1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for x86/SSE2. 18# 19# October 2014. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. In the process of adaptation 23# original .c module was made 32-bit savvy in order to make this 24# implementation possible. 25# 26# with/without -DECP_NISTZ256_ASM 27# Pentium +66-163% 28# PIII +72-172% 29# P4 +65-132% 30# Core2 +90-215% 31# Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) 32# Atom +65-155% 33# Opteron +54-110% 34# Bulldozer +99-240% 35# VIA Nano +93-290% 36# 37# Ranges denote minimum and maximum improvement coefficients depending 38# on benchmark. Lower coefficients are for ECDSA sign, server-side 39# operation. Keep in mind that +200% means 3x improvement. 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42push(@INC,"${dir}","${dir}../../perlasm"); 43require "x86asm.pl"; 44 45$output=pop and open STDOUT,">$output"; 46 47&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 48 49$sse2=0; 50for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 51 52&external_label("OPENSSL_ia32cap_P") if ($sse2); 53 54 55######################################################################## 56# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 57# 58open TABLE,"<ecp_nistz256_table.c" or 59open TABLE,"<${dir}../ecp_nistz256_table.c" or 60die "failed to open ecp_nistz256_table.c:",$!; 61 62use integer; 63 64foreach(<TABLE>) { 65 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 66} 67close TABLE; 68 69# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 70# 64*16*37-1 is because $#arr returns last valid index or @arr, not 71# amount of elements. 72die "insane number of elements" if ($#arr != 64*16*37-1); 73 74&public_label("ecp_nistz256_precomputed"); 75&align(4096); 76&set_label("ecp_nistz256_precomputed"); 77 78######################################################################## 79# this conversion smashes P256_POINT_AFFINE by individual bytes with 80# 64 byte interval, similar to 81# 1111222233334444 82# 1234123412341234 83for(1..37) { 84 @tbl = splice(@arr,0,64*16); 85 for($i=0;$i<64;$i++) { 86 undef @line; 87 for($j=0;$j<64;$j++) { 88 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 89 } 90 &data_byte(join(',',map { sprintf "0x%02x",$_} @line)); 91 } 92} 93 94######################################################################## 95# Keep in mind that constants are stored least to most significant word 96&static_label("RR"); 97&set_label("RR",64); 98&data_word(3,0,-1,-5,-2,-1,-3,4); # 2^512 mod P-256 99 100&static_label("ONE_mont"); 101&set_label("ONE_mont"); 102&data_word(1,0,0,-1,-1,-1,-2,0); 103 104&static_label("ONE"); 105&set_label("ONE"); 106&data_word(1,0,0,0,0,0,0,0); 107&asciz("ECP_NISZ256 for x86/SSE2, CRYPTOGAMS by <appro\@openssl.org>"); 108&align(64); 109 110######################################################################## 111# void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); 112&function_begin("ecp_nistz256_mul_by_2"); 113 &mov ("esi",&wparam(1)); 114 &mov ("edi",&wparam(0)); 115 &mov ("ebp","esi"); 116######################################################################## 117# common pattern for internal functions is that %edi is result pointer, 118# %esi and %ebp are input ones, %ebp being optional. %edi is preserved. 119 &call ("_ecp_nistz256_add"); 120&function_end("ecp_nistz256_mul_by_2"); 121 122######################################################################## 123# void ecp_nistz256_mul_by_3(BN_ULONG edi[8],const BN_ULONG esi[8]); 124&function_begin("ecp_nistz256_mul_by_3"); 125 &mov ("esi",&wparam(1)); 126 # multiplication by 3 is performed 127 # as 2*n+n, but we can't use output 128 # to store 2*n, because if output 129 # pointer equals to input, then 130 # we'll get 2*n+2*n. 131 &stack_push(8); # therefore we need to allocate 132 # 256-bit intermediate buffer. 133 &mov ("edi","esp"); 134 &mov ("ebp","esi"); 135 &call ("_ecp_nistz256_add"); 136 &lea ("esi",&DWP(0,"edi")); 137 &mov ("ebp",&wparam(1)); 138 &mov ("edi",&wparam(0)); 139 &call ("_ecp_nistz256_add"); 140 &stack_pop(8); 141&function_end("ecp_nistz256_mul_by_3"); 142 143######################################################################## 144# void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); 145&function_begin("ecp_nistz256_div_by_2"); 146 &mov ("esi",&wparam(1)); 147 &mov ("edi",&wparam(0)); 148 &call ("_ecp_nistz256_div_by_2"); 149&function_end("ecp_nistz256_div_by_2"); 150 151&function_begin_B("_ecp_nistz256_div_by_2"); 152 # tmp = a is odd ? a+mod : a 153 # 154 # note that because mod has special form, i.e. consists of 155 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 156 # assigning least significant bit of input to one register, 157 # %ebp, and its negative to another, %edx. 158 159 &mov ("ebp",&DWP(0,"esi")); 160 &xor ("edx","edx"); 161 &mov ("ebx",&DWP(4,"esi")); 162 &mov ("eax","ebp"); 163 &and ("ebp",1); 164 &mov ("ecx",&DWP(8,"esi")); 165 &sub ("edx","ebp"); 166 167 &add ("eax","edx"); 168 &adc ("ebx","edx"); 169 &mov (&DWP(0,"edi"),"eax"); 170 &adc ("ecx","edx"); 171 &mov (&DWP(4,"edi"),"ebx"); 172 &mov (&DWP(8,"edi"),"ecx"); 173 174 &mov ("eax",&DWP(12,"esi")); 175 &mov ("ebx",&DWP(16,"esi")); 176 &adc ("eax",0); 177 &mov ("ecx",&DWP(20,"esi")); 178 &adc ("ebx",0); 179 &mov (&DWP(12,"edi"),"eax"); 180 &adc ("ecx",0); 181 &mov (&DWP(16,"edi"),"ebx"); 182 &mov (&DWP(20,"edi"),"ecx"); 183 184 &mov ("eax",&DWP(24,"esi")); 185 &mov ("ebx",&DWP(28,"esi")); 186 &adc ("eax","ebp"); 187 &adc ("ebx","edx"); 188 &mov (&DWP(24,"edi"),"eax"); 189 &sbb ("esi","esi"); # broadcast carry bit 190 &mov (&DWP(28,"edi"),"ebx"); 191 192 # ret = tmp >> 1 193 194 &mov ("eax",&DWP(0,"edi")); 195 &mov ("ebx",&DWP(4,"edi")); 196 &mov ("ecx",&DWP(8,"edi")); 197 &mov ("edx",&DWP(12,"edi")); 198 199 &shr ("eax",1); 200 &mov ("ebp","ebx"); 201 &shl ("ebx",31); 202 &or ("eax","ebx"); 203 204 &shr ("ebp",1); 205 &mov ("ebx","ecx"); 206 &shl ("ecx",31); 207 &mov (&DWP(0,"edi"),"eax"); 208 &or ("ebp","ecx"); 209 &mov ("eax",&DWP(16,"edi")); 210 211 &shr ("ebx",1); 212 &mov ("ecx","edx"); 213 &shl ("edx",31); 214 &mov (&DWP(4,"edi"),"ebp"); 215 &or ("ebx","edx"); 216 &mov ("ebp",&DWP(20,"edi")); 217 218 &shr ("ecx",1); 219 &mov ("edx","eax"); 220 &shl ("eax",31); 221 &mov (&DWP(8,"edi"),"ebx"); 222 &or ("ecx","eax"); 223 &mov ("ebx",&DWP(24,"edi")); 224 225 &shr ("edx",1); 226 &mov ("eax","ebp"); 227 &shl ("ebp",31); 228 &mov (&DWP(12,"edi"),"ecx"); 229 &or ("edx","ebp"); 230 &mov ("ecx",&DWP(28,"edi")); 231 232 &shr ("eax",1); 233 &mov ("ebp","ebx"); 234 &shl ("ebx",31); 235 &mov (&DWP(16,"edi"),"edx"); 236 &or ("eax","ebx"); 237 238 &shr ("ebp",1); 239 &mov ("ebx","ecx"); 240 &shl ("ecx",31); 241 &mov (&DWP(20,"edi"),"eax"); 242 &or ("ebp","ecx"); 243 244 &shr ("ebx",1); 245 &shl ("esi",31); 246 &mov (&DWP(24,"edi"),"ebp"); 247 &or ("ebx","esi"); # handle top-most carry bit 248 &mov (&DWP(28,"edi"),"ebx"); 249 250 &ret (); 251&function_end_B("_ecp_nistz256_div_by_2"); 252 253######################################################################## 254# void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], 255# const BN_ULONG ebp[8]); 256&function_begin("ecp_nistz256_add"); 257 &mov ("esi",&wparam(1)); 258 &mov ("ebp",&wparam(2)); 259 &mov ("edi",&wparam(0)); 260 &call ("_ecp_nistz256_add"); 261&function_end("ecp_nistz256_add"); 262 263&function_begin_B("_ecp_nistz256_add"); 264 &mov ("eax",&DWP(0,"esi")); 265 &mov ("ebx",&DWP(4,"esi")); 266 &mov ("ecx",&DWP(8,"esi")); 267 &add ("eax",&DWP(0,"ebp")); 268 &mov ("edx",&DWP(12,"esi")); 269 &adc ("ebx",&DWP(4,"ebp")); 270 &mov (&DWP(0,"edi"),"eax"); 271 &adc ("ecx",&DWP(8,"ebp")); 272 &mov (&DWP(4,"edi"),"ebx"); 273 &adc ("edx",&DWP(12,"ebp")); 274 &mov (&DWP(8,"edi"),"ecx"); 275 &mov (&DWP(12,"edi"),"edx"); 276 277 &mov ("eax",&DWP(16,"esi")); 278 &mov ("ebx",&DWP(20,"esi")); 279 &mov ("ecx",&DWP(24,"esi")); 280 &adc ("eax",&DWP(16,"ebp")); 281 &mov ("edx",&DWP(28,"esi")); 282 &adc ("ebx",&DWP(20,"ebp")); 283 &mov (&DWP(16,"edi"),"eax"); 284 &adc ("ecx",&DWP(24,"ebp")); 285 &mov (&DWP(20,"edi"),"ebx"); 286 &mov ("esi",0); 287 &adc ("edx",&DWP(28,"ebp")); 288 &mov (&DWP(24,"edi"),"ecx"); 289 &adc ("esi",0); 290 &mov (&DWP(28,"edi"),"edx"); 291 292 # if a+b >= modulus, subtract modulus. 293 # 294 # But since comparison implies subtraction, we subtract modulus 295 # to see if it borrows, and then subtract it for real if 296 # subtraction didn't borrow. 297 298 &mov ("eax",&DWP(0,"edi")); 299 &mov ("ebx",&DWP(4,"edi")); 300 &mov ("ecx",&DWP(8,"edi")); 301 &sub ("eax",-1); 302 &mov ("edx",&DWP(12,"edi")); 303 &sbb ("ebx",-1); 304 &mov ("eax",&DWP(16,"edi")); 305 &sbb ("ecx",-1); 306 &mov ("ebx",&DWP(20,"edi")); 307 &sbb ("edx",0); 308 &mov ("ecx",&DWP(24,"edi")); 309 &sbb ("eax",0); 310 &mov ("edx",&DWP(28,"edi")); 311 &sbb ("ebx",0); 312 &sbb ("ecx",1); 313 &sbb ("edx",-1); 314 &sbb ("esi",0); 315 316 # Note that because mod has special form, i.e. consists of 317 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 318 # by using borrow. 319 320 ¬ ("esi"); 321 &mov ("eax",&DWP(0,"edi")); 322 &mov ("ebp","esi"); 323 &mov ("ebx",&DWP(4,"edi")); 324 &shr ("ebp",31); 325 &mov ("ecx",&DWP(8,"edi")); 326 &sub ("eax","esi"); 327 &mov ("edx",&DWP(12,"edi")); 328 &sbb ("ebx","esi"); 329 &mov (&DWP(0,"edi"),"eax"); 330 &sbb ("ecx","esi"); 331 &mov (&DWP(4,"edi"),"ebx"); 332 &sbb ("edx",0); 333 &mov (&DWP(8,"edi"),"ecx"); 334 &mov (&DWP(12,"edi"),"edx"); 335 336 &mov ("eax",&DWP(16,"edi")); 337 &mov ("ebx",&DWP(20,"edi")); 338 &mov ("ecx",&DWP(24,"edi")); 339 &sbb ("eax",0); 340 &mov ("edx",&DWP(28,"edi")); 341 &sbb ("ebx",0); 342 &mov (&DWP(16,"edi"),"eax"); 343 &sbb ("ecx","ebp"); 344 &mov (&DWP(20,"edi"),"ebx"); 345 &sbb ("edx","esi"); 346 &mov (&DWP(24,"edi"),"ecx"); 347 &mov (&DWP(28,"edi"),"edx"); 348 349 &ret (); 350&function_end_B("_ecp_nistz256_add"); 351 352######################################################################## 353# void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], 354# const BN_ULONG ebp[8]); 355&function_begin("ecp_nistz256_sub"); 356 &mov ("esi",&wparam(1)); 357 &mov ("ebp",&wparam(2)); 358 &mov ("edi",&wparam(0)); 359 &call ("_ecp_nistz256_sub"); 360&function_end("ecp_nistz256_sub"); 361 362&function_begin_B("_ecp_nistz256_sub"); 363 &mov ("eax",&DWP(0,"esi")); 364 &mov ("ebx",&DWP(4,"esi")); 365 &mov ("ecx",&DWP(8,"esi")); 366 &sub ("eax",&DWP(0,"ebp")); 367 &mov ("edx",&DWP(12,"esi")); 368 &sbb ("ebx",&DWP(4,"ebp")); 369 &mov (&DWP(0,"edi"),"eax"); 370 &sbb ("ecx",&DWP(8,"ebp")); 371 &mov (&DWP(4,"edi"),"ebx"); 372 &sbb ("edx",&DWP(12,"ebp")); 373 &mov (&DWP(8,"edi"),"ecx"); 374 &mov (&DWP(12,"edi"),"edx"); 375 376 &mov ("eax",&DWP(16,"esi")); 377 &mov ("ebx",&DWP(20,"esi")); 378 &mov ("ecx",&DWP(24,"esi")); 379 &sbb ("eax",&DWP(16,"ebp")); 380 &mov ("edx",&DWP(28,"esi")); 381 &sbb ("ebx",&DWP(20,"ebp")); 382 &sbb ("ecx",&DWP(24,"ebp")); 383 &mov (&DWP(16,"edi"),"eax"); 384 &sbb ("edx",&DWP(28,"ebp")); 385 &mov (&DWP(20,"edi"),"ebx"); 386 &sbb ("esi","esi"); # broadcast borrow bit 387 &mov (&DWP(24,"edi"),"ecx"); 388 &mov (&DWP(28,"edi"),"edx"); 389 390 # if a-b borrows, add modulus. 391 # 392 # Note that because mod has special form, i.e. consists of 393 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 394 # assigning borrow bit to one register, %ebp, and its negative 395 # to another, %esi. But we started by calculating %esi... 396 397 &mov ("eax",&DWP(0,"edi")); 398 &mov ("ebp","esi"); 399 &mov ("ebx",&DWP(4,"edi")); 400 &shr ("ebp",31); 401 &mov ("ecx",&DWP(8,"edi")); 402 &add ("eax","esi"); 403 &mov ("edx",&DWP(12,"edi")); 404 &adc ("ebx","esi"); 405 &mov (&DWP(0,"edi"),"eax"); 406 &adc ("ecx","esi"); 407 &mov (&DWP(4,"edi"),"ebx"); 408 &adc ("edx",0); 409 &mov (&DWP(8,"edi"),"ecx"); 410 &mov (&DWP(12,"edi"),"edx"); 411 412 &mov ("eax",&DWP(16,"edi")); 413 &mov ("ebx",&DWP(20,"edi")); 414 &mov ("ecx",&DWP(24,"edi")); 415 &adc ("eax",0); 416 &mov ("edx",&DWP(28,"edi")); 417 &adc ("ebx",0); 418 &mov (&DWP(16,"edi"),"eax"); 419 &adc ("ecx","ebp"); 420 &mov (&DWP(20,"edi"),"ebx"); 421 &adc ("edx","esi"); 422 &mov (&DWP(24,"edi"),"ecx"); 423 &mov (&DWP(28,"edi"),"edx"); 424 425 &ret (); 426&function_end_B("_ecp_nistz256_sub"); 427 428######################################################################## 429# void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); 430&function_begin("ecp_nistz256_neg"); 431 &mov ("ebp",&wparam(1)); 432 &mov ("edi",&wparam(0)); 433 434 &xor ("eax","eax"); 435 &stack_push(8); 436 &mov (&DWP(0,"esp"),"eax"); 437 &mov ("esi","esp"); 438 &mov (&DWP(4,"esp"),"eax"); 439 &mov (&DWP(8,"esp"),"eax"); 440 &mov (&DWP(12,"esp"),"eax"); 441 &mov (&DWP(16,"esp"),"eax"); 442 &mov (&DWP(20,"esp"),"eax"); 443 &mov (&DWP(24,"esp"),"eax"); 444 &mov (&DWP(28,"esp"),"eax"); 445 446 &call ("_ecp_nistz256_sub"); 447 448 &stack_pop(8); 449&function_end("ecp_nistz256_neg"); 450 451&function_begin_B("_picup_eax"); 452 &mov ("eax",&DWP(0,"esp")); 453 &ret (); 454&function_end_B("_picup_eax"); 455 456######################################################################## 457# void ecp_nistz256_to_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 458&function_begin("ecp_nistz256_to_mont"); 459 &mov ("esi",&wparam(1)); 460 &call ("_picup_eax"); 461 &set_label("pic"); 462 &lea ("ebp",&DWP(&label("RR")."-".&label("pic"),"eax")); 463 if ($sse2) { 464 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 465 &mov ("eax",&DWP(0,"eax")); } 466 &mov ("edi",&wparam(0)); 467 &call ("_ecp_nistz256_mul_mont"); 468&function_end("ecp_nistz256_to_mont"); 469 470######################################################################## 471# void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 472&function_begin("ecp_nistz256_from_mont"); 473 &mov ("esi",&wparam(1)); 474 &call ("_picup_eax"); 475 &set_label("pic"); 476 &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); 477 if ($sse2) { 478 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 479 &mov ("eax",&DWP(0,"eax")); } 480 &mov ("edi",&wparam(0)); 481 &call ("_ecp_nistz256_mul_mont"); 482&function_end("ecp_nistz256_from_mont"); 483 484######################################################################## 485# void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], 486# const BN_ULONG ebp[8]); 487&function_begin("ecp_nistz256_mul_mont"); 488 &mov ("esi",&wparam(1)); 489 &mov ("ebp",&wparam(2)); 490 if ($sse2) { 491 &call ("_picup_eax"); 492 &set_label("pic"); 493 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 494 &mov ("eax",&DWP(0,"eax")); } 495 &mov ("edi",&wparam(0)); 496 &call ("_ecp_nistz256_mul_mont"); 497&function_end("ecp_nistz256_mul_mont"); 498 499######################################################################## 500# void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 501&function_begin("ecp_nistz256_sqr_mont"); 502 &mov ("esi",&wparam(1)); 503 if ($sse2) { 504 &call ("_picup_eax"); 505 &set_label("pic"); 506 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 507 &mov ("eax",&DWP(0,"eax")); } 508 &mov ("edi",&wparam(0)); 509 &mov ("ebp","esi"); 510 &call ("_ecp_nistz256_mul_mont"); 511&function_end("ecp_nistz256_sqr_mont"); 512 513&function_begin_B("_ecp_nistz256_mul_mont"); 514 if ($sse2) { 515 &and ("eax",1<<24|1<<26); 516 &cmp ("eax",1<<24|1<<26); # see if XMM+SSE2 is on 517 &jne (&label("mul_mont_ialu")); 518 519 ######################################## 520 # SSE2 code path featuring 32x16-bit 521 # multiplications is ~2x faster than 522 # IALU counterpart (except on Atom)... 523 ######################################## 524 # stack layout: 525 # +------------------------------------+< %esp 526 # | 7 16-byte temporary XMM words, | 527 # | "sliding" toward lower address | 528 # . . 529 # +------------------------------------+ 530 # | unused XMM word | 531 # +------------------------------------+< +128,%ebx 532 # | 8 16-byte XMM words holding copies | 533 # | of a[i]<<64|a[i] | 534 # . . 535 # . . 536 # +------------------------------------+< +256 537 &mov ("edx","esp"); 538 &sub ("esp",0x100); 539 540 &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy 541 &lea ("ebp",&DWP(4,"ebp")); 542 &pcmpeqd("xmm6","xmm6"); 543 &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff 544 545 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 546 &and ("esp",-64); 547 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 548 &lea ("ebx",&DWP(0x80,"esp")); 549 550 &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy 551 &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy 552 &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... 553 &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] 554 &pmuludq("xmm0","xmm7"); # a[0]*b[0] 555 556 &movd ("xmm2",&DWP(4*2,"esi")); 557 &pshufd ("xmm1","xmm1",0b11001100); 558 &movdqa (&QWP(0x10,"ebx"),"xmm1"); 559 &pmuludq("xmm1","xmm7"); # a[1]*b[0] 560 561 &movq ("xmm4","xmm0"); # clear upper 64 bits 562 &pslldq("xmm4",6); 563 &paddq ("xmm4","xmm0"); 564 &movdqa("xmm5","xmm4"); 565 &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] 566 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] 567 568 # Upper half of a[0]*b[i] is carried into next multiplication 569 # iteration, while lower one "participates" in actual reduction. 570 # Normally latter is done by accumulating result of multiplication 571 # of modulus by "magic" digit, but thanks to special form of modulus 572 # and "magic" digit it can be performed only with additions and 573 # subtractions (see note in IALU section below). Note that we are 574 # not bothered with carry bits, they are accumulated in "flatten" 575 # phase after all multiplications and reductions. 576 577 &movd ("xmm3",&DWP(4*3,"esi")); 578 &pshufd ("xmm2","xmm2",0b11001100); 579 &movdqa (&QWP(0x20,"ebx"),"xmm2"); 580 &pmuludq("xmm2","xmm7"); # a[2]*b[0] 581 &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry 582 &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] 583 584 &movd ("xmm0",&DWP(4*4,"esi")); 585 &pshufd ("xmm3","xmm3",0b11001100); 586 &movdqa (&QWP(0x30,"ebx"),"xmm3"); 587 &pmuludq("xmm3","xmm7"); # a[3]*b[0] 588 &movdqa (&QWP(0x10,"esp"),"xmm2"); 589 590 &movd ("xmm1",&DWP(4*5,"esi")); 591 &pshufd ("xmm0","xmm0",0b11001100); 592 &movdqa (&QWP(0x40,"ebx"),"xmm0"); 593 &pmuludq("xmm0","xmm7"); # a[4]*b[0] 594 &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step 595 &movdqa (&QWP(0x20,"esp"),"xmm3"); 596 597 &movd ("xmm2",&DWP(4*6,"esi")); 598 &pshufd ("xmm1","xmm1",0b11001100); 599 &movdqa (&QWP(0x50,"ebx"),"xmm1"); 600 &pmuludq("xmm1","xmm7"); # a[5]*b[0] 601 &movdqa (&QWP(0x30,"esp"),"xmm0"); 602 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 603 604 &movd ("xmm3",&DWP(4*7,"esi")); 605 &pshufd ("xmm2","xmm2",0b11001100); 606 &movdqa (&QWP(0x60,"ebx"),"xmm2"); 607 &pmuludq("xmm2","xmm7"); # a[6]*b[0] 608 &movdqa (&QWP(0x40,"esp"),"xmm1"); 609 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 610 611 &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy 612 &pshufd ("xmm3","xmm3",0b11001100); 613 &movdqa (&QWP(0x70,"ebx"),"xmm3"); 614 &pmuludq("xmm3","xmm7"); # a[7]*b[0] 615 616 &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y 617 &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 618 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 619 620 &mov ("ecx",6); 621 &lea ("ebp",&DWP(4,"ebp")); 622 &jmp (&label("madd_sse2")); 623 624&set_label("madd_sse2",16); 625 &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] 626 &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] 627 &movdqa ("xmm1",&QWP(0x10,"ebx")); 628 &pmuludq("xmm0","xmm7"); # a[0]*b[i] 629 &movdqa(&QWP(0x50,"esp"),"xmm2"); 630 631 &movdqa ("xmm2",&QWP(0x20,"ebx")); 632 &pmuludq("xmm1","xmm7"); # a[1]*b[i] 633 &movdqa(&QWP(0x60,"esp"),"xmm3"); 634 &paddq ("xmm0",&QWP(0x00,"esp")); 635 636 &movdqa ("xmm3",&QWP(0x30,"ebx")); 637 &pmuludq("xmm2","xmm7"); # a[2]*b[i] 638 &movq ("xmm4","xmm0"); # clear upper 64 bits 639 &pslldq("xmm4",6); 640 &paddq ("xmm1",&QWP(0x10,"esp")); 641 &paddq ("xmm4","xmm0"); 642 &movdqa("xmm5","xmm4"); 643 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 644 645 &movdqa ("xmm0",&QWP(0x40,"ebx")); 646 &pmuludq("xmm3","xmm7"); # a[3]*b[i] 647 &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry 648 &paddq ("xmm2",&QWP(0x20,"esp")); 649 &movdqa (&QWP(0x00,"esp"),"xmm1"); 650 651 &movdqa ("xmm1",&QWP(0x50,"ebx")); 652 &pmuludq("xmm0","xmm7"); # a[4]*b[i] 653 &paddq ("xmm3",&QWP(0x30,"esp")); 654 &movdqa (&QWP(0x10,"esp"),"xmm2"); 655 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 656 657 &movdqa ("xmm2",&QWP(0x60,"ebx")); 658 &pmuludq("xmm1","xmm7"); # a[5]*b[i] 659 &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step 660 &paddq ("xmm0",&QWP(0x40,"esp")); 661 &movdqa (&QWP(0x20,"esp"),"xmm3"); 662 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 663 664 &movdqa ("xmm3","xmm7"); 665 &pmuludq("xmm2","xmm7"); # a[6]*b[i] 666 &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy 667 &lea ("ebp",&DWP(4,"ebp")); 668 &paddq ("xmm1",&QWP(0x50,"esp")); 669 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 670 &movdqa (&QWP(0x30,"esp"),"xmm0"); 671 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 672 673 &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] 674 &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 675 &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 676 &movdqa (&QWP(0x40,"esp"),"xmm1"); 677 &paddq ("xmm2",&QWP(0x60,"esp")); 678 679 &dec ("ecx"); 680 &jnz (&label("madd_sse2")); 681 682 &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] 683 &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] 684 &movdqa ("xmm1",&QWP(0x10,"ebx")); 685 &pmuludq("xmm0","xmm7"); # a[0]*b[7] 686 &movdqa(&QWP(0x50,"esp"),"xmm2"); 687 688 &movdqa ("xmm2",&QWP(0x20,"ebx")); 689 &pmuludq("xmm1","xmm7"); # a[1]*b[7] 690 &movdqa(&QWP(0x60,"esp"),"xmm3"); 691 &paddq ("xmm0",&QWP(0x00,"esp")); 692 693 &movdqa ("xmm3",&QWP(0x30,"ebx")); 694 &pmuludq("xmm2","xmm7"); # a[2]*b[7] 695 &movq ("xmm4","xmm0"); # clear upper 64 bits 696 &pslldq("xmm4",6); 697 &paddq ("xmm1",&QWP(0x10,"esp")); 698 &paddq ("xmm4","xmm0"); 699 &movdqa("xmm5","xmm4"); 700 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 701 702 &movdqa ("xmm0",&QWP(0x40,"ebx")); 703 &pmuludq("xmm3","xmm7"); # a[3]*b[7] 704 &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry 705 &paddq ("xmm2",&QWP(0x20,"esp")); 706 &movdqa (&QWP(0x00,"esp"),"xmm1"); 707 708 &movdqa ("xmm1",&QWP(0x50,"ebx")); 709 &pmuludq("xmm0","xmm7"); # a[4]*b[7] 710 &paddq ("xmm3",&QWP(0x30,"esp")); 711 &movdqa (&QWP(0x10,"esp"),"xmm2"); 712 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 713 714 &movdqa ("xmm2",&QWP(0x60,"ebx")); 715 &pmuludq("xmm1","xmm7"); # a[5]*b[7] 716 &paddq ("xmm3","xmm5"); # reduction step 717 &paddq ("xmm0",&QWP(0x40,"esp")); 718 &movdqa (&QWP(0x20,"esp"),"xmm3"); 719 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 720 721 &movdqa ("xmm3",&QWP(0x70,"ebx")); 722 &pmuludq("xmm2","xmm7"); # a[6]*b[7] 723 &paddq ("xmm1",&QWP(0x50,"esp")); 724 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 725 &movdqa (&QWP(0x30,"esp"),"xmm0"); 726 727 &pmuludq("xmm3","xmm7"); # a[7]*b[7] 728 &pcmpeqd("xmm7","xmm7"); 729 &movdqa ("xmm0",&QWP(0x00,"esp")); 730 &pslldq ("xmm7",8); 731 &movdqa (&QWP(0x40,"esp"),"xmm1"); 732 &paddq ("xmm2",&QWP(0x60,"esp")); 733 734 &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step 735 &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step 736 &movdqa(&QWP(0x50,"esp"),"xmm2"); 737 &movdqa(&QWP(0x60,"esp"),"xmm3"); 738 739 &movdqa ("xmm1",&QWP(0x10,"esp")); 740 &movdqa ("xmm2",&QWP(0x20,"esp")); 741 &movdqa ("xmm3",&QWP(0x30,"esp")); 742 743 &movq ("xmm4","xmm0"); # "flatten" 744 &pand ("xmm0","xmm7"); 745 &xor ("ebp","ebp"); 746 &pslldq ("xmm4",6); 747 &movq ("xmm5","xmm1"); 748 &paddq ("xmm0","xmm4"); 749 &pand ("xmm1","xmm7"); 750 &psrldq ("xmm0",6); 751 &movd ("eax","xmm0"); 752 &psrldq ("xmm0",4); 753 754 &paddq ("xmm5","xmm0"); 755 &movdqa ("xmm0",&QWP(0x40,"esp")); 756 &sub ("eax",-1); # start subtracting modulus, 757 # this is used to determine 758 # if result is larger/smaller 759 # than modulus (see below) 760 &pslldq ("xmm5",6); 761 &movq ("xmm4","xmm2"); 762 &paddq ("xmm1","xmm5"); 763 &pand ("xmm2","xmm7"); 764 &psrldq ("xmm1",6); 765 &mov (&DWP(4*0,"edi"),"eax"); 766 &movd ("eax","xmm1"); 767 &psrldq ("xmm1",4); 768 769 &paddq ("xmm4","xmm1"); 770 &movdqa ("xmm1",&QWP(0x50,"esp")); 771 &sbb ("eax",-1); 772 &pslldq ("xmm4",6); 773 &movq ("xmm5","xmm3"); 774 &paddq ("xmm2","xmm4"); 775 &pand ("xmm3","xmm7"); 776 &psrldq ("xmm2",6); 777 &mov (&DWP(4*1,"edi"),"eax"); 778 &movd ("eax","xmm2"); 779 &psrldq ("xmm2",4); 780 781 &paddq ("xmm5","xmm2"); 782 &movdqa ("xmm2",&QWP(0x60,"esp")); 783 &sbb ("eax",-1); 784 &pslldq ("xmm5",6); 785 &movq ("xmm4","xmm0"); 786 &paddq ("xmm3","xmm5"); 787 &pand ("xmm0","xmm7"); 788 &psrldq ("xmm3",6); 789 &mov (&DWP(4*2,"edi"),"eax"); 790 &movd ("eax","xmm3"); 791 &psrldq ("xmm3",4); 792 793 &paddq ("xmm4","xmm3"); 794 &sbb ("eax",0); 795 &pslldq ("xmm4",6); 796 &movq ("xmm5","xmm1"); 797 &paddq ("xmm0","xmm4"); 798 &pand ("xmm1","xmm7"); 799 &psrldq ("xmm0",6); 800 &mov (&DWP(4*3,"edi"),"eax"); 801 &movd ("eax","xmm0"); 802 &psrldq ("xmm0",4); 803 804 &paddq ("xmm5","xmm0"); 805 &sbb ("eax",0); 806 &pslldq ("xmm5",6); 807 &movq ("xmm4","xmm2"); 808 &paddq ("xmm1","xmm5"); 809 &pand ("xmm2","xmm7"); 810 &psrldq ("xmm1",6); 811 &movd ("ebx","xmm1"); 812 &psrldq ("xmm1",4); 813 &mov ("esp","edx"); 814 815 &paddq ("xmm4","xmm1"); 816 &pslldq ("xmm4",6); 817 &paddq ("xmm2","xmm4"); 818 &psrldq ("xmm2",6); 819 &movd ("ecx","xmm2"); 820 &psrldq ("xmm2",4); 821 &sbb ("ebx",0); 822 &movd ("edx","xmm2"); 823 &pextrw ("esi","xmm2",2); # top-most overflow bit 824 &sbb ("ecx",1); 825 &sbb ("edx",-1); 826 &sbb ("esi",0); # borrow from subtraction 827 828 # Final step is "if result > mod, subtract mod", and at this point 829 # we have result - mod written to output buffer, as well as borrow 830 # bit from this subtraction, and if borrow bit is set, we add 831 # modulus back. 832 # 833 # Note that because mod has special form, i.e. consists of 834 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 835 # assigning borrow bit to one register, %ebp, and its negative 836 # to another, %esi. But we started by calculating %esi... 837 838 &sub ("ebp","esi"); 839 &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero 840 &adc (&DWP(4*1,"edi"),"esi"); 841 &adc (&DWP(4*2,"edi"),"esi"); 842 &adc (&DWP(4*3,"edi"),0); 843 &adc ("eax",0); 844 &adc ("ebx",0); 845 &mov (&DWP(4*4,"edi"),"eax"); 846 &adc ("ecx","ebp"); 847 &mov (&DWP(4*5,"edi"),"ebx"); 848 &adc ("edx","esi"); 849 &mov (&DWP(4*6,"edi"),"ecx"); 850 &mov (&DWP(4*7,"edi"),"edx"); 851 852 &ret (); 853 854&set_label("mul_mont_ialu",16); } 855 856 ######################################## 857 # IALU code path suitable for all CPUs. 858 ######################################## 859 # stack layout: 860 # +------------------------------------+< %esp 861 # | 8 32-bit temporary words, accessed | 862 # | as circular buffer | 863 # . . 864 # . . 865 # +------------------------------------+< +32 866 # | offloaded destination pointer | 867 # +------------------------------------+ 868 # | unused | 869 # +------------------------------------+< +40 870 &sub ("esp",10*4); 871 872 &mov ("eax",&DWP(0*4,"esi")); # a[0] 873 &mov ("ebx",&DWP(0*4,"ebp")); # b[0] 874 &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr 875 876 &mul ("ebx"); # a[0]*b[0] 877 &mov (&DWP(0*4,"esp"),"eax"); # t[0] 878 &mov ("eax",&DWP(1*4,"esi")); 879 &mov ("ecx","edx") 880 881 &mul ("ebx"); # a[1]*b[0] 882 &add ("ecx","eax"); 883 &mov ("eax",&DWP(2*4,"esi")); 884 &adc ("edx",0); 885 &mov (&DWP(1*4,"esp"),"ecx"); # t[1] 886 &mov ("ecx","edx"); 887 888 &mul ("ebx"); # a[2]*b[0] 889 &add ("ecx","eax"); 890 &mov ("eax",&DWP(3*4,"esi")); 891 &adc ("edx",0); 892 &mov (&DWP(2*4,"esp"),"ecx"); # t[2] 893 &mov ("ecx","edx"); 894 895 &mul ("ebx"); # a[3]*b[0] 896 &add ("ecx","eax"); 897 &mov ("eax",&DWP(4*4,"esi")); 898 &adc ("edx",0); 899 &mov (&DWP(3*4,"esp"),"ecx"); # t[3] 900 &mov ("ecx","edx"); 901 902 &mul ("ebx"); # a[4]*b[0] 903 &add ("ecx","eax"); 904 &mov ("eax",&DWP(5*4,"esi")); 905 &adc ("edx",0); 906 &mov (&DWP(4*4,"esp"),"ecx"); # t[4] 907 &mov ("ecx","edx"); 908 909 &mul ("ebx"); # a[5]*b[0] 910 &add ("ecx","eax"); 911 &mov ("eax",&DWP(6*4,"esi")); 912 &adc ("edx",0); 913 &mov (&DWP(5*4,"esp"),"ecx"); # t[5] 914 &mov ("ecx","edx"); 915 916 &mul ("ebx"); # a[6]*b[0] 917 &add ("ecx","eax"); 918 &mov ("eax",&DWP(7*4,"esi")); 919 &adc ("edx",0); 920 &mov (&DWP(6*4,"esp"),"ecx"); # t[6] 921 &mov ("ecx","edx"); 922 923 &xor ("edi","edi"); # initial top-most carry 924 &mul ("ebx"); # a[7]*b[0] 925 &add ("ecx","eax"); # t[7] 926 &mov ("eax",&DWP(0*4,"esp")); # t[0] 927 &adc ("edx",0); # t[8] 928 929for ($i=0;$i<7;$i++) { 930 my $j=$i+1; 931 932 # Reduction iteration is normally performed by accumulating 933 # result of multiplication of modulus by "magic" digit [and 934 # omitting least significant word, which is guaranteed to 935 # be 0], but thanks to special form of modulus and "magic" 936 # digit being equal to least significant word, it can be 937 # performed with additions and subtractions alone. Indeed: 938 # 939 # ffff.0001.0000.0000.0000.ffff.ffff.ffff 940 # * abcd 941 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 942 # 943 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 944 # rewrite above as: 945 # 946 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 947 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 948 # - abcd.0000.0000.0000.0000.0000.0000.abcd 949 # 950 # or marking redundant operations: 951 # 952 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 953 # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 954 # - abcd.----.----.----.----.----.----.---- 955 956 &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] 957 &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 958 &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 959 &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] 960 &adc ("ecx",0); # t[7]+=0 961 &adc ("edx","eax"); # t[8]+=t[0] 962 &adc ("edi",0); # top-most carry 963 &mov ("ebx",&DWP($j*4,"ebp")); # b[i] 964 &sub ("ecx","eax"); # t[7]-=t[0] 965 &mov ("eax",&DWP(0*4,"esi")); # a[0] 966 &sbb ("edx",0); # t[8]-=0 967 &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); 968 &sbb ("edi",0); # top-most carry, 969 # keep in mind that 970 # netto result is 971 # *addition* of value 972 # with (abcd<<32)-abcd 973 # on top, so that 974 # underflow is 975 # impossible, because 976 # (abcd<<32)-abcd 977 # doesn't underflow 978 &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); 979 980 &mul ("ebx"); # a[0]*b[i] 981 &add ("eax",&DWP((($j+0)%8)*4,"esp")); 982 &adc ("edx",0); 983 &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); 984 &mov ("eax",&DWP(1*4,"esi")); 985 &mov ("ecx","edx") 986 987 &mul ("ebx"); # a[1]*b[i] 988 &add ("ecx",&DWP((($j+1)%8)*4,"esp")); 989 &adc ("edx",0); 990 &add ("ecx","eax"); 991 &adc ("edx",0); 992 &mov ("eax",&DWP(2*4,"esi")); 993 &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); 994 &mov ("ecx","edx"); 995 996 &mul ("ebx"); # a[2]*b[i] 997 &add ("ecx",&DWP((($j+2)%8)*4,"esp")); 998 &adc ("edx",0); 999 &add ("ecx","eax"); 1000 &adc ("edx",0); 1001 &mov ("eax",&DWP(3*4,"esi")); 1002 &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); 1003 &mov ("ecx","edx"); 1004 1005 &mul ("ebx"); # a[3]*b[i] 1006 &add ("ecx",&DWP((($j+3)%8)*4,"esp")); 1007 &adc ("edx",0); 1008 &add ("ecx","eax"); 1009 &adc ("edx",0); 1010 &mov ("eax",&DWP(4*4,"esi")); 1011 &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); 1012 &mov ("ecx","edx"); 1013 1014 &mul ("ebx"); # a[4]*b[i] 1015 &add ("ecx",&DWP((($j+4)%8)*4,"esp")); 1016 &adc ("edx",0); 1017 &add ("ecx","eax"); 1018 &adc ("edx",0); 1019 &mov ("eax",&DWP(5*4,"esi")); 1020 &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); 1021 &mov ("ecx","edx"); 1022 1023 &mul ("ebx"); # a[5]*b[i] 1024 &add ("ecx",&DWP((($j+5)%8)*4,"esp")); 1025 &adc ("edx",0); 1026 &add ("ecx","eax"); 1027 &adc ("edx",0); 1028 &mov ("eax",&DWP(6*4,"esi")); 1029 &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); 1030 &mov ("ecx","edx"); 1031 1032 &mul ("ebx"); # a[6]*b[i] 1033 &add ("ecx",&DWP((($j+6)%8)*4,"esp")); 1034 &adc ("edx",0); 1035 &add ("ecx","eax"); 1036 &adc ("edx",0); 1037 &mov ("eax",&DWP(7*4,"esi")); 1038 &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); 1039 &mov ("ecx","edx"); 1040 1041 &mul ("ebx"); # a[7]*b[i] 1042 &add ("ecx",&DWP((($j+7)%8)*4,"esp")); 1043 &adc ("edx",0); 1044 &add ("ecx","eax"); # t[7] 1045 &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] 1046 &adc ("edx","edi"); # t[8] 1047 &mov ("edi",0); 1048 &adc ("edi",0); # top-most carry 1049} 1050 &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr 1051 &xor ("esi","esi"); 1052 my $j=$i+1; 1053 1054 # last multiplication-less reduction 1055 &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] 1056 &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 1057 &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 1058 &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] 1059 &adc ("ecx",0); # t[7]+=0 1060 &adc ("edx","eax"); # t[8]+=t[0] 1061 &adc ("edi",0); # top-most carry 1062 &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); 1063 &sub ("ecx","eax"); # t[7]-=t[0] 1064 &mov ("eax",&DWP((($j+0)%8)*4,"esp")); 1065 &sbb ("edx",0); # t[8]-=0 1066 &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); 1067 &sbb ("edi",0); # top-most carry 1068 &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); 1069 1070 # Final step is "if result > mod, subtract mod", but we do it 1071 # "other way around", namely write result - mod to output buffer 1072 # and if subtraction borrowed, add modulus back. 1073 1074 &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); 1075 &sub ("eax",-1); 1076 &mov ("edx",&DWP((($j+3)%8)*4,"esp")); 1077 &sbb ("ebx",-1); 1078 &mov (&DWP(0*4,"ebp"),"eax"); 1079 &sbb ("ecx",-1); 1080 &mov (&DWP(1*4,"ebp"),"ebx"); 1081 &sbb ("edx",0); 1082 &mov (&DWP(2*4,"ebp"),"ecx"); 1083 &mov (&DWP(3*4,"ebp"),"edx"); 1084 1085 &mov ("eax",&DWP((($j+4)%8)*4,"esp")); 1086 &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); 1087 &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); 1088 &sbb ("eax",0); 1089 &mov ("edx",&DWP((($j+7)%8)*4,"esp")); 1090 &sbb ("ebx",0); 1091 &sbb ("ecx",1); 1092 &sbb ("edx",-1); 1093 &sbb ("edi",0); 1094 1095 # Note that because mod has special form, i.e. consists of 1096 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1097 # assigning borrow bit to one register, %ebp, and its negative 1098 # to another, %esi. But we started by calculating %esi... 1099 1100 &sub ("esi","edi"); 1101 &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero 1102 &adc (&DWP(1*4,"ebp"),"edi"); 1103 &adc (&DWP(2*4,"ebp"),"edi"); 1104 &adc (&DWP(3*4,"ebp"),0); 1105 &adc ("eax",0); 1106 &adc ("ebx",0); 1107 &mov (&DWP(4*4,"ebp"),"eax"); 1108 &adc ("ecx","esi"); 1109 &mov (&DWP(5*4,"ebp"),"ebx"); 1110 &adc ("edx","edi"); 1111 &mov (&DWP(6*4,"ebp"),"ecx"); 1112 &mov ("edi","ebp"); # fulfill contract 1113 &mov (&DWP(7*4,"ebp"),"edx"); 1114 1115 &add ("esp",10*4); 1116 &ret (); 1117&function_end_B("_ecp_nistz256_mul_mont"); 1118 1119######################################################################## 1120# void ecp_nistz256_scatter_w5(void *edi,const P256_POINT *esi, 1121# int ebp); 1122&function_begin("ecp_nistz256_scatter_w5"); 1123 &mov ("edi",&wparam(0)); 1124 &mov ("esi",&wparam(1)); 1125 &mov ("ebp",&wparam(2)); 1126 1127 &lea ("edi",&DWP(128-4,"edi","ebp",4)); 1128 &mov ("ebp",96/16); 1129&set_label("scatter_w5_loop"); 1130 &mov ("eax",&DWP(0,"esi")); 1131 &mov ("ebx",&DWP(4,"esi")); 1132 &mov ("ecx",&DWP(8,"esi")); 1133 &mov ("edx",&DWP(12,"esi")); 1134 &lea ("esi",&DWP(16,"esi")); 1135 &mov (&DWP(64*0-128,"edi"),"eax"); 1136 &mov (&DWP(64*1-128,"edi"),"ebx"); 1137 &mov (&DWP(64*2-128,"edi"),"ecx"); 1138 &mov (&DWP(64*3-128,"edi"),"edx"); 1139 &lea ("edi",&DWP(64*4,"edi")); 1140 &dec ("ebp"); 1141 &jnz (&label("scatter_w5_loop")); 1142&function_end("ecp_nistz256_scatter_w5"); 1143 1144######################################################################## 1145# void ecp_nistz256_gather_w5(P256_POINT *edi,const void *esi, 1146# int ebp); 1147&function_begin("ecp_nistz256_gather_w5"); 1148 &mov ("esi",&wparam(1)); 1149 &mov ("ebp",&wparam(2)); 1150 1151 &lea ("esi",&DWP(0,"esi","ebp",4)); 1152 &neg ("ebp"); 1153 &sar ("ebp",31); 1154 &mov ("edi",&wparam(0)); 1155 &lea ("esi",&DWP(0,"esi","ebp",4)); 1156 1157 for($i=0;$i<24;$i+=4) { 1158 &mov ("eax",&DWP(64*($i+0),"esi")); 1159 &mov ("ebx",&DWP(64*($i+1),"esi")); 1160 &mov ("ecx",&DWP(64*($i+2),"esi")); 1161 &mov ("edx",&DWP(64*($i+3),"esi")); 1162 &and ("eax","ebp"); 1163 &and ("ebx","ebp"); 1164 &and ("ecx","ebp"); 1165 &and ("edx","ebp"); 1166 &mov (&DWP(4*($i+0),"edi"),"eax"); 1167 &mov (&DWP(4*($i+1),"edi"),"ebx"); 1168 &mov (&DWP(4*($i+2),"edi"),"ecx"); 1169 &mov (&DWP(4*($i+3),"edi"),"edx"); 1170 } 1171&function_end("ecp_nistz256_gather_w5"); 1172 1173######################################################################## 1174# void ecp_nistz256_scatter_w7(void *edi,const P256_POINT_AFFINE *esi, 1175# int ebp); 1176&function_begin("ecp_nistz256_scatter_w7"); 1177 &mov ("edi",&wparam(0)); 1178 &mov ("esi",&wparam(1)); 1179 &mov ("ebp",&wparam(2)); 1180 1181 &lea ("edi",&DWP(0,"edi","ebp")); 1182 &mov ("ebp",64/4); 1183&set_label("scatter_w7_loop"); 1184 &mov ("eax",&DWP(0,"esi")); 1185 &lea ("esi",&DWP(4,"esi")); 1186 &mov (&BP(64*0,"edi"),"al"); 1187 &mov (&BP(64*1,"edi"),"ah"); 1188 &shr ("eax",16); 1189 &mov (&BP(64*2,"edi"),"al"); 1190 &mov (&BP(64*3,"edi"),"ah"); 1191 &lea ("edi",&DWP(64*4,"edi")); 1192 &dec ("ebp"); 1193 &jnz (&label("scatter_w7_loop")); 1194&function_end("ecp_nistz256_scatter_w7"); 1195 1196######################################################################## 1197# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *edi,const void *esi, 1198# int ebp); 1199&function_begin("ecp_nistz256_gather_w7"); 1200 &mov ("esi",&wparam(1)); 1201 &mov ("ebp",&wparam(2)); 1202 1203 &add ("esi","ebp"); 1204 &neg ("ebp"), 1205 &sar ("ebp",31); 1206 &mov ("edi",&wparam(0)); 1207 &lea ("esi",&DWP(0,"esi","ebp")); 1208 1209 for($i=0;$i<64;$i+=4) { 1210 &movz ("eax",&BP(64*($i+0),"esi")); 1211 &movz ("ebx",&BP(64*($i+1),"esi")); 1212 &movz ("ecx",&BP(64*($i+2),"esi")); 1213 &and ("eax","ebp"); 1214 &movz ("edx",&BP(64*($i+3),"esi")); 1215 &and ("ebx","ebp"); 1216 &mov (&BP($i+0,"edi"),"al"); 1217 &and ("ecx","ebp"); 1218 &mov (&BP($i+1,"edi"),"bl"); 1219 &and ("edx","ebp"); 1220 &mov (&BP($i+2,"edi"),"cl"); 1221 &mov (&BP($i+3,"edi"),"dl"); 1222 } 1223&function_end("ecp_nistz256_gather_w7"); 1224 1225######################################################################## 1226# following subroutines are "literal" implementation of those found in 1227# ecp_nistz256.c 1228# 1229######################################################################## 1230# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1231# 1232&static_label("point_double_shortcut"); 1233&function_begin("ecp_nistz256_point_double"); 1234{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1235 1236 &mov ("esi",&wparam(1)); 1237 1238 # above map() describes stack layout with 5 temporary 1239 # 256-bit vectors on top, then we take extra word for 1240 # OPENSSL_ia32cap_P copy. 1241 &stack_push(8*5+1); 1242 if ($sse2) { 1243 &call ("_picup_eax"); 1244 &set_label("pic"); 1245 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1246 &mov ("ebp",&DWP(0,"edx")); } 1247 1248&set_label("point_double_shortcut"); 1249 &mov ("eax",&DWP(0,"esi")); # copy in_x 1250 &mov ("ebx",&DWP(4,"esi")); 1251 &mov ("ecx",&DWP(8,"esi")); 1252 &mov ("edx",&DWP(12,"esi")); 1253 &mov (&DWP($in_x+0,"esp"),"eax"); 1254 &mov (&DWP($in_x+4,"esp"),"ebx"); 1255 &mov (&DWP($in_x+8,"esp"),"ecx"); 1256 &mov (&DWP($in_x+12,"esp"),"edx"); 1257 &mov ("eax",&DWP(16,"esi")); 1258 &mov ("ebx",&DWP(20,"esi")); 1259 &mov ("ecx",&DWP(24,"esi")); 1260 &mov ("edx",&DWP(28,"esi")); 1261 &mov (&DWP($in_x+16,"esp"),"eax"); 1262 &mov (&DWP($in_x+20,"esp"),"ebx"); 1263 &mov (&DWP($in_x+24,"esp"),"ecx"); 1264 &mov (&DWP($in_x+28,"esp"),"edx"); 1265 &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy 1266 1267 &lea ("ebp",&DWP(32,"esi")); 1268 &lea ("esi",&DWP(32,"esi")); 1269 &lea ("edi",&DWP($S,"esp")); 1270 &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); 1271 1272 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1273 &mov ("esi",64); 1274 &add ("esi",&wparam(1)); 1275 &lea ("edi",&DWP($Zsqr,"esp")); 1276 &mov ("ebp","esi"); 1277 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); 1278 1279 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1280 &lea ("esi",&DWP($S,"esp")); 1281 &lea ("ebp",&DWP($S,"esp")); 1282 &lea ("edi",&DWP($S,"esp")); 1283 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); 1284 1285 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1286 &mov ("ebp",&wparam(1)); 1287 &lea ("esi",&DWP(32,"ebp")); 1288 &lea ("ebp",&DWP(64,"ebp")); 1289 &lea ("edi",&DWP($tmp0,"esp")); 1290 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); 1291 1292 &lea ("esi",&DWP($in_x,"esp")); 1293 &lea ("ebp",&DWP($Zsqr,"esp")); 1294 &lea ("edi",&DWP($M,"esp")); 1295 &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); 1296 1297 &mov ("edi",64); 1298 &lea ("esi",&DWP($tmp0,"esp")); 1299 &lea ("ebp",&DWP($tmp0,"esp")); 1300 &add ("edi",&wparam(0)); 1301 &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); 1302 1303 &lea ("esi",&DWP($in_x,"esp")); 1304 &lea ("ebp",&DWP($Zsqr,"esp")); 1305 &lea ("edi",&DWP($Zsqr,"esp")); 1306 &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); 1307 1308 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1309 &lea ("esi",&DWP($S,"esp")); 1310 &lea ("ebp",&DWP($S,"esp")); 1311 &lea ("edi",&DWP($tmp0,"esp")); 1312 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); 1313 1314 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1315 &lea ("esi",&DWP($M,"esp")); 1316 &lea ("ebp",&DWP($Zsqr,"esp")); 1317 &lea ("edi",&DWP($M,"esp")); 1318 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); 1319 1320 &mov ("edi",32); 1321 &lea ("esi",&DWP($tmp0,"esp")); 1322 &add ("edi",&wparam(0)); 1323 &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); 1324 1325 &lea ("esi",&DWP($M,"esp")); 1326 &lea ("ebp",&DWP($M,"esp")); 1327 &lea ("edi",&DWP($tmp0,"esp")); 1328 &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); 1329 1330 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1331 &lea ("esi",&DWP($in_x,"esp")); 1332 &lea ("ebp",&DWP($S,"esp")); 1333 &lea ("edi",&DWP($S,"esp")); 1334 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); 1335 1336 &lea ("esi",&DWP($tmp0,"esp")); 1337 &lea ("ebp",&DWP($M,"esp")); 1338 &lea ("edi",&DWP($M,"esp")); 1339 &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); 1340 1341 &lea ("esi",&DWP($S,"esp")); 1342 &lea ("ebp",&DWP($S,"esp")); 1343 &lea ("edi",&DWP($tmp0,"esp")); 1344 &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); 1345 1346 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1347 &lea ("esi",&DWP($M,"esp")); 1348 &lea ("ebp",&DWP($M,"esp")); 1349 &mov ("edi",&wparam(0)); 1350 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); 1351 1352 &mov ("esi","edi"); # %edi is still res_x here 1353 &lea ("ebp",&DWP($tmp0,"esp")); 1354 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); 1355 1356 &lea ("esi",&DWP($S,"esp")); 1357 &mov ("ebp","edi"); # %edi is still res_x 1358 &lea ("edi",&DWP($S,"esp")); 1359 &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); 1360 1361 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1362 &mov ("esi","edi"); # %edi is still &S 1363 &lea ("ebp",&DWP($M,"esp")); 1364 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); 1365 1366 &mov ("ebp",32); 1367 &lea ("esi",&DWP($S,"esp")); 1368 &add ("ebp",&wparam(0)); 1369 &mov ("edi","ebp"); 1370 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); 1371 1372 &stack_pop(8*5+1); 1373} &function_end("ecp_nistz256_point_double"); 1374 1375######################################################################## 1376# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1377# const P256_POINT *in2); 1378&function_begin("ecp_nistz256_point_add"); 1379{ my ($res_x,$res_y,$res_z, 1380 $in1_x,$in1_y,$in1_z, 1381 $in2_x,$in2_y,$in2_z, 1382 $H,$Hsqr,$R,$Rsqr,$Hcub, 1383 $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1384 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1385 1386 &mov ("esi",&wparam(2)); 1387 1388 # above map() describes stack layout with 18 temporary 1389 # 256-bit vectors on top, then we take extra words for 1390 # ~in1infty, ~in2infty, result of check for zero and 1391 # OPENSSL_ia32cap_P copy. [one unused word for padding] 1392 &stack_push(8*18+5); 1393 if ($sse2) { 1394 &call ("_picup_eax"); 1395 &set_label("pic"); 1396 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1397 &mov ("ebp",&DWP(0,"edx")); } 1398 1399 &lea ("edi",&DWP($in2_x,"esp")); 1400 for($i=0;$i<96;$i+=16) { 1401 &mov ("eax",&DWP($i+0,"esi")); # copy in2 1402 &mov ("ebx",&DWP($i+4,"esi")); 1403 &mov ("ecx",&DWP($i+8,"esi")); 1404 &mov ("edx",&DWP($i+12,"esi")); 1405 &mov (&DWP($i+0,"edi"),"eax"); 1406 &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); 1407 &mov ("ebp","eax") if ($i==64); 1408 &or ("ebp","eax") if ($i>64); 1409 &mov (&DWP($i+4,"edi"),"ebx"); 1410 &or ("ebp","ebx") if ($i>=64); 1411 &mov (&DWP($i+8,"edi"),"ecx"); 1412 &or ("ebp","ecx") if ($i>=64); 1413 &mov (&DWP($i+12,"edi"),"edx"); 1414 &or ("ebp","edx") if ($i>=64); 1415 } 1416 &xor ("eax","eax"); 1417 &mov ("esi",&wparam(1)); 1418 &sub ("eax","ebp"); 1419 &or ("ebp","eax"); 1420 &sar ("ebp",31); 1421 &mov (&DWP(32*18+4,"esp"),"ebp"); # ~in2infty 1422 1423 &lea ("edi",&DWP($in1_x,"esp")); 1424 for($i=0;$i<96;$i+=16) { 1425 &mov ("eax",&DWP($i+0,"esi")); # copy in1 1426 &mov ("ebx",&DWP($i+4,"esi")); 1427 &mov ("ecx",&DWP($i+8,"esi")); 1428 &mov ("edx",&DWP($i+12,"esi")); 1429 &mov (&DWP($i+0,"edi"),"eax"); 1430 &mov ("ebp","eax") if ($i==64); 1431 &or ("ebp","eax") if ($i>64); 1432 &mov (&DWP($i+4,"edi"),"ebx"); 1433 &or ("ebp","ebx") if ($i>=64); 1434 &mov (&DWP($i+8,"edi"),"ecx"); 1435 &or ("ebp","ecx") if ($i>=64); 1436 &mov (&DWP($i+12,"edi"),"edx"); 1437 &or ("ebp","edx") if ($i>=64); 1438 } 1439 &xor ("eax","eax"); 1440 &sub ("eax","ebp"); 1441 &or ("ebp","eax"); 1442 &sar ("ebp",31); 1443 &mov (&DWP(32*18+0,"esp"),"ebp"); # ~in1infty 1444 1445 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1446 &lea ("esi",&DWP($in2_z,"esp")); 1447 &lea ("ebp",&DWP($in2_z,"esp")); 1448 &lea ("edi",&DWP($Z2sqr,"esp")); 1449 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); 1450 1451 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1452 &lea ("esi",&DWP($in1_z,"esp")); 1453 &lea ("ebp",&DWP($in1_z,"esp")); 1454 &lea ("edi",&DWP($Z1sqr,"esp")); 1455 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); 1456 1457 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1458 &lea ("esi",&DWP($Z2sqr,"esp")); 1459 &lea ("ebp",&DWP($in2_z,"esp")); 1460 &lea ("edi",&DWP($S1,"esp")); 1461 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); 1462 1463 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1464 &lea ("esi",&DWP($Z1sqr,"esp")); 1465 &lea ("ebp",&DWP($in1_z,"esp")); 1466 &lea ("edi",&DWP($S2,"esp")); 1467 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); 1468 1469 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1470 &lea ("esi",&DWP($in1_y,"esp")); 1471 &lea ("ebp",&DWP($S1,"esp")); 1472 &lea ("edi",&DWP($S1,"esp")); 1473 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); 1474 1475 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1476 &lea ("esi",&DWP($in2_y,"esp")); 1477 &lea ("ebp",&DWP($S2,"esp")); 1478 &lea ("edi",&DWP($S2,"esp")); 1479 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); 1480 1481 &lea ("esi",&DWP($S2,"esp")); 1482 &lea ("ebp",&DWP($S1,"esp")); 1483 &lea ("edi",&DWP($R,"esp")); 1484 &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); 1485 1486 &or ("ebx","eax"); # see if result is zero 1487 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1488 &or ("ebx","ecx"); 1489 &or ("ebx","edx"); 1490 &or ("ebx",&DWP(0,"edi")); 1491 &or ("ebx",&DWP(4,"edi")); 1492 &lea ("esi",&DWP($in1_x,"esp")); 1493 &or ("ebx",&DWP(8,"edi")); 1494 &lea ("ebp",&DWP($Z2sqr,"esp")); 1495 &or ("ebx",&DWP(12,"edi")); 1496 &lea ("edi",&DWP($U1,"esp")); 1497 &mov (&DWP(32*18+8,"esp"),"ebx"); 1498 1499 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); 1500 1501 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1502 &lea ("esi",&DWP($in2_x,"esp")); 1503 &lea ("ebp",&DWP($Z1sqr,"esp")); 1504 &lea ("edi",&DWP($U2,"esp")); 1505 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); 1506 1507 &lea ("esi",&DWP($U2,"esp")); 1508 &lea ("ebp",&DWP($U1,"esp")); 1509 &lea ("edi",&DWP($H,"esp")); 1510 &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); 1511 1512 &or ("eax","ebx"); # see if result is zero 1513 &or ("eax","ecx"); 1514 &or ("eax","edx"); 1515 &or ("eax",&DWP(0,"edi")); 1516 &or ("eax",&DWP(4,"edi")); 1517 &or ("eax",&DWP(8,"edi")); 1518 &or ("eax",&DWP(12,"edi")); # ~is_equal(U1,U2) 1519 1520 &mov ("ebx",&DWP(32*18+0,"esp")); # ~in1infty 1521 ¬ ("ebx"); # -1/0 -> 0/-1 1522 &or ("eax","ebx"); 1523 &mov ("ebx",&DWP(32*18+4,"esp")); # ~in2infty 1524 ¬ ("ebx"); # -1/0 -> 0/-1 1525 &or ("eax","ebx"); 1526 &or ("eax",&DWP(32*18+8,"esp")); # ~is_equal(S1,S2) 1527 1528 # if (~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 1529 &data_byte(0x3e); # predict taken 1530 &jnz (&label("add_proceed")); 1531 1532&set_label("add_double",16); 1533 &mov ("esi",&wparam(1)); 1534 &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1535 &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes 1536 &jmp (&label("point_double_shortcut")); 1537 1538&set_label("add_proceed",16); 1539 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1540 &lea ("esi",&DWP($R,"esp")); 1541 &lea ("ebp",&DWP($R,"esp")); 1542 &lea ("edi",&DWP($Rsqr,"esp")); 1543 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); 1544 1545 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1546 &lea ("esi",&DWP($H,"esp")); 1547 &lea ("ebp",&DWP($in1_z,"esp")); 1548 &lea ("edi",&DWP($res_z,"esp")); 1549 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); 1550 1551 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1552 &lea ("esi",&DWP($H,"esp")); 1553 &lea ("ebp",&DWP($H,"esp")); 1554 &lea ("edi",&DWP($Hsqr,"esp")); 1555 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); 1556 1557 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1558 &lea ("esi",&DWP($in2_z,"esp")); 1559 &lea ("ebp",&DWP($res_z,"esp")); 1560 &lea ("edi",&DWP($res_z,"esp")); 1561 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); 1562 1563 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1564 &lea ("esi",&DWP($Hsqr,"esp")); 1565 &lea ("ebp",&DWP($U1,"esp")); 1566 &lea ("edi",&DWP($U2,"esp")); 1567 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); 1568 1569 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1570 &lea ("esi",&DWP($H,"esp")); 1571 &lea ("ebp",&DWP($Hsqr,"esp")); 1572 &lea ("edi",&DWP($Hcub,"esp")); 1573 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); 1574 1575 &lea ("esi",&DWP($U2,"esp")); 1576 &lea ("ebp",&DWP($U2,"esp")); 1577 &lea ("edi",&DWP($Hsqr,"esp")); 1578 &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); 1579 1580 &lea ("esi",&DWP($Rsqr,"esp")); 1581 &lea ("ebp",&DWP($Hsqr,"esp")); 1582 &lea ("edi",&DWP($res_x,"esp")); 1583 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); 1584 1585 &lea ("esi",&DWP($res_x,"esp")); 1586 &lea ("ebp",&DWP($Hcub,"esp")); 1587 &lea ("edi",&DWP($res_x,"esp")); 1588 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); 1589 1590 &lea ("esi",&DWP($U2,"esp")); 1591 &lea ("ebp",&DWP($res_x,"esp")); 1592 &lea ("edi",&DWP($res_y,"esp")); 1593 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); 1594 1595 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1596 &lea ("esi",&DWP($Hcub,"esp")); 1597 &lea ("ebp",&DWP($S1,"esp")); 1598 &lea ("edi",&DWP($S2,"esp")); 1599 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); 1600 1601 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1602 &lea ("esi",&DWP($R,"esp")); 1603 &lea ("ebp",&DWP($res_y,"esp")); 1604 &lea ("edi",&DWP($res_y,"esp")); 1605 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); 1606 1607 &lea ("esi",&DWP($res_y,"esp")); 1608 &lea ("ebp",&DWP($S2,"esp")); 1609 &lea ("edi",&DWP($res_y,"esp")); 1610 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); 1611 1612 &mov ("ebp",&DWP(32*18+0,"esp")); # ~in1infty 1613 &mov ("esi",&DWP(32*18+4,"esp")); # ~in2infty 1614 &mov ("edi",&wparam(0)); 1615 &mov ("edx","ebp"); 1616 ¬ ("ebp"); 1617 &and ("edx","esi"); # ~in1infty & ~in2infty 1618 &and ("ebp","esi"); # in1infty & ~in2infty 1619 ¬ ("esi"); # in2infty 1620 1621 ######################################## 1622 # conditional moves 1623 for($i=64;$i<96;$i+=4) { 1624 &mov ("eax","edx"); # ~in1infty & ~in2infty 1625 &and ("eax",&DWP($res_x+$i,"esp")); 1626 &mov ("ebx","ebp"); # in1infty & ~in2infty 1627 &and ("ebx",&DWP($in2_x+$i,"esp")); 1628 &mov ("ecx","esi"); # in2infty 1629 &and ("ecx",&DWP($in1_x+$i,"esp")); 1630 &or ("eax","ebx"); 1631 &or ("eax","ecx"); 1632 &mov (&DWP($i,"edi"),"eax"); 1633 } 1634 for($i=0;$i<64;$i+=4) { 1635 &mov ("eax","edx"); # ~in1infty & ~in2infty 1636 &and ("eax",&DWP($res_x+$i,"esp")); 1637 &mov ("ebx","ebp"); # in1infty & ~in2infty 1638 &and ("ebx",&DWP($in2_x+$i,"esp")); 1639 &mov ("ecx","esi"); # in2infty 1640 &and ("ecx",&DWP($in1_x+$i,"esp")); 1641 &or ("eax","ebx"); 1642 &or ("eax","ecx"); 1643 &mov (&DWP($i,"edi"),"eax"); 1644 } 1645 &set_label("add_done"); 1646 &stack_pop(8*18+5); 1647} &function_end("ecp_nistz256_point_add"); 1648 1649######################################################################## 1650# void ecp_nistz256_point_add_affine(P256_POINT *out, 1651# const P256_POINT *in1, 1652# const P256_POINT_AFFINE *in2); 1653&function_begin("ecp_nistz256_point_add_affine"); 1654{ 1655 my ($res_x,$res_y,$res_z, 1656 $in1_x,$in1_y,$in1_z, 1657 $in2_x,$in2_y, 1658 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1659 my $Z1sqr = $S2; 1660 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1661 1662 &mov ("esi",&wparam(1)); 1663 1664 # above map() describes stack layout with 15 temporary 1665 # 256-bit vectors on top, then we take extra words for 1666 # ~in1infty, ~in2infty, and OPENSSL_ia32cap_P copy. 1667 &stack_push(8*15+3); 1668 if ($sse2) { 1669 &call ("_picup_eax"); 1670 &set_label("pic"); 1671 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1672 &mov ("ebp",&DWP(0,"edx")); } 1673 1674 &lea ("edi",&DWP($in1_x,"esp")); 1675 for($i=0;$i<96;$i+=16) { 1676 &mov ("eax",&DWP($i+0,"esi")); # copy in1 1677 &mov ("ebx",&DWP($i+4,"esi")); 1678 &mov ("ecx",&DWP($i+8,"esi")); 1679 &mov ("edx",&DWP($i+12,"esi")); 1680 &mov (&DWP($i+0,"edi"),"eax"); 1681 &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); 1682 &mov ("ebp","eax") if ($i==64); 1683 &or ("ebp","eax") if ($i>64); 1684 &mov (&DWP($i+4,"edi"),"ebx"); 1685 &or ("ebp","ebx") if ($i>=64); 1686 &mov (&DWP($i+8,"edi"),"ecx"); 1687 &or ("ebp","ecx") if ($i>=64); 1688 &mov (&DWP($i+12,"edi"),"edx"); 1689 &or ("ebp","edx") if ($i>=64); 1690 } 1691 &xor ("eax","eax"); 1692 &mov ("esi",&wparam(2)); 1693 &sub ("eax","ebp"); 1694 &or ("ebp","eax"); 1695 &sar ("ebp",31); 1696 &mov (&DWP(32*15+0,"esp"),"ebp"); # ~in1infty 1697 1698 &lea ("edi",&DWP($in2_x,"esp")); 1699 for($i=0;$i<64;$i+=16) { 1700 &mov ("eax",&DWP($i+0,"esi")); # copy in2 1701 &mov ("ebx",&DWP($i+4,"esi")); 1702 &mov ("ecx",&DWP($i+8,"esi")); 1703 &mov ("edx",&DWP($i+12,"esi")); 1704 &mov (&DWP($i+0,"edi"),"eax"); 1705 &mov ("ebp","eax") if ($i==0); 1706 &or ("ebp","eax") if ($i!=0); 1707 &mov (&DWP($i+4,"edi"),"ebx"); 1708 &or ("ebp","ebx"); 1709 &mov (&DWP($i+8,"edi"),"ecx"); 1710 &or ("ebp","ecx"); 1711 &mov (&DWP($i+12,"edi"),"edx"); 1712 &or ("ebp","edx"); 1713 } 1714 &xor ("ebx","ebx"); 1715 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1716 &sub ("ebx","ebp"); 1717 &lea ("esi",&DWP($in1_z,"esp")); 1718 &or ("ebx","ebp"); 1719 &lea ("ebp",&DWP($in1_z,"esp")); 1720 &sar ("ebx",31); 1721 &lea ("edi",&DWP($Z1sqr,"esp")); 1722 &mov (&DWP(32*15+4,"esp"),"ebx"); # ~in2infty 1723 1724 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); 1725 1726 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1727 &lea ("esi",&DWP($in2_x,"esp")); 1728 &mov ("ebp","edi"); # %esi is stull &Z1sqr 1729 &lea ("edi",&DWP($U2,"esp")); 1730 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); 1731 1732 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1733 &lea ("esi",&DWP($in1_z,"esp")); 1734 &lea ("ebp",&DWP($Z1sqr,"esp")); 1735 &lea ("edi",&DWP($S2,"esp")); 1736 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); 1737 1738 &lea ("esi",&DWP($U2,"esp")); 1739 &lea ("ebp",&DWP($in1_x,"esp")); 1740 &lea ("edi",&DWP($H,"esp")); 1741 &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); 1742 1743 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1744 &lea ("esi",&DWP($in2_y,"esp")); 1745 &lea ("ebp",&DWP($S2,"esp")); 1746 &lea ("edi",&DWP($S2,"esp")); 1747 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); 1748 1749 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1750 &lea ("esi",&DWP($in1_z,"esp")); 1751 &lea ("ebp",&DWP($H,"esp")); 1752 &lea ("edi",&DWP($res_z,"esp")); 1753 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); 1754 1755 &lea ("esi",&DWP($S2,"esp")); 1756 &lea ("ebp",&DWP($in1_y,"esp")); 1757 &lea ("edi",&DWP($R,"esp")); 1758 &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); 1759 1760 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1761 &lea ("esi",&DWP($H,"esp")); 1762 &lea ("ebp",&DWP($H,"esp")); 1763 &lea ("edi",&DWP($Hsqr,"esp")); 1764 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); 1765 1766 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1767 &lea ("esi",&DWP($R,"esp")); 1768 &lea ("ebp",&DWP($R,"esp")); 1769 &lea ("edi",&DWP($Rsqr,"esp")); 1770 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); 1771 1772 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1773 &lea ("esi",&DWP($in1_x,"esp")); 1774 &lea ("ebp",&DWP($Hsqr,"esp")); 1775 &lea ("edi",&DWP($U2,"esp")); 1776 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); 1777 1778 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1779 &lea ("esi",&DWP($H,"esp")); 1780 &lea ("ebp",&DWP($Hsqr,"esp")); 1781 &lea ("edi",&DWP($Hcub,"esp")); 1782 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); 1783 1784 &lea ("esi",&DWP($U2,"esp")); 1785 &lea ("ebp",&DWP($U2,"esp")); 1786 &lea ("edi",&DWP($Hsqr,"esp")); 1787 &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); 1788 1789 &lea ("esi",&DWP($Rsqr,"esp")); 1790 &lea ("ebp",&DWP($Hsqr,"esp")); 1791 &lea ("edi",&DWP($res_x,"esp")); 1792 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); 1793 1794 &lea ("esi",&DWP($res_x,"esp")); 1795 &lea ("ebp",&DWP($Hcub,"esp")); 1796 &lea ("edi",&DWP($res_x,"esp")); 1797 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); 1798 1799 &lea ("esi",&DWP($U2,"esp")); 1800 &lea ("ebp",&DWP($res_x,"esp")); 1801 &lea ("edi",&DWP($res_y,"esp")); 1802 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); 1803 1804 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1805 &lea ("esi",&DWP($Hcub,"esp")); 1806 &lea ("ebp",&DWP($in1_y,"esp")); 1807 &lea ("edi",&DWP($S2,"esp")); 1808 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); 1809 1810 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1811 &lea ("esi",&DWP($R,"esp")); 1812 &lea ("ebp",&DWP($res_y,"esp")); 1813 &lea ("edi",&DWP($res_y,"esp")); 1814 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); 1815 1816 &lea ("esi",&DWP($res_y,"esp")); 1817 &lea ("ebp",&DWP($S2,"esp")); 1818 &lea ("edi",&DWP($res_y,"esp")); 1819 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); 1820 1821 &mov ("ebp",&DWP(32*15+0,"esp")); # ~in1infty 1822 &mov ("esi",&DWP(32*15+4,"esp")); # ~in2infty 1823 &mov ("edi",&wparam(0)); 1824 &mov ("edx","ebp"); 1825 ¬ ("ebp"); 1826 &and ("edx","esi"); # ~in1infty & ~in2infty 1827 &and ("ebp","esi"); # in1infty & ~in2infty 1828 ¬ ("esi"); # in2infty 1829 1830 ######################################## 1831 # conditional moves 1832 for($i=64;$i<96;$i+=4) { 1833 my $one=@ONE_mont[($i-64)/4]; 1834 1835 &mov ("eax","edx"); 1836 &and ("eax",&DWP($res_x+$i,"esp")); 1837 &mov ("ebx","ebp") if ($one && $one!=-1); 1838 &and ("ebx",$one) if ($one && $one!=-1); 1839 &mov ("ecx","esi"); 1840 &and ("ecx",&DWP($in1_x+$i,"esp")); 1841 &or ("eax",$one==-1?"ebp":"ebx") if ($one); 1842 &or ("eax","ecx"); 1843 &mov (&DWP($i,"edi"),"eax"); 1844 } 1845 for($i=0;$i<64;$i+=4) { 1846 &mov ("eax","edx"); # ~in1infty & ~in2infty 1847 &and ("eax",&DWP($res_x+$i,"esp")); 1848 &mov ("ebx","ebp"); # in1infty & ~in2infty 1849 &and ("ebx",&DWP($in2_x+$i,"esp")); 1850 &mov ("ecx","esi"); # in2infty 1851 &and ("ecx",&DWP($in1_x+$i,"esp")); 1852 &or ("eax","ebx"); 1853 &or ("eax","ecx"); 1854 &mov (&DWP($i,"edi"),"eax"); 1855 } 1856 &stack_pop(8*15+3); 1857} &function_end("ecp_nistz256_point_add_affine"); 1858 1859&asm_finish(); 1860 1861close STDOUT or die "error closing STDOUT: $!"; 1862