1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv4. 18# 19# October 2014. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. In the process of adaptation 23# original .c module was made 32-bit savvy in order to make this 24# implementation possible. 25# 26# with/without -DECP_NISTZ256_ASM 27# Cortex-A8 +53-170% 28# Cortex-A9 +76-205% 29# Cortex-A15 +100-316% 30# Snapdragon S4 +66-187% 31# 32# Ranges denote minimum and maximum improvement coefficients depending 33# on benchmark. Lower coefficients are for ECDSA sign, server-side 34# operation. Keep in mind that +200% means 3x improvement. 35 36# $output is the last argument if it looks like a file (it has an extension) 37# $flavour is the first argument if it doesn't look like a file 38$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 39$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 40 41if ($flavour && $flavour ne "void") { 42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45 die "can't locate arm-xlate.pl"; 46 47 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 48 or die "can't call $xlate: $!"; 49} else { 50 $output and open STDOUT,">$output"; 51} 52 53$code.=<<___; 54#include "arm_arch.h" 55 56#if defined(__thumb2__) 57.syntax unified 58.thumb 59#else 60.code 32 61#endif 62___ 63######################################################################## 64# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 65# 66$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 67open TABLE,"<ecp_nistz256_table.c" or 68open TABLE,"<${dir}../ecp_nistz256_table.c" or 69die "failed to open ecp_nistz256_table.c:",$!; 70 71use integer; 72 73foreach(<TABLE>) { 74 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 75} 76close TABLE; 77 78# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 79# 64*16*37-1 is because $#arr returns last valid index or @arr, not 80# amount of elements. 81die "insane number of elements" if ($#arr != 64*16*37-1); 82 83$code.=<<___; 84.rodata 85.globl ecp_nistz256_precomputed 86.type ecp_nistz256_precomputed,%object 87.align 12 88ecp_nistz256_precomputed: 89___ 90######################################################################## 91# this conversion smashes P256_POINT_AFFINE by individual bytes with 92# 64 byte interval, similar to 93# 1111222233334444 94# 1234123412341234 95for(1..37) { 96 @tbl = splice(@arr,0,64*16); 97 for($i=0;$i<64;$i++) { 98 undef @line; 99 for($j=0;$j<64;$j++) { 100 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 101 } 102 $code.=".byte\t"; 103 $code.=join(',',map { sprintf "0x%02x",$_} @line); 104 $code.="\n"; 105 } 106} 107$code.=<<___; 108.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 109 110.text 111.align 5 112.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial 113.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb 114.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 115.Lone: 116.long 1,0,0,0,0,0,0,0 117.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 118.align 6 119___ 120 121######################################################################## 122# common register layout, note that $t2 is link register, so that if 123# internal subroutine uses $t2, then it has to offload lr... 124 125($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= 126 map("r$_",(0..12,14)); 127($t0,$t3)=($ff,$a_ptr); 128 129$code.=<<___; 130@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 131.globl ecp_nistz256_to_mont 132.type ecp_nistz256_to_mont,%function 133ecp_nistz256_to_mont: 134 adr $b_ptr,.LRR 135 b .Lecp_nistz256_mul_mont 136.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 137 138@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 139.globl ecp_nistz256_from_mont 140.type ecp_nistz256_from_mont,%function 141ecp_nistz256_from_mont: 142 adr $b_ptr,.Lone 143 b .Lecp_nistz256_mul_mont 144.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 145 146@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 147.globl ecp_nistz256_mul_by_2 148.type ecp_nistz256_mul_by_2,%function 149.align 4 150ecp_nistz256_mul_by_2: 151 stmdb sp!,{r4-r12,lr} 152 bl __ecp_nistz256_mul_by_2 153#if __ARM_ARCH__>=5 || !defined(__thumb__) 154 ldmia sp!,{r4-r12,pc} 155#else 156 ldmia sp!,{r4-r12,lr} 157 bx lr @ interoperable with Thumb ISA:-) 158#endif 159.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 160 161.type __ecp_nistz256_mul_by_2,%function 162.align 4 163__ecp_nistz256_mul_by_2: 164 ldr $a0,[$a_ptr,#0] 165 ldr $a1,[$a_ptr,#4] 166 ldr $a2,[$a_ptr,#8] 167 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself 168 ldr $a3,[$a_ptr,#12] 169 adcs $a1,$a1,$a1 170 ldr $a4,[$a_ptr,#16] 171 adcs $a2,$a2,$a2 172 ldr $a5,[$a_ptr,#20] 173 adcs $a3,$a3,$a3 174 ldr $a6,[$a_ptr,#24] 175 adcs $a4,$a4,$a4 176 ldr $a7,[$a_ptr,#28] 177 adcs $a5,$a5,$a5 178 adcs $a6,$a6,$a6 179 mov $ff,#0 180 adcs $a7,$a7,$a7 181 adc $ff,$ff,#0 182 183 b .Lreduce_by_sub 184.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 185 186@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], 187@ const BN_ULONG r2[8]); 188.globl ecp_nistz256_add 189.type ecp_nistz256_add,%function 190.align 4 191ecp_nistz256_add: 192 stmdb sp!,{r4-r12,lr} 193 bl __ecp_nistz256_add 194#if __ARM_ARCH__>=5 || !defined(__thumb__) 195 ldmia sp!,{r4-r12,pc} 196#else 197 ldmia sp!,{r4-r12,lr} 198 bx lr @ interoperable with Thumb ISA:-) 199#endif 200.size ecp_nistz256_add,.-ecp_nistz256_add 201 202.type __ecp_nistz256_add,%function 203.align 4 204__ecp_nistz256_add: 205 str lr,[sp,#-4]! @ push lr 206 207 ldr $a0,[$a_ptr,#0] 208 ldr $a1,[$a_ptr,#4] 209 ldr $a2,[$a_ptr,#8] 210 ldr $a3,[$a_ptr,#12] 211 ldr $a4,[$a_ptr,#16] 212 ldr $t0,[$b_ptr,#0] 213 ldr $a5,[$a_ptr,#20] 214 ldr $t1,[$b_ptr,#4] 215 ldr $a6,[$a_ptr,#24] 216 ldr $t2,[$b_ptr,#8] 217 ldr $a7,[$a_ptr,#28] 218 ldr $t3,[$b_ptr,#12] 219 adds $a0,$a0,$t0 220 ldr $t0,[$b_ptr,#16] 221 adcs $a1,$a1,$t1 222 ldr $t1,[$b_ptr,#20] 223 adcs $a2,$a2,$t2 224 ldr $t2,[$b_ptr,#24] 225 adcs $a3,$a3,$t3 226 ldr $t3,[$b_ptr,#28] 227 adcs $a4,$a4,$t0 228 adcs $a5,$a5,$t1 229 adcs $a6,$a6,$t2 230 mov $ff,#0 231 adcs $a7,$a7,$t3 232 adc $ff,$ff,#0 233 ldr lr,[sp],#4 @ pop lr 234 235.Lreduce_by_sub: 236 237 @ if a+b >= modulus, subtract modulus. 238 @ 239 @ But since comparison implies subtraction, we subtract 240 @ modulus and then add it back if subtraction borrowed. 241 242 subs $a0,$a0,#-1 243 sbcs $a1,$a1,#-1 244 sbcs $a2,$a2,#-1 245 sbcs $a3,$a3,#0 246 sbcs $a4,$a4,#0 247 sbcs $a5,$a5,#0 248 sbcs $a6,$a6,#1 249 sbcs $a7,$a7,#-1 250 sbc $ff,$ff,#0 251 252 @ Note that because mod has special form, i.e. consists of 253 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 254 @ using value of borrow as a whole or extracting single bit. 255 @ Follow $ff register... 256 257 adds $a0,$a0,$ff @ add synthesized modulus 258 adcs $a1,$a1,$ff 259 str $a0,[$r_ptr,#0] 260 adcs $a2,$a2,$ff 261 str $a1,[$r_ptr,#4] 262 adcs $a3,$a3,#0 263 str $a2,[$r_ptr,#8] 264 adcs $a4,$a4,#0 265 str $a3,[$r_ptr,#12] 266 adcs $a5,$a5,#0 267 str $a4,[$r_ptr,#16] 268 adcs $a6,$a6,$ff,lsr#31 269 str $a5,[$r_ptr,#20] 270 adcs $a7,$a7,$ff 271 str $a6,[$r_ptr,#24] 272 str $a7,[$r_ptr,#28] 273 274 mov pc,lr 275.size __ecp_nistz256_add,.-__ecp_nistz256_add 276 277@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); 278.globl ecp_nistz256_mul_by_3 279.type ecp_nistz256_mul_by_3,%function 280.align 4 281ecp_nistz256_mul_by_3: 282 stmdb sp!,{r4-r12,lr} 283 bl __ecp_nistz256_mul_by_3 284#if __ARM_ARCH__>=5 || !defined(__thumb__) 285 ldmia sp!,{r4-r12,pc} 286#else 287 ldmia sp!,{r4-r12,lr} 288 bx lr @ interoperable with Thumb ISA:-) 289#endif 290.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 291 292.type __ecp_nistz256_mul_by_3,%function 293.align 4 294__ecp_nistz256_mul_by_3: 295 str lr,[sp,#-4]! @ push lr 296 297 @ As multiplication by 3 is performed as 2*n+n, below are inline 298 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see 299 @ corresponding subroutines for details. 300 301 ldr $a0,[$a_ptr,#0] 302 ldr $a1,[$a_ptr,#4] 303 ldr $a2,[$a_ptr,#8] 304 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 305 ldr $a3,[$a_ptr,#12] 306 adcs $a1,$a1,$a1 307 ldr $a4,[$a_ptr,#16] 308 adcs $a2,$a2,$a2 309 ldr $a5,[$a_ptr,#20] 310 adcs $a3,$a3,$a3 311 ldr $a6,[$a_ptr,#24] 312 adcs $a4,$a4,$a4 313 ldr $a7,[$a_ptr,#28] 314 adcs $a5,$a5,$a5 315 adcs $a6,$a6,$a6 316 mov $ff,#0 317 adcs $a7,$a7,$a7 318 adc $ff,$ff,#0 319 320 subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores 321 sbcs $a1,$a1,#-1 322 sbcs $a2,$a2,#-1 323 sbcs $a3,$a3,#0 324 sbcs $a4,$a4,#0 325 sbcs $a5,$a5,#0 326 sbcs $a6,$a6,#1 327 sbcs $a7,$a7,#-1 328 sbc $ff,$ff,#0 329 330 adds $a0,$a0,$ff @ add synthesized modulus 331 adcs $a1,$a1,$ff 332 adcs $a2,$a2,$ff 333 adcs $a3,$a3,#0 334 adcs $a4,$a4,#0 335 ldr $b_ptr,[$a_ptr,#0] 336 adcs $a5,$a5,#0 337 ldr $t1,[$a_ptr,#4] 338 adcs $a6,$a6,$ff,lsr#31 339 ldr $t2,[$a_ptr,#8] 340 adc $a7,$a7,$ff 341 342 ldr $t0,[$a_ptr,#12] 343 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] 344 ldr $b_ptr,[$a_ptr,#16] 345 adcs $a1,$a1,$t1 346 ldr $t1,[$a_ptr,#20] 347 adcs $a2,$a2,$t2 348 ldr $t2,[$a_ptr,#24] 349 adcs $a3,$a3,$t0 350 ldr $t3,[$a_ptr,#28] 351 adcs $a4,$a4,$b_ptr 352 adcs $a5,$a5,$t1 353 adcs $a6,$a6,$t2 354 mov $ff,#0 355 adcs $a7,$a7,$t3 356 adc $ff,$ff,#0 357 ldr lr,[sp],#4 @ pop lr 358 359 b .Lreduce_by_sub 360.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 361 362@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 363.globl ecp_nistz256_div_by_2 364.type ecp_nistz256_div_by_2,%function 365.align 4 366ecp_nistz256_div_by_2: 367 stmdb sp!,{r4-r12,lr} 368 bl __ecp_nistz256_div_by_2 369#if __ARM_ARCH__>=5 || !defined(__thumb__) 370 ldmia sp!,{r4-r12,pc} 371#else 372 ldmia sp!,{r4-r12,lr} 373 bx lr @ interoperable with Thumb ISA:-) 374#endif 375.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 376 377.type __ecp_nistz256_div_by_2,%function 378.align 4 379__ecp_nistz256_div_by_2: 380 @ ret = (a is odd ? a+mod : a) >> 1 381 382 ldr $a0,[$a_ptr,#0] 383 ldr $a1,[$a_ptr,#4] 384 ldr $a2,[$a_ptr,#8] 385 mov $ff,$a0,lsl#31 @ place least significant bit to most 386 @ significant position, now arithmetic 387 @ right shift by 31 will produce -1 or 388 @ 0, while logical right shift 1 or 0, 389 @ this is how modulus is conditionally 390 @ synthesized in this case... 391 ldr $a3,[$a_ptr,#12] 392 adds $a0,$a0,$ff,asr#31 393 ldr $a4,[$a_ptr,#16] 394 adcs $a1,$a1,$ff,asr#31 395 ldr $a5,[$a_ptr,#20] 396 adcs $a2,$a2,$ff,asr#31 397 ldr $a6,[$a_ptr,#24] 398 adcs $a3,$a3,#0 399 ldr $a7,[$a_ptr,#28] 400 adcs $a4,$a4,#0 401 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early 402 @ because it doesn't affect flags 403 adcs $a5,$a5,#0 404 orr $a0,$a0,$a1,lsl#31 405 adcs $a6,$a6,$ff,lsr#31 406 mov $b_ptr,#0 407 adcs $a7,$a7,$ff,asr#31 408 mov $a1,$a1,lsr#1 409 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition 410 411 orr $a1,$a1,$a2,lsl#31 412 mov $a2,$a2,lsr#1 413 str $a0,[$r_ptr,#0] 414 orr $a2,$a2,$a3,lsl#31 415 mov $a3,$a3,lsr#1 416 str $a1,[$r_ptr,#4] 417 orr $a3,$a3,$a4,lsl#31 418 mov $a4,$a4,lsr#1 419 str $a2,[$r_ptr,#8] 420 orr $a4,$a4,$a5,lsl#31 421 mov $a5,$a5,lsr#1 422 str $a3,[$r_ptr,#12] 423 orr $a5,$a5,$a6,lsl#31 424 mov $a6,$a6,lsr#1 425 str $a4,[$r_ptr,#16] 426 orr $a6,$a6,$a7,lsl#31 427 mov $a7,$a7,lsr#1 428 str $a5,[$r_ptr,#20] 429 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit 430 str $a6,[$r_ptr,#24] 431 str $a7,[$r_ptr,#28] 432 433 mov pc,lr 434.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 435 436@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], 437@ const BN_ULONG r2[8]); 438.globl ecp_nistz256_sub 439.type ecp_nistz256_sub,%function 440.align 4 441ecp_nistz256_sub: 442 stmdb sp!,{r4-r12,lr} 443 bl __ecp_nistz256_sub 444#if __ARM_ARCH__>=5 || !defined(__thumb__) 445 ldmia sp!,{r4-r12,pc} 446#else 447 ldmia sp!,{r4-r12,lr} 448 bx lr @ interoperable with Thumb ISA:-) 449#endif 450.size ecp_nistz256_sub,.-ecp_nistz256_sub 451 452.type __ecp_nistz256_sub,%function 453.align 4 454__ecp_nistz256_sub: 455 str lr,[sp,#-4]! @ push lr 456 457 ldr $a0,[$a_ptr,#0] 458 ldr $a1,[$a_ptr,#4] 459 ldr $a2,[$a_ptr,#8] 460 ldr $a3,[$a_ptr,#12] 461 ldr $a4,[$a_ptr,#16] 462 ldr $t0,[$b_ptr,#0] 463 ldr $a5,[$a_ptr,#20] 464 ldr $t1,[$b_ptr,#4] 465 ldr $a6,[$a_ptr,#24] 466 ldr $t2,[$b_ptr,#8] 467 ldr $a7,[$a_ptr,#28] 468 ldr $t3,[$b_ptr,#12] 469 subs $a0,$a0,$t0 470 ldr $t0,[$b_ptr,#16] 471 sbcs $a1,$a1,$t1 472 ldr $t1,[$b_ptr,#20] 473 sbcs $a2,$a2,$t2 474 ldr $t2,[$b_ptr,#24] 475 sbcs $a3,$a3,$t3 476 ldr $t3,[$b_ptr,#28] 477 sbcs $a4,$a4,$t0 478 sbcs $a5,$a5,$t1 479 sbcs $a6,$a6,$t2 480 sbcs $a7,$a7,$t3 481 sbc $ff,$ff,$ff @ broadcast borrow bit 482 ldr lr,[sp],#4 @ pop lr 483 484.Lreduce_by_add: 485 486 @ if a-b borrows, add modulus. 487 @ 488 @ Note that because mod has special form, i.e. consists of 489 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 490 @ broadcasting borrow bit to a register, $ff, and using it as 491 @ a whole or extracting single bit. 492 493 adds $a0,$a0,$ff @ add synthesized modulus 494 adcs $a1,$a1,$ff 495 str $a0,[$r_ptr,#0] 496 adcs $a2,$a2,$ff 497 str $a1,[$r_ptr,#4] 498 adcs $a3,$a3,#0 499 str $a2,[$r_ptr,#8] 500 adcs $a4,$a4,#0 501 str $a3,[$r_ptr,#12] 502 adcs $a5,$a5,#0 503 str $a4,[$r_ptr,#16] 504 adcs $a6,$a6,$ff,lsr#31 505 str $a5,[$r_ptr,#20] 506 adcs $a7,$a7,$ff 507 str $a6,[$r_ptr,#24] 508 str $a7,[$r_ptr,#28] 509 510 mov pc,lr 511.size __ecp_nistz256_sub,.-__ecp_nistz256_sub 512 513@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); 514.globl ecp_nistz256_neg 515.type ecp_nistz256_neg,%function 516.align 4 517ecp_nistz256_neg: 518 stmdb sp!,{r4-r12,lr} 519 bl __ecp_nistz256_neg 520#if __ARM_ARCH__>=5 || !defined(__thumb__) 521 ldmia sp!,{r4-r12,pc} 522#else 523 ldmia sp!,{r4-r12,lr} 524 bx lr @ interoperable with Thumb ISA:-) 525#endif 526.size ecp_nistz256_neg,.-ecp_nistz256_neg 527 528.type __ecp_nistz256_neg,%function 529.align 4 530__ecp_nistz256_neg: 531 ldr $a0,[$a_ptr,#0] 532 eor $ff,$ff,$ff 533 ldr $a1,[$a_ptr,#4] 534 ldr $a2,[$a_ptr,#8] 535 subs $a0,$ff,$a0 536 ldr $a3,[$a_ptr,#12] 537 sbcs $a1,$ff,$a1 538 ldr $a4,[$a_ptr,#16] 539 sbcs $a2,$ff,$a2 540 ldr $a5,[$a_ptr,#20] 541 sbcs $a3,$ff,$a3 542 ldr $a6,[$a_ptr,#24] 543 sbcs $a4,$ff,$a4 544 ldr $a7,[$a_ptr,#28] 545 sbcs $a5,$ff,$a5 546 sbcs $a6,$ff,$a6 547 sbcs $a7,$ff,$a7 548 sbc $ff,$ff,$ff 549 550 b .Lreduce_by_add 551.size __ecp_nistz256_neg,.-__ecp_nistz256_neg 552___ 553{ 554my @acc=map("r$_",(3..11)); 555my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); 556 557$code.=<<___; 558@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 559.globl ecp_nistz256_sqr_mont 560.type ecp_nistz256_sqr_mont,%function 561.align 4 562ecp_nistz256_sqr_mont: 563 mov $b_ptr,$a_ptr 564 b .Lecp_nistz256_mul_mont 565.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 566 567@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], 568@ const BN_ULONG r2[8]); 569.globl ecp_nistz256_mul_mont 570.type ecp_nistz256_mul_mont,%function 571.align 4 572ecp_nistz256_mul_mont: 573.Lecp_nistz256_mul_mont: 574 stmdb sp!,{r4-r12,lr} 575 bl __ecp_nistz256_mul_mont 576#if __ARM_ARCH__>=5 || !defined(__thumb__) 577 ldmia sp!,{r4-r12,pc} 578#else 579 ldmia sp!,{r4-r12,lr} 580 bx lr @ interoperable with Thumb ISA:-) 581#endif 582.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 583 584.type __ecp_nistz256_mul_mont,%function 585.align 4 586__ecp_nistz256_mul_mont: 587 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too 588 589 ldr $bj,[$b_ptr,#0] @ b[0] 590 ldmia $a_ptr,{@acc[1]-@acc[8]} 591 592 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] 593 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so 594 @ that it can be addressed 595 @ without spending register 596 @ on address 597 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] 598 umull @acc[2],$t1,@acc[3],$bj 599 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult 600 umull @acc[3],$t2,@acc[4],$bj 601 adcs @acc[2],@acc[2],$t0 602 umull @acc[4],$t3,@acc[5],$bj 603 adcs @acc[3],@acc[3],$t1 604 umull @acc[5],$t0,@acc[6],$bj 605 adcs @acc[4],@acc[4],$t2 606 umull @acc[6],$t1,@acc[7],$bj 607 adcs @acc[5],@acc[5],$t3 608 umull @acc[7],$t2,@acc[8],$bj 609 adcs @acc[6],@acc[6],$t0 610 adcs @acc[7],@acc[7],$t1 611 eor $t3,$t3,$t3 @ first overflow bit is zero 612 adc @acc[8],$t2,#0 613___ 614for(my $i=1;$i<8;$i++) { 615my $t4=@acc[0]; 616 617 # Reduction iteration is normally performed by accumulating 618 # result of multiplication of modulus by "magic" digit [and 619 # omitting least significant word, which is guaranteed to 620 # be 0], but thanks to special form of modulus and "magic" 621 # digit being equal to least significant word, it can be 622 # performed with additions and subtractions alone. Indeed: 623 # 624 # ffff.0001.0000.0000.0000.ffff.ffff.ffff 625 # * abcd 626 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 627 # 628 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 629 # rewrite above as: 630 # 631 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 632 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 633 # - abcd.0000.0000.0000.0000.0000.0000.abcd 634 # 635 # or marking redundant operations: 636 # 637 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 638 # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 639 # - abcd.----.----.----.----.----.----.---- 640 641$code.=<<___; 642 @ multiplication-less reduction $i 643 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] 644 ldr $bj,[sp,#40] @ restore b_ptr 645 adcs @acc[4],@acc[4],#0 @ r[4]+=0 646 adcs @acc[5],@acc[5],#0 @ r[5]+=0 647 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] 648 ldr $t1,[sp,#0] @ load a[0] 649 adcs @acc[7],@acc[7],#0 @ r[7]+=0 650 ldr $bj,[$bj,#4*$i] @ load b[i] 651 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] 652 eor $t0,$t0,$t0 653 adc $t3,$t3,#0 @ overflow bit 654 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] 655 ldr $t2,[sp,#4] @ a[1] 656 sbcs @acc[8],@acc[8],#0 @ r[8]-=0 657 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] 658 eor $t1,$t1,$t1 659 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind 660 @ that netto result is 661 @ addition of a value which 662 @ makes underflow impossible 663 664 ldr $t3,[sp,#8] @ a[2] 665 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] 666 str @acc[0],[sp,#36] @ temporarily offload overflow 667 eor $t2,$t2,$t2 668 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] 669 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] 670 eor $t3,$t3,$t3 671 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult 672 ldr $t0,[sp,#16] @ a[4] 673 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] 674 eor $t4,$t4,$t4 675 adcs @acc[3],@acc[3],$t1 676 ldr $t1,[sp,#20] @ a[5] 677 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] 678 eor $t0,$t0,$t0 679 adcs @acc[4],@acc[4],$t2 680 ldr $t2,[sp,#24] @ a[6] 681 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] 682 eor $t1,$t1,$t1 683 adcs @acc[5],@acc[5],$t3 684 ldr $t3,[sp,#28] @ a[7] 685 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] 686 eor $t2,$t2,$t2 687 adcs @acc[6],@acc[6],$t4 688 ldr @acc[0],[sp,#36] @ restore overflow bit 689 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] 690 eor $t3,$t3,$t3 691 adcs @acc[7],@acc[7],$t0 692 adcs @acc[8],@acc[8],$t1 693 adcs @acc[0],$acc[0],$t2 694 adc $t3,$t3,#0 @ new overflow bit 695___ 696 push(@acc,shift(@acc)); # rotate registers, so that 697 # "r[i]" becomes r[i] 698} 699$code.=<<___; 700 @ last multiplication-less reduction 701 adds @acc[3],@acc[3],@acc[0] 702 ldr $r_ptr,[sp,#32] @ restore r_ptr 703 adcs @acc[4],@acc[4],#0 704 adcs @acc[5],@acc[5],#0 705 adcs @acc[6],@acc[6],@acc[0] 706 adcs @acc[7],@acc[7],#0 707 adcs @acc[8],@acc[8],@acc[0] 708 adc $t3,$t3,#0 709 subs @acc[7],@acc[7],@acc[0] 710 sbcs @acc[8],@acc[8],#0 711 sbc @acc[0],$t3,#0 @ overflow bit 712 713 @ Final step is "if result > mod, subtract mod", but we do it 714 @ "other way around", namely subtract modulus from result 715 @ and if it borrowed, add modulus back. 716 717 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 718 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 719 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 720 sbcs @acc[4],@acc[4],#0 721 sbcs @acc[5],@acc[5],#0 722 sbcs @acc[6],@acc[6],#0 723 sbcs @acc[7],@acc[7],#1 724 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 725 ldr lr,[sp,#44] @ restore lr 726 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit 727 add sp,sp,#48 728 729 @ Note that because mod has special form, i.e. consists of 730 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 731 @ broadcasting borrow bit to a register, @acc[0], and using it as 732 @ a whole or extracting single bit. 733 734 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero 735 adcs @acc[2],@acc[2],@acc[0] 736 str @acc[1],[$r_ptr,#0] 737 adcs @acc[3],@acc[3],@acc[0] 738 str @acc[2],[$r_ptr,#4] 739 adcs @acc[4],@acc[4],#0 740 str @acc[3],[$r_ptr,#8] 741 adcs @acc[5],@acc[5],#0 742 str @acc[4],[$r_ptr,#12] 743 adcs @acc[6],@acc[6],#0 744 str @acc[5],[$r_ptr,#16] 745 adcs @acc[7],@acc[7],@acc[0],lsr#31 746 str @acc[6],[$r_ptr,#20] 747 adc @acc[8],@acc[8],@acc[0] 748 str @acc[7],[$r_ptr,#24] 749 str @acc[8],[$r_ptr,#28] 750 751 mov pc,lr 752.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 753___ 754} 755 756{ 757my ($out,$inp,$index,$mask)=map("r$_",(0..3)); 758$code.=<<___; 759@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, 760@ int r2); 761.globl ecp_nistz256_scatter_w5 762.type ecp_nistz256_scatter_w5,%function 763.align 5 764ecp_nistz256_scatter_w5: 765 stmdb sp!,{r4-r11} 766 767 add $out,$out,$index,lsl#2 768 769 ldmia $inp!,{r4-r11} @ X 770 str r4,[$out,#64*0-4] 771 str r5,[$out,#64*1-4] 772 str r6,[$out,#64*2-4] 773 str r7,[$out,#64*3-4] 774 str r8,[$out,#64*4-4] 775 str r9,[$out,#64*5-4] 776 str r10,[$out,#64*6-4] 777 str r11,[$out,#64*7-4] 778 add $out,$out,#64*8 779 780 ldmia $inp!,{r4-r11} @ Y 781 str r4,[$out,#64*0-4] 782 str r5,[$out,#64*1-4] 783 str r6,[$out,#64*2-4] 784 str r7,[$out,#64*3-4] 785 str r8,[$out,#64*4-4] 786 str r9,[$out,#64*5-4] 787 str r10,[$out,#64*6-4] 788 str r11,[$out,#64*7-4] 789 add $out,$out,#64*8 790 791 ldmia $inp,{r4-r11} @ Z 792 str r4,[$out,#64*0-4] 793 str r5,[$out,#64*1-4] 794 str r6,[$out,#64*2-4] 795 str r7,[$out,#64*3-4] 796 str r8,[$out,#64*4-4] 797 str r9,[$out,#64*5-4] 798 str r10,[$out,#64*6-4] 799 str r11,[$out,#64*7-4] 800 801 ldmia sp!,{r4-r11} 802#if __ARM_ARCH__>=5 || defined(__thumb__) 803 bx lr 804#else 805 mov pc,lr 806#endif 807.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 808 809@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, 810@ int r2); 811.globl ecp_nistz256_gather_w5 812.type ecp_nistz256_gather_w5,%function 813.align 5 814ecp_nistz256_gather_w5: 815 stmdb sp!,{r4-r11} 816 817 cmp $index,#0 818 mov $mask,#0 819#ifdef __thumb2__ 820 itt ne 821#endif 822 subne $index,$index,#1 823 movne $mask,#-1 824 add $inp,$inp,$index,lsl#2 825 826 ldr r4,[$inp,#64*0] 827 ldr r5,[$inp,#64*1] 828 ldr r6,[$inp,#64*2] 829 and r4,r4,$mask 830 ldr r7,[$inp,#64*3] 831 and r5,r5,$mask 832 ldr r8,[$inp,#64*4] 833 and r6,r6,$mask 834 ldr r9,[$inp,#64*5] 835 and r7,r7,$mask 836 ldr r10,[$inp,#64*6] 837 and r8,r8,$mask 838 ldr r11,[$inp,#64*7] 839 add $inp,$inp,#64*8 840 and r9,r9,$mask 841 and r10,r10,$mask 842 and r11,r11,$mask 843 stmia $out!,{r4-r11} @ X 844 845 ldr r4,[$inp,#64*0] 846 ldr r5,[$inp,#64*1] 847 ldr r6,[$inp,#64*2] 848 and r4,r4,$mask 849 ldr r7,[$inp,#64*3] 850 and r5,r5,$mask 851 ldr r8,[$inp,#64*4] 852 and r6,r6,$mask 853 ldr r9,[$inp,#64*5] 854 and r7,r7,$mask 855 ldr r10,[$inp,#64*6] 856 and r8,r8,$mask 857 ldr r11,[$inp,#64*7] 858 add $inp,$inp,#64*8 859 and r9,r9,$mask 860 and r10,r10,$mask 861 and r11,r11,$mask 862 stmia $out!,{r4-r11} @ Y 863 864 ldr r4,[$inp,#64*0] 865 ldr r5,[$inp,#64*1] 866 ldr r6,[$inp,#64*2] 867 and r4,r4,$mask 868 ldr r7,[$inp,#64*3] 869 and r5,r5,$mask 870 ldr r8,[$inp,#64*4] 871 and r6,r6,$mask 872 ldr r9,[$inp,#64*5] 873 and r7,r7,$mask 874 ldr r10,[$inp,#64*6] 875 and r8,r8,$mask 876 ldr r11,[$inp,#64*7] 877 and r9,r9,$mask 878 and r10,r10,$mask 879 and r11,r11,$mask 880 stmia $out,{r4-r11} @ Z 881 882 ldmia sp!,{r4-r11} 883#if __ARM_ARCH__>=5 || defined(__thumb__) 884 bx lr 885#else 886 mov pc,lr 887#endif 888.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 889 890@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, 891@ int r2); 892.globl ecp_nistz256_scatter_w7 893.type ecp_nistz256_scatter_w7,%function 894.align 5 895ecp_nistz256_scatter_w7: 896 add $out,$out,$index 897 mov $index,#64/4 898.Loop_scatter_w7: 899 ldr $mask,[$inp],#4 900 subs $index,$index,#1 901 strb $mask,[$out,#64*0] 902 mov $mask,$mask,lsr#8 903 strb $mask,[$out,#64*1] 904 mov $mask,$mask,lsr#8 905 strb $mask,[$out,#64*2] 906 mov $mask,$mask,lsr#8 907 strb $mask,[$out,#64*3] 908 add $out,$out,#64*4 909 bne .Loop_scatter_w7 910 911#if __ARM_ARCH__>=5 || defined(__thumb__) 912 bx lr 913#else 914 mov pc,lr 915#endif 916.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 917 918@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, 919@ int r2); 920.globl ecp_nistz256_gather_w7 921.type ecp_nistz256_gather_w7,%function 922.align 5 923ecp_nistz256_gather_w7: 924 stmdb sp!,{r4-r7} 925 926 cmp $index,#0 927 mov $mask,#0 928#ifdef __thumb2__ 929 itt ne 930#endif 931 subne $index,$index,#1 932 movne $mask,#-1 933 add $inp,$inp,$index 934 mov $index,#64/4 935 nop 936.Loop_gather_w7: 937 ldrb r4,[$inp,#64*0] 938 subs $index,$index,#1 939 ldrb r5,[$inp,#64*1] 940 ldrb r6,[$inp,#64*2] 941 ldrb r7,[$inp,#64*3] 942 add $inp,$inp,#64*4 943 orr r4,r4,r5,lsl#8 944 orr r4,r4,r6,lsl#16 945 orr r4,r4,r7,lsl#24 946 and r4,r4,$mask 947 str r4,[$out],#4 948 bne .Loop_gather_w7 949 950 ldmia sp!,{r4-r7} 951#if __ARM_ARCH__>=5 || defined(__thumb__) 952 bx lr 953#else 954 mov pc,lr 955#endif 956.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 957___ 958} 959if (0) { 960# In comparison to integer-only equivalent of below subroutine: 961# 962# Cortex-A8 +10% 963# Cortex-A9 -10% 964# Snapdragon S4 +5% 965# 966# As not all time is spent in multiplication, overall impact is deemed 967# too low to care about. 968 969my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); 970my $mask="q4"; 971my $mult="q5"; 972my @AxB=map("q$_",(8..15)); 973 974my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); 975 976$code.=<<___; 977#if __ARM_ARCH__>=7 978.fpu neon 979 980.globl ecp_nistz256_mul_mont_neon 981.type ecp_nistz256_mul_mont_neon,%function 982.align 5 983ecp_nistz256_mul_mont_neon: 984 mov ip,sp 985 stmdb sp!,{r4-r9} 986 vstmdb sp!,{q4-q5} @ ABI specification says so 987 988 sub $toutptr,sp,#40 989 vld1.32 {${Bi}[0]},[$bptr,:32]! 990 veor $zero,$zero,$zero 991 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( 992 vzip.16 $Bi,$zero 993 mov sp,$toutptr @ alloca 994 vmov.i64 $mask,#0xffff 995 996 vmull.u32 @AxB[0],$Bi,${A0}[0] 997 vmull.u32 @AxB[1],$Bi,${A0}[1] 998 vmull.u32 @AxB[2],$Bi,${A1}[0] 999 vmull.u32 @AxB[3],$Bi,${A1}[1] 1000 vshr.u64 $temp,@AxB[0]#lo,#16 1001 vmull.u32 @AxB[4],$Bi,${A2}[0] 1002 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1003 vmull.u32 @AxB[5],$Bi,${A2}[1] 1004 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] 1005 vmull.u32 @AxB[6],$Bi,${A3}[0] 1006 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1007 vmull.u32 @AxB[7],$Bi,${A3}[1] 1008___ 1009for($i=1;$i<8;$i++) { 1010$code.=<<___; 1011 vld1.32 {${Bi}[0]},[$bptr,:32]! 1012 veor $zero,$zero,$zero 1013 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction 1014 vshl.u64 $mult,@AxB[0],#32 1015 vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1016 vsub.u64 $mult,$mult,@AxB[0] 1017 vzip.16 $Bi,$zero 1018 vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1019 vadd.u64 @AxB[7],@AxB[7],$mult 1020___ 1021 push(@AxB,shift(@AxB)); 1022$code.=<<___; 1023 vmlal.u32 @AxB[0],$Bi,${A0}[0] 1024 vmlal.u32 @AxB[1],$Bi,${A0}[1] 1025 vmlal.u32 @AxB[2],$Bi,${A1}[0] 1026 vmlal.u32 @AxB[3],$Bi,${A1}[1] 1027 vshr.u64 $temp,@AxB[0]#lo,#16 1028 vmlal.u32 @AxB[4],$Bi,${A2}[0] 1029 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1030 vmlal.u32 @AxB[5],$Bi,${A2}[1] 1031 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] 1032 vmlal.u32 @AxB[6],$Bi,${A3}[0] 1033 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1034 vmull.u32 @AxB[7],$Bi,${A3}[1] 1035___ 1036} 1037$code.=<<___; 1038 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction 1039 vshl.u64 $mult,@AxB[0],#32 1040 vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1041 vsub.u64 $mult,$mult,@AxB[0] 1042 vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1043 vadd.u64 @AxB[7],@AxB[7],$mult 1044 1045 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert 1046 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp 1047 vshr.u64 $temp,@AxB[1]#hi,#16 1048 vzip.16 @AxB[1]#lo,@AxB[1]#hi 1049___ 1050foreach (2..7) { 1051$code.=<<___; 1052 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp 1053 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! 1054 vshr.u64 $temp,@AxB[$_]#lo,#16 1055 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp 1056 vshr.u64 $temp,@AxB[$_]#hi,#16 1057 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi 1058___ 1059} 1060$code.=<<___; 1061 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! 1062 vst1.32 {$temp},[$toutptr] @ upper 33 bits 1063 1064 ldr r1,[sp,#0] 1065 ldr r2,[sp,#4] 1066 ldr r3,[sp,#8] 1067 subs r1,r1,#-1 1068 ldr r4,[sp,#12] 1069 sbcs r2,r2,#-1 1070 ldr r5,[sp,#16] 1071 sbcs r3,r3,#-1 1072 ldr r6,[sp,#20] 1073 sbcs r4,r4,#0 1074 ldr r7,[sp,#24] 1075 sbcs r5,r5,#0 1076 ldr r8,[sp,#28] 1077 sbcs r6,r6,#0 1078 ldr r9,[sp,#32] @ top-most bit 1079 sbcs r7,r7,#1 1080 sub sp,ip,#40+16 1081 sbcs r8,r8,#-1 1082 sbc r9,r9,#0 1083 vldmia sp!,{q4-q5} 1084 1085 adds r1,r1,r9 1086 adcs r2,r2,r9 1087 str r1,[$rptr,#0] 1088 adcs r3,r3,r9 1089 str r2,[$rptr,#4] 1090 adcs r4,r4,#0 1091 str r3,[$rptr,#8] 1092 adcs r5,r5,#0 1093 str r4,[$rptr,#12] 1094 adcs r6,r6,#0 1095 str r5,[$rptr,#16] 1096 adcs r7,r7,r9,lsr#31 1097 str r6,[$rptr,#20] 1098 adcs r8,r8,r9 1099 str r7,[$rptr,#24] 1100 str r8,[$rptr,#28] 1101 1102 ldmia sp!,{r4-r9} 1103 bx lr 1104.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon 1105#endif 1106___ 1107} 1108 1109{{{ 1110######################################################################## 1111# Below $aN assignment matches order in which 256-bit result appears in 1112# register bank at return from __ecp_nistz256_mul_mont, so that we can 1113# skip over reloading it from memory. This means that below functions 1114# use custom calling sequence accepting 256-bit input in registers, 1115# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. 1116# 1117# See their "normal" counterparts for insights on calculations. 1118 1119my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, 1120 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); 1121my $ff=$b_ptr; 1122 1123$code.=<<___; 1124.type __ecp_nistz256_sub_from,%function 1125.align 5 1126__ecp_nistz256_sub_from: 1127 str lr,[sp,#-4]! @ push lr 1128 1129 ldr $t0,[$b_ptr,#0] 1130 ldr $t1,[$b_ptr,#4] 1131 ldr $t2,[$b_ptr,#8] 1132 ldr $t3,[$b_ptr,#12] 1133 subs $a0,$a0,$t0 1134 ldr $t0,[$b_ptr,#16] 1135 sbcs $a1,$a1,$t1 1136 ldr $t1,[$b_ptr,#20] 1137 sbcs $a2,$a2,$t2 1138 ldr $t2,[$b_ptr,#24] 1139 sbcs $a3,$a3,$t3 1140 ldr $t3,[$b_ptr,#28] 1141 sbcs $a4,$a4,$t0 1142 sbcs $a5,$a5,$t1 1143 sbcs $a6,$a6,$t2 1144 sbcs $a7,$a7,$t3 1145 sbc $ff,$ff,$ff @ broadcast borrow bit 1146 ldr lr,[sp],#4 @ pop lr 1147 1148 adds $a0,$a0,$ff @ add synthesized modulus 1149 adcs $a1,$a1,$ff 1150 str $a0,[$r_ptr,#0] 1151 adcs $a2,$a2,$ff 1152 str $a1,[$r_ptr,#4] 1153 adcs $a3,$a3,#0 1154 str $a2,[$r_ptr,#8] 1155 adcs $a4,$a4,#0 1156 str $a3,[$r_ptr,#12] 1157 adcs $a5,$a5,#0 1158 str $a4,[$r_ptr,#16] 1159 adcs $a6,$a6,$ff,lsr#31 1160 str $a5,[$r_ptr,#20] 1161 adcs $a7,$a7,$ff 1162 str $a6,[$r_ptr,#24] 1163 str $a7,[$r_ptr,#28] 1164 1165 mov pc,lr 1166.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 1167 1168.type __ecp_nistz256_sub_morf,%function 1169.align 5 1170__ecp_nistz256_sub_morf: 1171 str lr,[sp,#-4]! @ push lr 1172 1173 ldr $t0,[$b_ptr,#0] 1174 ldr $t1,[$b_ptr,#4] 1175 ldr $t2,[$b_ptr,#8] 1176 ldr $t3,[$b_ptr,#12] 1177 subs $a0,$t0,$a0 1178 ldr $t0,[$b_ptr,#16] 1179 sbcs $a1,$t1,$a1 1180 ldr $t1,[$b_ptr,#20] 1181 sbcs $a2,$t2,$a2 1182 ldr $t2,[$b_ptr,#24] 1183 sbcs $a3,$t3,$a3 1184 ldr $t3,[$b_ptr,#28] 1185 sbcs $a4,$t0,$a4 1186 sbcs $a5,$t1,$a5 1187 sbcs $a6,$t2,$a6 1188 sbcs $a7,$t3,$a7 1189 sbc $ff,$ff,$ff @ broadcast borrow bit 1190 ldr lr,[sp],#4 @ pop lr 1191 1192 adds $a0,$a0,$ff @ add synthesized modulus 1193 adcs $a1,$a1,$ff 1194 str $a0,[$r_ptr,#0] 1195 adcs $a2,$a2,$ff 1196 str $a1,[$r_ptr,#4] 1197 adcs $a3,$a3,#0 1198 str $a2,[$r_ptr,#8] 1199 adcs $a4,$a4,#0 1200 str $a3,[$r_ptr,#12] 1201 adcs $a5,$a5,#0 1202 str $a4,[$r_ptr,#16] 1203 adcs $a6,$a6,$ff,lsr#31 1204 str $a5,[$r_ptr,#20] 1205 adcs $a7,$a7,$ff 1206 str $a6,[$r_ptr,#24] 1207 str $a7,[$r_ptr,#28] 1208 1209 mov pc,lr 1210.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 1211 1212.type __ecp_nistz256_add_self,%function 1213.align 4 1214__ecp_nistz256_add_self: 1215 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 1216 adcs $a1,$a1,$a1 1217 adcs $a2,$a2,$a2 1218 adcs $a3,$a3,$a3 1219 adcs $a4,$a4,$a4 1220 adcs $a5,$a5,$a5 1221 adcs $a6,$a6,$a6 1222 mov $ff,#0 1223 adcs $a7,$a7,$a7 1224 adc $ff,$ff,#0 1225 1226 @ if a+b >= modulus, subtract modulus. 1227 @ 1228 @ But since comparison implies subtraction, we subtract 1229 @ modulus and then add it back if subtraction borrowed. 1230 1231 subs $a0,$a0,#-1 1232 sbcs $a1,$a1,#-1 1233 sbcs $a2,$a2,#-1 1234 sbcs $a3,$a3,#0 1235 sbcs $a4,$a4,#0 1236 sbcs $a5,$a5,#0 1237 sbcs $a6,$a6,#1 1238 sbcs $a7,$a7,#-1 1239 sbc $ff,$ff,#0 1240 1241 @ Note that because mod has special form, i.e. consists of 1242 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1243 @ using value of borrow as a whole or extracting single bit. 1244 @ Follow $ff register... 1245 1246 adds $a0,$a0,$ff @ add synthesized modulus 1247 adcs $a1,$a1,$ff 1248 str $a0,[$r_ptr,#0] 1249 adcs $a2,$a2,$ff 1250 str $a1,[$r_ptr,#4] 1251 adcs $a3,$a3,#0 1252 str $a2,[$r_ptr,#8] 1253 adcs $a4,$a4,#0 1254 str $a3,[$r_ptr,#12] 1255 adcs $a5,$a5,#0 1256 str $a4,[$r_ptr,#16] 1257 adcs $a6,$a6,$ff,lsr#31 1258 str $a5,[$r_ptr,#20] 1259 adcs $a7,$a7,$ff 1260 str $a6,[$r_ptr,#24] 1261 str $a7,[$r_ptr,#28] 1262 1263 mov pc,lr 1264.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self 1265 1266___ 1267 1268######################################################################## 1269# following subroutines are "literal" implementation of those found in 1270# ecp_nistz256.c 1271# 1272######################################################################## 1273# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1274# 1275{ 1276my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1277# above map() describes stack layout with 5 temporary 1278# 256-bit vectors on top. Then note that we push 1279# starting from r0, which means that we have copy of 1280# input arguments just below these temporary vectors. 1281 1282$code.=<<___; 1283.globl ecp_nistz256_point_double 1284.type ecp_nistz256_point_double,%function 1285.align 5 1286ecp_nistz256_point_double: 1287 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1288 sub sp,sp,#32*5 1289 1290.Lpoint_double_shortcut: 1291 add r3,sp,#$in_x 1292 ldmia $a_ptr!,{r4-r11} @ copy in_x 1293 stmia r3,{r4-r11} 1294 1295 add $r_ptr,sp,#$S 1296 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); 1297 1298 add $b_ptr,$a_ptr,#32 1299 add $a_ptr,$a_ptr,#32 1300 add $r_ptr,sp,#$Zsqr 1301 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); 1302 1303 add $a_ptr,sp,#$S 1304 add $b_ptr,sp,#$S 1305 add $r_ptr,sp,#$S 1306 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); 1307 1308 ldr $b_ptr,[sp,#32*5+4] 1309 add $a_ptr,$b_ptr,#32 1310 add $b_ptr,$b_ptr,#64 1311 add $r_ptr,sp,#$tmp0 1312 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); 1313 1314 ldr $r_ptr,[sp,#32*5] 1315 add $r_ptr,$r_ptr,#64 1316 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); 1317 1318 add $a_ptr,sp,#$in_x 1319 add $b_ptr,sp,#$Zsqr 1320 add $r_ptr,sp,#$M 1321 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); 1322 1323 add $a_ptr,sp,#$in_x 1324 add $b_ptr,sp,#$Zsqr 1325 add $r_ptr,sp,#$Zsqr 1326 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); 1327 1328 add $a_ptr,sp,#$S 1329 add $b_ptr,sp,#$S 1330 add $r_ptr,sp,#$tmp0 1331 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); 1332 1333 add $a_ptr,sp,#$Zsqr 1334 add $b_ptr,sp,#$M 1335 add $r_ptr,sp,#$M 1336 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); 1337 1338 ldr $r_ptr,[sp,#32*5] 1339 add $a_ptr,sp,#$tmp0 1340 add $r_ptr,$r_ptr,#32 1341 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); 1342 1343 add $a_ptr,sp,#$M 1344 add $r_ptr,sp,#$M 1345 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); 1346 1347 add $a_ptr,sp,#$in_x 1348 add $b_ptr,sp,#$S 1349 add $r_ptr,sp,#$S 1350 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); 1351 1352 add $r_ptr,sp,#$tmp0 1353 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); 1354 1355 ldr $r_ptr,[sp,#32*5] 1356 add $a_ptr,sp,#$M 1357 add $b_ptr,sp,#$M 1358 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); 1359 1360 add $b_ptr,sp,#$tmp0 1361 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); 1362 1363 add $b_ptr,sp,#$S 1364 add $r_ptr,sp,#$S 1365 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); 1366 1367 add $a_ptr,sp,#$M 1368 add $b_ptr,sp,#$S 1369 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); 1370 1371 ldr $r_ptr,[sp,#32*5] 1372 add $b_ptr,$r_ptr,#32 1373 add $r_ptr,$r_ptr,#32 1374 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); 1375 1376 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" 1377#if __ARM_ARCH__>=5 || !defined(__thumb__) 1378 ldmia sp!,{r4-r12,pc} 1379#else 1380 ldmia sp!,{r4-r12,lr} 1381 bx lr @ interoperable with Thumb ISA:-) 1382#endif 1383.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 1384___ 1385} 1386 1387######################################################################## 1388# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1389# const P256_POINT *in2); 1390{ 1391my ($res_x,$res_y,$res_z, 1392 $in1_x,$in1_y,$in1_z, 1393 $in2_x,$in2_y,$in2_z, 1394 $H,$Hsqr,$R,$Rsqr,$Hcub, 1395 $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1396my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1397# above map() describes stack layout with 18 temporary 1398# 256-bit vectors on top. Then note that we push 1399# starting from r0, which means that we have copy of 1400# input arguments just below these temporary vectors. 1401# We use three of them for ~in1infty, ~in2infty and 1402# result of check for zero. 1403 1404$code.=<<___; 1405.globl ecp_nistz256_point_add 1406.type ecp_nistz256_point_add,%function 1407.align 5 1408ecp_nistz256_point_add: 1409 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1410 sub sp,sp,#32*18+16 1411 1412 ldmia $b_ptr!,{r4-r11} @ copy in2_x 1413 add r3,sp,#$in2_x 1414 stmia r3!,{r4-r11} 1415 ldmia $b_ptr!,{r4-r11} @ copy in2_y 1416 stmia r3!,{r4-r11} 1417 ldmia $b_ptr,{r4-r11} @ copy in2_z 1418 orr r12,r4,r5 1419 orr r12,r12,r6 1420 orr r12,r12,r7 1421 orr r12,r12,r8 1422 orr r12,r12,r9 1423 orr r12,r12,r10 1424 orr r12,r12,r11 1425 cmp r12,#0 1426#ifdef __thumb2__ 1427 it ne 1428#endif 1429 movne r12,#-1 1430 stmia r3,{r4-r11} 1431 str r12,[sp,#32*18+8] @ ~in2infty 1432 1433 ldmia $a_ptr!,{r4-r11} @ copy in1_x 1434 add r3,sp,#$in1_x 1435 stmia r3!,{r4-r11} 1436 ldmia $a_ptr!,{r4-r11} @ copy in1_y 1437 stmia r3!,{r4-r11} 1438 ldmia $a_ptr,{r4-r11} @ copy in1_z 1439 orr r12,r4,r5 1440 orr r12,r12,r6 1441 orr r12,r12,r7 1442 orr r12,r12,r8 1443 orr r12,r12,r9 1444 orr r12,r12,r10 1445 orr r12,r12,r11 1446 cmp r12,#0 1447#ifdef __thumb2__ 1448 it ne 1449#endif 1450 movne r12,#-1 1451 stmia r3,{r4-r11} 1452 str r12,[sp,#32*18+4] @ ~in1infty 1453 1454 add $a_ptr,sp,#$in2_z 1455 add $b_ptr,sp,#$in2_z 1456 add $r_ptr,sp,#$Z2sqr 1457 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); 1458 1459 add $a_ptr,sp,#$in1_z 1460 add $b_ptr,sp,#$in1_z 1461 add $r_ptr,sp,#$Z1sqr 1462 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1463 1464 add $a_ptr,sp,#$in2_z 1465 add $b_ptr,sp,#$Z2sqr 1466 add $r_ptr,sp,#$S1 1467 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); 1468 1469 add $a_ptr,sp,#$in1_z 1470 add $b_ptr,sp,#$Z1sqr 1471 add $r_ptr,sp,#$S2 1472 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1473 1474 add $a_ptr,sp,#$in1_y 1475 add $b_ptr,sp,#$S1 1476 add $r_ptr,sp,#$S1 1477 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); 1478 1479 add $a_ptr,sp,#$in2_y 1480 add $b_ptr,sp,#$S2 1481 add $r_ptr,sp,#$S2 1482 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1483 1484 add $b_ptr,sp,#$S1 1485 add $r_ptr,sp,#$R 1486 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); 1487 1488 orr $a0,$a0,$a1 @ see if result is zero 1489 orr $a2,$a2,$a3 1490 orr $a4,$a4,$a5 1491 orr $a0,$a0,$a2 1492 orr $a4,$a4,$a6 1493 orr $a0,$a0,$a7 1494 add $a_ptr,sp,#$in1_x 1495 orr $a0,$a0,$a4 1496 add $b_ptr,sp,#$Z2sqr 1497 str $a0,[sp,#32*18+12] 1498 1499 add $r_ptr,sp,#$U1 1500 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); 1501 1502 add $a_ptr,sp,#$in2_x 1503 add $b_ptr,sp,#$Z1sqr 1504 add $r_ptr,sp,#$U2 1505 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); 1506 1507 add $b_ptr,sp,#$U1 1508 add $r_ptr,sp,#$H 1509 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); 1510 1511 orr $a0,$a0,$a1 @ see if result is zero 1512 orr $a2,$a2,$a3 1513 orr $a4,$a4,$a5 1514 orr $a0,$a0,$a2 1515 orr $a4,$a4,$a6 1516 orr $a0,$a0,$a7 1517 orr $a0,$a0,$a4 @ ~is_equal(U1,U2) 1518 1519 ldr $t0,[sp,#32*18+4] @ ~in1infty 1520 ldr $t1,[sp,#32*18+8] @ ~in2infty 1521 ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2) 1522 mvn $t0,$t0 @ -1/0 -> 0/-1 1523 mvn $t1,$t1 @ -1/0 -> 0/-1 1524 orr $a0,$a0,$t0 1525 orr $a0,$a0,$t1 1526 orrs $a0,$a0,$t2 @ set flags 1527 1528 @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 1529 bne .Ladd_proceed 1530 1531.Ladd_double: 1532 ldr $a_ptr,[sp,#32*18+20] 1533 add sp,sp,#32*(18-5)+16 @ difference in frame sizes 1534 b .Lpoint_double_shortcut 1535 1536.align 4 1537.Ladd_proceed: 1538 add $a_ptr,sp,#$R 1539 add $b_ptr,sp,#$R 1540 add $r_ptr,sp,#$Rsqr 1541 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1542 1543 add $a_ptr,sp,#$H 1544 add $b_ptr,sp,#$in1_z 1545 add $r_ptr,sp,#$res_z 1546 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1547 1548 add $a_ptr,sp,#$H 1549 add $b_ptr,sp,#$H 1550 add $r_ptr,sp,#$Hsqr 1551 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1552 1553 add $a_ptr,sp,#$in2_z 1554 add $b_ptr,sp,#$res_z 1555 add $r_ptr,sp,#$res_z 1556 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); 1557 1558 add $a_ptr,sp,#$H 1559 add $b_ptr,sp,#$Hsqr 1560 add $r_ptr,sp,#$Hcub 1561 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1562 1563 add $a_ptr,sp,#$Hsqr 1564 add $b_ptr,sp,#$U1 1565 add $r_ptr,sp,#$U2 1566 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); 1567 1568 add $r_ptr,sp,#$Hsqr 1569 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1570 1571 add $b_ptr,sp,#$Rsqr 1572 add $r_ptr,sp,#$res_x 1573 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1574 1575 add $b_ptr,sp,#$Hcub 1576 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1577 1578 add $b_ptr,sp,#$U2 1579 add $r_ptr,sp,#$res_y 1580 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1581 1582 add $a_ptr,sp,#$Hcub 1583 add $b_ptr,sp,#$S1 1584 add $r_ptr,sp,#$S2 1585 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); 1586 1587 add $a_ptr,sp,#$R 1588 add $b_ptr,sp,#$res_y 1589 add $r_ptr,sp,#$res_y 1590 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1591 1592 add $b_ptr,sp,#$S2 1593 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1594 1595 ldr r11,[sp,#32*18+4] @ ~in1infty 1596 ldr r12,[sp,#32*18+8] @ ~in2infty 1597 add r1,sp,#$res_x 1598 add r2,sp,#$in2_x 1599 and r10,r11,r12 @ ~in1infty & ~in2infty 1600 mvn r11,r11 1601 add r3,sp,#$in1_x 1602 and r11,r11,r12 @ in1infty & ~in2infty 1603 mvn r12,r12 @ in2infty 1604 ldr $r_ptr,[sp,#32*18+16] 1605___ 1606for($i=0;$i<96;$i+=8) { # conditional moves 1607$code.=<<___; 1608 ldmia r1!,{r4-r5} @ res_x 1609 ldmia r2!,{r6-r7} @ in2_x 1610 ldmia r3!,{r8-r9} @ in1_x 1611 and r4,r4,r10 @ ~in1infty & ~in2infty 1612 and r5,r5,r10 1613 and r6,r6,r11 @ in1infty & ~in2infty 1614 and r7,r7,r11 1615 and r8,r8,r12 @ in2infty 1616 and r9,r9,r12 1617 orr r4,r4,r6 1618 orr r5,r5,r7 1619 orr r4,r4,r8 1620 orr r5,r5,r9 1621 stmia $r_ptr!,{r4-r5} 1622___ 1623} 1624$code.=<<___; 1625.Ladd_done: 1626 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" 1627#if __ARM_ARCH__>=5 || !defined(__thumb__) 1628 ldmia sp!,{r4-r12,pc} 1629#else 1630 ldmia sp!,{r4-r12,lr} 1631 bx lr @ interoperable with Thumb ISA:-) 1632#endif 1633.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1634___ 1635} 1636 1637######################################################################## 1638# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1639# const P256_POINT_AFFINE *in2); 1640{ 1641my ($res_x,$res_y,$res_z, 1642 $in1_x,$in1_y,$in1_z, 1643 $in2_x,$in2_y, 1644 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1645my $Z1sqr = $S2; 1646# above map() describes stack layout with 18 temporary 1647# 256-bit vectors on top. Then note that we push 1648# starting from r0, which means that we have copy of 1649# input arguments just below these temporary vectors. 1650# We use two of them for ~in1infty, ~in2infty. 1651 1652my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1653 1654$code.=<<___; 1655.globl ecp_nistz256_point_add_affine 1656.type ecp_nistz256_point_add_affine,%function 1657.align 5 1658ecp_nistz256_point_add_affine: 1659 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1660 sub sp,sp,#32*15 1661 1662 ldmia $a_ptr!,{r4-r11} @ copy in1_x 1663 add r3,sp,#$in1_x 1664 stmia r3!,{r4-r11} 1665 ldmia $a_ptr!,{r4-r11} @ copy in1_y 1666 stmia r3!,{r4-r11} 1667 ldmia $a_ptr,{r4-r11} @ copy in1_z 1668 orr r12,r4,r5 1669 orr r12,r12,r6 1670 orr r12,r12,r7 1671 orr r12,r12,r8 1672 orr r12,r12,r9 1673 orr r12,r12,r10 1674 orr r12,r12,r11 1675 cmp r12,#0 1676#ifdef __thumb2__ 1677 it ne 1678#endif 1679 movne r12,#-1 1680 stmia r3,{r4-r11} 1681 str r12,[sp,#32*15+4] @ ~in1infty 1682 1683 ldmia $b_ptr!,{r4-r11} @ copy in2_x 1684 add r3,sp,#$in2_x 1685 orr r12,r4,r5 1686 orr r12,r12,r6 1687 orr r12,r12,r7 1688 orr r12,r12,r8 1689 orr r12,r12,r9 1690 orr r12,r12,r10 1691 orr r12,r12,r11 1692 stmia r3!,{r4-r11} 1693 ldmia $b_ptr!,{r4-r11} @ copy in2_y 1694 orr r12,r12,r4 1695 orr r12,r12,r5 1696 orr r12,r12,r6 1697 orr r12,r12,r7 1698 orr r12,r12,r8 1699 orr r12,r12,r9 1700 orr r12,r12,r10 1701 orr r12,r12,r11 1702 stmia r3!,{r4-r11} 1703 cmp r12,#0 1704#ifdef __thumb2__ 1705 it ne 1706#endif 1707 movne r12,#-1 1708 str r12,[sp,#32*15+8] @ ~in2infty 1709 1710 add $a_ptr,sp,#$in1_z 1711 add $b_ptr,sp,#$in1_z 1712 add $r_ptr,sp,#$Z1sqr 1713 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1714 1715 add $a_ptr,sp,#$Z1sqr 1716 add $b_ptr,sp,#$in2_x 1717 add $r_ptr,sp,#$U2 1718 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); 1719 1720 add $b_ptr,sp,#$in1_x 1721 add $r_ptr,sp,#$H 1722 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); 1723 1724 add $a_ptr,sp,#$Z1sqr 1725 add $b_ptr,sp,#$in1_z 1726 add $r_ptr,sp,#$S2 1727 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1728 1729 add $a_ptr,sp,#$H 1730 add $b_ptr,sp,#$in1_z 1731 add $r_ptr,sp,#$res_z 1732 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1733 1734 add $a_ptr,sp,#$in2_y 1735 add $b_ptr,sp,#$S2 1736 add $r_ptr,sp,#$S2 1737 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1738 1739 add $b_ptr,sp,#$in1_y 1740 add $r_ptr,sp,#$R 1741 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); 1742 1743 add $a_ptr,sp,#$H 1744 add $b_ptr,sp,#$H 1745 add $r_ptr,sp,#$Hsqr 1746 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1747 1748 add $a_ptr,sp,#$R 1749 add $b_ptr,sp,#$R 1750 add $r_ptr,sp,#$Rsqr 1751 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1752 1753 add $a_ptr,sp,#$H 1754 add $b_ptr,sp,#$Hsqr 1755 add $r_ptr,sp,#$Hcub 1756 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1757 1758 add $a_ptr,sp,#$Hsqr 1759 add $b_ptr,sp,#$in1_x 1760 add $r_ptr,sp,#$U2 1761 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); 1762 1763 add $r_ptr,sp,#$Hsqr 1764 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1765 1766 add $b_ptr,sp,#$Rsqr 1767 add $r_ptr,sp,#$res_x 1768 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1769 1770 add $b_ptr,sp,#$Hcub 1771 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1772 1773 add $b_ptr,sp,#$U2 1774 add $r_ptr,sp,#$res_y 1775 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1776 1777 add $a_ptr,sp,#$Hcub 1778 add $b_ptr,sp,#$in1_y 1779 add $r_ptr,sp,#$S2 1780 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); 1781 1782 add $a_ptr,sp,#$R 1783 add $b_ptr,sp,#$res_y 1784 add $r_ptr,sp,#$res_y 1785 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1786 1787 add $b_ptr,sp,#$S2 1788 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1789 1790 ldr r11,[sp,#32*15+4] @ ~in1infty 1791 ldr r12,[sp,#32*15+8] @ ~in2infty 1792 add r1,sp,#$res_x 1793 add r2,sp,#$in2_x 1794 and r10,r11,r12 @ ~in1infty & ~in2infty 1795 mvn r11,r11 1796 add r3,sp,#$in1_x 1797 and r11,r11,r12 @ in1infty & ~in2infty 1798 mvn r12,r12 @ in2infty 1799 ldr $r_ptr,[sp,#32*15] 1800___ 1801for($i=0;$i<64;$i+=8) { # conditional moves 1802$code.=<<___; 1803 ldmia r1!,{r4-r5} @ res_x 1804 ldmia r2!,{r6-r7} @ in2_x 1805 ldmia r3!,{r8-r9} @ in1_x 1806 and r4,r4,r10 @ ~in1infty & ~in2infty 1807 and r5,r5,r10 1808 and r6,r6,r11 @ in1infty & ~in2infty 1809 and r7,r7,r11 1810 and r8,r8,r12 @ in2infty 1811 and r9,r9,r12 1812 orr r4,r4,r6 1813 orr r5,r5,r7 1814 orr r4,r4,r8 1815 orr r5,r5,r9 1816 stmia $r_ptr!,{r4-r5} 1817___ 1818} 1819for(;$i<96;$i+=8) { 1820my $j=($i-64)/4; 1821$code.=<<___; 1822 ldmia r1!,{r4-r5} @ res_z 1823 ldmia r3!,{r8-r9} @ in1_z 1824 and r4,r4,r10 1825 and r5,r5,r10 1826 and r6,r11,#@ONE_mont[$j] 1827 and r7,r11,#@ONE_mont[$j+1] 1828 and r8,r8,r12 1829 and r9,r9,r12 1830 orr r4,r4,r6 1831 orr r5,r5,r7 1832 orr r4,r4,r8 1833 orr r5,r5,r9 1834 stmia $r_ptr!,{r4-r5} 1835___ 1836} 1837$code.=<<___; 1838 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" 1839#if __ARM_ARCH__>=5 || !defined(__thumb__) 1840 ldmia sp!,{r4-r12,pc} 1841#else 1842 ldmia sp!,{r4-r12,lr} 1843 bx lr @ interoperable with Thumb ISA:-) 1844#endif 1845.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1846___ 1847} }}} 1848 1849foreach (split("\n",$code)) { 1850 s/\`([^\`]*)\`/eval $1/geo; 1851 1852 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1853 1854 print $_,"\n"; 1855} 1856close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1857