1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34# $output is the last argument if it looks like a file (it has an extension) 35# $flavour is the first argument if it doesn't look like a file 36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 38 39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 42die "can't locate arm-xlate.pl"; 43 44open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 45 or die "can't call $xlate: $!"; 46*STDOUT=*OUT; 47 48{ 49my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 50 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 51 map("x$_",(0..17,19,20)); 52 53my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 54 55$code.=<<___; 56#include "arm_arch.h" 57 58.text 59___ 60######################################################################## 61# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 62# 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64open TABLE,"<ecp_nistz256_table.c" or 65open TABLE,"<${dir}../ecp_nistz256_table.c" or 66die "failed to open ecp_nistz256_table.c:",$!; 67 68use integer; 69 70foreach(<TABLE>) { 71 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 72} 73close TABLE; 74 75# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 76# 64*16*37-1 is because $#arr returns last valid index or @arr, not 77# amount of elements. 78die "insane number of elements" if ($#arr != 64*16*37-1); 79 80$code.=<<___; 81.globl ecp_nistz256_precomputed 82.type ecp_nistz256_precomputed,%object 83.align 12 84ecp_nistz256_precomputed: 85___ 86######################################################################## 87# this conversion smashes P256_POINT_AFFINE by individual bytes with 88# 64 byte interval, similar to 89# 1111222233334444 90# 1234123412341234 91for(1..37) { 92 @tbl = splice(@arr,0,64*16); 93 for($i=0;$i<64;$i++) { 94 undef @line; 95 for($j=0;$j<64;$j++) { 96 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 97 } 98 $code.=".byte\t"; 99 $code.=join(',',map { sprintf "0x%02x",$_} @line); 100 $code.="\n"; 101 } 102} 103$code.=<<___; 104.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 105.align 5 106.Lpoly: 107.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 108.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 109.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 110.Lone_mont: 111.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 112.Lone: 113.quad 1,0,0,0 114.Lord: 115.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 116.LordK: 117.quad 0xccd1c8aaee00bc4f 118.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 119 120// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 121.globl ecp_nistz256_to_mont 122.type ecp_nistz256_to_mont,%function 123.align 6 124ecp_nistz256_to_mont: 125 AARCH64_SIGN_LINK_REGISTER 126 stp x29,x30,[sp,#-32]! 127 add x29,sp,#0 128 stp x19,x20,[sp,#16] 129 130 ldr $bi,.LRR // bp[0] 131 ldp $a0,$a1,[$ap] 132 ldp $a2,$a3,[$ap,#16] 133 ldr $poly1,.Lpoly+8 134 ldr $poly3,.Lpoly+24 135 adr $bp,.LRR // &bp[0] 136 137 bl __ecp_nistz256_mul_mont 138 139 ldp x19,x20,[sp,#16] 140 ldp x29,x30,[sp],#32 141 AARCH64_VALIDATE_LINK_REGISTER 142 ret 143.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 144 145// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 146.globl ecp_nistz256_from_mont 147.type ecp_nistz256_from_mont,%function 148.align 4 149ecp_nistz256_from_mont: 150 AARCH64_SIGN_LINK_REGISTER 151 stp x29,x30,[sp,#-32]! 152 add x29,sp,#0 153 stp x19,x20,[sp,#16] 154 155 mov $bi,#1 // bp[0] 156 ldp $a0,$a1,[$ap] 157 ldp $a2,$a3,[$ap,#16] 158 ldr $poly1,.Lpoly+8 159 ldr $poly3,.Lpoly+24 160 adr $bp,.Lone // &bp[0] 161 162 bl __ecp_nistz256_mul_mont 163 164 ldp x19,x20,[sp,#16] 165 ldp x29,x30,[sp],#32 166 AARCH64_VALIDATE_LINK_REGISTER 167 ret 168.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 169 170// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 171// const BN_ULONG x2[4]); 172.globl ecp_nistz256_mul_mont 173.type ecp_nistz256_mul_mont,%function 174.align 4 175ecp_nistz256_mul_mont: 176 AARCH64_SIGN_LINK_REGISTER 177 stp x29,x30,[sp,#-32]! 178 add x29,sp,#0 179 stp x19,x20,[sp,#16] 180 181 ldr $bi,[$bp] // bp[0] 182 ldp $a0,$a1,[$ap] 183 ldp $a2,$a3,[$ap,#16] 184 ldr $poly1,.Lpoly+8 185 ldr $poly3,.Lpoly+24 186 187 bl __ecp_nistz256_mul_mont 188 189 ldp x19,x20,[sp,#16] 190 ldp x29,x30,[sp],#32 191 AARCH64_VALIDATE_LINK_REGISTER 192 ret 193.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 194 195// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 196.globl ecp_nistz256_sqr_mont 197.type ecp_nistz256_sqr_mont,%function 198.align 4 199ecp_nistz256_sqr_mont: 200 AARCH64_SIGN_LINK_REGISTER 201 stp x29,x30,[sp,#-32]! 202 add x29,sp,#0 203 stp x19,x20,[sp,#16] 204 205 ldp $a0,$a1,[$ap] 206 ldp $a2,$a3,[$ap,#16] 207 ldr $poly1,.Lpoly+8 208 ldr $poly3,.Lpoly+24 209 210 bl __ecp_nistz256_sqr_mont 211 212 ldp x19,x20,[sp,#16] 213 ldp x29,x30,[sp],#32 214 AARCH64_VALIDATE_LINK_REGISTER 215 ret 216.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 217 218// void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 219// const BN_ULONG x2[4]); 220.globl ecp_nistz256_add 221.type ecp_nistz256_add,%function 222.align 4 223ecp_nistz256_add: 224 AARCH64_SIGN_LINK_REGISTER 225 stp x29,x30,[sp,#-16]! 226 add x29,sp,#0 227 228 ldp $acc0,$acc1,[$ap] 229 ldp $t0,$t1,[$bp] 230 ldp $acc2,$acc3,[$ap,#16] 231 ldp $t2,$t3,[$bp,#16] 232 ldr $poly1,.Lpoly+8 233 ldr $poly3,.Lpoly+24 234 235 bl __ecp_nistz256_add 236 237 ldp x29,x30,[sp],#16 238 AARCH64_VALIDATE_LINK_REGISTER 239 ret 240.size ecp_nistz256_add,.-ecp_nistz256_add 241 242// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 243.globl ecp_nistz256_div_by_2 244.type ecp_nistz256_div_by_2,%function 245.align 4 246ecp_nistz256_div_by_2: 247 AARCH64_SIGN_LINK_REGISTER 248 stp x29,x30,[sp,#-16]! 249 add x29,sp,#0 250 251 ldp $acc0,$acc1,[$ap] 252 ldp $acc2,$acc3,[$ap,#16] 253 ldr $poly1,.Lpoly+8 254 ldr $poly3,.Lpoly+24 255 256 bl __ecp_nistz256_div_by_2 257 258 ldp x29,x30,[sp],#16 259 AARCH64_VALIDATE_LINK_REGISTER 260 ret 261.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 262 263// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 264.globl ecp_nistz256_mul_by_2 265.type ecp_nistz256_mul_by_2,%function 266.align 4 267ecp_nistz256_mul_by_2: 268 AARCH64_SIGN_LINK_REGISTER 269 stp x29,x30,[sp,#-16]! 270 add x29,sp,#0 271 272 ldp $acc0,$acc1,[$ap] 273 ldp $acc2,$acc3,[$ap,#16] 274 ldr $poly1,.Lpoly+8 275 ldr $poly3,.Lpoly+24 276 mov $t0,$acc0 277 mov $t1,$acc1 278 mov $t2,$acc2 279 mov $t3,$acc3 280 281 bl __ecp_nistz256_add // ret = a+a // 2*a 282 283 ldp x29,x30,[sp],#16 284 AARCH64_VALIDATE_LINK_REGISTER 285 ret 286.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 287 288// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 289.globl ecp_nistz256_mul_by_3 290.type ecp_nistz256_mul_by_3,%function 291.align 4 292ecp_nistz256_mul_by_3: 293 AARCH64_SIGN_LINK_REGISTER 294 stp x29,x30,[sp,#-16]! 295 add x29,sp,#0 296 297 ldp $acc0,$acc1,[$ap] 298 ldp $acc2,$acc3,[$ap,#16] 299 ldr $poly1,.Lpoly+8 300 ldr $poly3,.Lpoly+24 301 mov $t0,$acc0 302 mov $t1,$acc1 303 mov $t2,$acc2 304 mov $t3,$acc3 305 mov $a0,$acc0 306 mov $a1,$acc1 307 mov $a2,$acc2 308 mov $a3,$acc3 309 310 bl __ecp_nistz256_add // ret = a+a // 2*a 311 312 mov $t0,$a0 313 mov $t1,$a1 314 mov $t2,$a2 315 mov $t3,$a3 316 317 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a 318 319 ldp x29,x30,[sp],#16 320 AARCH64_VALIDATE_LINK_REGISTER 321 ret 322.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 323 324// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 325// const BN_ULONG x2[4]); 326.globl ecp_nistz256_sub 327.type ecp_nistz256_sub,%function 328.align 4 329ecp_nistz256_sub: 330 AARCH64_SIGN_LINK_REGISTER 331 stp x29,x30,[sp,#-16]! 332 add x29,sp,#0 333 334 ldp $acc0,$acc1,[$ap] 335 ldp $acc2,$acc3,[$ap,#16] 336 ldr $poly1,.Lpoly+8 337 ldr $poly3,.Lpoly+24 338 339 bl __ecp_nistz256_sub_from 340 341 ldp x29,x30,[sp],#16 342 AARCH64_VALIDATE_LINK_REGISTER 343 ret 344.size ecp_nistz256_sub,.-ecp_nistz256_sub 345 346// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 347.globl ecp_nistz256_neg 348.type ecp_nistz256_neg,%function 349.align 4 350ecp_nistz256_neg: 351 AARCH64_SIGN_LINK_REGISTER 352 stp x29,x30,[sp,#-16]! 353 add x29,sp,#0 354 355 mov $bp,$ap 356 mov $acc0,xzr // a = 0 357 mov $acc1,xzr 358 mov $acc2,xzr 359 mov $acc3,xzr 360 ldr $poly1,.Lpoly+8 361 ldr $poly3,.Lpoly+24 362 363 bl __ecp_nistz256_sub_from 364 365 ldp x29,x30,[sp],#16 366 AARCH64_VALIDATE_LINK_REGISTER 367 ret 368.size ecp_nistz256_neg,.-ecp_nistz256_neg 369 370// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 371// to $a0-$a3 and b[0] - to $bi 372.type __ecp_nistz256_mul_mont,%function 373.align 4 374__ecp_nistz256_mul_mont: 375 mul $acc0,$a0,$bi // a[0]*b[0] 376 umulh $t0,$a0,$bi 377 378 mul $acc1,$a1,$bi // a[1]*b[0] 379 umulh $t1,$a1,$bi 380 381 mul $acc2,$a2,$bi // a[2]*b[0] 382 umulh $t2,$a2,$bi 383 384 mul $acc3,$a3,$bi // a[3]*b[0] 385 umulh $t3,$a3,$bi 386 ldr $bi,[$bp,#8] // b[1] 387 388 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 389 lsl $t0,$acc0,#32 390 adcs $acc2,$acc2,$t1 391 lsr $t1,$acc0,#32 392 adcs $acc3,$acc3,$t2 393 adc $acc4,xzr,$t3 394 mov $acc5,xzr 395___ 396for($i=1;$i<4;$i++) { 397 # Reduction iteration is normally performed by accumulating 398 # result of multiplication of modulus by "magic" digit [and 399 # omitting least significant word, which is guaranteed to 400 # be 0], but thanks to special form of modulus and "magic" 401 # digit being equal to least significant word, it can be 402 # performed with additions and subtractions alone. Indeed: 403 # 404 # ffff0001.00000000.0000ffff.ffffffff 405 # * abcdefgh 406 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 407 # 408 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 409 # rewrite above as: 410 # 411 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 412 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 413 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 414 # 415 # or marking redundant operations: 416 # 417 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 418 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 419 # - 0000abcd.efgh0000.--------.--------.-------- 420 421$code.=<<___; 422 subs $t2,$acc0,$t0 // "*0xffff0001" 423 sbc $t3,$acc0,$t1 424 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 425 mul $t0,$a0,$bi // lo(a[0]*b[i]) 426 adcs $acc1,$acc2,$t1 427 mul $t1,$a1,$bi // lo(a[1]*b[i]) 428 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 429 mul $t2,$a2,$bi // lo(a[2]*b[i]) 430 adcs $acc3,$acc4,$t3 431 mul $t3,$a3,$bi // lo(a[3]*b[i]) 432 adc $acc4,$acc5,xzr 433 434 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 435 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 436 adcs $acc1,$acc1,$t1 437 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 438 adcs $acc2,$acc2,$t2 439 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 440 adcs $acc3,$acc3,$t3 441 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 442 adc $acc4,$acc4,xzr 443___ 444$code.=<<___ if ($i<3); 445 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 446___ 447$code.=<<___; 448 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 449 lsl $t0,$acc0,#32 450 adcs $acc2,$acc2,$t1 451 lsr $t1,$acc0,#32 452 adcs $acc3,$acc3,$t2 453 adcs $acc4,$acc4,$t3 454 adc $acc5,xzr,xzr 455___ 456} 457$code.=<<___; 458 // last reduction 459 subs $t2,$acc0,$t0 // "*0xffff0001" 460 sbc $t3,$acc0,$t1 461 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 462 adcs $acc1,$acc2,$t1 463 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 464 adcs $acc3,$acc4,$t3 465 adc $acc4,$acc5,xzr 466 467 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 468 sbcs $t1,$acc1,$poly1 469 sbcs $t2,$acc2,xzr 470 sbcs $t3,$acc3,$poly3 471 sbcs xzr,$acc4,xzr // did it borrow? 472 473 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 474 csel $acc1,$acc1,$t1,lo 475 csel $acc2,$acc2,$t2,lo 476 stp $acc0,$acc1,[$rp] 477 csel $acc3,$acc3,$t3,lo 478 stp $acc2,$acc3,[$rp,#16] 479 480 ret 481.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 482 483// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 484// to $a0-$a3 485.type __ecp_nistz256_sqr_mont,%function 486.align 4 487__ecp_nistz256_sqr_mont: 488 // | | | | | |a1*a0| | 489 // | | | | |a2*a0| | | 490 // | |a3*a2|a3*a0| | | | 491 // | | | |a2*a1| | | | 492 // | | |a3*a1| | | | | 493 // *| | | | | | | | 2| 494 // +|a3*a3|a2*a2|a1*a1|a0*a0| 495 // |--+--+--+--+--+--+--+--| 496 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 497 // 498 // "can't overflow" below mark carrying into high part of 499 // multiplication result, which can't overflow, because it 500 // can never be all ones. 501 502 mul $acc1,$a1,$a0 // a[1]*a[0] 503 umulh $t1,$a1,$a0 504 mul $acc2,$a2,$a0 // a[2]*a[0] 505 umulh $t2,$a2,$a0 506 mul $acc3,$a3,$a0 // a[3]*a[0] 507 umulh $acc4,$a3,$a0 508 509 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 510 mul $t0,$a2,$a1 // a[2]*a[1] 511 umulh $t1,$a2,$a1 512 adcs $acc3,$acc3,$t2 513 mul $t2,$a3,$a1 // a[3]*a[1] 514 umulh $t3,$a3,$a1 515 adc $acc4,$acc4,xzr // can't overflow 516 517 mul $acc5,$a3,$a2 // a[3]*a[2] 518 umulh $acc6,$a3,$a2 519 520 adds $t1,$t1,$t2 // accumulate high parts of multiplication 521 mul $acc0,$a0,$a0 // a[0]*a[0] 522 adc $t2,$t3,xzr // can't overflow 523 524 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 525 umulh $a0,$a0,$a0 526 adcs $acc4,$acc4,$t1 527 mul $t1,$a1,$a1 // a[1]*a[1] 528 adcs $acc5,$acc5,$t2 529 umulh $a1,$a1,$a1 530 adc $acc6,$acc6,xzr // can't overflow 531 532 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 533 mul $t2,$a2,$a2 // a[2]*a[2] 534 adcs $acc2,$acc2,$acc2 535 umulh $a2,$a2,$a2 536 adcs $acc3,$acc3,$acc3 537 mul $t3,$a3,$a3 // a[3]*a[3] 538 adcs $acc4,$acc4,$acc4 539 umulh $a3,$a3,$a3 540 adcs $acc5,$acc5,$acc5 541 adcs $acc6,$acc6,$acc6 542 adc $acc7,xzr,xzr 543 544 adds $acc1,$acc1,$a0 // +a[i]*a[i] 545 adcs $acc2,$acc2,$t1 546 adcs $acc3,$acc3,$a1 547 adcs $acc4,$acc4,$t2 548 adcs $acc5,$acc5,$a2 549 lsl $t0,$acc0,#32 550 adcs $acc6,$acc6,$t3 551 lsr $t1,$acc0,#32 552 adc $acc7,$acc7,$a3 553___ 554for($i=0;$i<3;$i++) { # reductions, see commentary in 555 # multiplication for details 556$code.=<<___; 557 subs $t2,$acc0,$t0 // "*0xffff0001" 558 sbc $t3,$acc0,$t1 559 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 560 adcs $acc1,$acc2,$t1 561 lsl $t0,$acc0,#32 562 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 563 lsr $t1,$acc0,#32 564 adc $acc3,$t3,xzr // can't overflow 565___ 566} 567$code.=<<___; 568 subs $t2,$acc0,$t0 // "*0xffff0001" 569 sbc $t3,$acc0,$t1 570 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 571 adcs $acc1,$acc2,$t1 572 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 573 adc $acc3,$t3,xzr // can't overflow 574 575 adds $acc0,$acc0,$acc4 // accumulate upper half 576 adcs $acc1,$acc1,$acc5 577 adcs $acc2,$acc2,$acc6 578 adcs $acc3,$acc3,$acc7 579 adc $acc4,xzr,xzr 580 581 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 582 sbcs $t1,$acc1,$poly1 583 sbcs $t2,$acc2,xzr 584 sbcs $t3,$acc3,$poly3 585 sbcs xzr,$acc4,xzr // did it borrow? 586 587 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 588 csel $acc1,$acc1,$t1,lo 589 csel $acc2,$acc2,$t2,lo 590 stp $acc0,$acc1,[$rp] 591 csel $acc3,$acc3,$t3,lo 592 stp $acc2,$acc3,[$rp,#16] 593 594 ret 595.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 596 597// Note that __ecp_nistz256_add expects both input vectors pre-loaded to 598// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 599// contexts, e.g. in multiplication by 2 and 3... 600.type __ecp_nistz256_add,%function 601.align 4 602__ecp_nistz256_add: 603 adds $acc0,$acc0,$t0 // ret = a+b 604 adcs $acc1,$acc1,$t1 605 adcs $acc2,$acc2,$t2 606 adcs $acc3,$acc3,$t3 607 adc $ap,xzr,xzr // zap $ap 608 609 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 610 sbcs $t1,$acc1,$poly1 611 sbcs $t2,$acc2,xzr 612 sbcs $t3,$acc3,$poly3 613 sbcs xzr,$ap,xzr // did subtraction borrow? 614 615 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 616 csel $acc1,$acc1,$t1,lo 617 csel $acc2,$acc2,$t2,lo 618 stp $acc0,$acc1,[$rp] 619 csel $acc3,$acc3,$t3,lo 620 stp $acc2,$acc3,[$rp,#16] 621 622 ret 623.size __ecp_nistz256_add,.-__ecp_nistz256_add 624 625.type __ecp_nistz256_sub_from,%function 626.align 4 627__ecp_nistz256_sub_from: 628 ldp $t0,$t1,[$bp] 629 ldp $t2,$t3,[$bp,#16] 630 subs $acc0,$acc0,$t0 // ret = a-b 631 sbcs $acc1,$acc1,$t1 632 sbcs $acc2,$acc2,$t2 633 sbcs $acc3,$acc3,$t3 634 sbc $ap,xzr,xzr // zap $ap 635 636 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 637 adcs $t1,$acc1,$poly1 638 adcs $t2,$acc2,xzr 639 adc $t3,$acc3,$poly3 640 cmp $ap,xzr // did subtraction borrow? 641 642 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 643 csel $acc1,$acc1,$t1,eq 644 csel $acc2,$acc2,$t2,eq 645 stp $acc0,$acc1,[$rp] 646 csel $acc3,$acc3,$t3,eq 647 stp $acc2,$acc3,[$rp,#16] 648 649 ret 650.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 651 652.type __ecp_nistz256_sub_morf,%function 653.align 4 654__ecp_nistz256_sub_morf: 655 ldp $t0,$t1,[$bp] 656 ldp $t2,$t3,[$bp,#16] 657 subs $acc0,$t0,$acc0 // ret = b-a 658 sbcs $acc1,$t1,$acc1 659 sbcs $acc2,$t2,$acc2 660 sbcs $acc3,$t3,$acc3 661 sbc $ap,xzr,xzr // zap $ap 662 663 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 664 adcs $t1,$acc1,$poly1 665 adcs $t2,$acc2,xzr 666 adc $t3,$acc3,$poly3 667 cmp $ap,xzr // did subtraction borrow? 668 669 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 670 csel $acc1,$acc1,$t1,eq 671 csel $acc2,$acc2,$t2,eq 672 stp $acc0,$acc1,[$rp] 673 csel $acc3,$acc3,$t3,eq 674 stp $acc2,$acc3,[$rp,#16] 675 676 ret 677.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 678 679.type __ecp_nistz256_div_by_2,%function 680.align 4 681__ecp_nistz256_div_by_2: 682 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 683 adcs $t1,$acc1,$poly1 684 adcs $t2,$acc2,xzr 685 adcs $t3,$acc3,$poly3 686 adc $ap,xzr,xzr // zap $ap 687 tst $acc0,#1 // is a even? 688 689 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 690 csel $acc1,$acc1,$t1,eq 691 csel $acc2,$acc2,$t2,eq 692 csel $acc3,$acc3,$t3,eq 693 csel $ap,xzr,$ap,eq 694 695 lsr $acc0,$acc0,#1 // ret >>= 1 696 orr $acc0,$acc0,$acc1,lsl#63 697 lsr $acc1,$acc1,#1 698 orr $acc1,$acc1,$acc2,lsl#63 699 lsr $acc2,$acc2,#1 700 orr $acc2,$acc2,$acc3,lsl#63 701 lsr $acc3,$acc3,#1 702 stp $acc0,$acc1,[$rp] 703 orr $acc3,$acc3,$ap,lsl#63 704 stp $acc2,$acc3,[$rp,#16] 705 706 ret 707.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 708___ 709######################################################################## 710# following subroutines are "literal" implementation of those found in 711# ecp_nistz256.c 712# 713######################################################################## 714# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 715# 716{ 717my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 718# above map() describes stack layout with 4 temporary 719# 256-bit vectors on top. 720my ($rp_real,$ap_real) = map("x$_",(21,22)); 721 722$code.=<<___; 723.globl ecp_nistz256_point_double 724.type ecp_nistz256_point_double,%function 725.align 5 726ecp_nistz256_point_double: 727 AARCH64_SIGN_LINK_REGISTER 728 stp x29,x30,[sp,#-96]! 729 add x29,sp,#0 730 stp x19,x20,[sp,#16] 731 stp x21,x22,[sp,#32] 732 sub sp,sp,#32*4 733 734.Ldouble_shortcut: 735 ldp $acc0,$acc1,[$ap,#32] 736 mov $rp_real,$rp 737 ldp $acc2,$acc3,[$ap,#48] 738 mov $ap_real,$ap 739 ldr $poly1,.Lpoly+8 740 mov $t0,$acc0 741 ldr $poly3,.Lpoly+24 742 mov $t1,$acc1 743 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 744 mov $t2,$acc2 745 mov $t3,$acc3 746 ldp $a2,$a3,[$ap_real,#64+16] 747 add $rp,sp,#$S 748 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); 749 750 add $rp,sp,#$Zsqr 751 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 752 753 ldp $t0,$t1,[$ap_real] 754 ldp $t2,$t3,[$ap_real,#16] 755 mov $a0,$acc0 // put Zsqr aside for p256_sub 756 mov $a1,$acc1 757 mov $a2,$acc2 758 mov $a3,$acc3 759 add $rp,sp,#$M 760 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); 761 762 add $bp,$ap_real,#0 763 mov $acc0,$a0 // restore Zsqr 764 mov $acc1,$a1 765 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 766 mov $acc2,$a2 767 mov $acc3,$a3 768 ldp $a2,$a3,[sp,#$S+16] 769 add $rp,sp,#$Zsqr 770 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 771 772 add $rp,sp,#$S 773 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 774 775 ldr $bi,[$ap_real,#32] 776 ldp $a0,$a1,[$ap_real,#64] 777 ldp $a2,$a3,[$ap_real,#64+16] 778 add $bp,$ap_real,#32 779 add $rp,sp,#$tmp0 780 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 781 782 mov $t0,$acc0 783 mov $t1,$acc1 784 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 785 mov $t2,$acc2 786 mov $t3,$acc3 787 ldp $a2,$a3,[sp,#$S+16] 788 add $rp,$rp_real,#64 789 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); 790 791 add $rp,sp,#$tmp0 792 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 793 794 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 795 ldp $a0,$a1,[sp,#$M] 796 ldp $a2,$a3,[sp,#$M+16] 797 add $rp,$rp_real,#32 798 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 799 800 add $bp,sp,#$Zsqr 801 add $rp,sp,#$M 802 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 803 804 mov $t0,$acc0 // duplicate M 805 mov $t1,$acc1 806 mov $t2,$acc2 807 mov $t3,$acc3 808 mov $a0,$acc0 // put M aside 809 mov $a1,$acc1 810 mov $a2,$acc2 811 mov $a3,$acc3 812 add $rp,sp,#$M 813 bl __ecp_nistz256_add 814 mov $t0,$a0 // restore M 815 mov $t1,$a1 816 ldr $bi,[$ap_real] // forward load for p256_mul_mont 817 mov $t2,$a2 818 ldp $a0,$a1,[sp,#$S] 819 mov $t3,$a3 820 ldp $a2,$a3,[sp,#$S+16] 821 bl __ecp_nistz256_add // p256_mul_by_3(M, M); 822 823 add $bp,$ap_real,#0 824 add $rp,sp,#$S 825 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 826 827 mov $t0,$acc0 828 mov $t1,$acc1 829 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 830 mov $t2,$acc2 831 mov $t3,$acc3 832 ldp $a2,$a3,[sp,#$M+16] 833 add $rp,sp,#$tmp0 834 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); 835 836 add $rp,$rp_real,#0 837 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 838 839 add $bp,sp,#$tmp0 840 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 841 842 add $bp,sp,#$S 843 add $rp,sp,#$S 844 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 845 846 ldr $bi,[sp,#$M] 847 mov $a0,$acc0 // copy S 848 mov $a1,$acc1 849 mov $a2,$acc2 850 mov $a3,$acc3 851 add $bp,sp,#$M 852 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 853 854 add $bp,$rp_real,#32 855 add $rp,$rp_real,#32 856 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 857 858 add sp,x29,#0 // destroy frame 859 ldp x19,x20,[x29,#16] 860 ldp x21,x22,[x29,#32] 861 ldp x29,x30,[sp],#96 862 AARCH64_VALIDATE_LINK_REGISTER 863 ret 864.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 865___ 866} 867 868######################################################################## 869# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 870# const P256_POINT *in2); 871{ 872my ($res_x,$res_y,$res_z, 873 $H,$Hsqr,$R,$Rsqr,$Hcub, 874 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 875my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 876# above map() describes stack layout with 12 temporary 877# 256-bit vectors on top. 878my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); 879 880$code.=<<___; 881.globl ecp_nistz256_point_add 882.type ecp_nistz256_point_add,%function 883.align 5 884ecp_nistz256_point_add: 885 AARCH64_SIGN_LINK_REGISTER 886 stp x29,x30,[sp,#-96]! 887 add x29,sp,#0 888 stp x19,x20,[sp,#16] 889 stp x21,x22,[sp,#32] 890 stp x23,x24,[sp,#48] 891 stp x25,x26,[sp,#64] 892 stp x27,x28,[sp,#80] 893 sub sp,sp,#32*12 894 895 ldp $a0,$a1,[$bp,#64] // in2_z 896 ldp $a2,$a3,[$bp,#64+16] 897 mov $rp_real,$rp 898 mov $ap_real,$ap 899 mov $bp_real,$bp 900 ldr $poly1,.Lpoly+8 901 ldr $poly3,.Lpoly+24 902 orr $t0,$a0,$a1 903 orr $t2,$a2,$a3 904 orr $in2infty,$t0,$t2 905 cmp $in2infty,#0 906 csetm $in2infty,ne // ~in2infty 907 add $rp,sp,#$Z2sqr 908 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 909 910 ldp $a0,$a1,[$ap_real,#64] // in1_z 911 ldp $a2,$a3,[$ap_real,#64+16] 912 orr $t0,$a0,$a1 913 orr $t2,$a2,$a3 914 orr $in1infty,$t0,$t2 915 cmp $in1infty,#0 916 csetm $in1infty,ne // ~in1infty 917 add $rp,sp,#$Z1sqr 918 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 919 920 ldr $bi,[$bp_real,#64] 921 ldp $a0,$a1,[sp,#$Z2sqr] 922 ldp $a2,$a3,[sp,#$Z2sqr+16] 923 add $bp,$bp_real,#64 924 add $rp,sp,#$S1 925 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 926 927 ldr $bi,[$ap_real,#64] 928 ldp $a0,$a1,[sp,#$Z1sqr] 929 ldp $a2,$a3,[sp,#$Z1sqr+16] 930 add $bp,$ap_real,#64 931 add $rp,sp,#$S2 932 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 933 934 ldr $bi,[$ap_real,#32] 935 ldp $a0,$a1,[sp,#$S1] 936 ldp $a2,$a3,[sp,#$S1+16] 937 add $bp,$ap_real,#32 938 add $rp,sp,#$S1 939 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 940 941 ldr $bi,[$bp_real,#32] 942 ldp $a0,$a1,[sp,#$S2] 943 ldp $a2,$a3,[sp,#$S2+16] 944 add $bp,$bp_real,#32 945 add $rp,sp,#$S2 946 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 947 948 add $bp,sp,#$S1 949 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 950 ldp $a0,$a1,[$ap_real] 951 ldp $a2,$a3,[$ap_real,#16] 952 add $rp,sp,#$R 953 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 954 955 orr $acc0,$acc0,$acc1 // see if result is zero 956 orr $acc2,$acc2,$acc3 957 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) 958 959 add $bp,sp,#$Z2sqr 960 add $rp,sp,#$U1 961 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 962 963 ldr $bi,[sp,#$Z1sqr] 964 ldp $a0,$a1,[$bp_real] 965 ldp $a2,$a3,[$bp_real,#16] 966 add $bp,sp,#$Z1sqr 967 add $rp,sp,#$U2 968 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 969 970 add $bp,sp,#$U1 971 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 972 ldp $a2,$a3,[sp,#$R+16] 973 add $rp,sp,#$H 974 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 975 976 orr $acc0,$acc0,$acc1 // see if result is zero 977 orr $acc2,$acc2,$acc3 978 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) 979 980 mvn $temp1,$in1infty // -1/0 -> 0/-1 981 mvn $temp2,$in2infty // -1/0 -> 0/-1 982 orr $acc0,$acc0,$temp1 983 orr $acc0,$acc0,$temp2 984 orr $acc0,$acc0,$temp0 985 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 986 987.Ladd_double: 988 mov $ap,$ap_real 989 mov $rp,$rp_real 990 ldp x23,x24,[x29,#48] 991 ldp x25,x26,[x29,#64] 992 ldp x27,x28,[x29,#80] 993 add sp,sp,#32*(12-4) // difference in stack frames 994 b .Ldouble_shortcut 995 996.align 4 997.Ladd_proceed: 998 add $rp,sp,#$Rsqr 999 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1000 1001 ldr $bi,[$ap_real,#64] 1002 ldp $a0,$a1,[sp,#$H] 1003 ldp $a2,$a3,[sp,#$H+16] 1004 add $bp,$ap_real,#64 1005 add $rp,sp,#$res_z 1006 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1007 1008 ldp $a0,$a1,[sp,#$H] 1009 ldp $a2,$a3,[sp,#$H+16] 1010 add $rp,sp,#$Hsqr 1011 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1012 1013 ldr $bi,[$bp_real,#64] 1014 ldp $a0,$a1,[sp,#$res_z] 1015 ldp $a2,$a3,[sp,#$res_z+16] 1016 add $bp,$bp_real,#64 1017 add $rp,sp,#$res_z 1018 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 1019 1020 ldr $bi,[sp,#$H] 1021 ldp $a0,$a1,[sp,#$Hsqr] 1022 ldp $a2,$a3,[sp,#$Hsqr+16] 1023 add $bp,sp,#$H 1024 add $rp,sp,#$Hcub 1025 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1026 1027 ldr $bi,[sp,#$Hsqr] 1028 ldp $a0,$a1,[sp,#$U1] 1029 ldp $a2,$a3,[sp,#$U1+16] 1030 add $bp,sp,#$Hsqr 1031 add $rp,sp,#$U2 1032 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 1033 1034 mov $t0,$acc0 1035 mov $t1,$acc1 1036 mov $t2,$acc2 1037 mov $t3,$acc3 1038 add $rp,sp,#$Hsqr 1039 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1040 1041 add $bp,sp,#$Rsqr 1042 add $rp,sp,#$res_x 1043 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1044 1045 add $bp,sp,#$Hcub 1046 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1047 1048 add $bp,sp,#$U2 1049 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 1050 ldp $a0,$a1,[sp,#$S1] 1051 ldp $a2,$a3,[sp,#$S1+16] 1052 add $rp,sp,#$res_y 1053 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1054 1055 add $bp,sp,#$Hcub 1056 add $rp,sp,#$S2 1057 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 1058 1059 ldr $bi,[sp,#$R] 1060 ldp $a0,$a1,[sp,#$res_y] 1061 ldp $a2,$a3,[sp,#$res_y+16] 1062 add $bp,sp,#$R 1063 add $rp,sp,#$res_y 1064 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1065 1066 add $bp,sp,#$S2 1067 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1068 1069 ldp $a0,$a1,[sp,#$res_x] // res 1070 ldp $a2,$a3,[sp,#$res_x+16] 1071 ldp $t0,$t1,[$bp_real] // in2 1072 ldp $t2,$t3,[$bp_real,#16] 1073___ 1074for($i=0;$i<64;$i+=32) { # conditional moves 1075$code.=<<___; 1076 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1077 cmp $in1infty,#0 // ~$in1intfy, remember? 1078 ldp $acc2,$acc3,[$ap_real,#$i+16] 1079 csel $t0,$a0,$t0,ne 1080 csel $t1,$a1,$t1,ne 1081 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1082 csel $t2,$a2,$t2,ne 1083 csel $t3,$a3,$t3,ne 1084 cmp $in2infty,#0 // ~$in2intfy, remember? 1085 ldp $a2,$a3,[sp,#$res_x+$i+48] 1086 csel $acc0,$t0,$acc0,ne 1087 csel $acc1,$t1,$acc1,ne 1088 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1089 csel $acc2,$t2,$acc2,ne 1090 csel $acc3,$t3,$acc3,ne 1091 ldp $t2,$t3,[$bp_real,#$i+48] 1092 stp $acc0,$acc1,[$rp_real,#$i] 1093 stp $acc2,$acc3,[$rp_real,#$i+16] 1094___ 1095} 1096$code.=<<___; 1097 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1098 cmp $in1infty,#0 // ~$in1intfy, remember? 1099 ldp $acc2,$acc3,[$ap_real,#$i+16] 1100 csel $t0,$a0,$t0,ne 1101 csel $t1,$a1,$t1,ne 1102 csel $t2,$a2,$t2,ne 1103 csel $t3,$a3,$t3,ne 1104 cmp $in2infty,#0 // ~$in2intfy, remember? 1105 csel $acc0,$t0,$acc0,ne 1106 csel $acc1,$t1,$acc1,ne 1107 csel $acc2,$t2,$acc2,ne 1108 csel $acc3,$t3,$acc3,ne 1109 stp $acc0,$acc1,[$rp_real,#$i] 1110 stp $acc2,$acc3,[$rp_real,#$i+16] 1111 1112.Ladd_done: 1113 add sp,x29,#0 // destroy frame 1114 ldp x19,x20,[x29,#16] 1115 ldp x21,x22,[x29,#32] 1116 ldp x23,x24,[x29,#48] 1117 ldp x25,x26,[x29,#64] 1118 ldp x27,x28,[x29,#80] 1119 ldp x29,x30,[sp],#96 1120 AARCH64_VALIDATE_LINK_REGISTER 1121 ret 1122.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1123___ 1124} 1125 1126######################################################################## 1127# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1128# const P256_POINT_AFFINE *in2); 1129{ 1130my ($res_x,$res_y,$res_z, 1131 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 1132my $Z1sqr = $S2; 1133# above map() describes stack layout with 10 temporary 1134# 256-bit vectors on top. 1135my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 1136 1137$code.=<<___; 1138.globl ecp_nistz256_point_add_affine 1139.type ecp_nistz256_point_add_affine,%function 1140.align 5 1141ecp_nistz256_point_add_affine: 1142 AARCH64_SIGN_LINK_REGISTER 1143 stp x29,x30,[sp,#-80]! 1144 add x29,sp,#0 1145 stp x19,x20,[sp,#16] 1146 stp x21,x22,[sp,#32] 1147 stp x23,x24,[sp,#48] 1148 stp x25,x26,[sp,#64] 1149 sub sp,sp,#32*10 1150 1151 mov $rp_real,$rp 1152 mov $ap_real,$ap 1153 mov $bp_real,$bp 1154 ldr $poly1,.Lpoly+8 1155 ldr $poly3,.Lpoly+24 1156 1157 ldp $a0,$a1,[$ap,#64] // in1_z 1158 ldp $a2,$a3,[$ap,#64+16] 1159 orr $t0,$a0,$a1 1160 orr $t2,$a2,$a3 1161 orr $in1infty,$t0,$t2 1162 cmp $in1infty,#0 1163 csetm $in1infty,ne // ~in1infty 1164 1165 ldp $acc0,$acc1,[$bp] // in2_x 1166 ldp $acc2,$acc3,[$bp,#16] 1167 ldp $t0,$t1,[$bp,#32] // in2_y 1168 ldp $t2,$t3,[$bp,#48] 1169 orr $acc0,$acc0,$acc1 1170 orr $acc2,$acc2,$acc3 1171 orr $t0,$t0,$t1 1172 orr $t2,$t2,$t3 1173 orr $acc0,$acc0,$acc2 1174 orr $t0,$t0,$t2 1175 orr $in2infty,$acc0,$t0 1176 cmp $in2infty,#0 1177 csetm $in2infty,ne // ~in2infty 1178 1179 add $rp,sp,#$Z1sqr 1180 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1181 1182 mov $a0,$acc0 1183 mov $a1,$acc1 1184 mov $a2,$acc2 1185 mov $a3,$acc3 1186 ldr $bi,[$bp_real] 1187 add $bp,$bp_real,#0 1188 add $rp,sp,#$U2 1189 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1190 1191 add $bp,$ap_real,#0 1192 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 1193 ldp $a0,$a1,[sp,#$Z1sqr] 1194 ldp $a2,$a3,[sp,#$Z1sqr+16] 1195 add $rp,sp,#$H 1196 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1197 1198 add $bp,$ap_real,#64 1199 add $rp,sp,#$S2 1200 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1201 1202 ldr $bi,[$ap_real,#64] 1203 ldp $a0,$a1,[sp,#$H] 1204 ldp $a2,$a3,[sp,#$H+16] 1205 add $bp,$ap_real,#64 1206 add $rp,sp,#$res_z 1207 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1208 1209 ldr $bi,[$bp_real,#32] 1210 ldp $a0,$a1,[sp,#$S2] 1211 ldp $a2,$a3,[sp,#$S2+16] 1212 add $bp,$bp_real,#32 1213 add $rp,sp,#$S2 1214 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1215 1216 add $bp,$ap_real,#32 1217 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1218 ldp $a2,$a3,[sp,#$H+16] 1219 add $rp,sp,#$R 1220 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1221 1222 add $rp,sp,#$Hsqr 1223 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1224 1225 ldp $a0,$a1,[sp,#$R] 1226 ldp $a2,$a3,[sp,#$R+16] 1227 add $rp,sp,#$Rsqr 1228 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1229 1230 ldr $bi,[sp,#$H] 1231 ldp $a0,$a1,[sp,#$Hsqr] 1232 ldp $a2,$a3,[sp,#$Hsqr+16] 1233 add $bp,sp,#$H 1234 add $rp,sp,#$Hcub 1235 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1236 1237 ldr $bi,[$ap_real] 1238 ldp $a0,$a1,[sp,#$Hsqr] 1239 ldp $a2,$a3,[sp,#$Hsqr+16] 1240 add $bp,$ap_real,#0 1241 add $rp,sp,#$U2 1242 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1243 1244 mov $t0,$acc0 1245 mov $t1,$acc1 1246 mov $t2,$acc2 1247 mov $t3,$acc3 1248 add $rp,sp,#$Hsqr 1249 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 1250 1251 add $bp,sp,#$Rsqr 1252 add $rp,sp,#$res_x 1253 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1254 1255 add $bp,sp,#$Hcub 1256 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1257 1258 add $bp,sp,#$U2 1259 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1260 ldp $a0,$a1,[sp,#$Hcub] 1261 ldp $a2,$a3,[sp,#$Hcub+16] 1262 add $rp,sp,#$res_y 1263 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1264 1265 add $bp,$ap_real,#32 1266 add $rp,sp,#$S2 1267 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1268 1269 ldr $bi,[sp,#$R] 1270 ldp $a0,$a1,[sp,#$res_y] 1271 ldp $a2,$a3,[sp,#$res_y+16] 1272 add $bp,sp,#$R 1273 add $rp,sp,#$res_y 1274 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1275 1276 add $bp,sp,#$S2 1277 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1278 1279 ldp $a0,$a1,[sp,#$res_x] // res 1280 ldp $a2,$a3,[sp,#$res_x+16] 1281 ldp $t0,$t1,[$bp_real] // in2 1282 ldp $t2,$t3,[$bp_real,#16] 1283___ 1284for($i=0;$i<64;$i+=32) { # conditional moves 1285$code.=<<___; 1286 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1287 cmp $in1infty,#0 // ~$in1intfy, remember? 1288 ldp $acc2,$acc3,[$ap_real,#$i+16] 1289 csel $t0,$a0,$t0,ne 1290 csel $t1,$a1,$t1,ne 1291 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1292 csel $t2,$a2,$t2,ne 1293 csel $t3,$a3,$t3,ne 1294 cmp $in2infty,#0 // ~$in2intfy, remember? 1295 ldp $a2,$a3,[sp,#$res_x+$i+48] 1296 csel $acc0,$t0,$acc0,ne 1297 csel $acc1,$t1,$acc1,ne 1298 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1299 csel $acc2,$t2,$acc2,ne 1300 csel $acc3,$t3,$acc3,ne 1301 ldp $t2,$t3,[$bp_real,#$i+48] 1302 stp $acc0,$acc1,[$rp_real,#$i] 1303 stp $acc2,$acc3,[$rp_real,#$i+16] 1304___ 1305$code.=<<___ if ($i == 0); 1306 adr $bp_real,.Lone_mont-64 1307___ 1308} 1309$code.=<<___; 1310 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1311 cmp $in1infty,#0 // ~$in1intfy, remember? 1312 ldp $acc2,$acc3,[$ap_real,#$i+16] 1313 csel $t0,$a0,$t0,ne 1314 csel $t1,$a1,$t1,ne 1315 csel $t2,$a2,$t2,ne 1316 csel $t3,$a3,$t3,ne 1317 cmp $in2infty,#0 // ~$in2intfy, remember? 1318 csel $acc0,$t0,$acc0,ne 1319 csel $acc1,$t1,$acc1,ne 1320 csel $acc2,$t2,$acc2,ne 1321 csel $acc3,$t3,$acc3,ne 1322 stp $acc0,$acc1,[$rp_real,#$i] 1323 stp $acc2,$acc3,[$rp_real,#$i+16] 1324 1325 add sp,x29,#0 // destroy frame 1326 ldp x19,x20,[x29,#16] 1327 ldp x21,x22,[x29,#32] 1328 ldp x23,x24,[x29,#48] 1329 ldp x25,x26,[x29,#64] 1330 ldp x29,x30,[sp],#80 1331 AARCH64_VALIDATE_LINK_REGISTER 1332 ret 1333.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1334___ 1335} 1336if (1) { 1337my ($ord0,$ord1) = ($poly1,$poly3); 1338my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1339my $acc7 = $bi; 1340 1341$code.=<<___; 1342//////////////////////////////////////////////////////////////////////// 1343// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1344// uint64_t b[4]); 1345.globl ecp_nistz256_ord_mul_mont 1346.type ecp_nistz256_ord_mul_mont,%function 1347.align 4 1348ecp_nistz256_ord_mul_mont: 1349 AARCH64_VALID_CALL_TARGET 1350 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1351 stp x29,x30,[sp,#-64]! 1352 add x29,sp,#0 1353 stp x19,x20,[sp,#16] 1354 stp x21,x22,[sp,#32] 1355 stp x23,x24,[sp,#48] 1356 1357 adr $ordk,.Lord 1358 ldr $bi,[$bp] // bp[0] 1359 ldp $a0,$a1,[$ap] 1360 ldp $a2,$a3,[$ap,#16] 1361 1362 ldp $ord0,$ord1,[$ordk,#0] 1363 ldp $ord2,$ord3,[$ordk,#16] 1364 ldr $ordk,[$ordk,#32] 1365 1366 mul $acc0,$a0,$bi // a[0]*b[0] 1367 umulh $t0,$a0,$bi 1368 1369 mul $acc1,$a1,$bi // a[1]*b[0] 1370 umulh $t1,$a1,$bi 1371 1372 mul $acc2,$a2,$bi // a[2]*b[0] 1373 umulh $t2,$a2,$bi 1374 1375 mul $acc3,$a3,$bi // a[3]*b[0] 1376 umulh $acc4,$a3,$bi 1377 1378 mul $t4,$acc0,$ordk 1379 1380 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1381 adcs $acc2,$acc2,$t1 1382 adcs $acc3,$acc3,$t2 1383 adc $acc4,$acc4,xzr 1384 mov $acc5,xzr 1385___ 1386for ($i=1;$i<4;$i++) { 1387 ################################################################ 1388 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1389 # * abcdefgh 1390 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1391 # 1392 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1393 # rewrite above as: 1394 # 1395 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1396 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1397 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1398$code.=<<___; 1399 ldr $bi,[$bp,#8*$i] // b[i] 1400 1401 lsl $t0,$t4,#32 1402 subs $acc2,$acc2,$t4 1403 lsr $t1,$t4,#32 1404 sbcs $acc3,$acc3,$t0 1405 sbcs $acc4,$acc4,$t1 1406 sbc $acc5,$acc5,xzr 1407 1408 subs xzr,$acc0,#1 1409 umulh $t1,$ord0,$t4 1410 mul $t2,$ord1,$t4 1411 umulh $t3,$ord1,$t4 1412 1413 adcs $t2,$t2,$t1 1414 mul $t0,$a0,$bi 1415 adc $t3,$t3,xzr 1416 mul $t1,$a1,$bi 1417 1418 adds $acc0,$acc1,$t2 1419 mul $t2,$a2,$bi 1420 adcs $acc1,$acc2,$t3 1421 mul $t3,$a3,$bi 1422 adcs $acc2,$acc3,$t4 1423 adcs $acc3,$acc4,$t4 1424 adc $acc4,$acc5,xzr 1425 1426 adds $acc0,$acc0,$t0 // accumulate low parts 1427 umulh $t0,$a0,$bi 1428 adcs $acc1,$acc1,$t1 1429 umulh $t1,$a1,$bi 1430 adcs $acc2,$acc2,$t2 1431 umulh $t2,$a2,$bi 1432 adcs $acc3,$acc3,$t3 1433 umulh $t3,$a3,$bi 1434 adc $acc4,$acc4,xzr 1435 mul $t4,$acc0,$ordk 1436 adds $acc1,$acc1,$t0 // accumulate high parts 1437 adcs $acc2,$acc2,$t1 1438 adcs $acc3,$acc3,$t2 1439 adcs $acc4,$acc4,$t3 1440 adc $acc5,xzr,xzr 1441___ 1442} 1443$code.=<<___; 1444 lsl $t0,$t4,#32 // last reduction 1445 subs $acc2,$acc2,$t4 1446 lsr $t1,$t4,#32 1447 sbcs $acc3,$acc3,$t0 1448 sbcs $acc4,$acc4,$t1 1449 sbc $acc5,$acc5,xzr 1450 1451 subs xzr,$acc0,#1 1452 umulh $t1,$ord0,$t4 1453 mul $t2,$ord1,$t4 1454 umulh $t3,$ord1,$t4 1455 1456 adcs $t2,$t2,$t1 1457 adc $t3,$t3,xzr 1458 1459 adds $acc0,$acc1,$t2 1460 adcs $acc1,$acc2,$t3 1461 adcs $acc2,$acc3,$t4 1462 adcs $acc3,$acc4,$t4 1463 adc $acc4,$acc5,xzr 1464 1465 subs $t0,$acc0,$ord0 // ret -= modulus 1466 sbcs $t1,$acc1,$ord1 1467 sbcs $t2,$acc2,$ord2 1468 sbcs $t3,$acc3,$ord3 1469 sbcs xzr,$acc4,xzr 1470 1471 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1472 csel $acc1,$acc1,$t1,lo 1473 csel $acc2,$acc2,$t2,lo 1474 stp $acc0,$acc1,[$rp] 1475 csel $acc3,$acc3,$t3,lo 1476 stp $acc2,$acc3,[$rp,#16] 1477 1478 ldp x19,x20,[sp,#16] 1479 ldp x21,x22,[sp,#32] 1480 ldp x23,x24,[sp,#48] 1481 ldr x29,[sp],#64 1482 ret 1483.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1484 1485//////////////////////////////////////////////////////////////////////// 1486// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1487// uint64_t rep); 1488.globl ecp_nistz256_ord_sqr_mont 1489.type ecp_nistz256_ord_sqr_mont,%function 1490.align 4 1491ecp_nistz256_ord_sqr_mont: 1492 AARCH64_VALID_CALL_TARGET 1493 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1494 stp x29,x30,[sp,#-64]! 1495 add x29,sp,#0 1496 stp x19,x20,[sp,#16] 1497 stp x21,x22,[sp,#32] 1498 stp x23,x24,[sp,#48] 1499 1500 adr $ordk,.Lord 1501 ldp $a0,$a1,[$ap] 1502 ldp $a2,$a3,[$ap,#16] 1503 1504 ldp $ord0,$ord1,[$ordk,#0] 1505 ldp $ord2,$ord3,[$ordk,#16] 1506 ldr $ordk,[$ordk,#32] 1507 b .Loop_ord_sqr 1508 1509.align 4 1510.Loop_ord_sqr: 1511 sub $bp,$bp,#1 1512 //////////////////////////////////////////////////////////////// 1513 // | | | | | |a1*a0| | 1514 // | | | | |a2*a0| | | 1515 // | |a3*a2|a3*a0| | | | 1516 // | | | |a2*a1| | | | 1517 // | | |a3*a1| | | | | 1518 // *| | | | | | | | 2| 1519 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1520 // |--+--+--+--+--+--+--+--| 1521 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1522 // 1523 // "can't overflow" below mark carrying into high part of 1524 // multiplication result, which can't overflow, because it 1525 // can never be all ones. 1526 1527 mul $acc1,$a1,$a0 // a[1]*a[0] 1528 umulh $t1,$a1,$a0 1529 mul $acc2,$a2,$a0 // a[2]*a[0] 1530 umulh $t2,$a2,$a0 1531 mul $acc3,$a3,$a0 // a[3]*a[0] 1532 umulh $acc4,$a3,$a0 1533 1534 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1535 mul $t0,$a2,$a1 // a[2]*a[1] 1536 umulh $t1,$a2,$a1 1537 adcs $acc3,$acc3,$t2 1538 mul $t2,$a3,$a1 // a[3]*a[1] 1539 umulh $t3,$a3,$a1 1540 adc $acc4,$acc4,xzr // can't overflow 1541 1542 mul $acc5,$a3,$a2 // a[3]*a[2] 1543 umulh $acc6,$a3,$a2 1544 1545 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1546 mul $acc0,$a0,$a0 // a[0]*a[0] 1547 adc $t2,$t3,xzr // can't overflow 1548 1549 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1550 umulh $a0,$a0,$a0 1551 adcs $acc4,$acc4,$t1 1552 mul $t1,$a1,$a1 // a[1]*a[1] 1553 adcs $acc5,$acc5,$t2 1554 umulh $a1,$a1,$a1 1555 adc $acc6,$acc6,xzr // can't overflow 1556 1557 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1558 mul $t2,$a2,$a2 // a[2]*a[2] 1559 adcs $acc2,$acc2,$acc2 1560 umulh $a2,$a2,$a2 1561 adcs $acc3,$acc3,$acc3 1562 mul $t3,$a3,$a3 // a[3]*a[3] 1563 adcs $acc4,$acc4,$acc4 1564 umulh $a3,$a3,$a3 1565 adcs $acc5,$acc5,$acc5 1566 adcs $acc6,$acc6,$acc6 1567 adc $acc7,xzr,xzr 1568 1569 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1570 mul $t4,$acc0,$ordk 1571 adcs $acc2,$acc2,$t1 1572 adcs $acc3,$acc3,$a1 1573 adcs $acc4,$acc4,$t2 1574 adcs $acc5,$acc5,$a2 1575 adcs $acc6,$acc6,$t3 1576 adc $acc7,$acc7,$a3 1577___ 1578for($i=0; $i<4; $i++) { # reductions 1579$code.=<<___; 1580 subs xzr,$acc0,#1 1581 umulh $t1,$ord0,$t4 1582 mul $t2,$ord1,$t4 1583 umulh $t3,$ord1,$t4 1584 1585 adcs $t2,$t2,$t1 1586 adc $t3,$t3,xzr 1587 1588 adds $acc0,$acc1,$t2 1589 adcs $acc1,$acc2,$t3 1590 adcs $acc2,$acc3,$t4 1591 adc $acc3,xzr,$t4 // can't overflow 1592___ 1593$code.=<<___ if ($i<3); 1594 mul $t3,$acc0,$ordk 1595___ 1596$code.=<<___; 1597 lsl $t0,$t4,#32 1598 subs $acc1,$acc1,$t4 1599 lsr $t1,$t4,#32 1600 sbcs $acc2,$acc2,$t0 1601 sbc $acc3,$acc3,$t1 // can't borrow 1602___ 1603 ($t3,$t4) = ($t4,$t3); 1604} 1605$code.=<<___; 1606 adds $acc0,$acc0,$acc4 // accumulate upper half 1607 adcs $acc1,$acc1,$acc5 1608 adcs $acc2,$acc2,$acc6 1609 adcs $acc3,$acc3,$acc7 1610 adc $acc4,xzr,xzr 1611 1612 subs $t0,$acc0,$ord0 // ret -= modulus 1613 sbcs $t1,$acc1,$ord1 1614 sbcs $t2,$acc2,$ord2 1615 sbcs $t3,$acc3,$ord3 1616 sbcs xzr,$acc4,xzr 1617 1618 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1619 csel $a1,$acc1,$t1,lo 1620 csel $a2,$acc2,$t2,lo 1621 csel $a3,$acc3,$t3,lo 1622 1623 cbnz $bp,.Loop_ord_sqr 1624 1625 stp $a0,$a1,[$rp] 1626 stp $a2,$a3,[$rp,#16] 1627 1628 ldp x19,x20,[sp,#16] 1629 ldp x21,x22,[sp,#32] 1630 ldp x23,x24,[sp,#48] 1631 ldr x29,[sp],#64 1632 ret 1633.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1634___ 1635} } 1636 1637######################################################################## 1638# scatter-gather subroutines 1639{ 1640my ($out,$inp,$index,$mask)=map("x$_",(0..3)); 1641$code.=<<___; 1642// void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, 1643// int x2); 1644.globl ecp_nistz256_scatter_w5 1645.type ecp_nistz256_scatter_w5,%function 1646.align 4 1647ecp_nistz256_scatter_w5: 1648 AARCH64_VALID_CALL_TARGET 1649 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1650 stp x29,x30,[sp,#-16]! 1651 add x29,sp,#0 1652 1653 add $out,$out,$index,lsl#2 1654 1655 ldp x4,x5,[$inp] // X 1656 ldp x6,x7,[$inp,#16] 1657 stur w4,[$out,#64*0-4] 1658 lsr x4,x4,#32 1659 str w5,[$out,#64*1-4] 1660 lsr x5,x5,#32 1661 str w6,[$out,#64*2-4] 1662 lsr x6,x6,#32 1663 str w7,[$out,#64*3-4] 1664 lsr x7,x7,#32 1665 str w4,[$out,#64*4-4] 1666 str w5,[$out,#64*5-4] 1667 str w6,[$out,#64*6-4] 1668 str w7,[$out,#64*7-4] 1669 add $out,$out,#64*8 1670 1671 ldp x4,x5,[$inp,#32] // Y 1672 ldp x6,x7,[$inp,#48] 1673 stur w4,[$out,#64*0-4] 1674 lsr x4,x4,#32 1675 str w5,[$out,#64*1-4] 1676 lsr x5,x5,#32 1677 str w6,[$out,#64*2-4] 1678 lsr x6,x6,#32 1679 str w7,[$out,#64*3-4] 1680 lsr x7,x7,#32 1681 str w4,[$out,#64*4-4] 1682 str w5,[$out,#64*5-4] 1683 str w6,[$out,#64*6-4] 1684 str w7,[$out,#64*7-4] 1685 add $out,$out,#64*8 1686 1687 ldp x4,x5,[$inp,#64] // Z 1688 ldp x6,x7,[$inp,#80] 1689 stur w4,[$out,#64*0-4] 1690 lsr x4,x4,#32 1691 str w5,[$out,#64*1-4] 1692 lsr x5,x5,#32 1693 str w6,[$out,#64*2-4] 1694 lsr x6,x6,#32 1695 str w7,[$out,#64*3-4] 1696 lsr x7,x7,#32 1697 str w4,[$out,#64*4-4] 1698 str w5,[$out,#64*5-4] 1699 str w6,[$out,#64*6-4] 1700 str w7,[$out,#64*7-4] 1701 1702 ldr x29,[sp],#16 1703 ret 1704.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 1705 1706// void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, 1707// int x2); 1708.globl ecp_nistz256_gather_w5 1709.type ecp_nistz256_gather_w5,%function 1710.align 4 1711ecp_nistz256_gather_w5: 1712 AARCH64_VALID_CALL_TARGET 1713 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1714 stp x29,x30,[sp,#-16]! 1715 add x29,sp,#0 1716 1717 cmp $index,xzr 1718 csetm x3,ne 1719 add $index,$index,x3 1720 add $inp,$inp,$index,lsl#2 1721 1722 ldr w4,[$inp,#64*0] 1723 ldr w5,[$inp,#64*1] 1724 ldr w6,[$inp,#64*2] 1725 ldr w7,[$inp,#64*3] 1726 ldr w8,[$inp,#64*4] 1727 ldr w9,[$inp,#64*5] 1728 ldr w10,[$inp,#64*6] 1729 ldr w11,[$inp,#64*7] 1730 add $inp,$inp,#64*8 1731 orr x4,x4,x8,lsl#32 1732 orr x5,x5,x9,lsl#32 1733 orr x6,x6,x10,lsl#32 1734 orr x7,x7,x11,lsl#32 1735 csel x4,x4,xzr,ne 1736 csel x5,x5,xzr,ne 1737 csel x6,x6,xzr,ne 1738 csel x7,x7,xzr,ne 1739 stp x4,x5,[$out] // X 1740 stp x6,x7,[$out,#16] 1741 1742 ldr w4,[$inp,#64*0] 1743 ldr w5,[$inp,#64*1] 1744 ldr w6,[$inp,#64*2] 1745 ldr w7,[$inp,#64*3] 1746 ldr w8,[$inp,#64*4] 1747 ldr w9,[$inp,#64*5] 1748 ldr w10,[$inp,#64*6] 1749 ldr w11,[$inp,#64*7] 1750 add $inp,$inp,#64*8 1751 orr x4,x4,x8,lsl#32 1752 orr x5,x5,x9,lsl#32 1753 orr x6,x6,x10,lsl#32 1754 orr x7,x7,x11,lsl#32 1755 csel x4,x4,xzr,ne 1756 csel x5,x5,xzr,ne 1757 csel x6,x6,xzr,ne 1758 csel x7,x7,xzr,ne 1759 stp x4,x5,[$out,#32] // Y 1760 stp x6,x7,[$out,#48] 1761 1762 ldr w4,[$inp,#64*0] 1763 ldr w5,[$inp,#64*1] 1764 ldr w6,[$inp,#64*2] 1765 ldr w7,[$inp,#64*3] 1766 ldr w8,[$inp,#64*4] 1767 ldr w9,[$inp,#64*5] 1768 ldr w10,[$inp,#64*6] 1769 ldr w11,[$inp,#64*7] 1770 orr x4,x4,x8,lsl#32 1771 orr x5,x5,x9,lsl#32 1772 orr x6,x6,x10,lsl#32 1773 orr x7,x7,x11,lsl#32 1774 csel x4,x4,xzr,ne 1775 csel x5,x5,xzr,ne 1776 csel x6,x6,xzr,ne 1777 csel x7,x7,xzr,ne 1778 stp x4,x5,[$out,#64] // Z 1779 stp x6,x7,[$out,#80] 1780 1781 ldr x29,[sp],#16 1782 ret 1783.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 1784 1785// void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, 1786// int x2); 1787.globl ecp_nistz256_scatter_w7 1788.type ecp_nistz256_scatter_w7,%function 1789.align 4 1790ecp_nistz256_scatter_w7: 1791 AARCH64_VALID_CALL_TARGET 1792 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1793 stp x29,x30,[sp,#-16]! 1794 add x29,sp,#0 1795 1796 add $out,$out,$index 1797 mov $index,#64/8 1798.Loop_scatter_w7: 1799 ldr x3,[$inp],#8 1800 subs $index,$index,#1 1801 prfm pstl1strm,[$out,#4096+64*0] 1802 prfm pstl1strm,[$out,#4096+64*1] 1803 prfm pstl1strm,[$out,#4096+64*2] 1804 prfm pstl1strm,[$out,#4096+64*3] 1805 prfm pstl1strm,[$out,#4096+64*4] 1806 prfm pstl1strm,[$out,#4096+64*5] 1807 prfm pstl1strm,[$out,#4096+64*6] 1808 prfm pstl1strm,[$out,#4096+64*7] 1809 strb w3,[$out,#64*0] 1810 lsr x3,x3,#8 1811 strb w3,[$out,#64*1] 1812 lsr x3,x3,#8 1813 strb w3,[$out,#64*2] 1814 lsr x3,x3,#8 1815 strb w3,[$out,#64*3] 1816 lsr x3,x3,#8 1817 strb w3,[$out,#64*4] 1818 lsr x3,x3,#8 1819 strb w3,[$out,#64*5] 1820 lsr x3,x3,#8 1821 strb w3,[$out,#64*6] 1822 lsr x3,x3,#8 1823 strb w3,[$out,#64*7] 1824 add $out,$out,#64*8 1825 b.ne .Loop_scatter_w7 1826 1827 ldr x29,[sp],#16 1828 ret 1829.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 1830 1831// void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, 1832// int x2); 1833.globl ecp_nistz256_gather_w7 1834.type ecp_nistz256_gather_w7,%function 1835.align 4 1836ecp_nistz256_gather_w7: 1837 AARCH64_VALID_CALL_TARGET 1838 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1839 stp x29,x30,[sp,#-16]! 1840 add x29,sp,#0 1841 1842 cmp $index,xzr 1843 csetm x3,ne 1844 add $index,$index,x3 1845 add $inp,$inp,$index 1846 mov $index,#64/8 1847 nop 1848.Loop_gather_w7: 1849 ldrb w4,[$inp,#64*0] 1850 prfm pldl1strm,[$inp,#4096+64*0] 1851 subs $index,$index,#1 1852 ldrb w5,[$inp,#64*1] 1853 prfm pldl1strm,[$inp,#4096+64*1] 1854 ldrb w6,[$inp,#64*2] 1855 prfm pldl1strm,[$inp,#4096+64*2] 1856 ldrb w7,[$inp,#64*3] 1857 prfm pldl1strm,[$inp,#4096+64*3] 1858 ldrb w8,[$inp,#64*4] 1859 prfm pldl1strm,[$inp,#4096+64*4] 1860 ldrb w9,[$inp,#64*5] 1861 prfm pldl1strm,[$inp,#4096+64*5] 1862 ldrb w10,[$inp,#64*6] 1863 prfm pldl1strm,[$inp,#4096+64*6] 1864 ldrb w11,[$inp,#64*7] 1865 prfm pldl1strm,[$inp,#4096+64*7] 1866 add $inp,$inp,#64*8 1867 orr x4,x4,x5,lsl#8 1868 orr x6,x6,x7,lsl#8 1869 orr x8,x8,x9,lsl#8 1870 orr x4,x4,x6,lsl#16 1871 orr x10,x10,x11,lsl#8 1872 orr x4,x4,x8,lsl#32 1873 orr x4,x4,x10,lsl#48 1874 and x4,x4,x3 1875 str x4,[$out],#8 1876 b.ne .Loop_gather_w7 1877 1878 ldr x29,[sp],#16 1879 ret 1880.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 1881___ 1882} 1883 1884foreach (split("\n",$code)) { 1885 s/\`([^\`]*)\`/eval $1/ge; 1886 1887 print $_,"\n"; 1888} 1889close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1890