1#! /usr/bin/env perl 2# Copyright 2016-2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for ARMv8. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone. 22# 23# IALU/gcc-4.9 NEON 24# 25# Apple A7 1.86/+5% 0.72 26# Cortex-A53 2.69/+58% 1.47 27# Cortex-A57 2.70/+7% 1.14 28# Denver 1.64/+50% 1.18(*) 29# X-Gene 2.13/+68% 2.27 30# Mongoose 1.77/+75% 1.12 31# Kryo 2.70/+55% 1.13 32# ThunderX2 1.17/+95% 1.36 33# 34# (*) estimate based on resources availability is less than 1.0, 35# i.e. measured result is worse than expected, presumably binary 36# translator is not almighty; 37 38# $output is the last argument if it looks like a file (it has an extension) 39# $flavour is the first argument if it doesn't look like a file 40$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 41$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 42 43$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 44( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 45( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 46die "can't locate arm-xlate.pl"; 47 48open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 49 or die "can't call $xlate: $!"; 50*STDOUT=*OUT; 51 52my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 53my ($mac,$nonce)=($inp,$len); 54 55my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 56 57$code.=<<___; 58#include "arm_arch.h" 59 60.text 61 62// forward "declarations" are required for Apple 63.extern OPENSSL_armcap_P 64.hidden OPENSSL_armcap_P 65.globl poly1305_init 66.hidden poly1305_init 67.globl poly1305_blocks 68.hidden poly1305_blocks 69.globl poly1305_emit 70.hidden poly1305_emit 71 72.type poly1305_init,%function 73.align 5 74poly1305_init: 75 AARCH64_VALID_CALL_TARGET 76 cmp $inp,xzr 77 stp xzr,xzr,[$ctx] // zero hash value 78 stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 79 80 csel x0,xzr,x0,eq 81 b.eq .Lno_key 82 83 adrp x17,OPENSSL_armcap_P 84 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 85 86 ldp $r0,$r1,[$inp] // load key 87 mov $s1,#0xfffffffc0fffffff 88 movk $s1,#0x0fff,lsl#48 89#ifdef __AARCH64EB__ 90 rev $r0,$r0 // flip bytes 91 rev $r1,$r1 92#endif 93 and $r0,$r0,$s1 // &=0ffffffc0fffffff 94 and $s1,$s1,#-4 95 and $r1,$r1,$s1 // &=0ffffffc0ffffffc 96 stp $r0,$r1,[$ctx,#32] // save key value 97 98 tst w17,#ARMV7_NEON 99 100 adr $d0,.Lpoly1305_blocks 101 adr $r0,.Lpoly1305_blocks_neon 102 adr $d1,.Lpoly1305_emit 103 adr $r1,.Lpoly1305_emit_neon 104 105 csel $d0,$d0,$r0,eq 106 csel $d1,$d1,$r1,eq 107 108#ifdef __ILP32__ 109 stp w12,w13,[$len] 110#else 111 stp $d0,$d1,[$len] 112#endif 113 114 mov x0,#1 115.Lno_key: 116 ret 117.size poly1305_init,.-poly1305_init 118 119.type poly1305_blocks,%function 120.align 5 121poly1305_blocks: 122.Lpoly1305_blocks: 123 // The symbol .Lpoly1305_blocks is not a .globl symbol 124 // but a pointer to it is returned by poly1305_init 125 AARCH64_VALID_CALL_TARGET 126 ands $len,$len,#-16 127 b.eq .Lno_data 128 129 ldp $h0,$h1,[$ctx] // load hash value 130 ldp $r0,$r1,[$ctx,#32] // load key value 131 ldr $h2,[$ctx,#16] 132 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 133 b .Loop 134 135.align 5 136.Loop: 137 ldp $t0,$t1,[$inp],#16 // load input 138 sub $len,$len,#16 139#ifdef __AARCH64EB__ 140 rev $t0,$t0 141 rev $t1,$t1 142#endif 143 adds $h0,$h0,$t0 // accumulate input 144 adcs $h1,$h1,$t1 145 146 mul $d0,$h0,$r0 // h0*r0 147 adc $h2,$h2,$padbit 148 umulh $d1,$h0,$r0 149 150 mul $t0,$h1,$s1 // h1*5*r1 151 umulh $t1,$h1,$s1 152 153 adds $d0,$d0,$t0 154 mul $t0,$h0,$r1 // h0*r1 155 adc $d1,$d1,$t1 156 umulh $d2,$h0,$r1 157 158 adds $d1,$d1,$t0 159 mul $t0,$h1,$r0 // h1*r0 160 adc $d2,$d2,xzr 161 umulh $t1,$h1,$r0 162 163 adds $d1,$d1,$t0 164 mul $t0,$h2,$s1 // h2*5*r1 165 adc $d2,$d2,$t1 166 mul $t1,$h2,$r0 // h2*r0 167 168 adds $d1,$d1,$t0 169 adc $d2,$d2,$t1 170 171 and $t0,$d2,#-4 // final reduction 172 and $h2,$d2,#3 173 add $t0,$t0,$d2,lsr#2 174 adds $h0,$d0,$t0 175 adcs $h1,$d1,xzr 176 adc $h2,$h2,xzr 177 178 cbnz $len,.Loop 179 180 stp $h0,$h1,[$ctx] // store hash value 181 str $h2,[$ctx,#16] 182 183.Lno_data: 184 ret 185.size poly1305_blocks,.-poly1305_blocks 186 187.type poly1305_emit,%function 188.align 5 189poly1305_emit: 190.Lpoly1305_emit: 191 // The symbol .poly1305_emit is not a .globl symbol 192 // but a pointer to it is returned by poly1305_init 193 AARCH64_VALID_CALL_TARGET 194 ldp $h0,$h1,[$ctx] // load hash base 2^64 195 ldr $h2,[$ctx,#16] 196 ldp $t0,$t1,[$nonce] // load nonce 197 198 adds $d0,$h0,#5 // compare to modulus 199 adcs $d1,$h1,xzr 200 adc $d2,$h2,xzr 201 202 tst $d2,#-4 // see if it's carried/borrowed 203 204 csel $h0,$h0,$d0,eq 205 csel $h1,$h1,$d1,eq 206 207#ifdef __AARCH64EB__ 208 ror $t0,$t0,#32 // flip nonce words 209 ror $t1,$t1,#32 210#endif 211 adds $h0,$h0,$t0 // accumulate nonce 212 adc $h1,$h1,$t1 213#ifdef __AARCH64EB__ 214 rev $h0,$h0 // flip output bytes 215 rev $h1,$h1 216#endif 217 stp $h0,$h1,[$mac] // write result 218 219 ret 220.size poly1305_emit,.-poly1305_emit 221___ 222my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 223my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 224my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 225my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 226my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 227my ($T0,$T1,$MASK) = map("v$_",(29..31)); 228 229my ($in2,$zeros)=("x16","x17"); 230my $is_base2_26 = $zeros; # borrow 231 232$code.=<<___; 233.type poly1305_mult,%function 234.align 5 235poly1305_mult: 236 mul $d0,$h0,$r0 // h0*r0 237 umulh $d1,$h0,$r0 238 239 mul $t0,$h1,$s1 // h1*5*r1 240 umulh $t1,$h1,$s1 241 242 adds $d0,$d0,$t0 243 mul $t0,$h0,$r1 // h0*r1 244 adc $d1,$d1,$t1 245 umulh $d2,$h0,$r1 246 247 adds $d1,$d1,$t0 248 mul $t0,$h1,$r0 // h1*r0 249 adc $d2,$d2,xzr 250 umulh $t1,$h1,$r0 251 252 adds $d1,$d1,$t0 253 mul $t0,$h2,$s1 // h2*5*r1 254 adc $d2,$d2,$t1 255 mul $t1,$h2,$r0 // h2*r0 256 257 adds $d1,$d1,$t0 258 adc $d2,$d2,$t1 259 260 and $t0,$d2,#-4 // final reduction 261 and $h2,$d2,#3 262 add $t0,$t0,$d2,lsr#2 263 adds $h0,$d0,$t0 264 adcs $h1,$d1,xzr 265 adc $h2,$h2,xzr 266 267 ret 268.size poly1305_mult,.-poly1305_mult 269 270.type poly1305_splat,%function 271.align 5 272poly1305_splat: 273 and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 274 ubfx x13,$h0,#26,#26 275 extr x14,$h1,$h0,#52 276 and x14,x14,#0x03ffffff 277 ubfx x15,$h1,#14,#26 278 extr x16,$h2,$h1,#40 279 280 str w12,[$ctx,#16*0] // r0 281 add w12,w13,w13,lsl#2 // r1*5 282 str w13,[$ctx,#16*1] // r1 283 add w13,w14,w14,lsl#2 // r2*5 284 str w12,[$ctx,#16*2] // s1 285 str w14,[$ctx,#16*3] // r2 286 add w14,w15,w15,lsl#2 // r3*5 287 str w13,[$ctx,#16*4] // s2 288 str w15,[$ctx,#16*5] // r3 289 add w15,w16,w16,lsl#2 // r4*5 290 str w14,[$ctx,#16*6] // s3 291 str w16,[$ctx,#16*7] // r4 292 str w15,[$ctx,#16*8] // s4 293 294 ret 295.size poly1305_splat,.-poly1305_splat 296 297.type poly1305_blocks_neon,%function 298.align 5 299poly1305_blocks_neon: 300.Lpoly1305_blocks_neon: 301 // The symbol .Lpoly1305_blocks_neon is not a .globl symbol 302 // but a pointer to it is returned by poly1305_init 303 AARCH64_VALID_CALL_TARGET 304 ldr $is_base2_26,[$ctx,#24] 305 cmp $len,#128 306 b.hs .Lblocks_neon 307 cbz $is_base2_26,.Lpoly1305_blocks 308 309.Lblocks_neon: 310 AARCH64_SIGN_LINK_REGISTER 311 stp x29,x30,[sp,#-80]! 312 add x29,sp,#0 313 314 ands $len,$len,#-16 315 b.eq .Lno_data_neon 316 317 cbz $is_base2_26,.Lbase2_64_neon 318 319 ldp w10,w11,[$ctx] // load hash value base 2^26 320 ldp w12,w13,[$ctx,#8] 321 ldr w14,[$ctx,#16] 322 323 tst $len,#31 324 b.eq .Leven_neon 325 326 ldp $r0,$r1,[$ctx,#32] // load key value 327 328 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 329 lsr $h1,x12,#12 330 adds $h0,$h0,x12,lsl#52 331 add $h1,$h1,x13,lsl#14 332 adc $h1,$h1,xzr 333 lsr $h2,x14,#24 334 adds $h1,$h1,x14,lsl#40 335 adc $d2,$h2,xzr // can be partially reduced... 336 337 ldp $d0,$d1,[$inp],#16 // load input 338 sub $len,$len,#16 339 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 340 341 and $t0,$d2,#-4 // ... so reduce 342 and $h2,$d2,#3 343 add $t0,$t0,$d2,lsr#2 344 adds $h0,$h0,$t0 345 adcs $h1,$h1,xzr 346 adc $h2,$h2,xzr 347 348#ifdef __AARCH64EB__ 349 rev $d0,$d0 350 rev $d1,$d1 351#endif 352 adds $h0,$h0,$d0 // accumulate input 353 adcs $h1,$h1,$d1 354 adc $h2,$h2,$padbit 355 356 bl poly1305_mult 357 ldr x30,[sp,#8] 358 359 cbz $padbit,.Lstore_base2_64_neon 360 361 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 362 ubfx x11,$h0,#26,#26 363 extr x12,$h1,$h0,#52 364 and x12,x12,#0x03ffffff 365 ubfx x13,$h1,#14,#26 366 extr x14,$h2,$h1,#40 367 368 cbnz $len,.Leven_neon 369 370 stp w10,w11,[$ctx] // store hash value base 2^26 371 stp w12,w13,[$ctx,#8] 372 str w14,[$ctx,#16] 373 b .Lno_data_neon 374 375.align 4 376.Lstore_base2_64_neon: 377 stp $h0,$h1,[$ctx] // store hash value base 2^64 378 stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed 379 b .Lno_data_neon 380 381.align 4 382.Lbase2_64_neon: 383 ldp $r0,$r1,[$ctx,#32] // load key value 384 385 ldp $h0,$h1,[$ctx] // load hash value base 2^64 386 ldr $h2,[$ctx,#16] 387 388 tst $len,#31 389 b.eq .Linit_neon 390 391 ldp $d0,$d1,[$inp],#16 // load input 392 sub $len,$len,#16 393 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 394#ifdef __AARCH64EB__ 395 rev $d0,$d0 396 rev $d1,$d1 397#endif 398 adds $h0,$h0,$d0 // accumulate input 399 adcs $h1,$h1,$d1 400 adc $h2,$h2,$padbit 401 402 bl poly1305_mult 403 404.Linit_neon: 405 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 406 ubfx x11,$h0,#26,#26 407 extr x12,$h1,$h0,#52 408 and x12,x12,#0x03ffffff 409 ubfx x13,$h1,#14,#26 410 extr x14,$h2,$h1,#40 411 412 stp d8,d9,[sp,#16] // meet ABI requirements 413 stp d10,d11,[sp,#32] 414 stp d12,d13,[sp,#48] 415 stp d14,d15,[sp,#64] 416 417 fmov ${H0},x10 418 fmov ${H1},x11 419 fmov ${H2},x12 420 fmov ${H3},x13 421 fmov ${H4},x14 422 423 ////////////////////////////////// initialize r^n table 424 mov $h0,$r0 // r^1 425 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 426 mov $h1,$r1 427 mov $h2,xzr 428 add $ctx,$ctx,#48+12 429 bl poly1305_splat 430 431 bl poly1305_mult // r^2 432 sub $ctx,$ctx,#4 433 bl poly1305_splat 434 435 bl poly1305_mult // r^3 436 sub $ctx,$ctx,#4 437 bl poly1305_splat 438 439 bl poly1305_mult // r^4 440 sub $ctx,$ctx,#4 441 bl poly1305_splat 442 ldr x30,[sp,#8] 443 444 add $in2,$inp,#32 445 adr $zeros,.Lzeros 446 subs $len,$len,#64 447 csel $in2,$zeros,$in2,lo 448 449 mov x4,#1 450 stur x4,[$ctx,#-24] // set is_base2_26 451 sub $ctx,$ctx,#48 // restore original $ctx 452 b .Ldo_neon 453 454.align 4 455.Leven_neon: 456 add $in2,$inp,#32 457 adr $zeros,.Lzeros 458 subs $len,$len,#64 459 csel $in2,$zeros,$in2,lo 460 461 stp d8,d9,[sp,#16] // meet ABI requirements 462 stp d10,d11,[sp,#32] 463 stp d12,d13,[sp,#48] 464 stp d14,d15,[sp,#64] 465 466 fmov ${H0},x10 467 fmov ${H1},x11 468 fmov ${H2},x12 469 fmov ${H3},x13 470 fmov ${H4},x14 471 472.Ldo_neon: 473 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 474 ldp x9,x13,[$in2],#48 475 476 lsl $padbit,$padbit,#24 477 add x15,$ctx,#48 478 479#ifdef __AARCH64EB__ 480 rev x8,x8 481 rev x12,x12 482 rev x9,x9 483 rev x13,x13 484#endif 485 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 486 and x5,x9,#0x03ffffff 487 ubfx x6,x8,#26,#26 488 ubfx x7,x9,#26,#26 489 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 490 extr x8,x12,x8,#52 491 extr x9,x13,x9,#52 492 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 493 fmov $IN23_0,x4 494 and x8,x8,#0x03ffffff 495 and x9,x9,#0x03ffffff 496 ubfx x10,x12,#14,#26 497 ubfx x11,x13,#14,#26 498 add x12,$padbit,x12,lsr#40 499 add x13,$padbit,x13,lsr#40 500 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 501 fmov $IN23_1,x6 502 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 503 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 504 fmov $IN23_2,x8 505 fmov $IN23_3,x10 506 fmov $IN23_4,x12 507 508 ldp x8,x12,[$inp],#16 // inp[0:1] 509 ldp x9,x13,[$inp],#48 510 511 ld1 {$R0,$R1,$S1,$R2},[x15],#64 512 ld1 {$S2,$R3,$S3,$R4},[x15],#64 513 ld1 {$S4},[x15] 514 515#ifdef __AARCH64EB__ 516 rev x8,x8 517 rev x12,x12 518 rev x9,x9 519 rev x13,x13 520#endif 521 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 522 and x5,x9,#0x03ffffff 523 ubfx x6,x8,#26,#26 524 ubfx x7,x9,#26,#26 525 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 526 extr x8,x12,x8,#52 527 extr x9,x13,x9,#52 528 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 529 fmov $IN01_0,x4 530 and x8,x8,#0x03ffffff 531 and x9,x9,#0x03ffffff 532 ubfx x10,x12,#14,#26 533 ubfx x11,x13,#14,#26 534 add x12,$padbit,x12,lsr#40 535 add x13,$padbit,x13,lsr#40 536 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 537 fmov $IN01_1,x6 538 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 539 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 540 movi $MASK.2d,#-1 541 fmov $IN01_2,x8 542 fmov $IN01_3,x10 543 fmov $IN01_4,x12 544 ushr $MASK.2d,$MASK.2d,#38 545 546 b.ls .Lskip_loop 547 548.align 4 549.Loop_neon: 550 //////////////////////////////////////////////////////////////// 551 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 552 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 553 // \___________________/ 554 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 555 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 556 // \___________________/ \____________________/ 557 // 558 // Note that we start with inp[2:3]*r^2. This is because it 559 // doesn't depend on reduction in previous iteration. 560 //////////////////////////////////////////////////////////////// 561 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 562 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 563 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 564 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 565 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 566 567 subs $len,$len,#64 568 umull $ACC4,$IN23_0,${R4}[2] 569 csel $in2,$zeros,$in2,lo 570 umull $ACC3,$IN23_0,${R3}[2] 571 umull $ACC2,$IN23_0,${R2}[2] 572 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 573 umull $ACC1,$IN23_0,${R1}[2] 574 ldp x9,x13,[$in2],#48 575 umull $ACC0,$IN23_0,${R0}[2] 576#ifdef __AARCH64EB__ 577 rev x8,x8 578 rev x12,x12 579 rev x9,x9 580 rev x13,x13 581#endif 582 583 umlal $ACC4,$IN23_1,${R3}[2] 584 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 585 umlal $ACC3,$IN23_1,${R2}[2] 586 and x5,x9,#0x03ffffff 587 umlal $ACC2,$IN23_1,${R1}[2] 588 ubfx x6,x8,#26,#26 589 umlal $ACC1,$IN23_1,${R0}[2] 590 ubfx x7,x9,#26,#26 591 umlal $ACC0,$IN23_1,${S4}[2] 592 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 593 594 umlal $ACC4,$IN23_2,${R2}[2] 595 extr x8,x12,x8,#52 596 umlal $ACC3,$IN23_2,${R1}[2] 597 extr x9,x13,x9,#52 598 umlal $ACC2,$IN23_2,${R0}[2] 599 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 600 umlal $ACC1,$IN23_2,${S4}[2] 601 fmov $IN23_0,x4 602 umlal $ACC0,$IN23_2,${S3}[2] 603 and x8,x8,#0x03ffffff 604 605 umlal $ACC4,$IN23_3,${R1}[2] 606 and x9,x9,#0x03ffffff 607 umlal $ACC3,$IN23_3,${R0}[2] 608 ubfx x10,x12,#14,#26 609 umlal $ACC2,$IN23_3,${S4}[2] 610 ubfx x11,x13,#14,#26 611 umlal $ACC1,$IN23_3,${S3}[2] 612 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 613 umlal $ACC0,$IN23_3,${S2}[2] 614 fmov $IN23_1,x6 615 616 add $IN01_2,$IN01_2,$H2 617 add x12,$padbit,x12,lsr#40 618 umlal $ACC4,$IN23_4,${R0}[2] 619 add x13,$padbit,x13,lsr#40 620 umlal $ACC3,$IN23_4,${S4}[2] 621 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 622 umlal $ACC2,$IN23_4,${S3}[2] 623 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 624 umlal $ACC1,$IN23_4,${S2}[2] 625 fmov $IN23_2,x8 626 umlal $ACC0,$IN23_4,${S1}[2] 627 fmov $IN23_3,x10 628 629 //////////////////////////////////////////////////////////////// 630 // (hash+inp[0:1])*r^4 and accumulate 631 632 add $IN01_0,$IN01_0,$H0 633 fmov $IN23_4,x12 634 umlal $ACC3,$IN01_2,${R1}[0] 635 ldp x8,x12,[$inp],#16 // inp[0:1] 636 umlal $ACC0,$IN01_2,${S3}[0] 637 ldp x9,x13,[$inp],#48 638 umlal $ACC4,$IN01_2,${R2}[0] 639 umlal $ACC1,$IN01_2,${S4}[0] 640 umlal $ACC2,$IN01_2,${R0}[0] 641#ifdef __AARCH64EB__ 642 rev x8,x8 643 rev x12,x12 644 rev x9,x9 645 rev x13,x13 646#endif 647 648 add $IN01_1,$IN01_1,$H1 649 umlal $ACC3,$IN01_0,${R3}[0] 650 umlal $ACC4,$IN01_0,${R4}[0] 651 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 652 umlal $ACC2,$IN01_0,${R2}[0] 653 and x5,x9,#0x03ffffff 654 umlal $ACC0,$IN01_0,${R0}[0] 655 ubfx x6,x8,#26,#26 656 umlal $ACC1,$IN01_0,${R1}[0] 657 ubfx x7,x9,#26,#26 658 659 add $IN01_3,$IN01_3,$H3 660 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 661 umlal $ACC3,$IN01_1,${R2}[0] 662 extr x8,x12,x8,#52 663 umlal $ACC4,$IN01_1,${R3}[0] 664 extr x9,x13,x9,#52 665 umlal $ACC0,$IN01_1,${S4}[0] 666 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 667 umlal $ACC2,$IN01_1,${R1}[0] 668 fmov $IN01_0,x4 669 umlal $ACC1,$IN01_1,${R0}[0] 670 and x8,x8,#0x03ffffff 671 672 add $IN01_4,$IN01_4,$H4 673 and x9,x9,#0x03ffffff 674 umlal $ACC3,$IN01_3,${R0}[0] 675 ubfx x10,x12,#14,#26 676 umlal $ACC0,$IN01_3,${S2}[0] 677 ubfx x11,x13,#14,#26 678 umlal $ACC4,$IN01_3,${R1}[0] 679 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 680 umlal $ACC1,$IN01_3,${S3}[0] 681 fmov $IN01_1,x6 682 umlal $ACC2,$IN01_3,${S4}[0] 683 add x12,$padbit,x12,lsr#40 684 685 umlal $ACC3,$IN01_4,${S4}[0] 686 add x13,$padbit,x13,lsr#40 687 umlal $ACC0,$IN01_4,${S1}[0] 688 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 689 umlal $ACC4,$IN01_4,${R0}[0] 690 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 691 umlal $ACC1,$IN01_4,${S2}[0] 692 fmov $IN01_2,x8 693 umlal $ACC2,$IN01_4,${S3}[0] 694 fmov $IN01_3,x10 695 fmov $IN01_4,x12 696 697 ///////////////////////////////////////////////////////////////// 698 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 699 // and P. Schwabe 700 // 701 // [see discussion in poly1305-armv4 module] 702 703 ushr $T0.2d,$ACC3,#26 704 xtn $H3,$ACC3 705 ushr $T1.2d,$ACC0,#26 706 and $ACC0,$ACC0,$MASK.2d 707 add $ACC4,$ACC4,$T0.2d // h3 -> h4 708 bic $H3,#0xfc,lsl#24 // &=0x03ffffff 709 add $ACC1,$ACC1,$T1.2d // h0 -> h1 710 711 ushr $T0.2d,$ACC4,#26 712 xtn $H4,$ACC4 713 ushr $T1.2d,$ACC1,#26 714 xtn $H1,$ACC1 715 bic $H4,#0xfc,lsl#24 716 add $ACC2,$ACC2,$T1.2d // h1 -> h2 717 718 add $ACC0,$ACC0,$T0.2d 719 shl $T0.2d,$T0.2d,#2 720 shrn $T1.2s,$ACC2,#26 721 xtn $H2,$ACC2 722 add $ACC0,$ACC0,$T0.2d // h4 -> h0 723 bic $H1,#0xfc,lsl#24 724 add $H3,$H3,$T1.2s // h2 -> h3 725 bic $H2,#0xfc,lsl#24 726 727 shrn $T0.2s,$ACC0,#26 728 xtn $H0,$ACC0 729 ushr $T1.2s,$H3,#26 730 bic $H3,#0xfc,lsl#24 731 bic $H0,#0xfc,lsl#24 732 add $H1,$H1,$T0.2s // h0 -> h1 733 add $H4,$H4,$T1.2s // h3 -> h4 734 735 b.hi .Loop_neon 736 737.Lskip_loop: 738 dup $IN23_2,${IN23_2}[0] 739 add $IN01_2,$IN01_2,$H2 740 741 //////////////////////////////////////////////////////////////// 742 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 743 744 adds $len,$len,#32 745 b.ne .Long_tail 746 747 dup $IN23_2,${IN01_2}[0] 748 add $IN23_0,$IN01_0,$H0 749 add $IN23_3,$IN01_3,$H3 750 add $IN23_1,$IN01_1,$H1 751 add $IN23_4,$IN01_4,$H4 752 753.Long_tail: 754 dup $IN23_0,${IN23_0}[0] 755 umull2 $ACC0,$IN23_2,${S3} 756 umull2 $ACC3,$IN23_2,${R1} 757 umull2 $ACC4,$IN23_2,${R2} 758 umull2 $ACC2,$IN23_2,${R0} 759 umull2 $ACC1,$IN23_2,${S4} 760 761 dup $IN23_1,${IN23_1}[0] 762 umlal2 $ACC0,$IN23_0,${R0} 763 umlal2 $ACC2,$IN23_0,${R2} 764 umlal2 $ACC3,$IN23_0,${R3} 765 umlal2 $ACC4,$IN23_0,${R4} 766 umlal2 $ACC1,$IN23_0,${R1} 767 768 dup $IN23_3,${IN23_3}[0] 769 umlal2 $ACC0,$IN23_1,${S4} 770 umlal2 $ACC3,$IN23_1,${R2} 771 umlal2 $ACC2,$IN23_1,${R1} 772 umlal2 $ACC4,$IN23_1,${R3} 773 umlal2 $ACC1,$IN23_1,${R0} 774 775 dup $IN23_4,${IN23_4}[0] 776 umlal2 $ACC3,$IN23_3,${R0} 777 umlal2 $ACC4,$IN23_3,${R1} 778 umlal2 $ACC0,$IN23_3,${S2} 779 umlal2 $ACC1,$IN23_3,${S3} 780 umlal2 $ACC2,$IN23_3,${S4} 781 782 umlal2 $ACC3,$IN23_4,${S4} 783 umlal2 $ACC0,$IN23_4,${S1} 784 umlal2 $ACC4,$IN23_4,${R0} 785 umlal2 $ACC1,$IN23_4,${S2} 786 umlal2 $ACC2,$IN23_4,${S3} 787 788 b.eq .Lshort_tail 789 790 //////////////////////////////////////////////////////////////// 791 // (hash+inp[0:1])*r^4:r^3 and accumulate 792 793 add $IN01_0,$IN01_0,$H0 794 umlal $ACC3,$IN01_2,${R1} 795 umlal $ACC0,$IN01_2,${S3} 796 umlal $ACC4,$IN01_2,${R2} 797 umlal $ACC1,$IN01_2,${S4} 798 umlal $ACC2,$IN01_2,${R0} 799 800 add $IN01_1,$IN01_1,$H1 801 umlal $ACC3,$IN01_0,${R3} 802 umlal $ACC0,$IN01_0,${R0} 803 umlal $ACC4,$IN01_0,${R4} 804 umlal $ACC1,$IN01_0,${R1} 805 umlal $ACC2,$IN01_0,${R2} 806 807 add $IN01_3,$IN01_3,$H3 808 umlal $ACC3,$IN01_1,${R2} 809 umlal $ACC0,$IN01_1,${S4} 810 umlal $ACC4,$IN01_1,${R3} 811 umlal $ACC1,$IN01_1,${R0} 812 umlal $ACC2,$IN01_1,${R1} 813 814 add $IN01_4,$IN01_4,$H4 815 umlal $ACC3,$IN01_3,${R0} 816 umlal $ACC0,$IN01_3,${S2} 817 umlal $ACC4,$IN01_3,${R1} 818 umlal $ACC1,$IN01_3,${S3} 819 umlal $ACC2,$IN01_3,${S4} 820 821 umlal $ACC3,$IN01_4,${S4} 822 umlal $ACC0,$IN01_4,${S1} 823 umlal $ACC4,$IN01_4,${R0} 824 umlal $ACC1,$IN01_4,${S2} 825 umlal $ACC2,$IN01_4,${S3} 826 827.Lshort_tail: 828 //////////////////////////////////////////////////////////////// 829 // horizontal add 830 831 addp $ACC3,$ACC3,$ACC3 832 ldp d8,d9,[sp,#16] // meet ABI requirements 833 addp $ACC0,$ACC0,$ACC0 834 ldp d10,d11,[sp,#32] 835 addp $ACC4,$ACC4,$ACC4 836 ldp d12,d13,[sp,#48] 837 addp $ACC1,$ACC1,$ACC1 838 ldp d14,d15,[sp,#64] 839 addp $ACC2,$ACC2,$ACC2 840 841 //////////////////////////////////////////////////////////////// 842 // lazy reduction, but without narrowing 843 844 ushr $T0.2d,$ACC3,#26 845 and $ACC3,$ACC3,$MASK.2d 846 ushr $T1.2d,$ACC0,#26 847 and $ACC0,$ACC0,$MASK.2d 848 849 add $ACC4,$ACC4,$T0.2d // h3 -> h4 850 add $ACC1,$ACC1,$T1.2d // h0 -> h1 851 852 ushr $T0.2d,$ACC4,#26 853 and $ACC4,$ACC4,$MASK.2d 854 ushr $T1.2d,$ACC1,#26 855 and $ACC1,$ACC1,$MASK.2d 856 add $ACC2,$ACC2,$T1.2d // h1 -> h2 857 858 add $ACC0,$ACC0,$T0.2d 859 shl $T0.2d,$T0.2d,#2 860 ushr $T1.2d,$ACC2,#26 861 and $ACC2,$ACC2,$MASK.2d 862 add $ACC0,$ACC0,$T0.2d // h4 -> h0 863 add $ACC3,$ACC3,$T1.2d // h2 -> h3 864 865 ushr $T0.2d,$ACC0,#26 866 and $ACC0,$ACC0,$MASK.2d 867 ushr $T1.2d,$ACC3,#26 868 and $ACC3,$ACC3,$MASK.2d 869 add $ACC1,$ACC1,$T0.2d // h0 -> h1 870 add $ACC4,$ACC4,$T1.2d // h3 -> h4 871 872 //////////////////////////////////////////////////////////////// 873 // write the result, can be partially reduced 874 875 st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 876 st1 {$ACC4}[0],[$ctx] 877 878.Lno_data_neon: 879 ldr x29,[sp],#80 880 AARCH64_VALIDATE_LINK_REGISTER 881 ret 882.size poly1305_blocks_neon,.-poly1305_blocks_neon 883 884.type poly1305_emit_neon,%function 885.align 5 886poly1305_emit_neon: 887.Lpoly1305_emit_neon: 888 // The symbol .Lpoly1305_emit_neon is not a .globl symbol 889 // but a pointer to it is returned by poly1305_init 890 AARCH64_VALID_CALL_TARGET 891 ldr $is_base2_26,[$ctx,#24] 892 cbz $is_base2_26,poly1305_emit 893 894 ldp w10,w11,[$ctx] // load hash value base 2^26 895 ldp w12,w13,[$ctx,#8] 896 ldr w14,[$ctx,#16] 897 898 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 899 lsr $h1,x12,#12 900 adds $h0,$h0,x12,lsl#52 901 add $h1,$h1,x13,lsl#14 902 adc $h1,$h1,xzr 903 lsr $h2,x14,#24 904 adds $h1,$h1,x14,lsl#40 905 adc $h2,$h2,xzr // can be partially reduced... 906 907 ldp $t0,$t1,[$nonce] // load nonce 908 909 and $d0,$h2,#-4 // ... so reduce 910 add $d0,$d0,$h2,lsr#2 911 and $h2,$h2,#3 912 adds $h0,$h0,$d0 913 adcs $h1,$h1,xzr 914 adc $h2,$h2,xzr 915 916 adds $d0,$h0,#5 // compare to modulus 917 adcs $d1,$h1,xzr 918 adc $d2,$h2,xzr 919 920 tst $d2,#-4 // see if it's carried/borrowed 921 922 csel $h0,$h0,$d0,eq 923 csel $h1,$h1,$d1,eq 924 925#ifdef __AARCH64EB__ 926 ror $t0,$t0,#32 // flip nonce words 927 ror $t1,$t1,#32 928#endif 929 adds $h0,$h0,$t0 // accumulate nonce 930 adc $h1,$h1,$t1 931#ifdef __AARCH64EB__ 932 rev $h0,$h0 // flip output bytes 933 rev $h1,$h1 934#endif 935 stp $h0,$h1,[$mac] // write result 936 937 ret 938.size poly1305_emit_neon,.-poly1305_emit_neon 939 940.align 5 941.Lzeros: 942.long 0,0,0,0,0,0,0,0 943.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 944.align 2 945___ 946 947foreach (split("\n",$code)) { 948 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 949 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 950 (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 951 (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 952 (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 953 (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 954 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 955 956 s/\.[124]([sd])\[/.$1\[/; 957 958 print $_,"\n"; 959} 960close STDOUT or die "error closing STDOUT: $!"; 961