1#! /usr/bin/env perl 2# Copyright 2016-2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for ARMv8. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone. 22# 23# IALU/gcc-4.9 NEON 24# 25# Apple A7 1.86/+5% 0.72 26# Cortex-A53 2.69/+58% 1.47 27# Cortex-A57 2.70/+7% 1.14 28# Denver 1.64/+50% 1.18(*) 29# X-Gene 2.13/+68% 2.27 30# Mongoose 1.77/+75% 1.12 31# Kryo 2.70/+55% 1.13 32# ThunderX2 1.17/+95% 1.36 33# 34# (*) estimate based on resources availability is less than 1.0, 35# i.e. measured result is worse than expected, presumably binary 36# translator is not almighty; 37 38# $output is the last argument if it looks like a file (it has an extension) 39# $flavour is the first argument if it doesn't look like a file 40$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 41$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 42 43$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 44( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 45( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 46die "can't locate arm-xlate.pl"; 47 48open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 49 or die "can't call $xlate: $!"; 50*STDOUT=*OUT; 51 52my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 53my ($mac,$nonce)=($inp,$len); 54 55my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 56 57$code.=<<___; 58#include "arm_arch.h" 59 60.text 61 62// forward "declarations" are required for Apple 63.extern OPENSSL_armcap_P 64.hidden OPENSSL_armcap_P 65.globl poly1305_init 66.hidden poly1305_init 67.globl poly1305_blocks 68.hidden poly1305_blocks 69.globl poly1305_emit 70.hidden poly1305_emit 71 72.type poly1305_init,%function 73.align 5 74poly1305_init: 75 AARCH64_VALID_CALL_TARGET 76 cmp $inp,xzr 77 stp xzr,xzr,[$ctx] // zero hash value 78 stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 79 80 csel x0,xzr,x0,eq 81 b.eq .Lno_key 82 83 adrp x17,OPENSSL_armcap_P 84 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 85 86 ldp $r0,$r1,[$inp] // load key 87 mov $s1,#0xfffffffc0fffffff 88 movk $s1,#0x0fff,lsl#48 89#ifdef __AARCH64EB__ 90 rev $r0,$r0 // flip bytes 91 rev $r1,$r1 92#endif 93 and $r0,$r0,$s1 // &=0ffffffc0fffffff 94 and $s1,$s1,#-4 95 and $r1,$r1,$s1 // &=0ffffffc0ffffffc 96 stp $r0,$r1,[$ctx,#32] // save key value 97 98 tst w17,#ARMV7_NEON 99 100 adrp $d0,poly1305_blocks 101 add $d0,$d0,#:lo12:.Lpoly1305_blocks 102 adrp $r0,poly1305_blocks_neon 103 add $r0,$r0,#:lo12:.Lpoly1305_blocks_neon 104 adrp $d1,poly1305_emit 105 add $d1,$d1,#:lo12:.Lpoly1305_emit 106 adrp $r1,poly1305_emit_neon 107 add $r1,$r1,#:lo12:.Lpoly1305_emit_neon 108 109 csel $d0,$d0,$r0,eq 110 csel $d1,$d1,$r1,eq 111 112#ifdef __ILP32__ 113 stp w12,w13,[$len] 114#else 115 stp $d0,$d1,[$len] 116#endif 117 118 mov x0,#1 119.Lno_key: 120 ret 121.size poly1305_init,.-poly1305_init 122 123.type poly1305_blocks,%function 124.align 5 125poly1305_blocks: 126.Lpoly1305_blocks: 127 // The symbol .Lpoly1305_blocks is not a .globl symbol 128 // but a pointer to it is returned by poly1305_init 129 AARCH64_VALID_CALL_TARGET 130 ands $len,$len,#-16 131 b.eq .Lno_data 132 133 ldp $h0,$h1,[$ctx] // load hash value 134 ldp $r0,$r1,[$ctx,#32] // load key value 135 ldr $h2,[$ctx,#16] 136 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 137 b .Loop 138 139.align 5 140.Loop: 141 ldp $t0,$t1,[$inp],#16 // load input 142 sub $len,$len,#16 143#ifdef __AARCH64EB__ 144 rev $t0,$t0 145 rev $t1,$t1 146#endif 147 adds $h0,$h0,$t0 // accumulate input 148 adcs $h1,$h1,$t1 149 150 mul $d0,$h0,$r0 // h0*r0 151 adc $h2,$h2,$padbit 152 umulh $d1,$h0,$r0 153 154 mul $t0,$h1,$s1 // h1*5*r1 155 umulh $t1,$h1,$s1 156 157 adds $d0,$d0,$t0 158 mul $t0,$h0,$r1 // h0*r1 159 adc $d1,$d1,$t1 160 umulh $d2,$h0,$r1 161 162 adds $d1,$d1,$t0 163 mul $t0,$h1,$r0 // h1*r0 164 adc $d2,$d2,xzr 165 umulh $t1,$h1,$r0 166 167 adds $d1,$d1,$t0 168 mul $t0,$h2,$s1 // h2*5*r1 169 adc $d2,$d2,$t1 170 mul $t1,$h2,$r0 // h2*r0 171 172 adds $d1,$d1,$t0 173 adc $d2,$d2,$t1 174 175 and $t0,$d2,#-4 // final reduction 176 and $h2,$d2,#3 177 add $t0,$t0,$d2,lsr#2 178 adds $h0,$d0,$t0 179 adcs $h1,$d1,xzr 180 adc $h2,$h2,xzr 181 182 cbnz $len,.Loop 183 184 stp $h0,$h1,[$ctx] // store hash value 185 str $h2,[$ctx,#16] 186 187.Lno_data: 188 ret 189.size poly1305_blocks,.-poly1305_blocks 190 191.type poly1305_emit,%function 192.align 5 193poly1305_emit: 194.Lpoly1305_emit: 195 // The symbol .poly1305_emit is not a .globl symbol 196 // but a pointer to it is returned by poly1305_init 197 AARCH64_VALID_CALL_TARGET 198 ldp $h0,$h1,[$ctx] // load hash base 2^64 199 ldr $h2,[$ctx,#16] 200 ldp $t0,$t1,[$nonce] // load nonce 201 202 adds $d0,$h0,#5 // compare to modulus 203 adcs $d1,$h1,xzr 204 adc $d2,$h2,xzr 205 206 tst $d2,#-4 // see if it's carried/borrowed 207 208 csel $h0,$h0,$d0,eq 209 csel $h1,$h1,$d1,eq 210 211#ifdef __AARCH64EB__ 212 ror $t0,$t0,#32 // flip nonce words 213 ror $t1,$t1,#32 214#endif 215 adds $h0,$h0,$t0 // accumulate nonce 216 adc $h1,$h1,$t1 217#ifdef __AARCH64EB__ 218 rev $h0,$h0 // flip output bytes 219 rev $h1,$h1 220#endif 221 stp $h0,$h1,[$mac] // write result 222 223 ret 224.size poly1305_emit,.-poly1305_emit 225___ 226my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 227my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 228my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 229my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 230my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 231my ($T0,$T1,$MASK) = map("v$_",(29..31)); 232 233my ($in2,$zeros)=("x16","x17"); 234my $is_base2_26 = $zeros; # borrow 235 236$code.=<<___; 237.type poly1305_mult,%function 238.align 5 239poly1305_mult: 240 mul $d0,$h0,$r0 // h0*r0 241 umulh $d1,$h0,$r0 242 243 mul $t0,$h1,$s1 // h1*5*r1 244 umulh $t1,$h1,$s1 245 246 adds $d0,$d0,$t0 247 mul $t0,$h0,$r1 // h0*r1 248 adc $d1,$d1,$t1 249 umulh $d2,$h0,$r1 250 251 adds $d1,$d1,$t0 252 mul $t0,$h1,$r0 // h1*r0 253 adc $d2,$d2,xzr 254 umulh $t1,$h1,$r0 255 256 adds $d1,$d1,$t0 257 mul $t0,$h2,$s1 // h2*5*r1 258 adc $d2,$d2,$t1 259 mul $t1,$h2,$r0 // h2*r0 260 261 adds $d1,$d1,$t0 262 adc $d2,$d2,$t1 263 264 and $t0,$d2,#-4 // final reduction 265 and $h2,$d2,#3 266 add $t0,$t0,$d2,lsr#2 267 adds $h0,$d0,$t0 268 adcs $h1,$d1,xzr 269 adc $h2,$h2,xzr 270 271 ret 272.size poly1305_mult,.-poly1305_mult 273 274.type poly1305_splat,%function 275.align 5 276poly1305_splat: 277 and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 278 ubfx x13,$h0,#26,#26 279 extr x14,$h1,$h0,#52 280 and x14,x14,#0x03ffffff 281 ubfx x15,$h1,#14,#26 282 extr x16,$h2,$h1,#40 283 284 str w12,[$ctx,#16*0] // r0 285 add w12,w13,w13,lsl#2 // r1*5 286 str w13,[$ctx,#16*1] // r1 287 add w13,w14,w14,lsl#2 // r2*5 288 str w12,[$ctx,#16*2] // s1 289 str w14,[$ctx,#16*3] // r2 290 add w14,w15,w15,lsl#2 // r3*5 291 str w13,[$ctx,#16*4] // s2 292 str w15,[$ctx,#16*5] // r3 293 add w15,w16,w16,lsl#2 // r4*5 294 str w14,[$ctx,#16*6] // s3 295 str w16,[$ctx,#16*7] // r4 296 str w15,[$ctx,#16*8] // s4 297 298 ret 299.size poly1305_splat,.-poly1305_splat 300 301.type poly1305_blocks_neon,%function 302.align 5 303poly1305_blocks_neon: 304.Lpoly1305_blocks_neon: 305 // The symbol .Lpoly1305_blocks_neon is not a .globl symbol 306 // but a pointer to it is returned by poly1305_init 307 AARCH64_VALID_CALL_TARGET 308 ldr $is_base2_26,[$ctx,#24] 309 cmp $len,#128 310 b.hs .Lblocks_neon 311 cbz $is_base2_26,.Lpoly1305_blocks 312 313.Lblocks_neon: 314 AARCH64_SIGN_LINK_REGISTER 315 stp x29,x30,[sp,#-80]! 316 add x29,sp,#0 317 318 ands $len,$len,#-16 319 b.eq .Lno_data_neon 320 321 cbz $is_base2_26,.Lbase2_64_neon 322 323 ldp w10,w11,[$ctx] // load hash value base 2^26 324 ldp w12,w13,[$ctx,#8] 325 ldr w14,[$ctx,#16] 326 327 tst $len,#31 328 b.eq .Leven_neon 329 330 ldp $r0,$r1,[$ctx,#32] // load key value 331 332 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 333 lsr $h1,x12,#12 334 adds $h0,$h0,x12,lsl#52 335 add $h1,$h1,x13,lsl#14 336 adc $h1,$h1,xzr 337 lsr $h2,x14,#24 338 adds $h1,$h1,x14,lsl#40 339 adc $d2,$h2,xzr // can be partially reduced... 340 341 ldp $d0,$d1,[$inp],#16 // load input 342 sub $len,$len,#16 343 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 344 345 and $t0,$d2,#-4 // ... so reduce 346 and $h2,$d2,#3 347 add $t0,$t0,$d2,lsr#2 348 adds $h0,$h0,$t0 349 adcs $h1,$h1,xzr 350 adc $h2,$h2,xzr 351 352#ifdef __AARCH64EB__ 353 rev $d0,$d0 354 rev $d1,$d1 355#endif 356 adds $h0,$h0,$d0 // accumulate input 357 adcs $h1,$h1,$d1 358 adc $h2,$h2,$padbit 359 360 bl poly1305_mult 361 ldr x30,[sp,#8] 362 363 cbz $padbit,.Lstore_base2_64_neon 364 365 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 366 ubfx x11,$h0,#26,#26 367 extr x12,$h1,$h0,#52 368 and x12,x12,#0x03ffffff 369 ubfx x13,$h1,#14,#26 370 extr x14,$h2,$h1,#40 371 372 cbnz $len,.Leven_neon 373 374 stp w10,w11,[$ctx] // store hash value base 2^26 375 stp w12,w13,[$ctx,#8] 376 str w14,[$ctx,#16] 377 b .Lno_data_neon 378 379.align 4 380.Lstore_base2_64_neon: 381 stp $h0,$h1,[$ctx] // store hash value base 2^64 382 stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed 383 b .Lno_data_neon 384 385.align 4 386.Lbase2_64_neon: 387 ldp $r0,$r1,[$ctx,#32] // load key value 388 389 ldp $h0,$h1,[$ctx] // load hash value base 2^64 390 ldr $h2,[$ctx,#16] 391 392 tst $len,#31 393 b.eq .Linit_neon 394 395 ldp $d0,$d1,[$inp],#16 // load input 396 sub $len,$len,#16 397 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 398#ifdef __AARCH64EB__ 399 rev $d0,$d0 400 rev $d1,$d1 401#endif 402 adds $h0,$h0,$d0 // accumulate input 403 adcs $h1,$h1,$d1 404 adc $h2,$h2,$padbit 405 406 bl poly1305_mult 407 408.Linit_neon: 409 and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 410 ubfx x11,$h0,#26,#26 411 extr x12,$h1,$h0,#52 412 and x12,x12,#0x03ffffff 413 ubfx x13,$h1,#14,#26 414 extr x14,$h2,$h1,#40 415 416 stp d8,d9,[sp,#16] // meet ABI requirements 417 stp d10,d11,[sp,#32] 418 stp d12,d13,[sp,#48] 419 stp d14,d15,[sp,#64] 420 421 fmov ${H0},x10 422 fmov ${H1},x11 423 fmov ${H2},x12 424 fmov ${H3},x13 425 fmov ${H4},x14 426 427 ////////////////////////////////// initialize r^n table 428 mov $h0,$r0 // r^1 429 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 430 mov $h1,$r1 431 mov $h2,xzr 432 add $ctx,$ctx,#48+12 433 bl poly1305_splat 434 435 bl poly1305_mult // r^2 436 sub $ctx,$ctx,#4 437 bl poly1305_splat 438 439 bl poly1305_mult // r^3 440 sub $ctx,$ctx,#4 441 bl poly1305_splat 442 443 bl poly1305_mult // r^4 444 sub $ctx,$ctx,#4 445 bl poly1305_splat 446 ldr x30,[sp,#8] 447 448 add $in2,$inp,#32 449 adrp $zeros,.Lzeros 450 add $zeros,$zeros,:lo12:.Lzeros 451 subs $len,$len,#64 452 csel $in2,$zeros,$in2,lo 453 454 mov x4,#1 455 stur x4,[$ctx,#-24] // set is_base2_26 456 sub $ctx,$ctx,#48 // restore original $ctx 457 b .Ldo_neon 458 459.align 4 460.Leven_neon: 461 add $in2,$inp,#32 462 adrp $zeros,.Lzeros 463 add $zeros,$zeros,:lo12:.Lzeros 464 subs $len,$len,#64 465 csel $in2,$zeros,$in2,lo 466 467 stp d8,d9,[sp,#16] // meet ABI requirements 468 stp d10,d11,[sp,#32] 469 stp d12,d13,[sp,#48] 470 stp d14,d15,[sp,#64] 471 472 fmov ${H0},x10 473 fmov ${H1},x11 474 fmov ${H2},x12 475 fmov ${H3},x13 476 fmov ${H4},x14 477 478.Ldo_neon: 479 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 480 ldp x9,x13,[$in2],#48 481 482 lsl $padbit,$padbit,#24 483 add x15,$ctx,#48 484 485#ifdef __AARCH64EB__ 486 rev x8,x8 487 rev x12,x12 488 rev x9,x9 489 rev x13,x13 490#endif 491 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 492 and x5,x9,#0x03ffffff 493 ubfx x6,x8,#26,#26 494 ubfx x7,x9,#26,#26 495 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 496 extr x8,x12,x8,#52 497 extr x9,x13,x9,#52 498 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 499 fmov $IN23_0,x4 500 and x8,x8,#0x03ffffff 501 and x9,x9,#0x03ffffff 502 ubfx x10,x12,#14,#26 503 ubfx x11,x13,#14,#26 504 add x12,$padbit,x12,lsr#40 505 add x13,$padbit,x13,lsr#40 506 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 507 fmov $IN23_1,x6 508 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 509 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 510 fmov $IN23_2,x8 511 fmov $IN23_3,x10 512 fmov $IN23_4,x12 513 514 ldp x8,x12,[$inp],#16 // inp[0:1] 515 ldp x9,x13,[$inp],#48 516 517 ld1 {$R0,$R1,$S1,$R2},[x15],#64 518 ld1 {$S2,$R3,$S3,$R4},[x15],#64 519 ld1 {$S4},[x15] 520 521#ifdef __AARCH64EB__ 522 rev x8,x8 523 rev x12,x12 524 rev x9,x9 525 rev x13,x13 526#endif 527 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 528 and x5,x9,#0x03ffffff 529 ubfx x6,x8,#26,#26 530 ubfx x7,x9,#26,#26 531 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 532 extr x8,x12,x8,#52 533 extr x9,x13,x9,#52 534 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 535 fmov $IN01_0,x4 536 and x8,x8,#0x03ffffff 537 and x9,x9,#0x03ffffff 538 ubfx x10,x12,#14,#26 539 ubfx x11,x13,#14,#26 540 add x12,$padbit,x12,lsr#40 541 add x13,$padbit,x13,lsr#40 542 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 543 fmov $IN01_1,x6 544 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 545 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 546 movi $MASK.2d,#-1 547 fmov $IN01_2,x8 548 fmov $IN01_3,x10 549 fmov $IN01_4,x12 550 ushr $MASK.2d,$MASK.2d,#38 551 552 b.ls .Lskip_loop 553 554.align 4 555.Loop_neon: 556 //////////////////////////////////////////////////////////////// 557 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 558 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 559 // \___________________/ 560 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 561 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 562 // \___________________/ \____________________/ 563 // 564 // Note that we start with inp[2:3]*r^2. This is because it 565 // doesn't depend on reduction in previous iteration. 566 //////////////////////////////////////////////////////////////// 567 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 568 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 569 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 570 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 571 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 572 573 subs $len,$len,#64 574 umull $ACC4,$IN23_0,${R4}[2] 575 csel $in2,$zeros,$in2,lo 576 umull $ACC3,$IN23_0,${R3}[2] 577 umull $ACC2,$IN23_0,${R2}[2] 578 ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 579 umull $ACC1,$IN23_0,${R1}[2] 580 ldp x9,x13,[$in2],#48 581 umull $ACC0,$IN23_0,${R0}[2] 582#ifdef __AARCH64EB__ 583 rev x8,x8 584 rev x12,x12 585 rev x9,x9 586 rev x13,x13 587#endif 588 589 umlal $ACC4,$IN23_1,${R3}[2] 590 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 591 umlal $ACC3,$IN23_1,${R2}[2] 592 and x5,x9,#0x03ffffff 593 umlal $ACC2,$IN23_1,${R1}[2] 594 ubfx x6,x8,#26,#26 595 umlal $ACC1,$IN23_1,${R0}[2] 596 ubfx x7,x9,#26,#26 597 umlal $ACC0,$IN23_1,${S4}[2] 598 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 599 600 umlal $ACC4,$IN23_2,${R2}[2] 601 extr x8,x12,x8,#52 602 umlal $ACC3,$IN23_2,${R1}[2] 603 extr x9,x13,x9,#52 604 umlal $ACC2,$IN23_2,${R0}[2] 605 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 606 umlal $ACC1,$IN23_2,${S4}[2] 607 fmov $IN23_0,x4 608 umlal $ACC0,$IN23_2,${S3}[2] 609 and x8,x8,#0x03ffffff 610 611 umlal $ACC4,$IN23_3,${R1}[2] 612 and x9,x9,#0x03ffffff 613 umlal $ACC3,$IN23_3,${R0}[2] 614 ubfx x10,x12,#14,#26 615 umlal $ACC2,$IN23_3,${S4}[2] 616 ubfx x11,x13,#14,#26 617 umlal $ACC1,$IN23_3,${S3}[2] 618 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 619 umlal $ACC0,$IN23_3,${S2}[2] 620 fmov $IN23_1,x6 621 622 add $IN01_2,$IN01_2,$H2 623 add x12,$padbit,x12,lsr#40 624 umlal $ACC4,$IN23_4,${R0}[2] 625 add x13,$padbit,x13,lsr#40 626 umlal $ACC3,$IN23_4,${S4}[2] 627 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 628 umlal $ACC2,$IN23_4,${S3}[2] 629 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 630 umlal $ACC1,$IN23_4,${S2}[2] 631 fmov $IN23_2,x8 632 umlal $ACC0,$IN23_4,${S1}[2] 633 fmov $IN23_3,x10 634 635 //////////////////////////////////////////////////////////////// 636 // (hash+inp[0:1])*r^4 and accumulate 637 638 add $IN01_0,$IN01_0,$H0 639 fmov $IN23_4,x12 640 umlal $ACC3,$IN01_2,${R1}[0] 641 ldp x8,x12,[$inp],#16 // inp[0:1] 642 umlal $ACC0,$IN01_2,${S3}[0] 643 ldp x9,x13,[$inp],#48 644 umlal $ACC4,$IN01_2,${R2}[0] 645 umlal $ACC1,$IN01_2,${S4}[0] 646 umlal $ACC2,$IN01_2,${R0}[0] 647#ifdef __AARCH64EB__ 648 rev x8,x8 649 rev x12,x12 650 rev x9,x9 651 rev x13,x13 652#endif 653 654 add $IN01_1,$IN01_1,$H1 655 umlal $ACC3,$IN01_0,${R3}[0] 656 umlal $ACC4,$IN01_0,${R4}[0] 657 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 658 umlal $ACC2,$IN01_0,${R2}[0] 659 and x5,x9,#0x03ffffff 660 umlal $ACC0,$IN01_0,${R0}[0] 661 ubfx x6,x8,#26,#26 662 umlal $ACC1,$IN01_0,${R1}[0] 663 ubfx x7,x9,#26,#26 664 665 add $IN01_3,$IN01_3,$H3 666 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 667 umlal $ACC3,$IN01_1,${R2}[0] 668 extr x8,x12,x8,#52 669 umlal $ACC4,$IN01_1,${R3}[0] 670 extr x9,x13,x9,#52 671 umlal $ACC0,$IN01_1,${S4}[0] 672 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 673 umlal $ACC2,$IN01_1,${R1}[0] 674 fmov $IN01_0,x4 675 umlal $ACC1,$IN01_1,${R0}[0] 676 and x8,x8,#0x03ffffff 677 678 add $IN01_4,$IN01_4,$H4 679 and x9,x9,#0x03ffffff 680 umlal $ACC3,$IN01_3,${R0}[0] 681 ubfx x10,x12,#14,#26 682 umlal $ACC0,$IN01_3,${S2}[0] 683 ubfx x11,x13,#14,#26 684 umlal $ACC4,$IN01_3,${R1}[0] 685 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 686 umlal $ACC1,$IN01_3,${S3}[0] 687 fmov $IN01_1,x6 688 umlal $ACC2,$IN01_3,${S4}[0] 689 add x12,$padbit,x12,lsr#40 690 691 umlal $ACC3,$IN01_4,${S4}[0] 692 add x13,$padbit,x13,lsr#40 693 umlal $ACC0,$IN01_4,${S1}[0] 694 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 695 umlal $ACC4,$IN01_4,${R0}[0] 696 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 697 umlal $ACC1,$IN01_4,${S2}[0] 698 fmov $IN01_2,x8 699 umlal $ACC2,$IN01_4,${S3}[0] 700 fmov $IN01_3,x10 701 fmov $IN01_4,x12 702 703 ///////////////////////////////////////////////////////////////// 704 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 705 // and P. Schwabe 706 // 707 // [see discussion in poly1305-armv4 module] 708 709 ushr $T0.2d,$ACC3,#26 710 xtn $H3,$ACC3 711 ushr $T1.2d,$ACC0,#26 712 and $ACC0,$ACC0,$MASK.2d 713 add $ACC4,$ACC4,$T0.2d // h3 -> h4 714 bic $H3,#0xfc,lsl#24 // &=0x03ffffff 715 add $ACC1,$ACC1,$T1.2d // h0 -> h1 716 717 ushr $T0.2d,$ACC4,#26 718 xtn $H4,$ACC4 719 ushr $T1.2d,$ACC1,#26 720 xtn $H1,$ACC1 721 bic $H4,#0xfc,lsl#24 722 add $ACC2,$ACC2,$T1.2d // h1 -> h2 723 724 add $ACC0,$ACC0,$T0.2d 725 shl $T0.2d,$T0.2d,#2 726 shrn $T1.2s,$ACC2,#26 727 xtn $H2,$ACC2 728 add $ACC0,$ACC0,$T0.2d // h4 -> h0 729 bic $H1,#0xfc,lsl#24 730 add $H3,$H3,$T1.2s // h2 -> h3 731 bic $H2,#0xfc,lsl#24 732 733 shrn $T0.2s,$ACC0,#26 734 xtn $H0,$ACC0 735 ushr $T1.2s,$H3,#26 736 bic $H3,#0xfc,lsl#24 737 bic $H0,#0xfc,lsl#24 738 add $H1,$H1,$T0.2s // h0 -> h1 739 add $H4,$H4,$T1.2s // h3 -> h4 740 741 b.hi .Loop_neon 742 743.Lskip_loop: 744 dup $IN23_2,${IN23_2}[0] 745 add $IN01_2,$IN01_2,$H2 746 747 //////////////////////////////////////////////////////////////// 748 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 749 750 adds $len,$len,#32 751 b.ne .Long_tail 752 753 dup $IN23_2,${IN01_2}[0] 754 add $IN23_0,$IN01_0,$H0 755 add $IN23_3,$IN01_3,$H3 756 add $IN23_1,$IN01_1,$H1 757 add $IN23_4,$IN01_4,$H4 758 759.Long_tail: 760 dup $IN23_0,${IN23_0}[0] 761 umull2 $ACC0,$IN23_2,${S3} 762 umull2 $ACC3,$IN23_2,${R1} 763 umull2 $ACC4,$IN23_2,${R2} 764 umull2 $ACC2,$IN23_2,${R0} 765 umull2 $ACC1,$IN23_2,${S4} 766 767 dup $IN23_1,${IN23_1}[0] 768 umlal2 $ACC0,$IN23_0,${R0} 769 umlal2 $ACC2,$IN23_0,${R2} 770 umlal2 $ACC3,$IN23_0,${R3} 771 umlal2 $ACC4,$IN23_0,${R4} 772 umlal2 $ACC1,$IN23_0,${R1} 773 774 dup $IN23_3,${IN23_3}[0] 775 umlal2 $ACC0,$IN23_1,${S4} 776 umlal2 $ACC3,$IN23_1,${R2} 777 umlal2 $ACC2,$IN23_1,${R1} 778 umlal2 $ACC4,$IN23_1,${R3} 779 umlal2 $ACC1,$IN23_1,${R0} 780 781 dup $IN23_4,${IN23_4}[0] 782 umlal2 $ACC3,$IN23_3,${R0} 783 umlal2 $ACC4,$IN23_3,${R1} 784 umlal2 $ACC0,$IN23_3,${S2} 785 umlal2 $ACC1,$IN23_3,${S3} 786 umlal2 $ACC2,$IN23_3,${S4} 787 788 umlal2 $ACC3,$IN23_4,${S4} 789 umlal2 $ACC0,$IN23_4,${S1} 790 umlal2 $ACC4,$IN23_4,${R0} 791 umlal2 $ACC1,$IN23_4,${S2} 792 umlal2 $ACC2,$IN23_4,${S3} 793 794 b.eq .Lshort_tail 795 796 //////////////////////////////////////////////////////////////// 797 // (hash+inp[0:1])*r^4:r^3 and accumulate 798 799 add $IN01_0,$IN01_0,$H0 800 umlal $ACC3,$IN01_2,${R1} 801 umlal $ACC0,$IN01_2,${S3} 802 umlal $ACC4,$IN01_2,${R2} 803 umlal $ACC1,$IN01_2,${S4} 804 umlal $ACC2,$IN01_2,${R0} 805 806 add $IN01_1,$IN01_1,$H1 807 umlal $ACC3,$IN01_0,${R3} 808 umlal $ACC0,$IN01_0,${R0} 809 umlal $ACC4,$IN01_0,${R4} 810 umlal $ACC1,$IN01_0,${R1} 811 umlal $ACC2,$IN01_0,${R2} 812 813 add $IN01_3,$IN01_3,$H3 814 umlal $ACC3,$IN01_1,${R2} 815 umlal $ACC0,$IN01_1,${S4} 816 umlal $ACC4,$IN01_1,${R3} 817 umlal $ACC1,$IN01_1,${R0} 818 umlal $ACC2,$IN01_1,${R1} 819 820 add $IN01_4,$IN01_4,$H4 821 umlal $ACC3,$IN01_3,${R0} 822 umlal $ACC0,$IN01_3,${S2} 823 umlal $ACC4,$IN01_3,${R1} 824 umlal $ACC1,$IN01_3,${S3} 825 umlal $ACC2,$IN01_3,${S4} 826 827 umlal $ACC3,$IN01_4,${S4} 828 umlal $ACC0,$IN01_4,${S1} 829 umlal $ACC4,$IN01_4,${R0} 830 umlal $ACC1,$IN01_4,${S2} 831 umlal $ACC2,$IN01_4,${S3} 832 833.Lshort_tail: 834 //////////////////////////////////////////////////////////////// 835 // horizontal add 836 837 addp $ACC3,$ACC3,$ACC3 838 ldp d8,d9,[sp,#16] // meet ABI requirements 839 addp $ACC0,$ACC0,$ACC0 840 ldp d10,d11,[sp,#32] 841 addp $ACC4,$ACC4,$ACC4 842 ldp d12,d13,[sp,#48] 843 addp $ACC1,$ACC1,$ACC1 844 ldp d14,d15,[sp,#64] 845 addp $ACC2,$ACC2,$ACC2 846 847 //////////////////////////////////////////////////////////////// 848 // lazy reduction, but without narrowing 849 850 ushr $T0.2d,$ACC3,#26 851 and $ACC3,$ACC3,$MASK.2d 852 ushr $T1.2d,$ACC0,#26 853 and $ACC0,$ACC0,$MASK.2d 854 855 add $ACC4,$ACC4,$T0.2d // h3 -> h4 856 add $ACC1,$ACC1,$T1.2d // h0 -> h1 857 858 ushr $T0.2d,$ACC4,#26 859 and $ACC4,$ACC4,$MASK.2d 860 ushr $T1.2d,$ACC1,#26 861 and $ACC1,$ACC1,$MASK.2d 862 add $ACC2,$ACC2,$T1.2d // h1 -> h2 863 864 add $ACC0,$ACC0,$T0.2d 865 shl $T0.2d,$T0.2d,#2 866 ushr $T1.2d,$ACC2,#26 867 and $ACC2,$ACC2,$MASK.2d 868 add $ACC0,$ACC0,$T0.2d // h4 -> h0 869 add $ACC3,$ACC3,$T1.2d // h2 -> h3 870 871 ushr $T0.2d,$ACC0,#26 872 and $ACC0,$ACC0,$MASK.2d 873 ushr $T1.2d,$ACC3,#26 874 and $ACC3,$ACC3,$MASK.2d 875 add $ACC1,$ACC1,$T0.2d // h0 -> h1 876 add $ACC4,$ACC4,$T1.2d // h3 -> h4 877 878 //////////////////////////////////////////////////////////////// 879 // write the result, can be partially reduced 880 881 st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 882 st1 {$ACC4}[0],[$ctx] 883 884.Lno_data_neon: 885 ldr x29,[sp],#80 886 AARCH64_VALIDATE_LINK_REGISTER 887 ret 888.size poly1305_blocks_neon,.-poly1305_blocks_neon 889 890.type poly1305_emit_neon,%function 891.align 5 892poly1305_emit_neon: 893.Lpoly1305_emit_neon: 894 // The symbol .Lpoly1305_emit_neon is not a .globl symbol 895 // but a pointer to it is returned by poly1305_init 896 AARCH64_VALID_CALL_TARGET 897 ldr $is_base2_26,[$ctx,#24] 898 cbz $is_base2_26,poly1305_emit 899 900 ldp w10,w11,[$ctx] // load hash value base 2^26 901 ldp w12,w13,[$ctx,#8] 902 ldr w14,[$ctx,#16] 903 904 add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 905 lsr $h1,x12,#12 906 adds $h0,$h0,x12,lsl#52 907 add $h1,$h1,x13,lsl#14 908 adc $h1,$h1,xzr 909 lsr $h2,x14,#24 910 adds $h1,$h1,x14,lsl#40 911 adc $h2,$h2,xzr // can be partially reduced... 912 913 ldp $t0,$t1,[$nonce] // load nonce 914 915 and $d0,$h2,#-4 // ... so reduce 916 add $d0,$d0,$h2,lsr#2 917 and $h2,$h2,#3 918 adds $h0,$h0,$d0 919 adcs $h1,$h1,xzr 920 adc $h2,$h2,xzr 921 922 adds $d0,$h0,#5 // compare to modulus 923 adcs $d1,$h1,xzr 924 adc $d2,$h2,xzr 925 926 tst $d2,#-4 // see if it's carried/borrowed 927 928 csel $h0,$h0,$d0,eq 929 csel $h1,$h1,$d1,eq 930 931#ifdef __AARCH64EB__ 932 ror $t0,$t0,#32 // flip nonce words 933 ror $t1,$t1,#32 934#endif 935 adds $h0,$h0,$t0 // accumulate nonce 936 adc $h1,$h1,$t1 937#ifdef __AARCH64EB__ 938 rev $h0,$h0 // flip output bytes 939 rev $h1,$h1 940#endif 941 stp $h0,$h1,[$mac] // write result 942 943 ret 944.size poly1305_emit_neon,.-poly1305_emit_neon 945 946.rodata 947 948.align 5 949.Lzeros: 950.long 0,0,0,0,0,0,0,0 951.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 952.align 2 953___ 954 955foreach (split("\n",$code)) { 956 s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 957 s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 958 (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 959 (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 960 (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 961 (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 962 (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 963 964 s/\.[124]([sd])\[/.$1\[/; 965 966 print $_,"\n"; 967} 968close STDOUT or die "error closing STDOUT: $!"; 969