1#! /usr/bin/env perl 2# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. 18# 19# June 2014 20# 21# Initial version was developed in tight cooperation with Ard 22# Biesheuvel of Linaro from bits-n-pieces from other assembly modules. 23# Just like aesv8-armx.pl this module supports both AArch32 and 24# AArch64 execution modes. 25# 26# July 2014 27# 28# Implement 2x aggregated reduction [see ghash-x86.pl for background 29# information]. 30# 31# November 2017 32# 33# AArch64 register bank to "accommodate" 4x aggregated reduction and 34# improve performance by 20-70% depending on processor. 35# 36# Current performance in cycles per processed byte: 37# 38# 64-bit PMULL 32-bit PMULL 32-bit NEON(*) 39# Apple A7 0.58 0.92 5.62 40# Cortex-A53 0.85 1.01 8.39 41# Cortex-A57 0.73 1.17 7.61 42# Denver 0.51 0.65 6.02 43# Mongoose 0.65 1.10 8.06 44# Kryo 0.76 1.16 8.00 45# ThunderX2 1.05 46# 47# (*) presented for reference/comparison purposes; 48 49# $output is the last argument if it looks like a file (it has an extension) 50# $flavour is the first argument if it doesn't look like a file 51$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 52$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 53 54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 55( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 56( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 57die "can't locate arm-xlate.pl"; 58 59open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 60 or die "can't call $xlate: $!"; 61*STDOUT=*OUT; 62 63$Xi="x0"; # argument block 64$Htbl="x1"; 65$inp="x2"; 66$len="x3"; 67 68$inc="x12"; 69 70{ 71my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 72my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); 73my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 74 75$code=<<___; 76#include "arm_arch.h" 77 78#if __ARM_MAX_ARCH__>=7 79___ 80$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 81$code.=<<___ if ($flavour !~ /64/); 82.fpu neon 83#ifdef __thumb2__ 84.syntax unified 85.thumb 86# define INST(a,b,c,d) $_byte c,0xef,a,b 87#else 88.code 32 89# define INST(a,b,c,d) $_byte a,b,c,0xf2 90#endif 91 92.text 93___ 94 95################################################################################ 96# void gcm_init_v8(u128 Htable[16],const u64 H[2]); 97# 98# input: 128-bit H - secret parameter E(K,0^128) 99# output: precomputed table filled with degrees of twisted H; 100# H is twisted to handle reverse bitness of GHASH; 101# only few of 16 slots of Htable[16] are used; 102# data is opaque to outside world (which allows to 103# optimize the code independently); 104# 105$code.=<<___; 106.global gcm_init_v8 107.type gcm_init_v8,%function 108.align 4 109gcm_init_v8: 110___ 111$code.=<<___ if ($flavour =~ /64/); 112 AARCH64_VALID_CALL_TARGET 113___ 114$code.=<<___; 115 vld1.64 {$t1},[x1] @ load input H 116 vmov.i8 $xC2,#0xe1 117 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 118 vext.8 $IN,$t1,$t1,#8 119 vshr.u64 $t2,$xC2,#63 120 vdup.32 $t1,${t1}[1] 121 vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 122 vshr.u64 $t2,$IN,#63 123 vshr.s32 $t1,$t1,#31 @ broadcast carry bit 124 vand $t2,$t2,$t0 125 vshl.i64 $IN,$IN,#1 126 vext.8 $t2,$t2,$t2,#8 127 vand $t0,$t0,$t1 128 vorr $IN,$IN,$t2 @ H<<<=1 129 veor $H,$IN,$t0 @ twisted H 130 vst1.64 {$H},[x0],#16 @ store Htable[0] 131 132 @ calculate H^2 133 vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing 134 vpmull.p64 $Xl,$H,$H 135 veor $t0,$t0,$H 136 vpmull2.p64 $Xh,$H,$H 137 vpmull.p64 $Xm,$t0,$t0 138 139 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 140 veor $t2,$Xl,$Xh 141 veor $Xm,$Xm,$t1 142 veor $Xm,$Xm,$t2 143 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 144 145 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 146 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 147 veor $Xl,$Xm,$t2 148 149 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 150 vpmull.p64 $Xl,$Xl,$xC2 151 veor $t2,$t2,$Xh 152 veor $H2,$Xl,$t2 153 154 vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing 155 veor $t1,$t1,$H2 156 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 157 vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] 158___ 159if ($flavour =~ /64/) { 160my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); 161my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23)); 162 163$code.=<<___; 164 @ calculate H^3 and H^4 165 vpmull.p64 $Xl,$H, $H2 166 vpmull.p64 $Yl,$H2,$H2 167 vpmull2.p64 $Xh,$H, $H2 168 vpmull2.p64 $Yh,$H2,$H2 169 vpmull.p64 $Xm,$t0,$t1 170 vpmull.p64 $Ym,$t1,$t1 171 172 vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing 173 vext.8 $t1,$Yl,$Yh,#8 174 veor $t2,$Xl,$Xh 175 veor $Xm,$Xm,$t0 176 veor $t3,$Yl,$Yh 177 veor $Ym,$Ym,$t1 178 veor $Xm,$Xm,$t2 179 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 180 veor $Ym,$Ym,$t3 181 vpmull.p64 $t3,$Yl,$xC2 182 183 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 184 vmov $Yh#lo,$Ym#hi 185 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 186 vmov $Ym#hi,$Yl#lo 187 veor $Xl,$Xm,$t2 188 veor $Yl,$Ym,$t3 189 190 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 191 vext.8 $t3,$Yl,$Yl,#8 192 vpmull.p64 $Xl,$Xl,$xC2 193 vpmull.p64 $Yl,$Yl,$xC2 194 veor $t2,$t2,$Xh 195 veor $t3,$t3,$Yh 196 veor $H3, $Xl,$t2 @ H^3 197 veor $H4,$Yl,$t3 @ H^4 198 199 vext.8 $t0,$H3, $H3,#8 @ Karatsuba pre-processing 200 vext.8 $t1,$H4,$H4,#8 201 vext.8 $t2,$H2,$H2,#8 202 veor $t0,$t0,$H3 203 veor $t1,$t1,$H4 204 veor $t2,$t2,$H2 205 vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed 206 vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5] 207 208 @ calculate H^5 and H^6 209 vpmull.p64 $Xl,$H2, $H3 210 vpmull.p64 $Yl,$H3,$H3 211 vpmull2.p64 $Xh,$H2, $H3 212 vpmull2.p64 $Yh,$H3,$H3 213 vpmull.p64 $Xm,$t0,$t2 214 vpmull.p64 $Ym,$t0,$t0 215 216 vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing 217 vext.8 $t1,$Yl,$Yh,#8 218 veor $t2,$Xl,$Xh 219 veor $Xm,$Xm,$t0 220 veor $t3,$Yl,$Yh 221 veor $Ym,$Ym,$t1 222 veor $Xm,$Xm,$t2 223 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 224 veor $Ym,$Ym,$t3 225 vpmull.p64 $t3,$Yl,$xC2 226 227 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 228 vmov $Yh#lo,$Ym#hi 229 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 230 vmov $Ym#hi,$Yl#lo 231 veor $Xl,$Xm,$t2 232 veor $Yl,$Ym,$t3 233 234 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 235 vext.8 $t3,$Yl,$Yl,#8 236 vpmull.p64 $Xl,$Xl,$xC2 237 vpmull.p64 $Yl,$Yl,$xC2 238 veor $t2,$t2,$Xh 239 veor $t3,$t3,$Yh 240 veor $H5,$Xl,$t2 @ H^5 241 veor $H6,$Yl,$t3 @ H^6 242 243 vext.8 $t0,$H5, $H5,#8 @ Karatsuba pre-processing 244 vext.8 $t1,$H6,$H6,#8 245 vext.8 $t2,$H2,$H2,#8 246 veor $t0,$t0,$H5 247 veor $t1,$t1,$H6 248 veor $t2,$t2,$H2 249 vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed 250 vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8] 251 252 @ calculate H^7 and H^8 253 vpmull.p64 $Xl,$H2,$H5 254 vpmull.p64 $Yl,$H2,$H6 255 vpmull2.p64 $Xh,$H2,$H5 256 vpmull2.p64 $Yh,$H2,$H6 257 vpmull.p64 $Xm,$t0,$t2 258 vpmull.p64 $Ym,$t1,$t2 259 260 vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing 261 vext.8 $t1,$Yl,$Yh,#8 262 veor $t2,$Xl,$Xh 263 veor $Xm,$Xm,$t0 264 veor $t3,$Yl,$Yh 265 veor $Ym,$Ym,$t1 266 veor $Xm,$Xm,$t2 267 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 268 veor $Ym,$Ym,$t3 269 vpmull.p64 $t3,$Yl,$xC2 270 271 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 272 vmov $Yh#lo,$Ym#hi 273 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 274 vmov $Ym#hi,$Yl#lo 275 veor $Xl,$Xm,$t2 276 veor $Yl,$Ym,$t3 277 278 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 279 vext.8 $t3,$Yl,$Yl,#8 280 vpmull.p64 $Xl,$Xl,$xC2 281 vpmull.p64 $Yl,$Yl,$xC2 282 veor $t2,$t2,$Xh 283 veor $t3,$t3,$Yh 284 veor $H7,$Xl,$t2 @ H^7 285 veor $H8,$Yl,$t3 @ H^8 286 287 vext.8 $t0,$H7,$H7,#8 @ Karatsuba pre-processing 288 vext.8 $t1,$H8,$H8,#8 289 veor $t0,$t0,$H7 290 veor $t1,$t1,$H8 291 vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed 292 vst1.64 {$H7-$H8},[x0] @ store Htable[9..11] 293___ 294} 295$code.=<<___; 296 ret 297.size gcm_init_v8,.-gcm_init_v8 298___ 299################################################################################ 300# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); 301# 302# input: Xi - current hash value; 303# Htable - table precomputed in gcm_init_v8; 304# output: Xi - next hash value Xi; 305# 306$code.=<<___; 307.global gcm_gmult_v8 308.type gcm_gmult_v8,%function 309.align 4 310gcm_gmult_v8: 311___ 312$code.=<<___ if ($flavour =~ /64/); 313 AARCH64_VALID_CALL_TARGET 314___ 315$code.=<<___; 316 vld1.64 {$t1},[$Xi] @ load Xi 317 vmov.i8 $xC2,#0xe1 318 vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... 319 vshl.u64 $xC2,$xC2,#57 320#ifndef __ARMEB__ 321 vrev64.8 $t1,$t1 322#endif 323 vext.8 $IN,$t1,$t1,#8 324 325 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 326 veor $t1,$t1,$IN @ Karatsuba pre-processing 327 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 328 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 329 330 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 331 veor $t2,$Xl,$Xh 332 veor $Xm,$Xm,$t1 333 veor $Xm,$Xm,$t2 334 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 335 336 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 337 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 338 veor $Xl,$Xm,$t2 339 340 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 341 vpmull.p64 $Xl,$Xl,$xC2 342 veor $t2,$t2,$Xh 343 veor $Xl,$Xl,$t2 344 345#ifndef __ARMEB__ 346 vrev64.8 $Xl,$Xl 347#endif 348 vext.8 $Xl,$Xl,$Xl,#8 349 vst1.64 {$Xl},[$Xi] @ write out Xi 350 351 ret 352.size gcm_gmult_v8,.-gcm_gmult_v8 353___ 354################################################################################ 355# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 356# 357# input: table precomputed in gcm_init_v8; 358# current hash value Xi; 359# pointer to input data; 360# length of input data in bytes, but divisible by block size; 361# output: next hash value Xi; 362# 363$code.=<<___; 364.global gcm_ghash_v8 365.type gcm_ghash_v8,%function 366.align 4 367gcm_ghash_v8: 368___ 369$code.=<<___ if ($flavour =~ /64/); 370 AARCH64_VALID_CALL_TARGET 371 cmp $len,#64 372 b.hs .Lgcm_ghash_v8_4x 373___ 374$code.=<<___ if ($flavour !~ /64/); 375 vstmdb sp!,{d8-d15} @ 32-bit ABI says so 376___ 377$code.=<<___; 378 vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 379 @ "[rotated]" means that 380 @ loaded value would have 381 @ to be rotated in order to 382 @ make it appear as in 383 @ algorithm specification 384 subs $len,$len,#32 @ see if $len is 32 or larger 385 mov $inc,#16 @ $inc is used as post- 386 @ increment for input pointer; 387 @ as loop is modulo-scheduled 388 @ $inc is zeroed just in time 389 @ to preclude overstepping 390 @ inp[len], which means that 391 @ last block[s] are actually 392 @ loaded twice, but last 393 @ copy is not processed 394 vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 395 vmov.i8 $xC2,#0xe1 396 vld1.64 {$H2},[$Htbl] 397 cclr $inc,eq @ is it time to zero $inc? 398 vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi 399 vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] 400 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 401#ifndef __ARMEB__ 402 vrev64.8 $t0,$t0 403 vrev64.8 $Xl,$Xl 404#endif 405 vext.8 $IN,$t0,$t0,#8 @ rotate I[0] 406 b.lo .Lodd_tail_v8 @ $len was less than 32 407___ 408{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); 409 ####### 410 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 411 # [(H*Ii+1) + (H*Xi+1)] mod P = 412 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 413 # 414$code.=<<___; 415 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] 416#ifndef __ARMEB__ 417 vrev64.8 $t1,$t1 418#endif 419 vext.8 $In,$t1,$t1,#8 420 veor $IN,$IN,$Xl @ I[i]^=Xi 421 vpmull.p64 $Xln,$H,$In @ H·Ii+1 422 veor $t1,$t1,$In @ Karatsuba pre-processing 423 vpmull2.p64 $Xhn,$H,$In 424 b .Loop_mod2x_v8 425 426.align 4 427.Loop_mod2x_v8: 428 vext.8 $t2,$IN,$IN,#8 429 subs $len,$len,#32 @ is there more data? 430 vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo 431 cclr $inc,lo @ is it time to zero $inc? 432 433 vpmull.p64 $Xmn,$Hhl,$t1 434 veor $t2,$t2,$IN @ Karatsuba pre-processing 435 vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi 436 veor $Xl,$Xl,$Xln @ accumulate 437 vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 438 vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] 439 440 veor $Xh,$Xh,$Xhn 441 cclr $inc,eq @ is it time to zero $inc? 442 veor $Xm,$Xm,$Xmn 443 444 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 445 veor $t2,$Xl,$Xh 446 veor $Xm,$Xm,$t1 447 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] 448#ifndef __ARMEB__ 449 vrev64.8 $t0,$t0 450#endif 451 veor $Xm,$Xm,$t2 452 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 453 454#ifndef __ARMEB__ 455 vrev64.8 $t1,$t1 456#endif 457 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 458 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 459 vext.8 $In,$t1,$t1,#8 460 vext.8 $IN,$t0,$t0,#8 461 veor $Xl,$Xm,$t2 462 vpmull.p64 $Xln,$H,$In @ H·Ii+1 463 veor $IN,$IN,$Xh @ accumulate $IN early 464 465 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 466 vpmull.p64 $Xl,$Xl,$xC2 467 veor $IN,$IN,$t2 468 veor $t1,$t1,$In @ Karatsuba pre-processing 469 veor $IN,$IN,$Xl 470 vpmull2.p64 $Xhn,$H,$In 471 b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes 472 473 veor $Xh,$Xh,$t2 474 vext.8 $IN,$t0,$t0,#8 @ re-construct $IN 475 adds $len,$len,#32 @ re-construct $len 476 veor $Xl,$Xl,$Xh @ re-construct $Xl 477 b.eq .Ldone_v8 @ is $len zero? 478___ 479} 480$code.=<<___; 481.Lodd_tail_v8: 482 vext.8 $t2,$Xl,$Xl,#8 483 veor $IN,$IN,$Xl @ inp^=Xi 484 veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi 485 486 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 487 veor $t1,$t1,$IN @ Karatsuba pre-processing 488 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 489 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 490 491 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 492 veor $t2,$Xl,$Xh 493 veor $Xm,$Xm,$t1 494 veor $Xm,$Xm,$t2 495 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 496 497 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 498 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 499 veor $Xl,$Xm,$t2 500 501 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 502 vpmull.p64 $Xl,$Xl,$xC2 503 veor $t2,$t2,$Xh 504 veor $Xl,$Xl,$t2 505 506.Ldone_v8: 507#ifndef __ARMEB__ 508 vrev64.8 $Xl,$Xl 509#endif 510 vext.8 $Xl,$Xl,$Xl,#8 511 vst1.64 {$Xl},[$Xi] @ write out Xi 512 513___ 514$code.=<<___ if ($flavour !~ /64/); 515 vldmia sp!,{d8-d15} @ 32-bit ABI says so 516___ 517$code.=<<___; 518 ret 519.size gcm_ghash_v8,.-gcm_ghash_v8 520___ 521 522if ($flavour =~ /64/) { # 4x subroutine 523my ($I0,$j1,$j2,$j3, 524 $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23)); 525 526$code.=<<___; 527.type gcm_ghash_v8_4x,%function 528.align 4 529gcm_ghash_v8_4x: 530.Lgcm_ghash_v8_4x: 531 vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 532 vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2 533 vmov.i8 $xC2,#0xe1 534 vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 535 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 536 537 vld1.64 {$I0-$j3},[$inp],#64 538#ifndef __ARMEB__ 539 vrev64.8 $Xl,$Xl 540 vrev64.8 $j1,$j1 541 vrev64.8 $j2,$j2 542 vrev64.8 $j3,$j3 543 vrev64.8 $I0,$I0 544#endif 545 vext.8 $I3,$j3,$j3,#8 546 vext.8 $I2,$j2,$j2,#8 547 vext.8 $I1,$j1,$j1,#8 548 549 vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 550 veor $j3,$j3,$I3 551 vpmull2.p64 $Yh,$H,$I3 552 vpmull.p64 $Ym,$Hhl,$j3 553 554 vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 555 veor $j2,$j2,$I2 556 vpmull2.p64 $I2,$H2,$I2 557 vpmull2.p64 $j2,$Hhl,$j2 558 559 veor $Yl,$Yl,$t0 560 veor $Yh,$Yh,$I2 561 veor $Ym,$Ym,$j2 562 563 vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 564 veor $j1,$j1,$I1 565 vpmull2.p64 $I1,$H3,$I1 566 vpmull.p64 $j1,$H34,$j1 567 568 veor $Yl,$Yl,$j3 569 veor $Yh,$Yh,$I1 570 veor $Ym,$Ym,$j1 571 572 subs $len,$len,#128 573 b.lo .Ltail4x 574 575 b .Loop4x 576 577.align 4 578.Loop4x: 579 veor $t0,$I0,$Xl 580 vld1.64 {$I0-$j3},[$inp],#64 581 vext.8 $IN,$t0,$t0,#8 582#ifndef __ARMEB__ 583 vrev64.8 $j1,$j1 584 vrev64.8 $j2,$j2 585 vrev64.8 $j3,$j3 586 vrev64.8 $I0,$I0 587#endif 588 589 vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 590 veor $t0,$t0,$IN 591 vpmull2.p64 $Xh,$H4,$IN 592 vext.8 $I3,$j3,$j3,#8 593 vpmull2.p64 $Xm,$H34,$t0 594 595 veor $Xl,$Xl,$Yl 596 veor $Xh,$Xh,$Yh 597 vext.8 $I2,$j2,$j2,#8 598 veor $Xm,$Xm,$Ym 599 vext.8 $I1,$j1,$j1,#8 600 601 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 602 veor $t2,$Xl,$Xh 603 vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 604 veor $j3,$j3,$I3 605 veor $Xm,$Xm,$t1 606 vpmull2.p64 $Yh,$H,$I3 607 veor $Xm,$Xm,$t2 608 vpmull.p64 $Ym,$Hhl,$j3 609 610 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 611 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 612 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 613 vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 614 veor $j2,$j2,$I2 615 vpmull2.p64 $I2,$H2,$I2 616 veor $Xl,$Xm,$t2 617 vpmull2.p64 $j2,$Hhl,$j2 618 619 veor $Yl,$Yl,$t0 620 veor $Yh,$Yh,$I2 621 veor $Ym,$Ym,$j2 622 623 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 624 vpmull.p64 $Xl,$Xl,$xC2 625 vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 626 veor $j1,$j1,$I1 627 veor $t2,$t2,$Xh 628 vpmull2.p64 $I1,$H3,$I1 629 vpmull.p64 $j1,$H34,$j1 630 631 veor $Xl,$Xl,$t2 632 veor $Yl,$Yl,$j3 633 veor $Yh,$Yh,$I1 634 vext.8 $Xl,$Xl,$Xl,#8 635 veor $Ym,$Ym,$j1 636 637 subs $len,$len,#64 638 b.hs .Loop4x 639 640.Ltail4x: 641 veor $t0,$I0,$Xl 642 vext.8 $IN,$t0,$t0,#8 643 644 vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 645 veor $t0,$t0,$IN 646 vpmull2.p64 $Xh,$H4,$IN 647 vpmull2.p64 $Xm,$H34,$t0 648 649 veor $Xl,$Xl,$Yl 650 veor $Xh,$Xh,$Yh 651 veor $Xm,$Xm,$Ym 652 653 adds $len,$len,#64 654 b.eq .Ldone4x 655 656 cmp $len,#32 657 b.lo .Lone 658 b.eq .Ltwo 659.Lthree: 660 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 661 veor $t2,$Xl,$Xh 662 veor $Xm,$Xm,$t1 663 vld1.64 {$I0-$j2},[$inp] 664 veor $Xm,$Xm,$t2 665#ifndef __ARMEB__ 666 vrev64.8 $j1,$j1 667 vrev64.8 $j2,$j2 668 vrev64.8 $I0,$I0 669#endif 670 671 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 672 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 673 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 674 vext.8 $I2,$j2,$j2,#8 675 vext.8 $I1,$j1,$j1,#8 676 veor $Xl,$Xm,$t2 677 678 vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 679 veor $j2,$j2,$I2 680 681 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 682 vpmull.p64 $Xl,$Xl,$xC2 683 veor $t2,$t2,$Xh 684 vpmull2.p64 $Yh,$H,$I2 685 vpmull.p64 $Ym,$Hhl,$j2 686 veor $Xl,$Xl,$t2 687 vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 688 veor $j1,$j1,$I1 689 vext.8 $Xl,$Xl,$Xl,#8 690 691 vpmull2.p64 $I1,$H2,$I1 692 veor $t0,$I0,$Xl 693 vpmull2.p64 $j1,$Hhl,$j1 694 vext.8 $IN,$t0,$t0,#8 695 696 veor $Yl,$Yl,$j3 697 veor $Yh,$Yh,$I1 698 veor $Ym,$Ym,$j1 699 700 vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) 701 veor $t0,$t0,$IN 702 vpmull2.p64 $Xh,$H3,$IN 703 vpmull.p64 $Xm,$H34,$t0 704 705 veor $Xl,$Xl,$Yl 706 veor $Xh,$Xh,$Yh 707 veor $Xm,$Xm,$Ym 708 b .Ldone4x 709 710.align 4 711.Ltwo: 712 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 713 veor $t2,$Xl,$Xh 714 veor $Xm,$Xm,$t1 715 vld1.64 {$I0-$j1},[$inp] 716 veor $Xm,$Xm,$t2 717#ifndef __ARMEB__ 718 vrev64.8 $j1,$j1 719 vrev64.8 $I0,$I0 720#endif 721 722 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 723 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 724 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 725 vext.8 $I1,$j1,$j1,#8 726 veor $Xl,$Xm,$t2 727 728 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 729 vpmull.p64 $Xl,$Xl,$xC2 730 veor $t2,$t2,$Xh 731 veor $Xl,$Xl,$t2 732 vext.8 $Xl,$Xl,$Xl,#8 733 734 vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 735 veor $j1,$j1,$I1 736 737 veor $t0,$I0,$Xl 738 vext.8 $IN,$t0,$t0,#8 739 740 vpmull2.p64 $Yh,$H,$I1 741 vpmull.p64 $Ym,$Hhl,$j1 742 743 vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) 744 veor $t0,$t0,$IN 745 vpmull2.p64 $Xh,$H2,$IN 746 vpmull2.p64 $Xm,$Hhl,$t0 747 748 veor $Xl,$Xl,$Yl 749 veor $Xh,$Xh,$Yh 750 veor $Xm,$Xm,$Ym 751 b .Ldone4x 752 753.align 4 754.Lone: 755 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 756 veor $t2,$Xl,$Xh 757 veor $Xm,$Xm,$t1 758 vld1.64 {$I0},[$inp] 759 veor $Xm,$Xm,$t2 760#ifndef __ARMEB__ 761 vrev64.8 $I0,$I0 762#endif 763 764 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 765 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 766 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 767 veor $Xl,$Xm,$t2 768 769 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 770 vpmull.p64 $Xl,$Xl,$xC2 771 veor $t2,$t2,$Xh 772 veor $Xl,$Xl,$t2 773 vext.8 $Xl,$Xl,$Xl,#8 774 775 veor $t0,$I0,$Xl 776 vext.8 $IN,$t0,$t0,#8 777 778 vpmull.p64 $Xl,$H,$IN 779 veor $t0,$t0,$IN 780 vpmull2.p64 $Xh,$H,$IN 781 vpmull.p64 $Xm,$Hhl,$t0 782 783.Ldone4x: 784 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 785 veor $t2,$Xl,$Xh 786 veor $Xm,$Xm,$t1 787 veor $Xm,$Xm,$t2 788 789 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 790 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 791 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 792 veor $Xl,$Xm,$t2 793 794 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 795 vpmull.p64 $Xl,$Xl,$xC2 796 veor $t2,$t2,$Xh 797 veor $Xl,$Xl,$t2 798 vext.8 $Xl,$Xl,$Xl,#8 799 800#ifndef __ARMEB__ 801 vrev64.8 $Xl,$Xl 802#endif 803 vst1.64 {$Xl},[$Xi] @ write out Xi 804 805 ret 806.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 807___ 808 809} 810} 811 812$code.=<<___; 813.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 814.align 2 815#endif 816___ 817 818if ($flavour =~ /64/) { ######## 64-bit code 819 sub unvmov { 820 my $arg=shift; 821 822 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && 823 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, 824 $3<8?$3:$3+8,($4 eq "lo")?0:1; 825 } 826 foreach(split("\n",$code)) { 827 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 828 s/vmov\.i8/movi/o or # fix up legacy mnemonics 829 s/vmov\s+(.*)/unvmov($1)/geo or 830 s/vext\.8/ext/o or 831 s/vshr\.s/sshr\.s/o or 832 s/vshr/ushr/o or 833 s/^(\s+)v/$1/o or # strip off v prefix 834 s/\bbx\s+lr\b/ret/o; 835 836 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 837 s/@\s/\/\//o; # old->new style commentary 838 839 # fix up remaining legacy suffixes 840 s/\.[ui]?8(\s)/$1/o; 841 s/\.[uis]?32//o and s/\.16b/\.4s/go; 842 m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument 843 m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments 844 s/\.[uisp]?64//o and s/\.16b/\.2d/go; 845 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 846 847 # Switch preprocessor checks to aarch64 versions. 848 s/__ARME([BL])__/__AARCH64E$1__/go; 849 850 print $_,"\n"; 851 } 852} else { ######## 32-bit code 853 sub unvdup32 { 854 my $arg=shift; 855 856 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 857 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 858 } 859 sub unvpmullp64 { 860 my ($mnemonic,$arg)=@_; 861 862 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { 863 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) 864 |(($2&7)<<17)|(($2&8)<<4) 865 |(($3&7)<<1) |(($3&8)<<2); 866 $word |= 0x00010001 if ($mnemonic =~ "2"); 867 # since ARMv7 instructions are always encoded little-endian. 868 # correct solution is to use .inst directive, but older 869 # assemblers don't implement it:-( 870 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 871 $word&0xff,($word>>8)&0xff, 872 ($word>>16)&0xff,($word>>24)&0xff, 873 $mnemonic,$arg; 874 } 875 } 876 877 foreach(split("\n",$code)) { 878 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 879 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 880 s/\/\/\s?/@ /o; # new->old style commentary 881 882 # fix up remaining new-style suffixes 883 s/\],#[0-9]+/]!/o; 884 885 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 886 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 887 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or 888 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 889 s/^(\s+)b\./$1b/o or 890 s/^(\s+)ret/$1bx\tlr/o; 891 892 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 893 print " it $2\n"; 894 } 895 896 print $_,"\n"; 897 } 898} 899 900close STDOUT or die "error closing STDOUT: $!"; # enforce flush 901