1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv4. 17# 18# June 2017. 19# 20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit 21# interleaving. How does it compare to Keccak Code Package? It's as 22# fast, but several times smaller, and is endian- and ISA-neutral. ISA 23# neutrality means that minimum ISA requirement is ARMv4, yet it can 24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with 25# register layout taken from Keccak Code Package. It's also as fast, 26# in fact faster by 10-15% on some processors, and endian-neutral. 27# 28# August 2017. 29# 30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2 31# of rotate instructions with logical ones. This resulted in ~10% 32# improvement on most processors. Switch to KECCAK_2X effectively 33# minimizes re-loads from temporary storage, and merged rotates just 34# eliminate corresponding instructions. As for latter. When examining 35# code you'll notice commented ror instructions. These are eliminated 36# ones, and you should trace destination register below to see what's 37# going on. Just in case, why not all rotates are eliminated. Trouble 38# is that you have operations that require both inputs to be rotated, 39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using 40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation 41# that takes 'a' as input. And thing is that this next operation can 42# be in next round. It's totally possible to "carry" rotate "factors" 43# to the next round, but it makes code more complex. And the last word 44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the 45# time being]... 46# 47# Reduce per-round instruction count in Thumb-2 case by 16%. This is 48# achieved by folding ldr/str pairs to their double-word counterparts. 49# Theoretically this should have improved performance on single-issue 50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as 51# usual... 52# 53######################################################################## 54# Numbers are cycles per processed byte. Non-NEON results account even 55# for input bit interleaving. 56# 57# r=1088(*) Thumb-2(**) NEON 58# 59# ARM11xx 82/+150% 60# Cortex-A5 88/+160%, 86, 36 61# Cortex-A7 78/+160%, 68, 34 62# Cortex-A8 51/+230%, 57, 30 63# Cortex-A9 53/+210%, 51, 26 64# Cortex-A15 42/+160%, 38, 18 65# Snapdragon S4 43/+210%, 38, 24 66# 67# (*) Corresponds to SHA3-256. Percentage after slash is improvement 68# over compiler-generated KECCAK_2X reference code. 69# (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to 70# Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable 71# processors are presented mostly for reference purposes. 72 73# $output is the last argument if it looks like a file (it has an extension) 74# $flavour is the first argument if it doesn't look like a file 75$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 76$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 77 78if ($flavour && $flavour ne "void") { 79 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 80 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 81 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 82 die "can't locate arm-xlate.pl"; 83 84 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 85 or die "can't call $xlate: $!"; 86} else { 87 $output and open STDOUT,">$output"; 88} 89 90my @C = map("r$_",(0..9)); 91my @E = map("r$_",(10..12,14)); 92 93######################################################################## 94# Stack layout 95# ----->+-----------------------+ 96# | uint64_t A[5][5] | 97# | ... | 98# +200->+-----------------------+ 99# | uint64_t D[5] | 100# | ... | 101# +240->+-----------------------+ 102# | uint64_t T[5][5] | 103# | ... | 104# +440->+-----------------------+ 105# | saved lr | 106# +444->+-----------------------+ 107# | loop counter | 108# +448->+-----------------------+ 109# | ... 110 111my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); 112my @D = map(8*$_, (25..29)); 113my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50)); 114 115$code.=<<___; 116#include "arm_arch.h" 117 118#if defined(__thumb2__) 119.syntax unified 120.thumb 121#else 122.code 32 123#endif 124 125.text 126 127.type iotas32, %object 128.align 5 129iotas32: 130 .long 0x00000001, 0x00000000 131 .long 0x00000000, 0x00000089 132 .long 0x00000000, 0x8000008b 133 .long 0x00000000, 0x80008080 134 .long 0x00000001, 0x0000008b 135 .long 0x00000001, 0x00008000 136 .long 0x00000001, 0x80008088 137 .long 0x00000001, 0x80000082 138 .long 0x00000000, 0x0000000b 139 .long 0x00000000, 0x0000000a 140 .long 0x00000001, 0x00008082 141 .long 0x00000000, 0x00008003 142 .long 0x00000001, 0x0000808b 143 .long 0x00000001, 0x8000000b 144 .long 0x00000001, 0x8000008a 145 .long 0x00000001, 0x80000081 146 .long 0x00000000, 0x80000081 147 .long 0x00000000, 0x80000008 148 .long 0x00000000, 0x00000083 149 .long 0x00000000, 0x80008003 150 .long 0x00000001, 0x80008088 151 .long 0x00000000, 0x80000088 152 .long 0x00000001, 0x00008000 153 .long 0x00000000, 0x80008082 154.size iotas32,.-iotas32 155 156.type KeccakF1600_int, %function 157.align 5 158KeccakF1600_int: 159 add @C[9],sp,#$A[4][2] 160 add @E[2],sp,#$A[0][0] 161 add @E[0],sp,#$A[1][0] 162 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4] 163KeccakF1600_enter: 164 str lr,[sp,#440] 165 eor @E[1],@E[1],@E[1] 166 str @E[1],[sp,#444] 167 b .Lround2x 168 169.align 4 170.Lround2x: 171___ 172sub Round { 173my (@A,@R); (@A[0..4],@R) = @_; 174 175$code.=<<___; 176 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1] 177 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1] 178#ifdef __thumb2__ 179 eor @C[0],@C[0],@E[0] 180 eor @C[1],@C[1],@E[1] 181 eor @C[2],@C[2],@E[2] 182 ldrd @E[0],@E[1],[sp,#$A[1][2]] 183 eor @C[3],@C[3],@E[3] 184 ldrd @E[2],@E[3],[sp,#$A[1][3]] 185 eor @C[4],@C[4],@E[0] 186 eor @C[5],@C[5],@E[1] 187 eor @C[6],@C[6],@E[2] 188 ldrd @E[0],@E[1],[sp,#$A[1][4]] 189 eor @C[7],@C[7],@E[3] 190 ldrd @E[2],@E[3],[sp,#$A[2][0]] 191 eor @C[8],@C[8],@E[0] 192 eor @C[9],@C[9],@E[1] 193 eor @C[0],@C[0],@E[2] 194 ldrd @E[0],@E[1],[sp,#$A[2][1]] 195 eor @C[1],@C[1],@E[3] 196 ldrd @E[2],@E[3],[sp,#$A[2][2]] 197 eor @C[2],@C[2],@E[0] 198 eor @C[3],@C[3],@E[1] 199 eor @C[4],@C[4],@E[2] 200 ldrd @E[0],@E[1],[sp,#$A[2][3]] 201 eor @C[5],@C[5],@E[3] 202 ldrd @E[2],@E[3],[sp,#$A[2][4]] 203 eor @C[6],@C[6],@E[0] 204 eor @C[7],@C[7],@E[1] 205 eor @C[8],@C[8],@E[2] 206 ldrd @E[0],@E[1],[sp,#$A[3][0]] 207 eor @C[9],@C[9],@E[3] 208 ldrd @E[2],@E[3],[sp,#$A[3][1]] 209 eor @C[0],@C[0],@E[0] 210 eor @C[1],@C[1],@E[1] 211 eor @C[2],@C[2],@E[2] 212 ldrd @E[0],@E[1],[sp,#$A[3][2]] 213 eor @C[3],@C[3],@E[3] 214 ldrd @E[2],@E[3],[sp,#$A[3][3]] 215 eor @C[4],@C[4],@E[0] 216 eor @C[5],@C[5],@E[1] 217 eor @C[6],@C[6],@E[2] 218 ldrd @E[0],@E[1],[sp,#$A[3][4]] 219 eor @C[7],@C[7],@E[3] 220 ldrd @E[2],@E[3],[sp,#$A[4][0]] 221 eor @C[8],@C[8],@E[0] 222 eor @C[9],@C[9],@E[1] 223 eor @C[0],@C[0],@E[2] 224 ldrd @E[0],@E[1],[sp,#$A[4][1]] 225 eor @C[1],@C[1],@E[3] 226 ldrd @E[2],@E[3],[sp,#$A[0][2]] 227 eor @C[2],@C[2],@E[0] 228 eor @C[3],@C[3],@E[1] 229 eor @C[4],@C[4],@E[2] 230 ldrd @E[0],@E[1],[sp,#$A[0][3]] 231 eor @C[5],@C[5],@E[3] 232 ldrd @E[2],@E[3],[sp,#$A[0][4]] 233#else 234 eor @C[0],@C[0],@E[0] 235 add @E[0],sp,#$A[1][2] 236 eor @C[1],@C[1],@E[1] 237 eor @C[2],@C[2],@E[2] 238 eor @C[3],@C[3],@E[3] 239 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3] 240 eor @C[4],@C[4],@E[0] 241 add @E[0],sp,#$A[1][4] 242 eor @C[5],@C[5],@E[1] 243 eor @C[6],@C[6],@E[2] 244 eor @C[7],@C[7],@E[3] 245 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0] 246 eor @C[8],@C[8],@E[0] 247 add @E[0],sp,#$A[2][1] 248 eor @C[9],@C[9],@E[1] 249 eor @C[0],@C[0],@E[2] 250 eor @C[1],@C[1],@E[3] 251 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2] 252 eor @C[2],@C[2],@E[0] 253 add @E[0],sp,#$A[2][3] 254 eor @C[3],@C[3],@E[1] 255 eor @C[4],@C[4],@E[2] 256 eor @C[5],@C[5],@E[3] 257 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4] 258 eor @C[6],@C[6],@E[0] 259 add @E[0],sp,#$A[3][0] 260 eor @C[7],@C[7],@E[1] 261 eor @C[8],@C[8],@E[2] 262 eor @C[9],@C[9],@E[3] 263 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1] 264 eor @C[0],@C[0],@E[0] 265 add @E[0],sp,#$A[3][2] 266 eor @C[1],@C[1],@E[1] 267 eor @C[2],@C[2],@E[2] 268 eor @C[3],@C[3],@E[3] 269 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3] 270 eor @C[4],@C[4],@E[0] 271 add @E[0],sp,#$A[3][4] 272 eor @C[5],@C[5],@E[1] 273 eor @C[6],@C[6],@E[2] 274 eor @C[7],@C[7],@E[3] 275 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0] 276 eor @C[8],@C[8],@E[0] 277 ldr @E[0],[sp,#$A[4][1]] @ A[4][1] 278 eor @C[9],@C[9],@E[1] 279 ldr @E[1],[sp,#$A[4][1]+4] 280 eor @C[0],@C[0],@E[2] 281 ldr @E[2],[sp,#$A[0][2]] @ A[0][2] 282 eor @C[1],@C[1],@E[3] 283 ldr @E[3],[sp,#$A[0][2]+4] 284 eor @C[2],@C[2],@E[0] 285 add @E[0],sp,#$A[0][3] 286 eor @C[3],@C[3],@E[1] 287 eor @C[4],@C[4],@E[2] 288 eor @C[5],@C[5],@E[3] 289 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4] 290#endif 291 eor @C[6],@C[6],@E[0] 292 eor @C[7],@C[7],@E[1] 293 eor @C[8],@C[8],@E[2] 294 eor @C[9],@C[9],@E[3] 295 296 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0]; 297 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0] 298 eor @E[1],@C[1],@C[4] 299 str.h @E[1],[sp,#$D[1]+4] 300 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3]; 301 eor @E[3],@C[7],@C[0] 302 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1] 303 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4]; 304 str.h @E[3],[sp,#$D[4]+4] 305 eor @C[1],@C[9],@C[2] 306 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0] 307 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1]; 308 ldr.l @C[7],[sp,#$A[3][3]] 309 eor @C[3],@C[3],@C[6] 310 str.h @C[1],[sp,#$D[0]+4] 311 ldr.h @C[6],[sp,#$A[3][3]+4] 312 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1] 313 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; 314 str.h @C[3],[sp,#$D[2]+4] 315 eor @C[5],@C[5],@C[8] 316 317 ldr.l @C[8],[sp,#$A[4][4]] 318 ldr.h @C[9],[sp,#$A[4][4]+4] 319 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2] 320 eor @C[7],@C[7],@C[4] 321 str.h @C[5],[sp,#$D[3]+4] 322 eor @C[6],@C[6],@C[5] 323 ldr.l @C[4],[sp,#$A[0][0]] 324 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ 325 @ ror @C[6],@C[6],#32-11 326 ldr.h @C[5],[sp,#$A[0][0]+4] 327 eor @C[8],@C[8],@E[2] 328 eor @C[9],@C[9],@E[3] 329 ldr.l @E[2],[sp,#$A[2][2]] 330 eor @C[0],@C[0],@C[4] 331 ldr.h @E[3],[sp,#$A[2][2]+4] 332 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ 333 @ ror @C[9],@C[9],#32-7 334 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ 335 eor @E[2],@E[2],@C[2] 336 ldr.l @C[2],[sp,#$A[1][1]] 337 eor @E[3],@E[3],@C[3] 338 ldr.h @C[3],[sp,#$A[1][1]+4] 339 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ 340 ldr @E[2],[sp,#444] @ load counter 341 eor @C[2],@C[2],@E[0] 342 adr @E[0],iotas32 343 ror @C[4],@E[3],#32-22 344 add @E[3],@E[0],@E[2] 345 eor @C[3],@C[3],@E[1] 346___ 347$code.=<<___ if ($A[0][0] != $T[0][0]); 348 ldmia @E[3],{@E[0],@E[1]} @ iotas[i] 349___ 350$code.=<<___ if ($A[0][0] == $T[0][0]); 351 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo 352 add @E[2],@E[2],#16 353 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi 354 cmp @E[2],#192 355 str @E[2],[sp,#444] @ store counter 356___ 357$code.=<<___; 358 bic @E[2],@C[4],@C[2],ror#32-22 359 bic @E[3],@C[5],@C[3],ror#32-22 360 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ 361 ror @C[3],@C[3],#32-22 362 eor @E[2],@E[2],@C[0] 363 eor @E[3],@E[3],@C[1] 364 eor @E[0],@E[0],@E[2] 365 eor @E[1],@E[1],@E[3] 366 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 367 bic @E[2],@C[6],@C[4],ror#11 368 str.h @E[1],[sp,#$R[0][0]+4] 369 bic @E[3],@C[7],@C[5],ror#10 370 bic @E[0],@C[8],@C[6],ror#32-(11-7) 371 bic @E[1],@C[9],@C[7],ror#32-(10-7) 372 eor @E[2],@C[2],@E[2],ror#32-11 373 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]); 374 eor @E[3],@C[3],@E[3],ror#32-10 375 str.h @E[3],[sp,#$R[0][1]+4] 376 eor @E[0],@C[4],@E[0],ror#32-7 377 eor @E[1],@C[5],@E[1],ror#32-7 378 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]); 379 bic @E[2],@C[0],@C[8],ror#32-7 380 str.h @E[1],[sp,#$R[0][2]+4] 381 bic @E[3],@C[1],@C[9],ror#32-7 382 eor @E[2],@E[2],@C[6],ror#32-11 383 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]); 384 eor @E[3],@E[3],@C[7],ror#32-10 385 str.h @E[3],[sp,#$R[0][3]+4] 386 bic @E[0],@C[2],@C[0] 387 add @E[3],sp,#$D[3] 388 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3] 389 bic @E[1],@C[3],@C[1] 390 ldr.h @C[1],[sp,#$A[0][3]+4] 391 eor @E[0],@E[0],@C[8],ror#32-7 392 eor @E[1],@E[1],@C[9],ror#32-7 393 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]); 394 add @C[9],sp,#$D[0] 395 str.h @E[1],[sp,#$R[0][4]+4] 396 397 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4] 398 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1] 399 400 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4] 401 eor @C[0],@C[0],@E[0] 402 ldr.h @C[3],[sp,#$A[1][4]+4] 403 eor @C[1],@C[1],@E[1] 404 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 405 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1] 406 @ ror @C[1],@C[1],#32-14 407 ldr.h @E[1],[sp,#$A[3][1]+4] 408 409 eor @C[2],@C[2],@E[2] 410 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0] 411 eor @C[3],@C[3],@E[3] 412 ldr.h @C[5],[sp,#$A[2][0]+4] 413 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 414 @ ror @C[3],@C[3],#32-10 415 416 eor @C[6],@C[6],@C[4] 417 ldr.l @E[2],[sp,#$D[2]] @ D[2] 418 eor @C[7],@C[7],@C[5] 419 ldr.h @E[3],[sp,#$D[2]+4] 420 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 421 ror @C[4],@C[7],#32-2 422 423 eor @E[0],@E[0],@C[8] 424 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2] 425 eor @E[1],@E[1],@C[9] 426 ldr.h @C[9],[sp,#$A[4][2]+4] 427 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 428 ror @C[6],@E[1],#32-23 429 430 bic @E[0],@C[4],@C[2],ror#32-10 431 bic @E[1],@C[5],@C[3],ror#32-10 432 eor @E[2],@E[2],@C[8] 433 eor @E[3],@E[3],@C[9] 434 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 435 ror @C[8],@E[3],#32-31 436 eor @E[0],@E[0],@C[0],ror#32-14 437 eor @E[1],@E[1],@C[1],ror#32-14 438 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2]) 439 bic @E[2],@C[6],@C[4] 440 str.h @E[1],[sp,#$R[1][0]+4] 441 bic @E[3],@C[7],@C[5] 442 eor @E[2],@E[2],@C[2],ror#32-10 443 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]); 444 eor @E[3],@E[3],@C[3],ror#32-10 445 str.h @E[3],[sp,#$R[1][1]+4] 446 bic @E[0],@C[8],@C[6] 447 bic @E[1],@C[9],@C[7] 448 bic @E[2],@C[0],@C[8],ror#14 449 bic @E[3],@C[1],@C[9],ror#14 450 eor @E[0],@E[0],@C[4] 451 eor @E[1],@E[1],@C[5] 452 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]); 453 bic @C[2],@C[2],@C[0],ror#32-(14-10) 454 str.h @E[1],[sp,#$R[1][2]+4] 455 eor @E[2],@C[6],@E[2],ror#32-14 456 bic @E[1],@C[3],@C[1],ror#32-(14-10) 457 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]); 458 eor @E[3],@C[7],@E[3],ror#32-14 459 str.h @E[3],[sp,#$R[1][3]+4] 460 add @E[2],sp,#$D[1] 461 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1] 462 eor @E[0],@C[8],@C[2],ror#32-10 463 ldr.h @C[0],[sp,#$A[0][1]+4] 464 eor @E[1],@C[9],@E[1],ror#32-10 465 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]); 466 str.h @E[1],[sp,#$R[1][4]+4] 467 468 add @C[9],sp,#$D[3] 469 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2] 470 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2] 471 ldr.h @C[3],[sp,#$A[1][2]+4] 472 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4] 473 474 eor @C[1],@C[1],@E[0] 475 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3] 476 eor @C[0],@C[0],@E[1] 477 ldr.h @C[5],[sp,#$A[2][3]+4] 478 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); 479 480 eor @C[2],@C[2],@E[2] 481 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4] 482 eor @C[3],@C[3],@E[3] 483 ldr.h @E[1],[sp,#$A[3][4]+4] 484 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); 485 ldr.l @E[2],[sp,#$D[0]] @ D[0] 486 @ ror @C[3],@C[3],#32-3 487 ldr.h @E[3],[sp,#$D[0]+4] 488 489 eor @C[4],@C[4],@C[6] 490 eor @C[5],@C[5],@C[7] 491 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 492 @ ror @C[4],@C[7],#32-13 @ [track reverse order below] 493 494 eor @E[0],@E[0],@C[8] 495 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0] 496 eor @E[1],@E[1],@C[9] 497 ldr.h @C[9],[sp,#$A[4][0]+4] 498 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 499 ror @C[7],@E[1],#32-4 500 501 eor @E[2],@E[2],@C[8] 502 eor @E[3],@E[3],@C[9] 503 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 504 ror @C[9],@E[3],#32-9 505 506 bic @E[0],@C[5],@C[2],ror#13-3 507 bic @E[1],@C[4],@C[3],ror#12-3 508 bic @E[2],@C[6],@C[5],ror#32-13 509 bic @E[3],@C[7],@C[4],ror#32-12 510 eor @E[0],@C[0],@E[0],ror#32-13 511 eor @E[1],@C[1],@E[1],ror#32-12 512 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2]) 513 eor @E[2],@E[2],@C[2],ror#32-3 514 str.h @E[1],[sp,#$R[2][0]+4] 515 eor @E[3],@E[3],@C[3],ror#32-3 516 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]); 517 bic @E[0],@C[8],@C[6] 518 bic @E[1],@C[9],@C[7] 519 str.h @E[3],[sp,#$R[2][1]+4] 520 eor @E[0],@E[0],@C[5],ror#32-13 521 eor @E[1],@E[1],@C[4],ror#32-12 522 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]); 523 bic @E[2],@C[0],@C[8] 524 str.h @E[1],[sp,#$R[2][2]+4] 525 bic @E[3],@C[1],@C[9] 526 eor @E[2],@E[2],@C[6] 527 eor @E[3],@E[3],@C[7] 528 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]); 529 bic @E[0],@C[2],@C[0],ror#3 530 str.h @E[3],[sp,#$R[2][3]+4] 531 bic @E[1],@C[3],@C[1],ror#3 532 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order] 533 eor @E[0],@C[8],@E[0],ror#32-3 534 ldr.h @C[0],[sp,#$A[0][4]+4] 535 eor @E[1],@C[9],@E[1],ror#32-3 536 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]); 537 add @C[9],sp,#$D[1] 538 str.h @E[1],[sp,#$R[2][4]+4] 539 540 ldr.l @E[0],[sp,#$D[4]] @ D[4] 541 ldr.h @E[1],[sp,#$D[4]+4] 542 ldr.l @E[2],[sp,#$D[0]] @ D[0] 543 ldr.h @E[3],[sp,#$D[0]+4] 544 545 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2] 546 547 eor @C[1],@C[1],@E[0] 548 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0] 549 eor @C[0],@C[0],@E[1] 550 ldr.h @C[3],[sp,#$A[1][0]+4] 551 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); 552 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1] 553 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order] 554 ldr.h @C[5],[sp,#$A[2][1]+4] 555 556 eor @C[2],@C[2],@E[2] 557 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2] 558 eor @C[3],@C[3],@E[3] 559 ldr.h @E[1],[sp,#$A[3][2]+4] 560 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); 561 ldr.l @E[2],[sp,#$D[3]] @ D[3] 562 @ ror @C[3],@C[3],#32-18 563 ldr.h @E[3],[sp,#$D[3]+4] 564 565 eor @C[6],@C[6],@C[4] 566 eor @C[7],@C[7],@C[5] 567 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); 568 ror @C[5],@C[7],#32-5 569 570 eor @E[0],@E[0],@C[8] 571 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3] 572 eor @E[1],@E[1],@C[9] 573 ldr.h @C[9],[sp,#$A[4][3]+4] 574 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 575 ror @C[6],@E[1],#32-8 576 577 eor @E[2],@E[2],@C[8] 578 eor @E[3],@E[3],@C[9] 579 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 580 ror @C[9],@E[3],#32-28 581 582 bic @E[0],@C[4],@C[2],ror#32-18 583 bic @E[1],@C[5],@C[3],ror#32-18 584 eor @E[0],@E[0],@C[0],ror#32-14 585 eor @E[1],@E[1],@C[1],ror#32-13 586 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2]) 587 bic @E[2],@C[6],@C[4] 588 str.h @E[1],[sp,#$R[3][0]+4] 589 bic @E[3],@C[7],@C[5] 590 eor @E[2],@E[2],@C[2],ror#32-18 591 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]); 592 eor @E[3],@E[3],@C[3],ror#32-18 593 str.h @E[3],[sp,#$R[3][1]+4] 594 bic @E[0],@C[8],@C[6] 595 bic @E[1],@C[9],@C[7] 596 bic @E[2],@C[0],@C[8],ror#14 597 bic @E[3],@C[1],@C[9],ror#13 598 eor @E[0],@E[0],@C[4] 599 eor @E[1],@E[1],@C[5] 600 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]); 601 bic @C[2],@C[2],@C[0],ror#18-14 602 str.h @E[1],[sp,#$R[3][2]+4] 603 eor @E[2],@C[6],@E[2],ror#32-14 604 bic @E[1],@C[3],@C[1],ror#18-13 605 eor @E[3],@C[7],@E[3],ror#32-13 606 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]); 607 str.h @E[3],[sp,#$R[3][3]+4] 608 add @E[3],sp,#$D[2] 609 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2] 610 eor @E[0],@C[8],@C[2],ror#32-18 611 ldr.h @C[1],[sp,#$A[0][2]+4] 612 eor @E[1],@C[9],@E[1],ror#32-18 613 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]); 614 str.h @E[1],[sp,#$R[3][4]+4] 615 616 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3] 617 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3] 618 ldr.h @C[3],[sp,#$A[1][3]+4] 619 ldr.l @C[6],[sp,#$D[4]] @ D[4] 620 ldr.h @C[7],[sp,#$D[4]+4] 621 622 eor @C[0],@C[0],@E[0] 623 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4] 624 eor @C[1],@C[1],@E[1] 625 ldr.h @C[5],[sp,#$A[2][4]+4] 626 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); 627 ldr.l @C[8],[sp,#$D[0]] @ D[0] 628 @ ror @C[1],@C[1],#32-31 629 ldr.h @C[9],[sp,#$D[0]+4] 630 631 eor @E[2],@E[2],@C[2] 632 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0] 633 eor @E[3],@E[3],@C[3] 634 ldr.h @E[1],[sp,#$A[3][0]+4] 635 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); 636 ldr.l @E[2],[sp,#$D[1]] @ D[1] 637 ror @C[2],@E[3],#32-28 638 ldr.h @E[3],[sp,#$D[1]+4] 639 640 eor @C[6],@C[6],@C[4] 641 eor @C[7],@C[7],@C[5] 642 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); 643 ror @C[4],@C[7],#32-20 644 645 eor @E[0],@E[0],@C[8] 646 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1] 647 eor @E[1],@E[1],@C[9] 648 ldr.h @C[9],[sp,#$A[4][1]+4] 649 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); 650 ror @C[6],@E[1],#32-21 651 652 eor @C[8],@C[8],@E[2] 653 eor @C[9],@C[9],@E[3] 654 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 655 @ ror @C[9],@C[3],#32-1 656 657 bic @E[0],@C[4],@C[2] 658 bic @E[1],@C[5],@C[3] 659 eor @E[0],@E[0],@C[0],ror#32-31 660 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2]) 661 eor @E[1],@E[1],@C[1],ror#32-31 662 str.h @E[1],[sp,#$R[4][0]+4] 663 bic @E[2],@C[6],@C[4] 664 bic @E[3],@C[7],@C[5] 665 eor @E[2],@E[2],@C[2] 666 eor @E[3],@E[3],@C[3] 667 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]); 668 bic @E[0],@C[8],@C[6],ror#1 669 str.h @E[3],[sp,#$R[4][1]+4] 670 bic @E[1],@C[9],@C[7],ror#1 671 bic @E[2],@C[0],@C[8],ror#31-1 672 bic @E[3],@C[1],@C[9],ror#31-1 673 eor @C[4],@C[4],@E[0],ror#32-1 674 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]); 675 eor @C[5],@C[5],@E[1],ror#32-1 676 str.h @C[5],[sp,#$R[4][2]+4] 677 eor @C[6],@C[6],@E[2],ror#32-31 678 eor @C[7],@C[7],@E[3],ror#32-31 679 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]); 680 bic @E[0],@C[2],@C[0],ror#32-31 681 str.h @C[7],[sp,#$R[4][3]+4] 682 bic @E[1],@C[3],@C[1],ror#32-31 683 add @E[2],sp,#$R[0][0] 684 eor @C[8],@E[0],@C[8],ror#32-1 685 add @E[0],sp,#$R[1][0] 686 eor @C[9],@E[1],@C[9],ror#32-1 687 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]); 688 str.h @C[9],[sp,#$R[4][4]+4] 689___ 690} 691 Round(@A,@T); 692 Round(@T,@A); 693$code.=<<___; 694 blo .Lround2x 695 696#if __ARM_ARCH__>=5 697 ldr pc,[sp,#440] 698#else 699 ldr lr,[sp,#440] 700 tst lr,#1 701 moveq pc,lr @ be binary compatible with V4, yet 702 bx lr @ interoperable with Thumb ISA:-) 703#endif 704.size KeccakF1600_int,.-KeccakF1600_int 705 706.type KeccakF1600, %function 707.align 5 708KeccakF1600: 709 stmdb sp!,{r0,r4-r11,lr} 710 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],... 711 712 add @E[0],r0,#$A[1][0] 713 add @E[1],sp,#$A[1][0] 714 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack 715 stmia sp, {@C[0]-@C[9]} 716 ldmia @E[0]!,{@C[0]-@C[9]} 717 stmia @E[1]!,{@C[0]-@C[9]} 718 ldmia @E[0]!,{@C[0]-@C[9]} 719 stmia @E[1]!,{@C[0]-@C[9]} 720 ldmia @E[0]!,{@C[0]-@C[9]} 721 stmia @E[1]!,{@C[0]-@C[9]} 722 ldmia @E[0], {@C[0]-@C[9]} 723 add @E[2],sp,#$A[0][0] 724 add @E[0],sp,#$A[1][0] 725 stmia @E[1], {@C[0]-@C[9]} 726 727 bl KeccakF1600_enter 728 729 ldr @E[1], [sp,#440+16] @ restore pointer to A 730 ldmia sp, {@C[0]-@C[9]} 731 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5] 732 ldmia @E[0]!,{@C[0]-@C[9]} 733 stmia @E[1]!,{@C[0]-@C[9]} 734 ldmia @E[0]!,{@C[0]-@C[9]} 735 stmia @E[1]!,{@C[0]-@C[9]} 736 ldmia @E[0]!,{@C[0]-@C[9]} 737 stmia @E[1]!,{@C[0]-@C[9]} 738 ldmia @E[0], {@C[0]-@C[9]} 739 stmia @E[1], {@C[0]-@C[9]} 740 741 add sp,sp,#440+20 742#if __ARM_ARCH__>=5 743 ldmia sp!,{r4-r11,pc} 744#else 745 ldmia sp!,{r4-r11,lr} 746 tst lr,#1 747 moveq pc,lr @ be binary compatible with V4, yet 748 bx lr @ interoperable with Thumb ISA:-) 749#endif 750.size KeccakF1600,.-KeccakF1600 751___ 752{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14)); 753 754######################################################################## 755# Stack layout 756# ----->+-----------------------+ 757# | uint64_t A[5][5] | 758# | ... | 759# | ... | 760# +456->+-----------------------+ 761# | 0x55555555 | 762# +460->+-----------------------+ 763# | 0x33333333 | 764# +464->+-----------------------+ 765# | 0x0f0f0f0f | 766# +468->+-----------------------+ 767# | 0x00ff00ff | 768# +472->+-----------------------+ 769# | uint64_t *A | 770# +476->+-----------------------+ 771# | const void *inp | 772# +480->+-----------------------+ 773# | size_t len | 774# +484->+-----------------------+ 775# | size_t bs | 776# +488->+-----------------------+ 777# | .... 778 779$code.=<<___; 780.global SHA3_absorb 781.type SHA3_absorb,%function 782.align 5 783SHA3_absorb: 784 stmdb sp!,{r0-r12,lr} 785 sub sp,sp,#456+16 786 787 add $A_flat,r0,#$A[1][0] 788 @ mov $inp,r1 789 mov $len,r2 790 mov $bsz,r3 791 cmp r2,r3 792 blo .Labsorb_abort 793 794 add $inp,sp,#0 795 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack 796 stmia $inp!, {@C[0]-@C[9]} 797 ldmia $A_flat!,{@C[0]-@C[9]} 798 stmia $inp!, {@C[0]-@C[9]} 799 ldmia $A_flat!,{@C[0]-@C[9]} 800 stmia $inp!, {@C[0]-@C[9]} 801 ldmia $A_flat!,{@C[0]-@C[9]} 802 stmia $inp!, {@C[0]-@C[9]} 803 ldmia $A_flat!,{@C[0]-@C[9]} 804 stmia $inp, {@C[0]-@C[9]} 805 806 ldr $inp,[sp,#476] @ restore $inp 807#ifdef __thumb2__ 808 mov r9,#0x00ff00ff 809 mov r8,#0x0f0f0f0f 810 mov r7,#0x33333333 811 mov r6,#0x55555555 812#else 813 mov r6,#0x11 @ compose constants 814 mov r8,#0x0f 815 mov r9,#0xff 816 orr r6,r6,r6,lsl#8 817 orr r8,r8,r8,lsl#8 818 orr r6,r6,r6,lsl#16 @ 0x11111111 819 orr r9,r9,r9,lsl#16 @ 0x00ff00ff 820 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f 821 orr r7,r6,r6,lsl#1 @ 0x33333333 822 orr r6,r6,r6,lsl#2 @ 0x55555555 823#endif 824 str r9,[sp,#468] 825 str r8,[sp,#464] 826 str r7,[sp,#460] 827 str r6,[sp,#456] 828 b .Loop_absorb 829 830.align 4 831.Loop_absorb: 832 subs r0,$len,$bsz 833 blo .Labsorbed 834 add $A_flat,sp,#0 835 str r0,[sp,#480] @ save len - bsz 836 837.align 4 838.Loop_block: 839 ldrb r0,[$inp],#1 840 ldrb r1,[$inp],#1 841 ldrb r2,[$inp],#1 842 ldrb r3,[$inp],#1 843 ldrb r4,[$inp],#1 844 orr r0,r0,r1,lsl#8 845 ldrb r1,[$inp],#1 846 orr r0,r0,r2,lsl#16 847 ldrb r2,[$inp],#1 848 orr r0,r0,r3,lsl#24 @ lo 849 ldrb r3,[$inp],#1 850 orr r1,r4,r1,lsl#8 851 orr r1,r1,r2,lsl#16 852 orr r1,r1,r3,lsl#24 @ hi 853 854 and r2,r0,r6 @ &=0x55555555 855 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa 856 and r3,r1,r6 @ &=0x55555555 857 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa 858 orr r2,r2,r2,lsr#1 859 orr r0,r0,r0,lsl#1 860 orr r3,r3,r3,lsr#1 861 orr r1,r1,r1,lsl#1 862 and r2,r2,r7 @ &=0x33333333 863 and r0,r0,r7,lsl#2 @ &=0xcccccccc 864 and r3,r3,r7 @ &=0x33333333 865 and r1,r1,r7,lsl#2 @ &=0xcccccccc 866 orr r2,r2,r2,lsr#2 867 orr r0,r0,r0,lsl#2 868 orr r3,r3,r3,lsr#2 869 orr r1,r1,r1,lsl#2 870 and r2,r2,r8 @ &=0x0f0f0f0f 871 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0 872 and r3,r3,r8 @ &=0x0f0f0f0f 873 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 874 ldmia $A_flat,{r4-r5} @ A_flat[i] 875 orr r2,r2,r2,lsr#4 876 orr r0,r0,r0,lsl#4 877 orr r3,r3,r3,lsr#4 878 orr r1,r1,r1,lsl#4 879 and r2,r2,r9 @ &=0x00ff00ff 880 and r0,r0,r9,lsl#8 @ &=0xff00ff00 881 and r3,r3,r9 @ &=0x00ff00ff 882 and r1,r1,r9,lsl#8 @ &=0xff00ff00 883 orr r2,r2,r2,lsr#8 884 orr r0,r0,r0,lsl#8 885 orr r3,r3,r3,lsr#8 886 orr r1,r1,r1,lsl#8 887 888 lsl r2,r2,#16 889 lsr r1,r1,#16 890 eor r4,r4,r3,lsl#16 891 eor r5,r5,r0,lsr#16 892 eor r4,r4,r2,lsr#16 893 eor r5,r5,r1,lsl#16 894 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7]) 895 896 subs $bsz,$bsz,#8 897 bhi .Loop_block 898 899 str $inp,[sp,#476] 900 901 bl KeccakF1600_int 902 903 add r14,sp,#456 904 ldmia r14,{r6-r12,r14} @ restore constants and variables 905 b .Loop_absorb 906 907.align 4 908.Labsorbed: 909 add $inp,sp,#$A[1][0] 910 ldmia sp, {@C[0]-@C[9]} 911 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5] 912 ldmia $inp!, {@C[0]-@C[9]} 913 stmia $A_flat!,{@C[0]-@C[9]} 914 ldmia $inp!, {@C[0]-@C[9]} 915 stmia $A_flat!,{@C[0]-@C[9]} 916 ldmia $inp!, {@C[0]-@C[9]} 917 stmia $A_flat!,{@C[0]-@C[9]} 918 ldmia $inp, {@C[0]-@C[9]} 919 stmia $A_flat, {@C[0]-@C[9]} 920 921.Labsorb_abort: 922 add sp,sp,#456+32 923 mov r0,$len @ return value 924#if __ARM_ARCH__>=5 925 ldmia sp!,{r4-r12,pc} 926#else 927 ldmia sp!,{r4-r12,lr} 928 tst lr,#1 929 moveq pc,lr @ be binary compatible with V4, yet 930 bx lr @ interoperable with Thumb ISA:-) 931#endif 932.size SHA3_absorb,.-SHA3_absorb 933___ 934} 935 936{ my ($out,$len,$A_flat,$bsz,$next) = map("r$_", (4,5,10,12,0)); 937 938 939# void SHA3_squeeze(uint64_t A[5][5], 940# unsigned char *out, size_t len, size_t r, int next) 941# 942# The first 4 parameters are passed in via r0..r3, 943# next is passed on the stack [sp, #0] 944 945$code.=<<___; 946.global SHA3_squeeze 947.type SHA3_squeeze,%function 948.align 5 949SHA3_squeeze: 950 stmdb sp!,{r0,r3-r10,lr} 951 952 mov $A_flat,r0 953 mov $out,r1 954 mov $len,r2 955 mov $bsz,r3 956 ldr $next, [sp, #40] @ next is after the 10 pushed registers (10*4) 957 958#ifdef __thumb2__ 959 mov r9,#0x00ff00ff 960 mov r8,#0x0f0f0f0f 961 mov r7,#0x33333333 962 mov r6,#0x55555555 963#else 964 mov r6,#0x11 @ compose constants 965 mov r8,#0x0f 966 mov r9,#0xff 967 orr r6,r6,r6,lsl#8 968 orr r8,r8,r8,lsl#8 969 orr r6,r6,r6,lsl#16 @ 0x11111111 970 orr r9,r9,r9,lsl#16 @ 0x00ff00ff 971 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f 972 orr r7,r6,r6,lsl#1 @ 0x33333333 973 orr r6,r6,r6,lsl#2 @ 0x55555555 974#endif 975 stmdb sp!,{r6-r9} 976 977 mov r14,$A_flat 978 cmp $next, #1 979 beq .Lnext_block 980 b .Loop_squeeze 981 982.align 4 983.Loop_squeeze: 984 ldmia $A_flat!,{r0,r1} @ A_flat[i++] 985 986 lsl r2,r0,#16 987 lsl r3,r1,#16 @ r3 = r1 << 16 988 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff 989 lsr r1,r1,#16 990 lsr r0,r0,#16 @ r0 = r0 >> 16 991 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000 992 993 orr r2,r2,r2,lsl#8 994 orr r3,r3,r3,lsr#8 995 orr r0,r0,r0,lsl#8 996 orr r1,r1,r1,lsr#8 997 and r2,r2,r9 @ &=0x00ff00ff 998 and r3,r3,r9,lsl#8 @ &=0xff00ff00 999 and r0,r0,r9 @ &=0x00ff00ff 1000 and r1,r1,r9,lsl#8 @ &=0xff00ff00 1001 orr r2,r2,r2,lsl#4 1002 orr r3,r3,r3,lsr#4 1003 orr r0,r0,r0,lsl#4 1004 orr r1,r1,r1,lsr#4 1005 and r2,r2,r8 @ &=0x0f0f0f0f 1006 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0 1007 and r0,r0,r8 @ &=0x0f0f0f0f 1008 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 1009 orr r2,r2,r2,lsl#2 1010 orr r3,r3,r3,lsr#2 1011 orr r0,r0,r0,lsl#2 1012 orr r1,r1,r1,lsr#2 1013 and r2,r2,r7 @ &=0x33333333 1014 and r3,r3,r7,lsl#2 @ &=0xcccccccc 1015 and r0,r0,r7 @ &=0x33333333 1016 and r1,r1,r7,lsl#2 @ &=0xcccccccc 1017 orr r2,r2,r2,lsl#1 1018 orr r3,r3,r3,lsr#1 1019 orr r0,r0,r0,lsl#1 1020 orr r1,r1,r1,lsr#1 1021 and r2,r2,r6 @ &=0x55555555 1022 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa 1023 and r0,r0,r6 @ &=0x55555555 1024 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa 1025 1026 orr r2,r2,r3 1027 orr r0,r0,r1 1028 1029 cmp $len,#8 1030 blo .Lsqueeze_tail 1031 lsr r1,r2,#8 1032 strb r2,[$out],#1 1033 lsr r3,r2,#16 1034 strb r1,[$out],#1 1035 lsr r2,r2,#24 1036 strb r3,[$out],#1 1037 strb r2,[$out],#1 1038 1039 lsr r1,r0,#8 1040 strb r0,[$out],#1 1041 lsr r3,r0,#16 1042 strb r1,[$out],#1 1043 lsr r0,r0,#24 1044 strb r3,[$out],#1 1045 strb r0,[$out],#1 1046 subs $len,$len,#8 1047 beq .Lsqueeze_done 1048 1049 subs $bsz,$bsz,#8 @ bsz -= 8 1050 bhi .Loop_squeeze 1051.Lnext_block: 1052 mov r0,r14 @ original $A_flat 1053 1054 bl KeccakF1600 1055 1056 ldmia sp,{r6-r10,r12} @ restore constants and variables 1057 mov r14,$A_flat 1058 b .Loop_squeeze 1059 1060.align 4 1061.Lsqueeze_tail: 1062 strb r2,[$out],#1 1063 lsr r2,r2,#8 1064 subs $len,$len,#1 1065 beq .Lsqueeze_done 1066 strb r2,[$out],#1 1067 lsr r2,r2,#8 1068 subs $len,$len,#1 1069 beq .Lsqueeze_done 1070 strb r2,[$out],#1 1071 lsr r2,r2,#8 1072 subs $len,$len,#1 1073 beq .Lsqueeze_done 1074 strb r2,[$out],#1 1075 subs $len,$len,#1 1076 beq .Lsqueeze_done 1077 1078 strb r0,[$out],#1 1079 lsr r0,r0,#8 1080 subs $len,$len,#1 1081 beq .Lsqueeze_done 1082 strb r0,[$out],#1 1083 lsr r0,r0,#8 1084 subs $len,$len,#1 1085 beq .Lsqueeze_done 1086 strb r0,[$out] 1087 b .Lsqueeze_done 1088 1089.align 4 1090.Lsqueeze_done: 1091 add sp,sp,#24 1092#if __ARM_ARCH__>=5 1093 ldmia sp!,{r4-r10,pc} 1094#else 1095 ldmia sp!,{r4-r10,lr} 1096 tst lr,#1 1097 moveq pc,lr @ be binary compatible with V4, yet 1098 bx lr @ interoperable with Thumb ISA:-) 1099#endif 1100.size SHA3_squeeze,.-SHA3_squeeze 1101___ 1102} 1103 1104$code.=<<___; 1105#if __ARM_MAX_ARCH__>=7 1106.fpu neon 1107 1108.type iotas64, %object 1109.align 5 1110iotas64: 1111 .quad 0x0000000000000001 1112 .quad 0x0000000000008082 1113 .quad 0x800000000000808a 1114 .quad 0x8000000080008000 1115 .quad 0x000000000000808b 1116 .quad 0x0000000080000001 1117 .quad 0x8000000080008081 1118 .quad 0x8000000000008009 1119 .quad 0x000000000000008a 1120 .quad 0x0000000000000088 1121 .quad 0x0000000080008009 1122 .quad 0x000000008000000a 1123 .quad 0x000000008000808b 1124 .quad 0x800000000000008b 1125 .quad 0x8000000000008089 1126 .quad 0x8000000000008003 1127 .quad 0x8000000000008002 1128 .quad 0x8000000000000080 1129 .quad 0x000000000000800a 1130 .quad 0x800000008000000a 1131 .quad 0x8000000080008081 1132 .quad 0x8000000000008080 1133 .quad 0x0000000080000001 1134 .quad 0x8000000080008008 1135.size iotas64,.-iotas64 1136 1137.type KeccakF1600_neon, %function 1138.align 5 1139KeccakF1600_neon: 1140 add r1, r0, #16 1141 adr r2, iotas64 1142 mov r3, #24 @ loop counter 1143 b .Loop_neon 1144 1145.align 4 1146.Loop_neon: 1147 @ Theta 1148 vst1.64 {q4}, [r0,:64] @ offload A[0..1][4] 1149 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0] 1150 vst1.64 {d18}, [r1,:64] @ offload A[2][4] 1151 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1] 1152 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2] 1153 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0] 1154 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1] 1155 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3] 1156 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4] 1157 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2] 1158 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3] 1159 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4] 1160 veor q13, q13, q10 @ C[0..1]^=A[4][0..1] 1161 veor q14, q15, q11 @ C[2..3]^=A[4][2..3] 1162 veor d25, d25, d24 @ C[4]^=A[4][4] 1163 1164 vadd.u64 q4, q13, q13 @ C[0..1]<<1 1165 vadd.u64 q15, q14, q14 @ C[2..3]<<1 1166 vadd.u64 d18, d25, d25 @ C[4]<<1 1167 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1) 1168 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1) 1169 vsri.u64 d18, d25, #63 @ ROL64(C[4],1) 1170 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1) 1171 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1) 1172 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1) 1173 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1) 1174 1175 veor d0, d0, d25 @ A[0][0] ^= C[4] 1176 veor d1, d1, d25 @ A[1][0] ^= C[4] 1177 veor d10, d10, d25 @ A[2][0] ^= C[4] 1178 veor d11, d11, d25 @ A[3][0] ^= C[4] 1179 veor d20, d20, d25 @ A[4][0] ^= C[4] 1180 1181 veor d2, d2, d26 @ A[0][1] ^= D[1] 1182 veor d3, d3, d26 @ A[1][1] ^= D[1] 1183 veor d12, d12, d26 @ A[2][1] ^= D[1] 1184 veor d13, d13, d26 @ A[3][1] ^= D[1] 1185 veor d21, d21, d26 @ A[4][1] ^= D[1] 1186 vmov d26, d27 1187 1188 veor d6, d6, d28 @ A[0][3] ^= C[2] 1189 veor d7, d7, d28 @ A[1][3] ^= C[2] 1190 veor d16, d16, d28 @ A[2][3] ^= C[2] 1191 veor d17, d17, d28 @ A[3][3] ^= C[2] 1192 veor d23, d23, d28 @ A[4][3] ^= C[2] 1193 vld1.64 {q4}, [r0,:64] @ restore A[0..1][4] 1194 vmov d28, d29 1195 1196 vld1.64 {d18}, [r1,:64] @ restore A[2][4] 1197 veor q2, q2, q13 @ A[0..1][2] ^= D[2] 1198 veor q7, q7, q13 @ A[2..3][2] ^= D[2] 1199 veor d22, d22, d27 @ A[4][2] ^= D[2] 1200 1201 veor q4, q4, q14 @ A[0..1][4] ^= C[3] 1202 veor q9, q9, q14 @ A[2..3][4] ^= C[3] 1203 veor d24, d24, d29 @ A[4][4] ^= C[3] 1204 1205 @ Rho + Pi 1206 vmov d26, d2 @ C[1] = A[0][1] 1207 vshl.u64 d2, d3, #44 1208 vmov d27, d4 @ C[2] = A[0][2] 1209 vshl.u64 d4, d14, #43 1210 vmov d28, d6 @ C[3] = A[0][3] 1211 vshl.u64 d6, d17, #21 1212 vmov d29, d8 @ C[4] = A[0][4] 1213 vshl.u64 d8, d24, #14 1214 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1]) 1215 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2]) 1216 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3]) 1217 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4]) 1218 1219 vshl.u64 d3, d9, #20 1220 vshl.u64 d14, d16, #25 1221 vshl.u64 d17, d15, #15 1222 vshl.u64 d24, d21, #2 1223 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4]) 1224 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3]) 1225 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2]) 1226 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1]) 1227 1228 vshl.u64 d9, d22, #61 1229 @ vshl.u64 d16, d19, #8 1230 vshl.u64 d15, d12, #10 1231 vshl.u64 d21, d7, #55 1232 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2]) 1233 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4]) 1234 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1]) 1235 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3]) 1236 1237 vshl.u64 d22, d18, #39 1238 @ vshl.u64 d19, d23, #56 1239 vshl.u64 d12, d5, #6 1240 vshl.u64 d7, d13, #45 1241 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4]) 1242 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3]) 1243 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2]) 1244 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1]) 1245 1246 vshl.u64 d18, d20, #18 1247 vshl.u64 d23, d11, #41 1248 vshl.u64 d5, d10, #3 1249 vshl.u64 d13, d1, #36 1250 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0]) 1251 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0]) 1252 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0]) 1253 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0]) 1254 1255 vshl.u64 d1, d28, #28 1256 vshl.u64 d10, d26, #1 1257 vshl.u64 d11, d29, #27 1258 vshl.u64 d20, d27, #62 1259 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3]) 1260 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1]) 1261 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4]) 1262 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2]) 1263 1264 @ Chi + Iota 1265 vbic q13, q2, q1 1266 vbic q14, q3, q2 1267 vbic q15, q4, q3 1268 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2]) 1269 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3]) 1270 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) 1271 vst1.64 {q13}, [r0,:64] @ offload A[0..1][0] 1272 vbic q13, q0, q4 1273 vbic q15, q1, q0 1274 vmov q1, q14 @ A[0..1][1] 1275 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) 1276 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) 1277 1278 vbic q13, q7, q6 1279 vmov q0, q5 @ A[2..3][0] 1280 vbic q14, q8, q7 1281 vmov q15, q6 @ A[2..3][1] 1282 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) 1283 vbic q13, q9, q8 1284 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) 1285 vbic q14, q0, q9 1286 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) 1287 vbic q13, q15, q0 1288 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) 1289 vmov q14, q10 @ A[4][0..1] 1290 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) 1291 1292 vld1.64 d25, [r2,:64]! @ Iota[i++] 1293 vbic d26, d22, d21 1294 vbic d27, d23, d22 1295 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0] 1296 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2]) 1297 vbic d26, d24, d23 1298 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3]) 1299 vbic d27, d28, d24 1300 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4]) 1301 vbic d26, d29, d28 1302 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0]) 1303 veor d0, d0, d25 @ A[0][0] ^= Iota[i] 1304 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1]) 1305 1306 subs r3, r3, #1 1307 bne .Loop_neon 1308 1309 ret 1310.size KeccakF1600_neon,.-KeccakF1600_neon 1311 1312.global SHA3_absorb_neon 1313.type SHA3_absorb_neon, %function 1314.align 5 1315SHA3_absorb_neon: 1316 stmdb sp!, {r4-r6,lr} 1317 vstmdb sp!, {d8-d15} 1318 1319 mov r4, r1 @ inp 1320 mov r5, r2 @ len 1321 mov r6, r3 @ bsz 1322 1323 vld1.32 {d0}, [r0,:64]! @ A[0][0] 1324 vld1.32 {d2}, [r0,:64]! @ A[0][1] 1325 vld1.32 {d4}, [r0,:64]! @ A[0][2] 1326 vld1.32 {d6}, [r0,:64]! @ A[0][3] 1327 vld1.32 {d8}, [r0,:64]! @ A[0][4] 1328 1329 vld1.32 {d1}, [r0,:64]! @ A[1][0] 1330 vld1.32 {d3}, [r0,:64]! @ A[1][1] 1331 vld1.32 {d5}, [r0,:64]! @ A[1][2] 1332 vld1.32 {d7}, [r0,:64]! @ A[1][3] 1333 vld1.32 {d9}, [r0,:64]! @ A[1][4] 1334 1335 vld1.32 {d10}, [r0,:64]! @ A[2][0] 1336 vld1.32 {d12}, [r0,:64]! @ A[2][1] 1337 vld1.32 {d14}, [r0,:64]! @ A[2][2] 1338 vld1.32 {d16}, [r0,:64]! @ A[2][3] 1339 vld1.32 {d18}, [r0,:64]! @ A[2][4] 1340 1341 vld1.32 {d11}, [r0,:64]! @ A[3][0] 1342 vld1.32 {d13}, [r0,:64]! @ A[3][1] 1343 vld1.32 {d15}, [r0,:64]! @ A[3][2] 1344 vld1.32 {d17}, [r0,:64]! @ A[3][3] 1345 vld1.32 {d19}, [r0,:64]! @ A[3][4] 1346 1347 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3] 1348 vld1.32 {d24}, [r0,:64] @ A[4][4] 1349 sub r0, r0, #24*8 @ rewind 1350 b .Loop_absorb_neon 1351 1352.align 4 1353.Loop_absorb_neon: 1354 subs r12, r5, r6 @ len - bsz 1355 blo .Labsorbed_neon 1356 mov r5, r12 1357 1358 vld1.8 {d31}, [r4]! @ endian-neutral loads... 1359 cmp r6, #8*2 1360 veor d0, d0, d31 @ A[0][0] ^= *inp++ 1361 blo .Lprocess_neon 1362 vld1.8 {d31}, [r4]! 1363 veor d2, d2, d31 @ A[0][1] ^= *inp++ 1364 beq .Lprocess_neon 1365 vld1.8 {d31}, [r4]! 1366 cmp r6, #8*4 1367 veor d4, d4, d31 @ A[0][2] ^= *inp++ 1368 blo .Lprocess_neon 1369 vld1.8 {d31}, [r4]! 1370 veor d6, d6, d31 @ A[0][3] ^= *inp++ 1371 beq .Lprocess_neon 1372 vld1.8 {d31},[r4]! 1373 cmp r6, #8*6 1374 veor d8, d8, d31 @ A[0][4] ^= *inp++ 1375 blo .Lprocess_neon 1376 1377 vld1.8 {d31}, [r4]! 1378 veor d1, d1, d31 @ A[1][0] ^= *inp++ 1379 beq .Lprocess_neon 1380 vld1.8 {d31}, [r4]! 1381 cmp r6, #8*8 1382 veor d3, d3, d31 @ A[1][1] ^= *inp++ 1383 blo .Lprocess_neon 1384 vld1.8 {d31}, [r4]! 1385 veor d5, d5, d31 @ A[1][2] ^= *inp++ 1386 beq .Lprocess_neon 1387 vld1.8 {d31}, [r4]! 1388 cmp r6, #8*10 1389 veor d7, d7, d31 @ A[1][3] ^= *inp++ 1390 blo .Lprocess_neon 1391 vld1.8 {d31}, [r4]! 1392 veor d9, d9, d31 @ A[1][4] ^= *inp++ 1393 beq .Lprocess_neon 1394 1395 vld1.8 {d31}, [r4]! 1396 cmp r6, #8*12 1397 veor d10, d10, d31 @ A[2][0] ^= *inp++ 1398 blo .Lprocess_neon 1399 vld1.8 {d31}, [r4]! 1400 veor d12, d12, d31 @ A[2][1] ^= *inp++ 1401 beq .Lprocess_neon 1402 vld1.8 {d31}, [r4]! 1403 cmp r6, #8*14 1404 veor d14, d14, d31 @ A[2][2] ^= *inp++ 1405 blo .Lprocess_neon 1406 vld1.8 {d31}, [r4]! 1407 veor d16, d16, d31 @ A[2][3] ^= *inp++ 1408 beq .Lprocess_neon 1409 vld1.8 {d31}, [r4]! 1410 cmp r6, #8*16 1411 veor d18, d18, d31 @ A[2][4] ^= *inp++ 1412 blo .Lprocess_neon 1413 1414 vld1.8 {d31}, [r4]! 1415 veor d11, d11, d31 @ A[3][0] ^= *inp++ 1416 beq .Lprocess_neon 1417 vld1.8 {d31}, [r4]! 1418 cmp r6, #8*18 1419 veor d13, d13, d31 @ A[3][1] ^= *inp++ 1420 blo .Lprocess_neon 1421 vld1.8 {d31}, [r4]! 1422 veor d15, d15, d31 @ A[3][2] ^= *inp++ 1423 beq .Lprocess_neon 1424 vld1.8 {d31}, [r4]! 1425 cmp r6, #8*20 1426 veor d17, d17, d31 @ A[3][3] ^= *inp++ 1427 blo .Lprocess_neon 1428 vld1.8 {d31}, [r4]! 1429 veor d19, d19, d31 @ A[3][4] ^= *inp++ 1430 beq .Lprocess_neon 1431 1432 vld1.8 {d31}, [r4]! 1433 cmp r6, #8*22 1434 veor d20, d20, d31 @ A[4][0] ^= *inp++ 1435 blo .Lprocess_neon 1436 vld1.8 {d31}, [r4]! 1437 veor d21, d21, d31 @ A[4][1] ^= *inp++ 1438 beq .Lprocess_neon 1439 vld1.8 {d31}, [r4]! 1440 cmp r6, #8*24 1441 veor d22, d22, d31 @ A[4][2] ^= *inp++ 1442 blo .Lprocess_neon 1443 vld1.8 {d31}, [r4]! 1444 veor d23, d23, d31 @ A[4][3] ^= *inp++ 1445 beq .Lprocess_neon 1446 vld1.8 {d31}, [r4]! 1447 veor d24, d24, d31 @ A[4][4] ^= *inp++ 1448 1449.Lprocess_neon: 1450 bl KeccakF1600_neon 1451 b .Loop_absorb_neon 1452 1453.align 4 1454.Labsorbed_neon: 1455 vst1.32 {d0}, [r0,:64]! @ A[0][0..4] 1456 vst1.32 {d2}, [r0,:64]! 1457 vst1.32 {d4}, [r0,:64]! 1458 vst1.32 {d6}, [r0,:64]! 1459 vst1.32 {d8}, [r0,:64]! 1460 1461 vst1.32 {d1}, [r0,:64]! @ A[1][0..4] 1462 vst1.32 {d3}, [r0,:64]! 1463 vst1.32 {d5}, [r0,:64]! 1464 vst1.32 {d7}, [r0,:64]! 1465 vst1.32 {d9}, [r0,:64]! 1466 1467 vst1.32 {d10}, [r0,:64]! @ A[2][0..4] 1468 vst1.32 {d12}, [r0,:64]! 1469 vst1.32 {d14}, [r0,:64]! 1470 vst1.32 {d16}, [r0,:64]! 1471 vst1.32 {d18}, [r0,:64]! 1472 1473 vst1.32 {d11}, [r0,:64]! @ A[3][0..4] 1474 vst1.32 {d13}, [r0,:64]! 1475 vst1.32 {d15}, [r0,:64]! 1476 vst1.32 {d17}, [r0,:64]! 1477 vst1.32 {d19}, [r0,:64]! 1478 1479 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1480 vst1.32 {d24}, [r0,:64] 1481 1482 mov r0, r5 @ return value 1483 vldmia sp!, {d8-d15} 1484 ldmia sp!, {r4-r6,pc} 1485.size SHA3_absorb_neon,.-SHA3_absorb_neon 1486 1487.global SHA3_squeeze_neon 1488.type SHA3_squeeze_neon, %function 1489.align 5 1490SHA3_squeeze_neon: 1491 stmdb sp!, {r4-r6,lr} 1492 1493 mov r4, r1 @ out 1494 mov r5, r2 @ len 1495 mov r6, r3 @ bsz 1496 mov r12, r0 @ A_flat 1497 mov r14, r3 @ bsz 1498 b .Loop_squeeze_neon 1499 1500.align 4 1501.Loop_squeeze_neon: 1502 cmp r5, #8 1503 blo .Lsqueeze_neon_tail 1504 vld1.32 {d0}, [r12]! 1505 vst1.8 {d0}, [r4]! @ endian-neutral store 1506 1507 subs r5, r5, #8 @ len -= 8 1508 beq .Lsqueeze_neon_done 1509 1510 subs r14, r14, #8 @ bsz -= 8 1511 bhi .Loop_squeeze_neon 1512 1513 vstmdb sp!, {d8-d15} 1514 1515 vld1.32 {d0}, [r0,:64]! @ A[0][0..4] 1516 vld1.32 {d2}, [r0,:64]! 1517 vld1.32 {d4}, [r0,:64]! 1518 vld1.32 {d6}, [r0,:64]! 1519 vld1.32 {d8}, [r0,:64]! 1520 1521 vld1.32 {d1}, [r0,:64]! @ A[1][0..4] 1522 vld1.32 {d3}, [r0,:64]! 1523 vld1.32 {d5}, [r0,:64]! 1524 vld1.32 {d7}, [r0,:64]! 1525 vld1.32 {d9}, [r0,:64]! 1526 1527 vld1.32 {d10}, [r0,:64]! @ A[2][0..4] 1528 vld1.32 {d12}, [r0,:64]! 1529 vld1.32 {d14}, [r0,:64]! 1530 vld1.32 {d16}, [r0,:64]! 1531 vld1.32 {d18}, [r0,:64]! 1532 1533 vld1.32 {d11}, [r0,:64]! @ A[3][0..4] 1534 vld1.32 {d13}, [r0,:64]! 1535 vld1.32 {d15}, [r0,:64]! 1536 vld1.32 {d17}, [r0,:64]! 1537 vld1.32 {d19}, [r0,:64]! 1538 1539 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1540 vld1.32 {d24}, [r0,:64] 1541 sub r0, r0, #24*8 @ rewind 1542 1543 bl KeccakF1600_neon 1544 1545 mov r12, r0 @ A_flat 1546 vst1.32 {d0}, [r0,:64]! @ A[0][0..4] 1547 vst1.32 {d2}, [r0,:64]! 1548 vst1.32 {d4}, [r0,:64]! 1549 vst1.32 {d6}, [r0,:64]! 1550 vst1.32 {d8}, [r0,:64]! 1551 1552 vst1.32 {d1}, [r0,:64]! @ A[1][0..4] 1553 vst1.32 {d3}, [r0,:64]! 1554 vst1.32 {d5}, [r0,:64]! 1555 vst1.32 {d7}, [r0,:64]! 1556 vst1.32 {d9}, [r0,:64]! 1557 1558 vst1.32 {d10}, [r0,:64]! @ A[2][0..4] 1559 vst1.32 {d12}, [r0,:64]! 1560 vst1.32 {d14}, [r0,:64]! 1561 vst1.32 {d16}, [r0,:64]! 1562 vst1.32 {d18}, [r0,:64]! 1563 1564 vst1.32 {d11}, [r0,:64]! @ A[3][0..4] 1565 vst1.32 {d13}, [r0,:64]! 1566 vst1.32 {d15}, [r0,:64]! 1567 vst1.32 {d17}, [r0,:64]! 1568 vst1.32 {d19}, [r0,:64]! 1569 1570 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1571 mov r14, r6 @ bsz 1572 vst1.32 {d24}, [r0,:64] 1573 mov r0, r12 @ rewind 1574 1575 vldmia sp!, {d8-d15} 1576 b .Loop_squeeze_neon 1577 1578.align 4 1579.Lsqueeze_neon_tail: 1580 ldmia r12, {r2,r3} 1581 cmp r5, #2 1582 strb r2, [r4],#1 @ endian-neutral store 1583 lsr r2, r2, #8 1584 blo .Lsqueeze_neon_done 1585 strb r2, [r4], #1 1586 lsr r2, r2, #8 1587 beq .Lsqueeze_neon_done 1588 strb r2, [r4], #1 1589 lsr r2, r2, #8 1590 cmp r5, #4 1591 blo .Lsqueeze_neon_done 1592 strb r2, [r4], #1 1593 beq .Lsqueeze_neon_done 1594 1595 strb r3, [r4], #1 1596 lsr r3, r3, #8 1597 cmp r5, #6 1598 blo .Lsqueeze_neon_done 1599 strb r3, [r4], #1 1600 lsr r3, r3, #8 1601 beq .Lsqueeze_neon_done 1602 strb r3, [r4], #1 1603 1604.Lsqueeze_neon_done: 1605 ldmia sp!, {r4-r6,pc} 1606.size SHA3_squeeze_neon,.-SHA3_squeeze_neon 1607#endif 1608.asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 1609.align 2 1610___ 1611 1612{ 1613 my %ldr, %str; 1614 1615 sub ldrd { 1616 my ($mnemonic,$half,$reg,$ea) = @_; 1617 my $op = $mnemonic eq "ldr" ? \%ldr : \%str; 1618 1619 if ($half eq "l") { 1620 $$op{reg} = $reg; 1621 $$op{ea} = $ea; 1622 sprintf "#ifndef __thumb2__\n" . 1623 " %s\t%s,%s\n" . 1624 "#endif", $mnemonic,$reg,$ea; 1625 } else { 1626 sprintf "#ifndef __thumb2__\n" . 1627 " %s\t%s,%s\n" . 1628 "#else\n" . 1629 " %sd\t%s,%s,%s\n" . 1630 "#endif", $mnemonic,$reg,$ea, 1631 $mnemonic,$$op{reg},$reg,$$op{ea}; 1632 } 1633 } 1634} 1635 1636foreach (split($/,$code)) { 1637 s/\`([^\`]*)\`/eval $1/ge; 1638 1639 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or 1640 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or 1641 s/\bret\b/bx lr/g or 1642 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 1643 1644 print $_,"\n"; 1645} 1646 1647close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1648