1 /*
2 * Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 #include <openssl/e_os2.h>
11 #include <string.h>
12 #include <assert.h>
13
14 #include "internal/nelem.h"
15
16 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
17 size_t r);
18 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r, int next);
19
20 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
21
22 /*
23 * Choose some sensible defaults
24 */
25 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
26 !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
27 # define KECCAK_2X /* default to KECCAK_2X variant */
28 #endif
29
30 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
31 (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
32 defined(__mips) || defined(__riscv) || defined(__s390__) || \
33 defined(__EMSCRIPTEN__)
34 /*
35 * These don't have "and with complement" instruction, so minimize amount
36 * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
37 */
38 # define KECCAK_COMPLEMENTING_TRANSFORM
39 #endif
40
41 #if defined(__x86_64__) || defined(__aarch64__) || \
42 defined(__mips64) || defined(__ia64) || \
43 (defined(__VMS) && !defined(__vax))
44 /*
45 * These are available even in ILP32 flavours, but even then they are
46 * capable of performing 64-bit operations as efficiently as in *P64.
47 * Since it's not given that we can use sizeof(void *), just shunt it.
48 */
49 # define BIT_INTERLEAVE (0)
50 #else
51 # define BIT_INTERLEAVE (sizeof(void *) < 8)
52 #endif
53
54 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
55
ROL64(uint64_t val,int offset)56 static uint64_t ROL64(uint64_t val, int offset)
57 {
58 if (offset == 0) {
59 return val;
60 } else if (!BIT_INTERLEAVE) {
61 return (val << offset) | (val >> (64-offset));
62 } else {
63 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
64
65 if (offset & 1) {
66 uint32_t tmp = hi;
67
68 offset >>= 1;
69 hi = ROL32(lo, offset);
70 lo = ROL32(tmp, offset + 1);
71 } else {
72 offset >>= 1;
73 lo = ROL32(lo, offset);
74 hi = ROL32(hi, offset);
75 }
76
77 return ((uint64_t)hi << 32) | lo;
78 }
79 }
80
81 static const unsigned char rhotates[5][5] = {
82 { 0, 1, 62, 28, 27 },
83 { 36, 44, 6, 55, 20 },
84 { 3, 10, 43, 25, 39 },
85 { 41, 45, 15, 21, 8 },
86 { 18, 2, 61, 56, 14 }
87 };
88
89 static const uint64_t iotas[] = {
90 BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
91 BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
92 BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
93 BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
94 BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
95 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
96 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
97 BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
98 BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
99 BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
100 BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
101 BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
102 BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
103 BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
104 BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
105 BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
106 BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
107 BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
108 BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
109 BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
110 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
111 BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
112 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
113 BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
114 };
115
116 #if defined(KECCAK_REF)
117 /*
118 * This is straightforward or "maximum clarity" implementation aiming
119 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
120 * Permutation-Based Hash and Extendible-Output Functions" as much as
121 * possible. With one caveat. Because of the way C stores matrices,
122 * references to A[x,y] in the specification are presented as A[y][x].
123 * Implementation unrolls inner x-loops so that modulo 5 operations are
124 * explicitly pre-computed.
125 */
Theta(uint64_t A[5][5])126 static void Theta(uint64_t A[5][5])
127 {
128 uint64_t C[5], D[5];
129 size_t y;
130
131 C[0] = A[0][0];
132 C[1] = A[0][1];
133 C[2] = A[0][2];
134 C[3] = A[0][3];
135 C[4] = A[0][4];
136
137 for (y = 1; y < 5; y++) {
138 C[0] ^= A[y][0];
139 C[1] ^= A[y][1];
140 C[2] ^= A[y][2];
141 C[3] ^= A[y][3];
142 C[4] ^= A[y][4];
143 }
144
145 D[0] = ROL64(C[1], 1) ^ C[4];
146 D[1] = ROL64(C[2], 1) ^ C[0];
147 D[2] = ROL64(C[3], 1) ^ C[1];
148 D[3] = ROL64(C[4], 1) ^ C[2];
149 D[4] = ROL64(C[0], 1) ^ C[3];
150
151 for (y = 0; y < 5; y++) {
152 A[y][0] ^= D[0];
153 A[y][1] ^= D[1];
154 A[y][2] ^= D[2];
155 A[y][3] ^= D[3];
156 A[y][4] ^= D[4];
157 }
158 }
159
Rho(uint64_t A[5][5])160 static void Rho(uint64_t A[5][5])
161 {
162 size_t y;
163
164 for (y = 0; y < 5; y++) {
165 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
166 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
167 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
168 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
169 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
170 }
171 }
172
Pi(uint64_t A[5][5])173 static void Pi(uint64_t A[5][5])
174 {
175 uint64_t T[5][5];
176
177 /*
178 * T = A
179 * A[y][x] = T[x][(3*y+x)%5]
180 */
181 memcpy(T, A, sizeof(T));
182
183 A[0][0] = T[0][0];
184 A[0][1] = T[1][1];
185 A[0][2] = T[2][2];
186 A[0][3] = T[3][3];
187 A[0][4] = T[4][4];
188
189 A[1][0] = T[0][3];
190 A[1][1] = T[1][4];
191 A[1][2] = T[2][0];
192 A[1][3] = T[3][1];
193 A[1][4] = T[4][2];
194
195 A[2][0] = T[0][1];
196 A[2][1] = T[1][2];
197 A[2][2] = T[2][3];
198 A[2][3] = T[3][4];
199 A[2][4] = T[4][0];
200
201 A[3][0] = T[0][4];
202 A[3][1] = T[1][0];
203 A[3][2] = T[2][1];
204 A[3][3] = T[3][2];
205 A[3][4] = T[4][3];
206
207 A[4][0] = T[0][2];
208 A[4][1] = T[1][3];
209 A[4][2] = T[2][4];
210 A[4][3] = T[3][0];
211 A[4][4] = T[4][1];
212 }
213
Chi(uint64_t A[5][5])214 static void Chi(uint64_t A[5][5])
215 {
216 uint64_t C[5];
217 size_t y;
218
219 for (y = 0; y < 5; y++) {
220 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
221 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
222 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
223 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
224 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
225
226 A[y][0] = C[0];
227 A[y][1] = C[1];
228 A[y][2] = C[2];
229 A[y][3] = C[3];
230 A[y][4] = C[4];
231 }
232 }
233
Iota(uint64_t A[5][5],size_t i)234 static void Iota(uint64_t A[5][5], size_t i)
235 {
236 assert(i < OSSL_NELEM(iotas));
237 A[0][0] ^= iotas[i];
238 }
239
KeccakF1600(uint64_t A[5][5])240 static void KeccakF1600(uint64_t A[5][5])
241 {
242 size_t i;
243
244 for (i = 0; i < 24; i++) {
245 Theta(A);
246 Rho(A);
247 Pi(A);
248 Chi(A);
249 Iota(A, i);
250 }
251 }
252
253 #elif defined(KECCAK_1X)
254 /*
255 * This implementation is optimization of above code featuring unroll
256 * of even y-loops, their fusion and code motion. It also minimizes
257 * temporary storage. Compiler would normally do all these things for
258 * you, purpose of manual optimization is to provide "unobscured"
259 * reference for assembly implementation [in case this approach is
260 * chosen for implementation on some platform]. In the nutshell it's
261 * equivalent of "plane-per-plane processing" approach discussed in
262 * section 2.4 of "Keccak implementation overview".
263 */
Round(uint64_t A[5][5],size_t i)264 static void Round(uint64_t A[5][5], size_t i)
265 {
266 uint64_t C[5], E[2]; /* registers */
267 uint64_t D[5], T[2][5]; /* memory */
268
269 assert(i < OSSL_NELEM(iotas));
270
271 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
272 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
273 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
274 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
275 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
276
277 #if defined(__arm__)
278 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
279 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
280 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
281 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
282 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
283
284 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
285 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
286 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
287 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
288 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
289
290 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
291 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
292 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
293 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
294 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
295 #else
296 D[0] = ROL64(C[1], 1) ^ C[4];
297 D[1] = ROL64(C[2], 1) ^ C[0];
298 D[2] = ROL64(C[3], 1) ^ C[1];
299 D[3] = ROL64(C[4], 1) ^ C[2];
300 D[4] = ROL64(C[0], 1) ^ C[3];
301
302 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
303 T[0][1] = A[0][1] ^ D[1];
304 T[0][2] = A[0][2] ^ D[2];
305 T[0][3] = A[0][3] ^ D[3];
306 T[0][4] = A[0][4] ^ D[4];
307
308 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
309 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
310 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
311 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
312 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
313 #endif
314 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
315 A[0][1] = C[1] ^ (~C[2] & C[3]);
316 A[0][2] = C[2] ^ (~C[3] & C[4]);
317 A[0][3] = C[3] ^ (~C[4] & C[0]);
318 A[0][4] = C[4] ^ (~C[0] & C[1]);
319
320 T[1][0] = A[1][0] ^ (C[3] = D[0]);
321 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
322 T[1][2] = A[1][2] ^ (E[0] = D[2]);
323 T[1][3] = A[1][3] ^ (E[1] = D[3]);
324 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
325
326 C[0] = ROL64(T[0][3], rhotates[0][3]);
327 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
328 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
329 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
330 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
331
332 A[1][0] = C[0] ^ (~C[1] & C[2]);
333 A[1][1] = C[1] ^ (~C[2] & C[3]);
334 A[1][2] = C[2] ^ (~C[3] & C[4]);
335 A[1][3] = C[3] ^ (~C[4] & C[0]);
336 A[1][4] = C[4] ^ (~C[0] & C[1]);
337
338 C[0] = ROL64(T[0][1], rhotates[0][1]);
339 C[1] = ROL64(T[1][2], rhotates[1][2]);
340 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
341 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
342 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
343
344 A[2][0] = C[0] ^ (~C[1] & C[2]);
345 A[2][1] = C[1] ^ (~C[2] & C[3]);
346 A[2][2] = C[2] ^ (~C[3] & C[4]);
347 A[2][3] = C[3] ^ (~C[4] & C[0]);
348 A[2][4] = C[4] ^ (~C[0] & C[1]);
349
350 C[0] = ROL64(T[0][4], rhotates[0][4]);
351 C[1] = ROL64(T[1][0], rhotates[1][0]);
352 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
353 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
354 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
355
356 A[3][0] = C[0] ^ (~C[1] & C[2]);
357 A[3][1] = C[1] ^ (~C[2] & C[3]);
358 A[3][2] = C[2] ^ (~C[3] & C[4]);
359 A[3][3] = C[3] ^ (~C[4] & C[0]);
360 A[3][4] = C[4] ^ (~C[0] & C[1]);
361
362 C[0] = ROL64(T[0][2], rhotates[0][2]);
363 C[1] = ROL64(T[1][3], rhotates[1][3]);
364 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
365 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
366 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
367
368 A[4][0] = C[0] ^ (~C[1] & C[2]);
369 A[4][1] = C[1] ^ (~C[2] & C[3]);
370 A[4][2] = C[2] ^ (~C[3] & C[4]);
371 A[4][3] = C[3] ^ (~C[4] & C[0]);
372 A[4][4] = C[4] ^ (~C[0] & C[1]);
373 }
374
KeccakF1600(uint64_t A[5][5])375 static void KeccakF1600(uint64_t A[5][5])
376 {
377 size_t i;
378
379 for (i = 0; i < 24; i++) {
380 Round(A, i);
381 }
382 }
383
384 #elif defined(KECCAK_1X_ALT)
385 /*
386 * This is variant of above KECCAK_1X that reduces requirement for
387 * temporary storage even further, but at cost of more updates to A[][].
388 * It's less suitable if A[][] is memory bound, but better if it's
389 * register bound.
390 */
391
Round(uint64_t A[5][5],size_t i)392 static void Round(uint64_t A[5][5], size_t i)
393 {
394 uint64_t C[5], D[5];
395
396 assert(i < OSSL_NELEM(iotas));
397
398 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
399 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
400 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
401 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
402 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
403
404 D[1] = C[0] ^ ROL64(C[2], 1);
405 D[2] = C[1] ^ ROL64(C[3], 1);
406 D[3] = C[2] ^= ROL64(C[4], 1);
407 D[4] = C[3] ^= ROL64(C[0], 1);
408 D[0] = C[4] ^= ROL64(C[1], 1);
409
410 A[0][1] ^= D[1];
411 A[1][1] ^= D[1];
412 A[2][1] ^= D[1];
413 A[3][1] ^= D[1];
414 A[4][1] ^= D[1];
415
416 A[0][2] ^= D[2];
417 A[1][2] ^= D[2];
418 A[2][2] ^= D[2];
419 A[3][2] ^= D[2];
420 A[4][2] ^= D[2];
421
422 A[0][3] ^= C[2];
423 A[1][3] ^= C[2];
424 A[2][3] ^= C[2];
425 A[3][3] ^= C[2];
426 A[4][3] ^= C[2];
427
428 A[0][4] ^= C[3];
429 A[1][4] ^= C[3];
430 A[2][4] ^= C[3];
431 A[3][4] ^= C[3];
432 A[4][4] ^= C[3];
433
434 A[0][0] ^= C[4];
435 A[1][0] ^= C[4];
436 A[2][0] ^= C[4];
437 A[3][0] ^= C[4];
438 A[4][0] ^= C[4];
439
440 C[1] = A[0][1];
441 C[2] = A[0][2];
442 C[3] = A[0][3];
443 C[4] = A[0][4];
444
445 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
446 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
447 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
448 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
449
450 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
451 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
452 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
453 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
454
455 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
456 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
457 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
458 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
459
460 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
461 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
462 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
463 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
464
465 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
466 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
467 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
468 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
469
470 A[1][0] = ROL64(C[3], rhotates[0][3]);
471 A[2][0] = ROL64(C[1], rhotates[0][1]);
472 A[3][0] = ROL64(C[4], rhotates[0][4]);
473 A[4][0] = ROL64(C[2], rhotates[0][2]);
474
475 C[0] = A[0][0];
476 C[1] = A[1][0];
477 D[0] = A[0][1];
478 D[1] = A[1][1];
479
480 A[0][0] ^= (~A[0][1] & A[0][2]);
481 A[1][0] ^= (~A[1][1] & A[1][2]);
482 A[0][1] ^= (~A[0][2] & A[0][3]);
483 A[1][1] ^= (~A[1][2] & A[1][3]);
484 A[0][2] ^= (~A[0][3] & A[0][4]);
485 A[1][2] ^= (~A[1][3] & A[1][4]);
486 A[0][3] ^= (~A[0][4] & C[0]);
487 A[1][3] ^= (~A[1][4] & C[1]);
488 A[0][4] ^= (~C[0] & D[0]);
489 A[1][4] ^= (~C[1] & D[1]);
490
491 C[2] = A[2][0];
492 C[3] = A[3][0];
493 D[2] = A[2][1];
494 D[3] = A[3][1];
495
496 A[2][0] ^= (~A[2][1] & A[2][2]);
497 A[3][0] ^= (~A[3][1] & A[3][2]);
498 A[2][1] ^= (~A[2][2] & A[2][3]);
499 A[3][1] ^= (~A[3][2] & A[3][3]);
500 A[2][2] ^= (~A[2][3] & A[2][4]);
501 A[3][2] ^= (~A[3][3] & A[3][4]);
502 A[2][3] ^= (~A[2][4] & C[2]);
503 A[3][3] ^= (~A[3][4] & C[3]);
504 A[2][4] ^= (~C[2] & D[2]);
505 A[3][4] ^= (~C[3] & D[3]);
506
507 C[4] = A[4][0];
508 D[4] = A[4][1];
509
510 A[4][0] ^= (~A[4][1] & A[4][2]);
511 A[4][1] ^= (~A[4][2] & A[4][3]);
512 A[4][2] ^= (~A[4][3] & A[4][4]);
513 A[4][3] ^= (~A[4][4] & C[4]);
514 A[4][4] ^= (~C[4] & D[4]);
515 A[0][0] ^= iotas[i];
516 }
517
KeccakF1600(uint64_t A[5][5])518 static void KeccakF1600(uint64_t A[5][5])
519 {
520 size_t i;
521
522 for (i = 0; i < 24; i++) {
523 Round(A, i);
524 }
525 }
526
527 #elif defined(KECCAK_2X)
528 /*
529 * This implementation is variant of KECCAK_1X above with outer-most
530 * round loop unrolled twice. This allows to take temporary storage
531 * out of round procedure and simplify references to it by alternating
532 * it with actual data (see round loop below). Originally it was meant
533 * rather as reference for an assembly implementation, but it seems to
534 * play best with compilers [as well as provide best instruction per
535 * processed byte ratio at minimal round unroll factor]...
536 */
Round(uint64_t R[5][5],uint64_t A[5][5],size_t i)537 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
538 {
539 uint64_t C[5], D[5];
540
541 assert(i < OSSL_NELEM(iotas));
542
543 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
544 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
545 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
546 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
547 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
548
549 D[0] = ROL64(C[1], 1) ^ C[4];
550 D[1] = ROL64(C[2], 1) ^ C[0];
551 D[2] = ROL64(C[3], 1) ^ C[1];
552 D[3] = ROL64(C[4], 1) ^ C[2];
553 D[4] = ROL64(C[0], 1) ^ C[3];
554
555 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
556 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
557 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
558 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
559 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
560
561 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
562 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
563 R[0][1] = C[1] ^ (~C[2] | C[3]);
564 R[0][2] = C[2] ^ ( C[3] & C[4]);
565 R[0][3] = C[3] ^ ( C[4] | C[0]);
566 R[0][4] = C[4] ^ ( C[0] & C[1]);
567 #else
568 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
569 R[0][1] = C[1] ^ (~C[2] & C[3]);
570 R[0][2] = C[2] ^ (~C[3] & C[4]);
571 R[0][3] = C[3] ^ (~C[4] & C[0]);
572 R[0][4] = C[4] ^ (~C[0] & C[1]);
573 #endif
574
575 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
576 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
577 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
578 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
579 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
580
581 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
582 R[1][0] = C[0] ^ (C[1] | C[2]);
583 R[1][1] = C[1] ^ (C[2] & C[3]);
584 R[1][2] = C[2] ^ (C[3] | ~C[4]);
585 R[1][3] = C[3] ^ (C[4] | C[0]);
586 R[1][4] = C[4] ^ (C[0] & C[1]);
587 #else
588 R[1][0] = C[0] ^ (~C[1] & C[2]);
589 R[1][1] = C[1] ^ (~C[2] & C[3]);
590 R[1][2] = C[2] ^ (~C[3] & C[4]);
591 R[1][3] = C[3] ^ (~C[4] & C[0]);
592 R[1][4] = C[4] ^ (~C[0] & C[1]);
593 #endif
594
595 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
596 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
597 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
598 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
599 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
600
601 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
602 R[2][0] = C[0] ^ ( C[1] | C[2]);
603 R[2][1] = C[1] ^ ( C[2] & C[3]);
604 R[2][2] = C[2] ^ (~C[3] & C[4]);
605 R[2][3] = ~C[3] ^ ( C[4] | C[0]);
606 R[2][4] = C[4] ^ ( C[0] & C[1]);
607 #else
608 R[2][0] = C[0] ^ (~C[1] & C[2]);
609 R[2][1] = C[1] ^ (~C[2] & C[3]);
610 R[2][2] = C[2] ^ (~C[3] & C[4]);
611 R[2][3] = C[3] ^ (~C[4] & C[0]);
612 R[2][4] = C[4] ^ (~C[0] & C[1]);
613 #endif
614
615 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
616 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
617 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
618 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
619 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
620
621 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
622 R[3][0] = C[0] ^ ( C[1] & C[2]);
623 R[3][1] = C[1] ^ ( C[2] | C[3]);
624 R[3][2] = C[2] ^ (~C[3] | C[4]);
625 R[3][3] = ~C[3] ^ ( C[4] & C[0]);
626 R[3][4] = C[4] ^ ( C[0] | C[1]);
627 #else
628 R[3][0] = C[0] ^ (~C[1] & C[2]);
629 R[3][1] = C[1] ^ (~C[2] & C[3]);
630 R[3][2] = C[2] ^ (~C[3] & C[4]);
631 R[3][3] = C[3] ^ (~C[4] & C[0]);
632 R[3][4] = C[4] ^ (~C[0] & C[1]);
633 #endif
634
635 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
636 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
637 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
638 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
639 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
640
641 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
642 R[4][0] = C[0] ^ (~C[1] & C[2]);
643 R[4][1] = ~C[1] ^ ( C[2] | C[3]);
644 R[4][2] = C[2] ^ ( C[3] & C[4]);
645 R[4][3] = C[3] ^ ( C[4] | C[0]);
646 R[4][4] = C[4] ^ ( C[0] & C[1]);
647 #else
648 R[4][0] = C[0] ^ (~C[1] & C[2]);
649 R[4][1] = C[1] ^ (~C[2] & C[3]);
650 R[4][2] = C[2] ^ (~C[3] & C[4]);
651 R[4][3] = C[3] ^ (~C[4] & C[0]);
652 R[4][4] = C[4] ^ (~C[0] & C[1]);
653 #endif
654 }
655
KeccakF1600(uint64_t A[5][5])656 static void KeccakF1600(uint64_t A[5][5])
657 {
658 uint64_t T[5][5];
659 size_t i;
660
661 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
662 A[0][1] = ~A[0][1];
663 A[0][2] = ~A[0][2];
664 A[1][3] = ~A[1][3];
665 A[2][2] = ~A[2][2];
666 A[3][2] = ~A[3][2];
667 A[4][0] = ~A[4][0];
668 #endif
669
670 for (i = 0; i < 24; i += 2) {
671 Round(T, A, i);
672 Round(A, T, i + 1);
673 }
674
675 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
676 A[0][1] = ~A[0][1];
677 A[0][2] = ~A[0][2];
678 A[1][3] = ~A[1][3];
679 A[2][2] = ~A[2][2];
680 A[3][2] = ~A[3][2];
681 A[4][0] = ~A[4][0];
682 #endif
683 }
684
685 #else /* define KECCAK_INPLACE to compile this code path */
686 /*
687 * This implementation is KECCAK_1X from above combined 4 times with
688 * a twist that allows to omit temporary storage and perform in-place
689 * processing. It's discussed in section 2.5 of "Keccak implementation
690 * overview". It's likely to be best suited for processors with large
691 * register bank... On the other hand processor with large register
692 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
693 * more compact...
694 */
FourRounds(uint64_t A[5][5],size_t i)695 static void FourRounds(uint64_t A[5][5], size_t i)
696 {
697 uint64_t B[5], C[5], D[5];
698
699 assert(i <= OSSL_NELEM(iotas) - 4);
700
701 /* Round 4*n */
702 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
703 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
704 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
705 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
706 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
707
708 D[0] = ROL64(C[1], 1) ^ C[4];
709 D[1] = ROL64(C[2], 1) ^ C[0];
710 D[2] = ROL64(C[3], 1) ^ C[1];
711 D[3] = ROL64(C[4], 1) ^ C[2];
712 D[4] = ROL64(C[0], 1) ^ C[3];
713
714 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
715 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
716 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
717 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
718 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
719
720 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
721 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
722 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
723 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
724 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
725
726 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
727 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
728 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
729 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
730 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
731
732 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
733 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
734 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
735 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
736 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
737
738 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
739 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
740 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
741 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
742 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
743
744 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
745 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
746 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
747 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
748 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
749
750 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
751 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
752 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
753 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
754 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
755
756 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
757 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
758 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
759 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
760 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
761
762 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
763 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
764 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
765 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
766 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
767
768 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
769 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
770 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
771 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
772 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
773
774 /* Round 4*n+1 */
775 D[0] = ROL64(C[1], 1) ^ C[4];
776 D[1] = ROL64(C[2], 1) ^ C[0];
777 D[2] = ROL64(C[3], 1) ^ C[1];
778 D[3] = ROL64(C[4], 1) ^ C[2];
779 D[4] = ROL64(C[0], 1) ^ C[3];
780
781 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
782 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
783 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
784 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
785 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
786
787 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
788 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
789 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
790 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
791 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
792
793 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
794 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
795 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
796 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
797 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
798
799 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
800 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
801 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
802 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
803 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
804
805 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
806 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
807 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
808 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
809 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
810
811 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
812 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
813 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
814 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
815 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
816
817 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
818 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
819 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
820 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
821 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
822
823 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
824 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
825 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
826 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
827 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
828
829 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
830 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
831 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
832 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
833 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
834
835 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
836 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
837 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
838 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
839 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
840
841 /* Round 4*n+2 */
842 D[0] = ROL64(C[1], 1) ^ C[4];
843 D[1] = ROL64(C[2], 1) ^ C[0];
844 D[2] = ROL64(C[3], 1) ^ C[1];
845 D[3] = ROL64(C[4], 1) ^ C[2];
846 D[4] = ROL64(C[0], 1) ^ C[3];
847
848 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
849 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
850 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
851 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
852 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
853
854 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
855 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
856 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
857 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
858 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
859
860 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
861 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
862 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
863 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
864 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
865
866 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
867 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
868 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
869 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
870 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
871
872 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
873 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
874 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
875 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
876 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
877
878 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
879 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
880 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
881 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
882 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
883
884 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
885 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
886 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
887 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
888 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
889
890 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
891 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
892 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
893 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
894 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
895
896 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
897 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
898 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
899 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
900 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
901
902 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
903 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
904 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
905 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
906 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
907
908 /* Round 4*n+3 */
909 D[0] = ROL64(C[1], 1) ^ C[4];
910 D[1] = ROL64(C[2], 1) ^ C[0];
911 D[2] = ROL64(C[3], 1) ^ C[1];
912 D[3] = ROL64(C[4], 1) ^ C[2];
913 D[4] = ROL64(C[0], 1) ^ C[3];
914
915 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
916 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
917 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
918 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
919 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
920
921 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
922 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
923 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
924 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
925 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
926
927 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
928 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
929 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
930 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
931 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
932
933 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
934 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
935 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
936 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
937 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
938
939 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
940 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
941 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
942 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
943 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
944
945 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
946 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
947 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
948 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
949 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
950
951 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
952 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
953 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
954 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
955 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
956
957 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
958 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
959 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
960 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
961 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
962
963 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
964 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
965 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
966 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
967 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
968
969 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
970 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
971 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
972 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
973 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
974 }
975
KeccakF1600(uint64_t A[5][5])976 static void KeccakF1600(uint64_t A[5][5])
977 {
978 size_t i;
979
980 for (i = 0; i < 24; i += 4) {
981 FourRounds(A, i);
982 }
983 }
984
985 #endif
986
BitInterleave(uint64_t Ai)987 static uint64_t BitInterleave(uint64_t Ai)
988 {
989 if (BIT_INTERLEAVE) {
990 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
991 uint32_t t0, t1;
992
993 t0 = lo & 0x55555555;
994 t0 |= t0 >> 1; t0 &= 0x33333333;
995 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
996 t0 |= t0 >> 4; t0 &= 0x00ff00ff;
997 t0 |= t0 >> 8; t0 &= 0x0000ffff;
998
999 t1 = hi & 0x55555555;
1000 t1 |= t1 >> 1; t1 &= 0x33333333;
1001 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
1002 t1 |= t1 >> 4; t1 &= 0x00ff00ff;
1003 t1 |= t1 >> 8; t1 <<= 16;
1004
1005 lo &= 0xaaaaaaaa;
1006 lo |= lo << 1; lo &= 0xcccccccc;
1007 lo |= lo << 2; lo &= 0xf0f0f0f0;
1008 lo |= lo << 4; lo &= 0xff00ff00;
1009 lo |= lo << 8; lo >>= 16;
1010
1011 hi &= 0xaaaaaaaa;
1012 hi |= hi << 1; hi &= 0xcccccccc;
1013 hi |= hi << 2; hi &= 0xf0f0f0f0;
1014 hi |= hi << 4; hi &= 0xff00ff00;
1015 hi |= hi << 8; hi &= 0xffff0000;
1016
1017 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1018 }
1019
1020 return Ai;
1021 }
1022
BitDeinterleave(uint64_t Ai)1023 static uint64_t BitDeinterleave(uint64_t Ai)
1024 {
1025 if (BIT_INTERLEAVE) {
1026 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1027 uint32_t t0, t1;
1028
1029 t0 = lo & 0x0000ffff;
1030 t0 |= t0 << 8; t0 &= 0x00ff00ff;
1031 t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
1032 t0 |= t0 << 2; t0 &= 0x33333333;
1033 t0 |= t0 << 1; t0 &= 0x55555555;
1034
1035 t1 = hi << 16;
1036 t1 |= t1 >> 8; t1 &= 0xff00ff00;
1037 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
1038 t1 |= t1 >> 2; t1 &= 0xcccccccc;
1039 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
1040
1041 lo >>= 16;
1042 lo |= lo << 8; lo &= 0x00ff00ff;
1043 lo |= lo << 4; lo &= 0x0f0f0f0f;
1044 lo |= lo << 2; lo &= 0x33333333;
1045 lo |= lo << 1; lo &= 0x55555555;
1046
1047 hi &= 0xffff0000;
1048 hi |= hi >> 8; hi &= 0xff00ff00;
1049 hi |= hi >> 4; hi &= 0xf0f0f0f0;
1050 hi |= hi >> 2; hi &= 0xcccccccc;
1051 hi |= hi >> 1; hi &= 0xaaaaaaaa;
1052
1053 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1054 }
1055
1056 return Ai;
1057 }
1058
1059 /*
1060 * SHA3_absorb can be called multiple times, but at each invocation
1061 * largest multiple of |r| out of |len| bytes are processed. Then
1062 * remaining amount of bytes is returned. This is done to spare caller
1063 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1064 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1065 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1066 * padding and intermediate sub-block buffering, byte- or bitwise, is
1067 * caller's responsibility.
1068 */
SHA3_absorb(uint64_t A[5][5],const unsigned char * inp,size_t len,size_t r)1069 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1070 size_t r)
1071 {
1072 uint64_t *A_flat = (uint64_t *)A;
1073 size_t i, w = r / 8;
1074
1075 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1076
1077 while (len >= r) {
1078 for (i = 0; i < w; i++) {
1079 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
1080 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1081 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1082 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1083 inp += 8;
1084
1085 A_flat[i] ^= BitInterleave(Ai);
1086 }
1087 KeccakF1600(A);
1088 len -= r;
1089 }
1090
1091 return len;
1092 }
1093
1094 /*
1095 * SHA3_squeeze may be called after SHA3_absorb to generate |out| hash value of
1096 * |len| bytes.
1097 * If multiple SHA3_squeeze calls are required the output length |len| must be a
1098 * multiple of the blocksize, with |next| being 0 on the first call and 1 on
1099 * subsequent calls. It is the callers responsibility to buffer the results.
1100 * When only a single call to SHA3_squeeze is required, |len| can be any size
1101 * and |next| must be 0.
1102 */
SHA3_squeeze(uint64_t A[5][5],unsigned char * out,size_t len,size_t r,int next)1103 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r,
1104 int next)
1105 {
1106 uint64_t *A_flat = (uint64_t *)A;
1107 size_t i, w = r / 8;
1108
1109 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1110
1111 while (len != 0) {
1112 if (next)
1113 KeccakF1600(A);
1114 next = 1;
1115 for (i = 0; i < w && len != 0; i++) {
1116 uint64_t Ai = BitDeinterleave(A_flat[i]);
1117
1118 if (len < 8) {
1119 for (i = 0; i < len; i++) {
1120 *out++ = (unsigned char)Ai;
1121 Ai >>= 8;
1122 }
1123 return;
1124 }
1125
1126 out[0] = (unsigned char)(Ai);
1127 out[1] = (unsigned char)(Ai >> 8);
1128 out[2] = (unsigned char)(Ai >> 16);
1129 out[3] = (unsigned char)(Ai >> 24);
1130 out[4] = (unsigned char)(Ai >> 32);
1131 out[5] = (unsigned char)(Ai >> 40);
1132 out[6] = (unsigned char)(Ai >> 48);
1133 out[7] = (unsigned char)(Ai >> 56);
1134 out += 8;
1135 len -= 8;
1136 }
1137 }
1138 }
1139 #endif
1140
1141 #ifdef SELFTEST
1142 /*
1143 * Post-padding one-shot implementations would look as following:
1144 *
1145 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1146 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1147 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1148 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1149 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1150 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1151 */
1152
SHA3_sponge(const unsigned char * inp,size_t len,unsigned char * out,size_t d,size_t r)1153 void SHA3_sponge(const unsigned char *inp, size_t len,
1154 unsigned char *out, size_t d, size_t r)
1155 {
1156 uint64_t A[5][5];
1157
1158 memset(A, 0, sizeof(A));
1159 SHA3_absorb(A, inp, len, r);
1160 SHA3_squeeze(A, out, d, r);
1161 }
1162
1163 # include <stdio.h>
1164
main(void)1165 int main(void)
1166 {
1167 /*
1168 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1169 */
1170 unsigned char test[168] = { '\xf3', '\x3' };
1171 unsigned char out[512];
1172 size_t i;
1173 static const unsigned char result[512] = {
1174 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1175 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1176 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1177 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1178 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1179 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1180 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1181 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1182 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1183 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1184 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1185 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1186 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1187 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1188 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1189 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1190 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1191 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1192 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1193 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1194 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1195 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1196 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1197 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1198 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1199 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1200 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1201 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1202 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1203 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1204 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1205 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1206 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1207 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1208 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1209 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1210 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1211 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1212 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1213 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1214 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1215 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1216 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1217 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1218 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1219 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1220 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1221 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1222 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1223 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1224 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1225 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1226 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1227 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1228 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1229 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1230 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1231 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1232 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1233 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1234 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1235 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1236 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1237 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1238 };
1239
1240 test[167] = '\x80';
1241 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1242
1243 /*
1244 * Rationale behind keeping output [formatted as below] is that
1245 * one should be able to redirect it to a file, then copy-n-paste
1246 * final "output val" from official example to another file, and
1247 * compare the two with diff(1).
1248 */
1249 for (i = 0; i < sizeof(out);) {
1250 printf("%02X", out[i]);
1251 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1252 }
1253
1254 if (memcmp(out, result, sizeof(out))) {
1255 fprintf(stderr, "failure\n");
1256 return 1;
1257 } else {
1258 fprintf(stderr, "success\n");
1259 return 0;
1260 }
1261 }
1262 #endif
1263