xref: /openssl/crypto/sha/keccak1600.c (revision 7ed6de99)
1 /*
2  * Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <openssl/e_os2.h>
11 #include <string.h>
12 #include <assert.h>
13 
14 #include "internal/nelem.h"
15 
16 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
17                    size_t r);
18 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r, int next);
19 
20 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
21 
22 /*
23  * Choose some sensible defaults
24  */
25 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
26     !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
27 # define KECCAK_2X      /* default to KECCAK_2X variant */
28 #endif
29 
30 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
31     (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
32     defined(__mips) || defined(__riscv) || defined(__s390__) || \
33     defined(__EMSCRIPTEN__)
34 /*
35  * These don't have "and with complement" instruction, so minimize amount
36  * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
37  */
38 # define KECCAK_COMPLEMENTING_TRANSFORM
39 #endif
40 
41 #if defined(__x86_64__) || defined(__aarch64__) || \
42     defined(__mips64) || defined(__ia64) || \
43     (defined(__VMS) && !defined(__vax))
44 /*
45  * These are available even in ILP32 flavours, but even then they are
46  * capable of performing 64-bit operations as efficiently as in *P64.
47  * Since it's not given that we can use sizeof(void *), just shunt it.
48  */
49 # define BIT_INTERLEAVE (0)
50 #else
51 # define BIT_INTERLEAVE (sizeof(void *) < 8)
52 #endif
53 
54 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
55 
ROL64(uint64_t val,int offset)56 static uint64_t ROL64(uint64_t val, int offset)
57 {
58     if (offset == 0) {
59         return val;
60     } else if (!BIT_INTERLEAVE) {
61         return (val << offset) | (val >> (64-offset));
62     } else {
63         uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
64 
65         if (offset & 1) {
66             uint32_t tmp = hi;
67 
68             offset >>= 1;
69             hi = ROL32(lo, offset);
70             lo = ROL32(tmp, offset + 1);
71         } else {
72             offset >>= 1;
73             lo = ROL32(lo, offset);
74             hi = ROL32(hi, offset);
75         }
76 
77         return ((uint64_t)hi << 32) | lo;
78     }
79 }
80 
81 static const unsigned char rhotates[5][5] = {
82     {  0,  1, 62, 28, 27 },
83     { 36, 44,  6, 55, 20 },
84     {  3, 10, 43, 25, 39 },
85     { 41, 45, 15, 21,  8 },
86     { 18,  2, 61, 56, 14 }
87 };
88 
89 static const uint64_t iotas[] = {
90     BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
91     BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
92     BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
93     BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
94     BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
95     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
96     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
97     BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
98     BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
99     BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
100     BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
101     BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
102     BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
103     BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
104     BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
105     BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
106     BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
107     BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
108     BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
109     BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
110     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
111     BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
112     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
113     BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
114 };
115 
116 #if defined(KECCAK_REF)
117 /*
118  * This is straightforward or "maximum clarity" implementation aiming
119  * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
120  * Permutation-Based Hash and Extendible-Output Functions" as much as
121  * possible. With one caveat. Because of the way C stores matrices,
122  * references to A[x,y] in the specification are presented as A[y][x].
123  * Implementation unrolls inner x-loops so that modulo 5 operations are
124  * explicitly pre-computed.
125  */
Theta(uint64_t A[5][5])126 static void Theta(uint64_t A[5][5])
127 {
128     uint64_t C[5], D[5];
129     size_t y;
130 
131     C[0] = A[0][0];
132     C[1] = A[0][1];
133     C[2] = A[0][2];
134     C[3] = A[0][3];
135     C[4] = A[0][4];
136 
137     for (y = 1; y < 5; y++) {
138         C[0] ^= A[y][0];
139         C[1] ^= A[y][1];
140         C[2] ^= A[y][2];
141         C[3] ^= A[y][3];
142         C[4] ^= A[y][4];
143     }
144 
145     D[0] = ROL64(C[1], 1) ^ C[4];
146     D[1] = ROL64(C[2], 1) ^ C[0];
147     D[2] = ROL64(C[3], 1) ^ C[1];
148     D[3] = ROL64(C[4], 1) ^ C[2];
149     D[4] = ROL64(C[0], 1) ^ C[3];
150 
151     for (y = 0; y < 5; y++) {
152         A[y][0] ^= D[0];
153         A[y][1] ^= D[1];
154         A[y][2] ^= D[2];
155         A[y][3] ^= D[3];
156         A[y][4] ^= D[4];
157     }
158 }
159 
Rho(uint64_t A[5][5])160 static void Rho(uint64_t A[5][5])
161 {
162     size_t y;
163 
164     for (y = 0; y < 5; y++) {
165         A[y][0] = ROL64(A[y][0], rhotates[y][0]);
166         A[y][1] = ROL64(A[y][1], rhotates[y][1]);
167         A[y][2] = ROL64(A[y][2], rhotates[y][2]);
168         A[y][3] = ROL64(A[y][3], rhotates[y][3]);
169         A[y][4] = ROL64(A[y][4], rhotates[y][4]);
170     }
171 }
172 
Pi(uint64_t A[5][5])173 static void Pi(uint64_t A[5][5])
174 {
175     uint64_t T[5][5];
176 
177     /*
178      * T = A
179      * A[y][x] = T[x][(3*y+x)%5]
180      */
181     memcpy(T, A, sizeof(T));
182 
183     A[0][0] = T[0][0];
184     A[0][1] = T[1][1];
185     A[0][2] = T[2][2];
186     A[0][3] = T[3][3];
187     A[0][4] = T[4][4];
188 
189     A[1][0] = T[0][3];
190     A[1][1] = T[1][4];
191     A[1][2] = T[2][0];
192     A[1][3] = T[3][1];
193     A[1][4] = T[4][2];
194 
195     A[2][0] = T[0][1];
196     A[2][1] = T[1][2];
197     A[2][2] = T[2][3];
198     A[2][3] = T[3][4];
199     A[2][4] = T[4][0];
200 
201     A[3][0] = T[0][4];
202     A[3][1] = T[1][0];
203     A[3][2] = T[2][1];
204     A[3][3] = T[3][2];
205     A[3][4] = T[4][3];
206 
207     A[4][0] = T[0][2];
208     A[4][1] = T[1][3];
209     A[4][2] = T[2][4];
210     A[4][3] = T[3][0];
211     A[4][4] = T[4][1];
212 }
213 
Chi(uint64_t A[5][5])214 static void Chi(uint64_t A[5][5])
215 {
216     uint64_t C[5];
217     size_t y;
218 
219     for (y = 0; y < 5; y++) {
220         C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
221         C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
222         C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
223         C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
224         C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
225 
226         A[y][0] = C[0];
227         A[y][1] = C[1];
228         A[y][2] = C[2];
229         A[y][3] = C[3];
230         A[y][4] = C[4];
231     }
232 }
233 
Iota(uint64_t A[5][5],size_t i)234 static void Iota(uint64_t A[5][5], size_t i)
235 {
236     assert(i < OSSL_NELEM(iotas));
237     A[0][0] ^= iotas[i];
238 }
239 
KeccakF1600(uint64_t A[5][5])240 static void KeccakF1600(uint64_t A[5][5])
241 {
242     size_t i;
243 
244     for (i = 0; i < 24; i++) {
245         Theta(A);
246         Rho(A);
247         Pi(A);
248         Chi(A);
249         Iota(A, i);
250     }
251 }
252 
253 #elif defined(KECCAK_1X)
254 /*
255  * This implementation is optimization of above code featuring unroll
256  * of even y-loops, their fusion and code motion. It also minimizes
257  * temporary storage. Compiler would normally do all these things for
258  * you, purpose of manual optimization is to provide "unobscured"
259  * reference for assembly implementation [in case this approach is
260  * chosen for implementation on some platform]. In the nutshell it's
261  * equivalent of "plane-per-plane processing" approach discussed in
262  * section 2.4 of "Keccak implementation overview".
263  */
Round(uint64_t A[5][5],size_t i)264 static void Round(uint64_t A[5][5], size_t i)
265 {
266     uint64_t C[5], E[2];        /* registers */
267     uint64_t D[5], T[2][5];     /* memory    */
268 
269     assert(i < OSSL_NELEM(iotas));
270 
271     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
272     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
273     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
274     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
275     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
276 
277 #if defined(__arm__)
278     D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
279     D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
280     D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
281     D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
282     D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
283 
284     T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
285     T[0][1] = A[0][1] ^ E[0]; /* D[1] */
286     T[0][2] = A[0][2] ^ C[1]; /* D[2] */
287     T[0][3] = A[0][3] ^ C[2]; /* D[3] */
288     T[0][4] = A[0][4] ^ E[1]; /* D[4] */
289 
290     C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
291     C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
292     C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
293     C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
294     C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
295 #else
296     D[0] = ROL64(C[1], 1) ^ C[4];
297     D[1] = ROL64(C[2], 1) ^ C[0];
298     D[2] = ROL64(C[3], 1) ^ C[1];
299     D[3] = ROL64(C[4], 1) ^ C[2];
300     D[4] = ROL64(C[0], 1) ^ C[3];
301 
302     T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
303     T[0][1] = A[0][1] ^ D[1];
304     T[0][2] = A[0][2] ^ D[2];
305     T[0][3] = A[0][3] ^ D[3];
306     T[0][4] = A[0][4] ^ D[4];
307 
308     C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
309     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
310     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
311     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
312     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
313 #endif
314     A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
315     A[0][1] = C[1] ^ (~C[2] & C[3]);
316     A[0][2] = C[2] ^ (~C[3] & C[4]);
317     A[0][3] = C[3] ^ (~C[4] & C[0]);
318     A[0][4] = C[4] ^ (~C[0] & C[1]);
319 
320     T[1][0] = A[1][0] ^ (C[3] = D[0]);
321     T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
322     T[1][2] = A[1][2] ^ (E[0] = D[2]);
323     T[1][3] = A[1][3] ^ (E[1] = D[3]);
324     T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
325 
326     C[0] = ROL64(T[0][3],        rhotates[0][3]);
327     C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]);   /* D[4] */
328     C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]);   /* D[0] */
329     C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]);   /* D[1] */
330     C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]);   /* D[2] */
331 
332     A[1][0] = C[0] ^ (~C[1] & C[2]);
333     A[1][1] = C[1] ^ (~C[2] & C[3]);
334     A[1][2] = C[2] ^ (~C[3] & C[4]);
335     A[1][3] = C[3] ^ (~C[4] & C[0]);
336     A[1][4] = C[4] ^ (~C[0] & C[1]);
337 
338     C[0] = ROL64(T[0][1],        rhotates[0][1]);
339     C[1] = ROL64(T[1][2],        rhotates[1][2]);
340     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
341     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
342     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
343 
344     A[2][0] = C[0] ^ (~C[1] & C[2]);
345     A[2][1] = C[1] ^ (~C[2] & C[3]);
346     A[2][2] = C[2] ^ (~C[3] & C[4]);
347     A[2][3] = C[3] ^ (~C[4] & C[0]);
348     A[2][4] = C[4] ^ (~C[0] & C[1]);
349 
350     C[0] = ROL64(T[0][4],        rhotates[0][4]);
351     C[1] = ROL64(T[1][0],        rhotates[1][0]);
352     C[2] = ROL64(T[1][1],        rhotates[2][1]); /* originally A[2][1] */
353     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
354     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
355 
356     A[3][0] = C[0] ^ (~C[1] & C[2]);
357     A[3][1] = C[1] ^ (~C[2] & C[3]);
358     A[3][2] = C[2] ^ (~C[3] & C[4]);
359     A[3][3] = C[3] ^ (~C[4] & C[0]);
360     A[3][4] = C[4] ^ (~C[0] & C[1]);
361 
362     C[0] = ROL64(T[0][2],        rhotates[0][2]);
363     C[1] = ROL64(T[1][3],        rhotates[1][3]);
364     C[2] = ROL64(T[1][4],        rhotates[2][4]); /* originally A[2][4] */
365     C[3] = ROL64(T[0][0],        rhotates[3][0]); /* originally A[3][0] */
366     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
367 
368     A[4][0] = C[0] ^ (~C[1] & C[2]);
369     A[4][1] = C[1] ^ (~C[2] & C[3]);
370     A[4][2] = C[2] ^ (~C[3] & C[4]);
371     A[4][3] = C[3] ^ (~C[4] & C[0]);
372     A[4][4] = C[4] ^ (~C[0] & C[1]);
373 }
374 
KeccakF1600(uint64_t A[5][5])375 static void KeccakF1600(uint64_t A[5][5])
376 {
377     size_t i;
378 
379     for (i = 0; i < 24; i++) {
380         Round(A, i);
381     }
382 }
383 
384 #elif defined(KECCAK_1X_ALT)
385 /*
386  * This is variant of above KECCAK_1X that reduces requirement for
387  * temporary storage even further, but at cost of more updates to A[][].
388  * It's less suitable if A[][] is memory bound, but better if it's
389  * register bound.
390  */
391 
Round(uint64_t A[5][5],size_t i)392 static void Round(uint64_t A[5][5], size_t i)
393 {
394     uint64_t C[5], D[5];
395 
396     assert(i < OSSL_NELEM(iotas));
397 
398     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
399     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
400     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
401     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
402     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
403 
404     D[1] = C[0] ^  ROL64(C[2], 1);
405     D[2] = C[1] ^  ROL64(C[3], 1);
406     D[3] = C[2] ^= ROL64(C[4], 1);
407     D[4] = C[3] ^= ROL64(C[0], 1);
408     D[0] = C[4] ^= ROL64(C[1], 1);
409 
410     A[0][1] ^= D[1];
411     A[1][1] ^= D[1];
412     A[2][1] ^= D[1];
413     A[3][1] ^= D[1];
414     A[4][1] ^= D[1];
415 
416     A[0][2] ^= D[2];
417     A[1][2] ^= D[2];
418     A[2][2] ^= D[2];
419     A[3][2] ^= D[2];
420     A[4][2] ^= D[2];
421 
422     A[0][3] ^= C[2];
423     A[1][3] ^= C[2];
424     A[2][3] ^= C[2];
425     A[3][3] ^= C[2];
426     A[4][3] ^= C[2];
427 
428     A[0][4] ^= C[3];
429     A[1][4] ^= C[3];
430     A[2][4] ^= C[3];
431     A[3][4] ^= C[3];
432     A[4][4] ^= C[3];
433 
434     A[0][0] ^= C[4];
435     A[1][0] ^= C[4];
436     A[2][0] ^= C[4];
437     A[3][0] ^= C[4];
438     A[4][0] ^= C[4];
439 
440     C[1] = A[0][1];
441     C[2] = A[0][2];
442     C[3] = A[0][3];
443     C[4] = A[0][4];
444 
445     A[0][1] = ROL64(A[1][1], rhotates[1][1]);
446     A[0][2] = ROL64(A[2][2], rhotates[2][2]);
447     A[0][3] = ROL64(A[3][3], rhotates[3][3]);
448     A[0][4] = ROL64(A[4][4], rhotates[4][4]);
449 
450     A[1][1] = ROL64(A[1][4], rhotates[1][4]);
451     A[2][2] = ROL64(A[2][3], rhotates[2][3]);
452     A[3][3] = ROL64(A[3][2], rhotates[3][2]);
453     A[4][4] = ROL64(A[4][1], rhotates[4][1]);
454 
455     A[1][4] = ROL64(A[4][2], rhotates[4][2]);
456     A[2][3] = ROL64(A[3][4], rhotates[3][4]);
457     A[3][2] = ROL64(A[2][1], rhotates[2][1]);
458     A[4][1] = ROL64(A[1][3], rhotates[1][3]);
459 
460     A[4][2] = ROL64(A[2][4], rhotates[2][4]);
461     A[3][4] = ROL64(A[4][3], rhotates[4][3]);
462     A[2][1] = ROL64(A[1][2], rhotates[1][2]);
463     A[1][3] = ROL64(A[3][1], rhotates[3][1]);
464 
465     A[2][4] = ROL64(A[4][0], rhotates[4][0]);
466     A[4][3] = ROL64(A[3][0], rhotates[3][0]);
467     A[1][2] = ROL64(A[2][0], rhotates[2][0]);
468     A[3][1] = ROL64(A[1][0], rhotates[1][0]);
469 
470     A[1][0] = ROL64(C[3],    rhotates[0][3]);
471     A[2][0] = ROL64(C[1],    rhotates[0][1]);
472     A[3][0] = ROL64(C[4],    rhotates[0][4]);
473     A[4][0] = ROL64(C[2],    rhotates[0][2]);
474 
475     C[0] = A[0][0];
476     C[1] = A[1][0];
477     D[0] = A[0][1];
478     D[1] = A[1][1];
479 
480     A[0][0] ^= (~A[0][1] & A[0][2]);
481     A[1][0] ^= (~A[1][1] & A[1][2]);
482     A[0][1] ^= (~A[0][2] & A[0][3]);
483     A[1][1] ^= (~A[1][2] & A[1][3]);
484     A[0][2] ^= (~A[0][3] & A[0][4]);
485     A[1][2] ^= (~A[1][3] & A[1][4]);
486     A[0][3] ^= (~A[0][4] & C[0]);
487     A[1][3] ^= (~A[1][4] & C[1]);
488     A[0][4] ^= (~C[0]    & D[0]);
489     A[1][4] ^= (~C[1]    & D[1]);
490 
491     C[2] = A[2][0];
492     C[3] = A[3][0];
493     D[2] = A[2][1];
494     D[3] = A[3][1];
495 
496     A[2][0] ^= (~A[2][1] & A[2][2]);
497     A[3][0] ^= (~A[3][1] & A[3][2]);
498     A[2][1] ^= (~A[2][2] & A[2][3]);
499     A[3][1] ^= (~A[3][2] & A[3][3]);
500     A[2][2] ^= (~A[2][3] & A[2][4]);
501     A[3][2] ^= (~A[3][3] & A[3][4]);
502     A[2][3] ^= (~A[2][4] & C[2]);
503     A[3][3] ^= (~A[3][4] & C[3]);
504     A[2][4] ^= (~C[2]    & D[2]);
505     A[3][4] ^= (~C[3]    & D[3]);
506 
507     C[4] = A[4][0];
508     D[4] = A[4][1];
509 
510     A[4][0] ^= (~A[4][1] & A[4][2]);
511     A[4][1] ^= (~A[4][2] & A[4][3]);
512     A[4][2] ^= (~A[4][3] & A[4][4]);
513     A[4][3] ^= (~A[4][4] & C[4]);
514     A[4][4] ^= (~C[4]    & D[4]);
515     A[0][0] ^= iotas[i];
516 }
517 
KeccakF1600(uint64_t A[5][5])518 static void KeccakF1600(uint64_t A[5][5])
519 {
520     size_t i;
521 
522     for (i = 0; i < 24; i++) {
523         Round(A, i);
524     }
525 }
526 
527 #elif defined(KECCAK_2X)
528 /*
529  * This implementation is variant of KECCAK_1X above with outer-most
530  * round loop unrolled twice. This allows to take temporary storage
531  * out of round procedure and simplify references to it by alternating
532  * it with actual data (see round loop below). Originally it was meant
533  * rather as reference for an assembly implementation, but it seems to
534  * play best with compilers [as well as provide best instruction per
535  * processed byte ratio at minimal round unroll factor]...
536  */
Round(uint64_t R[5][5],uint64_t A[5][5],size_t i)537 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
538 {
539     uint64_t C[5], D[5];
540 
541     assert(i < OSSL_NELEM(iotas));
542 
543     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
544     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
545     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
546     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
547     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
548 
549     D[0] = ROL64(C[1], 1) ^ C[4];
550     D[1] = ROL64(C[2], 1) ^ C[0];
551     D[2] = ROL64(C[3], 1) ^ C[1];
552     D[3] = ROL64(C[4], 1) ^ C[2];
553     D[4] = ROL64(C[0], 1) ^ C[3];
554 
555     C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
556     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
557     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
558     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
559     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
560 
561 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
562     R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
563     R[0][1] = C[1] ^ (~C[2] | C[3]);
564     R[0][2] = C[2] ^ ( C[3] & C[4]);
565     R[0][3] = C[3] ^ ( C[4] | C[0]);
566     R[0][4] = C[4] ^ ( C[0] & C[1]);
567 #else
568     R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
569     R[0][1] = C[1] ^ (~C[2] & C[3]);
570     R[0][2] = C[2] ^ (~C[3] & C[4]);
571     R[0][3] = C[3] ^ (~C[4] & C[0]);
572     R[0][4] = C[4] ^ (~C[0] & C[1]);
573 #endif
574 
575     C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
576     C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
577     C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
578     C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
579     C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
580 
581 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
582     R[1][0] = C[0] ^ (C[1] |  C[2]);
583     R[1][1] = C[1] ^ (C[2] &  C[3]);
584     R[1][2] = C[2] ^ (C[3] | ~C[4]);
585     R[1][3] = C[3] ^ (C[4] |  C[0]);
586     R[1][4] = C[4] ^ (C[0] &  C[1]);
587 #else
588     R[1][0] = C[0] ^ (~C[1] & C[2]);
589     R[1][1] = C[1] ^ (~C[2] & C[3]);
590     R[1][2] = C[2] ^ (~C[3] & C[4]);
591     R[1][3] = C[3] ^ (~C[4] & C[0]);
592     R[1][4] = C[4] ^ (~C[0] & C[1]);
593 #endif
594 
595     C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
596     C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
597     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
598     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
599     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
600 
601 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
602     R[2][0] =  C[0] ^ ( C[1] | C[2]);
603     R[2][1] =  C[1] ^ ( C[2] & C[3]);
604     R[2][2] =  C[2] ^ (~C[3] & C[4]);
605     R[2][3] = ~C[3] ^ ( C[4] | C[0]);
606     R[2][4] =  C[4] ^ ( C[0] & C[1]);
607 #else
608     R[2][0] = C[0] ^ (~C[1] & C[2]);
609     R[2][1] = C[1] ^ (~C[2] & C[3]);
610     R[2][2] = C[2] ^ (~C[3] & C[4]);
611     R[2][3] = C[3] ^ (~C[4] & C[0]);
612     R[2][4] = C[4] ^ (~C[0] & C[1]);
613 #endif
614 
615     C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
616     C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
617     C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
618     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
619     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
620 
621 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
622     R[3][0] =  C[0] ^ ( C[1] & C[2]);
623     R[3][1] =  C[1] ^ ( C[2] | C[3]);
624     R[3][2] =  C[2] ^ (~C[3] | C[4]);
625     R[3][3] = ~C[3] ^ ( C[4] & C[0]);
626     R[3][4] =  C[4] ^ ( C[0] | C[1]);
627 #else
628     R[3][0] = C[0] ^ (~C[1] & C[2]);
629     R[3][1] = C[1] ^ (~C[2] & C[3]);
630     R[3][2] = C[2] ^ (~C[3] & C[4]);
631     R[3][3] = C[3] ^ (~C[4] & C[0]);
632     R[3][4] = C[4] ^ (~C[0] & C[1]);
633 #endif
634 
635     C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
636     C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
637     C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
638     C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
639     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
640 
641 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
642     R[4][0] =  C[0] ^ (~C[1] & C[2]);
643     R[4][1] = ~C[1] ^ ( C[2] | C[3]);
644     R[4][2] =  C[2] ^ ( C[3] & C[4]);
645     R[4][3] =  C[3] ^ ( C[4] | C[0]);
646     R[4][4] =  C[4] ^ ( C[0] & C[1]);
647 #else
648     R[4][0] = C[0] ^ (~C[1] & C[2]);
649     R[4][1] = C[1] ^ (~C[2] & C[3]);
650     R[4][2] = C[2] ^ (~C[3] & C[4]);
651     R[4][3] = C[3] ^ (~C[4] & C[0]);
652     R[4][4] = C[4] ^ (~C[0] & C[1]);
653 #endif
654 }
655 
KeccakF1600(uint64_t A[5][5])656 static void KeccakF1600(uint64_t A[5][5])
657 {
658     uint64_t T[5][5];
659     size_t i;
660 
661 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
662     A[0][1] = ~A[0][1];
663     A[0][2] = ~A[0][2];
664     A[1][3] = ~A[1][3];
665     A[2][2] = ~A[2][2];
666     A[3][2] = ~A[3][2];
667     A[4][0] = ~A[4][0];
668 #endif
669 
670     for (i = 0; i < 24; i += 2) {
671         Round(T, A, i);
672         Round(A, T, i + 1);
673     }
674 
675 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
676     A[0][1] = ~A[0][1];
677     A[0][2] = ~A[0][2];
678     A[1][3] = ~A[1][3];
679     A[2][2] = ~A[2][2];
680     A[3][2] = ~A[3][2];
681     A[4][0] = ~A[4][0];
682 #endif
683 }
684 
685 #else   /* define KECCAK_INPLACE to compile this code path */
686 /*
687  * This implementation is KECCAK_1X from above combined 4 times with
688  * a twist that allows to omit temporary storage and perform in-place
689  * processing. It's discussed in section 2.5 of "Keccak implementation
690  * overview". It's likely to be best suited for processors with large
691  * register bank... On the other hand processor with large register
692  * bank can as well use KECCAK_1X_ALT, it would be as fast but much
693  * more compact...
694  */
FourRounds(uint64_t A[5][5],size_t i)695 static void FourRounds(uint64_t A[5][5], size_t i)
696 {
697     uint64_t B[5], C[5], D[5];
698 
699     assert(i <= OSSL_NELEM(iotas) - 4);
700 
701     /* Round 4*n */
702     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
703     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
704     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
705     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
706     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
707 
708     D[0] = ROL64(C[1], 1) ^ C[4];
709     D[1] = ROL64(C[2], 1) ^ C[0];
710     D[2] = ROL64(C[3], 1) ^ C[1];
711     D[3] = ROL64(C[4], 1) ^ C[2];
712     D[4] = ROL64(C[0], 1) ^ C[3];
713 
714     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
715     B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
716     B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
717     B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
718     B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
719 
720     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
721     C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
722     C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
723     C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
724     C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
725 
726     B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
727     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
728     B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
729     B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
730     B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
731 
732     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
733     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
734     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
735     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
736     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
737 
738     B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
739     B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
740     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
741     B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
742     B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
743 
744     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
745     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
746     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
747     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
748     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
749 
750     B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
751     B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
752     B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
753     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
754     B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
755 
756     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
757     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
758     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
759     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
760     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
761 
762     B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
763     B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
764     B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
765     B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
766     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
767 
768     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
769     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
770     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
771     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
772     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
773 
774     /* Round 4*n+1 */
775     D[0] = ROL64(C[1], 1) ^ C[4];
776     D[1] = ROL64(C[2], 1) ^ C[0];
777     D[2] = ROL64(C[3], 1) ^ C[1];
778     D[3] = ROL64(C[4], 1) ^ C[2];
779     D[4] = ROL64(C[0], 1) ^ C[3];
780 
781     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
782     B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
783     B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
784     B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
785     B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
786 
787     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
788     C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
789     C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
790     C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
791     C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
792 
793     B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
794     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
795     B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
796     B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
797     B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
798 
799     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
800     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
801     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
802     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
803     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
804 
805     B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
806     B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
807     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
808     B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
809     B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
810 
811     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
812     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
813     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
814     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
815     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
816 
817     B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
818     B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
819     B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
820     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
821     B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
822 
823     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
824     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
825     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
826     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
827     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
828 
829     B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
830     B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
831     B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
832     B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
833     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
834 
835     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
836     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
837     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
838     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
839     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
840 
841     /* Round 4*n+2 */
842     D[0] = ROL64(C[1], 1) ^ C[4];
843     D[1] = ROL64(C[2], 1) ^ C[0];
844     D[2] = ROL64(C[3], 1) ^ C[1];
845     D[3] = ROL64(C[4], 1) ^ C[2];
846     D[4] = ROL64(C[0], 1) ^ C[3];
847 
848     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
849     B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
850     B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
851     B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
852     B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
853 
854     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
855     C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
856     C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
857     C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
858     C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
859 
860     B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
861     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
862     B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
863     B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
864     B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
865 
866     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
867     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
868     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
869     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
870     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
871 
872     B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
873     B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
874     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
875     B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
876     B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
877 
878     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
879     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
880     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
881     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
882     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
883 
884     B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
885     B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
886     B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
887     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
888     B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
889 
890     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
891     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
892     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
893     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
894     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
895 
896     B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
897     B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
898     B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
899     B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
900     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
901 
902     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
903     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
904     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
905     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
906     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
907 
908     /* Round 4*n+3 */
909     D[0] = ROL64(C[1], 1) ^ C[4];
910     D[1] = ROL64(C[2], 1) ^ C[0];
911     D[2] = ROL64(C[3], 1) ^ C[1];
912     D[3] = ROL64(C[4], 1) ^ C[2];
913     D[4] = ROL64(C[0], 1) ^ C[3];
914 
915     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
916     B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
917     B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
918     B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
919     B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
920 
921     /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
922     /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
923     /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
924     /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
925     /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
926 
927     B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
928     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
929     B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
930     B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
931     B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
932 
933     /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
934     /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
935     /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
936     /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
937     /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
938 
939     B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
940     B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
941     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
942     B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
943     B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
944 
945     /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
946     /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
947     /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
948     /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
949     /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
950 
951     B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
952     B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
953     B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
954     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
955     B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
956 
957     /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
958     /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
959     /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
960     /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
961     /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
962 
963     B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
964     B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
965     B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
966     B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
967     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
968 
969     /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
970     /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
971     /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
972     /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
973     /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
974 }
975 
KeccakF1600(uint64_t A[5][5])976 static void KeccakF1600(uint64_t A[5][5])
977 {
978     size_t i;
979 
980     for (i = 0; i < 24; i += 4) {
981         FourRounds(A, i);
982     }
983 }
984 
985 #endif
986 
BitInterleave(uint64_t Ai)987 static uint64_t BitInterleave(uint64_t Ai)
988 {
989     if (BIT_INTERLEAVE) {
990         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
991         uint32_t t0, t1;
992 
993         t0 = lo & 0x55555555;
994         t0 |= t0 >> 1;  t0 &= 0x33333333;
995         t0 |= t0 >> 2;  t0 &= 0x0f0f0f0f;
996         t0 |= t0 >> 4;  t0 &= 0x00ff00ff;
997         t0 |= t0 >> 8;  t0 &= 0x0000ffff;
998 
999         t1 = hi & 0x55555555;
1000         t1 |= t1 >> 1;  t1 &= 0x33333333;
1001         t1 |= t1 >> 2;  t1 &= 0x0f0f0f0f;
1002         t1 |= t1 >> 4;  t1 &= 0x00ff00ff;
1003         t1 |= t1 >> 8;  t1 <<= 16;
1004 
1005         lo &= 0xaaaaaaaa;
1006         lo |= lo << 1;  lo &= 0xcccccccc;
1007         lo |= lo << 2;  lo &= 0xf0f0f0f0;
1008         lo |= lo << 4;  lo &= 0xff00ff00;
1009         lo |= lo << 8;  lo >>= 16;
1010 
1011         hi &= 0xaaaaaaaa;
1012         hi |= hi << 1;  hi &= 0xcccccccc;
1013         hi |= hi << 2;  hi &= 0xf0f0f0f0;
1014         hi |= hi << 4;  hi &= 0xff00ff00;
1015         hi |= hi << 8;  hi &= 0xffff0000;
1016 
1017         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1018     }
1019 
1020     return Ai;
1021 }
1022 
BitDeinterleave(uint64_t Ai)1023 static uint64_t BitDeinterleave(uint64_t Ai)
1024 {
1025     if (BIT_INTERLEAVE) {
1026         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1027         uint32_t t0, t1;
1028 
1029         t0 = lo & 0x0000ffff;
1030         t0 |= t0 << 8;  t0 &= 0x00ff00ff;
1031         t0 |= t0 << 4;  t0 &= 0x0f0f0f0f;
1032         t0 |= t0 << 2;  t0 &= 0x33333333;
1033         t0 |= t0 << 1;  t0 &= 0x55555555;
1034 
1035         t1 = hi << 16;
1036         t1 |= t1 >> 8;  t1 &= 0xff00ff00;
1037         t1 |= t1 >> 4;  t1 &= 0xf0f0f0f0;
1038         t1 |= t1 >> 2;  t1 &= 0xcccccccc;
1039         t1 |= t1 >> 1;  t1 &= 0xaaaaaaaa;
1040 
1041         lo >>= 16;
1042         lo |= lo << 8;  lo &= 0x00ff00ff;
1043         lo |= lo << 4;  lo &= 0x0f0f0f0f;
1044         lo |= lo << 2;  lo &= 0x33333333;
1045         lo |= lo << 1;  lo &= 0x55555555;
1046 
1047         hi &= 0xffff0000;
1048         hi |= hi >> 8;  hi &= 0xff00ff00;
1049         hi |= hi >> 4;  hi &= 0xf0f0f0f0;
1050         hi |= hi >> 2;  hi &= 0xcccccccc;
1051         hi |= hi >> 1;  hi &= 0xaaaaaaaa;
1052 
1053         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1054     }
1055 
1056     return Ai;
1057 }
1058 
1059 /*
1060  * SHA3_absorb can be called multiple times, but at each invocation
1061  * largest multiple of |r| out of |len| bytes are processed. Then
1062  * remaining amount of bytes is returned. This is done to spare caller
1063  * trouble of calculating the largest multiple of |r|. |r| can be viewed
1064  * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1065  * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1066  * padding and intermediate sub-block buffering, byte- or bitwise, is
1067  * caller's responsibility.
1068  */
SHA3_absorb(uint64_t A[5][5],const unsigned char * inp,size_t len,size_t r)1069 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1070                    size_t r)
1071 {
1072     uint64_t *A_flat = (uint64_t *)A;
1073     size_t i, w = r / 8;
1074 
1075     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1076 
1077     while (len >= r) {
1078         for (i = 0; i < w; i++) {
1079             uint64_t Ai = (uint64_t)inp[0]       | (uint64_t)inp[1] << 8  |
1080                           (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1081                           (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1082                           (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1083             inp += 8;
1084 
1085             A_flat[i] ^= BitInterleave(Ai);
1086         }
1087         KeccakF1600(A);
1088         len -= r;
1089     }
1090 
1091     return len;
1092 }
1093 
1094 /*
1095  * SHA3_squeeze may be called after SHA3_absorb to generate |out| hash value of
1096  * |len| bytes.
1097  * If multiple SHA3_squeeze calls are required the output length |len| must be a
1098  * multiple of the blocksize, with |next| being 0 on the first call and 1 on
1099  * subsequent calls. It is the callers responsibility to buffer the results.
1100  * When only a single call to SHA3_squeeze is required, |len| can be any size
1101  * and |next| must be 0.
1102  */
SHA3_squeeze(uint64_t A[5][5],unsigned char * out,size_t len,size_t r,int next)1103 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r,
1104                   int next)
1105 {
1106     uint64_t *A_flat = (uint64_t *)A;
1107     size_t i, w = r / 8;
1108 
1109     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1110 
1111     while (len != 0) {
1112         if (next)
1113             KeccakF1600(A);
1114         next = 1;
1115         for (i = 0; i < w && len != 0; i++) {
1116             uint64_t Ai = BitDeinterleave(A_flat[i]);
1117 
1118             if (len < 8) {
1119                 for (i = 0; i < len; i++) {
1120                     *out++ = (unsigned char)Ai;
1121                     Ai >>= 8;
1122                 }
1123                 return;
1124             }
1125 
1126             out[0] = (unsigned char)(Ai);
1127             out[1] = (unsigned char)(Ai >> 8);
1128             out[2] = (unsigned char)(Ai >> 16);
1129             out[3] = (unsigned char)(Ai >> 24);
1130             out[4] = (unsigned char)(Ai >> 32);
1131             out[5] = (unsigned char)(Ai >> 40);
1132             out[6] = (unsigned char)(Ai >> 48);
1133             out[7] = (unsigned char)(Ai >> 56);
1134             out += 8;
1135             len -= 8;
1136         }
1137     }
1138 }
1139 #endif
1140 
1141 #ifdef SELFTEST
1142 /*
1143  * Post-padding one-shot implementations would look as following:
1144  *
1145  * SHA3_224     SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1146  * SHA3_256     SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1147  * SHA3_384     SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1148  * SHA3_512     SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1149  * SHAKE_128    SHA3_sponge(inp, len, out, d, (1600-256)/8);
1150  * SHAKE_256    SHA3_sponge(inp, len, out, d, (1600-512)/8);
1151  */
1152 
SHA3_sponge(const unsigned char * inp,size_t len,unsigned char * out,size_t d,size_t r)1153 void SHA3_sponge(const unsigned char *inp, size_t len,
1154                  unsigned char *out, size_t d, size_t r)
1155 {
1156     uint64_t A[5][5];
1157 
1158     memset(A, 0, sizeof(A));
1159     SHA3_absorb(A, inp, len, r);
1160     SHA3_squeeze(A, out, d, r);
1161 }
1162 
1163 # include <stdio.h>
1164 
main(void)1165 int main(void)
1166 {
1167     /*
1168      * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1169      */
1170     unsigned char test[168] = { '\xf3', '\x3' };
1171     unsigned char out[512];
1172     size_t i;
1173     static const unsigned char result[512] = {
1174         0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1175         0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1176         0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1177         0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1178         0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1179         0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1180         0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1181         0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1182         0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1183         0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1184         0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1185         0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1186         0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1187         0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1188         0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1189         0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1190         0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1191         0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1192         0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1193         0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1194         0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1195         0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1196         0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1197         0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1198         0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1199         0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1200         0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1201         0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1202         0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1203         0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1204         0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1205         0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1206         0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1207         0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1208         0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1209         0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1210         0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1211         0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1212         0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1213         0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1214         0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1215         0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1216         0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1217         0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1218         0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1219         0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1220         0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1221         0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1222         0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1223         0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1224         0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1225         0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1226         0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1227         0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1228         0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1229         0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1230         0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1231         0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1232         0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1233         0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1234         0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1235         0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1236         0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1237         0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1238     };
1239 
1240     test[167] = '\x80';
1241     SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1242 
1243     /*
1244      * Rationale behind keeping output [formatted as below] is that
1245      * one should be able to redirect it to a file, then copy-n-paste
1246      * final "output val" from official example to another file, and
1247      * compare the two with diff(1).
1248      */
1249     for (i = 0; i < sizeof(out);) {
1250         printf("%02X", out[i]);
1251         printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1252     }
1253 
1254     if (memcmp(out, result, sizeof(out))) {
1255         fprintf(stderr, "failure\n");
1256         return 1;
1257     } else {
1258         fprintf(stderr, "success\n");
1259         return 0;
1260     }
1261 }
1262 #endif
1263