xref: /php-src/ext/hash/hash_sha_sse2.c (revision 6eca7839)
1 /*-
2  * Copyright 2021 Tarsnap Backup Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include "php_hash.h"
28 #include "php_hash_sha.h"
29 
30 #ifdef __SSE2__
31 # include <emmintrin.h>
32 
33 /* Original implementation from libcperciva follows.
34  *
35  * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility.
36  */
37 
38 /**
39  * mm_bswap_epi32(a):
40  * Byte-swap each 32-bit word.
41  */
42 static inline __m128i
mm_bswap_epi32(__m128i a)43 mm_bswap_epi32(__m128i a)
44 {
45 
46 	/* Swap bytes in each 16-bit word. */
47 	a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8));
48 
49 	/* Swap all 16-bit words. */
50 	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
51 	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
52 
53 	return (a);
54 }
55 
56 /* SHA256 round constants. */
57 static const uint32_t Krnd[64] = {
58 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
59 	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
60 	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
61 	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
62 	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
63 	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
64 	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
65 	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
66 	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
67 	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
68 	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
69 	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
70 	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
71 	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
72 	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
73 	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
74 };
75 
76 /* Elementary functions used by SHA256 */
77 #define Ch(x, y, z)	((x & (y ^ z)) ^ z)
78 #define Maj(x, y, z)	((x & (y | z)) | (y & z))
79 #define ROTR(x, n)	((x >> n) | (x << (32 - n)))
80 #define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
81 #define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
82 
83 /* SHA256 round function */
84 #define RND(a, b, c, d, e, f, g, h, k)			\
85 	h += S1(e) + Ch(e, f, g) + k;			\
86 	d += h;						\
87 	h += S0(a) + Maj(a, b, c)
88 
89 /* Adjusted round function for rotating state */
90 #define RNDr(S, W, i, ii)			\
91 	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
92 	    S[(66 - i) % 8], S[(67 - i) % 8],	\
93 	    S[(68 - i) % 8], S[(69 - i) % 8],	\
94 	    S[(70 - i) % 8], S[(71 - i) % 8],	\
95 	    W[i + ii] + Krnd[i + ii])
96 
97 /* Message schedule computation */
98 #define SHR32(x, n) (_mm_srli_epi32(x, n))
99 #define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n))))
100 #define s0_128(x) _mm_xor_si128(_mm_xor_si128(			\
101 	ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3))
102 
103 static inline __m128i
s1_128_high(__m128i a)104 s1_128_high(__m128i a)
105 {
106 	__m128i b;
107 	__m128i c;
108 
109 	/* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
110 	b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
111 	c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
112 
113 	/* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
114 	c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
115 
116 	/* Shuffle good data back and zero unwanted lanes. */
117 	c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
118 	c = _mm_slli_si128(c, 8);
119 
120 	return (c);
121 }
122 
123 static inline __m128i
s1_128_low(__m128i a)124 s1_128_low(__m128i a)
125 {
126 	__m128i b;
127 	__m128i c;
128 
129 	/* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
130 	b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
131 	c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
132 
133 	/* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
134 	c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
135 
136 	/* Shuffle good data back and zero unwanted lanes. */
137 	c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
138 	c = _mm_srli_si128(c, 8);
139 
140 	return (c);
141 }
142 
143 /**
144  * SPAN_ONE_THREE(a, b):
145  * Combine the upper three words of ${a} with the lowest word of ${b}.  This
146  * could also be thought of returning bits [159:32] of the 256-bit value
147  * consisting of (b[127:0] a[127:0]).  In other words, set:
148  *     dst[31:0] := a[63:32]
149  *     dst[63:32] := a[95:64]
150  *     dst[95:64] := a[127:96]
151  *     dst[127:96] := b[31:0]
152  */
153 #define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128(	\
154 	_mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))),		\
155 	_MM_SHUFFLE(0, 3, 2, 1)))
156 
157 /**
158  * MSG4(X0, X1, X2, X3):
159  * Calculate the next four values of the message schedule.  If we define
160  * ${W[j]} as the first unknown value in the message schedule, then the input
161  * arguments are:
162  *     X0 = W[j - 16] : W[j - 13]
163  *     X1 = W[j - 12] : W[j - 9]
164  *     X2 = W[j - 8] : W[j - 5]
165  *     X3 = W[j - 4] : W[j - 1]
166  * This function therefore calculates:
167  *     X4 = W[j + 0] : W[j + 3]
168  */
169 static inline __m128i
MSG4(__m128i X0,__m128i X1,__m128i X2,__m128i X3)170 MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3)
171 {
172 	__m128i X4;
173 	__m128i Xj_minus_seven, Xj_minus_fifteen;
174 
175 	/* Set up variables which span X values. */
176 	Xj_minus_seven = SPAN_ONE_THREE(X2, X3);
177 	Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1);
178 
179 	/* Begin computing X4. */
180 	X4 = _mm_add_epi32(X0, Xj_minus_seven);
181 	X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen));
182 
183 	/* First half of s1. */
184 	X4 = _mm_add_epi32(X4, s1_128_low(X3));
185 
186 	/* Second half of s1; this depends on the above value of X4. */
187 	X4 = _mm_add_epi32(X4, s1_128_high(X4));
188 
189 	return (X4);
190 }
191 
192 /**
193  * SHA256_Transform_sse2(state, block, W, S):
194  * Compute the SHA256 block compression function, transforming ${state} using
195  * the data in ${block}.  This implementation uses x86 SSE2 instructions, and
196  * should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns
197  * nonzero.  The arrays W and S may be filled with sensitive data, and should
198  * be cleared by the callee.
199  */
200 void
SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT8],const uint8_t block[PHP_STATIC_RESTRICT64],uint32_t W[PHP_STATIC_RESTRICT64],uint32_t S[PHP_STATIC_RESTRICT8])201 SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8],
202     const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64],
203     uint32_t S[PHP_STATIC_RESTRICT 8])
204 {
205 	__m128i Y[4];
206 	int i;
207 
208 	/* 1. Prepare the first part of the message schedule W. */
209 	Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0]));
210 	_mm_storeu_si128((__m128i *)&W[0], Y[0]);
211 	Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16]));
212 	_mm_storeu_si128((__m128i *)&W[4], Y[1]);
213 	Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32]));
214 	_mm_storeu_si128((__m128i *)&W[8], Y[2]);
215 	Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48]));
216 	_mm_storeu_si128((__m128i *)&W[12], Y[3]);
217 
218 	/* 2. Initialize working variables. */
219 	memcpy(S, state, 32);
220 
221 	/* 3. Mix. */
222 	for (i = 0; i < 64; i += 16) {
223 		RNDr(S, W, 0, i);
224 		RNDr(S, W, 1, i);
225 		RNDr(S, W, 2, i);
226 		RNDr(S, W, 3, i);
227 		RNDr(S, W, 4, i);
228 		RNDr(S, W, 5, i);
229 		RNDr(S, W, 6, i);
230 		RNDr(S, W, 7, i);
231 		RNDr(S, W, 8, i);
232 		RNDr(S, W, 9, i);
233 		RNDr(S, W, 10, i);
234 		RNDr(S, W, 11, i);
235 		RNDr(S, W, 12, i);
236 		RNDr(S, W, 13, i);
237 		RNDr(S, W, 14, i);
238 		RNDr(S, W, 15, i);
239 
240 		if (i == 48)
241 			break;
242 		Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]);
243 		_mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]);
244 		Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]);
245 		_mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]);
246 		Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]);
247 		_mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]);
248 		Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]);
249 		_mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]);
250 	}
251 
252 	/* 4. Mix local working variables into global state. */
253 	for (i = 0; i < 8; i++)
254 		state[i] += S[i];
255 }
256 
257 #endif
258