xref: /php-src/ext/standard/crc32_x86.c (revision 0752baa5)
1 /*
2   +----------------------------------------------------------------------+
3   | Copyright (c) The PHP Group                                          |
4   +----------------------------------------------------------------------+
5   | This source file is subject to version 3.01 of the PHP license,      |
6   | that is bundled with this package in the file LICENSE, and is        |
7   | available through the world-wide-web at the following url:           |
8   | https://www.php.net/license/3_01.txt                                 |
9   | If you did not receive a copy of the PHP license and are unable to   |
10   | obtain it through the world-wide-web, please send a note to          |
11   | license@php.net so we can mail you a copy immediately.               |
12   +----------------------------------------------------------------------+
13   | Author: Frank Du <frank.du@intel.com>                                |
14   +----------------------------------------------------------------------+
15   | Compute the crc32 of the buffer. Based on:                           |
16   | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ"       |
17   |  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0          |
18 */
19 
20 #include "crc32_x86.h"
21 
22 #if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
23 # include <nmmintrin.h>
24 # include <wmmintrin.h>
25 #endif
26 
27 #if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
28 # include "Zend/zend_cpuinfo.h"
29 #endif
30 
31 #if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
32 
33 typedef struct _crc32_pclmul_bit_consts {
34 	uint64_t k1k2[2];
35 	uint64_t k3k4[2];
36 	uint64_t k5k6[2];
37 	uint64_t uPx[2];
38 } crc32_pclmul_consts;
39 
40 static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = {
41 	{ /* X86_CRC32, polynomial: 0x04C11DB7 */
42 		{0x00e6228b11, 0x008833794c}, /* endianness swap */
43 		{0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */
44 		{0x00490d678d, 0x00f200aa66}, /* endianness swap */
45 		{0x0104d101df, 0x0104c11db7}
46 	},
47 	{ /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */
48 		{0x0154442bd4, 0x01c6e41596},
49 		{0x01751997d0, 0x00ccaa009e},
50 		{0x0163cd6124, 0x01db710640},
51 		{0x01f7011641, 0x01db710641},
52 	},
53 	{ /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */
54 		{0x00740eef02, 0x009e4addf8},
55 		{0x00f20c0dfe, 0x014cd00bd6},
56 		{0x00dd45aab8, 0x0000000000},
57 		{0x00dea713f1, 0x0105ec76f0}
58 	}
59 };
60 
61 static uint8_t pclmul_shuf_mask_table[16] = {
62 	0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
63 	0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
64 };
65 
66 /* Folding of 128-bit data chunks */
67 #define CRC32_FOLDING_BLOCK_SIZE (16)
68 
69 /* PCLMUL version of non-relfected crc32 */
70 ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
crc32_pclmul_batch(uint32_t * crc,const unsigned char * p,size_t nr,const crc32_pclmul_consts * consts)71 size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
72 {
73 	size_t nr_in = nr;
74 	__m128i x0, x1, x2, k, shuf_mask;
75 
76 	if (nr < CRC32_FOLDING_BLOCK_SIZE) {
77 		return 0;
78 	}
79 
80 	shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
81 	x0 = _mm_cvtsi32_si128(*crc);
82 	x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
83 	x0 = _mm_slli_si128(x0, 12);
84 	x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
85 	x0 = _mm_xor_si128(x1, x0);
86 	p += CRC32_FOLDING_BLOCK_SIZE;
87 	nr -= CRC32_FOLDING_BLOCK_SIZE;
88 
89 	if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
90 		__m128i x3, x4;
91 
92 		x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
93 		x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
94 		x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
95 		x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
96 		x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
97 		x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */
98 		p += CRC32_FOLDING_BLOCK_SIZE * 3;
99 		nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
100 
101 		k = _mm_loadu_si128((__m128i *)consts->k1k2);
102 		/* parallel folding by 4 */
103 		while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
104 			__m128i x5, x6, x7, x8, x9, x10, x11;
105 			x4 = _mm_clmulepi64_si128(x0, k, 0x00);
106 			x5 = _mm_clmulepi64_si128(x1, k, 0x00);
107 			x6 = _mm_clmulepi64_si128(x2, k, 0x00);
108 			x7 = _mm_clmulepi64_si128(x3, k, 0x00);
109 			x0 = _mm_clmulepi64_si128(x0, k, 0x11);
110 			x1 = _mm_clmulepi64_si128(x1, k, 0x11);
111 			x2 = _mm_clmulepi64_si128(x2, k, 0x11);
112 			x3 = _mm_clmulepi64_si128(x3, k, 0x11);
113 			x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
114 			x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */
115 			x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
116 			x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */
117 			x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
118 			x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */
119 			x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
120 			x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */
121 			x0 = _mm_xor_si128(x0, x4);
122 			x1 = _mm_xor_si128(x1, x5);
123 			x2 = _mm_xor_si128(x2, x6);
124 			x3 = _mm_xor_si128(x3, x7);
125 			x0 = _mm_xor_si128(x0, x8);
126 			x1 = _mm_xor_si128(x1, x9);
127 			x2 = _mm_xor_si128(x2, x10);
128 			x3 = _mm_xor_si128(x3, x11);
129 
130 			p += CRC32_FOLDING_BLOCK_SIZE * 4;
131 			nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
132 		}
133 
134 		k = _mm_loadu_si128((__m128i *)consts->k3k4);
135 		/* fold 4 to 1, [x1, x2, x3] -> x0 */
136 		x4 = _mm_clmulepi64_si128(x0, k, 0x00);
137 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
138 		x0 = _mm_xor_si128(x0, x1);
139 		x0 = _mm_xor_si128(x0, x4);
140 		x4 = _mm_clmulepi64_si128(x0, k, 0x00);
141 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
142 		x0 = _mm_xor_si128(x0, x2);
143 		x0 = _mm_xor_si128(x0, x4);
144 		x4 = _mm_clmulepi64_si128(x0, k, 0x00);
145 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
146 		x0 = _mm_xor_si128(x0, x3);
147 		x0 = _mm_xor_si128(x0, x4);
148 	}
149 
150 	k = _mm_loadu_si128((__m128i *)consts->k3k4);
151 	/* folding by 1 */
152 	while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
153 		/* load next to x2, fold to x0, x1 */
154 		x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
155 		x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
156 		x1 = _mm_clmulepi64_si128(x0, k, 0x00);
157 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
158 		x0 = _mm_xor_si128(x0, x2);
159 		x0 = _mm_xor_si128(x0, x1);
160 		p += CRC32_FOLDING_BLOCK_SIZE;
161 		nr -= CRC32_FOLDING_BLOCK_SIZE;
162 	}
163 
164 	/* reduce 128-bits(final fold) to 96-bits */
165 	k = _mm_loadu_si128((__m128i*)consts->k5k6);
166 	x1 = _mm_clmulepi64_si128(x0, k, 0x11);
167 	x0 = _mm_slli_si128(x0, 8);
168 	x0 = _mm_srli_si128(x0, 4);
169 	x0 = _mm_xor_si128(x0, x1);
170 	/* reduce 96-bits to 64-bits */
171 	x1 = _mm_clmulepi64_si128(x0, k, 0x01);
172 	x0 = _mm_xor_si128(x0, x1);
173 
174 	/* barrett reduction */
175 	k = _mm_loadu_si128((__m128i*)consts->uPx);
176 	x1 = _mm_move_epi64(x0);
177 	x1 = _mm_srli_si128(x1, 4);
178 	x1 = _mm_clmulepi64_si128(x1, k, 0x00);
179 	x1 = _mm_srli_si128(x1, 4);
180 	x1 = _mm_clmulepi64_si128(x1, k, 0x10);
181 	x0 = _mm_xor_si128(x1, x0);
182 	*crc =  _mm_extract_epi32(x0, 0);
183 	return (nr_in - nr); /* the nr processed */
184 }
185 
186 /* PCLMUL version of relfected crc32 */
187 ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
crc32_pclmul_reflected_batch(uint32_t * crc,const unsigned char * p,size_t nr,const crc32_pclmul_consts * consts)188 size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
189 {
190 	size_t nr_in = nr;
191 	__m128i x0, x1, x2, k;
192 
193 	if (nr < CRC32_FOLDING_BLOCK_SIZE) {
194 		return 0;
195 	}
196 
197 	x0 = _mm_loadu_si128((__m128i *)(p + 0x00));
198 	x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
199 	p += CRC32_FOLDING_BLOCK_SIZE;
200 	nr -= CRC32_FOLDING_BLOCK_SIZE;
201 	if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
202 		__m128i x3, x4;
203 
204 		x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
205 		x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
206 		x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
207 		p += CRC32_FOLDING_BLOCK_SIZE * 3;
208 		nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
209 
210 		k = _mm_loadu_si128((__m128i *)consts->k1k2);
211 		/* parallel folding by 4 */
212 		while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
213 			__m128i x5, x6, x7, x8, x9, x10, x11;
214 			x4 = _mm_clmulepi64_si128(x0, k, 0x00);
215 			x5 = _mm_clmulepi64_si128(x1, k, 0x00);
216 			x6 = _mm_clmulepi64_si128(x2, k, 0x00);
217 			x7 = _mm_clmulepi64_si128(x3, k, 0x00);
218 			x0 = _mm_clmulepi64_si128(x0, k, 0x11);
219 			x1 = _mm_clmulepi64_si128(x1, k, 0x11);
220 			x2 = _mm_clmulepi64_si128(x2, k, 0x11);
221 			x3 = _mm_clmulepi64_si128(x3, k, 0x11);
222 			x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
223 			x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
224 			x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
225 			x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
226 			x0 = _mm_xor_si128(x0, x4);
227 			x1 = _mm_xor_si128(x1, x5);
228 			x2 = _mm_xor_si128(x2, x6);
229 			x3 = _mm_xor_si128(x3, x7);
230 			x0 = _mm_xor_si128(x0, x8);
231 			x1 = _mm_xor_si128(x1, x9);
232 			x2 = _mm_xor_si128(x2, x10);
233 			x3 = _mm_xor_si128(x3, x11);
234 
235 			p += CRC32_FOLDING_BLOCK_SIZE * 4;
236 			nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
237 		}
238 
239 		k = _mm_loadu_si128((__m128i *)consts->k3k4);
240 		/* fold 4 to 1, [x1, x2, x3] -> x0 */
241 		x4 = _mm_clmulepi64_si128(x0, k, 0x00);
242 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
243 		x0 = _mm_xor_si128(x0, x1);
244 		x0 = _mm_xor_si128(x0, x4);
245 		x4 = _mm_clmulepi64_si128(x0, k, 0x00);
246 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
247 		x0 = _mm_xor_si128(x0, x2);
248 		x0 = _mm_xor_si128(x0, x4);
249 		x4 = _mm_clmulepi64_si128(x0, k, 0x00);
250 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
251 		x0 = _mm_xor_si128(x0, x3);
252 		x0 = _mm_xor_si128(x0, x4);
253 	}
254 
255 	k = _mm_loadu_si128((__m128i *)consts->k3k4);
256 	/* folding by 1 */
257 	while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
258 		/* load next to x2, fold to x0, x1 */
259 		x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
260 		x1 = _mm_clmulepi64_si128(x0, k, 0x00);
261 		x0 = _mm_clmulepi64_si128(x0, k, 0x11);
262 		x0 = _mm_xor_si128(x0, x2);
263 		x0 = _mm_xor_si128(x0, x1);
264 		p += CRC32_FOLDING_BLOCK_SIZE;
265 		nr -= CRC32_FOLDING_BLOCK_SIZE;
266 	}
267 
268 	/* reduce 128-bits(final fold) to 96-bits */
269 	x1 = _mm_clmulepi64_si128(x0, k, 0x10);
270 	x0 = _mm_srli_si128(x0, 8);
271 	x0 = _mm_xor_si128(x0, x1);
272 	/* reduce 96-bits to 64-bits */
273 	x1 = _mm_shuffle_epi32(x0, 0xfc);
274 	x0 = _mm_shuffle_epi32(x0, 0xf9);
275 	k = _mm_loadu_si128((__m128i*)consts->k5k6);
276 	x1 = _mm_clmulepi64_si128(x1, k, 0x00);
277 	x0 = _mm_xor_si128(x0, x1);
278 
279 	/* barrett reduction */
280 	x1 = _mm_shuffle_epi32(x0, 0xf3);
281 	x0 = _mm_slli_si128(x0, 4);
282 	k = _mm_loadu_si128((__m128i*)consts->uPx);
283 	x1 = _mm_clmulepi64_si128(x1, k, 0x00);
284 	x1 = _mm_clmulepi64_si128(x1, k, 0x10);
285 	x0 = _mm_xor_si128(x1, x0);
286 	*crc =  _mm_extract_epi32(x0, 2);
287 	return (nr_in - nr); /* the nr processed */
288 }
289 
290 # if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE
crc32_x86_simd_update(X86_CRC32_TYPE type,uint32_t * crc,const unsigned char * p,size_t nr)291 size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
292 # else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */
293 size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
294 # endif
295 {
296 	if (type > X86_CRC32_MAX) {
297 		return 0;
298 	}
299 	const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type];
300 
301 	switch (type) {
302 	case X86_CRC32:
303 		return crc32_pclmul_batch(crc, p, nr, consts);
304 	case X86_CRC32B:
305 	case X86_CRC32C:
306 		return crc32_pclmul_reflected_batch(crc, p, nr, consts);
307 	default:
308 		return 0;
309 	}
310 }
311 #endif
312 
313 #if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
crc32_x86_simd_update_default(X86_CRC32_TYPE type,uint32_t * crc,const unsigned char * p,size_t nr)314 static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
315 {
316 	return 0;
317 }
318 
319 # if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
320 size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update")));
321 
322 typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
323 
324 ZEND_NO_SANITIZE_ADDRESS
325 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_crc32_x86_simd_update(void)326 static crc32_x86_simd_func_t resolve_crc32_x86_simd_update(void) {
327 	if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
328 		return crc32_sse42_pclmul_update;
329 	}
330 	return crc32_x86_simd_update_default;
331 }
332 # else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */
333 static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default;
334 
crc32_x86_simd_update(X86_CRC32_TYPE type,uint32_t * crc,const unsigned char * p,size_t nr)335 size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) {
336 	return crc32_x86_simd_ptr(type, crc, p, nr);
337 }
338 
339 /* {{{ PHP_MINIT_FUNCTION */
PHP_MINIT_FUNCTION(crc32_x86_intrin)340 PHP_MINIT_FUNCTION(crc32_x86_intrin)
341 {
342 	if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
343 		crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
344 	}
345 	return SUCCESS;
346 }
347 /* }}} */
348 # endif
349 #endif
350