1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Frank Du <frank.du@intel.com> |
14 +----------------------------------------------------------------------+
15 | Compute the crc32 of the buffer. Based on: |
16 | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ" |
17 | V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
18 */
19
20 #include "crc32_x86.h"
21
22 #if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
23 # include <nmmintrin.h>
24 # include <wmmintrin.h>
25 #endif
26
27 #ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
28 # include "Zend/zend_cpuinfo.h"
29 #endif
30
31 #if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
32
33 typedef struct _crc32_pclmul_bit_consts {
34 uint64_t k1k2[2];
35 uint64_t k3k4[2];
36 uint64_t k5k6[2];
37 uint64_t uPx[2];
38 } crc32_pclmul_consts;
39
40 static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = {
41 { /* X86_CRC32, polynomial: 0x04C11DB7 */
42 {0x00e6228b11, 0x008833794c}, /* endianness swap */
43 {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */
44 {0x00490d678d, 0x00f200aa66}, /* endianness swap */
45 {0x0104d101df, 0x0104c11db7}
46 },
47 { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */
48 {0x0154442bd4, 0x01c6e41596},
49 {0x01751997d0, 0x00ccaa009e},
50 {0x0163cd6124, 0x01db710640},
51 {0x01f7011641, 0x01db710641},
52 },
53 { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */
54 {0x00740eef02, 0x009e4addf8},
55 {0x00f20c0dfe, 0x014cd00bd6},
56 {0x00dd45aab8, 0x0000000000},
57 {0x00dea713f1, 0x0105ec76f0}
58 }
59 };
60
61 static uint8_t pclmul_shuf_mask_table[16] = {
62 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
63 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
64 };
65
66 /* Folding of 128-bit data chunks */
67 #define CRC32_FOLDING_BLOCK_SIZE (16)
68
69 /* PCLMUL version of non-relfected crc32 */
70 ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
crc32_pclmul_batch(uint32_t * crc,const unsigned char * p,size_t nr,const crc32_pclmul_consts * consts)71 size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
72 {
73 size_t nr_in = nr;
74 __m128i x0, x1, x2, k, shuf_mask;
75
76 if (nr < CRC32_FOLDING_BLOCK_SIZE) {
77 return 0;
78 }
79
80 shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
81 x0 = _mm_cvtsi32_si128(*crc);
82 x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
83 x0 = _mm_slli_si128(x0, 12);
84 x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
85 x0 = _mm_xor_si128(x1, x0);
86 p += CRC32_FOLDING_BLOCK_SIZE;
87 nr -= CRC32_FOLDING_BLOCK_SIZE;
88
89 if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
90 __m128i x3, x4;
91
92 x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
93 x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
94 x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
95 x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
96 x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
97 x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */
98 p += CRC32_FOLDING_BLOCK_SIZE * 3;
99 nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
100
101 k = _mm_loadu_si128((__m128i *)consts->k1k2);
102 /* parallel folding by 4 */
103 while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
104 __m128i x5, x6, x7, x8, x9, x10, x11;
105 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
106 x5 = _mm_clmulepi64_si128(x1, k, 0x00);
107 x6 = _mm_clmulepi64_si128(x2, k, 0x00);
108 x7 = _mm_clmulepi64_si128(x3, k, 0x00);
109 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
110 x1 = _mm_clmulepi64_si128(x1, k, 0x11);
111 x2 = _mm_clmulepi64_si128(x2, k, 0x11);
112 x3 = _mm_clmulepi64_si128(x3, k, 0x11);
113 x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
114 x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */
115 x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
116 x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */
117 x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
118 x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */
119 x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
120 x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */
121 x0 = _mm_xor_si128(x0, x4);
122 x1 = _mm_xor_si128(x1, x5);
123 x2 = _mm_xor_si128(x2, x6);
124 x3 = _mm_xor_si128(x3, x7);
125 x0 = _mm_xor_si128(x0, x8);
126 x1 = _mm_xor_si128(x1, x9);
127 x2 = _mm_xor_si128(x2, x10);
128 x3 = _mm_xor_si128(x3, x11);
129
130 p += CRC32_FOLDING_BLOCK_SIZE * 4;
131 nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
132 }
133
134 k = _mm_loadu_si128((__m128i *)consts->k3k4);
135 /* fold 4 to 1, [x1, x2, x3] -> x0 */
136 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
137 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
138 x0 = _mm_xor_si128(x0, x1);
139 x0 = _mm_xor_si128(x0, x4);
140 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
141 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
142 x0 = _mm_xor_si128(x0, x2);
143 x0 = _mm_xor_si128(x0, x4);
144 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
145 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
146 x0 = _mm_xor_si128(x0, x3);
147 x0 = _mm_xor_si128(x0, x4);
148 }
149
150 k = _mm_loadu_si128((__m128i *)consts->k3k4);
151 /* folding by 1 */
152 while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
153 /* load next to x2, fold to x0, x1 */
154 x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
155 x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
156 x1 = _mm_clmulepi64_si128(x0, k, 0x00);
157 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
158 x0 = _mm_xor_si128(x0, x2);
159 x0 = _mm_xor_si128(x0, x1);
160 p += CRC32_FOLDING_BLOCK_SIZE;
161 nr -= CRC32_FOLDING_BLOCK_SIZE;
162 }
163
164 /* reduce 128-bits(final fold) to 96-bits */
165 k = _mm_loadu_si128((__m128i*)consts->k5k6);
166 x1 = _mm_clmulepi64_si128(x0, k, 0x11);
167 x0 = _mm_slli_si128(x0, 8);
168 x0 = _mm_srli_si128(x0, 4);
169 x0 = _mm_xor_si128(x0, x1);
170 /* reduce 96-bits to 64-bits */
171 x1 = _mm_clmulepi64_si128(x0, k, 0x01);
172 x0 = _mm_xor_si128(x0, x1);
173
174 /* barrett reduction */
175 k = _mm_loadu_si128((__m128i*)consts->uPx);
176 x1 = _mm_move_epi64(x0);
177 x1 = _mm_srli_si128(x1, 4);
178 x1 = _mm_clmulepi64_si128(x1, k, 0x00);
179 x1 = _mm_srli_si128(x1, 4);
180 x1 = _mm_clmulepi64_si128(x1, k, 0x10);
181 x0 = _mm_xor_si128(x1, x0);
182 *crc = _mm_extract_epi32(x0, 0);
183 return (nr_in - nr); /* the nr processed */
184 }
185
186 /* PCLMUL version of relfected crc32 */
187 ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
crc32_pclmul_reflected_batch(uint32_t * crc,const unsigned char * p,size_t nr,const crc32_pclmul_consts * consts)188 size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
189 {
190 size_t nr_in = nr;
191 __m128i x0, x1, x2, k;
192
193 if (nr < CRC32_FOLDING_BLOCK_SIZE) {
194 return 0;
195 }
196
197 x0 = _mm_loadu_si128((__m128i *)(p + 0x00));
198 x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
199 p += CRC32_FOLDING_BLOCK_SIZE;
200 nr -= CRC32_FOLDING_BLOCK_SIZE;
201 if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
202 __m128i x3, x4;
203
204 x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
205 x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
206 x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
207 p += CRC32_FOLDING_BLOCK_SIZE * 3;
208 nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
209
210 k = _mm_loadu_si128((__m128i *)consts->k1k2);
211 /* parallel folding by 4 */
212 while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
213 __m128i x5, x6, x7, x8, x9, x10, x11;
214 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
215 x5 = _mm_clmulepi64_si128(x1, k, 0x00);
216 x6 = _mm_clmulepi64_si128(x2, k, 0x00);
217 x7 = _mm_clmulepi64_si128(x3, k, 0x00);
218 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
219 x1 = _mm_clmulepi64_si128(x1, k, 0x11);
220 x2 = _mm_clmulepi64_si128(x2, k, 0x11);
221 x3 = _mm_clmulepi64_si128(x3, k, 0x11);
222 x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
223 x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
224 x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
225 x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
226 x0 = _mm_xor_si128(x0, x4);
227 x1 = _mm_xor_si128(x1, x5);
228 x2 = _mm_xor_si128(x2, x6);
229 x3 = _mm_xor_si128(x3, x7);
230 x0 = _mm_xor_si128(x0, x8);
231 x1 = _mm_xor_si128(x1, x9);
232 x2 = _mm_xor_si128(x2, x10);
233 x3 = _mm_xor_si128(x3, x11);
234
235 p += CRC32_FOLDING_BLOCK_SIZE * 4;
236 nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
237 }
238
239 k = _mm_loadu_si128((__m128i *)consts->k3k4);
240 /* fold 4 to 1, [x1, x2, x3] -> x0 */
241 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
242 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
243 x0 = _mm_xor_si128(x0, x1);
244 x0 = _mm_xor_si128(x0, x4);
245 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
246 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
247 x0 = _mm_xor_si128(x0, x2);
248 x0 = _mm_xor_si128(x0, x4);
249 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
250 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
251 x0 = _mm_xor_si128(x0, x3);
252 x0 = _mm_xor_si128(x0, x4);
253 }
254
255 k = _mm_loadu_si128((__m128i *)consts->k3k4);
256 /* folding by 1 */
257 while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
258 /* load next to x2, fold to x0, x1 */
259 x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
260 x1 = _mm_clmulepi64_si128(x0, k, 0x00);
261 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
262 x0 = _mm_xor_si128(x0, x2);
263 x0 = _mm_xor_si128(x0, x1);
264 p += CRC32_FOLDING_BLOCK_SIZE;
265 nr -= CRC32_FOLDING_BLOCK_SIZE;
266 }
267
268 /* reduce 128-bits(final fold) to 96-bits */
269 x1 = _mm_clmulepi64_si128(x0, k, 0x10);
270 x0 = _mm_srli_si128(x0, 8);
271 x0 = _mm_xor_si128(x0, x1);
272 /* reduce 96-bits to 64-bits */
273 x1 = _mm_shuffle_epi32(x0, 0xfc);
274 x0 = _mm_shuffle_epi32(x0, 0xf9);
275 k = _mm_loadu_si128((__m128i*)consts->k5k6);
276 x1 = _mm_clmulepi64_si128(x1, k, 0x00);
277 x0 = _mm_xor_si128(x0, x1);
278
279 /* barrett reduction */
280 x1 = _mm_shuffle_epi32(x0, 0xf3);
281 x0 = _mm_slli_si128(x0, 4);
282 k = _mm_loadu_si128((__m128i*)consts->uPx);
283 x1 = _mm_clmulepi64_si128(x1, k, 0x00);
284 x1 = _mm_clmulepi64_si128(x1, k, 0x10);
285 x0 = _mm_xor_si128(x1, x0);
286 *crc = _mm_extract_epi32(x0, 2);
287 return (nr_in - nr); /* the nr processed */
288 }
289
290 # if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE)
crc32_x86_simd_update(X86_CRC32_TYPE type,uint32_t * crc,const unsigned char * p,size_t nr)291 size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
292 # else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */
293 size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
294 # endif
295 {
296 if (type > X86_CRC32_MAX) {
297 return 0;
298 }
299 const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type];
300
301 switch (type) {
302 case X86_CRC32:
303 return crc32_pclmul_batch(crc, p, nr, consts);
304 case X86_CRC32B:
305 case X86_CRC32C:
306 return crc32_pclmul_reflected_batch(crc, p, nr, consts);
307 default:
308 return 0;
309 }
310 }
311 #endif
312
313 #ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
crc32_x86_simd_update_default(X86_CRC32_TYPE type,uint32_t * crc,const unsigned char * p,size_t nr)314 static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
315 {
316 return 0;
317 }
318
319 # ifdef ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
320 size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update")));
321
322 typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
323
324 ZEND_NO_SANITIZE_ADDRESS
325 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_crc32_x86_simd_update(void)326 static crc32_x86_simd_func_t resolve_crc32_x86_simd_update(void) {
327 if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
328 return crc32_sse42_pclmul_update;
329 }
330 return crc32_x86_simd_update_default;
331 }
332 # else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */
333 static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default;
334
crc32_x86_simd_update(X86_CRC32_TYPE type,uint32_t * crc,const unsigned char * p,size_t nr)335 size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) {
336 return crc32_x86_simd_ptr(type, crc, p, nr);
337 }
338
339 /* {{{ PHP_MINIT_FUNCTION */
PHP_MINIT_FUNCTION(crc32_x86_intrin)340 PHP_MINIT_FUNCTION(crc32_x86_intrin)
341 {
342 if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
343 crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
344 }
345 return SUCCESS;
346 }
347 /* }}} */
348 # endif
349 #endif
350