1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Jim Winstead <jimw@php.net> |
14 | Xinchen Hui <laruence@php.net> |
15 +----------------------------------------------------------------------+
16 */
17
18 #include <string.h>
19
20 #include "php.h"
21 #include "base64.h"
22
23 /* {{{ base64 tables */
24 static const char base64_table[] = {
25 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
26 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
27 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
28 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
29 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
30 };
31
32 static const char base64_pad = '=';
33
34 static const short base64_reverse_table[256] = {
35 -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
36 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
37 -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
38 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
39 -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
40 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
41 -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
42 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
43 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
48 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
49 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
50 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
51 };
52 /* }}} */
53
54 #if defined(__aarch64__) || defined(_M_ARM64)
55 #include <arm_neon.h>
56
encode_toascii(const uint8x16_t input,const uint8x16x2_t shift_LUT)57 static zend_always_inline uint8x16_t encode_toascii(const uint8x16_t input, const uint8x16x2_t shift_LUT)
58 {
59 /* reduce 0..51 -> 0
60 52..61 -> 1 .. 10
61 62 -> 11
62 63 -> 12 */
63 uint8x16_t result = vqsubq_u8(input, vdupq_n_u8(51));
64 /* distinguish between ranges 0..25 and 26..51:
65 0 .. 25 -> remains 0
66 26 .. 51 -> becomes 13 */
67 const uint8x16_t less = vcgtq_u8(vdupq_n_u8(26), input);
68 result = vorrq_u8(result, vandq_u8(less, vdupq_n_u8(13)));
69 /* read shift */
70 result = vqtbl2q_u8(shift_LUT, result);
71 return vaddq_u8(result, input);
72 }
73
neon_base64_encode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)74 static zend_always_inline unsigned char *neon_base64_encode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left)
75 {
76 const uint8_t shift_LUT_[32] = {'a' - 26, '0' - 52, '0' - 52, '0' - 52,
77 '0' - 52, '0' - 52, '0' - 52, '0' - 52,
78 '0' - 52, '0' - 52, '0' - 52, '+' - 62,
79 '/' - 63, 'A', 0, 0,
80 'a' - 26, '0' - 52, '0' - 52, '0' - 52,
81 '0' - 52, '0' - 52, '0' - 52, '0' - 52,
82 '0' - 52, '0' - 52, '0' - 52, '+' - 62,
83 '/' - 63, 'A', 0, 0};
84 const uint8x16x2_t shift_LUT = *((const uint8x16x2_t *)shift_LUT_);
85 do {
86 /* [ccdddddd | bbbbcccc | aaaaaabb]
87 x.val[2] | x.val[1] | x.val[0] */
88 const uint8x16x3_t x = vld3q_u8((const uint8_t *)(in));
89
90 /* [00aa_aaaa] */
91 const uint8x16_t field_a = vshrq_n_u8(x.val[0], 2);
92
93 const uint8x16_t field_b = /* [00bb_bbbb] */
94 vbslq_u8(vdupq_n_u8(0x30), /* [0011_0000] */
95 vshlq_n_u8(x.val[0], 4), /* [aabb_0000] */
96 vshrq_n_u8(x.val[1], 4)); /* [0000_bbbb] */
97
98 const uint8x16_t field_c = /* [00cc_cccc] */
99 vbslq_u8(vdupq_n_u8(0x3c), /* [0011_1100] */
100 vshlq_n_u8(x.val[1], 2), /* [bbcc_cc00] */
101 vshrq_n_u8(x.val[2], 6)); /* [0000_00cc] */
102
103 /* [00dd_dddd] */
104 const uint8x16_t field_d = vandq_u8(x.val[2], vdupq_n_u8(0x3f));
105
106 uint8x16x4_t result;
107 result.val[0] = encode_toascii(field_a, shift_LUT);
108 result.val[1] = encode_toascii(field_b, shift_LUT);
109 result.val[2] = encode_toascii(field_c, shift_LUT);
110 result.val[3] = encode_toascii(field_d, shift_LUT);
111
112 vst4q_u8((uint8_t *)out, result);
113 out += 64;
114 in += 16 * 3;
115 inl -= 16 * 3;
116 } while (inl >= 16 * 3);
117
118 *left = inl;
119 return out;
120 }
121 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
122
php_base64_encode_impl(const unsigned char * in,size_t inl,unsigned char * out)123 static zend_always_inline unsigned char *php_base64_encode_impl(const unsigned char *in, size_t inl, unsigned char *out) /* {{{ */
124 {
125 #if defined(__aarch64__) || defined(_M_ARM64)
126 if (inl >= 16 * 3) {
127 size_t left = 0;
128 out = neon_base64_encode(in, inl, out, &left);
129 in += inl - left;
130 inl = left;
131 }
132 #endif
133
134 while (inl > 2) { /* keep going until we have less than 24 bits */
135 *out++ = base64_table[in[0] >> 2];
136 *out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
137 *out++ = base64_table[((in[1] & 0x0f) << 2) + (in[2] >> 6)];
138 *out++ = base64_table[in[2] & 0x3f];
139
140 in += 3;
141 inl -= 3; /* we just handle 3 octets of data */
142 }
143
144 /* now deal with the tail end of things */
145 if (inl != 0) {
146 *out++ = base64_table[in[0] >> 2];
147 if (inl > 1) {
148 *out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
149 *out++ = base64_table[(in[1] & 0x0f) << 2];
150 *out++ = base64_pad;
151 } else {
152 *out++ = base64_table[(in[0] & 0x03) << 4];
153 *out++ = base64_pad;
154 *out++ = base64_pad;
155 }
156 }
157
158 *out = '\0';
159
160 return out;
161 }
162 /* }}} */
163
164 #if defined(__aarch64__) || defined(_M_ARM64)
decode_fromascii(const uint8x16_t input,uint8x16_t * error,const uint8x16x2_t shiftLUT,const uint8x16x2_t maskLUT,const uint8x16x2_t bitposLUT)165 static zend_always_inline uint8x16_t decode_fromascii(const uint8x16_t input, uint8x16_t *error, const uint8x16x2_t shiftLUT, const uint8x16x2_t maskLUT, const uint8x16x2_t bitposLUT) {
166 const uint8x16_t higher_nibble = vshrq_n_u8(input, 4);
167 const uint8x16_t lower_nibble = vandq_u8(input, vdupq_n_u8(0x0f));
168 const uint8x16_t sh = vqtbl2q_u8(shiftLUT, higher_nibble);
169 const uint8x16_t eq_2f = vceqq_u8(input, vdupq_n_u8(0x2f));
170 const uint8x16_t shift = vbslq_u8(eq_2f, vdupq_n_u8(16), sh);
171 const uint8x16_t M = vqtbl2q_u8(maskLUT, lower_nibble);
172 const uint8x16_t bit = vqtbl2q_u8(bitposLUT, higher_nibble);
173 *error = vceqq_u8(vandq_u8(M, bit), vdupq_n_u8(0));
174 return vaddq_u8(input, shift);
175 }
176
neon_base64_decode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)177 static zend_always_inline size_t neon_base64_decode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left) {
178 unsigned char *out_orig = out;
179 const uint8_t shiftLUT_[32] = {
180 0, 0, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
183 0, 0, 0, 0, 0, 0, 0, 0};
184 const uint8_t maskLUT_[32] = {
185 /* 0 : 0b1010_1000*/ 0xa8,
186 /* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
187 /* 10 : 0b1111_0000*/ 0xf0,
188 /* 11 : 0b0101_0100*/ 0x54,
189 /* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
190 /* 15 : 0b0101_0100*/ 0x54,
191
192 /* 0 : 0b1010_1000*/ 0xa8,
193 /* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
194 /* 10 : 0b1111_0000*/ 0xf0,
195 /* 11 : 0b0101_0100*/ 0x54,
196 /* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
197 /* 15 : 0b0101_0100*/ 0x54
198 };
199 const uint8_t bitposLUT_[32] = {
200 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
201 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
202
203 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
204 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
205 };
206 const uint8x16x2_t shiftLUT = *((const uint8x16x2_t *)shiftLUT_);
207 const uint8x16x2_t maskLUT = *((const uint8x16x2_t *)maskLUT_);
208 const uint8x16x2_t bitposLUT = *((const uint8x16x2_t *)bitposLUT_);;
209
210 do {
211 const uint8x16x4_t x = vld4q_u8((const unsigned char *)in);
212 uint8x16_t error_a;
213 uint8x16_t error_b;
214 uint8x16_t error_c;
215 uint8x16_t error_d;
216 uint8x16_t field_a = decode_fromascii(x.val[0], &error_a, shiftLUT, maskLUT, bitposLUT);
217 uint8x16_t field_b = decode_fromascii(x.val[1], &error_b, shiftLUT, maskLUT, bitposLUT);
218 uint8x16_t field_c = decode_fromascii(x.val[2], &error_c, shiftLUT, maskLUT, bitposLUT);
219 uint8x16_t field_d = decode_fromascii(x.val[3], &error_d, shiftLUT, maskLUT, bitposLUT);
220
221 const uint8x16_t err = vorrq_u8(vorrq_u8(error_a, error_b), vorrq_u8(error_c, error_d));
222 union {uint8_t mem[16]; uint64_t dw[2]; } error;
223 vst1q_u8(error.mem, err);
224
225 /* Check that the input only contains bytes belonging to the alphabet of
226 Base64. If there are errors, decode the rest of the string with the
227 scalar decoder. */
228 if (error.dw[0] | error.dw[1])
229 break;
230
231 uint8x16x3_t result;
232 result.val[0] = vorrq_u8(vshrq_n_u8(field_b, 4), vshlq_n_u8(field_a, 2));
233 result.val[1] = vorrq_u8(vshrq_n_u8(field_c, 2), vshlq_n_u8(field_b, 4));
234 result.val[2] = vorrq_u8(field_d, vshlq_n_u8(field_c, 6));
235
236 vst3q_u8((unsigned char *)out, result);
237 out += 16 * 3;
238 in += 16 * 4;
239 inl -= 16 * 4;
240 } while (inl >= 16 * 4);
241 *left = inl;
242 return out - out_orig;
243 }
244 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
245
php_base64_decode_impl(const unsigned char * in,size_t inl,unsigned char * out,size_t * outl,bool strict)246 static zend_always_inline int php_base64_decode_impl(const unsigned char *in, size_t inl, unsigned char *out, size_t *outl, bool strict) /* {{{ */
247 {
248 int ch;
249 size_t i = 0, padding = 0, j = *outl;
250
251 #if defined(__aarch64__) || defined(_M_ARM64)
252 if (inl >= 16 * 4) {
253 size_t left = 0;
254 j += neon_base64_decode(in, inl, out, &left);
255 i = inl - left;
256 in += i;
257 inl = left;
258 }
259 #endif
260
261 /* run through the whole string, converting as we go */
262 while (inl-- > 0) {
263 ch = *in++;
264 if (ch == base64_pad) {
265 padding++;
266 continue;
267 }
268
269 ch = base64_reverse_table[ch];
270 if (!strict) {
271 /* skip unknown characters and whitespace */
272 if (ch < 0) {
273 continue;
274 }
275 } else {
276 /* skip whitespace */
277 if (ch == -1) {
278 continue;
279 }
280 /* fail on bad characters or if any data follows padding */
281 if (ch == -2 || padding) {
282 goto fail;
283 }
284 }
285
286 switch (i % 4) {
287 case 0:
288 out[j] = ch << 2;
289 break;
290 case 1:
291 out[j++] |= ch >> 4;
292 out[j] = (ch & 0x0f) << 4;
293 break;
294 case 2:
295 out[j++] |= ch >>2;
296 out[j] = (ch & 0x03) << 6;
297 break;
298 case 3:
299 out[j++] |= ch;
300 break;
301 }
302 i++;
303 }
304
305 /* fail if the input is truncated (only one char in last group) */
306 if (strict && i % 4 == 1) {
307 goto fail;
308 }
309
310 /* fail if the padding length is wrong (not VV==, VVV=), but accept zero padding
311 * RFC 4648: "In some circumstances, the use of padding [--] is not required" */
312 if (strict && padding && (padding > 2 || (i + padding) % 4 != 0)) {
313 goto fail;
314 }
315
316 *outl = j;
317 out[j] = '\0';
318
319 return 1;
320
321 fail:
322 return 0;
323 }
324 /* }}} */
325
326 /* {{{ php_base64_encode */
327
328 #if ZEND_INTRIN_AVX2_NATIVE
329 # undef ZEND_INTRIN_SSSE3_NATIVE
330 # undef ZEND_INTRIN_SSSE3_RESOLVER
331 # undef ZEND_INTRIN_SSSE3_FUNC_PROTO
332 # undef ZEND_INTRIN_SSSE3_FUNC_PTR
333 #elif ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_SSSE3_NATIVE
334 # undef ZEND_INTRIN_SSSE3_NATIVE
335 # undef ZEND_INTRIN_SSSE3_RESOLVER
336 # define ZEND_INTRIN_SSSE3_RESOLVER 1
337 # define ZEND_INTRIN_SSSE3_FUNC_PROTO 1
338 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
339 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
340 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
341 # else
342 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
343 # endif
344 #elif ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_SSSE3_NATIVE
345 # undef ZEND_INTRIN_SSSE3_NATIVE
346 # undef ZEND_INTRIN_SSSE3_RESOLVER
347 # define ZEND_INTRIN_SSSE3_RESOLVER 1
348 # define ZEND_INTRIN_SSSE3_FUNC_PTR 1
349 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
350 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
351 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
352 # else
353 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
354 # endif
355 #endif
356
357 /* Only enable avx512 resolver if avx2 use resolver also */
358 #if ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_AVX512_FUNC_PROTO
359 #define BASE64_INTRIN_AVX512_FUNC_PROTO 1
360 #endif
361 #if ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_AVX512_FUNC_PTR
362 #define BASE64_INTRIN_AVX512_FUNC_PTR 1
363 #endif
364 #if ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_AVX512_VBMI_FUNC_PROTO
365 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO 1
366 #endif
367 #if ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_AVX512_VBMI_FUNC_PTR
368 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PTR 1
369 #endif
370
371 #if ZEND_INTRIN_AVX2_NATIVE
372 # include <immintrin.h>
373 #elif ZEND_INTRIN_SSSE3_NATIVE
374 # include <tmmintrin.h>
375 #elif (ZEND_INTRIN_SSSE3_RESOLVER || ZEND_INTRIN_AVX2_RESOLVER)
376 # if ZEND_INTRIN_AVX2_RESOLVER
377 # include <immintrin.h>
378 # else
379 # include <tmmintrin.h>
380 # endif /* (ZEND_INTRIN_SSSE3_RESOLVER || ZEND_INTRIN_AVX2_RESOLVER) */
381 # include "Zend/zend_cpuinfo.h"
382
383 # if BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PTR
384 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length));
385 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict));
386 # endif
387 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
388 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length));
389 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict));
390 # endif
391
392 # if ZEND_INTRIN_AVX2_RESOLVER
393 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length));
394 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict));
395 # endif
396
397 # if ZEND_INTRIN_SSSE3_RESOLVER
398 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length));
399 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict));
400 # endif
401
402 zend_string *php_base64_encode_default(const unsigned char *str, size_t length);
403 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict);
404
405 # if (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO)
406 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length) __attribute__((ifunc("resolve_base64_encode")));
407 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) __attribute__((ifunc("resolve_base64_decode")));
408
409 typedef zend_string *(*base64_encode_func_t)(const unsigned char *, size_t);
410 typedef zend_string *(*base64_decode_func_t)(const unsigned char *, size_t, bool);
411
412 ZEND_NO_SANITIZE_ADDRESS
413 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_encode(void)414 static base64_encode_func_t resolve_base64_encode(void) {
415 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
416 if (zend_cpu_supports_avx512_vbmi()) {
417 return php_base64_encode_avx512_vbmi;
418 } else
419 # endif
420 # if BASE64_INTRIN_AVX512_FUNC_PROTO
421 if (zend_cpu_supports_avx512()) {
422 return php_base64_encode_avx512;
423 } else
424 # endif
425 # if ZEND_INTRIN_AVX2_FUNC_PROTO
426 if (zend_cpu_supports_avx2()) {
427 return php_base64_encode_avx2;
428 } else
429 # endif
430 #if ZEND_INTRIN_SSSE3_FUNC_PROTO
431 if (zend_cpu_supports_ssse3()) {
432 return php_base64_encode_ssse3;
433 }
434 #endif
435 return php_base64_encode_default;
436 }
437
438 ZEND_NO_SANITIZE_ADDRESS
439 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_decode(void)440 static base64_decode_func_t resolve_base64_decode(void) {
441 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
442 if (zend_cpu_supports_avx512_vbmi()) {
443 return php_base64_decode_ex_avx512_vbmi;
444 } else
445 # endif
446 # if BASE64_INTRIN_AVX512_FUNC_PROTO
447 if (zend_cpu_supports_avx512()) {
448 return php_base64_decode_ex_avx512;
449 } else
450 # endif
451 # if ZEND_INTRIN_AVX2_FUNC_PROTO
452 if (zend_cpu_supports_avx2()) {
453 return php_base64_decode_ex_avx2;
454 } else
455 # endif
456 #if ZEND_INTRIN_SSSE3_FUNC_PROTO
457 if (zend_cpu_supports_ssse3()) {
458 return php_base64_decode_ex_ssse3;
459 }
460 #endif
461 return php_base64_decode_ex_default;
462 }
463 # else /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
464
465 PHPAPI zend_string *(*php_base64_encode_ptr)(const unsigned char *str, size_t length) = NULL;
466 PHPAPI zend_string *(*php_base64_decode_ex_ptr)(const unsigned char *str, size_t length, bool strict) = NULL;
467
php_base64_encode(const unsigned char * str,size_t length)468 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length) {
469 return php_base64_encode_ptr(str, length);
470 }
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)471 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) {
472 return php_base64_decode_ex_ptr(str, length, strict);
473 }
474
PHP_MINIT_FUNCTION(base64_intrin)475 PHP_MINIT_FUNCTION(base64_intrin)
476 {
477 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
478 if (zend_cpu_supports_avx512_vbmi()) {
479 php_base64_encode_ptr = php_base64_encode_avx512_vbmi;
480 php_base64_decode_ex_ptr = php_base64_decode_ex_avx512_vbmi;
481 } else
482 # endif
483 # if BASE64_INTRIN_AVX512_FUNC_PTR
484 if (zend_cpu_supports_avx512()) {
485 php_base64_encode_ptr = php_base64_encode_avx512;
486 php_base64_decode_ex_ptr = php_base64_decode_ex_avx512;
487 } else
488 # endif
489 # if ZEND_INTRIN_AVX2_FUNC_PTR
490 if (zend_cpu_supports_avx2()) {
491 php_base64_encode_ptr = php_base64_encode_avx2;
492 php_base64_decode_ex_ptr = php_base64_decode_ex_avx2;
493 } else
494 # endif
495 #if ZEND_INTRIN_SSSE3_FUNC_PTR
496 if (zend_cpu_supports_ssse3()) {
497 php_base64_encode_ptr = php_base64_encode_ssse3;
498 php_base64_decode_ex_ptr = php_base64_decode_ex_ssse3;
499 } else
500 #endif
501 {
502 php_base64_encode_ptr = php_base64_encode_default;
503 php_base64_decode_ex_ptr = php_base64_decode_ex_default;
504 }
505 return SUCCESS;
506 }
507 # endif /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
508 #endif /* ZEND_INTRIN_AVX2_NATIVE */
509
510 #if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
php_base64_encode_avx512_vbmi(const unsigned char * str,size_t length)511 zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length)
512 {
513 const unsigned char *c = str;
514 unsigned char *o;
515 zend_string *result;
516
517 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
518 o = (unsigned char *)ZSTR_VAL(result);
519
520 const __m512i shuffle_splitting = _mm512_setr_epi32(
521 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
522 0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
523 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
524 const __m512i multi_shifts = _mm512_set1_epi64(0x3036242a1016040a);
525 const char *ascii_lookup_tbl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
526 const __m512i ascii_lookup = _mm512_loadu_si512((__m512i *)ascii_lookup_tbl);
527
528 while (length > 63) {
529 /* Step 1: load input data */
530 __m512i str = _mm512_loadu_si512((const __m512i *)c);
531
532 /* Step 2: splitting 24-bit words into 32-bit lanes */
533 str = _mm512_permutexvar_epi8(shuffle_splitting, str);
534
535 /* Step 3: moving 6-bit word to sperate bytes */
536 str = _mm512_multishift_epi64_epi8(multi_shifts, str);
537
538 /* Step 4: conversion to ASCII */
539 str = _mm512_permutexvar_epi8(str, ascii_lookup);
540
541 /* Step 5: store the final result */
542 _mm512_storeu_si512((__m512i *)o, str);
543 c += 48;
544 o += 64;
545 length -= 48;
546 }
547
548 o = php_base64_encode_impl(c, length, o);
549
550 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
551
552 return result;
553 }
554
php_base64_decode_ex_avx512_vbmi(const unsigned char * str,size_t length,bool strict)555 zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict)
556 {
557 const unsigned char *c = str;
558 unsigned char *o;
559 size_t outl = 0;
560 zend_string *result;
561
562 result = zend_string_alloc(length, 0);
563 o = (unsigned char *)ZSTR_VAL(result);
564
565 const __m512i lookup_0 = _mm512_setr_epi32(
566 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080,
567 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
568 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080);
569 const __m512i lookup_1 = _mm512_setr_epi32(
570 0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 0x1211100f, 0x16151413,
571 0x80191817, 0x80808080, 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
572 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080);
573
574 const __m512i merge_mask1 = _mm512_set1_epi32(0x01400140);
575 const __m512i merge_mask2 = _mm512_set1_epi32(0x00011000);
576
577 const __m512i continuous_mask = _mm512_setr_epi32(
578 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 0x191a1415, 0x1c1d1e18,
579 0x26202122, 0x292a2425, 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
580 0x00000000, 0x00000000, 0x00000000, 0x00000000);
581
582 while (length > 64) {
583 /* Step 1: load input data */
584 const __m512i input = _mm512_loadu_si512((__m512i *)c);
585
586 /* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
587 __m512i str = _mm512_permutex2var_epi8(lookup_0, input, lookup_1);
588 const uint64_t mask = _mm512_movepi8_mask(_mm512_or_epi64(str, input)); /* convert MSBs to the mask */
589 if (mask) {
590 break;
591 }
592
593 /* Step 3: pack four fields within 32-bit words into 24-bit words. */
594 const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, merge_mask1);
595 str = _mm512_madd_epi16(merge_ab_and_bc, merge_mask2);
596
597 /* Step 4: move 3-byte words into the continuous array. */
598 str = _mm512_permutexvar_epi8(continuous_mask, str);
599
600 /* Step 5: store the final result */
601 _mm512_storeu_si512((__m512i *)o, str);
602
603 c += 64;
604 o += 48;
605 outl += 48;
606 length -= 64;
607 }
608
609 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
610 zend_string_efree(result);
611 return NULL;
612 }
613
614 ZSTR_LEN(result) = outl;
615
616 return result;
617 }
618 #endif
619
620 #if BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PTR
php_base64_encode_avx512(const unsigned char * str,size_t length)621 zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length)
622 {
623 const unsigned char *c = str;
624 unsigned char *o;
625 zend_string *result;
626
627 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
628 o = (unsigned char *)ZSTR_VAL(result);
629
630 while (length > 63) {
631 /* Step 1: load input data */
632 /* [????|????|????|????|PPPO|OONN|NMMM|LLLK|KKJJ|JIII|HHHG|GGFF|FEEE|DDDC|CCBB|BAAA] */
633 __m512i str = _mm512_loadu_si512((const __m512i *)c);
634
635 /* Step 2: splitting 24-bit words into 32-bit lanes */
636 /* [0000|PPPO|OONN|NMMM|0000|LLLK|KKJJ|JIII|0000|HHHG|GGFF|FEEE|0000|DDDC|CCBB|BAAA] */
637 str = _mm512_permutexvar_epi32(
638 _mm512_set_epi32(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0), str);
639 /* [D1 D2 D0 D1|C1 C2 C0 C1|B1 B2 B0 B1|A1 A2 A0 A1] x 4 */
640 str = _mm512_shuffle_epi8(str, _mm512_set4_epi32(0x0a0b090a, 0x07080607, 0x04050304, 0x01020001));
641
642 /* Step 3: moving 6-bit word to sperate bytes */
643 /* in: [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] */
644 /* t0: [0000cccc|cc000000|aaaaaa00|00000000] */
645 const __m512i t0 = _mm512_and_si512(str, _mm512_set1_epi32(0x0fc0fc00));
646 /* t1: [00000000|00cccccc|00000000|00aaaaaa] */
647 const __m512i t1 = _mm512_srlv_epi16(t0, _mm512_set1_epi32(0x0006000a));
648 /* t2: [ccdddddd|00000000|aabbbbbb|cccc0000] */
649 const __m512i t2 = _mm512_sllv_epi16(str, _mm512_set1_epi32(0x00080004));
650 /* str: [00dddddd|00cccccc|00bbbbbb|00aaaaaa] */
651 str = _mm512_ternarylogic_epi32(_mm512_set1_epi32(0x3f003f00), t2, t1, 0xca);
652
653 /* Step 4: conversion to ASCII */
654 __m512i result = _mm512_subs_epu8(str, _mm512_set1_epi8(51));
655 const __mmask64 less = _mm512_cmpgt_epi8_mask(_mm512_set1_epi8(26), str);
656 result = _mm512_mask_mov_epi8(result, less, _mm512_set1_epi8(13));
657 const __m512i lut = _mm512_set4_epi32(0x000041f0, 0xedfcfcfc, 0xfcfcfcfc, 0xfcfcfc47);
658 result = _mm512_shuffle_epi8(lut, result);
659 result = _mm512_add_epi8(result, str);
660
661 /* Step 5: store the final result */
662 _mm512_storeu_si512((__m512i *)o, result);
663 c += 48;
664 o += 64;
665 length -= 48;
666 }
667
668 o = php_base64_encode_impl(c, length, o);
669
670 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
671
672 return result;
673 }
674
675 #define build_dword(b0, b1, b2, b3) \
676 ((uint32_t)(uint8_t)b0 << 0) | ((uint32_t)(uint8_t)b1 << 8) | \
677 ((uint32_t)(uint8_t)b2 << 16) | ((uint32_t)(uint8_t)b3 << 24)
678
679 #define _mm512_set4lanes_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15) \
680 _mm512_setr4_epi32(build_dword(b0, b1, b2, b3), build_dword(b4, b5, b6, b7), \
681 build_dword(b8, b9, b10, b11), build_dword(b12, b13, b14, b15))
682
php_base64_decode_ex_avx512(const unsigned char * str,size_t length,bool strict)683 zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict)
684 {
685 const unsigned char *c = str;
686 unsigned char *o;
687 size_t outl = 0;
688 zend_string *result;
689
690 result = zend_string_alloc(length, 0);
691 o = (unsigned char *)ZSTR_VAL(result);
692
693 while (length > 64) {
694 /* Step 1: load input data */
695 __m512i str = _mm512_loadu_si512((__m512i *)c);
696
697 /* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
698 const __m512i higher_nibble = _mm512_and_si512(_mm512_srli_epi32(str, 4), _mm512_set1_epi8(0x0f));
699 const __m512i lower_nibble = _mm512_and_si512(str, _mm512_set1_epi8(0x0f));
700 const __m512i shiftLUT = _mm512_set4lanes_epi8(
701 0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0);
702 const __m512i maskLUT = _mm512_set4lanes_epi8(
703 /* 0 : 0b1010_1000*/ 0xa8,
704 /* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
705 /* 10 : 0b1111_0000*/ 0xf0,
706 /* 11 : 0b0101_0100*/ 0x54,
707 /* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
708 /* 15 : 0b0101_0100*/ 0x54);
709 const __m512i bitposLUT = _mm512_set4lanes_epi8(
710 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
712 const __m512i M = _mm512_shuffle_epi8(maskLUT, lower_nibble);
713 const __m512i bit = _mm512_shuffle_epi8(bitposLUT, higher_nibble);
714 const uint64_t match = _mm512_test_epi8_mask(M, bit);
715 if (match != (uint64_t)-1) {
716 break;
717 }
718 const __m512i sh = _mm512_shuffle_epi8(shiftLUT, higher_nibble);
719 const __mmask64 eq_2f = _mm512_cmpeq_epi8_mask(str, _mm512_set1_epi8(0x2f));
720 const __m512i shift = _mm512_mask_mov_epi8(sh, eq_2f, _mm512_set1_epi8(16));
721 str = _mm512_add_epi8(str, shift);
722
723 /* Step 3: pack four fields within 32-bit words into 24-bit words. */
724 const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
725 str = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
726
727 /* Step 4: move 3-byte words into the continuous array. */
728 const __m512i t1 = _mm512_shuffle_epi8(str,
729 _mm512_set4lanes_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
730 const __m512i s6 = _mm512_setr_epi32(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0, 0, 0, 0);
731 const __m512i t2 = _mm512_permutexvar_epi32(s6, t1);
732
733 /* Step 5: store the final result */
734 _mm512_storeu_si512((__m512i *)o, t2);
735
736 c += 64;
737 o += 48;
738 outl += 48;
739 length -= 64;
740 }
741
742 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
743 zend_string_efree(result);
744 return NULL;
745 }
746
747 ZSTR_LEN(result) = outl;
748
749 return result;
750 }
751 #endif
752
753 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
754 # if ZEND_INTRIN_AVX2_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
755 static __m256i php_base64_encode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
756 static __m256i php_base64_encode_avx2_translate(__m256i in) __attribute__((target("avx2")));
757 # endif
php_base64_encode_avx2_reshuffle(__m256i in)758 static __m256i php_base64_encode_avx2_reshuffle(__m256i in)
759 {
760 /* This one works with shifted (4 bytes) input in order to
761 * be able to work efficiently in the 2 128-bit lanes */
762 __m256i t0, t1, t2, t3;
763
764 /* input, bytes MSB to LSB:
765 * 0 0 0 0 x w v u t s r q p o n m
766 * l k j i h g f e d c b a 0 0 0 0 */
767 in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
768 10, 11, 9, 10,
769 7, 8, 6, 7,
770 4, 5, 3, 4,
771 1, 2, 0, 1,
772
773 14, 15, 13, 14,
774 11, 12, 10, 11,
775 8, 9, 7, 8,
776 5, 6, 4, 5));
777
778 t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
779
780 t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
781
782 t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
783
784 t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
785
786 return _mm256_or_si256(t1, t3);
787 /* 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
788 * 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
789 * 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
790 * 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
791 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
792 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
793 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
794 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
795 }
796
php_base64_encode_avx2_translate(__m256i in)797 static __m256i php_base64_encode_avx2_translate(__m256i in)
798 {
799 __m256i lut, indices, mask;
800
801 lut = _mm256_setr_epi8(
802 65, 71, -4, -4, -4, -4, -4, -4,
803 -4, -4, -4, -4, -19, -16, 0, 0,
804 65, 71, -4, -4, -4, -4, -4, -4,
805 -4, -4, -4, -4, -19, -16, 0, 0);
806
807 indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
808
809 mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
810
811 indices = _mm256_sub_epi8(indices, mask);
812
813 return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
814
815 }
816 #endif /* ZEND_INTRIN_AVX2_NATIVE || (ZEND_INTRIN_AVX2_RESOLVER && !ZEND_INTRIN_SSSE3_NATIVE) */
817
818 #if ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
819
820 # if ZEND_INTRIN_SSSE3_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
821 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
822 static __m128i php_base64_encode_ssse3_translate(__m128i in) __attribute__((target("ssse3")));
823 # endif
824
php_base64_encode_ssse3_reshuffle(__m128i in)825 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in)
826 {
827 __m128i t0, t1, t2, t3;
828
829 /* input, bytes MSB to LSB:
830 * 0 0 0 0 l k j i h g f e d c b a */
831 in = _mm_shuffle_epi8(in, _mm_set_epi8(
832 10, 11, 9, 10,
833 7, 8, 6, 7,
834 4, 5, 3, 4,
835 1, 2, 0, 1));
836
837 t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
838
839 t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
840
841 t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
842
843 t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
844
845 /* output (upper case are MSB, lower case are LSB):
846 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
847 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
848 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
849 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
850 return _mm_or_si128(t1, t3);
851 }
852
php_base64_encode_ssse3_translate(__m128i in)853 static __m128i php_base64_encode_ssse3_translate(__m128i in)
854 {
855 __m128i mask, indices;
856 __m128i lut = _mm_setr_epi8(
857 65, 71, -4, -4,
858 -4, -4, -4, -4,
859 -4, -4, -4, -4,
860 -19, -16, 0, 0
861 );
862
863 /* Translate values 0..63 to the Base64 alphabet. There are five sets:
864 * # From To Abs Index Characters
865 * 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
866 * 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
867 * 2 [52..61] [48..57] -4 [2..11] 0123456789
868 * 3 [62] [43] -19 12 +
869 * 4 [63] [47] -16 13 / */
870
871 /* Create LUT indices from input:
872 * the index for range #0 is right, others are 1 less than expected: */
873 indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
874
875 /* mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: */
876 mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
877
878 /* subtract -1, so add 1 to indices for range #[1..4], All indices are now correct: */
879 indices = _mm_sub_epi8(indices, mask);
880
881 /* Add offsets to input values: */
882 return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
883 }
884
885 #define PHP_BASE64_ENCODE_SSSE3_LOOP \
886 while (length > 15) { \
887 __m128i s = _mm_loadu_si128((__m128i *)c); \
888 \
889 s = php_base64_encode_ssse3_reshuffle(s); \
890 \
891 s = php_base64_encode_ssse3_translate(s); \
892 \
893 _mm_storeu_si128((__m128i *)o, s); \
894 c += 12; \
895 o += 16; \
896 length -= 12; \
897 }
898
899 #endif /* ZEND_INTRIN_SSSE3_NATIVE || (ZEND_INTRIN_SSSE3_RESOLVER && !ZEND_INTRIN_AVX2_NATIVE) */
900
901 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
902 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_SSSE3_NATIVE
php_base64_encode(const unsigned char * str,size_t length)903 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length)
904 # elif ZEND_INTRIN_AVX2_RESOLVER
905 zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length)
906 # else /* ZEND_INTRIN_SSSE3_RESOLVER */
907 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length)
908 # endif
909 {
910 const unsigned char *c = str;
911 unsigned char *o;
912 zend_string *result;
913
914 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
915 o = (unsigned char *)ZSTR_VAL(result);
916 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
917 if (length > 31) {
918 __m256i s = _mm256_loadu_si256((__m256i *)c);
919
920 s = _mm256_permutevar8x32_epi32(s, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
921
922 for (;;) {
923 s = php_base64_encode_avx2_reshuffle(s);
924
925 s = php_base64_encode_avx2_translate(s);
926
927 _mm256_storeu_si256((__m256i *)o, s);
928 c += 24;
929 o += 32;
930 length -= 24;
931 if (length < 28) {
932 break;
933 }
934 s = _mm256_loadu_si256((__m256i *)(c - 4));
935 }
936 }
937 # else
938 PHP_BASE64_ENCODE_SSSE3_LOOP;
939 # endif
940
941 o = php_base64_encode_impl(c, length, o);
942
943 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
944
945 return result;
946 }
947
948 # if ZEND_INTRIN_SSSE3_RESOLVER && ZEND_INTRIN_AVX2_RESOLVER
php_base64_encode_ssse3(const unsigned char * str,size_t length)949 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length)
950 {
951 const unsigned char *c = str;
952 unsigned char *o;
953 zend_string *result;
954
955 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
956 o = (unsigned char *)ZSTR_VAL(result);
957
958 PHP_BASE64_ENCODE_SSSE3_LOOP;
959
960 o = php_base64_encode_impl(c, length, o);
961
962 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
963
964 return result;
965 }
966 # endif
967 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
968
969 /* }}} */
970
971 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
972 # if ZEND_INTRIN_AVX2_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
973 static __m256i php_base64_decode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
974 # endif
975
php_base64_decode_avx2_reshuffle(__m256i in)976 static __m256i php_base64_decode_avx2_reshuffle(__m256i in)
977 {
978 __m256i merge_ab_and_bc, out;
979
980 merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
981
982 out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
983
984 out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
985 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
986 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
987
988 return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
989 }
990 #endif
991
992 #if ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
993 # if ZEND_INTRIN_SSSE3_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
994 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
995 # endif
996
php_base64_decode_ssse3_reshuffle(__m128i in)997 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in)
998 {
999 __m128i merge_ab_and_bc, out;
1000
1001 merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
1002 /* 0000kkkk LLllllll 0000JJJJ JJjjKKKK
1003 * 0000hhhh IIiiiiii 0000GGGG GGggHHHH
1004 * 0000eeee FFffffff 0000DDDD DDddEEEE
1005 * 0000bbbb CCcccccc 0000AAAA AAaaBBBB */
1006
1007 out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
1008 /* 00000000 JJJJJJjj KKKKkkkk LLllllll
1009 * 00000000 GGGGGGgg HHHHhhhh IIiiiiii
1010 * 00000000 DDDDDDdd EEEEeeee FFffffff
1011 * 00000000 AAAAAAaa BBBBbbbb CCcccccc */
1012
1013 return _mm_shuffle_epi8(out, _mm_setr_epi8(
1014 2, 1, 0,
1015 6, 5, 4,
1016 10, 9, 8,
1017 14, 13, 12,
1018 -1, -1, -1, -1));
1019 /* 00000000 00000000 00000000 00000000
1020 * LLllllll KKKKkkkk JJJJJJjj IIiiiiii
1021 * HHHHhhhh GGGGGGgg FFffffff EEEEeeee
1022 * DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa */
1023 }
1024
1025 #define PHP_BASE64_DECODE_SSSE3_LOOP \
1026 while (length > 15 + 6 + 2) { \
1027 __m128i lut_lo, lut_hi, lut_roll; \
1028 __m128i hi_nibbles, lo_nibbles, hi, lo; \
1029 __m128i s = _mm_loadu_si128((__m128i *)c); \
1030 \
1031 lut_lo = _mm_setr_epi8( \
1032 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, \
1033 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); \
1034 lut_hi = _mm_setr_epi8( \
1035 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, \
1036 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); \
1037 lut_roll = _mm_setr_epi8( \
1038 0, 16, 19, 4, -65, -65, -71, -71, \
1039 0, 0, 0, 0, 0, 0, 0, 0); \
1040 \
1041 hi_nibbles = _mm_and_si128( \
1042 _mm_srli_epi32(s, 4), _mm_set1_epi8(0x2f)); \
1043 lo_nibbles = _mm_and_si128(s, _mm_set1_epi8(0x2f)); \
1044 hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); \
1045 lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); \
1046 \
1047 \
1048 if (UNEXPECTED( \
1049 _mm_movemask_epi8( \
1050 _mm_cmpgt_epi8( \
1051 _mm_and_si128(lo, hi), _mm_set1_epi8(0))))) { \
1052 break; \
1053 } else { \
1054 __m128i eq_2f, roll; \
1055 \
1056 eq_2f = _mm_cmpeq_epi8(s, _mm_set1_epi8(0x2f)); \
1057 roll = _mm_shuffle_epi8( \
1058 lut_roll, _mm_add_epi8(eq_2f, hi_nibbles)); \
1059 \
1060 s = _mm_add_epi8(s, roll); \
1061 s = php_base64_decode_ssse3_reshuffle(s); \
1062 \
1063 _mm_storeu_si128((__m128i *)o, s); \
1064 \
1065 c += 16; \
1066 o += 12; \
1067 outl += 12; \
1068 length -= 16; \
1069 } \
1070 }
1071
1072 #endif
1073
1074 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
1075 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_SSSE3_NATIVE
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)1076 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1077 # elif ZEND_INTRIN_AVX2_RESOLVER
1078 zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict)
1079 # else
1080 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1081 # endif
1082 {
1083 const unsigned char *c = str;
1084 unsigned char *o;
1085 size_t outl = 0;
1086 zend_string *result;
1087
1088 result = zend_string_alloc(length, 0);
1089 o = (unsigned char *)ZSTR_VAL(result);
1090
1091 /* See: "Faster Base64 Encoding and Decoding using AVX2 Instructions"
1092 * https://arxiv.org/pdf/1704.00605.pdf */
1093 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
1094 while (length > 31 + 11 + 2) {
1095 __m256i lut_lo, lut_hi, lut_roll;
1096 __m256i hi_nibbles, lo_nibbles, hi, lo;
1097 __m256i str = _mm256_loadu_si256((__m256i *)c);
1098
1099 lut_lo = _mm256_setr_epi8(
1100 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1101 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
1102 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1103 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
1104
1105 lut_hi = _mm256_setr_epi8(
1106 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1107 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1108 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1109 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
1110
1111 lut_roll = _mm256_setr_epi8(
1112 0, 16, 19, 4, -65, -65, -71, -71,
1113 0, 0, 0, 0, 0, 0, 0, 0,
1114 0, 16, 19, 4, -65, -65, -71, -71,
1115 0, 0, 0, 0, 0, 0, 0, 0);
1116
1117 hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), _mm256_set1_epi8(0x2f));
1118 lo_nibbles = _mm256_and_si256(str, _mm256_set1_epi8(0x2f));
1119 hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
1120 lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
1121
1122 if (!_mm256_testz_si256(lo, hi)) {
1123 break;
1124 } else {
1125 __m256i eq_2f, roll;
1126 eq_2f = _mm256_cmpeq_epi8(str, _mm256_set1_epi8(0x2f));
1127 roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2f, hi_nibbles));
1128
1129
1130 str = _mm256_add_epi8(str, roll);
1131
1132 str = php_base64_decode_avx2_reshuffle(str);
1133
1134 _mm256_storeu_si256((__m256i *)o, str);
1135
1136 c += 32;
1137 o += 24;
1138 outl += 24;
1139 length -= 32;
1140 }
1141 }
1142 # else
1143 PHP_BASE64_DECODE_SSSE3_LOOP;
1144 # endif
1145
1146 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1147 zend_string_efree(result);
1148 return NULL;
1149 }
1150
1151 ZSTR_LEN(result) = outl;
1152
1153 return result;
1154 }
1155
1156 # if ZEND_INTRIN_SSSE3_RESOLVER && ZEND_INTRIN_AVX2_RESOLVER
php_base64_decode_ex_ssse3(const unsigned char * str,size_t length,bool strict)1157 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1158 {
1159 const unsigned char *c = str;
1160 unsigned char *o;
1161 size_t outl = 0;
1162 zend_string *result;
1163
1164 result = zend_string_alloc(length, 0);
1165 o = (unsigned char *)ZSTR_VAL(result);
1166
1167 PHP_BASE64_DECODE_SSSE3_LOOP;
1168
1169 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1170 zend_string_efree(result);
1171 return NULL;
1172 }
1173
1174 ZSTR_LEN(result) = outl;
1175
1176 return result;
1177 }
1178 # endif
1179 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
1180
1181 #if !ZEND_INTRIN_AVX2_NATIVE && !ZEND_INTRIN_SSSE3_NATIVE
1182 #if ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_RESOLVER
php_base64_encode_default(const unsigned char * str,size_t length)1183 zend_string *php_base64_encode_default(const unsigned char *str, size_t length)
1184 #else
1185 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length)
1186 #endif
1187 {
1188 unsigned char *p;
1189 zend_string *result;
1190
1191 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
1192 p = (unsigned char *)ZSTR_VAL(result);
1193
1194 p = php_base64_encode_impl(str, length, p);
1195
1196 ZSTR_LEN(result) = (p - (unsigned char *)ZSTR_VAL(result));
1197
1198 return result;
1199 }
1200 #endif
1201
1202 #if !ZEND_INTRIN_AVX2_NATIVE && !ZEND_INTRIN_SSSE3_NATIVE
1203 #if ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_RESOLVER
php_base64_decode_ex_default(const unsigned char * str,size_t length,bool strict)1204 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict)
1205 #else
1206 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1207 #endif
1208 {
1209 zend_string *result;
1210 size_t outl = 0;
1211
1212 result = zend_string_alloc(length, 0);
1213
1214 if (!php_base64_decode_impl(str, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1215 zend_string_efree(result);
1216 return NULL;
1217 }
1218
1219 ZSTR_LEN(result) = outl;
1220
1221 return result;
1222 }
1223 #endif
1224 /* }}} */
1225
1226 /* {{{ Encodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_encode)1227 PHP_FUNCTION(base64_encode)
1228 {
1229 char *str;
1230 size_t str_len;
1231 zend_string *result;
1232
1233 ZEND_PARSE_PARAMETERS_START(1, 1)
1234 Z_PARAM_STRING(str, str_len)
1235 ZEND_PARSE_PARAMETERS_END();
1236
1237 result = php_base64_encode((unsigned char*)str, str_len);
1238 RETURN_STR(result);
1239 }
1240 /* }}} */
1241
1242 /* {{{ Decodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_decode)1243 PHP_FUNCTION(base64_decode)
1244 {
1245 char *str;
1246 bool strict = 0;
1247 size_t str_len;
1248 zend_string *result;
1249
1250 ZEND_PARSE_PARAMETERS_START(1, 2)
1251 Z_PARAM_STRING(str, str_len)
1252 Z_PARAM_OPTIONAL
1253 Z_PARAM_BOOL(strict)
1254 ZEND_PARSE_PARAMETERS_END();
1255
1256 result = php_base64_decode_ex((unsigned char*)str, str_len, strict);
1257 if (result != NULL) {
1258 RETURN_STR(result);
1259 } else {
1260 RETURN_FALSE;
1261 }
1262 }
1263 /* }}} */
1264