xref: /php-src/ext/standard/base64.c (revision 6c5814da)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Jim Winstead <jimw@php.net>                                  |
14    |         Xinchen Hui <laruence@php.net>                               |
15    +----------------------------------------------------------------------+
16  */
17 
18 #include <string.h>
19 
20 #include "php.h"
21 #include "base64.h"
22 
23 /* {{{ base64 tables */
24 static const char base64_table[] = {
25 	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
26 	'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
27 	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
28 	'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
29 	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
30 };
31 
32 static const char base64_pad = '=';
33 
34 static const short base64_reverse_table[256] = {
35 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
36 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
37 	-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
38 	52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
39 	-2,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
40 	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
41 	-2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
42 	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
43 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
48 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
49 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
50 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
51 };
52 /* }}} */
53 
54 #if defined(__aarch64__) || defined(_M_ARM64)
55 #include <arm_neon.h>
56 
encode_toascii(const uint8x16_t input,const uint8x16x2_t shift_LUT)57 static zend_always_inline uint8x16_t encode_toascii(const uint8x16_t input, const uint8x16x2_t shift_LUT)
58 {
59 	/* reduce  0..51 -> 0
60 	          52..61 -> 1 .. 10
61 	              62 -> 11
62 	              63 -> 12 */
63 	uint8x16_t result = vqsubq_u8(input, vdupq_n_u8(51));
64 	/* distinguish between ranges 0..25 and 26..51:
65 	   0 .. 25 -> remains 0
66 	   26 .. 51 -> becomes 13 */
67 	const uint8x16_t less = vcgtq_u8(vdupq_n_u8(26), input);
68 	result = vorrq_u8(result, vandq_u8(less, vdupq_n_u8(13)));
69 	/* read shift */
70 	result = vqtbl2q_u8(shift_LUT, result);
71 	return vaddq_u8(result, input);
72 }
73 
neon_base64_encode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)74 static zend_always_inline unsigned char *neon_base64_encode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left)
75 {
76 	const uint8_t shift_LUT_[32] = {'a' - 26, '0' - 52, '0' - 52, '0' - 52,
77 					'0' - 52, '0' - 52, '0' - 52, '0' - 52,
78 					'0' - 52, '0' - 52, '0' - 52, '+' - 62,
79 					'/' - 63, 'A',      0,        0,
80 					'a' - 26, '0' - 52, '0' - 52, '0' - 52,
81 					'0' - 52, '0' - 52, '0' - 52, '0' - 52,
82 					'0' - 52, '0' - 52, '0' - 52, '+' - 62,
83 					'/' - 63, 'A',      0,        0};
84 	const uint8x16x2_t shift_LUT = *((const uint8x16x2_t *)shift_LUT_);
85 	do {
86 		/* [ccdddddd | bbbbcccc | aaaaaabb]
87 		    x.val[2] | x.val[1] | x.val[0] */
88 		const uint8x16x3_t x = vld3q_u8((const uint8_t *)(in));
89 
90 		/* [00aa_aaaa] */
91 		const uint8x16_t field_a = vshrq_n_u8(x.val[0], 2);
92 
93 		const uint8x16_t field_b =             /* [00bb_bbbb] */
94 		    vbslq_u8(vdupq_n_u8(0x30),         /* [0011_0000] */
95 		             vshlq_n_u8(x.val[0], 4),  /* [aabb_0000] */
96 		             vshrq_n_u8(x.val[1], 4)); /* [0000_bbbb] */
97 
98 		const uint8x16_t field_c =             /* [00cc_cccc] */
99 		    vbslq_u8(vdupq_n_u8(0x3c),         /* [0011_1100] */
100 		             vshlq_n_u8(x.val[1], 2),  /* [bbcc_cc00] */
101 		             vshrq_n_u8(x.val[2], 6)); /* [0000_00cc] */
102 
103 		/* [00dd_dddd] */
104 		const uint8x16_t field_d = vandq_u8(x.val[2], vdupq_n_u8(0x3f));
105 
106 		uint8x16x4_t result;
107 		result.val[0] = encode_toascii(field_a, shift_LUT);
108 		result.val[1] = encode_toascii(field_b, shift_LUT);
109 		result.val[2] = encode_toascii(field_c, shift_LUT);
110 		result.val[3] = encode_toascii(field_d, shift_LUT);
111 
112 		vst4q_u8((uint8_t *)out, result);
113 		out += 64;
114 		in += 16 * 3;
115 		inl -= 16 * 3;
116 	} while (inl >= 16 * 3);
117 
118 	*left = inl;
119 	return out;
120 }
121 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
122 
php_base64_encode_impl(const unsigned char * in,size_t inl,unsigned char * out,zend_long flags)123 static zend_always_inline unsigned char *php_base64_encode_impl(const unsigned char *in, size_t inl, unsigned char *out, zend_long flags) /* {{{ */
124 {
125 #if defined(__aarch64__) || defined(_M_ARM64)
126 	if (inl >= 16 * 3) {
127 		size_t left = 0;
128 		out = neon_base64_encode(in, inl, out, &left);
129 		in += inl - left;
130 		inl = left;
131 	}
132 #endif
133 
134 	while (inl > 2) { /* keep going until we have less than 24 bits */
135 		*out++ = base64_table[in[0] >> 2];
136 		*out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
137 		*out++ = base64_table[((in[1] & 0x0f) << 2) + (in[2] >> 6)];
138 		*out++ = base64_table[in[2] & 0x3f];
139 
140 		in += 3;
141 		inl -= 3; /* we just handle 3 octets of data */
142 	}
143 
144 	/* now deal with the tail end of things */
145 	if (inl != 0) {
146 		*out++ = base64_table[in[0] >> 2];
147 		if (inl > 1) {
148 			*out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
149 			*out++ = base64_table[(in[1] & 0x0f) << 2];
150 			if ((flags & PHP_BASE64_NO_PADDING) == 0) {
151 				*out++ = base64_pad;
152 			}
153 		} else {
154 			*out++ = base64_table[(in[0] & 0x03) << 4];
155 			if ((flags & PHP_BASE64_NO_PADDING) == 0) {
156 				*out++ = base64_pad;
157 				*out++ = base64_pad;
158 			}
159 		}
160 	}
161 
162 	*out = '\0';
163 
164 	return out;
165 }
166 /* }}} */
167 
168 #if defined(__aarch64__) || defined(_M_ARM64)
decode_fromascii(const uint8x16_t input,uint8x16_t * error,const uint8x16x2_t shiftLUT,const uint8x16x2_t maskLUT,const uint8x16x2_t bitposLUT)169 static zend_always_inline uint8x16_t decode_fromascii(const uint8x16_t input, uint8x16_t *error, const uint8x16x2_t shiftLUT, const uint8x16x2_t maskLUT, const uint8x16x2_t bitposLUT) {
170 	const uint8x16_t higher_nibble = vshrq_n_u8(input, 4);
171 	const uint8x16_t lower_nibble = vandq_u8(input, vdupq_n_u8(0x0f));
172 	const uint8x16_t sh = vqtbl2q_u8(shiftLUT, higher_nibble);
173 	const uint8x16_t eq_2f = vceqq_u8(input, vdupq_n_u8(0x2f));
174 	const uint8x16_t shift = vbslq_u8(eq_2f, vdupq_n_u8(16), sh);
175 	const uint8x16_t M = vqtbl2q_u8(maskLUT, lower_nibble);
176 	const uint8x16_t bit = vqtbl2q_u8(bitposLUT, higher_nibble);
177 	*error = vceqq_u8(vandq_u8(M, bit), vdupq_n_u8(0));
178 	return vaddq_u8(input, shift);
179 }
180 
neon_base64_decode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)181 static zend_always_inline size_t neon_base64_decode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left) {
182 	unsigned char *out_orig = out;
183 	const uint8_t shiftLUT_[32] = {
184 		0,   0,  19,   4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
185 		0,   0,   0,   0,   0,   0,   0,   0,
186 		0,   0,  19,   4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
187 		0,   0,   0,   0,   0,   0,   0,   0};
188 	const uint8_t maskLUT_[32] = {
189 		/* 0        : 0b1010_1000*/ 0xa8,
190 		/* 1 .. 9   : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
191 		/* 10       : 0b1111_0000*/ 0xf0,
192 		/* 11       : 0b0101_0100*/ 0x54,
193 		/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
194 		/* 15       : 0b0101_0100*/ 0x54,
195 
196 		/* 0        : 0b1010_1000*/ 0xa8,
197 		/* 1 .. 9   : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
198 		/* 10       : 0b1111_0000*/ 0xf0,
199 		/* 11       : 0b0101_0100*/ 0x54,
200 		/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
201 		/* 15       : 0b0101_0100*/ 0x54
202 	};
203 	const uint8_t bitposLUT_[32] = {
204 		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
205 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
206 
207 		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
208 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
209 	};
210 	const uint8x16x2_t shiftLUT = *((const uint8x16x2_t *)shiftLUT_);
211 	const uint8x16x2_t maskLUT = *((const uint8x16x2_t *)maskLUT_);
212 	const uint8x16x2_t bitposLUT = *((const uint8x16x2_t *)bitposLUT_);;
213 
214 	do {
215 		const uint8x16x4_t x = vld4q_u8((const unsigned char *)in);
216 		uint8x16_t error_a;
217 		uint8x16_t error_b;
218 		uint8x16_t error_c;
219 		uint8x16_t error_d;
220 		uint8x16_t field_a = decode_fromascii(x.val[0], &error_a, shiftLUT, maskLUT, bitposLUT);
221 		uint8x16_t field_b = decode_fromascii(x.val[1], &error_b, shiftLUT, maskLUT, bitposLUT);
222 		uint8x16_t field_c = decode_fromascii(x.val[2], &error_c, shiftLUT, maskLUT, bitposLUT);
223 		uint8x16_t field_d = decode_fromascii(x.val[3], &error_d, shiftLUT, maskLUT, bitposLUT);
224 
225 		const uint8x16_t err = vorrq_u8(vorrq_u8(error_a, error_b), vorrq_u8(error_c, error_d));
226 		union {uint8_t mem[16]; uint64_t dw[2]; } error;
227 		vst1q_u8(error.mem, err);
228 
229 		/* Check that the input only contains bytes belonging to the alphabet of
230 		   Base64. If there are errors, decode the rest of the string with the
231 		   scalar decoder. */
232 		if (error.dw[0] | error.dw[1])
233 			break;
234 
235 		uint8x16x3_t result;
236 		result.val[0] = vorrq_u8(vshrq_n_u8(field_b, 4), vshlq_n_u8(field_a, 2));
237 		result.val[1] = vorrq_u8(vshrq_n_u8(field_c, 2), vshlq_n_u8(field_b, 4));
238 		result.val[2] = vorrq_u8(field_d, vshlq_n_u8(field_c, 6));
239 
240 		vst3q_u8((unsigned char *)out, result);
241 		out += 16 * 3;
242 		in += 16 * 4;
243 		inl -= 16 * 4;
244 	} while (inl >= 16 * 4);
245 	*left = inl;
246 	return out - out_orig;
247 }
248 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
249 
php_base64_decode_impl(const unsigned char * in,size_t inl,unsigned char * out,size_t * outl,bool strict)250 static zend_always_inline int php_base64_decode_impl(const unsigned char *in, size_t inl, unsigned char *out, size_t *outl, bool strict) /* {{{ */
251 {
252 	int ch;
253 	size_t i = 0, padding = 0, j = *outl;
254 
255 #if defined(__aarch64__) || defined(_M_ARM64)
256 	if (inl >= 16 * 4) {
257 		size_t left = 0;
258 		j += neon_base64_decode(in, inl, out, &left);
259 		i = inl - left;
260 		in += i;
261 		inl = left;
262 	}
263 #endif
264 
265 	/* run through the whole string, converting as we go */
266 	while (inl-- > 0) {
267 		ch = *in++;
268 		if (ch == base64_pad) {
269 			padding++;
270 			continue;
271 		}
272 
273 		ch = base64_reverse_table[ch];
274 		if (!strict) {
275 			/* skip unknown characters and whitespace */
276 			if (ch < 0) {
277 				continue;
278 			}
279 		} else {
280 			/* skip whitespace */
281 			if (ch == -1) {
282 				continue;
283 			}
284 			/* fail on bad characters or if any data follows padding */
285 			if (ch == -2 || padding) {
286 				goto fail;
287 			}
288 		}
289 
290 		switch (i % 4) {
291 			case 0:
292 				out[j] = ch << 2;
293 				break;
294 			case 1:
295 				out[j++] |= ch >> 4;
296 				out[j] = (ch & 0x0f) << 4;
297 				break;
298 			case 2:
299 				out[j++] |= ch >>2;
300 				out[j] = (ch & 0x03) << 6;
301 				break;
302 			case 3:
303 				out[j++] |= ch;
304 				break;
305 		}
306 		i++;
307 	}
308 
309 	/* fail if the input is truncated (only one char in last group) */
310 	if (strict && i % 4 == 1) {
311 		goto fail;
312 	}
313 
314 	/* fail if the padding length is wrong (not VV==, VVV=), but accept zero padding
315 	 * RFC 4648: "In some circumstances, the use of padding [--] is not required" */
316 	if (strict && padding && (padding > 2 || (i + padding) % 4 != 0)) {
317 		goto fail;
318 	}
319 
320 	*outl = j;
321 	out[j] = '\0';
322 
323 	return 1;
324 
325 fail:
326 	return 0;
327 }
328 /* }}} */
329 
330 /* {{{ php_base64_encode */
331 
332 #if ZEND_INTRIN_AVX2_NATIVE
333 # undef ZEND_INTRIN_SSSE3_NATIVE
334 # undef ZEND_INTRIN_SSSE3_RESOLVER
335 # undef ZEND_INTRIN_SSSE3_FUNC_PROTO
336 # undef ZEND_INTRIN_SSSE3_FUNC_PTR
337 #elif ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_SSSE3_NATIVE
338 # undef ZEND_INTRIN_SSSE3_NATIVE
339 # undef ZEND_INTRIN_SSSE3_RESOLVER
340 # define ZEND_INTRIN_SSSE3_RESOLVER 1
341 # define ZEND_INTRIN_SSSE3_FUNC_PROTO 1
342 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
343 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
344 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
345 # else
346 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
347 # endif
348 #elif ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_SSSE3_NATIVE
349 # undef ZEND_INTRIN_SSSE3_NATIVE
350 # undef ZEND_INTRIN_SSSE3_RESOLVER
351 # define ZEND_INTRIN_SSSE3_RESOLVER 1
352 # define ZEND_INTRIN_SSSE3_FUNC_PTR 1
353 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
354 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
355 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
356 # else
357 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
358 # endif
359 #endif
360 
361 /* Only enable avx512 resolver if avx2 use resolver also */
362 #if ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_AVX512_FUNC_PROTO
363 #define BASE64_INTRIN_AVX512_FUNC_PROTO 1
364 #endif
365 #if ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_AVX512_FUNC_PTR
366 #define BASE64_INTRIN_AVX512_FUNC_PTR 1
367 #endif
368 #if ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_AVX512_VBMI_FUNC_PROTO
369 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO 1
370 #endif
371 #if ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_AVX512_VBMI_FUNC_PTR
372 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PTR 1
373 #endif
374 
375 #if ZEND_INTRIN_AVX2_NATIVE
376 # include <immintrin.h>
377 #elif ZEND_INTRIN_SSSE3_NATIVE
378 # include <tmmintrin.h>
379 #elif (ZEND_INTRIN_SSSE3_RESOLVER || ZEND_INTRIN_AVX2_RESOLVER)
380 # if ZEND_INTRIN_AVX2_RESOLVER
381 #  include <immintrin.h>
382 # else
383 #  include <tmmintrin.h>
384 # endif /* (ZEND_INTRIN_SSSE3_RESOLVER || ZEND_INTRIN_AVX2_RESOLVER) */
385 # include "Zend/zend_cpuinfo.h"
386 
387 # if BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PTR
388 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length, zend_long flags));
389 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict));
390 # endif
391 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
392 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length, zend_long flags));
393 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict));
394 # endif
395 
396 # if ZEND_INTRIN_AVX2_RESOLVER
397 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length, zend_long flags));
398 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict));
399 # endif
400 
401 # if ZEND_INTRIN_SSSE3_RESOLVER
402 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length, zend_long flags));
403 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict));
404 # endif
405 
406 zend_string *php_base64_encode_default(const unsigned char *str, size_t length, zend_long flags);
407 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict);
408 
409 # if (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO)
410 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags) __attribute__((ifunc("resolve_base64_encode")));
411 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) __attribute__((ifunc("resolve_base64_decode")));
412 
413 typedef zend_string *(*base64_encode_func_t)(const unsigned char *, size_t, zend_long flags);
414 typedef zend_string *(*base64_decode_func_t)(const unsigned char *, size_t, bool);
415 
416 ZEND_NO_SANITIZE_ADDRESS
417 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_encode(void)418 static base64_encode_func_t resolve_base64_encode(void) {
419 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
420 	if (zend_cpu_supports_avx512_vbmi()) {
421 		return php_base64_encode_avx512_vbmi;
422 	} else
423 # endif
424 # if BASE64_INTRIN_AVX512_FUNC_PROTO
425 	if (zend_cpu_supports_avx512()) {
426 		return php_base64_encode_avx512;
427 	} else
428 # endif
429 # if ZEND_INTRIN_AVX2_FUNC_PROTO
430 	if (zend_cpu_supports_avx2()) {
431 		return php_base64_encode_avx2;
432 	} else
433 # endif
434 #if ZEND_INTRIN_SSSE3_FUNC_PROTO
435 	if (zend_cpu_supports_ssse3()) {
436 		return php_base64_encode_ssse3;
437 	}
438 #endif
439 	return php_base64_encode_default;
440 }
441 
442 ZEND_NO_SANITIZE_ADDRESS
443 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_decode(void)444 static base64_decode_func_t resolve_base64_decode(void) {
445 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
446 	if (zend_cpu_supports_avx512_vbmi()) {
447 		return php_base64_decode_ex_avx512_vbmi;
448 	} else
449 # endif
450 # if BASE64_INTRIN_AVX512_FUNC_PROTO
451 	if (zend_cpu_supports_avx512()) {
452 		return php_base64_decode_ex_avx512;
453 	} else
454 # endif
455 # if ZEND_INTRIN_AVX2_FUNC_PROTO
456 	if (zend_cpu_supports_avx2()) {
457 		return php_base64_decode_ex_avx2;
458 	} else
459 # endif
460 #if ZEND_INTRIN_SSSE3_FUNC_PROTO
461 	if (zend_cpu_supports_ssse3()) {
462 		return php_base64_decode_ex_ssse3;
463 	}
464 #endif
465 	return php_base64_decode_ex_default;
466 }
467 # else /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
468 
469 PHPAPI zend_string *(*php_base64_encode_ptr)(const unsigned char *str, size_t length, zend_long flags) = NULL;
470 PHPAPI zend_string *(*php_base64_decode_ex_ptr)(const unsigned char *str, size_t length, bool strict) = NULL;
471 
php_base64_encode_ex(const unsigned char * str,size_t length,zend_long flags)472 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags) {
473 	return php_base64_encode_ptr(str, length, flags);
474 }
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)475 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) {
476 	return php_base64_decode_ex_ptr(str, length, strict);
477 }
478 
PHP_MINIT_FUNCTION(base64_intrin)479 PHP_MINIT_FUNCTION(base64_intrin)
480 {
481 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
482 	if (zend_cpu_supports_avx512_vbmi()) {
483 		php_base64_encode_ptr = php_base64_encode_avx512_vbmi;
484 		php_base64_decode_ex_ptr = php_base64_decode_ex_avx512_vbmi;
485 	} else
486 # endif
487 # if BASE64_INTRIN_AVX512_FUNC_PTR
488 	if (zend_cpu_supports_avx512()) {
489 		php_base64_encode_ptr = php_base64_encode_avx512;
490 		php_base64_decode_ex_ptr = php_base64_decode_ex_avx512;
491 	} else
492 # endif
493 # if ZEND_INTRIN_AVX2_FUNC_PTR
494 	if (zend_cpu_supports_avx2()) {
495 		php_base64_encode_ptr = php_base64_encode_avx2;
496 		php_base64_decode_ex_ptr = php_base64_decode_ex_avx2;
497 	} else
498 # endif
499 #if ZEND_INTRIN_SSSE3_FUNC_PTR
500 	if (zend_cpu_supports_ssse3()) {
501 		php_base64_encode_ptr = php_base64_encode_ssse3;
502 		php_base64_decode_ex_ptr = php_base64_decode_ex_ssse3;
503 	} else
504 #endif
505 	{
506 		php_base64_encode_ptr = php_base64_encode_default;
507 		php_base64_decode_ex_ptr = php_base64_decode_ex_default;
508 	}
509 	return SUCCESS;
510 }
511 # endif /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
512 #endif /* ZEND_INTRIN_AVX2_NATIVE */
513 
514 #if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
php_base64_encode_avx512_vbmi(const unsigned char * str,size_t length,zend_long flags)515 zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length, zend_long flags)
516 {
517 	const unsigned char *c = str;
518 	unsigned char *o;
519 	zend_string *result;
520 
521 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
522 	o = (unsigned char *)ZSTR_VAL(result);
523 
524 	const __m512i shuffle_splitting = _mm512_setr_epi32(
525 		0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
526 		0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
527 		0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
528 	const __m512i multi_shifts = _mm512_set1_epi64(0x3036242a1016040a);
529 	const char *ascii_lookup_tbl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
530 	const __m512i ascii_lookup = _mm512_loadu_si512((__m512i *)ascii_lookup_tbl);
531 
532 	while (length > 63) {
533 		/* Step 1: load input data */
534 		__m512i str = _mm512_loadu_si512((const __m512i *)c);
535 
536 		/* Step 2: splitting 24-bit words into 32-bit lanes */
537 		str = _mm512_permutexvar_epi8(shuffle_splitting, str);
538 
539 		/* Step 3: moving 6-bit word to sperate bytes */
540 		str = _mm512_multishift_epi64_epi8(multi_shifts, str);
541 
542 		/* Step 4: conversion to ASCII */
543 		str = _mm512_permutexvar_epi8(str, ascii_lookup);
544 
545 		/* Step 5: store the final result */
546 		_mm512_storeu_si512((__m512i *)o, str);
547 		c += 48;
548 		o += 64;
549 		length -= 48;
550 	}
551 
552 	o = php_base64_encode_impl(c, length, o, flags);
553 
554 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
555 
556 	return result;
557 }
558 
php_base64_decode_ex_avx512_vbmi(const unsigned char * str,size_t length,bool strict)559 zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict)
560 {
561 	const unsigned char *c = str;
562 	unsigned char *o;
563 	size_t outl = 0;
564 	zend_string *result;
565 
566 	result = zend_string_alloc(length, 0);
567 	o = (unsigned char *)ZSTR_VAL(result);
568 
569 	const __m512i lookup_0 = _mm512_setr_epi32(
570 		0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080,
571 		0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
572 		0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080);
573 	const __m512i lookup_1 = _mm512_setr_epi32(
574 		0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 0x1211100f, 0x16151413,
575 		0x80191817, 0x80808080, 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
576 		0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080);
577 
578 	const __m512i merge_mask1 = _mm512_set1_epi32(0x01400140);
579 	const __m512i merge_mask2 = _mm512_set1_epi32(0x00011000);
580 
581 	const __m512i continuous_mask = _mm512_setr_epi32(
582 		0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 0x191a1415, 0x1c1d1e18,
583 		0x26202122, 0x292a2425, 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
584 		0x00000000, 0x00000000, 0x00000000, 0x00000000);
585 
586 	while (length > 64) {
587 		/* Step 1: load input data */
588 		const __m512i input = _mm512_loadu_si512((__m512i *)c);
589 
590 		/* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
591 		__m512i str = _mm512_permutex2var_epi8(lookup_0, input, lookup_1);
592 		const uint64_t mask = _mm512_movepi8_mask(_mm512_or_epi64(str, input)); /* convert MSBs to the mask */
593 		if (mask) {
594 			break;
595 		}
596 
597 		/* Step 3: pack four fields within 32-bit words into 24-bit words. */
598 		const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, merge_mask1);
599 		str = _mm512_madd_epi16(merge_ab_and_bc, merge_mask2);
600 
601 		/* Step 4: move 3-byte words into the continuous array. */
602 		str = _mm512_permutexvar_epi8(continuous_mask, str);
603 
604 		/* Step 5: store the final result */
605 		_mm512_storeu_si512((__m512i *)o, str);
606 
607 		c += 64;
608 		o += 48;
609 		outl += 48;
610 		length -= 64;
611 	}
612 
613 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
614 		zend_string_efree(result);
615 		return NULL;
616 	}
617 
618 	ZSTR_LEN(result) = outl;
619 
620 	return result;
621 }
622 #endif
623 
624 #if BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PTR
php_base64_encode_avx512(const unsigned char * str,size_t length,zend_long flags)625 zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length, zend_long flags)
626 {
627 	const unsigned char *c = str;
628 	unsigned char *o;
629 	zend_string *result;
630 
631 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
632 	o = (unsigned char *)ZSTR_VAL(result);
633 
634 	while (length > 63) {
635 		/* Step 1: load input data */
636 		/* [????|????|????|????|PPPO|OONN|NMMM|LLLK|KKJJ|JIII|HHHG|GGFF|FEEE|DDDC|CCBB|BAAA] */
637 		__m512i str = _mm512_loadu_si512((const __m512i *)c);
638 
639 		/* Step 2: splitting 24-bit words into 32-bit lanes */
640 		/* [0000|PPPO|OONN|NMMM|0000|LLLK|KKJJ|JIII|0000|HHHG|GGFF|FEEE|0000|DDDC|CCBB|BAAA] */
641 		str = _mm512_permutexvar_epi32(
642 			_mm512_set_epi32(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0), str);
643 		/* [D1 D2 D0 D1|C1 C2 C0 C1|B1 B2 B0 B1|A1 A2 A0 A1] x 4 */
644 		str = _mm512_shuffle_epi8(str, _mm512_set4_epi32(0x0a0b090a, 0x07080607, 0x04050304, 0x01020001));
645 
646 		/* Step 3: moving 6-bit word to sperate bytes */
647 		/* in:  [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] */
648 		/* t0:  [0000cccc|cc000000|aaaaaa00|00000000] */
649 		const __m512i t0 = _mm512_and_si512(str, _mm512_set1_epi32(0x0fc0fc00));
650 		/* t1:  [00000000|00cccccc|00000000|00aaaaaa] */
651 		const __m512i t1 = _mm512_srlv_epi16(t0, _mm512_set1_epi32(0x0006000a));
652 		/* t2:  [ccdddddd|00000000|aabbbbbb|cccc0000] */
653 		const __m512i t2 = _mm512_sllv_epi16(str, _mm512_set1_epi32(0x00080004));
654 		/* str: [00dddddd|00cccccc|00bbbbbb|00aaaaaa] */
655 		str = _mm512_ternarylogic_epi32(_mm512_set1_epi32(0x3f003f00), t2, t1, 0xca);
656 
657 		/* Step 4: conversion to ASCII */
658 		__m512i result = _mm512_subs_epu8(str, _mm512_set1_epi8(51));
659 		const __mmask64 less = _mm512_cmpgt_epi8_mask(_mm512_set1_epi8(26), str);
660 		result = _mm512_mask_mov_epi8(result, less, _mm512_set1_epi8(13));
661 		const __m512i lut = _mm512_set4_epi32(0x000041f0, 0xedfcfcfc, 0xfcfcfcfc, 0xfcfcfc47);
662 		result = _mm512_shuffle_epi8(lut, result);
663 		result = _mm512_add_epi8(result, str);
664 
665 		/* Step 5: store the final result */
666 		_mm512_storeu_si512((__m512i *)o, result);
667 		c += 48;
668 		o += 64;
669 		length -= 48;
670 	}
671 
672 	o = php_base64_encode_impl(c, length, o, flags);
673 
674 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
675 
676 	return result;
677 }
678 
679 #define build_dword(b0, b1, b2, b3)					\
680 	((uint32_t)(uint8_t)b0 << 0) | ((uint32_t)(uint8_t)b1 << 8) |	\
681 	((uint32_t)(uint8_t)b2 << 16) | ((uint32_t)(uint8_t)b3 << 24)
682 
683 #define _mm512_set4lanes_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15)	\
684 	_mm512_setr4_epi32(build_dword(b0, b1, b2, b3), build_dword(b4, b5, b6, b7),			\
685 			   build_dword(b8, b9, b10, b11), build_dword(b12, b13, b14, b15))
686 
php_base64_decode_ex_avx512(const unsigned char * str,size_t length,bool strict)687 zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict)
688 {
689 	const unsigned char *c = str;
690 	unsigned char *o;
691 	size_t outl = 0;
692 	zend_string *result;
693 
694 	result = zend_string_alloc(length, 0);
695 	o = (unsigned char *)ZSTR_VAL(result);
696 
697 	while (length > 64) {
698 		/* Step 1: load input data */
699 		__m512i str = _mm512_loadu_si512((__m512i *)c);
700 
701 		/* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
702 		const __m512i higher_nibble = _mm512_and_si512(_mm512_srli_epi32(str, 4), _mm512_set1_epi8(0x0f));
703 		const __m512i lower_nibble = _mm512_and_si512(str, _mm512_set1_epi8(0x0f));
704 		const __m512i shiftLUT = _mm512_set4lanes_epi8(
705 				0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0);
706 		const __m512i maskLUT = _mm512_set4lanes_epi8(
707 				/* 0        : 0b1010_1000*/ 0xa8,
708 				/* 1 .. 9   : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
709 				/* 10       : 0b1111_0000*/ 0xf0,
710 				/* 11       : 0b0101_0100*/ 0x54,
711 				/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
712 				/* 15       : 0b0101_0100*/ 0x54);
713 		const __m512i bitposLUT = _mm512_set4lanes_epi8(
714 				0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
715 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
716 		const __m512i M = _mm512_shuffle_epi8(maskLUT, lower_nibble);
717 		const __m512i bit = _mm512_shuffle_epi8(bitposLUT, higher_nibble);
718 		const uint64_t match = _mm512_test_epi8_mask(M, bit);
719 		if (match != (uint64_t)-1) {
720 			break;
721 		}
722 		const __m512i sh = _mm512_shuffle_epi8(shiftLUT, higher_nibble);
723 		const __mmask64 eq_2f = _mm512_cmpeq_epi8_mask(str, _mm512_set1_epi8(0x2f));
724 		const __m512i shift = _mm512_mask_mov_epi8(sh, eq_2f, _mm512_set1_epi8(16));
725 		str = _mm512_add_epi8(str, shift);
726 
727 		/* Step 3: pack four fields within 32-bit words into 24-bit words. */
728 		const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
729 		str = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
730 
731 		/* Step 4: move 3-byte words into the continuous array. */
732 		const __m512i t1 = _mm512_shuffle_epi8(str,
733 			_mm512_set4lanes_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
734 		const __m512i s6 = _mm512_setr_epi32(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0, 0, 0, 0);
735 		const __m512i t2 = _mm512_permutexvar_epi32(s6, t1);
736 
737 		/* Step 5: store the final result */
738 		_mm512_storeu_si512((__m512i *)o, t2);
739 
740 		c += 64;
741 		o += 48;
742 		outl += 48;
743 		length -= 64;
744 	}
745 
746 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
747 		zend_string_efree(result);
748 		return NULL;
749 	}
750 
751 	ZSTR_LEN(result) = outl;
752 
753 	return result;
754 }
755 #endif
756 
757 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
758 # if ZEND_INTRIN_AVX2_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
759 static __m256i php_base64_encode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
760 static __m256i php_base64_encode_avx2_translate(__m256i in) __attribute__((target("avx2")));
761 # endif
php_base64_encode_avx2_reshuffle(__m256i in)762 static __m256i php_base64_encode_avx2_reshuffle(__m256i in)
763 {
764 	/* This one works with shifted (4 bytes) input in order to
765 	 * be able to work efficiently in the 2 128-bit lanes */
766 	__m256i t0, t1, t2, t3;
767 
768 	/* input, bytes MSB to LSB:
769 	 * 0 0 0 0 x w v u t s r q p o n m
770 	 * l k j i h g f e d c b a 0 0 0 0 */
771 	in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
772 		10, 11,  9, 10,
773 		 7,  8,  6,  7,
774 		 4,  5,  3,  4,
775 		 1,  2,  0,  1,
776 
777 		14, 15, 13, 14,
778 		11, 12, 10, 11,
779 		 8,  9,  7,  8,
780 		 5,  6,  4,  5));
781 
782 	t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
783 
784 	t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
785 
786 	t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
787 
788 	t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
789 
790 	return _mm256_or_si256(t1, t3);
791 	/* 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
792 	 * 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
793 	 * 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
794 	 * 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
795 	 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
796 	 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
797 	 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
798 	 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
799 }
800 
php_base64_encode_avx2_translate(__m256i in)801 static __m256i php_base64_encode_avx2_translate(__m256i in)
802 {
803 	__m256i lut, indices, mask;
804 
805 	lut = _mm256_setr_epi8(
806 			65, 71, -4, -4, -4, -4, -4, -4,
807 			-4, -4, -4, -4, -19, -16, 0, 0,
808 			65, 71, -4, -4, -4, -4, -4, -4,
809 			-4, -4, -4, -4, -19, -16, 0, 0);
810 
811 	indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
812 
813 	mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
814 
815 	indices = _mm256_sub_epi8(indices, mask);
816 
817 	return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
818 
819 }
820 #endif /* ZEND_INTRIN_AVX2_NATIVE || (ZEND_INTRIN_AVX2_RESOLVER && !ZEND_INTRIN_SSSE3_NATIVE) */
821 
822 #if ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
823 
824 # if ZEND_INTRIN_SSSE3_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
825 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
826 static __m128i php_base64_encode_ssse3_translate(__m128i in) __attribute__((target("ssse3")));
827 # endif
828 
php_base64_encode_ssse3_reshuffle(__m128i in)829 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in)
830 {
831 	__m128i t0, t1, t2, t3;
832 
833 	/* input, bytes MSB to LSB:
834 	 * 0 0 0 0 l k j i h g f e d c b a */
835 	in = _mm_shuffle_epi8(in, _mm_set_epi8(
836 				10, 11,  9, 10,
837 				7,  8,  6,  7,
838 				4,  5,  3,  4,
839 				1,  2,  0,  1));
840 
841 	t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
842 
843 	t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
844 
845 	t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
846 
847 	t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
848 
849 	/* output (upper case are MSB, lower case are LSB):
850 	 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
851 	 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
852 	 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
853 	 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
854 	return _mm_or_si128(t1, t3);
855 }
856 
php_base64_encode_ssse3_translate(__m128i in)857 static __m128i php_base64_encode_ssse3_translate(__m128i in)
858 {
859 	__m128i mask, indices;
860 	__m128i lut = _mm_setr_epi8(
861 			65,  71, -4, -4,
862 			-4,  -4, -4, -4,
863 			-4,  -4, -4, -4,
864 			-19, -16,  0,  0
865 			);
866 
867 	/* Translate values 0..63 to the Base64 alphabet. There are five sets:
868 	 * #  From      To         Abs    Index  Characters
869 	 * 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
870 	 * 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
871 	 * 2  [52..61]  [48..57]    -4  [2..11]  0123456789
872 	 * 3  [62]      [43]       -19       12  +
873 	 * 4  [63]      [47]       -16       13  / */
874 
875 	/* Create LUT indices from input:
876 	 * the index for range #0 is right, others are 1 less than expected: */
877 	indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
878 
879 	/* mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: */
880 	mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
881 
882 	/* subtract -1, so add 1 to indices for range #[1..4], All indices are now correct: */
883 	indices = _mm_sub_epi8(indices, mask);
884 
885 	/* Add offsets to input values: */
886 	return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
887 }
888 
889 #define PHP_BASE64_ENCODE_SSSE3_LOOP				\
890 	while (length > 15) {							\
891 		__m128i s = _mm_loadu_si128((__m128i *)c);	\
892 													\
893 		s = php_base64_encode_ssse3_reshuffle(s);	\
894 													\
895 		s = php_base64_encode_ssse3_translate(s);	\
896 													\
897 		_mm_storeu_si128((__m128i *)o, s);			\
898 		c += 12;									\
899 		o += 16;									\
900 		length -= 12;								\
901 	}
902 
903 #endif /* ZEND_INTRIN_SSSE3_NATIVE || (ZEND_INTRIN_SSSE3_RESOLVER && !ZEND_INTRIN_AVX2_NATIVE) */
904 
905 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
906 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_SSSE3_NATIVE
php_base64_encode_ex(const unsigned char * str,size_t length,zend_long flags)907 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags)
908 # elif ZEND_INTRIN_AVX2_RESOLVER
909 zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length, zend_long flags)
910 # else /* ZEND_INTRIN_SSSE3_RESOLVER */
911 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length, zend_long flags)
912 # endif
913 {
914 	const unsigned char *c = str;
915 	unsigned char *o;
916 	zend_string *result;
917 
918 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
919 	o = (unsigned char *)ZSTR_VAL(result);
920 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
921 	if (length > 31) {
922 		__m256i s = _mm256_loadu_si256((__m256i *)c);
923 
924 		s = _mm256_permutevar8x32_epi32(s, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
925 
926 		for (;;) {
927 			s = php_base64_encode_avx2_reshuffle(s);
928 
929 			s = php_base64_encode_avx2_translate(s);
930 
931 			_mm256_storeu_si256((__m256i *)o, s);
932 			c += 24;
933 			o += 32;
934 			length -= 24;
935 			if (length < 28) {
936 				break;
937 			}
938 			s = _mm256_loadu_si256((__m256i *)(c - 4));
939 		}
940 	}
941 # else
942 	PHP_BASE64_ENCODE_SSSE3_LOOP;
943 # endif
944 
945 	o = php_base64_encode_impl(c, length, o, flags);
946 
947 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
948 
949 	return result;
950 }
951 
952 # if ZEND_INTRIN_SSSE3_RESOLVER && ZEND_INTRIN_AVX2_RESOLVER
php_base64_encode_ssse3(const unsigned char * str,size_t length,zend_long flags)953 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length, zend_long flags)
954 {
955 	const unsigned char *c = str;
956 	unsigned char *o;
957 	zend_string *result;
958 
959 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
960 	o = (unsigned char *)ZSTR_VAL(result);
961 
962 	PHP_BASE64_ENCODE_SSSE3_LOOP;
963 
964 	o = php_base64_encode_impl(c, length, o, flags);
965 
966 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
967 
968 	return result;
969 }
970 # endif
971 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
972 
973 /* }}} */
974 
975 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
976 # if ZEND_INTRIN_AVX2_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
977 static __m256i php_base64_decode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
978 # endif
979 
php_base64_decode_avx2_reshuffle(__m256i in)980 static __m256i php_base64_decode_avx2_reshuffle(__m256i in)
981 {
982 	__m256i merge_ab_and_bc, out;
983 
984 	merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
985 
986 	out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
987 
988 	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
989 				2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
990 				2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
991 
992 	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
993 }
994 #endif
995 
996 #if ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
997 # if ZEND_INTRIN_SSSE3_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
998 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
999 # endif
1000 
php_base64_decode_ssse3_reshuffle(__m128i in)1001 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in)
1002 {
1003 	__m128i merge_ab_and_bc, out;
1004 
1005 	merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
1006 	/* 0000kkkk LLllllll 0000JJJJ JJjjKKKK
1007 	 * 0000hhhh IIiiiiii 0000GGGG GGggHHHH
1008 	 * 0000eeee FFffffff 0000DDDD DDddEEEE
1009 	 * 0000bbbb CCcccccc 0000AAAA AAaaBBBB */
1010 
1011 	out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
1012 	/* 00000000 JJJJJJjj KKKKkkkk LLllllll
1013 	 * 00000000 GGGGGGgg HHHHhhhh IIiiiiii
1014 	 * 00000000 DDDDDDdd EEEEeeee FFffffff
1015 	 * 00000000 AAAAAAaa BBBBbbbb CCcccccc */
1016 
1017 	return  _mm_shuffle_epi8(out, _mm_setr_epi8(
1018 		 2,  1,  0,
1019 		 6,  5,  4,
1020 		10,  9,  8,
1021 		14, 13, 12,
1022 		-1, -1, -1, -1));
1023 	/* 00000000 00000000 00000000 00000000
1024 	 * LLllllll KKKKkkkk JJJJJJjj IIiiiiii
1025 	 * HHHHhhhh GGGGGGgg FFffffff EEEEeeee
1026 	 * DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa */
1027 }
1028 
1029 #define PHP_BASE64_DECODE_SSSE3_LOOP								\
1030 	while (length > 15 + 6 + 2) {									\
1031 		__m128i lut_lo, lut_hi, lut_roll;							\
1032 		__m128i hi_nibbles, lo_nibbles, hi, lo;						\
1033 		__m128i s = _mm_loadu_si128((__m128i *)c);					\
1034 																	\
1035 		lut_lo = _mm_setr_epi8(										\
1036 				0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,		\
1037 				0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);	\
1038 		lut_hi = _mm_setr_epi8(										\
1039 				0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,		\
1040 				0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);	\
1041 		lut_roll = _mm_setr_epi8(									\
1042 				0,  16,  19,   4, -65, -65, -71, -71,				\
1043 				0,   0,   0,   0,   0,   0,   0,   0);				\
1044 																	\
1045 		hi_nibbles  = _mm_and_si128(								\
1046 						_mm_srli_epi32(s, 4), _mm_set1_epi8(0x2f));	\
1047 		lo_nibbles  = _mm_and_si128(s, _mm_set1_epi8(0x2f));		\
1048 		hi          = _mm_shuffle_epi8(lut_hi, hi_nibbles);			\
1049 		lo          = _mm_shuffle_epi8(lut_lo, lo_nibbles);			\
1050 																	\
1051 																	\
1052 		if (UNEXPECTED(												\
1053 			_mm_movemask_epi8(										\
1054 				_mm_cmpgt_epi8(										\
1055 					_mm_and_si128(lo, hi), _mm_set1_epi8(0))))) {	\
1056 			break;													\
1057 		} else {													\
1058 			__m128i eq_2f, roll;									\
1059 																	\
1060 			eq_2f = _mm_cmpeq_epi8(s, _mm_set1_epi8(0x2f));			\
1061 			roll = _mm_shuffle_epi8(								\
1062 					lut_roll, _mm_add_epi8(eq_2f, hi_nibbles));		\
1063 																	\
1064 			s = _mm_add_epi8(s, roll);								\
1065 			s = php_base64_decode_ssse3_reshuffle(s);				\
1066 																	\
1067 			_mm_storeu_si128((__m128i *)o, s);						\
1068 																	\
1069 			c += 16;												\
1070 			o += 12;												\
1071 			outl += 12;												\
1072 			length -= 16;											\
1073 		}															\
1074 	}
1075 
1076 #endif
1077 
1078 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
1079 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_SSSE3_NATIVE
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)1080 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1081 # elif ZEND_INTRIN_AVX2_RESOLVER
1082 zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict)
1083 # else
1084 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1085 # endif
1086 {
1087 	const unsigned char *c = str;
1088 	unsigned char *o;
1089 	size_t outl = 0;
1090 	zend_string *result;
1091 
1092 	result = zend_string_alloc(length, 0);
1093 	o = (unsigned char *)ZSTR_VAL(result);
1094 
1095 	/* See: "Faster Base64 Encoding and Decoding using AVX2 Instructions"
1096 	* https://arxiv.org/pdf/1704.00605.pdf */
1097 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
1098 	while (length > 31 + 11 + 2) {
1099 		__m256i lut_lo, lut_hi, lut_roll;
1100 		__m256i hi_nibbles, lo_nibbles, hi, lo;
1101 		__m256i str = _mm256_loadu_si256((__m256i *)c);
1102 
1103 		lut_lo = _mm256_setr_epi8(
1104 				0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1105 				0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
1106 				0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1107 				0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
1108 
1109 		lut_hi = _mm256_setr_epi8(
1110 				0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1111 				0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1112 				0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1113 				0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
1114 
1115 		lut_roll = _mm256_setr_epi8(
1116 				0,  16,  19,   4, -65, -65, -71, -71,
1117 				0,   0,   0,   0,   0,   0,   0,   0,
1118 				0,  16,  19,   4, -65, -65, -71, -71,
1119 				0,   0,   0,   0,   0,   0,   0,   0);
1120 
1121 		hi_nibbles  = _mm256_and_si256(_mm256_srli_epi32(str, 4), _mm256_set1_epi8(0x2f));
1122 		lo_nibbles  = _mm256_and_si256(str, _mm256_set1_epi8(0x2f));
1123 		hi          = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
1124 		lo          = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
1125 
1126 		if (!_mm256_testz_si256(lo, hi)) {
1127 			break;
1128 		} else {
1129 			__m256i eq_2f, roll;
1130 			eq_2f = _mm256_cmpeq_epi8(str, _mm256_set1_epi8(0x2f));
1131 			roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2f, hi_nibbles));
1132 
1133 
1134 			str = _mm256_add_epi8(str, roll);
1135 
1136 			str = php_base64_decode_avx2_reshuffle(str);
1137 
1138 			_mm256_storeu_si256((__m256i *)o, str);
1139 
1140 			c += 32;
1141 			o += 24;
1142 			outl += 24;
1143 			length -= 32;
1144 		}
1145 	}
1146 # else
1147 	PHP_BASE64_DECODE_SSSE3_LOOP;
1148 # endif
1149 
1150 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1151 		zend_string_efree(result);
1152 		return NULL;
1153 	}
1154 
1155 	ZSTR_LEN(result) = outl;
1156 
1157 	return result;
1158 }
1159 
1160 # if ZEND_INTRIN_SSSE3_RESOLVER && ZEND_INTRIN_AVX2_RESOLVER
php_base64_decode_ex_ssse3(const unsigned char * str,size_t length,bool strict)1161 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1162 {
1163 	const unsigned char *c = str;
1164 	unsigned char *o;
1165 	size_t outl = 0;
1166 	zend_string *result;
1167 
1168 	result = zend_string_alloc(length, 0);
1169 	o = (unsigned char *)ZSTR_VAL(result);
1170 
1171 	PHP_BASE64_DECODE_SSSE3_LOOP;
1172 
1173 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1174 		zend_string_efree(result);
1175 		return NULL;
1176 	}
1177 
1178 	ZSTR_LEN(result) = outl;
1179 
1180 	return result;
1181 }
1182 # endif
1183 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
1184 
1185 #if !ZEND_INTRIN_AVX2_NATIVE && !ZEND_INTRIN_SSSE3_NATIVE
1186 #if ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_RESOLVER
php_base64_encode_default(const unsigned char * str,size_t length,zend_long flags)1187 zend_string *php_base64_encode_default(const unsigned char *str, size_t length, zend_long flags)
1188 #else
1189 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags)
1190 #endif
1191 {
1192 	unsigned char *p;
1193 	zend_string *result;
1194 
1195 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
1196 	p = (unsigned char *)ZSTR_VAL(result);
1197 
1198 	p = php_base64_encode_impl(str, length, p, flags);
1199 
1200 	ZSTR_LEN(result) = (p - (unsigned char *)ZSTR_VAL(result));
1201 
1202 	return result;
1203 }
1204 #endif
1205 
1206 #if !ZEND_INTRIN_AVX2_NATIVE && !ZEND_INTRIN_SSSE3_NATIVE
1207 #if ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_RESOLVER
php_base64_decode_ex_default(const unsigned char * str,size_t length,bool strict)1208 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict)
1209 #else
1210 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1211 #endif
1212 {
1213 	zend_string *result;
1214 	size_t outl = 0;
1215 
1216 	result = zend_string_alloc(length, 0);
1217 
1218 	if (!php_base64_decode_impl(str, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1219 		zend_string_efree(result);
1220 		return NULL;
1221 	}
1222 
1223 	ZSTR_LEN(result) = outl;
1224 
1225 	return result;
1226 }
1227 #endif
1228 /* }}} */
1229 
1230 /* {{{ Encodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_encode)1231 PHP_FUNCTION(base64_encode)
1232 {
1233 	char *str;
1234 	size_t str_len;
1235 	zend_string *result;
1236 
1237 	ZEND_PARSE_PARAMETERS_START(1, 1)
1238 		Z_PARAM_STRING(str, str_len)
1239 	ZEND_PARSE_PARAMETERS_END();
1240 
1241 	result = php_base64_encode((unsigned char*)str, str_len);
1242 	RETURN_STR(result);
1243 }
1244 /* }}} */
1245 
1246 /* {{{ Decodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_decode)1247 PHP_FUNCTION(base64_decode)
1248 {
1249 	char *str;
1250 	bool strict = 0;
1251 	size_t str_len;
1252 	zend_string *result;
1253 
1254 	ZEND_PARSE_PARAMETERS_START(1, 2)
1255 		Z_PARAM_STRING(str, str_len)
1256 		Z_PARAM_OPTIONAL
1257 		Z_PARAM_BOOL(strict)
1258 	ZEND_PARSE_PARAMETERS_END();
1259 
1260 	result = php_base64_decode_ex((unsigned char*)str, str_len, strict);
1261 	if (result != NULL) {
1262 		RETURN_STR(result);
1263 	} else {
1264 		RETURN_FALSE;
1265 	}
1266 }
1267 /* }}} */
1268