xref: /PHP-8.2/ext/standard/base64.c (revision 413844d6)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Jim Winstead <jimw@php.net>                                  |
14    |         Xinchen Hui <laruence@php.net>                               |
15    +----------------------------------------------------------------------+
16  */
17 
18 #include <string.h>
19 
20 #include "php.h"
21 #include "base64.h"
22 
23 /* {{{ base64 tables */
24 static const char base64_table[] = {
25 	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
26 	'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
27 	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
28 	'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
29 	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
30 };
31 
32 static const char base64_pad = '=';
33 
34 static const short base64_reverse_table[256] = {
35 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
36 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
37 	-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
38 	52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
39 	-2,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
40 	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
41 	-2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
42 	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
43 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
48 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
49 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
50 	-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
51 };
52 /* }}} */
53 
54 #if defined(__aarch64__) || defined(_M_ARM64)
55 #include <arm_neon.h>
56 
encode_toascii(const uint8x16_t input,const uint8x16x2_t shift_LUT)57 static zend_always_inline uint8x16_t encode_toascii(const uint8x16_t input, const uint8x16x2_t shift_LUT)
58 {
59 	/* reduce  0..51 -> 0
60 	          52..61 -> 1 .. 10
61 	              62 -> 11
62 	              63 -> 12 */
63 	uint8x16_t result = vqsubq_u8(input, vdupq_n_u8(51));
64 	/* distinguish between ranges 0..25 and 26..51:
65 	   0 .. 25 -> remains 0
66 	   26 .. 51 -> becomes 13 */
67 	const uint8x16_t less = vcgtq_u8(vdupq_n_u8(26), input);
68 	result = vorrq_u8(result, vandq_u8(less, vdupq_n_u8(13)));
69 	/* read shift */
70 	result = vqtbl2q_u8(shift_LUT, result);
71 	return vaddq_u8(result, input);
72 }
73 
neon_base64_encode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)74 static zend_always_inline unsigned char *neon_base64_encode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left)
75 {
76 	const uint8_t shift_LUT_[32] = {'a' - 26, '0' - 52, '0' - 52, '0' - 52,
77 					'0' - 52, '0' - 52, '0' - 52, '0' - 52,
78 					'0' - 52, '0' - 52, '0' - 52, '+' - 62,
79 					'/' - 63, 'A',      0,        0,
80 					'a' - 26, '0' - 52, '0' - 52, '0' - 52,
81 					'0' - 52, '0' - 52, '0' - 52, '0' - 52,
82 					'0' - 52, '0' - 52, '0' - 52, '+' - 62,
83 					'/' - 63, 'A',      0,        0};
84 	const uint8x16x2_t shift_LUT = *((const uint8x16x2_t *)shift_LUT_);
85 	do {
86 		/* [ccdddddd | bbbbcccc | aaaaaabb]
87 		    x.val[2] | x.val[1] | x.val[0] */
88 		const uint8x16x3_t x = vld3q_u8((const uint8_t *)(in));
89 
90 		/* [00aa_aaaa] */
91 		const uint8x16_t field_a = vshrq_n_u8(x.val[0], 2);
92 
93 		const uint8x16_t field_b =             /* [00bb_bbbb] */
94 		    vbslq_u8(vdupq_n_u8(0x30),         /* [0011_0000] */
95 		             vshlq_n_u8(x.val[0], 4),  /* [aabb_0000] */
96 		             vshrq_n_u8(x.val[1], 4)); /* [0000_bbbb] */
97 
98 		const uint8x16_t field_c =             /* [00cc_cccc] */
99 		    vbslq_u8(vdupq_n_u8(0x3c),         /* [0011_1100] */
100 		             vshlq_n_u8(x.val[1], 2),  /* [bbcc_cc00] */
101 		             vshrq_n_u8(x.val[2], 6)); /* [0000_00cc] */
102 
103 		/* [00dd_dddd] */
104 		const uint8x16_t field_d = vandq_u8(x.val[2], vdupq_n_u8(0x3f));
105 
106 		uint8x16x4_t result;
107 		result.val[0] = encode_toascii(field_a, shift_LUT);
108 		result.val[1] = encode_toascii(field_b, shift_LUT);
109 		result.val[2] = encode_toascii(field_c, shift_LUT);
110 		result.val[3] = encode_toascii(field_d, shift_LUT);
111 
112 		vst4q_u8((uint8_t *)out, result);
113 		out += 64;
114 		in += 16 * 3;
115 		inl -= 16 * 3;
116 	} while (inl >= 16 * 3);
117 
118 	*left = inl;
119 	return out;
120 }
121 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
122 
php_base64_encode_impl(const unsigned char * in,size_t inl,unsigned char * out)123 static zend_always_inline unsigned char *php_base64_encode_impl(const unsigned char *in, size_t inl, unsigned char *out) /* {{{ */
124 {
125 #if defined(__aarch64__) || defined(_M_ARM64)
126 	if (inl >= 16 * 3) {
127 		size_t left = 0;
128 		out = neon_base64_encode(in, inl, out, &left);
129 		in += inl - left;
130 		inl = left;
131 	}
132 #endif
133 
134 	while (inl > 2) { /* keep going until we have less than 24 bits */
135 		*out++ = base64_table[in[0] >> 2];
136 		*out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
137 		*out++ = base64_table[((in[1] & 0x0f) << 2) + (in[2] >> 6)];
138 		*out++ = base64_table[in[2] & 0x3f];
139 
140 		in += 3;
141 		inl -= 3; /* we just handle 3 octets of data */
142 	}
143 
144 	/* now deal with the tail end of things */
145 	if (inl != 0) {
146 		*out++ = base64_table[in[0] >> 2];
147 		if (inl > 1) {
148 			*out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
149 			*out++ = base64_table[(in[1] & 0x0f) << 2];
150 			*out++ = base64_pad;
151 		} else {
152 			*out++ = base64_table[(in[0] & 0x03) << 4];
153 			*out++ = base64_pad;
154 			*out++ = base64_pad;
155 		}
156 	}
157 
158 	*out = '\0';
159 
160 	return out;
161 }
162 /* }}} */
163 
164 #if defined(__aarch64__) || defined(_M_ARM64)
decode_fromascii(const uint8x16_t input,uint8x16_t * error,const uint8x16x2_t shiftLUT,const uint8x16x2_t maskLUT,const uint8x16x2_t bitposLUT)165 static zend_always_inline uint8x16_t decode_fromascii(const uint8x16_t input, uint8x16_t *error, const uint8x16x2_t shiftLUT, const uint8x16x2_t maskLUT, const uint8x16x2_t bitposLUT) {
166 	const uint8x16_t higher_nibble = vshrq_n_u8(input, 4);
167 	const uint8x16_t lower_nibble = vandq_u8(input, vdupq_n_u8(0x0f));
168 	const uint8x16_t sh = vqtbl2q_u8(shiftLUT, higher_nibble);
169 	const uint8x16_t eq_2f = vceqq_u8(input, vdupq_n_u8(0x2f));
170 	const uint8x16_t shift = vbslq_u8(eq_2f, vdupq_n_u8(16), sh);
171 	const uint8x16_t M = vqtbl2q_u8(maskLUT, lower_nibble);
172 	const uint8x16_t bit = vqtbl2q_u8(bitposLUT, higher_nibble);
173 	*error = vceqq_u8(vandq_u8(M, bit), vdupq_n_u8(0));
174 	return vaddq_u8(input, shift);
175 }
176 
neon_base64_decode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)177 static zend_always_inline size_t neon_base64_decode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left) {
178 	unsigned char *out_orig = out;
179 	const uint8_t shiftLUT_[32] = {
180 		0,   0,  19,   4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
181 		0,   0,   0,   0,   0,   0,   0,   0,
182 		0,   0,  19,   4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
183 		0,   0,   0,   0,   0,   0,   0,   0};
184 	const uint8_t maskLUT_[32] = {
185 		/* 0        : 0b1010_1000*/ 0xa8,
186 		/* 1 .. 9   : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
187 		/* 10       : 0b1111_0000*/ 0xf0,
188 		/* 11       : 0b0101_0100*/ 0x54,
189 		/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
190 		/* 15       : 0b0101_0100*/ 0x54,
191 
192 		/* 0        : 0b1010_1000*/ 0xa8,
193 		/* 1 .. 9   : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
194 		/* 10       : 0b1111_0000*/ 0xf0,
195 		/* 11       : 0b0101_0100*/ 0x54,
196 		/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
197 		/* 15       : 0b0101_0100*/ 0x54
198 	};
199 	const uint8_t bitposLUT_[32] = {
200 		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
201 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
202 
203 		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
204 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
205 	};
206 	const uint8x16x2_t shiftLUT = *((const uint8x16x2_t *)shiftLUT_);
207 	const uint8x16x2_t maskLUT = *((const uint8x16x2_t *)maskLUT_);
208 	const uint8x16x2_t bitposLUT = *((const uint8x16x2_t *)bitposLUT_);;
209 
210 	do {
211 		const uint8x16x4_t x = vld4q_u8((const unsigned char *)in);
212 		uint8x16_t error_a;
213 		uint8x16_t error_b;
214 		uint8x16_t error_c;
215 		uint8x16_t error_d;
216 		uint8x16_t field_a = decode_fromascii(x.val[0], &error_a, shiftLUT, maskLUT, bitposLUT);
217 		uint8x16_t field_b = decode_fromascii(x.val[1], &error_b, shiftLUT, maskLUT, bitposLUT);
218 		uint8x16_t field_c = decode_fromascii(x.val[2], &error_c, shiftLUT, maskLUT, bitposLUT);
219 		uint8x16_t field_d = decode_fromascii(x.val[3], &error_d, shiftLUT, maskLUT, bitposLUT);
220 
221 		const uint8x16_t err = vorrq_u8(vorrq_u8(error_a, error_b), vorrq_u8(error_c, error_d));
222 		union {uint8_t mem[16]; uint64_t dw[2]; } error;
223 		vst1q_u8(error.mem, err);
224 
225 		/* Check that the input only contains bytes belonging to the alphabet of
226 		   Base64. If there are errors, decode the rest of the string with the
227 		   scalar decoder. */
228 		if (error.dw[0] | error.dw[1])
229 			break;
230 
231 		uint8x16x3_t result;
232 		result.val[0] = vorrq_u8(vshrq_n_u8(field_b, 4), vshlq_n_u8(field_a, 2));
233 		result.val[1] = vorrq_u8(vshrq_n_u8(field_c, 2), vshlq_n_u8(field_b, 4));
234 		result.val[2] = vorrq_u8(field_d, vshlq_n_u8(field_c, 6));
235 
236 		vst3q_u8((unsigned char *)out, result);
237 		out += 16 * 3;
238 		in += 16 * 4;
239 		inl -= 16 * 4;
240 	} while (inl >= 16 * 4);
241 	*left = inl;
242 	return out - out_orig;
243 }
244 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
245 
php_base64_decode_impl(const unsigned char * in,size_t inl,unsigned char * out,size_t * outl,bool strict)246 static zend_always_inline int php_base64_decode_impl(const unsigned char *in, size_t inl, unsigned char *out, size_t *outl, bool strict) /* {{{ */
247 {
248 	int ch;
249 	size_t i = 0, padding = 0, j = *outl;
250 
251 #if defined(__aarch64__) || defined(_M_ARM64)
252 	if (inl >= 16 * 4) {
253 		size_t left = 0;
254 		j += neon_base64_decode(in, inl, out, &left);
255 		i = inl - left;
256 		in += i;
257 		inl = left;
258 	}
259 #endif
260 
261 	/* run through the whole string, converting as we go */
262 	while (inl-- > 0) {
263 		ch = *in++;
264 		if (ch == base64_pad) {
265 			padding++;
266 			continue;
267 		}
268 
269 		ch = base64_reverse_table[ch];
270 		if (!strict) {
271 			/* skip unknown characters and whitespace */
272 			if (ch < 0) {
273 				continue;
274 			}
275 		} else {
276 			/* skip whitespace */
277 			if (ch == -1) {
278 				continue;
279 			}
280 			/* fail on bad characters or if any data follows padding */
281 			if (ch == -2 || padding) {
282 				goto fail;
283 			}
284 		}
285 
286 		switch (i % 4) {
287 			case 0:
288 				out[j] = ch << 2;
289 				break;
290 			case 1:
291 				out[j++] |= ch >> 4;
292 				out[j] = (ch & 0x0f) << 4;
293 				break;
294 			case 2:
295 				out[j++] |= ch >>2;
296 				out[j] = (ch & 0x03) << 6;
297 				break;
298 			case 3:
299 				out[j++] |= ch;
300 				break;
301 		}
302 		i++;
303 	}
304 
305 	/* fail if the input is truncated (only one char in last group) */
306 	if (strict && i % 4 == 1) {
307 		goto fail;
308 	}
309 
310 	/* fail if the padding length is wrong (not VV==, VVV=), but accept zero padding
311 	 * RFC 4648: "In some circumstances, the use of padding [--] is not required" */
312 	if (strict && padding && (padding > 2 || (i + padding) % 4 != 0)) {
313 		goto fail;
314 	}
315 
316 	*outl = j;
317 	out[j] = '\0';
318 
319 	return 1;
320 
321 fail:
322 	return 0;
323 }
324 /* }}} */
325 
326 /* {{{ php_base64_encode */
327 
328 #if ZEND_INTRIN_AVX2_NATIVE
329 # undef ZEND_INTRIN_SSSE3_NATIVE
330 # undef ZEND_INTRIN_SSSE3_RESOLVER
331 # undef ZEND_INTRIN_SSSE3_FUNC_PROTO
332 # undef ZEND_INTRIN_SSSE3_FUNC_PTR
333 #elif ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_SSSE3_NATIVE
334 # undef ZEND_INTRIN_SSSE3_NATIVE
335 # undef ZEND_INTRIN_SSSE3_RESOLVER
336 # define ZEND_INTRIN_SSSE3_RESOLVER 1
337 # define ZEND_INTRIN_SSSE3_FUNC_PROTO 1
338 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
339 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
340 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
341 # else
342 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
343 # endif
344 #elif ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_SSSE3_NATIVE
345 # undef ZEND_INTRIN_SSSE3_NATIVE
346 # undef ZEND_INTRIN_SSSE3_RESOLVER
347 # define ZEND_INTRIN_SSSE3_RESOLVER 1
348 # define ZEND_INTRIN_SSSE3_FUNC_PTR 1
349 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
350 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
351 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
352 # else
353 #  define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
354 # endif
355 #endif
356 
357 /* Only enable avx512 resolver if avx2 use resolver also */
358 #if ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_AVX512_FUNC_PROTO
359 #define BASE64_INTRIN_AVX512_FUNC_PROTO 1
360 #endif
361 #if ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_AVX512_FUNC_PTR
362 #define BASE64_INTRIN_AVX512_FUNC_PTR 1
363 #endif
364 #if ZEND_INTRIN_AVX2_FUNC_PROTO && ZEND_INTRIN_AVX512_VBMI_FUNC_PROTO
365 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO 1
366 #endif
367 #if ZEND_INTRIN_AVX2_FUNC_PTR && ZEND_INTRIN_AVX512_VBMI_FUNC_PTR
368 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PTR 1
369 #endif
370 
371 #if ZEND_INTRIN_AVX2_NATIVE
372 # include <immintrin.h>
373 #elif ZEND_INTRIN_SSSE3_NATIVE
374 # include <tmmintrin.h>
375 #elif (ZEND_INTRIN_SSSE3_RESOLVER || ZEND_INTRIN_AVX2_RESOLVER)
376 # if ZEND_INTRIN_AVX2_RESOLVER
377 #  include <immintrin.h>
378 # else
379 #  include <tmmintrin.h>
380 # endif /* (ZEND_INTRIN_SSSE3_RESOLVER || ZEND_INTRIN_AVX2_RESOLVER) */
381 # include "Zend/zend_cpuinfo.h"
382 
383 # if BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PTR
384 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length));
385 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict));
386 # endif
387 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
388 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length));
389 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict));
390 # endif
391 
392 # if ZEND_INTRIN_AVX2_RESOLVER
393 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length));
394 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict));
395 # endif
396 
397 # if ZEND_INTRIN_SSSE3_RESOLVER
398 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length));
399 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict));
400 # endif
401 
402 zend_string *php_base64_encode_default(const unsigned char *str, size_t length);
403 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict);
404 
405 # if (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO)
406 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length) __attribute__((ifunc("resolve_base64_encode")));
407 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) __attribute__((ifunc("resolve_base64_decode")));
408 
409 typedef zend_string *(*base64_encode_func_t)(const unsigned char *, size_t);
410 typedef zend_string *(*base64_decode_func_t)(const unsigned char *, size_t, bool);
411 
412 ZEND_NO_SANITIZE_ADDRESS
413 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_encode(void)414 static base64_encode_func_t resolve_base64_encode(void) {
415 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
416 	if (zend_cpu_supports_avx512_vbmi()) {
417 		return php_base64_encode_avx512_vbmi;
418 	} else
419 # endif
420 # if BASE64_INTRIN_AVX512_FUNC_PROTO
421 	if (zend_cpu_supports_avx512()) {
422 		return php_base64_encode_avx512;
423 	} else
424 # endif
425 # if ZEND_INTRIN_AVX2_FUNC_PROTO
426 	if (zend_cpu_supports_avx2()) {
427 		return php_base64_encode_avx2;
428 	} else
429 # endif
430 #if ZEND_INTRIN_SSSE3_FUNC_PROTO
431 	if (zend_cpu_supports_ssse3()) {
432 		return php_base64_encode_ssse3;
433 	}
434 #endif
435 	return php_base64_encode_default;
436 }
437 
438 ZEND_NO_SANITIZE_ADDRESS
439 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_decode(void)440 static base64_decode_func_t resolve_base64_decode(void) {
441 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
442 	if (zend_cpu_supports_avx512_vbmi()) {
443 		return php_base64_decode_ex_avx512_vbmi;
444 	} else
445 # endif
446 # if BASE64_INTRIN_AVX512_FUNC_PROTO
447 	if (zend_cpu_supports_avx512()) {
448 		return php_base64_decode_ex_avx512;
449 	} else
450 # endif
451 # if ZEND_INTRIN_AVX2_FUNC_PROTO
452 	if (zend_cpu_supports_avx2()) {
453 		return php_base64_decode_ex_avx2;
454 	} else
455 # endif
456 #if ZEND_INTRIN_SSSE3_FUNC_PROTO
457 	if (zend_cpu_supports_ssse3()) {
458 		return php_base64_decode_ex_ssse3;
459 	}
460 #endif
461 	return php_base64_decode_ex_default;
462 }
463 # else /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
464 
465 PHPAPI zend_string *(*php_base64_encode_ptr)(const unsigned char *str, size_t length) = NULL;
466 PHPAPI zend_string *(*php_base64_decode_ex_ptr)(const unsigned char *str, size_t length, bool strict) = NULL;
467 
php_base64_encode(const unsigned char * str,size_t length)468 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length) {
469 	return php_base64_encode_ptr(str, length);
470 }
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)471 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) {
472 	return php_base64_decode_ex_ptr(str, length, strict);
473 }
474 
PHP_MINIT_FUNCTION(base64_intrin)475 PHP_MINIT_FUNCTION(base64_intrin)
476 {
477 # if BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
478 	if (zend_cpu_supports_avx512_vbmi()) {
479 		php_base64_encode_ptr = php_base64_encode_avx512_vbmi;
480 		php_base64_decode_ex_ptr = php_base64_decode_ex_avx512_vbmi;
481 	} else
482 # endif
483 # if BASE64_INTRIN_AVX512_FUNC_PTR
484 	if (zend_cpu_supports_avx512()) {
485 		php_base64_encode_ptr = php_base64_encode_avx512;
486 		php_base64_decode_ex_ptr = php_base64_decode_ex_avx512;
487 	} else
488 # endif
489 # if ZEND_INTRIN_AVX2_FUNC_PTR
490 	if (zend_cpu_supports_avx2()) {
491 		php_base64_encode_ptr = php_base64_encode_avx2;
492 		php_base64_decode_ex_ptr = php_base64_decode_ex_avx2;
493 	} else
494 # endif
495 #if ZEND_INTRIN_SSSE3_FUNC_PTR
496 	if (zend_cpu_supports_ssse3()) {
497 		php_base64_encode_ptr = php_base64_encode_ssse3;
498 		php_base64_decode_ex_ptr = php_base64_decode_ex_ssse3;
499 	} else
500 #endif
501 	{
502 		php_base64_encode_ptr = php_base64_encode_default;
503 		php_base64_decode_ex_ptr = php_base64_decode_ex_default;
504 	}
505 	return SUCCESS;
506 }
507 # endif /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
508 #endif /* ZEND_INTRIN_AVX2_NATIVE */
509 
510 #if BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO || BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
php_base64_encode_avx512_vbmi(const unsigned char * str,size_t length)511 zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length)
512 {
513 	const unsigned char *c = str;
514 	unsigned char *o;
515 	zend_string *result;
516 
517 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
518 	o = (unsigned char *)ZSTR_VAL(result);
519 
520 	const __m512i shuffle_splitting = _mm512_setr_epi32(
521 		0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
522 		0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
523 		0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
524 	const __m512i multi_shifts = _mm512_set1_epi64(0x3036242a1016040a);
525 	const char *ascii_lookup_tbl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
526 	const __m512i ascii_lookup = _mm512_loadu_si512((__m512i *)ascii_lookup_tbl);
527 
528 	while (length > 63) {
529 		/* Step 1: load input data */
530 		__m512i str = _mm512_loadu_si512((const __m512i *)c);
531 
532 		/* Step 2: splitting 24-bit words into 32-bit lanes */
533 		str = _mm512_permutexvar_epi8(shuffle_splitting, str);
534 
535 		/* Step 3: moving 6-bit word to sperate bytes */
536 		str = _mm512_multishift_epi64_epi8(multi_shifts, str);
537 
538 		/* Step 4: conversion to ASCII */
539 		str = _mm512_permutexvar_epi8(str, ascii_lookup);
540 
541 		/* Step 5: store the final result */
542 		_mm512_storeu_si512((__m512i *)o, str);
543 		c += 48;
544 		o += 64;
545 		length -= 48;
546 	}
547 
548 	o = php_base64_encode_impl(c, length, o);
549 
550 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
551 
552 	return result;
553 }
554 
php_base64_decode_ex_avx512_vbmi(const unsigned char * str,size_t length,bool strict)555 zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict)
556 {
557 	const unsigned char *c = str;
558 	unsigned char *o;
559 	size_t outl = 0;
560 	zend_string *result;
561 
562 	result = zend_string_alloc(length, 0);
563 	o = (unsigned char *)ZSTR_VAL(result);
564 
565 	const __m512i lookup_0 = _mm512_setr_epi32(
566 		0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080,
567 		0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
568 		0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080);
569 	const __m512i lookup_1 = _mm512_setr_epi32(
570 		0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 0x1211100f, 0x16151413,
571 		0x80191817, 0x80808080, 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
572 		0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080);
573 
574 	const __m512i merge_mask1 = _mm512_set1_epi32(0x01400140);
575 	const __m512i merge_mask2 = _mm512_set1_epi32(0x00011000);
576 
577 	const __m512i continuous_mask = _mm512_setr_epi32(
578 		0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 0x191a1415, 0x1c1d1e18,
579 		0x26202122, 0x292a2425, 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
580 		0x00000000, 0x00000000, 0x00000000, 0x00000000);
581 
582 	while (length > 64) {
583 		/* Step 1: load input data */
584 		const __m512i input = _mm512_loadu_si512((__m512i *)c);
585 
586 		/* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
587 		__m512i str = _mm512_permutex2var_epi8(lookup_0, input, lookup_1);
588 		const uint64_t mask = _mm512_movepi8_mask(_mm512_or_epi64(str, input)); /* convert MSBs to the mask */
589 		if (mask) {
590 			break;
591 		}
592 
593 		/* Step 3: pack four fields within 32-bit words into 24-bit words. */
594 		const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, merge_mask1);
595 		str = _mm512_madd_epi16(merge_ab_and_bc, merge_mask2);
596 
597 		/* Step 4: move 3-byte words into the continuous array. */
598 		str = _mm512_permutexvar_epi8(continuous_mask, str);
599 
600 		/* Step 5: store the final result */
601 		_mm512_storeu_si512((__m512i *)o, str);
602 
603 		c += 64;
604 		o += 48;
605 		outl += 48;
606 		length -= 64;
607 	}
608 
609 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
610 		zend_string_efree(result);
611 		return NULL;
612 	}
613 
614 	ZSTR_LEN(result) = outl;
615 
616 	return result;
617 }
618 #endif
619 
620 #if BASE64_INTRIN_AVX512_FUNC_PROTO || BASE64_INTRIN_AVX512_FUNC_PTR
php_base64_encode_avx512(const unsigned char * str,size_t length)621 zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length)
622 {
623 	const unsigned char *c = str;
624 	unsigned char *o;
625 	zend_string *result;
626 
627 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
628 	o = (unsigned char *)ZSTR_VAL(result);
629 
630 	while (length > 63) {
631 		/* Step 1: load input data */
632 		/* [????|????|????|????|PPPO|OONN|NMMM|LLLK|KKJJ|JIII|HHHG|GGFF|FEEE|DDDC|CCBB|BAAA] */
633 		__m512i str = _mm512_loadu_si512((const __m512i *)c);
634 
635 		/* Step 2: splitting 24-bit words into 32-bit lanes */
636 		/* [0000|PPPO|OONN|NMMM|0000|LLLK|KKJJ|JIII|0000|HHHG|GGFF|FEEE|0000|DDDC|CCBB|BAAA] */
637 		str = _mm512_permutexvar_epi32(
638 			_mm512_set_epi32(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0), str);
639 		/* [D1 D2 D0 D1|C1 C2 C0 C1|B1 B2 B0 B1|A1 A2 A0 A1] x 4 */
640 		str = _mm512_shuffle_epi8(str, _mm512_set4_epi32(0x0a0b090a, 0x07080607, 0x04050304, 0x01020001));
641 
642 		/* Step 3: moving 6-bit word to sperate bytes */
643 		/* in:  [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] */
644 		/* t0:  [0000cccc|cc000000|aaaaaa00|00000000] */
645 		const __m512i t0 = _mm512_and_si512(str, _mm512_set1_epi32(0x0fc0fc00));
646 		/* t1:  [00000000|00cccccc|00000000|00aaaaaa] */
647 		const __m512i t1 = _mm512_srlv_epi16(t0, _mm512_set1_epi32(0x0006000a));
648 		/* t2:  [ccdddddd|00000000|aabbbbbb|cccc0000] */
649 		const __m512i t2 = _mm512_sllv_epi16(str, _mm512_set1_epi32(0x00080004));
650 		/* str: [00dddddd|00cccccc|00bbbbbb|00aaaaaa] */
651 		str = _mm512_ternarylogic_epi32(_mm512_set1_epi32(0x3f003f00), t2, t1, 0xca);
652 
653 		/* Step 4: conversion to ASCII */
654 		__m512i result = _mm512_subs_epu8(str, _mm512_set1_epi8(51));
655 		const __mmask64 less = _mm512_cmpgt_epi8_mask(_mm512_set1_epi8(26), str);
656 		result = _mm512_mask_mov_epi8(result, less, _mm512_set1_epi8(13));
657 		const __m512i lut = _mm512_set4_epi32(0x000041f0, 0xedfcfcfc, 0xfcfcfcfc, 0xfcfcfc47);
658 		result = _mm512_shuffle_epi8(lut, result);
659 		result = _mm512_add_epi8(result, str);
660 
661 		/* Step 5: store the final result */
662 		_mm512_storeu_si512((__m512i *)o, result);
663 		c += 48;
664 		o += 64;
665 		length -= 48;
666 	}
667 
668 	o = php_base64_encode_impl(c, length, o);
669 
670 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
671 
672 	return result;
673 }
674 
675 #define build_dword(b0, b1, b2, b3)					\
676 	((uint32_t)(uint8_t)b0 << 0) | ((uint32_t)(uint8_t)b1 << 8) |	\
677 	((uint32_t)(uint8_t)b2 << 16) | ((uint32_t)(uint8_t)b3 << 24)
678 
679 #define _mm512_set4lanes_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15)	\
680 	_mm512_setr4_epi32(build_dword(b0, b1, b2, b3), build_dword(b4, b5, b6, b7),			\
681 			   build_dword(b8, b9, b10, b11), build_dword(b12, b13, b14, b15))
682 
php_base64_decode_ex_avx512(const unsigned char * str,size_t length,bool strict)683 zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict)
684 {
685 	const unsigned char *c = str;
686 	unsigned char *o;
687 	size_t outl = 0;
688 	zend_string *result;
689 
690 	result = zend_string_alloc(length, 0);
691 	o = (unsigned char *)ZSTR_VAL(result);
692 
693 	while (length > 64) {
694 		/* Step 1: load input data */
695 		__m512i str = _mm512_loadu_si512((__m512i *)c);
696 
697 		/* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
698 		const __m512i higher_nibble = _mm512_and_si512(_mm512_srli_epi32(str, 4), _mm512_set1_epi8(0x0f));
699 		const __m512i lower_nibble = _mm512_and_si512(str, _mm512_set1_epi8(0x0f));
700 		const __m512i shiftLUT = _mm512_set4lanes_epi8(
701 				0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0);
702 		const __m512i maskLUT = _mm512_set4lanes_epi8(
703 				/* 0        : 0b1010_1000*/ 0xa8,
704 				/* 1 .. 9   : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
705 				/* 10       : 0b1111_0000*/ 0xf0,
706 				/* 11       : 0b0101_0100*/ 0x54,
707 				/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
708 				/* 15       : 0b0101_0100*/ 0x54);
709 		const __m512i bitposLUT = _mm512_set4lanes_epi8(
710 				0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
711 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
712 		const __m512i M = _mm512_shuffle_epi8(maskLUT, lower_nibble);
713 		const __m512i bit = _mm512_shuffle_epi8(bitposLUT, higher_nibble);
714 		const uint64_t match = _mm512_test_epi8_mask(M, bit);
715 		if (match != (uint64_t)-1) {
716 			break;
717 		}
718 		const __m512i sh = _mm512_shuffle_epi8(shiftLUT, higher_nibble);
719 		const __mmask64 eq_2f = _mm512_cmpeq_epi8_mask(str, _mm512_set1_epi8(0x2f));
720 		const __m512i shift = _mm512_mask_mov_epi8(sh, eq_2f, _mm512_set1_epi8(16));
721 		str = _mm512_add_epi8(str, shift);
722 
723 		/* Step 3: pack four fields within 32-bit words into 24-bit words. */
724 		const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
725 		str = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
726 
727 		/* Step 4: move 3-byte words into the continuous array. */
728 		const __m512i t1 = _mm512_shuffle_epi8(str,
729 			_mm512_set4lanes_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
730 		const __m512i s6 = _mm512_setr_epi32(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0, 0, 0, 0);
731 		const __m512i t2 = _mm512_permutexvar_epi32(s6, t1);
732 
733 		/* Step 5: store the final result */
734 		_mm512_storeu_si512((__m512i *)o, t2);
735 
736 		c += 64;
737 		o += 48;
738 		outl += 48;
739 		length -= 64;
740 	}
741 
742 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
743 		zend_string_efree(result);
744 		return NULL;
745 	}
746 
747 	ZSTR_LEN(result) = outl;
748 
749 	return result;
750 }
751 #endif
752 
753 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
754 # if ZEND_INTRIN_AVX2_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
755 static __m256i php_base64_encode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
756 static __m256i php_base64_encode_avx2_translate(__m256i in) __attribute__((target("avx2")));
757 # endif
php_base64_encode_avx2_reshuffle(__m256i in)758 static __m256i php_base64_encode_avx2_reshuffle(__m256i in)
759 {
760 	/* This one works with shifted (4 bytes) input in order to
761 	 * be able to work efficiently in the 2 128-bit lanes */
762 	__m256i t0, t1, t2, t3;
763 
764 	/* input, bytes MSB to LSB:
765 	 * 0 0 0 0 x w v u t s r q p o n m
766 	 * l k j i h g f e d c b a 0 0 0 0 */
767 	in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
768 		10, 11,  9, 10,
769 		 7,  8,  6,  7,
770 		 4,  5,  3,  4,
771 		 1,  2,  0,  1,
772 
773 		14, 15, 13, 14,
774 		11, 12, 10, 11,
775 		 8,  9,  7,  8,
776 		 5,  6,  4,  5));
777 
778 	t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
779 
780 	t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
781 
782 	t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
783 
784 	t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
785 
786 	return _mm256_or_si256(t1, t3);
787 	/* 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
788 	 * 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
789 	 * 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
790 	 * 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
791 	 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
792 	 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
793 	 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
794 	 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
795 }
796 
php_base64_encode_avx2_translate(__m256i in)797 static __m256i php_base64_encode_avx2_translate(__m256i in)
798 {
799 	__m256i lut, indices, mask;
800 
801 	lut = _mm256_setr_epi8(
802 			65, 71, -4, -4, -4, -4, -4, -4,
803 			-4, -4, -4, -4, -19, -16, 0, 0,
804 			65, 71, -4, -4, -4, -4, -4, -4,
805 			-4, -4, -4, -4, -19, -16, 0, 0);
806 
807 	indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
808 
809 	mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
810 
811 	indices = _mm256_sub_epi8(indices, mask);
812 
813 	return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
814 
815 }
816 #endif /* ZEND_INTRIN_AVX2_NATIVE || (ZEND_INTRIN_AVX2_RESOLVER && !ZEND_INTRIN_SSSE3_NATIVE) */
817 
818 #if ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
819 
820 # if ZEND_INTRIN_SSSE3_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
821 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
822 static __m128i php_base64_encode_ssse3_translate(__m128i in) __attribute__((target("ssse3")));
823 # endif
824 
php_base64_encode_ssse3_reshuffle(__m128i in)825 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in)
826 {
827 	__m128i t0, t1, t2, t3;
828 
829 	/* input, bytes MSB to LSB:
830 	 * 0 0 0 0 l k j i h g f e d c b a */
831 	in = _mm_shuffle_epi8(in, _mm_set_epi8(
832 				10, 11,  9, 10,
833 				7,  8,  6,  7,
834 				4,  5,  3,  4,
835 				1,  2,  0,  1));
836 
837 	t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
838 
839 	t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
840 
841 	t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
842 
843 	t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
844 
845 	/* output (upper case are MSB, lower case are LSB):
846 	 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
847 	 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
848 	 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
849 	 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
850 	return _mm_or_si128(t1, t3);
851 }
852 
php_base64_encode_ssse3_translate(__m128i in)853 static __m128i php_base64_encode_ssse3_translate(__m128i in)
854 {
855 	__m128i mask, indices;
856 	__m128i lut = _mm_setr_epi8(
857 			65,  71, -4, -4,
858 			-4,  -4, -4, -4,
859 			-4,  -4, -4, -4,
860 			-19, -16,  0,  0
861 			);
862 
863 	/* Translate values 0..63 to the Base64 alphabet. There are five sets:
864 	 * #  From      To         Abs    Index  Characters
865 	 * 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
866 	 * 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
867 	 * 2  [52..61]  [48..57]    -4  [2..11]  0123456789
868 	 * 3  [62]      [43]       -19       12  +
869 	 * 4  [63]      [47]       -16       13  / */
870 
871 	/* Create LUT indices from input:
872 	 * the index for range #0 is right, others are 1 less than expected: */
873 	indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
874 
875 	/* mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: */
876 	mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
877 
878 	/* subtract -1, so add 1 to indices for range #[1..4], All indices are now correct: */
879 	indices = _mm_sub_epi8(indices, mask);
880 
881 	/* Add offsets to input values: */
882 	return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
883 }
884 
885 #define PHP_BASE64_ENCODE_SSSE3_LOOP				\
886 	while (length > 15) {							\
887 		__m128i s = _mm_loadu_si128((__m128i *)c);	\
888 													\
889 		s = php_base64_encode_ssse3_reshuffle(s);	\
890 													\
891 		s = php_base64_encode_ssse3_translate(s);	\
892 													\
893 		_mm_storeu_si128((__m128i *)o, s);			\
894 		c += 12;									\
895 		o += 16;									\
896 		length -= 12;								\
897 	}
898 
899 #endif /* ZEND_INTRIN_SSSE3_NATIVE || (ZEND_INTRIN_SSSE3_RESOLVER && !ZEND_INTRIN_AVX2_NATIVE) */
900 
901 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
902 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_SSSE3_NATIVE
php_base64_encode(const unsigned char * str,size_t length)903 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length)
904 # elif ZEND_INTRIN_AVX2_RESOLVER
905 zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length)
906 # else /* ZEND_INTRIN_SSSE3_RESOLVER */
907 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length)
908 # endif
909 {
910 	const unsigned char *c = str;
911 	unsigned char *o;
912 	zend_string *result;
913 
914 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
915 	o = (unsigned char *)ZSTR_VAL(result);
916 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
917 	if (length > 31) {
918 		__m256i s = _mm256_loadu_si256((__m256i *)c);
919 
920 		s = _mm256_permutevar8x32_epi32(s, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
921 
922 		for (;;) {
923 			s = php_base64_encode_avx2_reshuffle(s);
924 
925 			s = php_base64_encode_avx2_translate(s);
926 
927 			_mm256_storeu_si256((__m256i *)o, s);
928 			c += 24;
929 			o += 32;
930 			length -= 24;
931 			if (length < 28) {
932 				break;
933 			}
934 			s = _mm256_loadu_si256((__m256i *)(c - 4));
935 		}
936 	}
937 # else
938 	PHP_BASE64_ENCODE_SSSE3_LOOP;
939 # endif
940 
941 	o = php_base64_encode_impl(c, length, o);
942 
943 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
944 
945 	return result;
946 }
947 
948 # if ZEND_INTRIN_SSSE3_RESOLVER && ZEND_INTRIN_AVX2_RESOLVER
php_base64_encode_ssse3(const unsigned char * str,size_t length)949 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length)
950 {
951 	const unsigned char *c = str;
952 	unsigned char *o;
953 	zend_string *result;
954 
955 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
956 	o = (unsigned char *)ZSTR_VAL(result);
957 
958 	PHP_BASE64_ENCODE_SSSE3_LOOP;
959 
960 	o = php_base64_encode_impl(c, length, o);
961 
962 	ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
963 
964 	return result;
965 }
966 # endif
967 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
968 
969 /* }}} */
970 
971 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
972 # if ZEND_INTRIN_AVX2_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
973 static __m256i php_base64_decode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
974 # endif
975 
php_base64_decode_avx2_reshuffle(__m256i in)976 static __m256i php_base64_decode_avx2_reshuffle(__m256i in)
977 {
978 	__m256i merge_ab_and_bc, out;
979 
980 	merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
981 
982 	out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
983 
984 	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
985 				2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
986 				2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
987 
988 	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
989 }
990 #endif
991 
992 #if ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
993 # if ZEND_INTRIN_SSSE3_RESOLVER && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
994 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
995 # endif
996 
php_base64_decode_ssse3_reshuffle(__m128i in)997 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in)
998 {
999 	__m128i merge_ab_and_bc, out;
1000 
1001 	merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
1002 	/* 0000kkkk LLllllll 0000JJJJ JJjjKKKK
1003 	 * 0000hhhh IIiiiiii 0000GGGG GGggHHHH
1004 	 * 0000eeee FFffffff 0000DDDD DDddEEEE
1005 	 * 0000bbbb CCcccccc 0000AAAA AAaaBBBB */
1006 
1007 	out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
1008 	/* 00000000 JJJJJJjj KKKKkkkk LLllllll
1009 	 * 00000000 GGGGGGgg HHHHhhhh IIiiiiii
1010 	 * 00000000 DDDDDDdd EEEEeeee FFffffff
1011 	 * 00000000 AAAAAAaa BBBBbbbb CCcccccc */
1012 
1013 	return  _mm_shuffle_epi8(out, _mm_setr_epi8(
1014 		 2,  1,  0,
1015 		 6,  5,  4,
1016 		10,  9,  8,
1017 		14, 13, 12,
1018 		-1, -1, -1, -1));
1019 	/* 00000000 00000000 00000000 00000000
1020 	 * LLllllll KKKKkkkk JJJJJJjj IIiiiiii
1021 	 * HHHHhhhh GGGGGGgg FFffffff EEEEeeee
1022 	 * DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa */
1023 }
1024 
1025 #define PHP_BASE64_DECODE_SSSE3_LOOP								\
1026 	while (length > 15 + 6 + 2) {									\
1027 		__m128i lut_lo, lut_hi, lut_roll;							\
1028 		__m128i hi_nibbles, lo_nibbles, hi, lo;						\
1029 		__m128i s = _mm_loadu_si128((__m128i *)c);					\
1030 																	\
1031 		lut_lo = _mm_setr_epi8(										\
1032 				0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,		\
1033 				0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);	\
1034 		lut_hi = _mm_setr_epi8(										\
1035 				0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,		\
1036 				0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);	\
1037 		lut_roll = _mm_setr_epi8(									\
1038 				0,  16,  19,   4, -65, -65, -71, -71,				\
1039 				0,   0,   0,   0,   0,   0,   0,   0);				\
1040 																	\
1041 		hi_nibbles  = _mm_and_si128(								\
1042 						_mm_srli_epi32(s, 4), _mm_set1_epi8(0x2f));	\
1043 		lo_nibbles  = _mm_and_si128(s, _mm_set1_epi8(0x2f));		\
1044 		hi          = _mm_shuffle_epi8(lut_hi, hi_nibbles);			\
1045 		lo          = _mm_shuffle_epi8(lut_lo, lo_nibbles);			\
1046 																	\
1047 																	\
1048 		if (UNEXPECTED(												\
1049 			_mm_movemask_epi8(										\
1050 				_mm_cmpgt_epi8(										\
1051 					_mm_and_si128(lo, hi), _mm_set1_epi8(0))))) {	\
1052 			break;													\
1053 		} else {													\
1054 			__m128i eq_2f, roll;									\
1055 																	\
1056 			eq_2f = _mm_cmpeq_epi8(s, _mm_set1_epi8(0x2f));			\
1057 			roll = _mm_shuffle_epi8(								\
1058 					lut_roll, _mm_add_epi8(eq_2f, hi_nibbles));		\
1059 																	\
1060 			s = _mm_add_epi8(s, roll);								\
1061 			s = php_base64_decode_ssse3_reshuffle(s);				\
1062 																	\
1063 			_mm_storeu_si128((__m128i *)o, s);						\
1064 																	\
1065 			c += 16;												\
1066 			o += 12;												\
1067 			outl += 12;												\
1068 			length -= 16;											\
1069 		}															\
1070 	}
1071 
1072 #endif
1073 
1074 #if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER
1075 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_SSSE3_NATIVE
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)1076 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1077 # elif ZEND_INTRIN_AVX2_RESOLVER
1078 zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict)
1079 # else
1080 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1081 # endif
1082 {
1083 	const unsigned char *c = str;
1084 	unsigned char *o;
1085 	size_t outl = 0;
1086 	zend_string *result;
1087 
1088 	result = zend_string_alloc(length, 0);
1089 	o = (unsigned char *)ZSTR_VAL(result);
1090 
1091 	/* See: "Faster Base64 Encoding and Decoding using AVX2 Instructions"
1092 	* https://arxiv.org/pdf/1704.00605.pdf */
1093 # if ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER
1094 	while (length > 31 + 11 + 2) {
1095 		__m256i lut_lo, lut_hi, lut_roll;
1096 		__m256i hi_nibbles, lo_nibbles, hi, lo;
1097 		__m256i str = _mm256_loadu_si256((__m256i *)c);
1098 
1099 		lut_lo = _mm256_setr_epi8(
1100 				0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1101 				0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
1102 				0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1103 				0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
1104 
1105 		lut_hi = _mm256_setr_epi8(
1106 				0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1107 				0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1108 				0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1109 				0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
1110 
1111 		lut_roll = _mm256_setr_epi8(
1112 				0,  16,  19,   4, -65, -65, -71, -71,
1113 				0,   0,   0,   0,   0,   0,   0,   0,
1114 				0,  16,  19,   4, -65, -65, -71, -71,
1115 				0,   0,   0,   0,   0,   0,   0,   0);
1116 
1117 		hi_nibbles  = _mm256_and_si256(_mm256_srli_epi32(str, 4), _mm256_set1_epi8(0x2f));
1118 		lo_nibbles  = _mm256_and_si256(str, _mm256_set1_epi8(0x2f));
1119 		hi          = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
1120 		lo          = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
1121 
1122 		if (!_mm256_testz_si256(lo, hi)) {
1123 			break;
1124 		} else {
1125 			__m256i eq_2f, roll;
1126 			eq_2f = _mm256_cmpeq_epi8(str, _mm256_set1_epi8(0x2f));
1127 			roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2f, hi_nibbles));
1128 
1129 
1130 			str = _mm256_add_epi8(str, roll);
1131 
1132 			str = php_base64_decode_avx2_reshuffle(str);
1133 
1134 			_mm256_storeu_si256((__m256i *)o, str);
1135 
1136 			c += 32;
1137 			o += 24;
1138 			outl += 24;
1139 			length -= 32;
1140 		}
1141 	}
1142 # else
1143 	PHP_BASE64_DECODE_SSSE3_LOOP;
1144 # endif
1145 
1146 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1147 		zend_string_efree(result);
1148 		return NULL;
1149 	}
1150 
1151 	ZSTR_LEN(result) = outl;
1152 
1153 	return result;
1154 }
1155 
1156 # if ZEND_INTRIN_SSSE3_RESOLVER && ZEND_INTRIN_AVX2_RESOLVER
php_base64_decode_ex_ssse3(const unsigned char * str,size_t length,bool strict)1157 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1158 {
1159 	const unsigned char *c = str;
1160 	unsigned char *o;
1161 	size_t outl = 0;
1162 	zend_string *result;
1163 
1164 	result = zend_string_alloc(length, 0);
1165 	o = (unsigned char *)ZSTR_VAL(result);
1166 
1167 	PHP_BASE64_DECODE_SSSE3_LOOP;
1168 
1169 	if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1170 		zend_string_efree(result);
1171 		return NULL;
1172 	}
1173 
1174 	ZSTR_LEN(result) = outl;
1175 
1176 	return result;
1177 }
1178 # endif
1179 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
1180 
1181 #if !ZEND_INTRIN_AVX2_NATIVE && !ZEND_INTRIN_SSSE3_NATIVE
1182 #if ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_RESOLVER
php_base64_encode_default(const unsigned char * str,size_t length)1183 zend_string *php_base64_encode_default(const unsigned char *str, size_t length)
1184 #else
1185 PHPAPI zend_string *php_base64_encode(const unsigned char *str, size_t length)
1186 #endif
1187 {
1188 	unsigned char *p;
1189 	zend_string *result;
1190 
1191 	result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
1192 	p = (unsigned char *)ZSTR_VAL(result);
1193 
1194 	p = php_base64_encode_impl(str, length, p);
1195 
1196 	ZSTR_LEN(result) = (p - (unsigned char *)ZSTR_VAL(result));
1197 
1198 	return result;
1199 }
1200 #endif
1201 
1202 #if !ZEND_INTRIN_AVX2_NATIVE && !ZEND_INTRIN_SSSE3_NATIVE
1203 #if ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_RESOLVER
php_base64_decode_ex_default(const unsigned char * str,size_t length,bool strict)1204 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict)
1205 #else
1206 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1207 #endif
1208 {
1209 	zend_string *result;
1210 	size_t outl = 0;
1211 
1212 	result = zend_string_alloc(length, 0);
1213 
1214 	if (!php_base64_decode_impl(str, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1215 		zend_string_efree(result);
1216 		return NULL;
1217 	}
1218 
1219 	ZSTR_LEN(result) = outl;
1220 
1221 	return result;
1222 }
1223 #endif
1224 /* }}} */
1225 
1226 /* {{{ Encodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_encode)1227 PHP_FUNCTION(base64_encode)
1228 {
1229 	char *str;
1230 	size_t str_len;
1231 	zend_string *result;
1232 
1233 	ZEND_PARSE_PARAMETERS_START(1, 1)
1234 		Z_PARAM_STRING(str, str_len)
1235 	ZEND_PARSE_PARAMETERS_END();
1236 
1237 	result = php_base64_encode((unsigned char*)str, str_len);
1238 	RETURN_STR(result);
1239 }
1240 /* }}} */
1241 
1242 /* {{{ Decodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_decode)1243 PHP_FUNCTION(base64_decode)
1244 {
1245 	char *str;
1246 	bool strict = 0;
1247 	size_t str_len;
1248 	zend_string *result;
1249 
1250 	ZEND_PARSE_PARAMETERS_START(1, 2)
1251 		Z_PARAM_STRING(str, str_len)
1252 		Z_PARAM_OPTIONAL
1253 		Z_PARAM_BOOL(strict)
1254 	ZEND_PARSE_PARAMETERS_END();
1255 
1256 	result = php_base64_decode_ex((unsigned char*)str, str_len, strict);
1257 	if (result != NULL) {
1258 		RETURN_STR(result);
1259 	} else {
1260 		RETURN_FALSE;
1261 	}
1262 }
1263 /* }}} */
1264