1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Jim Winstead <jimw@php.net> |
14 | Xinchen Hui <laruence@php.net> |
15 +----------------------------------------------------------------------+
16 */
17
18 #include <string.h>
19
20 #include "php.h"
21 #include "base64.h"
22
23 /* {{{ base64 tables */
24 static const char base64_table[] = {
25 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
26 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
27 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
28 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
29 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
30 };
31
32 static const char base64_pad = '=';
33
34 static const short base64_reverse_table[256] = {
35 -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
36 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
37 -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
38 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
39 -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
40 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
41 -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
42 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
43 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
48 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
49 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
50 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
51 };
52 /* }}} */
53
54 #if defined(__aarch64__) || defined(_M_ARM64)
55 #include <arm_neon.h>
56
encode_toascii(const uint8x16_t input,const uint8x16x2_t shift_LUT)57 static zend_always_inline uint8x16_t encode_toascii(const uint8x16_t input, const uint8x16x2_t shift_LUT)
58 {
59 /* reduce 0..51 -> 0
60 52..61 -> 1 .. 10
61 62 -> 11
62 63 -> 12 */
63 uint8x16_t result = vqsubq_u8(input, vdupq_n_u8(51));
64 /* distinguish between ranges 0..25 and 26..51:
65 0 .. 25 -> remains 0
66 26 .. 51 -> becomes 13 */
67 const uint8x16_t less = vcgtq_u8(vdupq_n_u8(26), input);
68 result = vorrq_u8(result, vandq_u8(less, vdupq_n_u8(13)));
69 /* read shift */
70 result = vqtbl2q_u8(shift_LUT, result);
71 return vaddq_u8(result, input);
72 }
73
neon_base64_encode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)74 static zend_always_inline unsigned char *neon_base64_encode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left)
75 {
76 const uint8_t shift_LUT_[32] = {'a' - 26, '0' - 52, '0' - 52, '0' - 52,
77 '0' - 52, '0' - 52, '0' - 52, '0' - 52,
78 '0' - 52, '0' - 52, '0' - 52, '+' - 62,
79 '/' - 63, 'A', 0, 0,
80 'a' - 26, '0' - 52, '0' - 52, '0' - 52,
81 '0' - 52, '0' - 52, '0' - 52, '0' - 52,
82 '0' - 52, '0' - 52, '0' - 52, '+' - 62,
83 '/' - 63, 'A', 0, 0};
84 const uint8x16x2_t shift_LUT = *((const uint8x16x2_t *)shift_LUT_);
85 do {
86 /* [ccdddddd | bbbbcccc | aaaaaabb]
87 x.val[2] | x.val[1] | x.val[0] */
88 const uint8x16x3_t x = vld3q_u8((const uint8_t *)(in));
89
90 /* [00aa_aaaa] */
91 const uint8x16_t field_a = vshrq_n_u8(x.val[0], 2);
92
93 const uint8x16_t field_b = /* [00bb_bbbb] */
94 vbslq_u8(vdupq_n_u8(0x30), /* [0011_0000] */
95 vshlq_n_u8(x.val[0], 4), /* [aabb_0000] */
96 vshrq_n_u8(x.val[1], 4)); /* [0000_bbbb] */
97
98 const uint8x16_t field_c = /* [00cc_cccc] */
99 vbslq_u8(vdupq_n_u8(0x3c), /* [0011_1100] */
100 vshlq_n_u8(x.val[1], 2), /* [bbcc_cc00] */
101 vshrq_n_u8(x.val[2], 6)); /* [0000_00cc] */
102
103 /* [00dd_dddd] */
104 const uint8x16_t field_d = vandq_u8(x.val[2], vdupq_n_u8(0x3f));
105
106 uint8x16x4_t result;
107 result.val[0] = encode_toascii(field_a, shift_LUT);
108 result.val[1] = encode_toascii(field_b, shift_LUT);
109 result.val[2] = encode_toascii(field_c, shift_LUT);
110 result.val[3] = encode_toascii(field_d, shift_LUT);
111
112 vst4q_u8((uint8_t *)out, result);
113 out += 64;
114 in += 16 * 3;
115 inl -= 16 * 3;
116 } while (inl >= 16 * 3);
117
118 *left = inl;
119 return out;
120 }
121 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
122
php_base64_encode_impl(const unsigned char * in,size_t inl,unsigned char * out,zend_long flags)123 static zend_always_inline unsigned char *php_base64_encode_impl(const unsigned char *in, size_t inl, unsigned char *out, zend_long flags) /* {{{ */
124 {
125 #if defined(__aarch64__) || defined(_M_ARM64)
126 if (inl >= 16 * 3) {
127 size_t left = 0;
128 out = neon_base64_encode(in, inl, out, &left);
129 in += inl - left;
130 inl = left;
131 }
132 #endif
133
134 while (inl > 2) { /* keep going until we have less than 24 bits */
135 *out++ = base64_table[in[0] >> 2];
136 *out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
137 *out++ = base64_table[((in[1] & 0x0f) << 2) + (in[2] >> 6)];
138 *out++ = base64_table[in[2] & 0x3f];
139
140 in += 3;
141 inl -= 3; /* we just handle 3 octets of data */
142 }
143
144 /* now deal with the tail end of things */
145 if (inl != 0) {
146 *out++ = base64_table[in[0] >> 2];
147 if (inl > 1) {
148 *out++ = base64_table[((in[0] & 0x03) << 4) + (in[1] >> 4)];
149 *out++ = base64_table[(in[1] & 0x0f) << 2];
150 if ((flags & PHP_BASE64_NO_PADDING) == 0) {
151 *out++ = base64_pad;
152 }
153 } else {
154 *out++ = base64_table[(in[0] & 0x03) << 4];
155 if ((flags & PHP_BASE64_NO_PADDING) == 0) {
156 *out++ = base64_pad;
157 *out++ = base64_pad;
158 }
159 }
160 }
161
162 *out = '\0';
163
164 return out;
165 }
166 /* }}} */
167
168 #if defined(__aarch64__) || defined(_M_ARM64)
decode_fromascii(const uint8x16_t input,uint8x16_t * error,const uint8x16x2_t shiftLUT,const uint8x16x2_t maskLUT,const uint8x16x2_t bitposLUT)169 static zend_always_inline uint8x16_t decode_fromascii(const uint8x16_t input, uint8x16_t *error, const uint8x16x2_t shiftLUT, const uint8x16x2_t maskLUT, const uint8x16x2_t bitposLUT) {
170 const uint8x16_t higher_nibble = vshrq_n_u8(input, 4);
171 const uint8x16_t lower_nibble = vandq_u8(input, vdupq_n_u8(0x0f));
172 const uint8x16_t sh = vqtbl2q_u8(shiftLUT, higher_nibble);
173 const uint8x16_t eq_2f = vceqq_u8(input, vdupq_n_u8(0x2f));
174 const uint8x16_t shift = vbslq_u8(eq_2f, vdupq_n_u8(16), sh);
175 const uint8x16_t M = vqtbl2q_u8(maskLUT, lower_nibble);
176 const uint8x16_t bit = vqtbl2q_u8(bitposLUT, higher_nibble);
177 *error = vceqq_u8(vandq_u8(M, bit), vdupq_n_u8(0));
178 return vaddq_u8(input, shift);
179 }
180
neon_base64_decode(const unsigned char * in,size_t inl,unsigned char * out,size_t * left)181 static zend_always_inline size_t neon_base64_decode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left) {
182 unsigned char *out_orig = out;
183 const uint8_t shiftLUT_[32] = {
184 0, 0, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
185 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
187 0, 0, 0, 0, 0, 0, 0, 0};
188 const uint8_t maskLUT_[32] = {
189 /* 0 : 0b1010_1000*/ 0xa8,
190 /* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
191 /* 10 : 0b1111_0000*/ 0xf0,
192 /* 11 : 0b0101_0100*/ 0x54,
193 /* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
194 /* 15 : 0b0101_0100*/ 0x54,
195
196 /* 0 : 0b1010_1000*/ 0xa8,
197 /* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
198 /* 10 : 0b1111_0000*/ 0xf0,
199 /* 11 : 0b0101_0100*/ 0x54,
200 /* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
201 /* 15 : 0b0101_0100*/ 0x54
202 };
203 const uint8_t bitposLUT_[32] = {
204 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
205 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
206
207 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
208 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
209 };
210 const uint8x16x2_t shiftLUT = *((const uint8x16x2_t *)shiftLUT_);
211 const uint8x16x2_t maskLUT = *((const uint8x16x2_t *)maskLUT_);
212 const uint8x16x2_t bitposLUT = *((const uint8x16x2_t *)bitposLUT_);;
213
214 do {
215 const uint8x16x4_t x = vld4q_u8((const unsigned char *)in);
216 uint8x16_t error_a;
217 uint8x16_t error_b;
218 uint8x16_t error_c;
219 uint8x16_t error_d;
220 uint8x16_t field_a = decode_fromascii(x.val[0], &error_a, shiftLUT, maskLUT, bitposLUT);
221 uint8x16_t field_b = decode_fromascii(x.val[1], &error_b, shiftLUT, maskLUT, bitposLUT);
222 uint8x16_t field_c = decode_fromascii(x.val[2], &error_c, shiftLUT, maskLUT, bitposLUT);
223 uint8x16_t field_d = decode_fromascii(x.val[3], &error_d, shiftLUT, maskLUT, bitposLUT);
224
225 const uint8x16_t err = vorrq_u8(vorrq_u8(error_a, error_b), vorrq_u8(error_c, error_d));
226 union {uint8_t mem[16]; uint64_t dw[2]; } error;
227 vst1q_u8(error.mem, err);
228
229 /* Check that the input only contains bytes belonging to the alphabet of
230 Base64. If there are errors, decode the rest of the string with the
231 scalar decoder. */
232 if (error.dw[0] | error.dw[1])
233 break;
234
235 uint8x16x3_t result;
236 result.val[0] = vorrq_u8(vshrq_n_u8(field_b, 4), vshlq_n_u8(field_a, 2));
237 result.val[1] = vorrq_u8(vshrq_n_u8(field_c, 2), vshlq_n_u8(field_b, 4));
238 result.val[2] = vorrq_u8(field_d, vshlq_n_u8(field_c, 6));
239
240 vst3q_u8((unsigned char *)out, result);
241 out += 16 * 3;
242 in += 16 * 4;
243 inl -= 16 * 4;
244 } while (inl >= 16 * 4);
245 *left = inl;
246 return out - out_orig;
247 }
248 #endif /* defined(__aarch64__) || defined(_M_ARM64) */
249
php_base64_decode_impl(const unsigned char * in,size_t inl,unsigned char * out,size_t * outl,bool strict)250 static zend_always_inline int php_base64_decode_impl(const unsigned char *in, size_t inl, unsigned char *out, size_t *outl, bool strict) /* {{{ */
251 {
252 int ch;
253 size_t i = 0, padding = 0, j = *outl;
254
255 #if defined(__aarch64__) || defined(_M_ARM64)
256 if (inl >= 16 * 4) {
257 size_t left = 0;
258 j += neon_base64_decode(in, inl, out, &left);
259 i = inl - left;
260 in += i;
261 inl = left;
262 }
263 #endif
264
265 /* run through the whole string, converting as we go */
266 while (inl-- > 0) {
267 ch = *in++;
268 if (ch == base64_pad) {
269 padding++;
270 continue;
271 }
272
273 ch = base64_reverse_table[ch];
274 if (!strict) {
275 /* skip unknown characters and whitespace */
276 if (ch < 0) {
277 continue;
278 }
279 } else {
280 /* skip whitespace */
281 if (ch == -1) {
282 continue;
283 }
284 /* fail on bad characters or if any data follows padding */
285 if (ch == -2 || padding) {
286 goto fail;
287 }
288 }
289
290 switch (i % 4) {
291 case 0:
292 out[j] = ch << 2;
293 break;
294 case 1:
295 out[j++] |= ch >> 4;
296 out[j] = (ch & 0x0f) << 4;
297 break;
298 case 2:
299 out[j++] |= ch >>2;
300 out[j] = (ch & 0x03) << 6;
301 break;
302 case 3:
303 out[j++] |= ch;
304 break;
305 }
306 i++;
307 }
308
309 /* fail if the input is truncated (only one char in last group) */
310 if (strict && i % 4 == 1) {
311 goto fail;
312 }
313
314 /* fail if the padding length is wrong (not VV==, VVV=), but accept zero padding
315 * RFC 4648: "In some circumstances, the use of padding [--] is not required" */
316 if (strict && padding && (padding > 2 || (i + padding) % 4 != 0)) {
317 goto fail;
318 }
319
320 *outl = j;
321 out[j] = '\0';
322
323 return 1;
324
325 fail:
326 return 0;
327 }
328 /* }}} */
329
330 /* {{{ php_base64_encode */
331
332 #ifdef ZEND_INTRIN_AVX2_NATIVE
333 # undef ZEND_INTRIN_SSSE3_NATIVE
334 # undef ZEND_INTRIN_SSSE3_RESOLVER
335 # undef ZEND_INTRIN_SSSE3_FUNC_PROTO
336 # undef ZEND_INTRIN_SSSE3_FUNC_PTR
337 #elif defined(ZEND_INTRIN_AVX2_FUNC_PROTO) && defined(ZEND_INTRIN_SSSE3_NATIVE)
338 # undef ZEND_INTRIN_SSSE3_NATIVE
339 # undef ZEND_INTRIN_SSSE3_RESOLVER
340 # define ZEND_INTRIN_SSSE3_RESOLVER 1
341 # define ZEND_INTRIN_SSSE3_FUNC_PROTO 1
342 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
343 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
344 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
345 # else
346 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
347 # endif
348 #elif defined(ZEND_INTRIN_AVX2_FUNC_PTR) && defined(ZEND_INTRIN_SSSE3_NATIVE)
349 # undef ZEND_INTRIN_SSSE3_NATIVE
350 # undef ZEND_INTRIN_SSSE3_RESOLVER
351 # define ZEND_INTRIN_SSSE3_RESOLVER 1
352 # define ZEND_INTRIN_SSSE3_FUNC_PTR 1
353 # undef ZEND_INTRIN_SSSE3_FUNC_DECL
354 # ifdef HAVE_FUNC_ATTRIBUTE_TARGET
355 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func __attribute__((target("ssse3")))
356 # else
357 # define ZEND_INTRIN_SSSE3_FUNC_DECL(func) ZEND_API func
358 # endif
359 #endif
360
361 /* Only enable avx512 resolver if avx2 use resolver also */
362 #if defined(ZEND_INTRIN_AVX2_FUNC_PROTO) && defined(ZEND_INTRIN_AVX512_FUNC_PROTO)
363 #define BASE64_INTRIN_AVX512_FUNC_PROTO 1
364 #endif
365 #if defined(ZEND_INTRIN_AVX2_FUNC_PTR) && defined(ZEND_INTRIN_AVX512_FUNC_PTR)
366 #define BASE64_INTRIN_AVX512_FUNC_PTR 1
367 #endif
368 #if defined(ZEND_INTRIN_AVX2_FUNC_PROTO) && defined(ZEND_INTRIN_AVX512_VBMI_FUNC_PROTO)
369 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO 1
370 #endif
371 #if defined(ZEND_INTRIN_AVX2_FUNC_PTR) && defined(ZEND_INTRIN_AVX512_VBMI_FUNC_PTR)
372 #define BASE64_INTRIN_AVX512_VBMI_FUNC_PTR 1
373 #endif
374
375 #ifdef ZEND_INTRIN_AVX2_NATIVE
376 # include <immintrin.h>
377 #elif defined(ZEND_INTRIN_SSSE3_NATIVE)
378 # include <tmmintrin.h>
379 #elif defined(ZEND_INTRIN_SSSE3_RESOLVER) || defined(ZEND_INTRIN_AVX2_RESOLVER)
380 # ifdef ZEND_INTRIN_AVX2_RESOLVER
381 # include <immintrin.h>
382 # else
383 # include <tmmintrin.h>
384 # endif /* (ZEND_INTRIN_SSSE3_RESOLVER || ZEND_INTRIN_AVX2_RESOLVER) */
385 # include "Zend/zend_cpuinfo.h"
386
387 # if defined(BASE64_INTRIN_AVX512_FUNC_PROTO) || defined(BASE64_INTRIN_AVX512_FUNC_PTR)
388 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length, zend_long flags));
389 ZEND_INTRIN_AVX512_FUNC_DECL(zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict));
390 # endif
391 # if defined(BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO) || defined(BASE64_INTRIN_AVX512_VBMI_FUNC_PTR)
392 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length, zend_long flags));
393 ZEND_INTRIN_AVX512_VBMI_FUNC_DECL(zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict));
394 # endif
395
396 # ifdef ZEND_INTRIN_AVX2_RESOLVER
397 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length, zend_long flags));
398 ZEND_INTRIN_AVX2_FUNC_DECL(zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict));
399 # endif
400
401 # ifdef ZEND_INTRIN_SSSE3_RESOLVER
402 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length, zend_long flags));
403 ZEND_INTRIN_SSSE3_FUNC_DECL(zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict));
404 # endif
405
406 zend_string *php_base64_encode_default(const unsigned char *str, size_t length, zend_long flags);
407 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict);
408
409 # if (defined(ZEND_INTRIN_AVX2_FUNC_PROTO) || defined(ZEND_INTRIN_SSSE3_FUNC_PROTO) || defined(BASE64_INTRIN_AVX512_FUNC_PROTO) || defined(BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO))
410 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags) __attribute__((ifunc("resolve_base64_encode")));
411 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) __attribute__((ifunc("resolve_base64_decode")));
412
413 typedef zend_string *(*base64_encode_func_t)(const unsigned char *, size_t, zend_long flags);
414 typedef zend_string *(*base64_decode_func_t)(const unsigned char *, size_t, bool);
415
416 ZEND_NO_SANITIZE_ADDRESS
417 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_encode(void)418 static base64_encode_func_t resolve_base64_encode(void) {
419 # ifdef BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
420 if (zend_cpu_supports_avx512_vbmi()) {
421 return php_base64_encode_avx512_vbmi;
422 } else
423 # endif
424 # ifdef BASE64_INTRIN_AVX512_FUNC_PROTO
425 if (zend_cpu_supports_avx512()) {
426 return php_base64_encode_avx512;
427 } else
428 # endif
429 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
430 if (zend_cpu_supports_avx2()) {
431 return php_base64_encode_avx2;
432 } else
433 # endif
434 #ifdef ZEND_INTRIN_SSSE3_FUNC_PROTO
435 if (zend_cpu_supports_ssse3()) {
436 return php_base64_encode_ssse3;
437 }
438 #endif
439 return php_base64_encode_default;
440 }
441
442 ZEND_NO_SANITIZE_ADDRESS
443 ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
resolve_base64_decode(void)444 static base64_decode_func_t resolve_base64_decode(void) {
445 # ifdef BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO
446 if (zend_cpu_supports_avx512_vbmi()) {
447 return php_base64_decode_ex_avx512_vbmi;
448 } else
449 # endif
450 # ifdef BASE64_INTRIN_AVX512_FUNC_PROTO
451 if (zend_cpu_supports_avx512()) {
452 return php_base64_decode_ex_avx512;
453 } else
454 # endif
455 # ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
456 if (zend_cpu_supports_avx2()) {
457 return php_base64_decode_ex_avx2;
458 } else
459 # endif
460 #ifdef ZEND_INTRIN_SSSE3_FUNC_PROTO
461 if (zend_cpu_supports_ssse3()) {
462 return php_base64_decode_ex_ssse3;
463 }
464 #endif
465 return php_base64_decode_ex_default;
466 }
467 # else /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
468
469 PHPAPI zend_string *(*php_base64_encode_ptr)(const unsigned char *str, size_t length, zend_long flags) = NULL;
470 PHPAPI zend_string *(*php_base64_decode_ex_ptr)(const unsigned char *str, size_t length, bool strict) = NULL;
471
php_base64_encode_ex(const unsigned char * str,size_t length,zend_long flags)472 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags) {
473 return php_base64_encode_ptr(str, length, flags);
474 }
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)475 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict) {
476 return php_base64_decode_ex_ptr(str, length, strict);
477 }
478
PHP_MINIT_FUNCTION(base64_intrin)479 PHP_MINIT_FUNCTION(base64_intrin)
480 {
481 # ifdef BASE64_INTRIN_AVX512_VBMI_FUNC_PTR
482 if (zend_cpu_supports_avx512_vbmi()) {
483 php_base64_encode_ptr = php_base64_encode_avx512_vbmi;
484 php_base64_decode_ex_ptr = php_base64_decode_ex_avx512_vbmi;
485 } else
486 # endif
487 # ifdef BASE64_INTRIN_AVX512_FUNC_PTR
488 if (zend_cpu_supports_avx512()) {
489 php_base64_encode_ptr = php_base64_encode_avx512;
490 php_base64_decode_ex_ptr = php_base64_decode_ex_avx512;
491 } else
492 # endif
493 # ifdef ZEND_INTRIN_AVX2_FUNC_PTR
494 if (zend_cpu_supports_avx2()) {
495 php_base64_encode_ptr = php_base64_encode_avx2;
496 php_base64_decode_ex_ptr = php_base64_decode_ex_avx2;
497 } else
498 # endif
499 #ifdef ZEND_INTRIN_SSSE3_FUNC_PTR
500 if (zend_cpu_supports_ssse3()) {
501 php_base64_encode_ptr = php_base64_encode_ssse3;
502 php_base64_decode_ex_ptr = php_base64_decode_ex_ssse3;
503 } else
504 #endif
505 {
506 php_base64_encode_ptr = php_base64_encode_default;
507 php_base64_decode_ex_ptr = php_base64_decode_ex_default;
508 }
509 return SUCCESS;
510 }
511 # endif /* (ZEND_INTRIN_AVX2_FUNC_PROTO || ZEND_INTRIN_SSSE3_FUNC_PROTO) */
512 #endif /* ZEND_INTRIN_AVX2_NATIVE */
513
514 #if defined(BASE64_INTRIN_AVX512_VBMI_FUNC_PROTO) || defined(BASE64_INTRIN_AVX512_VBMI_FUNC_PTR)
php_base64_encode_avx512_vbmi(const unsigned char * str,size_t length,zend_long flags)515 zend_string *php_base64_encode_avx512_vbmi(const unsigned char *str, size_t length, zend_long flags)
516 {
517 const unsigned char *c = str;
518 unsigned char *o;
519 zend_string *result;
520
521 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
522 o = (unsigned char *)ZSTR_VAL(result);
523
524 const __m512i shuffle_splitting = _mm512_setr_epi32(
525 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
526 0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
527 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
528 const __m512i multi_shifts = _mm512_set1_epi64(0x3036242a1016040a);
529 const char *ascii_lookup_tbl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
530 const __m512i ascii_lookup = _mm512_loadu_si512((__m512i *)ascii_lookup_tbl);
531
532 while (length > 63) {
533 /* Step 1: load input data */
534 __m512i str = _mm512_loadu_si512((const __m512i *)c);
535
536 /* Step 2: splitting 24-bit words into 32-bit lanes */
537 str = _mm512_permutexvar_epi8(shuffle_splitting, str);
538
539 /* Step 3: moving 6-bit word to sperate bytes */
540 str = _mm512_multishift_epi64_epi8(multi_shifts, str);
541
542 /* Step 4: conversion to ASCII */
543 str = _mm512_permutexvar_epi8(str, ascii_lookup);
544
545 /* Step 5: store the final result */
546 _mm512_storeu_si512((__m512i *)o, str);
547 c += 48;
548 o += 64;
549 length -= 48;
550 }
551
552 o = php_base64_encode_impl(c, length, o, flags);
553
554 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
555
556 return result;
557 }
558
php_base64_decode_ex_avx512_vbmi(const unsigned char * str,size_t length,bool strict)559 zend_string *php_base64_decode_ex_avx512_vbmi(const unsigned char *str, size_t length, bool strict)
560 {
561 const unsigned char *c = str;
562 unsigned char *o;
563 size_t outl = 0;
564 zend_string *result;
565
566 result = zend_string_alloc(length, 0);
567 o = (unsigned char *)ZSTR_VAL(result);
568
569 const __m512i lookup_0 = _mm512_setr_epi32(
570 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x80808080,
571 0x80808080, 0x80808080, 0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
572 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080);
573 const __m512i lookup_1 = _mm512_setr_epi32(
574 0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 0x1211100f, 0x16151413,
575 0x80191817, 0x80808080, 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
576 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080);
577
578 const __m512i merge_mask1 = _mm512_set1_epi32(0x01400140);
579 const __m512i merge_mask2 = _mm512_set1_epi32(0x00011000);
580
581 const __m512i continuous_mask = _mm512_setr_epi32(
582 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 0x191a1415, 0x1c1d1e18,
583 0x26202122, 0x292a2425, 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
584 0x00000000, 0x00000000, 0x00000000, 0x00000000);
585
586 while (length > 64) {
587 /* Step 1: load input data */
588 const __m512i input = _mm512_loadu_si512((__m512i *)c);
589
590 /* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
591 __m512i str = _mm512_permutex2var_epi8(lookup_0, input, lookup_1);
592 const uint64_t mask = _mm512_movepi8_mask(_mm512_or_epi64(str, input)); /* convert MSBs to the mask */
593 if (mask) {
594 break;
595 }
596
597 /* Step 3: pack four fields within 32-bit words into 24-bit words. */
598 const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, merge_mask1);
599 str = _mm512_madd_epi16(merge_ab_and_bc, merge_mask2);
600
601 /* Step 4: move 3-byte words into the continuous array. */
602 str = _mm512_permutexvar_epi8(continuous_mask, str);
603
604 /* Step 5: store the final result */
605 _mm512_storeu_si512((__m512i *)o, str);
606
607 c += 64;
608 o += 48;
609 outl += 48;
610 length -= 64;
611 }
612
613 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
614 zend_string_efree(result);
615 return NULL;
616 }
617
618 ZSTR_LEN(result) = outl;
619
620 return result;
621 }
622 #endif
623
624 #if defined(BASE64_INTRIN_AVX512_FUNC_PROTO) || defined(BASE64_INTRIN_AVX512_FUNC_PTR)
php_base64_encode_avx512(const unsigned char * str,size_t length,zend_long flags)625 zend_string *php_base64_encode_avx512(const unsigned char *str, size_t length, zend_long flags)
626 {
627 const unsigned char *c = str;
628 unsigned char *o;
629 zend_string *result;
630
631 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
632 o = (unsigned char *)ZSTR_VAL(result);
633
634 while (length > 63) {
635 /* Step 1: load input data */
636 /* [????|????|????|????|PPPO|OONN|NMMM|LLLK|KKJJ|JIII|HHHG|GGFF|FEEE|DDDC|CCBB|BAAA] */
637 __m512i str = _mm512_loadu_si512((const __m512i *)c);
638
639 /* Step 2: splitting 24-bit words into 32-bit lanes */
640 /* [0000|PPPO|OONN|NMMM|0000|LLLK|KKJJ|JIII|0000|HHHG|GGFF|FEEE|0000|DDDC|CCBB|BAAA] */
641 str = _mm512_permutexvar_epi32(
642 _mm512_set_epi32(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0), str);
643 /* [D1 D2 D0 D1|C1 C2 C0 C1|B1 B2 B0 B1|A1 A2 A0 A1] x 4 */
644 str = _mm512_shuffle_epi8(str, _mm512_set4_epi32(0x0a0b090a, 0x07080607, 0x04050304, 0x01020001));
645
646 /* Step 3: moving 6-bit word to sperate bytes */
647 /* in: [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] */
648 /* t0: [0000cccc|cc000000|aaaaaa00|00000000] */
649 const __m512i t0 = _mm512_and_si512(str, _mm512_set1_epi32(0x0fc0fc00));
650 /* t1: [00000000|00cccccc|00000000|00aaaaaa] */
651 const __m512i t1 = _mm512_srlv_epi16(t0, _mm512_set1_epi32(0x0006000a));
652 /* t2: [ccdddddd|00000000|aabbbbbb|cccc0000] */
653 const __m512i t2 = _mm512_sllv_epi16(str, _mm512_set1_epi32(0x00080004));
654 /* str: [00dddddd|00cccccc|00bbbbbb|00aaaaaa] */
655 str = _mm512_ternarylogic_epi32(_mm512_set1_epi32(0x3f003f00), t2, t1, 0xca);
656
657 /* Step 4: conversion to ASCII */
658 __m512i result = _mm512_subs_epu8(str, _mm512_set1_epi8(51));
659 const __mmask64 less = _mm512_cmpgt_epi8_mask(_mm512_set1_epi8(26), str);
660 result = _mm512_mask_mov_epi8(result, less, _mm512_set1_epi8(13));
661 const __m512i lut = _mm512_set4_epi32(0x000041f0, 0xedfcfcfc, 0xfcfcfcfc, 0xfcfcfc47);
662 result = _mm512_shuffle_epi8(lut, result);
663 result = _mm512_add_epi8(result, str);
664
665 /* Step 5: store the final result */
666 _mm512_storeu_si512((__m512i *)o, result);
667 c += 48;
668 o += 64;
669 length -= 48;
670 }
671
672 o = php_base64_encode_impl(c, length, o, flags);
673
674 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
675
676 return result;
677 }
678
679 #define build_dword(b0, b1, b2, b3) \
680 ((uint32_t)(uint8_t)b0 << 0) | ((uint32_t)(uint8_t)b1 << 8) | \
681 ((uint32_t)(uint8_t)b2 << 16) | ((uint32_t)(uint8_t)b3 << 24)
682
683 #define _mm512_set4lanes_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15) \
684 _mm512_setr4_epi32(build_dword(b0, b1, b2, b3), build_dword(b4, b5, b6, b7), \
685 build_dword(b8, b9, b10, b11), build_dword(b12, b13, b14, b15))
686
php_base64_decode_ex_avx512(const unsigned char * str,size_t length,bool strict)687 zend_string *php_base64_decode_ex_avx512(const unsigned char *str, size_t length, bool strict)
688 {
689 const unsigned char *c = str;
690 unsigned char *o;
691 size_t outl = 0;
692 zend_string *result;
693
694 result = zend_string_alloc(length, 0);
695 o = (unsigned char *)ZSTR_VAL(result);
696
697 while (length > 64) {
698 /* Step 1: load input data */
699 __m512i str = _mm512_loadu_si512((__m512i *)c);
700
701 /* Step 2: translation into 6-bit values(saved on bytes) from ASCII and error detection */
702 const __m512i higher_nibble = _mm512_and_si512(_mm512_srli_epi32(str, 4), _mm512_set1_epi8(0x0f));
703 const __m512i lower_nibble = _mm512_and_si512(str, _mm512_set1_epi8(0x0f));
704 const __m512i shiftLUT = _mm512_set4lanes_epi8(
705 0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0);
706 const __m512i maskLUT = _mm512_set4lanes_epi8(
707 /* 0 : 0b1010_1000*/ 0xa8,
708 /* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
709 /* 10 : 0b1111_0000*/ 0xf0,
710 /* 11 : 0b0101_0100*/ 0x54,
711 /* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
712 /* 15 : 0b0101_0100*/ 0x54);
713 const __m512i bitposLUT = _mm512_set4lanes_epi8(
714 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
715 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
716 const __m512i M = _mm512_shuffle_epi8(maskLUT, lower_nibble);
717 const __m512i bit = _mm512_shuffle_epi8(bitposLUT, higher_nibble);
718 const uint64_t match = _mm512_test_epi8_mask(M, bit);
719 if (match != (uint64_t)-1) {
720 break;
721 }
722 const __m512i sh = _mm512_shuffle_epi8(shiftLUT, higher_nibble);
723 const __mmask64 eq_2f = _mm512_cmpeq_epi8_mask(str, _mm512_set1_epi8(0x2f));
724 const __m512i shift = _mm512_mask_mov_epi8(sh, eq_2f, _mm512_set1_epi8(16));
725 str = _mm512_add_epi8(str, shift);
726
727 /* Step 3: pack four fields within 32-bit words into 24-bit words. */
728 const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
729 str = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
730
731 /* Step 4: move 3-byte words into the continuous array. */
732 const __m512i t1 = _mm512_shuffle_epi8(str,
733 _mm512_set4lanes_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
734 const __m512i s6 = _mm512_setr_epi32(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0, 0, 0, 0);
735 const __m512i t2 = _mm512_permutexvar_epi32(s6, t1);
736
737 /* Step 5: store the final result */
738 _mm512_storeu_si512((__m512i *)o, t2);
739
740 c += 64;
741 o += 48;
742 outl += 48;
743 length -= 64;
744 }
745
746 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
747 zend_string_efree(result);
748 return NULL;
749 }
750
751 ZSTR_LEN(result) = outl;
752
753 return result;
754 }
755 #endif
756
757 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
758 # if defined(ZEND_INTRIN_AVX2_RESOLVER) && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
759 static __m256i php_base64_encode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
760 static __m256i php_base64_encode_avx2_translate(__m256i in) __attribute__((target("avx2")));
761 # endif
php_base64_encode_avx2_reshuffle(__m256i in)762 static __m256i php_base64_encode_avx2_reshuffle(__m256i in)
763 {
764 /* This one works with shifted (4 bytes) input in order to
765 * be able to work efficiently in the 2 128-bit lanes */
766 __m256i t0, t1, t2, t3;
767
768 /* input, bytes MSB to LSB:
769 * 0 0 0 0 x w v u t s r q p o n m
770 * l k j i h g f e d c b a 0 0 0 0 */
771 in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
772 10, 11, 9, 10,
773 7, 8, 6, 7,
774 4, 5, 3, 4,
775 1, 2, 0, 1,
776
777 14, 15, 13, 14,
778 11, 12, 10, 11,
779 8, 9, 7, 8,
780 5, 6, 4, 5));
781
782 t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
783
784 t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
785
786 t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
787
788 t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
789
790 return _mm256_or_si256(t1, t3);
791 /* 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
792 * 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
793 * 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
794 * 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
795 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
796 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
797 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
798 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
799 }
800
php_base64_encode_avx2_translate(__m256i in)801 static __m256i php_base64_encode_avx2_translate(__m256i in)
802 {
803 __m256i lut, indices, mask;
804
805 lut = _mm256_setr_epi8(
806 65, 71, -4, -4, -4, -4, -4, -4,
807 -4, -4, -4, -4, -19, -16, 0, 0,
808 65, 71, -4, -4, -4, -4, -4, -4,
809 -4, -4, -4, -4, -19, -16, 0, 0);
810
811 indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
812
813 mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
814
815 indices = _mm256_sub_epi8(indices, mask);
816
817 return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
818
819 }
820 #endif /* ZEND_INTRIN_AVX2_NATIVE || (ZEND_INTRIN_AVX2_RESOLVER && !ZEND_INTRIN_SSSE3_NATIVE) */
821
822 #if defined(ZEND_INTRIN_SSSE3_NATIVE) || defined(ZEND_INTRIN_SSSE3_RESOLVER)
823
824 # if defined(ZEND_INTRIN_SSSE3_RESOLVER) && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
825 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
826 static __m128i php_base64_encode_ssse3_translate(__m128i in) __attribute__((target("ssse3")));
827 # endif
828
php_base64_encode_ssse3_reshuffle(__m128i in)829 static __m128i php_base64_encode_ssse3_reshuffle(__m128i in)
830 {
831 __m128i t0, t1, t2, t3;
832
833 /* input, bytes MSB to LSB:
834 * 0 0 0 0 l k j i h g f e d c b a */
835 in = _mm_shuffle_epi8(in, _mm_set_epi8(
836 10, 11, 9, 10,
837 7, 8, 6, 7,
838 4, 5, 3, 4,
839 1, 2, 0, 1));
840
841 t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
842
843 t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
844
845 t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
846
847 t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
848
849 /* output (upper case are MSB, lower case are LSB):
850 * 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
851 * 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
852 * 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
853 * 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA */
854 return _mm_or_si128(t1, t3);
855 }
856
php_base64_encode_ssse3_translate(__m128i in)857 static __m128i php_base64_encode_ssse3_translate(__m128i in)
858 {
859 __m128i mask, indices;
860 __m128i lut = _mm_setr_epi8(
861 65, 71, -4, -4,
862 -4, -4, -4, -4,
863 -4, -4, -4, -4,
864 -19, -16, 0, 0
865 );
866
867 /* Translate values 0..63 to the Base64 alphabet. There are five sets:
868 * # From To Abs Index Characters
869 * 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
870 * 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
871 * 2 [52..61] [48..57] -4 [2..11] 0123456789
872 * 3 [62] [43] -19 12 +
873 * 4 [63] [47] -16 13 / */
874
875 /* Create LUT indices from input:
876 * the index for range #0 is right, others are 1 less than expected: */
877 indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
878
879 /* mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: */
880 mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
881
882 /* subtract -1, so add 1 to indices for range #[1..4], All indices are now correct: */
883 indices = _mm_sub_epi8(indices, mask);
884
885 /* Add offsets to input values: */
886 return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
887 }
888
889 #define PHP_BASE64_ENCODE_SSSE3_LOOP \
890 while (length > 15) { \
891 __m128i s = _mm_loadu_si128((__m128i *)c); \
892 \
893 s = php_base64_encode_ssse3_reshuffle(s); \
894 \
895 s = php_base64_encode_ssse3_translate(s); \
896 \
897 _mm_storeu_si128((__m128i *)o, s); \
898 c += 12; \
899 o += 16; \
900 length -= 12; \
901 }
902
903 #endif /* ZEND_INTRIN_SSSE3_NATIVE || (ZEND_INTRIN_SSSE3_RESOLVER && !ZEND_INTRIN_AVX2_NATIVE) */
904
905 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) || defined(ZEND_INTRIN_SSSE3_NATIVE) || defined(ZEND_INTRIN_SSSE3_RESOLVER)
906 # if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_SSSE3_NATIVE)
php_base64_encode_ex(const unsigned char * str,size_t length,zend_long flags)907 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags)
908 # elif defined(ZEND_INTRIN_AVX2_RESOLVER)
909 zend_string *php_base64_encode_avx2(const unsigned char *str, size_t length, zend_long flags)
910 # else /* ZEND_INTRIN_SSSE3_RESOLVER */
911 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length, zend_long flags)
912 # endif
913 {
914 const unsigned char *c = str;
915 unsigned char *o;
916 zend_string *result;
917
918 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
919 o = (unsigned char *)ZSTR_VAL(result);
920 # if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
921 if (length > 31) {
922 __m256i s = _mm256_loadu_si256((__m256i *)c);
923
924 s = _mm256_permutevar8x32_epi32(s, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
925
926 for (;;) {
927 s = php_base64_encode_avx2_reshuffle(s);
928
929 s = php_base64_encode_avx2_translate(s);
930
931 _mm256_storeu_si256((__m256i *)o, s);
932 c += 24;
933 o += 32;
934 length -= 24;
935 if (length < 28) {
936 break;
937 }
938 s = _mm256_loadu_si256((__m256i *)(c - 4));
939 }
940 }
941 # else
942 PHP_BASE64_ENCODE_SSSE3_LOOP;
943 # endif
944
945 o = php_base64_encode_impl(c, length, o, flags);
946
947 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
948
949 return result;
950 }
951
952 # if defined(ZEND_INTRIN_SSSE3_RESOLVER) && defined(ZEND_INTRIN_AVX2_RESOLVER)
php_base64_encode_ssse3(const unsigned char * str,size_t length,zend_long flags)953 zend_string *php_base64_encode_ssse3(const unsigned char *str, size_t length, zend_long flags)
954 {
955 const unsigned char *c = str;
956 unsigned char *o;
957 zend_string *result;
958
959 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
960 o = (unsigned char *)ZSTR_VAL(result);
961
962 PHP_BASE64_ENCODE_SSSE3_LOOP;
963
964 o = php_base64_encode_impl(c, length, o, flags);
965
966 ZSTR_LEN(result) = (o - (unsigned char *)ZSTR_VAL(result));
967
968 return result;
969 }
970 # endif
971 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
972
973 /* }}} */
974
975 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
976 # if defined(ZEND_INTRIN_AVX2_RESOLVER) && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
977 static __m256i php_base64_decode_avx2_reshuffle(__m256i in) __attribute__((target("avx2")));
978 # endif
979
php_base64_decode_avx2_reshuffle(__m256i in)980 static __m256i php_base64_decode_avx2_reshuffle(__m256i in)
981 {
982 __m256i merge_ab_and_bc, out;
983
984 merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
985
986 out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
987
988 out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
989 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
990 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
991
992 return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
993 }
994 #endif
995
996 #if defined(ZEND_INTRIN_SSSE3_NATIVE) || defined(ZEND_INTRIN_SSSE3_RESOLVER)
997 # if defined(ZEND_INTRIN_SSSE3_RESOLVER) && defined(HAVE_FUNC_ATTRIBUTE_TARGET)
998 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in) __attribute__((target("ssse3")));
999 # endif
1000
php_base64_decode_ssse3_reshuffle(__m128i in)1001 static __m128i php_base64_decode_ssse3_reshuffle(__m128i in)
1002 {
1003 __m128i merge_ab_and_bc, out;
1004
1005 merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
1006 /* 0000kkkk LLllllll 0000JJJJ JJjjKKKK
1007 * 0000hhhh IIiiiiii 0000GGGG GGggHHHH
1008 * 0000eeee FFffffff 0000DDDD DDddEEEE
1009 * 0000bbbb CCcccccc 0000AAAA AAaaBBBB */
1010
1011 out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
1012 /* 00000000 JJJJJJjj KKKKkkkk LLllllll
1013 * 00000000 GGGGGGgg HHHHhhhh IIiiiiii
1014 * 00000000 DDDDDDdd EEEEeeee FFffffff
1015 * 00000000 AAAAAAaa BBBBbbbb CCcccccc */
1016
1017 return _mm_shuffle_epi8(out, _mm_setr_epi8(
1018 2, 1, 0,
1019 6, 5, 4,
1020 10, 9, 8,
1021 14, 13, 12,
1022 -1, -1, -1, -1));
1023 /* 00000000 00000000 00000000 00000000
1024 * LLllllll KKKKkkkk JJJJJJjj IIiiiiii
1025 * HHHHhhhh GGGGGGgg FFffffff EEEEeeee
1026 * DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa */
1027 }
1028
1029 #define PHP_BASE64_DECODE_SSSE3_LOOP \
1030 while (length > 15 + 6 + 2) { \
1031 __m128i lut_lo, lut_hi, lut_roll; \
1032 __m128i hi_nibbles, lo_nibbles, hi, lo; \
1033 __m128i s = _mm_loadu_si128((__m128i *)c); \
1034 \
1035 lut_lo = _mm_setr_epi8( \
1036 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, \
1037 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); \
1038 lut_hi = _mm_setr_epi8( \
1039 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, \
1040 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); \
1041 lut_roll = _mm_setr_epi8( \
1042 0, 16, 19, 4, -65, -65, -71, -71, \
1043 0, 0, 0, 0, 0, 0, 0, 0); \
1044 \
1045 hi_nibbles = _mm_and_si128( \
1046 _mm_srli_epi32(s, 4), _mm_set1_epi8(0x2f)); \
1047 lo_nibbles = _mm_and_si128(s, _mm_set1_epi8(0x2f)); \
1048 hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); \
1049 lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); \
1050 \
1051 \
1052 if (UNEXPECTED( \
1053 _mm_movemask_epi8( \
1054 _mm_cmpgt_epi8( \
1055 _mm_and_si128(lo, hi), _mm_set1_epi8(0))))) { \
1056 break; \
1057 } else { \
1058 __m128i eq_2f, roll; \
1059 \
1060 eq_2f = _mm_cmpeq_epi8(s, _mm_set1_epi8(0x2f)); \
1061 roll = _mm_shuffle_epi8( \
1062 lut_roll, _mm_add_epi8(eq_2f, hi_nibbles)); \
1063 \
1064 s = _mm_add_epi8(s, roll); \
1065 s = php_base64_decode_ssse3_reshuffle(s); \
1066 \
1067 _mm_storeu_si128((__m128i *)o, s); \
1068 \
1069 c += 16; \
1070 o += 12; \
1071 outl += 12; \
1072 length -= 16; \
1073 } \
1074 }
1075
1076 #endif
1077
1078 #if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) || defined(ZEND_INTRIN_SSSE3_NATIVE) || defined(ZEND_INTRIN_SSSE3_RESOLVER)
1079 # if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_SSSE3_NATIVE)
php_base64_decode_ex(const unsigned char * str,size_t length,bool strict)1080 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1081 # elif defined(ZEND_INTRIN_AVX2_RESOLVER)
1082 zend_string *php_base64_decode_ex_avx2(const unsigned char *str, size_t length, bool strict)
1083 # else
1084 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1085 # endif
1086 {
1087 const unsigned char *c = str;
1088 unsigned char *o;
1089 size_t outl = 0;
1090 zend_string *result;
1091
1092 result = zend_string_alloc(length, 0);
1093 o = (unsigned char *)ZSTR_VAL(result);
1094
1095 /* See: "Faster Base64 Encoding and Decoding using AVX2 Instructions"
1096 * https://arxiv.org/pdf/1704.00605.pdf */
1097 # if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
1098 while (length > 31 + 11 + 2) {
1099 __m256i lut_lo, lut_hi, lut_roll;
1100 __m256i hi_nibbles, lo_nibbles, hi, lo;
1101 __m256i str = _mm256_loadu_si256((__m256i *)c);
1102
1103 lut_lo = _mm256_setr_epi8(
1104 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1105 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
1106 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
1107 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
1108
1109 lut_hi = _mm256_setr_epi8(
1110 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1111 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1112 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
1113 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
1114
1115 lut_roll = _mm256_setr_epi8(
1116 0, 16, 19, 4, -65, -65, -71, -71,
1117 0, 0, 0, 0, 0, 0, 0, 0,
1118 0, 16, 19, 4, -65, -65, -71, -71,
1119 0, 0, 0, 0, 0, 0, 0, 0);
1120
1121 hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), _mm256_set1_epi8(0x2f));
1122 lo_nibbles = _mm256_and_si256(str, _mm256_set1_epi8(0x2f));
1123 hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
1124 lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
1125
1126 if (!_mm256_testz_si256(lo, hi)) {
1127 break;
1128 } else {
1129 __m256i eq_2f, roll;
1130 eq_2f = _mm256_cmpeq_epi8(str, _mm256_set1_epi8(0x2f));
1131 roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2f, hi_nibbles));
1132
1133
1134 str = _mm256_add_epi8(str, roll);
1135
1136 str = php_base64_decode_avx2_reshuffle(str);
1137
1138 _mm256_storeu_si256((__m256i *)o, str);
1139
1140 c += 32;
1141 o += 24;
1142 outl += 24;
1143 length -= 32;
1144 }
1145 }
1146 # else
1147 PHP_BASE64_DECODE_SSSE3_LOOP;
1148 # endif
1149
1150 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1151 zend_string_efree(result);
1152 return NULL;
1153 }
1154
1155 ZSTR_LEN(result) = outl;
1156
1157 return result;
1158 }
1159
1160 # if defined(ZEND_INTRIN_SSSE3_RESOLVER) && defined(ZEND_INTRIN_AVX2_RESOLVER)
php_base64_decode_ex_ssse3(const unsigned char * str,size_t length,bool strict)1161 zend_string *php_base64_decode_ex_ssse3(const unsigned char *str, size_t length, bool strict)
1162 {
1163 const unsigned char *c = str;
1164 unsigned char *o;
1165 size_t outl = 0;
1166 zend_string *result;
1167
1168 result = zend_string_alloc(length, 0);
1169 o = (unsigned char *)ZSTR_VAL(result);
1170
1171 PHP_BASE64_DECODE_SSSE3_LOOP;
1172
1173 if (!php_base64_decode_impl(c, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1174 zend_string_efree(result);
1175 return NULL;
1176 }
1177
1178 ZSTR_LEN(result) = outl;
1179
1180 return result;
1181 }
1182 # endif
1183 #endif /* ZEND_INTRIN_AVX2_NATIVE || ZEND_INTRIN_AVX2_RESOLVER || ZEND_INTRIN_SSSE3_NATIVE || ZEND_INTRIN_SSSE3_RESOLVER */
1184
1185 #if !defined(ZEND_INTRIN_AVX2_NATIVE) && !defined(ZEND_INTRIN_SSSE3_NATIVE)
1186 #if defined(ZEND_INTRIN_AVX2_RESOLVER) || defined(ZEND_INTRIN_SSSE3_RESOLVER)
php_base64_encode_default(const unsigned char * str,size_t length,zend_long flags)1187 zend_string *php_base64_encode_default(const unsigned char *str, size_t length, zend_long flags)
1188 #else
1189 PHPAPI zend_string *php_base64_encode_ex(const unsigned char *str, size_t length, zend_long flags)
1190 #endif
1191 {
1192 unsigned char *p;
1193 zend_string *result;
1194
1195 result = zend_string_safe_alloc(((length + 2) / 3), 4 * sizeof(char), 0, 0);
1196 p = (unsigned char *)ZSTR_VAL(result);
1197
1198 p = php_base64_encode_impl(str, length, p, flags);
1199
1200 ZSTR_LEN(result) = (p - (unsigned char *)ZSTR_VAL(result));
1201
1202 return result;
1203 }
1204 #endif
1205
1206 #if !defined(ZEND_INTRIN_AVX2_NATIVE) && !defined(ZEND_INTRIN_SSSE3_NATIVE)
1207 #if defined(ZEND_INTRIN_AVX2_RESOLVER) || defined(ZEND_INTRIN_SSSE3_RESOLVER)
php_base64_decode_ex_default(const unsigned char * str,size_t length,bool strict)1208 zend_string *php_base64_decode_ex_default(const unsigned char *str, size_t length, bool strict)
1209 #else
1210 PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, bool strict)
1211 #endif
1212 {
1213 zend_string *result;
1214 size_t outl = 0;
1215
1216 result = zend_string_alloc(length, 0);
1217
1218 if (!php_base64_decode_impl(str, length, (unsigned char*)ZSTR_VAL(result), &outl, strict)) {
1219 zend_string_efree(result);
1220 return NULL;
1221 }
1222
1223 ZSTR_LEN(result) = outl;
1224
1225 return result;
1226 }
1227 #endif
1228 /* }}} */
1229
1230 /* {{{ Encodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_encode)1231 PHP_FUNCTION(base64_encode)
1232 {
1233 char *str;
1234 size_t str_len;
1235 zend_string *result;
1236
1237 ZEND_PARSE_PARAMETERS_START(1, 1)
1238 Z_PARAM_STRING(str, str_len)
1239 ZEND_PARSE_PARAMETERS_END();
1240
1241 result = php_base64_encode((unsigned char*)str, str_len);
1242 RETURN_STR(result);
1243 }
1244 /* }}} */
1245
1246 /* {{{ Decodes string using MIME base64 algorithm */
PHP_FUNCTION(base64_decode)1247 PHP_FUNCTION(base64_decode)
1248 {
1249 char *str;
1250 bool strict = 0;
1251 size_t str_len;
1252 zend_string *result;
1253
1254 ZEND_PARSE_PARAMETERS_START(1, 2)
1255 Z_PARAM_STRING(str, str_len)
1256 Z_PARAM_OPTIONAL
1257 Z_PARAM_BOOL(strict)
1258 ZEND_PARSE_PARAMETERS_END();
1259
1260 result = php_base64_decode_ex((unsigned char*)str, str_len, strict);
1261 if (result != NULL) {
1262 RETURN_STR(result);
1263 } else {
1264 RETURN_FALSE;
1265 }
1266 }
1267 /* }}} */
1268