1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this file was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf7.h"
32 #include "utf7_helper.h"
33 
34 static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter);
35 static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 static bool mb_check_utf7(unsigned char *in, size_t in_len);
38 
39 static const unsigned char mbfl_base64_table[] = {
40  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
41    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
42  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
43    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
44  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
45    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
46  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
47    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
48  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
49    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
50 };
51 
52 static const char *mbfl_encoding_utf7_aliases[] = {"utf7", NULL};
53 
54 const mbfl_encoding mbfl_encoding_utf7 = {
55 	mbfl_no_encoding_utf7,
56 	"UTF-7",
57 	"UTF-7",
58 	mbfl_encoding_utf7_aliases,
59 	NULL,
60 	MBFL_ENCTYPE_GL_UNSAFE,
61 	&vtbl_utf7_wchar,
62 	&vtbl_wchar_utf7,
63 	mb_utf7_to_wchar,
64 	mb_wchar_to_utf7,
65 	mb_check_utf7
66 };
67 
68 const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
69 	mbfl_no_encoding_utf7,
70 	mbfl_no_encoding_wchar,
71 	mbfl_filt_conv_common_ctor,
72 	NULL,
73 	mbfl_filt_conv_utf7_wchar,
74 	mbfl_filt_conv_utf7_wchar_flush,
75 	NULL,
76 };
77 
78 const struct mbfl_convert_vtbl vtbl_wchar_utf7 = {
79 	mbfl_no_encoding_wchar,
80 	mbfl_no_encoding_utf7,
81 	mbfl_filt_conv_common_ctor,
82 	NULL,
83 	mbfl_filt_conv_wchar_utf7,
84 	mbfl_filt_conv_wchar_utf7_flush,
85 	NULL,
86 };
87 
88 
89 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
90 
decode_base64_char(unsigned char c)91 static unsigned int decode_base64_char(unsigned char c)
92 {
93 	if (c >= 'A' && c <= 'Z') {
94 		return c - 65;
95 	} else if (c >= 'a' && c <= 'z') {
96 		return c - 71;
97 	} else if (c >= '0' && c <= '9') {
98 		return c + 4;
99 	} else if (c == '+') {
100 		return 62;
101 	} else if (c == '/') {
102 		return 63;
103 	}
104 	return -1;
105 }
106 
mbfl_filt_conv_utf7_wchar(int c,mbfl_convert_filter * filter)107 int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
108 {
109 	int s, n = -1;
110 
111 	if (filter->status) { /* Modified Base64 */
112 		n = decode_base64_char(c);
113 		if (n < 0) {
114 			if (filter->cache) {
115 				/* Either we were expecting the 2nd half of a surrogate pair which
116 				 * never came, or else the last Base64 data was not padded with zeroes */
117 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
118 			}
119 			if (c == '-') {
120 				if (filter->status == 1) { /* "+-" -> "+" */
121 					CK((*filter->output_function)('+', filter->data));
122 				}
123 			} else if (c >= 0 && c < 0x80) { /* ASCII exclude '-' */
124 				CK((*filter->output_function)(c, filter->data));
125 			} else { /* illegal character */
126 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
127 			}
128 			filter->cache = filter->status = 0;
129 			return 0;
130 		}
131 	}
132 
133 	switch (filter->status) {
134 	/* directly encoded characters */
135 	case 0:
136 		if (c == '+') { /* '+' shift character */
137 			filter->status = 1;
138 		} else if (c >= 0 && c < 0x80) { /* ASCII */
139 			CK((*filter->output_function)(c, filter->data));
140 		} else { /* illegal character */
141 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
142 		}
143 		break;
144 
145 	/* decode Modified Base64 */
146 	case 1:
147 	case 2:
148 		filter->cache |= n << 10;
149 		filter->status = 3;
150 		break;
151 	case 3:
152 		filter->cache |= n << 4;
153 		filter->status = 4;
154 		break;
155 	case 4:
156 		s = ((n >> 2) & 0xf) | (filter->cache & 0xffff);
157 		n = (n & 0x3) << 14;
158 		filter->status = 5;
159 		if (s >= 0xd800 && s < 0xdc00) {
160 			/* 1st part of surrogate pair */
161 			if (filter->cache & 0xfff0000) {
162 				/* We were waiting for the 2nd part of a surrogate pair */
163 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
164 			}
165 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
166 			filter->cache = s;
167 		} else if (s >= 0xdc00 && s < 0xe000) {
168 			/* 2nd part of surrogate pair */
169 			if (filter->cache & 0xfff0000) {
170 				s &= 0x3ff;
171 				s |= (filter->cache & 0xfff0000) >> 6;
172 				filter->cache = n;
173 				CK((*filter->output_function)(s, filter->data));
174 			} else {
175 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
176 				filter->cache = n;
177 			}
178 		} else {
179 			if (filter->cache & 0xfff0000) {
180 				/* We were waiting for the 2nd part of a surrogate pair */
181 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
182 			}
183 			filter->cache = n;
184 			CK((*filter->output_function)(s, filter->data));
185 		}
186 		break;
187 
188 	case 5:
189 		filter->cache |= n << 8;
190 		filter->status = 6;
191 		break;
192 	case 6:
193 		filter->cache |= n << 2;
194 		filter->status = 7;
195 		break;
196 	case 7:
197 		s = ((n >> 4) & 0x3) | (filter->cache & 0xffff);
198 		n = (n & 0xf) << 12;
199 		filter->status = 8;
200 		if (s >= 0xd800 && s < 0xdc00) {
201 			if (filter->cache & 0xfff0000) {
202 				/* We were waiting for the 2nd part of a surrogate pair */
203 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
204 			}
205 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
206 			filter->cache = s;
207 		} else if (s >= 0xdc00 && s < 0xe000) {
208 			/* 2nd part of surrogate pair */
209 			if (filter->cache & 0xfff0000) {
210 				s &= 0x3ff;
211 				s |= (filter->cache & 0xfff0000) >> 6;
212 				filter->cache = n;
213 				CK((*filter->output_function)(s, filter->data));
214 			} else {
215 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
216 				filter->cache = n;
217 			}
218 		} else {
219 			if (filter->cache & 0xfff0000) {
220 				/* We were waiting for the 2nd part of a surrogate pair */
221 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
222 			}
223 			filter->cache = n;
224 			CK((*filter->output_function)(s, filter->data));
225 		}
226 		break;
227 
228 	case 8:
229 		filter->cache |= n << 6;
230 		filter->status = 9;
231 		break;
232 	case 9:
233 		s = n | (filter->cache & 0xffff);
234 		filter->status = 2;
235 		if (s >= 0xd800 && s < 0xdc00) {
236 			if (filter->cache & 0xfff0000) {
237 				/* We were waiting for the 2nd part of a surrogate pair */
238 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
239 			}
240 			s = (((s & 0x3ff) << 16) + 0x400000);
241 			filter->cache = s;
242 		} else if (s >= 0xdc00 && s < 0xe000) {
243 			if (filter->cache & 0xfff0000) {
244 				s &= 0x3ff;
245 				s |= (filter->cache & 0xfff0000) >> 6;
246 				filter->cache = 0;
247 				CK((*filter->output_function)(s, filter->data));
248 			} else {
249 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
250 				filter->cache = 0;
251 			}
252 		} else {
253 			if (filter->cache & 0xfff0000) {
254 				/* We were waiting for the 2nd part of a surrogate pair */
255 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
256 			}
257 			filter->cache = 0;
258 			CK((*filter->output_function)(s, filter->data));
259 		}
260 		break;
261 
262 		EMPTY_SWITCH_DEFAULT_CASE();
263 	}
264 
265 	return 0;
266 }
267 
mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter * filter)268 static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter)
269 {
270 	if (filter->cache) {
271 		/* Either we were expecting the 2nd half of a surrogate pair which
272 		 * never came, or else the last Base64 data was not padded with zeroes */
273 		filter->cache = 0;
274 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
275 	}
276 
277 	if (filter->flush_function) {
278 		(*filter->flush_function)(filter->data);
279 	}
280 
281 	return 0;
282 }
283 
mbfl_filt_conv_wchar_utf7(int c,mbfl_convert_filter * filter)284 int mbfl_filt_conv_wchar_utf7(int c, mbfl_convert_filter *filter)
285 {
286 	int s;
287 
288 	int n = 0;
289 	if (c >= 0 && c < 0x80) { /* ASCII */
290 		if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-') {
291 			n = 1;
292 		} else if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?') {
293 			n = 2;
294 		}
295 	} else if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
296 		;
297 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_UTF32MAX) {
298 		CK((*filter->filter_function)(((c >> 10) - 0x40) | 0xd800, filter));
299 		CK((*filter->filter_function)((c & 0x3ff) | 0xdc00, filter));
300 		return 0;
301 	} else {
302 		CK(mbfl_filt_conv_illegal_output(c, filter));
303 		return 0;
304 	}
305 
306 	switch (filter->status) {
307 	case 0:
308 		if (n != 0) { /* directly encode characters */
309 			CK((*filter->output_function)(c, filter->data));
310 		} else { /* Modified Base64 */
311 			CK((*filter->output_function)('+', filter->data));
312 			filter->status = 1;
313 			filter->cache = c;
314 		}
315 		break;
316 
317 	/* encode Modified Base64 */
318 	case 1:
319 		s = filter->cache;
320 		CK((*filter->output_function)(mbfl_base64_table[(s >> 10) & 0x3f], filter->data));
321 		CK((*filter->output_function)(mbfl_base64_table[(s >> 4) & 0x3f], filter->data));
322 		if (n != 0) {
323 			CK((*filter->output_function)(mbfl_base64_table[(s << 2) & 0x3c], filter->data));
324 			if (n == 1) {
325 				CK((*filter->output_function)('-', filter->data));
326 			}
327 			CK((*filter->output_function)(c, filter->data));
328 			filter->status = 0;
329 		} else {
330 			filter->status = 2;
331 			filter->cache = ((s & 0xf) << 16) | c;
332 		}
333 		break;
334 
335 	case 2:
336 		s = filter->cache;
337 		CK((*filter->output_function)(mbfl_base64_table[(s >> 14) & 0x3f], filter->data));
338 		CK((*filter->output_function)(mbfl_base64_table[(s >> 8) & 0x3f], filter->data));
339 		CK((*filter->output_function)(mbfl_base64_table[(s >> 2) & 0x3f], filter->data));
340 		if (n != 0) {
341 			CK((*filter->output_function)(mbfl_base64_table[(s << 4) & 0x30], filter->data));
342 			if (n == 1) {
343 				CK((*filter->output_function)('-', filter->data));
344 			}
345 			CK((*filter->output_function)(c, filter->data));
346 			filter->status = 0;
347 		} else {
348 			filter->status = 3;
349 			filter->cache = ((s & 0x3) << 16) | c;
350 		}
351 		break;
352 
353 	case 3:
354 		s = filter->cache;
355 		CK((*filter->output_function)(mbfl_base64_table[(s >> 12) & 0x3f], filter->data));
356 		CK((*filter->output_function)(mbfl_base64_table[(s >> 6) & 0x3f], filter->data));
357 		CK((*filter->output_function)(mbfl_base64_table[s & 0x3f], filter->data));
358 		if (n != 0) {
359 			if (n == 1) {
360 				CK((*filter->output_function)('-', filter->data));
361 			}
362 			CK((*filter->output_function)(c, filter->data));
363 			filter->status = 0;
364 		} else {
365 			filter->status = 1;
366 			filter->cache = c;
367 		}
368 		break;
369 
370 		EMPTY_SWITCH_DEFAULT_CASE();
371 	}
372 
373 	return 0;
374 }
375 
mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter * filter)376 int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter)
377 {
378 	int status = filter->status;
379 	int cache = filter->cache;
380 	filter->status = filter->cache = 0;
381 
382 	/* flush fragments */
383 	switch (status) {
384 	case 1:
385 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 10) & 0x3f], filter->data));
386 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 4) & 0x3f], filter->data));
387 		CK((*filter->output_function)(mbfl_base64_table[(cache << 2) & 0x3c], filter->data));
388 		CK((*filter->output_function)('-', filter->data));
389 		break;
390 
391 	case 2:
392 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 14) & 0x3f], filter->data));
393 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 8) & 0x3f], filter->data));
394 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 2) & 0x3f], filter->data));
395 		CK((*filter->output_function)(mbfl_base64_table[(cache << 4) & 0x30], filter->data));
396 		CK((*filter->output_function)('-', filter->data));
397 		break;
398 
399 	case 3:
400 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 12) & 0x3f], filter->data));
401 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 6) & 0x3f], filter->data));
402 		CK((*filter->output_function)(mbfl_base64_table[cache & 0x3f], filter->data));
403 		CK((*filter->output_function)('-', filter->data));
404 		break;
405 	}
406 
407 	if (filter->flush_function) {
408 		(*filter->flush_function)(filter->data);
409 	}
410 
411 	return 0;
412 }
413 
is_base64_end(unsigned char c)414 static inline bool is_base64_end(unsigned char c)
415 {
416 	return c >= DASH;
417 }
418 
is_optional_direct(unsigned char c)419 static bool is_optional_direct(unsigned char c)
420 {
421 	/* Characters that are allowed to be encoded by Base64 or directly encoded */
422 	return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' ||
423 		   c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' ||
424 		   c == '|' || c == '}';
425 }
426 
can_end_base64(uint32_t c)427 static bool can_end_base64(uint32_t c)
428 {
429 	return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
430 }
431 
decode_base64(unsigned char c)432 static unsigned char decode_base64(unsigned char c)
433 {
434 	if (c >= 'A' && c <= 'Z') {
435 		return c - 65;
436 	} else if (c >= 'a' && c <= 'z') {
437 		return c - 71;
438 	} else if (c >= '0' && c <= '9') {
439 		return c + 4;
440 	} else if (c == '+') {
441 		return 62;
442 	} else if (c == '/') {
443 		return 63;
444 	} else if (c == '-') {
445 		return DASH;
446 	} else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') {
447 		return DIRECT;
448 	} else if (c <= 0x7F) {
449 		return ASCII;
450 	}
451 	return ILLEGAL;
452 }
453 
handle_utf16_cp(uint16_t cp,uint32_t * out,uint16_t * surrogate1)454 static uint32_t* handle_utf16_cp(uint16_t cp, uint32_t *out, uint16_t *surrogate1)
455 {
456 retry:
457 	if (*surrogate1) {
458 		if (cp >= 0xDC00 && cp <= 0xDFFF) {
459 			*out++ = ((*surrogate1 & 0x3FF) << 10) + (cp & 0x3FF) + 0x10000;
460 			*surrogate1 = 0;
461 		} else {
462 			*out++ = MBFL_BAD_INPUT;
463 			*surrogate1 = 0;
464 			goto retry;
465 		}
466 	} else if (cp >= 0xD800 && cp <= 0xDBFF) {
467 		*surrogate1 = cp;
468 	} else if (cp >= 0xDC00 && cp <= 0xDFFF) {
469 		/* 2nd part of surrogate pair came unexpectedly */
470 		*out++ = MBFL_BAD_INPUT;
471 	} else {
472 		*out++ = cp;
473 	}
474 	return out;
475 }
476 
handle_base64_end(unsigned char n,unsigned char ** p,uint32_t * out,bool * base64,bool abrupt,uint16_t * surrogate1)477 static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t *out, bool *base64, bool abrupt, uint16_t *surrogate1)
478 {
479 	if (abrupt || *surrogate1) {
480 		*out++ = MBFL_BAD_INPUT;
481 		*surrogate1 = 0;
482 	}
483 
484 	if (n == ILLEGAL) {
485 		*out++ = MBFL_BAD_INPUT;
486 	} else if (n == DIRECT || n == ASCII) {
487 		(*p)--; /* Unconsume byte */
488 	}
489 
490 	*base64 = false;
491 	return out;
492 }
493 
mb_utf7_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)494 static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
495 {
496 	ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */
497 
498 	/* Why does this require a minimum output buffer size of 5?
499 	 * There is one case where one iteration of the main 'while' loop below will emit 5 wchars:
500 	 * that is if the first half of a surrogate pair is followed by an otherwise valid codepoint which
501 	 * is not the 2nd half of a surrogate pair, then another valid codepoint, then the Base64-encoded
502 	 * section ends with a byte which is not a valid Base64 character, AND which also is not in a
503 	 * position where we would expect the Base64-encoded section to end */
504 
505 	unsigned char *p = *in, *e = p + *in_len;
506 	uint32_t *out = buf, *limit = buf + bufsize;
507 
508 	bool base64 = *state & 1;
509 	uint16_t surrogate1 = (*state >> 1); /* First half of a surrogate pair which still needs 2nd half */
510 
511 	while (p < e && out < limit) {
512 		if (base64) {
513 			/* Base64 section */
514 			if ((limit - out) < 5) {
515 				break;
516 			}
517 
518 			unsigned char n1 = decode_base64(*p++);
519 			if (is_base64_end(n1)) {
520 				out = handle_base64_end(n1, &p, out, &base64, false, &surrogate1);
521 				continue;
522 			} else if (p == e) {
523 				out = handle_base64_end(n1, &p, out, &base64, true, &surrogate1);
524 				continue;
525 			}
526 			unsigned char n2 = decode_base64(*p++);
527 			if (is_base64_end(n2) || p == e) {
528 				out = handle_base64_end(n2, &p, out, &base64, true, &surrogate1);
529 				continue;
530 			}
531 			unsigned char n3 = decode_base64(*p++);
532 			if (is_base64_end(n3)) {
533 				out = handle_base64_end(n3, &p, out, &base64, true, &surrogate1);
534 				continue;
535 			}
536 			out = handle_utf16_cp((n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2), out, &surrogate1);
537 			if (p == e) {
538 				/* It is an error if trailing padding bits are not zeroes or if we were
539 				 * expecting the 2nd part of a surrogate pair when Base64 section ends */
540 				if ((n3 & 0x3) || surrogate1) {
541 					*out++ = MBFL_BAD_INPUT;
542 					surrogate1 = 0;
543 				}
544 				break;
545 			}
546 
547 			unsigned char n4 = decode_base64(*p++);
548 			if (is_base64_end(n4)) {
549 				out = handle_base64_end(n4, &p, out, &base64, n3 & 0x3, &surrogate1);
550 				continue;
551 			} else if (p == e) {
552 				out = handle_base64_end(n4, &p, out, &base64, true, &surrogate1);
553 				continue;
554 			}
555 			unsigned char n5 = decode_base64(*p++);
556 			if (is_base64_end(n5) || p == e) {
557 				out = handle_base64_end(n5, &p, out, &base64, true, &surrogate1);
558 				continue;
559 			}
560 			unsigned char n6 = decode_base64(*p++);
561 			if (is_base64_end(n6)) {
562 				out = handle_base64_end(n6, &p, out, &base64, true, &surrogate1);
563 				continue;
564 			}
565 			out = handle_utf16_cp((n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4), out, &surrogate1);
566 			if (p == e) {
567 				if ((n6 & 0xF) || surrogate1) {
568 					*out++ = MBFL_BAD_INPUT;
569 					surrogate1 = 0;
570 				}
571 				break;
572 			}
573 
574 			unsigned char n7 = decode_base64(*p++);
575 			if (is_base64_end(n7)) {
576 				out = handle_base64_end(n7, &p, out, &base64, n6 & 0xF, &surrogate1);
577 				continue;
578 			} else if (p == e) {
579 				out = handle_base64_end(n7, &p, out, &base64, true, &surrogate1);
580 				continue;
581 			}
582 			unsigned char n8 = decode_base64(*p++);
583 			if (is_base64_end(n8)) {
584 				out = handle_base64_end(n8, &p, out, &base64, true, &surrogate1);
585 				continue;
586 			}
587 			out = handle_utf16_cp((n6 << 12) | (n7 << 6) | n8, out, &surrogate1);
588 		} else {
589 			/* ASCII text section */
590 			unsigned char c = *p++;
591 
592 			if (c == '+') {
593 				if (p < e) {
594 					if (*p == '-') {
595 						*out++ = '+';
596 						p++;
597 					} else {
598 						base64 = true;
599 					}
600 				}
601 				/* If a + comes at the end of the input string... do nothing about it */
602 			} else if (c <= 0x7F) {
603 				*out++ = c;
604 			} else {
605 				*out++ = MBFL_BAD_INPUT;
606 			}
607 		}
608 	}
609 
610 	if (p == e && surrogate1) {
611 		ZEND_ASSERT(out < limit);
612 		*out++ = MBFL_BAD_INPUT;
613 	}
614 
615 	*state = (surrogate1 << 1) | base64;
616 	*in_len = e - p;
617 	*in = p;
618 	return out - buf;
619 }
620 
should_direct_encode(uint32_t c)621 static bool should_direct_encode(uint32_t c)
622 {
623 	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c);
624 }
625 
626 #define SAVE_CONVERSION_STATE() buf->state = (cache << 4) | (nbits << 1) | base64
627 #define RESTORE_CONVERSION_STATE() base64 = (buf->state & 1); nbits = (buf->state >> 1) & 0x7; cache = (buf->state >> 4)
628 
mb_wchar_to_utf7(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)629 static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
630 {
631 	unsigned char *out, *limit;
632 	MB_CONVERT_BUF_LOAD(buf, out, limit);
633 
634 	/* Make enough space such that if the input string is all ASCII (not including '+'),
635 	 * we can copy it to the output buffer without checking for available space.
636 	 * However, if we find anything which is not plain ASCII, additional checks for
637 	 * output buffer space will be needed. */
638 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
639 
640 	bool base64;
641 	unsigned char nbits, cache; /* `nbits` is the number of cached bits; either 0, 2, or 4 */
642 	RESTORE_CONVERSION_STATE();
643 
644 	while (len--) {
645 		uint32_t w = *in++;
646 		if (base64) {
647 			if (should_direct_encode(w)) {
648 				/* End of Base64 section. Drain buffered bits (if any), close Base64 section */
649 				base64 = false;
650 				in--; len++; /* Unconsume codepoint; it will be handled by 'ASCII section' code below */
651 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
652 				if (nbits) {
653 					out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
654 				}
655 				nbits = cache = 0;
656 				if (!can_end_base64(w)) {
657 					out = mb_convert_buf_add(out, '-');
658 				}
659 			} else if (w >= MBFL_WCSPLANE_UTF32MAX) {
660 				/* Make recursive call to add an error marker character */
661 				SAVE_CONVERSION_STATE();
662 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7);
663 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
664 				RESTORE_CONVERSION_STATE();
665 			} else {
666 				/* Encode codepoint, preceded by any cached bits, as Base64
667 				 * Make enough space in the output buffer to hold both any bytes that
668 				 * we emit right here, plus any finishing byte which might need to
669 				 * be emitted if the input string ends abruptly */
670 				uint64_t bits;
671 				if (w >= MBFL_WCSPLANE_SUPMIN) {
672 					/* Must use surrogate pair */
673 					MB_CONVERT_BUF_ENSURE(buf, out, limit, 7);
674 					w -= 0x10000;
675 					bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF);
676 					nbits += 32;
677 				} else {
678 					MB_CONVERT_BUF_ENSURE(buf, out, limit, 4);
679 					bits = (cache << 16) | w;
680 					nbits += 16;
681 				}
682 
683 				while (nbits >= 6) {
684 					out = mb_convert_buf_add(out, mbfl_base64_table[(bits >> (nbits - 6)) & 0x3F]);
685 					nbits -= 6;
686 				}
687 				cache = bits;
688 			}
689 		} else {
690 			/* ASCII section */
691 			if (should_direct_encode(w)) {
692 				out = mb_convert_buf_add(out, w);
693 			} else if (w >= MBFL_WCSPLANE_UTF32MAX) {
694 				buf->state = 0;
695 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7);
696 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
697 				RESTORE_CONVERSION_STATE();
698 			} else {
699 				out = mb_convert_buf_add(out, '+');
700 				base64 = true;
701 				in--; len++; /* Unconsume codepoint; it will be handled by Base64 code above */
702 			}
703 		}
704 	}
705 
706 	if (end) {
707 		if (nbits) {
708 			out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
709 		}
710 		if (base64) {
711 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
712 			out = mb_convert_buf_add(out, '-');
713 		}
714 	} else {
715 		SAVE_CONVERSION_STATE();
716 	}
717 
718 	MB_CONVERT_BUF_STORE(buf, out, limit);
719 }
720 
is_utf16_cp_valid(uint16_t cp,bool is_surrogate)721 static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
722 {
723 	if (is_surrogate) {
724 		return cp >= 0xDC00 && cp <= 0xDFFF;
725 	} else {
726 		/* 2nd part of surrogate pair came unexpectedly */
727 		return !(cp >= 0xDC00 && cp <= 0xDFFF);
728 	}
729 }
730 
can_encode_directly(unsigned char c)731 static bool can_encode_directly(unsigned char c)
732 {
733 	return should_direct_encode(c) || is_optional_direct(c) || c == '\0';
734 }
735 
mb_check_utf7(unsigned char * in,size_t in_len)736 static bool mb_check_utf7(unsigned char *in, size_t in_len)
737 {
738 	unsigned char *p = in, *e = p + in_len;
739 	bool base64 = false;
740 	bool is_surrogate = false;
741 
742 	while (p < e) {
743 		if (base64) {
744 			unsigned char n1 = decode_base64(*p++);
745 			if (is_base64_end(n1)) {
746 				if (!is_base64_end_valid(n1, false, is_surrogate)) {
747 					return false;
748 				}
749 				base64 = false;
750 				continue;
751 			} else if (p == e) {
752 				return false;
753 			}
754 			unsigned char n2 = decode_base64(*p++);
755 			if (is_base64_end(n2) || p == e) {
756 				return false;
757 			}
758 			unsigned char n3 = decode_base64(*p++);
759 			if (is_base64_end(n3)) {
760 				return false;
761 			}
762 			uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
763 			if (!is_utf16_cp_valid(cp1, is_surrogate)) {
764 				return false;
765 			}
766 			is_surrogate = has_surrogate(cp1, is_surrogate);
767 			if (p == e) {
768 				/* It is an error if trailing padding bits are not zeroes or if we were
769 				 * expecting the 2nd part of a surrogate pair when Base64 section ends */
770 				return !((n3 & 0x3) || is_surrogate);
771 			}
772 
773 			unsigned char n4 = decode_base64(*p++);
774 			if (is_base64_end(n4)) {
775 				if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
776 					return false;
777 				}
778 				base64 = false;
779 				continue;
780 			} else if (p == e) {
781 				return false;
782 			}
783 			unsigned char n5 = decode_base64(*p++);
784 			if (is_base64_end(n5) || p == e) {
785 				return false;
786 			}
787 			unsigned char n6 = decode_base64(*p++);
788 			if (is_base64_end(n6)) {
789 				return false;
790 			}
791 			uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
792 			if (!is_utf16_cp_valid(cp2, is_surrogate)) {
793 				return false;
794 			}
795 			is_surrogate = has_surrogate(cp2, is_surrogate);
796 			if (p == e) {
797 				return !((n6 & 0xF) || is_surrogate);
798 			}
799 
800 			unsigned char n7 = decode_base64(*p++);
801 			if (is_base64_end(n7)) {
802 				if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
803 					return false;
804 				}
805 				base64 = false;
806 				continue;
807 			} else if (p == e) {
808 				return false;
809 			}
810 			unsigned char n8 = decode_base64(*p++);
811 			if (is_base64_end(n8)) {
812 				return false;
813 			}
814 			uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
815 			if (!is_utf16_cp_valid(cp3, is_surrogate)) {
816 				return false;
817 			}
818 			is_surrogate = has_surrogate(cp3, is_surrogate);
819 		} else {
820 			/* ASCII text section */
821 			unsigned char c = *p++;
822 
823 			if (c == '+') {
824 				if (p == e) {
825 					base64 = true;
826 					return !is_surrogate;
827 				}
828 				unsigned char n = decode_base64(*p);
829 				if (n == DASH) {
830 					p++;
831 				} else if (n > DASH) {
832 					/* If a "+" character followed immediately by any character other than base64 or "-" */
833 					return false;
834 				} else {
835 					base64 = true;
836 				}
837 			} else if (can_encode_directly(c)) {
838 				continue;
839 			} else {
840 				return false;
841 			}
842 		}
843 	}
844 	return !is_surrogate;
845 }
846