1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf7.h"
32 #include "utf7_helper.h"
33 
34 static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter);
35 static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 static bool mb_check_utf7(unsigned char *in, size_t in_len);
38 
39 static const unsigned char mbfl_base64_table[] = {
40  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
41    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
42  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
43    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
44  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
45    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
46  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
47    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
48  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
49    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
50 };
51 
52 static const char *mbfl_encoding_utf7_aliases[] = {"utf7", NULL};
53 
54 const mbfl_encoding mbfl_encoding_utf7 = {
55 	mbfl_no_encoding_utf7,
56 	"UTF-7",
57 	"UTF-7",
58 	mbfl_encoding_utf7_aliases,
59 	NULL,
60 	MBFL_ENCTYPE_GL_UNSAFE,
61 	&vtbl_utf7_wchar,
62 	&vtbl_wchar_utf7,
63 	mb_utf7_to_wchar,
64 	mb_wchar_to_utf7,
65 	mb_check_utf7
66 };
67 
68 const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
69 	mbfl_no_encoding_utf7,
70 	mbfl_no_encoding_wchar,
71 	mbfl_filt_conv_common_ctor,
72 	NULL,
73 	mbfl_filt_conv_utf7_wchar,
74 	mbfl_filt_conv_utf7_wchar_flush,
75 	NULL,
76 };
77 
78 const struct mbfl_convert_vtbl vtbl_wchar_utf7 = {
79 	mbfl_no_encoding_wchar,
80 	mbfl_no_encoding_utf7,
81 	mbfl_filt_conv_common_ctor,
82 	NULL,
83 	mbfl_filt_conv_wchar_utf7,
84 	mbfl_filt_conv_wchar_utf7_flush,
85 	NULL,
86 };
87 
88 
89 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
90 
decode_base64_char(unsigned char c)91 static unsigned int decode_base64_char(unsigned char c)
92 {
93 	if (c >= 'A' && c <= 'Z') {
94 		return c - 65;
95 	} else if (c >= 'a' && c <= 'z') {
96 		return c - 71;
97 	} else if (c >= '0' && c <= '9') {
98 		return c + 4;
99 	} else if (c == '+') {
100 		return 62;
101 	} else if (c == '/') {
102 		return 63;
103 	}
104 	return -1;
105 }
106 
mbfl_filt_conv_utf7_wchar(int c,mbfl_convert_filter * filter)107 int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
108 {
109 	int s, n = -1;
110 
111 	if (filter->status) { /* Modified Base64 */
112 		n = decode_base64_char(c);
113 		if (n < 0) {
114 			if (filter->cache) {
115 				/* Either we were expecting the 2nd half of a surrogate pair which
116 				 * never came, or else the last Base64 data was not padded with zeroes */
117 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
118 			}
119 			if (c == '-') {
120 				if (filter->status == 1) { /* "+-" -> "+" */
121 					CK((*filter->output_function)('+', filter->data));
122 				}
123 			} else if (c >= 0 && c < 0x80) { /* ASCII exclude '-' */
124 				CK((*filter->output_function)(c, filter->data));
125 			} else { /* illegal character */
126 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
127 			}
128 			filter->cache = filter->status = 0;
129 			return 0;
130 		}
131 	}
132 
133 	switch (filter->status) {
134 	/* directly encoded characters */
135 	case 0:
136 		if (c == '+') { /* '+' shift character */
137 			filter->status = 1;
138 		} else if (c >= 0 && c < 0x80) { /* ASCII */
139 			CK((*filter->output_function)(c, filter->data));
140 		} else { /* illegal character */
141 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
142 		}
143 		break;
144 
145 	/* decode Modified Base64 */
146 	case 1:
147 	case 2:
148 		filter->cache |= n << 10;
149 		filter->status = 3;
150 		break;
151 	case 3:
152 		filter->cache |= n << 4;
153 		filter->status = 4;
154 		break;
155 	case 4:
156 		s = ((n >> 2) & 0xf) | (filter->cache & 0xffff);
157 		n = (n & 0x3) << 14;
158 		filter->status = 5;
159 		if (s >= 0xd800 && s < 0xdc00) {
160 			/* 1st part of surrogate pair */
161 			if (filter->cache & 0xfff0000) {
162 				/* We were waiting for the 2nd part of a surrogate pair */
163 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
164 			}
165 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
166 			filter->cache = s;
167 		} else if (s >= 0xdc00 && s < 0xe000) {
168 			/* 2nd part of surrogate pair */
169 			if (filter->cache & 0xfff0000) {
170 				s &= 0x3ff;
171 				s |= (filter->cache & 0xfff0000) >> 6;
172 				filter->cache = n;
173 				CK((*filter->output_function)(s, filter->data));
174 			} else {
175 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
176 				filter->cache = n;
177 			}
178 		} else {
179 			if (filter->cache & 0xfff0000) {
180 				/* We were waiting for the 2nd part of a surrogate pair */
181 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
182 			}
183 			filter->cache = n;
184 			CK((*filter->output_function)(s, filter->data));
185 		}
186 		break;
187 
188 	case 5:
189 		filter->cache |= n << 8;
190 		filter->status = 6;
191 		break;
192 	case 6:
193 		filter->cache |= n << 2;
194 		filter->status = 7;
195 		break;
196 	case 7:
197 		s = ((n >> 4) & 0x3) | (filter->cache & 0xffff);
198 		n = (n & 0xf) << 12;
199 		filter->status = 8;
200 		if (s >= 0xd800 && s < 0xdc00) {
201 			if (filter->cache & 0xfff0000) {
202 				/* We were waiting for the 2nd part of a surrogate pair */
203 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
204 			}
205 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
206 			filter->cache = s;
207 		} else if (s >= 0xdc00 && s < 0xe000) {
208 			/* 2nd part of surrogate pair */
209 			if (filter->cache & 0xfff0000) {
210 				s &= 0x3ff;
211 				s |= (filter->cache & 0xfff0000) >> 6;
212 				filter->cache = n;
213 				CK((*filter->output_function)(s, filter->data));
214 			} else {
215 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
216 				filter->cache = n;
217 			}
218 		} else {
219 			if (filter->cache & 0xfff0000) {
220 				/* We were waiting for the 2nd part of a surrogate pair */
221 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
222 			}
223 			filter->cache = n;
224 			CK((*filter->output_function)(s, filter->data));
225 		}
226 		break;
227 
228 	case 8:
229 		filter->cache |= n << 6;
230 		filter->status = 9;
231 		break;
232 	case 9:
233 		s = n | (filter->cache & 0xffff);
234 		filter->status = 2;
235 		if (s >= 0xd800 && s < 0xdc00) {
236 			if (filter->cache & 0xfff0000) {
237 				/* We were waiting for the 2nd part of a surrogate pair */
238 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
239 			}
240 			s = (((s & 0x3ff) << 16) + 0x400000);
241 			filter->cache = s;
242 		} else if (s >= 0xdc00 && s < 0xe000) {
243 			if (filter->cache & 0xfff0000) {
244 				s &= 0x3ff;
245 				s |= (filter->cache & 0xfff0000) >> 6;
246 				filter->cache = 0;
247 				CK((*filter->output_function)(s, filter->data));
248 			} else {
249 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
250 				filter->cache = 0;
251 			}
252 		} else {
253 			if (filter->cache & 0xfff0000) {
254 				/* We were waiting for the 2nd part of a surrogate pair */
255 				(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
256 			}
257 			filter->cache = 0;
258 			CK((*filter->output_function)(s, filter->data));
259 		}
260 		break;
261 
262 		EMPTY_SWITCH_DEFAULT_CASE();
263 	}
264 
265 	return 0;
266 }
267 
mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter * filter)268 static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter)
269 {
270 	if (filter->cache) {
271 		/* Either we were expecting the 2nd half of a surrogate pair which
272 		 * never came, or else the last Base64 data was not padded with zeroes */
273 		filter->cache = 0;
274 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
275 	}
276 
277 	if (filter->flush_function) {
278 		(*filter->flush_function)(filter->data);
279 	}
280 
281 	return 0;
282 }
283 
mbfl_filt_conv_wchar_utf7(int c,mbfl_convert_filter * filter)284 int mbfl_filt_conv_wchar_utf7(int c, mbfl_convert_filter *filter)
285 {
286 	int s;
287 
288 	int n = 0;
289 	if (c >= 0 && c < 0x80) { /* ASCII */
290 		if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-') {
291 			n = 1;
292 		} else if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?') {
293 			n = 2;
294 		}
295 	} else if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
296 		;
297 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_UTF32MAX) {
298 		CK((*filter->filter_function)(((c >> 10) - 0x40) | 0xd800, filter));
299 		CK((*filter->filter_function)((c & 0x3ff) | 0xdc00, filter));
300 		return 0;
301 	} else {
302 		CK(mbfl_filt_conv_illegal_output(c, filter));
303 		return 0;
304 	}
305 
306 	switch (filter->status) {
307 	case 0:
308 		if (n != 0) { /* directly encode characters */
309 			CK((*filter->output_function)(c, filter->data));
310 		} else { /* Modified Base64 */
311 			CK((*filter->output_function)('+', filter->data));
312 			filter->status = 1;
313 			filter->cache = c;
314 		}
315 		break;
316 
317 	/* encode Modified Base64 */
318 	case 1:
319 		s = filter->cache;
320 		CK((*filter->output_function)(mbfl_base64_table[(s >> 10) & 0x3f], filter->data));
321 		CK((*filter->output_function)(mbfl_base64_table[(s >> 4) & 0x3f], filter->data));
322 		if (n != 0) {
323 			CK((*filter->output_function)(mbfl_base64_table[(s << 2) & 0x3c], filter->data));
324 			if (n == 1) {
325 				CK((*filter->output_function)('-', filter->data));
326 			}
327 			CK((*filter->output_function)(c, filter->data));
328 			filter->status = 0;
329 		} else {
330 			filter->status = 2;
331 			filter->cache = ((s & 0xf) << 16) | c;
332 		}
333 		break;
334 
335 	case 2:
336 		s = filter->cache;
337 		CK((*filter->output_function)(mbfl_base64_table[(s >> 14) & 0x3f], filter->data));
338 		CK((*filter->output_function)(mbfl_base64_table[(s >> 8) & 0x3f], filter->data));
339 		CK((*filter->output_function)(mbfl_base64_table[(s >> 2) & 0x3f], filter->data));
340 		if (n != 0) {
341 			CK((*filter->output_function)(mbfl_base64_table[(s << 4) & 0x30], filter->data));
342 			if (n == 1) {
343 				CK((*filter->output_function)('-', filter->data));
344 			}
345 			CK((*filter->output_function)(c, filter->data));
346 			filter->status = 0;
347 		} else {
348 			filter->status = 3;
349 			filter->cache = ((s & 0x3) << 16) | c;
350 		}
351 		break;
352 
353 	case 3:
354 		s = filter->cache;
355 		CK((*filter->output_function)(mbfl_base64_table[(s >> 12) & 0x3f], filter->data));
356 		CK((*filter->output_function)(mbfl_base64_table[(s >> 6) & 0x3f], filter->data));
357 		CK((*filter->output_function)(mbfl_base64_table[s & 0x3f], filter->data));
358 		if (n != 0) {
359 			if (n == 1) {
360 				CK((*filter->output_function)('-', filter->data));
361 			}
362 			CK((*filter->output_function)(c, filter->data));
363 			filter->status = 0;
364 		} else {
365 			filter->status = 1;
366 			filter->cache = c;
367 		}
368 		break;
369 
370 		EMPTY_SWITCH_DEFAULT_CASE();
371 	}
372 
373 	return 0;
374 }
375 
mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter * filter)376 int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter)
377 {
378 	int status = filter->status;
379 	int cache = filter->cache;
380 	filter->status = filter->cache = 0;
381 
382 	/* flush fragments */
383 	switch (status) {
384 	case 1:
385 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 10) & 0x3f], filter->data));
386 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 4) & 0x3f], filter->data));
387 		CK((*filter->output_function)(mbfl_base64_table[(cache << 2) & 0x3c], filter->data));
388 		CK((*filter->output_function)('-', filter->data));
389 		break;
390 
391 	case 2:
392 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 14) & 0x3f], filter->data));
393 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 8) & 0x3f], filter->data));
394 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 2) & 0x3f], filter->data));
395 		CK((*filter->output_function)(mbfl_base64_table[(cache << 4) & 0x30], filter->data));
396 		CK((*filter->output_function)('-', filter->data));
397 		break;
398 
399 	case 3:
400 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 12) & 0x3f], filter->data));
401 		CK((*filter->output_function)(mbfl_base64_table[(cache >> 6) & 0x3f], filter->data));
402 		CK((*filter->output_function)(mbfl_base64_table[cache & 0x3f], filter->data));
403 		CK((*filter->output_function)('-', filter->data));
404 		break;
405 	}
406 
407 	if (filter->flush_function) {
408 		(*filter->flush_function)(filter->data);
409 	}
410 
411 	return 0;
412 }
413 
is_base64_end(unsigned char c)414 static inline bool is_base64_end(unsigned char c)
415 {
416 	return c >= DASH;
417 }
418 
is_optional_direct(unsigned char c)419 static bool is_optional_direct(unsigned char c)
420 {
421 	/* Characters that are allowed to be encoded by Base64 or directly encoded */
422 	return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' ||
423 		   c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' ||
424 		   c == '|' || c == '}';
425 }
426 
can_end_base64(uint32_t c)427 static bool can_end_base64(uint32_t c)
428 {
429 	return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
430 }
431 
decode_base64(unsigned char c)432 static unsigned char decode_base64(unsigned char c)
433 {
434 	if (c >= 'A' && c <= 'Z') {
435 		return c - 65;
436 	} else if (c >= 'a' && c <= 'z') {
437 		return c - 71;
438 	} else if (c >= '0' && c <= '9') {
439 		return c + 4;
440 	} else if (c == '+') {
441 		return 62;
442 	} else if (c == '/') {
443 		return 63;
444 	} else if (c == '-') {
445 		return DASH;
446 	} else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') {
447 		return DIRECT;
448 	} else if (c <= 0x7F) {
449 		return ASCII;
450 	}
451 	return ILLEGAL;
452 }
453 
handle_utf16_cp(uint16_t cp,uint32_t * out,uint16_t * surrogate1)454 static uint32_t* handle_utf16_cp(uint16_t cp, uint32_t *out, uint16_t *surrogate1)
455 {
456 retry:
457 	if (*surrogate1) {
458 		if (cp >= 0xDC00 && cp <= 0xDFFF) {
459 			*out++ = ((*surrogate1 & 0x3FF) << 10) + (cp & 0x3FF) + 0x10000;
460 			*surrogate1 = 0;
461 		} else {
462 			*out++ = MBFL_BAD_INPUT;
463 			*surrogate1 = 0;
464 			goto retry;
465 		}
466 	} else if (cp >= 0xD800 && cp <= 0xDBFF) {
467 		*surrogate1 = cp;
468 	} else if (cp >= 0xDC00 && cp <= 0xDFFF) {
469 		/* 2nd part of surrogate pair came unexpectedly */
470 		*out++ = MBFL_BAD_INPUT;
471 	} else {
472 		*out++ = cp;
473 	}
474 	return out;
475 }
476 
handle_base64_end(unsigned char n,unsigned char ** p,uint32_t * out,bool * base64,bool abrupt,uint16_t * surrogate1)477 static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t *out, bool *base64, bool abrupt, uint16_t *surrogate1)
478 {
479 	if (abrupt || *surrogate1) {
480 		*out++ = MBFL_BAD_INPUT;
481 		*surrogate1 = 0;
482 	}
483 
484 	if (n == ILLEGAL) {
485 		*out++ = MBFL_BAD_INPUT;
486 	} else if (n == DIRECT || n == ASCII) {
487 		(*p)--; /* Unconsume byte */
488 	}
489 
490 	*base64 = false;
491 	return out;
492 }
493 
mb_utf7_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)494 static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
495 {
496 	ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */
497 
498 	/* Why does this require a minimum output buffer size of 5?
499 	 * There is one case where one iteration of the main 'while' loop below will emit 5 wchars:
500 	 * that is if the first half of a surrogate pair is followed by an otherwise valid codepoint which
501 	 * is not the 2nd half of a surrogate pair, then another valid codepoint, then the Base64-encoded
502 	 * section ends with a byte which is not a valid Base64 character, AND which also is not in a
503 	 * position where we would expect the Base64-encoded section to end */
504 
505 	unsigned char *p = *in, *e = p + *in_len;
506 	uint32_t *out = buf, *limit = buf + bufsize;
507 
508 	bool base64 = *state & 1;
509 	uint16_t surrogate1 = (*state >> 1); /* First half of a surrogate pair which still needs 2nd half */
510 
511 	while (p < e && out < limit) {
512 		if (base64) {
513 			/* Base64 section */
514 			if ((limit - out) < 5) {
515 				break;
516 			}
517 
518 			unsigned char n1 = decode_base64(*p++);
519 			if (is_base64_end(n1)) {
520 				out = handle_base64_end(n1, &p, out, &base64, false, &surrogate1);
521 				continue;
522 			} else if (p == e) {
523 				out = handle_base64_end(n1, &p, out, &base64, true, &surrogate1);
524 				continue;
525 			}
526 			unsigned char n2 = decode_base64(*p++);
527 			if (is_base64_end(n2) || p == e) {
528 				out = handle_base64_end(n2, &p, out, &base64, true, &surrogate1);
529 				continue;
530 			}
531 			unsigned char n3 = decode_base64(*p++);
532 			if (is_base64_end(n3)) {
533 				out = handle_base64_end(n3, &p, out, &base64, true, &surrogate1);
534 				continue;
535 			}
536 			out = handle_utf16_cp((n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2), out, &surrogate1);
537 			if (p == e) {
538 				/* It is an error if trailing padding bits are not zeroes or if we were
539 				 * expecting the 2nd part of a surrogate pair when Base64 section ends */
540 				if ((n3 & 0x3) || surrogate1)
541 					*out++ = MBFL_BAD_INPUT;
542 				break;
543 			}
544 
545 			unsigned char n4 = decode_base64(*p++);
546 			if (is_base64_end(n4)) {
547 				out = handle_base64_end(n4, &p, out, &base64, n3 & 0x3, &surrogate1);
548 				continue;
549 			} else if (p == e) {
550 				out = handle_base64_end(n4, &p, out, &base64, true, &surrogate1);
551 				continue;
552 			}
553 			unsigned char n5 = decode_base64(*p++);
554 			if (is_base64_end(n5) || p == e) {
555 				out = handle_base64_end(n5, &p, out, &base64, true, &surrogate1);
556 				continue;
557 			}
558 			unsigned char n6 = decode_base64(*p++);
559 			if (is_base64_end(n6)) {
560 				out = handle_base64_end(n6, &p, out, &base64, true, &surrogate1);
561 				continue;
562 			}
563 			out = handle_utf16_cp((n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4), out, &surrogate1);
564 			if (p == e) {
565 				if ((n6 & 0xF) || surrogate1)
566 					*out++ = MBFL_BAD_INPUT;
567 				break;
568 			}
569 
570 			unsigned char n7 = decode_base64(*p++);
571 			if (is_base64_end(n7)) {
572 				out = handle_base64_end(n7, &p, out, &base64, n6 & 0xF, &surrogate1);
573 				continue;
574 			} else if (p == e) {
575 				out = handle_base64_end(n7, &p, out, &base64, true, &surrogate1);
576 				continue;
577 			}
578 			unsigned char n8 = decode_base64(*p++);
579 			if (is_base64_end(n8)) {
580 				out = handle_base64_end(n8, &p, out, &base64, true, &surrogate1);
581 				continue;
582 			}
583 			out = handle_utf16_cp((n6 << 12) | (n7 << 6) | n8, out, &surrogate1);
584 		} else {
585 			/* ASCII text section */
586 			unsigned char c = *p++;
587 
588 			if (c == '+') {
589 				if (p < e) {
590 					if (*p == '-') {
591 						*out++ = '+';
592 						p++;
593 					} else {
594 						base64 = true;
595 					}
596 				}
597 				/* If a + comes at the end of the input string... do nothing about it */
598 			} else if (c <= 0x7F) {
599 				*out++ = c;
600 			} else {
601 				*out++ = MBFL_BAD_INPUT;
602 			}
603 		}
604 	}
605 
606 	*state = (surrogate1 << 1) | base64;
607 	*in_len = e - p;
608 	*in = p;
609 	return out - buf;
610 }
611 
should_direct_encode(uint32_t c)612 static bool should_direct_encode(uint32_t c)
613 {
614 	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c);
615 }
616 
617 #define SAVE_CONVERSION_STATE() buf->state = (cache << 4) | (nbits << 1) | base64
618 #define RESTORE_CONVERSION_STATE() base64 = (buf->state & 1); nbits = (buf->state >> 1) & 0x7; cache = (buf->state >> 4)
619 
mb_wchar_to_utf7(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)620 static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
621 {
622 	unsigned char *out, *limit;
623 	MB_CONVERT_BUF_LOAD(buf, out, limit);
624 
625 	/* Make enough space such that if the input string is all ASCII (not including '+'),
626 	 * we can copy it to the output buffer without checking for available space.
627 	 * However, if we find anything which is not plain ASCII, additional checks for
628 	 * output buffer space will be needed. */
629 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
630 
631 	bool base64;
632 	unsigned char nbits, cache; /* `nbits` is the number of cached bits; either 0, 2, or 4 */
633 	RESTORE_CONVERSION_STATE();
634 
635 	while (len--) {
636 		uint32_t w = *in++;
637 		if (base64) {
638 			if (should_direct_encode(w)) {
639 				/* End of Base64 section. Drain buffered bits (if any), close Base64 section */
640 				base64 = false;
641 				in--; len++; /* Unconsume codepoint; it will be handled by 'ASCII section' code below */
642 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
643 				if (nbits) {
644 					out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
645 				}
646 				nbits = cache = 0;
647 				if (!can_end_base64(w)) {
648 					out = mb_convert_buf_add(out, '-');
649 				}
650 			} else if (w >= MBFL_WCSPLANE_UTF32MAX) {
651 				/* Make recursive call to add an error marker character */
652 				SAVE_CONVERSION_STATE();
653 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7);
654 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
655 				RESTORE_CONVERSION_STATE();
656 			} else {
657 				/* Encode codepoint, preceded by any cached bits, as Base64
658 				 * Make enough space in the output buffer to hold both any bytes that
659 				 * we emit right here, plus any finishing byte which might need to
660 				 * be emitted if the input string ends abruptly */
661 				uint64_t bits;
662 				if (w >= MBFL_WCSPLANE_SUPMIN) {
663 					/* Must use surrogate pair */
664 					MB_CONVERT_BUF_ENSURE(buf, out, limit, 7);
665 					w -= 0x10000;
666 					bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF);
667 					nbits += 32;
668 				} else {
669 					MB_CONVERT_BUF_ENSURE(buf, out, limit, 4);
670 					bits = (cache << 16) | w;
671 					nbits += 16;
672 				}
673 
674 				while (nbits >= 6) {
675 					out = mb_convert_buf_add(out, mbfl_base64_table[(bits >> (nbits - 6)) & 0x3F]);
676 					nbits -= 6;
677 				}
678 				cache = bits;
679 			}
680 		} else {
681 			/* ASCII section */
682 			if (should_direct_encode(w)) {
683 				out = mb_convert_buf_add(out, w);
684 			} else if (w >= MBFL_WCSPLANE_UTF32MAX) {
685 				buf->state = 0;
686 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7);
687 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
688 				RESTORE_CONVERSION_STATE();
689 			} else {
690 				out = mb_convert_buf_add(out, '+');
691 				base64 = true;
692 				in--; len++; /* Unconsume codepoint; it will be handled by Base64 code above */
693 			}
694 		}
695 	}
696 
697 	if (end) {
698 		if (nbits) {
699 			out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
700 		}
701 		if (base64) {
702 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
703 			out = mb_convert_buf_add(out, '-');
704 		}
705 	} else {
706 		SAVE_CONVERSION_STATE();
707 	}
708 
709 	MB_CONVERT_BUF_STORE(buf, out, limit);
710 }
711 
is_utf16_cp_valid(uint16_t cp,bool is_surrogate)712 static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
713 {
714 	if (is_surrogate) {
715 		return cp >= 0xDC00 && cp <= 0xDFFF;
716 	} else {
717 		/* 2nd part of surrogate pair came unexpectedly */
718 		return !(cp >= 0xDC00 && cp <= 0xDFFF);
719 	}
720 }
721 
can_encode_directly(unsigned char c)722 static bool can_encode_directly(unsigned char c)
723 {
724 	return should_direct_encode(c) || is_optional_direct(c) || c == '\0';
725 }
726 
mb_check_utf7(unsigned char * in,size_t in_len)727 static bool mb_check_utf7(unsigned char *in, size_t in_len)
728 {
729 	unsigned char *p = in, *e = p + in_len;
730 	bool base64 = false;
731 	bool is_surrogate = false;
732 
733 	while (p < e) {
734 		if (base64) {
735 			unsigned char n1 = decode_base64(*p++);
736 			if (is_base64_end(n1)) {
737 				if (!is_base64_end_valid(n1, false, is_surrogate)) {
738 					return false;
739 				}
740 				base64 = false;
741 				continue;
742 			} else if (p == e) {
743 				return false;
744 			}
745 			unsigned char n2 = decode_base64(*p++);
746 			if (is_base64_end(n2) || p == e) {
747 				return false;
748 			}
749 			unsigned char n3 = decode_base64(*p++);
750 			if (is_base64_end(n3)) {
751 				return false;
752 			}
753 			uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
754 			if (!is_utf16_cp_valid(cp1, is_surrogate)) {
755 				return false;
756 			}
757 			is_surrogate = has_surrogate(cp1, is_surrogate);
758 			if (p == e) {
759 				/* It is an error if trailing padding bits are not zeroes or if we were
760 				 * expecting the 2nd part of a surrogate pair when Base64 section ends */
761 				return !((n3 & 0x3) || is_surrogate);
762 			}
763 
764 			unsigned char n4 = decode_base64(*p++);
765 			if (is_base64_end(n4)) {
766 				if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
767 					return false;
768 				}
769 				base64 = false;
770 				continue;
771 			} else if (p == e) {
772 				return false;
773 			}
774 			unsigned char n5 = decode_base64(*p++);
775 			if (is_base64_end(n5) || p == e) {
776 				return false;
777 			}
778 			unsigned char n6 = decode_base64(*p++);
779 			if (is_base64_end(n6)) {
780 				return false;
781 			}
782 			uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
783 			if (!is_utf16_cp_valid(cp2, is_surrogate)) {
784 				return false;
785 			}
786 			is_surrogate = has_surrogate(cp2, is_surrogate);
787 			if (p == e) {
788 				return !((n6 & 0xF) || is_surrogate);
789 			}
790 
791 			unsigned char n7 = decode_base64(*p++);
792 			if (is_base64_end(n7)) {
793 				if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
794 					return false;
795 				}
796 				base64 = false;
797 				continue;
798 			} else if (p == e) {
799 				return false;
800 			}
801 			unsigned char n8 = decode_base64(*p++);
802 			if (is_base64_end(n8)) {
803 				return false;
804 			}
805 			uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
806 			if (!is_utf16_cp_valid(cp3, is_surrogate)) {
807 				return false;
808 			}
809 			is_surrogate = has_surrogate(cp3, is_surrogate);
810 		} else {
811 			/* ASCII text section */
812 			unsigned char c = *p++;
813 
814 			if (c == '+') {
815 				if (p == e) {
816 					base64 = true;
817 					return !is_surrogate;
818 				}
819 				unsigned char n = decode_base64(*p);
820 				if (n == DASH) {
821 					p++;
822 				} else if (n > DASH) {
823 					/* If a "+" character followed immediately by any character other than base64 or "-" */
824 					return false;
825 				} else {
826 					base64 = true;
827 				}
828 			} else if (can_encode_directly(c)) {
829 				continue;
830 			} else {
831 				return false;
832 			}
833 		}
834 	}
835 	return !is_surrogate;
836 }
837