1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 /* Modified UTF-7 used for 'international mailbox names' in the IMAP protocol
31  * Also known as mUTF-7
32  * Defined in RFC 3501 5.1.3 (https://tools.ietf.org/html/rfc3501)
33  *
34  * Quoting from the RFC:
35  *
36  ***********************************************************************
37  * In modified UTF-7, printable US-ASCII characters, except for "&",
38  * represent themselves; that is, characters with octet values 0x20-0x25
39  * and 0x27-0x7e. The character "&" (0x26) is represented by the
40  * two-octet sequence "&-".
41  *
42  * All other characters (octet values 0x00-0x1f and 0x7f-0xff) are
43  * represented in modified BASE64, with a further modification from
44  * UTF-7 that "," is used instead of "/". Modified BASE64 MUST NOT be
45  * used to represent any printing US-ASCII character which can represent
46  * itself.
47  *
48  * "&" is used to shift to modified BASE64 and "-" to shift back to
49  * US-ASCII. There is no implicit shift from BASE64 to US-ASCII, and
50  * null shifts ("-&" while in BASE64; note that "&-" while in US-ASCII
51  * means "&") are not permitted.  However, all names start in US-ASCII,
52  * and MUST end in US-ASCII; that is, a name that ends with a non-ASCII
53  * ISO-10646 character MUST end with a "-").
54  ***********************************************************************
55  *
56  * The purpose of all this is: 1) to keep all parts of IMAP messages 7-bit clean,
57  * 2) to avoid giving special treatment to +, /, \, and ~, since these are
58  * commonly used in mailbox names, and 3) to ensure there is only one
59  * representation of any mailbox name (vanilla UTF-7 does allow multiple
60  * representations of the same string, by Base64-encoding characters which
61  * could have been included as ASCII literals.)
62  *
63  * RFC 2152 also applies, since it defines vanilla UTF-7 (minus IMAP modifications)
64  * The following paragraph is notable:
65  *
66  ***********************************************************************
67  * Unicode is encoded using Modified Base64 by first converting Unicode
68  * 16-bit quantities to an octet stream (with the most significant octet first).
69  * Surrogate pairs (UTF-16) are converted by treating each half of the pair as
70  * a separate 16 bit quantity (i.e., no special treatment). Text with an odd
71  * number of octets is ill-formed. ISO 10646 characters outside the range
72  * addressable via surrogate pairs cannot be encoded.
73  ***********************************************************************
74  *
75  * So after reversing the modified Base64 encoding on an encoded section,
76  * the contents are interpreted as UTF-16BE. */
77 
78 #include "mbfilter.h"
79 #include "mbfilter_utf7imap.h"
80 #include "utf7_helper.h"
81 
82 static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter);
83 static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter);
84 static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
85 static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
86 static bool mb_check_utf7imap(unsigned char *in, size_t in_len);
87 
88 static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL};
89 
90 const mbfl_encoding mbfl_encoding_utf7imap = {
91 	mbfl_no_encoding_utf7imap,
92 	"UTF7-IMAP",
93 	NULL,
94 	mbfl_encoding_utf7imap_aliases,
95 	NULL,
96 	0,
97 	&vtbl_utf7imap_wchar,
98 	&vtbl_wchar_utf7imap,
99 	mb_utf7imap_to_wchar,
100 	mb_wchar_to_utf7imap,
101 	mb_check_utf7imap
102 };
103 
104 const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = {
105 	mbfl_no_encoding_utf7imap,
106 	mbfl_no_encoding_wchar,
107 	mbfl_filt_conv_common_ctor,
108 	NULL,
109 	mbfl_filt_conv_utf7imap_wchar,
110 	mbfl_filt_conv_utf7imap_wchar_flush,
111 	NULL,
112 };
113 
114 const struct mbfl_convert_vtbl vtbl_wchar_utf7imap = {
115 	mbfl_no_encoding_wchar,
116 	mbfl_no_encoding_utf7imap,
117 	mbfl_filt_conv_common_ctor,
118 	NULL,
119 	mbfl_filt_conv_wchar_utf7imap,
120 	mbfl_filt_conv_wchar_utf7imap_flush,
121 	NULL,
122 };
123 
124 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
125 
mbfl_filt_conv_utf7imap_wchar(int c,mbfl_convert_filter * filter)126 int mbfl_filt_conv_utf7imap_wchar(int c, mbfl_convert_filter *filter)
127 {
128 	int s, n = -1;
129 
130 	if (filter->status != 0) { /* Modified Base64 */
131 		if (c >= 'A' && c <= 'Z') {
132 			n = c - 65;
133 		} else if (c >= 'a' && c <= 'z') {
134 			n = c - 71;
135 		} else if (c >= '0' && c <= '9') {
136 			n = c + 4;
137 		} else if (c == '+') {
138 			n = 62;
139 		} else if (c == ',') {
140 			n = 63;
141 		}
142 
143 		if (n < 0 || n > 63) {
144 			if (c == '-') {
145 				if (filter->status == 1) { /* "&-" -> "&" */
146 					filter->cache = filter->status = 0;
147 					CK((*filter->output_function)('&', filter->data));
148 				} else if (filter->cache) {
149 					/* Base64-encoded section ended abruptly, with partially encoded characters,
150 					 * or it could be that it ended on the first half of a surrogate pair */
151 					filter->cache = filter->status = 0;
152 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
153 				} else {
154 					/* Base64-encoded section properly terminated by - */
155 					filter->cache = filter->status = 0;
156 				}
157 			} else { /* illegal character */
158 				filter->cache = filter->status = 0;
159 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
160 			}
161 			return 0;
162 		}
163 	}
164 
165 	switch (filter->status) {
166 	/* directly encoded characters */
167 	case 0:
168 		if (c == '&') { /* shift character */
169 			filter->status++;
170 		} else if (c >= 0x20 && c <= 0x7E) { /* ASCII */
171 			CK((*filter->output_function)(c, filter->data));
172 		} else { /* illegal character */
173 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
174 		}
175 		break;
176 
177 	/* decode Modified Base64 */
178 	case 1:
179 	case 2:
180 		filter->cache |= n << 10;
181 		filter->status = 3;
182 		break;
183 	case 3:
184 		filter->cache |= n << 4;
185 		filter->status = 4;
186 		break;
187 	case 4:
188 		s = ((n >> 2) & 0xf) | (filter->cache & 0xffff);
189 		n = (n & 0x3) << 14;
190 		filter->status = 5;
191 		if (s >= 0xd800 && s < 0xdc00) {
192 			/* 1st part of surrogate pair */
193 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
194 			filter->cache = s;
195 		} else if (s >= 0xdc00 && s < 0xe000) {
196 			/* 2nd part of surrogate pair */
197 			if (filter->cache & 0xfff0000) {
198 				s &= 0x3ff;
199 				s |= (filter->cache & 0xfff0000) >> 6;
200 				filter->cache = n;
201 				CK((*filter->output_function)(s, filter->data));
202 			} else { /* illegal character */
203 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
204 			}
205 		} else {
206 			filter->cache = n;
207 			/* Characters which can be expressed as literal, ASCII characters
208 			 * should not be Base64-encoded */
209 			if (s < 0x20 || s > 0x7E || s == '&') {
210 				CK((*filter->output_function)(s, filter->data));
211 			} else {
212 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
213 			}
214 		}
215 		break;
216 
217 	case 5:
218 		filter->cache |= n << 8;
219 		filter->status = 6;
220 		break;
221 	case 6:
222 		filter->cache |= n << 2;
223 		filter->status = 7;
224 		break;
225 	case 7:
226 		s = ((n >> 4) & 0x3) | (filter->cache & 0xffff);
227 		n = (n & 0xf) << 12;
228 		filter->status = 8;
229 		if (s >= 0xd800 && s < 0xdc00) {
230 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
231 			filter->cache = s;
232 		} else if (s >= 0xdc00 && s < 0xe000) {
233 			if (filter->cache & 0xfff0000) {
234 				s &= 0x3ff;
235 				s |= (filter->cache & 0xfff0000) >> 6;
236 				filter->cache = n;
237 				CK((*filter->output_function)(s, filter->data));
238 			} else { /* illegal character */
239 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
240 			}
241 		} else {
242 			filter->cache = n;
243 			/* Characters which can be expressed as literal, ASCII characters
244 			 * should not be Base64-encoded */
245 			if (s < 0x20 || s > 0x7E || s == '&') {
246 				CK((*filter->output_function)(s, filter->data));
247 			} else {
248 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
249 			}
250 		}
251 		break;
252 
253 	case 8:
254 		filter->cache |= n << 6;
255 		filter->status = 9;
256 		break;
257 	case 9:
258 		s = n | (filter->cache & 0xffff);
259 		filter->status = 2;
260 		if (s >= 0xd800 && s < 0xdc00) {
261 			s = (((s & 0x3ff) << 16) + 0x400000);
262 			filter->cache = s;
263 		} else if (s >= 0xdc00 && s < 0xe000) {
264 			if (filter->cache & 0xfff0000) {
265 				s &= 0x3ff;
266 				s |= (filter->cache & 0xfff0000) >> 6;
267 				filter->cache = 0;
268 				CK((*filter->output_function)(s, filter->data));
269 			} else { /* illegal character */
270 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
271 			}
272 		} else {
273 			filter->cache = 0;
274 			/* Characters which can be expressed as literal, ASCII characters
275 			 * should not be Base64-encoded */
276 			if (s < 0x20 || s > 0x7E || s == '&') {
277 				CK((*filter->output_function)(s, filter->data));
278 			} else {
279 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
280 			}
281 		}
282 		break;
283 
284 		EMPTY_SWITCH_DEFAULT_CASE();
285 	}
286 
287 	return 0;
288 }
289 
mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter * filter)290 static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter)
291 {
292 	if (filter->status) {
293 		/* It is illegal for a UTF-7 IMAP string to end in a Base-64 encoded
294 		 * section. It should always change back to ASCII before the end. */
295 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
296 		filter->status = 0;
297 	}
298 
299 	if (filter->flush_function) {
300 		(*filter->flush_function)(filter->data);
301 	}
302 
303 	return 0;
304 }
305 
306 static const unsigned char mbfl_utf7imap_base64_table[] =
307 {
308  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
309    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
310  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
311    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
312  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
313    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
314  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
315    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
316  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',', '\0' */
317    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2c,0x00
318 };
319 
mbfl_filt_conv_wchar_utf7imap(int c,mbfl_convert_filter * filter)320 int mbfl_filt_conv_wchar_utf7imap(int c, mbfl_convert_filter *filter)
321 {
322 	int n = 0, s;
323 
324 	if (c == '&') {
325 		n = 1;
326 	} else if ((c >= 0x20 && c <= 0x7e) || c == 0) {
327 		n = 2;
328 	} else if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
329 		;
330 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
331 		s = ((c >> 10) - 0x40) | 0xd800;
332 		CK((*filter->filter_function)(s, filter));
333 		s = (c & 0x3ff) | 0xdc00;
334 		CK((*filter->filter_function)(s, filter));
335 		return 0;
336 	} else {
337 		CK(mbfl_filt_conv_illegal_output(c, filter));
338 		return 0;
339 	}
340 
341 	switch (filter->status) {
342 	case 0:
343 		if (n != 0) {	/* directly encode characters */
344 			CK((*filter->output_function)(c, filter->data));
345 			if (n == 1) {
346 				CK((*filter->output_function)(0x2d, filter->data));		/* '-' */
347 			}
348 		} else {	/* Modified Base64 */
349 			CK((*filter->output_function)(0x26, filter->data));		/* '&' */
350 			filter->status = 1;
351 			filter->cache = c;
352 		}
353 		break;
354 
355 	/* encode Modified Base64 */
356 	case 1:
357 		s = filter->cache;
358 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 10) & 0x3f], filter->data));
359 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 4) & 0x3f], filter->data));
360 		if (n != 0) {
361 			CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s << 2) & 0x3c], filter->data));
362 			CK((*filter->output_function)('-', filter->data));
363 			CK((*filter->output_function)(c, filter->data));
364 			if (n == 1) {
365 				CK((*filter->output_function)('-', filter->data));
366 			}
367 			filter->status = 0;
368 		} else {
369 			filter->status = 2;
370 			filter->cache = ((s & 0xf) << 16) | c;
371 		}
372 		break;
373 
374 	case 2:
375 		s = filter->cache;
376 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 14) & 0x3f], filter->data));
377 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 8) & 0x3f], filter->data));
378 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 2) & 0x3f], filter->data));
379 		if (n != 0) {
380 			CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s << 4) & 0x30], filter->data));
381 			CK((*filter->output_function)('-', filter->data));
382 			CK((*filter->output_function)(c, filter->data));
383 			if (n == 1) {
384 				CK((*filter->output_function)('-', filter->data));
385 			}
386 			filter->status = 0;
387 		} else {
388 			filter->status = 3;
389 			filter->cache = ((s & 0x3) << 16) | c;
390 		}
391 		break;
392 
393 	case 3:
394 		s = filter->cache;
395 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 12) & 0x3f], filter->data));
396 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 6) & 0x3f], filter->data));
397 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[s & 0x3f], filter->data));
398 		if (n != 0) {
399 			CK((*filter->output_function)('-', filter->data));
400 			CK((*filter->output_function)(c, filter->data));
401 			if (n == 1) {
402 				CK((*filter->output_function)('-', filter->data));
403 			}
404 			filter->status = 0;
405 		} else {
406 			filter->status = 1;
407 			filter->cache = c;
408 		}
409 		break;
410 
411 		EMPTY_SWITCH_DEFAULT_CASE();
412 	}
413 
414 	return 0;
415 }
416 
mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter * filter)417 static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter)
418 {
419 	int status = filter->status, cache = filter->cache;
420 	filter->status = filter->cache = 0;
421 
422 	/* flush fragments */
423 	switch (status) {
424 	case 1:
425 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 10) & 0x3f], filter->data));
426 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 4) & 0x3f], filter->data));
427 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache << 2) & 0x3c], filter->data));
428 		CK((*filter->output_function)('-', filter->data));
429 		break;
430 
431 	case 2:
432 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 14) & 0x3f], filter->data));
433 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 8) & 0x3f], filter->data));
434 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 2) & 0x3f], filter->data));
435 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache << 4) & 0x30], filter->data));
436 		CK((*filter->output_function)('-', filter->data));
437 		break;
438 
439 	case 3:
440 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 12) & 0x3f], filter->data));
441 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 6) & 0x3f], filter->data));
442 		CK((*filter->output_function)(mbfl_utf7imap_base64_table[cache & 0x3f], filter->data));
443 		CK((*filter->output_function)('-', filter->data));
444 		break;
445 	}
446 
447 	return 0;
448 }
449 
is_base64_end(unsigned char c)450 static inline bool is_base64_end(unsigned char c)
451 {
452 	return c >= DASH;
453 }
454 
decode_base64(unsigned char c)455 static unsigned char decode_base64(unsigned char c)
456 {
457 	if (c >= 'A' && c <= 'Z') {
458 		return c - 65;
459 	} else if (c >= 'a' && c <= 'z') {
460 		return c - 71;
461 	} else if (c >= '0' && c <= '9') {
462 		return c + 4;
463 	} else if (c == '+') {
464 		return 62;
465 	} else if (c == ',') {
466 		return 63;
467 	} else if (c == '-') {
468 		return DASH;
469 	}
470 	return ILLEGAL;
471 }
472 
handle_utf16_cp(uint16_t cp,uint32_t * out,uint16_t * surrogate1)473 static uint32_t* handle_utf16_cp(uint16_t cp, uint32_t *out, uint16_t *surrogate1)
474 {
475 retry:
476 	if (*surrogate1) {
477 		if (cp >= 0xDC00 && cp <= 0xDFFF) {
478 			*out++ = ((*surrogate1 & 0x3FF) << 10) + (cp & 0x3FF) + 0x10000;
479 			*surrogate1 = 0;
480 		} else {
481 			*out++ = MBFL_BAD_INPUT;
482 			*surrogate1 = 0;
483 			goto retry;
484 		}
485 	} else if (cp >= 0xD800 && cp <= 0xDBFF) {
486 		*surrogate1 = cp;
487 	} else if (cp >= 0xDC00 && cp <= 0xDFFF) {
488 		/* 2nd part of surrogate pair came unexpectedly */
489 		*out++ = MBFL_BAD_INPUT;
490 	} else if (cp >= 0x20 && cp <= 0x7E && cp != '&') {
491 		*out++ = MBFL_BAD_INPUT;
492 	} else {
493 		*out++ = cp;
494 	}
495 	return out;
496 }
497 
handle_base64_end(unsigned char n,uint32_t * out,bool * base64,bool abrupt,uint16_t * surrogate1)498 static uint32_t* handle_base64_end(unsigned char n, uint32_t *out, bool *base64, bool abrupt, uint16_t *surrogate1)
499 {
500 	if (abrupt || n == ILLEGAL || *surrogate1) {
501 		*out++ = MBFL_BAD_INPUT;
502 		*surrogate1 = 0;
503 	}
504 
505 	*base64 = false;
506 	return out;
507 }
508 
mb_utf7imap_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)509 static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
510 {
511 	ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */
512 
513 	/* Why does this require a minimum output buffer size of 5?
514 	 * See comment in mb_utf7_to_wchar; the worst case for this function is similar,
515 	 * though not exactly the same. */
516 
517 	unsigned char *p = *in, *e = p + *in_len;
518 	/* Always leave one empty space in output buffer in case the string ends while
519 	 * in Base64 mode and we need to emit an error marker */
520 	uint32_t *out = buf, *limit = buf + bufsize - 1;
521 
522 	bool base64 = *state & 1;
523 	uint16_t surrogate1 = (*state >> 1); /* First half of a surrogate pair */
524 
525 	while (p < e && out < limit) {
526 		if (base64) {
527 			/* Base64 section */
528 			if ((limit - out) < 4) {
529 				break;
530 			}
531 
532 			unsigned char n1 = decode_base64(*p++);
533 			if (is_base64_end(n1)) {
534 				out = handle_base64_end(n1, out, &base64, false, &surrogate1);
535 				continue;
536 			} else if (p == e) {
537 				out = handle_base64_end(n1, out, &base64, true, &surrogate1);
538 				continue;
539 			}
540 			unsigned char n2 = decode_base64(*p++);
541 			if (is_base64_end(n2) || p == e) {
542 				out = handle_base64_end(n2, out, &base64, true, &surrogate1);
543 				continue;
544 			}
545 			unsigned char n3 = decode_base64(*p++);
546 			if (is_base64_end(n3)) {
547 				out = handle_base64_end(n3, out, &base64, true, &surrogate1);
548 				continue;
549 			}
550 			out = handle_utf16_cp((n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2), out, &surrogate1);
551 			if (p == e) {
552 				/* It is an error if trailing padding bits are not zeroes or if we were
553 				 * expecting the 2nd part of a surrogate pair when Base64 section ends */
554 				if ((n3 & 0x3) || surrogate1)
555 					*out++ = MBFL_BAD_INPUT;
556 				break;
557 			}
558 
559 			unsigned char n4 = decode_base64(*p++);
560 			if (is_base64_end(n4)) {
561 				out = handle_base64_end(n4, out, &base64, n3 & 0x3, &surrogate1);
562 				continue;
563 			} else if (p == e) {
564 				out = handle_base64_end(n4, out, &base64, true, &surrogate1);
565 				continue;
566 			}
567 			unsigned char n5 = decode_base64(*p++);
568 			if (is_base64_end(n5) || p == e) {
569 				out = handle_base64_end(n5, out, &base64, true, &surrogate1);
570 				continue;
571 			}
572 			unsigned char n6 = decode_base64(*p++);
573 			if (is_base64_end(n6)) {
574 				out = handle_base64_end(n6, out, &base64, true, &surrogate1);
575 				continue;
576 			}
577 			out = handle_utf16_cp((n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4), out, &surrogate1);
578 			if (p == e) {
579 				if ((n6 & 0xF) || surrogate1)
580 					*out++ = MBFL_BAD_INPUT;
581 				break;
582 			}
583 
584 			unsigned char n7 = decode_base64(*p++);
585 			if (is_base64_end(n7)) {
586 				out = handle_base64_end(n7, out, &base64, n6 & 0xF, &surrogate1);
587 				continue;
588 			} else if (p == e) {
589 				out = handle_base64_end(n7, out, &base64, true, &surrogate1);
590 				continue;
591 			}
592 			unsigned char n8 = decode_base64(*p++);
593 			if (is_base64_end(n8)) {
594 				out = handle_base64_end(n8, out, &base64, true, &surrogate1);
595 				continue;
596 			}
597 			out = handle_utf16_cp((n6 << 12) | (n7 << 6) | n8, out, &surrogate1);
598 		} else {
599 			unsigned char c = *p++;
600 
601 			if (c == '&') {
602 				if (p < e && *p == '-') {
603 					*out++ = '&';
604 					p++;
605 				} else {
606 					base64 = true;
607 				}
608 			} else if (c >= 0x20 && c <= 0x7E) {
609 				*out++ = c;
610 			} else {
611 				*out++ = MBFL_BAD_INPUT;
612 			}
613 		}
614 	}
615 
616 	if (p == e && base64) {
617 		/* UTF7-IMAP doesn't allow strings to end in Base64 mode
618 		 * One space in output buffer was reserved just for this */
619 		*out++ = MBFL_BAD_INPUT;
620 	}
621 
622 	*state = (surrogate1 << 1) | base64;
623 	*in_len = e - p;
624 	*in = p;
625 	return out - buf;
626 }
627 
628 #define SAVE_CONVERSION_STATE() buf->state = (cache << 4) | (nbits << 1) | base64
629 #define RESTORE_CONVERSION_STATE() base64 = (buf->state & 1); nbits = (buf->state >> 1) & 0x7; cache = (buf->state >> 4)
630 
631 static const unsigned char mbfl_base64_table[] = {
632  /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
633    0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
634  /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
635    0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
636  /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
637    0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
638  /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
639    0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
640  /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',', '\0' */
641    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2c,0x00
642 };
643 
mb_wchar_to_utf7imap(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)644 static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
645 {
646 	unsigned char *out, *limit;
647 	MB_CONVERT_BUF_LOAD(buf, out, limit);
648 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
649 
650 	bool base64;
651 	unsigned char nbits, cache; /* `nbits` is the number of cached bits; either 0, 2, or 4 */
652 	RESTORE_CONVERSION_STATE();
653 
654 	while (len--) {
655 		uint32_t w = *in++;
656 		if (base64) {
657 			if (w >= 0x20 && w <= 0x7E) {
658 				/* End of Base64 section. Drain buffered bits (if any), close Base64 section
659 				 * Leave enough space in the output buffer such that even if the remainder of
660 				 * the input string is ASCII, we can output the whole thing without having to
661 				 * check for output buffer space again */
662 				base64 = false;
663 				in--; len++; /* Unconsume codepoint; it will be handled by 'ASCII section' code below */
664 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
665 				if (nbits) {
666 					out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
667 				}
668 				nbits = cache = 0;
669 				out = mb_convert_buf_add(out, '-');
670 			} else if (w >= MBFL_WCSPLANE_UTF32MAX) {
671 				/* Make recursive call to add an error marker character */
672 				SAVE_CONVERSION_STATE();
673 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7imap);
674 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
675 				RESTORE_CONVERSION_STATE();
676 			} else {
677 				/* Encode codepoint, preceded by any cached bits, as Base64
678 				 * Make enough space in the output buffer to hold both any bytes that
679 				 * we emit right here, plus any finishing byte which might need to
680 				 * be emitted if the input string ends abruptly */
681 				uint64_t bits;
682 				if (w >= MBFL_WCSPLANE_SUPMIN) {
683 					/* Must use surrogate pair */
684 					MB_CONVERT_BUF_ENSURE(buf, out, limit, 7);
685 					w -= 0x10000;
686 					bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF);
687 					nbits += 32;
688 				} else {
689 					MB_CONVERT_BUF_ENSURE(buf, out, limit, 4);
690 					bits = (cache << 16) | w;
691 					nbits += 16;
692 				}
693 
694 				while (nbits >= 6) {
695 					out = mb_convert_buf_add(out, mbfl_base64_table[(bits >> (nbits - 6)) & 0x3F]);
696 					nbits -= 6;
697 				}
698 				cache = bits;
699 			}
700 		} else {
701 			/* ASCII section */
702 			if (w == '&') {
703 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
704 				out = mb_convert_buf_add2(out, '&', '-');
705 			} else if (w >= 0x20 && w <= 0x7E) {
706 				out = mb_convert_buf_add(out, w);
707 			} else if (w >= MBFL_WCSPLANE_UTF32MAX) {
708 				buf->state = 0;
709 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7imap);
710 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
711 				RESTORE_CONVERSION_STATE();
712 			} else {
713 				out = mb_convert_buf_add(out, '&');
714 				base64 = true;
715 				in--; len++; /* Unconsume codepoint; it will be handled by Base64 code above */
716 			}
717 		}
718 	}
719 
720 	if (end) {
721 		if (nbits) {
722 			out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
723 		}
724 		if (base64) {
725 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
726 			out = mb_convert_buf_add(out, '-');
727 		}
728 	} else {
729 		SAVE_CONVERSION_STATE();
730 	}
731 
732 	MB_CONVERT_BUF_STORE(buf, out, limit);
733 }
734 
is_utf16_cp_valid(uint16_t cp,bool is_surrogate)735 static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
736 {
737 	if (is_surrogate) {
738 		return cp >= 0xDC00 && cp <= 0xDFFF;
739 	} else if (cp >= 0xDC00 && cp <= 0xDFFF) {
740 		/* 2nd part of surrogate pair came unexpectedly */
741 		return false;
742 	} else if (cp >= 0x20 && cp <= 0x7E && cp != '&') {
743 		return false;
744 	}
745 	return true;
746 }
747 
mb_check_utf7imap(unsigned char * in,size_t in_len)748 static bool mb_check_utf7imap(unsigned char *in, size_t in_len)
749 {
750 	unsigned char *p = in, *e = p + in_len;
751 	bool base64 = false;
752 	bool is_surrogate = false;
753 
754 	while (p < e) {
755 		if (base64) {
756 			/* Base64 section */
757 			unsigned char n1 = decode_base64(*p++);
758 			if (is_base64_end(n1)) {
759 				if (!is_base64_end_valid(n1, false, is_surrogate)) {
760 					return false;
761 				}
762 				base64 = false;
763 				continue;
764 			} else if (p == e) {
765 				return false;
766 			}
767 			unsigned char n2 = decode_base64(*p++);
768 			if (is_base64_end(n2) || p == e) {
769 				return false;
770 			}
771 			unsigned char n3 = decode_base64(*p++);
772 			if (is_base64_end(n3)) {
773 				return false;
774 			}
775 			uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
776 			if (!is_utf16_cp_valid(cp1, is_surrogate)) {
777 				return false;
778 			}
779 			is_surrogate = has_surrogate(cp1, is_surrogate);
780 			if (p == e) {
781 				return false;
782 			}
783 
784 			unsigned char n4 = decode_base64(*p++);
785 			if (is_base64_end(n4)) {
786 				if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
787 					return false;
788 				}
789 				base64 = false;
790 				continue;
791 			} else if (p == e) {
792 				return false;
793 			}
794 			unsigned char n5 = decode_base64(*p++);
795 			if (is_base64_end(n5) || p == e) {
796 				return false;
797 			}
798 			unsigned char n6 = decode_base64(*p++);
799 			if (is_base64_end(n6)) {
800 				return false;
801 			}
802 			uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
803 			if (!is_utf16_cp_valid(cp2, is_surrogate)) {
804 				return false;
805 			}
806 			is_surrogate = has_surrogate(cp2, is_surrogate);
807 			if (p == e) {
808 				return false;
809 			}
810 
811 			unsigned char n7 = decode_base64(*p++);
812 			if (is_base64_end(n7)) {
813 				if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
814 					return false;
815 				}
816 				base64 = false;
817 				continue;
818 			} else if (p == e) {
819 				return false;
820 			}
821 			unsigned char n8 = decode_base64(*p++);
822 			if (is_base64_end(n8)) {
823 				return false;
824 			}
825 			uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
826 			if (!is_utf16_cp_valid(cp3, is_surrogate)) {
827 				return false;
828 			}
829 			is_surrogate = has_surrogate(cp3, is_surrogate);
830 		} else {
831 			/* ASCII text section */
832 			unsigned char c = *p++;
833 
834 			if (c == '&') {
835 				if (p == e) {
836 					return false;
837 				}
838 				unsigned char n = decode_base64(*p);
839 				if (n == DASH) {
840 					p++;
841 				} else if (n == ILLEGAL) {
842 					return false;
843 				} else {
844 					base64 = true;
845 				}
846 			} else if (c >= 0x20 && c <= 0x7E) {
847 				continue;
848 			} else {
849 				return false;
850 			}
851 		}
852 	}
853 	return !base64;
854 }
855