1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 /* Modified UTF-7 used for 'international mailbox names' in the IMAP protocol
31 * Also known as mUTF-7
32 * Defined in RFC 3501 5.1.3 (https://tools.ietf.org/html/rfc3501)
33 *
34 * Quoting from the RFC:
35 *
36 ***********************************************************************
37 * In modified UTF-7, printable US-ASCII characters, except for "&",
38 * represent themselves; that is, characters with octet values 0x20-0x25
39 * and 0x27-0x7e. The character "&" (0x26) is represented by the
40 * two-octet sequence "&-".
41 *
42 * All other characters (octet values 0x00-0x1f and 0x7f-0xff) are
43 * represented in modified BASE64, with a further modification from
44 * UTF-7 that "," is used instead of "/". Modified BASE64 MUST NOT be
45 * used to represent any printing US-ASCII character which can represent
46 * itself.
47 *
48 * "&" is used to shift to modified BASE64 and "-" to shift back to
49 * US-ASCII. There is no implicit shift from BASE64 to US-ASCII, and
50 * null shifts ("-&" while in BASE64; note that "&-" while in US-ASCII
51 * means "&") are not permitted. However, all names start in US-ASCII,
52 * and MUST end in US-ASCII; that is, a name that ends with a non-ASCII
53 * ISO-10646 character MUST end with a "-").
54 ***********************************************************************
55 *
56 * The purpose of all this is: 1) to keep all parts of IMAP messages 7-bit clean,
57 * 2) to avoid giving special treatment to +, /, \, and ~, since these are
58 * commonly used in mailbox names, and 3) to ensure there is only one
59 * representation of any mailbox name (vanilla UTF-7 does allow multiple
60 * representations of the same string, by Base64-encoding characters which
61 * could have been included as ASCII literals.)
62 *
63 * RFC 2152 also applies, since it defines vanilla UTF-7 (minus IMAP modifications)
64 * The following paragraph is notable:
65 *
66 ***********************************************************************
67 * Unicode is encoded using Modified Base64 by first converting Unicode
68 * 16-bit quantities to an octet stream (with the most significant octet first).
69 * Surrogate pairs (UTF-16) are converted by treating each half of the pair as
70 * a separate 16 bit quantity (i.e., no special treatment). Text with an odd
71 * number of octets is ill-formed. ISO 10646 characters outside the range
72 * addressable via surrogate pairs cannot be encoded.
73 ***********************************************************************
74 *
75 * So after reversing the modified Base64 encoding on an encoded section,
76 * the contents are interpreted as UTF-16BE. */
77
78 #include "mbfilter.h"
79 #include "mbfilter_utf7imap.h"
80 #include "utf7_helper.h"
81
82 static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter);
83 static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter);
84 static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
85 static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
86 static bool mb_check_utf7imap(unsigned char *in, size_t in_len);
87
88 static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL};
89
90 const mbfl_encoding mbfl_encoding_utf7imap = {
91 mbfl_no_encoding_utf7imap,
92 "UTF7-IMAP",
93 NULL,
94 mbfl_encoding_utf7imap_aliases,
95 NULL,
96 0,
97 &vtbl_utf7imap_wchar,
98 &vtbl_wchar_utf7imap,
99 mb_utf7imap_to_wchar,
100 mb_wchar_to_utf7imap,
101 mb_check_utf7imap
102 };
103
104 const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = {
105 mbfl_no_encoding_utf7imap,
106 mbfl_no_encoding_wchar,
107 mbfl_filt_conv_common_ctor,
108 NULL,
109 mbfl_filt_conv_utf7imap_wchar,
110 mbfl_filt_conv_utf7imap_wchar_flush,
111 NULL,
112 };
113
114 const struct mbfl_convert_vtbl vtbl_wchar_utf7imap = {
115 mbfl_no_encoding_wchar,
116 mbfl_no_encoding_utf7imap,
117 mbfl_filt_conv_common_ctor,
118 NULL,
119 mbfl_filt_conv_wchar_utf7imap,
120 mbfl_filt_conv_wchar_utf7imap_flush,
121 NULL,
122 };
123
124 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
125
mbfl_filt_conv_utf7imap_wchar(int c,mbfl_convert_filter * filter)126 int mbfl_filt_conv_utf7imap_wchar(int c, mbfl_convert_filter *filter)
127 {
128 int s, n = -1;
129
130 if (filter->status != 0) { /* Modified Base64 */
131 if (c >= 'A' && c <= 'Z') {
132 n = c - 65;
133 } else if (c >= 'a' && c <= 'z') {
134 n = c - 71;
135 } else if (c >= '0' && c <= '9') {
136 n = c + 4;
137 } else if (c == '+') {
138 n = 62;
139 } else if (c == ',') {
140 n = 63;
141 }
142
143 if (n < 0 || n > 63) {
144 if (c == '-') {
145 if (filter->status == 1) { /* "&-" -> "&" */
146 filter->cache = filter->status = 0;
147 CK((*filter->output_function)('&', filter->data));
148 } else if (filter->cache) {
149 /* Base64-encoded section ended abruptly, with partially encoded characters,
150 * or it could be that it ended on the first half of a surrogate pair */
151 filter->cache = filter->status = 0;
152 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
153 } else {
154 /* Base64-encoded section properly terminated by - */
155 filter->cache = filter->status = 0;
156 }
157 } else { /* illegal character */
158 filter->cache = filter->status = 0;
159 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
160 }
161 return 0;
162 }
163 }
164
165 switch (filter->status) {
166 /* directly encoded characters */
167 case 0:
168 if (c == '&') { /* shift character */
169 filter->status++;
170 } else if (c >= 0x20 && c <= 0x7E) { /* ASCII */
171 CK((*filter->output_function)(c, filter->data));
172 } else { /* illegal character */
173 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
174 }
175 break;
176
177 /* decode Modified Base64 */
178 case 1:
179 case 2:
180 filter->cache |= n << 10;
181 filter->status = 3;
182 break;
183 case 3:
184 filter->cache |= n << 4;
185 filter->status = 4;
186 break;
187 case 4:
188 s = ((n >> 2) & 0xf) | (filter->cache & 0xffff);
189 n = (n & 0x3) << 14;
190 filter->status = 5;
191 if (s >= 0xd800 && s < 0xdc00) {
192 /* 1st part of surrogate pair */
193 s = (((s & 0x3ff) << 16) + 0x400000) | n;
194 filter->cache = s;
195 } else if (s >= 0xdc00 && s < 0xe000) {
196 /* 2nd part of surrogate pair */
197 if (filter->cache & 0xfff0000) {
198 s &= 0x3ff;
199 s |= (filter->cache & 0xfff0000) >> 6;
200 filter->cache = n;
201 CK((*filter->output_function)(s, filter->data));
202 } else { /* illegal character */
203 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
204 }
205 } else {
206 filter->cache = n;
207 /* Characters which can be expressed as literal, ASCII characters
208 * should not be Base64-encoded */
209 if (s < 0x20 || s > 0x7E || s == '&') {
210 CK((*filter->output_function)(s, filter->data));
211 } else {
212 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
213 }
214 }
215 break;
216
217 case 5:
218 filter->cache |= n << 8;
219 filter->status = 6;
220 break;
221 case 6:
222 filter->cache |= n << 2;
223 filter->status = 7;
224 break;
225 case 7:
226 s = ((n >> 4) & 0x3) | (filter->cache & 0xffff);
227 n = (n & 0xf) << 12;
228 filter->status = 8;
229 if (s >= 0xd800 && s < 0xdc00) {
230 s = (((s & 0x3ff) << 16) + 0x400000) | n;
231 filter->cache = s;
232 } else if (s >= 0xdc00 && s < 0xe000) {
233 if (filter->cache & 0xfff0000) {
234 s &= 0x3ff;
235 s |= (filter->cache & 0xfff0000) >> 6;
236 filter->cache = n;
237 CK((*filter->output_function)(s, filter->data));
238 } else { /* illegal character */
239 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
240 }
241 } else {
242 filter->cache = n;
243 /* Characters which can be expressed as literal, ASCII characters
244 * should not be Base64-encoded */
245 if (s < 0x20 || s > 0x7E || s == '&') {
246 CK((*filter->output_function)(s, filter->data));
247 } else {
248 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
249 }
250 }
251 break;
252
253 case 8:
254 filter->cache |= n << 6;
255 filter->status = 9;
256 break;
257 case 9:
258 s = n | (filter->cache & 0xffff);
259 filter->status = 2;
260 if (s >= 0xd800 && s < 0xdc00) {
261 s = (((s & 0x3ff) << 16) + 0x400000);
262 filter->cache = s;
263 } else if (s >= 0xdc00 && s < 0xe000) {
264 if (filter->cache & 0xfff0000) {
265 s &= 0x3ff;
266 s |= (filter->cache & 0xfff0000) >> 6;
267 filter->cache = 0;
268 CK((*filter->output_function)(s, filter->data));
269 } else { /* illegal character */
270 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
271 }
272 } else {
273 filter->cache = 0;
274 /* Characters which can be expressed as literal, ASCII characters
275 * should not be Base64-encoded */
276 if (s < 0x20 || s > 0x7E || s == '&') {
277 CK((*filter->output_function)(s, filter->data));
278 } else {
279 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
280 }
281 }
282 break;
283
284 EMPTY_SWITCH_DEFAULT_CASE();
285 }
286
287 return 0;
288 }
289
mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter * filter)290 static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter)
291 {
292 if (filter->status) {
293 /* It is illegal for a UTF-7 IMAP string to end in a Base-64 encoded
294 * section. It should always change back to ASCII before the end. */
295 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
296 filter->status = 0;
297 }
298
299 if (filter->flush_function) {
300 (*filter->flush_function)(filter->data);
301 }
302
303 return 0;
304 }
305
306 static const unsigned char mbfl_utf7imap_base64_table[] =
307 {
308 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
309 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
310 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
311 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
312 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
313 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
314 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
315 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
316 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',', '\0' */
317 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2c,0x00
318 };
319
mbfl_filt_conv_wchar_utf7imap(int c,mbfl_convert_filter * filter)320 int mbfl_filt_conv_wchar_utf7imap(int c, mbfl_convert_filter *filter)
321 {
322 int n = 0, s;
323
324 if (c == '&') {
325 n = 1;
326 } else if ((c >= 0x20 && c <= 0x7e) || c == 0) {
327 n = 2;
328 } else if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
329 ;
330 } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
331 s = ((c >> 10) - 0x40) | 0xd800;
332 CK((*filter->filter_function)(s, filter));
333 s = (c & 0x3ff) | 0xdc00;
334 CK((*filter->filter_function)(s, filter));
335 return 0;
336 } else {
337 CK(mbfl_filt_conv_illegal_output(c, filter));
338 return 0;
339 }
340
341 switch (filter->status) {
342 case 0:
343 if (n != 0) { /* directly encode characters */
344 CK((*filter->output_function)(c, filter->data));
345 if (n == 1) {
346 CK((*filter->output_function)(0x2d, filter->data)); /* '-' */
347 }
348 } else { /* Modified Base64 */
349 CK((*filter->output_function)(0x26, filter->data)); /* '&' */
350 filter->status = 1;
351 filter->cache = c;
352 }
353 break;
354
355 /* encode Modified Base64 */
356 case 1:
357 s = filter->cache;
358 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 10) & 0x3f], filter->data));
359 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 4) & 0x3f], filter->data));
360 if (n != 0) {
361 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s << 2) & 0x3c], filter->data));
362 CK((*filter->output_function)('-', filter->data));
363 CK((*filter->output_function)(c, filter->data));
364 if (n == 1) {
365 CK((*filter->output_function)('-', filter->data));
366 }
367 filter->status = 0;
368 } else {
369 filter->status = 2;
370 filter->cache = ((s & 0xf) << 16) | c;
371 }
372 break;
373
374 case 2:
375 s = filter->cache;
376 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 14) & 0x3f], filter->data));
377 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 8) & 0x3f], filter->data));
378 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 2) & 0x3f], filter->data));
379 if (n != 0) {
380 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s << 4) & 0x30], filter->data));
381 CK((*filter->output_function)('-', filter->data));
382 CK((*filter->output_function)(c, filter->data));
383 if (n == 1) {
384 CK((*filter->output_function)('-', filter->data));
385 }
386 filter->status = 0;
387 } else {
388 filter->status = 3;
389 filter->cache = ((s & 0x3) << 16) | c;
390 }
391 break;
392
393 case 3:
394 s = filter->cache;
395 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 12) & 0x3f], filter->data));
396 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(s >> 6) & 0x3f], filter->data));
397 CK((*filter->output_function)(mbfl_utf7imap_base64_table[s & 0x3f], filter->data));
398 if (n != 0) {
399 CK((*filter->output_function)('-', filter->data));
400 CK((*filter->output_function)(c, filter->data));
401 if (n == 1) {
402 CK((*filter->output_function)('-', filter->data));
403 }
404 filter->status = 0;
405 } else {
406 filter->status = 1;
407 filter->cache = c;
408 }
409 break;
410
411 EMPTY_SWITCH_DEFAULT_CASE();
412 }
413
414 return 0;
415 }
416
mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter * filter)417 static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter)
418 {
419 int status = filter->status, cache = filter->cache;
420 filter->status = filter->cache = 0;
421
422 /* flush fragments */
423 switch (status) {
424 case 1:
425 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 10) & 0x3f], filter->data));
426 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 4) & 0x3f], filter->data));
427 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache << 2) & 0x3c], filter->data));
428 CK((*filter->output_function)('-', filter->data));
429 break;
430
431 case 2:
432 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 14) & 0x3f], filter->data));
433 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 8) & 0x3f], filter->data));
434 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 2) & 0x3f], filter->data));
435 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache << 4) & 0x30], filter->data));
436 CK((*filter->output_function)('-', filter->data));
437 break;
438
439 case 3:
440 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 12) & 0x3f], filter->data));
441 CK((*filter->output_function)(mbfl_utf7imap_base64_table[(cache >> 6) & 0x3f], filter->data));
442 CK((*filter->output_function)(mbfl_utf7imap_base64_table[cache & 0x3f], filter->data));
443 CK((*filter->output_function)('-', filter->data));
444 break;
445 }
446
447 return 0;
448 }
449
is_base64_end(unsigned char c)450 static inline bool is_base64_end(unsigned char c)
451 {
452 return c >= DASH;
453 }
454
decode_base64(unsigned char c)455 static unsigned char decode_base64(unsigned char c)
456 {
457 if (c >= 'A' && c <= 'Z') {
458 return c - 65;
459 } else if (c >= 'a' && c <= 'z') {
460 return c - 71;
461 } else if (c >= '0' && c <= '9') {
462 return c + 4;
463 } else if (c == '+') {
464 return 62;
465 } else if (c == ',') {
466 return 63;
467 } else if (c == '-') {
468 return DASH;
469 }
470 return ILLEGAL;
471 }
472
handle_utf16_cp(uint16_t cp,uint32_t * out,uint16_t * surrogate1)473 static uint32_t* handle_utf16_cp(uint16_t cp, uint32_t *out, uint16_t *surrogate1)
474 {
475 retry:
476 if (*surrogate1) {
477 if (cp >= 0xDC00 && cp <= 0xDFFF) {
478 *out++ = ((*surrogate1 & 0x3FF) << 10) + (cp & 0x3FF) + 0x10000;
479 *surrogate1 = 0;
480 } else {
481 *out++ = MBFL_BAD_INPUT;
482 *surrogate1 = 0;
483 goto retry;
484 }
485 } else if (cp >= 0xD800 && cp <= 0xDBFF) {
486 *surrogate1 = cp;
487 } else if (cp >= 0xDC00 && cp <= 0xDFFF) {
488 /* 2nd part of surrogate pair came unexpectedly */
489 *out++ = MBFL_BAD_INPUT;
490 } else if (cp >= 0x20 && cp <= 0x7E && cp != '&') {
491 *out++ = MBFL_BAD_INPUT;
492 } else {
493 *out++ = cp;
494 }
495 return out;
496 }
497
handle_base64_end(unsigned char n,uint32_t * out,bool * base64,bool abrupt,uint16_t * surrogate1)498 static uint32_t* handle_base64_end(unsigned char n, uint32_t *out, bool *base64, bool abrupt, uint16_t *surrogate1)
499 {
500 if (abrupt || n == ILLEGAL || *surrogate1) {
501 *out++ = MBFL_BAD_INPUT;
502 *surrogate1 = 0;
503 }
504
505 *base64 = false;
506 return out;
507 }
508
mb_utf7imap_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)509 static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
510 {
511 ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */
512
513 /* Why does this require a minimum output buffer size of 5?
514 * See comment in mb_utf7_to_wchar; the worst case for this function is similar,
515 * though not exactly the same. */
516
517 unsigned char *p = *in, *e = p + *in_len;
518 /* Always leave one empty space in output buffer in case the string ends while
519 * in Base64 mode and we need to emit an error marker */
520 uint32_t *out = buf, *limit = buf + bufsize - 1;
521
522 bool base64 = *state & 1;
523 uint16_t surrogate1 = (*state >> 1); /* First half of a surrogate pair */
524
525 while (p < e && out < limit) {
526 if (base64) {
527 /* Base64 section */
528 if ((limit - out) < 4) {
529 break;
530 }
531
532 unsigned char n1 = decode_base64(*p++);
533 if (is_base64_end(n1)) {
534 out = handle_base64_end(n1, out, &base64, false, &surrogate1);
535 continue;
536 } else if (p == e) {
537 out = handle_base64_end(n1, out, &base64, true, &surrogate1);
538 continue;
539 }
540 unsigned char n2 = decode_base64(*p++);
541 if (is_base64_end(n2) || p == e) {
542 out = handle_base64_end(n2, out, &base64, true, &surrogate1);
543 continue;
544 }
545 unsigned char n3 = decode_base64(*p++);
546 if (is_base64_end(n3)) {
547 out = handle_base64_end(n3, out, &base64, true, &surrogate1);
548 continue;
549 }
550 out = handle_utf16_cp((n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2), out, &surrogate1);
551 if (p == e) {
552 /* It is an error if trailing padding bits are not zeroes or if we were
553 * expecting the 2nd part of a surrogate pair when Base64 section ends */
554 if ((n3 & 0x3) || surrogate1)
555 *out++ = MBFL_BAD_INPUT;
556 break;
557 }
558
559 unsigned char n4 = decode_base64(*p++);
560 if (is_base64_end(n4)) {
561 out = handle_base64_end(n4, out, &base64, n3 & 0x3, &surrogate1);
562 continue;
563 } else if (p == e) {
564 out = handle_base64_end(n4, out, &base64, true, &surrogate1);
565 continue;
566 }
567 unsigned char n5 = decode_base64(*p++);
568 if (is_base64_end(n5) || p == e) {
569 out = handle_base64_end(n5, out, &base64, true, &surrogate1);
570 continue;
571 }
572 unsigned char n6 = decode_base64(*p++);
573 if (is_base64_end(n6)) {
574 out = handle_base64_end(n6, out, &base64, true, &surrogate1);
575 continue;
576 }
577 out = handle_utf16_cp((n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4), out, &surrogate1);
578 if (p == e) {
579 if ((n6 & 0xF) || surrogate1)
580 *out++ = MBFL_BAD_INPUT;
581 break;
582 }
583
584 unsigned char n7 = decode_base64(*p++);
585 if (is_base64_end(n7)) {
586 out = handle_base64_end(n7, out, &base64, n6 & 0xF, &surrogate1);
587 continue;
588 } else if (p == e) {
589 out = handle_base64_end(n7, out, &base64, true, &surrogate1);
590 continue;
591 }
592 unsigned char n8 = decode_base64(*p++);
593 if (is_base64_end(n8)) {
594 out = handle_base64_end(n8, out, &base64, true, &surrogate1);
595 continue;
596 }
597 out = handle_utf16_cp((n6 << 12) | (n7 << 6) | n8, out, &surrogate1);
598 } else {
599 unsigned char c = *p++;
600
601 if (c == '&') {
602 if (p < e && *p == '-') {
603 *out++ = '&';
604 p++;
605 } else {
606 base64 = true;
607 }
608 } else if (c >= 0x20 && c <= 0x7E) {
609 *out++ = c;
610 } else {
611 *out++ = MBFL_BAD_INPUT;
612 }
613 }
614 }
615
616 if (p == e && base64) {
617 /* UTF7-IMAP doesn't allow strings to end in Base64 mode
618 * One space in output buffer was reserved just for this */
619 *out++ = MBFL_BAD_INPUT;
620 }
621
622 *state = (surrogate1 << 1) | base64;
623 *in_len = e - p;
624 *in = p;
625 return out - buf;
626 }
627
628 #define SAVE_CONVERSION_STATE() buf->state = (cache << 4) | (nbits << 1) | base64
629 #define RESTORE_CONVERSION_STATE() base64 = (buf->state & 1); nbits = (buf->state >> 1) & 0x7; cache = (buf->state >> 4)
630
631 static const unsigned char mbfl_base64_table[] = {
632 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
633 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
634 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
635 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
636 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
637 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
638 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
639 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
640 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',', '\0' */
641 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2c,0x00
642 };
643
mb_wchar_to_utf7imap(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)644 static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
645 {
646 unsigned char *out, *limit;
647 MB_CONVERT_BUF_LOAD(buf, out, limit);
648 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
649
650 bool base64;
651 unsigned char nbits, cache; /* `nbits` is the number of cached bits; either 0, 2, or 4 */
652 RESTORE_CONVERSION_STATE();
653
654 while (len--) {
655 uint32_t w = *in++;
656 if (base64) {
657 if (w >= 0x20 && w <= 0x7E) {
658 /* End of Base64 section. Drain buffered bits (if any), close Base64 section
659 * Leave enough space in the output buffer such that even if the remainder of
660 * the input string is ASCII, we can output the whole thing without having to
661 * check for output buffer space again */
662 base64 = false;
663 in--; len++; /* Unconsume codepoint; it will be handled by 'ASCII section' code below */
664 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
665 if (nbits) {
666 out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
667 }
668 nbits = cache = 0;
669 out = mb_convert_buf_add(out, '-');
670 } else if (w >= MBFL_WCSPLANE_UTF32MAX) {
671 /* Make recursive call to add an error marker character */
672 SAVE_CONVERSION_STATE();
673 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7imap);
674 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
675 RESTORE_CONVERSION_STATE();
676 } else {
677 /* Encode codepoint, preceded by any cached bits, as Base64
678 * Make enough space in the output buffer to hold both any bytes that
679 * we emit right here, plus any finishing byte which might need to
680 * be emitted if the input string ends abruptly */
681 uint64_t bits;
682 if (w >= MBFL_WCSPLANE_SUPMIN) {
683 /* Must use surrogate pair */
684 MB_CONVERT_BUF_ENSURE(buf, out, limit, 7);
685 w -= 0x10000;
686 bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF);
687 nbits += 32;
688 } else {
689 MB_CONVERT_BUF_ENSURE(buf, out, limit, 4);
690 bits = (cache << 16) | w;
691 nbits += 16;
692 }
693
694 while (nbits >= 6) {
695 out = mb_convert_buf_add(out, mbfl_base64_table[(bits >> (nbits - 6)) & 0x3F]);
696 nbits -= 6;
697 }
698 cache = bits;
699 }
700 } else {
701 /* ASCII section */
702 if (w == '&') {
703 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
704 out = mb_convert_buf_add2(out, '&', '-');
705 } else if (w >= 0x20 && w <= 0x7E) {
706 out = mb_convert_buf_add(out, w);
707 } else if (w >= MBFL_WCSPLANE_UTF32MAX) {
708 buf->state = 0;
709 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7imap);
710 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
711 RESTORE_CONVERSION_STATE();
712 } else {
713 out = mb_convert_buf_add(out, '&');
714 base64 = true;
715 in--; len++; /* Unconsume codepoint; it will be handled by Base64 code above */
716 }
717 }
718 }
719
720 if (end) {
721 if (nbits) {
722 out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
723 }
724 if (base64) {
725 MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
726 out = mb_convert_buf_add(out, '-');
727 }
728 } else {
729 SAVE_CONVERSION_STATE();
730 }
731
732 MB_CONVERT_BUF_STORE(buf, out, limit);
733 }
734
is_utf16_cp_valid(uint16_t cp,bool is_surrogate)735 static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
736 {
737 if (is_surrogate) {
738 return cp >= 0xDC00 && cp <= 0xDFFF;
739 } else if (cp >= 0xDC00 && cp <= 0xDFFF) {
740 /* 2nd part of surrogate pair came unexpectedly */
741 return false;
742 } else if (cp >= 0x20 && cp <= 0x7E && cp != '&') {
743 return false;
744 }
745 return true;
746 }
747
mb_check_utf7imap(unsigned char * in,size_t in_len)748 static bool mb_check_utf7imap(unsigned char *in, size_t in_len)
749 {
750 unsigned char *p = in, *e = p + in_len;
751 bool base64 = false;
752 bool is_surrogate = false;
753
754 while (p < e) {
755 if (base64) {
756 /* Base64 section */
757 unsigned char n1 = decode_base64(*p++);
758 if (is_base64_end(n1)) {
759 if (!is_base64_end_valid(n1, false, is_surrogate)) {
760 return false;
761 }
762 base64 = false;
763 continue;
764 } else if (p == e) {
765 return false;
766 }
767 unsigned char n2 = decode_base64(*p++);
768 if (is_base64_end(n2) || p == e) {
769 return false;
770 }
771 unsigned char n3 = decode_base64(*p++);
772 if (is_base64_end(n3)) {
773 return false;
774 }
775 uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
776 if (!is_utf16_cp_valid(cp1, is_surrogate)) {
777 return false;
778 }
779 is_surrogate = has_surrogate(cp1, is_surrogate);
780 if (p == e) {
781 return false;
782 }
783
784 unsigned char n4 = decode_base64(*p++);
785 if (is_base64_end(n4)) {
786 if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
787 return false;
788 }
789 base64 = false;
790 continue;
791 } else if (p == e) {
792 return false;
793 }
794 unsigned char n5 = decode_base64(*p++);
795 if (is_base64_end(n5) || p == e) {
796 return false;
797 }
798 unsigned char n6 = decode_base64(*p++);
799 if (is_base64_end(n6)) {
800 return false;
801 }
802 uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
803 if (!is_utf16_cp_valid(cp2, is_surrogate)) {
804 return false;
805 }
806 is_surrogate = has_surrogate(cp2, is_surrogate);
807 if (p == e) {
808 return false;
809 }
810
811 unsigned char n7 = decode_base64(*p++);
812 if (is_base64_end(n7)) {
813 if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
814 return false;
815 }
816 base64 = false;
817 continue;
818 } else if (p == e) {
819 return false;
820 }
821 unsigned char n8 = decode_base64(*p++);
822 if (is_base64_end(n8)) {
823 return false;
824 }
825 uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
826 if (!is_utf16_cp_valid(cp3, is_surrogate)) {
827 return false;
828 }
829 is_surrogate = has_surrogate(cp3, is_surrogate);
830 } else {
831 /* ASCII text section */
832 unsigned char c = *p++;
833
834 if (c == '&') {
835 if (p == e) {
836 return false;
837 }
838 unsigned char n = decode_base64(*p);
839 if (n == DASH) {
840 p++;
841 } else if (n == ILLEGAL) {
842 return false;
843 } else {
844 base64 = true;
845 }
846 } else if (c >= 0x20 && c <= 0x7E) {
847 continue;
848 } else {
849 return false;
850 }
851 }
852 }
853 return !base64;
854 }
855