1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_utf7.h"
32 #include "utf7_helper.h"
33
34 static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter);
35 static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 static bool mb_check_utf7(unsigned char *in, size_t in_len);
38
39 static const unsigned char mbfl_base64_table[] = {
40 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
41 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
42 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
43 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
44 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
45 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
46 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
47 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
48 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
49 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
50 };
51
52 static const char *mbfl_encoding_utf7_aliases[] = {"utf7", NULL};
53
54 const mbfl_encoding mbfl_encoding_utf7 = {
55 mbfl_no_encoding_utf7,
56 "UTF-7",
57 "UTF-7",
58 mbfl_encoding_utf7_aliases,
59 NULL,
60 MBFL_ENCTYPE_GL_UNSAFE,
61 &vtbl_utf7_wchar,
62 &vtbl_wchar_utf7,
63 mb_utf7_to_wchar,
64 mb_wchar_to_utf7,
65 mb_check_utf7
66 };
67
68 const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
69 mbfl_no_encoding_utf7,
70 mbfl_no_encoding_wchar,
71 mbfl_filt_conv_common_ctor,
72 NULL,
73 mbfl_filt_conv_utf7_wchar,
74 mbfl_filt_conv_utf7_wchar_flush,
75 NULL,
76 };
77
78 const struct mbfl_convert_vtbl vtbl_wchar_utf7 = {
79 mbfl_no_encoding_wchar,
80 mbfl_no_encoding_utf7,
81 mbfl_filt_conv_common_ctor,
82 NULL,
83 mbfl_filt_conv_wchar_utf7,
84 mbfl_filt_conv_wchar_utf7_flush,
85 NULL,
86 };
87
88
89 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
90
decode_base64_char(unsigned char c)91 static unsigned int decode_base64_char(unsigned char c)
92 {
93 if (c >= 'A' && c <= 'Z') {
94 return c - 65;
95 } else if (c >= 'a' && c <= 'z') {
96 return c - 71;
97 } else if (c >= '0' && c <= '9') {
98 return c + 4;
99 } else if (c == '+') {
100 return 62;
101 } else if (c == '/') {
102 return 63;
103 }
104 return -1;
105 }
106
mbfl_filt_conv_utf7_wchar(int c,mbfl_convert_filter * filter)107 int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
108 {
109 int s, n = -1;
110
111 if (filter->status) { /* Modified Base64 */
112 n = decode_base64_char(c);
113 if (n < 0) {
114 if (filter->cache) {
115 /* Either we were expecting the 2nd half of a surrogate pair which
116 * never came, or else the last Base64 data was not padded with zeroes */
117 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
118 }
119 if (c == '-') {
120 if (filter->status == 1) { /* "+-" -> "+" */
121 CK((*filter->output_function)('+', filter->data));
122 }
123 } else if (c >= 0 && c < 0x80) { /* ASCII exclude '-' */
124 CK((*filter->output_function)(c, filter->data));
125 } else { /* illegal character */
126 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
127 }
128 filter->cache = filter->status = 0;
129 return 0;
130 }
131 }
132
133 switch (filter->status) {
134 /* directly encoded characters */
135 case 0:
136 if (c == '+') { /* '+' shift character */
137 filter->status = 1;
138 } else if (c >= 0 && c < 0x80) { /* ASCII */
139 CK((*filter->output_function)(c, filter->data));
140 } else { /* illegal character */
141 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
142 }
143 break;
144
145 /* decode Modified Base64 */
146 case 1:
147 case 2:
148 filter->cache |= n << 10;
149 filter->status = 3;
150 break;
151 case 3:
152 filter->cache |= n << 4;
153 filter->status = 4;
154 break;
155 case 4:
156 s = ((n >> 2) & 0xf) | (filter->cache & 0xffff);
157 n = (n & 0x3) << 14;
158 filter->status = 5;
159 if (s >= 0xd800 && s < 0xdc00) {
160 /* 1st part of surrogate pair */
161 if (filter->cache & 0xfff0000) {
162 /* We were waiting for the 2nd part of a surrogate pair */
163 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
164 }
165 s = (((s & 0x3ff) << 16) + 0x400000) | n;
166 filter->cache = s;
167 } else if (s >= 0xdc00 && s < 0xe000) {
168 /* 2nd part of surrogate pair */
169 if (filter->cache & 0xfff0000) {
170 s &= 0x3ff;
171 s |= (filter->cache & 0xfff0000) >> 6;
172 filter->cache = n;
173 CK((*filter->output_function)(s, filter->data));
174 } else {
175 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
176 filter->cache = n;
177 }
178 } else {
179 if (filter->cache & 0xfff0000) {
180 /* We were waiting for the 2nd part of a surrogate pair */
181 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
182 }
183 filter->cache = n;
184 CK((*filter->output_function)(s, filter->data));
185 }
186 break;
187
188 case 5:
189 filter->cache |= n << 8;
190 filter->status = 6;
191 break;
192 case 6:
193 filter->cache |= n << 2;
194 filter->status = 7;
195 break;
196 case 7:
197 s = ((n >> 4) & 0x3) | (filter->cache & 0xffff);
198 n = (n & 0xf) << 12;
199 filter->status = 8;
200 if (s >= 0xd800 && s < 0xdc00) {
201 if (filter->cache & 0xfff0000) {
202 /* We were waiting for the 2nd part of a surrogate pair */
203 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
204 }
205 s = (((s & 0x3ff) << 16) + 0x400000) | n;
206 filter->cache = s;
207 } else if (s >= 0xdc00 && s < 0xe000) {
208 /* 2nd part of surrogate pair */
209 if (filter->cache & 0xfff0000) {
210 s &= 0x3ff;
211 s |= (filter->cache & 0xfff0000) >> 6;
212 filter->cache = n;
213 CK((*filter->output_function)(s, filter->data));
214 } else {
215 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
216 filter->cache = n;
217 }
218 } else {
219 if (filter->cache & 0xfff0000) {
220 /* We were waiting for the 2nd part of a surrogate pair */
221 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
222 }
223 filter->cache = n;
224 CK((*filter->output_function)(s, filter->data));
225 }
226 break;
227
228 case 8:
229 filter->cache |= n << 6;
230 filter->status = 9;
231 break;
232 case 9:
233 s = n | (filter->cache & 0xffff);
234 filter->status = 2;
235 if (s >= 0xd800 && s < 0xdc00) {
236 if (filter->cache & 0xfff0000) {
237 /* We were waiting for the 2nd part of a surrogate pair */
238 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
239 }
240 s = (((s & 0x3ff) << 16) + 0x400000);
241 filter->cache = s;
242 } else if (s >= 0xdc00 && s < 0xe000) {
243 if (filter->cache & 0xfff0000) {
244 s &= 0x3ff;
245 s |= (filter->cache & 0xfff0000) >> 6;
246 filter->cache = 0;
247 CK((*filter->output_function)(s, filter->data));
248 } else {
249 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
250 filter->cache = 0;
251 }
252 } else {
253 if (filter->cache & 0xfff0000) {
254 /* We were waiting for the 2nd part of a surrogate pair */
255 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
256 }
257 filter->cache = 0;
258 CK((*filter->output_function)(s, filter->data));
259 }
260 break;
261
262 EMPTY_SWITCH_DEFAULT_CASE();
263 }
264
265 return 0;
266 }
267
mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter * filter)268 static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter)
269 {
270 if (filter->cache) {
271 /* Either we were expecting the 2nd half of a surrogate pair which
272 * never came, or else the last Base64 data was not padded with zeroes */
273 filter->cache = 0;
274 (*filter->output_function)(MBFL_BAD_INPUT, filter->data);
275 }
276
277 if (filter->flush_function) {
278 (*filter->flush_function)(filter->data);
279 }
280
281 return 0;
282 }
283
mbfl_filt_conv_wchar_utf7(int c,mbfl_convert_filter * filter)284 int mbfl_filt_conv_wchar_utf7(int c, mbfl_convert_filter *filter)
285 {
286 int s;
287
288 int n = 0;
289 if (c >= 0 && c < 0x80) { /* ASCII */
290 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-') {
291 n = 1;
292 } else if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?') {
293 n = 2;
294 }
295 } else if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
296 ;
297 } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_UTF32MAX) {
298 CK((*filter->filter_function)(((c >> 10) - 0x40) | 0xd800, filter));
299 CK((*filter->filter_function)((c & 0x3ff) | 0xdc00, filter));
300 return 0;
301 } else {
302 CK(mbfl_filt_conv_illegal_output(c, filter));
303 return 0;
304 }
305
306 switch (filter->status) {
307 case 0:
308 if (n != 0) { /* directly encode characters */
309 CK((*filter->output_function)(c, filter->data));
310 } else { /* Modified Base64 */
311 CK((*filter->output_function)('+', filter->data));
312 filter->status = 1;
313 filter->cache = c;
314 }
315 break;
316
317 /* encode Modified Base64 */
318 case 1:
319 s = filter->cache;
320 CK((*filter->output_function)(mbfl_base64_table[(s >> 10) & 0x3f], filter->data));
321 CK((*filter->output_function)(mbfl_base64_table[(s >> 4) & 0x3f], filter->data));
322 if (n != 0) {
323 CK((*filter->output_function)(mbfl_base64_table[(s << 2) & 0x3c], filter->data));
324 if (n == 1) {
325 CK((*filter->output_function)('-', filter->data));
326 }
327 CK((*filter->output_function)(c, filter->data));
328 filter->status = 0;
329 } else {
330 filter->status = 2;
331 filter->cache = ((s & 0xf) << 16) | c;
332 }
333 break;
334
335 case 2:
336 s = filter->cache;
337 CK((*filter->output_function)(mbfl_base64_table[(s >> 14) & 0x3f], filter->data));
338 CK((*filter->output_function)(mbfl_base64_table[(s >> 8) & 0x3f], filter->data));
339 CK((*filter->output_function)(mbfl_base64_table[(s >> 2) & 0x3f], filter->data));
340 if (n != 0) {
341 CK((*filter->output_function)(mbfl_base64_table[(s << 4) & 0x30], filter->data));
342 if (n == 1) {
343 CK((*filter->output_function)('-', filter->data));
344 }
345 CK((*filter->output_function)(c, filter->data));
346 filter->status = 0;
347 } else {
348 filter->status = 3;
349 filter->cache = ((s & 0x3) << 16) | c;
350 }
351 break;
352
353 case 3:
354 s = filter->cache;
355 CK((*filter->output_function)(mbfl_base64_table[(s >> 12) & 0x3f], filter->data));
356 CK((*filter->output_function)(mbfl_base64_table[(s >> 6) & 0x3f], filter->data));
357 CK((*filter->output_function)(mbfl_base64_table[s & 0x3f], filter->data));
358 if (n != 0) {
359 if (n == 1) {
360 CK((*filter->output_function)('-', filter->data));
361 }
362 CK((*filter->output_function)(c, filter->data));
363 filter->status = 0;
364 } else {
365 filter->status = 1;
366 filter->cache = c;
367 }
368 break;
369
370 EMPTY_SWITCH_DEFAULT_CASE();
371 }
372
373 return 0;
374 }
375
mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter * filter)376 int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter)
377 {
378 int status = filter->status;
379 int cache = filter->cache;
380 filter->status = filter->cache = 0;
381
382 /* flush fragments */
383 switch (status) {
384 case 1:
385 CK((*filter->output_function)(mbfl_base64_table[(cache >> 10) & 0x3f], filter->data));
386 CK((*filter->output_function)(mbfl_base64_table[(cache >> 4) & 0x3f], filter->data));
387 CK((*filter->output_function)(mbfl_base64_table[(cache << 2) & 0x3c], filter->data));
388 CK((*filter->output_function)('-', filter->data));
389 break;
390
391 case 2:
392 CK((*filter->output_function)(mbfl_base64_table[(cache >> 14) & 0x3f], filter->data));
393 CK((*filter->output_function)(mbfl_base64_table[(cache >> 8) & 0x3f], filter->data));
394 CK((*filter->output_function)(mbfl_base64_table[(cache >> 2) & 0x3f], filter->data));
395 CK((*filter->output_function)(mbfl_base64_table[(cache << 4) & 0x30], filter->data));
396 CK((*filter->output_function)('-', filter->data));
397 break;
398
399 case 3:
400 CK((*filter->output_function)(mbfl_base64_table[(cache >> 12) & 0x3f], filter->data));
401 CK((*filter->output_function)(mbfl_base64_table[(cache >> 6) & 0x3f], filter->data));
402 CK((*filter->output_function)(mbfl_base64_table[cache & 0x3f], filter->data));
403 CK((*filter->output_function)('-', filter->data));
404 break;
405 }
406
407 if (filter->flush_function) {
408 (*filter->flush_function)(filter->data);
409 }
410
411 return 0;
412 }
413
is_base64_end(unsigned char c)414 static inline bool is_base64_end(unsigned char c)
415 {
416 return c >= DASH;
417 }
418
is_optional_direct(unsigned char c)419 static bool is_optional_direct(unsigned char c)
420 {
421 /* Characters that are allowed to be encoded by Base64 or directly encoded */
422 return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' ||
423 c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' ||
424 c == '|' || c == '}';
425 }
426
can_end_base64(uint32_t c)427 static bool can_end_base64(uint32_t c)
428 {
429 return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
430 }
431
decode_base64(unsigned char c)432 static unsigned char decode_base64(unsigned char c)
433 {
434 if (c >= 'A' && c <= 'Z') {
435 return c - 65;
436 } else if (c >= 'a' && c <= 'z') {
437 return c - 71;
438 } else if (c >= '0' && c <= '9') {
439 return c + 4;
440 } else if (c == '+') {
441 return 62;
442 } else if (c == '/') {
443 return 63;
444 } else if (c == '-') {
445 return DASH;
446 } else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') {
447 return DIRECT;
448 } else if (c <= 0x7F) {
449 return ASCII;
450 }
451 return ILLEGAL;
452 }
453
handle_utf16_cp(uint16_t cp,uint32_t * out,uint16_t * surrogate1)454 static uint32_t* handle_utf16_cp(uint16_t cp, uint32_t *out, uint16_t *surrogate1)
455 {
456 retry:
457 if (*surrogate1) {
458 if (cp >= 0xDC00 && cp <= 0xDFFF) {
459 *out++ = ((*surrogate1 & 0x3FF) << 10) + (cp & 0x3FF) + 0x10000;
460 *surrogate1 = 0;
461 } else {
462 *out++ = MBFL_BAD_INPUT;
463 *surrogate1 = 0;
464 goto retry;
465 }
466 } else if (cp >= 0xD800 && cp <= 0xDBFF) {
467 *surrogate1 = cp;
468 } else if (cp >= 0xDC00 && cp <= 0xDFFF) {
469 /* 2nd part of surrogate pair came unexpectedly */
470 *out++ = MBFL_BAD_INPUT;
471 } else {
472 *out++ = cp;
473 }
474 return out;
475 }
476
handle_base64_end(unsigned char n,unsigned char ** p,uint32_t * out,bool * base64,bool abrupt,uint16_t * surrogate1)477 static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t *out, bool *base64, bool abrupt, uint16_t *surrogate1)
478 {
479 if (abrupt || *surrogate1) {
480 *out++ = MBFL_BAD_INPUT;
481 *surrogate1 = 0;
482 }
483
484 if (n == ILLEGAL) {
485 *out++ = MBFL_BAD_INPUT;
486 } else if (n == DIRECT || n == ASCII) {
487 (*p)--; /* Unconsume byte */
488 }
489
490 *base64 = false;
491 return out;
492 }
493
mb_utf7_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)494 static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
495 {
496 ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */
497
498 /* Why does this require a minimum output buffer size of 5?
499 * There is one case where one iteration of the main 'while' loop below will emit 5 wchars:
500 * that is if the first half of a surrogate pair is followed by an otherwise valid codepoint which
501 * is not the 2nd half of a surrogate pair, then another valid codepoint, then the Base64-encoded
502 * section ends with a byte which is not a valid Base64 character, AND which also is not in a
503 * position where we would expect the Base64-encoded section to end */
504
505 unsigned char *p = *in, *e = p + *in_len;
506 uint32_t *out = buf, *limit = buf + bufsize;
507
508 bool base64 = *state & 1;
509 uint16_t surrogate1 = (*state >> 1); /* First half of a surrogate pair which still needs 2nd half */
510
511 while (p < e && out < limit) {
512 if (base64) {
513 /* Base64 section */
514 if ((limit - out) < 5) {
515 break;
516 }
517
518 unsigned char n1 = decode_base64(*p++);
519 if (is_base64_end(n1)) {
520 out = handle_base64_end(n1, &p, out, &base64, false, &surrogate1);
521 continue;
522 } else if (p == e) {
523 out = handle_base64_end(n1, &p, out, &base64, true, &surrogate1);
524 continue;
525 }
526 unsigned char n2 = decode_base64(*p++);
527 if (is_base64_end(n2) || p == e) {
528 out = handle_base64_end(n2, &p, out, &base64, true, &surrogate1);
529 continue;
530 }
531 unsigned char n3 = decode_base64(*p++);
532 if (is_base64_end(n3)) {
533 out = handle_base64_end(n3, &p, out, &base64, true, &surrogate1);
534 continue;
535 }
536 out = handle_utf16_cp((n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2), out, &surrogate1);
537 if (p == e) {
538 /* It is an error if trailing padding bits are not zeroes or if we were
539 * expecting the 2nd part of a surrogate pair when Base64 section ends */
540 if ((n3 & 0x3) || surrogate1)
541 *out++ = MBFL_BAD_INPUT;
542 break;
543 }
544
545 unsigned char n4 = decode_base64(*p++);
546 if (is_base64_end(n4)) {
547 out = handle_base64_end(n4, &p, out, &base64, n3 & 0x3, &surrogate1);
548 continue;
549 } else if (p == e) {
550 out = handle_base64_end(n4, &p, out, &base64, true, &surrogate1);
551 continue;
552 }
553 unsigned char n5 = decode_base64(*p++);
554 if (is_base64_end(n5) || p == e) {
555 out = handle_base64_end(n5, &p, out, &base64, true, &surrogate1);
556 continue;
557 }
558 unsigned char n6 = decode_base64(*p++);
559 if (is_base64_end(n6)) {
560 out = handle_base64_end(n6, &p, out, &base64, true, &surrogate1);
561 continue;
562 }
563 out = handle_utf16_cp((n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4), out, &surrogate1);
564 if (p == e) {
565 if ((n6 & 0xF) || surrogate1)
566 *out++ = MBFL_BAD_INPUT;
567 break;
568 }
569
570 unsigned char n7 = decode_base64(*p++);
571 if (is_base64_end(n7)) {
572 out = handle_base64_end(n7, &p, out, &base64, n6 & 0xF, &surrogate1);
573 continue;
574 } else if (p == e) {
575 out = handle_base64_end(n7, &p, out, &base64, true, &surrogate1);
576 continue;
577 }
578 unsigned char n8 = decode_base64(*p++);
579 if (is_base64_end(n8)) {
580 out = handle_base64_end(n8, &p, out, &base64, true, &surrogate1);
581 continue;
582 }
583 out = handle_utf16_cp((n6 << 12) | (n7 << 6) | n8, out, &surrogate1);
584 } else {
585 /* ASCII text section */
586 unsigned char c = *p++;
587
588 if (c == '+') {
589 if (p < e) {
590 if (*p == '-') {
591 *out++ = '+';
592 p++;
593 } else {
594 base64 = true;
595 }
596 }
597 /* If a + comes at the end of the input string... do nothing about it */
598 } else if (c <= 0x7F) {
599 *out++ = c;
600 } else {
601 *out++ = MBFL_BAD_INPUT;
602 }
603 }
604 }
605
606 *state = (surrogate1 << 1) | base64;
607 *in_len = e - p;
608 *in = p;
609 return out - buf;
610 }
611
should_direct_encode(uint32_t c)612 static bool should_direct_encode(uint32_t c)
613 {
614 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c);
615 }
616
617 #define SAVE_CONVERSION_STATE() buf->state = (cache << 4) | (nbits << 1) | base64
618 #define RESTORE_CONVERSION_STATE() base64 = (buf->state & 1); nbits = (buf->state >> 1) & 0x7; cache = (buf->state >> 4)
619
mb_wchar_to_utf7(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)620 static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
621 {
622 unsigned char *out, *limit;
623 MB_CONVERT_BUF_LOAD(buf, out, limit);
624
625 /* Make enough space such that if the input string is all ASCII (not including '+'),
626 * we can copy it to the output buffer without checking for available space.
627 * However, if we find anything which is not plain ASCII, additional checks for
628 * output buffer space will be needed. */
629 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
630
631 bool base64;
632 unsigned char nbits, cache; /* `nbits` is the number of cached bits; either 0, 2, or 4 */
633 RESTORE_CONVERSION_STATE();
634
635 while (len--) {
636 uint32_t w = *in++;
637 if (base64) {
638 if (should_direct_encode(w)) {
639 /* End of Base64 section. Drain buffered bits (if any), close Base64 section */
640 base64 = false;
641 in--; len++; /* Unconsume codepoint; it will be handled by 'ASCII section' code below */
642 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
643 if (nbits) {
644 out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
645 }
646 nbits = cache = 0;
647 if (!can_end_base64(w)) {
648 out = mb_convert_buf_add(out, '-');
649 }
650 } else if (w >= MBFL_WCSPLANE_UTF32MAX) {
651 /* Make recursive call to add an error marker character */
652 SAVE_CONVERSION_STATE();
653 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7);
654 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
655 RESTORE_CONVERSION_STATE();
656 } else {
657 /* Encode codepoint, preceded by any cached bits, as Base64
658 * Make enough space in the output buffer to hold both any bytes that
659 * we emit right here, plus any finishing byte which might need to
660 * be emitted if the input string ends abruptly */
661 uint64_t bits;
662 if (w >= MBFL_WCSPLANE_SUPMIN) {
663 /* Must use surrogate pair */
664 MB_CONVERT_BUF_ENSURE(buf, out, limit, 7);
665 w -= 0x10000;
666 bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF);
667 nbits += 32;
668 } else {
669 MB_CONVERT_BUF_ENSURE(buf, out, limit, 4);
670 bits = (cache << 16) | w;
671 nbits += 16;
672 }
673
674 while (nbits >= 6) {
675 out = mb_convert_buf_add(out, mbfl_base64_table[(bits >> (nbits - 6)) & 0x3F]);
676 nbits -= 6;
677 }
678 cache = bits;
679 }
680 } else {
681 /* ASCII section */
682 if (should_direct_encode(w)) {
683 out = mb_convert_buf_add(out, w);
684 } else if (w >= MBFL_WCSPLANE_UTF32MAX) {
685 buf->state = 0;
686 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7);
687 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
688 RESTORE_CONVERSION_STATE();
689 } else {
690 out = mb_convert_buf_add(out, '+');
691 base64 = true;
692 in--; len++; /* Unconsume codepoint; it will be handled by Base64 code above */
693 }
694 }
695 }
696
697 if (end) {
698 if (nbits) {
699 out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]);
700 }
701 if (base64) {
702 MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
703 out = mb_convert_buf_add(out, '-');
704 }
705 } else {
706 SAVE_CONVERSION_STATE();
707 }
708
709 MB_CONVERT_BUF_STORE(buf, out, limit);
710 }
711
is_utf16_cp_valid(uint16_t cp,bool is_surrogate)712 static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
713 {
714 if (is_surrogate) {
715 return cp >= 0xDC00 && cp <= 0xDFFF;
716 } else {
717 /* 2nd part of surrogate pair came unexpectedly */
718 return !(cp >= 0xDC00 && cp <= 0xDFFF);
719 }
720 }
721
can_encode_directly(unsigned char c)722 static bool can_encode_directly(unsigned char c)
723 {
724 return should_direct_encode(c) || is_optional_direct(c) || c == '\0';
725 }
726
mb_check_utf7(unsigned char * in,size_t in_len)727 static bool mb_check_utf7(unsigned char *in, size_t in_len)
728 {
729 unsigned char *p = in, *e = p + in_len;
730 bool base64 = false;
731 bool is_surrogate = false;
732
733 while (p < e) {
734 if (base64) {
735 unsigned char n1 = decode_base64(*p++);
736 if (is_base64_end(n1)) {
737 if (!is_base64_end_valid(n1, false, is_surrogate)) {
738 return false;
739 }
740 base64 = false;
741 continue;
742 } else if (p == e) {
743 return false;
744 }
745 unsigned char n2 = decode_base64(*p++);
746 if (is_base64_end(n2) || p == e) {
747 return false;
748 }
749 unsigned char n3 = decode_base64(*p++);
750 if (is_base64_end(n3)) {
751 return false;
752 }
753 uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
754 if (!is_utf16_cp_valid(cp1, is_surrogate)) {
755 return false;
756 }
757 is_surrogate = has_surrogate(cp1, is_surrogate);
758 if (p == e) {
759 /* It is an error if trailing padding bits are not zeroes or if we were
760 * expecting the 2nd part of a surrogate pair when Base64 section ends */
761 return !((n3 & 0x3) || is_surrogate);
762 }
763
764 unsigned char n4 = decode_base64(*p++);
765 if (is_base64_end(n4)) {
766 if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
767 return false;
768 }
769 base64 = false;
770 continue;
771 } else if (p == e) {
772 return false;
773 }
774 unsigned char n5 = decode_base64(*p++);
775 if (is_base64_end(n5) || p == e) {
776 return false;
777 }
778 unsigned char n6 = decode_base64(*p++);
779 if (is_base64_end(n6)) {
780 return false;
781 }
782 uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
783 if (!is_utf16_cp_valid(cp2, is_surrogate)) {
784 return false;
785 }
786 is_surrogate = has_surrogate(cp2, is_surrogate);
787 if (p == e) {
788 return !((n6 & 0xF) || is_surrogate);
789 }
790
791 unsigned char n7 = decode_base64(*p++);
792 if (is_base64_end(n7)) {
793 if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
794 return false;
795 }
796 base64 = false;
797 continue;
798 } else if (p == e) {
799 return false;
800 }
801 unsigned char n8 = decode_base64(*p++);
802 if (is_base64_end(n8)) {
803 return false;
804 }
805 uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
806 if (!is_utf16_cp_valid(cp3, is_surrogate)) {
807 return false;
808 }
809 is_surrogate = has_surrogate(cp3, is_surrogate);
810 } else {
811 /* ASCII text section */
812 unsigned char c = *p++;
813
814 if (c == '+') {
815 if (p == e) {
816 base64 = true;
817 return !is_surrogate;
818 }
819 unsigned char n = decode_base64(*p);
820 if (n == DASH) {
821 p++;
822 } else if (n > DASH) {
823 /* If a "+" character followed immediately by any character other than base64 or "-" */
824 return false;
825 } else {
826 base64 = true;
827 }
828 } else if (can_encode_directly(c)) {
829 continue;
830 } else {
831 return false;
832 }
833 }
834 }
835 return !is_surrogate;
836 }
837