1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf16.h"
32 
33 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
34 static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
35 static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
38 static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
39 
40 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
41 
42 const mbfl_encoding mbfl_encoding_utf16 = {
43 	mbfl_no_encoding_utf16,
44 	"UTF-16",
45 	"UTF-16",
46 	mbfl_encoding_utf16_aliases,
47 	NULL,
48 	0,
49 	&vtbl_utf16_wchar,
50 	&vtbl_wchar_utf16,
51 	mb_utf16_to_wchar,
52 	mb_wchar_to_utf16be,
53 	NULL
54 };
55 
56 const mbfl_encoding mbfl_encoding_utf16be = {
57 	mbfl_no_encoding_utf16be,
58 	"UTF-16BE",
59 	"UTF-16BE",
60 	NULL,
61 	NULL,
62 	0,
63 	&vtbl_utf16be_wchar,
64 	&vtbl_wchar_utf16be,
65 	mb_utf16be_to_wchar,
66 	mb_wchar_to_utf16be,
67 	NULL
68 };
69 
70 const mbfl_encoding mbfl_encoding_utf16le = {
71 	mbfl_no_encoding_utf16le,
72 	"UTF-16LE",
73 	"UTF-16LE",
74 	NULL,
75 	NULL,
76 	0,
77 	&vtbl_utf16le_wchar,
78 	&vtbl_wchar_utf16le,
79 	mb_utf16le_to_wchar,
80 	mb_wchar_to_utf16le,
81 	NULL
82 };
83 
84 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
85 	mbfl_no_encoding_utf16,
86 	mbfl_no_encoding_wchar,
87 	mbfl_filt_conv_common_ctor,
88 	NULL,
89 	mbfl_filt_conv_utf16_wchar,
90 	mbfl_filt_conv_utf16_wchar_flush,
91 	NULL,
92 };
93 
94 const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
95 	mbfl_no_encoding_wchar,
96 	mbfl_no_encoding_utf16,
97 	mbfl_filt_conv_common_ctor,
98 	NULL,
99 	mbfl_filt_conv_wchar_utf16be,
100 	mbfl_filt_conv_common_flush,
101 	NULL,
102 };
103 
104 const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
105 	mbfl_no_encoding_utf16be,
106 	mbfl_no_encoding_wchar,
107 	mbfl_filt_conv_common_ctor,
108 	NULL,
109 	mbfl_filt_conv_utf16be_wchar,
110 	mbfl_filt_conv_utf16_wchar_flush,
111 	NULL,
112 };
113 
114 const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
115 	mbfl_no_encoding_wchar,
116 	mbfl_no_encoding_utf16be,
117 	mbfl_filt_conv_common_ctor,
118 	NULL,
119 	mbfl_filt_conv_wchar_utf16be,
120 	mbfl_filt_conv_common_flush,
121 	NULL,
122 };
123 
124 const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
125 	mbfl_no_encoding_utf16le,
126 	mbfl_no_encoding_wchar,
127 	mbfl_filt_conv_common_ctor,
128 	NULL,
129 	mbfl_filt_conv_utf16le_wchar,
130 	mbfl_filt_conv_utf16_wchar_flush,
131 	NULL,
132 };
133 
134 const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
135 	mbfl_no_encoding_wchar,
136 	mbfl_no_encoding_utf16le,
137 	mbfl_filt_conv_common_ctor,
138 	NULL,
139 	mbfl_filt_conv_wchar_utf16le,
140 	mbfl_filt_conv_common_flush,
141 	NULL,
142 };
143 
144 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
145 
mbfl_filt_conv_utf16_wchar(int c,mbfl_convert_filter * filter)146 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
147 {
148 	/* Start with the assumption that the string is big-endian;
149 	 * If we find a little-endian BOM, then we will change that assumption */
150 	if (filter->status == 0) {
151 		filter->cache = c & 0xFF;
152 		filter->status = 1;
153 	} else {
154 		int n = (filter->cache << 8) | (c & 0xFF);
155 		filter->cache = filter->status = 0;
156 		if (n == 0xFFFE) {
157 			/* Switch to little-endian mode */
158 			filter->filter_function = mbfl_filt_conv_utf16le_wchar;
159 		} else {
160 			filter->filter_function = mbfl_filt_conv_utf16be_wchar;
161 			if (n >= 0xD800 && n <= 0xDBFF) {
162 				filter->cache = n & 0x3FF; /* Pick out 10 data bits */
163 				filter->status = 2;
164 				return 0;
165 			} else if (n >= 0xDC00 && n <= 0xDFFF) {
166 				/* This is wrong; second part of surrogate pair has come first */
167 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
168 			} else if (n != 0xFEFF) {
169 				CK((*filter->output_function)(n, filter->data));
170 			}
171 		}
172 	}
173 
174 	return 0;
175 }
176 
mbfl_filt_conv_utf16be_wchar(int c,mbfl_convert_filter * filter)177 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
178 {
179 	int n;
180 
181 	switch (filter->status) {
182 	case 0: /* First byte */
183 		filter->cache = c & 0xFF;
184 		filter->status = 1;
185 		break;
186 
187 	case 1: /* Second byte */
188 		n = (filter->cache << 8) | (c & 0xFF);
189 		if (n >= 0xD800 && n <= 0xDBFF) {
190 			filter->cache = n & 0x3FF; /* Pick out 10 data bits */
191 			filter->status = 2;
192 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
193 			/* This is wrong; second part of surrogate pair has come first */
194 			filter->status = 0;
195 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
196 		} else {
197 			filter->status = 0;
198 			CK((*filter->output_function)(n, filter->data));
199 		}
200 		break;
201 
202 	case 2: /* Second part of surrogate, first byte */
203 		filter->cache = (filter->cache << 8) | (c & 0xFF);
204 		filter->status = 3;
205 		break;
206 
207 	case 3: /* Second part of surrogate, second byte */
208 		n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
209 		if (n >= 0xD800 && n <= 0xDBFF) {
210 			/* Wrong; that's the first half of a surrogate pair, not the second */
211 			filter->cache = n & 0x3FF;
212 			filter->status = 2;
213 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
214 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
215 			filter->status = 0;
216 			n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
217 			CK((*filter->output_function)(n, filter->data));
218 		} else {
219 			filter->status = 0;
220 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
221 			CK((*filter->output_function)(n, filter->data));
222 		}
223 	}
224 
225 	return 0;
226 }
227 
mbfl_filt_conv_wchar_utf16be(int c,mbfl_convert_filter * filter)228 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
229 {
230 	int n;
231 
232 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
233 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
234 		CK((*filter->output_function)(c & 0xff, filter->data));
235 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
236 		n = ((c >> 10) - 0x40) | 0xd800;
237 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
238 		CK((*filter->output_function)(n & 0xff, filter->data));
239 		n = (c & 0x3ff) | 0xdc00;
240 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
241 		CK((*filter->output_function)(n & 0xff, filter->data));
242 	} else {
243 		CK(mbfl_filt_conv_illegal_output(c, filter));
244 	}
245 
246 	return 0;
247 }
248 
mbfl_filt_conv_utf16le_wchar(int c,mbfl_convert_filter * filter)249 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
250 {
251 	int n;
252 
253 	switch (filter->status) {
254 	case 0:
255 		filter->cache = c & 0xff;
256 		filter->status = 1;
257 		break;
258 
259 	case 1:
260 		if ((c & 0xfc) == 0xd8) {
261 			/* Looks like we have a surrogate pair here */
262 			filter->cache += ((c & 0x3) << 8);
263 			filter->status = 2;
264 		} else if ((c & 0xfc) == 0xdc) {
265 			/* This is wrong; the second part of the surrogate pair has come first */
266 			filter->status = 0;
267 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
268 		} else {
269 			filter->status = 0;
270 			CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
271 		}
272 		break;
273 
274 	case 2:
275 		filter->cache = (filter->cache << 10) + (c & 0xff);
276 		filter->status = 3;
277 		break;
278 
279 	case 3:
280 		n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
281 		if (n >= 0xD800 && n <= 0xDBFF) {
282 			/* We previously saw the first part of a surrogate pair and were
283 			 * expecting the second part; this is another first part */
284 			filter->cache = n & 0x3FF;
285 			filter->status = 2;
286 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
287 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
288 			n = filter->cache + ((c & 0x3) << 8) + 0x10000;
289 			filter->status = 0;
290 			CK((*filter->output_function)(n, filter->data));
291 		} else {
292 			/* The first part of a surrogate pair was followed by some other codepoint
293 			 * which is not part of a surrogate pair at all */
294 			filter->status = 0;
295 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
296 			CK((*filter->output_function)(n, filter->data));
297 		}
298 		break;
299 	}
300 
301 	return 0;
302 }
303 
mbfl_filt_conv_wchar_utf16le(int c,mbfl_convert_filter * filter)304 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
305 {
306 	int n;
307 
308 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
309 		CK((*filter->output_function)(c & 0xff, filter->data));
310 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
311 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
312 		n = ((c >> 10) - 0x40) | 0xd800;
313 		CK((*filter->output_function)(n & 0xff, filter->data));
314 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
315 		n = (c & 0x3ff) | 0xdc00;
316 		CK((*filter->output_function)(n & 0xff, filter->data));
317 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
318 	} else {
319 		CK(mbfl_filt_conv_illegal_output(c, filter));
320 	}
321 
322 	return 0;
323 }
324 
mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter * filter)325 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
326 {
327 	if (filter->status) {
328 		/* Input string was truncated */
329 		filter->status = 0;
330 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
331 	}
332 
333 	if (filter->flush_function) {
334 		(*filter->flush_function)(filter->data);
335 	}
336 
337 	return 0;
338 }
339 
340 #define DETECTED_BE 1
341 #define DETECTED_LE 2
342 
mb_utf16_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)343 static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
344 {
345 	if (*state == DETECTED_BE) {
346 		return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
347 	} else if (*state == DETECTED_LE) {
348 		return mb_utf16le_to_wchar(in, in_len, buf, bufsize, NULL);
349 	} else if (*in_len >= 2) {
350 		unsigned char *p = *in;
351 		unsigned char c1 = *p++;
352 		unsigned char c2 = *p++;
353 		uint16_t n = (c1 << 8) | c2;
354 
355 		if (n == 0xFFFE) {
356 			/* Little-endian BOM */
357 			*in = p;
358 			*in_len -= 2;
359 			*state = DETECTED_LE;
360 			return mb_utf16le_to_wchar(in, in_len, buf, bufsize, NULL);
361 		} if (n == 0xFEFF) {
362 			/* Big-endian BOM; don't send to output */
363 			*in = p;
364 			*in_len -= 2;
365 		}
366 	}
367 
368 	*state = DETECTED_BE;
369 	return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
370 }
371 
mb_utf16be_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)372 static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
373 {
374 	/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
375 	unsigned char *p = *in, *e = p + (*in_len & ~1);
376 	/* Set `limit` to one less than the actual amount of space in the buffer; this is because
377 	 * on some iterations of the below loop, we might produce two output words */
378 	uint32_t *out = buf, *limit = buf + bufsize - 1;
379 
380 	while (p < e && out < limit) {
381 		unsigned char c1 = *p++;
382 		unsigned char c2 = *p++;
383 		uint16_t n = (c1 << 8) | c2;
384 
385 		if (n >= 0xD800 && n <= 0xDBFF) {
386 			/* Handle surrogate */
387 			if (p < e) {
388 				unsigned char c3 = *p++;
389 				unsigned char c4 = *p++;
390 				uint16_t n2 = (c3 << 8) | c4;
391 
392 				if (n2 >= 0xD800 && n2 <= 0xDBFF) {
393 					/* Wrong; that's the first half of a surrogate pair, when we were expecting the second */
394 					*out++ = MBFL_BAD_INPUT;
395 					p -= 2;
396 				} else if (n2 >= 0xDC00 && n2 <= 0xDFFF) {
397 					*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
398 				} else {
399 					/* The first half of a surrogate pair was followed by a 'normal' codepoint */
400 					*out++ = MBFL_BAD_INPUT;
401 					*out++ = n2;
402 				}
403 			} else {
404 				*out++ = MBFL_BAD_INPUT;
405 			}
406 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
407 			/* This is wrong; second part of surrogate pair has come first */
408 			*out++ = MBFL_BAD_INPUT;
409 		} else {
410 			*out++ = n;
411 		}
412 	}
413 
414 	if (p == e && (*in_len & 0x1) && out < limit) {
415 		/* There is an extra trailing byte (which shouldn't be there) */
416 		*out++ = MBFL_BAD_INPUT;
417 		p++;
418 	}
419 
420 	*in_len -= (p - *in);
421 	*in = p;
422 	return out - buf;
423 }
424 
mb_wchar_to_utf16be(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)425 static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
426 {
427 	unsigned char *out, *limit;
428 	MB_CONVERT_BUF_LOAD(buf, out, limit);
429 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
430 
431 	while (len--) {
432 		uint32_t w = *in++;
433 
434 		if (w < MBFL_WCSPLANE_UCS2MAX) {
435 			out = mb_convert_buf_add2(out, (w >> 8) & 0xFF, w & 0xFF);
436 		} else if (w < MBFL_WCSPLANE_UTF32MAX) {
437 			uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
438 			uint16_t n2 = (w & 0x3FF) | 0xDC00;
439 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
440 			out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
441 		} else {
442 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be);
443 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
444 		}
445 	}
446 
447 	MB_CONVERT_BUF_STORE(buf, out, limit);
448 }
449 
mb_utf16le_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)450 static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
451 {
452 	/* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
453 	unsigned char *p = *in, *e = p + (*in_len & ~1);
454 	/* Set `limit` to one less than the actual amount of space in the buffer; this is because
455 	 * on some iterations of the below loop, we might produce two output words */
456 	uint32_t *out = buf, *limit = buf + bufsize - 1;
457 
458 	while (p < e && out < limit) {
459 		unsigned char c1 = *p++;
460 		unsigned char c2 = *p++;
461 		uint16_t n = (c2 << 8) | c1;
462 
463 		if (n >= 0xD800 && n <= 0xDBFF) {
464 			/* Handle surrogate */
465 			if (p < e) {
466 				unsigned char c3 = *p++;
467 				unsigned char c4 = *p++;
468 				uint16_t n2 = (c4 << 8) | c3;
469 
470 				if (n2 >= 0xD800 && n2 <= 0xDBFF) {
471 					/* Wrong; that's the first half of a surrogate pair, when we were expecting the second */
472 					*out++ = MBFL_BAD_INPUT;
473 					p -= 2;
474 				} else if (n2 >= 0xDC00 && n2 <= 0xDFFF) {
475 					*out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
476 				} else {
477 					/* The first half of a surrogate pair was followed by a 'normal' codepoint */
478 					*out++ = MBFL_BAD_INPUT;
479 					*out++ = n2;
480 				}
481 			} else {
482 				*out++ = MBFL_BAD_INPUT;
483 			}
484 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
485 			/* This is wrong; second part of surrogate pair has come first */
486 			*out++ = MBFL_BAD_INPUT;
487 		} else {
488 			*out++ = n;
489 		}
490 	}
491 
492 	if (p == e && (*in_len & 0x1) && out < limit) {
493 		/* There is an extra trailing byte (which shouldn't be there) */
494 		*out++ = MBFL_BAD_INPUT;
495 		p++;
496 	}
497 
498 	*in_len -= (p - *in);
499 	*in = p;
500 	return out - buf;
501 }
502 
mb_wchar_to_utf16le(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)503 static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
504 {
505 	unsigned char *out, *limit;
506 	MB_CONVERT_BUF_LOAD(buf, out, limit);
507 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
508 
509 	while (len--) {
510 		uint32_t w = *in++;
511 
512 		if (w < MBFL_WCSPLANE_UCS2MAX) {
513 			out = mb_convert_buf_add2(out, w & 0xFF, (w >> 8) & 0xFF);
514 		} else if (w < MBFL_WCSPLANE_UTF32MAX) {
515 			uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
516 			uint16_t n2 = (w & 0x3FF) | 0xDC00;
517 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
518 			out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
519 		} else {
520 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le);
521 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
522 		}
523 	}
524 
525 	MB_CONVERT_BUF_STORE(buf, out, limit);
526 }
527