1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_utf16.h"
32
33 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
34 static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
35 static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
38 static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
39
40 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
41
42 const mbfl_encoding mbfl_encoding_utf16 = {
43 mbfl_no_encoding_utf16,
44 "UTF-16",
45 "UTF-16",
46 mbfl_encoding_utf16_aliases,
47 NULL,
48 0,
49 &vtbl_utf16_wchar,
50 &vtbl_wchar_utf16,
51 mb_utf16_to_wchar,
52 mb_wchar_to_utf16be,
53 NULL
54 };
55
56 const mbfl_encoding mbfl_encoding_utf16be = {
57 mbfl_no_encoding_utf16be,
58 "UTF-16BE",
59 "UTF-16BE",
60 NULL,
61 NULL,
62 0,
63 &vtbl_utf16be_wchar,
64 &vtbl_wchar_utf16be,
65 mb_utf16be_to_wchar,
66 mb_wchar_to_utf16be,
67 NULL
68 };
69
70 const mbfl_encoding mbfl_encoding_utf16le = {
71 mbfl_no_encoding_utf16le,
72 "UTF-16LE",
73 "UTF-16LE",
74 NULL,
75 NULL,
76 0,
77 &vtbl_utf16le_wchar,
78 &vtbl_wchar_utf16le,
79 mb_utf16le_to_wchar,
80 mb_wchar_to_utf16le,
81 NULL
82 };
83
84 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
85 mbfl_no_encoding_utf16,
86 mbfl_no_encoding_wchar,
87 mbfl_filt_conv_common_ctor,
88 NULL,
89 mbfl_filt_conv_utf16_wchar,
90 mbfl_filt_conv_utf16_wchar_flush,
91 NULL,
92 };
93
94 const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
95 mbfl_no_encoding_wchar,
96 mbfl_no_encoding_utf16,
97 mbfl_filt_conv_common_ctor,
98 NULL,
99 mbfl_filt_conv_wchar_utf16be,
100 mbfl_filt_conv_common_flush,
101 NULL,
102 };
103
104 const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
105 mbfl_no_encoding_utf16be,
106 mbfl_no_encoding_wchar,
107 mbfl_filt_conv_common_ctor,
108 NULL,
109 mbfl_filt_conv_utf16be_wchar,
110 mbfl_filt_conv_utf16_wchar_flush,
111 NULL,
112 };
113
114 const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
115 mbfl_no_encoding_wchar,
116 mbfl_no_encoding_utf16be,
117 mbfl_filt_conv_common_ctor,
118 NULL,
119 mbfl_filt_conv_wchar_utf16be,
120 mbfl_filt_conv_common_flush,
121 NULL,
122 };
123
124 const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
125 mbfl_no_encoding_utf16le,
126 mbfl_no_encoding_wchar,
127 mbfl_filt_conv_common_ctor,
128 NULL,
129 mbfl_filt_conv_utf16le_wchar,
130 mbfl_filt_conv_utf16_wchar_flush,
131 NULL,
132 };
133
134 const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
135 mbfl_no_encoding_wchar,
136 mbfl_no_encoding_utf16le,
137 mbfl_filt_conv_common_ctor,
138 NULL,
139 mbfl_filt_conv_wchar_utf16le,
140 mbfl_filt_conv_common_flush,
141 NULL,
142 };
143
144 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
145
mbfl_filt_conv_utf16_wchar(int c,mbfl_convert_filter * filter)146 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
147 {
148 /* Start with the assumption that the string is big-endian;
149 * If we find a little-endian BOM, then we will change that assumption */
150 if (filter->status == 0) {
151 filter->cache = c & 0xFF;
152 filter->status = 1;
153 } else {
154 int n = (filter->cache << 8) | (c & 0xFF);
155 filter->cache = filter->status = 0;
156 if (n == 0xFFFE) {
157 /* Switch to little-endian mode */
158 filter->filter_function = mbfl_filt_conv_utf16le_wchar;
159 } else {
160 filter->filter_function = mbfl_filt_conv_utf16be_wchar;
161 if (n >= 0xD800 && n <= 0xDBFF) {
162 filter->cache = n & 0x3FF; /* Pick out 10 data bits */
163 filter->status = 2;
164 return 0;
165 } else if (n >= 0xDC00 && n <= 0xDFFF) {
166 /* This is wrong; second part of surrogate pair has come first */
167 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
168 } else if (n != 0xFEFF) {
169 CK((*filter->output_function)(n, filter->data));
170 }
171 }
172 }
173
174 return 0;
175 }
176
mbfl_filt_conv_utf16be_wchar(int c,mbfl_convert_filter * filter)177 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
178 {
179 int n;
180
181 switch (filter->status) {
182 case 0: /* First byte */
183 filter->cache = c & 0xFF;
184 filter->status = 1;
185 break;
186
187 case 1: /* Second byte */
188 n = (filter->cache << 8) | (c & 0xFF);
189 if (n >= 0xD800 && n <= 0xDBFF) {
190 filter->cache = n & 0x3FF; /* Pick out 10 data bits */
191 filter->status = 2;
192 } else if (n >= 0xDC00 && n <= 0xDFFF) {
193 /* This is wrong; second part of surrogate pair has come first */
194 filter->status = 0;
195 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
196 } else {
197 filter->status = 0;
198 CK((*filter->output_function)(n, filter->data));
199 }
200 break;
201
202 case 2: /* Second part of surrogate, first byte */
203 filter->cache = (filter->cache << 8) | (c & 0xFF);
204 filter->status = 3;
205 break;
206
207 case 3: /* Second part of surrogate, second byte */
208 n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
209 if (n >= 0xD800 && n <= 0xDBFF) {
210 /* Wrong; that's the first half of a surrogate pair, not the second */
211 filter->cache = n & 0x3FF;
212 filter->status = 2;
213 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
214 } else if (n >= 0xDC00 && n <= 0xDFFF) {
215 filter->status = 0;
216 n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
217 CK((*filter->output_function)(n, filter->data));
218 } else {
219 filter->status = 0;
220 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
221 CK((*filter->output_function)(n, filter->data));
222 }
223 }
224
225 return 0;
226 }
227
mbfl_filt_conv_wchar_utf16be(int c,mbfl_convert_filter * filter)228 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
229 {
230 int n;
231
232 if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
233 CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
234 CK((*filter->output_function)(c & 0xff, filter->data));
235 } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
236 n = ((c >> 10) - 0x40) | 0xd800;
237 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
238 CK((*filter->output_function)(n & 0xff, filter->data));
239 n = (c & 0x3ff) | 0xdc00;
240 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
241 CK((*filter->output_function)(n & 0xff, filter->data));
242 } else {
243 CK(mbfl_filt_conv_illegal_output(c, filter));
244 }
245
246 return 0;
247 }
248
mbfl_filt_conv_utf16le_wchar(int c,mbfl_convert_filter * filter)249 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
250 {
251 int n;
252
253 switch (filter->status) {
254 case 0:
255 filter->cache = c & 0xff;
256 filter->status = 1;
257 break;
258
259 case 1:
260 if ((c & 0xfc) == 0xd8) {
261 /* Looks like we have a surrogate pair here */
262 filter->cache += ((c & 0x3) << 8);
263 filter->status = 2;
264 } else if ((c & 0xfc) == 0xdc) {
265 /* This is wrong; the second part of the surrogate pair has come first */
266 filter->status = 0;
267 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
268 } else {
269 filter->status = 0;
270 CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
271 }
272 break;
273
274 case 2:
275 filter->cache = (filter->cache << 10) + (c & 0xff);
276 filter->status = 3;
277 break;
278
279 case 3:
280 n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
281 if (n >= 0xD800 && n <= 0xDBFF) {
282 /* We previously saw the first part of a surrogate pair and were
283 * expecting the second part; this is another first part */
284 filter->cache = n & 0x3FF;
285 filter->status = 2;
286 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
287 } else if (n >= 0xDC00 && n <= 0xDFFF) {
288 n = filter->cache + ((c & 0x3) << 8) + 0x10000;
289 filter->status = 0;
290 CK((*filter->output_function)(n, filter->data));
291 } else {
292 /* The first part of a surrogate pair was followed by some other codepoint
293 * which is not part of a surrogate pair at all */
294 filter->status = 0;
295 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
296 CK((*filter->output_function)(n, filter->data));
297 }
298 break;
299 }
300
301 return 0;
302 }
303
mbfl_filt_conv_wchar_utf16le(int c,mbfl_convert_filter * filter)304 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
305 {
306 int n;
307
308 if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
309 CK((*filter->output_function)(c & 0xff, filter->data));
310 CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
311 } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
312 n = ((c >> 10) - 0x40) | 0xd800;
313 CK((*filter->output_function)(n & 0xff, filter->data));
314 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
315 n = (c & 0x3ff) | 0xdc00;
316 CK((*filter->output_function)(n & 0xff, filter->data));
317 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
318 } else {
319 CK(mbfl_filt_conv_illegal_output(c, filter));
320 }
321
322 return 0;
323 }
324
mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter * filter)325 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
326 {
327 if (filter->status) {
328 /* Input string was truncated */
329 filter->status = 0;
330 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
331 }
332
333 if (filter->flush_function) {
334 (*filter->flush_function)(filter->data);
335 }
336
337 return 0;
338 }
339
340 #define DETECTED_BE 1
341 #define DETECTED_LE 2
342
mb_utf16_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)343 static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
344 {
345 if (*state == DETECTED_BE) {
346 return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
347 } else if (*state == DETECTED_LE) {
348 return mb_utf16le_to_wchar(in, in_len, buf, bufsize, NULL);
349 } else if (*in_len >= 2) {
350 unsigned char *p = *in;
351 unsigned char c1 = *p++;
352 unsigned char c2 = *p++;
353 uint16_t n = (c1 << 8) | c2;
354
355 if (n == 0xFFFE) {
356 /* Little-endian BOM */
357 *in = p;
358 *in_len -= 2;
359 *state = DETECTED_LE;
360 return mb_utf16le_to_wchar(in, in_len, buf, bufsize, NULL);
361 } if (n == 0xFEFF) {
362 /* Big-endian BOM; don't send to output */
363 *in = p;
364 *in_len -= 2;
365 }
366 }
367
368 *state = DETECTED_BE;
369 return mb_utf16be_to_wchar(in, in_len, buf, bufsize, NULL);
370 }
371
mb_utf16be_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)372 static size_t mb_utf16be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
373 {
374 /* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
375 unsigned char *p = *in, *e = p + (*in_len & ~1);
376 /* Set `limit` to one less than the actual amount of space in the buffer; this is because
377 * on some iterations of the below loop, we might produce two output words */
378 uint32_t *out = buf, *limit = buf + bufsize - 1;
379
380 while (p < e && out < limit) {
381 unsigned char c1 = *p++;
382 unsigned char c2 = *p++;
383 uint16_t n = (c1 << 8) | c2;
384
385 if (n >= 0xD800 && n <= 0xDBFF) {
386 /* Handle surrogate */
387 if (p < e) {
388 unsigned char c3 = *p++;
389 unsigned char c4 = *p++;
390 uint16_t n2 = (c3 << 8) | c4;
391
392 if (n2 >= 0xD800 && n2 <= 0xDBFF) {
393 /* Wrong; that's the first half of a surrogate pair, when we were expecting the second */
394 *out++ = MBFL_BAD_INPUT;
395 p -= 2;
396 } else if (n2 >= 0xDC00 && n2 <= 0xDFFF) {
397 *out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
398 } else {
399 /* The first half of a surrogate pair was followed by a 'normal' codepoint */
400 *out++ = MBFL_BAD_INPUT;
401 *out++ = n2;
402 }
403 } else {
404 *out++ = MBFL_BAD_INPUT;
405 }
406 } else if (n >= 0xDC00 && n <= 0xDFFF) {
407 /* This is wrong; second part of surrogate pair has come first */
408 *out++ = MBFL_BAD_INPUT;
409 } else {
410 *out++ = n;
411 }
412 }
413
414 if (p == e && (*in_len & 0x1) && out < limit) {
415 /* There is an extra trailing byte (which shouldn't be there) */
416 *out++ = MBFL_BAD_INPUT;
417 p++;
418 }
419
420 *in_len -= (p - *in);
421 *in = p;
422 return out - buf;
423 }
424
mb_wchar_to_utf16be(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)425 static void mb_wchar_to_utf16be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
426 {
427 unsigned char *out, *limit;
428 MB_CONVERT_BUF_LOAD(buf, out, limit);
429 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
430
431 while (len--) {
432 uint32_t w = *in++;
433
434 if (w < MBFL_WCSPLANE_UCS2MAX) {
435 out = mb_convert_buf_add2(out, (w >> 8) & 0xFF, w & 0xFF);
436 } else if (w < MBFL_WCSPLANE_UTF32MAX) {
437 uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
438 uint16_t n2 = (w & 0x3FF) | 0xDC00;
439 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
440 out = mb_convert_buf_add4(out, (n1 >> 8) & 0xFF, n1 & 0xFF, (n2 >> 8) & 0xFF, n2 & 0xFF);
441 } else {
442 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16be);
443 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
444 }
445 }
446
447 MB_CONVERT_BUF_STORE(buf, out, limit);
448 }
449
mb_utf16le_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)450 static size_t mb_utf16le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
451 {
452 /* We only want to read 16-bit words out of `str`; any trailing byte will be handled at the end */
453 unsigned char *p = *in, *e = p + (*in_len & ~1);
454 /* Set `limit` to one less than the actual amount of space in the buffer; this is because
455 * on some iterations of the below loop, we might produce two output words */
456 uint32_t *out = buf, *limit = buf + bufsize - 1;
457
458 while (p < e && out < limit) {
459 unsigned char c1 = *p++;
460 unsigned char c2 = *p++;
461 uint16_t n = (c2 << 8) | c1;
462
463 if (n >= 0xD800 && n <= 0xDBFF) {
464 /* Handle surrogate */
465 if (p < e) {
466 unsigned char c3 = *p++;
467 unsigned char c4 = *p++;
468 uint16_t n2 = (c4 << 8) | c3;
469
470 if (n2 >= 0xD800 && n2 <= 0xDBFF) {
471 /* Wrong; that's the first half of a surrogate pair, when we were expecting the second */
472 *out++ = MBFL_BAD_INPUT;
473 p -= 2;
474 } else if (n2 >= 0xDC00 && n2 <= 0xDFFF) {
475 *out++ = (((n & 0x3FF) << 10) | (n2 & 0x3FF)) + 0x10000;
476 } else {
477 /* The first half of a surrogate pair was followed by a 'normal' codepoint */
478 *out++ = MBFL_BAD_INPUT;
479 *out++ = n2;
480 }
481 } else {
482 *out++ = MBFL_BAD_INPUT;
483 }
484 } else if (n >= 0xDC00 && n <= 0xDFFF) {
485 /* This is wrong; second part of surrogate pair has come first */
486 *out++ = MBFL_BAD_INPUT;
487 } else {
488 *out++ = n;
489 }
490 }
491
492 if (p == e && (*in_len & 0x1) && out < limit) {
493 /* There is an extra trailing byte (which shouldn't be there) */
494 *out++ = MBFL_BAD_INPUT;
495 p++;
496 }
497
498 *in_len -= (p - *in);
499 *in = p;
500 return out - buf;
501 }
502
mb_wchar_to_utf16le(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)503 static void mb_wchar_to_utf16le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
504 {
505 unsigned char *out, *limit;
506 MB_CONVERT_BUF_LOAD(buf, out, limit);
507 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
508
509 while (len--) {
510 uint32_t w = *in++;
511
512 if (w < MBFL_WCSPLANE_UCS2MAX) {
513 out = mb_convert_buf_add2(out, w & 0xFF, (w >> 8) & 0xFF);
514 } else if (w < MBFL_WCSPLANE_UTF32MAX) {
515 uint16_t n1 = ((w >> 10) - 0x40) | 0xD800;
516 uint16_t n2 = (w & 0x3FF) | 0xDC00;
517 MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
518 out = mb_convert_buf_add4(out, n1 & 0xFF, (n1 >> 8) & 0xFF, n2 & 0xFF, (n2 >> 8) & 0xFF);
519 } else {
520 MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf16le);
521 MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
522 }
523 }
524
525 MB_CONVERT_BUF_STORE(buf, out, limit);
526 }
527