1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this file was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 20 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf32.h"
32 
33 static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter);
34 static size_t mb_utf32_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
35 static size_t mb_utf32be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_utf32be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 static size_t mb_utf32le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
38 static void mb_wchar_to_utf32le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
39 
40 static const char *mbfl_encoding_utf32_aliases[] = {"utf32", NULL};
41 
42 const mbfl_encoding mbfl_encoding_utf32 = {
43 	mbfl_no_encoding_utf32,
44 	"UTF-32",
45 	"UTF-32",
46 	mbfl_encoding_utf32_aliases,
47 	NULL,
48 	MBFL_ENCTYPE_WCS4,
49 	&vtbl_utf32_wchar,
50 	&vtbl_wchar_utf32,
51 	mb_utf32_to_wchar,
52 	mb_wchar_to_utf32be,
53 	NULL,
54 	NULL,
55 };
56 
57 const mbfl_encoding mbfl_encoding_utf32be = {
58 	mbfl_no_encoding_utf32be,
59 	"UTF-32BE",
60 	"UTF-32BE",
61 	NULL,
62 	NULL,
63 	MBFL_ENCTYPE_WCS4,
64 	&vtbl_utf32be_wchar,
65 	&vtbl_wchar_utf32be,
66 	mb_utf32be_to_wchar,
67 	mb_wchar_to_utf32be,
68 	NULL,
69 	NULL,
70 };
71 
72 const mbfl_encoding mbfl_encoding_utf32le = {
73 	mbfl_no_encoding_utf32le,
74 	"UTF-32LE",
75 	"UTF-32LE",
76 	NULL,
77 	NULL,
78 	MBFL_ENCTYPE_WCS4,
79 	&vtbl_utf32le_wchar,
80 	&vtbl_wchar_utf32le,
81 	mb_utf32le_to_wchar,
82 	mb_wchar_to_utf32le,
83 	NULL,
84 	NULL,
85 };
86 
87 const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
88 	mbfl_no_encoding_utf32,
89 	mbfl_no_encoding_wchar,
90 	mbfl_filt_conv_common_ctor,
91 	NULL,
92 	mbfl_filt_conv_utf32_wchar,
93 	mbfl_filt_conv_utf32_wchar_flush,
94 	NULL,
95 };
96 
97 const struct mbfl_convert_vtbl vtbl_wchar_utf32 = {
98 	mbfl_no_encoding_wchar,
99 	mbfl_no_encoding_utf32,
100 	mbfl_filt_conv_common_ctor,
101 	NULL,
102 	mbfl_filt_conv_wchar_utf32be,
103 	mbfl_filt_conv_common_flush,
104 	NULL,
105 };
106 
107 const struct mbfl_convert_vtbl vtbl_utf32be_wchar = {
108 	mbfl_no_encoding_utf32be,
109 	mbfl_no_encoding_wchar,
110 	mbfl_filt_conv_common_ctor,
111 	NULL,
112 	mbfl_filt_conv_utf32be_wchar,
113 	mbfl_filt_conv_utf32_wchar_flush,
114 	NULL,
115 };
116 
117 const struct mbfl_convert_vtbl vtbl_wchar_utf32be = {
118 	mbfl_no_encoding_wchar,
119 	mbfl_no_encoding_utf32be,
120 	mbfl_filt_conv_common_ctor,
121 	NULL,
122 	mbfl_filt_conv_wchar_utf32be,
123 	mbfl_filt_conv_common_flush,
124 	NULL,
125 };
126 
127 const struct mbfl_convert_vtbl vtbl_utf32le_wchar = {
128 	mbfl_no_encoding_utf32le,
129 	mbfl_no_encoding_wchar,
130 	mbfl_filt_conv_common_ctor,
131 	NULL,
132 	mbfl_filt_conv_utf32le_wchar,
133 	mbfl_filt_conv_utf32_wchar_flush,
134 	NULL,
135 };
136 
137 const struct mbfl_convert_vtbl vtbl_wchar_utf32le = {
138 	mbfl_no_encoding_wchar,
139 	mbfl_no_encoding_utf32le,
140 	mbfl_filt_conv_common_ctor,
141 	NULL,
142 	mbfl_filt_conv_wchar_utf32le,
143 	mbfl_filt_conv_common_flush,
144 	NULL,
145 };
146 
147 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
148 
emit_char_if_valid(int n,mbfl_convert_filter * filter)149 static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
150 {
151 	if (n >= 0 && n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
152 		CK((*filter->output_function)(n, filter->data));
153 	} else {
154 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
155 	}
156 	return 0;
157 }
158 
mbfl_filt_conv_utf32_wchar(int c,mbfl_convert_filter * filter)159 int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter)
160 {
161 	if (filter->status < 3) {
162 		filter->cache = (filter->cache << 8) | (c & 0xFF);
163 		filter->status++;
164 	} else {
165 		int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
166 		filter->cache = filter->status = 0;
167 
168 		if (n == 0xFFFE0000) {
169 			/* Found a little-endian byte order mark */
170 			filter->filter_function = mbfl_filt_conv_utf32le_wchar;
171 		} else {
172 			filter->filter_function = mbfl_filt_conv_utf32be_wchar;
173 			if (n != 0xFEFF) {
174 				CK(emit_char_if_valid(n, filter));
175 			}
176 		}
177 	}
178 
179 	return 0;
180 }
181 
mbfl_filt_conv_utf32be_wchar(int c,mbfl_convert_filter * filter)182 int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter)
183 {
184 	if (filter->status < 3) {
185 		filter->cache = (filter->cache << 8) | (c & 0xFF);
186 		filter->status++;
187 	} else {
188 		int n = ((unsigned int)filter->cache << 8) | (c & 0xFF);
189 		filter->cache = filter->status = 0;
190 		CK(emit_char_if_valid(n, filter));
191 	}
192 	return 0;
193 }
194 
mbfl_filt_conv_wchar_utf32be(int c,mbfl_convert_filter * filter)195 int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
196 {
197 	if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
198 		CK((*filter->output_function)((c >> 24) & 0xff, filter->data));
199 		CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
200 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
201 		CK((*filter->output_function)(c & 0xff, filter->data));
202 	} else {
203 		CK(mbfl_filt_conv_illegal_output(c, filter));
204 	}
205 
206 	return 0;
207 }
208 
mbfl_filt_conv_utf32le_wchar(int c,mbfl_convert_filter * filter)209 int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter)
210 {
211 	if (filter->status < 3) {
212 		filter->cache |= ((c & 0xFFU) << (8 * filter->status));
213 		filter->status++;
214 	} else {
215 		int n = ((c & 0xFFU) << 24) | filter->cache;
216 		filter->cache = filter->status = 0;
217 		CK(emit_char_if_valid(n, filter));
218 	}
219 	return 0;
220 }
221 
mbfl_filt_conv_wchar_utf32le(int c,mbfl_convert_filter * filter)222 int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
223 {
224 	if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
225 		CK((*filter->output_function)(c & 0xff, filter->data));
226 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
227 		CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
228 		CK((*filter->output_function)((c >> 24) & 0xff, filter->data));
229 	} else {
230 		CK(mbfl_filt_conv_illegal_output(c, filter));
231 	}
232 
233 	return 0;
234 }
235 
mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter * filter)236 static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter)
237 {
238 	if (filter->status) {
239 		/* Input string was truncated */
240 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
241 	}
242 	filter->cache = filter->status = 0;
243 
244 	if (filter->flush_function) {
245 		(*filter->flush_function)(filter->data);
246 	}
247 
248 	return 0;
249 }
250 
251 #define DETECTED_BE 1
252 #define DETECTED_LE 2
253 
mb_utf32_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)254 static size_t mb_utf32_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
255 {
256 	if (*state == DETECTED_BE) {
257 		return mb_utf32be_to_wchar(in, in_len, buf, bufsize, NULL);
258 	} else if (*state == DETECTED_LE) {
259 		return mb_utf32le_to_wchar(in, in_len, buf, bufsize, NULL);
260 	} else if (*in_len >= 4) {
261 		unsigned char *p = *in;
262 		uint32_t c1 = *p++;
263 		uint32_t c2 = *p++;
264 		uint32_t c3 = *p++;
265 		uint32_t c4 = *p++;
266 		uint32_t w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c4;
267 
268 		if (w == 0xFFFE0000) {
269 			/* Little-endian BOM */
270 			*in = p;
271 			*in_len -= 4;
272 			*state = DETECTED_LE;
273 			return mb_utf32le_to_wchar(in, in_len, buf, bufsize, NULL);
274 		} else if (w == 0xFEFF) {
275 			/* Big-endian BOM; don't send it to output */
276 			*in = p;
277 			*in_len -= 4;
278 		}
279 	}
280 
281 	*state = DETECTED_BE;
282 	return mb_utf32be_to_wchar(in, in_len, buf, bufsize, NULL);
283 }
284 
mb_utf32be_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)285 static size_t mb_utf32be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
286 {
287 	unsigned char *p = *in, *e = p + (*in_len & ~3);
288 	uint32_t *out = buf, *limit = buf + bufsize;
289 
290 	while (p < e && out < limit) {
291 		uint32_t c1 = *p++;
292 		uint32_t c2 = *p++;
293 		uint32_t c3 = *p++;
294 		uint32_t c4 = *p++;
295 		uint32_t w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c4;
296 
297 		if (w < MBFL_WCSPLANE_UTF32MAX && (w < 0xD800 || w > 0xDFFF)) {
298 			*out++ = w;
299 		} else {
300 			*out++ = MBFL_BAD_INPUT;
301 		}
302 	}
303 
304 	if (p == e && (*in_len & 0x3) && out < limit) {
305 		/* There are 1-3 trailing bytes, which shouldn't be there */
306 		*out++ = MBFL_BAD_INPUT;
307 		p = *in + *in_len;
308 	}
309 
310 	*in_len -= (p - *in);
311 	*in = p;
312 	return out - buf;
313 }
314 
mb_wchar_to_utf32be(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)315 static void mb_wchar_to_utf32be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
316 {
317 	unsigned char *out, *limit;
318 	MB_CONVERT_BUF_LOAD(buf, out, limit);
319 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 4);
320 
321 	while (len--) {
322 		uint32_t w = *in++;
323 		if (w < MBFL_WCSPLANE_UTF32MAX) {
324 			out = mb_convert_buf_add4(out, (w >> 24) & 0xFF, (w >> 16) & 0xFF, (w >> 8) & 0xFF, w & 0xFF);
325 		} else {
326 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf32be);
327 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 4);
328 		}
329 	}
330 
331 	MB_CONVERT_BUF_STORE(buf, out, limit);
332 }
333 
mb_utf32le_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)334 static size_t mb_utf32le_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
335 {
336 	unsigned char *p = *in, *e = p + (*in_len & ~3);
337 	uint32_t *out = buf, *limit = buf + bufsize;
338 
339 	while (p < e && out < limit) {
340 		uint32_t c1 = *p++;
341 		uint32_t c2 = *p++;
342 		uint32_t c3 = *p++;
343 		uint32_t c4 = *p++;
344 		uint32_t w = (c4 << 24) | (c3 << 16) | (c2 << 8) | c1;
345 
346 		if (w < MBFL_WCSPLANE_UTF32MAX && (w < 0xD800 || w > 0xDFFF)) {
347 			*out++ = w;
348 		} else {
349 			*out++ = MBFL_BAD_INPUT;
350 		}
351 	}
352 
353 	if (p == e && (*in_len & 0x3) && out < limit) {
354 		/* There are 1-3 trailing bytes, which shouldn't be there */
355 		*out++ = MBFL_BAD_INPUT;
356 		p = *in + *in_len;
357 	}
358 
359 	*in_len -= (p - *in);
360 	*in = p;
361 	return out - buf;
362 }
363 
mb_wchar_to_utf32le(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)364 static void mb_wchar_to_utf32le(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
365 {
366 	unsigned char *out, *limit;
367 	MB_CONVERT_BUF_LOAD(buf, out, limit);
368 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 4);
369 
370 	while (len--) {
371 		uint32_t w = *in++;
372 		if (w < MBFL_WCSPLANE_UTF32MAX) {
373 			out = mb_convert_buf_add4(out, w & 0xFF, (w >> 8) & 0xFF, (w >> 16) & 0xFF, (w >> 24) & 0xFF);
374 		} else {
375 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf32le);
376 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 4);
377 		}
378 	}
379 
380 	MB_CONVERT_BUF_STORE(buf, out, limit);
381 }
382