1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf8.h"
32 
33 const unsigned char mblen_table_utf8[] = {
34 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 	1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
49 	4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
50 };
51 
52 static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
53 static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
54 
55 static const char *mbfl_encoding_utf8_aliases[] = {"utf8", NULL};
56 
57 const mbfl_encoding mbfl_encoding_utf8 = {
58 	mbfl_no_encoding_utf8,
59 	"UTF-8",
60 	"UTF-8",
61 	mbfl_encoding_utf8_aliases,
62 	mblen_table_utf8,
63 	0,
64 	&vtbl_utf8_wchar,
65 	&vtbl_wchar_utf8,
66 	mb_utf8_to_wchar,
67 	mb_wchar_to_utf8,
68 	NULL
69 };
70 
71 const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
72 	mbfl_no_encoding_utf8,
73 	mbfl_no_encoding_wchar,
74 	mbfl_filt_conv_common_ctor,
75 	NULL,
76 	mbfl_filt_conv_utf8_wchar,
77 	mbfl_filt_conv_utf8_wchar_flush,
78 	NULL,
79 };
80 
81 const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
82 	mbfl_no_encoding_wchar,
83 	mbfl_no_encoding_utf8,
84 	mbfl_filt_conv_common_ctor,
85 	NULL,
86 	mbfl_filt_conv_wchar_utf8,
87 	mbfl_filt_conv_common_flush,
88 	NULL,
89 };
90 
91 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
92 
mbfl_filt_put_invalid_char(mbfl_convert_filter * filter)93 int mbfl_filt_put_invalid_char(mbfl_convert_filter *filter)
94 {
95 	filter->status = filter->cache = 0;
96 	CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
97 	return 0;
98 }
99 
mbfl_filt_conv_utf8_wchar(int c,mbfl_convert_filter * filter)100 int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
101 {
102 	int s, c1;
103 
104 retry:
105 	switch (filter->status) {
106 	case 0x00:
107 		if (c < 0x80) {
108 			CK((*filter->output_function)(c, filter->data));
109 		} else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
110 			filter->status = 0x10;
111 			filter->cache = c & 0x1f;
112 		} else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
113 			filter->status = 0x20;
114 			filter->cache = c & 0xf;
115 		} else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
116 			filter->status = 0x30;
117 			filter->cache = c & 0x7;
118 		} else {
119 			CK(mbfl_filt_put_invalid_char(filter));
120 		}
121 		break;
122 	case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
123 	case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
124 	case 0x32: /* 4byte code 4th char: 0x80-0xbf */
125 		if (c >= 0x80 && c <= 0xbf) {
126 			s = (filter->cache<<6) | (c & 0x3f);
127 			filter->status = filter->cache = 0;
128 			CK((*filter->output_function)(s, filter->data));
129 		} else {
130 			CK(mbfl_filt_put_invalid_char(filter));
131 			goto retry;
132 		}
133 		break;
134 	case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
135 		s = (filter->cache<<6) | (c & 0x3f);
136 		c1 = filter->cache & 0xf;
137 
138 		if ((c >= 0x80 && c <= 0xbf) &&
139 			((c1 == 0x0 && c >= 0xa0) ||
140 			 (c1 == 0xd && c < 0xa0) ||
141 			 (c1 > 0x0 && c1 != 0xd))) {
142 			filter->cache = s;
143 			filter->status++;
144 		} else {
145 			CK(mbfl_filt_put_invalid_char(filter));
146 			goto retry;
147 		}
148 		break;
149 	case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
150 		s = (filter->cache<<6) | (c & 0x3f);
151 		c1 = filter->cache & 0x7;
152 
153 		if ((c >= 0x80 && c <= 0xbf) &&
154 			((c1 == 0x0 && c >= 0x90) ||
155 			 (c1 == 0x4 && c < 0x90) ||
156 			 (c1 > 0x0 && c1 != 0x4))) {
157 			filter->cache = s;
158 			filter->status++;
159 		} else {
160 			CK(mbfl_filt_put_invalid_char(filter));
161 			goto retry;
162 		}
163 		break;
164 	case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
165 		if (c >= 0x80 && c <= 0xbf) {
166 			filter->cache = (filter->cache<<6) | (c & 0x3f);
167 			filter->status++;
168 		} else {
169 			CK(mbfl_filt_put_invalid_char(filter));
170 			goto retry;
171 		}
172 		break;
173 
174 		EMPTY_SWITCH_DEFAULT_CASE();
175 	}
176 
177 	return 0;
178 }
179 
mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter * filter)180 int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
181 {
182 	if (filter->status) {
183 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
184 		filter->status = 0;
185 	}
186 
187 	if (filter->flush_function) {
188 		(*filter->flush_function)(filter->data);
189 	}
190 
191 	return 0;
192 }
193 
mbfl_filt_conv_wchar_utf8(int c,mbfl_convert_filter * filter)194 int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter)
195 {
196 	if (c >= 0 && c < 0x110000) {
197 		if (c < 0x80) {
198 			CK((*filter->output_function)(c, filter->data));
199 		} else if (c < 0x800) {
200 			CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
201 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
202 		} else if (c < 0x10000) {
203 			CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
204 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
205 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
206 		} else {
207 			CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
208 			CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
209 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
210 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
211 		}
212 	} else {
213 		CK(mbfl_filt_conv_illegal_output(c, filter));
214 	}
215 
216 	return 0;
217 }
218 
mb_utf8_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)219 static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
220 {
221 	unsigned char *p = *in, *e = p + *in_len;
222 	uint32_t *out = buf, *limit = buf + bufsize;
223 
224 	while (p < e && out < limit) {
225 		unsigned char c = *p++;
226 
227 		if (c < 0x80) {
228 			*out++ = c;
229 		} else if (c < 0xC2) {
230 			*out++ = MBFL_BAD_INPUT;
231 		} else if (c <= 0xDF) { /* 2 byte character */
232 			if (p < e) {
233 				unsigned char c2 = *p++;
234 				if ((c2 & 0xC0) != 0x80) {
235 					*out++ = MBFL_BAD_INPUT;
236 					p--;
237 				} else {
238 					*out++ = ((c & 0x1F) << 6) | (c2 & 0x3F);
239 				}
240 			} else {
241 				*out++ = MBFL_BAD_INPUT;
242 			}
243 		} else if (c <= 0xEF) { /* 3 byte character */
244 			if ((e - p) >= 2) {
245 				unsigned char c2 = *p++;
246 				unsigned char c3 = *p++;
247 				if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
248 					*out++ = MBFL_BAD_INPUT;
249 					p -= 2;
250 				} else if ((c3 & 0xC0) != 0x80) {
251 					*out++ = MBFL_BAD_INPUT;
252 					p--;
253 				} else {
254 					uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
255 					ZEND_ASSERT(decoded >= 0x800); /* Not an overlong code unit */
256 					ZEND_ASSERT(decoded < 0xD800 || decoded > 0xDFFF); /* U+D800-DFFF are reserved, illegal code points */
257 					*out++ = decoded;
258 				}
259 			} else {
260 				*out++ = MBFL_BAD_INPUT;
261 				if (p < e && (c != 0xE0 || *p >= 0xA0) && (c != 0xED || *p < 0xA0) && (*p & 0xC0) == 0x80) {
262 					p++;
263 					if (p < e && (*p & 0xC0) == 0x80) {
264 						p++;
265 					}
266 				}
267 			}
268 		} else if (c <= 0xF4) { /* 4 byte character */
269 			if ((e - p) >= 3) {
270 				unsigned char c2 = *p++;
271 				unsigned char c3 = *p++;
272 				unsigned char c4 = *p++;
273 				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
274 				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
275 				 * greater than U+10FFFF, which is the highest legal codepoint */
276 				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
277 					*out++ = MBFL_BAD_INPUT;
278 					p -= 3;
279 				} else if ((c3 & 0xC0) != 0x80) {
280 					*out++ = MBFL_BAD_INPUT;
281 					p -= 2;
282 				} else if ((c4 & 0xC0) != 0x80) {
283 					*out++ = MBFL_BAD_INPUT;
284 					p--;
285 				} else {
286 					uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
287 					ZEND_ASSERT(decoded >= 0x10000); /* Not an overlong code unit */
288 					*out++ = decoded;
289 				}
290 			} else {
291 				*out++ = MBFL_BAD_INPUT;
292 				if (p < e) {
293 					unsigned char c2 = *p;
294 					if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c >= 0xF1 && c <= 0xF3)) {
295 						while (p < e && (*p & 0xC0) == 0x80) {
296 							p++;
297 						}
298 					}
299 				}
300 			}
301 		} else {
302 			*out++ = MBFL_BAD_INPUT;
303 		}
304 	}
305 
306 	*in_len = e - p;
307 	*in = p;
308 	return out - buf;
309 }
310 
mb_wchar_to_utf8(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)311 static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
312 {
313 	unsigned char *out, *limit;
314 	MB_CONVERT_BUF_LOAD(buf, out, limit);
315 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
316 
317 	while (len--) {
318 		uint32_t w = *in++;
319 		if (w < 0x80) {
320 			out = mb_convert_buf_add(out, w & 0xFF);
321 		} else if (w < 0x800) {
322 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
323 			out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80);
324 		} else if (w < 0x10000) {
325 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
326 			out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
327 		} else if (w < 0x110000) {
328 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
329 			out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
330 		} else {
331 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8);
332 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
333 		}
334 	}
335 
336 	MB_CONVERT_BUF_STORE(buf, out, limit);
337 }
338