1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf8.h"
32 #include "mbfilter_cjk.h"
33 #include "emoji2uni.h"
34 
35 const unsigned char mblen_table_utf8[] = {
36 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 	1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
51 	4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
52 };
53 
54 extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n);
55 extern int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter);
56 
57 static int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter);
58 static int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter);
59 
60 static int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter);
61 static int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter);
62 static int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter);
63 
64 static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
65 static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
66 static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end);
67 
68 static size_t mb_utf8_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
69 static void mb_wchar_to_utf8_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
70 static size_t mb_utf8_kddi_a_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
71 static void mb_wchar_to_utf8_kddi_a(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
72 static size_t mb_utf8_kddi_b_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
73 static void mb_wchar_to_utf8_kddi_b(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
74 static size_t mb_utf8_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
75 static void mb_wchar_to_utf8_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
76 
mbfilter_conv_map_tbl(int c,int * w,unsigned int n,const unsigned short map[][3])77 static bool mbfilter_conv_map_tbl(int c, int *w, unsigned int n, const unsigned short map[/* n */][3])
78 {
79 	for (unsigned int i = 0; i < n; i++) {
80 		if (map[i][0] <= c && c <= map[i][1]) {
81 			*w = c - map[i][0] + map[i][2];
82 			return true;
83 		}
84 	}
85 	return false;
86 }
87 
mbfilter_conv_r_map_tbl(int c,int * w,unsigned int n,const unsigned short map[][3])88 static bool mbfilter_conv_r_map_tbl(int c, int *w, unsigned int n, const unsigned short map[/* n */][3])
89 {
90 	/* Convert in reverse direction */
91 	for (unsigned int i = 0; i < n; i++) {
92 		if (map[i][2] <= c && c <= map[i][2] - map[i][0] + map[i][1]) {
93 			*w = c + map[i][0] - map[i][2];
94 			return true;
95 		}
96 	}
97 	return false;
98 }
99 
100 static const unsigned short mbfl_docomo2uni_pua[4][3] = {
101 	{0x28c2, 0x292f, 0xe63e},
102 	{0x2930, 0x2934, 0xe6ac},
103 	{0x2935, 0x2951, 0xe6b1},
104 	{0x2952, 0x29db, 0xe6ce},
105 };
106 
107 static const unsigned short mbfl_kddi2uni_pua[7][3] = {
108 	{0x26ec, 0x2838, 0xe468},
109 	{0x284c, 0x2863, 0xe5b5},
110 	{0x24b8, 0x24ca, 0xe5cd},
111 	{0x24cb, 0x2545, 0xea80},
112 	{0x2839, 0x284b, 0xeafb},
113 	{0x2546, 0x25c0, 0xeb0e},
114 	{0x25c1, 0x25c6, 0xeb89},
115 };
116 
117 static const unsigned short mbfl_kddi2uni_pua_b[8][3] = {
118 	{0x24b8, 0x24f6, 0xec40},
119 	{0x24f7, 0x2573, 0xec80},
120 	{0x2574, 0x25b2, 0xed40},
121 	{0x25b3, 0x25c6, 0xed80},
122 	{0x26ec, 0x272a, 0xef40},
123 	{0x272b, 0x27a7, 0xef80},
124 	{0x27a8, 0x27e6, 0xf040},
125 	{0x27e7, 0x2863, 0xf080},
126 };
127 
128 static const unsigned short mbfl_sb2uni_pua[6][3] = {
129 	{0x27a9, 0x2802, 0xe101},
130 	{0x2808, 0x2861, 0xe201},
131 	{0x2921, 0x297a, 0xe001},
132 	{0x2980, 0x29cc, 0xe301},
133 	{0x2a99, 0x2ae4, 0xe401},
134 	{0x2af8, 0x2b35, 0xe501},
135 };
136 
137 static const char *mbfl_encoding_utf8_aliases[] = {"utf8", NULL};
138 
139 const mbfl_encoding mbfl_encoding_utf8 = {
140 	mbfl_no_encoding_utf8,
141 	"UTF-8",
142 	"UTF-8",
143 	mbfl_encoding_utf8_aliases,
144 	mblen_table_utf8,
145 	0,
146 	&vtbl_utf8_wchar,
147 	&vtbl_wchar_utf8,
148 	mb_utf8_to_wchar,
149 	mb_wchar_to_utf8,
150 	NULL,
151 	mb_cut_utf8
152 };
153 
154 const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
155 	mbfl_no_encoding_utf8,
156 	mbfl_no_encoding_wchar,
157 	mbfl_filt_conv_common_ctor,
158 	NULL,
159 	mbfl_filt_conv_utf8_wchar,
160 	mbfl_filt_conv_utf8_wchar_flush,
161 	NULL,
162 };
163 
164 const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
165 	mbfl_no_encoding_wchar,
166 	mbfl_no_encoding_utf8,
167 	mbfl_filt_conv_common_ctor,
168 	NULL,
169 	mbfl_filt_conv_wchar_utf8,
170 	mbfl_filt_conv_common_flush,
171 	NULL,
172 };
173 
174 static const char *mbfl_encoding_utf8_docomo_aliases[] = {"UTF-8-DOCOMO", "UTF8-DOCOMO", NULL};
175 static const char *mbfl_encoding_utf8_kddi_b_aliases[] = {"UTF-8-Mobile#KDDI", "UTF-8-KDDI", "UTF8-KDDI", NULL};
176 static const char *mbfl_encoding_utf8_sb_aliases[] = {"UTF-8-SOFTBANK", "UTF8-SOFTBANK", NULL};
177 
178 const mbfl_encoding mbfl_encoding_utf8_docomo = {
179 	mbfl_no_encoding_utf8_docomo,
180 	"UTF-8-Mobile#DOCOMO",
181 	"UTF-8",
182 	mbfl_encoding_utf8_docomo_aliases,
183 	mblen_table_utf8,
184 	0,
185 	&vtbl_utf8_docomo_wchar,
186 	&vtbl_wchar_utf8_docomo,
187 	mb_utf8_docomo_to_wchar,
188 	mb_wchar_to_utf8_docomo,
189 	NULL,
190 	mb_cut_utf8,
191 };
192 
193 const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
194 	mbfl_no_encoding_utf8_kddi_a,
195 	"UTF-8-Mobile#KDDI-A",
196 	"UTF-8",
197 	NULL,
198 	mblen_table_utf8,
199 	0,
200 	&vtbl_utf8_kddi_a_wchar,
201 	&vtbl_wchar_utf8_kddi_a,
202 	mb_utf8_kddi_a_to_wchar,
203 	mb_wchar_to_utf8_kddi_a,
204 	NULL,
205 	mb_cut_utf8,
206 };
207 
208 const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
209 	mbfl_no_encoding_utf8_kddi_b,
210 	"UTF-8-Mobile#KDDI-B",
211 	"UTF-8",
212 	mbfl_encoding_utf8_kddi_b_aliases,
213 	mblen_table_utf8,
214 	0,
215 	&vtbl_utf8_kddi_b_wchar,
216 	&vtbl_wchar_utf8_kddi_b,
217 	mb_utf8_kddi_b_to_wchar,
218 	mb_wchar_to_utf8_kddi_b,
219 	NULL,
220 	mb_cut_utf8,
221 };
222 
223 const mbfl_encoding mbfl_encoding_utf8_sb = {
224 	mbfl_no_encoding_utf8_sb,
225 	"UTF-8-Mobile#SOFTBANK",
226 	"UTF-8",
227 	mbfl_encoding_utf8_sb_aliases,
228 	mblen_table_utf8,
229 	0,
230 	&vtbl_utf8_sb_wchar,
231 	&vtbl_wchar_utf8_sb,
232 	mb_utf8_sb_to_wchar,
233 	mb_wchar_to_utf8_sb,
234 	NULL,
235 	mb_cut_utf8,
236 };
237 
238 const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {
239 	mbfl_no_encoding_utf8_docomo,
240 	mbfl_no_encoding_wchar,
241 	mbfl_filt_conv_common_ctor,
242 	NULL,
243 	mbfl_filt_conv_utf8_mobile_wchar,
244 	mbfl_filt_conv_utf8_wchar_flush,
245 	NULL,
246 };
247 
248 const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo = {
249 	mbfl_no_encoding_wchar,
250 	mbfl_no_encoding_utf8_docomo,
251 	mbfl_filt_conv_common_ctor,
252 	NULL,
253 	mbfl_filt_conv_wchar_utf8_mobile,
254 	mbfl_filt_conv_sjis_mobile_flush,
255 	NULL,
256 };
257 
258 const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar = {
259 	mbfl_no_encoding_utf8_kddi_a,
260 	mbfl_no_encoding_wchar,
261 	mbfl_filt_conv_common_ctor,
262 	NULL,
263 	mbfl_filt_conv_utf8_mobile_wchar,
264 	mbfl_filt_conv_utf8_wchar_flush,
265 	NULL,
266 };
267 
268 const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a = {
269 	mbfl_no_encoding_wchar,
270 	mbfl_no_encoding_utf8_kddi_a,
271 	mbfl_filt_conv_common_ctor,
272 	NULL,
273 	mbfl_filt_conv_wchar_utf8_mobile,
274 	mbfl_filt_conv_sjis_mobile_flush,
275 	NULL,
276 };
277 
278 const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar = {
279 	mbfl_no_encoding_utf8_kddi_b,
280 	mbfl_no_encoding_wchar,
281 	mbfl_filt_conv_common_ctor,
282 	NULL,
283 	mbfl_filt_conv_utf8_mobile_wchar,
284 	mbfl_filt_conv_utf8_wchar_flush,
285 	NULL,
286 };
287 
288 const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b = {
289 	mbfl_no_encoding_wchar,
290 	mbfl_no_encoding_utf8_kddi_b,
291 	mbfl_filt_conv_common_ctor,
292 	NULL,
293 	mbfl_filt_conv_wchar_utf8_mobile,
294 	mbfl_filt_conv_sjis_mobile_flush,
295 	NULL,
296 };
297 
298 const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar = {
299 	mbfl_no_encoding_utf8_sb,
300 	mbfl_no_encoding_wchar,
301 	mbfl_filt_conv_common_ctor,
302 	NULL,
303 	mbfl_filt_conv_utf8_mobile_wchar,
304 	mbfl_filt_conv_utf8_wchar_flush,
305 	NULL,
306 };
307 
308 const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb = {
309 	mbfl_no_encoding_wchar,
310 	mbfl_no_encoding_utf8_sb,
311 	mbfl_filt_conv_common_ctor,
312 	NULL,
313 	mbfl_filt_conv_wchar_utf8_mobile,
314 	mbfl_filt_conv_sjis_mobile_flush,
315 	NULL,
316 };
317 
318 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
319 
mbfl_filt_put_invalid_char(mbfl_convert_filter * filter)320 static int mbfl_filt_put_invalid_char(mbfl_convert_filter *filter)
321 {
322 	filter->status = filter->cache = 0;
323 	CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
324 	return 0;
325 }
326 
mbfl_filt_conv_utf8_wchar(int c,mbfl_convert_filter * filter)327 static int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
328 {
329 	int s, c1;
330 
331 retry:
332 	switch (filter->status) {
333 	case 0x00:
334 		if (c < 0x80) {
335 			CK((*filter->output_function)(c, filter->data));
336 		} else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
337 			filter->status = 0x10;
338 			filter->cache = c & 0x1f;
339 		} else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
340 			filter->status = 0x20;
341 			filter->cache = c & 0xf;
342 		} else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
343 			filter->status = 0x30;
344 			filter->cache = c & 0x7;
345 		} else {
346 			CK(mbfl_filt_put_invalid_char(filter));
347 		}
348 		break;
349 	case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
350 	case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
351 	case 0x32: /* 4byte code 4th char: 0x80-0xbf */
352 		if (c >= 0x80 && c <= 0xbf) {
353 			s = (filter->cache<<6) | (c & 0x3f);
354 			filter->status = filter->cache = 0;
355 			CK((*filter->output_function)(s, filter->data));
356 		} else {
357 			CK(mbfl_filt_put_invalid_char(filter));
358 			goto retry;
359 		}
360 		break;
361 	case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
362 		s = (filter->cache<<6) | (c & 0x3f);
363 		c1 = filter->cache & 0xf;
364 
365 		if ((c >= 0x80 && c <= 0xbf) &&
366 			((c1 == 0x0 && c >= 0xa0) ||
367 			 (c1 == 0xd && c < 0xa0) ||
368 			 (c1 > 0x0 && c1 != 0xd))) {
369 			filter->cache = s;
370 			filter->status++;
371 		} else {
372 			CK(mbfl_filt_put_invalid_char(filter));
373 			goto retry;
374 		}
375 		break;
376 	case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
377 		s = (filter->cache<<6) | (c & 0x3f);
378 		c1 = filter->cache & 0x7;
379 
380 		if ((c >= 0x80 && c <= 0xbf) &&
381 			((c1 == 0x0 && c >= 0x90) ||
382 			 (c1 == 0x4 && c < 0x90) ||
383 			 (c1 > 0x0 && c1 != 0x4))) {
384 			filter->cache = s;
385 			filter->status++;
386 		} else {
387 			CK(mbfl_filt_put_invalid_char(filter));
388 			goto retry;
389 		}
390 		break;
391 	case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
392 		if (c >= 0x80 && c <= 0xbf) {
393 			filter->cache = (filter->cache<<6) | (c & 0x3f);
394 			filter->status++;
395 		} else {
396 			CK(mbfl_filt_put_invalid_char(filter));
397 			goto retry;
398 		}
399 		break;
400 
401 		EMPTY_SWITCH_DEFAULT_CASE();
402 	}
403 
404 	return 0;
405 }
406 
mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter * filter)407 static int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
408 {
409 	if (filter->status) {
410 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
411 		filter->status = 0;
412 	}
413 
414 	if (filter->flush_function) {
415 		(*filter->flush_function)(filter->data);
416 	}
417 
418 	return 0;
419 }
420 
mbfl_filt_conv_wchar_utf8(int c,mbfl_convert_filter * filter)421 static int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter)
422 {
423 	if (c >= 0 && c < 0x110000) {
424 		if (c < 0x80) {
425 			CK((*filter->output_function)(c, filter->data));
426 		} else if (c < 0x800) {
427 			CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
428 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
429 		} else if (c < 0x10000) {
430 			CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
431 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
432 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
433 		} else {
434 			CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
435 			CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
436 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
437 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
438 		}
439 	} else {
440 		CK(mbfl_filt_conv_illegal_output(c, filter));
441 	}
442 
443 	return 0;
444 }
445 
mb_utf8_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)446 static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
447 {
448 	unsigned char *p = *in, *e = p + *in_len;
449 	uint32_t *out = buf, *limit = buf + bufsize;
450 
451 	while (p < e && out < limit) {
452 		unsigned char c = *p++;
453 
454 		if (c < 0x80) {
455 			*out++ = c;
456 		} else if (c < 0xC2) {
457 			*out++ = MBFL_BAD_INPUT;
458 		} else if (c <= 0xDF) { /* 2 byte character */
459 			if (p < e) {
460 				unsigned char c2 = *p++;
461 				if ((c2 & 0xC0) != 0x80) {
462 					*out++ = MBFL_BAD_INPUT;
463 					p--;
464 				} else {
465 					*out++ = ((c & 0x1F) << 6) | (c2 & 0x3F);
466 				}
467 			} else {
468 				*out++ = MBFL_BAD_INPUT;
469 			}
470 		} else if (c <= 0xEF) { /* 3 byte character */
471 			if ((e - p) >= 2) {
472 				unsigned char c2 = *p++;
473 				unsigned char c3 = *p++;
474 				if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
475 					*out++ = MBFL_BAD_INPUT;
476 					p -= 2;
477 				} else if ((c3 & 0xC0) != 0x80) {
478 					*out++ = MBFL_BAD_INPUT;
479 					p--;
480 				} else {
481 					uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
482 					ZEND_ASSERT(decoded >= 0x800); /* Not an overlong code unit */
483 					ZEND_ASSERT(decoded < 0xD800 || decoded > 0xDFFF); /* U+D800-DFFF are reserved, illegal code points */
484 					*out++ = decoded;
485 				}
486 			} else {
487 				*out++ = MBFL_BAD_INPUT;
488 				if (p < e && (c != 0xE0 || *p >= 0xA0) && (c != 0xED || *p < 0xA0) && (*p & 0xC0) == 0x80) {
489 					p++;
490 					if (p < e && (*p & 0xC0) == 0x80) {
491 						p++;
492 					}
493 				}
494 			}
495 		} else if (c <= 0xF4) { /* 4 byte character */
496 			if ((e - p) >= 3) {
497 				unsigned char c2 = *p++;
498 				unsigned char c3 = *p++;
499 				unsigned char c4 = *p++;
500 				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
501 				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
502 				 * greater than U+10FFFF, which is the highest legal codepoint */
503 				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
504 					*out++ = MBFL_BAD_INPUT;
505 					p -= 3;
506 				} else if ((c3 & 0xC0) != 0x80) {
507 					*out++ = MBFL_BAD_INPUT;
508 					p -= 2;
509 				} else if ((c4 & 0xC0) != 0x80) {
510 					*out++ = MBFL_BAD_INPUT;
511 					p--;
512 				} else {
513 					uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
514 					ZEND_ASSERT(decoded >= 0x10000); /* Not an overlong code unit */
515 					*out++ = decoded;
516 				}
517 			} else {
518 				*out++ = MBFL_BAD_INPUT;
519 				if (p < e) {
520 					unsigned char c2 = *p;
521 					if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c >= 0xF1 && c <= 0xF3)) {
522 						while (p < e && (*p & 0xC0) == 0x80) {
523 							p++;
524 						}
525 					}
526 				}
527 			}
528 		} else {
529 			*out++ = MBFL_BAD_INPUT;
530 		}
531 	}
532 
533 	*in_len = e - p;
534 	*in = p;
535 	return out - buf;
536 }
537 
mb_wchar_to_utf8(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)538 static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
539 {
540 	unsigned char *out, *limit;
541 	MB_CONVERT_BUF_LOAD(buf, out, limit);
542 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
543 
544 	while (len--) {
545 		uint32_t w = *in++;
546 		if (w < 0x80) {
547 			out = mb_convert_buf_add(out, w & 0xFF);
548 		} else if (w < 0x800) {
549 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
550 			out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80);
551 		} else if (w < 0x10000) {
552 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
553 			out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
554 		} else if (w < 0x110000) {
555 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
556 			out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
557 		} else {
558 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8);
559 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
560 		}
561 	}
562 
563 	MB_CONVERT_BUF_STORE(buf, out, limit);
564 }
565 
mb_cut_utf8(unsigned char * str,size_t from,size_t len,unsigned char * end)566 static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end)
567 {
568 	unsigned char *start = str + from;
569 	/* Byte values less than -64 are UTF-8 continuation bytes, that is,
570 	 * the 2nd, 3rd, or 4th byte of a multi-byte character */
571 	while (start > str && ((signed char)*start) < -64) {
572 		start--;
573 	}
574 	unsigned char *_end = start + len;
575 	if (_end >= end) {
576 		return zend_string_init_fast((char*)start, end - start);
577 	}
578 	while (_end > start && ((signed char)*_end) < -64) {
579 		_end--;
580 	}
581 	return zend_string_init_fast((char*)start, _end - start);
582 }
583 
mbfl_filt_conv_utf8_mobile_wchar(int c,mbfl_convert_filter * filter)584 static int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter)
585 {
586 	int s, s1 = 0, c1 = 0, snd = 0;
587 
588 retry:
589 	switch (filter->status & 0xff) {
590 	case 0x00:
591 		if (c < 0x80) {
592 			CK((*filter->output_function)(c, filter->data));
593 		} else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
594 			filter->status = 0x10;
595 			filter->cache = c & 0x1f;
596 		} else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
597 			filter->status = 0x20;
598 			filter->cache = c & 0xf;
599 		} else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
600 			filter->status = 0x30;
601 			filter->cache = c & 0x7;
602 		} else {
603 			CK(mbfl_filt_put_invalid_char(filter));
604 		}
605 		break;
606 
607 	case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
608 	case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
609 	case 0x32: /* 4byte code 4th char: 0x80-0xbf */
610 		filter->status = 0;
611 		if (c >= 0x80 && c <= 0xbf) {
612 			s = (filter->cache << 6) | (c & 0x3f);
613 			filter->cache = 0;
614 
615 			if (filter->from->no_encoding == mbfl_no_encoding_utf8_docomo && mbfilter_conv_r_map_tbl(s, &s1, 4, mbfl_docomo2uni_pua)) {
616 				s = mbfilter_sjis_emoji_docomo2unicode(s1, &snd);
617 			} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_a && mbfilter_conv_r_map_tbl(s, &s1, 7, mbfl_kddi2uni_pua)) {
618 				s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
619 			} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_b && mbfilter_conv_r_map_tbl(s, &s1, 8, mbfl_kddi2uni_pua_b)) {
620 				s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
621 			} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_sb && mbfilter_conv_r_map_tbl(s, &s1, 6, mbfl_sb2uni_pua)) {
622 				s = mbfilter_sjis_emoji_sb2unicode(s1, &snd);
623 			}
624 
625 			if (snd > 0) {
626 				CK((*filter->output_function)(snd, filter->data));
627 			}
628 			CK((*filter->output_function)(s, filter->data));
629 		} else {
630 			CK(mbfl_filt_put_invalid_char(filter));
631 			goto retry;
632 		}
633 		break;
634 
635 	case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
636 		s = (filter->cache << 6) | (c & 0x3f);
637 		c1 = filter->cache & 0xf;
638 
639 		if ((c >= 0x80 && c <= 0xbf) &&
640 			 ((c1 == 0x0 && c >= 0xa0) ||
641 			 (c1 == 0xd && c < 0xa0) ||
642 			 (c1 > 0x0 && c1 != 0xd))) {
643 			filter->cache = s;
644 			filter->status++;
645 		} else {
646 			CK(mbfl_filt_put_invalid_char(filter));
647 			goto retry;
648 		}
649 		break;
650 
651 	case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
652 		s = (filter->cache << 6) | (c & 0x3f);
653 		c1 = filter->cache & 0x7;
654 
655 		if ((c >= 0x80 && c <= 0xbf) &&
656 			 ((c1 == 0x0 && c >= 0x90) ||
657 			 (c1 == 0x4 && c < 0x90) ||
658 			 (c1 > 0x0 && c1 != 0x4))) {
659 			filter->cache = s;
660 			filter->status++;
661 		} else {
662 			CK(mbfl_filt_put_invalid_char(filter));
663 			goto retry;
664 		}
665 		break;
666 
667 	case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
668 		if (c >= 0x80 && c <= 0xbf) {
669 			filter->cache = (filter->cache << 6) | (c & 0x3f);
670 			filter->status++;
671 		} else {
672 			CK(mbfl_filt_put_invalid_char(filter));
673 			goto retry;
674 		}
675 		break;
676 
677 		EMPTY_SWITCH_DEFAULT_CASE();
678 	}
679 
680 	return 0;
681 }
682 
mbfl_filt_conv_wchar_utf8_mobile(int c,mbfl_convert_filter * filter)683 static int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter)
684 {
685 	if (c >= 0 && c < 0x110000) {
686 		int s1, c1;
687 
688 		if ((filter->to->no_encoding == mbfl_no_encoding_utf8_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 4, mbfl_docomo2uni_pua)) ||
689 			  (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_a && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 7, mbfl_kddi2uni_pua)) ||
690 			  (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_b && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 8, mbfl_kddi2uni_pua_b)) ||
691 			  (filter->to->no_encoding == mbfl_no_encoding_utf8_sb && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 6, mbfl_sb2uni_pua))) {
692 			c = c1;
693 		}
694 
695 		if (filter->status) {
696 			return 0;
697 		}
698 
699 		if (c < 0x80) {
700 			CK((*filter->output_function)(c, filter->data));
701 		} else if (c < 0x800) {
702 			CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
703 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
704 		} else if (c < 0x10000) {
705 			CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
706 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
707 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
708 		} else {
709 			CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
710 			CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
711 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
712 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
713 		}
714 	} else {
715 		CK(mbfl_filt_conv_illegal_output(c, filter));
716 	}
717 
718 	return 0;
719 }
720 
721 /* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF
722  * These correspond to the letters A-Z
723  * To display the flag emoji for a country, two unicode codepoints are combined,
724  * which correspond to the two-letter code for that country
725  * This macro converts uppercase ASCII values to Regional Indicator codepoints */
726 #define NFLAGS(c) (0x1F1A5+(int)(c))
727 
728 static const char nflags_s[10][2] = {"CN","DE","ES","FR","GB","IT","JP","KR","RU","US"};
729 static const int nflags_code_kddi[10] = { 0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7 };
730 static const int nflags_code_sb[10] = { 0x2B0A, 0x2B05, 0x2B08, 0x2B04, 0x2B07, 0x2B06, 0x2B02, 0x2B0B, 0x2B09, 0x2B03 };
731 
mb_mobile_utf8_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state,const unsigned short emoji_map[][3],int (* convert_emoji)(int s,int * snd),int n)732 static size_t mb_mobile_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state, const unsigned short emoji_map[][3], int (*convert_emoji)(int s, int *snd), int n)
733 {
734 	unsigned char *p = *in, *e = p + *in_len;
735 	uint32_t *out = buf, *limit = buf + bufsize - 1;
736 
737 	while (p < e && out < limit) {
738 		unsigned char c = *p++;
739 		unsigned int s = 0;
740 
741 		if (c <= 0x7F) {
742 			*out++ = c;
743 			continue;
744 		} else if (c >= 0xC2 && c <= 0xDF && p < e) {
745 			unsigned char c2 = *p++;
746 
747 			if ((c2 & 0xC0) == 0x80) {
748 				s = ((c & 0x1F) << 6) | (c2 & 0x3F);
749 			} else {
750 				*out++ = MBFL_BAD_INPUT;
751 				p--;
752 				continue;
753 			}
754 		} else if (c >= 0xE0 && c <= 0xEF) {
755 			if ((e - p) < 2) {
756 				*out++ = MBFL_BAD_INPUT;
757 				if (p < e && (c != 0xE0 || *p >= 0xA0) && (c != 0xED || *p < 0xA0) && (*p & 0xC0) == 0x80) {
758 					p++;
759 					if (p < e && (*p & 0xC0) == 0x80) {
760 						p++;
761 					}
762 				}
763 				continue;
764 			}
765 			unsigned char c2 = *p++;
766 			unsigned char c3 = *p++;
767 
768 			if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
769 				*out++ = MBFL_BAD_INPUT;
770 				p -= 2;
771 				continue;
772 			} else if ((c3 & 0xC0) != 0x80) {
773 				*out++ = MBFL_BAD_INPUT;
774 				p--;
775 				continue;
776 			} else {
777 				s = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
778 			}
779 		} else if (c >= 0xF0 && c <= 0xF4) {
780 			if ((e - p) < 3) {
781 				*out++ = MBFL_BAD_INPUT;
782 				if (p < e) {
783 					unsigned char c2 = *p;
784 					if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c >= 0xF1 && c <= 0xF3)) {
785 						while (p < e && (*p & 0xC0) == 0x80) {
786 							p++;
787 						}
788 					}
789 				}
790 				continue;
791 			}
792 			unsigned char c2 = *p++;
793 			unsigned char c3 = *p++;
794 			unsigned char c4 = *p++;
795 
796 			if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
797 				*out++ = MBFL_BAD_INPUT;
798 				p -= 3;
799 				continue;
800 			} else if ((c3 & 0xC0) != 0x80) {
801 				*out++ = MBFL_BAD_INPUT;
802 				p -= 2;
803 				continue;
804 			} else if ((c4 & 0xC0) != 0x80) {
805 				*out++ = MBFL_BAD_INPUT;
806 				p--;
807 				continue;
808 			} else {
809 				s = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
810 			}
811 		} else {
812 			*out++ = MBFL_BAD_INPUT;
813 			continue;
814 		}
815 
816 		int s1 = 0, snd = 0;
817 		if (mbfilter_conv_r_map_tbl(s, &s1, n, emoji_map)) {
818 			s = convert_emoji(s1, &snd);
819 			if (snd) {
820 				*out++ = snd;
821 			}
822 		}
823 		*out++ = s;
824 	}
825 
826 	*in_len = e - p;
827 	*in = p;
828 	return out - buf;
829 }
830 
mb_utf8_docomo_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)831 static size_t mb_utf8_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
832 {
833 	return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_docomo2uni_pua, mbfilter_sjis_emoji_docomo2unicode, 4);
834 }
835 
mb_wchar_to_utf8_docomo(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)836 static void mb_wchar_to_utf8_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
837 {
838 	unsigned char *out, *limit;
839 	MB_CONVERT_BUF_LOAD(buf, out, limit);
840 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
841 
842 	while (len--) {
843 		uint32_t w = *in++;
844 		unsigned int s = 0;
845 		int c1 = 0;
846 
847 		if (w < 0x110000) {
848 			if ((w == '#' || (w >= '0' && w <= '9')) && len) {
849 				uint32_t w2 = *in++; len--;
850 
851 				if (w2 == 0x20E3) {
852 					if (w == '#') {
853 						s = 0x2964;
854 					} else if (w == '0') {
855 						s = 0x296F;
856 					} else {
857 						s = 0x2966 + (w - '1');
858 					}
859 				} else {
860 					in--; len++;
861 				}
862 			} else if (w == 0xA9) { /* Copyright sign */
863 				s = 0x29B5;
864 			} else if (w == 0xAE) { /* Registered sign */
865 				s = 0x29BA;
866 			} else if (w >= mb_tbl_uni_docomo2code2_min && w <= mb_tbl_uni_docomo2code2_max) {
867 				int i = mbfl_bisec_srch2(w, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
868 				if (i >= 0) {
869 					s = mb_tbl_uni_docomo2code2_value[i];
870 				}
871 			} else if (w >= mb_tbl_uni_docomo2code3_min && w <= mb_tbl_uni_docomo2code3_max) {
872 				int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
873 				if (i >= 0) {
874 					s = mb_tbl_uni_docomo2code3_value[i];
875 				}
876 			} else if (w >= mb_tbl_uni_docomo2code5_min && w <= mb_tbl_uni_docomo2code5_max) {
877 				int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
878 				if (i >= 0) {
879 					s = mb_tbl_uni_docomo2code5_val[i];
880 				}
881 			}
882 
883 			if (s && mbfilter_conv_map_tbl(s, &c1, 4, mbfl_docomo2uni_pua)) {
884 				w = c1;
885 			}
886 
887 			if (w <= 0x7F) {
888 				out = mb_convert_buf_add(out, w);
889 			} else if (w <= 0x7FF) {
890 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
891 				out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80);
892 			} else if (w <= 0xFFFF) {
893 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
894 				out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
895 			} else {
896 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
897 				out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
898 			}
899 		} else {
900 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8_docomo);
901 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
902 		}
903 	}
904 
905 	MB_CONVERT_BUF_STORE(buf, out, limit);
906 }
907 
mb_utf8_kddi_a_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)908 static size_t mb_utf8_kddi_a_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
909 {
910 	return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_kddi2uni_pua, mbfilter_sjis_emoji_kddi2unicode, 7);
911 }
912 
mb_wchar_to_utf8_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end,const unsigned short emoji_map[][3],int n,mb_from_wchar_fn error_handler)913 static void mb_wchar_to_utf8_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end, const unsigned short emoji_map[][3], int n, mb_from_wchar_fn error_handler)
914 {
915 	unsigned char *out, *limit;
916 	MB_CONVERT_BUF_LOAD(buf, out, limit);
917 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
918 
919 	while (len--) {
920 		uint32_t w = *in++;
921 		unsigned int s = 0;
922 		int c1 = 0;
923 
924 		if (w < 0x110000) {
925 			if ((w == '#' || (w >= '0' && w <= '9')) && len) {
926 				uint32_t w2 = *in++; len--;
927 
928 				if (w2 == 0x20E3) {
929 					if (w == '#') {
930 						s = 0x25BC;
931 					} else if (w == '0') {
932 						s = 0x2830;
933 					} else {
934 						s = 0x27A6 + (w - '1');
935 					}
936 				} else {
937 					in--; len++;
938 				}
939 			} else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
940 				if (len) {
941 					uint32_t w2 = *in++; len--;
942 
943 					if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
944 						for (int i = 0; i < 10; i++) {
945 							if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
946 								s = nflags_code_kddi[i];
947 								goto process_kuten;
948 							}
949 						}
950 					}
951 
952 					in--; len++;
953 				}
954 
955 				MB_CONVERT_ERROR(buf, out, limit, w, error_handler);
956 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
957 				continue;
958 			} else if (w == 0xA9) { /* Copyright sign */
959 				s = 0x27DC;
960 			} else if (w == 0xAE) { /* Registered sign */
961 				s = 0x27DD;
962 			} else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
963 				int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
964 				if (i >= 0) {
965 					s = mb_tbl_uni_kddi2code2_value[i];
966 				}
967 			} else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
968 				int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
969 				if (i >= 0) {
970 					s = mb_tbl_uni_kddi2code3_value[i];
971 				}
972 			} else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
973 				int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
974 				if (i >= 0) {
975 					s = mb_tbl_uni_kddi2code5_val[i];
976 				}
977 			}
978 
979 process_kuten:
980 			if (s && mbfilter_conv_map_tbl(s, &c1, n, emoji_map)) {
981 				w = c1;
982 			}
983 
984 			if (w <= 0x7F) {
985 				out = mb_convert_buf_add(out, w);
986 			} else if (w <= 0x7FF) {
987 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
988 				out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80);
989 			} else if (w <= 0xFFFF) {
990 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
991 				out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
992 			} else {
993 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
994 				out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
995 			}
996 		} else {
997 			MB_CONVERT_ERROR(buf, out, limit, w, error_handler);
998 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
999 		}
1000 	}
1001 
1002 	MB_CONVERT_BUF_STORE(buf, out, limit);
1003 }
1004 
mb_wchar_to_utf8_kddi_a(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1005 static void mb_wchar_to_utf8_kddi_a(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1006 {
1007 	mb_wchar_to_utf8_kddi(in, len, buf, end, mbfl_kddi2uni_pua, 7, mb_wchar_to_utf8_kddi_a);
1008 }
1009 
mb_utf8_kddi_b_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1010 static size_t mb_utf8_kddi_b_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1011 {
1012 	return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_kddi2uni_pua_b, mbfilter_sjis_emoji_kddi2unicode, 8);
1013 }
1014 
mb_wchar_to_utf8_kddi_b(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1015 static void mb_wchar_to_utf8_kddi_b(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1016 {
1017 	mb_wchar_to_utf8_kddi(in, len, buf, end, mbfl_kddi2uni_pua_b, 8, mb_wchar_to_utf8_kddi_b);
1018 }
1019 
mb_utf8_sb_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1020 static size_t mb_utf8_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1021 {
1022 	return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_sb2uni_pua, mbfilter_sjis_emoji_sb2unicode, 6);
1023 }
1024 
mb_wchar_to_utf8_sb(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1025 static void mb_wchar_to_utf8_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1026 {
1027 	unsigned char *out, *limit;
1028 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1029 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1030 
1031 	while (len--) {
1032 		uint32_t w = *in++;
1033 		unsigned int s = 0;
1034 		int c1 = 0;
1035 
1036 		if (w < 0x110000) {
1037 			if ((w == '#' || (w >= '0' && w <= '9')) && len) {
1038 				uint32_t w2 = *in++; len--;
1039 
1040 				if (w2 == 0x20E3) {
1041 					if (w == '#') {
1042 						s = 0x2817;
1043 					} else if (w == '0') {
1044 						s = 0x282C;
1045 					} else {
1046 						s = 0x2823 + (w - '1');
1047 					}
1048 				} else {
1049 					in--; len++;
1050 				}
1051 			} else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
1052 				if (len) {
1053 					uint32_t w2 = *in++; len--;
1054 
1055 					if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
1056 						for (int i = 0; i < 10; i++) {
1057 							if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
1058 								s = nflags_code_sb[i];
1059 								goto process_kuten;
1060 							}
1061 						}
1062 					}
1063 
1064 					in--; len++;
1065 				}
1066 
1067 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8_sb);
1068 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1069 				continue;
1070 			} else if (w == 0xA9) { /* Copyright sign */
1071 				s = 0x2855;
1072 			} else if (w == 0xAE) { /* Registered sign */
1073 				s = 0x2856;
1074 			} else if (w >= mb_tbl_uni_sb2code2_min && w <= mb_tbl_uni_sb2code2_max) {
1075 				int i = mbfl_bisec_srch2(w, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
1076 				if (i >= 0) {
1077 					s = mb_tbl_uni_sb2code2_value[i];
1078 				}
1079 			} else if (w >= mb_tbl_uni_sb2code3_min && w <= mb_tbl_uni_sb2code3_max) {
1080 				int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
1081 				if (i >= 0) {
1082 					s = mb_tbl_uni_sb2code3_value[i];
1083 				}
1084 			} else if (w >= mb_tbl_uni_sb2code5_min && w <= mb_tbl_uni_sb2code5_max) {
1085 				int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
1086 				if (i >= 0) {
1087 					s = mb_tbl_uni_sb2code5_val[i];
1088 				}
1089 			}
1090 
1091 process_kuten:
1092 			if (s && mbfilter_conv_map_tbl(s, &c1, 6, mbfl_sb2uni_pua)) {
1093 				w = c1;
1094 			}
1095 
1096 			if (w <= 0x7F) {
1097 				out = mb_convert_buf_add(out, w);
1098 			} else if (w <= 0x7FF) {
1099 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1100 				out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80);
1101 			} else if (w <= 0xFFFF) {
1102 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
1103 				out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
1104 			} else {
1105 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1106 				out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80);
1107 			}
1108 		} else {
1109 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8_sb);
1110 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1111 		}
1112 	}
1113 
1114 	MB_CONVERT_BUF_STORE(buf, out, limit);
1115 }
1116