1 #include "mbfilter_cjk.h"
2 
3 #include "unicode_table_jis.h"
4 #include "unicode_table_jis2004.h"
5 #include "unicode_table_big5.h"
6 #include "unicode_table_cns11643.h"
7 #include "unicode_table_cp932_ext.h"
8 #include "unicode_table_cp936.h"
9 #include "unicode_table_gb18030.h"
10 #include "unicode_table_gb2312.h"
11 #include "unicode_table_uhc.h"
12 #include "cp932_table.h"
13 #include "sjis_mac2uni.h"
14 #include "translit_kana_jisx0201_jisx0208.h"
15 #include "emoji2uni.h"
16 
17 /* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF
18  * These correspond to the letters A-Z
19  * To display the flag emoji for a country, two unicode codepoints are combined,
20  * which correspond to the two-letter code for that country
21  * This macro converts uppercase ASCII values to Regional Indicator codepoints */
22 #define NFLAGS(c) (0x1F1A5+((unsigned int)(c)))
23 
24 static const char nflags_s[10][2] = {"CN", "DE", "ES", "FR", "GB", "IT", "JP", "KR", "RU", "US"};
25 static const int nflags_code_kddi[10] = { 0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7 };
26 static const int nflags_code_sb[10] = { 0x2B0A, 0x2B05, 0x2B08, 0x2B04, 0x2B07, 0x2B06, 0x2B02, 0x2B0B, 0x2B09, 0x2B03 };
27 
28 #define EMIT_KEYPAD_EMOJI(c) do { *snd = (c); return 0x20E3; } while(0)
29 #define EMIT_FLAG_EMOJI(country) do { *snd = NFLAGS((country)[0]); return NFLAGS((country)[1]); } while(0)
30 
31 static const char nflags_kddi[6][2] = {"FR", "DE", "IT", "GB", "CN", "KR"};
32 static const char nflags_sb[10][2] = {"JP", "US", "FR", "DE", "IT", "GB", "ES", "RU", "CN", "KR"};
33 
34 /* number -> (ku*94)+ten value for telephone keypad character */
35 #define DOCOMO_KEYPAD(n) ((n) == 0 ? 0x296F : (0x2965 + (n)))
36 #define DOCOMO_KEYPAD_HASH 0x2964
37 
38 /* `tbl` contains inclusive ranges, each represented by a pair of unsigned shorts */
mbfl_bisec_srch(int w,const unsigned short * tbl,int n)39 static int mbfl_bisec_srch(int w, const unsigned short *tbl, int n)
40 {
41 	int l = 0, r = n-1;
42 	while (l <= r) {
43 		int probe = (l + r) >> 1;
44 		unsigned short lo = tbl[2 * probe], hi = tbl[(2 * probe) + 1];
45 		if (w < lo) {
46 			r = probe - 1;
47 		} else if (w > hi) {
48 			l = probe + 1;
49 		} else {
50 			return probe;
51 		}
52 	}
53 	return -1;
54 }
55 
56 /* `tbl` contains single values, not ranges */
mbfl_bisec_srch2(int w,const unsigned short tbl[],int n)57 int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
58 {
59 	int l = 0, r = n-1;
60 	while (l <= r) {
61 		int probe = (l + r) >> 1;
62 		unsigned short val = tbl[probe];
63 		if (w < val) {
64 			r = probe - 1;
65 		} else if (w > val) {
66 			l = probe + 1;
67 		} else {
68 			return probe;
69 		}
70 	}
71 	return -1;
72 }
73 
mbfl_binary_search_paired_sorted_table(uint32_t w,const unsigned short tbl[][2],int n)74 static const unsigned short *mbfl_binary_search_paired_sorted_table(uint32_t w, const unsigned short tbl[][2], int n)
75 {
76 	int r = n;
77 	int l = 0;
78 	while (l < r) {
79 		int probe = (l + r) >> 1;
80 		if (w < tbl[probe][0]) {
81 			r = probe;
82 		} else if (w > tbl[probe][0]) {
83 			l = probe + 1;
84 		} else {
85 			return &tbl[probe][1];
86 		}
87 	}
88 	return NULL;
89 }
90 
91 #define SJIS_ENCODE(c1,c2,s1,s2) \
92 	do { \
93 		s1 = ((c1 - 1) >> 1) + ((c1) < 0x5F ? 0x71 : 0xB1); \
94 		s2 = c2; \
95 		if ((c1) & 1) { \
96 			if ((c2) < 0x60) { \
97 				s2--; \
98 			} \
99 			s2 += 0x20; \
100 		} else { \
101 			s2 += 0x7e; \
102 		} \
103 	} while (0)
104 
105 #define SJIS_DECODE(c1,c2,s1,s2) \
106 	do { \
107 		if (c1 < 0xa0) { \
108 			s1 = ((c1 - 0x81) << 1) + 0x21; \
109 		} else { \
110 			s1 = ((c1 - 0xc1) << 1) + 0x21; \
111 		} \
112 		s2 = c2; \
113 		if (c2 < 0x9f) { \
114 			if (c2 < 0x7f) { \
115 				s2++; \
116 			} \
117 			s2 -= 0x20; \
118 		} else { \
119 			s1++; \
120 			s2 -= 0x7e; \
121 		} \
122 	} while (0)
123 
124 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
125 
126 /*
127  * ISO-2022 variants
128  */
129 
130 #define ASCII          0
131 #define JISX0201_KANA  0x20
132 #define JISX0208_KANJI 0x80
133 
mbfl_filt_conv_jis_wchar(int c,mbfl_convert_filter * filter)134 static int mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter)
135 {
136 	int c1, s, w;
137 
138 retry:
139 	switch (filter->status & 0xf) {
140 /*	case 0x00:	 ASCII */
141 /*	case 0x10:	 X 0201 latin */
142 /*	case 0x20:	 X 0201 kana */
143 /*	case 0x80:	 X 0208 */
144 /*	case 0x90:	 X 0212 */
145 	case 0:
146 		if (c == 0x1b) {
147 			filter->status += 2;
148 		} else if (c == 0x0e) {		/* "kana in" */
149 			filter->status = 0x20;
150 		} else if (c == 0x0f) {		/* "kana out" */
151 			filter->status = 0;
152 		} else if (filter->status == 0x10 && c == 0x5c) {	/* YEN SIGN */
153 			CK((*filter->output_function)(0xa5, filter->data));
154 		} else if (filter->status == 0x10 && c == 0x7e) {	/* OVER LINE */
155 			CK((*filter->output_function)(0x203e, filter->data));
156 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
157 			CK((*filter->output_function)(0xff40 + c, filter->data));
158 		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x7f) {		/* kanji first char */
159 			filter->cache = c;
160 			filter->status += 1;
161 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
162 			CK((*filter->output_function)(c, filter->data));
163 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
164 			CK((*filter->output_function)(0xfec0 + c, filter->data));
165 		} else {
166 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
167 		}
168 		break;
169 
170 /*	case 0x81:	 X 0208 second char */
171 /*	case 0x91:	 X 0212 second char */
172 	case 1:
173 		filter->status &= ~0xf;
174 		c1 = filter->cache;
175 		if (c > 0x20 && c < 0x7f) {
176 			s = (c1 - 0x21)*94 + c - 0x21;
177 			if (filter->status == 0x80) {
178 				if (s >= 0 && s < jisx0208_ucs_table_size) {
179 					w = jisx0208_ucs_table[s];
180 				} else {
181 					w = 0;
182 				}
183 
184 				if (w <= 0) {
185 					w = MBFL_BAD_INPUT;
186 				}
187 			} else {
188 				if (s >= 0 && s < jisx0212_ucs_table_size) {
189 					w = jisx0212_ucs_table[s];
190 				} else {
191 					w = 0;
192 				}
193 
194 				if (w <= 0) {
195 					w = MBFL_BAD_INPUT;
196 				}
197 			}
198 			CK((*filter->output_function)(w, filter->data));
199 		} else {
200 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
201 		}
202 		break;
203 
204 	/* ESC */
205 /*	case 0x02:	*/
206 /*	case 0x12:	*/
207 /*	case 0x22:	*/
208 /*	case 0x82:	*/
209 /*	case 0x92:	*/
210 	case 2:
211 		if (c == 0x24) {		/* '$' */
212 			filter->status++;
213 		} else if (c == 0x28) {		/* '(' */
214 			filter->status += 3;
215 		} else {
216 			filter->status &= ~0xf;
217 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
218 			goto retry;
219 		}
220 		break;
221 
222 	/* ESC $ */
223 /*	case 0x03:	*/
224 /*	case 0x13:	*/
225 /*	case 0x23:	*/
226 /*	case 0x83:	*/
227 /*	case 0x93:	*/
228 	case 3:
229 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
230 			filter->status = 0x80;
231 		} else if (c == 0x28) {			/* '(' */
232 			filter->status++;
233 		} else {
234 			filter->status &= ~0xf;
235 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
236 			CK((*filter->output_function)(0x24, filter->data));
237 			goto retry;
238 		}
239 		break;
240 
241 	/* ESC $ ( */
242 /*	case 0x04:	*/
243 /*	case 0x14:	*/
244 /*	case 0x24:	*/
245 /*	case 0x84:	*/
246 /*	case 0x94:	*/
247 	case 4:
248 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
249 			filter->status = 0x80;
250 		} else if (c == 0x44) {			/* 'D' */
251 			filter->status = 0x90;
252 		} else {
253 			filter->status &= ~0xf;
254 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
255 			CK((*filter->output_function)(0x24, filter->data));
256 			CK((*filter->output_function)(0x28, filter->data));
257 			goto retry;
258 		}
259 		break;
260 
261 	/* ESC ( */
262 /*	case 0x05:	*/
263 /*	case 0x15:	*/
264 /*	case 0x25:	*/
265 /*	case 0x85:	*/
266 /*	case 0x95:	*/
267 	case 5:
268 		if (c == 0x42 || c == 0x48) {		/* 'B' or 'H' */
269 			filter->status = 0;
270 		} else if (c == 0x4a) {		/* 'J' */
271 			filter->status = 0x10;
272 		} else if (c == 0x49) {		/* 'I' */
273 			filter->status = 0x20;
274 		} else {
275 			filter->status &= ~0xf;
276 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
277 			CK((*filter->output_function)(0x28, filter->data));
278 			goto retry;
279 		}
280 		break;
281 
282 		EMPTY_SWITCH_DEFAULT_CASE();
283 	}
284 
285 	return 0;
286 }
287 
mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter * filter)288 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter)
289 {
290 	if (filter->status & 0xF) {
291 		/* 2-byte (JIS X 0208 or 0212) character was truncated,
292 		 * or else escape sequence was truncated */
293 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
294 	}
295 	filter->status = 0;
296 
297 	if (filter->flush_function) {
298 		(*filter->flush_function)(filter->data);
299 	}
300 
301 	return 0;
302 }
303 
mbfl_filt_conv_wchar_jis(int c,mbfl_convert_filter * filter)304 static int mbfl_filt_conv_wchar_jis(int c, mbfl_convert_filter *filter)
305 {
306 	int s = 0;
307 
308 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
309 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
310 	} else if (c == 0x203E) { /* OVERLINE */
311 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
312 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
313 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
314 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
315 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
316 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
317 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
318 	}
319 	if (s <= 0) {
320 		if (c == 0xa5) {		/* YEN SIGN */
321 			s = 0x1005c;
322 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
323 			s = 0x2140;
324 		} else if (c == 0x2225) {	/* PARALLEL TO */
325 			s = 0x2142;
326 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
327 			s = 0x215d;
328 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
329 			s = 0x2171;
330 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
331 			s = 0x2172;
332 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
333 			s = 0x224c;
334 		}
335 		if (c == 0) {
336 			s = 0;
337 		} else if (s <= 0) {
338 			s = -1;
339 		}
340 	}
341 	if (s >= 0) {
342 		if (s < 0x80) { /* ASCII */
343 			if ((filter->status & 0xff00) != 0) {
344 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
345 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
346 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
347 			}
348 			filter->status = 0;
349 			CK((*filter->output_function)(s, filter->data));
350 		} else if (s < 0x8080) { /* X 0208 */
351 			if ((filter->status & 0xff00) != 0x200) {
352 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
353 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
354 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
355 			}
356 			filter->status = 0x200;
357 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
358 			CK((*filter->output_function)(s & 0x7f, filter->data));
359 		} else if (s < 0x10000) { /* X 0212 */
360 			if ((filter->status & 0xff00) != 0x300) {
361 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
362 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
363 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
364 				CK((*filter->output_function)(0x44, filter->data));		/* 'D' */
365 			}
366 			filter->status = 0x300;
367 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
368 			CK((*filter->output_function)(s & 0x7f, filter->data));
369 		} else { /* X 0201 latin */
370 			if ((filter->status & 0xff00) != 0x400) {
371 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
372 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
373 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
374 			}
375 			filter->status = 0x400;
376 			CK((*filter->output_function)(s & 0x7f, filter->data));
377 		}
378 	} else {
379 		CK(mbfl_filt_conv_illegal_output(c, filter));
380 	}
381 
382 	return 0;
383 }
384 
mbfl_filt_conv_wchar_2022jp(int c,mbfl_convert_filter * filter)385 static int mbfl_filt_conv_wchar_2022jp(int c, mbfl_convert_filter *filter)
386 {
387 	int s;
388 
389 	s = 0;
390 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
391 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
392 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
393 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
394 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
395 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
396 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
397 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
398 	}
399 
400 	if (s <= 0) {
401 		if (c == 0xa5) {			/* YEN SIGN */
402 			s = 0x1005c;
403 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
404 			s = 0x2140;
405 		} else if (c == 0x2225) {	/* PARALLEL TO */
406 			s = 0x2142;
407 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
408 			s = 0x215d;
409 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
410 			s = 0x2171;
411 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
412 			s = 0x2172;
413 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
414 			s = 0x224c;
415 		}
416 		if (c == 0) {
417 			s = 0;
418 		} else if (s <= 0) {
419 			s = -1;
420 		}
421 	} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
422 		s = -1;
423 	}
424 	if (s >= 0) {
425 		if (s < 0x80) { /* ASCII */
426 			if ((filter->status & 0xff00) != 0) {
427 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
428 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
429 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
430 			}
431 			filter->status = 0;
432 			CK((*filter->output_function)(s, filter->data));
433 		} else if (s < 0x10000) { /* X 0208 */
434 			if ((filter->status & 0xff00) != 0x200) {
435 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
436 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
437 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
438 			}
439 			filter->status = 0x200;
440 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
441 			CK((*filter->output_function)(s & 0x7f, filter->data));
442 		} else { /* X 0201 latin */
443 			if ((filter->status & 0xff00) != 0x400) {
444 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
445 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
446 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
447 			}
448 			filter->status = 0x400;
449 			CK((*filter->output_function)(s & 0x7f, filter->data));
450 		}
451 	}
452 
453 	return 0;
454 }
455 
456 #define ASCII 0
457 #define JISX_0201_LATIN 1
458 #define JISX_0201_KANA 2
459 #define JISX_0208 3
460 #define JISX_0212 4
461 
mb_iso2022jp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)462 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
463 {
464 	ZEND_ASSERT(bufsize >= 3);
465 
466 	unsigned char *p = *in, *e = p + *in_len;
467 	uint32_t *out = buf, *limit = buf + bufsize;
468 
469 	while (p < e && out < limit) {
470 		unsigned char c = *p++;
471 
472 		if (c == 0x1B) {
473 			/* ESC seen; this is an escape sequence */
474 			if ((e - p) < 2) {
475 				*out++ = MBFL_BAD_INPUT;
476 				if (p != e && (*p == '$' || *p == '('))
477 					p++;
478 				continue;
479 			}
480 
481 			unsigned char c2 = *p++;
482 			if (c2 == '$') {
483 				unsigned char c3 = *p++;
484 				if (c3 == '@' || c3 == 'B') {
485 					*state = JISX_0208;
486 				} else if (c3 == '(') {
487 					if (p == e) {
488 						*out++ = MBFL_BAD_INPUT;
489 						break;
490 					}
491 					unsigned char c4 = *p++;
492 					if (c4 == '@' || c4 == 'B') {
493 						*state = JISX_0208;
494 					} else if (c4 == 'D') {
495 						*state = JISX_0212;
496 					} else {
497 						if ((limit - out) < 3) {
498 							p -= 4;
499 							break;
500 						}
501 						*out++ = MBFL_BAD_INPUT;
502 						*out++ = '$';
503 						*out++ = '(';
504 						p--;
505 					}
506 				} else {
507 					if ((limit - out) < 2) {
508 						p -= 3;
509 						break;
510 					}
511 					*out++ = MBFL_BAD_INPUT;
512 					*out++ = '$';
513 					p--;
514 				}
515 			} else if (c2 == '(') {
516 				unsigned char c3 = *p++;
517 				if (c3 == 'B' || c3 == 'H') {
518 					*state = ASCII;
519 				} else if (c3 == 'J') {
520 					*state = JISX_0201_LATIN;
521 				} else if (c3 == 'I') {
522 					*state = JISX_0201_KANA;
523 				} else {
524 					if ((limit - out) < 2) {
525 						p -= 3;
526 						break;
527 					}
528 					*out++ = MBFL_BAD_INPUT;
529 					*out++ = '(';
530 					p--;
531 				}
532 			} else {
533 				*out++ = MBFL_BAD_INPUT;
534 				p--;
535 			}
536 		} else if (c == 0xE) {
537 			/* "Kana In" marker; this is just for JIS-7/8, but we also accept it for ISO-2022-JP */
538 			*state = JISX_0201_KANA;
539 		} else if (c == 0xF) {
540 			/* "Kana Out" marker */
541 			*state = ASCII;
542 		} else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
543 			*out++ = 0xA5;
544 		} else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
545 			*out++ = 0x203E;
546 		} else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
547 			*out++ = 0xFF40 + c;
548 		} else if (*state >= JISX_0208 && c > 0x20 && c < 0x7F) {
549 			if (p == e) {
550 				*out++ = MBFL_BAD_INPUT;
551 				break;
552 			}
553 			unsigned char c2 = *p++;
554 			if (c2 > 0x20 && c2 < 0x7F) {
555 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
556 				uint32_t w = 0;
557 				if (*state == JISX_0208) {
558 					if (s < jisx0208_ucs_table_size) {
559 						w = jisx0208_ucs_table[s];
560 					}
561 					if (!w) {
562 						w = MBFL_BAD_INPUT;
563 					}
564 				} else {
565 					if (s < jisx0212_ucs_table_size) {
566 						w = jisx0212_ucs_table[s];
567 					}
568 					if (!w) {
569 						w = MBFL_BAD_INPUT;
570 					}
571 				}
572 				*out++ = w;
573 			} else {
574 				*out++ = MBFL_BAD_INPUT;
575 			}
576 		} else if (c < 0x80) {
577 			*out++ = c;
578 		} else if (c >= 0xA1 && c <= 0xDF) {
579 			/* GR-invoked Kana; "GR" stands for "graphics right" and refers to bytes
580 			 * with the MSB bit (in the context of ISO-2022 encoding).
581 			 *
582 			 * In this regard, Wikipedia states:
583 			 * "Other, older variants known as JIS7 and JIS8 build directly on the 7-bit and 8-bit
584 			 * encodings defined by JIS X 0201 and allow use of JIS X 0201 kana from G1 without
585 			 * escape sequences, using Shift Out and Shift In or setting the eighth bit
586 			 * (GR-invoked), respectively."
587 			 *
588 			 * Note that we support both the 'JIS7' use of 0xE/0xF Shift In/Shift Out codes
589 			 * and the 'JIS8' use of GR-invoked Kana */
590 			*out++ = 0xFEC0 + c;
591 		} else {
592 			*out++ = MBFL_BAD_INPUT;
593 		}
594 	}
595 
596 	*in_len = e - p;
597 	*in = p;
598 	return out - buf;
599 }
600 
mb_wchar_to_iso2022jp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)601 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
602 {
603 	unsigned char *out, *limit;
604 	MB_CONVERT_BUF_LOAD(buf, out, limit);
605 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
606 
607 	while (len--) {
608 		uint32_t w = *in++;
609 		unsigned int s = 0;
610 
611 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
612 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
613 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
614 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
615 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
616 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
617 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
618 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
619 		}
620 
621 		if (s == 0) {
622 			if (w == 0xA5) { /* YEN SIGN */
623 				s = 0x1005C;
624 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
625 				s = 0x2140;
626 			} else if (w == 0x2225) { /* PARALLEL TO */
627 				s = 0x2142;
628 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
629 				s = 0x215D;
630 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
631 				s = 0x2171;
632 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
633 				s = 0x2172;
634 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
635 				s = 0x224C;
636 			} else if (w != 0) {
637 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
638 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
639 				continue;
640 			}
641 		} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
642 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
643 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
644 			continue;
645 		}
646 
647 		if (s < 0x80) { /* ASCII */
648 			if (buf->state != ASCII) {
649 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
650 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
651 				buf->state = ASCII;
652 			}
653 			out = mb_convert_buf_add(out, s);
654 		} else if (s < 0x8080) { /* JIS X 0208 */
655 			if (buf->state != JISX_0208) {
656 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
657 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
658 				buf->state = JISX_0208;
659 			}
660 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
661 		} else if (s < 0x10000) { /* JIS X 0212 */
662 			if (buf->state != JISX_0212) {
663 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
664 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
665 				buf->state = JISX_0212;
666 			}
667 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
668 		} else { /* X 0201 Latin */
669 			if (buf->state != JISX_0201_LATIN) {
670 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
671 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
672 				buf->state = JISX_0201_LATIN;
673 			}
674 			out = mb_convert_buf_add(out, s & 0x7F);
675 		}
676 	}
677 
678 	if (end && buf->state != ASCII) {
679 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
680 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
681 	}
682 
683 	MB_CONVERT_BUF_STORE(buf, out, limit);
684 }
685 
mb_wchar_to_jis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)686 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
687 {
688 	unsigned char *out, *limit;
689 	MB_CONVERT_BUF_LOAD(buf, out, limit);
690 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
691 
692 	while (len--) {
693 		uint32_t w = *in++;
694 		unsigned int s = 0;
695 
696 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
697 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
698 		} else if (w == 0x203E) { /* OVERLINE */
699 			s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
700 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
701 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
702 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
703 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
704 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
705 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
706 		}
707 
708 		if (s == 0) {
709 			if (w == 0xA5) { /* YEN SIGN */
710 				s = 0x1005C;
711 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
712 				s = 0x2140;
713 			} else if (w == 0x2225) { /* PARALLEL TO */
714 				s = 0x2142;
715 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
716 				s = 0x215D;
717 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
718 				s = 0x2171;
719 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
720 				s = 0x2172;
721 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
722 				s = 0x224C;
723 			} else if (w != 0) {
724 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
725 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
726 				continue;
727 			}
728 		}
729 
730 		if (s < 0x80) { /* ASCII */
731 			if (buf->state != ASCII) {
732 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
733 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
734 				buf->state = ASCII;
735 			}
736 			out = mb_convert_buf_add(out, s);
737 		} else if (s >= 0xA1 && s <= 0xDF) {
738 			if (buf->state != JISX_0201_KANA) {
739 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
740 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
741 				buf->state = JISX_0201_KANA;
742 			}
743 			out = mb_convert_buf_add(out, s & 0x7F);
744 		} else if (s < 0x8080) { /* JIS X 0208 */
745 			if (buf->state != JISX_0208) {
746 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
747 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
748 				buf->state = JISX_0208;
749 			}
750 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
751 		} else if (s < 0x10000) { /* JIS X 0212 */
752 			if (buf->state != JISX_0212) {
753 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
754 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
755 				buf->state = JISX_0212;
756 			}
757 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
758 		} else { /* X 0201 Latin */
759 			if (buf->state != JISX_0201_LATIN) {
760 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
761 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
762 				buf->state = JISX_0201_LATIN;
763 			}
764 			out = mb_convert_buf_add(out, s & 0x7F);
765 		}
766 	}
767 
768 	if (end && buf->state != ASCII) {
769 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
770 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
771 	}
772 
773 	MB_CONVERT_BUF_STORE(buf, out, limit);
774 }
775 
776 #define JISX_0201_KANA_SO 5
777 
mb_check_jis(unsigned char * in,size_t in_len)778 static bool mb_check_jis(unsigned char *in, size_t in_len)
779 {
780 	unsigned char *p = in, *e = p + in_len;
781 	unsigned int state = ASCII;
782 
783 	while (p < e) {
784 		unsigned char c = *p++;
785 		if (c == 0x1B) {
786 			/* ESC seen; this is an escape sequence */
787 			if (state == JISX_0201_KANA_SO) {
788 				return false;
789 			}
790 			if ((e - p) < 2) {
791 				return false;
792 			}
793 			unsigned char c2 = *p++;
794 			if (c2 == '$') {
795 				unsigned char c3 = *p++;
796 				if (c3 == '@' || c3 == 'B') {
797 					state = JISX_0208;
798 				} else if (c3 == '(') {
799 					if (p == e) {
800 						return false;
801 					}
802 					unsigned char c4 = *p++;
803 					if (c4 == '@' || c4 == 'B') {
804 						state = JISX_0208;
805 					} else if (c4 == 'D') {
806 						state = JISX_0212;
807 					} else {
808 						return false;
809 					}
810 				} else {
811 					return false;
812 				}
813 			} else if (c2 == '(') {
814 				unsigned char c3 = *p++;
815 				/* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
816 				 * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
817 				if (c3 == 'B' || c3 == 'H') {
818 					state = ASCII;
819 				} else if (c3 == 'J') {
820 					state = JISX_0201_LATIN;
821 				} else if (c3 == 'I') {
822 					state = JISX_0201_KANA;
823 				} else {
824 					return false;
825 				}
826 			} else {
827 				return false;
828 			}
829 		} else if (c == 0xE) {
830 			/* "Kana In" marker */
831 			if (state != ASCII) {
832 				return false;
833 			}
834 			state = JISX_0201_KANA_SO;
835 		} else if (c == 0xF) {
836 			/* "Kana Out" marker */
837 			if (state != JISX_0201_KANA_SO) {
838 				return false;
839 			}
840 			state = ASCII;
841 		} else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
842 			if (p == e) {
843 				return false;
844 			}
845 			unsigned char c2 = *p++;
846 			if (c2 > 0x20 && c2 < 0x7F) {
847 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
848 				if (state == JISX_0208) {
849 					if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
850 						continue;
851 					}
852 				} else {
853 					if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
854 						continue;
855 					}
856 				}
857 				return false;
858 			} else {
859 				return false;
860 			}
861 		} else if (c < 0x80) {
862 			continue;
863 		} else if (c >= 0xA1 && c <= 0xDF) {
864 			/* GR-invoked Kana */
865 			continue;
866 		} else {
867 			return false;
868 		}
869 	}
870 
871 	return state == ASCII;
872 }
873 
mb_check_iso2022jp(unsigned char * in,size_t in_len)874 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
875 {
876 	unsigned char *p = in, *e = p + in_len;
877 	unsigned int state = ASCII;
878 
879 	while (p < e) {
880 		unsigned char c = *p++;
881 		if (c == 0x1B) {
882 			/* ESC seen; this is an escape sequence */
883 			if ((e - p) < 2) {
884 				return false;
885 			}
886 			unsigned char c2 = *p++;
887 			if (c2 == '$') {
888 				unsigned char c3 = *p++;
889 				if (c3 == '@' || c3 == 'B') {
890 					state = JISX_0208;
891 				} else {
892 					return false;
893 				}
894 			} else if (c2 == '(') {
895 				unsigned char c3 = *p++;
896 				if (c3 == 'B') {
897 					state = ASCII;
898 				} else if (c3 == 'J') {
899 					state = JISX_0201_LATIN;
900 				} else {
901 					return false;
902 				}
903 			} else {
904 				return false;
905 			}
906 		} else if (c == 0xE || c == 0xF) {
907 			/* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
908 			return false;
909 		} else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
910 			if (p == e) {
911 				return false;
912 			}
913 			unsigned char c2 = *p++;
914 			if (c2 > 0x20 && c2 < 0x7F) {
915 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
916 				if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
917 					continue;
918 				}
919 				return false;
920 			} else {
921 				return false;
922 			}
923 		} else if (c < 0x80) {
924 			continue;
925 		} else {
926 			return false;
927 		}
928 	}
929 
930 	return state == ASCII;
931 }
932 
933 /* Unicode codepoints for emoji are above 0x1F000, but we only store 16-bits
934  * in our tables. Therefore, add 0x10000 to recover the true values.
935  *
936  * Again, for some emoji which are not supported by Unicode, we use codepoints
937  * in the Private Use Area above 0xFE000. Again, add 0xF0000 to recover the
938  * true value. */
convert_emoji_cp(int cp)939 static inline int convert_emoji_cp(int cp)
940 {
941 	if (cp > 0xF000)
942 		return cp + 0x10000;
943 	else if (cp > 0xE000)
944 		return cp + 0xF0000;
945 	return cp;
946 }
947 
mbfilter_sjis_emoji_kddi2unicode(int s,int * snd)948 int mbfilter_sjis_emoji_kddi2unicode(int s, int *snd)
949 {
950 	if (s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi1_max) {
951 		if (s == 0x24C0) { /* Spain */
952 			EMIT_FLAG_EMOJI("ES");
953 		} else if (s == 0x24C1) { /* Russia */
954 			EMIT_FLAG_EMOJI("RU");
955 		} else if (s >= 0x2545 && s <= 0x254A) {
956 			EMIT_FLAG_EMOJI(nflags_kddi[s - 0x2545]);
957 		} else if (s == 0x25BC) {
958 			EMIT_KEYPAD_EMOJI('#');
959 		} else {
960 			*snd = 0;
961 			return convert_emoji_cp(mb_tbl_code2uni_kddi1[s - mb_tbl_code2uni_kddi1_min]);
962 		}
963 	} else if (s >= mb_tbl_code2uni_kddi2_min && s <= mb_tbl_code2uni_kddi2_max) {
964 		if (s == 0x2750) { /* Japan */
965 			EMIT_FLAG_EMOJI("JP");
966 		} else if (s >= 0x27A6 && s <= 0x27AE) {
967 			EMIT_KEYPAD_EMOJI(s - 0x27A6 + '1');
968 		} else if (s == 0x27F7) { /* United States */
969 			EMIT_FLAG_EMOJI("US");
970 		} else if (s == 0x2830) {
971 			EMIT_KEYPAD_EMOJI('0');
972 		} else {
973 			*snd = 0;
974 			return convert_emoji_cp(mb_tbl_code2uni_kddi2[s - mb_tbl_code2uni_kddi2_min]);
975 		}
976 	}
977 	return 0;
978 }
979 
mbfl_filt_conv_2022jp_mobile_wchar(int c,mbfl_convert_filter * filter)980 static int mbfl_filt_conv_2022jp_mobile_wchar(int c, mbfl_convert_filter *filter)
981 {
982 	int c1, s, w, snd = 0;
983 
984 	switch (filter->status & 0xF) {
985 	case 0:
986 		if (c == 0x1B) {
987 			filter->status += 2;
988 		} else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
989 			CK((*filter->output_function)(0xFF40 + c, filter->data));
990 		} else if (filter->status == JISX0208_KANJI && c > 0x20 && c < 0x80) {
991 			filter->cache = c;
992 			filter->status += 1;
993 		} else if (c >= 0 && c < 0x80) { /* ASCII */
994 			CK((*filter->output_function)(c, filter->data));
995 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
996 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
997 		} else {
998 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
999 		}
1000 		break;
1001 
1002 	/* JISX 0208, second byte */
1003 	case 1:
1004 		w = 0;
1005 		filter->status &= ~0xF;
1006 		c1 = filter->cache;
1007 		if (c > 0x20 && c < 0x7F) {
1008 			s = ((c1 - 0x21) * 94) + c - 0x21;
1009 
1010 			if (s <= 137) {
1011 				if (s == 31) {
1012 					w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
1013 				} else if (s == 32) {
1014 					w = 0xFF5E; /* FULLWIDTH TILDE */
1015 				} else if (s == 33) {
1016 					w = 0x2225; /* PARALLEL TO */
1017 				} else if (s == 60) {
1018 					w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1019 				} else if (s == 80) {
1020 					w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1021 				} else if (s == 81) {
1022 					w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1023 				} else if (s == 137) {
1024 					w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1025 				}
1026 			}
1027 
1028 			if (s >= (84 * 94) && s < (91 * 94)) {
1029 				s += 22 * 94;
1030 				w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1031 				if (w > 0 && snd > 0) {
1032 					(*filter->output_function)(snd, filter->data);
1033 				}
1034 			}
1035 
1036 			if (w == 0) {
1037 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1038 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1039 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {
1040 					w = jisx0208_ucs_table[s];
1041 				}
1042 			}
1043 
1044 			if (w <= 0) {
1045 				w = MBFL_BAD_INPUT;
1046 			}
1047 			CK((*filter->output_function)(w, filter->data));
1048 		} else {
1049 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1050 		}
1051 		break;
1052 
1053 	/* ESC */
1054 	case 2:
1055 		if (c == '$') {
1056 			filter->status++;
1057 		} else if (c == '(') {
1058 			filter->status += 3;
1059 		} else {
1060 			filter->status &= ~0xF;
1061 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1062 		}
1063 		break;
1064 
1065 	/* ESC $ */
1066 	case 3:
1067 		if (c == '@' || c == 'B') {
1068 			filter->status = JISX0208_KANJI;
1069 		} else if (c == '(') {
1070 			filter->status++;
1071 		} else {
1072 			filter->status &= ~0xF;
1073 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1074 		}
1075 		break;
1076 
1077 	/* ESC $ ( */
1078 	case 4:
1079 		if (c == '@' || c == 'B') {
1080 			filter->status = JISX0208_KANJI;
1081 		} else {
1082 			filter->status &= ~0xF;
1083 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1084 		}
1085 		break;
1086 
1087 	/* ESC ( */
1088 	case 5:
1089 		if (c == 'B' || c == 'J') {
1090 			filter->status = 0; /* ASCII mode */
1091 		} else if (c == 'I') {
1092 			filter->status = JISX0201_KANA;
1093 		} else {
1094 			filter->status &= ~0xF;
1095 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1096 		}
1097 	}
1098 
1099 	return 0;
1100 }
1101 
mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter * filter)1102 static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter)
1103 {
1104 	if (filter->status & 0xF) {
1105 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
1106 	}
1107 	filter->status = 0;
1108 
1109 	if (filter->flush_function) {
1110 		(*filter->flush_function)(filter->data);
1111 	}
1112 
1113 	return 0;
1114 }
1115 
mbfilter_unicode2sjis_emoji_kddi(int c,int * s1,mbfl_convert_filter * filter)1116 static int mbfilter_unicode2sjis_emoji_kddi(int c, int *s1, mbfl_convert_filter *filter)
1117 {
1118 	if ((filter->status & 0xF) == 1) {
1119 		int c1 = filter->cache;
1120 		filter->cache = 0;
1121 		filter->status &= ~0xFF;
1122 		if (c == 0x20E3) {
1123 			if (c1 == '#') {
1124 				*s1 = 0x25BC;
1125 			} else if (c1 == '0') {
1126 				*s1 = 0x2830;
1127 			} else { /* Previous character was '1'-'9' */
1128 				*s1 = 0x27A6 + (c1 - '1');
1129 			}
1130 			return 1;
1131 		} else {
1132 			if (filter->status & 0xFF00) {
1133 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1134 				CK((*filter->output_function)('(', filter->data));
1135 				CK((*filter->output_function)('B', filter->data));
1136 			}
1137 			CK((*filter->output_function)(c1, filter->data));
1138 			filter->status = 0;
1139 		}
1140 	}
1141 
1142 	if (c == '#' || (c >= '0' && c <= '9')) {
1143 		filter->status |= 1;
1144 		filter->cache = c;
1145 		return 0;
1146 	}
1147 
1148 	if (c == 0xA9) { /* Copyright sign */
1149 		*s1 = 0x27DC;
1150 		return 1;
1151 	} else if (c == 0xAE) { /* Registered sign */
1152 		*s1 = 0x27DD;
1153 		return 1;
1154 	} else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
1155 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1156 		if (i >= 0) {
1157 			*s1 = mb_tbl_uni_kddi2code2_value[i];
1158 			return 1;
1159 		}
1160 	} else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
1161 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1162 		if (i >= 0) {
1163 			*s1 = mb_tbl_uni_kddi2code3_value[i];
1164 			return 1;
1165 		}
1166 	} else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
1167 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1168 		if (i >= 0) {
1169 			*s1 = mb_tbl_uni_kddi2code5_val[i];
1170 			return 1;
1171 		}
1172 	}
1173 	return 0;
1174 }
1175 
1176 /* (ku*94)+ten value -> Shift-JIS byte sequence */
1177 #define CODE2JIS(c1,c2,s1,s2) \
1178 	c1 = (s1)/94+0x21; \
1179 	c2 = (s1)-94*((c1)-0x21)+0x21; \
1180 	s1 = ((c1) << 8) | (c2); \
1181 	s2 = 1
1182 
mbfl_filt_conv_wchar_2022jp_mobile(int c,mbfl_convert_filter * filter)1183 static int mbfl_filt_conv_wchar_2022jp_mobile(int c, mbfl_convert_filter *filter)
1184 {
1185 	int c1, c2, s1 = 0, s2 = 0;
1186 
1187 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
1188 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
1189 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
1190 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
1191 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
1192 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
1193 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
1194 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
1195 	}
1196 
1197 	if (s1 <= 0) {
1198 		if (c == 0xA5) { /* YEN SIGN */
1199 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
1200 		} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1201 			s1 = 0x2140;
1202 		} else if (c == 0x2225) { /* PARALLEL TO */
1203 			s1 = 0x2142;
1204 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1205 			s1 = 0x215d;
1206 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1207 			s1 = 0x2171;
1208 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1209 			s1 = 0x2172;
1210 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1211 			s1 = 0x224c;
1212 		}
1213 	}
1214 
1215 	if (mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0) {
1216 		/* A KDDI emoji was detected and stored in s1 */
1217 		CODE2JIS(c1,c2,s1,s2);
1218 		s1 -= 0x1600;
1219 	} else if ((filter->status & 0xFF) == 1 && filter->cache) {
1220 		/* We are just processing one of KDDI's special emoji for a phone keypad button */
1221 		return 0;
1222 	}
1223 
1224 	if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
1225 		s1 = -1;
1226 		for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
1227 			if (c == cp932ext1_ucs_table[c1]) {
1228 				s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
1229 				break;
1230 			}
1231 		}
1232 
1233 		if (c == 0) {
1234 			s1 = 0;
1235 		}
1236 	}
1237 
1238 	if (s1 >= 0) {
1239 		if (s1 < 0x80) { /* ASCII */
1240 			if (filter->status & 0xFF00) {
1241 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1242 				CK((*filter->output_function)('(', filter->data));
1243 				CK((*filter->output_function)('B', filter->data));
1244 			}
1245 			CK((*filter->output_function)(s1, filter->data));
1246 			filter->status = 0;
1247 		} else if (s1 > 0xA0 && s1 < 0xE0) { /* Kana */
1248 			if ((filter->status & 0xFF00) != 0x100) {
1249 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1250 				CK((*filter->output_function)('(', filter->data));
1251 				CK((*filter->output_function)('I', filter->data));
1252 			}
1253 			filter->status = 0x100;
1254 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
1255 		} else if (s1 < 0x7E7F) { /* JIS X 0208 */
1256 			if ((filter->status & 0xFF00) != 0x200) {
1257 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1258 				CK((*filter->output_function)('$', filter->data));
1259 				CK((*filter->output_function)('B', filter->data));
1260 			}
1261 			filter->status = 0x200;
1262 			CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
1263 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
1264 		}
1265 	} else {
1266 		CK(mbfl_filt_conv_illegal_output(c, filter));
1267 	}
1268 
1269 	return 0;
1270 }
1271 
mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter * filter)1272 static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter)
1273 {
1274 	/* Go back to ASCII mode (so strings can be safely concatenated) */
1275 	if (filter->status & 0xFF00) {
1276 		(*filter->output_function)(0x1B, filter->data); /* ESC */
1277 		(*filter->output_function)('(', filter->data);
1278 		(*filter->output_function)('B', filter->data);
1279 	}
1280 
1281 	int c1 = filter->cache;
1282 	if ((filter->status & 0xFF) == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
1283 		(*filter->output_function)(c1, filter->data);
1284 	}
1285 	filter->status = filter->cache = 0;
1286 
1287 	if (filter->flush_function) {
1288 		(*filter->flush_function)(filter->data);
1289 	}
1290 
1291 	return 0;
1292 }
1293 
mb_iso2022jp_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1294 static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1295 {
1296 	unsigned char *p = *in, *e = p + *in_len;
1297 	uint32_t *out = buf, *limit = buf + bufsize - 1;
1298 
1299 	while (p < e && out < limit) {
1300 		unsigned char c = *p++;
1301 
1302 		if (c == 0x1B) {
1303 			if ((e - p) < 2) {
1304 				p = e;
1305 				*out++ = MBFL_BAD_INPUT;
1306 				break;
1307 			}
1308 			unsigned char c2 = *p++;
1309 			unsigned char c3 = *p++;
1310 
1311 			if (c2 == '$') {
1312 				if (c3 == '@' || c3 == 'B') {
1313 					*state = JISX0208_KANJI;
1314 				} else if (c3 == '(') {
1315 					if (p == e) {
1316 						*out++ = MBFL_BAD_INPUT;
1317 						break;
1318 					}
1319 					unsigned char c4 = *p++;
1320 
1321 					if (c4 == '@' || c4 == 'B') {
1322 						*state = JISX0208_KANJI;
1323 					} else {
1324 						*out++ = MBFL_BAD_INPUT;
1325 					}
1326 				} else {
1327 					*out++ = MBFL_BAD_INPUT;
1328 				}
1329 			} else if (c2 == '(') {
1330 				if (c3 == 'B' || c3 == 'J') {
1331 					*state = ASCII;
1332 				} else if (c3 == 'I') {
1333 					*state = JISX0201_KANA;
1334 				} else {
1335 					*out++ = MBFL_BAD_INPUT;
1336 				}
1337 			} else {
1338 				p--;
1339 				*out++ = MBFL_BAD_INPUT;
1340 			}
1341 		} else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
1342 			*out++ = 0xFF40 + c;
1343 		} else if (*state == JISX0208_KANJI && c >= 0x21 && c <= 0x7F) {
1344 			if (p == e) {
1345 				*out++ = MBFL_BAD_INPUT;
1346 				break;
1347 			}
1348 			unsigned char c2 = *p++;
1349 
1350 			if (c2 >= 0x21 && c2 <= 0x7E) {
1351 				unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
1352 				uint32_t w = 0;
1353 
1354 				if (s <= 137) {
1355 					if (s == 31) {
1356 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
1357 					} else if (s == 32) {
1358 						w = 0xFF5E; /* FULLWIDTH TILDE */
1359 					} else if (s == 33) {
1360 						w = 0x2225; /* PARALLEL TO */
1361 					} else if (s == 60) {
1362 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1363 					} else if (s == 80) {
1364 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1365 					} else if (s == 81) {
1366 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1367 					} else if (s == 137) {
1368 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1369 					}
1370 				}
1371 
1372 				if (s >= (84 * 94) && s < (91 * 94)) {
1373 					int snd = 0;
1374 					s += 22 * 94;
1375 					w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1376 					if (w && snd) {
1377 						*out++ = snd;
1378 					}
1379 				}
1380 
1381 				if (!w) {
1382 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1383 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1384 					} else if (s < jisx0208_ucs_table_size) {
1385 						w = jisx0208_ucs_table[s];
1386 					}
1387 				}
1388 
1389 				*out++ = w ? w : MBFL_BAD_INPUT;
1390 			} else {
1391 				*out++ = MBFL_BAD_INPUT;
1392 			}
1393 		} else if (c <= 0x7F) {
1394 			*out++ = c;
1395 		} else if (c >= 0xA1 && c <= 0xDF) {
1396 			*out++ = 0xFEC0 + c;
1397 		} else {
1398 			*out++ = MBFL_BAD_INPUT;
1399 		}
1400 	}
1401 
1402 	*in_len = e - p;
1403 	*in = p;
1404 	return out - buf;
1405 }
1406 
mb_wchar_to_iso2022jp_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1407 static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1408 {
1409 	unsigned char *out, *limit;
1410 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1411 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1412 
1413 	while (len--) {
1414 		uint32_t w = *in++;
1415 		unsigned int s = 0;
1416 
1417 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
1418 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
1419 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
1420 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
1421 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
1422 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
1423 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
1424 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
1425 		}
1426 
1427 		if (!s) {
1428 			if (w == 0xA5) { /* YEN SIGN */
1429 				s = 0x216F; /* FULLWIDTH YEN SIGN */
1430 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1431 				s = 0x2140;
1432 			} else if (w == 0x2225) { /* PARALLEL TO */
1433 				s = 0x2142;
1434 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1435 				s = 0x215D;
1436 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1437 				s = 0x2171;
1438 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1439 				s = 0x2172;
1440 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1441 				s = 0x224C;
1442 			}
1443 		}
1444 
1445 		if ((w == '#' || (w >= '0' && w <= '9')) && len) {
1446 			uint32_t w2 = *in++; len--;
1447 
1448 			if (w2 == 0x20E3) {
1449 				unsigned int s1 = 0;
1450 				if (w == '#') {
1451 					s1 = 0x25BC;
1452 				} else if (w == '0') {
1453 					s1 = 0x2830;
1454 				} else { /* Previous character was '1'-'9' */
1455 					s1 = 0x27A6 + (w - '1');
1456 				}
1457 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1458 			} else {
1459 				in--; len++;
1460 			}
1461 		} else if (w >= NFLAGS('C') && w <= NFLAGS('U') && len) { /* C for CN, U for US */
1462 			uint32_t w2 = *in++; len--;
1463 
1464 			if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
1465 				for (int i = 0; i < 10; i++) {
1466 					if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
1467 						unsigned int s1 = nflags_code_kddi[i];
1468 						s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1469 						goto found_flag_emoji;
1470 					}
1471 				}
1472 			}
1473 
1474 			in--; len++;
1475 found_flag_emoji: ;
1476 		}
1477 
1478 		if (w == 0xA9) { /* Copyright sign */
1479 			unsigned int s1 = 0x27DC;
1480 			s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1481 		} else if (w == 0xAE) { /* Registered sign */
1482 			unsigned int s1 = 0x27DD;
1483 			s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1484 		} else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
1485 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1486 			if (i >= 0) {
1487 				unsigned int s1 = mb_tbl_uni_kddi2code2_value[i];
1488 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1489 			}
1490 		} else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
1491 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1492 			if (i >= 0) {
1493 				unsigned int s1 = mb_tbl_uni_kddi2code3_value[i];
1494 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1495 			}
1496 		} else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
1497 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1498 			if (i >= 0) {
1499 				unsigned int s1 = mb_tbl_uni_kddi2code5_val[i];
1500 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1501 			}
1502 		}
1503 
1504 		if (!s || s >= 0xA1A1) {
1505 			s = 0;
1506 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
1507 				if (w == cp932ext1_ucs_table[i]) {
1508 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
1509 					break;
1510 				}
1511 			}
1512 			if (w == 0)
1513 				s = 0;
1514 		}
1515 
1516 		if (!s && w) {
1517 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1518 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1519 		} else if (s <= 0x7F) {
1520 			if (buf->state != ASCII) {
1521 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1522 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1523 				buf->state = ASCII;
1524 			}
1525 			out = mb_convert_buf_add(out, s);
1526 		} else if (s >= 0xA1 && s <= 0xDF) {
1527 			if (buf->state != JISX0201_KANA) {
1528 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1529 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1530 				buf->state = JISX0201_KANA;
1531 			}
1532 			out = mb_convert_buf_add(out, s & 0x7F);
1533 		} else if (s <= 0x7E7E) {
1534 			if (buf->state != JISX0208_KANJI) {
1535 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1536 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1537 				buf->state = JISX0208_KANJI;
1538 			} else {
1539 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1540 			}
1541 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1542 		} else {
1543 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1544 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1545 		}
1546 	}
1547 
1548 	if (end && buf->state != ASCII) {
1549 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1550 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1551 	}
1552 
1553 	MB_CONVERT_BUF_STORE(buf, out, limit);
1554 }
1555 
mbfl_filt_conv_jis2004_wchar(int c,mbfl_convert_filter * filter)1556 static int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
1557 {
1558 	int k;
1559 	int c1, c2, s, s1 = 0, s2 = 0, w = 0, w1;
1560 
1561 	switch (filter->status & 0xf) {
1562 	case 0:
1563 		if (c >= 0 && c < 0x80) { /* latin */
1564 			if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1565 				CK((*filter->output_function)(c, filter->data));
1566 			} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1567 				if (c == 0x5c) {
1568 					CK((*filter->output_function)(0x00a5, filter->data));
1569 				} else if (c == 0x7e) {
1570 					CK((*filter->output_function)(0x203e, filter->data));
1571 				} else {
1572 					CK((*filter->output_function)(c, filter->data));
1573 				}
1574 			} else { /* ISO-2022-JP-2004 */
1575 				if (c == 0x1b) {
1576 					filter->status += 6;
1577 				} else if ((filter->status == 0x80 || filter->status == 0x90 || filter->status == 0xa0)
1578 				   && c > 0x20 && c < 0x7f) { /* kanji first char */
1579 					filter->cache = c;
1580 					if (filter->status == 0x90) {
1581 						filter->status += 1; /* JIS X 0213 plane 1 */
1582 					} else if (filter->status == 0xa0) {
1583 						filter->status += 4; /* JIS X 0213 plane 2 */
1584 					} else {
1585 						filter->status += 5; /* JIS X 0208 */
1586 					}
1587 				} else {
1588 					CK((*filter->output_function)(c, filter->data));
1589 				}
1590 			}
1591 		} else {
1592 			if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1593 				if (c > 0xa0 && c < 0xff) { /* X 0213 plane 1 first char */
1594 					filter->status = 1;
1595 					filter->cache = c;
1596 				} else if (c == 0x8e) { /* kana first char */
1597 					filter->cache = 0x8E; /* So error will be reported if input is truncated right here */
1598 					filter->status = 2;
1599 				} else if (c == 0x8f) { /* X 0213 plane 2 first char */
1600 					filter->status = 3;
1601 				} else {
1602 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1603 				}
1604 			} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1605 				if (c > 0xa0 && c < 0xe0) { /* kana */
1606 					CK((*filter->output_function)(0xfec0 + c, filter->data));
1607 				} else if (c > 0x80 && c < 0xfd && c != 0xa0) {	/* kanji first char */
1608 					filter->status = 1;
1609 					filter->cache = c;
1610 				} else {
1611 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1612 				}
1613 			} else {
1614 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1615 			}
1616 		}
1617 		break;
1618 
1619 	case 1: /* kanji second char */
1620 		filter->status &= ~0xf;
1621 		c1 = filter->cache;
1622 
1623 		if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1624 			if (c > 0xa0 && c < 0xff) {
1625 				s1 = c1 - 0x80;
1626 				s2 = c - 0x80;
1627 			} else {
1628 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1629 				break;
1630 			}
1631 		} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1632 			if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
1633 				SJIS_DECODE(c1, c, s1, s2);
1634 			} else {
1635 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1636 				break;
1637 			}
1638 		} else { /* ISO-2022-JP-2004 */
1639 			if (c >= 0x21 && c <= 0x7E) {
1640 				s1 = c1;
1641 				s2 = c;
1642 			} else {
1643 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1644 				break;
1645 			}
1646 		}
1647 		w1 = (s1 << 8) | s2;
1648 
1649 		/* conversion for combining characters */
1650 		if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) ||
1651 			(w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 ||
1652 			(w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
1653 			k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
1654 			if (k >= 0) {
1655 				w = jisx0213_u2_tbl[2*k];
1656 				CK((*filter->output_function)(w, filter->data));
1657 				w = jisx0213_u2_tbl[2*k+1];
1658 			}
1659 		}
1660 
1661 		/* conversion for BMP  */
1662 		if (w <= 0) {
1663 			w1 = (s1 - 0x21)*94 + s2 - 0x21;
1664 			if (w1 >= 0 && w1 < jisx0213_ucs_table_size) {
1665 				w = jisx0213_ucs_table[w1];
1666 			}
1667 		}
1668 
1669 		/* conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
1670 		if (w <= 0) {
1671 			k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1672 			if (k >= 0) {
1673 				w = jisx0213_jis_u5_tbl[k] + 0x20000;
1674 			}
1675 		}
1676 
1677 		if (w <= 0) {
1678 			w = MBFL_BAD_INPUT;
1679 		}
1680 		CK((*filter->output_function)(w, filter->data));
1681 		break;
1682 
1683 	case 2: /* got 0x8e: EUC-JP-2004 kana */
1684 		filter->status = 0;
1685 		if (c > 0xa0 && c < 0xe0) {
1686 			w = 0xfec0 + c;
1687 			CK((*filter->output_function)(w, filter->data));
1688 		} else {
1689 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1690 		}
1691 		break;
1692 
1693 	case 3: /* X 0213 plane 2 first char: EUC-JP-2004 (0x8f) */
1694 		if (c == 0xA1 || (c >= 0xA3 && c <= 0xA5) || c == 0xA8 || (c >= 0xAC && c <= 0xAF) || (c >= 0xEE && c <= 0xFE)) {
1695 			filter->cache = c - 0x80;
1696 			filter->status++;
1697 		} else {
1698 			filter->status = 0;
1699 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1700 		}
1701 		break;
1702 
1703 	case 4: /* X 0213 plane 2 second char: EUC-JP-2004, ISO-2022-JP-2004 */
1704 		filter->status &= ~0xF;
1705 		c1 = filter->cache;
1706 		if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1707 			c2 = c - 0x80;
1708 		} else {
1709 			c2 = c;
1710 		}
1711 
1712 		if (c2 < 0x21 || c2 > 0x7E) {
1713 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1714 			break;
1715 		}
1716 
1717 		s1 = c1 - 0x21;
1718 		s2 = c2 - 0x21;
1719 
1720 		if (((s1 >= 0 && s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) ||
1721 			(s1 >= 77 && s1 < 94)) && s2 >= 0 && s2 < 94) {
1722 			/* calc offset from ku */
1723 			for (k = 0; k < jisx0213_p2_ofst_len; k++) {
1724 				if (s1 == jisx0213_p2_ofst[k]) {
1725 					break;
1726 				}
1727 			}
1728 			k -= jisx0213_p2_ofst[k];
1729 
1730 			/* check for japanese chars in BMP */
1731 			s = (s1 + 94 + k)*94 + s2;
1732 			ZEND_ASSERT(s < jisx0213_ucs_table_size);
1733 			w = jisx0213_ucs_table[s];
1734 
1735 			/* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1736 			if (w <= 0) {
1737 				k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1738 				if (k >= 0) {
1739 					w = jisx0213_jis_u5_tbl[k] + 0x20000;
1740 				}
1741 			}
1742 
1743 			if (w <= 0) {
1744 				w = MBFL_BAD_INPUT;
1745 			}
1746 
1747 			CK((*filter->output_function)(w, filter->data));
1748 		} else {
1749 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1750 		}
1751 		break;
1752 
1753 	case 5: /* X 0208: ISO-2022-JP-2004 */
1754 		filter->status &= ~0xf;
1755 		c1 = filter->cache;
1756 		if (c > 0x20 && c < 0x7f) {
1757 			s = (c1 - 0x21)*94 + c - 0x21;
1758 			if (s >= 0 && s < jisx0208_ucs_table_size) {
1759 				w = jisx0208_ucs_table[s];
1760 			}
1761 		}
1762 
1763 		if (w <= 0) {
1764 			w = MBFL_BAD_INPUT;
1765 		}
1766 
1767 		CK((*filter->output_function)(w, filter->data));
1768 		break;
1769 
1770 	/* ESC: ISO-2022-JP-2004 */
1771 /*	case 0x06:	*/
1772 /*	case 0x16:	*/
1773 /*	case 0x26:	*/
1774 /*	case 0x86:	*/
1775 /*	case 0x96:	*/
1776 /*	case 0xa6:	*/
1777 	case 6:
1778 		if (c == '$') {
1779 			filter->status++;
1780 		} else if (c == '(') {
1781 			filter->status += 3;
1782 		} else {
1783 			filter->status &= ~0xf;
1784 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1785 		}
1786 		break;
1787 
1788 	/* ESC $: ISO-2022-JP-2004 */
1789 /*	case 0x07:	*/
1790 /*	case 0x17:	*/
1791 /*	case 0x27:	*/
1792 /*	case 0x87:	*/
1793 /*	case 0x97:	*/
1794 /*	case 0xa7:	*/
1795 	case 7:
1796 		if (c == 'B') { /* JIS X 0208-1983 */
1797 			filter->status = 0x80;
1798 		} else if (c == '(') {
1799 			filter->status++;
1800 		} else {
1801 			filter->status &= ~0xf;
1802 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1803 		}
1804 		break;
1805 
1806 	/* ESC $ (: ISO-2022-JP-2004 */
1807 /*	case 0x08:	*/
1808 /*	case 0x18:	*/
1809 /*	case 0x28:	*/
1810 /*	case 0x88:	*/
1811 /*	case 0x98:	*/
1812 /*	case 0xa8:	*/
1813 	case 8:
1814 		if (c == 'Q') { /* JIS X 0213 plane 1 */
1815 			filter->status = 0x90;
1816 		} else if (c == 'P') { /* JIS X 0213 plane 2 */
1817 			filter->status = 0xa0;
1818 		} else {
1819 			filter->status &= ~0xf;
1820 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1821 		}
1822 		break;
1823 
1824 	/* ESC (: ISO-2022-JP-2004 */
1825 /*	case 0x09:	*/
1826 /*	case 0x19:	*/
1827 /*	case 0x29:	*/
1828 /*	case 0x89:	*/
1829 /*	case 0x99:	*/
1830 	case 9:
1831 		if (c == 'B') {
1832 			filter->status = 0;
1833 		} else {
1834 			filter->status &= ~0xf;
1835 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1836 		}
1837 		break;
1838 
1839 		EMPTY_SWITCH_DEFAULT_CASE();
1840 	}
1841 
1842 	return 0;
1843 }
1844 
mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter * filter)1845 static int mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter *filter)
1846 {
1847 	if (filter->status & 0xF) {
1848 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1849 	}
1850 	filter->status = 0;
1851 
1852 	if (filter->flush_function) {
1853 		return (*filter->flush_function)(filter->data);
1854 	}
1855 
1856 	return 0;
1857 }
1858 
mbfl_filt_conv_wchar_jis2004(int c,mbfl_convert_filter * filter)1859 static int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter)
1860 {
1861 	int k;
1862 	int c1, c2, s1, s2;
1863 
1864 retry:
1865 	s1 = 0;
1866 	/* check for 1st char of combining characters */
1867 	if ((filter->status & 0xf) == 0 && (
1868 			c == 0x00E6 ||
1869 			(c >= 0x0254 && c <= 0x02E9) ||
1870 			(c >= 0x304B && c <= 0x3053) ||
1871 			(c >= 0x30AB && c <= 0x30C8) ||
1872 			c == 0x31F7)) {
1873 		for (k = 0; k < jisx0213_u2_tbl_len; k++) {
1874 			if (c == jisx0213_u2_tbl[2*k]) {
1875 				filter->status++;
1876 				filter->cache = k;
1877 				return 0;
1878 			}
1879 		}
1880 	}
1881 
1882 	/* check for 2nd char of combining characters */
1883 	if ((filter->status & 0xf) == 1 && filter->cache >= 0 && filter->cache < jisx0213_u2_tbl_len) {
1884 		k = filter->cache;
1885 		filter->status &= ~0xf;
1886 		filter->cache = 0;
1887 
1888 		c1 = jisx0213_u2_tbl[2*k];
1889 		if ((c1 == 0x0254 || c1 == 0x028C || c1 == 0x0259 || c1 == 0x025A) && c == 0x0301) {
1890 			k++;
1891 		}
1892 		if (c == jisx0213_u2_tbl[2*k+1]) {
1893 			s1 = jisx0213_u2_key[k];
1894 		} else { /* fallback */
1895 			s1 = jisx0213_u2_fb_tbl[k];
1896 
1897 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1898 				c1 = (s1 >> 8) & 0xff;
1899 				c2 = s1 & 0xff;
1900 				SJIS_ENCODE(c1, c2, s1, s2);
1901 			} else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1902 				s2 = (s1 & 0xff) + 0x80;
1903 				s1 = ((s1 >> 8) & 0xff) + 0x80;
1904 			} else {
1905 				if (filter->status != 0x200) {
1906 					CK((*filter->output_function)(0x1b, filter->data));
1907 					CK((*filter->output_function)('$', filter->data));
1908 					CK((*filter->output_function)('(', filter->data));
1909 					CK((*filter->output_function)('Q', filter->data));
1910 				}
1911 				filter->status = 0x200;
1912 
1913 				s2 = s1 & 0x7f;
1914 				s1 = (s1 >> 8) & 0x7f;
1915 			}
1916 
1917 			/* Flush out cached data */
1918 			CK((*filter->output_function)(s1, filter->data));
1919 			CK((*filter->output_function)(s2, filter->data));
1920 			goto retry;
1921 		}
1922 	}
1923 
1924 	/* check for major japanese chars: U+4E00 - U+9FFF */
1925 	if (s1 <= 0) {
1926 		for (k = 0; k < uni2jis_tbl_len; k++) {
1927 			if (c >= uni2jis_tbl_range[k][0] && c <= uni2jis_tbl_range[k][1]) {
1928 				s1 = uni2jis_tbl[k][c-uni2jis_tbl_range[k][0]];
1929 				break;
1930 			}
1931 		}
1932 	}
1933 
1934 	/* check for japanese chars in compressed mapping area: U+1E00 - U+4DBF */
1935 	if (s1 <= 0 && c >= ucs_c1_jisx0213_min && c <= ucs_c1_jisx0213_max) {
1936 		k = mbfl_bisec_srch(c, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1937 		if (k >= 0) {
1938 			s1 = ucs_c1_jisx0213_ofst[k] + c - ucs_c1_jisx0213_tbl[2*k];
1939 		}
1940 	}
1941 
1942 	/* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1943 	if (s1 <= 0 && c >= jisx0213_u5_tbl_min && c <= jisx0213_u5_tbl_max) {
1944 		k = mbfl_bisec_srch2(c - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1945 		if (k >= 0) {
1946 			s1 = jisx0213_u5_jis_tbl[k];
1947 		}
1948 	}
1949 
1950 	if (s1 <= 0) {
1951 		/* CJK Compatibility Forms: U+FE30 - U+FE4F */
1952 		if (c == 0xfe45) {
1953 			s1 = 0x233e;
1954 		} else if (c == 0xfe46) {
1955 			s1 = 0x233d;
1956 		} else if (c >= 0xf91d && c <= 0xf9dc) {
1957 			/* CJK Compatibility Ideographs: U+F900 - U+F92A */
1958 			k = mbfl_bisec_srch2(c, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1959 			if (k >= 0) {
1960 				s1 = ucs_r2b_jisx0213_cmap_val[k];
1961 			}
1962 		}
1963 	}
1964 
1965 	if (s1 <= 0) {
1966 		if (c == 0) {
1967 			s1 = 0;
1968 		} else {
1969 			s1 = -1;
1970 		}
1971 	}
1972 
1973 	if (s1 >= 0) {
1974 		if (s1 < 0x80) { /* ASCII */
1975 			if (filter->to->no_encoding == mbfl_no_encoding_2022jp_2004 && (filter->status & 0xff00)) {
1976 				CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
1977 				CK((*filter->output_function)('(', filter->data));
1978 				CK((*filter->output_function)('B', filter->data));
1979 			}
1980 			filter->status = 0;
1981 			CK((*filter->output_function)(s1, filter->data));
1982 		} else if (s1 < 0x100) { /* latin or kana */
1983 			if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1984 				CK((*filter->output_function)(0x8e, filter->data));
1985 				CK((*filter->output_function)(s1, filter->data));
1986 			} else if (filter->to->no_encoding == mbfl_no_encoding_sjis2004 && (s1 >= 0xA1 && s1 <= 0xDF)) {
1987 				CK((*filter->output_function)(s1, filter->data));
1988 			} else {
1989 				CK(mbfl_filt_conv_illegal_output(c, filter));
1990 			}
1991 		} else if (s1 < 0x7f00) { /* X 0213 plane 1 */
1992 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1993 				c1 = (s1 >> 8) & 0xff;
1994 				c2 = s1 & 0xff;
1995 				SJIS_ENCODE(c1, c2, s1, s2);
1996 			} else if  (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1997 				s2 = (s1 & 0xff) + 0x80;
1998 				s1 = ((s1 >> 8) & 0xff) + 0x80;
1999 			} else {
2000 				if ((filter->status & 0xff00) != 0x200) {
2001 					CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2002 					CK((*filter->output_function)('$', filter->data));
2003 					CK((*filter->output_function)('(', filter->data));
2004 					CK((*filter->output_function)('Q', filter->data));
2005 				}
2006 				filter->status = 0x200;
2007 				s2 = s1 & 0xff;
2008 				s1 = (s1 >> 8) & 0xff;
2009 			}
2010 			CK((*filter->output_function)(s1, filter->data));
2011 			CK((*filter->output_function)(s2, filter->data));
2012 		} else { /* X 0213 plane 2 */
2013 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
2014 				c1 = (s1 >> 8) & 0xff;
2015 				c2 = s1 & 0xff;
2016 				SJIS_ENCODE(c1, c2, s1, s2);
2017 			} else {
2018 				s2 = s1 & 0xff;
2019 				k = ((s1 >> 8) & 0xff) - 0x7f;
2020 				if (k >= 0 && k < jisx0213_p2_ofst_len) {
2021 					s1 = jisx0213_p2_ofst[k] + 0x21;
2022 				}
2023 				if  (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2024 					s2 |= 0x80;
2025 					s1 |= 0x80;
2026 					CK((*filter->output_function)(0x8f, filter->data));
2027 				} else {
2028 					if ((filter->status & 0xff00) != 0x200) {
2029 						CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2030 						CK((*filter->output_function)('$', filter->data));
2031 						CK((*filter->output_function)('(', filter->data));
2032 						CK((*filter->output_function)('P', filter->data));
2033 					}
2034 					filter->status = 0x200;
2035 				}
2036 			}
2037 
2038 			CK((*filter->output_function)(s1, filter->data));
2039 			CK((*filter->output_function)(s2, filter->data));
2040 		}
2041 	} else {
2042 		CK(mbfl_filt_conv_illegal_output(c, filter));
2043 	}
2044 
2045 	return 0;
2046 }
2047 
mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter * filter)2048 static int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter)
2049 {
2050 	int k, c1, c2, s1, s2;
2051 
2052 	k = filter->cache;
2053 	filter->cache = 0;
2054 
2055 	if (filter->status == 1 && k >= 0 && k <= jisx0213_u2_tbl_len) {
2056 		s1 = jisx0213_u2_fb_tbl[k];
2057 
2058 		if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
2059 			c1 = (s1 >> 8) & 0xff;
2060 			c2 = s1 & 0xff;
2061 			SJIS_ENCODE(c1, c2, s1, s2);
2062 		} else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2063 			s2 = (s1 & 0xff) | 0x80;
2064 			s1 = ((s1 >> 8) & 0xff) | 0x80;
2065 		} else {
2066 			s2 = s1 & 0x7f;
2067 			s1 = (s1 >> 8) & 0x7f;
2068 			if ((filter->status & 0xff00) != 0x200) {
2069 				CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2070 				CK((*filter->output_function)('$', filter->data));
2071 				CK((*filter->output_function)('(', filter->data));
2072 				CK((*filter->output_function)('Q', filter->data));
2073 			}
2074 			filter->status = 0x200;
2075 		}
2076 
2077 		CK((*filter->output_function)(s1, filter->data));
2078 		CK((*filter->output_function)(s2, filter->data));
2079 	}
2080 
2081 	/* If we had switched to a different charset, go back to ASCII mode
2082 	 * This makes it possible to concatenate arbitrary valid strings
2083 	 * together and get a valid string */
2084 	if (filter->status & 0xff00) {
2085 		CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2086 		CK((*filter->output_function)('(', filter->data));
2087 		CK((*filter->output_function)('B', filter->data));
2088 	}
2089 
2090 	filter->status = 0;
2091 
2092 	if (filter->flush_function) {
2093 		return (*filter->flush_function)(filter->data);
2094 	}
2095 
2096 	return 0;
2097 }
2098 
2099 #define ASCII 0
2100 #define JISX0208 1
2101 #define JISX0213_PLANE1 2
2102 #define JISX0213_PLANE2 3
2103 
mb_iso2022jp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)2104 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
2105 {
2106 	unsigned char *p = *in, *e = p + *in_len;
2107 	uint32_t *out = buf, *limit = buf + bufsize - 1;
2108 
2109 	while (p < e && out < limit) {
2110 		unsigned char c = *p++;
2111 
2112 		if (c <= 0x7F) {
2113 			if (c == 0x1B) {
2114 				if ((e - p) < 2) {
2115 					*out++ = MBFL_BAD_INPUT;
2116 					p = e;
2117 					break;
2118 				}
2119 				unsigned char c2 = *p++;
2120 				unsigned char c3 = *p++;
2121 				if (c2 == '$') {
2122 					if (c3 == 'B') {
2123 						*state = JISX0208;
2124 					} else if (c3 == '(') {
2125 						if (p == e) {
2126 							*out++ = MBFL_BAD_INPUT;
2127 							break;
2128 						}
2129 						unsigned char c4 = *p++;
2130 						if (c4 == 'Q') {
2131 							*state = JISX0213_PLANE1;
2132 						} else if (c4 == 'P') {
2133 							*state = JISX0213_PLANE2;
2134 						} else {
2135 							*out++ = MBFL_BAD_INPUT;
2136 						}
2137 					} else {
2138 						*out++ = MBFL_BAD_INPUT;
2139 					}
2140 				} else if (c2 == '(') {
2141 					if (c3 == 'B') {
2142 						*state = ASCII;
2143 					} else {
2144 						*out++ = MBFL_BAD_INPUT;
2145 					}
2146 				} else {
2147 					p--;
2148 					*out++ = MBFL_BAD_INPUT;
2149 				}
2150 			} else if (*state >= JISX0208 && c > 0x20 && c < 0x7F) {
2151 				if (p == e) {
2152 					*out++ = MBFL_BAD_INPUT;
2153 					break;
2154 				}
2155 				unsigned char c2 = *p++;
2156 				if (c2 < 0x21 || c2 > 0x7E) {
2157 					*out++ = MBFL_BAD_INPUT;
2158 					continue;
2159 				}
2160 
2161 				if (*state == JISX0213_PLANE1) {
2162 					unsigned int w1 = (c << 8) | c2;
2163 
2164 					/* Conversion for combining characters */
2165 					if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
2166 						int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
2167 						if (k >= 0) {
2168 							*out++ = jisx0213_u2_tbl[2*k];
2169 							*out++ = jisx0213_u2_tbl[2*k+1];
2170 							continue;
2171 						}
2172 					}
2173 
2174 					/* Conversion for BMP */
2175 					uint32_t w = 0;
2176 					w1 = (c - 0x21)*94 + c2 - 0x21;
2177 					if (w1 < jisx0213_ucs_table_size) {
2178 						w = jisx0213_ucs_table[w1];
2179 					}
2180 
2181 					/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
2182 					if (!w) {
2183 						int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2184 						if (k >= 0) {
2185 							w = jisx0213_jis_u5_tbl[k] + 0x20000;
2186 						}
2187 					}
2188 
2189 					*out++ = w ? w : MBFL_BAD_INPUT;
2190 				} else if (*state == JISX0213_PLANE2) {
2191 
2192 					unsigned int s1 = c - 0x21, s2 = c2 - 0x21;
2193 
2194 					if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
2195 						int k;
2196 						for (k = 0; k < jisx0213_p2_ofst_len; k++) {
2197 							if (s1 == jisx0213_p2_ofst[k]) {
2198 								break;
2199 							}
2200 						}
2201 						k -= jisx0213_p2_ofst[k];
2202 
2203 						/* Check for Japanese chars in BMP */
2204 						unsigned int s = (s1 + 94 + k)*94 + s2;
2205 						ZEND_ASSERT(s < jisx0213_ucs_table_size);
2206 						uint32_t w = jisx0213_ucs_table[s];
2207 
2208 						/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2209 						if (!w) {
2210 							k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2211 							if (k >= 0) {
2212 								w = jisx0213_jis_u5_tbl[k] + 0x20000;
2213 							}
2214 						}
2215 
2216 						*out++ = w ? w : MBFL_BAD_INPUT;
2217 					} else {
2218 						*out++ = MBFL_BAD_INPUT;
2219 					}
2220 				} else { /* state == JISX0208 */
2221 					unsigned int s = (c - 0x21)*94 + c2 - 0x21;
2222 					uint32_t w = 0;
2223 					if (s < jisx0208_ucs_table_size) {
2224 						w = jisx0208_ucs_table[s];
2225 					}
2226 					*out++ = w ? w : MBFL_BAD_INPUT;
2227 				}
2228 			} else {
2229 				*out++ = c;
2230 			}
2231 		} else {
2232 			*out++ = MBFL_BAD_INPUT;
2233 		}
2234 	}
2235 
2236 	*in_len = e - p;
2237 	*in = p;
2238 	return out - buf;
2239 }
2240 
mb_wchar_to_iso2022jp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)2241 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
2242 {
2243 	unsigned char *out, *limit;
2244 	MB_CONVERT_BUF_LOAD(buf, out, limit);
2245 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2246 
2247 	uint32_t w;
2248 	if (buf->state & 0xFF00) {
2249 		int k = (buf->state >> 8) - 1;
2250 		w = jisx0213_u2_tbl[2*k];
2251 		buf->state &= 0xFF;
2252 		goto process_codepoint;
2253 	}
2254 
2255 	while (len--) {
2256 		w = *in++;
2257 process_codepoint: ;
2258 		unsigned int s = 0;
2259 
2260 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
2261 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
2262 				if (w == jisx0213_u2_tbl[2*k]) {
2263 					if (!len) {
2264 						if (!end) {
2265 							buf->state |= (k+1) << 8;
2266 							MB_CONVERT_BUF_STORE(buf, out, limit);
2267 							return;
2268 						}
2269 					}	else {
2270 						uint32_t w2 = *in++; len--;
2271 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
2272 							k++;
2273 						}
2274 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
2275 							s = jisx0213_u2_key[k];
2276 							break;
2277 						}
2278 						in--; len++;
2279 					}
2280 
2281 					s = jisx0213_u2_fb_tbl[k];
2282 					break;
2283 				}
2284 			}
2285 		}
2286 
2287 		/* Check for major Japanese chars: U+4E00-U+9FFF */
2288 		if (!s) {
2289 			for (int k = 0; k < uni2jis_tbl_len; k++) {
2290 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
2291 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
2292 					break;
2293 				}
2294 			}
2295 		}
2296 
2297 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
2298 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
2299 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
2300 			if (k >= 0) {
2301 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
2302 			}
2303 		}
2304 
2305 		/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2306 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
2307 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
2308 			if (k >= 0) {
2309 				s = jisx0213_u5_jis_tbl[k];
2310 			}
2311 		}
2312 
2313 		if (!s) {
2314 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
2315 			if (w == 0xFE45) {
2316 				s = 0x233E;
2317 			} else if (w == 0xFE46) {
2318 				s = 0x233D;
2319 			} else if (w >= 0xF91D && w <= 0xF9DC) {
2320 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
2321 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
2322 				if (k >= 0) {
2323 					s = ucs_r2b_jisx0213_cmap_val[k];
2324 				}
2325 			}
2326 		}
2327 
2328 		if (!s && w) {
2329 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2330 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2331 		} else if (s <= 0x7F) {
2332 			if (buf->state != ASCII) {
2333 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
2334 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2335 				buf->state = ASCII;
2336 			}
2337 			out = mb_convert_buf_add(out, s);
2338 		} else if (s <= 0xFF) {
2339 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2340 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2341 		} else if (s <= 0x7EFF) {
2342 			if (buf->state != JISX0213_PLANE1) {
2343 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2344 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'Q');
2345 				buf->state = JISX0213_PLANE1;
2346 			} else {
2347 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2348 			}
2349 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
2350 		} else {
2351 			if (buf->state != JISX0213_PLANE2) {
2352 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2353 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'P');
2354 				buf->state = JISX0213_PLANE2;
2355 			} else {
2356 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2357 			}
2358 			unsigned int s2 = s & 0xFF;
2359 			int k = ((s >> 8) & 0xFF) - 0x7F;
2360 			ZEND_ASSERT(k < jisx0213_p2_ofst_len);
2361 			s = jisx0213_p2_ofst[k] + 0x21;
2362 			out = mb_convert_buf_add2(out, s, s2);
2363 		}
2364 	}
2365 
2366 	if (end && buf->state != ASCII) {
2367 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
2368 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2369 	}
2370 
2371 	MB_CONVERT_BUF_STORE(buf, out, limit);
2372 }
2373 
mbfl_filt_conv_cp5022x_wchar(int c,mbfl_convert_filter * filter)2374 static int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
2375 {
2376 	int c1, s, w;
2377 
2378 retry:
2379 	switch (filter->status & 0xf) {
2380 /*	case 0x00:	 ASCII */
2381 /*	case 0x10:	 X 0201 latin */
2382 /*	case 0x20:	 X 0201 kana */
2383 /*	case 0x80:	 X 0208 */
2384 /*	case 0x90:	 X 0212 */
2385 	case 0:
2386 		if (c == 0x1b) {
2387 			filter->status += 2;
2388 		} else if (c == 0x0e) {		/* "kana in" */
2389 			filter->status = 0x20;
2390 		} else if (c == 0x0f) {		/* "kana out" */
2391 			filter->status = 0;
2392 		} else if (filter->status == 0x10 && c == 0x5c) {	/* YEN SIGN */
2393 			CK((*filter->output_function)(0xa5, filter->data));
2394 		} else if (filter->status == 0x10 && c == 0x7e) {	/* OVER LINE */
2395 			CK((*filter->output_function)(0x203e, filter->data));
2396 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
2397 			CK((*filter->output_function)(0xff40 + c, filter->data));
2398 		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
2399 			filter->cache = c;
2400 			filter->status += 1;
2401 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
2402 			CK((*filter->output_function)(c, filter->data));
2403 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
2404 			CK((*filter->output_function)(0xfec0 + c, filter->data));
2405 		} else {
2406 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2407 		}
2408 		break;
2409 
2410 /*	case 0x81:	 X 0208 second char */
2411 /*	case 0x91:	 X 0212 second char */
2412 	case 1:
2413 		filter->status &= ~0xf;
2414 		c1 = filter->cache;
2415 		if (c > 0x20 && c < 0x7f) {
2416 			s = (c1 - 0x21)*94 + c - 0x21;
2417 			if (filter->status == 0x80) {
2418 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
2419 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
2420 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {
2421 					w = jisx0208_ucs_table[s];
2422 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
2423 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
2424 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
2425 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
2426 				} else if (s >= 94 * 94 && s < 114 * 94) {
2427 					/* user-defined => PUA (Microsoft extended) */
2428 					w = s - 94*94 + 0xe000;
2429 				} else {
2430 					w = 0;
2431 				}
2432 
2433 				if (w <= 0) {
2434 					w = MBFL_BAD_INPUT;
2435 				}
2436 			} else {
2437 				if (s >= 0 && s < jisx0212_ucs_table_size) {
2438 					w = jisx0212_ucs_table[s];
2439 				} else {
2440 					w = 0;
2441 				}
2442 
2443 				if (w <= 0) {
2444 					w = MBFL_BAD_INPUT;
2445 				}
2446 			}
2447 			CK((*filter->output_function)(w, filter->data));
2448 		} else {
2449 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2450 		}
2451 		break;
2452 
2453 	/* ESC */
2454 /*	case 0x02:	*/
2455 /*	case 0x12:	*/
2456 /*	case 0x22:	*/
2457 /*	case 0x82:	*/
2458 /*	case 0x92:	*/
2459 	case 2:
2460 		if (c == 0x24) {		/* '$' */
2461 			filter->status++;
2462 		} else if (c == 0x28) {		/* '(' */
2463 			filter->status += 3;
2464 		} else {
2465 			filter->status &= ~0xf;
2466 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2467 			goto retry;
2468 		}
2469 		break;
2470 
2471 	/* ESC $ */
2472 /*	case 0x03:	*/
2473 /*	case 0x13:	*/
2474 /*	case 0x23:	*/
2475 /*	case 0x83:	*/
2476 /*	case 0x93:	*/
2477 	case 3:
2478 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
2479 			filter->status = 0x80;
2480 		} else if (c == 0x28) {			/* '(' */
2481 			filter->status++;
2482 		} else {
2483 			filter->status &= ~0xf;
2484 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2485 			CK((*filter->output_function)(0x24, filter->data));
2486 			goto retry;
2487 		}
2488 		break;
2489 
2490 	/* ESC $ ( */
2491 /*	case 0x04:	*/
2492 /*	case 0x14:	*/
2493 /*	case 0x24:	*/
2494 /*	case 0x84:	*/
2495 /*	case 0x94:	*/
2496 	case 4:
2497 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
2498 			filter->status = 0x80;
2499 		} else if (c == 0x44) {			/* 'D' */
2500 			filter->status = 0x90;
2501 		} else {
2502 			filter->status &= ~0xf;
2503 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2504 			CK((*filter->output_function)(0x24, filter->data));
2505 			CK((*filter->output_function)(0x28, filter->data));
2506 			goto retry;
2507 		}
2508 		break;
2509 
2510 	/* ESC ( */
2511 /*	case 0x05:	*/
2512 /*	case 0x15:	*/
2513 /*	case 0x25:	*/
2514 /*	case 0x85:	*/
2515 /*	case 0x95:	*/
2516 	case 5:
2517 		if (c == 0x42 || c == 0x48) {		/* 'B' or 'H' */
2518 			filter->status = 0;
2519 		} else if (c == 0x4a) {		/* 'J' */
2520 			filter->status = 0x10;
2521 		} else if (c == 0x49) {		/* 'I' */
2522 			filter->status = 0x20;
2523 		} else {
2524 			filter->status &= ~0xf;
2525 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2526 			CK((*filter->output_function)(0x28, filter->data));
2527 			goto retry;
2528 		}
2529 		break;
2530 
2531 		EMPTY_SWITCH_DEFAULT_CASE();
2532 	}
2533 
2534 	return 0;
2535 }
2536 
mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter * filter)2537 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
2538 {
2539 	if (filter->status & 0xF) {
2540 		/* 2-byte (JIS X 0208 or 0212) character was truncated, or else
2541 		 * escape sequence was truncated */
2542 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2543 	}
2544 	filter->status = 0;
2545 
2546 	if (filter->flush_function) {
2547 		(*filter->flush_function)(filter->data);
2548 	}
2549 
2550 	return 0;
2551 }
2552 
2553 static const unsigned char hankana2zenkana_table[64] = {
2554 	0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5,
2555 	0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6,
2556 	0xA8,0xAA,0xAB,0xAD,0xAF,0xB1,0xB3,0xB5,0xB7,0xB9,
2557 	0xBB,0xBD,0xBF,0xC1,0xC4,0xC6,0xC8,0xCA,0xCB,0xCC,
2558 	0xCD,0xCE,0xCF,0xD2,0xD5,0xD8,0xDB,0xDE,0xDF,0xE0,
2559 	0xE1,0xE2,0xE4,0xE6,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,
2560 	0xEF,0xF3,0x9B,0x9C
2561 };
2562 
2563 static const unsigned char hankana2zenhira_table[64] = {
2564 	0x00,0x02,0x0C,0x0D,0x01,0xFB,0x92,0x41,0x43,0x45,
2565 	0x47,0x49,0x83,0x85,0x87,0x63,0xFC,0x42,0x44,0x46,
2566 	0x48,0x4A,0x4B,0x4D,0x4F,0x51,0x53,0x55,0x57,0x59,
2567 	0x5B,0x5D,0x5F,0x61,0x64,0x66,0x68,0x6A,0x6B,0x6C,
2568 	0x6D,0x6E,0x6F,0x72,0x75,0x78,0x7B,0x7E,0x7F,0x80,
2569 	0x81,0x82,0x84,0x86,0x88,0x89,0x8A,0x8B,0x8C,0x8D,
2570 	0x8F,0x93,0x9B,0x9C
2571 };
2572 
2573 static const unsigned char zenkana2hankana_table[84][2] = {
2574 	{0x67,0x00},{0x71,0x00},{0x68,0x00},{0x72,0x00},{0x69,0x00},
2575 	{0x73,0x00},{0x6A,0x00},{0x74,0x00},{0x6B,0x00},{0x75,0x00},
2576 	{0x76,0x00},{0x76,0x9E},{0x77,0x00},{0x77,0x9E},{0x78,0x00},
2577 	{0x78,0x9E},{0x79,0x00},{0x79,0x9E},{0x7A,0x00},{0x7A,0x9E},
2578 	{0x7B,0x00},{0x7B,0x9E},{0x7C,0x00},{0x7C,0x9E},{0x7D,0x00},
2579 	{0x7D,0x9E},{0x7E,0x00},{0x7E,0x9E},{0x7F,0x00},{0x7F,0x9E},
2580 	{0x80,0x00},{0x80,0x9E},{0x81,0x00},{0x81,0x9E},{0x6F,0x00},
2581 	{0x82,0x00},{0x82,0x9E},{0x83,0x00},{0x83,0x9E},{0x84,0x00},
2582 	{0x84,0x9E},{0x85,0x00},{0x86,0x00},{0x87,0x00},{0x88,0x00},
2583 	{0x89,0x00},{0x8A,0x00},{0x8A,0x9E},{0x8A,0x9F},{0x8B,0x00},
2584 	{0x8B,0x9E},{0x8B,0x9F},{0x8C,0x00},{0x8C,0x9E},{0x8C,0x9F},
2585 	{0x8D,0x00},{0x8D,0x9E},{0x8D,0x9F},{0x8E,0x00},{0x8E,0x9E},
2586 	{0x8E,0x9F},{0x8F,0x00},{0x90,0x00},{0x91,0x00},{0x92,0x00},
2587 	{0x93,0x00},{0x6C,0x00},{0x94,0x00},{0x6D,0x00},{0x95,0x00},
2588 	{0x6E,0x00},{0x96,0x00},{0x97,0x00},{0x98,0x00},{0x99,0x00},
2589 	{0x9A,0x00},{0x9B,0x00},{0x9C,0x00},{0x9C,0x00},{0x72,0x00},
2590 	{0x74,0x00},{0x66,0x00},{0x9D,0x00},{0x73,0x9E}
2591 };
2592 
2593 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
2594  * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
2595  * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
2596  * `mode` must not call for transforms which are inverses (i.e. which would cancel
2597  * each other out).
2598  *
2599  * In some cases, successive input codepoints may be merged into one output codepoint.
2600  * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
2601  * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
2602  * will not be modified. If there is no following codepoint, `next` should be zero.
2603  *
2604  * Again, in some cases, one input codepoint may convert to two output codepoints.
2605  * If so, the second output codepoint will be stored in `*second`.
2606  *
2607  * Return the resulting codepoint. If none of the requested transforms apply, return
2608  * the input codepoint unchanged.
2609  */
mb_convert_kana_codepoint(uint32_t c,uint32_t next,bool * consumed,uint32_t * second,unsigned int mode)2610 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
2611 {
2612 	if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
2613 		return c + 0xFEE0;
2614 	}
2615 	if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2616 		return c + 0xFEE0;
2617 	}
2618 	if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
2619 		return c + 0xFEE0;
2620 	}
2621 	if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
2622 		return 0x3000;
2623 	}
2624 
2625 	if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
2626 		/* Convert Hankaku kana to Zenkaku kana
2627 		 * Either all Hankaku kana (including katakana and hiragana) will be converted
2628 		 * to Zenkaku katakana, or to Zenkaku hiragana */
2629 		if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2630 			if (c >= 0xFF61 && c <= 0xFF9F) {
2631 				int n = c - 0xFF60;
2632 
2633 				if (next >= 0xFF61 && next <= 0xFF9F) {
2634 					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2635 						*consumed = true;
2636 						return 0x3001 + hankana2zenkana_table[n];
2637 					}
2638 					if (next == 0xFF9E && n == 19) {
2639 						*consumed = true;
2640 						return 0x30F4;
2641 					}
2642 					if (next == 0xFF9F && n >= 42 && n <= 46) {
2643 						*consumed = true;
2644 						return 0x3002 + hankana2zenkana_table[n];
2645 					}
2646 				}
2647 
2648 				return 0x3000 + hankana2zenkana_table[n];
2649 			}
2650 		}
2651 		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2652 			if (c >= 0xFF61 && c <= 0xFF9F) {
2653 				int n = c - 0xFF60;
2654 
2655 				if (next >= 0xFF61 && next <= 0xFF9F) {
2656 					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2657 						*consumed = true;
2658 						return 0x3001 + hankana2zenhira_table[n];
2659 					}
2660 					if (next == 0xFF9F && n >= 42 && n <= 46) {
2661 						*consumed = true;
2662 						return 0x3002 + hankana2zenhira_table[n];
2663 					}
2664 				}
2665 
2666 				return 0x3000 + hankana2zenhira_table[n];
2667 			}
2668 		}
2669 		if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
2670 			return 0x3000 + hankana2zenkana_table[c - 0xFF60];
2671 		}
2672 		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
2673 			return 0x3000 + hankana2zenhira_table[c - 0xFF60];
2674 		}
2675 	}
2676 
2677 	if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
2678 		if (c == '\\' || c == 0xA5) { /* YEN SIGN */
2679 			return 0xFFE5; /* FULLWIDTH YEN SIGN */
2680 		}
2681 		if (c == 0x7E || c == 0x203E) {
2682 			return 0xFFE3; /* FULLWIDTH MACRON */
2683 		}
2684 		if (c == '\'') {
2685 			return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
2686 		}
2687 		if (c == '"') {
2688 			return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
2689 		}
2690 	}
2691 
2692 	if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
2693 		/* Zenkaku to Hankaku */
2694 		if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
2695 			/* all except " ' \ ~ */
2696 			return c - 0xFEE0;
2697 		}
2698 		if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
2699 			return c - 0xFEE0;
2700 		}
2701 		if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
2702 			return c - 0xFEE0;
2703 		}
2704 		if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
2705 			return ' ';
2706 		}
2707 		if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
2708 			return '-';
2709 		}
2710 	}
2711 
2712 	if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
2713 		/* Zenkaku kana to hankaku kana */
2714 		if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
2715 			/* Zenkaku katakana to hankaku kana */
2716 			int n = c - 0x30A1;
2717 			if (zenkana2hankana_table[n][1]) {
2718 				*second = 0xFF00 + zenkana2hankana_table[n][1];
2719 			}
2720 			return 0xFF00 + zenkana2hankana_table[n][0];
2721 		}
2722 		if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
2723 			/* Zenkaku hiragana to hankaku kana */
2724 			int n = c - 0x3041;
2725 			if (zenkana2hankana_table[n][1]) {
2726 				*second = 0xFF00 + zenkana2hankana_table[n][1];
2727 			}
2728 			return 0xFF00 + zenkana2hankana_table[n][0];
2729 		}
2730 		if (c == 0x3001) {
2731 			return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
2732 		}
2733 		if (c == 0x3002) {
2734 			return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
2735 		}
2736 		if (c == 0x300C) {
2737 			return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
2738 		}
2739 		if (c == 0x300D) {
2740 			return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
2741 		}
2742 		if (c == 0x309B) {
2743 			return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
2744 		}
2745 		if (c == 0x309C) {
2746 			return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
2747 		}
2748 		if (c == 0x30FC) {
2749 			return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
2750 		}
2751 		if (c == 0x30FB) {
2752 			return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
2753 		}
2754 	}
2755 
2756 	if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
2757 		if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
2758 			/* Zenkaku hiragana to Zenkaku katakana */
2759 			return c + 0x60;
2760 		}
2761 		if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
2762 			/* Zenkaku katakana to Zenkaku hiragana */
2763 			return c - 0x60;
2764 		}
2765 	}
2766 
2767 	if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
2768 		if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
2769 			return '\\';
2770 		}
2771 		if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
2772 			return '~';
2773 		}
2774 		if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
2775 			return '\'';
2776 		}
2777 		if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
2778 			return '"';
2779 		}
2780 	}
2781 
2782 	return c;
2783 }
2784 
2785 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
2786 
mbfl_filt_conv_wchar_cp50220(int c,mbfl_convert_filter * filter)2787 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
2788 {
2789 	int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2790 	bool consumed = false;
2791 
2792 	if (filter->cache) {
2793 		int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
2794 		filter->cache = consumed ? 0 : c;
2795 		/* Terrible hack to get CP50220 to emit error markers in the proper
2796 		 * position, not reordering them with subsequent characters */
2797 		filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2798 		mbfl_filt_conv_wchar_cp50221(s, filter);
2799 		filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2800 		if (c == 0 && !consumed) {
2801 			(*filter->output_function)(0, filter->data);
2802 		}
2803 	} else if (c == 0) {
2804 		/* This case has to be handled separately, since `filter->cache == 0` means
2805 		 * no codepoint is cached */
2806 		(*filter->output_function)(0, filter->data);
2807 	} else {
2808 		filter->cache = c;
2809 	}
2810 
2811 	return 0;
2812 }
2813 
mbfl_filt_conv_any_jis_flush(mbfl_convert_filter * filter)2814 static int mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
2815 {
2816 	/* back to latin */
2817 	if ((filter->status & 0xff00) != 0) {
2818 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2819 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2820 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
2821 	}
2822 	filter->status = 0;
2823 
2824 	if (filter->flush_function != NULL) {
2825 		return (*filter->flush_function)(filter->data);
2826 	}
2827 
2828 	return 0;
2829 }
2830 
mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter * filter)2831 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
2832 {
2833 	int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2834 
2835 	if (filter->cache) {
2836 		int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
2837 		filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2838 		mbfl_filt_conv_wchar_cp50221(s, filter);
2839 		filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2840 		filter->cache = 0;
2841 	}
2842 
2843 	return mbfl_filt_conv_any_jis_flush(filter);
2844 }
2845 
mbfl_filt_conv_wchar_cp50221(int c,mbfl_convert_filter * filter)2846 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
2847 {
2848 	int s = 0;
2849 
2850 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2851 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2852 	} else if (c == 0x203E) { /* OVERLINE */
2853 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2854 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2855 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2856 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2857 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2858 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2859 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2860 	} else if (c >= 0xE000 && c <= 0xE757) {
2861 		/* 'private'/'user' codepoints */
2862 		s = c - 0xE000;
2863 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2864 	}
2865 
2866 	if (s <= 0) {
2867 		if (c == 0xa5) {			/* YEN SIGN */
2868 			s = 0x1005c;
2869 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
2870 			s = 0x2140;
2871 		} else if (c == 0x2225) {	/* PARALLEL TO */
2872 			s = 0x2142;
2873 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
2874 			s = 0x215d;
2875 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
2876 			s = 0x2171;
2877 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
2878 			s = 0x2172;
2879 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
2880 			s = 0x224c;
2881 		}
2882 	}
2883 
2884 	/* Above, we do a series of lookups in `ucs_*_jis_table` to find a
2885 	 * corresponding kuten code for this Unicode codepoint
2886 	 * If we get zero, that means the codepoint is not in JIS X 0208
2887 	 * On the other hand, if we get a result with the high bits set on both
2888 	 * upper and lower bytes, that is not a code in JIS X 0208 but rather
2889 	 * in JIS X 0213
2890 	 * In either case, check if this codepoint is one of the extensions added
2891 	 * to JIS X 0208 by MicroSoft (to make CP932) */
2892 	if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
2893 		int i;
2894 		s = -1;
2895 
2896 		for (i = 0;
2897 				i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
2898 				i++) {
2899 			const int oh = cp932ext1_ucs_table_min / 94;
2900 
2901 			if (c == cp932ext1_ucs_table[i]) {
2902 				s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2903 				break;
2904 			}
2905 		}
2906 
2907 		if (s < 0) {
2908 			const int oh = cp932ext2_ucs_table_min / 94;
2909 			const int cp932ext2_ucs_table_size =
2910 					cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
2911 			for (i = 0; i < cp932ext2_ucs_table_size; i++) {
2912 				if (c == cp932ext2_ucs_table[i]) {
2913 					s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2914 					break;
2915 				}
2916 			}
2917 		}
2918 
2919 		if (c == 0) {
2920 			s = 0;
2921 		} else if (s <= 0) {
2922 			s = -1;
2923 		}
2924 	}
2925 
2926 	if (s >= 0) {
2927 		if (s < 0x80) { /* ASCII */
2928 			if ((filter->status & 0xff00) != 0) {
2929 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2930 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2931 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
2932 				filter->status = 0;
2933 			}
2934 			CK((*filter->output_function)(s, filter->data));
2935 		} else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
2936 			if ((filter->status & 0xff00) != 0x500) {
2937 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2938 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2939 				CK((*filter->output_function)(0x49, filter->data));		/* 'I' */
2940 				filter->status = 0x500;
2941 			}
2942 			CK((*filter->output_function)(s - 0x80, filter->data));
2943 		} else if (s <= 0x927E) { /* X 0208 + extensions */
2944 			if ((filter->status & 0xff00) != 0x200) {
2945 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2946 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
2947 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
2948 				filter->status = 0x200;
2949 			}
2950 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
2951 			CK((*filter->output_function)(s & 0xff, filter->data));
2952 		} else if (s < 0x10000) { /* X0212 */
2953 			CK(mbfl_filt_conv_illegal_output(c, filter));
2954 		} else { /* X 0201 latin */
2955 			if ((filter->status & 0xff00) != 0x400) {
2956 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2957 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2958 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
2959 			}
2960 			filter->status = 0x400;
2961 			CK((*filter->output_function)(s & 0x7f, filter->data));
2962 		}
2963 	} else {
2964 		CK(mbfl_filt_conv_illegal_output(c, filter));
2965 	}
2966 
2967 	return 0;
2968 }
2969 
mbfl_filt_conv_wchar_cp50222(int c,mbfl_convert_filter * filter)2970 static int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
2971 {
2972 	int s = 0;
2973 
2974 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2975 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2976 	} else if (c == 0x203E) { /* OVERLINE */
2977 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2978 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2979 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2980 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2981 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2982 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2983 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2984 	} else if (c >= 0xE000 && c <= 0xE757) {
2985 		/* 'private'/'user' codepoints */
2986 		s = c - 0xE000;
2987 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2988 	}
2989 
2990 	if (s <= 0) {
2991 		if (c == 0xa5) {			/* YEN SIGN */
2992 			s = 0x1005c;
2993 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
2994 			s = 0x2140;
2995 		} else if (c == 0x2225) {	/* PARALLEL TO */
2996 			s = 0x2142;
2997 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
2998 			s = 0x215d;
2999 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
3000 			s = 0x2171;
3001 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
3002 			s = 0x2172;
3003 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
3004 			s = 0x224c;
3005 		}
3006 	}
3007 	if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
3008 		int i;
3009 		s = -1;
3010 
3011 		for (i = 0;
3012 				i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3013 			const int oh = cp932ext1_ucs_table_min / 94;
3014 
3015 			if (c == cp932ext1_ucs_table[i]) {
3016 				s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3017 				break;
3018 			}
3019 		}
3020 
3021 		if (s <= 0) {
3022 			const int oh = cp932ext2_ucs_table_min / 94;
3023 			const int cp932ext2_ucs_table_size =
3024 					cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
3025 			for (i = 0; i < cp932ext2_ucs_table_size; i++) {
3026 				if (c == cp932ext2_ucs_table[i]) {
3027 					s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3028 					break;
3029 				}
3030 			}
3031 		}
3032 
3033 		if (c == 0) {
3034 			s = 0;
3035 		} else if (s <= 0) {
3036 			s = -1;
3037 		}
3038 	}
3039 
3040 	if (s >= 0) {
3041 		if (s < 0x80) { /* ASCII */
3042 			if ((filter->status & 0xff00) == 0x500) {
3043 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3044 				filter->status = 0;
3045 			} else if ((filter->status & 0xff00) != 0) {
3046 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3047 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
3048 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
3049 				filter->status = 0;
3050 			}
3051 			CK((*filter->output_function)(s, filter->data));
3052 		} else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
3053 			if ((filter->status & 0xff00) != 0x500) {
3054 				CK((*filter->output_function)(0x0e, filter->data));		/* SI */
3055 				filter->status = 0x500;
3056 			}
3057 			CK((*filter->output_function)(s - 0x80, filter->data));
3058 		} else if (s <= 0x927E) { /* X 0208 */
3059 			if ((filter->status & 0xff00) == 0x500) {
3060 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3061 				filter->status = 0;
3062 			}
3063 			if ((filter->status & 0xff00) != 0x200) {
3064 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3065 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
3066 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
3067 				filter->status = 0x200;
3068 			}
3069 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
3070 			CK((*filter->output_function)(s & 0xff, filter->data));
3071 		} else if (s < 0x10000) { /* X0212 */
3072 			CK(mbfl_filt_conv_illegal_output(c, filter));
3073 		} else { /* X 0201 latin */
3074 			if ((filter->status & 0xff00) == 0x500) {
3075 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3076 				filter->status = 0;
3077 			}
3078 			if ((filter->status & 0xff00) != 0x400) {
3079 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3080 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
3081 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
3082 			}
3083 			filter->status = 0x400;
3084 			CK((*filter->output_function)(s & 0x7f, filter->data));
3085 		}
3086 	} else {
3087 		CK(mbfl_filt_conv_illegal_output(c, filter));
3088 	}
3089 
3090 	return 0;
3091 }
3092 
mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter * filter)3093 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
3094 {
3095 	/* back to latin */
3096 	if ((filter->status & 0xff00) == 0x500) {
3097 		CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3098 	} else if ((filter->status & 0xff00) != 0) {
3099 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3100 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
3101 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
3102 	}
3103 	filter->status = 0;
3104 
3105 	if (filter->flush_function) {
3106 		(*filter->flush_function)(filter->data);
3107 	}
3108 
3109 	return 0;
3110 }
3111 
3112 #define ASCII 0
3113 #define JISX_0201_LATIN 1
3114 #define JISX_0201_KANA 2
3115 #define JISX_0208 3
3116 #define JISX_0212 4
3117 
mb_cp5022x_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3118 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3119 {
3120 	ZEND_ASSERT(bufsize >= 3);
3121 
3122 	unsigned char *p = *in, *e = p + *in_len;
3123 	uint32_t *out = buf, *limit = buf + bufsize;
3124 
3125 	while (p < e && out < limit) {
3126 		unsigned char c = *p++;
3127 
3128 		if (c == 0x1B) {
3129 			/* Escape sequence */
3130 			if ((e - p) < 2) {
3131 				*out++ = MBFL_BAD_INPUT;
3132 				/* Duplicate error-handling behavior of legacy code */
3133 				if (p < e && (*p == '(' || *p == '$'))
3134 					p++;
3135 				continue;
3136 			}
3137 			unsigned char c2 = *p++;
3138 			if (c2 == '$') {
3139 				unsigned char c3 = *p++;
3140 				if (c3 == '@' || c3 == 'B') {
3141 					*state = JISX_0208;
3142 				} else if (c3 == '(') {
3143 					if (p == e) {
3144 						*out++ = MBFL_BAD_INPUT;
3145 						break;
3146 					}
3147 					unsigned char c4 = *p++;
3148 					if (c4 == '@' || c4 == 'B') {
3149 						*state = JISX_0208;
3150 					} else if (c4 == 'D') {
3151 						*state = JISX_0212;
3152 					} else {
3153 						if ((limit - out) < 3) {
3154 							p -= 4;
3155 							break;
3156 						}
3157 						*out++ = MBFL_BAD_INPUT;
3158 						*out++ = '$';
3159 						*out++ = '(';
3160 						p--;
3161 					}
3162 				} else {
3163 					if ((limit - out) < 2) {
3164 						p -= 3;
3165 						break;
3166 					}
3167 					*out++ = MBFL_BAD_INPUT;
3168 					*out++ = '$';
3169 					p--;
3170 				}
3171 			} else if (c2 == '(') {
3172 				unsigned char c3 = *p++;
3173 				if (c3 == 'B' || c3 == 'H') {
3174 					*state = ASCII;
3175 				} else if (c3 == 'J') {
3176 					*state = JISX_0201_LATIN;
3177 				} else if (c3 == 'I') {
3178 					*state = JISX_0201_KANA;
3179 				} else {
3180 					if ((limit - out) < 2) {
3181 						p -= 3;
3182 						break;
3183 					}
3184 					*out++ = MBFL_BAD_INPUT;
3185 					*out++ = '(';
3186 					p--;
3187 				}
3188 			} else {
3189 				*out++ = MBFL_BAD_INPUT;
3190 				p--;
3191 			}
3192 		} else if (c == 0xE) {
3193 			*state = JISX_0201_KANA;
3194 		} else if (c == 0xF) {
3195 			*state = ASCII;
3196 		} else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
3197 			*out++ = 0xA5;
3198 		} else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
3199 			*out++ = 0x203E;
3200 		} else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
3201 			*out++ = 0xFF40 + c;
3202 		} else if (*state >= JISX_0208 && c > 0x20 && c <= 0x97) {
3203 			if (p == e) {
3204 				*out++ = MBFL_BAD_INPUT;
3205 				break;
3206 			}
3207 			unsigned char c2 = *p++;
3208 			if (c2 > 0x20 && c2 < 0x7F) {
3209 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
3210 				uint32_t w = 0;
3211 				if (*state == JISX_0208) {
3212 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3213 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3214 					} else if (s < jisx0208_ucs_table_size) {
3215 						w = jisx0208_ucs_table[s];
3216 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3217 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3218 					} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
3219 						w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
3220 					} else if (s >= 94*94 && s < 114*94) {
3221 						/* MicroSoft extension */
3222 						w = s - 94*94 + 0xE000;
3223 					}
3224 					if (!w)
3225 						w = MBFL_BAD_INPUT;
3226 				} else {
3227 					if (s < jisx0212_ucs_table_size) {
3228 						w = jisx0212_ucs_table[s];
3229 					}
3230 					if (!w)
3231 						w = MBFL_BAD_INPUT;
3232 				}
3233 				*out++ = w;
3234 			} else {
3235 				*out++ = MBFL_BAD_INPUT;
3236 			}
3237 		} else if (c < 0x80) {
3238 			*out++ = c;
3239 		} else if (c >= 0xA1 && c <= 0xDF) {
3240 			*out++ = 0xFEC0 + c;
3241 		} else {
3242 			*out++ = MBFL_BAD_INPUT;
3243 		}
3244 	}
3245 
3246 	*in_len = e - p;
3247 	*in = p;
3248 	return out - buf;
3249 }
3250 
lookup_wchar(uint32_t w)3251 static unsigned int lookup_wchar(uint32_t w)
3252 {
3253 	unsigned int s = 0;
3254 
3255 	if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3256 		s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3257 	} else if (w == 0x203E) { /* OVERLINE */
3258 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
3259 	} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3260 		s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3261 	} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3262 		s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3263 	} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3264 		s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3265 	} else if (w >= 0xE000 && w <= 0xE757) {
3266 		/* Private Use Area codepoints */
3267 		s = w - 0xE000;
3268 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
3269 	}
3270 
3271 	if (!s) {
3272 		if (w == 0xA5) { /* YEN SIGN */
3273 			s = 0x1005C;
3274 		} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3275 			s = 0x2140;
3276 		} else if (w == 0x2225) { /* PARALLEL TO */
3277 			s = 0x2142;
3278 		} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3279 			s = 0x215D;
3280 		} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3281 			s = 0x2171;
3282 		} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3283 			s = 0x2172;
3284 		} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3285 			s = 0x224C;
3286 		} else if (w == 0) {
3287 			return 0;
3288 		}
3289 	}
3290 
3291 	/* Above, we do a series of lookups in `ucs_*_jis_table` to find a
3292 	 * corresponding kuten code for this Unicode codepoint
3293 	 * If we get zero, that means the codepoint is not in JIS X 0208
3294 	 * On the other hand, if we get a result with the high bits set on both
3295 	 * upper and lower bytes, that is not a code in JIS X 0208 but rather
3296 	 * in JIS X 0213
3297 	 * In either case, check if this codepoint is one of the extensions added
3298 	 * to JIS X 0208 by MicroSoft (to make CP932) */
3299 	if (!s || s >= 0x8080) {
3300 		for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3301 			if (w == cp932ext1_ucs_table[i]) {
3302 				return (((i / 94) + (cp932ext1_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3303 			}
3304 		}
3305 
3306 		for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
3307 			if (w == cp932ext2_ucs_table[i]) {
3308 				return (((i / 94) + (cp932ext2_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3309 			}
3310 		}
3311 	}
3312 
3313 	return s;
3314 }
3315 
3316 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3317 
mb_wchar_to_cp50220(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3318 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3319 {
3320 	unsigned char *out, *limit;
3321 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3322 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3323 
3324 	uint32_t w;
3325 
3326 	if (buf->state & 0xFFFF00) {
3327 		/* Reprocess cached codepoint */
3328 		w = buf->state >> 8;
3329 		buf->state &= 0xFF;
3330 		goto reprocess_codepoint;
3331 	}
3332 
3333 	while (len--) {
3334 		w = *in++;
3335 reprocess_codepoint:
3336 
3337 		if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
3338 			/* This codepoint may need to combine with the next one,
3339 			 * but the 'next one' will come in a separate buffer */
3340 			buf->state |= w << 8;
3341 			break;
3342 		}
3343 
3344 		bool consumed = false;
3345 		w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
3346 		if (consumed) {
3347 			/* Two successive codepoints were converted into one */
3348 			in++; len--; consumed = false;
3349 		}
3350 
3351 		unsigned int s = lookup_wchar(w);
3352 
3353 		if (!s && w) {
3354 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3355 		} else if (s < 0x80) {
3356 			/* ASCII */
3357 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3358 			if (buf->state != ASCII) {
3359 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3360 				buf->state = ASCII;
3361 			}
3362 			out = mb_convert_buf_add(out, s);
3363 		} else if (s >= 0xA0 && s < 0xE0) {
3364 			/* JISX 0201 Kana */
3365 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3366 			if (buf->state != JISX_0201_KANA) {
3367 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3368 				buf->state = JISX_0201_KANA;
3369 			}
3370 			out = mb_convert_buf_add(out, s - 0x80);
3371 		} else if (s <= 0x927E) {
3372 			/* JISX 0208 Kanji */
3373 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3374 			if (buf->state != JISX_0208) {
3375 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3376 				buf->state = JISX_0208;
3377 			}
3378 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3379 		} else if (s >= 0x10000) {
3380 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3381 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3382 			if (buf->state != JISX_0201_LATIN) {
3383 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3384 				buf->state = JISX_0201_LATIN;
3385 			}
3386 			out = mb_convert_buf_add(out, s & 0x7F);
3387 		} else {
3388 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3389 		}
3390 	}
3391 
3392 	if (end && buf->state != ASCII) {
3393 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3394 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3395 	}
3396 
3397 	MB_CONVERT_BUF_STORE(buf, out, limit);
3398 }
3399 
mb_wchar_to_cp50221(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3400 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3401 {
3402 	unsigned char *out, *limit;
3403 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3404 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3405 
3406 	while (len--) {
3407 		uint32_t w = *in++;
3408 		unsigned int s = lookup_wchar(w);
3409 
3410 		if (!s && w) {
3411 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3412 		} else if (s < 0x80) {
3413 			/* ASCII */
3414 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3415 			if (buf->state != ASCII) {
3416 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3417 				buf->state = ASCII;
3418 			}
3419 			out = mb_convert_buf_add(out, s);
3420 		} else if (s >= 0xA0 && s < 0xE0) {
3421 			/* JISX 0201 Kana */
3422 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3423 			if (buf->state != JISX_0201_KANA) {
3424 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3425 				buf->state = JISX_0201_KANA;
3426 			}
3427 			out = mb_convert_buf_add(out, s - 0x80);
3428 		} else if (s <= 0x927E) {
3429 			/* JISX 0208 Kanji */
3430 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3431 			if (buf->state != JISX_0208) {
3432 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3433 				buf->state = JISX_0208;
3434 			}
3435 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3436 		} else if (s >= 0x10000) {
3437 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3438 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3439 			if (buf->state != JISX_0201_LATIN) {
3440 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3441 				buf->state = JISX_0201_LATIN;
3442 			}
3443 			out = mb_convert_buf_add(out, s & 0x7F);
3444 		} else {
3445 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3446 		}
3447 	}
3448 
3449 	if (end && buf->state != ASCII) {
3450 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3451 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3452 	}
3453 
3454 	MB_CONVERT_BUF_STORE(buf, out, limit);
3455 }
3456 
mb_wchar_to_cp50222(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3457 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3458 {
3459 	unsigned char *out, *limit;
3460 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3461 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3462 
3463 	while (len--) {
3464 		uint32_t w = *in++;
3465 		unsigned int s = lookup_wchar(w);
3466 
3467 		if (!s && w) {
3468 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3469 		} else if (s < 0x80) {
3470 			/* ASCII */
3471 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3472 			if (buf->state == JISX_0201_KANA) {
3473 				out = mb_convert_buf_add(out, 0xF);
3474 				buf->state = ASCII;
3475 			} else if (buf->state != ASCII) {
3476 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3477 				buf->state = ASCII;
3478 			}
3479 			out = mb_convert_buf_add(out, s);
3480 		} else if (s >= 0xA0 && s < 0xE0) {
3481 			/* JISX 0201 Kana */
3482 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3483 			if (buf->state != JISX_0201_KANA) {
3484 				out = mb_convert_buf_add(out, 0xE);
3485 				buf->state = JISX_0201_KANA;
3486 			}
3487 			out = mb_convert_buf_add(out, s - 0x80);
3488 		} else if (s <= 0x927E) {
3489 			/* JISX 0208 Kanji */
3490 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
3491 			if (buf->state == JISX_0201_KANA) {
3492 				out = mb_convert_buf_add(out, 0xF);
3493 			}
3494 			if (buf->state != JISX_0208) {
3495 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3496 				buf->state = JISX_0208;
3497 			}
3498 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3499 		} else if (s >= 0x10000) {
3500 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3501 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3502 			if (buf->state == JISX_0201_KANA) {
3503 				out = mb_convert_buf_add(out, 0xF);
3504 			}
3505 			if (buf->state != JISX_0201_LATIN) {
3506 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3507 				buf->state = JISX_0201_LATIN;
3508 			}
3509 			out = mb_convert_buf_add(out, s & 0x7F);
3510 		} else {
3511 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3512 		}
3513 	}
3514 
3515 	if (end) {
3516 		if (buf->state == JISX_0201_KANA) {
3517 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
3518 			out = mb_convert_buf_add(out, 0xF);
3519 		} else if (buf->state != ASCII) {
3520 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3521 			out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3522 		}
3523 	}
3524 
3525 	MB_CONVERT_BUF_STORE(buf, out, limit);
3526 }
3527 
3528 #define ASCII          0
3529 #define JISX0201_KANA  0x20
3530 #define JISX0208_KANJI 0x80
3531 #define UDC            0xA0
3532 
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)3533 static int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
3534 {
3535 	int c1, s, w;
3536 
3537 	switch (filter->status & 0xF) {
3538 	case 0:
3539 		if (c == 0x1B) {
3540 			filter->status += 2;
3541 		} else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
3542 			CK((*filter->output_function)(0xFF40 + c, filter->data));
3543 		} else if ((filter->status == JISX0208_KANJI || filter->status == UDC) && c > 0x20 && c < 0x80) {
3544 			filter->cache = c;
3545 			filter->status += 1;
3546 		} else if (c >= 0 && c < 0x80) { /* ASCII */
3547 			CK((*filter->output_function)(c, filter->data));
3548 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
3549 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
3550 		} else {
3551 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3552 		}
3553 		break;
3554 
3555 	/* Kanji, second byte */
3556 	case 1:
3557 		w = 0;
3558 		filter->status &= ~0xF;
3559 		c1 = filter->cache;
3560 		if (c > 0x20 && c < 0x7F) {
3561 			s = ((c1 - 0x21) * 94) + c - 0x21;
3562 			if (filter->status == JISX0208_KANJI) {
3563 				if (s <= 137) {
3564 					if (s == 31) {
3565 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3566 					} else if (s == 32) {
3567 						w = 0xFF5E; /* FULLWIDTH TILDE */
3568 					} else if (s == 33) {
3569 						w = 0x2225; /* PARALLEL TO */
3570 					} else if (s == 60) {
3571 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3572 					} else if (s == 80) {
3573 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3574 					} else if (s == 81) {
3575 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3576 					} else if (s == 137) {
3577 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3578 					}
3579 				}
3580 
3581 				if (w == 0) {
3582 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
3583 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3584 					} else if (s >= 0 && s < jisx0208_ucs_table_size) {
3585 						w = jisx0208_ucs_table[s];
3586 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
3587 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3588 					}
3589 				}
3590 
3591 				if (w <= 0) {
3592 					w = MBFL_BAD_INPUT;
3593 				}
3594 			} else {
3595 				if (c1 > 0x20 && c1 < 0x35) {
3596 					w = 0xE000 + ((c1 - 0x21) * 94) + c - 0x21;
3597 				} else {
3598 					w = MBFL_BAD_INPUT;
3599 				}
3600 			}
3601 			CK((*filter->output_function)(w, filter->data));
3602 		} else {
3603 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3604 		}
3605 		break;
3606 
3607 	/* ESC */
3608 	case 2:
3609 		if (c == '$') {
3610 			filter->status++;
3611 		} else if (c == '(') {
3612 			filter->status += 3;
3613 		} else {
3614 			filter->status &= ~0xF;
3615 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3616 		}
3617 		break;
3618 
3619 	/* ESC $ */
3620 	case 3:
3621 		if (c == '@' || c == 'B') {
3622 			filter->status = JISX0208_KANJI;
3623 		} else if (c == '(') {
3624 			filter->status++;
3625 		} else {
3626 			filter->status &= ~0xF;
3627 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3628 		}
3629 		break;
3630 
3631 	/* ESC $ ( */
3632 	case 4:
3633 		if (c == '@' || c == 'B') {
3634 			filter->status = JISX0208_KANJI;
3635 		} else if (c == '?') {
3636 			filter->status = UDC;
3637 		} else {
3638 			filter->status &= ~0xF;
3639 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3640 		}
3641 		break;
3642 
3643 	/* ESC ( */
3644 	case 5:
3645 		if (c == 'B' || c == 'J') {
3646 			filter->status = 0;
3647 		} else if (c == 'I') {
3648 			filter->status = JISX0201_KANA;
3649 		} else {
3650 			filter->status &= ~0xF;
3651 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3652 		}
3653 	}
3654 
3655 	return 0;
3656 }
3657 
mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter * filter)3658 static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter)
3659 {
3660 	if (filter->status & 0xF) {
3661 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
3662 	}
3663 	filter->status = 0;
3664 
3665 	if (filter->flush_function) {
3666 		(*filter->flush_function)(filter->data);
3667 	}
3668 
3669 	return 0;
3670 }
3671 
3672 #define sjistoidx(c1, c2) \
3673 	(((c1) > 0x9f) ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
3674 #define idxtojis1(c) (((c) / 94) + 0x21)
3675 #define idxtojis2(c) (((c) % 94) + 0x21)
3676 
cp932ext3_cp932ext2_jis(int c)3677 static int cp932ext3_cp932ext2_jis(int c)
3678 {
3679 	int idx;
3680 
3681 	idx = sjistoidx(0xfa, 0x40) + c;
3682 	if (idx >= sjistoidx(0xfa, 0x5c))
3683 		idx -=  sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
3684 	else if (idx >= sjistoidx(0xfa, 0x55))
3685 		idx -=  sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
3686 	else if (idx >= sjistoidx(0xfa, 0x40))
3687 		idx -=  sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
3688 	return idxtojis1(idx) << 8 | idxtojis2(idx);
3689 }
3690 
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)3691 static int mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
3692 {
3693 	int c1, c2, s1 = 0, s2 = 0;
3694 
3695 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
3696 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
3697 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
3698 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
3699 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
3700 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
3701 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
3702 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
3703 	} else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
3704 		/* Private User Area (95ku - 114ku) */
3705 		s1 = c - 0xE000;
3706 		c1 = (s1 / 94) + 0x7f;
3707 		c2 = (s1 % 94) + 0x21;
3708 		s1 = (c1 << 8) | c2;
3709 	}
3710 
3711 	if (s1 <= 0) {
3712 		if (c == 0xA5) { /* YEN SIGN */
3713 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
3714 		} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3715 			s1 = 0x2140;
3716 		} else if (c == 0x2225) { /* PARALLEL TO */
3717 			s1 = 0x2142;
3718 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3719 			s1 = 0x215d;
3720 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3721 			s1 = 0x2171;
3722 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3723 			s1 = 0x2172;
3724 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3725 			s1 = 0x224C;
3726 		}
3727 	}
3728 
3729 	if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
3730 		s1 = -1;
3731 		for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
3732 			if (c == cp932ext1_ucs_table[c1]) {
3733 				s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
3734 				break;
3735 			}
3736 		}
3737 
3738 		if (s1 <= 0) {
3739 			for (c1 = 0; c1 < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; c1++) {
3740 				if (c == cp932ext3_ucs_table[c1]) {
3741 					s1 = cp932ext3_cp932ext2_jis(c1);
3742 					break;
3743 				}
3744 			}
3745 		}
3746 
3747 		if (c == 0) {
3748 			s1 = 0;
3749 		}
3750 	}
3751 
3752 	if (s1 >= 0) {
3753 		if (s1 < 0x80) { /* latin */
3754 			if (filter->status & 0xFF00) {
3755 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3756 				CK((*filter->output_function)('(', filter->data));
3757 				CK((*filter->output_function)('B', filter->data));
3758 			}
3759 			CK((*filter->output_function)(s1, filter->data));
3760 			filter->status = 0;
3761 		} else if (s1 > 0xA0 && s1 < 0xE0) { /* kana */
3762 			if ((filter->status & 0xFF00) != 0x100) {
3763 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3764 				CK((*filter->output_function)('(', filter->data));
3765 				CK((*filter->output_function)('I', filter->data));
3766 			}
3767 			filter->status = 0x100;
3768 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
3769 		} else if (s1 < 0x7E7F) { /* X 0208 */
3770 			if ((filter->status & 0xFF00) != 0x200) {
3771 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3772 				CK((*filter->output_function)('$', filter->data));
3773 				CK((*filter->output_function)('B', filter->data));
3774 			}
3775 			filter->status = 0x200;
3776 			CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
3777 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
3778 		} else if (s1 < 0x927F) { /* UDC */
3779 			if ((filter->status & 0xFF00) != 0x800) {
3780 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3781 				CK((*filter->output_function)('$', filter->data));
3782 				CK((*filter->output_function)('(', filter->data));
3783 				CK((*filter->output_function)('?', filter->data));
3784 			}
3785 			filter->status = 0x800;
3786 			CK((*filter->output_function)(((s1 >> 8) - 0x5E) & 0x7F, filter->data));
3787 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
3788 		}
3789 	} else {
3790 		CK(mbfl_filt_conv_illegal_output(c, filter));
3791 	}
3792 
3793 	return 0;
3794 }
3795 
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)3796 static int mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
3797 {
3798 	/* Go back to ASCII (so strings can be safely concatenated) */
3799 	if ((filter->status & 0xFF00) != 0) {
3800 		CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3801 		CK((*filter->output_function)('(', filter->data));
3802 		CK((*filter->output_function)('B', filter->data));
3803 	}
3804 	filter->status = 0;
3805 
3806 	if (filter->flush_function) {
3807 		(*filter->flush_function)(filter->data);
3808 	}
3809 
3810 	return 0;
3811 }
3812 
mb_iso2022jpms_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3813 static size_t mb_iso2022jpms_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3814 {
3815 	unsigned char *p = *in, *e = p + *in_len;
3816 	uint32_t *out = buf, *limit = buf + bufsize;
3817 
3818 	while (p < e && out < limit) {
3819 		unsigned char c = *p++;
3820 
3821 		if (c == 0x1B) {
3822 			if ((e - p) < 2) {
3823 				*out++ = MBFL_BAD_INPUT;
3824 				p = e;
3825 				break;
3826 			}
3827 			unsigned char c2 = *p++;
3828 			unsigned char c3 = *p++;
3829 
3830 			if (c2 == '$') {
3831 				if (c3 == '@' || c3 == 'B') {
3832 					*state = JISX0208_KANJI;
3833 				} else if (c3 == '(' && p < e) {
3834 					unsigned char c4 = *p++;
3835 
3836 					if (c4 == '@' || c4 == 'B') {
3837 						*state = JISX0208_KANJI;
3838 					} else if (c4 == '?') {
3839 						*state = UDC;
3840 					} else {
3841 						*out++ = MBFL_BAD_INPUT;
3842 					}
3843 				} else {
3844 					*out++ = MBFL_BAD_INPUT;
3845 				}
3846 			} else if (c2 == '(') {
3847 				if (c3 == 'B' || c3 == 'J') {
3848 					*state = ASCII;
3849 				} else if (c3 == 'I') {
3850 					*state = JISX0201_KANA;
3851 				} else {
3852 					*out++ = MBFL_BAD_INPUT;
3853 				}
3854 			} else {
3855 				p--;
3856 				*out++ = MBFL_BAD_INPUT;
3857 			}
3858 		} else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
3859 			*out++ = 0xFF40 + c;
3860 		} else if ((*state == JISX0208_KANJI || *state == UDC) && c >= 0x21 && c <= 0x7F) {
3861 			if (p == e) {
3862 				*out++ = MBFL_BAD_INPUT;
3863 				break;
3864 			}
3865 			unsigned char c2 = *p++;
3866 			unsigned int w = 0;
3867 
3868 			if (c2 >= 0x21 && c2 <= 0x7E) {
3869 				unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
3870 				if (*state == JISX0208_KANJI) {
3871 					if (s <= 137) {
3872 						if (s == 31) {
3873 							w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3874 						} else if (s == 32) {
3875 							w = 0xFF5E; /* FULLWIDTH TILDE */
3876 						} else if (s == 33) {
3877 							w = 0x2225; /* PARALLEL TO */
3878 						} else if (s == 60) {
3879 							w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3880 						} else if (s == 80) {
3881 							w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3882 						} else if (s == 81) {
3883 							w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3884 						} else if (s == 137) {
3885 							w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3886 						}
3887 					}
3888 
3889 					if (!w) {
3890 						if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3891 							w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3892 						} else if (s < jisx0208_ucs_table_size) {
3893 							w = jisx0208_ucs_table[s];
3894 						} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3895 							w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3896 						}
3897 					}
3898 				} else if (c >= 0x21 && c <= 0x34) {
3899 					w = 0xE000 + ((c - 0x21) * 94) + c2 - 0x21;
3900 				}
3901 
3902 				*out++ = w ? w : MBFL_BAD_INPUT;
3903 			} else {
3904 				*out++ = MBFL_BAD_INPUT;
3905 			}
3906 		} else if (c <= 0x7F) {
3907 			*out++ = c;
3908 		} else if (c >= 0xA1 && c <= 0xDF) {
3909 			*out++ = 0xFEC0 + c;
3910 		} else {
3911 			*out++ = MBFL_BAD_INPUT;
3912 		}
3913 	}
3914 
3915 	*in_len = e - p;
3916 	*in = p;
3917 	return out - buf;
3918 }
3919 
mb_wchar_to_iso2022jpms(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3920 static void mb_wchar_to_iso2022jpms(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3921 {
3922 	unsigned char *out, *limit;
3923 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3924 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3925 
3926 	while (len--) {
3927 		uint32_t w = *in++;
3928 		unsigned int s = 0;
3929 
3930 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3931 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3932 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3933 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3934 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3935 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3936 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3937 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3938 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
3939 			/* Private User Area (95ku - 114ku) */
3940 			s = ((((w - 0xE000) / 94) + 0x7F) << 8) | (((w - 0xE000) % 94) + 0x21);
3941 		}
3942 
3943 		if (!s) {
3944 			if (w == 0xA5) { /* YEN SIGN */
3945 				s = 0x216F; /* FULLWIDTH YEN SIGN */
3946 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3947 				s = 0x2140;
3948 			} else if (w == 0x2225) { /* PARALLEL TO */
3949 				s = 0x2142;
3950 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3951 				s = 0x215D;
3952 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3953 				s = 0x2171;
3954 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3955 				s = 0x2172;
3956 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3957 				s = 0x224C;
3958 			}
3959 		}
3960 
3961 		if (s >= 0xA1A1) /* JISX 0212 */
3962 			s = 0;
3963 
3964 		if (!s && w) {
3965 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3966 				if (w == cp932ext1_ucs_table[i]) {
3967 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
3968 					break;
3969 				}
3970 			}
3971 
3972 			if (!s) {
3973 				for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
3974 					if (w == cp932ext3_ucs_table[i]) {
3975 						s = cp932ext3_cp932ext2_jis(i);
3976 						break;
3977 					}
3978 				}
3979 			}
3980 		}
3981 
3982 		if (!s && w) {
3983 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
3984 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3985 		} else if (s <= 0x7F) {
3986 			if (buf->state != ASCII) {
3987 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3988 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3989 				buf->state = ASCII;
3990 			}
3991 			out = mb_convert_buf_add(out, s);
3992 		} else if (s >= 0xA1 && s <= 0xDF) {
3993 			if (buf->state != JISX0201_KANA) {
3994 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3995 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3996 				buf->state = JISX0201_KANA;
3997 			}
3998 			out = mb_convert_buf_add(out, s & 0x7F);
3999 		} else if (s <= 0x7E7E) {
4000 			if (buf->state != JISX0208_KANJI) {
4001 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
4002 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
4003 				buf->state = JISX0208_KANJI;
4004 			} else {
4005 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4006 			}
4007 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0x7F);
4008 		} else if (s < 0x927F) {
4009 			if (buf->state != UDC) {
4010 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
4011 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', '?');
4012 				buf->state = UDC;
4013 			} else {
4014 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4015 			}
4016 			out = mb_convert_buf_add2(out, ((s >> 8) - 0x5E) & 0x7F, s & 0x7F);
4017 		} else {
4018 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
4019 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4020 		}
4021 	}
4022 
4023 	if (end && buf->state != ASCII) {
4024 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
4025 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
4026 	}
4027 
4028 	MB_CONVERT_BUF_STORE(buf, out, limit);
4029 }
4030 
mbfl_filt_conv_2022kr_wchar(int c,mbfl_convert_filter * filter)4031 static int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
4032 {
4033 	int w = 0;
4034 
4035 	switch (filter->status & 0xf) {
4036 	/* case 0x00: ASCII */
4037 	/* case 0x10: KSC5601 */
4038 	case 0:
4039 		if (c == 0x1b) { /* ESC */
4040 			filter->status += 2;
4041 		} else if (c == 0x0f) { /* shift in (ASCII) */
4042 			filter->status = 0;
4043 		} else if (c == 0x0e) { /* shift out (KSC5601) */
4044 			filter->status = 0x10;
4045 		} else if ((filter->status & 0x10) && c > 0x20 && c < 0x7f) {
4046 			/* KSC5601 lead byte */
4047 			filter->cache = c;
4048 			filter->status = 0x11;
4049 		} else if ((filter->status & 0x10) == 0 && c >= 0 && c < 0x80) {
4050 			/* latin, CTLs */
4051 			CK((*filter->output_function)(c, filter->data));
4052 		} else {
4053 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4054 		}
4055 		break;
4056 
4057 	case 1: /* dbcs second byte */
4058 		filter->status = 0x10;
4059 		int c1 = filter->cache;
4060 		int flag = 0;
4061 
4062 		if (c1 > 0x20 && c1 < 0x47) {
4063 			flag = 1;
4064 		} else if (c1 >= 0x47 && c1 <= 0x7e && c1 != 0x49) {
4065 			flag = 2;
4066 		}
4067 
4068 		if (flag > 0 && c > 0x20 && c < 0x7f) {
4069 			if (flag == 1) {
4070 				if (c1 != 0x22 || c <= 0x65) {
4071 					w = (c1 - 1)*190 + (c - 0x41) + 0x80;
4072 					ZEND_ASSERT(w < uhc1_ucs_table_size);
4073 					w = uhc1_ucs_table[w];
4074 				}
4075 			} else {
4076 				w = (c1 - 0x47)*94 + c - 0x21;
4077 				if (w < uhc3_ucs_table_size) {
4078 					w = uhc3_ucs_table[w];
4079 				} else {
4080 					w = MBFL_BAD_INPUT;
4081 				}
4082 			}
4083 
4084 			if (w <= 0) {
4085 				w = MBFL_BAD_INPUT;
4086 			}
4087 			CK((*filter->output_function)(w, filter->data));
4088 		} else {
4089 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4090 		}
4091 		break;
4092 
4093 	case 2: /* ESC */
4094 		if (c == '$') {
4095 			filter->status++;
4096 		} else {
4097 			filter->status &= ~0xF;
4098 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4099 		}
4100 		break;
4101 
4102 	case 3: /* ESC $ */
4103 		if (c == ')') {
4104 			filter->status++;
4105 		} else {
4106 			filter->status &= ~0xF;
4107 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4108 		}
4109 		break;
4110 
4111 	case 4: /* ESC $ ) */
4112 		filter->status = 0;
4113 		if (c != 'C') {
4114 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4115 		}
4116 		break;
4117 
4118 		EMPTY_SWITCH_DEFAULT_CASE();
4119 	}
4120 
4121 	return 0;
4122 }
4123 
mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter * filter)4124 static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
4125 {
4126 	if (filter->status & 0xF) {
4127 		/* 2-byte character was truncated */
4128 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4129 	}
4130 	filter->status = 0;
4131 
4132 	if (filter->flush_function) {
4133 		(*filter->flush_function)(filter->data);
4134 	}
4135 
4136 	return 0;
4137 }
4138 
mbfl_filt_conv_wchar_2022kr(int c,mbfl_convert_filter * filter)4139 static int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
4140 {
4141 	int c1, c2, s = 0;
4142 
4143 	if ((filter->status & 0x100) == 0) {
4144 		CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
4145 		CK((*filter->output_function)('$', filter->data));
4146 		CK((*filter->output_function)(')', filter->data));
4147 		CK((*filter->output_function)('C', filter->data));
4148 		filter->status |= 0x100;
4149 	}
4150 
4151 	if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
4152 		s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
4153 	} else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
4154 		s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
4155 	} else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
4156 		s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
4157 	} else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
4158 		s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
4159 	} else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
4160 		s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
4161 	} else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
4162 		s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
4163 	} else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
4164 		s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
4165 	}
4166 
4167 	c1 = (s >> 8) & 0xff;
4168 	c2 = s & 0xff;
4169 	/* exclude UHC extension area */
4170 	if (c1 < 0xa1 || c2 < 0xa1) {
4171 		s = c;
4172 	} else if (s & 0x8000) {
4173 		s -= 0x8080;
4174 	}
4175 
4176 	if (s <= 0) {
4177 		if (c == 0) {
4178 			s = 0;
4179 		} else {
4180 			s = -1;
4181 		}
4182 	} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4183 		s = -1;
4184 	}
4185 
4186 	if (s >= 0) {
4187 		if (s < 0x80 && s >= 0) { /* ASCII */
4188 			if (filter->status & 0x10) {
4189 				CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4190 				filter->status &= ~0x10;
4191 			}
4192 			CK((*filter->output_function)(s, filter->data));
4193 		} else {
4194 			if ((filter->status & 0x10) == 0) {
4195 				CK((*filter->output_function)(0x0e, filter->data)); /* shift out */
4196 				filter->status |= 0x10;
4197 			}
4198 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
4199 			CK((*filter->output_function)(s & 0xff, filter->data));
4200 		}
4201 	} else {
4202 		CK(mbfl_filt_conv_illegal_output(c, filter));
4203 	}
4204 
4205 	return 0;
4206 }
4207 
mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter * filter)4208 static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter)
4209 {
4210 	if (filter->status & 0xF) {
4211 		/* Escape sequence or 2-byte character was truncated */
4212 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4213 	}
4214 	/* back to ascii */
4215 	if (filter->status & 0x10) {
4216 		CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4217 	}
4218 
4219 	filter->status = filter->cache = 0;
4220 
4221 	if (filter->flush_function) {
4222 		return (*filter->flush_function)(filter->data);
4223 	}
4224 
4225 	return 0;
4226 }
4227 
4228 #define ASCII 0
4229 #define KSC5601 1
4230 
mb_iso2022kr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4231 static size_t mb_iso2022kr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4232 {
4233 	unsigned char *p = *in, *e = p + *in_len;
4234 	uint32_t *out = buf, *limit = buf + bufsize;
4235 
4236 	while (p < e && out < limit) {
4237 		unsigned char c = *p++;
4238 
4239 		if (c == 0x1B) {
4240 			if ((e - p) < 3) {
4241 				*out++ = MBFL_BAD_INPUT;
4242 				if (p < e && *p++ == '$') {
4243 					if (p < e) {
4244 						p++;
4245 					}
4246 				}
4247 				continue;
4248 			}
4249 			unsigned char c2 = *p++;
4250 			unsigned char c3 = *p++;
4251 			unsigned char c4 = *p++;
4252 			if (c2 == '$' && c3 == ')' && c4 == 'C') {
4253 				*state = ASCII;
4254 			} else {
4255 				if (c3 != ')') {
4256 					p--;
4257 					if (c2 != '$')
4258 						p--;
4259 				}
4260 				*out++ = MBFL_BAD_INPUT;
4261 			}
4262 		} else if (c == 0xF) {
4263 			*state = ASCII;
4264 		} else if (c == 0xE) {
4265 			*state = KSC5601;
4266 		} else if (c >= 0x21 && c <= 0x7E && *state == KSC5601) {
4267 			if (p == e) {
4268 				*out++ = MBFL_BAD_INPUT;
4269 				break;
4270 			}
4271 			unsigned char c2 = *p++;
4272 			unsigned int w = 0;
4273 
4274 			if (c2 < 0x21 || c2 > 0x7E) {
4275 				*out++ = MBFL_BAD_INPUT;
4276 				continue;
4277 			}
4278 
4279 			if (c < 0x47) {
4280 				if (c != 0x22 || c2 <= 0x65) {
4281 					w = (c - 1)*190 + c2 - 0x41 + 0x80;
4282 					ZEND_ASSERT(w < uhc1_ucs_table_size);
4283 					w = uhc1_ucs_table[w];
4284 				}
4285 			} else if (c != 0x49 && c <= 0x7D) {
4286 				w = (c - 0x47)*94 + c2 - 0x21;
4287 				ZEND_ASSERT(w < uhc3_ucs_table_size);
4288 				w = uhc3_ucs_table[w];
4289 			}
4290 
4291 			if (!w)
4292 				w = MBFL_BAD_INPUT;
4293 			*out++ = w;
4294 		} else if (c < 0x80 && *state == ASCII) {
4295 			*out++ = c;
4296 		} else {
4297 			*out++ = MBFL_BAD_INPUT;
4298 		}
4299 	}
4300 
4301 	*in_len = e - p;
4302 	*in = p;
4303 	return out - buf;
4304 }
4305 
4306 #define EMITTED_ESC_SEQUENCE 0x10
4307 
mb_wchar_to_iso2022kr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4308 static void mb_wchar_to_iso2022kr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4309 {
4310 	unsigned char *out, *limit;
4311 	MB_CONVERT_BUF_LOAD(buf, out, limit);
4312 
4313 	/* This escape sequence needs to come *somewhere* at the beginning of a line before
4314 	 * we can use the Shift In/Shift Out bytes, but it only needs to come once in a string
4315 	 * Rather than tracking newlines, we can just emit the sequence once at the beginning
4316 	 * of the output string... since that will always be "the beginning of a line" */
4317 	if (len && !(buf->state & EMITTED_ESC_SEQUENCE)) {
4318 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 4 + len);
4319 		out = mb_convert_buf_add4(out, 0x1B, '$', ')', 'C');
4320 		buf->state |= EMITTED_ESC_SEQUENCE;
4321 	} else {
4322 		MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4323 	}
4324 
4325 	while (len--) {
4326 		uint32_t w = *in++;
4327 		unsigned int s = 0;
4328 
4329 		if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
4330 			s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
4331 		} else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
4332 			s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
4333 		} else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
4334 			s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
4335 		} else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
4336 			s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
4337 		} else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
4338 			s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
4339 		} else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
4340 			s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
4341 		} else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
4342 			s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
4343 		}
4344 
4345 		if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
4346 			s = w;
4347 		} else {
4348 			s -= 0x8080;
4349 		}
4350 
4351 		if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4352 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022kr);
4353 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4354 		} else if (s < 0x80) {
4355 			if ((buf->state & 1) != ASCII) {
4356 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4357 				out = mb_convert_buf_add(out, 0xF);
4358 				buf->state &= ~KSC5601;
4359 			}
4360 			out = mb_convert_buf_add(out, s);
4361 		} else {
4362 			if ((buf->state & 1) != KSC5601) {
4363 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
4364 				out = mb_convert_buf_add(out, 0xE);
4365 				buf->state |= KSC5601;
4366 			} else {
4367 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4368 			}
4369 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
4370 		}
4371 	}
4372 
4373 	if (end && (buf->state & 1) != ASCII) {
4374 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
4375 		out = mb_convert_buf_add(out, 0xF);
4376 	}
4377 
4378 	MB_CONVERT_BUF_STORE(buf, out, limit);
4379 }
4380 
4381 static const struct mbfl_convert_vtbl vtbl_jis_wchar = {
4382 	mbfl_no_encoding_jis,
4383 	mbfl_no_encoding_wchar,
4384 	mbfl_filt_conv_common_ctor,
4385 	NULL,
4386 	mbfl_filt_conv_jis_wchar,
4387 	mbfl_filt_conv_jis_wchar_flush,
4388 	NULL,
4389 };
4390 
4391 static const struct mbfl_convert_vtbl vtbl_wchar_jis = {
4392 	mbfl_no_encoding_wchar,
4393 	mbfl_no_encoding_jis,
4394 	mbfl_filt_conv_common_ctor,
4395 	NULL,
4396 	mbfl_filt_conv_wchar_jis,
4397 	mbfl_filt_conv_any_jis_flush,
4398 	NULL,
4399 };
4400 
4401 const mbfl_encoding mbfl_encoding_jis = {
4402 	mbfl_no_encoding_jis,
4403 	"JIS",
4404 	"ISO-2022-JP",
4405 	NULL,
4406 	NULL,
4407 	MBFL_ENCTYPE_GL_UNSAFE,
4408 	&vtbl_jis_wchar,
4409 	&vtbl_wchar_jis,
4410 	mb_iso2022jp_to_wchar,
4411 	mb_wchar_to_jis,
4412 	mb_check_jis,
4413 	NULL,
4414 };
4415 
4416 static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
4417 	mbfl_no_encoding_2022jp,
4418 	mbfl_no_encoding_wchar,
4419 	mbfl_filt_conv_common_ctor,
4420 	NULL,
4421 	mbfl_filt_conv_jis_wchar,
4422 	mbfl_filt_conv_jis_wchar_flush,
4423 	NULL,
4424 };
4425 
4426 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp = {
4427 	mbfl_no_encoding_wchar,
4428 	mbfl_no_encoding_2022jp,
4429 	mbfl_filt_conv_common_ctor,
4430 	NULL,
4431 	mbfl_filt_conv_wchar_2022jp,
4432 	mbfl_filt_conv_any_jis_flush,
4433 	NULL,
4434 };
4435 
4436 const mbfl_encoding mbfl_encoding_2022jp = {
4437 	mbfl_no_encoding_2022jp,
4438 	"ISO-2022-JP",
4439 	"ISO-2022-JP",
4440 	NULL,
4441 	NULL,
4442 	MBFL_ENCTYPE_GL_UNSAFE,
4443 	&vtbl_2022jp_wchar,
4444 	&vtbl_wchar_2022jp,
4445 	mb_iso2022jp_to_wchar,
4446 	mb_wchar_to_iso2022jp,
4447 	mb_check_iso2022jp,
4448 	NULL,
4449 };
4450 
4451 static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
4452 
4453 static const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {
4454 	mbfl_no_encoding_2022jp_kddi,
4455 	mbfl_no_encoding_wchar,
4456 	mbfl_filt_conv_common_ctor,
4457 	NULL,
4458 	mbfl_filt_conv_2022jp_mobile_wchar,
4459 	mbfl_filt_conv_2022jp_mobile_wchar_flush,
4460 	NULL,
4461 };
4462 
4463 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_kddi = {
4464 	mbfl_no_encoding_wchar,
4465 	mbfl_no_encoding_2022jp_kddi,
4466 	mbfl_filt_conv_common_ctor,
4467 	NULL,
4468 	mbfl_filt_conv_wchar_2022jp_mobile,
4469 	mbfl_filt_conv_wchar_2022jp_mobile_flush,
4470 	NULL,
4471 };
4472 
4473 const mbfl_encoding mbfl_encoding_2022jp_kddi = {
4474 	mbfl_no_encoding_2022jp_kddi,
4475 	"ISO-2022-JP-MOBILE#KDDI",
4476 	"ISO-2022-JP",
4477 	mbfl_encoding_2022jp_kddi_aliases,
4478 	NULL,
4479 	MBFL_ENCTYPE_GL_UNSAFE,
4480 	&vtbl_2022jp_kddi_wchar,
4481 	&vtbl_wchar_2022jp_kddi,
4482 	mb_iso2022jp_kddi_to_wchar,
4483 	mb_wchar_to_iso2022jp_kddi,
4484 	NULL,
4485 	NULL,
4486 };
4487 
4488 static const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
4489 	mbfl_no_encoding_2022jp_2004,
4490 	mbfl_no_encoding_wchar,
4491 	mbfl_filt_conv_common_ctor,
4492 	NULL,
4493 	mbfl_filt_conv_jis2004_wchar,
4494 	mbfl_filt_conv_jis2004_wchar_flush,
4495 	NULL,
4496 };
4497 
4498 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = {
4499 	mbfl_no_encoding_wchar,
4500 	mbfl_no_encoding_2022jp_2004,
4501 	mbfl_filt_conv_common_ctor,
4502 	NULL,
4503 	mbfl_filt_conv_wchar_jis2004,
4504 	mbfl_filt_conv_wchar_jis2004_flush,
4505 	NULL,
4506 };
4507 
4508 const mbfl_encoding mbfl_encoding_2022jp_2004 = {
4509 	mbfl_no_encoding_2022jp_2004,
4510 	"ISO-2022-JP-2004",
4511 	"ISO-2022-JP-2004",
4512 	NULL,
4513 	NULL,
4514 	MBFL_ENCTYPE_GL_UNSAFE,
4515 	&vtbl_2022jp_2004_wchar,
4516 	&vtbl_wchar_2022jp_2004,
4517 	mb_iso2022jp2004_to_wchar,
4518 	mb_wchar_to_iso2022jp2004,
4519 	NULL,
4520 	NULL,
4521 };
4522 
4523 /* Previously, a dubious 'encoding' called 'cp50220raw' was supported
4524  * This was just CP50220, but the implementation was less strict regarding
4525  * invalid characters; it would silently pass some through
4526  * This 'encoding' only existed in mbstring. In case some poor, lost soul is
4527  * still using it, retain minimal support by aliasing it to CP50220
4528  *
4529  * Further, mbstring also had a made-up encoding called "JIS-ms"
4530  * This was the same as CP5022{0,1,2}, but without their special ways of
4531  * handling conversion of Unicode half-width katakana */
4532 static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
4533 
4534 static const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
4535 	mbfl_no_encoding_cp50220,
4536 	mbfl_no_encoding_wchar,
4537 	mbfl_filt_conv_common_ctor,
4538 	NULL,
4539 	mbfl_filt_conv_cp5022x_wchar,
4540 	mbfl_filt_conv_cp5022x_wchar_flush,
4541 	NULL,
4542 };
4543 
4544 static const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
4545 	mbfl_no_encoding_wchar,
4546 	mbfl_no_encoding_cp50220,
4547 	mbfl_filt_conv_common_ctor,
4548 	NULL,
4549 	mbfl_filt_conv_wchar_cp50220,
4550 	mbfl_filt_conv_wchar_cp50220_flush,
4551 	NULL,
4552 };
4553 
4554 static const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
4555 	mbfl_no_encoding_cp50221,
4556 	mbfl_no_encoding_wchar,
4557 	mbfl_filt_conv_common_ctor,
4558 	NULL,
4559 	mbfl_filt_conv_cp5022x_wchar,
4560 	mbfl_filt_conv_cp5022x_wchar_flush,
4561 	NULL,
4562 };
4563 
4564 static const struct mbfl_convert_vtbl vtbl_wchar_cp50221 = {
4565 	mbfl_no_encoding_wchar,
4566 	mbfl_no_encoding_cp50221,
4567 	mbfl_filt_conv_common_ctor,
4568 	NULL,
4569 	mbfl_filt_conv_wchar_cp50221,
4570 	mbfl_filt_conv_any_jis_flush,
4571 	NULL,
4572 };
4573 
4574 static const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
4575 	mbfl_no_encoding_cp50222,
4576 	mbfl_no_encoding_wchar,
4577 	mbfl_filt_conv_common_ctor,
4578 	NULL,
4579 	mbfl_filt_conv_cp5022x_wchar,
4580 	mbfl_filt_conv_cp5022x_wchar_flush,
4581 	NULL,
4582 };
4583 
4584 static const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
4585 	mbfl_no_encoding_wchar,
4586 	mbfl_no_encoding_cp50222,
4587 	mbfl_filt_conv_common_ctor,
4588 	NULL,
4589 	mbfl_filt_conv_wchar_cp50222,
4590 	mbfl_filt_conv_wchar_cp50222_flush,
4591 	NULL,
4592 };
4593 
4594 const mbfl_encoding mbfl_encoding_cp50220 = {
4595 	mbfl_no_encoding_cp50220,
4596 	"CP50220",
4597 	"ISO-2022-JP",
4598 	cp50220_aliases,
4599 	NULL,
4600 	MBFL_ENCTYPE_GL_UNSAFE,
4601 	&vtbl_cp50220_wchar,
4602 	&vtbl_wchar_cp50220,
4603 	mb_cp5022x_to_wchar,
4604 	mb_wchar_to_cp50220,
4605 	NULL,
4606 	NULL,
4607 };
4608 
4609 const mbfl_encoding mbfl_encoding_cp50221 = {
4610 	mbfl_no_encoding_cp50221,
4611 	"CP50221",
4612 	"ISO-2022-JP",
4613 	NULL,
4614 	NULL,
4615 	MBFL_ENCTYPE_GL_UNSAFE,
4616 	&vtbl_cp50221_wchar,
4617 	&vtbl_wchar_cp50221,
4618 	mb_cp5022x_to_wchar,
4619 	mb_wchar_to_cp50221,
4620 	NULL,
4621 	NULL,
4622 };
4623 
4624 const mbfl_encoding mbfl_encoding_cp50222 = {
4625 	mbfl_no_encoding_cp50222,
4626 	"CP50222",
4627 	"ISO-2022-JP",
4628 	NULL,
4629 	NULL,
4630 	MBFL_ENCTYPE_GL_UNSAFE,
4631 	&vtbl_cp50222_wchar,
4632 	&vtbl_wchar_cp50222,
4633 	mb_cp5022x_to_wchar,
4634 	mb_wchar_to_cp50222,
4635 	NULL,
4636 	NULL,
4637 };
4638 
4639 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
4640 
4641 static const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
4642 	mbfl_no_encoding_2022jpms,
4643 	mbfl_no_encoding_wchar,
4644 	mbfl_filt_conv_common_ctor,
4645 	NULL,
4646 	mbfl_filt_conv_2022jpms_wchar,
4647 	mbfl_filt_conv_2022jpms_wchar_flush,
4648 	NULL,
4649 };
4650 
4651 static const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
4652 	mbfl_no_encoding_wchar,
4653 	mbfl_no_encoding_2022jpms,
4654 	mbfl_filt_conv_common_ctor,
4655 	NULL,
4656 	mbfl_filt_conv_wchar_2022jpms,
4657 	mbfl_filt_conv_any_2022jpms_flush,
4658 	NULL,
4659 };
4660 
4661 const mbfl_encoding mbfl_encoding_2022jpms = {
4662 	mbfl_no_encoding_2022jpms,
4663 	"ISO-2022-JP-MS",
4664 	"ISO-2022-JP",
4665 	mbfl_encoding_2022jpms_aliases,
4666 	NULL,
4667 	MBFL_ENCTYPE_GL_UNSAFE,
4668 	&vtbl_2022jpms_wchar,
4669 	&vtbl_wchar_2022jpms,
4670 	mb_iso2022jpms_to_wchar,
4671 	mb_wchar_to_iso2022jpms,
4672 	NULL,
4673 	NULL,
4674 };
4675 
4676 /* ISO-2022-KR is defined in RFC 1557
4677  *
4678  * The RFC says that ESC $ ) C must appear once in a ISO-2022-KR string,
4679  * at the beginning of a line, before any instances of the Shift In or
4680  * Shift Out bytes which are used to switch between ASCII/KSC 5601 modes
4681  *
4682  * We don't enforce that for ISO-2022-KR input */
4683 
4684 static const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {
4685 	mbfl_no_encoding_wchar,
4686 	mbfl_no_encoding_2022kr,
4687 	mbfl_filt_conv_common_ctor,
4688 	NULL,
4689 	mbfl_filt_conv_wchar_2022kr,
4690 	mbfl_filt_conv_any_2022kr_flush,
4691 	NULL,
4692 };
4693 
4694 static const struct mbfl_convert_vtbl vtbl_2022kr_wchar = {
4695 	mbfl_no_encoding_2022kr,
4696 	mbfl_no_encoding_wchar,
4697 	mbfl_filt_conv_common_ctor,
4698 	NULL,
4699 	mbfl_filt_conv_2022kr_wchar,
4700 	mbfl_filt_conv_2022kr_wchar_flush,
4701 	NULL,
4702 };
4703 
4704 const mbfl_encoding mbfl_encoding_2022kr = {
4705 	mbfl_no_encoding_2022kr,
4706 	"ISO-2022-KR",
4707 	"ISO-2022-KR",
4708 	NULL,
4709 	NULL,
4710 	MBFL_ENCTYPE_GL_UNSAFE,
4711 	&vtbl_2022kr_wchar,
4712 	&vtbl_wchar_2022kr,
4713 	mb_iso2022kr_to_wchar,
4714 	mb_wchar_to_iso2022kr,
4715 	NULL,
4716 	NULL,
4717 };
4718 
4719 /*
4720  * SJIS variants
4721  */
4722 
mbfl_filt_conv_sjis_wchar(int c,mbfl_convert_filter * filter)4723 static int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
4724 {
4725 	int s1, s2, w;
4726 
4727 	switch (filter->status) {
4728 	case 0:
4729 		if (c >= 0 && c < 0x80) { /* ASCII */
4730 			CK((*filter->output_function)(c, filter->data));
4731 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
4732 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
4733 		} else if (c > 0x80 && c < 0xF0 && c != 0xA0) { /* Kanji, first byte */
4734 			filter->status = 1;
4735 			filter->cache = c;
4736 		} else {
4737 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4738 		}
4739 		break;
4740 
4741 	case 1: /* Kanji, second byte */
4742 		filter->status = 0;
4743 		int c1 = filter->cache;
4744 		if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
4745 			SJIS_DECODE(c1, c, s1, s2);
4746 			w = (s1 - 0x21)*94 + s2 - 0x21;
4747 			if (w >= 0 && w < jisx0208_ucs_table_size) {
4748 				w = jisx0208_ucs_table[w];
4749 				if (!w)
4750 					w = MBFL_BAD_INPUT;
4751 			} else {
4752 				w = MBFL_BAD_INPUT;
4753 			}
4754 			CK((*filter->output_function)(w, filter->data));
4755 		} else {
4756 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4757 		}
4758 	}
4759 
4760 	return 0;
4761 }
4762 
mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter * filter)4763 static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter)
4764 {
4765 	if (filter->status && filter->status != 4) {
4766 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4767 	}
4768 	filter->status = 0;
4769 
4770 	if (filter->flush_function) {
4771 		(*filter->flush_function)(filter->data);
4772 	}
4773 
4774 	return 0;
4775 }
4776 
mbfl_filt_conv_wchar_sjis(int c,mbfl_convert_filter * filter)4777 static int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
4778 {
4779 	int c1, c2, s1 = 0, s2;
4780 
4781 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
4782 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
4783 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
4784 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
4785 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
4786 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
4787 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
4788 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
4789 	}
4790 	if (s1 <= 0) {
4791 		if (c == 0xA5) { /* YEN SIGN */
4792 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
4793 		} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
4794 			s1 = 0x2131; /* FULLWIDTH MACRON */
4795 		} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4796 			s1 = 0x2140;
4797 		} else if (c == 0x2225) { /* PARALLEL TO */
4798 			s1 = 0x2142;
4799 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4800 			s1 = 0x215D;
4801 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4802 			s1 = 0x2171;
4803 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4804 			s1 = 0x2172;
4805 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4806 			s1 = 0x224C;
4807 		} else if (c == 0) {
4808 			s1 = 0;
4809 		} else {
4810 			s1 = -1;
4811 		}
4812 	} else if (s1 >= 0x8080) { /* JIS X 0212; not supported */
4813 		s1 = -1;
4814 	}
4815 
4816 	if (s1 >= 0) {
4817 		if (s1 < 0x100) { /* Latin/Kana */
4818 			CK((*filter->output_function)(s1, filter->data));
4819 		} else { /* Kanji */
4820 			c1 = (s1 >> 8) & 0xFF;
4821 			c2 = s1 & 0xFF;
4822 			SJIS_ENCODE(c1, c2, s1, s2);
4823 			CK((*filter->output_function)(s1, filter->data));
4824 			CK((*filter->output_function)(s2, filter->data));
4825 		}
4826 	} else {
4827 		CK(mbfl_filt_conv_illegal_output(c, filter));
4828 	}
4829 
4830 	return 0;
4831 }
4832 
4833 static const unsigned short sjis_decode_tbl1[] = {
4834 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
4835 };
4836 
4837 static const unsigned short sjis_decode_tbl2[] = {
4838 	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 0xFFFF, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 0xFFFF, 0xFFFF, 0xFFFF
4839 };
4840 
mb_sjis_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4841 static size_t mb_sjis_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4842 {
4843 	unsigned char *p = *in, *e = p + *in_len;
4844 	uint32_t *out = buf, *limit = buf + bufsize;
4845 
4846 	e--; /* Stop the main loop 1 byte short of the end of the input */
4847 
4848 	while (p < e && out < limit) {
4849 		unsigned char c = *p++;
4850 
4851 		if (c <= 0x7F) {
4852 			*out++ = c;
4853 		} else if (c >= 0xA1 && c <= 0xDF) { /* Kana */
4854 			*out++ = 0xFEC0 + c;
4855 		} else {
4856 			/* Don't need to check p < e; it's not possible to go out of bounds here, due to e-- above */
4857 			unsigned char c2 = *p++;
4858 			/* This is only legal if c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F
4859 			 * But the values in the above conversion tables have been chosen such that
4860 			 * illegal values of c2 will always result in w > jisx0208_ucs_table_size,
4861 			 * so we don't need to do a separate bounds check on c2
4862 			 * Likewise, the values in the conversion tables are such that illegal values
4863 			 * for c will always result in w > jisx0208_ucs_table_size */
4864 			uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
4865 			if (w < jisx0208_ucs_table_size) {
4866 				w = jisx0208_ucs_table[w];
4867 				if (!w)
4868 					w = MBFL_BAD_INPUT;
4869 				*out++ = w;
4870 			} else {
4871 				if (c == 0x80 || c == 0xA0 || c > 0xEF) {
4872 					p--;
4873 				}
4874 				*out++ = MBFL_BAD_INPUT;
4875 			}
4876 		}
4877 	}
4878 
4879 	/* Finish up last byte of input string if there is one */
4880 	if (p == e && out < limit) {
4881 		unsigned char c = *p++;
4882 		if (c <= 0x7F) {
4883 			*out++ = c;
4884 		} else if (c >= 0xA1 && c <= 0xDF) {
4885 			*out++ = 0xFEC0 + c;
4886 		} else {
4887 			*out++ = MBFL_BAD_INPUT;
4888 		}
4889 	}
4890 
4891 	*in_len = e - p + 1;
4892 	*in = p;
4893 	return out - buf;
4894 }
4895 
mb_wchar_to_sjis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4896 static void mb_wchar_to_sjis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4897 {
4898 	unsigned char *out, *limit;
4899 	MB_CONVERT_BUF_LOAD(buf, out, limit);
4900 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4901 
4902 	while (len--) {
4903 		uint32_t w = *in++;
4904 		unsigned int s = 0;
4905 
4906 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
4907 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
4908 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
4909 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
4910 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
4911 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
4912 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
4913 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
4914 		}
4915 
4916 		if (s == 0) {
4917 			if (w == 0xA5) { /* YEN SIGN */
4918 				s = 0x216F; /* FULLWIDTH YEN SIGN */
4919 			} else if (w == 0xAF || w == 0x203E) {
4920 				s = 0x2131; /* FULLWIDTH MACRON */
4921 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4922 				s = 0x2140;
4923 			} else if (w == 0x2225) { /* PARALLEL TO */
4924 				s = 0x2142;
4925 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4926 				s = 0x215D;
4927 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4928 				s = 0x2171;
4929 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4930 				s = 0x2172;
4931 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4932 				s = 0x224C;
4933 			} else if (w != 0) {
4934 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4935 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4936 				continue;
4937 			}
4938 		} else if (s >= 0x8080) { /* JIS X 0212; not supported */
4939 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4940 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4941 			continue;
4942 		}
4943 
4944 		if (s <= 0xFF) {
4945 			/* Latin/Kana */
4946 			out = mb_convert_buf_add(out, s);
4947 		} else {
4948 			/* Kanji */
4949 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s2;
4950 			SJIS_ENCODE(c1, c2, s, s2);
4951 			out = mb_convert_buf_add2(out, s, s2);
4952 		}
4953 	}
4954 
4955 	MB_CONVERT_BUF_STORE(buf, out, limit);
4956 }
4957 
mbfl_filt_conv_sjis_mac_wchar(int c,mbfl_convert_filter * filter)4958 static int mbfl_filt_conv_sjis_mac_wchar(int c, mbfl_convert_filter *filter)
4959 {
4960 	int i, j, n;
4961 	int c1, s, s1, s2, w;
4962 
4963 	switch (filter->status) {
4964 	case 0:
4965 		if (c >= 0 && c < 0x80 && c != 0x5c) {	/* latin */
4966 			CK((*filter->output_function)(c, filter->data));
4967 		} else if (c > 0xa0 && c < 0xe0) {	/* kana */
4968 			CK((*filter->output_function)(0xfec0 + c, filter->data));
4969 		} else if (c > 0x80 && c <= 0xed && c != 0xa0) {	/* kanji first char */
4970 			filter->status = 1;
4971 			filter->cache = c;
4972 		} else if (c == 0x5c) {
4973 			CK((*filter->output_function)(0x00a5, filter->data));
4974 		} else if (c == 0x80) {
4975 			CK((*filter->output_function)(0x005c, filter->data));
4976 		} else if (c == 0xa0) {
4977 			CK((*filter->output_function)(0x00a0, filter->data));
4978 		} else if (c == 0xfd) {
4979 			CK((*filter->output_function)(0x00a9, filter->data));
4980 		} else if (c == 0xfe) {
4981 			CK((*filter->output_function)(0x2122, filter->data));
4982 		} else if (c == 0xff) {
4983 			CK((*filter->output_function)(0x2026, filter->data));
4984 			CK((*filter->output_function)(0xf87f, filter->data));
4985 		} else {
4986 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4987 		}
4988 		break;
4989 
4990 	case 1:		/* kanji second char */
4991 		filter->status = 0;
4992 		c1 = filter->cache;
4993 		if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
4994 			w = 0;
4995 			SJIS_DECODE(c1, c, s1, s2);
4996 			s = (s1 - 0x21)*94 + s2 - 0x21;
4997 			if (s <= 0x89) {
4998 				if (s == 0x1c) {
4999 					w = 0x2014;		    /* EM DASH */
5000 				} else if (s == 0x1f) {
5001 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
5002 				} else if (s == 0x20) {
5003 					w = 0x301c;			/* FULLWIDTH TILDE */
5004 				} else if (s == 0x21) {
5005 					w = 0x2016;			/* PARALLEL TO */
5006 				} else if (s == 0x3c) {
5007 					w = 0x2212;			/* FULLWIDTH HYPHEN-MINUS */
5008 				} else if (s == 0x50) {
5009 					w = 0x00a2;			/* FULLWIDTH CENT SIGN */
5010 				} else if (s == 0x51) {
5011 					w = 0x00a3;			/* FULLWIDTH POUND SIGN */
5012 				} else if (s == 0x89) {
5013 					w = 0x00ac;			/* FULLWIDTH NOT SIGN */
5014 				}
5015 			}
5016 
5017 			/* apple gaiji area 0x8540 - 0x886d */
5018 			if (w == 0) {
5019 				for (i=0; i<7; i++) {
5020 					if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) {
5021 						w = s - code_tbl[i][0] + code_tbl[i][2];
5022 						break;
5023 					}
5024 				}
5025 			}
5026 
5027 			if (w == 0) {
5028 
5029 				for (i=0; i<code_tbl_m_len; i++) {
5030 					if (s == code_tbl_m[i][0]) {
5031 						if (code_tbl_m[i][1] == 0xf860) {
5032 							n = 4;
5033 						} else if (code_tbl_m[i][1] == 0xf861) {
5034 							n = 5;
5035 						} else {
5036 							n = 6;
5037 						}
5038 						for (j=1; j<n-1; j++) {
5039 							CK((*filter->output_function)(code_tbl_m[i][j], filter->data));
5040 						}
5041 						w = code_tbl_m[i][n-1];
5042 						break;
5043 					}
5044 				}
5045 			}
5046 
5047 			if (w == 0) {
5048 				for (i=0; i<8; i++) {
5049 					if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) {
5050 						w = code_map[i][s - code_ofst_tbl[i][0]];
5051 						if (w == 0) {
5052 							CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5053 							return 0;
5054 						}
5055 						s2 = 0;
5056 						if (s >= 0x043e && s <= 0x0441) {
5057 							s2 = 0xf87a;
5058 						} else if (s == 0x03b1 || s == 0x03b7) {
5059 							s2 = 0xf87f;
5060 						} else if (s == 0x04b8 || s == 0x04b9 || s == 0x04c4) {
5061 							s2 = 0x20dd;
5062 						} else if (s == 0x1ed9 || s == 0x1eda || s == 0x1ee8 || s == 0x1ef3 ||
5063 								   (s >= 0x1ef5 && s <= 0x1efb) || s == 0x1f05 || s == 0x1f06 ||
5064 								   s == 0x1f18 || (s >= 0x1ff2 && s <= 0x20a5)) {
5065 							s2 = 0xf87e;
5066 						}
5067 						if (s2 > 0) {
5068 							CK((*filter->output_function)(w, filter->data));
5069 							w = s2;
5070 						}
5071 						break;
5072 					}
5073 				}
5074 			}
5075 
5076 			if (w == 0 && s >= 0 && s < jisx0208_ucs_table_size) {	/* X 0208 */
5077 				w = jisx0208_ucs_table[s];
5078 			}
5079 
5080 			if (w <= 0) {
5081 				w = MBFL_BAD_INPUT;
5082 			}
5083 			CK((*filter->output_function)(w, filter->data));
5084 		} else {
5085 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5086 		}
5087 		break;
5088 
5089 		EMPTY_SWITCH_DEFAULT_CASE();
5090 	}
5091 
5092 	return 0;
5093 }
5094 
mbfl_filt_conv_wchar_sjis_mac(int c,mbfl_convert_filter * filter)5095 static int mbfl_filt_conv_wchar_sjis_mac(int c, mbfl_convert_filter *filter)
5096 {
5097 	int i, c1, c2, s1 = 0, s2 = 0, mode;
5098 
5099 	// a1: U+0000 -> U+046F
5100 	// a2: U+2000 -> U+30FF
5101 	//  i: U+4E00 -> U+9FFF
5102 	//  r: U+FF00 -> U+FFFF
5103 
5104 	switch (filter->status) {
5105 	case 1:
5106 		c1 = filter->cache;
5107 		filter->cache = filter->status = 0;
5108 
5109 		if (c == 0xf87a) {
5110 			for (i = 0; i < 4; i++) {
5111 				if (c1 == s_form_tbl[i+34+3+3]) {
5112 					s1 = s_form_sjis_tbl[i+34+3+3];
5113 					break;
5114 				}
5115 			}
5116 			if (s1 <= 0) {
5117 				s2 = c1;
5118 			}
5119 		} else if (c == 0x20dd) {
5120 			for (i = 0; i < 3; i++) {
5121 				if (c1 == s_form_tbl[i+34+3]) {
5122 					s1 = s_form_sjis_tbl[i+34+3];
5123 					break;
5124 				}
5125 			}
5126 			if (s1 <= 0) {
5127 				s2 = c1;
5128 			}
5129 		} else if (c == 0xf87f) {
5130 			for (i = 0; i < 3; i++) {
5131 				if (c1 == s_form_tbl[i+34]) {
5132 					s1 = s_form_sjis_tbl[i+34];
5133 					break;
5134 				}
5135 			}
5136 			if (s1 <= 0) {
5137 				s2 = c1;
5138 				s1 = -1;
5139 			}
5140 		} else if (c == 0xf87e) {
5141 			for (i = 0; i < 34; i++) {
5142 				if (c1 == s_form_tbl[i]) {
5143 					s1 = s_form_sjis_tbl[i];
5144 					break;
5145 				}
5146 			}
5147 			if (s1 <= 0) {
5148 				s2 = c1;
5149 				s1 = -1;
5150 			}
5151 		} else {
5152 			s2 = c1;
5153 			s1 = c;
5154 		}
5155 
5156 		if (s2 > 0) {
5157 			for (i = 0; i < s_form_tbl_len; i++) {
5158 				if (c1 == s_form_tbl[i]) {
5159 					s1 = s_form_sjis_fallback_tbl[i];
5160 					break;
5161 				}
5162 			}
5163 		}
5164 
5165 		if (s1 >= 0) {
5166 			if (s1 < 0x100) {
5167 				CK((*filter->output_function)(s1, filter->data));
5168 			} else {
5169 				CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5170 				CK((*filter->output_function)(s1 & 0xff, filter->data));
5171 			}
5172 		} else {
5173 			CK(mbfl_filt_conv_illegal_output(c, filter));
5174 		}
5175 
5176 		if (s2 <= 0 || s1 == -1) {
5177 			break;
5178 		}
5179 		s1 = s2 = 0;
5180 		ZEND_FALLTHROUGH;
5181 
5182 	case 0:
5183 		if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
5184 			s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
5185 			if (c == 0x5c) {
5186 				s1 = 0x80;
5187 			} else if (c == 0xa9) {
5188 				s1 = 0xfd;
5189 			}
5190 		} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
5191 			s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
5192 			if (c == 0x2122) {
5193 				s1 = 0xfe;
5194 			} else if (c == 0x2014) {
5195 				s1 = 0x213d;
5196 			} else if (c == 0x2116) {
5197 				s1 = 0x2c1d;
5198 			}
5199 		} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
5200 			s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
5201 		} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
5202 			s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
5203 		}
5204 
5205 		if (c >= 0x2000) {
5206 			for (i = 0; i < s_form_tbl_len; i++) {
5207 				if (c == s_form_tbl[i]) {
5208 					filter->status = 1;
5209 					filter->cache = c;
5210 					return 0;
5211 				}
5212 			}
5213 
5214 			if (c == 0xf860 || c == 0xf861 || c == 0xf862) {
5215 				/* Apple 'transcoding hint' codepoints (from private use area) */
5216 				filter->status = 2;
5217 				filter->cache = c;
5218 				return 0;
5219 			}
5220 		}
5221 
5222 		if (s1 <= 0) {
5223 			if (c == 0xa0) {
5224 				s1 = 0x00a0;
5225 			} else if (c == 0xa5) { /* YEN SIGN */
5226 				/* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5227 				 * convert codepoint 0xA5 to halfwidth Yen sign */
5228 				s1 = 0x5c; /* HALFWIDTH YEN SIGN */
5229 			} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
5230 				s1 = 0x2140;
5231 			}
5232 		}
5233 
5234 		if (s1 <= 0) {
5235 			for (i=0; i<wchar2sjis_mac_r_tbl_len; i++) {
5236 				if (c >= wchar2sjis_mac_r_tbl[i][0] && c <= wchar2sjis_mac_r_tbl[i][1]) {
5237 					s1 = c - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5238 					break;
5239 				}
5240 			}
5241 
5242 			if (s1 <= 0) {
5243 				for (i=0; i<wchar2sjis_mac_r_map_len; i++) {
5244 					if (c >= wchar2sjis_mac_r_map[i][0] && c <= wchar2sjis_mac_r_map[i][1]) {
5245 						s1 = wchar2sjis_mac_code_map[i][c-wchar2sjis_mac_r_map[i][0]];
5246 						break;
5247 					}
5248 				}
5249 			}
5250 
5251 			if (s1 <= 0) {
5252 				for (i=0; i<wchar2sjis_mac_wchar_tbl_len ; i++) {
5253 					if ( c == wchar2sjis_mac_wchar_tbl[i][0]) {
5254 						s1 = wchar2sjis_mac_wchar_tbl[i][1] & 0xffff;
5255 						break;
5256 					}
5257 				}
5258 			}
5259 
5260 			if (s1 > 0) {
5261 				c1 = s1/94+0x21;
5262 				c2 = s1-94*(c1-0x21)+0x21;
5263 				s1 = (c1 << 8) | c2;
5264 				s2 = 1;
5265 			}
5266 		}
5267 
5268 		if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {	/* not found or X 0212 */
5269 			s1 = -1;
5270 			c1 = 0;
5271 
5272 			if (c == 0) {
5273 				s1 = 0;
5274 			} else if (s1 <= 0) {
5275 				s1 = -1;
5276 			}
5277 		}
5278 
5279 		if (s1 >= 0) {
5280 			if (s1 < 0x100) { /* latin or kana */
5281 				CK((*filter->output_function)(s1, filter->data));
5282 			} else { /* kanji */
5283 				c1 = (s1 >> 8) & 0xff;
5284 				c2 = s1 & 0xff;
5285 				SJIS_ENCODE(c1, c2, s1, s2);
5286 				CK((*filter->output_function)(s1, filter->data));
5287 				CK((*filter->output_function)(s2, filter->data));
5288 			}
5289 		} else {
5290 			CK(mbfl_filt_conv_illegal_output(c, filter));
5291 		}
5292 		break;
5293 
5294 	case 2:
5295 		c1 = filter->cache;
5296 		filter->cache = 0;
5297 		filter->status = 0;
5298 		if (c1 == 0xf860) {
5299 			for (i = 0; i < 5; i++) {
5300 				if (c == code_tbl_m[i][2]) {
5301 					filter->cache = c | 0x10000;
5302 					filter->status = 3;
5303 					break;
5304 				}
5305 			}
5306 		} else if (c1 == 0xf861) {
5307 			for (i = 0; i < 3; i++) {
5308 				if (c == code_tbl_m[i+5][2]) {
5309 					filter->cache = c | 0x20000;
5310 					filter->status = 3;
5311 					break;
5312 				}
5313 			}
5314 		} else if (c1 == 0xf862) {
5315 			for (i = 0; i < 4; i++) {
5316 				if (c == code_tbl_m[i+5+3][2]) {
5317 					filter->cache = c | 0x40000;
5318 					filter->status = 3;
5319 					break;
5320 				}
5321 			}
5322 		}
5323 
5324 		if (filter->status == 0) {
5325 			/* Didn't find any of expected codepoints after Apple transcoding hint */
5326 			CK(mbfl_filt_conv_illegal_output(c1, filter));
5327 			return mbfl_filt_conv_wchar_sjis_mac(c, filter);
5328 		}
5329 		break;
5330 
5331 	case 3:
5332 		s1 = 0;
5333 		c1 = filter->cache & 0xffff;
5334 		mode = (filter->cache & 0xf0000) >> 16;
5335 
5336 		filter->cache = filter->status = 0;
5337 
5338 		if (mode == 0x1) {
5339 			for (i = 0; i < 5; i++) {
5340 				if (c1 == code_tbl_m[i][2] && c == code_tbl_m[i][3]) {
5341 					s1 = code_tbl_m[i][0];
5342 					break;
5343 				}
5344 			}
5345 
5346 			if (s1 > 0) {
5347 				c1 = s1/94+0x21;
5348 				c2 = s1-94*(c1-0x21)+0x21;
5349 				SJIS_ENCODE(c1, c2, s1, s2);
5350 				CK((*filter->output_function)(s1, filter->data));
5351 				CK((*filter->output_function)(s2, filter->data));
5352 			} else {
5353 				CK(mbfl_filt_conv_illegal_output(0xf860, filter));
5354 				CK(mbfl_filt_conv_illegal_output(c1, filter));
5355 				CK(mbfl_filt_conv_illegal_output(c, filter));
5356 			}
5357 		} else if (mode == 0x2) {
5358 			for (i = 0; i < 3; i++) {
5359 				if (c1 == code_tbl_m[i+5][2] && c == code_tbl_m[i+5][3]) {
5360 					filter->cache = c | 0x20000;
5361 					filter->status = 4;
5362 					break;
5363 				}
5364 			}
5365 		} else if (mode == 0x4) {
5366 			for (i = 0; i < 4; i++) {
5367 				if (c1 == code_tbl_m[i+8][2] && c == code_tbl_m[i+8][3]) {
5368 					filter->cache = c | 0x40000;
5369 					filter->status = 4;
5370 					break;
5371 				}
5372 			}
5373 		}
5374 		break;
5375 
5376 	case 4:
5377 		s1 = 0;
5378 		c1 = filter->cache & 0xffff;
5379 		mode = (filter->cache & 0xf0000) >> 16;
5380 
5381 		filter->cache = 0;
5382 		filter->status = 0;
5383 
5384 		if (mode == 0x2) {
5385 			for (i = 0; i < 3; i++) {
5386 				if (c1 == code_tbl_m[i+5][3] && c == code_tbl_m[i+5][4]) {
5387 					s1 = code_tbl_m[i+5][0];
5388 					break;
5389 				}
5390 			}
5391 
5392 			if (s1 > 0) {
5393 				c1 = s1/94+0x21;
5394 				c2 = s1-94*(c1-0x21)+0x21;
5395 				SJIS_ENCODE(c1, c2, s1, s2);
5396 				CK((*filter->output_function)(s1, filter->data));
5397 				CK((*filter->output_function)(s2, filter->data));
5398 			} else {
5399 				CK(mbfl_filt_conv_illegal_output(0xf861, filter));
5400 				for (i = 0; i < 3; i++) {
5401 					if (c1 == code_tbl_m[i+5][3]) {
5402 						CK(mbfl_filt_conv_illegal_output(code_tbl_m[i+5][2], filter));
5403 						break;
5404 					}
5405 				}
5406 				CK(mbfl_filt_conv_illegal_output(c1, filter));
5407 				CK(mbfl_filt_conv_illegal_output(c, filter));
5408 			}
5409 		} else if (mode == 0x4) {
5410 			for (i = 0; i < 4; i++) {
5411 				if (c1 == code_tbl_m[i+8][3] && c == code_tbl_m[i+8][4]) {
5412 					filter->cache = c | 0x40000;
5413 					filter->status = 5;
5414 					break;
5415 				}
5416 			}
5417 		}
5418 		break;
5419 
5420 	case 5:
5421 		s1 = 0;
5422 		c1 = filter->cache & 0xffff;
5423 		mode = (filter->cache & 0xf0000) >> 16;
5424 
5425 		filter->cache = filter->status = 0;
5426 
5427 		if (mode == 0x4) {
5428 			for (i = 0; i < 4; i++) {
5429 				if (c1 == code_tbl_m[i+8][4] && c == code_tbl_m[i+8][5]) {
5430 					s1 = code_tbl_m[i+8][0];
5431 					break;
5432 				}
5433 			}
5434 
5435 			if (s1 > 0) {
5436 				c1 = s1/94+0x21;
5437 				c2 = s1-94*(c1-0x21)+0x21;
5438 				SJIS_ENCODE(c1, c2, s1, s2);
5439 				CK((*filter->output_function)(s1, filter->data));
5440 				CK((*filter->output_function)(s2, filter->data));
5441 			} else {
5442 				CK(mbfl_filt_conv_illegal_output(0xf862, filter));
5443 				for (i = 0; i < 4; i++) {
5444 					if (c1 == code_tbl_m[i+8][4]) {
5445 						CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][2], filter));
5446 						CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][3], filter));
5447 						break;
5448 					}
5449 				}
5450 				CK(mbfl_filt_conv_illegal_output(c1, filter));
5451 				CK(mbfl_filt_conv_illegal_output(c, filter));
5452 			}
5453 		}
5454 		break;
5455 
5456 		EMPTY_SWITCH_DEFAULT_CASE();
5457 	}
5458 
5459 	return 0;
5460 }
5461 
mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter * filter)5462 static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter)
5463 {
5464 	int i, c1, s1 = 0;
5465 	if (filter->status == 1 && filter->cache > 0) {
5466 		c1 = filter->cache;
5467 		for (i=0;i<s_form_tbl_len;i++) {
5468 			if (c1 == s_form_tbl[i]) {
5469 				s1 = s_form_sjis_fallback_tbl[i];
5470 				break;
5471 			}
5472 		}
5473 		if (s1 > 0) {
5474 			CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5475 			CK((*filter->output_function)(s1 & 0xff, filter->data));
5476 		}
5477 	}
5478 	filter->cache = 0;
5479 	filter->status = 0;
5480 
5481 	if (filter->flush_function != NULL) {
5482 		return (*filter->flush_function)(filter->data);
5483 	}
5484 
5485 	return 0;
5486 }
5487 
mb_sjismac_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)5488 static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
5489 {
5490 	/* A single SJIS-Mac kuten code can convert to up to 5 Unicode codepoints, oh my! */
5491 	ZEND_ASSERT(bufsize >= 5);
5492 
5493 	unsigned char *p = *in, *e = p + *in_len;
5494 	uint32_t *out = buf, *limit = buf + bufsize;
5495 
5496 	while (p < e && out < limit) {
5497 		unsigned char c = *p++;
5498 
5499 		if (c <= 0x80 || c == 0xA0) {
5500 			if (c == 0x5C) {
5501 				*out++ = 0xA5;
5502 			} else if (c == 0x80) {
5503 				*out++ = 0x5C;
5504 			} else {
5505 				*out++ = c;
5506 			}
5507 		} else if (c >= 0xA1 && c <= 0xDF) {
5508 			*out++ = 0xFEC0 + c;
5509 		} else if (c <= 0xED) {
5510 			if (p == e) {
5511 				*out++ = MBFL_BAD_INPUT;
5512 				break;
5513 			}
5514 			unsigned char c2 = *p++;
5515 			uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
5516 
5517 			if (w <= 0x89) {
5518 				if (w == 0x1C) {
5519 					*out++ = 0x2014; /* EM DASH */
5520 					continue;
5521 				} else if (w == 0x1F) {
5522 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
5523 					continue;
5524 				} else if (w == 0x20) {
5525 					*out++ = 0x301C; /* FULLWIDTH TILDE */
5526 					continue;
5527 				} else if (w == 0x21) {
5528 					*out++ = 0x2016; /* PARALLEL TO */
5529 					continue;
5530 				} else if (w == 0x3C) {
5531 					*out++ = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
5532 					continue;
5533 				} else if (w == 0x50) {
5534 					*out++ = 0xA2; /* FULLWIDTH CENT SIGN */
5535 					continue;
5536 				} else if (w == 0x51) {
5537 					*out++ = 0xA3; /* FULLWIDTH POUND SIGN */
5538 					continue;
5539 				} else if (w == 0x89) {
5540 					*out++ = 0xAC; /* FULLWIDTH NOT SIGN */
5541 					continue;
5542 				}
5543 			} else {
5544 				if (w >= 0x2F0 && w <= 0x3A3) {
5545 					for (int i = 0; i < 7; i++) {
5546 						if (w >= code_tbl[i][0] && w <= code_tbl[i][1]) {
5547 							*out++ = w - code_tbl[i][0] + code_tbl[i][2];
5548 							goto next_iteration;
5549 						}
5550 					}
5551 				}
5552 
5553 				if (w >= 0x340 && w <= 0x523) {
5554 					for (int i = 0; i < code_tbl_m_len; i++) {
5555 						if (w == code_tbl_m[i][0]) {
5556 							int n = 5;
5557 							if (code_tbl_m[i][1] == 0xF860) {
5558 								n = 3;
5559 							} else if (code_tbl_m[i][1] == 0xF861) {
5560 								n = 4;
5561 							}
5562 							if ((limit - out) < n) {
5563 								p -= 2;
5564 								goto finished;
5565 							}
5566 							for (int j = 1; j <= n; j++) {
5567 								*out++ = code_tbl_m[i][j];
5568 							}
5569 							goto next_iteration;
5570 						}
5571 					}
5572 				}
5573 
5574 				if (w >= 0x3AC && w <= 0x20A5) {
5575 					for (int i = 0; i < 8; i++) {
5576 						if (w >= code_ofst_tbl[i][0] && w <= code_ofst_tbl[i][1]) {
5577 							uint32_t w2 = code_map[i][w - code_ofst_tbl[i][0]];
5578 							if (!w2) {
5579 								*out++ = MBFL_BAD_INPUT;
5580 								goto next_iteration;
5581 							}
5582 							if ((limit - out) < 2) {
5583 								p -= 2;
5584 								goto finished;
5585 							}
5586 							*out++ = w2;
5587 							if (w >= 0x43E && w <= 0x441) {
5588 								*out++ = 0xF87A;
5589 							} else if (w == 0x3B1 || w == 0x3B7) {
5590 								*out++ = 0xF87F;
5591 							} else if (w == 0x4B8 || w == 0x4B9 || w == 0x4C4) {
5592 								*out++ = 0x20DD;
5593 							} else if (w == 0x1ED9 || w == 0x1EDA || w == 0x1EE8 || w == 0x1EF3 || (w >= 0x1EF5 && w <= 0x1EFB) || w == 0x1F05 || w == 0x1F06 || w == 0x1F18 || (w >= 0x1FF2 && w <= 0x20A5)) {
5594 								*out++ = 0xF87E;
5595 							}
5596 							goto next_iteration;
5597 						}
5598 					}
5599 				}
5600 			}
5601 
5602 			if (w < jisx0208_ucs_table_size) {
5603 				w = jisx0208_ucs_table[w];
5604 				if (!w)
5605 					w = MBFL_BAD_INPUT;
5606 				*out++ = w;
5607 			} else {
5608 				*out++ = MBFL_BAD_INPUT;
5609 			}
5610 		} else if (c == 0xFD) {
5611 			*out++ = 0xA9;
5612 		} else if (c == 0xFE) {
5613 			*out++ = 0x2122;
5614 		} else if (c == 0xFF) {
5615 			if ((limit - out) < 2) {
5616 				p--;
5617 				break;
5618 			}
5619 			*out++ = 0x2026;
5620 			*out++ = 0xF87F;
5621 		} else {
5622 			*out++ = MBFL_BAD_INPUT;
5623 		}
5624 next_iteration: ;
5625 	}
5626 
5627 finished:
5628 	*in_len = e - p;
5629 	*in = p;
5630 	return out - buf;
5631 }
5632 
process_s_form(uint32_t w,uint32_t w2,unsigned int * s)5633 static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s)
5634 {
5635 	if (w2 == 0xF87A) {
5636 		for (int i = 0; i < 4; i++) {
5637 			if (w == s_form_tbl[i+34+3+3]) {
5638 				*s = s_form_sjis_tbl[i+34+3+3];
5639 				return true;
5640 			}
5641 		}
5642 	} else if (w2 == 0x20DD) {
5643 		for (int i = 0; i < 3; i++) {
5644 			if (w == s_form_tbl[i+34+3]) {
5645 				*s = s_form_sjis_tbl[i+34+3];
5646 				return true;
5647 			}
5648 		}
5649 	} else if (w2 == 0xF87F) {
5650 		for (int i = 0; i < 3; i++) {
5651 			if (w == s_form_tbl[i+34]) {
5652 				*s = s_form_sjis_tbl[i+34];
5653 				return true;
5654 			}
5655 		}
5656 	} else if (w2 == 0xF87E) {
5657 		for (int i = 0; i < 34; i++) {
5658 			if (w == s_form_tbl[i]) {
5659 				*s = s_form_sjis_tbl[i];
5660 				return true;
5661 			}
5662 		}
5663 	}
5664 
5665 	return false;
5666 }
5667 
5668 /* For codepoints F860-F862, which are treated specially in MacJapanese */
5669 static int transcoding_hint_cp_width[3] = { 3, 4, 5 };
5670 
mb_wchar_to_sjismac(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)5671 static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
5672 {
5673 	unsigned char *out, *limit;
5674 	MB_CONVERT_BUF_LOAD(buf, out, limit);
5675 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5676 
5677 	uint32_t w;
5678 
5679 	if (buf->state) {
5680 		w = buf->state & 0xFFFF;
5681 		if (buf->state & 0xFF000000L) {
5682 			goto resume_transcoding_hint;
5683 		} else {
5684 			buf->state = 0;
5685 			goto process_codepoint;
5686 		}
5687 	}
5688 
5689 	while (len--) {
5690 		w = *in++;
5691 process_codepoint: ;
5692 		unsigned int s = 0;
5693 
5694 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
5695 			if (w == 0x5C) {
5696 				s = 0x80;
5697 			} else if (w == 0xA9) {
5698 				s = 0xFD;
5699 			} else {
5700 				s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
5701 			}
5702 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
5703 			if (w == 0x2122) {
5704 				s = 0xFE;
5705 			} else if (w == 0x2014) {
5706 				s = 0x213D;
5707 			} else if (w == 0x2116) {
5708 				s = 0x2C1D;
5709 			} else {
5710 				s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
5711 			}
5712 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
5713 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
5714 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
5715 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
5716 		}
5717 
5718 		if (w >= 0x2000) {
5719 			for (int i = 0; i < s_form_tbl_len; i++) {
5720 				if (w == s_form_tbl[i]) {
5721 					if (!len) {
5722 						if (end) {
5723 							s = s_form_sjis_fallback_tbl[i];
5724 							if (s) {
5725 								MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
5726 								out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5727 							} else {
5728 								MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5729 							}
5730 						} else {
5731 							buf->state = w;
5732 						}
5733 						MB_CONVERT_BUF_STORE(buf, out, limit);
5734 						return;
5735 					}
5736 					uint32_t w2 = *in++;
5737 					len--;
5738 
5739 					if (!process_s_form(w, w2, &s)) {
5740 						in--; len++;
5741 
5742 						for (int i = 0; i < s_form_tbl_len; i++) {
5743 							if (w == s_form_tbl[i]) {
5744 								s = s_form_sjis_fallback_tbl[i];
5745 								break;
5746 							}
5747 						}
5748 					}
5749 
5750 					if (s <= 0xFF) {
5751 						out = mb_convert_buf_add(out, s);
5752 					} else {
5753 						MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5754 						out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5755 					}
5756 
5757 					goto next_iteration;
5758 				}
5759 			}
5760 
5761 			if (w == 0xF860 || w == 0xF861 || w == 0xF862) {
5762 				/* Apple 'transcoding hint' codepoints (from private use area) */
5763 				if (!len) {
5764 					if (end) {
5765 						MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5766 					} else {
5767 						buf->state = w;
5768 					}
5769 					MB_CONVERT_BUF_STORE(buf, out, limit);
5770 					return;
5771 				}
5772 
5773 				uint32_t w2 = *in++;
5774 				len--;
5775 
5776 				for (int i = 0; i < code_tbl_m_len; i++) {
5777 					if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) {
5778 						/* This might be a valid transcoding hint sequence */
5779 						int index = 3;
5780 
5781 						if (buf->state) {
5782 resume_transcoding_hint:
5783 							i = buf->state >> 24;
5784 							index = (buf->state >> 16) & 0xFF;
5785 							buf->state = 0;
5786 						}
5787 
5788 						int expected = transcoding_hint_cp_width[w - 0xF860];
5789 
5790 						while (index <= expected) {
5791 							if (!len) {
5792 								if (end) {
5793 									for (int j = 1; j < index; j++) {
5794 										MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5795 									}
5796 								} else {
5797 									buf->state = (i << 24) | (index << 16) | (w & 0xFFFF);
5798 								}
5799 								MB_CONVERT_BUF_STORE(buf, out, limit);
5800 								return;
5801 							}
5802 
5803 							w2 = *in++;
5804 							len--;
5805 
5806 							if (w2 != code_tbl_m[i][index]) {
5807 								/* Didn't match */
5808 								for (int j = 1; j < index; j++) {
5809 									MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5810 								}
5811 								MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac);
5812 								MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5813 								goto next_iteration;
5814 							}
5815 
5816 							index++;
5817 						}
5818 
5819 						/* Successful match, emit SJIS-mac bytes */
5820 						s = code_tbl_m[i][0];
5821 						unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2;
5822 						SJIS_ENCODE(c1, c2, s1, s2);
5823 						MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5824 						out = mb_convert_buf_add2(out, s1, s2);
5825 						goto next_iteration;
5826 					}
5827 				}
5828 
5829 				/* No valid transcoding hint sequence found */
5830 				in--; len++;
5831 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5832 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5833 				continue;
5834 			}
5835 		}
5836 
5837 		if (!s) {
5838 			if (w == 0xA0) {
5839 				s = 0xA0;
5840 			} else if (w == 0xA5) { /* YEN SIGN */
5841 				/* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5842 				 * convert codepoint 0xA5 to halfwidth Yen sign */
5843 				s = 0x5C; /* HALFWIDTH YEN SIGN */
5844 			} else if (w == 0xFF3C) {	/* FULLWIDTH REVERSE SOLIDUS */
5845 				s = 0x2140;
5846 			} else {
5847 				for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) {
5848 					if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) {
5849 						s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5850 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5851 						goto found_kuten_code;
5852 					}
5853 				}
5854 
5855 				for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) {
5856 					if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) {
5857 						s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]];
5858 						if (s) {
5859 							s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5860 							goto found_kuten_code;
5861 						}
5862 					}
5863 				}
5864 
5865 				for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) {
5866 					if (w == wchar2sjis_mac_wchar_tbl[i][0]) {
5867 						s = wchar2sjis_mac_wchar_tbl[i][1];
5868 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5869 						goto found_kuten_code;
5870 					}
5871 				}
5872 			}
5873 		}
5874 
5875 found_kuten_code:
5876 		if ((!s && w) || s >= 0x8080) {
5877 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5878 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5879 		} else if (s <= 0xFF) {
5880 			out = mb_convert_buf_add(out, s);
5881 		} else {
5882 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
5883 			SJIS_ENCODE(c1, c2, s1, s2);
5884 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5885 			out = mb_convert_buf_add2(out, s1, s2);
5886 		}
5887 
5888 next_iteration: ;
5889 	}
5890 
5891 	MB_CONVERT_BUF_STORE(buf, out, limit);
5892 }
5893 
mbfilter_sjis_emoji_docomo2unicode(int s,int * snd)5894 int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd)
5895 {
5896 	/* All three mobile vendors had emoji for numbers on a telephone keypad
5897 	 * Unicode doesn't have those, but it has a combining character which puts
5898 	 * a 'keypad button' around the following character, making it look like
5899 	 * a key on a telephone or keyboard. That combining char is codepoint 0x20E3. */
5900 	if (s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
5901 		if ((s >= DOCOMO_KEYPAD(1) && s <= DOCOMO_KEYPAD(9)) || s == DOCOMO_KEYPAD(0) || s == DOCOMO_KEYPAD_HASH) {
5902 			EMIT_KEYPAD_EMOJI(convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]));
5903 		} else {
5904 			*snd = 0;
5905 			return convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]);
5906 		}
5907 	}
5908 	return 0;
5909 }
5910 
mbfilter_sjis_emoji_sb2unicode(int s,int * snd)5911 int mbfilter_sjis_emoji_sb2unicode(int s, int *snd)
5912 {
5913 	if (s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb1_max) {
5914 		if (s == 0x2817 || (s >= 0x2823 && s <= 0x282C)) {
5915 			EMIT_KEYPAD_EMOJI(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5916 		} else {
5917 			*snd = 0;
5918 			return convert_emoji_cp(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5919 		}
5920 	} else if (s >= mb_tbl_code2uni_sb2_min && s <= mb_tbl_code2uni_sb2_max) {
5921 		*snd = 0;
5922 		return convert_emoji_cp(mb_tbl_code2uni_sb2[s - mb_tbl_code2uni_sb2_min]);
5923 	} else if (s >= mb_tbl_code2uni_sb3_min && s <= mb_tbl_code2uni_sb3_max) {
5924 		if (s >= 0x2B02 && s <= 0x2B0B) {
5925 			EMIT_FLAG_EMOJI(nflags_sb[s - 0x2B02]);
5926 		} else {
5927 			*snd = 0;
5928 			return convert_emoji_cp(mb_tbl_code2uni_sb3[s - mb_tbl_code2uni_sb3_min]);
5929 		}
5930 	}
5931 	return 0;
5932 }
5933 
mbfilter_unicode2sjis_emoji_docomo(int c,int * s1,mbfl_convert_filter * filter)5934 int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter)
5935 {
5936 	/* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
5937 	 * to a sequence of 2 codepoints, one of which is a combining character which
5938 	 * adds the 'key' image around the other
5939 	 *
5940 	 * In the other direction, look for such sequences and convert them to a
5941 	 * single emoji */
5942 	if (filter->status == 1) {
5943 		int c1 = filter->cache;
5944 		filter->cache = filter->status = 0;
5945 		if (c == 0x20E3) {
5946 			if (c1 == '#') {
5947 				*s1 = 0x2964;
5948 			} else if (c1 == '0') {
5949 				*s1 = 0x296F;
5950 			} else { /* Previous character was '1'-'9' */
5951 				*s1 = 0x2966 + (c1 - '1');
5952 			}
5953 			return 1;
5954 		} else {
5955 			/* This character wasn't combining character to make keypad symbol,
5956 			 * so pass the previous character through... and proceed to process the
5957 			 * current character as usual
5958 			 * (Single-byte ASCII characters are valid in Shift-JIS...) */
5959 			CK((*filter->output_function)(c1, filter->data));
5960 		}
5961 	}
5962 
5963 	if (c == '#' || (c >= '0' && c <= '9')) {
5964 		filter->status = 1;
5965 		filter->cache = c;
5966 		return 0;
5967 	}
5968 
5969 	if (c == 0xA9) { /* Copyright sign */
5970 		*s1 = 0x29B5;
5971 		return 1;
5972 	} else if (c == 0x00AE) { /* Registered sign */
5973 		*s1 = 0x29BA;
5974 		return 1;
5975 	} else if (c >= mb_tbl_uni_docomo2code2_min && c <= mb_tbl_uni_docomo2code2_max) {
5976 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
5977 		if (i >= 0) {
5978 			*s1 = mb_tbl_uni_docomo2code2_value[i];
5979 			return 1;
5980 		}
5981 	} else if (c >= mb_tbl_uni_docomo2code3_min && c <= mb_tbl_uni_docomo2code3_max) {
5982 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
5983 		if (i >= 0) {
5984 			*s1 = mb_tbl_uni_docomo2code3_value[i];
5985 			return 1;
5986 		}
5987 	} else if (c >= mb_tbl_uni_docomo2code5_min && c <= mb_tbl_uni_docomo2code5_max) {
5988 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
5989 		if (i >= 0) {
5990 			*s1 = mb_tbl_uni_docomo2code5_val[i];
5991 			return 1;
5992 		}
5993 	}
5994 	return 0;
5995 }
5996 
mbfilter_unicode2sjis_emoji_kddi_sjis(int c,int * s1,mbfl_convert_filter * filter)5997 int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter)
5998 {
5999 	if (filter->status == 1) {
6000 		int c1 = filter->cache;
6001 		filter->cache = filter->status = 0;
6002 		if (c == 0x20E3) {
6003 			if (c1 == '#') {
6004 				*s1 = 0x25BC;
6005 			} else if (c1 == '0') {
6006 				*s1 = 0x2830;
6007 			} else { /* Previous character was '1'-'9' */
6008 				*s1 = 0x27a6 + (c1 - '1');
6009 			}
6010 			return 1;
6011 		} else {
6012 			CK((*filter->output_function)(c1, filter->data));
6013 		}
6014 	} else if (filter->status == 2) {
6015 		int c1 = filter->cache;
6016 		filter->cache = filter->status = 0;
6017 		if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
6018 			for (int i = 0; i < 10; i++) {
6019 				if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
6020 					*s1 = nflags_code_kddi[i];
6021 					return 1;
6022 				}
6023 			}
6024 		}
6025 
6026 		/* If none of the KDDI national flag emoji matched, then we have no way
6027 		 * to convert the previous codepoint... */
6028 		mbfl_filt_conv_illegal_output(c1, filter);
6029 	}
6030 
6031 	if (c == '#' || (c >= '0' && c <= '9')) {
6032 		filter->status = 1;
6033 		filter->cache = c;
6034 		return 0;
6035 	} else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6036 		filter->status = 2;
6037 		filter->cache = c;
6038 		return 0;
6039 	}
6040 
6041 	if (c == 0xA9) { /* Copyright sign */
6042 		*s1 = 0x27DC;
6043 		return 1;
6044 	} else if (c == 0xAE) { /* Registered sign */
6045 		*s1 = 0x27DD;
6046 		return 1;
6047 	} else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
6048 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6049 		if (i >= 0) {
6050 			*s1 = mb_tbl_uni_kddi2code2_value[i];
6051 			return 1;
6052 		}
6053 	} else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
6054 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6055 		if (i >= 0) {
6056 			*s1 = mb_tbl_uni_kddi2code3_value[i];
6057 			return 1;
6058 		}
6059 	} else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
6060 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6061 		if (i >= 0) {
6062 			*s1 = mb_tbl_uni_kddi2code5_val[i];
6063 			return 1;
6064 		}
6065 	}
6066 	return 0;
6067 }
6068 
mbfilter_unicode2sjis_emoji_sb(int c,int * s1,mbfl_convert_filter * filter)6069 int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter)
6070 {
6071 	if (filter->status == 1) {
6072 		int c1 = filter->cache;
6073 		filter->cache = filter->status = 0;
6074 		if (c == 0x20E3) {
6075 			if (c1 == '#') {
6076 				*s1 = 0x2817;
6077 			} else if (c1 == '0') {
6078 				*s1 = 0x282c;
6079 			} else { /* Previous character was '1'-'9' */
6080 				*s1 = 0x2823 + (c1 - '1');
6081 			}
6082 			return 1;
6083 		} else {
6084 			(*filter->output_function)(c1, filter->data);
6085 		}
6086 	} else if (filter->status == 2) {
6087 		int c1 = filter->cache;
6088 		filter->cache = filter->status = 0;
6089 		if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
6090 			for (int i = 0; i < 10; i++) {
6091 				if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
6092 					*s1 = nflags_code_sb[i];
6093 					return 1;
6094 				}
6095 			}
6096 		}
6097 
6098 		/* If none of the SoftBank national flag emoji matched, then we have no way
6099 		 * to convert the previous codepoint... */
6100 		mbfl_filt_conv_illegal_output(c1, filter);
6101 	}
6102 
6103 	if (c == '#' || (c >= '0' && c <= '9')) {
6104 		filter->status = 1;
6105 		filter->cache = c;
6106 		return 0;
6107 	} else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6108 		filter->status = 2;
6109 		filter->cache = c;
6110 		return 0;
6111 	}
6112 
6113 	if (c == 0xA9) { /* Copyright sign */
6114 		*s1 = 0x2855;
6115 		return 1;
6116 	} else if (c == 0xAE) { /* Registered sign */
6117 		*s1 = 0x2856;
6118 		return 1;
6119 	} else if (c >= mb_tbl_uni_sb2code2_min && c <= mb_tbl_uni_sb2code2_max) {
6120 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
6121 		if (i >= 0) {
6122 			*s1 = mb_tbl_uni_sb2code2_value[i];
6123 			return 1;
6124 		}
6125 	} else if (c >= mb_tbl_uni_sb2code3_min && c <= mb_tbl_uni_sb2code3_max) {
6126 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
6127 		if (i >= 0) {
6128 			*s1 = mb_tbl_uni_sb2code3_value[i];
6129 			return 1;
6130 		}
6131 	} else if (c >= mb_tbl_uni_sb2code5_min && c <= mb_tbl_uni_sb2code5_max) {
6132 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
6133 		if (i >= 0) {
6134 			*s1 = mb_tbl_uni_sb2code5_val[i];
6135 			return 1;
6136 		}
6137 	}
6138 	return 0;
6139 }
6140 
mbfl_filt_conv_sjis_mobile_wchar(int c,mbfl_convert_filter * filter)6141 static int mbfl_filt_conv_sjis_mobile_wchar(int c, mbfl_convert_filter *filter)
6142 {
6143 	int c1, s, s1, s2, w, snd = 0;
6144 
6145 	switch (filter->status) {
6146 	case 0:
6147 		if (c >= 0 && c < 0x80) { /* ASCII */
6148 			if (filter->from == &mbfl_encoding_sjis_sb && c == 0x1B) {
6149 				/* ESC; escape sequences were used on older SoftBank phones for emoji */
6150 				filter->cache = c;
6151 				filter->status = 2;
6152 			} else {
6153 				CK((*filter->output_function)(c, filter->data));
6154 			}
6155 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
6156 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
6157 		} else if (c > 0x80 && c < 0xFD && c != 0xA0) { /* Kanji, first byte */
6158 			filter->status = 1;
6159 			filter->cache = c;
6160 		} else {
6161 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6162 		}
6163 		break;
6164 
6165 	case 1: /* Kanji, second byte */
6166 		filter->status = 0;
6167 		c1 = filter->cache;
6168 		if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
6169 			w = 0;
6170 			SJIS_DECODE(c1, c, s1, s2);
6171 			s = ((s1 - 0x21) * 94) + s2 - 0x21;
6172 			if (s <= 137) {
6173 				if (s == 31) {
6174 					w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6175 				} else if (s == 32) {
6176 					w = 0xFF5E; /* FULLWIDTH TILDE */
6177 				} else if (s == 33) {
6178 					w = 0x2225; /* PARALLEL TO */
6179 				} else if (s == 60) {
6180 					w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6181 				} else if (s == 80) {
6182 					w = 0xFFE0; /* FULLWIDTH CENT SIGN */
6183 				} else if (s == 81) {
6184 					w = 0xFFE1; /* FULLWIDTH POUND SIGN */
6185 				} else if (s == 137) {
6186 					w = 0xFFE2; /* FULLWIDTH NOT SIGN */
6187 				}
6188 			}
6189 			if (w == 0) {
6190 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
6191 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
6192 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
6193 					w = jisx0208_ucs_table[s];
6194 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
6195 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
6196 				}
6197 
6198 				/* Emoji */
6199 				if (filter->from == &mbfl_encoding_sjis_docomo && s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
6200 					w = mbfilter_sjis_emoji_docomo2unicode(s, &snd);
6201 					if (snd > 0) {
6202 						CK((*filter->output_function)(snd, filter->data));
6203 					}
6204 				} else if (filter->from == &mbfl_encoding_sjis_kddi && s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi2_max) {
6205 					w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
6206 					if (snd > 0) {
6207 						CK((*filter->output_function)(snd, filter->data));
6208 					}
6209 				} else if (filter->from == &mbfl_encoding_sjis_sb && s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb3_max) {
6210 					w = mbfilter_sjis_emoji_sb2unicode(s, &snd);
6211 					if (snd > 0) {
6212 						CK((*filter->output_function)(snd, filter->data));
6213 					}
6214 				}
6215 
6216 				if (w == 0) {
6217 					if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */
6218 						w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
6219 					} else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */
6220 						w = s - (94*94) + 0xe000;
6221 					}
6222 				}
6223 			}
6224 			if (w <= 0) {
6225 				w = MBFL_BAD_INPUT;
6226 			}
6227 			CK((*filter->output_function)(w, filter->data));
6228 		} else {
6229 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6230 		}
6231 		break;
6232 
6233 	/* ESC: Softbank Emoji */
6234 	case 2:
6235 		if (c == '$') {
6236 			filter->cache = c;
6237 			filter->status++;
6238 		} else {
6239 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6240 			filter->status = filter->cache = 0;
6241 		}
6242 		break;
6243 
6244 	/* ESC $: Softbank Emoji */
6245 	case 3:
6246 		if ((c >= 'E' && c <= 'G') || (c >= 'O' && c <= 'Q')) {
6247 			filter->cache = c;
6248 			filter->status++;
6249 		} else {
6250 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6251 			filter->status = filter->cache = 0;
6252 		}
6253 		break;
6254 
6255 	/* ESC $ [GEFOPQ]: Softbank Emoji */
6256 	case 4:
6257 		c1 = filter->cache;
6258 		if (c == 0xF) { /* Terminate sequence of emoji */
6259 			filter->status = filter->cache = 0;
6260 			return 0;
6261 		} else {
6262 			if (c1 == 'G' && c >= 0x21 && c <= 0x7a) {
6263 				s1 = (0x91 - 0x21) * 94;
6264 			} else if (c1 == 'E' && c >= 0x21 && c <= 0x7A) {
6265 				s1 = (0x8D - 0x21) * 94;
6266 			} else if (c1 == 'F' && c >= 0x21 && c <= 0x7A) {
6267 				s1 = (0x8E - 0x21) * 94;
6268 			} else if (c1 == 'O' && c >= 0x21 && c <= 0x6D) {
6269 				s1 = (0x92 - 0x21) * 94;
6270 			} else if (c1 == 'P' && c >= 0x21 && c <= 0x6C) {
6271 				s1 = (0x95 - 0x21) * 94;
6272 			} else if (c1 == 'Q' && c >= 0x21 && c <= 0x5E) {
6273 				s1 = (0x96 - 0x21) * 94;
6274 			} else {
6275 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6276 				filter->status = filter->cache = 0;
6277 				return 0;
6278 			}
6279 
6280 			w = mbfilter_sjis_emoji_sb2unicode(s1 + c - 0x21, &snd);
6281 			if (w > 0) {
6282 				if (snd > 0) {
6283 					CK((*filter->output_function)(snd, filter->data));
6284 				}
6285 				CK((*filter->output_function)(w, filter->data));
6286 			} else {
6287 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6288 				filter->status = filter->cache = 0;
6289 			}
6290 		}
6291 	}
6292 
6293 	return 0;
6294 }
6295 
mbfl_filt_conv_wchar_sjis_mobile(int c,mbfl_convert_filter * filter)6296 static int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter)
6297 {
6298 	int c1, c2, s1 = 0, s2 = 0;
6299 
6300 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
6301 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
6302 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
6303 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
6304 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
6305 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
6306 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
6307 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
6308 	} else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
6309 		/* Private User Area (95ku - 114ku) */
6310 		s1 = c - 0xE000;
6311 		c1 = (s1 / 94) + 0x7F;
6312 		c2 = (s1 % 94) + 0x21;
6313 		s1 = (c1 << 8) | c2;
6314 		s2 = 1;
6315 	}
6316 
6317 	if (s1 <= 0) {
6318 		if (c == 0xA5) { /* YEN SIGN */
6319 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
6320 		} else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6321 			s1 = 0x2140;
6322 		} else if (c == 0x2225) { /* PARALLEL TO */
6323 			s1 = 0x2142;
6324 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6325 			s1 = 0x215D;
6326 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6327 			s1 = 0x2171;
6328 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6329 			s1 = 0x2172;
6330 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6331 			s1 = 0x224C;
6332 		}
6333 	}
6334 
6335 	if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {	/* not found or X 0212 */
6336 		s1 = -1;
6337 
6338 		/* CP932 vendor ext1 (13ku) */
6339 		for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
6340 			if (c == cp932ext1_ucs_table[c1]) {
6341 				s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
6342 				break;
6343 			}
6344 		}
6345 
6346 		if (s1 <= 0) {
6347 			/* CP932 vendor ext2 (115ku - 119ku) */
6348 			for (c1 = 0; c1 < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; c1++) {
6349 				if (c == cp932ext2_ucs_table[c1]) {
6350 					s1 = (((c1 / 94) + 0x79) << 8) + (c1 % 94) + 0x21;
6351 					break;
6352 				}
6353 			}
6354 		}
6355 
6356 		if (c == 0) {
6357 			s1 = 0;
6358 		}
6359 	}
6360 
6361 	if ((filter->to == &mbfl_encoding_sjis_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter)) ||
6362 		  (filter->to == &mbfl_encoding_sjis_kddi   && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter)) ||
6363 		  (filter->to == &mbfl_encoding_sjis_sb     && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter))) {
6364 		s1 = (((s1 / 94) + 0x21) << 8) | ((s1 % 94) + 0x21);
6365 	}
6366 
6367 	if (filter->status) {
6368 		return 0;
6369 	}
6370 
6371 	if (s1 >= 0) {
6372 		if (s1 < 0x100) { /* Latin/Kana */
6373 			CK((*filter->output_function)(s1, filter->data));
6374 		} else { /* Kanji */
6375 			c1 = (s1 >> 8) & 0xff;
6376 			c2 = s1 & 0xff;
6377 			SJIS_ENCODE(c1, c2, s1, s2);
6378 			CK((*filter->output_function)(s1, filter->data));
6379 			CK((*filter->output_function)(s2, filter->data));
6380 		}
6381 	} else {
6382 		CK(mbfl_filt_conv_illegal_output(c, filter));
6383 	}
6384 
6385 	return 0;
6386 }
6387 
mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter * filter)6388 int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter)
6389 {
6390 	int c1 = filter->cache;
6391 	if (filter->status == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
6392 		filter->cache = filter->status = 0;
6393 		CK((*filter->output_function)(c1, filter->data));
6394 	} else if (filter->status == 2) {
6395 		/* First of a pair of Regional Indicator codepoints came at the end of a string */
6396 		filter->cache = filter->status = 0;
6397 		mbfl_filt_conv_illegal_output(c1, filter);
6398 	}
6399 
6400 	if (filter->flush_function) {
6401 		(*filter->flush_function)(filter->data);
6402 	}
6403 
6404 	return 0;
6405 }
6406 
6407 static const unsigned short sjis_mobile_decode_tbl1[] = {
6408 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 8836, 9024, 9212, 9400, 9588, 9776, 9964, 10152, 10340, 10528, 10716, 10904, 11092, 0xFFFF, 0xFFFF, 0xFFFF
6409 };
6410 
mb_sjis_docomo_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6411 static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6412 {
6413 	unsigned char *p = *in, *e = p + *in_len;
6414 	/* Leave one extra space available in output buffer, since some iterations of
6415 	 * main loop (below) may emit two wchars */
6416 	uint32_t *out = buf, *limit = buf + bufsize - 1;
6417 
6418 	while (p < e && out < limit) {
6419 		unsigned char c = *p++;
6420 
6421 		if (c <= 0x7F) {
6422 			*out++ = c;
6423 		} else if (c >= 0xA1 && c <= 0xDF) {
6424 			/* Kana */
6425 			*out++ = 0xFEC0 + c;
6426 		} else {
6427 			/* Kanji */
6428 			if (p == e) {
6429 				*out++ = MBFL_BAD_INPUT;
6430 				break;
6431 			}
6432 			unsigned char c2 = *p++;
6433 			uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6434 
6435 			if (w <= 137) {
6436 				if (w == 31) {
6437 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6438 					continue;
6439 				} else if (w == 32) {
6440 					*out++ =  0xFF5E; /* FULLWIDTH TILDE */
6441 					continue;
6442 				} else if (w == 33) {
6443 					*out++ =  0x2225; /* PARALLEL TO */
6444 					continue;
6445 				} else if (w == 60) {
6446 					*out++ =  0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6447 					continue;
6448 				} else if (w == 80) {
6449 					*out++ =  0xFFE0; /* FULLWIDTH CENT SIGN */
6450 					continue;
6451 				} else if (w == 81) {
6452 					*out++ =  0xFFE1; /* FULLWIDTH POUND SIGN */
6453 					continue;
6454 				} else if (w == 137) {
6455 					*out++ =  0xFFE2; /* FULLWIDTH NOT SIGN */
6456 					continue;
6457 				}
6458 			}
6459 
6460 			if (w >= mb_tbl_code2uni_docomo1_min && w <= mb_tbl_code2uni_docomo1_max) {
6461 				int snd = 0;
6462 				w = mbfilter_sjis_emoji_docomo2unicode(w, &snd);
6463 				if (snd) {
6464 					*out++ = snd;
6465 				}
6466 			} else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6467 				w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6468 			} else if (w < jisx0208_ucs_table_size) {
6469 				w = jisx0208_ucs_table[w];
6470 			} else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6471 				w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6472 			} else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6473 				w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6474 			} else if (w >= (94*94) && w < (114*94)) {
6475 				w = w - (94*94) + 0xE000;
6476 			} else {
6477 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6478 					p--;
6479 				}
6480 				*out++ = MBFL_BAD_INPUT;
6481 				continue;
6482 			}
6483 
6484 			*out++ = w ? w : MBFL_BAD_INPUT;
6485 		}
6486 	}
6487 
6488 	*in_len = e - p;
6489 	*in = p;
6490 	return out - buf;
6491 }
6492 
mb_wchar_to_sjis_docomo(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6493 static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6494 {
6495 	unsigned char *out, *limit;
6496 	MB_CONVERT_BUF_LOAD(buf, out, limit);
6497 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6498 
6499 	uint32_t w;
6500 	unsigned int s = 0;
6501 
6502 	if (buf->state) {
6503 		/* Continue what we were doing on the previous call */
6504 		w = buf->state;
6505 		buf->state = 0;
6506 		goto reprocess_wchar;
6507 	}
6508 
6509 	while (len--) {
6510 		w = *in++;
6511 reprocess_wchar:
6512 		s = 0;
6513 
6514 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6515 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6516 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6517 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6518 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6519 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6520 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6521 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6522 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6523 			/* Private User Area (95ku - 114ku) */
6524 			s = w - 0xE000;
6525 			s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6526 			goto process_emoji;
6527 		}
6528 
6529 		if (!s) {
6530 			if (w == 0xA5) { /* YEN SIGN */
6531 				s = 0x216F; /* FULLWIDTH YEN SIGN */
6532 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
6533 				s = 0x2140;
6534 			} else if (w == 0x2225) { /* PARALLEL TO */
6535 				s = 0x2142;
6536 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6537 				s = 0x215D;
6538 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6539 				s = 0x2171;
6540 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6541 				s = 0x2172;
6542 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6543 				s = 0x224C;
6544 			}
6545 		}
6546 
6547 		if (w && (!s || s >= 0x8080)) {
6548 			s = 0;
6549 
6550 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6551 				if (w == cp932ext1_ucs_table[i]) {
6552 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6553 					goto process_emoji;
6554 				}
6555 			}
6556 
6557 			for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6558 				if (w == cp932ext2_ucs_table[i]) {
6559 					s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6560 					goto process_emoji;
6561 				}
6562 			}
6563 		}
6564 
6565 process_emoji:
6566 		/* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
6567 		 * to a sequence of 2 codepoints, one of which is a combining character which
6568 		 * adds the 'key' image around the other
6569 		 *
6570 		 * In the other direction, look for such sequences and convert them to a
6571 		 * single emoji */
6572 		if (w == '#' || (w >= '0' && w <= '9')) {
6573 			if (!len) {
6574 				if (end) {
6575 					goto emit_output;
6576 				} else {
6577 					/* If we are at the end of the current buffer of codepoints, but another
6578 					 * buffer is coming, then remember that we have to reprocess `w` */
6579 					buf->state = w;
6580 					break;
6581 				}
6582 			}
6583 			uint32_t w2 = *in++; len--;
6584 			if (w2 == 0x20E3) {
6585 				if (w == '#') {
6586 					s = 0x2964;
6587 				} else if (w == '0') {
6588 					s = 0x296F;
6589 				} else { /* Previous character was '1'-'9' */
6590 					s = 0x2966 + (w - '1');
6591 				}
6592 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6593 			} else {
6594 				in--; len++;
6595 			}
6596 		} else if (w == 0xA9) { /* Copyright sign */
6597 			s = (((0x29B5 / 94) + 0x21) << 8) | ((0x29B5 % 94) + 0x21);
6598 		} else if (w == 0xAE) { /* Registered sign */
6599 			s = (((0x29BA / 94) + 0x21) << 8) | ((0x29BA % 94) + 0x21);
6600 		} else if (w >= mb_tbl_uni_docomo2code2_min && w <= mb_tbl_uni_docomo2code2_max) {
6601 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
6602 			if (i >= 0) {
6603 				s = mb_tbl_uni_docomo2code2_value[i];
6604 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6605 			}
6606 		} else if (w >= mb_tbl_uni_docomo2code3_min && w <= mb_tbl_uni_docomo2code3_max) {
6607 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
6608 			if (i >= 0) {
6609 				s = mb_tbl_uni_docomo2code3_value[i];
6610 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6611 			}
6612 		} else if (w >= mb_tbl_uni_docomo2code5_min && w <= mb_tbl_uni_docomo2code5_max) {
6613 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
6614 			if (i >= 0) {
6615 				s = mb_tbl_uni_docomo2code5_val[i];
6616 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6617 			}
6618 		}
6619 
6620 emit_output:
6621 		if (!s && w) {
6622 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_docomo);
6623 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6624 		} else if (s <= 0xFF) {
6625 			out = mb_convert_buf_add(out, s);
6626 		} else {
6627 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6628 			SJIS_ENCODE(c1, c2, s1, s2);
6629 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6630 			out = mb_convert_buf_add2(out, s1, s2);
6631 		}
6632 	}
6633 
6634 	MB_CONVERT_BUF_STORE(buf, out, limit);
6635 }
6636 
mb_sjis_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6637 static size_t mb_sjis_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6638 {
6639 	unsigned char *p = *in, *e = p + *in_len;
6640 	uint32_t *out = buf, *limit = buf + bufsize - 1;
6641 
6642 	while (p < e && out < limit) {
6643 		unsigned char c = *p++;
6644 
6645 		if (c <= 0x7F) {
6646 			*out++ = c;
6647 		} else if (c >= 0xA1 && c <= 0xDF) {
6648 			/* Kana */
6649 			*out++ = 0xFEC0 + c;
6650 		} else {
6651 			/* Kanji */
6652 			if (p == e) {
6653 				*out++ = MBFL_BAD_INPUT;
6654 				break;
6655 			}
6656 			unsigned char c2 = *p++;
6657 			uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6658 
6659 			if (w <= 137) {
6660 				if (w == 31) {
6661 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6662 					continue;
6663 				} else if (w == 32) {
6664 					*out++ = 0xFF5E; /* FULLWIDTH TILDE */
6665 					continue;
6666 				} else if (w == 33) {
6667 					*out++ = 0x2225; /* PARALLEL TO */
6668 					continue;
6669 				} else if (w == 60) {
6670 					*out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6671 					continue;
6672 				} else if (w == 80) {
6673 					*out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6674 					continue;
6675 				} else if (w == 81) {
6676 					*out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6677 					continue;
6678 				} else if (w == 137) {
6679 					*out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6680 					continue;
6681 				}
6682 			}
6683 
6684 			if (w >= mb_tbl_code2uni_kddi1_min && w <= mb_tbl_code2uni_kddi2_max) {
6685 				int snd = 0;
6686 				w = mbfilter_sjis_emoji_kddi2unicode(w, &snd);
6687 				if (!w) {
6688 					w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6689 					if (w >= (94*94) && w < (114*94)) {
6690 						w = w - (94*94) + 0xE000;
6691 					}
6692 				} else if (snd) {
6693 					*out++ = snd;
6694 				}
6695 			} else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6696 				w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6697 			} else if (w < jisx0208_ucs_table_size) {
6698 				w = jisx0208_ucs_table[w];
6699 			} else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6700 				w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6701 			} else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6702 				w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6703 			} else if (w >= (94*94) && w < (114*94)) {
6704 				w = w - (94*94) + 0xE000;
6705 			} else {
6706 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6707 					p--;
6708 				}
6709 				*out++ = MBFL_BAD_INPUT;
6710 				continue;
6711 			}
6712 
6713 			*out++ = w ? w : MBFL_BAD_INPUT;
6714 		}
6715 	}
6716 
6717 	*in_len = e - p;
6718 	*in = p;
6719 	return out - buf;
6720 }
6721 
mb_wchar_to_sjis_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6722 static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6723 {
6724 	unsigned char *out, *limit;
6725 	MB_CONVERT_BUF_LOAD(buf, out, limit);
6726 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6727 
6728 	uint32_t w;
6729 	unsigned int s = 0;
6730 
6731 	if (buf->state) {
6732 		w = buf->state;
6733 		buf->state = 0;
6734 		goto reprocess_wchar;
6735 	}
6736 
6737 	while (len--) {
6738 		w = *in++;
6739 reprocess_wchar:
6740 		s = 0;
6741 
6742 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6743 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6744 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6745 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6746 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6747 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6748 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6749 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6750 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6751 			/* Private User Area (95ku - 114ku) */
6752 			s = w - 0xE000;
6753 			s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6754 			goto process_emoji;
6755 		}
6756 
6757 		if (!s) {
6758 			if (w == 0xA5) { /* YEN SIGN */
6759 				s = 0x216F; /* FULLWIDTH YEN SIGN */
6760 			} else if (w == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6761 				s = 0x2140;
6762 			} else if (w == 0x2225) { /* PARALLEL TO */
6763 				s = 0x2142;
6764 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6765 				s = 0x215D;
6766 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6767 				s = 0x2171;
6768 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6769 				s = 0x2172;
6770 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6771 				s = 0x224C;
6772 			}
6773 		}
6774 
6775 		if (w && (!s || s >= 0x8080)) {
6776 			s = 0;
6777 
6778 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6779 				if (w == cp932ext1_ucs_table[i]) {
6780 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6781 					goto process_emoji;
6782 				}
6783 			}
6784 
6785 			for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6786 				if (w == cp932ext2_ucs_table[i]) {
6787 					s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6788 					goto process_emoji;
6789 				}
6790 			}
6791 		}
6792 
6793 process_emoji:
6794 		if (w == '#' || (w >= '0' && w <= '9')) {
6795 			if (!len) {
6796 				if (end) {
6797 					goto emit_output;
6798 				} else {
6799 					/* If we are at the end of the current buffer of codepoints, but another
6800 					 * buffer is coming, then remember that we have to reprocess `w` */
6801 					buf->state = w;
6802 					break;
6803 				}
6804 			}
6805 			uint32_t w2 = *in++; len--;
6806 			if (w2 == 0x20E3) {
6807 				if (w == '#') {
6808 					s = 0x25BC;
6809 				} else if (w == '0') {
6810 					s = 0x2830;
6811 				} else { /* Previous character was '1'-'9' */
6812 					s = 0x27A6 + (w - '1');
6813 				}
6814 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6815 			} else {
6816 				in--; len++;
6817 			}
6818 		} else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
6819 			if (!len) {
6820 				if (end) {
6821 					MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6822 				} else {
6823 					/* Reprocess `w` when this function is called again with another buffer
6824 					 * of wchars */
6825 					buf->state = w;
6826 				}
6827 				break;
6828 			}
6829 			uint32_t w2 = *in++; len--;
6830 			if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
6831 				for (int i = 0; i < 10; i++) {
6832 					if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
6833 						s = nflags_code_kddi[i];
6834 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6835 						goto emit_output;
6836 					}
6837 				}
6838 			}
6839 			in--; len++;
6840 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6841 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6842 			continue;
6843 		} else if (w == 0xA9) { /* Copyright sign */
6844 			s = (((0x27DC / 94) + 0x21) << 8) | ((0x27DC % 94) + 0x21);
6845 		} else if (w == 0xAE) { /* Registered sign */
6846 			s = (((0x27DD / 94) + 0x21) << 8) | ((0x27DD % 94) + 0x21);
6847 		} else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
6848 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6849 			if (i >= 0) {
6850 				s = mb_tbl_uni_kddi2code2_value[i];
6851 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6852 			}
6853 		} else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
6854 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6855 			if (i >= 0) {
6856 				s = mb_tbl_uni_kddi2code3_value[i];
6857 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6858 			}
6859 		} else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
6860 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6861 			if (i >= 0) {
6862 				s = mb_tbl_uni_kddi2code5_val[i];
6863 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6864 			}
6865 		}
6866 
6867 emit_output:
6868 		if (!s && w) {
6869 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6870 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6871 		} else if (s <= 0xFF) {
6872 			out = mb_convert_buf_add(out, s);
6873 		} else {
6874 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6875 			SJIS_ENCODE(c1, c2, s1, s2);
6876 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6877 			out = mb_convert_buf_add2(out, s1, s2);
6878 		}
6879 	}
6880 
6881 	MB_CONVERT_BUF_STORE(buf, out, limit);
6882 }
6883 
mb_sjis_sb_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6884 static size_t mb_sjis_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6885 {
6886 	unsigned char *p = *in, *e = p + *in_len;
6887 	uint32_t *out = buf, *limit = buf + bufsize - 1;
6888 
6889 	if (*state) {
6890 		goto softbank_emoji_escapes;
6891 	}
6892 
6893 	while (p < e && out < limit) {
6894 		unsigned char c = *p++;
6895 
6896 		if (c == 0x1B) {
6897 			/* Escape sequence */
6898 			if (p == e || *p++ != '$' || p == e) {
6899 				*out++ = MBFL_BAD_INPUT;
6900 				continue;
6901 			}
6902 			unsigned char c2 = *p++;
6903 			if ((c2 < 'E' || c2 > 'G') && (c2 < 'O' || c2 > 'Q')) {
6904 				*out++ = MBFL_BAD_INPUT;
6905 				continue;
6906 			}
6907 			/* Escape sequence was valid, next should be a series of specially
6908 			 * encoded Softbank emoji */
6909 			*state = c2;
6910 
6911 softbank_emoji_escapes:
6912 			while (p < e && out < limit) {
6913 				c = *p++;
6914 				if (c == 0xF) {
6915 					*state = 0;
6916 					break;
6917 				}
6918 				unsigned int s = 0;
6919 				if (*state == 'G' && c >= 0x21 && c <= 0x7A) {
6920 					s = (0x91 - 0x21) * 94;
6921 				} else if (*state == 'E' && c >= 0x21 && c <= 0x7A) {
6922 					s = (0x8D - 0x21) * 94;
6923 				} else if (*state == 'F' && c >= 0x21 && c <= 0x7A) {
6924 					s = (0x8E - 0x21) * 94;
6925 				} else if (*state == 'O' && c >= 0x21 && c <= 0x6D) {
6926 					s = (0x92 - 0x21) * 94;
6927 				} else if (*state == 'P' && c >= 0x21 && c <= 0x6C) {
6928 					s = (0x95 - 0x21) * 94;
6929 				} else if (*state == 'Q' && c >= 0x21 && c <= 0x5E) {
6930 					s = (0x96 - 0x21) * 94;
6931 				} else {
6932 					*out++ = MBFL_BAD_INPUT;
6933 					*state = 0;
6934 					break;
6935 				}
6936 
6937 				int snd = 0;
6938 				uint32_t w = mbfilter_sjis_emoji_sb2unicode(s + c - 0x21, &snd);
6939 				if (w) {
6940 					if (snd) {
6941 						*out++ = snd;
6942 					}
6943 					*out++ = w;
6944 				} else {
6945 					*out++ = MBFL_BAD_INPUT;
6946 					*state = 0;
6947 					break;
6948 				}
6949 			}
6950 		} else if (c <= 0x7F) {
6951 			*out++ = c;
6952 		} else if (c >= 0xA1 && c <= 0xDF) {
6953 			/* Kana */
6954 			*out++ = 0xFEC0 + c;
6955 		} else {
6956 			/* Kanji */
6957 			if (p == e) {
6958 				*out++ = MBFL_BAD_INPUT;
6959 				break;
6960 			}
6961 			unsigned char c2 = *p++;
6962 			uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6963 
6964 			if (w <= 137) {
6965 				if (w == 31) {
6966 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6967 					continue;
6968 				} else if (w == 32) {
6969 					*out++ = 0xFF5E; /* FULLWIDTH TILDE */
6970 					continue;
6971 				} else if (w == 33) {
6972 					*out++ = 0x2225; /* PARALLEL TO */
6973 					continue;
6974 				} else if (w == 60) {
6975 					*out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6976 					continue;
6977 				} else if (w == 80) {
6978 					*out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6979 					continue;
6980 				} else if (w == 81) {
6981 					*out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6982 					continue;
6983 				} else if (w == 137) {
6984 					*out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6985 					continue;
6986 				}
6987 			}
6988 
6989 			if (w >= mb_tbl_code2uni_sb1_min && w <= mb_tbl_code2uni_sb3_max) {
6990 				int snd = 0;
6991 				w = mbfilter_sjis_emoji_sb2unicode(w, &snd);
6992 				if (!w) {
6993 					w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6994 					if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6995 						w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6996 					} else if (w >= (94*94) && w < (114*94)) {
6997 						w = w - (94*94) + 0xE000;
6998 					}
6999 				} else if (snd) {
7000 					*out++ = snd;
7001 				}
7002 			} else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
7003 				w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
7004 			} else if (w < jisx0208_ucs_table_size) {
7005 				w = jisx0208_ucs_table[w];
7006 			} else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
7007 				w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
7008 			} else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
7009 				w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
7010 			} else if (w >= (94*94) && w < (114*94)) {
7011 				w = w - (94*94) + 0xE000;
7012 			} else {
7013 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7014 					p--;
7015 				}
7016 				*out++ = MBFL_BAD_INPUT;
7017 				continue;
7018 			}
7019 
7020 			*out++ = w ? w : MBFL_BAD_INPUT;
7021 		}
7022 	}
7023 
7024 	*in_len = e - p;
7025 	*in = p;
7026 	return out - buf;
7027 }
7028 
mb_wchar_to_sjis_sb(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7029 static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7030 {
7031 	unsigned char *out, *limit;
7032 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7033 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
7034 
7035 	uint32_t w;
7036 	unsigned int s = 0;
7037 
7038 	if (buf->state) {
7039 		w = buf->state;
7040 		buf->state = 0;
7041 		goto reprocess_wchar;
7042 	}
7043 
7044 	while (len--) {
7045 		w = *in++;
7046 reprocess_wchar:
7047 		s = 0;
7048 
7049 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7050 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7051 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7052 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7053 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7054 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
7055 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7056 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
7057 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7058 			/* Private User Area (95ku - 114ku) */
7059 			s = w - 0xE000;
7060 			s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
7061 			goto process_emoji;
7062 		}
7063 
7064 		if (!s) {
7065 			if (w == 0xA5) { /* YEN SIGN */
7066 				s = 0x216F; /* FULLWIDTH YEN SIGN */
7067 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7068 				s = 0x2140;
7069 			} else if (w == 0x2225) { /* PARALLEL TO */
7070 				s = 0x2142;
7071 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7072 				s = 0x215D;
7073 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7074 				s = 0x2171;
7075 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7076 				s = 0x2172;
7077 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7078 				s = 0x224C;
7079 			}
7080 		}
7081 
7082 		if (w && (!s || s >= 0x8080)) {
7083 			s = 0;
7084 
7085 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7086 				if (w == cp932ext1_ucs_table[i]) {
7087 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
7088 					goto process_emoji;
7089 				}
7090 			}
7091 
7092 			for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
7093 				if (w == cp932ext2_ucs_table[i]) {
7094 					s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
7095 					goto process_emoji;
7096 				}
7097 			}
7098 		}
7099 
7100 process_emoji:
7101 		if (w == '#' || (w >= '0' && w <= '9')) {
7102 			if (!len) {
7103 				if (end) {
7104 					goto emit_output;
7105 				} else {
7106 					/* If we are at the end of the current buffer of codepoints, but another
7107 					 * buffer is coming, then remember that we have to reprocess `w` */
7108 					buf->state = w;
7109 					break;
7110 				}
7111 			}
7112 			uint32_t w2 = *in++; len--;
7113 			if (w2 == 0x20E3) {
7114 				if (w == '#') {
7115 					s = 0x2817;
7116 				} else if (w == '0') {
7117 					s = 0x282c;
7118 				} else { /* Previous character was '1'-'9' */
7119 					s = 0x2823 + (w - '1');
7120 				}
7121 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7122 			} else {
7123 				in--; len++;
7124 			}
7125 		} else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
7126 			if (!len) {
7127 				if (end) {
7128 					MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7129 				} else {
7130 					/* Reprocess `w` when this function is called again with
7131 					 * another buffer of wchars */
7132 					buf->state = w;
7133 				}
7134 				break;
7135 			}
7136 			uint32_t w2 = *in++; len--;
7137 			if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
7138 				for (int i = 0; i < 10; i++) {
7139 					if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
7140 						s = nflags_code_sb[i];
7141 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7142 						goto emit_output;
7143 					}
7144 				}
7145 			}
7146 			in--; len++;
7147 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7148 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7149 			continue;
7150 		} else if (w == 0xA9) { /* Copyright sign */
7151 			s = (((0x2855 / 94) + 0x21) << 8) | ((0x2855 % 94) + 0x21);
7152 		} else if (w == 0xAE) { /* Registered sign */
7153 			s = (((0x2856 / 94) + 0x21) << 8) | ((0x2856 % 94) + 0x21);
7154 		} else if (w >= mb_tbl_uni_sb2code2_min && w <= mb_tbl_uni_sb2code2_max) {
7155 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
7156 			if (i >= 0) {
7157 				s = mb_tbl_uni_sb2code2_value[i];
7158 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7159 			}
7160 		} else if (w >= mb_tbl_uni_sb2code3_min && w <= mb_tbl_uni_sb2code3_max) {
7161 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
7162 			if (i >= 0) {
7163 				s = mb_tbl_uni_sb2code3_value[i];
7164 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7165 			}
7166 		} else if (w >= mb_tbl_uni_sb2code5_min && w <= mb_tbl_uni_sb2code5_max) {
7167 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
7168 			if (i >= 0) {
7169 				s = mb_tbl_uni_sb2code5_val[i];
7170 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7171 			}
7172 		}
7173 
7174 emit_output:
7175 		if (!s && w) {
7176 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7177 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7178 		} else if (s <= 0xFF) {
7179 			out = mb_convert_buf_add(out, s);
7180 		} else {
7181 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7182 			SJIS_ENCODE(c1, c2, s1, s2);
7183 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7184 			out = mb_convert_buf_add2(out, s1, s2);
7185 		}
7186 	}
7187 
7188 	MB_CONVERT_BUF_STORE(buf, out, limit);
7189 }
7190 
mb_sjis2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7191 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7192 {
7193 	unsigned char *p = *in, *e = p + *in_len;
7194 	uint32_t *out = buf, *limit = buf + bufsize - 1;
7195 
7196 	while (p < e && out < limit) {
7197 		unsigned char c = *p++;
7198 
7199 		if (c <= 0x7F) {
7200 			if (c == 0x5C) {
7201 				*out++ = 0xA5;
7202 			} else if (c == 0x7E) {
7203 				*out++ = 0x203E;
7204 			} else {
7205 				*out++ = c;
7206 			}
7207 		} else if (c >= 0xA1 && c <= 0xDF) {
7208 			*out++ = 0xFEC0 + c;
7209 		} else {
7210 			if (p == e) {
7211 				*out++ = MBFL_BAD_INPUT;
7212 				break;
7213 			}
7214 			unsigned char c2 = *p++;
7215 			uint32_t w1 = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7216 
7217 			/* Conversion for combining characters */
7218 			if (w1 >= 0x0170 && w1 <= 0x03F1) {
7219 				int k = mbfl_bisec_srch2(w1, jisx0213_u2_key_b, jisx0213_u2_tbl_len);
7220 				if (k >= 0) {
7221 					*out++ = jisx0213_u2_tbl[2*k];
7222 					*out++ = jisx0213_u2_tbl[2*k+1];
7223 					continue;
7224 				}
7225 			}
7226 
7227 			/* Conversion for BMP */
7228 			if (w1 < jisx0213_ucs_table_size) {
7229 				uint32_t w = jisx0213_ucs_table[w1];
7230 				if (w) {
7231 					*out++ = w;
7232 					continue;
7233 				}
7234 			}
7235 
7236 			/* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */
7237 			int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
7238 			if (k >= 0) {
7239 				*out++ = jisx0213_jis_u5_tbl[k] + 0x20000;
7240 			} else {
7241 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7242 					p--;
7243 				}
7244 				*out++ = MBFL_BAD_INPUT;
7245 			}
7246 		}
7247 	}
7248 
7249 	 *in_len = e - p;
7250 	 *in = p;
7251 	 return out - buf;
7252 }
7253 
mb_wchar_to_sjis2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7254 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7255 {
7256 	unsigned char *out, *limit;
7257 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7258 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7259 
7260 	uint32_t w;
7261 	if (buf->state) {
7262 		w = buf->state;
7263 		buf->state = 0;
7264 		goto process_codepoint;
7265 	}
7266 
7267 	while (len--) {
7268 		w = *in++;
7269 process_codepoint: ;
7270 		unsigned int s = 0;
7271 
7272 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
7273 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
7274 				if (w == jisx0213_u2_tbl[2*k]) {
7275 					if (!len) {
7276 						if (!end) {
7277 							buf->state = w;
7278 							MB_CONVERT_BUF_STORE(buf, out, limit);
7279 							return;
7280 						}
7281 					} else {
7282 						uint32_t w2 = *in++; len--;
7283 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
7284 							k++;
7285 						}
7286 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
7287 							s = jisx0213_u2_key[k];
7288 							break;
7289 						}
7290 						in--; len++;
7291 					}
7292 
7293 					/* Fallback */
7294 					s = jisx0213_u2_fb_tbl[k];
7295 					break;
7296 				}
7297 			}
7298 		}
7299 
7300 		/* Check for major Japanese chars: U+4E00-U+9FFF */
7301 		if (!s) {
7302 			for (int k = 0; k < uni2jis_tbl_len; k++) {
7303 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
7304 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
7305 					break;
7306 				}
7307 			}
7308 		}
7309 
7310 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
7311 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
7312 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
7313 			if (k >= 0) {
7314 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
7315 			}
7316 		}
7317 
7318 		/* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
7319 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
7320 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
7321 			if (k >= 0) {
7322 				s = jisx0213_u5_jis_tbl[k];
7323 			}
7324 		}
7325 
7326 		if (!s) {
7327 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
7328 			if (w == 0xFE45) {
7329 				s = 0x233E;
7330 			} else if (w == 0xFE46) {
7331 				s = 0x233D;
7332 			} else if (w >= 0xF91D && w <= 0xF9DC) {
7333 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
7334 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
7335 				if (k >= 0) {
7336 					s = ucs_r2b_jisx0213_cmap_val[k];
7337 				}
7338 			}
7339 		}
7340 
7341 		if (!s && w) {
7342 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis2004);
7343 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7344 		} else if (s <= 0xFF) {
7345 			out = mb_convert_buf_add(out, s);
7346 		} else {
7347 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7348 			SJIS_ENCODE(c1, c2, s1, s2);
7349 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7350 			out = mb_convert_buf_add2(out, s1, s2);
7351 		}
7352 	}
7353 
7354 	MB_CONVERT_BUF_STORE(buf, out, limit);
7355 }
7356 
mbfl_filt_conv_cp932_wchar(int c,mbfl_convert_filter * filter)7357 static int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
7358 {
7359 	int c1, s, s1, s2, w;
7360 
7361 	switch (filter->status) {
7362 	case 0:
7363 		if (c >= 0 && c < 0x80) {	/* latin */
7364 			CK((*filter->output_function)(c, filter->data));
7365 		} else if (c > 0xa0 && c < 0xe0) {	/* kana */
7366 			CK((*filter->output_function)(0xfec0 + c, filter->data));
7367 		} else if (c > 0x80 && c < 0xfd && c != 0xa0) {	/* kanji first char */
7368 			filter->status = 1;
7369 			filter->cache = c;
7370 		} else {
7371 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7372 		}
7373 		break;
7374 
7375 	case 1:		/* kanji second char */
7376 		filter->status = 0;
7377 		c1 = filter->cache;
7378 		if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
7379 			w = 0;
7380 			SJIS_DECODE(c1, c, s1, s2);
7381 			s = (s1 - 0x21)*94 + s2 - 0x21;
7382 			if (s <= 137) {
7383 				if (s == 31) {
7384 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
7385 				} else if (s == 32) {
7386 					w = 0xff5e;			/* FULLWIDTH TILDE */
7387 				} else if (s == 33) {
7388 					w = 0x2225;			/* PARALLEL TO */
7389 				} else if (s == 60) {
7390 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
7391 				} else if (s == 80) {
7392 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
7393 				} else if (s == 81) {
7394 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
7395 				} else if (s == 137) {
7396 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
7397 				}
7398 			}
7399 			if (w == 0) {
7400 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
7401 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7402 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
7403 					w = jisx0208_ucs_table[s];
7404 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
7405 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7406 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {		/* vendor ext3 (115ku - 119ku) */
7407 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7408 				} else if (s >= (94*94) && s < (114*94)) {		/* user (95ku - 114ku) */
7409 					w = s - (94*94) + 0xe000;
7410 				}
7411 			}
7412 
7413 			if (w <= 0) {
7414 				w = MBFL_BAD_INPUT;
7415 			}
7416 
7417 			CK((*filter->output_function)(w, filter->data));
7418 		} else {
7419 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7420 		}
7421 		break;
7422 
7423 		EMPTY_SWITCH_DEFAULT_CASE();
7424 	}
7425 
7426 	return 0;
7427 }
7428 
mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter * filter)7429 static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
7430 {
7431 	if (filter->status) {
7432 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
7433 		filter->status = 0;
7434 	}
7435 
7436 	if (filter->flush_function) {
7437 		(*filter->flush_function)(filter->data);
7438 	}
7439 
7440 	return 0;
7441 }
7442 
mbfl_filt_conv_wchar_cp932(int c,mbfl_convert_filter * filter)7443 static int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
7444 {
7445 	int c1, c2, s1, s2;
7446 
7447 	s1 = 0;
7448 	s2 = 0;
7449 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
7450 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
7451 	} else if (c == 0x203E) {
7452 		s1 = 0x7E;
7453 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
7454 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
7455 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
7456 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
7457 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
7458 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
7459 	} else if (c >= 0xe000 && c < (0xe000 + 20*94)) {	/* user  (95ku - 114ku) */
7460 		s1 = c - 0xe000;
7461 		c1 = s1/94 + 0x7f;
7462 		c2 = s1%94 + 0x21;
7463 		s1 = (c1 << 8) | c2;
7464 		s2 = 1;
7465 	}
7466 	if (s1 <= 0) {
7467 		if (c == 0xa5) { /* YEN SIGN */
7468 			s1 = 0x5C;
7469 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
7470 			s1 = 0x2140;
7471 		} else if (c == 0x2225) {	/* PARALLEL TO */
7472 			s1 = 0x2142;
7473 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
7474 			s1 = 0x215d;
7475 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
7476 			s1 = 0x2171;
7477 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
7478 			s1 = 0x2172;
7479 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
7480 			s1 = 0x224c;
7481 		}
7482 	}
7483 	if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {	/* not found or X 0212 */
7484 		s1 = -1;
7485 		c1 = 0;
7486 		c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
7487 		while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
7488 			if (c == cp932ext1_ucs_table[c1]) {
7489 				s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
7490 				break;
7491 			}
7492 			c1++;
7493 		}
7494 		if (s1 <= 0) {
7495 			c1 = 0;
7496 			c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
7497 			while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
7498 				if (c == cp932ext3_ucs_table[c1]) {
7499 					s1 = ((c1/94 + 0x93) << 8) + (c1%94 + 0x21);
7500 					break;
7501 				}
7502 				c1++;
7503 			}
7504 		}
7505 		if (c == 0) {
7506 			s1 = 0;
7507 		} else if (s1 <= 0) {
7508 			s1 = -1;
7509 		}
7510 	}
7511 	if (s1 >= 0) {
7512 		if (s1 < 0x100) { /* latin or kana */
7513 			CK((*filter->output_function)(s1, filter->data));
7514 		} else { /* kanji */
7515 			c1 = (s1 >> 8) & 0xff;
7516 			c2 = s1 & 0xff;
7517 			SJIS_ENCODE(c1, c2, s1, s2);
7518 			CK((*filter->output_function)(s1, filter->data));
7519 			CK((*filter->output_function)(s2, filter->data));
7520 		}
7521 	} else {
7522 		CK(mbfl_filt_conv_illegal_output(c, filter));
7523 	}
7524 
7525 	return 0;
7526 }
7527 
mbfl_filt_conv_wchar_sjiswin(int c,mbfl_convert_filter * filter)7528 static int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter)
7529 {
7530 	if (c == 0xA5) {
7531 		CK((*filter->output_function)(0x81, filter->data));
7532 		CK((*filter->output_function)(0x8F, filter->data));
7533 	} else if (c == 0x203E) {
7534 		CK((*filter->output_function)(0x81, filter->data));
7535 		CK((*filter->output_function)(0x50, filter->data));
7536 	} else {
7537 		return mbfl_filt_conv_wchar_cp932(c, filter);
7538 	}
7539 	return 0;
7540 }
7541 
mb_cp932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7542 static size_t mb_cp932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7543 {
7544 	unsigned char *p = *in, *e = p + *in_len;
7545 	uint32_t *out = buf, *limit = buf + bufsize;
7546 
7547 	while (p < e && out < limit) {
7548 		unsigned char c = *p++;
7549 
7550 		if (c < 0x80) {
7551 			*out++ = c;
7552 		} else if (c > 0xA0 && c < 0xE0) {
7553 			/* Kana */
7554 			*out++ = 0xFEC0 + c;
7555 		} else {
7556 			if (p == e) {
7557 				*out++ = MBFL_BAD_INPUT;
7558 				break;
7559 			}
7560 			unsigned char c2 = *p++;
7561 			unsigned int w = 0;
7562 			unsigned int s = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7563 
7564 			if (s <= 137) {
7565 				if (s == 31) {
7566 					w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
7567 				} else if (s == 32) {
7568 					w = 0xFF5E; /* FULLWIDTH TILDE */
7569 				} else if (s == 33) {
7570 					w = 0x2225; /* PARALLEL TO */
7571 				} else if (s == 60) {
7572 					w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
7573 				} else if (s == 80) {
7574 					w = 0xFFE0; /* FULLWIDTH CENT SIGN */
7575 				} else if (s == 81) {
7576 					w = 0xFFE1; /* FULLWIDTH POUND SIGN */
7577 				} else if (s == 137) {
7578 					w = 0xFFE2; /* FULLWIDTH NOT SIGN */
7579 				}
7580 			}
7581 
7582 			if (w == 0) {
7583 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
7584 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7585 				} else if (s < jisx0208_ucs_table_size) {
7586 					w = jisx0208_ucs_table[s];
7587 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
7588 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7589 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
7590 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7591 				} else if (s >= (94*94) && s < (114*94)) {
7592 					w = s - (94*94) + 0xE000;
7593 				}
7594 			}
7595 
7596 			if (!w) {
7597 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7598 					p--;
7599 				}
7600 				w = MBFL_BAD_INPUT;
7601 			}
7602 			*out++ = w;
7603 		}
7604 	}
7605 
7606 	*in_len = e - p;
7607 	*in = p;
7608 	return out - buf;
7609 }
7610 
mb_wchar_to_cp932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7611 static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7612 {
7613 	unsigned char *out, *limit;
7614 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7615 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7616 
7617 	while (len--) {
7618 		uint32_t w = *in++;
7619 		unsigned int s1 = 0, s2 = 0, c1, c2;
7620 
7621 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7622 			s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7623 		} else if (w == 0x203E) {
7624 			s1 = 0x7E;
7625 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7626 			s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7627 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7628 			s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7629 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7630 			s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7631 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7632 			s1 = w - 0xE000;
7633 			c1 = s1/94 + 0x7F;
7634 			c2 = s1%94 + 0x21;
7635 			s1 = (c1 << 8) | c2;
7636 			s2 = 1;
7637 		}
7638 
7639 		if (w == 0xA5) { /* YEN SIGN */
7640 			s1 = 0x5C;
7641 		} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7642 			s1 = 0x2140;
7643 		} else if (w == 0x2225) { /* PARALLEL TO */
7644 			s1 = 0x2142;
7645 		} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7646 			s1 = 0x215D;
7647 		} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7648 			s1 = 0x2171;
7649 		} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7650 			s1 = 0x2172;
7651 		} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7652 			s1 = 0x224C;
7653 		} else if (w == 0) {
7654 			out = mb_convert_buf_add(out, 0);
7655 			continue;
7656 		}
7657 
7658 		if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7659 			const unsigned short *lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext1_ucs_table_paired_sorted, sizeof(cp932ext1_ucs_table_paired_sorted) / sizeof(*cp932ext1_ucs_table_paired_sorted));
7660 			if (lookup) {
7661 				s1 = ((*lookup/94 + 0x2D) << 8) + (*lookup%94 + 0x21);
7662 				goto emit_output;
7663 			}
7664 
7665 			lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext3_ucs_table_paired_sorted, sizeof(cp932ext3_ucs_table_paired_sorted) / sizeof(*cp932ext3_ucs_table_paired_sorted));
7666 			if (lookup) {
7667 				s1 = ((*lookup/94 + 0x93) << 8) + (*lookup%94 + 0x21);
7668 				goto emit_output;
7669 			}
7670 
7671 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7672 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7673 			continue;
7674 		}
7675 
7676 emit_output:
7677 		if (s1 < 0x100) {
7678 			out = mb_convert_buf_add(out, s1);
7679 		} else {
7680 			c1 = (s1 >> 8) & 0xFF;
7681 			c2 = s1 & 0xFF;
7682 			SJIS_ENCODE(c1, c2, s1, s2);
7683 			out = mb_convert_buf_add2(out, s1, s2);
7684 		}
7685 	}
7686 
7687 	MB_CONVERT_BUF_STORE(buf, out, limit);
7688 }
7689 
mb_wchar_to_sjiswin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7690 static void mb_wchar_to_sjiswin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7691 {
7692 	unsigned char *out, *limit;
7693 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7694 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7695 
7696 	while (len--) {
7697 		uint32_t w = *in++;
7698 		unsigned int s1 = 0, s2 = 0, c1, c2;
7699 
7700 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7701 			s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7702 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7703 			s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7704 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7705 			s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7706 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7707 			s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7708 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7709 			s1 = w - 0xE000;
7710 			c1 = s1/94 + 0x7F;
7711 			c2 = s1%94 + 0x21;
7712 			s1 = (c1 << 8) | c2;
7713 			s2 = 1;
7714 		}
7715 
7716 		if (w == 0xA5) { /* YEN SIGN */
7717 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
7718 		} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7719 			s1 = 0x2140;
7720 		} else if (w == 0x2225) { /* PARALLEL TO */
7721 			s1 = 0x2142;
7722 		} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7723 			s1 = 0x215D;
7724 		} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7725 			s1 = 0x2171;
7726 		} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7727 			s1 = 0x2172;
7728 		} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7729 			s1 = 0x224C;
7730 		} else if (w == 0) {
7731 			out = mb_convert_buf_add(out, 0);
7732 			continue;
7733 		}
7734 
7735 		if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7736 			const unsigned short *lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext1_ucs_table_paired_sorted, sizeof(cp932ext1_ucs_table_paired_sorted) / sizeof(*cp932ext1_ucs_table_paired_sorted));
7737 			if (lookup) {
7738 				s1 = ((*lookup/94 + 0x2D) << 8) + (*lookup%94 + 0x21);
7739 				goto emit_output;
7740 			}
7741 
7742 			lookup = mbfl_binary_search_paired_sorted_table(w, cp932ext3_ucs_table_paired_sorted, sizeof(cp932ext3_ucs_table_paired_sorted) / sizeof(*cp932ext3_ucs_table_paired_sorted));
7743 			if (lookup) {
7744 				s1 = ((*lookup/94 + 0x93) << 8) + (*lookup%94 + 0x21);
7745 				goto emit_output;
7746 			}
7747 
7748 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7749 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7750 			continue;
7751 		}
7752 
7753 emit_output:
7754 		if (s1 < 0x100) {
7755 			out = mb_convert_buf_add(out, s1);
7756 		} else {
7757 			c1 = (s1 >> 8) & 0xFF;
7758 			c2 = s1 & 0xFF;
7759 			SJIS_ENCODE(c1, c2, s1, s2);
7760 			out = mb_convert_buf_add2(out, s1, s2);
7761 		}
7762 	}
7763 
7764 	MB_CONVERT_BUF_STORE(buf, out, limit);
7765 }
7766 
7767 static const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xEF */
7768 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7769 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7770 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7771 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7772 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7773 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7774 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7775 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7776 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7777 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7778 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7779 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7780 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7781 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7782 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7783 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7784 };
7785 
7786 static const unsigned char mblen_table_sjismac[] = { /* 0x81-0x9F,0xE0-0xED */
7787 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7788 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7789 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7790 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7791 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7792 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7793 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7794 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7795 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7796 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7797 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7798 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7799 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7800 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7801 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
7802 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7803 };
7804 
7805 static const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC */
7806 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7807 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7808 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7809 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7810 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7811 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7812 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7813 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7814 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7815 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7816 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7817 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7818 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7819 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7820 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7821 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
7822 };
7823 
7824 static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};
7825 
7826 static const struct mbfl_convert_vtbl vtbl_sjis_wchar = {
7827 	mbfl_no_encoding_sjis,
7828 	mbfl_no_encoding_wchar,
7829 	mbfl_filt_conv_common_ctor,
7830 	NULL,
7831 	mbfl_filt_conv_sjis_wchar,
7832 	mbfl_filt_conv_sjis_wchar_flush,
7833 	NULL
7834 };
7835 
7836 static const struct mbfl_convert_vtbl vtbl_wchar_sjis = {
7837 	mbfl_no_encoding_wchar,
7838 	mbfl_no_encoding_sjis,
7839 	mbfl_filt_conv_common_ctor,
7840 	NULL,
7841 	mbfl_filt_conv_wchar_sjis,
7842 	mbfl_filt_conv_common_flush,
7843 	NULL
7844 };
7845 
7846 const mbfl_encoding mbfl_encoding_sjis = {
7847 	mbfl_no_encoding_sjis,
7848 	"SJIS",
7849 	"Shift_JIS",
7850 	mbfl_encoding_sjis_aliases,
7851 	mblen_table_sjis,
7852 	MBFL_ENCTYPE_GL_UNSAFE,
7853 	&vtbl_sjis_wchar,
7854 	&vtbl_wchar_sjis,
7855 	mb_sjis_to_wchar,
7856 	mb_wchar_to_sjis,
7857 	NULL,
7858 	NULL,
7859 };
7860 
7861 static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
7862 
7863 static const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = {
7864 	mbfl_no_encoding_sjis_mac,
7865 	mbfl_no_encoding_wchar,
7866 	mbfl_filt_conv_common_ctor,
7867 	NULL,
7868 	mbfl_filt_conv_sjis_mac_wchar,
7869 	mbfl_filt_conv_sjis_wchar_flush,
7870 	NULL,
7871 };
7872 
7873 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_mac = {
7874 	mbfl_no_encoding_wchar,
7875 	mbfl_no_encoding_sjis_mac,
7876 	mbfl_filt_conv_common_ctor,
7877 	NULL,
7878 	mbfl_filt_conv_wchar_sjis_mac,
7879 	mbfl_filt_conv_wchar_sjis_mac_flush,
7880 	NULL,
7881 };
7882 
7883 const mbfl_encoding mbfl_encoding_sjis_mac = {
7884 	mbfl_no_encoding_sjis_mac,
7885 	"SJIS-mac",
7886 	"Shift_JIS",
7887 	mbfl_encoding_sjis_mac_aliases,
7888 	mblen_table_sjismac,
7889 	MBFL_ENCTYPE_GL_UNSAFE,
7890 	&vtbl_sjis_mac_wchar,
7891 	&vtbl_wchar_sjis_mac,
7892 	mb_sjismac_to_wchar,
7893 	mb_wchar_to_sjismac,
7894 	NULL,
7895 	NULL,
7896 };
7897 
7898 static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL};
7899 static const char *mbfl_encoding_sjis_kddi_aliases[] = {"SJIS-KDDI", "shift_jis-kddi", "x-sjis-emoji-kddi", NULL};
7900 static const char *mbfl_encoding_sjis_sb_aliases[] = {"SJIS-SOFTBANK", "shift_jis-softbank", "x-sjis-emoji-softbank", NULL};
7901 
7902 static const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = {
7903 	mbfl_no_encoding_sjis_docomo,
7904 	mbfl_no_encoding_wchar,
7905 	mbfl_filt_conv_common_ctor,
7906 	NULL,
7907 	mbfl_filt_conv_sjis_mobile_wchar,
7908 	mbfl_filt_conv_sjis_wchar_flush,
7909 	NULL,
7910 };
7911 
7912 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_docomo = {
7913 	mbfl_no_encoding_wchar,
7914 	mbfl_no_encoding_sjis_docomo,
7915 	mbfl_filt_conv_common_ctor,
7916 	NULL,
7917 	mbfl_filt_conv_wchar_sjis_mobile,
7918 	mbfl_filt_conv_sjis_mobile_flush,
7919 	NULL,
7920 };
7921 
7922 const mbfl_encoding mbfl_encoding_sjis_docomo = {
7923 	mbfl_no_encoding_sjis_docomo,
7924 	"SJIS-Mobile#DOCOMO",
7925 	"Shift_JIS",
7926 	mbfl_encoding_sjis_docomo_aliases,
7927 	mblen_table_sjis_mobile,
7928 	MBFL_ENCTYPE_GL_UNSAFE,
7929 	&vtbl_sjis_docomo_wchar,
7930 	&vtbl_wchar_sjis_docomo,
7931 	mb_sjis_docomo_to_wchar,
7932 	mb_wchar_to_sjis_docomo,
7933 	NULL,
7934 	NULL,
7935 };
7936 
7937 static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = {
7938 	mbfl_no_encoding_sjis_kddi,
7939 	mbfl_no_encoding_wchar,
7940 	mbfl_filt_conv_common_ctor,
7941 	NULL,
7942 	mbfl_filt_conv_sjis_mobile_wchar,
7943 	mbfl_filt_conv_sjis_wchar_flush,
7944 	NULL,
7945 };
7946 
7947 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_kddi = {
7948 	mbfl_no_encoding_wchar,
7949 	mbfl_no_encoding_sjis_kddi,
7950 	mbfl_filt_conv_common_ctor,
7951 	NULL,
7952 	mbfl_filt_conv_wchar_sjis_mobile,
7953 	mbfl_filt_conv_sjis_mobile_flush,
7954 	NULL,
7955 };
7956 
7957 const mbfl_encoding mbfl_encoding_sjis_kddi = {
7958 	mbfl_no_encoding_sjis_kddi,
7959 	"SJIS-Mobile#KDDI",
7960 	"Shift_JIS",
7961 	mbfl_encoding_sjis_kddi_aliases,
7962 	mblen_table_sjis_mobile,
7963 	MBFL_ENCTYPE_GL_UNSAFE,
7964 	&vtbl_sjis_kddi_wchar,
7965 	&vtbl_wchar_sjis_kddi,
7966 	mb_sjis_kddi_to_wchar,
7967 	mb_wchar_to_sjis_kddi,
7968 	NULL,
7969 	NULL,
7970 };
7971 
7972 static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = {
7973 	mbfl_no_encoding_sjis_sb,
7974 	mbfl_no_encoding_wchar,
7975 	mbfl_filt_conv_common_ctor,
7976 	NULL,
7977 	mbfl_filt_conv_sjis_mobile_wchar,
7978 	mbfl_filt_conv_sjis_wchar_flush,
7979 	NULL,
7980 };
7981 
7982 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_sb = {
7983 	mbfl_no_encoding_wchar,
7984 	mbfl_no_encoding_sjis_sb,
7985 	mbfl_filt_conv_common_ctor,
7986 	NULL,
7987 	mbfl_filt_conv_wchar_sjis_mobile,
7988 	mbfl_filt_conv_sjis_mobile_flush,
7989 	NULL,
7990 };
7991 
7992 const mbfl_encoding mbfl_encoding_sjis_sb = {
7993 	mbfl_no_encoding_sjis_sb,
7994 	"SJIS-Mobile#SOFTBANK",
7995 	"Shift_JIS",
7996 	mbfl_encoding_sjis_sb_aliases,
7997 	mblen_table_sjis_mobile,
7998 	MBFL_ENCTYPE_GL_UNSAFE,
7999 	&vtbl_sjis_sb_wchar,
8000 	&vtbl_wchar_sjis_sb,
8001 	mb_sjis_sb_to_wchar,
8002 	mb_wchar_to_sjis_sb,
8003 	NULL,
8004 	NULL,
8005 };
8006 
8007 /* Although the specification for Shift-JIS-2004 indicates that 0x5C and
8008  * 0x7E should (respectively) represent a Yen sign and an overbar, feedback
8009  * from Japanese PHP users indicates that they prefer 0x5C and 0x7E to be
8010  * treated as equivalent to U+005C and U+007E. This is the historical
8011  * behavior of mbstring, and promotes compatibility with other software
8012  * which handles Shift-JIS and Shift-JIS-2004 text in this way. */
8013 
8014 static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL};
8015 
8016 static const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
8017 	mbfl_no_encoding_sjis2004,
8018 	mbfl_no_encoding_wchar,
8019 	mbfl_filt_conv_common_ctor,
8020 	NULL,
8021 	mbfl_filt_conv_jis2004_wchar,
8022 	mbfl_filt_conv_jis2004_wchar_flush,
8023 	NULL,
8024 };
8025 
8026 static const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = {
8027 	mbfl_no_encoding_wchar,
8028 	mbfl_no_encoding_sjis2004,
8029 	mbfl_filt_conv_common_ctor,
8030 	NULL,
8031 	mbfl_filt_conv_wchar_jis2004,
8032 	mbfl_filt_conv_wchar_jis2004_flush,
8033 	NULL,
8034 };
8035 
8036 const mbfl_encoding mbfl_encoding_sjis2004 = {
8037 	mbfl_no_encoding_sjis2004,
8038 	"SJIS-2004",
8039 	"Shift_JIS",
8040 	mbfl_encoding_sjis2004_aliases,
8041 	mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
8042 	MBFL_ENCTYPE_GL_UNSAFE,
8043 	&vtbl_sjis2004_wchar,
8044 	&vtbl_wchar_sjis2004,
8045 	mb_sjis2004_to_wchar,
8046 	mb_wchar_to_sjis2004,
8047 	NULL,
8048 	NULL,
8049 };
8050 
8051 /* CP932 is Microsoft's version of Shift-JIS.
8052  *
8053  * What we call "SJIS-win" is a variant of CP932 which maps U+00A5
8054  * and U+203E the same way as eucJP-win; namely, instead of mapping
8055  * U+00A5 (YEN SIGN) to 0x5C and U+203E (OVERLINE) to 0x7E,
8056  * these codepoints are mapped to appropriate JIS X 0208 characters.
8057  *
8058  * When converting from Shift-JIS to Unicode, there is no difference
8059  * between CP932 and "SJIS-win".
8060  *
8061  * Additional facts:
8062  *
8063  * • In the libmbfl library which formed the base for mbstring, "CP932" and
8064  *   "SJIS-win" were originally aliases. The differing mappings were added in
8065  *   December 2002. The libmbfl author later stated that this was done so that
8066  *   "CP932" would comply with a certain specification, while "SJIS-win" would
8067  *   maintain the existing mappings. He does not remember which specification
8068  *   it was.
8069  * • The WHATWG specification for "Shift_JIS" (followed by web browsers)
8070  *   agrees with our mappings for "CP932".
8071  * • Microsoft Windows' "best-fit" mappings for CP932 (via the
8072  *   WideCharToMultiByte API) convert U+00A5 to 0x5C, which also agrees with
8073  *   our mappings for "CP932".
8074  * • glibc's iconv converts U+203E to CP932 0x7E, which again agrees with
8075  *   our mappings for "CP932".
8076  * • When converting Shift-JIS to CP932, the conversion goes through Unicode.
8077  *   Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that
8078  *   0x7E will go to 0x7E when converting Shift-JIS to CP932.
8079  */
8080 
8081 static const unsigned char mblen_table_sjiswin[] = { /* 0x81-0x9F,0xE0-0xFF */
8082 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8083 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8084 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8085 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8086 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8087 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8088 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8089 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8090 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8091 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8092 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8093 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8094 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8095 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8096 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8097 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
8098 };
8099 
8100 static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL};
8101 static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL};
8102 
8103 static const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
8104 	mbfl_no_encoding_cp932,
8105 	mbfl_no_encoding_wchar,
8106 	mbfl_filt_conv_common_ctor,
8107 	NULL,
8108 	mbfl_filt_conv_cp932_wchar,
8109 	mbfl_filt_conv_cp932_wchar_flush,
8110 	NULL,
8111 };
8112 
8113 static const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
8114 	mbfl_no_encoding_wchar,
8115 	mbfl_no_encoding_cp932,
8116 	mbfl_filt_conv_common_ctor,
8117 	NULL,
8118 	mbfl_filt_conv_wchar_cp932,
8119 	mbfl_filt_conv_common_flush,
8120 	NULL,
8121 };
8122 
8123 const mbfl_encoding mbfl_encoding_cp932 = {
8124 	mbfl_no_encoding_cp932,
8125 	"CP932",
8126 	"Shift_JIS",
8127 	mbfl_encoding_cp932_aliases,
8128 	mblen_table_sjiswin,
8129 	MBFL_ENCTYPE_GL_UNSAFE,
8130 	&vtbl_cp932_wchar,
8131 	&vtbl_wchar_cp932,
8132 	mb_cp932_to_wchar,
8133 	mb_wchar_to_cp932,
8134 	NULL,
8135 	NULL,
8136 };
8137 
8138 static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
8139 	mbfl_no_encoding_sjiswin,
8140 	mbfl_no_encoding_wchar,
8141 	mbfl_filt_conv_common_ctor,
8142 	NULL,
8143 	mbfl_filt_conv_cp932_wchar,
8144 	mbfl_filt_conv_cp932_wchar_flush,
8145 	NULL,
8146 };
8147 
8148 static const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = {
8149 	mbfl_no_encoding_wchar,
8150 	mbfl_no_encoding_sjiswin,
8151 	mbfl_filt_conv_common_ctor,
8152 	NULL,
8153 	mbfl_filt_conv_wchar_sjiswin,
8154 	mbfl_filt_conv_common_flush,
8155 	NULL,
8156 };
8157 
8158 const mbfl_encoding mbfl_encoding_sjiswin = {
8159 	mbfl_no_encoding_sjiswin,
8160 	"SJIS-win",
8161 	"Shift_JIS",
8162 	mbfl_encoding_sjiswin_aliases,
8163 	mblen_table_sjiswin,
8164 	MBFL_ENCTYPE_GL_UNSAFE,
8165 	&vtbl_sjiswin_wchar,
8166 	&vtbl_wchar_sjiswin,
8167 	mb_cp932_to_wchar,
8168 	mb_wchar_to_sjiswin,
8169 	NULL,
8170 	NULL,
8171 };
8172 
8173 /*
8174  * EUC variants
8175  */
8176 
mbfl_filt_conv_eucjp_wchar(int c,mbfl_convert_filter * filter)8177 static int mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
8178 {
8179 	int c1, s, w = 0;
8180 
8181 	switch (filter->status) {
8182 	case 0:
8183 		if (c >= 0 && c < 0x80) {	/* latin */
8184 			CK((*filter->output_function)(c, filter->data));
8185 		} else if (c > 0xa0 && c < 0xff) {	/* X 0208 first char */
8186 			filter->status = 1;
8187 			filter->cache = c;
8188 		} else if (c == 0x8e) {	/* kana first char */
8189 			filter->status = 2;
8190 		} else if (c == 0x8f) {	/* X 0212 first char */
8191 			filter->status = 3;
8192 		} else {
8193 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8194 		}
8195 		break;
8196 
8197 	case 1:	/* got first half */
8198 		filter->status = 0;
8199 		c1 = filter->cache;
8200 		if (c > 0xa0 && c < 0xff) {
8201 			s = (c1 - 0xa1)*94 + c - 0xa1;
8202 			if (s >= 0 && s < jisx0208_ucs_table_size) {
8203 				w = jisx0208_ucs_table[s];
8204 				if (!w)
8205 					w = MBFL_BAD_INPUT;
8206 			} else {
8207 				w = MBFL_BAD_INPUT;
8208 			}
8209 
8210 			CK((*filter->output_function)(w, filter->data));
8211 		} else {
8212 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8213 		}
8214 		break;
8215 
8216 	case 2:	/* got 0x8e */
8217 		filter->status = 0;
8218 		if (c > 0xa0 && c < 0xe0) {
8219 			w = 0xfec0 + c;
8220 			CK((*filter->output_function)(w, filter->data));
8221 		} else {
8222 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8223 		}
8224 		break;
8225 
8226 	case 3: /* got 0x8f, JIS X 0212 first byte */
8227 		filter->status++;
8228 		filter->cache = c;
8229 		break;
8230 
8231 	case 4: /* got 0x8f, JIS X 0212 second byte */
8232 		filter->status = 0;
8233 		c1 = filter->cache;
8234 		if (c > 0xA0 && c < 0xFF && c1 > 0xA0 && c1 < 0xFF) {
8235 			s = (c1 - 0xa1)*94 + c - 0xa1;
8236 			if (s >= 0 && s < jisx0212_ucs_table_size) {
8237 				w = jisx0212_ucs_table[s];
8238 				if (!w)
8239 					w = MBFL_BAD_INPUT;
8240 			} else {
8241 				w = MBFL_BAD_INPUT;
8242 			}
8243 
8244 			CK((*filter->output_function)(w, filter->data));
8245 		} else {
8246 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8247 		}
8248 		break;
8249 
8250 		EMPTY_SWITCH_DEFAULT_CASE();
8251 	}
8252 
8253 	return 0;
8254 }
8255 
mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter * filter)8256 static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter)
8257 {
8258 	if (filter->status) {
8259 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8260 		filter->status = 0;
8261 	}
8262 
8263 	if (filter->flush_function) {
8264 		(*filter->flush_function)(filter->data);
8265 	}
8266 
8267 	return 0;
8268 }
8269 
mbfl_filt_conv_wchar_eucjp(int c,mbfl_convert_filter * filter)8270 static int mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
8271 {
8272 	int s = 0;
8273 
8274 	if (c == 0xAF) { /* U+00AF is MACRON */
8275 		s = 0xA2B4; /* Use JIS X 0212 overline */
8276 	} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8277 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8278 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8279 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8280 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8281 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
8282 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8283 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
8284 	}
8285 	if (s <= 0) {
8286 		if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
8287 			s = 0x2140;
8288 		} else if (c == 0x2225) {	/* PARALLEL TO */
8289 			s = 0x2142;
8290 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
8291 			s = 0x215d;
8292 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
8293 			s = 0x2171;
8294 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
8295 			s = 0x2172;
8296 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
8297 			s = 0x224c;
8298 		} else if (c == 0) {
8299 			s = 0;
8300 		} else {
8301 			s = -1;
8302 		}
8303 	}
8304 	if (s >= 0) {
8305 		if (s < 0x80) {	/* latin */
8306 			CK((*filter->output_function)(s, filter->data));
8307 		} else if (s < 0x100) {	/* kana */
8308 			CK((*filter->output_function)(0x8e, filter->data));
8309 			CK((*filter->output_function)(s, filter->data));
8310 		} else if (s < 0x8080)  {	/* X 0208 */
8311 			CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8312 			CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8313 		} else {	/* X 0212 */
8314 			CK((*filter->output_function)(0x8f, filter->data));
8315 			CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8316 			CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8317 		}
8318 	} else {
8319 		CK(mbfl_filt_conv_illegal_output(c, filter));
8320 	}
8321 
8322 	return 0;
8323 }
8324 
mb_eucjp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8325 static size_t mb_eucjp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8326 {
8327 	unsigned char *p = *in, *e = p + *in_len;
8328 	uint32_t *out = buf, *limit = buf + bufsize;
8329 
8330 	while (p < e && out < limit) {
8331 		unsigned char c = *p++;
8332 
8333 		if (c < 0x80) {
8334 			*out++ = c;
8335 		} else if (c >= 0xA1 && c <= 0xFE && p < e) {
8336 			/* JISX 0208 */
8337 			unsigned char c2 = *p++;
8338 			if (c2 >= 0xA1 && c2 <= 0xFE) {
8339 				unsigned int s = (c - 0xA1)*94 + c2 - 0xA1;
8340 				if (s < jisx0208_ucs_table_size) {
8341 					uint32_t w = jisx0208_ucs_table[s];
8342 					if (!w)
8343 						w = MBFL_BAD_INPUT;
8344 					*out++ = w;
8345 				} else {
8346 					*out++ = MBFL_BAD_INPUT;
8347 				}
8348 			} else {
8349 				*out++ = MBFL_BAD_INPUT;
8350 			}
8351 		} else if (c == 0x8E && p < e) {
8352 			/* Kana */
8353 			unsigned char c2 = *p++;
8354 			*out++ = (c2 >= 0xA1 && c2 <= 0xDF) ? 0xFEC0 + c2 : MBFL_BAD_INPUT;
8355 		} else if (c == 0x8F) {
8356 			/* JISX 0212 */
8357 			if ((e - p) >= 2) {
8358 				unsigned char c2 = *p++;
8359 				unsigned char c3 = *p++;
8360 				if (c3 >= 0xA1 && c3 <= 0xFE && c2 >= 0xA1 && c2 <= 0xFE) {
8361 					unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1;
8362 					if (s < jisx0212_ucs_table_size) {
8363 						uint32_t w = jisx0212_ucs_table[s];
8364 						if (!w)
8365 							w = MBFL_BAD_INPUT;
8366 						*out++ = w;
8367 					} else {
8368 						*out++ = MBFL_BAD_INPUT;
8369 					}
8370 				} else {
8371 					*out++ = MBFL_BAD_INPUT;
8372 				}
8373 			} else {
8374 				*out++ = MBFL_BAD_INPUT;
8375 				p = e; /* Jump to end of string */
8376 			}
8377 		} else {
8378 			*out++ = MBFL_BAD_INPUT;
8379 		}
8380 	}
8381 
8382 	*in_len = e - p;
8383 	*in = p;
8384 	return out - buf;
8385 }
8386 
mb_wchar_to_eucjp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8387 static void mb_wchar_to_eucjp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8388 {
8389 	unsigned char *out, *limit;
8390 	MB_CONVERT_BUF_LOAD(buf, out, limit);
8391 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8392 
8393 	while (len--) {
8394 		uint32_t w = *in++;
8395 		unsigned int s = 0;
8396 
8397 		if (w == 0xAF) { /* U+00AF is MACRON */
8398 			s = 0xA2B4; /* Use JIS X 0212 overline */
8399 		} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8400 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8401 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8402 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8403 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8404 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8405 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8406 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8407 		}
8408 
8409 		if (s == 0) {
8410 			if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8411 				s = 0x2140;
8412 			} else if (w == 0x2225) { /* PARALLEL TO */
8413 				s = 0x2142;
8414 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8415 				s = 0x215D;
8416 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8417 				s = 0x2171;
8418 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8419 				s = 0x2172;
8420 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8421 				s = 0x224C;
8422 			} else if (w == 0) {
8423 				out = mb_convert_buf_add(out, 0);
8424 				continue;
8425 			} else {
8426 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp);
8427 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8428 				continue;
8429 			}
8430 		}
8431 
8432 		if (s < 0x80) {
8433 			out = mb_convert_buf_add(out, s);
8434 		} else if (s < 0x100) {
8435 			out = mb_convert_buf_add2(out, 0x8E, s);
8436 		} else if (s < 0x8080)  {
8437 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8438 		} else {
8439 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8440 			out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8441 		}
8442 	}
8443 
8444 	MB_CONVERT_BUF_STORE(buf, out, limit);
8445 }
8446 
mbfl_filt_conv_eucjpwin_wchar(int c,mbfl_convert_filter * filter)8447 static int mbfl_filt_conv_eucjpwin_wchar(int c, mbfl_convert_filter *filter)
8448 {
8449 	int c1, s, w, n;
8450 
8451 	switch (filter->status) {
8452 	case 0:
8453 		if (c >= 0 && c < 0x80) { /* latin */
8454 			CK((*filter->output_function)(c, filter->data));
8455 		} else if (c >= 0xa1 && c <= 0xfe) { /* CP932 first char */
8456 			filter->status = 1;
8457 			filter->cache = c;
8458 		} else if (c == 0x8e) { /* kana first char */
8459 			filter->status = 2;
8460 		} else if (c == 0x8f) { /* X 0212 first char */
8461 			filter->status = 3;
8462 		} else {
8463 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8464 		}
8465 		break;
8466 
8467 	case 1:	/* got first half */
8468 		filter->status = 0;
8469 		c1 = filter->cache;
8470 		if (c > 0xa0 && c < 0xff) {
8471 			w = 0;
8472 			s = (c1 - 0xa1)*94 + c - 0xa1;
8473 			if (s <= 137) {
8474 				if (s == 31) {
8475 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
8476 				} else if (s == 32) {
8477 					w = 0xff5e;			/* FULLWIDTH TILDE */
8478 				} else if (s == 33) {
8479 					w = 0x2225;			/* PARALLEL TO */
8480 				} else if (s == 60) {
8481 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
8482 				} else if (s == 80) {
8483 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
8484 				} else if (s == 81) {
8485 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
8486 				} else if (s == 137) {
8487 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
8488 				}
8489 			}
8490 
8491 			if (w == 0) {
8492 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
8493 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8494 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
8495 					w = jisx0208_ucs_table[s];
8496 				} else if (s >= (84 * 94)) {		/* user (85ku - 94ku) */
8497 					w = s - (84 * 94) + 0xe000;
8498 				}
8499 			}
8500 
8501 			if (w <= 0) {
8502 				w = MBFL_BAD_INPUT;
8503 			}
8504 			CK((*filter->output_function)(w, filter->data));
8505 		} else {
8506 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8507 		}
8508 		break;
8509 
8510 	case 2:	/* got 0x8e, X0201 kana */
8511 		filter->status = 0;
8512 		if (c > 0xa0 && c < 0xe0) {
8513 			w = 0xfec0 + c;
8514 			CK((*filter->output_function)(w, filter->data));
8515 		} else {
8516 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8517 		}
8518 		break;
8519 
8520 	case 3:	/* got 0x8f,  X 0212 first char */
8521 		filter->status++;
8522 		filter->cache = c;
8523 		break;
8524 
8525 	case 4:	/* got 0x8f,  X 0212 second char */
8526 		filter->status = 0;
8527 		c1 = filter->cache;
8528 		if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
8529 			s = (c1 - 0xa1)*94 + c - 0xa1;
8530 
8531 			if (s >= 0 && s < jisx0212_ucs_table_size) {
8532 				w = jisx0212_ucs_table[s];
8533 
8534 				if (w == 0x007e) {
8535 					w = 0xff5e;		/* FULLWIDTH TILDE */
8536 				}
8537 			} else if (s >= (82*94) && s < (84*94)) {	/* vender ext3 (83ku - 84ku) <-> CP932 (115ku -120ku) */
8538 				s = (c1 << 8) | c;
8539 				w = 0;
8540 				n = 0;
8541 				while (n < cp932ext3_eucjp_table_size) {
8542 					if (s == cp932ext3_eucjp_table[n]) {
8543 						if (n < (cp932ext3_ucs_table_max - cp932ext3_ucs_table_min)) {
8544 							w = cp932ext3_ucs_table[n];
8545 						}
8546 						break;
8547 					}
8548 					n++;
8549 				}
8550 			} else if (s >= (84*94)) {		/* user (85ku - 94ku) */
8551 				w = s - (84*94) + (0xe000 + (94*10));
8552 			} else {
8553 				w = 0;
8554 			}
8555 
8556 			if (w == 0x00A6) {
8557 				w = 0xFFE4;		/* FULLWIDTH BROKEN BAR */
8558 			}
8559 
8560 			if (w <= 0) {
8561 				w = MBFL_BAD_INPUT;
8562 			}
8563 			CK((*filter->output_function)(w, filter->data));
8564 		} else {
8565 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8566 		}
8567 		break;
8568 
8569 		EMPTY_SWITCH_DEFAULT_CASE();
8570 	}
8571 
8572 	return 0;
8573 }
8574 
mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter * filter)8575 static int mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter *filter)
8576 {
8577 	if (filter->status) {
8578 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8579 		filter->status = 0;
8580 	}
8581 
8582 	if (filter->flush_function) {
8583 		(*filter->flush_function)(filter->data);
8584 	}
8585 
8586 	return 0;
8587 }
8588 
mbfl_filt_conv_wchar_eucjpwin(int c,mbfl_convert_filter * filter)8589 static int mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter)
8590 {
8591 	int c1, c2, s1 = 0;
8592 
8593 	if (c == 0xAF) { /* U+00AF is MACRON */
8594 		s1 = 0xA2B4; /* Use JIS X 0212 overline */
8595 	} else if (c == 0x203E) {
8596 		s1 = 0x7E;
8597 	} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8598 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8599 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8600 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8601 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8602 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8603 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8604 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8605 	} else if (c >= 0xe000 && c < (0xe000 + 10*94)) { /* user (X0208 85ku - 94ku) */
8606 		s1 = c - 0xe000;
8607 		c1 = s1/94 + 0x75;
8608 		c2 = s1%94 + 0x21;
8609 		s1 = (c1 << 8) | c2;
8610 	} else if (c >= (0xe000 + 10*94) && c < (0xe000 + 20*94)) { /* user (X0212 85ku - 94ku) */
8611 		s1 = c - (0xe000 + 10*94);
8612 		c1 = s1/94 + 0xf5;
8613 		c2 = s1%94 + 0xa1;
8614 		s1 = (c1 << 8) | c2;
8615 	}
8616 
8617 	if (s1 == 0xa2f1) {
8618 		s1 = 0x2d62;		/* NUMERO SIGN */
8619 	}
8620 
8621 	if (s1 <= 0) {
8622 		if (c == 0xa5) {		/* YEN SIGN */
8623 			s1 = 0x5C;
8624 		} else if (c == 0x2014) {
8625 			s1 = 0x213D;
8626 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
8627 			s1 = 0x2140;
8628 		} else if (c == 0x2225) {	/* PARALLEL TO */
8629 			s1 = 0x2142;
8630 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
8631 			s1 = 0x215d;
8632 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
8633 			s1 = 0x2171;
8634 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
8635 			s1 = 0x2172;
8636 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
8637 			s1 = 0x224c;
8638 		} else {
8639 			s1 = -1;
8640 			c1 = 0;
8641 			c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
8642 			while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
8643 				const int oh = cp932ext1_ucs_table_min / 94;
8644 
8645 				if (c == cp932ext1_ucs_table[c1]) {
8646 					s1 = ((c1 / 94 + oh + 0x21) << 8) + (c1 % 94 + 0x21);
8647 					break;
8648 				}
8649 				c1++;
8650 			}
8651 			if (s1 < 0) {
8652 				c1 = 0;
8653 				c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
8654 				while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
8655 					if (c == cp932ext3_ucs_table[c1]) {
8656 						if (c1 < cp932ext3_eucjp_table_size) {
8657 							s1 = cp932ext3_eucjp_table[c1];
8658 						}
8659 						break;
8660 					}
8661 					c1++;
8662 				}
8663 			}
8664 		}
8665 
8666 		if (c == 0) {
8667 			s1 = 0;
8668 		} else if (s1 <= 0) {
8669 			s1 = -1;
8670 		}
8671 	}
8672 
8673 	if (s1 >= 0) {
8674 		if (s1 < 0x80) {	/* latin */
8675 			CK((*filter->output_function)(s1, filter->data));
8676 		} else if (s1 < 0x100) {	/* kana */
8677 			CK((*filter->output_function)(0x8e, filter->data));
8678 			CK((*filter->output_function)(s1, filter->data));
8679 		} else if (s1 < 0x8080)  {	/* X 0208 */
8680 			CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8681 			CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8682 		} else {	/* X 0212 */
8683 			CK((*filter->output_function)(0x8f, filter->data));
8684 			CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8685 			CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8686 		}
8687 	} else {
8688 		CK(mbfl_filt_conv_illegal_output(c, filter));
8689 	}
8690 
8691 	return 0;
8692 }
8693 
mb_eucjpwin_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8694 static size_t mb_eucjpwin_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8695 {
8696 	unsigned char *p = *in, *e = p + *in_len;
8697 	uint32_t *out = buf, *limit = buf + bufsize;
8698 
8699 	while (p < e && out < limit) {
8700 		unsigned char c = *p++;
8701 
8702 		if (c < 0x80) {
8703 			*out++ = c;
8704 		} else if (c >= 0xA1 && c <= 0xFE && p < e) {
8705 			unsigned char c2 = *p++;
8706 
8707 			if (c2 >= 0xA1 && c2 <= 0xFE) {
8708 				unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
8709 
8710 				if (s <= 137) {
8711 					if (s == 31) {
8712 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
8713 					} else if (s == 32) {
8714 						w = 0xFF5E; /* FULLWIDTH TILDE */
8715 					} else if (s == 33) {
8716 						w = 0x2225; /* PARALLEL TO */
8717 					} else if (s == 60) {
8718 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
8719 					} else if (s == 80) {
8720 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
8721 					} else if (s == 81) {
8722 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
8723 					} else if (s == 137) {
8724 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
8725 					}
8726 				}
8727 
8728 				if (w == 0) {
8729 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
8730 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8731 					} else if (s < jisx0208_ucs_table_size) {
8732 						w = jisx0208_ucs_table[s];
8733 					} else if (s >= (84 * 94)) {
8734 						w = s - (84 * 94) + 0xE000;
8735 					}
8736 				}
8737 
8738 				if (!w)
8739 					w = MBFL_BAD_INPUT;
8740 				*out++ = w;
8741 			} else {
8742 				*out++ = MBFL_BAD_INPUT;
8743 			}
8744 		} else if (c == 0x8E && p < e) {
8745 			unsigned char c2 = *p++;
8746 			if (c2 >= 0xA1 && c2 <= 0xDF) {
8747 				*out++ = 0xFEC0 + c2;
8748 			} else {
8749 				*out++ = MBFL_BAD_INPUT;
8750 			}
8751 		} else if (c == 0x8F && p < e) {
8752 			unsigned char c2 = *p++;
8753 			if (p == e) {
8754 				*out++ = MBFL_BAD_INPUT;
8755 				continue;
8756 			}
8757 			unsigned char c3 = *p++;
8758 
8759 			if (c2 >= 0xA1 && c2 <= 0xFE && c3 >= 0xA1 && c3 <= 0xFE) {
8760 				unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1, w = 0;
8761 
8762 				if (s < jisx0212_ucs_table_size) {
8763 					w = jisx0212_ucs_table[s];
8764 					if (w == 0x7E)
8765 						w = 0xFF5E; /* FULLWIDTH TILDE */
8766 				} else if (s >= (82*94) && s < (84*94)) {
8767 					s = (c2 << 8) | c3;
8768 					for (int i = 0; i < cp932ext3_eucjp_table_size; i++) {
8769 						if (cp932ext3_eucjp_table[i] == s) {
8770 							w = cp932ext3_ucs_table[i];
8771 							break;
8772 						}
8773 					}
8774 				} else if (s >= (84*94)) {
8775 					w = s - (84*94) + 0xE000 + (94*10);
8776 				}
8777 
8778 				if (w == 0xA6)
8779 					w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
8780 
8781 				if (!w)
8782 					w = MBFL_BAD_INPUT;
8783 				*out++ = w;
8784 			} else {
8785 				*out++ = MBFL_BAD_INPUT;
8786 			}
8787 		} else {
8788 			*out++ = MBFL_BAD_INPUT;
8789 		}
8790 	}
8791 
8792 	*in_len = e - p;
8793 	*in = p;
8794 	return out - buf;
8795 }
8796 
mb_wchar_to_eucjpwin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8797 static void mb_wchar_to_eucjpwin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8798 {
8799 	unsigned char *out, *limit;
8800 	MB_CONVERT_BUF_LOAD(buf, out, limit);
8801 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8802 
8803 	while (len--) {
8804 		uint32_t w = *in++;
8805 		unsigned int s = 0;
8806 
8807 		if (w == 0) {
8808 			out = mb_convert_buf_add(out, 0);
8809 			continue;
8810 		} else if (w == 0xAF) { /* U+00AF is MACRON */
8811 			s = 0xA2B4; /* Use JIS X 0212 overline */
8812 		} else if (w == 0x203E) {
8813 			s = 0x7E;
8814 		} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8815 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8816 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8817 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8818 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8819 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8820 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8821 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8822 		} else if (w >= 0xE000 && w < (0xE000 + 10*94)) {
8823 			s = w - 0xE000;
8824 			s = ((s/94 + 0x75) << 8) + (s%94) + 0x21;
8825 		} else if (w >= (0xE000 + 10*94) && w < (0xE000 + 20*94)) {
8826 			s = w - (0xE000 + 10*94);
8827 			s = ((s/94 + 0xF5) << 8) + (s%94) + 0xA1;
8828 		}
8829 
8830 		if (s == 0xA2F1)
8831 			s = 0x2D62; /* NUMERO SIGN */
8832 
8833 		if (s == 0) {
8834 			if (w == 0xA5) { /* YEN SIGN */
8835 				s = 0x5C;
8836 			} else if (w == 0x2014) { /* EM DASH */
8837 				s = 0x213D;
8838 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8839 				s = 0x2140;
8840 			} else if (w == 0x2225) { /* PARALLEL TO */
8841 				s = 0x2142;
8842 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8843 				s = 0x215D;
8844 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8845 				s = 0x2171;
8846 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8847 				s = 0x2172;
8848 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8849 				s = 0x224C;
8850 			} else {
8851 				for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
8852 					if (cp932ext1_ucs_table[i] == w) {
8853 						s = (((i/94) + (cp932ext1_ucs_table_min/94) + 0x21) << 8) + (i%94) + 0x21;
8854 						break;
8855 					}
8856 				}
8857 
8858 				if (!s) {
8859 					for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
8860 						if (cp932ext3_ucs_table[i] == w) {
8861 							s = cp932ext3_eucjp_table[i];
8862 							break;
8863 						}
8864 					}
8865 				}
8866 			}
8867 		}
8868 
8869 		if (!s) {
8870 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjpwin);
8871 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8872 		} else if (s < 0x80) {
8873 			out = mb_convert_buf_add(out, s);
8874 		} else if (s < 0x100) {
8875 			out = mb_convert_buf_add2(out, 0x8E, s);
8876 		} else if (s < 0x8080) {
8877 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8878 		} else {
8879 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8880 			out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8881 		}
8882 	}
8883 
8884 	MB_CONVERT_BUF_STORE(buf, out, limit);
8885 }
8886 
mbfl_filt_conv_cp51932_wchar(int c,mbfl_convert_filter * filter)8887 static int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
8888 {
8889 	int c1, s, w;
8890 
8891 	switch (filter->status) {
8892 	case 0:
8893 		if (c >= 0 && c < 0x80) { /* latin */
8894 			CK((*filter->output_function)(c, filter->data));
8895 		} else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */
8896 			filter->status = 1;
8897 			filter->cache = c;
8898 		} else if (c == 0x8e) { /* kana first char */
8899 			filter->status = 2;
8900 		} else {
8901 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8902 		}
8903 		break;
8904 
8905 	case 1:	/* got first half */
8906 		filter->status = 0;
8907 		c1 = filter->cache;
8908 		if (c > 0xa0 && c < 0xff) {
8909 			w = 0;
8910 			s = (c1 - 0xa1)*94 + c - 0xa1;
8911 			if (s <= 137) {
8912 				if (s == 31) {
8913 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
8914 				} else if (s == 32) {
8915 					w = 0xff5e;			/* FULLWIDTH TILDE */
8916 				} else if (s == 33) {
8917 					w = 0x2225;			/* PARALLEL TO */
8918 				} else if (s == 60) {
8919 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
8920 				} else if (s == 80) {
8921 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
8922 				} else if (s == 81) {
8923 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
8924 				} else if (s == 137) {
8925 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
8926 				}
8927 			}
8928 			if (w == 0) {
8929 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
8930 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8931 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
8932 					w = jisx0208_ucs_table[s];
8933 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
8934 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
8935 				}
8936 			}
8937 			if (w <= 0) {
8938 				w = MBFL_BAD_INPUT;
8939 			}
8940 			CK((*filter->output_function)(w, filter->data));
8941 		} else {
8942 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8943 		}
8944 		break;
8945 
8946 	case 2:	/* got 0x8e, X0201 kana */
8947 		filter->status = 0;
8948 		if (c > 0xa0 && c < 0xe0) {
8949 			w = 0xfec0 + c;
8950 			CK((*filter->output_function)(w, filter->data));
8951 		} else {
8952 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8953 		}
8954 		break;
8955 
8956 		EMPTY_SWITCH_DEFAULT_CASE();
8957 	}
8958 
8959 	return 0;
8960 }
8961 
mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter * filter)8962 static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter)
8963 {
8964 	if (filter->status) {
8965 		/* Input string was truncated */
8966 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8967 		filter->status = 0;
8968 	}
8969 
8970 	if (filter->flush_function) {
8971 		(*filter->flush_function)(filter->data);
8972 	}
8973 
8974 	return 0;
8975 }
8976 
mbfl_filt_conv_wchar_cp51932(int c,mbfl_convert_filter * filter)8977 static int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
8978 {
8979 	int c1, c2, s1;
8980 
8981 	s1 = 0;
8982 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8983 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8984 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8985 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8986 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8987 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8988 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8989 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8990 	}
8991 	if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */
8992 	if (s1 <= 0) {
8993 		if (c == 0xa5) { /* YEN SIGN */
8994 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
8995 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
8996 			s1 = 0x2140;
8997 		} else if (c == 0x2225) {	/* PARALLEL TO */
8998 			s1 = 0x2142;
8999 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
9000 			s1 = 0x215d;
9001 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
9002 			s1 = 0x2171;
9003 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
9004 			s1 = 0x2172;
9005 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
9006 			s1 = 0x224c;
9007 		} else {
9008 			s1 = -1;
9009 			c1 = 0;
9010 			c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
9011 			while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
9012 				if (c == cp932ext1_ucs_table[c1]) {
9013 					s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
9014 					break;
9015 				}
9016 				c1++;
9017 			}
9018 			if (s1 < 0) {
9019 				c1 = 0;
9020 				c2 = cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
9021 				while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
9022 					if (c == cp932ext2_ucs_table[c1]) {
9023 						s1 = ((c1/94 + 0x79) << 8) +(c1%94 + 0x21);
9024 						break;
9025 					}
9026 					c1++;
9027 				}
9028 			}
9029 		}
9030 		if (c == 0) {
9031 			s1 = 0;
9032 		} else if (s1 <= 0) {
9033 			s1 = -1;
9034 		}
9035 	}
9036 
9037 	if (s1 >= 0) {
9038 		if (s1 < 0x80) {	/* latin */
9039 			CK((*filter->output_function)(s1, filter->data));
9040 		} else if (s1 < 0x100) {	/* kana */
9041 			CK((*filter->output_function)(0x8e, filter->data));
9042 			CK((*filter->output_function)(s1, filter->data));
9043 		} else if (s1 < 0x8080)  {	/* X 0208 */
9044 			CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
9045 			CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
9046 		} else {
9047 			CK(mbfl_filt_conv_illegal_output(c, filter));
9048 		}
9049 	} else {
9050 		CK(mbfl_filt_conv_illegal_output(c, filter));
9051 	}
9052 
9053 	return 0;
9054 }
9055 
mb_cp51932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9056 static size_t mb_cp51932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9057 {
9058 	unsigned char *p = *in, *e = p + *in_len;
9059 	uint32_t *out = buf, *limit = buf + bufsize;
9060 
9061 	while (p < e && out < limit) {
9062 		unsigned char c = *p++;
9063 
9064 		if (c < 0x80) {
9065 			*out++ = c;
9066 		} else if (c >= 0xA1 && c <= 0xFE && p < e) {
9067 			unsigned char c2 = *p++;
9068 			if (c2 >= 0xA1 && c2 <= 0xFE) {
9069 				unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
9070 
9071 				if (s <= 137) {
9072 					if (s == 31) {
9073 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
9074 					} else if (s == 32) {
9075 						w = 0xFF5E; /* FULLWIDTH TILDE */
9076 					} else if (s == 33) {
9077 						w = 0x2225; /* PARALLEL TO */
9078 					} else if (s == 60) {
9079 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
9080 					} else if (s == 80) {
9081 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
9082 					} else if (s == 81) {
9083 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
9084 					} else if (s == 137) {
9085 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
9086 					}
9087 				}
9088 
9089 				if (w == 0) {
9090 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
9091 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
9092 					} else if (s < jisx0208_ucs_table_size) {
9093 						w = jisx0208_ucs_table[s];
9094 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
9095 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
9096 					}
9097 				}
9098 
9099 				if (!w)
9100 					w = MBFL_BAD_INPUT;
9101 				*out++ = w;
9102 			} else {
9103 				*out++ = MBFL_BAD_INPUT;
9104 			}
9105 		} else if (c == 0x8E && p < e) {
9106 			unsigned char c2 = *p++;
9107 			if (c2 >= 0xA1 && c2 <= 0xDF) {
9108 				*out++ = 0xFEC0 + c2;
9109 			} else {
9110 				*out++ = MBFL_BAD_INPUT;
9111 			}
9112 		} else {
9113 			*out++ = MBFL_BAD_INPUT;
9114 		}
9115 	}
9116 
9117 	*in_len = e - p;
9118 	*in = p;
9119 	return out - buf;
9120 }
9121 
mb_wchar_to_cp51932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9122 static void mb_wchar_to_cp51932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9123 {
9124 	unsigned char *out, *limit;
9125 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9126 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9127 
9128 	while (len--) {
9129 		uint32_t w = *in++;
9130 		unsigned int s = 0;
9131 
9132 		if (w == 0) {
9133 			out = mb_convert_buf_add(out, 0);
9134 			continue;
9135 		} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
9136 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
9137 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
9138 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
9139 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
9140 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
9141 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
9142 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
9143 		}
9144 
9145 		if (s >= 0x8080) s = 0; /* We don't support JIS X0213 */
9146 
9147 		if (s == 0) {
9148 			if (w == 0xA5) { /* YEN SIGN */
9149 				s = 0x216F; /* FULLWIDTH YEN SIGN */
9150 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
9151 				s = 0x2140;
9152 			} else if (w == 0x2225) { /* PARALLEL TO */
9153 				s = 0x2142;
9154 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
9155 				s = 0x215D;
9156 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
9157 				s = 0x2171;
9158 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
9159 				s = 0x2172;
9160 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
9161 				s = 0x224C;
9162 			} else {
9163 				for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
9164 					if (cp932ext1_ucs_table[i] == w) {
9165 						s = ((i/94 + 0x2D) << 8) + (i%94) + 0x21;
9166 						goto found_it;
9167 					}
9168 				}
9169 
9170 				for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
9171 					if (cp932ext2_ucs_table[i] == w) {
9172 						s = ((i/94 + 0x79) << 8) + (i%94) + 0x21;
9173 						goto found_it;
9174 					}
9175 				}
9176 			}
9177 found_it: ;
9178 		}
9179 
9180 		if (!s || s >= 0x8080) {
9181 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp51932);
9182 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9183 		} else if (s < 0x80) {
9184 			out = mb_convert_buf_add(out, s);
9185 		} else if (s < 0x100) {
9186 			out = mb_convert_buf_add2(out, 0x8E, s);
9187 		} else {
9188 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9189 		}
9190 	}
9191 
9192 	MB_CONVERT_BUF_STORE(buf, out, limit);
9193 }
9194 
mb_eucjp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9195 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9196 {
9197 	unsigned char *p = *in, *e = p + *in_len;
9198 	uint32_t *out = buf, *limit = buf + bufsize - 1;
9199 
9200 	while (p < e && out < limit) {
9201 		unsigned char c = *p++;
9202 
9203 		if (c <= 0x7F) {
9204 			*out++ = c;
9205 		} else if (c >= 0xA1 && c <= 0xFE) {
9206 			/* Kanji */
9207 			if (p == e) {
9208 				*out++ = MBFL_BAD_INPUT;
9209 				break;
9210 			}
9211 			unsigned char c2 = *p++;
9212 			if (c2 <= 0xA0 || c2 == 0xFF) {
9213 				*out++ = MBFL_BAD_INPUT;
9214 				continue;
9215 			}
9216 
9217 			unsigned int s1 = c - 0x80, s2 = c2 - 0x80;
9218 			unsigned int w1 = (s1 << 8) | s2, w = 0;
9219 
9220 			/* Conversion for combining characters */
9221 			if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
9222 				int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
9223 				if (k >= 0) {
9224 					*out++ = jisx0213_u2_tbl[2*k];
9225 					*out++ = jisx0213_u2_tbl[2*k+1];
9226 					continue;
9227 				}
9228 			}
9229 
9230 			/* Conversion for BMP  */
9231 			w1 = (s1 - 0x21)*94 + s2 - 0x21;
9232 			if (w1 < jisx0213_ucs_table_size) {
9233 				w = jisx0213_ucs_table[w1];
9234 			}
9235 
9236 			/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
9237 			if (!w) {
9238 				int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9239 				if (k >= 0) {
9240 					w = jisx0213_jis_u5_tbl[k] + 0x20000;
9241 				}
9242 			}
9243 
9244 			*out++ = w ? w : MBFL_BAD_INPUT;
9245 		} else if (c == 0x8E && p < e) {
9246 			/* Kana */
9247 			unsigned char c2 = *p++;
9248 			if (c2 >= 0xA1 && c2 <= 0xDF) {
9249 				*out++ = 0xFEC0 + c2;
9250 			} else {
9251 				*out++ = MBFL_BAD_INPUT;
9252 			}
9253 		} else if (c == 0x8F && p < e) {
9254 			unsigned char c2 = *p++;
9255 			if ((c2 == 0xA1 || (c2 >= 0xA3 && c2 <= 0xA5) || c2 == 0xA8 || (c2 >= 0xAC && c2 <= 0xAF) || (c2 >= 0xEE && c2 <= 0xFE)) && p < e) {
9256 				unsigned char c3 = *p++;
9257 
9258 				if (c3 < 0xA1 || c3 == 0xFF) {
9259 					*out++ = MBFL_BAD_INPUT;
9260 					continue;
9261 				}
9262 
9263 				unsigned int s1 = c2 - 0xA1, s2 = c3 - 0xA1;
9264 
9265 				if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
9266 					int k;
9267 					for (k = 0; k < jisx0213_p2_ofst_len; k++) {
9268 						if (s1 == jisx0213_p2_ofst[k]) {
9269 							break;
9270 						}
9271 					}
9272 					k -= jisx0213_p2_ofst[k];
9273 
9274 					/* Check for Japanese chars in BMP */
9275 					unsigned int s = (s1 + 94 + k)*94 + s2;
9276 					ZEND_ASSERT(s < jisx0213_ucs_table_size);
9277 					unsigned int w = jisx0213_ucs_table[s];
9278 
9279 					/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
9280 					if (!w) {
9281 						k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9282 						if (k >= 0) {
9283 							w = jisx0213_jis_u5_tbl[k] + 0x20000;
9284 						}
9285 					}
9286 
9287 					*out++ = w ? w : MBFL_BAD_INPUT;
9288 				} else {
9289 					*out++ = MBFL_BAD_INPUT;
9290 				}
9291 			} else {
9292 				*out++ = MBFL_BAD_INPUT;
9293 			}
9294 		} else {
9295 			*out++ = MBFL_BAD_INPUT;
9296 		}
9297 	}
9298 
9299 	*in_len = e - p;
9300 	*in = p;
9301 	return out - buf;
9302 }
9303 
mb_wchar_to_eucjp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9304 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9305 {
9306 	unsigned char *out, *limit;
9307 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9308 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9309 
9310 	uint32_t w;
9311 	if (buf->state) {
9312 		w = buf->state;
9313 		buf->state = 0;
9314 		goto process_codepoint;
9315 	}
9316 
9317 	while (len--) {
9318 		w = *in++;
9319 process_codepoint: ;
9320 		unsigned int s = 0;
9321 
9322 		/* Check for 1st char of combining characters */
9323 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
9324 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
9325 				if (w == jisx0213_u2_tbl[2*k]) {
9326 					if (!len) {
9327 						if (!end) {
9328 							buf->state = w;
9329 							MB_CONVERT_BUF_STORE(buf, out, limit);
9330 							return;
9331 						}
9332 					} else {
9333 						uint32_t w2 = *in++; len--;
9334 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
9335 							k++;
9336 						}
9337 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
9338 							s = jisx0213_u2_key[k];
9339 							break;
9340 						}
9341 						in--; len++;
9342 					}
9343 
9344 					/* Fallback */
9345 					s = jisx0213_u2_fb_tbl[k];
9346 					break;
9347 				}
9348 			}
9349 		}
9350 
9351 		/* Check for major Japanese chars: U+4E00-U+9FFF */
9352 		if (!s) {
9353 			for (int k = 0; k < uni2jis_tbl_len; k++) {
9354 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
9355 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
9356 					break;
9357 				}
9358 			}
9359 		}
9360 
9361 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
9362 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
9363 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
9364 			if (k >= 0) {
9365 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
9366 			}
9367 		}
9368 
9369 		/* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
9370 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
9371 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
9372 			if (k >= 0) {
9373 				s = jisx0213_u5_jis_tbl[k];
9374 			}
9375 		}
9376 
9377 		if (!s) {
9378 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
9379 			if (w == 0xFE45) {
9380 				s = 0x233E;
9381 			} else if (w == 0xFE46) {
9382 				s = 0x233D;
9383 			} else if (w >= 0xF91D && w <= 0xF9DC) {
9384 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
9385 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
9386 				if (k >= 0) {
9387 					s = ucs_r2b_jisx0213_cmap_val[k];
9388 				}
9389 			}
9390 		}
9391 
9392 		if (!s && w) {
9393 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004);
9394 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9395 		} else if (s <= 0x7F) {
9396 			out = mb_convert_buf_add(out, s);
9397 		} else if (s <= 0xFF) {
9398 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9399 			out = mb_convert_buf_add2(out, 0x8E, s);
9400 		} else if (s <= 0x7EFF) {
9401 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9402 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80);
9403 		} else {
9404 			unsigned int s2 = s & 0xFF;
9405 			int k = ((s >> 8) & 0xFF) - 0x7F;
9406 			ZEND_ASSERT(k < jisx0213_p2_ofst_len);
9407 			s = jisx0213_p2_ofst[k] + 0x21;
9408 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
9409 			out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80);
9410 		}
9411 	}
9412 
9413 	MB_CONVERT_BUF_STORE(buf, out, limit);
9414 }
9415 
mbfl_filt_conv_euccn_wchar(int c,mbfl_convert_filter * filter)9416 static int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter)
9417 {
9418 	int c1, w;
9419 
9420 	switch (filter->status) {
9421 	case 0:
9422 		if (c >= 0 && c < 0x80) { /* latin */
9423 			CK((*filter->output_function)(c, filter->data));
9424 		} else if ((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) { /* dbcs lead byte */
9425 			filter->status = 1;
9426 			filter->cache = c;
9427 		} else {
9428 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9429 		}
9430 		break;
9431 
9432 	case 1: /* dbcs second byte */
9433 		filter->status = 0;
9434 		c1 = filter->cache;
9435 		if (c > 0xA0 && c < 0xFF) {
9436 			w = (c1 - 0x81)*192 + c - 0x40;
9437 			ZEND_ASSERT(w < cp936_ucs_table_size);
9438 			if (w == 0x1864) {
9439 				w = 0x30FB;
9440 			} else if (w == 0x186A) {
9441 				w = 0x2015;
9442 			} else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9443 				w = 0;
9444 			} else {
9445 				w = cp936_ucs_table[w];
9446 			}
9447 
9448 			if (w <= 0) {
9449 				w = MBFL_BAD_INPUT;
9450 			}
9451 
9452 			CK((*filter->output_function)(w, filter->data));
9453 		} else {
9454 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9455 		}
9456 		break;
9457 
9458 		EMPTY_SWITCH_DEFAULT_CASE();
9459 	}
9460 
9461 	return 0;
9462 }
9463 
mbfl_filt_conv_wchar_euccn(int c,mbfl_convert_filter * filter)9464 static int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter)
9465 {
9466 	int s = 0;
9467 
9468 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
9469 		if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261) {
9470 			s = 0;
9471 		} else {
9472 			s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
9473 		}
9474 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
9475 		if (c == 0x2015) {
9476 			s = 0xA1AA;
9477 		} else if (c == 0x2014 || (c >= 0x2170 && c <= 0x2179)) {
9478 			s = 0;
9479 		} else {
9480 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
9481 		}
9482 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
9483 		if (c == 0x30FB) {
9484 			s = 0xA1A4;
9485 		} else {
9486 			s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
9487 		}
9488 	} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
9489 		s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
9490 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
9491 		if (c == 0xFF04) {
9492 			s = 0xA1E7;
9493 		} else if (c == 0xFF5E) {
9494 			s = 0xA1AB;
9495 		} else if (c >= 0xFF01 && c <= 0xFF5D) {
9496 			s = c - 0xFF01 + 0xA3A1;
9497 		} else if (c >= 0xFFE0 && c <= 0xFFE5) {
9498 			s = ucs_hff_s_cp936_table[c - 0xFFE0];
9499 		}
9500 	}
9501 
9502 	/* exclude CP936 extensions */
9503 	if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9504 		s = 0;
9505 	}
9506 
9507 	if (s <= 0) {
9508 		if (c < 0x80) {
9509 			s = c;
9510 		} else if (s <= 0) {
9511 			s = -1;
9512 		}
9513 	}
9514 
9515 	if (s >= 0) {
9516 		if (s < 0x80) { /* latin */
9517 			CK((*filter->output_function)(s, filter->data));
9518 		} else {
9519 			CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9520 			CK((*filter->output_function)(s & 0xFF, filter->data));
9521 		}
9522 	} else {
9523 		CK(mbfl_filt_conv_illegal_output(c, filter));
9524 	}
9525 
9526 	return 0;
9527 }
9528 
mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter * filter)9529 static int mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter *filter)
9530 {
9531 	if (filter->status == 1) {
9532 		/* 2-byte character was truncated */
9533 		filter->status = 0;
9534 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9535 	}
9536 
9537 	if (filter->flush_function) {
9538 		(*filter->flush_function)(filter->data);
9539 	}
9540 
9541 	return 0;
9542 }
9543 
mb_euccn_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9544 static size_t mb_euccn_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9545 {
9546 	unsigned char *p = *in, *e = p + *in_len;
9547 	uint32_t *out = buf, *limit = buf + bufsize;
9548 
9549 	while (p < e && out < limit) {
9550 		unsigned char c = *p++;
9551 
9552 		if (c < 0x80) {
9553 			*out++ = c;
9554 		} else if (((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) && p < e) {
9555 			unsigned char c2 = *p++;
9556 
9557 			if (c2 >= 0xA1 && c2 <= 0xFE) {
9558 				unsigned int w = (c - 0x81)*192 + c2 - 0x40;
9559 				ZEND_ASSERT(w < cp936_ucs_table_size);
9560 				if (w == 0x1864) {
9561 					w = 0x30FB;
9562 				} else if (w == 0x186A) {
9563 					w = 0x2015;
9564 				} else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9565 					w = 0;
9566 				} else {
9567 					w = cp936_ucs_table[w];
9568 				}
9569 
9570 				if (!w)
9571 					w = MBFL_BAD_INPUT;
9572 				*out++ = w;
9573 			} else {
9574 				*out++ = MBFL_BAD_INPUT;
9575 			}
9576 		} else {
9577 			*out++ = MBFL_BAD_INPUT;
9578 		}
9579 	}
9580 
9581 	*in_len = e - p;
9582 	*in = p;
9583 	return out - buf;
9584 }
9585 
mb_wchar_to_euccn(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9586 static void mb_wchar_to_euccn(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9587 {
9588 	unsigned char *out, *limit;
9589 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9590 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9591 
9592 	while (len--) {
9593 		uint32_t w = *in++;
9594 		unsigned int s = 0;
9595 
9596 		if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
9597 			if (w != 0xB7 && w != 0x144 && w != 0x148 && w != 0x251 && w != 0x261) {
9598 				s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
9599 			}
9600 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
9601 			if (w == 0x2015) {
9602 				s = 0xA1AA;
9603 			} else if (w != 0x2014 && (w < 0x2170 || w > 0x2179)) {
9604 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
9605 			}
9606 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
9607 			if (w == 0x30FB) {
9608 				s = 0xA1A4;
9609 			} else {
9610 				s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
9611 			}
9612 		} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
9613 			s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
9614 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
9615 			if (w == 0xFF04) {
9616 				s = 0xA1E7;
9617 			} else if (w == 0xFF5E) {
9618 				s = 0xA1AB;
9619 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
9620 				s = w - 0xFF01 + 0xA3A1;
9621 			} else if (w >= 0xFFE0 && w <= 0xFFE5) {
9622 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
9623 			}
9624 		}
9625 
9626 		/* Exclude CP936 extensions */
9627 		if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9628 			s = 0;
9629 		}
9630 
9631 		if (!s) {
9632 			if (w < 0x80) {
9633 				out = mb_convert_buf_add(out, w);
9634 			} else {
9635 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euccn);
9636 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9637 			}
9638 		} else if (s < 0x80) {
9639 			out = mb_convert_buf_add(out, s);
9640 		} else {
9641 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
9642 		}
9643 	}
9644 
9645 	MB_CONVERT_BUF_STORE(buf, out, limit);
9646 }
9647 
mbfl_filt_conv_euctw_wchar(int c,mbfl_convert_filter * filter)9648 static int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
9649 {
9650 	int c1, s, w;
9651 
9652 	switch (filter->status) {
9653 	case 0:
9654 		if (c >= 0 && c < 0x80) { /* latin */
9655 			CK((*filter->output_function)(c, filter->data));
9656 		} else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) { /* 2-byte character, first byte */
9657 			filter->status = 1;
9658 			filter->cache = c;
9659 		} else if (c == 0x8E) { /* 4-byte character, first byte */
9660 			filter->status = 2;
9661 		} else {
9662 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9663 		}
9664 		break;
9665 
9666 	case 1: /* 2-byte character, second byte */
9667 		filter->status = 0;
9668 		c1 = filter->cache;
9669 		if (c > 0xA0 && c < 0xFF) {
9670 			w = (c1 - 0xA1)*94 + (c - 0xA1);
9671 			if (w >= 0 && w < cns11643_1_ucs_table_size) {
9672 				w = cns11643_1_ucs_table[w];
9673 			} else {
9674 				w = 0;
9675 			}
9676 
9677 			if (w <= 0) {
9678 				w = MBFL_BAD_INPUT;
9679 			}
9680 
9681 			CK((*filter->output_function)(w, filter->data));
9682 		} else {
9683 			filter->status = filter->cache = 0;
9684 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9685 		}
9686 		break;
9687 
9688 	case 2: /* got 0x8e, second byte */
9689 		if (c == 0xA1 || c == 0xA2 || c == 0xAE) {
9690 			filter->status = 3;
9691 			filter->cache = c - 0xA1;
9692 		} else {
9693 			filter->status = filter->cache = 0;
9694 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9695 		}
9696 		break;
9697 
9698 	case 3: /* got 0x8e, third byte */
9699 		filter->status = 0;
9700 		c1 = filter->cache;
9701 		if (c >= 0xA1 && ((c1 == 0 && ((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) ||
9702 				(c1 == 1 && c <= 0xF2) || (c1 == 13 && c <= 0xE7))) {
9703 			filter->status = 4;
9704 			filter->cache = (c1 << 8) + c - 0xA1;
9705 		} else {
9706 			filter->status = filter->cache = 0;
9707 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9708 		}
9709 		break;
9710 
9711 	case 4:	/* multi-byte character, fourth byte */
9712 		filter->status = 0;
9713 		c1 = filter->cache;
9714 		if (c1 <= 0xDFF && c > 0xA0 && c < 0xFF) {
9715 			int plane = (c1 & 0xF00) >> 8; /* This is actually the CNS-11643 plane minus one */
9716 			s = (c1 & 0xFF)*94 + c - 0xA1;
9717 			w = 0;
9718 			if (s >= 0) {
9719 				/* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9720 				 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9721 				 * We only support the older version of CNS-11643
9722 				 * This is the same as iconv from glibc 2.2 */
9723 				if (plane == 0 && s < cns11643_1_ucs_table_size) {
9724 					w = cns11643_1_ucs_table[s];
9725 				} else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9726 					w = cns11643_2_ucs_table[s];
9727 				} else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9728 					w = cns11643_14_ucs_table[s];
9729 				}
9730 			}
9731 
9732 			if (w <= 0) {
9733 				w = MBFL_BAD_INPUT;
9734 			}
9735 
9736 			CK((*filter->output_function)(w, filter->data));
9737 		} else {
9738 			filter->status = filter->cache = 0;
9739 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9740 		}
9741 		break;
9742 
9743 		EMPTY_SWITCH_DEFAULT_CASE();
9744 	}
9745 
9746 	return 0;
9747 }
9748 
mbfl_filt_conv_wchar_euctw(int c,mbfl_convert_filter * filter)9749 static int mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
9750 {
9751 	int s = 0;
9752 
9753 	if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) {
9754 		s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min];
9755 	} else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) {
9756 		s = ucs_a2_cns11643_table[c - ucs_a2_cns11643_table_min];
9757 	} else if (c >= ucs_a3_cns11643_table_min && c < ucs_a3_cns11643_table_max) {
9758 		s = ucs_a3_cns11643_table[c - ucs_a3_cns11643_table_min];
9759 	} else if (c >= ucs_i_cns11643_table_min && c < ucs_i_cns11643_table_max) {
9760 		s = ucs_i_cns11643_table[c - ucs_i_cns11643_table_min];
9761 	} else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) {
9762 		s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min];
9763 	}
9764 
9765 	if (s <= 0) {
9766 		if (c == 0) {
9767 			s = 0;
9768 		} else if (s <= 0) {
9769 			s = -1;
9770 		}
9771 	}
9772 
9773 	if (s >= 0) {
9774 		int plane = (s & 0x1F0000) >> 16;
9775 		if (plane <= 1) {
9776 			if (s < 0x80) { /* latin */
9777 				CK((*filter->output_function)(s, filter->data));
9778 			} else {
9779 				s = (s & 0xFFFF) | 0x8080;
9780 				CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9781 				CK((*filter->output_function)(s & 0xFF, filter->data));
9782 			}
9783 		} else {
9784 			s = (0x8EA00000 + (plane << 16)) | ((s & 0xFFFF) | 0x8080);
9785 			CK((*filter->output_function)(0x8e , filter->data));
9786 			CK((*filter->output_function)((s >> 16) & 0xFF, filter->data));
9787 			CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9788 			CK((*filter->output_function)(s & 0xFF, filter->data));
9789 		}
9790 	} else {
9791 		CK(mbfl_filt_conv_illegal_output(c, filter));
9792 	}
9793 	return 0;
9794 }
9795 
mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter * filter)9796 static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter)
9797 {
9798 	if (filter->status) {
9799 		/* 2-byte or 4-byte character was truncated */
9800 		filter->status = 0;
9801 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9802 	}
9803 
9804 	if (filter->flush_function) {
9805 		(*filter->flush_function)(filter->data);
9806 	}
9807 
9808 	return 0;
9809 }
9810 
mb_euctw_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9811 static size_t mb_euctw_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9812 {
9813 	unsigned char *p = *in, *e = p + *in_len;
9814 	uint32_t *out = buf, *limit = buf + bufsize;
9815 
9816 	while (p < e && out < limit) {
9817 		unsigned char c = *p++;
9818 
9819 		if (c < 0x80) {
9820 			*out++ = c;
9821 		} else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3 && p < e) {
9822 			unsigned char c2 = *p++;
9823 
9824 			if (c2 >= 0xA1 && c2 <= 0xFE) {
9825 				unsigned int w = (c - 0xA1)*94 + (c2 - 0xA1);
9826 				if (w < cns11643_1_ucs_table_size) {
9827 					w = cns11643_1_ucs_table[w];
9828 				} else {
9829 					w = 0;
9830 				}
9831 				if (!w)
9832 					w = MBFL_BAD_INPUT;
9833 				*out++ = w;
9834 			} else {
9835 				*out++ = MBFL_BAD_INPUT;
9836 			}
9837 		} else if (c == 0x8E && p < e) {
9838 			unsigned char c2 = *p++;
9839 
9840 			if ((c2 == 0xA1 || c2 == 0xA2 || c2 == 0xAE) && p < e) {
9841 				unsigned int plane = c2 - 0xA1; /* This is actually the CNS-11643 plane minus one */
9842 				unsigned char c3 = *p++;
9843 
9844 				if (c3 >= 0xA1 && ((plane == 0 && ((c3 >= 0xA1 && c3 <= 0xA6) || (c3 >= 0xC2 && c3 <= 0xFD)) && c3 != 0xC3) || (plane == 1 && c3 <= 0xF2) || (plane == 13 && c3 <= 0xE7)) && p < e) {
9845 					unsigned char c4 = *p++;
9846 
9847 					if (c2 <= 0xAE && c4 > 0xA0 && c4 < 0xFF) {
9848 						unsigned int s = (c3 - 0xA1)*94 + c4 - 0xA1, w = 0;
9849 
9850 						/* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9851 						 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9852 						 * We only support the older version of CNS-11643
9853 						 * This is the same as iconv from glibc 2.2 */
9854 						if (plane == 0 && s < cns11643_1_ucs_table_size) {
9855 							w = cns11643_1_ucs_table[s];
9856 						} else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9857 							w = cns11643_2_ucs_table[s];
9858 						} else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9859 							w = cns11643_14_ucs_table[s];
9860 						}
9861 
9862 						if (!w)
9863 							w = MBFL_BAD_INPUT;
9864 						*out++ = w;
9865 						continue;
9866 					}
9867 				}
9868 			}
9869 
9870 			*out++ = MBFL_BAD_INPUT;
9871 		} else {
9872 			*out++ = MBFL_BAD_INPUT;
9873 		}
9874 	}
9875 
9876 	*in_len = e - p;
9877 	*in = p;
9878 	return out - buf;
9879 }
9880 
mb_wchar_to_euctw(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9881 static void mb_wchar_to_euctw(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9882 {
9883 	unsigned char *out, *limit;
9884 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9885 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9886 
9887 	while (len--) {
9888 		uint32_t w = *in++;
9889 		unsigned int s = 0;
9890 
9891 		if (w >= ucs_a1_cns11643_table_min && w < ucs_a1_cns11643_table_max) {
9892 			s = ucs_a1_cns11643_table[w - ucs_a1_cns11643_table_min];
9893 		} else if (w >= ucs_a2_cns11643_table_min && w < ucs_a2_cns11643_table_max) {
9894 			s = ucs_a2_cns11643_table[w - ucs_a2_cns11643_table_min];
9895 		} else if (w >= ucs_a3_cns11643_table_min && w < ucs_a3_cns11643_table_max) {
9896 			s = ucs_a3_cns11643_table[w - ucs_a3_cns11643_table_min];
9897 		} else if (w >= ucs_i_cns11643_table_min && w < ucs_i_cns11643_table_max) {
9898 			s = ucs_i_cns11643_table[w - ucs_i_cns11643_table_min];
9899 		} else if (w >= ucs_r_cns11643_table_min && w < ucs_r_cns11643_table_max) {
9900 			s = ucs_r_cns11643_table[w - ucs_r_cns11643_table_min];
9901 		}
9902 
9903 		if (!s) {
9904 			if (w == 0) {
9905 				out = mb_convert_buf_add(out, 0);
9906 			} else {
9907 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euctw);
9908 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9909 			}
9910 		} else {
9911 			unsigned int plane = s >> 16;
9912 			if (plane <= 1) {
9913 				if (s < 0x80) {
9914 					out = mb_convert_buf_add(out, s);
9915 				} else {
9916 					out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9917 				}
9918 			} else {
9919 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
9920 				out = mb_convert_buf_add4(out, 0x8E, 0xA0 + plane, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9921 			}
9922 		}
9923 	}
9924 
9925 	MB_CONVERT_BUF_STORE(buf, out, limit);
9926 }
9927 
mbfl_filt_conv_euckr_wchar(int c,mbfl_convert_filter * filter)9928 static int mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
9929 {
9930 	int c1, w, flag;
9931 
9932 	switch (filter->status) {
9933 	case 0:
9934 		if (c >= 0 && c < 0x80) { /* latin */
9935 			CK((*filter->output_function)(c, filter->data));
9936 		} else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9) { /* dbcs lead byte */
9937 			filter->status = 1;
9938 			filter->cache = c;
9939 		} else {
9940 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9941 		}
9942 		break;
9943 
9944 	case 1: /* dbcs second byte */
9945 		filter->status = 0;
9946 		c1 = filter->cache;
9947 		flag = 0;
9948 		if (c1 >= 0xa1 && c1 <= 0xc6) {
9949 			flag = 1;
9950 		} else if (c1 >= 0xc7 && c1 <= 0xfe && c1 != 0xc9) {
9951 			flag = 2;
9952 		}
9953 		if (flag > 0 && c >= 0xa1 && c <= 0xfe) {
9954 			if (flag == 1) { /* 1st: 0xa1..0xc6, 2nd: 0x41..0x7a, 0x81..0xfe */
9955 				w = (c1 - 0x81)*190 + c - 0x41;
9956 				ZEND_ASSERT(w < uhc1_ucs_table_size);
9957 				w = uhc1_ucs_table[w];
9958 			} else { /* 1st: 0xc7..0xc8,0xca..0xfe, 2nd: 0xa1..0xfe */
9959 				w = (c1 - 0xc7)*94 + c - 0xa1;
9960 				ZEND_ASSERT(w < uhc3_ucs_table_size);
9961 				w = uhc3_ucs_table[w];
9962 			}
9963 
9964 			if (w <= 0) {
9965 				w = MBFL_BAD_INPUT;
9966 			}
9967 			CK((*filter->output_function)(w, filter->data));
9968 		} else {
9969 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9970 		}
9971 		break;
9972 
9973 		EMPTY_SWITCH_DEFAULT_CASE();
9974 	}
9975 
9976 	return 0;
9977 }
9978 
mbfl_filt_conv_wchar_euckr(int c,mbfl_convert_filter * filter)9979 static int mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
9980 {
9981 	int s = 0;
9982 
9983 	if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
9984 		s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
9985 	} else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
9986 		s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
9987 	} else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
9988 		s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
9989 	} else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
9990 		s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
9991 	} else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
9992 		s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
9993 	} else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
9994 		s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
9995 	} else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
9996 		s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
9997 	}
9998 
9999 	/* exclude UHC extension area (although we are using the UHC conversion tables) */
10000 	if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
10001 		s = 0;
10002 	}
10003 
10004 	if (s <= 0) {
10005 		if (c < 0x80) {
10006 			s = c;
10007 		} else {
10008 			s = -1;
10009 		}
10010 	}
10011 
10012 	if (s >= 0) {
10013 		if (s < 0x80) { /* latin */
10014 			CK((*filter->output_function)(s, filter->data));
10015 		} else {
10016 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10017 			CK((*filter->output_function)(s & 0xff, filter->data));
10018 		}
10019 	} else {
10020 		CK(mbfl_filt_conv_illegal_output(c, filter));
10021 	}
10022 
10023 	return 0;
10024 }
10025 
mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter * filter)10026 static int mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter *filter)
10027 {
10028 	if (filter->status == 1) {
10029 		/* 2-byte character was truncated */
10030 		filter->status = 0;
10031 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10032 	}
10033 
10034 	if (filter->flush_function) {
10035 		(*filter->flush_function)(filter->data);
10036 	}
10037 
10038 	return 0;
10039 }
10040 
mb_euckr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10041 static size_t mb_euckr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10042 {
10043 	unsigned char *p = *in, *e = p + *in_len;
10044 	uint32_t *out = buf, *limit = buf + bufsize;
10045 
10046 	while (p < e && out < limit) {
10047 		unsigned char c = *p++;
10048 
10049 		if (c < 0x80) {
10050 			*out++ = c;
10051 		} else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9 && p < e) {
10052 			unsigned char c2 = *p++;
10053 			if (c2 < 0xA1 || c2 == 0xFF) {
10054 				*out++ = MBFL_BAD_INPUT;
10055 				continue;
10056 			}
10057 
10058 			if (c <= 0xC6) {
10059 				unsigned int w = (c - 0x81)*190 + c2 - 0x41;
10060 				ZEND_ASSERT(w < uhc1_ucs_table_size);
10061 				w = uhc1_ucs_table[w];
10062 				if (!w)
10063 					w = MBFL_BAD_INPUT;
10064 				*out++ = w;
10065 			} else {
10066 				unsigned int w = (c - 0xC7)*94 + c2 - 0xA1;
10067 				ZEND_ASSERT(w < uhc3_ucs_table_size);
10068 				w = uhc3_ucs_table[w];
10069 				if (!w)
10070 					w = MBFL_BAD_INPUT;
10071 				*out++ = w;
10072 			}
10073 		} else {
10074 			*out++ = MBFL_BAD_INPUT;
10075 		}
10076 	}
10077 
10078 	*in_len = e - p;
10079 	*in = p;
10080 	return out - buf;
10081 }
10082 
mb_wchar_to_euckr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10083 static void mb_wchar_to_euckr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10084 {
10085 	unsigned char *out, *limit;
10086 	MB_CONVERT_BUF_LOAD(buf, out, limit);
10087 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10088 
10089 	while (len--) {
10090 		uint32_t w = *in++;
10091 		unsigned int s = 0;
10092 
10093 		if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10094 			s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10095 		} else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10096 			s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10097 		} else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10098 			s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10099 		} else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10100 			s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10101 		} else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10102 			s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10103 		} else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10104 			s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10105 		} else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10106 			s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10107 		}
10108 
10109 		/* Exclude UHC extension area (although we are using the UHC conversion tables) */
10110 		if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
10111 			s = 0;
10112 		}
10113 
10114 		if (!s) {
10115 			if (w < 0x80) {
10116 				out = mb_convert_buf_add(out, w);
10117 			} else {
10118 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euckr);
10119 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10120 			}
10121 		} else if (s < 0x80) {
10122 			out = mb_convert_buf_add(out, s);
10123 		} else {
10124 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10125 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10126 		}
10127 	}
10128 
10129 	MB_CONVERT_BUF_STORE(buf, out, limit);
10130 }
10131 
mbfl_filt_conv_uhc_wchar(int c,mbfl_convert_filter * filter)10132 static int mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter)
10133 {
10134 	switch (filter->status) {
10135 	case 0:
10136 		if (c >= 0 && c < 0x80) { /* latin */
10137 			CK((*filter->output_function)(c, filter->data));
10138 		} else if (c > 0x80 && c < 0xfe && c != 0xc9) { /* dbcs lead byte */
10139 			filter->status = 1;
10140 			filter->cache = c;
10141 		} else {
10142 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10143 		}
10144 		break;
10145 
10146 	case 1: /* dbcs second byte */
10147 		filter->status = 0;
10148 		int c1 = filter->cache, w = 0;
10149 
10150 		if (c1 >= 0x81 && c1 <= 0xc6 && c >= 0x41 && c <= 0xfe) {
10151 			w = (c1 - 0x81)*190 + (c - 0x41);
10152 			if (w >= 0 && w < uhc1_ucs_table_size) {
10153 				w = uhc1_ucs_table[w];
10154 			}
10155 		} else if (c1 >= 0xc7 && c1 < 0xfe && c >= 0xa1 && c <= 0xfe) {
10156 			w = (c1 - 0xc7)*94 + (c - 0xa1);
10157 			if (w >= 0 && w < uhc3_ucs_table_size) {
10158 				w = uhc3_ucs_table[w];
10159 			}
10160 		}
10161 
10162 		if (w == 0) {
10163 			w = MBFL_BAD_INPUT;
10164 		}
10165 		CK((*filter->output_function)(w, filter->data));
10166 		break;
10167 
10168 		EMPTY_SWITCH_DEFAULT_CASE();
10169 	}
10170 
10171 	return 0;
10172 }
10173 
mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter * filter)10174 static int mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter *filter)
10175 {
10176 	if (filter->status == 1) {
10177 		/* 2-byte character was truncated */
10178 		filter->status = 0;
10179 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10180 	}
10181 
10182 	if (filter->flush_function) {
10183 		(*filter->flush_function)(filter->data);
10184 	}
10185 
10186 	return 0;
10187 }
10188 
mbfl_filt_conv_wchar_uhc(int c,mbfl_convert_filter * filter)10189 static int mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter)
10190 {
10191 	int s = 0;
10192 
10193 	if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
10194 		s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
10195 	} else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
10196 		s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
10197 	} else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
10198 		s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
10199 	} else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
10200 		s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
10201 	} else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
10202 		s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
10203 	} else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
10204 		s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
10205 	} else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
10206 		s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
10207 	}
10208 
10209 	if (s == 0 && c != 0) {
10210 		s = -1;
10211 	}
10212 
10213 	if (s >= 0) {
10214 		if (s < 0x80) { /* latin */
10215 			CK((*filter->output_function)(s, filter->data));
10216 		} else {
10217 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10218 			CK((*filter->output_function)(s & 0xff, filter->data));
10219 		}
10220 	} else {
10221 		CK(mbfl_filt_conv_illegal_output(c, filter));
10222 	}
10223 
10224 	return 0;
10225 }
10226 
mb_uhc_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10227 static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10228 {
10229 	unsigned char *p = *in, *e = p + *in_len;
10230 	uint32_t *out = buf, *limit = buf + bufsize;
10231 
10232 	e--; /* Stop the main loop 1 byte short of the end of the input */
10233 
10234 	while (p < e && out < limit) {
10235 		unsigned char c = *p++;
10236 
10237 		if (c < 0x80) {
10238 			*out++ = c;
10239 		} else if (c > 0x80 && c < 0xFE) {
10240 			/* We don't need to check p < e here; it's not possible that this pointer dereference
10241 			 * will be outside the input string, because of e-- above */
10242 			unsigned char c2 = *p++;
10243 			if (c2 < 0x41 || c2 == 0xFF) {
10244 				*out++ = MBFL_BAD_INPUT;
10245 				continue;
10246 			}
10247 			unsigned int w = 0;
10248 
10249 			if (c <= 0xC6) {
10250 				w = (c - 0x81)*190 + c2 - 0x41;
10251 				ZEND_ASSERT(w < uhc1_ucs_table_size);
10252 				w = uhc1_ucs_table[w];
10253 			} else if (c2 >= 0xA1) {
10254 				w = (c - 0xC7)*94 + c2 - 0xA1;
10255 				ZEND_ASSERT(w < uhc3_ucs_table_size);
10256 				w = uhc3_ucs_table[w];
10257 			}
10258 			if (!w) {
10259 				/* If c == 0xC9, we shouldn't have tried to read a 2-byte char at all... but it is faster
10260 				 * to fix up that rare case here rather than include an extra check in the hot path */
10261 				if (c == 0xC9) {
10262 					p--;
10263 				}
10264 				w = MBFL_BAD_INPUT;
10265 			}
10266 			*out++ = w;
10267 		} else {
10268 			*out++ = MBFL_BAD_INPUT;
10269 		}
10270 	}
10271 
10272 	/* Finish up last byte of input string if there is one */
10273 	if (p == e && out < limit) {
10274 		unsigned char c = *p++;
10275 		*out++ = (c < 0x80) ? c : MBFL_BAD_INPUT;
10276 	}
10277 
10278 	*in_len = e - p + 1;
10279 	*in = p;
10280 	return out - buf;
10281 }
10282 
mb_wchar_to_uhc(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10283 static void mb_wchar_to_uhc(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10284 {
10285 	unsigned char *out, *limit;
10286 	MB_CONVERT_BUF_LOAD(buf, out, limit);
10287 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10288 
10289 	while (len--) {
10290 		uint32_t w = *in++;
10291 		unsigned int s = 0;
10292 
10293 		if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10294 			s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10295 		} else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10296 			s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10297 		} else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10298 			s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10299 		} else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10300 			s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10301 		} else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10302 			s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10303 		} else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10304 			s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10305 		} else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10306 			s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10307 		}
10308 
10309 		if (!s) {
10310 			if (w == 0) {
10311 				out = mb_convert_buf_add(out, 0);
10312 			} else {
10313 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_uhc);
10314 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10315 			}
10316 		} else if (s < 0x80) {
10317 			out = mb_convert_buf_add(out, s);
10318 		} else {
10319 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10320 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10321 		}
10322 	}
10323 
10324 	MB_CONVERT_BUF_STORE(buf, out, limit);
10325 }
10326 
10327 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
10328 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10329 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10330 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10331 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10332 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10333 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10334 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10335 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10336 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
10337 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10338 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10339 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10340 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10341 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10342 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10343 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10344 };
10345 
10346 static const char *mbfl_encoding_euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
10347 
10348 static const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
10349 	mbfl_no_encoding_euc_jp,
10350 	mbfl_no_encoding_wchar,
10351 	mbfl_filt_conv_common_ctor,
10352 	NULL,
10353 	mbfl_filt_conv_eucjp_wchar,
10354 	mbfl_filt_conv_eucjp_wchar_flush,
10355 	NULL,
10356 };
10357 
10358 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp = {
10359 	mbfl_no_encoding_wchar,
10360 	mbfl_no_encoding_euc_jp,
10361 	mbfl_filt_conv_common_ctor,
10362 	NULL,
10363 	mbfl_filt_conv_wchar_eucjp,
10364 	mbfl_filt_conv_common_flush,
10365 	NULL,
10366 };
10367 
10368 const mbfl_encoding mbfl_encoding_euc_jp = {
10369 	mbfl_no_encoding_euc_jp,
10370 	"EUC-JP",
10371 	"EUC-JP",
10372 	mbfl_encoding_euc_jp_aliases,
10373 	mblen_table_eucjp,
10374 	0,
10375 	&vtbl_eucjp_wchar,
10376 	&vtbl_wchar_eucjp,
10377 	mb_eucjp_to_wchar,
10378 	mb_wchar_to_eucjp,
10379 	NULL,
10380 	NULL,
10381 };
10382 
10383 static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
10384 
10385 static const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
10386 	mbfl_no_encoding_eucjp2004,
10387 	mbfl_no_encoding_wchar,
10388 	mbfl_filt_conv_common_ctor,
10389 	NULL,
10390 	mbfl_filt_conv_jis2004_wchar,
10391 	mbfl_filt_conv_jis2004_wchar_flush,
10392 	NULL,
10393 };
10394 
10395 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = {
10396 	mbfl_no_encoding_wchar,
10397 	mbfl_no_encoding_eucjp2004,
10398 	mbfl_filt_conv_common_ctor,
10399 	NULL,
10400 	mbfl_filt_conv_wchar_jis2004,
10401 	mbfl_filt_conv_wchar_jis2004_flush,
10402 	NULL,
10403 };
10404 
10405 const mbfl_encoding mbfl_encoding_eucjp2004 = {
10406 	mbfl_no_encoding_eucjp2004,
10407 	"EUC-JP-2004",
10408 	"EUC-JP",
10409 	mbfl_encoding_eucjp2004_aliases,
10410 	mblen_table_eucjp,
10411 	0,
10412 	&vtbl_eucjp2004_wchar,
10413 	&vtbl_wchar_eucjp2004,
10414 	mb_eucjp2004_to_wchar,
10415 	mb_wchar_to_eucjp2004,
10416 	NULL,
10417 	NULL,
10418 };
10419 
10420 static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL};
10421 
10422 static const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
10423 	mbfl_no_encoding_eucjp_win,
10424 	mbfl_no_encoding_wchar,
10425 	mbfl_filt_conv_common_ctor,
10426 	NULL,
10427 	mbfl_filt_conv_eucjpwin_wchar,
10428 	mbfl_filt_conv_eucjpwin_wchar_flush,
10429 	NULL,
10430 };
10431 
10432 static const struct mbfl_convert_vtbl vtbl_wchar_eucjpwin = {
10433 	mbfl_no_encoding_wchar,
10434 	mbfl_no_encoding_eucjp_win,
10435 	mbfl_filt_conv_common_ctor,
10436 	NULL,
10437 	mbfl_filt_conv_wchar_eucjpwin,
10438 	mbfl_filt_conv_common_flush,
10439 	NULL,
10440 };
10441 
10442 const mbfl_encoding mbfl_encoding_eucjp_win = {
10443 	mbfl_no_encoding_eucjp_win,
10444 	"eucJP-win",
10445 	"EUC-JP",
10446 	mbfl_encoding_eucjp_win_aliases,
10447 	mblen_table_eucjp,
10448 	0,
10449 	&vtbl_eucjpwin_wchar,
10450 	&vtbl_wchar_eucjpwin,
10451 	mb_eucjpwin_to_wchar,
10452 	mb_wchar_to_eucjpwin,
10453 	NULL,
10454 	NULL,
10455 };
10456 
10457 static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
10458 
10459 static const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
10460 	mbfl_no_encoding_cp51932,
10461 	mbfl_no_encoding_wchar,
10462 	mbfl_filt_conv_common_ctor,
10463 	NULL,
10464 	mbfl_filt_conv_cp51932_wchar,
10465 	mbfl_filt_conv_cp51932_wchar_flush,
10466 	NULL,
10467 };
10468 
10469 static const struct mbfl_convert_vtbl vtbl_wchar_cp51932 = {
10470 	mbfl_no_encoding_wchar,
10471 	mbfl_no_encoding_cp51932,
10472 	mbfl_filt_conv_common_ctor,
10473 	NULL,
10474 	mbfl_filt_conv_wchar_cp51932,
10475 	mbfl_filt_conv_common_flush,
10476 	NULL,
10477 };
10478 
10479 const mbfl_encoding mbfl_encoding_cp51932 = {
10480 	mbfl_no_encoding_cp51932,
10481 	"CP51932",
10482 	"CP51932",
10483 	mbfl_encoding_cp51932_aliases,
10484 	mblen_table_eucjp,
10485 	0,
10486 	&vtbl_cp51932_wchar,
10487 	&vtbl_wchar_cp51932,
10488 	mb_cp51932_to_wchar,
10489 	mb_wchar_to_cp51932,
10490 	NULL,
10491 	NULL,
10492 };
10493 
10494 static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
10495   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10496   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10497   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10498   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10499   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10500   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10501   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10502   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10503   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10504   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10505   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10506   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10507   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10508   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10509   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10510   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10511 };
10512 
10513 static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
10514 
10515 static const struct mbfl_convert_vtbl vtbl_euccn_wchar = {
10516 	mbfl_no_encoding_euc_cn,
10517 	mbfl_no_encoding_wchar,
10518 	mbfl_filt_conv_common_ctor,
10519 	NULL,
10520 	mbfl_filt_conv_euccn_wchar,
10521 	mbfl_filt_conv_euccn_wchar_flush,
10522 	NULL,
10523 };
10524 
10525 static const struct mbfl_convert_vtbl vtbl_wchar_euccn = {
10526 	mbfl_no_encoding_wchar,
10527 	mbfl_no_encoding_euc_cn,
10528 	mbfl_filt_conv_common_ctor,
10529 	NULL,
10530 	mbfl_filt_conv_wchar_euccn,
10531 	mbfl_filt_conv_common_flush,
10532 	NULL,
10533 };
10534 
10535 const mbfl_encoding mbfl_encoding_euc_cn = {
10536 	mbfl_no_encoding_euc_cn,
10537 	"EUC-CN",
10538 	"CN-GB",
10539 	mbfl_encoding_euc_cn_aliases,
10540 	mblen_table_euccn,
10541 	0,
10542 	&vtbl_euccn_wchar,
10543 	&vtbl_wchar_euccn,
10544 	mb_euccn_to_wchar,
10545 	mb_wchar_to_euccn,
10546 	NULL,
10547 	NULL,
10548 };
10549 
10550 static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
10551 
10552 static const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
10553 	mbfl_no_encoding_euc_tw,
10554 	mbfl_no_encoding_wchar,
10555 	mbfl_filt_conv_common_ctor,
10556 	NULL,
10557 	mbfl_filt_conv_euctw_wchar,
10558 	mbfl_filt_conv_euctw_wchar_flush,
10559 	NULL,
10560 };
10561 
10562 static const struct mbfl_convert_vtbl vtbl_wchar_euctw = {
10563 	mbfl_no_encoding_wchar,
10564 	mbfl_no_encoding_euc_tw,
10565 	mbfl_filt_conv_common_ctor,
10566 	NULL,
10567 	mbfl_filt_conv_wchar_euctw,
10568 	mbfl_filt_conv_common_flush,
10569 	NULL,
10570 };
10571 
10572 const mbfl_encoding mbfl_encoding_euc_tw = {
10573 	mbfl_no_encoding_euc_tw,
10574 	"EUC-TW",
10575 	"EUC-TW",
10576 	mbfl_encoding_euc_tw_aliases,
10577 	mblen_table_euccn,
10578 	0,
10579 	&vtbl_euctw_wchar,
10580 	&vtbl_wchar_euctw,
10581 	mb_euctw_to_wchar,
10582 	mb_wchar_to_euctw,
10583 	NULL,
10584 	NULL,
10585 };
10586 
10587 static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
10588 
10589 static const struct mbfl_convert_vtbl vtbl_euckr_wchar = {
10590 	mbfl_no_encoding_euc_kr,
10591 	mbfl_no_encoding_wchar,
10592 	mbfl_filt_conv_common_ctor,
10593 	NULL,
10594 	mbfl_filt_conv_euckr_wchar,
10595 	mbfl_filt_conv_euckr_wchar_flush,
10596 	NULL,
10597 };
10598 
10599 static const struct mbfl_convert_vtbl vtbl_wchar_euckr = {
10600 	mbfl_no_encoding_wchar,
10601 	mbfl_no_encoding_euc_kr,
10602 	mbfl_filt_conv_common_ctor,
10603 	NULL,
10604 	mbfl_filt_conv_wchar_euckr,
10605 	mbfl_filt_conv_common_flush,
10606 	NULL,
10607 };
10608 
10609 const mbfl_encoding mbfl_encoding_euc_kr = {
10610 	mbfl_no_encoding_euc_kr,
10611 	"EUC-KR",
10612 	"EUC-KR",
10613 	mbfl_encoding_euc_kr_aliases,
10614 	mblen_table_euccn,
10615 	0,
10616 	&vtbl_euckr_wchar,
10617 	&vtbl_wchar_euckr,
10618 	mb_euckr_to_wchar,
10619 	mb_wchar_to_euckr,
10620 	NULL,
10621 	NULL,
10622 };
10623 
10624 /* UHC was introduced by MicroSoft in Windows 95, and is also known as CP949.
10625  * It is the same as EUC-KR, but with 8,822 additional characters added to
10626  * complete all the characters in the Johab charset. */
10627 
10628 static const unsigned char mblen_table_81_to_fe[] = { /* 0x81-0xFE */
10629 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10630 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10631 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10632 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10633 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10634 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10635 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10636 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10637 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10638 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10639 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10640 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10641 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10642 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10643 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10644 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10645 };
10646 
10647 static const char *mbfl_encoding_uhc_aliases[] = {"CP949", NULL};
10648 
10649 static const struct mbfl_convert_vtbl vtbl_uhc_wchar = {
10650 	mbfl_no_encoding_uhc,
10651 	mbfl_no_encoding_wchar,
10652 	mbfl_filt_conv_common_ctor,
10653 	NULL,
10654 	mbfl_filt_conv_uhc_wchar,
10655 	mbfl_filt_conv_uhc_wchar_flush,
10656 	NULL,
10657 };
10658 
10659 static const struct mbfl_convert_vtbl vtbl_wchar_uhc = {
10660 	mbfl_no_encoding_wchar,
10661 	mbfl_no_encoding_uhc,
10662 	mbfl_filt_conv_common_ctor,
10663 	NULL,
10664 	mbfl_filt_conv_wchar_uhc,
10665 	mbfl_filt_conv_common_flush,
10666 	NULL,
10667 };
10668 
10669 const mbfl_encoding mbfl_encoding_uhc = {
10670 	mbfl_no_encoding_uhc,
10671 	"UHC",
10672 	"UHC",
10673 	mbfl_encoding_uhc_aliases,
10674 	mblen_table_81_to_fe,
10675 	0,
10676 	&vtbl_uhc_wchar,
10677 	&vtbl_wchar_uhc,
10678 	mb_uhc_to_wchar,
10679 	mb_wchar_to_uhc,
10680 	NULL,
10681 	NULL,
10682 };
10683 
10684 /*
10685  * GB18030/CP936
10686  */
10687 
mbfl_filt_conv_gb18030_wchar(int c,mbfl_convert_filter * filter)10688 static int mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
10689 {
10690 	int k;
10691 	int c1, c2, c3, w = -1;
10692 
10693 	switch (filter->status) {
10694 	case 0:
10695 		if (c >= 0 && c < 0x80) { /* latin */
10696 			CK((*filter->output_function)(c, filter->data));
10697 		} else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */
10698 			filter->status = 1;
10699 			filter->cache = c;
10700 		} else {
10701 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10702 		}
10703 		break;
10704 
10705 	case 1: /* dbcs/qbcs second byte */
10706 		c1 = filter->cache;
10707 		filter->status = 0;
10708 
10709 		if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) {
10710 			/* 4 byte range: Unicode BMP */
10711 			filter->status = 2;
10712 			filter->cache = (c1 << 8) | c;
10713 			return 0;
10714 		} else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) {
10715 			/* 4 byte range: Unicode 16 planes */
10716 			filter->status = 2;
10717 			filter->cache = (c1 << 8) | c;
10718 			return 0;
10719 		} else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) {
10720 			/* UDA part 1,2: U+E000-U+E4C5 */
10721 			w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
10722 			CK((*filter->output_function)(w, filter->data));
10723 		} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
10724 			/* UDA part3 : U+E4C6-U+E765*/
10725 			w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
10726 			CK((*filter->output_function)(w, filter->data));
10727 		}
10728 
10729 		c2 = (c1 << 8) | c;
10730 
10731 		if (w <= 0 &&
10732 			((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
10733 			 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
10734 			 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
10735 			for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) {
10736 				if (c2 >= mbfl_gb18030_pua_tbl[k][2] && c2 <= mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][1] - mbfl_gb18030_pua_tbl[k][0]) {
10737 					w = c2 - mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
10738 					CK((*filter->output_function)(w, filter->data));
10739 					break;
10740 				}
10741 			}
10742 		}
10743 
10744 		if (w <= 0) {
10745 			if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
10746 				(c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
10747 				(c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
10748 				(c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
10749 				(c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
10750 				w = (c1 - 0x81)*192 + c - 0x40;
10751 				ZEND_ASSERT(w < cp936_ucs_table_size);
10752 				CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
10753 			} else {
10754 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10755 			}
10756 		}
10757 		break;
10758 
10759 	case 2: /* qbcs third byte */
10760 		c1 = (filter->cache >> 8) & 0xff;
10761 		c2 = filter->cache & 0xff;
10762 		filter->status = filter->cache = 0;
10763 		if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
10764 			filter->cache = (c1 << 16) | (c2 << 8) | c;
10765 			filter->status = 3;
10766 		} else {
10767 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10768 		}
10769 		break;
10770 
10771 	case 3: /* qbcs fourth byte */
10772 		c1 = (filter->cache >> 16) & 0xff;
10773 		c2 = (filter->cache >> 8) & 0xff;
10774 		c3 = filter->cache & 0xff;
10775 		filter->status = filter->cache = 0;
10776 		if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
10777 			if (c1 >= 0x90 && c1 <= 0xe3) {
10778 				w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000;
10779 				if (w > 0x10FFFF) {
10780 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10781 					return 0;
10782 				}
10783 			} else { /* Unicode BMP */
10784 				w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30);
10785 				if (w >= 0 && w <= 39419) {
10786 					k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max);
10787 					w += mbfl_gb_uni_ofst[k];
10788 				} else {
10789 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10790 					return 0;
10791 				}
10792 			}
10793 			CK((*filter->output_function)(w, filter->data));
10794 		} else {
10795 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10796 		}
10797 		break;
10798 
10799 		EMPTY_SWITCH_DEFAULT_CASE();
10800 	}
10801 
10802 	return 0;
10803 }
10804 
mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter * filter)10805 static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter)
10806 {
10807 	if (filter->status) {
10808 		/* multi-byte character was truncated */
10809 		filter->status = 0;
10810 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10811 	}
10812 
10813 	if (filter->flush_function) {
10814 		(*filter->flush_function)(filter->data);
10815 	}
10816 
10817 	return 0;
10818 }
10819 
mbfl_filt_conv_wchar_gb18030(int c,mbfl_convert_filter * filter)10820 static int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
10821 {
10822 	int k, k1, k2;
10823 	int c1, s = 0, s1 = 0;
10824 
10825 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
10826 		if (c == 0x01f9) {
10827 			s = 0xa8bf;
10828 		} else {
10829 			s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
10830 		}
10831 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
10832 		if (c == 0x20ac) { /* euro-sign */
10833 			s = 0xa2e3;
10834 		} else {
10835 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
10836 		}
10837 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
10838 		s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
10839 	} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
10840 		s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
10841 	} else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
10842 		/* U+F900-FA2F CJK Compatibility Ideographs */
10843 		if (c == 0xf92c) {
10844 			s = 0xfd9c;
10845 		} else if (c == 0xf979) {
10846 			s = 0xfd9d;
10847 		} else if (c == 0xf995) {
10848 			s = 0xfd9e;
10849 		} else if (c == 0xf9e7) {
10850 			s = 0xfd9f;
10851 		} else if (c == 0xf9f1) {
10852 			s = 0xfda0;
10853 		} else if (c >= 0xfa0c && c <= 0xfa29) {
10854 			s = ucs_ci_s_cp936_table[c - 0xfa0c];
10855 		}
10856 	} else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
10857 		/* FE30h CJK Compatibility Forms  */
10858 		s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
10859 	} else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
10860 		/* U+FE50-FE6F Small Form Variants */
10861 		s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min];
10862 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
10863 		/* U+FF00-FFFF HW/FW Forms */
10864 		if (c == 0xff04) {
10865 			s = 0xa1e7;
10866 		} else if (c == 0xff5e) {
10867 			s = 0xa1ab;
10868 		} else if (c >= 0xff01 && c <= 0xff5d) {
10869 			s = c - 0xff01 + 0xa3a1;
10870 		} else if (c >= 0xffe0 && c <= 0xffe5) {
10871 			s = ucs_hff_s_cp936_table[c-0xffe0];
10872 		}
10873 	}
10874 
10875 	/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
10876 	 * do a binary search in a table of differing codepoints to see if we have one */
10877 	if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] && c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
10878 		k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
10879 		if (k1 >= 0) {
10880 			s = mbfl_gb18030_c_tbl_val[k1];
10881 		}
10882 	}
10883 
10884 	if (c >= 0xe000 && c <= 0xe864) { /* PUA */
10885 		if (c < 0xe766) {
10886 			if (c < 0xe4c6) {
10887 				c1 = c - 0xe000;
10888 				s = (c1 % 94) + 0xa1;
10889 				c1 /= 94;
10890 				s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
10891 			} else {
10892 				c1 = c - 0xe4c6;
10893 				s = ((c1 / 96) + 0xa1) << 8;
10894 				c1 %= 96;
10895 				s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
10896 			}
10897 		} else {
10898 			/* U+E766..U+E864 */
10899 			k1 = 0;
10900 			k2 = mbfl_gb18030_pua_tbl_max;
10901 			while (k1 < k2) {
10902 				k = (k1 + k2) >> 1;
10903 				if (c < mbfl_gb18030_pua_tbl[k][0]) {
10904 					k2 = k;
10905 				} else if (c > mbfl_gb18030_pua_tbl[k][1]) {
10906 					k1 = k + 1;
10907 				} else {
10908 					s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
10909 					break;
10910 				}
10911 			}
10912 		}
10913 	}
10914 
10915 	/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
10916 	if (s <= 0 && c >= 0x0080 && c <= 0xffff) {
10917 		/* BMP */
10918 		s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
10919 		if (s >= 0) {
10920 			c1 = c - mbfl_gb_uni_ofst[s];
10921 			s = (c1 % 10) + 0x30;
10922 			c1 /= 10;
10923 			s |= ((c1 % 126) + 0x81) << 8;
10924 			c1 /= 126;
10925 			s |= ((c1 % 10) + 0x30) << 16;
10926 			c1 /= 10;
10927 			s1 = c1 + 0x81;
10928 		}
10929 	} else if (c >= 0x10000 && c <= 0x10ffff) {
10930 		/* Code set 3: Unicode U+10000..U+10FFFF */
10931 		c1 = c - 0x10000;
10932 		s = (c1 % 10) + 0x30;
10933 		c1 /= 10;
10934 		s |= ((c1 % 126) + 0x81) << 8;
10935 		c1 /= 126;
10936 		s |= ((c1 % 10) + 0x30) << 16;
10937 		c1 /= 10;
10938 		s1 = c1 + 0x90;
10939 	}
10940 
10941 	if (c == 0) {
10942 		s = 0;
10943 	} else if (s == 0) {
10944 		s = -1;
10945 	}
10946 
10947 	if (s >= 0) {
10948 		if (s <= 0x80) { /* latin */
10949 			CK((*filter->output_function)(s, filter->data));
10950 		} else if (s1 > 0) { /* qbcs */
10951 			CK((*filter->output_function)(s1 & 0xff, filter->data));
10952 			CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
10953 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10954 			CK((*filter->output_function)(s & 0xff, filter->data));
10955 		} else { /* dbcs */
10956 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10957 			CK((*filter->output_function)(s & 0xff, filter->data));
10958 		}
10959 	} else {
10960 		CK(mbfl_filt_conv_illegal_output(c, filter));
10961 	}
10962 
10963 	return 0;
10964 }
10965 
10966 static const unsigned short gb18030_pua_tbl3[] = {
10967 	/* 0xFE50 */
10968 	0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
10969 	0x0000,0xE81E,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10970 	0x0000,0xE826,0x0000,0x0000,0x0000,0x0000,0xE82B,0xE82C,
10971 	0x0000,0x0000,0x0000,0x0000,0xE831,0xE832,0x0000,0x0000,
10972 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
10973 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE843,0x0000,
10974 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10975 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10976 	0xE854,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10977 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10978 	/* 0xFEA0 */
10979 	0xE864
10980 };
10981 
mb_gb18030_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10982 static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10983 {
10984 	unsigned char *p = *in, *e = p + *in_len;
10985 	uint32_t *out = buf, *limit = buf + bufsize;
10986 
10987 	while (p < e && out < limit) {
10988 		unsigned char c = *p++;
10989 
10990 		if (c < 0x80) {
10991 			*out++ = c;
10992 		} else if (c == 0x80 || c == 0xFF) {
10993 			*out++ = MBFL_BAD_INPUT;
10994 		} else {
10995 			if (p == e) {
10996 				*out++ = MBFL_BAD_INPUT;
10997 				break;
10998 			}
10999 			unsigned char c2 = *p++;
11000 
11001 			if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
11002 				if (p >= e) {
11003 					*out++ = MBFL_BAD_INPUT;
11004 					break;
11005 				}
11006 				unsigned char c3 = *p++;
11007 
11008 				if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
11009 					unsigned char c4 = *p++;
11010 
11011 					if (c4 >= 0x30 && c4 <= 0x39) {
11012 						if (c >= 0x90 && c <= 0xE3) {
11013 							unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
11014 							*out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
11015 						} else {
11016 							/* Unicode BMP */
11017 							unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
11018 							if (w <= 39419) {
11019 								*out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
11020 							} else {
11021 								*out++ = MBFL_BAD_INPUT;
11022 							}
11023 						}
11024 					} else {
11025 						*out++ = MBFL_BAD_INPUT;
11026 					}
11027 				} else {
11028 					*out++ = MBFL_BAD_INPUT;
11029 				}
11030 			} else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
11031 				/* UDA part 1, 2: U+E000-U+E4C5 */
11032 				*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11033 			} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
11034 				/* UDA part 3: U+E4C6-U+E765 */
11035 				*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11036 			} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
11037 				unsigned int w = (c - 0x81)*192 + c2 - 0x40;
11038 
11039 				if (w >= 0x192B) {
11040 					if (w <= 0x1EBE) {
11041 						if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
11042 							*out++ = cp936_pua_tbl1[w - 0x192B];
11043 							continue;
11044 						}
11045 					} else if (w >= 0x413A) {
11046 						if (w <= 0x413E) {
11047 							*out++ = cp936_pua_tbl2[w - 0x413A];
11048 							continue;
11049 						} else if (w >= 0x5DD0 && w <= 0x5E20) {
11050 							unsigned int c = gb18030_pua_tbl3[w - 0x5DD0];
11051 							if (c) {
11052 								*out++ = c;
11053 								continue;
11054 							}
11055 						}
11056 					}
11057 				}
11058 
11059 				if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
11060 					ZEND_ASSERT(w < cp936_ucs_table_size);
11061 					*out++ = cp936_ucs_table[w];
11062 				} else {
11063 					*out++ = MBFL_BAD_INPUT;
11064 				}
11065 			} else {
11066 				*out++ = MBFL_BAD_INPUT;
11067 			}
11068 		}
11069 	}
11070 
11071 	*in_len = e - p;
11072 	*in = p;
11073 	return out - buf;
11074 }
11075 
mb_wchar_to_gb18030(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11076 static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11077 {
11078 	unsigned char *out, *limit;
11079 	MB_CONVERT_BUF_LOAD(buf, out, limit);
11080 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11081 
11082 	while (len--) {
11083 		uint32_t w = *in++;
11084 		unsigned int s = 0;
11085 
11086 		if (w == 0) {
11087 			out = mb_convert_buf_add(out, 0);
11088 			continue;
11089 		} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11090 			if (w == 0x1F9) {
11091 				s = 0xA8BF;
11092 			} else {
11093 				s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11094 			}
11095 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11096 			if (w == 0x20AC) { /* Euro sign */
11097 				s = 0xA2E3;
11098 			} else {
11099 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11100 			}
11101 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11102 			s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11103 		} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11104 			s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11105 		} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11106 			/* U+F900-U+FA2F CJK Compatibility Ideographs */
11107 			if (w == 0xF92C) {
11108 				s = 0xFD9C;
11109 			} else if (w == 0xF979) {
11110 				s = 0xFD9D;
11111 			} else if (w == 0xF995) {
11112 				s = 0xFD9E;
11113 			} else if (w == 0xF9E7) {
11114 				s = 0xFD9F;
11115 			} else if (w == 0xF9F1) {
11116 				s = 0xFDA0;
11117 			} else if (w >= 0xFA0C && w <= 0xFA29) {
11118 				s = ucs_ci_s_cp936_table[w - 0xFA0C];
11119 			}
11120 		} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11121 			/* CJK Compatibility Forms  */
11122 			s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11123 		} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11124 			/* U+FE50-U+FE6F Small Form Variants */
11125 			s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11126 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11127 			/* U+FF00-U+FFFF HW/FW Forms */
11128 			if (w == 0xFF04) {
11129 				s = 0xA1E7;
11130 			} else if (w == 0xFF5E) {
11131 				s = 0xA1AB;
11132 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
11133 				s = w - 0xFF01 + 0xA3A1;
11134 			} else if (w >= 0xFFE0 && w <= 0xFFE5) {
11135 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
11136 			}
11137 		} else if (w >= 0xE000 && w <= 0xE864) {
11138 			/* PUA */
11139 			if (w < 0xE766) {
11140 				if (w < 0xE4C6) {
11141 					unsigned int c1 = w - 0xE000;
11142 					s = (c1 % 94) + 0xA1;
11143 					c1 /= 94;
11144 					s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
11145 				} else {
11146 					unsigned int c1 = w - 0xE4C6;
11147 					s = ((c1 / 96) + 0xA1) << 8;
11148 					c1 %= 96;
11149 					s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11150 				}
11151 			} else {
11152 				/* U+E766-U+E864 */
11153 				unsigned int k1 = 0, k2 = mbfl_gb18030_pua_tbl_max;
11154 				while (k1 < k2) {
11155 					unsigned int k = (k1 + k2) >> 1;
11156 					if (w < mbfl_gb18030_pua_tbl[k][0]) {
11157 						k2 = k;
11158 					} else if (w > mbfl_gb18030_pua_tbl[k][1]) {
11159 						k1 = k + 1;
11160 					} else {
11161 						s = w - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
11162 						break;
11163 					}
11164 				}
11165 			}
11166 		}
11167 
11168 		/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11169 		 * do a binary search in a table of differing codepoints to see if we have one */
11170 		if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
11171 			int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
11172 			if (i >= 0) {
11173 				s = mbfl_gb18030_c_tbl_val[i];
11174 			}
11175 		}
11176 
11177 		/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11178 		if (!s && w >= 0x80 && w <= 0xFFFF) {
11179 			/* BMP */
11180 			int i = mbfl_bisec_srch(w, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
11181 			if (i >= 0) {
11182 				unsigned int c1 = w - mbfl_gb_uni_ofst[i];
11183 				s = (c1 % 10) + 0x30;
11184 				c1 /= 10;
11185 				s |= ((c1 % 126) + 0x81) << 8;
11186 				c1 /= 126;
11187 				s |= ((c1 % 10) + 0x30) << 16;
11188 				c1 /= 10;
11189 				s |= (c1 + 0x81) << 24;
11190 			}
11191 		} else if (w >= 0x10000 && w <= 0x10FFFF) {
11192 			/* Code set 3: Unicode U+10000-U+10FFFF */
11193 			unsigned int c1 = w - 0x10000;
11194 			s = (c1 % 10) + 0x30;
11195 			c1 /= 10;
11196 			s |= ((c1 % 126) + 0x81) << 8;
11197 			c1 /= 126;
11198 			s |= ((c1 % 10) + 0x30) << 16;
11199 			c1 /= 10;
11200 			s |= (c1 + 0x90) << 24;
11201 		}
11202 
11203 		if (!s) {
11204 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
11205 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11206 		} else if (s < 0x80) {
11207 			out = mb_convert_buf_add(out, s);
11208 		} else if (s > 0xFFFFFF) {
11209 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
11210 			out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
11211 		} else {
11212 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11213 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11214 		}
11215 	}
11216 
11217 	MB_CONVERT_BUF_STORE(buf, out, limit);
11218 }
11219 
mbfl_filt_conv_cp936_wchar(int c,mbfl_convert_filter * filter)11220 static int mbfl_filt_conv_cp936_wchar(int c, mbfl_convert_filter *filter)
11221 {
11222 	int k;
11223 	int c1, c2, w = -1;
11224 
11225 	switch (filter->status) {
11226 	case 0:
11227 		if (c >= 0 && c < 0x80) {	/* latin */
11228 			CK((*filter->output_function)(c, filter->data));
11229 		} else if (c == 0x80) {	/* euro sign */
11230 			CK((*filter->output_function)(0x20ac, filter->data));
11231 		} else if (c < 0xff) {	/* dbcs lead byte */
11232 			filter->status = 1;
11233 			filter->cache = c;
11234 		} else { /* 0xff */
11235 			CK((*filter->output_function)(0xf8f5, filter->data));
11236 		}
11237 		break;
11238 
11239 	case 1:		/* dbcs second byte */
11240 		filter->status = 0;
11241 		c1 = filter->cache;
11242 
11243 		if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) &&
11244 			(c >= 0xa1 && c <= 0xfe)) {
11245 			/* UDA part1,2: U+E000-U+E4C5 */
11246 			w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
11247 			CK((*filter->output_function)(w, filter->data));
11248 		} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
11249 			/* UDA part3 : U+E4C6-U+E765*/
11250 			w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
11251 			CK((*filter->output_function)(w, filter->data));
11252 		}
11253 
11254 		c2 = (c1 << 8) | c;
11255 
11256 		if (w <= 0 &&
11257 			((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
11258 			 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
11259 			 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
11260 			for (k = 0; k < mbfl_cp936_pua_tbl_max; k++) {
11261 				if (c2 >= mbfl_cp936_pua_tbl[k][2] &&
11262 					c2 <= mbfl_cp936_pua_tbl[k][2] +
11263 					mbfl_cp936_pua_tbl[k][1] -  mbfl_cp936_pua_tbl[k][0]) {
11264 					w = c2 -  mbfl_cp936_pua_tbl[k][2] + mbfl_cp936_pua_tbl[k][0];
11265 					CK((*filter->output_function)(w, filter->data));
11266 					break;
11267 				}
11268 			}
11269 		}
11270 
11271 		if (w <= 0) {
11272 			if (c1 < 0xff && c1 > 0x80 && c >= 0x40 && c < 0xff && c != 0x7f) {
11273 				w = (c1 - 0x81)*192 + c - 0x40;
11274 				ZEND_ASSERT(w < cp936_ucs_table_size);
11275 				CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
11276 			} else {
11277 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11278 			}
11279 		}
11280 		break;
11281 
11282 		EMPTY_SWITCH_DEFAULT_CASE();
11283 	}
11284 
11285 	return 0;
11286 }
11287 
mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter * filter)11288 static int mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter *filter)
11289 {
11290 	if (filter->status) {
11291 		/* 2-byte character was truncated */
11292 		filter->status = 0;
11293 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11294 	}
11295 
11296 	if (filter->flush_function) {
11297 		(*filter->flush_function)(filter->data);
11298 	}
11299 
11300 	return 0;
11301 }
11302 
mbfl_filt_conv_wchar_cp936(int c,mbfl_convert_filter * filter)11303 static int mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter)
11304 {
11305 	int k, k1, k2;
11306 	int c1, s = 0;
11307 
11308 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
11309 		/* U+0000 - U+0451 */
11310 		s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
11311 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
11312 		/* U+2000 - U+26FF */
11313 		if (c == 0x203e) {
11314 			s = 0xa3fe;
11315 		} else if (c == 0x2218) {
11316 			s = 0xa1e3;
11317 		} else if (c == 0x223c) {
11318 			s = 0xa1ab;
11319 		} else {
11320 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
11321 		}
11322 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
11323 		/* U+2F00 - U+33FF */
11324 		s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
11325 	} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
11326 		/* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11327 		s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
11328 	} else if (c >= 0xe000 && c <= 0xe864) { /* PUA */
11329 		if (c < 0xe766) {
11330 			if (c < 0xe4c6) {
11331 				c1 = c - 0xe000;
11332 				s = (c1 % 94) + 0xa1; c1 /= 94;
11333 				s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
11334 			} else {
11335 				c1 = c - 0xe4c6;
11336 				s = ((c1 / 96) + 0xa1) << 8; c1 %= 96;
11337 				s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
11338 			}
11339 		} else {
11340 			/* U+E766..U+E864 */
11341 			k1 = 0; k2 = mbfl_cp936_pua_tbl_max;
11342 			while (k1 < k2) {
11343 				k = (k1 + k2) >> 1;
11344 				if (c < mbfl_cp936_pua_tbl[k][0]) {
11345 					k2 = k;
11346 				} else if (c > mbfl_cp936_pua_tbl[k][1]) {
11347 					k1 = k + 1;
11348 				} else {
11349 					s = c - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11350 					break;
11351 				}
11352 			}
11353 		}
11354 	} else if (c == 0xf8f5) {
11355 		s = 0xff;
11356 	} else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
11357 		/* U+F900-FA2F CJK Compatibility Ideographs */
11358 		s = ucs_ci_cp936_table[c - ucs_ci_cp936_table_min];
11359 	} else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
11360 		s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
11361 	} else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
11362 		s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min]; /* U+FE50-FE6F Small Form Variants */
11363 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
11364 		/* U+FF00-FFFF HW/FW Forms */
11365 		if (c == 0xff04) {
11366 			s = 0xa1e7;
11367 		} else if (c == 0xff5e) {
11368 			s = 0xa1ab;
11369 		} else if (c >= 0xff01 && c <= 0xff5d) {
11370 			s = c - 0xff01 + 0xa3a1;
11371 		} else if (c >= 0xffe0 && c <= 0xffe5) {
11372 			s = ucs_hff_s_cp936_table[c-0xffe0];
11373 		}
11374 	}
11375 
11376 	if (s <= 0) {
11377 		if (c == 0) {
11378 			s = 0;
11379 		} else if (s <= 0) {
11380 			s = -1;
11381 		}
11382 	}
11383 
11384 	if (s >= 0) {
11385 		if (s <= 0x80 || s == 0xff) {	/* latin */
11386 			CK((*filter->output_function)(s, filter->data));
11387 		} else {
11388 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
11389 			CK((*filter->output_function)(s & 0xff, filter->data));
11390 		}
11391 	} else {
11392 		CK(mbfl_filt_conv_illegal_output(c, filter));
11393 	}
11394 
11395 	return 0;
11396 }
11397 
mb_cp936_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11398 static size_t mb_cp936_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11399 {
11400 	unsigned char *p = *in, *e = p + *in_len;
11401 	uint32_t *out = buf, *limit = buf + bufsize;
11402 
11403 	while (p < e && out < limit) {
11404 		unsigned char c = *p++;
11405 
11406 		if (c < 0x80) {
11407 			*out++ = c;
11408 		} else if (c == 0x80) {
11409 			*out++ = 0x20AC; /* Euro sign */
11410 		} else if (c < 0xFF) {
11411 			if (p >= e) {
11412 				*out++ = MBFL_BAD_INPUT;
11413 				continue;
11414 			}
11415 
11416 			unsigned char c2 = *p++;
11417 			if (c2 < 0x40 || c2 == 0x7F || c2 == 0xFF) {
11418 				*out++ = MBFL_BAD_INPUT;
11419 				continue;
11420 			}
11421 
11422 			if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && c2 >= 0xA1) {
11423 				/* UDA part 1, 2: U+E000-U+E4C5 */
11424 				*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11425 			} else if (c >= 0xA1 && c <= 0xA7 && c2 < 0xA1) {
11426 				/* UDA part 3: U+E4C6-U+E765*/
11427 				*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11428 			} else {
11429 				unsigned int w = (c - 0x81)*192 + c2 - 0x40; /* Convert c, c2 into GB 2312 table lookup index */
11430 
11431 				/* For CP936 and GB18030, certain GB 2312 byte combinations are mapped to PUA codepoints,
11432 				 * whereas the same combinations aren't mapped to any codepoint for HZ and EUC-CN
11433 				 * To avoid duplicating the entire GB 2312 -> Unicode lookup table, we have three
11434 				 * auxiliary tables which are consulted instead for specific ranges of lookup indices */
11435 				if (w >= 0x192B) {
11436 					if (w <= 0x1EBE) {
11437 						*out++ = cp936_pua_tbl1[w - 0x192B];
11438 						continue;
11439 					} else if (w >= 0x413A) {
11440 						if (w <= 0x413E) {
11441 							*out++ = cp936_pua_tbl2[w - 0x413A];
11442 							continue;
11443 						} else if (w >= 0x5DD0 && w <= 0x5E20) {
11444 							*out++ = cp936_pua_tbl3[w - 0x5DD0];
11445 							continue;
11446 						}
11447 					}
11448 				}
11449 
11450 				ZEND_ASSERT(w < cp936_ucs_table_size);
11451 				*out++ = cp936_ucs_table[w];
11452 			}
11453 		} else {
11454 			*out++ = 0xF8F5;
11455 		}
11456 	}
11457 
11458 	*in_len = e - p;
11459 	*in = p;
11460 	return out - buf;
11461 }
11462 
mb_wchar_to_cp936(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11463 static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11464 {
11465 	unsigned char *out, *limit;
11466 	MB_CONVERT_BUF_LOAD(buf, out, limit);
11467 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11468 
11469 	while (len--) {
11470 		uint32_t w = *in++;
11471 		unsigned int s = 0;
11472 
11473 		if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11474 			/* U+0000-U+0451 */
11475 			s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11476 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11477 			/* U+2000-U+26FF */
11478 			if (w == 0x203E) {
11479 				s = 0xA3FE;
11480 			} else if (w == 0x2218) {
11481 				s = 0xA1E3;
11482 			} else if (w == 0x223C) {
11483 				s = 0xA1AB;
11484 			} else {
11485 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11486 			}
11487 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11488 			/* U+2F00-U+33FF */
11489 			s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11490 		} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11491 			/* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11492 			s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11493 		} else if (w >= 0xE000 && w <= 0xE864) {
11494 			/* PUA */
11495 			if (w < 0xe766) {
11496 				if (w < 0xe4c6) {
11497 					unsigned int c1 = w - 0xE000;
11498 					s = (c1 % 94) + 0xA1;
11499 					c1 /= 94;
11500 					s |= (c1 < 0x6 ? c1 + 0xAA : c1 + 0xF2) << 8;
11501 				} else {
11502 					unsigned int c1 = w - 0xE4C6;
11503 					s = ((c1 / 96) + 0xA1) << 8;
11504 					c1 %= 96;
11505 					s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11506 				}
11507 			} else {
11508 				/* U+E766-U+E864 */
11509 				unsigned int k1 = 0;
11510 				unsigned int k2 = mbfl_cp936_pua_tbl_max;
11511 				while (k1 < k2) {
11512 					int k = (k1 + k2) >> 1;
11513 					if (w < mbfl_cp936_pua_tbl[k][0]) {
11514 						k2 = k;
11515 					} else if (w > mbfl_cp936_pua_tbl[k][1]) {
11516 						k1 = k + 1;
11517 					} else {
11518 						s = w - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11519 						break;
11520 					}
11521 				}
11522 			}
11523 		} else if (w == 0xF8F5) {
11524 			s = 0xFF;
11525 		} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11526 			/* U+F900-U+FA2F CJK Compatibility Ideographs */
11527 			s = ucs_ci_cp936_table[w - ucs_ci_cp936_table_min];
11528 		} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11529 			s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11530 		} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11531 			/* U+FE50-U+FE6F Small Form Variants */
11532 			s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11533 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11534 			/* U+FF00-U+FFFF HW/FW Forms */
11535 			if (w == 0xFF04) {
11536 				s = 0xA1E7;
11537 			} else if (w == 0xFF5E) {
11538 				s = 0xA1AB;
11539 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
11540 				s = w - 0xFF01 + 0xA3A1;
11541 			} else if (w >= 0xFFE0 && w <= 0xFFE5) {
11542 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
11543 			}
11544 		}
11545 
11546 		if (!s) {
11547 			if (w == 0) {
11548 				out = mb_convert_buf_add(out, 0);
11549 			} else {
11550 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp936);
11551 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11552 			}
11553 		} else if (s <= 0x80 || s == 0xFF) {
11554 			out = mb_convert_buf_add(out, s);
11555 		} else {
11556 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11557 		}
11558 	}
11559 
11560 	MB_CONVERT_BUF_STORE(buf, out, limit);
11561 }
11562 
11563 static const unsigned short gb18030_2022_pua_tbl3[] = {
11564 	/* 0xFE50 */
11565 	0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
11566 	0x0000,0x9FB4,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11567 	0x0000,0x9FB5,0x0000,0x0000,0x0000,0x0000,0x9FB6,0x9FB7,
11568 	0x0000,0x0000,0x0000,0x0000,0xE831,0x9FB8,0x0000,0x0000,
11569 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
11570 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x9FB9,0x0000,
11571 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11572 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11573 	0x9FBA,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11574 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11575 	/* 0xFEA0 */
11576 	0x9FBB
11577 };
11578 
mb_gb18030_2022_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11579 static size_t mb_gb18030_2022_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11580 {
11581 	unsigned char *p = *in, *e = p + *in_len;
11582 	uint32_t *out = buf, *limit = buf + bufsize;
11583 
11584 	while (p < e && out < limit) {
11585 		unsigned char c = *p++;
11586 
11587 		if (c < 0x80) {
11588 			*out++ = c;
11589 		} else if (c == 0x80 || c == 0xFF) {
11590 			*out++ = MBFL_BAD_INPUT;
11591 		} else {
11592 			if (p == e) {
11593 				*out++ = MBFL_BAD_INPUT;
11594 				break;
11595 			}
11596 			unsigned char c2 = *p++;
11597 
11598 			if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
11599 				if (p >= e) {
11600 					*out++ = MBFL_BAD_INPUT;
11601 					break;
11602 				}
11603 				unsigned char c3 = *p++;
11604 
11605 				if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
11606 					unsigned char c4 = *p++;
11607 
11608 					if (c4 >= 0x30 && c4 <= 0x39) {
11609 						if (c >= 0x90 && c <= 0xE3) {
11610 							unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
11611 							*out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
11612 						} else {
11613 							/* Unicode BMP */
11614 							unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
11615 							if (w == 0x98A4) {
11616 								*out++ = 0xE78D;
11617 							} else if (w == 0x98A6) {
11618 								*out++ = 0xE78E;
11619 							} else if (w == 0x98A5) {
11620 								*out++ = 0xE78F;
11621 							} else if (w >= 0x98A7 && w <= 0x98AD) {
11622 								*out++ = w + (0xE790 - 0x98A7);
11623 							} else if (w == 0x1D21) {
11624 								*out++ = 0xE7C7;
11625 							} else if (w == 0x4A71) {
11626 								*out++ = 0xE81E;
11627 							} else if (w == 0x4A72) {
11628 								*out++ = 0xE826;
11629 							} else if (w >= 0x4A73 && w <= 0x4A74) {
11630 								*out++ = w + (0xE82B - 0x4A73);
11631 							} else if (w == 0x4A75) {
11632 								*out++ = 0xE832;
11633 							} else if (w == 0x4A76) {
11634 								*out++ = 0xE843;
11635 							} else if (w == 0x4A77) {
11636 								*out++ = 0xE854;
11637 							} else if (w == 0x4A78) {
11638 								*out++ = 0xE864;
11639 							} else if (w <= 0x99FB) {
11640 								*out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
11641 							} else {
11642 								*out++ = MBFL_BAD_INPUT;
11643 							}
11644 						}
11645 					} else {
11646 						*out++ = MBFL_BAD_INPUT;
11647 					}
11648 				} else {
11649 					*out++ = MBFL_BAD_INPUT;
11650 				}
11651 			} else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
11652 				/* UDA part 1, 2: U+E000-U+E4C5 */
11653 				*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11654 			} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
11655 				/* UDA part 3: U+E4C6-U+E765 */
11656 				*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11657 			} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
11658 				unsigned int w = (c - 0x81)*192 + c2 - 0x40;
11659 
11660 				if (w >= 0x192B) {
11661 					if (w <= 0x1EBE) {
11662 						if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
11663 							*out++ = gb18030_2022_pua_tbl1[w - 0x192B];
11664 							continue;
11665 						}
11666 					} else if (w >= 0x413A) {
11667 						if (w <= 0x413E) {
11668 							*out++ = cp936_pua_tbl2[w - 0x413A];
11669 							continue;
11670 						} else if (w >= 0x5DD0 && w <= 0x5E20) {
11671 							unsigned int c = gb18030_2022_pua_tbl3[w - 0x5DD0];
11672 							if (c) {
11673 								*out++ = c;
11674 								continue;
11675 							}
11676 						}
11677 					}
11678 				}
11679 
11680 				if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
11681 					ZEND_ASSERT(w < cp936_ucs_table_size);
11682 					*out++ = cp936_ucs_table[w];
11683 				} else {
11684 					*out++ = MBFL_BAD_INPUT;
11685 				}
11686 			} else {
11687 				*out++ = MBFL_BAD_INPUT;
11688 			}
11689 		}
11690 	}
11691 
11692 	*in_len = e - p;
11693 	*in = p;
11694 	return out - buf;
11695 }
11696 
mb_wchar_to_gb18030_2022(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11697 static void mb_wchar_to_gb18030_2022(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11698 {
11699 	unsigned char *out, *limit;
11700 	MB_CONVERT_BUF_LOAD(buf, out, limit);
11701 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11702 
11703 	while (len--) {
11704 		uint32_t w = *in++;
11705 		unsigned int s = 0;
11706 
11707 		if (w == 0) {
11708 			out = mb_convert_buf_add(out, 0);
11709 			continue;
11710 		} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11711 			if (w == 0x1F9) {
11712 				s = 0xA8BF;
11713 			} else {
11714 				s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11715 			}
11716 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11717 			if (w == 0x20AC) { /* Euro sign */
11718 				s = 0xA2E3;
11719 			} else {
11720 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11721 			}
11722 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11723 			s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11724 		} else if (w >= 0x9FB4 && w <= 0x9FBB) {
11725 			/* Newly mapped in GB18030-2022 */
11726 			if (w == 0x9FB4) {
11727 				s = 0xFE59;
11728 			} else if (w == 0x9FB5) {
11729 				s = 0xFE61;
11730 			} else if (w == 0x9FB6) {
11731 				s = 0xFE66;
11732 			} else if (w == 0x9FB7) {
11733 				s = 0xFE67;
11734 			} else if (w == 0x9FB8) {
11735 				s = 0xFE6D;
11736 			} else if (w == 0x9FB9) {
11737 				s = 0xFE7E;
11738 			} else if (w == 0x9FBA) {
11739 				s = 0xFE90;
11740 			} else {
11741 				s = 0xFEA0;
11742 			}
11743 		} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11744 			s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11745 		} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11746 			/* U+F900-U+FA2F CJK Compatibility Ideographs */
11747 			if (w == 0xF92C) {
11748 				s = 0xFD9C;
11749 			} else if (w == 0xF979) {
11750 				s = 0xFD9D;
11751 			} else if (w == 0xF995) {
11752 				s = 0xFD9E;
11753 			} else if (w == 0xF9E7) {
11754 				s = 0xFD9F;
11755 			} else if (w == 0xF9F1) {
11756 				s = 0xFDA0;
11757 			} else if (w >= 0xFA0C && w <= 0xFA29) {
11758 				s = ucs_ci_s_cp936_table[w - 0xFA0C];
11759 			}
11760 		} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11761 			/* CJK Compatibility Forms  */
11762 			s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11763 		} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11764 			/* U+FE50-U+FE6F Small Form Variants */
11765 			s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11766 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11767 			/* U+FF00-U+FFFF HW/FW Forms */
11768 			if (w == 0xFF04) {
11769 				s = 0xA1E7;
11770 			} else if (w == 0xFF5E) {
11771 				s = 0xA1AB;
11772 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
11773 				s = w - 0xFF01 + 0xA3A1;
11774 			} else if (w >= 0xFFE0 && w <= 0xFFE5) {
11775 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
11776 			}
11777 		} else if (w >= 0xE000 && w <= 0xE864) {
11778 			/* PUA */
11779 			if (w < 0xE766) {
11780 				if (w < 0xE4C6) {
11781 					unsigned int c1 = w - 0xE000;
11782 					s = (c1 % 94) + 0xA1;
11783 					c1 /= 94;
11784 					s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
11785 				} else {
11786 					unsigned int c1 = w - 0xE4C6;
11787 					s = ((c1 / 96) + 0xA1) << 8;
11788 					c1 %= 96;
11789 					s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11790 				}
11791 			} else {
11792 				/* U+E766-U+E864 */
11793 				unsigned int k1 = 0, k2 = mbfl_gb18030_2022_pua_tbl_max;
11794 				while (k1 < k2) {
11795 					unsigned int k = (k1 + k2) >> 1;
11796 					if (w < mbfl_gb18030_2022_pua_tbl[k][0]) {
11797 						k2 = k;
11798 					} else if (w > mbfl_gb18030_2022_pua_tbl[k][1]) {
11799 						k1 = k + 1;
11800 					} else {
11801 						s = w - mbfl_gb18030_2022_pua_tbl[k][0] + mbfl_gb18030_2022_pua_tbl[k][2];
11802 						break;
11803 					}
11804 				}
11805 			}
11806 		} else if (w >= 0xFE10 && w <= 0xFE19) {
11807 			/* Newly mapped codepoints in GB18030-2022 */
11808 			if (w == 0xFE11) {
11809 				s = 0xA6DB;
11810 			} else if (w == 0xFE12) {
11811 				s = 0xA6DA;
11812 			} else if (w <= 0xFE16) {
11813 				s = w - (0xFE10 - 0xA6D9);
11814 			} else if (w <= 0xFE18) {
11815 				s = w - (0xFE17 - 0xA6EC);
11816 			} else {
11817 				s = 0xA6F3;
11818 			}
11819 		} else if (w == 0x1E3F) {
11820 			/* Newly mapped codepoint in GB18030-2022 */
11821 			s = 0xA8BC;
11822 		}
11823 
11824 		/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11825 		 * do a binary search in a table of differing codepoints to see if we have one */
11826 		if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
11827 			int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
11828 			if (i >= 0) {
11829 				s = mbfl_gb18030_c_tbl_val[i];
11830 			}
11831 		}
11832 
11833 		/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11834 		if (!s && w >= 0x80 && w <= 0xFFFF) {
11835 			/* BMP */
11836 			int i = mbfl_bisec_srch(w, mbfl_uni2gb2022_tbl, mbfl_gb2022_uni_max);
11837 			if (i >= 0) {
11838 				unsigned int c1 = w - mbfl_gb2022_uni_ofst[i];
11839 				s = (c1 % 10) + 0x30;
11840 				c1 /= 10;
11841 				s |= ((c1 % 126) + 0x81) << 8;
11842 				c1 /= 126;
11843 				s |= ((c1 % 10) + 0x30) << 16;
11844 				c1 /= 10;
11845 				s |= (c1 + 0x81) << 24;
11846 			}
11847 		} else if (w >= 0x10000 && w <= 0x10FFFF) {
11848 			/* Code set 3: Unicode U+10000-U+10FFFF */
11849 			unsigned int c1 = w - 0x10000;
11850 			s = (c1 % 10) + 0x30;
11851 			c1 /= 10;
11852 			s |= ((c1 % 126) + 0x81) << 8;
11853 			c1 /= 126;
11854 			s |= ((c1 % 10) + 0x30) << 16;
11855 			c1 /= 10;
11856 			s |= (c1 + 0x90) << 24;
11857 		}
11858 
11859 		if (!s) {
11860 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
11861 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11862 		} else if (s < 0x80) {
11863 			out = mb_convert_buf_add(out, s);
11864 		} else if (s > 0xFFFFFF) {
11865 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
11866 			out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
11867 		} else {
11868 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11869 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11870 		}
11871 	}
11872 
11873 	MB_CONVERT_BUF_STORE(buf, out, limit);
11874 }
11875 
11876 /* Step through a GB18030 string one character at a time. Find the last position at or
11877  * before `limit` which falls directly after the end of a (single or multi-byte) character */
step_through_gb18030_str(unsigned char * p,unsigned char * limit)11878 static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit)
11879 {
11880 	while (p < limit) {
11881 		unsigned char c = *p;
11882 		if (c < 0x81 || c == 0xFF) {
11883 			p++;
11884 		} else {
11885 			if (limit - p == 1) {
11886 				break;
11887 			}
11888 			unsigned char c2 = p[1];
11889 			/* For a 4-byte char, the 2nd byte will be 0x30-0x39 */
11890 			unsigned int w = (c2 >= 0x30 && c2 <= 0x39) ? 4 : 2;
11891 			if (limit - p < w) {
11892 				break;
11893 			}
11894 			p += w;
11895 		}
11896 	}
11897 	return p;
11898 }
11899 
mb_cut_gb18030(unsigned char * str,size_t from,size_t len,unsigned char * end)11900 static zend_string* mb_cut_gb18030(unsigned char *str, size_t from, size_t len, unsigned char *end)
11901 {
11902 	ZEND_ASSERT(str + from <= end);
11903 	unsigned char *start = step_through_gb18030_str(str, str + from);
11904 	if (str + from + len > end) {
11905 		len = (end - str) - from;
11906 	}
11907 	if (start + len >= end) {
11908 		return zend_string_init_fast((const char*)start, end - start);
11909 	} else {
11910 		unsigned char *_end = step_through_gb18030_str(start, start + len);
11911 		return zend_string_init_fast((const char*)start, _end - start);
11912 	}
11913 }
11914 
11915 static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
11916 
11917 static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
11918 	mbfl_no_encoding_gb18030,
11919 	mbfl_no_encoding_wchar,
11920 	mbfl_filt_conv_common_ctor,
11921 	NULL,
11922 	mbfl_filt_conv_gb18030_wchar,
11923 	mbfl_filt_conv_gb18030_wchar_flush,
11924 	NULL,
11925 };
11926 
11927 static const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = {
11928 	mbfl_no_encoding_wchar,
11929 	mbfl_no_encoding_gb18030,
11930 	mbfl_filt_conv_common_ctor,
11931 	NULL,
11932 	mbfl_filt_conv_wchar_gb18030,
11933 	mbfl_filt_conv_common_flush,
11934 	NULL,
11935 };
11936 
11937 const mbfl_encoding mbfl_encoding_gb18030 = {
11938 	mbfl_no_encoding_gb18030,
11939 	"GB18030",
11940 	"GB18030",
11941 	mbfl_encoding_gb18030_aliases,
11942 	NULL,
11943 	MBFL_ENCTYPE_GL_UNSAFE,
11944 	&vtbl_gb18030_wchar,
11945 	&vtbl_wchar_gb18030,
11946 	mb_gb18030_to_wchar,
11947 	mb_wchar_to_gb18030,
11948 	NULL,
11949 	mb_cut_gb18030,
11950 };
11951 
11952 static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
11953 
11954 static const struct mbfl_convert_vtbl vtbl_cp936_wchar = {
11955 	mbfl_no_encoding_cp936,
11956 	mbfl_no_encoding_wchar,
11957 	mbfl_filt_conv_common_ctor,
11958 	NULL,
11959 	mbfl_filt_conv_cp936_wchar,
11960 	mbfl_filt_conv_cp936_wchar_flush,
11961 	NULL,
11962 };
11963 
11964 static const struct mbfl_convert_vtbl vtbl_wchar_cp936 = {
11965 	mbfl_no_encoding_wchar,
11966 	mbfl_no_encoding_cp936,
11967 	mbfl_filt_conv_common_ctor,
11968 	NULL,
11969 	mbfl_filt_conv_wchar_cp936,
11970 	mbfl_filt_conv_common_flush,
11971 	NULL,
11972 };
11973 
11974 const mbfl_encoding mbfl_encoding_cp936 = {
11975 	mbfl_no_encoding_cp936,
11976 	"CP936",
11977 	"CP936",
11978 	mbfl_encoding_cp936_aliases,
11979 	mblen_table_81_to_fe,
11980 	MBFL_ENCTYPE_GL_UNSAFE,
11981 	&vtbl_cp936_wchar,
11982 	&vtbl_wchar_cp936,
11983 	mb_cp936_to_wchar,
11984 	mb_wchar_to_cp936,
11985 	NULL,
11986 	NULL,
11987 };
11988 
11989 const mbfl_encoding mbfl_encoding_gb18030_2022 = {
11990 	mbfl_no_encoding_gb18030_2022,
11991 	"GB18030-2022",
11992 	"GB18030-2022",
11993 	NULL,
11994 	NULL,
11995 	MBFL_ENCTYPE_GL_UNSAFE,
11996 	NULL,
11997 	NULL,
11998 	mb_gb18030_2022_to_wchar,
11999 	mb_wchar_to_gb18030_2022,
12000 	NULL,
12001 	mb_cut_gb18030,
12002 };
12003 
12004 /*
12005  * BIG5/CP950
12006  */
12007 
12008 /* 63 + 94 = 157 or 94 */
12009 static unsigned short cp950_pua_tbl[][4] = {
12010 	{0xe000, 0xe310, 0xfa40, 0xfefe},
12011 	{0xe311, 0xeeb7, 0x8e40, 0xa0fe},
12012 	{0xeeb8, 0xf6b0, 0x8140, 0x8dfe},
12013 	{0xf6b1, 0xf70e, 0xc6a1, 0xc6fe},
12014 	{0xf70f, 0xf848, 0xc740, 0xc8fe},
12015 };
12016 
is_in_cp950_pua(int c1,int c)12017 static inline int is_in_cp950_pua(int c1, int c)
12018 {
12019 	if ((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) || (c1 >= 0x81 && c1 <= 0x8d) || (c1 >= 0xc7 && c1 <= 0xc8)) {
12020 		return (c >= 0x40 && c <= 0x7e) || (c >= 0xa1 && c <= 0xfe);
12021 	} else if (c1 == 0xc6) {
12022 		return c >= 0xa1 && c <= 0xfe;
12023 	}
12024 	return 0;
12025 }
12026 
mbfl_filt_conv_big5_wchar(int c,mbfl_convert_filter * filter)12027 static int mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
12028 {
12029 	int k, c1, w;
12030 
12031 	switch (filter->status) {
12032 	case 0:
12033 		if (c >= 0 && c < 0x80) { /* latin */
12034 			CK((*filter->output_function)(c, filter->data));
12035 		} else if (filter->from->no_encoding != mbfl_no_encoding_cp950 && c > 0xA0 && c <= 0xF9 && c != 0xC8) {
12036 			filter->status = 1;
12037 			filter->cache = c;
12038 		} else if (filter->from->no_encoding == mbfl_no_encoding_cp950 && c > 0x80 && c <= 0xFE) {
12039 			filter->status = 1;
12040 			filter->cache = c;
12041 		} else {
12042 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12043 		}
12044 		break;
12045 
12046 	case 1: /* dbcs second byte */
12047 		filter->status = 0;
12048 		c1 = filter->cache;
12049 		if ((c > 0x3f && c < 0x7f) || (c > 0xa0 && c < 0xff)) {
12050 			if (c < 0x7f) {
12051 				w = (c1 - 0xa1)*157 + (c - 0x40);
12052 			} else {
12053 				w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f;
12054 			}
12055 			if (w >= 0 && w < big5_ucs_table_size) {
12056 				w = big5_ucs_table[w];
12057 			} else {
12058 				w = 0;
12059 			}
12060 
12061 			if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
12062 				/* PUA for CP950 */
12063 				if (is_in_cp950_pua(c1, c)) {
12064 					int c2 = (c1 << 8) | c;
12065 
12066 					for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12067 						if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) {
12068 							break;
12069 						}
12070 					}
12071 
12072 					if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
12073 						w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
12074 					} else {
12075 						w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
12076 					}
12077 				} else if (c1 == 0xA1) {
12078 					if (c == 0x45) {
12079 						w = 0x2027;
12080 					} else if (c == 0x4E) {
12081 						w = 0xFE51;
12082 					} else if (c == 0x5A) {
12083 						w = 0x2574;
12084 					} else if (c == 0xC2) {
12085 						w = 0x00AF;
12086 					} else if (c == 0xC3) {
12087 						w = 0xFFE3;
12088 					} else if (c == 0xC5) {
12089 						w = 0x02CD;
12090 					} else if (c == 0xE3) {
12091 						w = 0xFF5E;
12092 					} else if (c == 0xF2) {
12093 						w = 0x2295;
12094 					} else if (c == 0xF3) {
12095 						w = 0x2299;
12096 					} else if (c == 0xFE) {
12097 						w = 0xFF0F;
12098 					}
12099 				} else if (c1 == 0xA2) {
12100 					if (c == 0x40) {
12101 						w = 0xFF3C;
12102 					} else if (c == 0x41) {
12103 						w = 0x2215;
12104 					} else if (c == 0x42) {
12105 						w = 0xFE68;
12106 					} else if (c == 0x46) {
12107 						w = 0xFFE0;
12108 					} else if (c == 0x47) {
12109 						w = 0xFFE1;
12110 					} else if (c == 0xCC) {
12111 						w = 0x5341;
12112 					} else if (c == 0xCE) {
12113 						w = 0x5345;
12114 					}
12115 				}
12116 			}
12117 
12118 			if (w <= 0) {
12119 				w = MBFL_BAD_INPUT;
12120 			}
12121 			CK((*filter->output_function)(w, filter->data));
12122 		} else {
12123 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12124 		}
12125 		break;
12126 
12127 		EMPTY_SWITCH_DEFAULT_CASE();
12128 	}
12129 
12130 	return 0;
12131 }
12132 
mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter * filter)12133 static int mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter *filter)
12134 {
12135 	if (filter->status == 1) {
12136 		/* 2-byte character was truncated */
12137 		filter->status = 0;
12138 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12139 	}
12140 
12141 	if (filter->flush_function) {
12142 		(*filter->flush_function)(filter->data);
12143 	}
12144 
12145 	return 0;
12146 }
12147 
mbfl_filt_conv_wchar_big5(int c,mbfl_convert_filter * filter)12148 static int mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
12149 {
12150 	int k, s = 0;
12151 
12152 	if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) {
12153 		s = ucs_a1_big5_table[c - ucs_a1_big5_table_min];
12154 	} else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) {
12155 		s = ucs_a2_big5_table[c - ucs_a2_big5_table_min];
12156 	} else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) {
12157 		s = ucs_a3_big5_table[c - ucs_a3_big5_table_min];
12158 	} else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) {
12159 		s = ucs_i_big5_table[c - ucs_i_big5_table_min];
12160 	} else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) {
12161 		s = ucs_r1_big5_table[c - ucs_r1_big5_table_min];
12162 	} else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) {
12163 		s = ucs_r2_big5_table[c - ucs_r2_big5_table_min];
12164 	}
12165 
12166 	if (filter->to->no_encoding == mbfl_no_encoding_cp950) {
12167 		if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */
12168 			for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12169 				if (c <= cp950_pua_tbl[k][1]) {
12170 					break;
12171 				}
12172 			}
12173 
12174 			int c1 = c - cp950_pua_tbl[k][0];
12175 			if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
12176 				int c2 = cp950_pua_tbl[k][2] >> 8;
12177 				s = ((c1 / 157) + c2) << 8;
12178 				c1 %= 157;
12179 				s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40);
12180 			} else {
12181 				s = c1 + cp950_pua_tbl[k][2];
12182 			}
12183 		} else if (c == 0x00A2) {
12184 			s = 0;
12185 		} else if (c == 0x00A3) {
12186 			s = 0;
12187 		} else if (c == 0x00AF) {
12188 			s = 0xA1C2;
12189 		} else if (c == 0x02CD) {
12190 			s = 0xA1C5;
12191 		} else if (c == 0x0401) {
12192 			s = 0;
12193 		} else if (c >= 0x0414 && c <= 0x041C) {
12194 			s = 0;
12195 		} else if (c >= 0x0423 && c <= 0x044F) {
12196 			s = 0;
12197 		} else if (c == 0x0451) {
12198 			s = 0;
12199 		} else if (c == 0x2022) {
12200 			s = 0;
12201 		} else if (c == 0x2027) {
12202 			s = 0xA145;
12203 		} else if (c == 0x203E) {
12204 			s = 0;
12205 		} else if (c == 0x2215) {
12206 			s = 0xA241;
12207 		} else if (c == 0x223C) {
12208 			s = 0;
12209 		} else if (c == 0x2295) {
12210 			s = 0xA1F2;
12211 		} else if (c == 0x2299) {
12212 			s = 0xA1F3;
12213 		} else if (c >= 0x2460 && c <= 0x247D) {
12214 			s = 0;
12215 		} else if (c == 0x2574) {
12216 			s = 0xA15A;
12217 		} else if (c == 0x2609) {
12218 			s = 0;
12219 		} else if (c == 0x2641) {
12220 			s = 0;
12221 		} else if (c == 0x3005 || (c >= 0x302A && c <= 0x30FF)) {
12222 			s = 0;
12223 		} else if (c == 0xFE51) {
12224 			s = 0xA14E;
12225 		} else if (c == 0xFE68) {
12226 			s = 0xA242;
12227 		} else if (c == 0xFF3C) {
12228 			s = 0xA240;
12229 		} else if (c == 0xFF5E) {
12230 			s = 0xA1E3;
12231 		} else if (c == 0xFF64) {
12232 			s = 0;
12233 		} else if (c == 0xFFE0) {
12234 			s = 0xA246;
12235 		} else if (c == 0xFFE1) {
12236 			s = 0xA247;
12237 		} else if (c == 0xFFE3) {
12238 			s = 0xA1C3;
12239 		} else if (c == 0xFF0F) {
12240 			s = 0xA1FE;
12241 		}
12242 	}
12243 
12244 	if (s <= 0) {
12245 		if (c == 0) {
12246 			s = 0;
12247 		} else {
12248 			s = -1;
12249 		}
12250 	}
12251 
12252 	if (s >= 0) {
12253 		if (s <= 0x80) { /* latin */
12254 			CK((*filter->output_function)(s, filter->data));
12255 		} else {
12256 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
12257 			CK((*filter->output_function)(s & 0xff, filter->data));
12258 		}
12259 	} else {
12260 		CK(mbfl_filt_conv_illegal_output(c, filter));
12261 	}
12262 
12263 	return 0;
12264 }
12265 
mb_big5_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12266 static size_t mb_big5_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12267 {
12268 	unsigned char *p = *in, *e = p + *in_len;
12269 	uint32_t *out = buf, *limit = buf + bufsize;
12270 
12271 	e--; /* Stop the main loop 1 byte short of the end of the input */
12272 
12273 	while (p < e && out < limit) {
12274 		unsigned char c = *p++;
12275 
12276 		if (c <= 0x7F) {
12277 			*out++ = c;
12278 		} else if (c > 0xA0 && c <= 0xF9) {
12279 			/* We don't need to check p < e here; it's not possible that this pointer dereference
12280 			 * will be outside the input string, because of e-- above */
12281 			unsigned char c2 = *p++;
12282 
12283 			if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
12284 				unsigned int w = (c - 0xA1)*157 + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
12285 				ZEND_ASSERT(w < big5_ucs_table_size);
12286 				w = big5_ucs_table[w];
12287 				if (!w) {
12288 					if (c == 0xC8) {
12289 						p--;
12290 					}
12291 					w = MBFL_BAD_INPUT;
12292 				}
12293 				*out++ = w;
12294 			} else {
12295 				*out++ = MBFL_BAD_INPUT;
12296 			}
12297 		} else {
12298 			*out++ = MBFL_BAD_INPUT;
12299 		}
12300 	}
12301 
12302 	/* Finish up last byte of input string if there is one */
12303 	if (p == e && out < limit) {
12304 		unsigned char c = *p++;
12305 		*out++ = (c <= 0x7F) ? c : MBFL_BAD_INPUT;
12306 	}
12307 
12308 	*in_len = e - p + 1;
12309 	*in = p;
12310 	return out - buf;
12311 }
12312 
mb_wchar_to_big5(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12313 static void mb_wchar_to_big5(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12314 {
12315 	unsigned char *out, *limit;
12316 	MB_CONVERT_BUF_LOAD(buf, out, limit);
12317 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12318 
12319 	while (len--) {
12320 		uint32_t w = *in++;
12321 		unsigned int s = 0;
12322 
12323 		if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
12324 			s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
12325 		} else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
12326 			s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
12327 		} else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
12328 			s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
12329 		} else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
12330 			s = ucs_i_big5_table[w - ucs_i_big5_table_min];
12331 		} else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
12332 			s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
12333 		} else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
12334 			s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
12335 		}
12336 
12337 		if (!s) {
12338 			if (w == 0) {
12339 				out = mb_convert_buf_add(out, 0);
12340 			} else {
12341 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
12342 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12343 			}
12344 		} else if (s <= 0x80) {
12345 			out = mb_convert_buf_add(out, s);
12346 		} else {
12347 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12348 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
12349 		}
12350 	}
12351 
12352 	MB_CONVERT_BUF_STORE(buf, out, limit);
12353 }
12354 
mb_cp950_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12355 static size_t mb_cp950_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12356 {
12357 	unsigned char *p = *in, *e = p + *in_len;
12358 	uint32_t *out = buf, *limit = buf + bufsize;
12359 
12360 	while (p < e && out < limit) {
12361 		unsigned char c = *p++;
12362 
12363 		if (c <= 0x7F) {
12364 			*out++ = c;
12365 		} else if (c > 0x80 && c <= 0xFE && p < e) {
12366 			unsigned char c2 = *p++;
12367 
12368 			if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
12369 				unsigned int w = ((c - 0xA1)*157) + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
12370 				w = (w < big5_ucs_table_size) ? big5_ucs_table[w] : 0;
12371 
12372 				/* PUA for CP950 */
12373 				if (is_in_cp950_pua(c, c2)) {
12374 					unsigned int s = (c << 8) | c2;
12375 
12376 					int k;
12377 					for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12378 						if (s >= cp950_pua_tbl[k][2] && s <= cp950_pua_tbl[k][3]) {
12379 							break;
12380 						}
12381 					}
12382 
12383 					if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
12384 						w = 157*(c - (cp950_pua_tbl[k][2] >> 8)) + c2 - (c2 >= 0xA1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
12385 					} else {
12386 						w = s - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
12387 					}
12388 				} else if (c == 0xA1) {
12389 					if (c2 == 0x45) {
12390 						w = 0x2027;
12391 					} else if (c2 == 0x4E) {
12392 						w = 0xFE51;
12393 					} else if (c2 == 0x5A) {
12394 						w = 0x2574;
12395 					} else if (c2 == 0xC2) {
12396 						w = 0x00AF;
12397 					} else if (c2 == 0xC3) {
12398 						w = 0xFFE3;
12399 					} else if (c2 == 0xC5) {
12400 						w = 0x02CD;
12401 					} else if (c2 == 0xE3) {
12402 						w = 0xFF5E;
12403 					} else if (c2 == 0xF2) {
12404 						w = 0x2295;
12405 					} else if (c2 == 0xF3) {
12406 						w = 0x2299;
12407 					} else if (c2 == 0xFE) {
12408 						w = 0xFF0F;
12409 					}
12410 				} else if (c == 0xA2) {
12411 					if (c2 == 0x40) {
12412 						w = 0xFF3C;
12413 					} else if (c2 == 0x41) {
12414 						w = 0x2215;
12415 					} else if (c2 == 0x42) {
12416 						w = 0xFE68;
12417 					} else if (c2 == 0x46) {
12418 						w = 0xFFE0;
12419 					} else if (c2 == 0x47) {
12420 						w = 0xFFE1;
12421 					} else if (c2 == 0xCC) {
12422 						w = 0x5341;
12423 					} else if (c2 == 0xCE) {
12424 						w = 0x5345;
12425 					}
12426 				}
12427 
12428 				if (!w)
12429 					w = MBFL_BAD_INPUT;
12430 				*out++ = w;
12431 			} else {
12432 				*out++ = MBFL_BAD_INPUT;
12433 			}
12434 		} else {
12435 			*out++ = MBFL_BAD_INPUT;
12436 		}
12437 	}
12438 
12439 	*in_len = e - p;
12440 	*in = p;
12441 	return out - buf;
12442 }
12443 
mb_wchar_to_cp950(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12444 static void mb_wchar_to_cp950(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12445 {
12446 	unsigned char *out, *limit;
12447 	MB_CONVERT_BUF_LOAD(buf, out, limit);
12448 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12449 
12450 	while (len--) {
12451 		uint32_t w = *in++;
12452 		unsigned int s = 0;
12453 
12454 		if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
12455 			s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
12456 		} else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
12457 			s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
12458 		} else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
12459 			s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
12460 		} else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
12461 			s = ucs_i_big5_table[w - ucs_i_big5_table_min];
12462 		} else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
12463 			s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
12464 		} else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
12465 			s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
12466 		}
12467 
12468 		if (w >= 0xE000 && w <= 0xF848) {
12469 			int k;
12470 			for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12471 				if (w <= cp950_pua_tbl[k][1]) {
12472 					break;
12473 				}
12474 			}
12475 
12476 			int c1 = w - cp950_pua_tbl[k][0];
12477 			if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
12478 				int c2 = cp950_pua_tbl[k][2] >> 8;
12479 				s = ((c1 / 157) + c2) << 8;
12480 				c1 %= 157;
12481 				s |= c1 + (c1 >= 0x3F ? 0x62 : 0x40);
12482 			} else {
12483 				s = c1 + cp950_pua_tbl[k][2];
12484 			}
12485 		} else if (w == 0xA2 || w == 0xA3 || w == 0x401 || (w >= 0x414 && w <= 0x41C) || (w >= 0x423 && w <= 0x44F) || w == 0x451 || w == 0x2022 || w == 0x203E || w == 0x223C || (w >= 0x2460 && w <= 0x247D) || w == 0x2609 || w == 0x2641 || w == 0x3005 || (w >= 0x302A && w <= 0x30FF) || w == 0xFF64) {
12486 			s = 0;
12487 		} else if (w == 0xAF) {
12488 			s = 0xA1C2;
12489 		} else if (w == 0x2CD) {
12490 			s = 0xA1C5;
12491 		} else if (w == 0x2027) {
12492 			s = 0xA145;
12493 		} else if (w == 0x2215) {
12494 			s = 0xA241;
12495 		} else if (w == 0x2295) {
12496 			s = 0xA1F2;
12497 		} else if (w == 0x2299) {
12498 			s = 0xA1F3;
12499 		} else if (w == 0x2574) {
12500 			s = 0xA15A;
12501 		} else if (w == 0xFE51) {
12502 			s = 0xA14E;
12503 		} else if (w == 0xFE68) {
12504 			s = 0xA242;
12505 		} else if (w == 0xFF3C) {
12506 			s = 0xA240;
12507 		} else if (w == 0xFF5E) {
12508 			s = 0xA1E3;
12509 		} else if (w == 0xFFE0) {
12510 			s = 0xA246;
12511 		} else if (w == 0xFFE1) {
12512 			s = 0xA247;
12513 		} else if (w == 0xFFE3) {
12514 			s = 0xA1C3;
12515 		} else if (w == 0xFF0F) {
12516 			s = 0xA1FE;
12517 		}
12518 
12519 		if (!s) {
12520 			if (w == 0) {
12521 				out = mb_convert_buf_add(out, 0);
12522 			} else {
12523 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
12524 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12525 			}
12526 		} else if (s <= 0x80) {
12527 			out = mb_convert_buf_add(out, s);
12528 		} else {
12529 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12530 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
12531 		}
12532 	}
12533 
12534 	MB_CONVERT_BUF_STORE(buf, out, limit);
12535 }
12536 
12537 static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL};
12538 
12539 static const struct mbfl_convert_vtbl vtbl_big5_wchar = {
12540 	mbfl_no_encoding_big5,
12541 	mbfl_no_encoding_wchar,
12542 	mbfl_filt_conv_common_ctor,
12543 	NULL,
12544 	mbfl_filt_conv_big5_wchar,
12545 	mbfl_filt_conv_big5_wchar_flush,
12546 	NULL,
12547 };
12548 
12549 static const struct mbfl_convert_vtbl vtbl_wchar_big5 = {
12550 	mbfl_no_encoding_wchar,
12551 	mbfl_no_encoding_big5,
12552 	mbfl_filt_conv_common_ctor,
12553 	NULL,
12554 	mbfl_filt_conv_wchar_big5,
12555 	mbfl_filt_conv_common_flush,
12556 	NULL
12557 };
12558 
12559 const mbfl_encoding mbfl_encoding_big5 = {
12560 	mbfl_no_encoding_big5,
12561 	"BIG-5",
12562 	"BIG5",
12563 	mbfl_encoding_big5_aliases,
12564 	mblen_table_81_to_fe,
12565 	MBFL_ENCTYPE_GL_UNSAFE,
12566 	&vtbl_big5_wchar,
12567 	&vtbl_wchar_big5,
12568 	mb_big5_to_wchar,
12569 	mb_wchar_to_big5,
12570 	NULL,
12571 	NULL,
12572 };
12573 
12574 static const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
12575 	mbfl_no_encoding_cp950,
12576 	mbfl_no_encoding_wchar,
12577 	mbfl_filt_conv_common_ctor,
12578 	NULL,
12579 	mbfl_filt_conv_big5_wchar,
12580 	mbfl_filt_conv_big5_wchar_flush,
12581 	NULL,
12582 };
12583 
12584 static const struct mbfl_convert_vtbl vtbl_wchar_cp950 = {
12585 	mbfl_no_encoding_wchar,
12586 	mbfl_no_encoding_cp950,
12587 	mbfl_filt_conv_common_ctor,
12588 	NULL,
12589 	mbfl_filt_conv_wchar_big5,
12590 	mbfl_filt_conv_common_flush,
12591 	NULL,
12592 };
12593 
12594 const mbfl_encoding mbfl_encoding_cp950 = {
12595 	mbfl_no_encoding_cp950,
12596 	"CP950",
12597 	"BIG5",
12598 	NULL,
12599 	mblen_table_81_to_fe,
12600 	MBFL_ENCTYPE_GL_UNSAFE,
12601 	&vtbl_cp950_wchar,
12602 	&vtbl_wchar_cp950,
12603 	mb_cp950_to_wchar,
12604 	mb_wchar_to_cp950,
12605 	NULL,
12606 	NULL,
12607 };
12608 
12609 /*
12610  * HZ
12611  */
12612 
mbfl_filt_conv_hz_wchar(int c,mbfl_convert_filter * filter)12613 static int mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter)
12614 {
12615 	int c1, s, w;
12616 
12617 	switch (filter->status & 0xf) {
12618 	/* case 0x00: ASCII */
12619 	/* case 0x10: GB2312 */
12620 	case 0:
12621 		if (c == '~') {
12622 			filter->status += 2;
12623 		} else if (filter->status == 0x10 && ((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77))) {
12624 			/* DBCS first char */
12625 			filter->cache = c;
12626 			filter->status += 1;
12627 		} else if (filter->status == 0 && c >= 0 && c < 0x80) { /* latin, CTLs */
12628 			CK((*filter->output_function)(c, filter->data));
12629 		} else {
12630 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12631 		}
12632 		break;
12633 
12634 	/* case 0x11: GB2312 second char */
12635 	case 1:
12636 		filter->status &= ~0xf;
12637 		c1 = filter->cache;
12638 		if (c1 > 0x20 && c1 < 0x7F && c > 0x20 && c < 0x7F) {
12639 			s = (c1 - 1)*192 + c + 0x40; /* GB2312 */
12640 			ZEND_ASSERT(s < cp936_ucs_table_size);
12641 			if (s == 0x1864) {
12642 				w = 0x30FB;
12643 			} else if (s == 0x186A) {
12644 				w = 0x2015;
12645 			} else if (s == 0x186C) {
12646 				w = 0x2225;
12647 			} else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12648 				w = 0;
12649 			} else {
12650 				w = cp936_ucs_table[s];
12651 			}
12652 
12653 			if (w <= 0) {
12654 				w = MBFL_BAD_INPUT;
12655 			}
12656 
12657 			CK((*filter->output_function)(w, filter->data));
12658 		} else {
12659 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12660 		}
12661 		break;
12662 
12663 	/* '~' */
12664 	case 2:
12665 		if (c == '}' && filter->status == 0x12) {
12666 			filter->status = 0;
12667 		} else if (c == '{' && filter->status == 2) {
12668 			filter->status = 0x10;
12669 		} else if (c == '~' && filter->status == 2) {
12670 			CK((*filter->output_function)('~', filter->data));
12671 			filter->status -= 2;
12672 		} else if (c == '\n') {
12673 			/* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12674 			filter->status -= 2;
12675 		} else {
12676 			/* Invalid character after ~ */
12677 			filter->status -= 2;
12678 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12679 		}
12680 		break;
12681 
12682 		EMPTY_SWITCH_DEFAULT_CASE();
12683 	}
12684 
12685 	return 0;
12686 }
12687 
mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter * filter)12688 static int mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter *filter)
12689 {
12690 	if (filter->status == 0x11) {
12691 		/* 2-byte character was truncated */
12692 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12693 	}
12694 
12695 	filter->status = 0;
12696 
12697 	if (filter->flush_function) {
12698 		(*filter->flush_function)(filter->data);
12699 	}
12700 
12701 	return 0;
12702 }
12703 
mbfl_filt_conv_wchar_hz(int c,mbfl_convert_filter * filter)12704 static int mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter)
12705 {
12706 	int s = 0;
12707 
12708 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
12709 		if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261 || c == 0x2CA || c == 0x2CB || c == 0x2D9) {
12710 			s = 0;
12711 		} else {
12712 			s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
12713 		}
12714 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
12715 		if (c == 0x2015) {
12716 			s = 0xA1AA;
12717 		} else if (c == 0x2010 || c == 0x2013 || c == 0x2014 || c == 0x2016 || c == 0x2025 || c == 0x2035 ||
12718 				c == 0x2105 || c == 0x2109 || c == 0x2121 || (c >= 0x2170 && c <= 0x2179) || (c >= 0x2196 && c <= 0x2199) ||
12719 				c == 0x2215 || c == 0x221F || c == 0x2223 || c == 0x2252 || c == 0x2266 || c == 0x2267 || c == 0x2295 ||
12720 				(c >= 0x2550 && c <= 0x2573) || c == 0x22BF || c == 0x2609 || (c >= 0x2581 && c <= 0x258F) ||
12721 				(c >= 0x2593 && c <= 0x2595) || c == 0x25BC || c == 0x25BD || (c >= 0x25E2 && c <= 0x25E5)) {
12722 			s = 0;
12723 		} else {
12724 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
12725 		}
12726 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
12727 		if (c == 0x30FB) {
12728 			s = 0xA1A4;
12729 		} else if (c == 0x3006 || c == 0x3007 || c == 0x3012 || c == 0x3231 || c == 0x32A3 || c >= 0x3300 ||
12730 				(c >= 0x3018 && c <= 0x3040) || (c >= 0x309B && c <= 0x309E) || (c >= 0x30FC && c <= 0x30FE)) {
12731 			s = 0;
12732 		} else {
12733 			s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
12734 		}
12735 	} else if (c >= ucs_i_gb2312_table_min && c < ucs_i_gb2312_table_max) {
12736 		s = ucs_i_gb2312_table[c - ucs_i_gb2312_table_min];
12737 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
12738 		if (c == 0xFF04) {
12739 			s = 0xA1E7;
12740 		} else if (c == 0xFF5E) {
12741 			s = 0xA1AB;
12742 		} else if (c >= 0xFF01 && c <= 0xFF5D) {
12743 			s = c - 0xFF01 + 0xA3A1;
12744 		} else if (c == 0xFFE0 || c == 0xFFE1 || c == 0xFFE3 || c == 0xFFE5) {
12745 			s = ucs_hff_s_cp936_table[c - 0xFFE0];
12746 		}
12747 	}
12748 
12749 	if (s & 0x8000) {
12750 		s -= 0x8080;
12751 	}
12752 
12753 	if (s <= 0) {
12754 		s = (c == 0) ? 0 : -1;
12755 	} else if ((s >= 0x80 && s < 0x2121) || s > 0x8080) {
12756 		s = -1;
12757 	}
12758 
12759 	if (s >= 0) {
12760 		if (s < 0x80) { /* ASCII */
12761 			if ((filter->status & 0xff00) != 0) {
12762 				CK((*filter->output_function)('~', filter->data));
12763 				CK((*filter->output_function)('}', filter->data));
12764 			}
12765 			filter->status = 0;
12766 			if (s == 0x7E) {
12767 				CK((*filter->output_function)('~', filter->data));
12768 			}
12769 			CK((*filter->output_function)(s, filter->data));
12770 		} else { /* GB 2312-80 */
12771 			if ((filter->status & 0xFF00) != 0x200) {
12772 				CK((*filter->output_function)('~', filter->data));
12773 				CK((*filter->output_function)('{', filter->data));
12774 			}
12775 			filter->status = 0x200;
12776 			CK((*filter->output_function)((s >> 8) & 0x7F, filter->data));
12777 			CK((*filter->output_function)(s & 0x7F, filter->data));
12778 		}
12779 	} else {
12780 		CK(mbfl_filt_conv_illegal_output(c, filter));
12781 	}
12782 
12783 	return 0;
12784 }
12785 
mbfl_filt_conv_any_hz_flush(mbfl_convert_filter * filter)12786 static int mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter)
12787 {
12788 	/* back to latin */
12789 	if (filter->status & 0xFF00) {
12790 		CK((*filter->output_function)('~', filter->data));
12791 		CK((*filter->output_function)('}', filter->data));
12792 	}
12793 	filter->status = 0;
12794 	return 0;
12795 }
12796 
12797 #define ASCII 0
12798 #define GB2312 1
12799 
mb_hz_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12800 static size_t mb_hz_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12801 {
12802 	unsigned char *p = *in, *e = p + *in_len;
12803 	uint32_t *out = buf, *limit = buf + bufsize;
12804 
12805 	while (p < e && out < limit) {
12806 		unsigned char c = *p++;
12807 
12808 		if (c == '~') {
12809 			if (p == e) {
12810 				break;
12811 			}
12812 			unsigned char c2 = *p++;
12813 
12814 			if (c2 == '}' && *state == GB2312) {
12815 				*state = ASCII;
12816 			} else if (c2 == '{' && *state == ASCII) {
12817 				*state = GB2312;
12818 			} else if (c2 == '~' && *state == ASCII) {
12819 				*out++ = '~';
12820 			} else if (c2 == '\n') {
12821 				/* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12822 			} else {
12823 				/* Invalid character after ~ */
12824 				*out++ = MBFL_BAD_INPUT;
12825 			}
12826 		} else if (((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77)) && p < e && *state == GB2312) {
12827 			unsigned char c2 = *p++;
12828 
12829 			if (c > 0x20 && c < 0x7F && c2 > 0x20 && c2 < 0x7F) {
12830 				unsigned int s = (c - 1)*192 + c2 + 0x40;
12831 				ZEND_ASSERT(s < cp936_ucs_table_size);
12832 
12833 				if (s == 0x1864) {
12834 					s = 0x30FB;
12835 				} else if (s == 0x186A) {
12836 					s = 0x2015;
12837 				} else if (s == 0x186C) {
12838 					s = 0x2225;
12839 				} else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12840 					s = 0;
12841 				} else {
12842 					s = cp936_ucs_table[s];
12843 				}
12844 				if (!s)
12845 					s = MBFL_BAD_INPUT;
12846 				*out++ = s;
12847 			} else {
12848 				*out++ = MBFL_BAD_INPUT;
12849 			}
12850 		} else if (c < 0x80 && *state == ASCII) {
12851 			*out++ = c;
12852 		} else {
12853 			*out++ = MBFL_BAD_INPUT;
12854 		}
12855 	}
12856 
12857 	*in_len = e - p;
12858 	*in = p;
12859 	return out - buf;
12860 }
12861 
mb_wchar_to_hz(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12862 static void mb_wchar_to_hz(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12863 {
12864 	unsigned char *out, *limit;
12865 	MB_CONVERT_BUF_LOAD(buf, out, limit);
12866 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12867 
12868 	while (len--) {
12869 		uint32_t w = *in++;
12870 		unsigned int s = 0;
12871 
12872 		if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
12873 			if (w == 0xB7 || w == 0x144 || w == 0x148 || w == 0x251 || w == 0x261 || w == 0x2CA || w == 0x2CB || w == 0x2D9) {
12874 				s = 0;
12875 			} else {
12876 				s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
12877 			}
12878 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
12879 			if (w == 0x2015) {
12880 				s = 0xA1AA;
12881 			} else if (w == 0x2010 || w == 0x2013 || w == 0x2014 || w == 0x2016 || w == 0x2025 || w == 0x2035 || w == 0x2105 || w == 0x2109 || w == 0x2121 || (w >= 0x2170 && w <= 0x2179) || (w >= 0x2196 && w <= 0x2199) || w == 0x2215 || w == 0x221F || w == 0x2223 || w == 0x2252 || w == 0x2266 || w == 0x2267 || w == 0x2295 || (w >= 0x2550 && w <= 0x2573) || w == 0x22BF || w == 0x2609 || (w >= 0x2581 && w <= 0x258F) || (w >= 0x2593 && w <= 0x2595) || w == 0x25BC || w == 0x25BD || (w >= 0x25E2 && w <= 0x25E5)) {
12882 				s = 0;
12883 			} else {
12884 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
12885 			}
12886 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
12887 			if (w == 0x30FB) {
12888 				s = 0xA1A4;
12889 			} else if (w == 0x3006 || w == 0x3007 || w == 0x3012 || w == 0x3231 || w == 0x32A3 || w >= 0x3300 || (w >= 0x3018 && w <= 0x3040) || (w >= 0x309B && w <= 0x309E) || (w >= 0x30FC && w <= 0x30FE)) {
12890 				s = 0;
12891 			} else {
12892 				s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
12893 			}
12894 		} else if (w >= ucs_i_gb2312_table_min && w < ucs_i_gb2312_table_max) {
12895 			s = ucs_i_gb2312_table[w - ucs_i_gb2312_table_min];
12896 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
12897 			if (w == 0xFF04) {
12898 				s = 0xA1E7;
12899 			} else if (w == 0xFF5E) {
12900 				s = 0xA1AB;
12901 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
12902 				s = w - 0xFF01 + 0xA3A1;
12903 			} else if (w == 0xFFE0 || w == 0xFFE1 || w == 0xFFE3 || w == 0xFFE5) {
12904 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
12905 			}
12906 		}
12907 
12908 		s &= ~0x8080;
12909 
12910 		if ((!s && w) || (s >= 0x80 && s < 0x2121)) {
12911 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_hz);
12912 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12913 		} else if (s < 0x80) {
12914 			/* ASCII */
12915 			if (buf->state != ASCII) {
12916 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
12917 				out = mb_convert_buf_add2(out, '~', '}');
12918 				buf->state = ASCII;
12919 			}
12920 			if (s == '~') {
12921 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12922 				out = mb_convert_buf_add2(out, '~', '~');
12923 			} else {
12924 				out = mb_convert_buf_add(out, s);
12925 			}
12926 		} else {
12927 			/* GB 2312-80 */
12928 			if (buf->state != GB2312) {
12929 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
12930 				out = mb_convert_buf_add2(out, '~', '{');
12931 				buf->state = GB2312;
12932 			} else {
12933 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12934 			}
12935 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
12936 		}
12937 	}
12938 
12939 	if (end && buf->state != ASCII) {
12940 		/* If not in ASCII state, need to emit closing control chars */
12941 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
12942 		out = mb_convert_buf_add2(out, '~', '}');
12943 	}
12944 
12945 	MB_CONVERT_BUF_STORE(buf, out, limit);
12946 }
12947 
12948 static const struct mbfl_convert_vtbl vtbl_hz_wchar = {
12949 	mbfl_no_encoding_hz,
12950 	mbfl_no_encoding_wchar,
12951 	mbfl_filt_conv_common_ctor,
12952 	NULL,
12953 	mbfl_filt_conv_hz_wchar,
12954 	mbfl_filt_conv_hz_wchar_flush,
12955 	NULL,
12956 };
12957 
12958 static const struct mbfl_convert_vtbl vtbl_wchar_hz = {
12959 	mbfl_no_encoding_wchar,
12960 	mbfl_no_encoding_hz,
12961 	mbfl_filt_conv_common_ctor,
12962 	NULL,
12963 	mbfl_filt_conv_wchar_hz,
12964 	mbfl_filt_conv_any_hz_flush,
12965 	NULL,
12966 };
12967 
12968 const mbfl_encoding mbfl_encoding_hz = {
12969 	mbfl_no_encoding_hz,
12970 	"HZ",
12971 	"HZ-GB-2312",
12972 	NULL,
12973 	NULL,
12974 	MBFL_ENCTYPE_GL_UNSAFE,
12975 	&vtbl_hz_wchar,
12976 	&vtbl_wchar_hz,
12977 	mb_hz_to_wchar,
12978 	mb_wchar_to_hz,
12979 	NULL,
12980 	NULL,
12981 };
12982