1 #include "mbfilter_cjk.h"
2 
3 #include "unicode_table_jis.h"
4 #include "unicode_table_jis2004.h"
5 #include "unicode_table_big5.h"
6 #include "unicode_table_cns11643.h"
7 #include "unicode_table_cp932_ext.h"
8 #include "unicode_table_cp936.h"
9 #include "unicode_table_gb18030.h"
10 #include "unicode_table_gb2312.h"
11 #include "unicode_table_uhc.h"
12 #include "cp932_table.h"
13 #include "sjis_mac2uni.h"
14 #include "translit_kana_jisx0201_jisx0208.h"
15 #include "emoji2uni.h"
16 
17 /* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF
18  * These correspond to the letters A-Z
19  * To display the flag emoji for a country, two unicode codepoints are combined,
20  * which correspond to the two-letter code for that country
21  * This macro converts uppercase ASCII values to Regional Indicator codepoints */
22 #define NFLAGS(c) (0x1F1A5+((unsigned int)(c)))
23 
24 static const char nflags_s[10][2] = {"CN", "DE", "ES", "FR", "GB", "IT", "JP", "KR", "RU", "US"};
25 static const int nflags_code_kddi[10] = { 0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7 };
26 static const int nflags_code_sb[10] = { 0x2B0A, 0x2B05, 0x2B08, 0x2B04, 0x2B07, 0x2B06, 0x2B02, 0x2B0B, 0x2B09, 0x2B03 };
27 
28 #define EMIT_KEYPAD_EMOJI(c) do { *snd = (c); return 0x20E3; } while(0)
29 #define EMIT_FLAG_EMOJI(country) do { *snd = NFLAGS((country)[0]); return NFLAGS((country)[1]); } while(0)
30 
31 static const char nflags_kddi[6][2] = {"FR", "DE", "IT", "GB", "CN", "KR"};
32 static const char nflags_sb[10][2] = {"JP", "US", "FR", "DE", "IT", "GB", "ES", "RU", "CN", "KR"};
33 
34 /* number -> (ku*94)+ten value for telephone keypad character */
35 #define DOCOMO_KEYPAD(n) ((n) == 0 ? 0x296F : (0x2965 + (n)))
36 #define DOCOMO_KEYPAD_HASH 0x2964
37 
38 /* `tbl` contains inclusive ranges, each represented by a pair of unsigned shorts */
mbfl_bisec_srch(int w,const unsigned short * tbl,int n)39 static int mbfl_bisec_srch(int w, const unsigned short *tbl, int n)
40 {
41 	int l = 0, r = n-1;
42 	while (l <= r) {
43 		int probe = (l + r) >> 1;
44 		unsigned short lo = tbl[2 * probe], hi = tbl[(2 * probe) + 1];
45 		if (w < lo) {
46 			r = probe - 1;
47 		} else if (w > hi) {
48 			l = probe + 1;
49 		} else {
50 			return probe;
51 		}
52 	}
53 	return -1;
54 }
55 
56 /* `tbl` contains single values, not ranges */
mbfl_bisec_srch2(int w,const unsigned short tbl[],int n)57 int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
58 {
59 	int l = 0, r = n-1;
60 	while (l <= r) {
61 		int probe = (l + r) >> 1;
62 		unsigned short val = tbl[probe];
63 		if (w < val) {
64 			r = probe - 1;
65 		} else if (w > val) {
66 			l = probe + 1;
67 		} else {
68 			return probe;
69 		}
70 	}
71 	return -1;
72 }
73 
74 #define SJIS_ENCODE(c1,c2,s1,s2) \
75 	do { \
76 		s1 = ((c1 - 1) >> 1) + ((c1) < 0x5F ? 0x71 : 0xB1); \
77 		s2 = c2; \
78 		if ((c1) & 1) { \
79 			if ((c2) < 0x60) { \
80 				s2--; \
81 			} \
82 			s2 += 0x20; \
83 		} else { \
84 			s2 += 0x7e; \
85 		} \
86 	} while (0)
87 
88 #define SJIS_DECODE(c1,c2,s1,s2) \
89 	do { \
90 		if (c1 < 0xa0) { \
91 			s1 = ((c1 - 0x81) << 1) + 0x21; \
92 		} else { \
93 			s1 = ((c1 - 0xc1) << 1) + 0x21; \
94 		} \
95 		s2 = c2; \
96 		if (c2 < 0x9f) { \
97 			if (c2 < 0x7f) { \
98 				s2++; \
99 			} \
100 			s2 -= 0x20; \
101 		} else { \
102 			s1++; \
103 			s2 -= 0x7e; \
104 		} \
105 	} while (0)
106 
107 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
108 
109 /*
110  * ISO-2022 variants
111  */
112 
113 #define ASCII          0
114 #define JISX0201_KANA  0x20
115 #define JISX0208_KANJI 0x80
116 
mbfl_filt_conv_jis_wchar(int c,mbfl_convert_filter * filter)117 static int mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter)
118 {
119 	int c1, s, w;
120 
121 retry:
122 	switch (filter->status & 0xf) {
123 /*	case 0x00:	 ASCII */
124 /*	case 0x10:	 X 0201 latin */
125 /*	case 0x20:	 X 0201 kana */
126 /*	case 0x80:	 X 0208 */
127 /*	case 0x90:	 X 0212 */
128 	case 0:
129 		if (c == 0x1b) {
130 			filter->status += 2;
131 		} else if (c == 0x0e) {		/* "kana in" */
132 			filter->status = 0x20;
133 		} else if (c == 0x0f) {		/* "kana out" */
134 			filter->status = 0;
135 		} else if (filter->status == 0x10 && c == 0x5c) {	/* YEN SIGN */
136 			CK((*filter->output_function)(0xa5, filter->data));
137 		} else if (filter->status == 0x10 && c == 0x7e) {	/* OVER LINE */
138 			CK((*filter->output_function)(0x203e, filter->data));
139 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
140 			CK((*filter->output_function)(0xff40 + c, filter->data));
141 		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x7f) {		/* kanji first char */
142 			filter->cache = c;
143 			filter->status += 1;
144 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
145 			CK((*filter->output_function)(c, filter->data));
146 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
147 			CK((*filter->output_function)(0xfec0 + c, filter->data));
148 		} else {
149 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
150 		}
151 		break;
152 
153 /*	case 0x81:	 X 0208 second char */
154 /*	case 0x91:	 X 0212 second char */
155 	case 1:
156 		filter->status &= ~0xf;
157 		c1 = filter->cache;
158 		if (c > 0x20 && c < 0x7f) {
159 			s = (c1 - 0x21)*94 + c - 0x21;
160 			if (filter->status == 0x80) {
161 				if (s >= 0 && s < jisx0208_ucs_table_size) {
162 					w = jisx0208_ucs_table[s];
163 				} else {
164 					w = 0;
165 				}
166 
167 				if (w <= 0) {
168 					w = MBFL_BAD_INPUT;
169 				}
170 			} else {
171 				if (s >= 0 && s < jisx0212_ucs_table_size) {
172 					w = jisx0212_ucs_table[s];
173 				} else {
174 					w = 0;
175 				}
176 
177 				if (w <= 0) {
178 					w = MBFL_BAD_INPUT;
179 				}
180 			}
181 			CK((*filter->output_function)(w, filter->data));
182 		} else {
183 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
184 		}
185 		break;
186 
187 	/* ESC */
188 /*	case 0x02:	*/
189 /*	case 0x12:	*/
190 /*	case 0x22:	*/
191 /*	case 0x82:	*/
192 /*	case 0x92:	*/
193 	case 2:
194 		if (c == 0x24) {		/* '$' */
195 			filter->status++;
196 		} else if (c == 0x28) {		/* '(' */
197 			filter->status += 3;
198 		} else {
199 			filter->status &= ~0xf;
200 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
201 			goto retry;
202 		}
203 		break;
204 
205 	/* ESC $ */
206 /*	case 0x03:	*/
207 /*	case 0x13:	*/
208 /*	case 0x23:	*/
209 /*	case 0x83:	*/
210 /*	case 0x93:	*/
211 	case 3:
212 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
213 			filter->status = 0x80;
214 		} else if (c == 0x28) {			/* '(' */
215 			filter->status++;
216 		} else {
217 			filter->status &= ~0xf;
218 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
219 			CK((*filter->output_function)(0x24, filter->data));
220 			goto retry;
221 		}
222 		break;
223 
224 	/* ESC $ ( */
225 /*	case 0x04:	*/
226 /*	case 0x14:	*/
227 /*	case 0x24:	*/
228 /*	case 0x84:	*/
229 /*	case 0x94:	*/
230 	case 4:
231 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
232 			filter->status = 0x80;
233 		} else if (c == 0x44) {			/* 'D' */
234 			filter->status = 0x90;
235 		} else {
236 			filter->status &= ~0xf;
237 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
238 			CK((*filter->output_function)(0x24, filter->data));
239 			CK((*filter->output_function)(0x28, filter->data));
240 			goto retry;
241 		}
242 		break;
243 
244 	/* ESC ( */
245 /*	case 0x05:	*/
246 /*	case 0x15:	*/
247 /*	case 0x25:	*/
248 /*	case 0x85:	*/
249 /*	case 0x95:	*/
250 	case 5:
251 		if (c == 0x42 || c == 0x48) {		/* 'B' or 'H' */
252 			filter->status = 0;
253 		} else if (c == 0x4a) {		/* 'J' */
254 			filter->status = 0x10;
255 		} else if (c == 0x49) {		/* 'I' */
256 			filter->status = 0x20;
257 		} else {
258 			filter->status &= ~0xf;
259 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
260 			CK((*filter->output_function)(0x28, filter->data));
261 			goto retry;
262 		}
263 		break;
264 
265 		EMPTY_SWITCH_DEFAULT_CASE();
266 	}
267 
268 	return 0;
269 }
270 
mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter * filter)271 static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter)
272 {
273 	if (filter->status & 0xF) {
274 		/* 2-byte (JIS X 0208 or 0212) character was truncated,
275 		 * or else escape sequence was truncated */
276 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
277 	}
278 	filter->status = 0;
279 
280 	if (filter->flush_function) {
281 		(*filter->flush_function)(filter->data);
282 	}
283 
284 	return 0;
285 }
286 
mbfl_filt_conv_wchar_jis(int c,mbfl_convert_filter * filter)287 static int mbfl_filt_conv_wchar_jis(int c, mbfl_convert_filter *filter)
288 {
289 	int s = 0;
290 
291 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
292 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
293 	} else if (c == 0x203E) { /* OVERLINE */
294 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
295 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
296 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
297 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
298 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
299 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
300 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
301 	}
302 	if (s <= 0) {
303 		if (c == 0xa5) {		/* YEN SIGN */
304 			s = 0x1005c;
305 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
306 			s = 0x2140;
307 		} else if (c == 0x2225) {	/* PARALLEL TO */
308 			s = 0x2142;
309 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
310 			s = 0x215d;
311 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
312 			s = 0x2171;
313 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
314 			s = 0x2172;
315 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
316 			s = 0x224c;
317 		}
318 		if (c == 0) {
319 			s = 0;
320 		} else if (s <= 0) {
321 			s = -1;
322 		}
323 	}
324 	if (s >= 0) {
325 		if (s < 0x80) { /* ASCII */
326 			if ((filter->status & 0xff00) != 0) {
327 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
328 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
329 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
330 			}
331 			filter->status = 0;
332 			CK((*filter->output_function)(s, filter->data));
333 		} else if (s < 0x8080) { /* X 0208 */
334 			if ((filter->status & 0xff00) != 0x200) {
335 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
336 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
337 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
338 			}
339 			filter->status = 0x200;
340 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
341 			CK((*filter->output_function)(s & 0x7f, filter->data));
342 		} else if (s < 0x10000) { /* X 0212 */
343 			if ((filter->status & 0xff00) != 0x300) {
344 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
345 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
346 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
347 				CK((*filter->output_function)(0x44, filter->data));		/* 'D' */
348 			}
349 			filter->status = 0x300;
350 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
351 			CK((*filter->output_function)(s & 0x7f, filter->data));
352 		} else { /* X 0201 latin */
353 			if ((filter->status & 0xff00) != 0x400) {
354 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
355 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
356 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
357 			}
358 			filter->status = 0x400;
359 			CK((*filter->output_function)(s & 0x7f, filter->data));
360 		}
361 	} else {
362 		CK(mbfl_filt_conv_illegal_output(c, filter));
363 	}
364 
365 	return 0;
366 }
367 
mbfl_filt_conv_wchar_2022jp(int c,mbfl_convert_filter * filter)368 static int mbfl_filt_conv_wchar_2022jp(int c, mbfl_convert_filter *filter)
369 {
370 	int s;
371 
372 	s = 0;
373 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
374 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
375 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
376 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
377 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
378 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
379 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
380 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
381 	}
382 
383 	if (s <= 0) {
384 		if (c == 0xa5) {			/* YEN SIGN */
385 			s = 0x1005c;
386 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
387 			s = 0x2140;
388 		} else if (c == 0x2225) {	/* PARALLEL TO */
389 			s = 0x2142;
390 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
391 			s = 0x215d;
392 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
393 			s = 0x2171;
394 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
395 			s = 0x2172;
396 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
397 			s = 0x224c;
398 		}
399 		if (c == 0) {
400 			s = 0;
401 		} else if (s <= 0) {
402 			s = -1;
403 		}
404 	} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
405 		s = -1;
406 	}
407 	if (s >= 0) {
408 		if (s < 0x80) { /* ASCII */
409 			if ((filter->status & 0xff00) != 0) {
410 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
411 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
412 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
413 			}
414 			filter->status = 0;
415 			CK((*filter->output_function)(s, filter->data));
416 		} else if (s < 0x10000) { /* X 0208 */
417 			if ((filter->status & 0xff00) != 0x200) {
418 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
419 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
420 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
421 			}
422 			filter->status = 0x200;
423 			CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
424 			CK((*filter->output_function)(s & 0x7f, filter->data));
425 		} else { /* X 0201 latin */
426 			if ((filter->status & 0xff00) != 0x400) {
427 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
428 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
429 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
430 			}
431 			filter->status = 0x400;
432 			CK((*filter->output_function)(s & 0x7f, filter->data));
433 		}
434 	}
435 
436 	return 0;
437 }
438 
439 #define ASCII 0
440 #define JISX_0201_LATIN 1
441 #define JISX_0201_KANA 2
442 #define JISX_0208 3
443 #define JISX_0212 4
444 
mb_iso2022jp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)445 static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
446 {
447 	ZEND_ASSERT(bufsize >= 3);
448 
449 	unsigned char *p = *in, *e = p + *in_len;
450 	uint32_t *out = buf, *limit = buf + bufsize;
451 
452 	while (p < e && out < limit) {
453 		unsigned char c = *p++;
454 
455 		if (c == 0x1B) {
456 			/* ESC seen; this is an escape sequence */
457 			if ((e - p) < 2) {
458 				*out++ = MBFL_BAD_INPUT;
459 				if (p != e && (*p == '$' || *p == '('))
460 					p++;
461 				continue;
462 			}
463 
464 			unsigned char c2 = *p++;
465 			if (c2 == '$') {
466 				unsigned char c3 = *p++;
467 				if (c3 == '@' || c3 == 'B') {
468 					*state = JISX_0208;
469 				} else if (c3 == '(') {
470 					if (p == e) {
471 						*out++ = MBFL_BAD_INPUT;
472 						break;
473 					}
474 					unsigned char c4 = *p++;
475 					if (c4 == '@' || c4 == 'B') {
476 						*state = JISX_0208;
477 					} else if (c4 == 'D') {
478 						*state = JISX_0212;
479 					} else {
480 						if ((limit - out) < 3) {
481 							p -= 4;
482 							break;
483 						}
484 						*out++ = MBFL_BAD_INPUT;
485 						*out++ = '$';
486 						*out++ = '(';
487 						p--;
488 					}
489 				} else {
490 					if ((limit - out) < 2) {
491 						p -= 3;
492 						break;
493 					}
494 					*out++ = MBFL_BAD_INPUT;
495 					*out++ = '$';
496 					p--;
497 				}
498 			} else if (c2 == '(') {
499 				unsigned char c3 = *p++;
500 				if (c3 == 'B' || c3 == 'H') {
501 					*state = ASCII;
502 				} else if (c3 == 'J') {
503 					*state = JISX_0201_LATIN;
504 				} else if (c3 == 'I') {
505 					*state = JISX_0201_KANA;
506 				} else {
507 					if ((limit - out) < 2) {
508 						p -= 3;
509 						break;
510 					}
511 					*out++ = MBFL_BAD_INPUT;
512 					*out++ = '(';
513 					p--;
514 				}
515 			} else {
516 				*out++ = MBFL_BAD_INPUT;
517 				p--;
518 			}
519 		} else if (c == 0xE) {
520 			/* "Kana In" marker; this is just for JIS-7/8, but we also accept it for ISO-2022-JP */
521 			*state = JISX_0201_KANA;
522 		} else if (c == 0xF) {
523 			/* "Kana Out" marker */
524 			*state = ASCII;
525 		} else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
526 			*out++ = 0xA5;
527 		} else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
528 			*out++ = 0x203E;
529 		} else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
530 			*out++ = 0xFF40 + c;
531 		} else if (*state >= JISX_0208 && c > 0x20 && c < 0x7F) {
532 			if (p == e) {
533 				*out++ = MBFL_BAD_INPUT;
534 				break;
535 			}
536 			unsigned char c2 = *p++;
537 			if (c2 > 0x20 && c2 < 0x7F) {
538 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
539 				uint32_t w = 0;
540 				if (*state == JISX_0208) {
541 					if (s < jisx0208_ucs_table_size) {
542 						w = jisx0208_ucs_table[s];
543 					}
544 					if (!w) {
545 						w = MBFL_BAD_INPUT;
546 					}
547 				} else {
548 					if (s < jisx0212_ucs_table_size) {
549 						w = jisx0212_ucs_table[s];
550 					}
551 					if (!w) {
552 						w = MBFL_BAD_INPUT;
553 					}
554 				}
555 				*out++ = w;
556 			} else {
557 				*out++ = MBFL_BAD_INPUT;
558 			}
559 		} else if (c < 0x80) {
560 			*out++ = c;
561 		} else if (c >= 0xA1 && c <= 0xDF) {
562 			/* GR-invoked Kana; "GR" stands for "graphics right" and refers to bytes
563 			 * with the MSB bit (in the context of ISO-2022 encoding).
564 			 *
565 			 * In this regard, Wikipedia states:
566 			 * "Other, older variants known as JIS7 and JIS8 build directly on the 7-bit and 8-bit
567 			 * encodings defined by JIS X 0201 and allow use of JIS X 0201 kana from G1 without
568 			 * escape sequences, using Shift Out and Shift In or setting the eighth bit
569 			 * (GR-invoked), respectively."
570 			 *
571 			 * Note that we support both the 'JIS7' use of 0xE/0xF Shift In/Shift Out codes
572 			 * and the 'JIS8' use of GR-invoked Kana */
573 			*out++ = 0xFEC0 + c;
574 		} else {
575 			*out++ = MBFL_BAD_INPUT;
576 		}
577 	}
578 
579 	*in_len = e - p;
580 	*in = p;
581 	return out - buf;
582 }
583 
mb_wchar_to_iso2022jp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)584 static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
585 {
586 	unsigned char *out, *limit;
587 	MB_CONVERT_BUF_LOAD(buf, out, limit);
588 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
589 
590 	while (len--) {
591 		uint32_t w = *in++;
592 		unsigned int s = 0;
593 
594 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
595 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
596 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
597 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
598 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
599 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
600 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
601 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
602 		}
603 
604 		if (s == 0) {
605 			if (w == 0xA5) { /* YEN SIGN */
606 				s = 0x1005C;
607 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
608 				s = 0x2140;
609 			} else if (w == 0x2225) { /* PARALLEL TO */
610 				s = 0x2142;
611 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
612 				s = 0x215D;
613 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
614 				s = 0x2171;
615 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
616 				s = 0x2172;
617 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
618 				s = 0x224C;
619 			} else if (w != 0) {
620 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
621 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
622 				continue;
623 			}
624 		} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
625 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
626 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
627 			continue;
628 		}
629 
630 		if (s < 0x80) { /* ASCII */
631 			if (buf->state != ASCII) {
632 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
633 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
634 				buf->state = ASCII;
635 			}
636 			out = mb_convert_buf_add(out, s);
637 		} else if (s < 0x8080) { /* JIS X 0208 */
638 			if (buf->state != JISX_0208) {
639 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
640 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
641 				buf->state = JISX_0208;
642 			}
643 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
644 		} else if (s < 0x10000) { /* JIS X 0212 */
645 			if (buf->state != JISX_0212) {
646 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
647 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
648 				buf->state = JISX_0212;
649 			}
650 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
651 		} else { /* X 0201 Latin */
652 			if (buf->state != JISX_0201_LATIN) {
653 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
654 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
655 				buf->state = JISX_0201_LATIN;
656 			}
657 			out = mb_convert_buf_add(out, s & 0x7F);
658 		}
659 	}
660 
661 	if (end && buf->state != ASCII) {
662 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
663 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
664 	}
665 
666 	MB_CONVERT_BUF_STORE(buf, out, limit);
667 }
668 
mb_wchar_to_jis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)669 static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
670 {
671 	unsigned char *out, *limit;
672 	MB_CONVERT_BUF_LOAD(buf, out, limit);
673 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
674 
675 	while (len--) {
676 		uint32_t w = *in++;
677 		unsigned int s = 0;
678 
679 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
680 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
681 		} else if (w == 0x203E) { /* OVERLINE */
682 			s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
683 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
684 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
685 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
686 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
687 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
688 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
689 		}
690 
691 		if (s == 0) {
692 			if (w == 0xA5) { /* YEN SIGN */
693 				s = 0x1005C;
694 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
695 				s = 0x2140;
696 			} else if (w == 0x2225) { /* PARALLEL TO */
697 				s = 0x2142;
698 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
699 				s = 0x215D;
700 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
701 				s = 0x2171;
702 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
703 				s = 0x2172;
704 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
705 				s = 0x224C;
706 			} else if (w != 0) {
707 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp);
708 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
709 				continue;
710 			}
711 		}
712 
713 		if (s < 0x80) { /* ASCII */
714 			if (buf->state != ASCII) {
715 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
716 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
717 				buf->state = ASCII;
718 			}
719 			out = mb_convert_buf_add(out, s);
720 		} else if (s >= 0xA1 && s <= 0xDF) {
721 			if (buf->state != JISX_0201_KANA) {
722 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
723 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
724 				buf->state = JISX_0201_KANA;
725 			}
726 			out = mb_convert_buf_add(out, s & 0x7F);
727 		} else if (s < 0x8080) { /* JIS X 0208 */
728 			if (buf->state != JISX_0208) {
729 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 5);
730 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
731 				buf->state = JISX_0208;
732 			}
733 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
734 		} else if (s < 0x10000) { /* JIS X 0212 */
735 			if (buf->state != JISX_0212) {
736 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 6);
737 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'D');
738 				buf->state = JISX_0212;
739 			}
740 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
741 		} else { /* X 0201 Latin */
742 			if (buf->state != JISX_0201_LATIN) {
743 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
744 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
745 				buf->state = JISX_0201_LATIN;
746 			}
747 			out = mb_convert_buf_add(out, s & 0x7F);
748 		}
749 	}
750 
751 	if (end && buf->state != ASCII) {
752 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
753 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
754 	}
755 
756 	MB_CONVERT_BUF_STORE(buf, out, limit);
757 }
758 
759 #define JISX_0201_KANA_SO 5
760 
mb_check_jis(unsigned char * in,size_t in_len)761 static bool mb_check_jis(unsigned char *in, size_t in_len)
762 {
763 	unsigned char *p = in, *e = p + in_len;
764 	unsigned int state = ASCII;
765 
766 	while (p < e) {
767 		unsigned char c = *p++;
768 		if (c == 0x1B) {
769 			/* ESC seen; this is an escape sequence */
770 			if (state == JISX_0201_KANA_SO) {
771 				return false;
772 			}
773 			if ((e - p) < 2) {
774 				return false;
775 			}
776 			unsigned char c2 = *p++;
777 			if (c2 == '$') {
778 				unsigned char c3 = *p++;
779 				if (c3 == '@' || c3 == 'B') {
780 					state = JISX_0208;
781 				} else if (c3 == '(') {
782 					if (p == e) {
783 						return false;
784 					}
785 					unsigned char c4 = *p++;
786 					if (c4 == '@' || c4 == 'B') {
787 						state = JISX_0208;
788 					} else if (c4 == 'D') {
789 						state = JISX_0212;
790 					} else {
791 						return false;
792 					}
793 				} else {
794 					return false;
795 				}
796 			} else if (c2 == '(') {
797 				unsigned char c3 = *p++;
798 				/* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
799 				 * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
800 				if (c3 == 'B' || c3 == 'H') {
801 					state = ASCII;
802 				} else if (c3 == 'J') {
803 					state = JISX_0201_LATIN;
804 				} else if (c3 == 'I') {
805 					state = JISX_0201_KANA;
806 				} else {
807 					return false;
808 				}
809 			} else {
810 				return false;
811 			}
812 		} else if (c == 0xE) {
813 			/* "Kana In" marker */
814 			if (state != ASCII) {
815 				return false;
816 			}
817 			state = JISX_0201_KANA_SO;
818 		} else if (c == 0xF) {
819 			/* "Kana Out" marker */
820 			if (state != JISX_0201_KANA_SO) {
821 				return false;
822 			}
823 			state = ASCII;
824 		} else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
825 			if (p == e) {
826 				return false;
827 			}
828 			unsigned char c2 = *p++;
829 			if (c2 > 0x20 && c2 < 0x7F) {
830 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
831 				if (state == JISX_0208) {
832 					if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
833 						continue;
834 					}
835 				} else {
836 					if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
837 						continue;
838 					}
839 				}
840 				return false;
841 			} else {
842 				return false;
843 			}
844 		} else if (c < 0x80) {
845 			continue;
846 		} else if (c >= 0xA1 && c <= 0xDF) {
847 			/* GR-invoked Kana */
848 			continue;
849 		} else {
850 			return false;
851 		}
852 	}
853 
854 	return state == ASCII;
855 }
856 
mb_check_iso2022jp(unsigned char * in,size_t in_len)857 static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
858 {
859 	unsigned char *p = in, *e = p + in_len;
860 	unsigned int state = ASCII;
861 
862 	while (p < e) {
863 		unsigned char c = *p++;
864 		if (c == 0x1B) {
865 			/* ESC seen; this is an escape sequence */
866 			if ((e - p) < 2) {
867 				return false;
868 			}
869 			unsigned char c2 = *p++;
870 			if (c2 == '$') {
871 				unsigned char c3 = *p++;
872 				if (c3 == '@' || c3 == 'B') {
873 					state = JISX_0208;
874 				} else {
875 					return false;
876 				}
877 			} else if (c2 == '(') {
878 				unsigned char c3 = *p++;
879 				if (c3 == 'B') {
880 					state = ASCII;
881 				} else if (c3 == 'J') {
882 					state = JISX_0201_LATIN;
883 				} else {
884 					return false;
885 				}
886 			} else {
887 				return false;
888 			}
889 		} else if (c == 0xE || c == 0xF) {
890 			/* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
891 			return false;
892 		} else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
893 			if (p == e) {
894 				return false;
895 			}
896 			unsigned char c2 = *p++;
897 			if (c2 > 0x20 && c2 < 0x7F) {
898 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
899 				if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
900 					continue;
901 				}
902 				return false;
903 			} else {
904 				return false;
905 			}
906 		} else if (c < 0x80) {
907 			continue;
908 		} else {
909 			return false;
910 		}
911 	}
912 
913 	return state == ASCII;
914 }
915 
916 /* Unicode codepoints for emoji are above 0x1F000, but we only store 16-bits
917  * in our tables. Therefore, add 0x10000 to recover the true values.
918  *
919  * Again, for some emoji which are not supported by Unicode, we use codepoints
920  * in the Private Use Area above 0xFE000. Again, add 0xF0000 to recover the
921  * true value. */
convert_emoji_cp(int cp)922 static inline int convert_emoji_cp(int cp)
923 {
924 	if (cp > 0xF000)
925 		return cp + 0x10000;
926 	else if (cp > 0xE000)
927 		return cp + 0xF0000;
928 	return cp;
929 }
930 
mbfilter_sjis_emoji_kddi2unicode(int s,int * snd)931 int mbfilter_sjis_emoji_kddi2unicode(int s, int *snd)
932 {
933 	if (s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi1_max) {
934 		if (s == 0x24C0) { /* Spain */
935 			EMIT_FLAG_EMOJI("ES");
936 		} else if (s == 0x24C1) { /* Russia */
937 			EMIT_FLAG_EMOJI("RU");
938 		} else if (s >= 0x2545 && s <= 0x254A) {
939 			EMIT_FLAG_EMOJI(nflags_kddi[s - 0x2545]);
940 		} else if (s == 0x25BC) {
941 			EMIT_KEYPAD_EMOJI('#');
942 		} else {
943 			*snd = 0;
944 			return convert_emoji_cp(mb_tbl_code2uni_kddi1[s - mb_tbl_code2uni_kddi1_min]);
945 		}
946 	} else if (s >= mb_tbl_code2uni_kddi2_min && s <= mb_tbl_code2uni_kddi2_max) {
947 		if (s == 0x2750) { /* Japan */
948 			EMIT_FLAG_EMOJI("JP");
949 		} else if (s >= 0x27A6 && s <= 0x27AE) {
950 			EMIT_KEYPAD_EMOJI(s - 0x27A6 + '1');
951 		} else if (s == 0x27F7) { /* United States */
952 			EMIT_FLAG_EMOJI("US");
953 		} else if (s == 0x2830) {
954 			EMIT_KEYPAD_EMOJI('0');
955 		} else {
956 			*snd = 0;
957 			return convert_emoji_cp(mb_tbl_code2uni_kddi2[s - mb_tbl_code2uni_kddi2_min]);
958 		}
959 	}
960 	return 0;
961 }
962 
mbfl_filt_conv_2022jp_mobile_wchar(int c,mbfl_convert_filter * filter)963 static int mbfl_filt_conv_2022jp_mobile_wchar(int c, mbfl_convert_filter *filter)
964 {
965 	int c1, s, w, snd = 0;
966 
967 	switch (filter->status & 0xF) {
968 	case 0:
969 		if (c == 0x1B) {
970 			filter->status += 2;
971 		} else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
972 			CK((*filter->output_function)(0xFF40 + c, filter->data));
973 		} else if (filter->status == JISX0208_KANJI && c > 0x20 && c < 0x80) {
974 			filter->cache = c;
975 			filter->status += 1;
976 		} else if (c >= 0 && c < 0x80) { /* ASCII */
977 			CK((*filter->output_function)(c, filter->data));
978 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
979 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
980 		} else {
981 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
982 		}
983 		break;
984 
985 	/* JISX 0208, second byte */
986 	case 1:
987 		w = 0;
988 		filter->status &= ~0xF;
989 		c1 = filter->cache;
990 		if (c > 0x20 && c < 0x7F) {
991 			s = ((c1 - 0x21) * 94) + c - 0x21;
992 
993 			if (s <= 137) {
994 				if (s == 31) {
995 					w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
996 				} else if (s == 32) {
997 					w = 0xFF5E; /* FULLWIDTH TILDE */
998 				} else if (s == 33) {
999 					w = 0x2225; /* PARALLEL TO */
1000 				} else if (s == 60) {
1001 					w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1002 				} else if (s == 80) {
1003 					w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1004 				} else if (s == 81) {
1005 					w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1006 				} else if (s == 137) {
1007 					w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1008 				}
1009 			}
1010 
1011 			if (s >= (84 * 94) && s < (91 * 94)) {
1012 				s += 22 * 94;
1013 				w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1014 				if (w > 0 && snd > 0) {
1015 					(*filter->output_function)(snd, filter->data);
1016 				}
1017 			}
1018 
1019 			if (w == 0) {
1020 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1021 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1022 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {
1023 					w = jisx0208_ucs_table[s];
1024 				}
1025 			}
1026 
1027 			if (w <= 0) {
1028 				w = MBFL_BAD_INPUT;
1029 			}
1030 			CK((*filter->output_function)(w, filter->data));
1031 		} else {
1032 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1033 		}
1034 		break;
1035 
1036 	/* ESC */
1037 	case 2:
1038 		if (c == '$') {
1039 			filter->status++;
1040 		} else if (c == '(') {
1041 			filter->status += 3;
1042 		} else {
1043 			filter->status &= ~0xF;
1044 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1045 		}
1046 		break;
1047 
1048 	/* ESC $ */
1049 	case 3:
1050 		if (c == '@' || c == 'B') {
1051 			filter->status = JISX0208_KANJI;
1052 		} else if (c == '(') {
1053 			filter->status++;
1054 		} else {
1055 			filter->status &= ~0xF;
1056 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1057 		}
1058 		break;
1059 
1060 	/* ESC $ ( */
1061 	case 4:
1062 		if (c == '@' || c == 'B') {
1063 			filter->status = JISX0208_KANJI;
1064 		} else {
1065 			filter->status &= ~0xF;
1066 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1067 		}
1068 		break;
1069 
1070 	/* ESC ( */
1071 	case 5:
1072 		if (c == 'B' || c == 'J') {
1073 			filter->status = 0; /* ASCII mode */
1074 		} else if (c == 'I') {
1075 			filter->status = JISX0201_KANA;
1076 		} else {
1077 			filter->status &= ~0xF;
1078 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1079 		}
1080 	}
1081 
1082 	return 0;
1083 }
1084 
mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter * filter)1085 static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter)
1086 {
1087 	if (filter->status & 0xF) {
1088 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
1089 	}
1090 	filter->status = 0;
1091 
1092 	if (filter->flush_function) {
1093 		(*filter->flush_function)(filter->data);
1094 	}
1095 
1096 	return 0;
1097 }
1098 
mbfilter_unicode2sjis_emoji_kddi(int c,int * s1,mbfl_convert_filter * filter)1099 static int mbfilter_unicode2sjis_emoji_kddi(int c, int *s1, mbfl_convert_filter *filter)
1100 {
1101 	if ((filter->status & 0xF) == 1) {
1102 		int c1 = filter->cache;
1103 		filter->cache = 0;
1104 		filter->status &= ~0xFF;
1105 		if (c == 0x20E3) {
1106 			if (c1 == '#') {
1107 				*s1 = 0x25BC;
1108 			} else if (c1 == '0') {
1109 				*s1 = 0x2830;
1110 			} else { /* Previous character was '1'-'9' */
1111 				*s1 = 0x27A6 + (c1 - '1');
1112 			}
1113 			return 1;
1114 		} else {
1115 			if (filter->status & 0xFF00) {
1116 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1117 				CK((*filter->output_function)('(', filter->data));
1118 				CK((*filter->output_function)('B', filter->data));
1119 			}
1120 			CK((*filter->output_function)(c1, filter->data));
1121 			filter->status = 0;
1122 		}
1123 	}
1124 
1125 	if (c == '#' || (c >= '0' && c <= '9')) {
1126 		filter->status |= 1;
1127 		filter->cache = c;
1128 		return 0;
1129 	}
1130 
1131 	if (c == 0xA9) { /* Copyright sign */
1132 		*s1 = 0x27DC;
1133 		return 1;
1134 	} else if (c == 0xAE) { /* Registered sign */
1135 		*s1 = 0x27DD;
1136 		return 1;
1137 	} else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
1138 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1139 		if (i >= 0) {
1140 			*s1 = mb_tbl_uni_kddi2code2_value[i];
1141 			return 1;
1142 		}
1143 	} else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
1144 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1145 		if (i >= 0) {
1146 			*s1 = mb_tbl_uni_kddi2code3_value[i];
1147 			return 1;
1148 		}
1149 	} else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
1150 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1151 		if (i >= 0) {
1152 			*s1 = mb_tbl_uni_kddi2code5_val[i];
1153 			return 1;
1154 		}
1155 	}
1156 	return 0;
1157 }
1158 
1159 /* (ku*94)+ten value -> Shift-JIS byte sequence */
1160 #define CODE2JIS(c1,c2,s1,s2) \
1161 	c1 = (s1)/94+0x21; \
1162 	c2 = (s1)-94*((c1)-0x21)+0x21; \
1163 	s1 = ((c1) << 8) | (c2); \
1164 	s2 = 1
1165 
mbfl_filt_conv_wchar_2022jp_mobile(int c,mbfl_convert_filter * filter)1166 static int mbfl_filt_conv_wchar_2022jp_mobile(int c, mbfl_convert_filter *filter)
1167 {
1168 	int c1, c2, s1 = 0, s2 = 0;
1169 
1170 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
1171 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
1172 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
1173 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
1174 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
1175 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
1176 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
1177 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
1178 	}
1179 
1180 	if (s1 <= 0) {
1181 		if (c == 0xA5) { /* YEN SIGN */
1182 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
1183 		} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1184 			s1 = 0x2140;
1185 		} else if (c == 0x2225) { /* PARALLEL TO */
1186 			s1 = 0x2142;
1187 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1188 			s1 = 0x215d;
1189 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1190 			s1 = 0x2171;
1191 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1192 			s1 = 0x2172;
1193 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1194 			s1 = 0x224c;
1195 		}
1196 	}
1197 
1198 	if (mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0) {
1199 		/* A KDDI emoji was detected and stored in s1 */
1200 		CODE2JIS(c1,c2,s1,s2);
1201 		s1 -= 0x1600;
1202 	} else if ((filter->status & 0xFF) == 1 && filter->cache) {
1203 		/* We are just processing one of KDDI's special emoji for a phone keypad button */
1204 		return 0;
1205 	}
1206 
1207 	if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
1208 		s1 = -1;
1209 		for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
1210 			if (c == cp932ext1_ucs_table[c1]) {
1211 				s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
1212 				break;
1213 			}
1214 		}
1215 
1216 		if (c == 0) {
1217 			s1 = 0;
1218 		}
1219 	}
1220 
1221 	if (s1 >= 0) {
1222 		if (s1 < 0x80) { /* ASCII */
1223 			if (filter->status & 0xFF00) {
1224 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1225 				CK((*filter->output_function)('(', filter->data));
1226 				CK((*filter->output_function)('B', filter->data));
1227 			}
1228 			CK((*filter->output_function)(s1, filter->data));
1229 			filter->status = 0;
1230 		} else if (s1 > 0xA0 && s1 < 0xE0) { /* Kana */
1231 			if ((filter->status & 0xFF00) != 0x100) {
1232 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1233 				CK((*filter->output_function)('(', filter->data));
1234 				CK((*filter->output_function)('I', filter->data));
1235 			}
1236 			filter->status = 0x100;
1237 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
1238 		} else if (s1 < 0x7E7F) { /* JIS X 0208 */
1239 			if ((filter->status & 0xFF00) != 0x200) {
1240 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
1241 				CK((*filter->output_function)('$', filter->data));
1242 				CK((*filter->output_function)('B', filter->data));
1243 			}
1244 			filter->status = 0x200;
1245 			CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
1246 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
1247 		}
1248 	} else {
1249 		CK(mbfl_filt_conv_illegal_output(c, filter));
1250 	}
1251 
1252 	return 0;
1253 }
1254 
mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter * filter)1255 static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter)
1256 {
1257 	/* Go back to ASCII mode (so strings can be safely concatenated) */
1258 	if (filter->status & 0xFF00) {
1259 		(*filter->output_function)(0x1B, filter->data); /* ESC */
1260 		(*filter->output_function)('(', filter->data);
1261 		(*filter->output_function)('B', filter->data);
1262 	}
1263 
1264 	int c1 = filter->cache;
1265 	if ((filter->status & 0xFF) == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
1266 		(*filter->output_function)(c1, filter->data);
1267 	}
1268 	filter->status = filter->cache = 0;
1269 
1270 	if (filter->flush_function) {
1271 		(*filter->flush_function)(filter->data);
1272 	}
1273 
1274 	return 0;
1275 }
1276 
mb_iso2022jp_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)1277 static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
1278 {
1279 	unsigned char *p = *in, *e = p + *in_len;
1280 	uint32_t *out = buf, *limit = buf + bufsize - 1;
1281 
1282 	while (p < e && out < limit) {
1283 		unsigned char c = *p++;
1284 
1285 		if (c == 0x1B) {
1286 			if ((e - p) < 2) {
1287 				p = e;
1288 				*out++ = MBFL_BAD_INPUT;
1289 				break;
1290 			}
1291 			unsigned char c2 = *p++;
1292 			unsigned char c3 = *p++;
1293 
1294 			if (c2 == '$') {
1295 				if (c3 == '@' || c3 == 'B') {
1296 					*state = JISX0208_KANJI;
1297 				} else if (c3 == '(') {
1298 					if (p == e) {
1299 						*out++ = MBFL_BAD_INPUT;
1300 						break;
1301 					}
1302 					unsigned char c4 = *p++;
1303 
1304 					if (c4 == '@' || c4 == 'B') {
1305 						*state = JISX0208_KANJI;
1306 					} else {
1307 						*out++ = MBFL_BAD_INPUT;
1308 					}
1309 				} else {
1310 					*out++ = MBFL_BAD_INPUT;
1311 				}
1312 			} else if (c2 == '(') {
1313 				if (c3 == 'B' || c3 == 'J') {
1314 					*state = ASCII;
1315 				} else if (c3 == 'I') {
1316 					*state = JISX0201_KANA;
1317 				} else {
1318 					*out++ = MBFL_BAD_INPUT;
1319 				}
1320 			} else {
1321 				p--;
1322 				*out++ = MBFL_BAD_INPUT;
1323 			}
1324 		} else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
1325 			*out++ = 0xFF40 + c;
1326 		} else if (*state == JISX0208_KANJI && c >= 0x21 && c <= 0x7F) {
1327 			if (p == e) {
1328 				*out++ = MBFL_BAD_INPUT;
1329 				break;
1330 			}
1331 			unsigned char c2 = *p++;
1332 
1333 			if (c2 >= 0x21 && c2 <= 0x7E) {
1334 				unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
1335 				uint32_t w = 0;
1336 
1337 				if (s <= 137) {
1338 					if (s == 31) {
1339 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
1340 					} else if (s == 32) {
1341 						w = 0xFF5E; /* FULLWIDTH TILDE */
1342 					} else if (s == 33) {
1343 						w = 0x2225; /* PARALLEL TO */
1344 					} else if (s == 60) {
1345 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
1346 					} else if (s == 80) {
1347 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
1348 					} else if (s == 81) {
1349 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
1350 					} else if (s == 137) {
1351 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
1352 					}
1353 				}
1354 
1355 				if (s >= (84 * 94) && s < (91 * 94)) {
1356 					int snd = 0;
1357 					s += 22 * 94;
1358 					w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
1359 					if (w && snd) {
1360 						*out++ = snd;
1361 					}
1362 				}
1363 
1364 				if (!w) {
1365 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
1366 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
1367 					} else if (s < jisx0208_ucs_table_size) {
1368 						w = jisx0208_ucs_table[s];
1369 					}
1370 				}
1371 
1372 				*out++ = w ? w : MBFL_BAD_INPUT;
1373 			} else {
1374 				*out++ = MBFL_BAD_INPUT;
1375 			}
1376 		} else if (c <= 0x7F) {
1377 			*out++ = c;
1378 		} else if (c >= 0xA1 && c <= 0xDF) {
1379 			*out++ = 0xFEC0 + c;
1380 		} else {
1381 			*out++ = MBFL_BAD_INPUT;
1382 		}
1383 	}
1384 
1385 	*in_len = e - p;
1386 	*in = p;
1387 	return out - buf;
1388 }
1389 
mb_wchar_to_iso2022jp_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)1390 static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
1391 {
1392 	unsigned char *out, *limit;
1393 	MB_CONVERT_BUF_LOAD(buf, out, limit);
1394 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1395 
1396 	while (len--) {
1397 		uint32_t w = *in++;
1398 		unsigned int s = 0;
1399 
1400 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
1401 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
1402 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
1403 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
1404 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
1405 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
1406 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
1407 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
1408 		}
1409 
1410 		if (!s) {
1411 			if (w == 0xA5) { /* YEN SIGN */
1412 				s = 0x216F; /* FULLWIDTH YEN SIGN */
1413 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
1414 				s = 0x2140;
1415 			} else if (w == 0x2225) { /* PARALLEL TO */
1416 				s = 0x2142;
1417 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
1418 				s = 0x215D;
1419 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
1420 				s = 0x2171;
1421 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
1422 				s = 0x2172;
1423 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
1424 				s = 0x224C;
1425 			}
1426 		}
1427 
1428 		if ((w == '#' || (w >= '0' && w <= '9')) && len) {
1429 			uint32_t w2 = *in++; len--;
1430 
1431 			if (w2 == 0x20E3) {
1432 				unsigned int s1 = 0;
1433 				if (w == '#') {
1434 					s1 = 0x25BC;
1435 				} else if (w == '0') {
1436 					s1 = 0x2830;
1437 				} else { /* Previous character was '1'-'9' */
1438 					s1 = 0x27A6 + (w - '1');
1439 				}
1440 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1441 			} else {
1442 				in--; len++;
1443 			}
1444 		} else if (w >= NFLAGS('C') && w <= NFLAGS('U') && len) { /* C for CN, U for US */
1445 			uint32_t w2 = *in++; len--;
1446 
1447 			if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
1448 				for (int i = 0; i < 10; i++) {
1449 					if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
1450 						unsigned int s1 = nflags_code_kddi[i];
1451 						s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1452 						goto found_flag_emoji;
1453 					}
1454 				}
1455 			}
1456 
1457 			in--; len++;
1458 found_flag_emoji: ;
1459 		}
1460 
1461 		if (w == 0xA9) { /* Copyright sign */
1462 			unsigned int s1 = 0x27DC;
1463 			s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1464 		} else if (w == 0xAE) { /* Registered sign */
1465 			unsigned int s1 = 0x27DD;
1466 			s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1467 		} else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
1468 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
1469 			if (i >= 0) {
1470 				unsigned int s1 = mb_tbl_uni_kddi2code2_value[i];
1471 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1472 			}
1473 		} else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
1474 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
1475 			if (i >= 0) {
1476 				unsigned int s1 = mb_tbl_uni_kddi2code3_value[i];
1477 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1478 			}
1479 		} else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
1480 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
1481 			if (i >= 0) {
1482 				unsigned int s1 = mb_tbl_uni_kddi2code5_val[i];
1483 				s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
1484 			}
1485 		}
1486 
1487 		if (!s || s >= 0xA1A1) {
1488 			s = 0;
1489 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
1490 				if (w == cp932ext1_ucs_table[i]) {
1491 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
1492 					break;
1493 				}
1494 			}
1495 			if (w == 0)
1496 				s = 0;
1497 		}
1498 
1499 		if (!s && w) {
1500 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1501 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1502 		} else if (s <= 0x7F) {
1503 			if (buf->state != ASCII) {
1504 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1505 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1506 				buf->state = ASCII;
1507 			}
1508 			out = mb_convert_buf_add(out, s);
1509 		} else if (s >= 0xA1 && s <= 0xDF) {
1510 			if (buf->state != JISX0201_KANA) {
1511 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
1512 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
1513 				buf->state = JISX0201_KANA;
1514 			}
1515 			out = mb_convert_buf_add(out, s & 0x7F);
1516 		} else if (s <= 0x7E7E) {
1517 			if (buf->state != JISX0208_KANJI) {
1518 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
1519 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
1520 				buf->state = JISX0208_KANJI;
1521 			} else {
1522 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
1523 			}
1524 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
1525 		} else {
1526 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
1527 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
1528 		}
1529 	}
1530 
1531 	if (end && buf->state != ASCII) {
1532 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
1533 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
1534 	}
1535 
1536 	MB_CONVERT_BUF_STORE(buf, out, limit);
1537 }
1538 
mbfl_filt_conv_jis2004_wchar(int c,mbfl_convert_filter * filter)1539 static int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
1540 {
1541 	int k;
1542 	int c1, c2, s, s1 = 0, s2 = 0, w = 0, w1;
1543 
1544 	switch (filter->status & 0xf) {
1545 	case 0:
1546 		if (c >= 0 && c < 0x80) { /* latin */
1547 			if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1548 				CK((*filter->output_function)(c, filter->data));
1549 			} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1550 				if (c == 0x5c) {
1551 					CK((*filter->output_function)(0x00a5, filter->data));
1552 				} else if (c == 0x7e) {
1553 					CK((*filter->output_function)(0x203e, filter->data));
1554 				} else {
1555 					CK((*filter->output_function)(c, filter->data));
1556 				}
1557 			} else { /* ISO-2022-JP-2004 */
1558 				if (c == 0x1b) {
1559 					filter->status += 6;
1560 				} else if ((filter->status == 0x80 || filter->status == 0x90 || filter->status == 0xa0)
1561 				   && c > 0x20 && c < 0x7f) { /* kanji first char */
1562 					filter->cache = c;
1563 					if (filter->status == 0x90) {
1564 						filter->status += 1; /* JIS X 0213 plane 1 */
1565 					} else if (filter->status == 0xa0) {
1566 						filter->status += 4; /* JIS X 0213 plane 2 */
1567 					} else {
1568 						filter->status += 5; /* JIS X 0208 */
1569 					}
1570 				} else {
1571 					CK((*filter->output_function)(c, filter->data));
1572 				}
1573 			}
1574 		} else {
1575 			if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1576 				if (c > 0xa0 && c < 0xff) { /* X 0213 plane 1 first char */
1577 					filter->status = 1;
1578 					filter->cache = c;
1579 				} else if (c == 0x8e) { /* kana first char */
1580 					filter->cache = 0x8E; /* So error will be reported if input is truncated right here */
1581 					filter->status = 2;
1582 				} else if (c == 0x8f) { /* X 0213 plane 2 first char */
1583 					filter->status = 3;
1584 				} else {
1585 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1586 				}
1587 			} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1588 				if (c > 0xa0 && c < 0xe0) { /* kana */
1589 					CK((*filter->output_function)(0xfec0 + c, filter->data));
1590 				} else if (c > 0x80 && c < 0xfd && c != 0xa0) {	/* kanji first char */
1591 					filter->status = 1;
1592 					filter->cache = c;
1593 				} else {
1594 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1595 				}
1596 			} else {
1597 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1598 			}
1599 		}
1600 		break;
1601 
1602 	case 1: /* kanji second char */
1603 		filter->status &= ~0xf;
1604 		c1 = filter->cache;
1605 
1606 		if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1607 			if (c > 0xa0 && c < 0xff) {
1608 				s1 = c1 - 0x80;
1609 				s2 = c - 0x80;
1610 			} else {
1611 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1612 				break;
1613 			}
1614 		} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
1615 			if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
1616 				SJIS_DECODE(c1, c, s1, s2);
1617 			} else {
1618 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1619 				break;
1620 			}
1621 		} else { /* ISO-2022-JP-2004 */
1622 			if (c >= 0x21 && c <= 0x7E) {
1623 				s1 = c1;
1624 				s2 = c;
1625 			} else {
1626 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1627 				break;
1628 			}
1629 		}
1630 		w1 = (s1 << 8) | s2;
1631 
1632 		/* conversion for combining characters */
1633 		if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) ||
1634 			(w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 ||
1635 			(w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
1636 			k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
1637 			if (k >= 0) {
1638 				w = jisx0213_u2_tbl[2*k];
1639 				CK((*filter->output_function)(w, filter->data));
1640 				w = jisx0213_u2_tbl[2*k+1];
1641 			}
1642 		}
1643 
1644 		/* conversion for BMP  */
1645 		if (w <= 0) {
1646 			w1 = (s1 - 0x21)*94 + s2 - 0x21;
1647 			if (w1 >= 0 && w1 < jisx0213_ucs_table_size) {
1648 				w = jisx0213_ucs_table[w1];
1649 			}
1650 		}
1651 
1652 		/* conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
1653 		if (w <= 0) {
1654 			k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1655 			if (k >= 0) {
1656 				w = jisx0213_jis_u5_tbl[k] + 0x20000;
1657 			}
1658 		}
1659 
1660 		if (w <= 0) {
1661 			w = MBFL_BAD_INPUT;
1662 		}
1663 		CK((*filter->output_function)(w, filter->data));
1664 		break;
1665 
1666 	case 2: /* got 0x8e: EUC-JP-2004 kana */
1667 		filter->status = 0;
1668 		if (c > 0xa0 && c < 0xe0) {
1669 			w = 0xfec0 + c;
1670 			CK((*filter->output_function)(w, filter->data));
1671 		} else {
1672 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1673 		}
1674 		break;
1675 
1676 	case 3: /* X 0213 plane 2 first char: EUC-JP-2004 (0x8f) */
1677 		if (c == 0xA1 || (c >= 0xA3 && c <= 0xA5) || c == 0xA8 || (c >= 0xAC && c <= 0xAF) || (c >= 0xEE && c <= 0xFE)) {
1678 			filter->cache = c - 0x80;
1679 			filter->status++;
1680 		} else {
1681 			filter->status = 0;
1682 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1683 		}
1684 		break;
1685 
1686 	case 4: /* X 0213 plane 2 second char: EUC-JP-2004, ISO-2022-JP-2004 */
1687 		filter->status &= ~0xF;
1688 		c1 = filter->cache;
1689 		if (filter->from->no_encoding == mbfl_no_encoding_eucjp2004) {
1690 			c2 = c - 0x80;
1691 		} else {
1692 			c2 = c;
1693 		}
1694 
1695 		if (c2 < 0x21 || c2 > 0x7E) {
1696 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1697 			break;
1698 		}
1699 
1700 		s1 = c1 - 0x21;
1701 		s2 = c2 - 0x21;
1702 
1703 		if (((s1 >= 0 && s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) ||
1704 			(s1 >= 77 && s1 < 94)) && s2 >= 0 && s2 < 94) {
1705 			/* calc offset from ku */
1706 			for (k = 0; k < jisx0213_p2_ofst_len; k++) {
1707 				if (s1 == jisx0213_p2_ofst[k]) {
1708 					break;
1709 				}
1710 			}
1711 			k -= jisx0213_p2_ofst[k];
1712 
1713 			/* check for japanese chars in BMP */
1714 			s = (s1 + 94 + k)*94 + s2;
1715 			ZEND_ASSERT(s < jisx0213_ucs_table_size);
1716 			w = jisx0213_ucs_table[s];
1717 
1718 			/* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1719 			if (w <= 0) {
1720 				k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1721 				if (k >= 0) {
1722 					w = jisx0213_jis_u5_tbl[k] + 0x20000;
1723 				}
1724 			}
1725 
1726 			if (w <= 0) {
1727 				w = MBFL_BAD_INPUT;
1728 			}
1729 
1730 			CK((*filter->output_function)(w, filter->data));
1731 		} else {
1732 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1733 		}
1734 		break;
1735 
1736 	case 5: /* X 0208: ISO-2022-JP-2004 */
1737 		filter->status &= ~0xf;
1738 		c1 = filter->cache;
1739 		if (c > 0x20 && c < 0x7f) {
1740 			s = (c1 - 0x21)*94 + c - 0x21;
1741 			if (s >= 0 && s < jisx0208_ucs_table_size) {
1742 				w = jisx0208_ucs_table[s];
1743 			}
1744 		}
1745 
1746 		if (w <= 0) {
1747 			w = MBFL_BAD_INPUT;
1748 		}
1749 
1750 		CK((*filter->output_function)(w, filter->data));
1751 		break;
1752 
1753 	/* ESC: ISO-2022-JP-2004 */
1754 /*	case 0x06:	*/
1755 /*	case 0x16:	*/
1756 /*	case 0x26:	*/
1757 /*	case 0x86:	*/
1758 /*	case 0x96:	*/
1759 /*	case 0xa6:	*/
1760 	case 6:
1761 		if (c == '$') {
1762 			filter->status++;
1763 		} else if (c == '(') {
1764 			filter->status += 3;
1765 		} else {
1766 			filter->status &= ~0xf;
1767 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1768 		}
1769 		break;
1770 
1771 	/* ESC $: ISO-2022-JP-2004 */
1772 /*	case 0x07:	*/
1773 /*	case 0x17:	*/
1774 /*	case 0x27:	*/
1775 /*	case 0x87:	*/
1776 /*	case 0x97:	*/
1777 /*	case 0xa7:	*/
1778 	case 7:
1779 		if (c == 'B') { /* JIS X 0208-1983 */
1780 			filter->status = 0x80;
1781 		} else if (c == '(') {
1782 			filter->status++;
1783 		} else {
1784 			filter->status &= ~0xf;
1785 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1786 		}
1787 		break;
1788 
1789 	/* ESC $ (: ISO-2022-JP-2004 */
1790 /*	case 0x08:	*/
1791 /*	case 0x18:	*/
1792 /*	case 0x28:	*/
1793 /*	case 0x88:	*/
1794 /*	case 0x98:	*/
1795 /*	case 0xa8:	*/
1796 	case 8:
1797 		if (c == 'Q') { /* JIS X 0213 plane 1 */
1798 			filter->status = 0x90;
1799 		} else if (c == 'P') { /* JIS X 0213 plane 2 */
1800 			filter->status = 0xa0;
1801 		} else {
1802 			filter->status &= ~0xf;
1803 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1804 		}
1805 		break;
1806 
1807 	/* ESC (: ISO-2022-JP-2004 */
1808 /*	case 0x09:	*/
1809 /*	case 0x19:	*/
1810 /*	case 0x29:	*/
1811 /*	case 0x89:	*/
1812 /*	case 0x99:	*/
1813 	case 9:
1814 		if (c == 'B') {
1815 			filter->status = 0;
1816 		} else {
1817 			filter->status &= ~0xf;
1818 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1819 		}
1820 		break;
1821 
1822 		EMPTY_SWITCH_DEFAULT_CASE();
1823 	}
1824 
1825 	return 0;
1826 }
1827 
mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter * filter)1828 static int mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter *filter)
1829 {
1830 	if (filter->status & 0xF) {
1831 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
1832 	}
1833 	filter->status = 0;
1834 
1835 	if (filter->flush_function) {
1836 		return (*filter->flush_function)(filter->data);
1837 	}
1838 
1839 	return 0;
1840 }
1841 
mbfl_filt_conv_wchar_jis2004(int c,mbfl_convert_filter * filter)1842 static int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter)
1843 {
1844 	int k;
1845 	int c1, c2, s1, s2;
1846 
1847 retry:
1848 	s1 = 0;
1849 	/* check for 1st char of combining characters */
1850 	if ((filter->status & 0xf) == 0 && (
1851 			c == 0x00E6 ||
1852 			(c >= 0x0254 && c <= 0x02E9) ||
1853 			(c >= 0x304B && c <= 0x3053) ||
1854 			(c >= 0x30AB && c <= 0x30C8) ||
1855 			c == 0x31F7)) {
1856 		for (k = 0; k < jisx0213_u2_tbl_len; k++) {
1857 			if (c == jisx0213_u2_tbl[2*k]) {
1858 				filter->status++;
1859 				filter->cache = k;
1860 				return 0;
1861 			}
1862 		}
1863 	}
1864 
1865 	/* check for 2nd char of combining characters */
1866 	if ((filter->status & 0xf) == 1 && filter->cache >= 0 && filter->cache < jisx0213_u2_tbl_len) {
1867 		k = filter->cache;
1868 		filter->status &= ~0xf;
1869 		filter->cache = 0;
1870 
1871 		c1 = jisx0213_u2_tbl[2*k];
1872 		if ((c1 == 0x0254 || c1 == 0x028C || c1 == 0x0259 || c1 == 0x025A) && c == 0x0301) {
1873 			k++;
1874 		}
1875 		if (c == jisx0213_u2_tbl[2*k+1]) {
1876 			s1 = jisx0213_u2_key[k];
1877 		} else { /* fallback */
1878 			s1 = jisx0213_u2_fb_tbl[k];
1879 
1880 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1881 				c1 = (s1 >> 8) & 0xff;
1882 				c2 = s1 & 0xff;
1883 				SJIS_ENCODE(c1, c2, s1, s2);
1884 			} else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1885 				s2 = (s1 & 0xff) + 0x80;
1886 				s1 = ((s1 >> 8) & 0xff) + 0x80;
1887 			} else {
1888 				if (filter->status != 0x200) {
1889 					CK((*filter->output_function)(0x1b, filter->data));
1890 					CK((*filter->output_function)('$', filter->data));
1891 					CK((*filter->output_function)('(', filter->data));
1892 					CK((*filter->output_function)('Q', filter->data));
1893 				}
1894 				filter->status = 0x200;
1895 
1896 				s2 = s1 & 0x7f;
1897 				s1 = (s1 >> 8) & 0x7f;
1898 			}
1899 
1900 			/* Flush out cached data */
1901 			CK((*filter->output_function)(s1, filter->data));
1902 			CK((*filter->output_function)(s2, filter->data));
1903 			goto retry;
1904 		}
1905 	}
1906 
1907 	/* check for major japanese chars: U+4E00 - U+9FFF */
1908 	if (s1 <= 0) {
1909 		for (k = 0; k < uni2jis_tbl_len; k++) {
1910 			if (c >= uni2jis_tbl_range[k][0] && c <= uni2jis_tbl_range[k][1]) {
1911 				s1 = uni2jis_tbl[k][c-uni2jis_tbl_range[k][0]];
1912 				break;
1913 			}
1914 		}
1915 	}
1916 
1917 	/* check for japanese chars in compressed mapping area: U+1E00 - U+4DBF */
1918 	if (s1 <= 0 && c >= ucs_c1_jisx0213_min && c <= ucs_c1_jisx0213_max) {
1919 		k = mbfl_bisec_srch(c, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
1920 		if (k >= 0) {
1921 			s1 = ucs_c1_jisx0213_ofst[k] + c - ucs_c1_jisx0213_tbl[2*k];
1922 		}
1923 	}
1924 
1925 	/* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
1926 	if (s1 <= 0 && c >= jisx0213_u5_tbl_min && c <= jisx0213_u5_tbl_max) {
1927 		k = mbfl_bisec_srch2(c - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
1928 		if (k >= 0) {
1929 			s1 = jisx0213_u5_jis_tbl[k];
1930 		}
1931 	}
1932 
1933 	if (s1 <= 0) {
1934 		/* CJK Compatibility Forms: U+FE30 - U+FE4F */
1935 		if (c == 0xfe45) {
1936 			s1 = 0x233e;
1937 		} else if (c == 0xfe46) {
1938 			s1 = 0x233d;
1939 		} else if (c >= 0xf91d && c <= 0xf9dc) {
1940 			/* CJK Compatibility Ideographs: U+F900 - U+F92A */
1941 			k = mbfl_bisec_srch2(c, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
1942 			if (k >= 0) {
1943 				s1 = ucs_r2b_jisx0213_cmap_val[k];
1944 			}
1945 		}
1946 	}
1947 
1948 	if (s1 <= 0) {
1949 		if (c == 0) {
1950 			s1 = 0;
1951 		} else {
1952 			s1 = -1;
1953 		}
1954 	}
1955 
1956 	if (s1 >= 0) {
1957 		if (s1 < 0x80) { /* ASCII */
1958 			if (filter->to->no_encoding == mbfl_no_encoding_2022jp_2004 && (filter->status & 0xff00)) {
1959 				CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
1960 				CK((*filter->output_function)('(', filter->data));
1961 				CK((*filter->output_function)('B', filter->data));
1962 			}
1963 			filter->status = 0;
1964 			CK((*filter->output_function)(s1, filter->data));
1965 		} else if (s1 < 0x100) { /* latin or kana */
1966 			if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1967 				CK((*filter->output_function)(0x8e, filter->data));
1968 				CK((*filter->output_function)(s1, filter->data));
1969 			} else if (filter->to->no_encoding == mbfl_no_encoding_sjis2004 && (s1 >= 0xA1 && s1 <= 0xDF)) {
1970 				CK((*filter->output_function)(s1, filter->data));
1971 			} else {
1972 				CK(mbfl_filt_conv_illegal_output(c, filter));
1973 			}
1974 		} else if (s1 < 0x7f00) { /* X 0213 plane 1 */
1975 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1976 				c1 = (s1 >> 8) & 0xff;
1977 				c2 = s1 & 0xff;
1978 				SJIS_ENCODE(c1, c2, s1, s2);
1979 			} else if  (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
1980 				s2 = (s1 & 0xff) + 0x80;
1981 				s1 = ((s1 >> 8) & 0xff) + 0x80;
1982 			} else {
1983 				if ((filter->status & 0xff00) != 0x200) {
1984 					CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
1985 					CK((*filter->output_function)('$', filter->data));
1986 					CK((*filter->output_function)('(', filter->data));
1987 					CK((*filter->output_function)('Q', filter->data));
1988 				}
1989 				filter->status = 0x200;
1990 				s2 = s1 & 0xff;
1991 				s1 = (s1 >> 8) & 0xff;
1992 			}
1993 			CK((*filter->output_function)(s1, filter->data));
1994 			CK((*filter->output_function)(s2, filter->data));
1995 		} else { /* X 0213 plane 2 */
1996 			if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
1997 				c1 = (s1 >> 8) & 0xff;
1998 				c2 = s1 & 0xff;
1999 				SJIS_ENCODE(c1, c2, s1, s2);
2000 			} else {
2001 				s2 = s1 & 0xff;
2002 				k = ((s1 >> 8) & 0xff) - 0x7f;
2003 				if (k >= 0 && k < jisx0213_p2_ofst_len) {
2004 					s1 = jisx0213_p2_ofst[k] + 0x21;
2005 				}
2006 				if  (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2007 					s2 |= 0x80;
2008 					s1 |= 0x80;
2009 					CK((*filter->output_function)(0x8f, filter->data));
2010 				} else {
2011 					if ((filter->status & 0xff00) != 0x200) {
2012 						CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2013 						CK((*filter->output_function)('$', filter->data));
2014 						CK((*filter->output_function)('(', filter->data));
2015 						CK((*filter->output_function)('P', filter->data));
2016 					}
2017 					filter->status = 0x200;
2018 				}
2019 			}
2020 
2021 			CK((*filter->output_function)(s1, filter->data));
2022 			CK((*filter->output_function)(s2, filter->data));
2023 		}
2024 	} else {
2025 		CK(mbfl_filt_conv_illegal_output(c, filter));
2026 	}
2027 
2028 	return 0;
2029 }
2030 
mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter * filter)2031 static int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter)
2032 {
2033 	int k, c1, c2, s1, s2;
2034 
2035 	k = filter->cache;
2036 	filter->cache = 0;
2037 
2038 	if (filter->status == 1 && k >= 0 && k <= jisx0213_u2_tbl_len) {
2039 		s1 = jisx0213_u2_fb_tbl[k];
2040 
2041 		if (filter->to->no_encoding == mbfl_no_encoding_sjis2004) {
2042 			c1 = (s1 >> 8) & 0xff;
2043 			c2 = s1 & 0xff;
2044 			SJIS_ENCODE(c1, c2, s1, s2);
2045 		} else if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) {
2046 			s2 = (s1 & 0xff) | 0x80;
2047 			s1 = ((s1 >> 8) & 0xff) | 0x80;
2048 		} else {
2049 			s2 = s1 & 0x7f;
2050 			s1 = (s1 >> 8) & 0x7f;
2051 			if ((filter->status & 0xff00) != 0x200) {
2052 				CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2053 				CK((*filter->output_function)('$', filter->data));
2054 				CK((*filter->output_function)('(', filter->data));
2055 				CK((*filter->output_function)('Q', filter->data));
2056 			}
2057 			filter->status = 0x200;
2058 		}
2059 
2060 		CK((*filter->output_function)(s1, filter->data));
2061 		CK((*filter->output_function)(s2, filter->data));
2062 	}
2063 
2064 	/* If we had switched to a different charset, go back to ASCII mode
2065 	 * This makes it possible to concatenate arbitrary valid strings
2066 	 * together and get a valid string */
2067 	if (filter->status & 0xff00) {
2068 		CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
2069 		CK((*filter->output_function)('(', filter->data));
2070 		CK((*filter->output_function)('B', filter->data));
2071 	}
2072 
2073 	filter->status = 0;
2074 
2075 	if (filter->flush_function) {
2076 		return (*filter->flush_function)(filter->data);
2077 	}
2078 
2079 	return 0;
2080 }
2081 
2082 #define ASCII 0
2083 #define JISX0208 1
2084 #define JISX0213_PLANE1 2
2085 #define JISX0213_PLANE2 3
2086 
mb_iso2022jp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)2087 static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
2088 {
2089 	unsigned char *p = *in, *e = p + *in_len;
2090 	uint32_t *out = buf, *limit = buf + bufsize - 1;
2091 
2092 	while (p < e && out < limit) {
2093 		unsigned char c = *p++;
2094 
2095 		if (c <= 0x7F) {
2096 			if (c == 0x1B) {
2097 				if ((e - p) < 2) {
2098 					*out++ = MBFL_BAD_INPUT;
2099 					p = e;
2100 					break;
2101 				}
2102 				unsigned char c2 = *p++;
2103 				unsigned char c3 = *p++;
2104 				if (c2 == '$') {
2105 					if (c3 == 'B') {
2106 						*state = JISX0208;
2107 					} else if (c3 == '(') {
2108 						if (p == e) {
2109 							*out++ = MBFL_BAD_INPUT;
2110 							break;
2111 						}
2112 						unsigned char c4 = *p++;
2113 						if (c4 == 'Q') {
2114 							*state = JISX0213_PLANE1;
2115 						} else if (c4 == 'P') {
2116 							*state = JISX0213_PLANE2;
2117 						} else {
2118 							*out++ = MBFL_BAD_INPUT;
2119 						}
2120 					} else {
2121 						*out++ = MBFL_BAD_INPUT;
2122 					}
2123 				} else if (c2 == '(') {
2124 					if (c3 == 'B') {
2125 						*state = ASCII;
2126 					} else {
2127 						*out++ = MBFL_BAD_INPUT;
2128 					}
2129 				} else {
2130 					p--;
2131 					*out++ = MBFL_BAD_INPUT;
2132 				}
2133 			} else if (*state >= JISX0208 && c > 0x20 && c < 0x7F) {
2134 				if (p == e) {
2135 					*out++ = MBFL_BAD_INPUT;
2136 					break;
2137 				}
2138 				unsigned char c2 = *p++;
2139 				if (c2 < 0x21 || c2 > 0x7E) {
2140 					*out++ = MBFL_BAD_INPUT;
2141 					continue;
2142 				}
2143 
2144 				if (*state == JISX0213_PLANE1) {
2145 					unsigned int w1 = (c << 8) | c2;
2146 
2147 					/* Conversion for combining characters */
2148 					if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
2149 						int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
2150 						if (k >= 0) {
2151 							*out++ = jisx0213_u2_tbl[2*k];
2152 							*out++ = jisx0213_u2_tbl[2*k+1];
2153 							continue;
2154 						}
2155 					}
2156 
2157 					/* Conversion for BMP */
2158 					uint32_t w = 0;
2159 					w1 = (c - 0x21)*94 + c2 - 0x21;
2160 					if (w1 < jisx0213_ucs_table_size) {
2161 						w = jisx0213_ucs_table[w1];
2162 					}
2163 
2164 					/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
2165 					if (!w) {
2166 						int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2167 						if (k >= 0) {
2168 							w = jisx0213_jis_u5_tbl[k] + 0x20000;
2169 						}
2170 					}
2171 
2172 					*out++ = w ? w : MBFL_BAD_INPUT;
2173 				} else if (*state == JISX0213_PLANE2) {
2174 
2175 					unsigned int s1 = c - 0x21, s2 = c2 - 0x21;
2176 
2177 					if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
2178 						int k;
2179 						for (k = 0; k < jisx0213_p2_ofst_len; k++) {
2180 							if (s1 == jisx0213_p2_ofst[k]) {
2181 								break;
2182 							}
2183 						}
2184 						k -= jisx0213_p2_ofst[k];
2185 
2186 						/* Check for Japanese chars in BMP */
2187 						unsigned int s = (s1 + 94 + k)*94 + s2;
2188 						ZEND_ASSERT(s < jisx0213_ucs_table_size);
2189 						uint32_t w = jisx0213_ucs_table[s];
2190 
2191 						/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2192 						if (!w) {
2193 							k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2194 							if (k >= 0) {
2195 								w = jisx0213_jis_u5_tbl[k] + 0x20000;
2196 							}
2197 						}
2198 
2199 						*out++ = w ? w : MBFL_BAD_INPUT;
2200 					} else {
2201 						*out++ = MBFL_BAD_INPUT;
2202 					}
2203 				} else { /* state == JISX0208 */
2204 					unsigned int s = (c - 0x21)*94 + c2 - 0x21;
2205 					uint32_t w = 0;
2206 					if (s < jisx0208_ucs_table_size) {
2207 						w = jisx0208_ucs_table[s];
2208 					}
2209 					*out++ = w ? w : MBFL_BAD_INPUT;
2210 				}
2211 			} else {
2212 				*out++ = c;
2213 			}
2214 		} else {
2215 			*out++ = MBFL_BAD_INPUT;
2216 		}
2217 	}
2218 
2219 	*in_len = e - p;
2220 	*in = p;
2221 	return out - buf;
2222 }
2223 
mb_wchar_to_iso2022jp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)2224 static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
2225 {
2226 	unsigned char *out, *limit;
2227 	MB_CONVERT_BUF_LOAD(buf, out, limit);
2228 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2229 
2230 	uint32_t w;
2231 	if (buf->state & 0xFF00) {
2232 		int k = (buf->state >> 8) - 1;
2233 		w = jisx0213_u2_tbl[2*k];
2234 		buf->state &= 0xFF;
2235 		goto process_codepoint;
2236 	}
2237 
2238 	while (len--) {
2239 		w = *in++;
2240 process_codepoint: ;
2241 		unsigned int s = 0;
2242 
2243 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
2244 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
2245 				if (w == jisx0213_u2_tbl[2*k]) {
2246 					if (!len) {
2247 						if (!end) {
2248 							buf->state |= (k+1) << 8;
2249 							MB_CONVERT_BUF_STORE(buf, out, limit);
2250 							return;
2251 						}
2252 					}	else {
2253 						uint32_t w2 = *in++; len--;
2254 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
2255 							k++;
2256 						}
2257 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
2258 							s = jisx0213_u2_key[k];
2259 							break;
2260 						}
2261 						in--; len++;
2262 					}
2263 
2264 					s = jisx0213_u2_fb_tbl[k];
2265 					break;
2266 				}
2267 			}
2268 		}
2269 
2270 		/* Check for major Japanese chars: U+4E00-U+9FFF */
2271 		if (!s) {
2272 			for (int k = 0; k < uni2jis_tbl_len; k++) {
2273 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
2274 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
2275 					break;
2276 				}
2277 			}
2278 		}
2279 
2280 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
2281 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
2282 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
2283 			if (k >= 0) {
2284 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
2285 			}
2286 		}
2287 
2288 		/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
2289 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
2290 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
2291 			if (k >= 0) {
2292 				s = jisx0213_u5_jis_tbl[k];
2293 			}
2294 		}
2295 
2296 		if (!s) {
2297 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
2298 			if (w == 0xFE45) {
2299 				s = 0x233E;
2300 			} else if (w == 0xFE46) {
2301 				s = 0x233D;
2302 			} else if (w >= 0xF91D && w <= 0xF9DC) {
2303 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
2304 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
2305 				if (k >= 0) {
2306 					s = ucs_r2b_jisx0213_cmap_val[k];
2307 				}
2308 			}
2309 		}
2310 
2311 		if (!s && w) {
2312 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2313 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2314 		} else if (s <= 0x7F) {
2315 			if (buf->state != ASCII) {
2316 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
2317 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2318 				buf->state = ASCII;
2319 			}
2320 			out = mb_convert_buf_add(out, s);
2321 		} else if (s <= 0xFF) {
2322 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004);
2323 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
2324 		} else if (s <= 0x7EFF) {
2325 			if (buf->state != JISX0213_PLANE1) {
2326 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2327 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'Q');
2328 				buf->state = JISX0213_PLANE1;
2329 			} else {
2330 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2331 			}
2332 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
2333 		} else {
2334 			if (buf->state != JISX0213_PLANE2) {
2335 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
2336 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'P');
2337 				buf->state = JISX0213_PLANE2;
2338 			} else {
2339 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
2340 			}
2341 			unsigned int s2 = s & 0xFF;
2342 			int k = ((s >> 8) & 0xFF) - 0x7F;
2343 			ZEND_ASSERT(k < jisx0213_p2_ofst_len);
2344 			s = jisx0213_p2_ofst[k] + 0x21;
2345 			out = mb_convert_buf_add2(out, s, s2);
2346 		}
2347 	}
2348 
2349 	if (end && buf->state != ASCII) {
2350 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
2351 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
2352 	}
2353 
2354 	MB_CONVERT_BUF_STORE(buf, out, limit);
2355 }
2356 
mbfl_filt_conv_cp5022x_wchar(int c,mbfl_convert_filter * filter)2357 static int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
2358 {
2359 	int c1, s, w;
2360 
2361 retry:
2362 	switch (filter->status & 0xf) {
2363 /*	case 0x00:	 ASCII */
2364 /*	case 0x10:	 X 0201 latin */
2365 /*	case 0x20:	 X 0201 kana */
2366 /*	case 0x80:	 X 0208 */
2367 /*	case 0x90:	 X 0212 */
2368 	case 0:
2369 		if (c == 0x1b) {
2370 			filter->status += 2;
2371 		} else if (c == 0x0e) {		/* "kana in" */
2372 			filter->status = 0x20;
2373 		} else if (c == 0x0f) {		/* "kana out" */
2374 			filter->status = 0;
2375 		} else if (filter->status == 0x10 && c == 0x5c) {	/* YEN SIGN */
2376 			CK((*filter->output_function)(0xa5, filter->data));
2377 		} else if (filter->status == 0x10 && c == 0x7e) {	/* OVER LINE */
2378 			CK((*filter->output_function)(0x203e, filter->data));
2379 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
2380 			CK((*filter->output_function)(0xff40 + c, filter->data));
2381 		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
2382 			filter->cache = c;
2383 			filter->status += 1;
2384 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */
2385 			CK((*filter->output_function)(c, filter->data));
2386 		} else if (c > 0xa0 && c < 0xe0) {	/* GR kana */
2387 			CK((*filter->output_function)(0xfec0 + c, filter->data));
2388 		} else {
2389 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2390 		}
2391 		break;
2392 
2393 /*	case 0x81:	 X 0208 second char */
2394 /*	case 0x91:	 X 0212 second char */
2395 	case 1:
2396 		filter->status &= ~0xf;
2397 		c1 = filter->cache;
2398 		if (c > 0x20 && c < 0x7f) {
2399 			s = (c1 - 0x21)*94 + c - 0x21;
2400 			if (filter->status == 0x80) {
2401 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
2402 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
2403 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {
2404 					w = jisx0208_ucs_table[s];
2405 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
2406 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
2407 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
2408 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
2409 				} else if (s >= 94 * 94 && s < 114 * 94) {
2410 					/* user-defined => PUA (Microsoft extended) */
2411 					w = s - 94*94 + 0xe000;
2412 				} else {
2413 					w = 0;
2414 				}
2415 
2416 				if (w <= 0) {
2417 					w = MBFL_BAD_INPUT;
2418 				}
2419 			} else {
2420 				if (s >= 0 && s < jisx0212_ucs_table_size) {
2421 					w = jisx0212_ucs_table[s];
2422 				} else {
2423 					w = 0;
2424 				}
2425 
2426 				if (w <= 0) {
2427 					w = MBFL_BAD_INPUT;
2428 				}
2429 			}
2430 			CK((*filter->output_function)(w, filter->data));
2431 		} else {
2432 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2433 		}
2434 		break;
2435 
2436 	/* ESC */
2437 /*	case 0x02:	*/
2438 /*	case 0x12:	*/
2439 /*	case 0x22:	*/
2440 /*	case 0x82:	*/
2441 /*	case 0x92:	*/
2442 	case 2:
2443 		if (c == 0x24) {		/* '$' */
2444 			filter->status++;
2445 		} else if (c == 0x28) {		/* '(' */
2446 			filter->status += 3;
2447 		} else {
2448 			filter->status &= ~0xf;
2449 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2450 			goto retry;
2451 		}
2452 		break;
2453 
2454 	/* ESC $ */
2455 /*	case 0x03:	*/
2456 /*	case 0x13:	*/
2457 /*	case 0x23:	*/
2458 /*	case 0x83:	*/
2459 /*	case 0x93:	*/
2460 	case 3:
2461 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
2462 			filter->status = 0x80;
2463 		} else if (c == 0x28) {			/* '(' */
2464 			filter->status++;
2465 		} else {
2466 			filter->status &= ~0xf;
2467 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2468 			CK((*filter->output_function)(0x24, filter->data));
2469 			goto retry;
2470 		}
2471 		break;
2472 
2473 	/* ESC $ ( */
2474 /*	case 0x04:	*/
2475 /*	case 0x14:	*/
2476 /*	case 0x24:	*/
2477 /*	case 0x84:	*/
2478 /*	case 0x94:	*/
2479 	case 4:
2480 		if (c == 0x40 || c == 0x42) {	/* '@' or 'B' */
2481 			filter->status = 0x80;
2482 		} else if (c == 0x44) {			/* 'D' */
2483 			filter->status = 0x90;
2484 		} else {
2485 			filter->status &= ~0xf;
2486 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2487 			CK((*filter->output_function)(0x24, filter->data));
2488 			CK((*filter->output_function)(0x28, filter->data));
2489 			goto retry;
2490 		}
2491 		break;
2492 
2493 	/* ESC ( */
2494 /*	case 0x05:	*/
2495 /*	case 0x15:	*/
2496 /*	case 0x25:	*/
2497 /*	case 0x85:	*/
2498 /*	case 0x95:	*/
2499 	case 5:
2500 		if (c == 0x42 || c == 0x48) {		/* 'B' or 'H' */
2501 			filter->status = 0;
2502 		} else if (c == 0x4a) {		/* 'J' */
2503 			filter->status = 0x10;
2504 		} else if (c == 0x49) {		/* 'I' */
2505 			filter->status = 0x20;
2506 		} else {
2507 			filter->status &= ~0xf;
2508 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2509 			CK((*filter->output_function)(0x28, filter->data));
2510 			goto retry;
2511 		}
2512 		break;
2513 
2514 		EMPTY_SWITCH_DEFAULT_CASE();
2515 	}
2516 
2517 	return 0;
2518 }
2519 
mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter * filter)2520 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
2521 {
2522 	if (filter->status & 0xF) {
2523 		/* 2-byte (JIS X 0208 or 0212) character was truncated, or else
2524 		 * escape sequence was truncated */
2525 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
2526 	}
2527 	filter->status = 0;
2528 
2529 	if (filter->flush_function) {
2530 		(*filter->flush_function)(filter->data);
2531 	}
2532 
2533 	return 0;
2534 }
2535 
2536 static const unsigned char hankana2zenkana_table[64] = {
2537 	0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5,
2538 	0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6,
2539 	0xA8,0xAA,0xAB,0xAD,0xAF,0xB1,0xB3,0xB5,0xB7,0xB9,
2540 	0xBB,0xBD,0xBF,0xC1,0xC4,0xC6,0xC8,0xCA,0xCB,0xCC,
2541 	0xCD,0xCE,0xCF,0xD2,0xD5,0xD8,0xDB,0xDE,0xDF,0xE0,
2542 	0xE1,0xE2,0xE4,0xE6,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,
2543 	0xEF,0xF3,0x9B,0x9C
2544 };
2545 
2546 static const unsigned char hankana2zenhira_table[64] = {
2547 	0x00,0x02,0x0C,0x0D,0x01,0xFB,0x92,0x41,0x43,0x45,
2548 	0x47,0x49,0x83,0x85,0x87,0x63,0xFC,0x42,0x44,0x46,
2549 	0x48,0x4A,0x4B,0x4D,0x4F,0x51,0x53,0x55,0x57,0x59,
2550 	0x5B,0x5D,0x5F,0x61,0x64,0x66,0x68,0x6A,0x6B,0x6C,
2551 	0x6D,0x6E,0x6F,0x72,0x75,0x78,0x7B,0x7E,0x7F,0x80,
2552 	0x81,0x82,0x84,0x86,0x88,0x89,0x8A,0x8B,0x8C,0x8D,
2553 	0x8F,0x93,0x9B,0x9C
2554 };
2555 
2556 static const unsigned char zenkana2hankana_table[84][2] = {
2557 	{0x67,0x00},{0x71,0x00},{0x68,0x00},{0x72,0x00},{0x69,0x00},
2558 	{0x73,0x00},{0x6A,0x00},{0x74,0x00},{0x6B,0x00},{0x75,0x00},
2559 	{0x76,0x00},{0x76,0x9E},{0x77,0x00},{0x77,0x9E},{0x78,0x00},
2560 	{0x78,0x9E},{0x79,0x00},{0x79,0x9E},{0x7A,0x00},{0x7A,0x9E},
2561 	{0x7B,0x00},{0x7B,0x9E},{0x7C,0x00},{0x7C,0x9E},{0x7D,0x00},
2562 	{0x7D,0x9E},{0x7E,0x00},{0x7E,0x9E},{0x7F,0x00},{0x7F,0x9E},
2563 	{0x80,0x00},{0x80,0x9E},{0x81,0x00},{0x81,0x9E},{0x6F,0x00},
2564 	{0x82,0x00},{0x82,0x9E},{0x83,0x00},{0x83,0x9E},{0x84,0x00},
2565 	{0x84,0x9E},{0x85,0x00},{0x86,0x00},{0x87,0x00},{0x88,0x00},
2566 	{0x89,0x00},{0x8A,0x00},{0x8A,0x9E},{0x8A,0x9F},{0x8B,0x00},
2567 	{0x8B,0x9E},{0x8B,0x9F},{0x8C,0x00},{0x8C,0x9E},{0x8C,0x9F},
2568 	{0x8D,0x00},{0x8D,0x9E},{0x8D,0x9F},{0x8E,0x00},{0x8E,0x9E},
2569 	{0x8E,0x9F},{0x8F,0x00},{0x90,0x00},{0x91,0x00},{0x92,0x00},
2570 	{0x93,0x00},{0x6C,0x00},{0x94,0x00},{0x6D,0x00},{0x95,0x00},
2571 	{0x6E,0x00},{0x96,0x00},{0x97,0x00},{0x98,0x00},{0x99,0x00},
2572 	{0x9A,0x00},{0x9B,0x00},{0x9C,0x00},{0x9C,0x00},{0x72,0x00},
2573 	{0x74,0x00},{0x66,0x00},{0x9D,0x00},{0x73,0x9E}
2574 };
2575 
2576 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
2577  * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
2578  * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
2579  * `mode` must not call for transforms which are inverses (i.e. which would cancel
2580  * each other out).
2581  *
2582  * In some cases, successive input codepoints may be merged into one output codepoint.
2583  * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
2584  * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
2585  * will not be modified. If there is no following codepoint, `next` should be zero.
2586  *
2587  * Again, in some cases, one input codepoint may convert to two output codepoints.
2588  * If so, the second output codepoint will be stored in `*second`.
2589  *
2590  * Return the resulting codepoint. If none of the requested transforms apply, return
2591  * the input codepoint unchanged.
2592  */
mb_convert_kana_codepoint(uint32_t c,uint32_t next,bool * consumed,uint32_t * second,unsigned int mode)2593 uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
2594 {
2595 	if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
2596 		return c + 0xFEE0;
2597 	}
2598 	if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2599 		return c + 0xFEE0;
2600 	}
2601 	if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
2602 		return c + 0xFEE0;
2603 	}
2604 	if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
2605 		return 0x3000;
2606 	}
2607 
2608 	if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
2609 		/* Convert Hankaku kana to Zenkaku kana
2610 		 * Either all Hankaku kana (including katakana and hiragana) will be converted
2611 		 * to Zenkaku katakana, or to Zenkaku hiragana */
2612 		if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2613 			if (c >= 0xFF61 && c <= 0xFF9F) {
2614 				int n = c - 0xFF60;
2615 
2616 				if (next >= 0xFF61 && next <= 0xFF9F) {
2617 					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2618 						*consumed = true;
2619 						return 0x3001 + hankana2zenkana_table[n];
2620 					}
2621 					if (next == 0xFF9E && n == 19) {
2622 						*consumed = true;
2623 						return 0x30F4;
2624 					}
2625 					if (next == 0xFF9F && n >= 42 && n <= 46) {
2626 						*consumed = true;
2627 						return 0x3002 + hankana2zenkana_table[n];
2628 					}
2629 				}
2630 
2631 				return 0x3000 + hankana2zenkana_table[n];
2632 			}
2633 		}
2634 		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
2635 			if (c >= 0xFF61 && c <= 0xFF9F) {
2636 				int n = c - 0xFF60;
2637 
2638 				if (next >= 0xFF61 && next <= 0xFF9F) {
2639 					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
2640 						*consumed = true;
2641 						return 0x3001 + hankana2zenhira_table[n];
2642 					}
2643 					if (next == 0xFF9F && n >= 42 && n <= 46) {
2644 						*consumed = true;
2645 						return 0x3002 + hankana2zenhira_table[n];
2646 					}
2647 				}
2648 
2649 				return 0x3000 + hankana2zenhira_table[n];
2650 			}
2651 		}
2652 		if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
2653 			return 0x3000 + hankana2zenkana_table[c - 0xFF60];
2654 		}
2655 		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
2656 			return 0x3000 + hankana2zenhira_table[c - 0xFF60];
2657 		}
2658 	}
2659 
2660 	if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
2661 		if (c == '\\' || c == 0xA5) { /* YEN SIGN */
2662 			return 0xFFE5; /* FULLWIDTH YEN SIGN */
2663 		}
2664 		if (c == 0x7E || c == 0x203E) {
2665 			return 0xFFE3; /* FULLWIDTH MACRON */
2666 		}
2667 		if (c == '\'') {
2668 			return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
2669 		}
2670 		if (c == '"') {
2671 			return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
2672 		}
2673 	}
2674 
2675 	if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
2676 		/* Zenkaku to Hankaku */
2677 		if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
2678 			/* all except " ' \ ~ */
2679 			return c - 0xFEE0;
2680 		}
2681 		if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
2682 			return c - 0xFEE0;
2683 		}
2684 		if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
2685 			return c - 0xFEE0;
2686 		}
2687 		if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
2688 			return ' ';
2689 		}
2690 		if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
2691 			return '-';
2692 		}
2693 	}
2694 
2695 	if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
2696 		/* Zenkaku kana to hankaku kana */
2697 		if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
2698 			/* Zenkaku katakana to hankaku kana */
2699 			int n = c - 0x30A1;
2700 			if (zenkana2hankana_table[n][1]) {
2701 				*second = 0xFF00 + zenkana2hankana_table[n][1];
2702 			}
2703 			return 0xFF00 + zenkana2hankana_table[n][0];
2704 		}
2705 		if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
2706 			/* Zenkaku hiragana to hankaku kana */
2707 			int n = c - 0x3041;
2708 			if (zenkana2hankana_table[n][1]) {
2709 				*second = 0xFF00 + zenkana2hankana_table[n][1];
2710 			}
2711 			return 0xFF00 + zenkana2hankana_table[n][0];
2712 		}
2713 		if (c == 0x3001) {
2714 			return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
2715 		}
2716 		if (c == 0x3002) {
2717 			return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
2718 		}
2719 		if (c == 0x300C) {
2720 			return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
2721 		}
2722 		if (c == 0x300D) {
2723 			return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
2724 		}
2725 		if (c == 0x309B) {
2726 			return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
2727 		}
2728 		if (c == 0x309C) {
2729 			return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
2730 		}
2731 		if (c == 0x30FC) {
2732 			return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
2733 		}
2734 		if (c == 0x30FB) {
2735 			return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
2736 		}
2737 	}
2738 
2739 	if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
2740 		if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
2741 			/* Zenkaku hiragana to Zenkaku katakana */
2742 			return c + 0x60;
2743 		}
2744 		if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
2745 			/* Zenkaku katakana to Zenkaku hiragana */
2746 			return c - 0x60;
2747 		}
2748 	}
2749 
2750 	if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
2751 		if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
2752 			return '\\';
2753 		}
2754 		if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
2755 			return '~';
2756 		}
2757 		if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
2758 			return '\'';
2759 		}
2760 		if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
2761 			return '"';
2762 		}
2763 	}
2764 
2765 	return c;
2766 }
2767 
2768 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
2769 
mbfl_filt_conv_wchar_cp50220(int c,mbfl_convert_filter * filter)2770 static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
2771 {
2772 	int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2773 	bool consumed = false;
2774 
2775 	if (filter->cache) {
2776 		int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
2777 		filter->cache = consumed ? 0 : c;
2778 		/* Terrible hack to get CP50220 to emit error markers in the proper
2779 		 * position, not reordering them with subsequent characters */
2780 		filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2781 		mbfl_filt_conv_wchar_cp50221(s, filter);
2782 		filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2783 		if (c == 0 && !consumed) {
2784 			(*filter->output_function)(0, filter->data);
2785 		}
2786 	} else if (c == 0) {
2787 		/* This case has to be handled separately, since `filter->cache == 0` means
2788 		 * no codepoint is cached */
2789 		(*filter->output_function)(0, filter->data);
2790 	} else {
2791 		filter->cache = c;
2792 	}
2793 
2794 	return 0;
2795 }
2796 
mbfl_filt_conv_any_jis_flush(mbfl_convert_filter * filter)2797 static int mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
2798 {
2799 	/* back to latin */
2800 	if ((filter->status & 0xff00) != 0) {
2801 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2802 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2803 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
2804 	}
2805 	filter->status = 0;
2806 
2807 	if (filter->flush_function != NULL) {
2808 		return (*filter->flush_function)(filter->data);
2809 	}
2810 
2811 	return 0;
2812 }
2813 
mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter * filter)2814 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
2815 {
2816 	int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
2817 
2818 	if (filter->cache) {
2819 		int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
2820 		filter->filter_function = mbfl_filt_conv_wchar_cp50221;
2821 		mbfl_filt_conv_wchar_cp50221(s, filter);
2822 		filter->filter_function = mbfl_filt_conv_wchar_cp50220;
2823 		filter->cache = 0;
2824 	}
2825 
2826 	return mbfl_filt_conv_any_jis_flush(filter);
2827 }
2828 
mbfl_filt_conv_wchar_cp50221(int c,mbfl_convert_filter * filter)2829 static int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
2830 {
2831 	int s = 0;
2832 
2833 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2834 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2835 	} else if (c == 0x203E) { /* OVERLINE */
2836 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2837 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2838 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2839 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2840 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2841 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2842 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2843 	} else if (c >= 0xE000 && c <= 0xE757) {
2844 		/* 'private'/'user' codepoints */
2845 		s = c - 0xE000;
2846 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2847 	}
2848 
2849 	if (s <= 0) {
2850 		if (c == 0xa5) {			/* YEN SIGN */
2851 			s = 0x1005c;
2852 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
2853 			s = 0x2140;
2854 		} else if (c == 0x2225) {	/* PARALLEL TO */
2855 			s = 0x2142;
2856 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
2857 			s = 0x215d;
2858 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
2859 			s = 0x2171;
2860 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
2861 			s = 0x2172;
2862 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
2863 			s = 0x224c;
2864 		}
2865 	}
2866 
2867 	/* Above, we do a series of lookups in `ucs_*_jis_table` to find a
2868 	 * corresponding kuten code for this Unicode codepoint
2869 	 * If we get zero, that means the codepoint is not in JIS X 0208
2870 	 * On the other hand, if we get a result with the high bits set on both
2871 	 * upper and lower bytes, that is not a code in JIS X 0208 but rather
2872 	 * in JIS X 0213
2873 	 * In either case, check if this codepoint is one of the extensions added
2874 	 * to JIS X 0208 by MicroSoft (to make CP932) */
2875 	if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
2876 		int i;
2877 		s = -1;
2878 
2879 		for (i = 0;
2880 				i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
2881 				i++) {
2882 			const int oh = cp932ext1_ucs_table_min / 94;
2883 
2884 			if (c == cp932ext1_ucs_table[i]) {
2885 				s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2886 				break;
2887 			}
2888 		}
2889 
2890 		if (s < 0) {
2891 			const int oh = cp932ext2_ucs_table_min / 94;
2892 			const int cp932ext2_ucs_table_size =
2893 					cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
2894 			for (i = 0; i < cp932ext2_ucs_table_size; i++) {
2895 				if (c == cp932ext2_ucs_table[i]) {
2896 					s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
2897 					break;
2898 				}
2899 			}
2900 		}
2901 
2902 		if (c == 0) {
2903 			s = 0;
2904 		} else if (s <= 0) {
2905 			s = -1;
2906 		}
2907 	}
2908 
2909 	if (s >= 0) {
2910 		if (s < 0x80) { /* ASCII */
2911 			if ((filter->status & 0xff00) != 0) {
2912 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2913 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2914 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
2915 				filter->status = 0;
2916 			}
2917 			CK((*filter->output_function)(s, filter->data));
2918 		} else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
2919 			if ((filter->status & 0xff00) != 0x500) {
2920 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2921 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2922 				CK((*filter->output_function)(0x49, filter->data));		/* 'I' */
2923 				filter->status = 0x500;
2924 			}
2925 			CK((*filter->output_function)(s - 0x80, filter->data));
2926 		} else if (s <= 0x927E) { /* X 0208 + extensions */
2927 			if ((filter->status & 0xff00) != 0x200) {
2928 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2929 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
2930 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
2931 				filter->status = 0x200;
2932 			}
2933 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
2934 			CK((*filter->output_function)(s & 0xff, filter->data));
2935 		} else if (s < 0x10000) { /* X0212 */
2936 			CK(mbfl_filt_conv_illegal_output(c, filter));
2937 		} else { /* X 0201 latin */
2938 			if ((filter->status & 0xff00) != 0x400) {
2939 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
2940 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
2941 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
2942 			}
2943 			filter->status = 0x400;
2944 			CK((*filter->output_function)(s & 0x7f, filter->data));
2945 		}
2946 	} else {
2947 		CK(mbfl_filt_conv_illegal_output(c, filter));
2948 	}
2949 
2950 	return 0;
2951 }
2952 
mbfl_filt_conv_wchar_cp50222(int c,mbfl_convert_filter * filter)2953 static int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
2954 {
2955 	int s = 0;
2956 
2957 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
2958 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
2959 	} else if (c == 0x203E) { /* OVERLINE */
2960 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
2961 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
2962 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
2963 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
2964 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
2965 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
2966 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
2967 	} else if (c >= 0xE000 && c <= 0xE757) {
2968 		/* 'private'/'user' codepoints */
2969 		s = c - 0xE000;
2970 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
2971 	}
2972 
2973 	if (s <= 0) {
2974 		if (c == 0xa5) {			/* YEN SIGN */
2975 			s = 0x1005c;
2976 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
2977 			s = 0x2140;
2978 		} else if (c == 0x2225) {	/* PARALLEL TO */
2979 			s = 0x2142;
2980 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
2981 			s = 0x215d;
2982 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
2983 			s = 0x2171;
2984 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
2985 			s = 0x2172;
2986 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
2987 			s = 0x224c;
2988 		}
2989 	}
2990 	if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
2991 		int i;
2992 		s = -1;
2993 
2994 		for (i = 0;
2995 				i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
2996 			const int oh = cp932ext1_ucs_table_min / 94;
2997 
2998 			if (c == cp932ext1_ucs_table[i]) {
2999 				s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3000 				break;
3001 			}
3002 		}
3003 
3004 		if (s <= 0) {
3005 			const int oh = cp932ext2_ucs_table_min / 94;
3006 			const int cp932ext2_ucs_table_size =
3007 					cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
3008 			for (i = 0; i < cp932ext2_ucs_table_size; i++) {
3009 				if (c == cp932ext2_ucs_table[i]) {
3010 					s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
3011 					break;
3012 				}
3013 			}
3014 		}
3015 
3016 		if (c == 0) {
3017 			s = 0;
3018 		} else if (s <= 0) {
3019 			s = -1;
3020 		}
3021 	}
3022 
3023 	if (s >= 0) {
3024 		if (s < 0x80) { /* ASCII */
3025 			if ((filter->status & 0xff00) == 0x500) {
3026 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3027 				filter->status = 0;
3028 			} else if ((filter->status & 0xff00) != 0) {
3029 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3030 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
3031 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
3032 				filter->status = 0;
3033 			}
3034 			CK((*filter->output_function)(s, filter->data));
3035 		} else if (s >= 0xa0 && s < 0xe0) { /* X 0201 kana */
3036 			if ((filter->status & 0xff00) != 0x500) {
3037 				CK((*filter->output_function)(0x0e, filter->data));		/* SI */
3038 				filter->status = 0x500;
3039 			}
3040 			CK((*filter->output_function)(s - 0x80, filter->data));
3041 		} else if (s <= 0x927E) { /* X 0208 */
3042 			if ((filter->status & 0xff00) == 0x500) {
3043 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3044 				filter->status = 0;
3045 			}
3046 			if ((filter->status & 0xff00) != 0x200) {
3047 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3048 				CK((*filter->output_function)(0x24, filter->data));		/* '$' */
3049 				CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
3050 				filter->status = 0x200;
3051 			}
3052 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
3053 			CK((*filter->output_function)(s & 0xff, filter->data));
3054 		} else if (s < 0x10000) { /* X0212 */
3055 			CK(mbfl_filt_conv_illegal_output(c, filter));
3056 		} else { /* X 0201 latin */
3057 			if ((filter->status & 0xff00) == 0x500) {
3058 				CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3059 				filter->status = 0;
3060 			}
3061 			if ((filter->status & 0xff00) != 0x400) {
3062 				CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3063 				CK((*filter->output_function)(0x28, filter->data));		/* '(' */
3064 				CK((*filter->output_function)(0x4a, filter->data));		/* 'J' */
3065 			}
3066 			filter->status = 0x400;
3067 			CK((*filter->output_function)(s & 0x7f, filter->data));
3068 		}
3069 	} else {
3070 		CK(mbfl_filt_conv_illegal_output(c, filter));
3071 	}
3072 
3073 	return 0;
3074 }
3075 
mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter * filter)3076 static int mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
3077 {
3078 	/* back to latin */
3079 	if ((filter->status & 0xff00) == 0x500) {
3080 		CK((*filter->output_function)(0x0f, filter->data));		/* SO */
3081 	} else if ((filter->status & 0xff00) != 0) {
3082 		CK((*filter->output_function)(0x1b, filter->data));		/* ESC */
3083 		CK((*filter->output_function)(0x28, filter->data));		/* '(' */
3084 		CK((*filter->output_function)(0x42, filter->data));		/* 'B' */
3085 	}
3086 	filter->status = 0;
3087 
3088 	if (filter->flush_function) {
3089 		(*filter->flush_function)(filter->data);
3090 	}
3091 
3092 	return 0;
3093 }
3094 
3095 #define ASCII 0
3096 #define JISX_0201_LATIN 1
3097 #define JISX_0201_KANA 2
3098 #define JISX_0208 3
3099 #define JISX_0212 4
3100 
mb_cp5022x_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3101 static size_t mb_cp5022x_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3102 {
3103 	ZEND_ASSERT(bufsize >= 3);
3104 
3105 	unsigned char *p = *in, *e = p + *in_len;
3106 	uint32_t *out = buf, *limit = buf + bufsize;
3107 
3108 	while (p < e && out < limit) {
3109 		unsigned char c = *p++;
3110 
3111 		if (c == 0x1B) {
3112 			/* Escape sequence */
3113 			if ((e - p) < 2) {
3114 				*out++ = MBFL_BAD_INPUT;
3115 				/* Duplicate error-handling behavior of legacy code */
3116 				if (p < e && (*p == '(' || *p == '$'))
3117 					p++;
3118 				continue;
3119 			}
3120 			unsigned char c2 = *p++;
3121 			if (c2 == '$') {
3122 				unsigned char c3 = *p++;
3123 				if (c3 == '@' || c3 == 'B') {
3124 					*state = JISX_0208;
3125 				} else if (c3 == '(') {
3126 					if (p == e) {
3127 						*out++ = MBFL_BAD_INPUT;
3128 						break;
3129 					}
3130 					unsigned char c4 = *p++;
3131 					if (c4 == '@' || c4 == 'B') {
3132 						*state = JISX_0208;
3133 					} else if (c4 == 'D') {
3134 						*state = JISX_0212;
3135 					} else {
3136 						if ((limit - out) < 3) {
3137 							p -= 4;
3138 							break;
3139 						}
3140 						*out++ = MBFL_BAD_INPUT;
3141 						*out++ = '$';
3142 						*out++ = '(';
3143 						p--;
3144 					}
3145 				} else {
3146 					if ((limit - out) < 2) {
3147 						p -= 3;
3148 						break;
3149 					}
3150 					*out++ = MBFL_BAD_INPUT;
3151 					*out++ = '$';
3152 					p--;
3153 				}
3154 			} else if (c2 == '(') {
3155 				unsigned char c3 = *p++;
3156 				if (c3 == 'B' || c3 == 'H') {
3157 					*state = ASCII;
3158 				} else if (c3 == 'J') {
3159 					*state = JISX_0201_LATIN;
3160 				} else if (c3 == 'I') {
3161 					*state = JISX_0201_KANA;
3162 				} else {
3163 					if ((limit - out) < 2) {
3164 						p -= 3;
3165 						break;
3166 					}
3167 					*out++ = MBFL_BAD_INPUT;
3168 					*out++ = '(';
3169 					p--;
3170 				}
3171 			} else {
3172 				*out++ = MBFL_BAD_INPUT;
3173 				p--;
3174 			}
3175 		} else if (c == 0xE) {
3176 			*state = JISX_0201_KANA;
3177 		} else if (c == 0xF) {
3178 			*state = ASCII;
3179 		} else if (*state == JISX_0201_LATIN && c == 0x5C) { /* YEN SIGN */
3180 			*out++ = 0xA5;
3181 		} else if (*state == JISX_0201_LATIN && c == 0x7E) { /* OVER LINE */
3182 			*out++ = 0x203E;
3183 		} else if (*state == JISX_0201_KANA && c > 0x20 && c < 0x60) {
3184 			*out++ = 0xFF40 + c;
3185 		} else if (*state >= JISX_0208 && c > 0x20 && c <= 0x97) {
3186 			if (p == e) {
3187 				*out++ = MBFL_BAD_INPUT;
3188 				break;
3189 			}
3190 			unsigned char c2 = *p++;
3191 			if (c2 > 0x20 && c2 < 0x7F) {
3192 				unsigned int s = (c - 0x21)*94 + c2 - 0x21;
3193 				uint32_t w = 0;
3194 				if (*state == JISX_0208) {
3195 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3196 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3197 					} else if (s < jisx0208_ucs_table_size) {
3198 						w = jisx0208_ucs_table[s];
3199 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3200 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3201 					} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
3202 						w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
3203 					} else if (s >= 94*94 && s < 114*94) {
3204 						/* MicroSoft extension */
3205 						w = s - 94*94 + 0xE000;
3206 					}
3207 					if (!w)
3208 						w = MBFL_BAD_INPUT;
3209 				} else {
3210 					if (s < jisx0212_ucs_table_size) {
3211 						w = jisx0212_ucs_table[s];
3212 					}
3213 					if (!w)
3214 						w = MBFL_BAD_INPUT;
3215 				}
3216 				*out++ = w;
3217 			} else {
3218 				*out++ = MBFL_BAD_INPUT;
3219 			}
3220 		} else if (c < 0x80) {
3221 			*out++ = c;
3222 		} else if (c >= 0xA1 && c <= 0xDF) {
3223 			*out++ = 0xFEC0 + c;
3224 		} else {
3225 			*out++ = MBFL_BAD_INPUT;
3226 		}
3227 	}
3228 
3229 	*in_len = e - p;
3230 	*in = p;
3231 	return out - buf;
3232 }
3233 
lookup_wchar(uint32_t w)3234 static unsigned int lookup_wchar(uint32_t w)
3235 {
3236 	unsigned int s = 0;
3237 
3238 	if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3239 		s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3240 	} else if (w == 0x203E) { /* OVERLINE */
3241 		s = 0x1007E; /* Convert to JISX 0201 OVERLINE */
3242 	} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3243 		s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3244 	} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3245 		s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3246 	} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3247 		s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3248 	} else if (w >= 0xE000 && w <= 0xE757) {
3249 		/* Private Use Area codepoints */
3250 		s = w - 0xE000;
3251 		s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
3252 	}
3253 
3254 	if (!s) {
3255 		if (w == 0xA5) { /* YEN SIGN */
3256 			s = 0x1005C;
3257 		} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3258 			s = 0x2140;
3259 		} else if (w == 0x2225) { /* PARALLEL TO */
3260 			s = 0x2142;
3261 		} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3262 			s = 0x215D;
3263 		} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3264 			s = 0x2171;
3265 		} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3266 			s = 0x2172;
3267 		} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3268 			s = 0x224C;
3269 		} else if (w == 0) {
3270 			return 0;
3271 		}
3272 	}
3273 
3274 	/* Above, we do a series of lookups in `ucs_*_jis_table` to find a
3275 	 * corresponding kuten code for this Unicode codepoint
3276 	 * If we get zero, that means the codepoint is not in JIS X 0208
3277 	 * On the other hand, if we get a result with the high bits set on both
3278 	 * upper and lower bytes, that is not a code in JIS X 0208 but rather
3279 	 * in JIS X 0213
3280 	 * In either case, check if this codepoint is one of the extensions added
3281 	 * to JIS X 0208 by MicroSoft (to make CP932) */
3282 	if (!s || s >= 0x8080) {
3283 		for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3284 			if (w == cp932ext1_ucs_table[i]) {
3285 				return (((i / 94) + (cp932ext1_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3286 			}
3287 		}
3288 
3289 		for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
3290 			if (w == cp932ext2_ucs_table[i]) {
3291 				return (((i / 94) + (cp932ext2_ucs_table_min / 94) + 0x21) << 8) + (i % 94) + 0x21;
3292 			}
3293 		}
3294 	}
3295 
3296 	return s;
3297 }
3298 
3299 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3300 
mb_wchar_to_cp50220(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3301 static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3302 {
3303 	unsigned char *out, *limit;
3304 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3305 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3306 
3307 	uint32_t w;
3308 
3309 	if (buf->state & 0xFFFF00) {
3310 		/* Reprocess cached codepoint */
3311 		w = buf->state >> 8;
3312 		buf->state &= 0xFF;
3313 		goto reprocess_codepoint;
3314 	}
3315 
3316 	while (len--) {
3317 		w = *in++;
3318 reprocess_codepoint:
3319 
3320 		if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
3321 			/* This codepoint may need to combine with the next one,
3322 			 * but the 'next one' will come in a separate buffer */
3323 			buf->state |= w << 8;
3324 			break;
3325 		}
3326 
3327 		bool consumed = false;
3328 		w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
3329 		if (consumed) {
3330 			/* Two successive codepoints were converted into one */
3331 			in++; len--; consumed = false;
3332 		}
3333 
3334 		unsigned int s = lookup_wchar(w);
3335 
3336 		if (!s && w) {
3337 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3338 		} else if (s < 0x80) {
3339 			/* ASCII */
3340 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3341 			if (buf->state != ASCII) {
3342 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3343 				buf->state = ASCII;
3344 			}
3345 			out = mb_convert_buf_add(out, s);
3346 		} else if (s >= 0xA0 && s < 0xE0) {
3347 			/* JISX 0201 Kana */
3348 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3349 			if (buf->state != JISX_0201_KANA) {
3350 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3351 				buf->state = JISX_0201_KANA;
3352 			}
3353 			out = mb_convert_buf_add(out, s - 0x80);
3354 		} else if (s <= 0x927E) {
3355 			/* JISX 0208 Kanji */
3356 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3357 			if (buf->state != JISX_0208) {
3358 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3359 				buf->state = JISX_0208;
3360 			}
3361 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3362 		} else if (s >= 0x10000) {
3363 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3364 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3365 			if (buf->state != JISX_0201_LATIN) {
3366 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3367 				buf->state = JISX_0201_LATIN;
3368 			}
3369 			out = mb_convert_buf_add(out, s & 0x7F);
3370 		} else {
3371 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3372 		}
3373 	}
3374 
3375 	if (end && buf->state != ASCII) {
3376 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3377 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3378 	}
3379 
3380 	MB_CONVERT_BUF_STORE(buf, out, limit);
3381 }
3382 
mb_wchar_to_cp50221(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3383 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3384 {
3385 	unsigned char *out, *limit;
3386 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3387 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3388 
3389 	while (len--) {
3390 		uint32_t w = *in++;
3391 		unsigned int s = lookup_wchar(w);
3392 
3393 		if (!s && w) {
3394 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3395 		} else if (s < 0x80) {
3396 			/* ASCII */
3397 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3398 			if (buf->state != ASCII) {
3399 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3400 				buf->state = ASCII;
3401 			}
3402 			out = mb_convert_buf_add(out, s);
3403 		} else if (s >= 0xA0 && s < 0xE0) {
3404 			/* JISX 0201 Kana */
3405 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3406 			if (buf->state != JISX_0201_KANA) {
3407 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3408 				buf->state = JISX_0201_KANA;
3409 			}
3410 			out = mb_convert_buf_add(out, s - 0x80);
3411 		} else if (s <= 0x927E) {
3412 			/* JISX 0208 Kanji */
3413 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3414 			if (buf->state != JISX_0208) {
3415 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3416 				buf->state = JISX_0208;
3417 			}
3418 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3419 		} else if (s >= 0x10000) {
3420 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3421 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3422 			if (buf->state != JISX_0201_LATIN) {
3423 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3424 				buf->state = JISX_0201_LATIN;
3425 			}
3426 			out = mb_convert_buf_add(out, s & 0x7F);
3427 		} else {
3428 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50221);
3429 		}
3430 	}
3431 
3432 	if (end && buf->state != ASCII) {
3433 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3434 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3435 	}
3436 
3437 	MB_CONVERT_BUF_STORE(buf, out, limit);
3438 }
3439 
mb_wchar_to_cp50222(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3440 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3441 {
3442 	unsigned char *out, *limit;
3443 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3444 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3445 
3446 	while (len--) {
3447 		uint32_t w = *in++;
3448 		unsigned int s = lookup_wchar(w);
3449 
3450 		if (!s && w) {
3451 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3452 		} else if (s < 0x80) {
3453 			/* ASCII */
3454 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3455 			if (buf->state == JISX_0201_KANA) {
3456 				out = mb_convert_buf_add(out, 0xF);
3457 				buf->state = ASCII;
3458 			} else if (buf->state != ASCII) {
3459 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3460 				buf->state = ASCII;
3461 			}
3462 			out = mb_convert_buf_add(out, s);
3463 		} else if (s >= 0xA0 && s < 0xE0) {
3464 			/* JISX 0201 Kana */
3465 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3466 			if (buf->state != JISX_0201_KANA) {
3467 				out = mb_convert_buf_add(out, 0xE);
3468 				buf->state = JISX_0201_KANA;
3469 			}
3470 			out = mb_convert_buf_add(out, s - 0x80);
3471 		} else if (s <= 0x927E) {
3472 			/* JISX 0208 Kanji */
3473 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
3474 			if (buf->state == JISX_0201_KANA) {
3475 				out = mb_convert_buf_add(out, 0xF);
3476 			}
3477 			if (buf->state != JISX_0208) {
3478 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3479 				buf->state = JISX_0208;
3480 			}
3481 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
3482 		} else if (s >= 0x10000) {
3483 			/* JISX 0201 Latin; we 'tag' these by adding 0x10000 */
3484 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3485 			if (buf->state == JISX_0201_KANA) {
3486 				out = mb_convert_buf_add(out, 0xF);
3487 			}
3488 			if (buf->state != JISX_0201_LATIN) {
3489 				out = mb_convert_buf_add3(out, 0x1B, '(', 'J');
3490 				buf->state = JISX_0201_LATIN;
3491 			}
3492 			out = mb_convert_buf_add(out, s & 0x7F);
3493 		} else {
3494 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp50222);
3495 		}
3496 	}
3497 
3498 	if (end) {
3499 		if (buf->state == JISX_0201_KANA) {
3500 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
3501 			out = mb_convert_buf_add(out, 0xF);
3502 		} else if (buf->state != ASCII) {
3503 			MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
3504 			out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3505 		}
3506 	}
3507 
3508 	MB_CONVERT_BUF_STORE(buf, out, limit);
3509 }
3510 
3511 #define ASCII          0
3512 #define JISX0201_KANA  0x20
3513 #define JISX0208_KANJI 0x80
3514 #define UDC            0xA0
3515 
mbfl_filt_conv_2022jpms_wchar(int c,mbfl_convert_filter * filter)3516 static int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
3517 {
3518 	int c1, s, w;
3519 
3520 	switch (filter->status & 0xF) {
3521 	case 0:
3522 		if (c == 0x1B) {
3523 			filter->status += 2;
3524 		} else if (filter->status == JISX0201_KANA && c > 0x20 && c < 0x60) {
3525 			CK((*filter->output_function)(0xFF40 + c, filter->data));
3526 		} else if ((filter->status == JISX0208_KANJI || filter->status == UDC) && c > 0x20 && c < 0x80) {
3527 			filter->cache = c;
3528 			filter->status += 1;
3529 		} else if (c >= 0 && c < 0x80) { /* ASCII */
3530 			CK((*filter->output_function)(c, filter->data));
3531 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
3532 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
3533 		} else {
3534 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3535 		}
3536 		break;
3537 
3538 	/* Kanji, second byte */
3539 	case 1:
3540 		w = 0;
3541 		filter->status &= ~0xF;
3542 		c1 = filter->cache;
3543 		if (c > 0x20 && c < 0x7F) {
3544 			s = ((c1 - 0x21) * 94) + c - 0x21;
3545 			if (filter->status == JISX0208_KANJI) {
3546 				if (s <= 137) {
3547 					if (s == 31) {
3548 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3549 					} else if (s == 32) {
3550 						w = 0xFF5E; /* FULLWIDTH TILDE */
3551 					} else if (s == 33) {
3552 						w = 0x2225; /* PARALLEL TO */
3553 					} else if (s == 60) {
3554 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3555 					} else if (s == 80) {
3556 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3557 					} else if (s == 81) {
3558 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3559 					} else if (s == 137) {
3560 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3561 					}
3562 				}
3563 
3564 				if (w == 0) {
3565 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
3566 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3567 					} else if (s >= 0 && s < jisx0208_ucs_table_size) {
3568 						w = jisx0208_ucs_table[s];
3569 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
3570 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3571 					}
3572 				}
3573 
3574 				if (w <= 0) {
3575 					w = MBFL_BAD_INPUT;
3576 				}
3577 			} else {
3578 				if (c1 > 0x20 && c1 < 0x35) {
3579 					w = 0xE000 + ((c1 - 0x21) * 94) + c - 0x21;
3580 				} else {
3581 					w = MBFL_BAD_INPUT;
3582 				}
3583 			}
3584 			CK((*filter->output_function)(w, filter->data));
3585 		} else {
3586 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3587 		}
3588 		break;
3589 
3590 	/* ESC */
3591 	case 2:
3592 		if (c == '$') {
3593 			filter->status++;
3594 		} else if (c == '(') {
3595 			filter->status += 3;
3596 		} else {
3597 			filter->status &= ~0xF;
3598 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3599 		}
3600 		break;
3601 
3602 	/* ESC $ */
3603 	case 3:
3604 		if (c == '@' || c == 'B') {
3605 			filter->status = JISX0208_KANJI;
3606 		} else if (c == '(') {
3607 			filter->status++;
3608 		} else {
3609 			filter->status &= ~0xF;
3610 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3611 		}
3612 		break;
3613 
3614 	/* ESC $ ( */
3615 	case 4:
3616 		if (c == '@' || c == 'B') {
3617 			filter->status = JISX0208_KANJI;
3618 		} else if (c == '?') {
3619 			filter->status = UDC;
3620 		} else {
3621 			filter->status &= ~0xF;
3622 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3623 		}
3624 		break;
3625 
3626 	/* ESC ( */
3627 	case 5:
3628 		if (c == 'B' || c == 'J') {
3629 			filter->status = 0;
3630 		} else if (c == 'I') {
3631 			filter->status = JISX0201_KANA;
3632 		} else {
3633 			filter->status &= ~0xF;
3634 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
3635 		}
3636 	}
3637 
3638 	return 0;
3639 }
3640 
mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter * filter)3641 static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter)
3642 {
3643 	if (filter->status & 0xF) {
3644 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
3645 	}
3646 	filter->status = 0;
3647 
3648 	if (filter->flush_function) {
3649 		(*filter->flush_function)(filter->data);
3650 	}
3651 
3652 	return 0;
3653 }
3654 
3655 #define sjistoidx(c1, c2) \
3656 	(((c1) > 0x9f) ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
3657 #define idxtojis1(c) (((c) / 94) + 0x21)
3658 #define idxtojis2(c) (((c) % 94) + 0x21)
3659 
cp932ext3_cp932ext2_jis(int c)3660 static int cp932ext3_cp932ext2_jis(int c)
3661 {
3662 	int idx;
3663 
3664 	idx = sjistoidx(0xfa, 0x40) + c;
3665 	if (idx >= sjistoidx(0xfa, 0x5c))
3666 		idx -=  sjistoidx(0xfa, 0x5c) - sjistoidx(0xed, 0x40);
3667 	else if (idx >= sjistoidx(0xfa, 0x55))
3668 		idx -=  sjistoidx(0xfa, 0x55) - sjistoidx(0xee, 0xfa);
3669 	else if (idx >= sjistoidx(0xfa, 0x40))
3670 		idx -=  sjistoidx(0xfa, 0x40) - sjistoidx(0xee, 0xef);
3671 	return idxtojis1(idx) << 8 | idxtojis2(idx);
3672 }
3673 
mbfl_filt_conv_wchar_2022jpms(int c,mbfl_convert_filter * filter)3674 static int mbfl_filt_conv_wchar_2022jpms(int c, mbfl_convert_filter *filter)
3675 {
3676 	int c1, c2, s1 = 0, s2 = 0;
3677 
3678 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
3679 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
3680 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
3681 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
3682 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
3683 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
3684 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
3685 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
3686 	} else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
3687 		/* Private User Area (95ku - 114ku) */
3688 		s1 = c - 0xE000;
3689 		c1 = (s1 / 94) + 0x7f;
3690 		c2 = (s1 % 94) + 0x21;
3691 		s1 = (c1 << 8) | c2;
3692 	}
3693 
3694 	if (s1 <= 0) {
3695 		if (c == 0xA5) { /* YEN SIGN */
3696 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
3697 		} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3698 			s1 = 0x2140;
3699 		} else if (c == 0x2225) { /* PARALLEL TO */
3700 			s1 = 0x2142;
3701 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3702 			s1 = 0x215d;
3703 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3704 			s1 = 0x2171;
3705 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3706 			s1 = 0x2172;
3707 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3708 			s1 = 0x224C;
3709 		}
3710 	}
3711 
3712 	if ((s1 <= 0) || (s1 >= 0xa1a1 && s2 == 0)) { /* not found or X 0212 */
3713 		s1 = -1;
3714 		for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
3715 			if (c == cp932ext1_ucs_table[c1]) {
3716 				s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
3717 				break;
3718 			}
3719 		}
3720 
3721 		if (s1 <= 0) {
3722 			for (c1 = 0; c1 < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; c1++) {
3723 				if (c == cp932ext3_ucs_table[c1]) {
3724 					s1 = cp932ext3_cp932ext2_jis(c1);
3725 					break;
3726 				}
3727 			}
3728 		}
3729 
3730 		if (c == 0) {
3731 			s1 = 0;
3732 		}
3733 	}
3734 
3735 	if (s1 >= 0) {
3736 		if (s1 < 0x80) { /* latin */
3737 			if (filter->status & 0xFF00) {
3738 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3739 				CK((*filter->output_function)('(', filter->data));
3740 				CK((*filter->output_function)('B', filter->data));
3741 			}
3742 			CK((*filter->output_function)(s1, filter->data));
3743 			filter->status = 0;
3744 		} else if (s1 > 0xA0 && s1 < 0xE0) { /* kana */
3745 			if ((filter->status & 0xFF00) != 0x100) {
3746 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3747 				CK((*filter->output_function)('(', filter->data));
3748 				CK((*filter->output_function)('I', filter->data));
3749 			}
3750 			filter->status = 0x100;
3751 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
3752 		} else if (s1 < 0x7E7F) { /* X 0208 */
3753 			if ((filter->status & 0xFF00) != 0x200) {
3754 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3755 				CK((*filter->output_function)('$', filter->data));
3756 				CK((*filter->output_function)('B', filter->data));
3757 			}
3758 			filter->status = 0x200;
3759 			CK((*filter->output_function)((s1 >> 8) & 0xFF, filter->data));
3760 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
3761 		} else if (s1 < 0x927F) { /* UDC */
3762 			if ((filter->status & 0xFF00) != 0x800) {
3763 				CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3764 				CK((*filter->output_function)('$', filter->data));
3765 				CK((*filter->output_function)('(', filter->data));
3766 				CK((*filter->output_function)('?', filter->data));
3767 			}
3768 			filter->status = 0x800;
3769 			CK((*filter->output_function)(((s1 >> 8) - 0x5E) & 0x7F, filter->data));
3770 			CK((*filter->output_function)(s1 & 0x7F, filter->data));
3771 		}
3772 	} else {
3773 		CK(mbfl_filt_conv_illegal_output(c, filter));
3774 	}
3775 
3776 	return 0;
3777 }
3778 
mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter * filter)3779 static int mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter)
3780 {
3781 	/* Go back to ASCII (so strings can be safely concatenated) */
3782 	if ((filter->status & 0xFF00) != 0) {
3783 		CK((*filter->output_function)(0x1B, filter->data)); /* ESC */
3784 		CK((*filter->output_function)('(', filter->data));
3785 		CK((*filter->output_function)('B', filter->data));
3786 	}
3787 	filter->status = 0;
3788 
3789 	if (filter->flush_function) {
3790 		(*filter->flush_function)(filter->data);
3791 	}
3792 
3793 	return 0;
3794 }
3795 
mb_iso2022jpms_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)3796 static size_t mb_iso2022jpms_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
3797 {
3798 	unsigned char *p = *in, *e = p + *in_len;
3799 	uint32_t *out = buf, *limit = buf + bufsize;
3800 
3801 	while (p < e && out < limit) {
3802 		unsigned char c = *p++;
3803 
3804 		if (c == 0x1B) {
3805 			if ((e - p) < 2) {
3806 				*out++ = MBFL_BAD_INPUT;
3807 				p = e;
3808 				break;
3809 			}
3810 			unsigned char c2 = *p++;
3811 			unsigned char c3 = *p++;
3812 
3813 			if (c2 == '$') {
3814 				if (c3 == '@' || c3 == 'B') {
3815 					*state = JISX0208_KANJI;
3816 				} else if (c3 == '(' && p < e) {
3817 					unsigned char c4 = *p++;
3818 
3819 					if (c4 == '@' || c4 == 'B') {
3820 						*state = JISX0208_KANJI;
3821 					} else if (c4 == '?') {
3822 						*state = UDC;
3823 					} else {
3824 						*out++ = MBFL_BAD_INPUT;
3825 					}
3826 				} else {
3827 					*out++ = MBFL_BAD_INPUT;
3828 				}
3829 			} else if (c2 == '(') {
3830 				if (c3 == 'B' || c3 == 'J') {
3831 					*state = ASCII;
3832 				} else if (c3 == 'I') {
3833 					*state = JISX0201_KANA;
3834 				} else {
3835 					*out++ = MBFL_BAD_INPUT;
3836 				}
3837 			} else {
3838 				p--;
3839 				*out++ = MBFL_BAD_INPUT;
3840 			}
3841 		} else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
3842 			*out++ = 0xFF40 + c;
3843 		} else if ((*state == JISX0208_KANJI || *state == UDC) && c >= 0x21 && c <= 0x7F) {
3844 			if (p == e) {
3845 				*out++ = MBFL_BAD_INPUT;
3846 				break;
3847 			}
3848 			unsigned char c2 = *p++;
3849 			unsigned int w = 0;
3850 
3851 			if (c2 >= 0x21 && c2 <= 0x7E) {
3852 				unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
3853 				if (*state == JISX0208_KANJI) {
3854 					if (s <= 137) {
3855 						if (s == 31) {
3856 							w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
3857 						} else if (s == 32) {
3858 							w = 0xFF5E; /* FULLWIDTH TILDE */
3859 						} else if (s == 33) {
3860 							w = 0x2225; /* PARALLEL TO */
3861 						} else if (s == 60) {
3862 							w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
3863 						} else if (s == 80) {
3864 							w = 0xFFE0; /* FULLWIDTH CENT SIGN */
3865 						} else if (s == 81) {
3866 							w = 0xFFE1; /* FULLWIDTH POUND SIGN */
3867 						} else if (s == 137) {
3868 							w = 0xFFE2; /* FULLWIDTH NOT SIGN */
3869 						}
3870 					}
3871 
3872 					if (!w) {
3873 						if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
3874 							w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
3875 						} else if (s < jisx0208_ucs_table_size) {
3876 							w = jisx0208_ucs_table[s];
3877 						} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
3878 							w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
3879 						}
3880 					}
3881 				} else if (c >= 0x21 && c <= 0x34) {
3882 					w = 0xE000 + ((c - 0x21) * 94) + c2 - 0x21;
3883 				}
3884 
3885 				*out++ = w ? w : MBFL_BAD_INPUT;
3886 			} else {
3887 				*out++ = MBFL_BAD_INPUT;
3888 			}
3889 		} else if (c <= 0x7F) {
3890 			*out++ = c;
3891 		} else if (c >= 0xA1 && c <= 0xDF) {
3892 			*out++ = 0xFEC0 + c;
3893 		} else {
3894 			*out++ = MBFL_BAD_INPUT;
3895 		}
3896 	}
3897 
3898 	*in_len = e - p;
3899 	*in = p;
3900 	return out - buf;
3901 }
3902 
mb_wchar_to_iso2022jpms(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)3903 static void mb_wchar_to_iso2022jpms(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
3904 {
3905 	unsigned char *out, *limit;
3906 	MB_CONVERT_BUF_LOAD(buf, out, limit);
3907 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3908 
3909 	while (len--) {
3910 		uint32_t w = *in++;
3911 		unsigned int s = 0;
3912 
3913 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
3914 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
3915 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
3916 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
3917 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
3918 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
3919 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
3920 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
3921 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
3922 			/* Private User Area (95ku - 114ku) */
3923 			s = ((((w - 0xE000) / 94) + 0x7F) << 8) | (((w - 0xE000) % 94) + 0x21);
3924 		}
3925 
3926 		if (!s) {
3927 			if (w == 0xA5) { /* YEN SIGN */
3928 				s = 0x216F; /* FULLWIDTH YEN SIGN */
3929 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
3930 				s = 0x2140;
3931 			} else if (w == 0x2225) { /* PARALLEL TO */
3932 				s = 0x2142;
3933 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
3934 				s = 0x215D;
3935 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
3936 				s = 0x2171;
3937 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
3938 				s = 0x2172;
3939 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
3940 				s = 0x224C;
3941 			}
3942 		}
3943 
3944 		if (s >= 0xA1A1) /* JISX 0212 */
3945 			s = 0;
3946 
3947 		if (!s && w) {
3948 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
3949 				if (w == cp932ext1_ucs_table[i]) {
3950 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
3951 					break;
3952 				}
3953 			}
3954 
3955 			if (!s) {
3956 				for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
3957 					if (w == cp932ext3_ucs_table[i]) {
3958 						s = cp932ext3_cp932ext2_jis(i);
3959 						break;
3960 					}
3961 				}
3962 			}
3963 		}
3964 
3965 		if (!s && w) {
3966 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
3967 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
3968 		} else if (s <= 0x7F) {
3969 			if (buf->state != ASCII) {
3970 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3971 				out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
3972 				buf->state = ASCII;
3973 			}
3974 			out = mb_convert_buf_add(out, s);
3975 		} else if (s >= 0xA1 && s <= 0xDF) {
3976 			if (buf->state != JISX0201_KANA) {
3977 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
3978 				out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
3979 				buf->state = JISX0201_KANA;
3980 			}
3981 			out = mb_convert_buf_add(out, s & 0x7F);
3982 		} else if (s <= 0x7E7E) {
3983 			if (buf->state != JISX0208_KANJI) {
3984 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
3985 				out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
3986 				buf->state = JISX0208_KANJI;
3987 			} else {
3988 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3989 			}
3990 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0x7F);
3991 		} else if (s < 0x927F) {
3992 			if (buf->state != UDC) {
3993 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6);
3994 				out = mb_convert_buf_add4(out, 0x1B, '$', '(', '?');
3995 				buf->state = UDC;
3996 			} else {
3997 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
3998 			}
3999 			out = mb_convert_buf_add2(out, ((s >> 8) - 0x5E) & 0x7F, s & 0x7F);
4000 		} else {
4001 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms);
4002 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4003 		}
4004 	}
4005 
4006 	if (end && buf->state != ASCII) {
4007 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
4008 		out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
4009 	}
4010 
4011 	MB_CONVERT_BUF_STORE(buf, out, limit);
4012 }
4013 
mbfl_filt_conv_2022kr_wchar(int c,mbfl_convert_filter * filter)4014 static int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
4015 {
4016 	int w = 0;
4017 
4018 	switch (filter->status & 0xf) {
4019 	/* case 0x00: ASCII */
4020 	/* case 0x10: KSC5601 */
4021 	case 0:
4022 		if (c == 0x1b) { /* ESC */
4023 			filter->status += 2;
4024 		} else if (c == 0x0f) { /* shift in (ASCII) */
4025 			filter->status = 0;
4026 		} else if (c == 0x0e) { /* shift out (KSC5601) */
4027 			filter->status = 0x10;
4028 		} else if ((filter->status & 0x10) && c > 0x20 && c < 0x7f) {
4029 			/* KSC5601 lead byte */
4030 			filter->cache = c;
4031 			filter->status = 0x11;
4032 		} else if ((filter->status & 0x10) == 0 && c >= 0 && c < 0x80) {
4033 			/* latin, CTLs */
4034 			CK((*filter->output_function)(c, filter->data));
4035 		} else {
4036 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4037 		}
4038 		break;
4039 
4040 	case 1: /* dbcs second byte */
4041 		filter->status = 0x10;
4042 		int c1 = filter->cache;
4043 		int flag = 0;
4044 
4045 		if (c1 > 0x20 && c1 < 0x47) {
4046 			flag = 1;
4047 		} else if (c1 >= 0x47 && c1 <= 0x7e && c1 != 0x49) {
4048 			flag = 2;
4049 		}
4050 
4051 		if (flag > 0 && c > 0x20 && c < 0x7f) {
4052 			if (flag == 1) {
4053 				if (c1 != 0x22 || c <= 0x65) {
4054 					w = (c1 - 1)*190 + (c - 0x41) + 0x80;
4055 					ZEND_ASSERT(w < uhc1_ucs_table_size);
4056 					w = uhc1_ucs_table[w];
4057 				}
4058 			} else {
4059 				w = (c1 - 0x47)*94 + c - 0x21;
4060 				if (w < uhc3_ucs_table_size) {
4061 					w = uhc3_ucs_table[w];
4062 				} else {
4063 					w = MBFL_BAD_INPUT;
4064 				}
4065 			}
4066 
4067 			if (w <= 0) {
4068 				w = MBFL_BAD_INPUT;
4069 			}
4070 			CK((*filter->output_function)(w, filter->data));
4071 		} else {
4072 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4073 		}
4074 		break;
4075 
4076 	case 2: /* ESC */
4077 		if (c == '$') {
4078 			filter->status++;
4079 		} else {
4080 			filter->status &= ~0xF;
4081 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4082 		}
4083 		break;
4084 
4085 	case 3: /* ESC $ */
4086 		if (c == ')') {
4087 			filter->status++;
4088 		} else {
4089 			filter->status &= ~0xF;
4090 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4091 		}
4092 		break;
4093 
4094 	case 4: /* ESC $ ) */
4095 		filter->status = 0;
4096 		if (c != 'C') {
4097 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4098 		}
4099 		break;
4100 
4101 		EMPTY_SWITCH_DEFAULT_CASE();
4102 	}
4103 
4104 	return 0;
4105 }
4106 
mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter * filter)4107 static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
4108 {
4109 	if (filter->status & 0xF) {
4110 		/* 2-byte character was truncated */
4111 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4112 	}
4113 	filter->status = 0;
4114 
4115 	if (filter->flush_function) {
4116 		(*filter->flush_function)(filter->data);
4117 	}
4118 
4119 	return 0;
4120 }
4121 
mbfl_filt_conv_wchar_2022kr(int c,mbfl_convert_filter * filter)4122 static int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
4123 {
4124 	int c1, c2, s = 0;
4125 
4126 	if ((filter->status & 0x100) == 0) {
4127 		CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
4128 		CK((*filter->output_function)('$', filter->data));
4129 		CK((*filter->output_function)(')', filter->data));
4130 		CK((*filter->output_function)('C', filter->data));
4131 		filter->status |= 0x100;
4132 	}
4133 
4134 	if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
4135 		s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
4136 	} else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
4137 		s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
4138 	} else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
4139 		s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
4140 	} else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
4141 		s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
4142 	} else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
4143 		s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
4144 	} else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
4145 		s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
4146 	} else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
4147 		s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
4148 	}
4149 
4150 	c1 = (s >> 8) & 0xff;
4151 	c2 = s & 0xff;
4152 	/* exclude UHC extension area */
4153 	if (c1 < 0xa1 || c2 < 0xa1) {
4154 		s = c;
4155 	} else if (s & 0x8000) {
4156 		s -= 0x8080;
4157 	}
4158 
4159 	if (s <= 0) {
4160 		if (c == 0) {
4161 			s = 0;
4162 		} else {
4163 			s = -1;
4164 		}
4165 	} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4166 		s = -1;
4167 	}
4168 
4169 	if (s >= 0) {
4170 		if (s < 0x80 && s >= 0) { /* ASCII */
4171 			if (filter->status & 0x10) {
4172 				CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4173 				filter->status &= ~0x10;
4174 			}
4175 			CK((*filter->output_function)(s, filter->data));
4176 		} else {
4177 			if ((filter->status & 0x10) == 0) {
4178 				CK((*filter->output_function)(0x0e, filter->data)); /* shift out */
4179 				filter->status |= 0x10;
4180 			}
4181 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
4182 			CK((*filter->output_function)(s & 0xff, filter->data));
4183 		}
4184 	} else {
4185 		CK(mbfl_filt_conv_illegal_output(c, filter));
4186 	}
4187 
4188 	return 0;
4189 }
4190 
mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter * filter)4191 static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter)
4192 {
4193 	if (filter->status & 0xF) {
4194 		/* Escape sequence or 2-byte character was truncated */
4195 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4196 	}
4197 	/* back to ascii */
4198 	if (filter->status & 0x10) {
4199 		CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
4200 	}
4201 
4202 	filter->status = filter->cache = 0;
4203 
4204 	if (filter->flush_function) {
4205 		return (*filter->flush_function)(filter->data);
4206 	}
4207 
4208 	return 0;
4209 }
4210 
4211 #define ASCII 0
4212 #define KSC5601 1
4213 
mb_iso2022kr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4214 static size_t mb_iso2022kr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4215 {
4216 	unsigned char *p = *in, *e = p + *in_len;
4217 	uint32_t *out = buf, *limit = buf + bufsize;
4218 
4219 	while (p < e && out < limit) {
4220 		unsigned char c = *p++;
4221 
4222 		if (c == 0x1B) {
4223 			if ((e - p) < 3) {
4224 				*out++ = MBFL_BAD_INPUT;
4225 				if (p < e && *p++ == '$') {
4226 					if (p < e) {
4227 						p++;
4228 					}
4229 				}
4230 				continue;
4231 			}
4232 			unsigned char c2 = *p++;
4233 			unsigned char c3 = *p++;
4234 			unsigned char c4 = *p++;
4235 			if (c2 == '$' && c3 == ')' && c4 == 'C') {
4236 				*state = ASCII;
4237 			} else {
4238 				if (c3 != ')') {
4239 					p--;
4240 					if (c2 != '$')
4241 						p--;
4242 				}
4243 				*out++ = MBFL_BAD_INPUT;
4244 			}
4245 		} else if (c == 0xF) {
4246 			*state = ASCII;
4247 		} else if (c == 0xE) {
4248 			*state = KSC5601;
4249 		} else if (c >= 0x21 && c <= 0x7E && *state == KSC5601) {
4250 			if (p == e) {
4251 				*out++ = MBFL_BAD_INPUT;
4252 				break;
4253 			}
4254 			unsigned char c2 = *p++;
4255 			unsigned int w = 0;
4256 
4257 			if (c2 < 0x21 || c2 > 0x7E) {
4258 				*out++ = MBFL_BAD_INPUT;
4259 				continue;
4260 			}
4261 
4262 			if (c < 0x47) {
4263 				if (c != 0x22 || c2 <= 0x65) {
4264 					w = (c - 1)*190 + c2 - 0x41 + 0x80;
4265 					ZEND_ASSERT(w < uhc1_ucs_table_size);
4266 					w = uhc1_ucs_table[w];
4267 				}
4268 			} else if (c != 0x49 && c <= 0x7D) {
4269 				w = (c - 0x47)*94 + c2 - 0x21;
4270 				ZEND_ASSERT(w < uhc3_ucs_table_size);
4271 				w = uhc3_ucs_table[w];
4272 			}
4273 
4274 			if (!w)
4275 				w = MBFL_BAD_INPUT;
4276 			*out++ = w;
4277 		} else if (c < 0x80 && *state == ASCII) {
4278 			*out++ = c;
4279 		} else {
4280 			*out++ = MBFL_BAD_INPUT;
4281 		}
4282 	}
4283 
4284 	*in_len = e - p;
4285 	*in = p;
4286 	return out - buf;
4287 }
4288 
4289 #define EMITTED_ESC_SEQUENCE 0x10
4290 
mb_wchar_to_iso2022kr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4291 static void mb_wchar_to_iso2022kr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4292 {
4293 	unsigned char *out, *limit;
4294 	MB_CONVERT_BUF_LOAD(buf, out, limit);
4295 
4296 	/* This escape sequence needs to come *somewhere* at the beginning of a line before
4297 	 * we can use the Shift In/Shift Out bytes, but it only needs to come once in a string
4298 	 * Rather than tracking newlines, we can just emit the sequence once at the beginning
4299 	 * of the output string... since that will always be "the beginning of a line" */
4300 	if (len && !(buf->state & EMITTED_ESC_SEQUENCE)) {
4301 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 4 + len);
4302 		out = mb_convert_buf_add4(out, 0x1B, '$', ')', 'C');
4303 		buf->state |= EMITTED_ESC_SEQUENCE;
4304 	} else {
4305 		MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4306 	}
4307 
4308 	while (len--) {
4309 		uint32_t w = *in++;
4310 		unsigned int s = 0;
4311 
4312 		if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
4313 			s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
4314 		} else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
4315 			s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
4316 		} else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
4317 			s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
4318 		} else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
4319 			s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
4320 		} else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
4321 			s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
4322 		} else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
4323 			s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
4324 		} else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
4325 			s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
4326 		}
4327 
4328 		if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
4329 			s = w;
4330 		} else {
4331 			s -= 0x8080;
4332 		}
4333 
4334 		if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
4335 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022kr);
4336 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
4337 		} else if (s < 0x80) {
4338 			if ((buf->state & 1) != ASCII) {
4339 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4340 				out = mb_convert_buf_add(out, 0xF);
4341 				buf->state &= ~KSC5601;
4342 			}
4343 			out = mb_convert_buf_add(out, s);
4344 		} else {
4345 			if ((buf->state & 1) != KSC5601) {
4346 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
4347 				out = mb_convert_buf_add(out, 0xE);
4348 				buf->state |= KSC5601;
4349 			} else {
4350 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
4351 			}
4352 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
4353 		}
4354 	}
4355 
4356 	if (end && (buf->state & 1) != ASCII) {
4357 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 1);
4358 		out = mb_convert_buf_add(out, 0xF);
4359 	}
4360 
4361 	MB_CONVERT_BUF_STORE(buf, out, limit);
4362 }
4363 
4364 static const struct mbfl_convert_vtbl vtbl_jis_wchar = {
4365 	mbfl_no_encoding_jis,
4366 	mbfl_no_encoding_wchar,
4367 	mbfl_filt_conv_common_ctor,
4368 	NULL,
4369 	mbfl_filt_conv_jis_wchar,
4370 	mbfl_filt_conv_jis_wchar_flush,
4371 	NULL,
4372 };
4373 
4374 static const struct mbfl_convert_vtbl vtbl_wchar_jis = {
4375 	mbfl_no_encoding_wchar,
4376 	mbfl_no_encoding_jis,
4377 	mbfl_filt_conv_common_ctor,
4378 	NULL,
4379 	mbfl_filt_conv_wchar_jis,
4380 	mbfl_filt_conv_any_jis_flush,
4381 	NULL,
4382 };
4383 
4384 const mbfl_encoding mbfl_encoding_jis = {
4385 	mbfl_no_encoding_jis,
4386 	"JIS",
4387 	"ISO-2022-JP",
4388 	NULL,
4389 	NULL,
4390 	MBFL_ENCTYPE_GL_UNSAFE,
4391 	&vtbl_jis_wchar,
4392 	&vtbl_wchar_jis,
4393 	mb_iso2022jp_to_wchar,
4394 	mb_wchar_to_jis,
4395 	mb_check_jis
4396 };
4397 
4398 static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
4399 	mbfl_no_encoding_2022jp,
4400 	mbfl_no_encoding_wchar,
4401 	mbfl_filt_conv_common_ctor,
4402 	NULL,
4403 	mbfl_filt_conv_jis_wchar,
4404 	mbfl_filt_conv_jis_wchar_flush,
4405 	NULL,
4406 };
4407 
4408 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp = {
4409 	mbfl_no_encoding_wchar,
4410 	mbfl_no_encoding_2022jp,
4411 	mbfl_filt_conv_common_ctor,
4412 	NULL,
4413 	mbfl_filt_conv_wchar_2022jp,
4414 	mbfl_filt_conv_any_jis_flush,
4415 	NULL,
4416 };
4417 
4418 const mbfl_encoding mbfl_encoding_2022jp = {
4419 	mbfl_no_encoding_2022jp,
4420 	"ISO-2022-JP",
4421 	"ISO-2022-JP",
4422 	NULL,
4423 	NULL,
4424 	MBFL_ENCTYPE_GL_UNSAFE,
4425 	&vtbl_2022jp_wchar,
4426 	&vtbl_wchar_2022jp,
4427 	mb_iso2022jp_to_wchar,
4428 	mb_wchar_to_iso2022jp,
4429 	mb_check_iso2022jp
4430 };
4431 
4432 static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
4433 
4434 static const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {
4435 	mbfl_no_encoding_2022jp_kddi,
4436 	mbfl_no_encoding_wchar,
4437 	mbfl_filt_conv_common_ctor,
4438 	NULL,
4439 	mbfl_filt_conv_2022jp_mobile_wchar,
4440 	mbfl_filt_conv_2022jp_mobile_wchar_flush,
4441 	NULL,
4442 };
4443 
4444 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_kddi = {
4445 	mbfl_no_encoding_wchar,
4446 	mbfl_no_encoding_2022jp_kddi,
4447 	mbfl_filt_conv_common_ctor,
4448 	NULL,
4449 	mbfl_filt_conv_wchar_2022jp_mobile,
4450 	mbfl_filt_conv_wchar_2022jp_mobile_flush,
4451 	NULL,
4452 };
4453 
4454 const mbfl_encoding mbfl_encoding_2022jp_kddi = {
4455 	mbfl_no_encoding_2022jp_kddi,
4456 	"ISO-2022-JP-MOBILE#KDDI",
4457 	"ISO-2022-JP",
4458 	mbfl_encoding_2022jp_kddi_aliases,
4459 	NULL,
4460 	MBFL_ENCTYPE_GL_UNSAFE,
4461 	&vtbl_2022jp_kddi_wchar,
4462 	&vtbl_wchar_2022jp_kddi,
4463 	mb_iso2022jp_kddi_to_wchar,
4464 	mb_wchar_to_iso2022jp_kddi,
4465 	NULL
4466 };
4467 
4468 static const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
4469 	mbfl_no_encoding_2022jp_2004,
4470 	mbfl_no_encoding_wchar,
4471 	mbfl_filt_conv_common_ctor,
4472 	NULL,
4473 	mbfl_filt_conv_jis2004_wchar,
4474 	mbfl_filt_conv_jis2004_wchar_flush,
4475 	NULL,
4476 };
4477 
4478 static const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = {
4479 	mbfl_no_encoding_wchar,
4480 	mbfl_no_encoding_2022jp_2004,
4481 	mbfl_filt_conv_common_ctor,
4482 	NULL,
4483 	mbfl_filt_conv_wchar_jis2004,
4484 	mbfl_filt_conv_wchar_jis2004_flush,
4485 	NULL,
4486 };
4487 
4488 const mbfl_encoding mbfl_encoding_2022jp_2004 = {
4489 	mbfl_no_encoding_2022jp_2004,
4490 	"ISO-2022-JP-2004",
4491 	"ISO-2022-JP-2004",
4492 	NULL,
4493 	NULL,
4494 	MBFL_ENCTYPE_GL_UNSAFE,
4495 	&vtbl_2022jp_2004_wchar,
4496 	&vtbl_wchar_2022jp_2004,
4497 	mb_iso2022jp2004_to_wchar,
4498 	mb_wchar_to_iso2022jp2004,
4499 	NULL
4500 };
4501 
4502 /* Previously, a dubious 'encoding' called 'cp50220raw' was supported
4503  * This was just CP50220, but the implementation was less strict regarding
4504  * invalid characters; it would silently pass some through
4505  * This 'encoding' only existed in mbstring. In case some poor, lost soul is
4506  * still using it, retain minimal support by aliasing it to CP50220
4507  *
4508  * Further, mbstring also had a made-up encoding called "JIS-ms"
4509  * This was the same as CP5022{0,1,2}, but without their special ways of
4510  * handling conversion of Unicode half-width katakana */
4511 static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
4512 
4513 static const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
4514 	mbfl_no_encoding_cp50220,
4515 	mbfl_no_encoding_wchar,
4516 	mbfl_filt_conv_common_ctor,
4517 	NULL,
4518 	mbfl_filt_conv_cp5022x_wchar,
4519 	mbfl_filt_conv_cp5022x_wchar_flush,
4520 	NULL,
4521 };
4522 
4523 static const struct mbfl_convert_vtbl vtbl_wchar_cp50220 = {
4524 	mbfl_no_encoding_wchar,
4525 	mbfl_no_encoding_cp50220,
4526 	mbfl_filt_conv_common_ctor,
4527 	NULL,
4528 	mbfl_filt_conv_wchar_cp50220,
4529 	mbfl_filt_conv_wchar_cp50220_flush,
4530 	NULL,
4531 };
4532 
4533 static const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
4534 	mbfl_no_encoding_cp50221,
4535 	mbfl_no_encoding_wchar,
4536 	mbfl_filt_conv_common_ctor,
4537 	NULL,
4538 	mbfl_filt_conv_cp5022x_wchar,
4539 	mbfl_filt_conv_cp5022x_wchar_flush,
4540 	NULL,
4541 };
4542 
4543 static const struct mbfl_convert_vtbl vtbl_wchar_cp50221 = {
4544 	mbfl_no_encoding_wchar,
4545 	mbfl_no_encoding_cp50221,
4546 	mbfl_filt_conv_common_ctor,
4547 	NULL,
4548 	mbfl_filt_conv_wchar_cp50221,
4549 	mbfl_filt_conv_any_jis_flush,
4550 	NULL,
4551 };
4552 
4553 static const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
4554 	mbfl_no_encoding_cp50222,
4555 	mbfl_no_encoding_wchar,
4556 	mbfl_filt_conv_common_ctor,
4557 	NULL,
4558 	mbfl_filt_conv_cp5022x_wchar,
4559 	mbfl_filt_conv_cp5022x_wchar_flush,
4560 	NULL,
4561 };
4562 
4563 static const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
4564 	mbfl_no_encoding_wchar,
4565 	mbfl_no_encoding_cp50222,
4566 	mbfl_filt_conv_common_ctor,
4567 	NULL,
4568 	mbfl_filt_conv_wchar_cp50222,
4569 	mbfl_filt_conv_wchar_cp50222_flush,
4570 	NULL,
4571 };
4572 
4573 const mbfl_encoding mbfl_encoding_cp50220 = {
4574 	mbfl_no_encoding_cp50220,
4575 	"CP50220",
4576 	"ISO-2022-JP",
4577 	cp50220_aliases,
4578 	NULL,
4579 	MBFL_ENCTYPE_GL_UNSAFE,
4580 	&vtbl_cp50220_wchar,
4581 	&vtbl_wchar_cp50220,
4582 	mb_cp5022x_to_wchar,
4583 	mb_wchar_to_cp50220,
4584 	NULL
4585 };
4586 
4587 const mbfl_encoding mbfl_encoding_cp50221 = {
4588 	mbfl_no_encoding_cp50221,
4589 	"CP50221",
4590 	"ISO-2022-JP",
4591 	NULL,
4592 	NULL,
4593 	MBFL_ENCTYPE_GL_UNSAFE,
4594 	&vtbl_cp50221_wchar,
4595 	&vtbl_wchar_cp50221,
4596 	mb_cp5022x_to_wchar,
4597 	mb_wchar_to_cp50221,
4598 	NULL
4599 };
4600 
4601 const mbfl_encoding mbfl_encoding_cp50222 = {
4602 	mbfl_no_encoding_cp50222,
4603 	"CP50222",
4604 	"ISO-2022-JP",
4605 	NULL,
4606 	NULL,
4607 	MBFL_ENCTYPE_GL_UNSAFE,
4608 	&vtbl_cp50222_wchar,
4609 	&vtbl_wchar_cp50222,
4610 	mb_cp5022x_to_wchar,
4611 	mb_wchar_to_cp50222,
4612 	NULL
4613 };
4614 
4615 static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
4616 
4617 static const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
4618 	mbfl_no_encoding_2022jpms,
4619 	mbfl_no_encoding_wchar,
4620 	mbfl_filt_conv_common_ctor,
4621 	NULL,
4622 	mbfl_filt_conv_2022jpms_wchar,
4623 	mbfl_filt_conv_2022jpms_wchar_flush,
4624 	NULL,
4625 };
4626 
4627 static const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = {
4628 	mbfl_no_encoding_wchar,
4629 	mbfl_no_encoding_2022jpms,
4630 	mbfl_filt_conv_common_ctor,
4631 	NULL,
4632 	mbfl_filt_conv_wchar_2022jpms,
4633 	mbfl_filt_conv_any_2022jpms_flush,
4634 	NULL,
4635 };
4636 
4637 const mbfl_encoding mbfl_encoding_2022jpms = {
4638 	mbfl_no_encoding_2022jpms,
4639 	"ISO-2022-JP-MS",
4640 	"ISO-2022-JP",
4641 	mbfl_encoding_2022jpms_aliases,
4642 	NULL,
4643 	MBFL_ENCTYPE_GL_UNSAFE,
4644 	&vtbl_2022jpms_wchar,
4645 	&vtbl_wchar_2022jpms,
4646 	mb_iso2022jpms_to_wchar,
4647 	mb_wchar_to_iso2022jpms,
4648 	NULL
4649 };
4650 
4651 /* ISO-2022-KR is defined in RFC 1557
4652  *
4653  * The RFC says that ESC $ ) C must appear once in a ISO-2022-KR string,
4654  * at the beginning of a line, before any instances of the Shift In or
4655  * Shift Out bytes which are used to switch between ASCII/KSC 5601 modes
4656  *
4657  * We don't enforce that for ISO-2022-KR input */
4658 
4659 static const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {
4660 	mbfl_no_encoding_wchar,
4661 	mbfl_no_encoding_2022kr,
4662 	mbfl_filt_conv_common_ctor,
4663 	NULL,
4664 	mbfl_filt_conv_wchar_2022kr,
4665 	mbfl_filt_conv_any_2022kr_flush,
4666 	NULL,
4667 };
4668 
4669 static const struct mbfl_convert_vtbl vtbl_2022kr_wchar = {
4670 	mbfl_no_encoding_2022kr,
4671 	mbfl_no_encoding_wchar,
4672 	mbfl_filt_conv_common_ctor,
4673 	NULL,
4674 	mbfl_filt_conv_2022kr_wchar,
4675 	mbfl_filt_conv_2022kr_wchar_flush,
4676 	NULL,
4677 };
4678 
4679 const mbfl_encoding mbfl_encoding_2022kr = {
4680 	mbfl_no_encoding_2022kr,
4681 	"ISO-2022-KR",
4682 	"ISO-2022-KR",
4683 	NULL,
4684 	NULL,
4685 	MBFL_ENCTYPE_GL_UNSAFE,
4686 	&vtbl_2022kr_wchar,
4687 	&vtbl_wchar_2022kr,
4688 	mb_iso2022kr_to_wchar,
4689 	mb_wchar_to_iso2022kr,
4690 	NULL
4691 };
4692 
4693 /*
4694  * SJIS variants
4695  */
4696 
mbfl_filt_conv_sjis_wchar(int c,mbfl_convert_filter * filter)4697 static int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
4698 {
4699 	int s1, s2, w;
4700 
4701 	switch (filter->status) {
4702 	case 0:
4703 		if (c >= 0 && c < 0x80) { /* ASCII */
4704 			CK((*filter->output_function)(c, filter->data));
4705 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
4706 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
4707 		} else if (c > 0x80 && c < 0xF0 && c != 0xA0) { /* Kanji, first byte */
4708 			filter->status = 1;
4709 			filter->cache = c;
4710 		} else {
4711 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4712 		}
4713 		break;
4714 
4715 	case 1: /* Kanji, second byte */
4716 		filter->status = 0;
4717 		int c1 = filter->cache;
4718 		if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
4719 			SJIS_DECODE(c1, c, s1, s2);
4720 			w = (s1 - 0x21)*94 + s2 - 0x21;
4721 			if (w >= 0 && w < jisx0208_ucs_table_size) {
4722 				w = jisx0208_ucs_table[w];
4723 				if (!w)
4724 					w = MBFL_BAD_INPUT;
4725 			} else {
4726 				w = MBFL_BAD_INPUT;
4727 			}
4728 			CK((*filter->output_function)(w, filter->data));
4729 		} else {
4730 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4731 		}
4732 	}
4733 
4734 	return 0;
4735 }
4736 
mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter * filter)4737 static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter)
4738 {
4739 	if (filter->status && filter->status != 4) {
4740 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
4741 	}
4742 	filter->status = 0;
4743 
4744 	if (filter->flush_function) {
4745 		(*filter->flush_function)(filter->data);
4746 	}
4747 
4748 	return 0;
4749 }
4750 
mbfl_filt_conv_wchar_sjis(int c,mbfl_convert_filter * filter)4751 static int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
4752 {
4753 	int c1, c2, s1 = 0, s2;
4754 
4755 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
4756 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
4757 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
4758 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
4759 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
4760 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
4761 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
4762 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
4763 	}
4764 	if (s1 <= 0) {
4765 		if (c == 0xA5) { /* YEN SIGN */
4766 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
4767 		} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
4768 			s1 = 0x2131; /* FULLWIDTH MACRON */
4769 		} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4770 			s1 = 0x2140;
4771 		} else if (c == 0x2225) { /* PARALLEL TO */
4772 			s1 = 0x2142;
4773 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4774 			s1 = 0x215D;
4775 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4776 			s1 = 0x2171;
4777 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4778 			s1 = 0x2172;
4779 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4780 			s1 = 0x224C;
4781 		} else if (c == 0) {
4782 			s1 = 0;
4783 		} else {
4784 			s1 = -1;
4785 		}
4786 	} else if (s1 >= 0x8080) { /* JIS X 0212; not supported */
4787 		s1 = -1;
4788 	}
4789 
4790 	if (s1 >= 0) {
4791 		if (s1 < 0x100) { /* Latin/Kana */
4792 			CK((*filter->output_function)(s1, filter->data));
4793 		} else { /* Kanji */
4794 			c1 = (s1 >> 8) & 0xFF;
4795 			c2 = s1 & 0xFF;
4796 			SJIS_ENCODE(c1, c2, s1, s2);
4797 			CK((*filter->output_function)(s1, filter->data));
4798 			CK((*filter->output_function)(s2, filter->data));
4799 		}
4800 	} else {
4801 		CK(mbfl_filt_conv_illegal_output(c, filter));
4802 	}
4803 
4804 	return 0;
4805 }
4806 
4807 static const unsigned short sjis_decode_tbl1[] = {
4808 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
4809 };
4810 
4811 static const unsigned short sjis_decode_tbl2[] = {
4812 	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 0xFFFF, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 0xFFFF, 0xFFFF, 0xFFFF
4813 };
4814 
mb_sjis_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)4815 static size_t mb_sjis_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
4816 {
4817 	unsigned char *p = *in, *e = p + *in_len;
4818 	uint32_t *out = buf, *limit = buf + bufsize;
4819 
4820 	e--; /* Stop the main loop 1 byte short of the end of the input */
4821 
4822 	while (p < e && out < limit) {
4823 		unsigned char c = *p++;
4824 
4825 		if (c <= 0x7F) {
4826 			*out++ = c;
4827 		} else if (c >= 0xA1 && c <= 0xDF) { /* Kana */
4828 			*out++ = 0xFEC0 + c;
4829 		} else {
4830 			/* Don't need to check p < e; it's not possible to go out of bounds here, due to e-- above */
4831 			unsigned char c2 = *p++;
4832 			/* This is only legal if c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F
4833 			 * But the values in the above conversion tables have been chosen such that
4834 			 * illegal values of c2 will always result in w > jisx0208_ucs_table_size,
4835 			 * so we don't need to do a separate bounds check on c2
4836 			 * Likewise, the values in the conversion tables are such that illegal values
4837 			 * for c will always result in w > jisx0208_ucs_table_size */
4838 			uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
4839 			if (w < jisx0208_ucs_table_size) {
4840 				w = jisx0208_ucs_table[w];
4841 				if (!w)
4842 					w = MBFL_BAD_INPUT;
4843 				*out++ = w;
4844 			} else {
4845 				if (c == 0x80 || c == 0xA0 || c > 0xEF) {
4846 					p--;
4847 				}
4848 				*out++ = MBFL_BAD_INPUT;
4849 			}
4850 		}
4851 	}
4852 
4853 	/* Finish up last byte of input string if there is one */
4854 	if (p == e && out < limit) {
4855 		unsigned char c = *p++;
4856 		if (c <= 0x7F) {
4857 			*out++ = c;
4858 		} else if (c >= 0xA1 && c <= 0xDF) {
4859 			*out++ = 0xFEC0 + c;
4860 		} else {
4861 			*out++ = MBFL_BAD_INPUT;
4862 		}
4863 	}
4864 
4865 	*in_len = e - p + 1;
4866 	*in = p;
4867 	return out - buf;
4868 }
4869 
mb_wchar_to_sjis(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)4870 static void mb_wchar_to_sjis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
4871 {
4872 	unsigned char *out, *limit;
4873 	MB_CONVERT_BUF_LOAD(buf, out, limit);
4874 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4875 
4876 	while (len--) {
4877 		uint32_t w = *in++;
4878 		unsigned int s = 0;
4879 
4880 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
4881 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
4882 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
4883 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
4884 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
4885 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
4886 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
4887 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
4888 		}
4889 
4890 		if (s == 0) {
4891 			if (w == 0xA5) { /* YEN SIGN */
4892 				s = 0x216F; /* FULLWIDTH YEN SIGN */
4893 			} else if (w == 0xAF || w == 0x203E) {
4894 				s = 0x2131; /* FULLWIDTH MACRON */
4895 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
4896 				s = 0x2140;
4897 			} else if (w == 0x2225) { /* PARALLEL TO */
4898 				s = 0x2142;
4899 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
4900 				s = 0x215D;
4901 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
4902 				s = 0x2171;
4903 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
4904 				s = 0x2172;
4905 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
4906 				s = 0x224C;
4907 			} else if (w != 0) {
4908 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4909 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4910 				continue;
4911 			}
4912 		} else if (s >= 0x8080) { /* JIS X 0212; not supported */
4913 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis);
4914 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
4915 			continue;
4916 		}
4917 
4918 		if (s <= 0xFF) {
4919 			/* Latin/Kana */
4920 			out = mb_convert_buf_add(out, s);
4921 		} else {
4922 			/* Kanji */
4923 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s2;
4924 			SJIS_ENCODE(c1, c2, s, s2);
4925 			out = mb_convert_buf_add2(out, s, s2);
4926 		}
4927 	}
4928 
4929 	MB_CONVERT_BUF_STORE(buf, out, limit);
4930 }
4931 
mbfl_filt_conv_sjis_mac_wchar(int c,mbfl_convert_filter * filter)4932 static int mbfl_filt_conv_sjis_mac_wchar(int c, mbfl_convert_filter *filter)
4933 {
4934 	int i, j, n;
4935 	int c1, s, s1, s2, w;
4936 
4937 	switch (filter->status) {
4938 	case 0:
4939 		if (c >= 0 && c < 0x80 && c != 0x5c) {	/* latin */
4940 			CK((*filter->output_function)(c, filter->data));
4941 		} else if (c > 0xa0 && c < 0xe0) {	/* kana */
4942 			CK((*filter->output_function)(0xfec0 + c, filter->data));
4943 		} else if (c > 0x80 && c <= 0xed && c != 0xa0) {	/* kanji first char */
4944 			filter->status = 1;
4945 			filter->cache = c;
4946 		} else if (c == 0x5c) {
4947 			CK((*filter->output_function)(0x00a5, filter->data));
4948 		} else if (c == 0x80) {
4949 			CK((*filter->output_function)(0x005c, filter->data));
4950 		} else if (c == 0xa0) {
4951 			CK((*filter->output_function)(0x00a0, filter->data));
4952 		} else if (c == 0xfd) {
4953 			CK((*filter->output_function)(0x00a9, filter->data));
4954 		} else if (c == 0xfe) {
4955 			CK((*filter->output_function)(0x2122, filter->data));
4956 		} else if (c == 0xff) {
4957 			CK((*filter->output_function)(0x2026, filter->data));
4958 			CK((*filter->output_function)(0xf87f, filter->data));
4959 		} else {
4960 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
4961 		}
4962 		break;
4963 
4964 	case 1:		/* kanji second char */
4965 		filter->status = 0;
4966 		c1 = filter->cache;
4967 		if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
4968 			w = 0;
4969 			SJIS_DECODE(c1, c, s1, s2);
4970 			s = (s1 - 0x21)*94 + s2 - 0x21;
4971 			if (s <= 0x89) {
4972 				if (s == 0x1c) {
4973 					w = 0x2014;		    /* EM DASH */
4974 				} else if (s == 0x1f) {
4975 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
4976 				} else if (s == 0x20) {
4977 					w = 0x301c;			/* FULLWIDTH TILDE */
4978 				} else if (s == 0x21) {
4979 					w = 0x2016;			/* PARALLEL TO */
4980 				} else if (s == 0x3c) {
4981 					w = 0x2212;			/* FULLWIDTH HYPHEN-MINUS */
4982 				} else if (s == 0x50) {
4983 					w = 0x00a2;			/* FULLWIDTH CENT SIGN */
4984 				} else if (s == 0x51) {
4985 					w = 0x00a3;			/* FULLWIDTH POUND SIGN */
4986 				} else if (s == 0x89) {
4987 					w = 0x00ac;			/* FULLWIDTH NOT SIGN */
4988 				}
4989 			}
4990 
4991 			/* apple gaiji area 0x8540 - 0x886d */
4992 			if (w == 0) {
4993 				for (i=0; i<7; i++) {
4994 					if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) {
4995 						w = s - code_tbl[i][0] + code_tbl[i][2];
4996 						break;
4997 					}
4998 				}
4999 			}
5000 
5001 			if (w == 0) {
5002 
5003 				for (i=0; i<code_tbl_m_len; i++) {
5004 					if (s == code_tbl_m[i][0]) {
5005 						if (code_tbl_m[i][1] == 0xf860) {
5006 							n = 4;
5007 						} else if (code_tbl_m[i][1] == 0xf861) {
5008 							n = 5;
5009 						} else {
5010 							n = 6;
5011 						}
5012 						for (j=1; j<n-1; j++) {
5013 							CK((*filter->output_function)(code_tbl_m[i][j], filter->data));
5014 						}
5015 						w = code_tbl_m[i][n-1];
5016 						break;
5017 					}
5018 				}
5019 			}
5020 
5021 			if (w == 0) {
5022 				for (i=0; i<8; i++) {
5023 					if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) {
5024 						w = code_map[i][s - code_ofst_tbl[i][0]];
5025 						if (w == 0) {
5026 							CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5027 							return 0;
5028 						}
5029 						s2 = 0;
5030 						if (s >= 0x043e && s <= 0x0441) {
5031 							s2 = 0xf87a;
5032 						} else if (s == 0x03b1 || s == 0x03b7) {
5033 							s2 = 0xf87f;
5034 						} else if (s == 0x04b8 || s == 0x04b9 || s == 0x04c4) {
5035 							s2 = 0x20dd;
5036 						} else if (s == 0x1ed9 || s == 0x1eda || s == 0x1ee8 || s == 0x1ef3 ||
5037 								   (s >= 0x1ef5 && s <= 0x1efb) || s == 0x1f05 || s == 0x1f06 ||
5038 								   s == 0x1f18 || (s >= 0x1ff2 && s <= 0x20a5)) {
5039 							s2 = 0xf87e;
5040 						}
5041 						if (s2 > 0) {
5042 							CK((*filter->output_function)(w, filter->data));
5043 							w = s2;
5044 						}
5045 						break;
5046 					}
5047 				}
5048 			}
5049 
5050 			if (w == 0 && s >= 0 && s < jisx0208_ucs_table_size) {	/* X 0208 */
5051 				w = jisx0208_ucs_table[s];
5052 			}
5053 
5054 			if (w <= 0) {
5055 				w = MBFL_BAD_INPUT;
5056 			}
5057 			CK((*filter->output_function)(w, filter->data));
5058 		} else {
5059 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
5060 		}
5061 		break;
5062 
5063 		EMPTY_SWITCH_DEFAULT_CASE();
5064 	}
5065 
5066 	return 0;
5067 }
5068 
mbfl_filt_conv_wchar_sjis_mac(int c,mbfl_convert_filter * filter)5069 static int mbfl_filt_conv_wchar_sjis_mac(int c, mbfl_convert_filter *filter)
5070 {
5071 	int i, c1, c2, s1 = 0, s2 = 0, mode;
5072 
5073 	// a1: U+0000 -> U+046F
5074 	// a2: U+2000 -> U+30FF
5075 	//  i: U+4E00 -> U+9FFF
5076 	//  r: U+FF00 -> U+FFFF
5077 
5078 	switch (filter->status) {
5079 	case 1:
5080 		c1 = filter->cache;
5081 		filter->cache = filter->status = 0;
5082 
5083 		if (c == 0xf87a) {
5084 			for (i = 0; i < 4; i++) {
5085 				if (c1 == s_form_tbl[i+34+3+3]) {
5086 					s1 = s_form_sjis_tbl[i+34+3+3];
5087 					break;
5088 				}
5089 			}
5090 			if (s1 <= 0) {
5091 				s2 = c1;
5092 			}
5093 		} else if (c == 0x20dd) {
5094 			for (i = 0; i < 3; i++) {
5095 				if (c1 == s_form_tbl[i+34+3]) {
5096 					s1 = s_form_sjis_tbl[i+34+3];
5097 					break;
5098 				}
5099 			}
5100 			if (s1 <= 0) {
5101 				s2 = c1;
5102 			}
5103 		} else if (c == 0xf87f) {
5104 			for (i = 0; i < 3; i++) {
5105 				if (c1 == s_form_tbl[i+34]) {
5106 					s1 = s_form_sjis_tbl[i+34];
5107 					break;
5108 				}
5109 			}
5110 			if (s1 <= 0) {
5111 				s2 = c1;
5112 				s1 = -1;
5113 			}
5114 		} else if (c == 0xf87e) {
5115 			for (i = 0; i < 34; i++) {
5116 				if (c1 == s_form_tbl[i]) {
5117 					s1 = s_form_sjis_tbl[i];
5118 					break;
5119 				}
5120 			}
5121 			if (s1 <= 0) {
5122 				s2 = c1;
5123 				s1 = -1;
5124 			}
5125 		} else {
5126 			s2 = c1;
5127 			s1 = c;
5128 		}
5129 
5130 		if (s2 > 0) {
5131 			for (i = 0; i < s_form_tbl_len; i++) {
5132 				if (c1 == s_form_tbl[i]) {
5133 					s1 = s_form_sjis_fallback_tbl[i];
5134 					break;
5135 				}
5136 			}
5137 		}
5138 
5139 		if (s1 >= 0) {
5140 			if (s1 < 0x100) {
5141 				CK((*filter->output_function)(s1, filter->data));
5142 			} else {
5143 				CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5144 				CK((*filter->output_function)(s1 & 0xff, filter->data));
5145 			}
5146 		} else {
5147 			CK(mbfl_filt_conv_illegal_output(c, filter));
5148 		}
5149 
5150 		if (s2 <= 0 || s1 == -1) {
5151 			break;
5152 		}
5153 		s1 = s2 = 0;
5154 		ZEND_FALLTHROUGH;
5155 
5156 	case 0:
5157 		if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
5158 			s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
5159 			if (c == 0x5c) {
5160 				s1 = 0x80;
5161 			} else if (c == 0xa9) {
5162 				s1 = 0xfd;
5163 			}
5164 		} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
5165 			s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
5166 			if (c == 0x2122) {
5167 				s1 = 0xfe;
5168 			} else if (c == 0x2014) {
5169 				s1 = 0x213d;
5170 			} else if (c == 0x2116) {
5171 				s1 = 0x2c1d;
5172 			}
5173 		} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
5174 			s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
5175 		} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
5176 			s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
5177 		}
5178 
5179 		if (c >= 0x2000) {
5180 			for (i = 0; i < s_form_tbl_len; i++) {
5181 				if (c == s_form_tbl[i]) {
5182 					filter->status = 1;
5183 					filter->cache = c;
5184 					return 0;
5185 				}
5186 			}
5187 
5188 			if (c == 0xf860 || c == 0xf861 || c == 0xf862) {
5189 				/* Apple 'transcoding hint' codepoints (from private use area) */
5190 				filter->status = 2;
5191 				filter->cache = c;
5192 				return 0;
5193 			}
5194 		}
5195 
5196 		if (s1 <= 0) {
5197 			if (c == 0xa0) {
5198 				s1 = 0x00a0;
5199 			} else if (c == 0xa5) { /* YEN SIGN */
5200 				/* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5201 				 * convert codepoint 0xA5 to halfwidth Yen sign */
5202 				s1 = 0x5c; /* HALFWIDTH YEN SIGN */
5203 			} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
5204 				s1 = 0x2140;
5205 			}
5206 		}
5207 
5208 		if (s1 <= 0) {
5209 			for (i=0; i<wchar2sjis_mac_r_tbl_len; i++) {
5210 				if (c >= wchar2sjis_mac_r_tbl[i][0] && c <= wchar2sjis_mac_r_tbl[i][1]) {
5211 					s1 = c - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5212 					break;
5213 				}
5214 			}
5215 
5216 			if (s1 <= 0) {
5217 				for (i=0; i<wchar2sjis_mac_r_map_len; i++) {
5218 					if (c >= wchar2sjis_mac_r_map[i][0] && c <= wchar2sjis_mac_r_map[i][1]) {
5219 						s1 = wchar2sjis_mac_code_map[i][c-wchar2sjis_mac_r_map[i][0]];
5220 						break;
5221 					}
5222 				}
5223 			}
5224 
5225 			if (s1 <= 0) {
5226 				for (i=0; i<wchar2sjis_mac_wchar_tbl_len ; i++) {
5227 					if ( c == wchar2sjis_mac_wchar_tbl[i][0]) {
5228 						s1 = wchar2sjis_mac_wchar_tbl[i][1] & 0xffff;
5229 						break;
5230 					}
5231 				}
5232 			}
5233 
5234 			if (s1 > 0) {
5235 				c1 = s1/94+0x21;
5236 				c2 = s1-94*(c1-0x21)+0x21;
5237 				s1 = (c1 << 8) | c2;
5238 				s2 = 1;
5239 			}
5240 		}
5241 
5242 		if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {	/* not found or X 0212 */
5243 			s1 = -1;
5244 			c1 = 0;
5245 
5246 			if (c == 0) {
5247 				s1 = 0;
5248 			} else if (s1 <= 0) {
5249 				s1 = -1;
5250 			}
5251 		}
5252 
5253 		if (s1 >= 0) {
5254 			if (s1 < 0x100) { /* latin or kana */
5255 				CK((*filter->output_function)(s1, filter->data));
5256 			} else { /* kanji */
5257 				c1 = (s1 >> 8) & 0xff;
5258 				c2 = s1 & 0xff;
5259 				SJIS_ENCODE(c1, c2, s1, s2);
5260 				CK((*filter->output_function)(s1, filter->data));
5261 				CK((*filter->output_function)(s2, filter->data));
5262 			}
5263 		} else {
5264 			CK(mbfl_filt_conv_illegal_output(c, filter));
5265 		}
5266 		break;
5267 
5268 	case 2:
5269 		c1 = filter->cache;
5270 		filter->cache = 0;
5271 		filter->status = 0;
5272 		if (c1 == 0xf860) {
5273 			for (i = 0; i < 5; i++) {
5274 				if (c == code_tbl_m[i][2]) {
5275 					filter->cache = c | 0x10000;
5276 					filter->status = 3;
5277 					break;
5278 				}
5279 			}
5280 		} else if (c1 == 0xf861) {
5281 			for (i = 0; i < 3; i++) {
5282 				if (c == code_tbl_m[i+5][2]) {
5283 					filter->cache = c | 0x20000;
5284 					filter->status = 3;
5285 					break;
5286 				}
5287 			}
5288 		} else if (c1 == 0xf862) {
5289 			for (i = 0; i < 4; i++) {
5290 				if (c == code_tbl_m[i+5+3][2]) {
5291 					filter->cache = c | 0x40000;
5292 					filter->status = 3;
5293 					break;
5294 				}
5295 			}
5296 		}
5297 
5298 		if (filter->status == 0) {
5299 			/* Didn't find any of expected codepoints after Apple transcoding hint */
5300 			CK(mbfl_filt_conv_illegal_output(c1, filter));
5301 			return mbfl_filt_conv_wchar_sjis_mac(c, filter);
5302 		}
5303 		break;
5304 
5305 	case 3:
5306 		s1 = 0;
5307 		c1 = filter->cache & 0xffff;
5308 		mode = (filter->cache & 0xf0000) >> 16;
5309 
5310 		filter->cache = filter->status = 0;
5311 
5312 		if (mode == 0x1) {
5313 			for (i = 0; i < 5; i++) {
5314 				if (c1 == code_tbl_m[i][2] && c == code_tbl_m[i][3]) {
5315 					s1 = code_tbl_m[i][0];
5316 					break;
5317 				}
5318 			}
5319 
5320 			if (s1 > 0) {
5321 				c1 = s1/94+0x21;
5322 				c2 = s1-94*(c1-0x21)+0x21;
5323 				SJIS_ENCODE(c1, c2, s1, s2);
5324 				CK((*filter->output_function)(s1, filter->data));
5325 				CK((*filter->output_function)(s2, filter->data));
5326 			} else {
5327 				CK(mbfl_filt_conv_illegal_output(0xf860, filter));
5328 				CK(mbfl_filt_conv_illegal_output(c1, filter));
5329 				CK(mbfl_filt_conv_illegal_output(c, filter));
5330 			}
5331 		} else if (mode == 0x2) {
5332 			for (i = 0; i < 3; i++) {
5333 				if (c1 == code_tbl_m[i+5][2] && c == code_tbl_m[i+5][3]) {
5334 					filter->cache = c | 0x20000;
5335 					filter->status = 4;
5336 					break;
5337 				}
5338 			}
5339 		} else if (mode == 0x4) {
5340 			for (i = 0; i < 4; i++) {
5341 				if (c1 == code_tbl_m[i+8][2] && c == code_tbl_m[i+8][3]) {
5342 					filter->cache = c | 0x40000;
5343 					filter->status = 4;
5344 					break;
5345 				}
5346 			}
5347 		}
5348 		break;
5349 
5350 	case 4:
5351 		s1 = 0;
5352 		c1 = filter->cache & 0xffff;
5353 		mode = (filter->cache & 0xf0000) >> 16;
5354 
5355 		filter->cache = 0;
5356 		filter->status = 0;
5357 
5358 		if (mode == 0x2) {
5359 			for (i = 0; i < 3; i++) {
5360 				if (c1 == code_tbl_m[i+5][3] && c == code_tbl_m[i+5][4]) {
5361 					s1 = code_tbl_m[i+5][0];
5362 					break;
5363 				}
5364 			}
5365 
5366 			if (s1 > 0) {
5367 				c1 = s1/94+0x21;
5368 				c2 = s1-94*(c1-0x21)+0x21;
5369 				SJIS_ENCODE(c1, c2, s1, s2);
5370 				CK((*filter->output_function)(s1, filter->data));
5371 				CK((*filter->output_function)(s2, filter->data));
5372 			} else {
5373 				CK(mbfl_filt_conv_illegal_output(0xf861, filter));
5374 				for (i = 0; i < 3; i++) {
5375 					if (c1 == code_tbl_m[i+5][3]) {
5376 						CK(mbfl_filt_conv_illegal_output(code_tbl_m[i+5][2], filter));
5377 						break;
5378 					}
5379 				}
5380 				CK(mbfl_filt_conv_illegal_output(c1, filter));
5381 				CK(mbfl_filt_conv_illegal_output(c, filter));
5382 			}
5383 		} else if (mode == 0x4) {
5384 			for (i = 0; i < 4; i++) {
5385 				if (c1 == code_tbl_m[i+8][3] && c == code_tbl_m[i+8][4]) {
5386 					filter->cache = c | 0x40000;
5387 					filter->status = 5;
5388 					break;
5389 				}
5390 			}
5391 		}
5392 		break;
5393 
5394 	case 5:
5395 		s1 = 0;
5396 		c1 = filter->cache & 0xffff;
5397 		mode = (filter->cache & 0xf0000) >> 16;
5398 
5399 		filter->cache = filter->status = 0;
5400 
5401 		if (mode == 0x4) {
5402 			for (i = 0; i < 4; i++) {
5403 				if (c1 == code_tbl_m[i+8][4] && c == code_tbl_m[i+8][5]) {
5404 					s1 = code_tbl_m[i+8][0];
5405 					break;
5406 				}
5407 			}
5408 
5409 			if (s1 > 0) {
5410 				c1 = s1/94+0x21;
5411 				c2 = s1-94*(c1-0x21)+0x21;
5412 				SJIS_ENCODE(c1, c2, s1, s2);
5413 				CK((*filter->output_function)(s1, filter->data));
5414 				CK((*filter->output_function)(s2, filter->data));
5415 			} else {
5416 				CK(mbfl_filt_conv_illegal_output(0xf862, filter));
5417 				for (i = 0; i < 4; i++) {
5418 					if (c1 == code_tbl_m[i+8][4]) {
5419 						CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][2], filter));
5420 						CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][3], filter));
5421 						break;
5422 					}
5423 				}
5424 				CK(mbfl_filt_conv_illegal_output(c1, filter));
5425 				CK(mbfl_filt_conv_illegal_output(c, filter));
5426 			}
5427 		}
5428 		break;
5429 
5430 		EMPTY_SWITCH_DEFAULT_CASE();
5431 	}
5432 
5433 	return 0;
5434 }
5435 
mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter * filter)5436 static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter)
5437 {
5438 	int i, c1, s1 = 0;
5439 	if (filter->status == 1 && filter->cache > 0) {
5440 		c1 = filter->cache;
5441 		for (i=0;i<s_form_tbl_len;i++) {
5442 			if (c1 == s_form_tbl[i]) {
5443 				s1 = s_form_sjis_fallback_tbl[i];
5444 				break;
5445 			}
5446 		}
5447 		if (s1 > 0) {
5448 			CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data));
5449 			CK((*filter->output_function)(s1 & 0xff, filter->data));
5450 		}
5451 	}
5452 	filter->cache = 0;
5453 	filter->status = 0;
5454 
5455 	if (filter->flush_function != NULL) {
5456 		return (*filter->flush_function)(filter->data);
5457 	}
5458 
5459 	return 0;
5460 }
5461 
mb_sjismac_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)5462 static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
5463 {
5464 	/* A single SJIS-Mac kuten code can convert to up to 5 Unicode codepoints, oh my! */
5465 	ZEND_ASSERT(bufsize >= 5);
5466 
5467 	unsigned char *p = *in, *e = p + *in_len;
5468 	uint32_t *out = buf, *limit = buf + bufsize;
5469 
5470 	while (p < e && out < limit) {
5471 		unsigned char c = *p++;
5472 
5473 		if (c <= 0x80 || c == 0xA0) {
5474 			if (c == 0x5C) {
5475 				*out++ = 0xA5;
5476 			} else if (c == 0x80) {
5477 				*out++ = 0x5C;
5478 			} else {
5479 				*out++ = c;
5480 			}
5481 		} else if (c >= 0xA1 && c <= 0xDF) {
5482 			*out++ = 0xFEC0 + c;
5483 		} else if (c <= 0xED) {
5484 			if (p == e) {
5485 				*out++ = MBFL_BAD_INPUT;
5486 				break;
5487 			}
5488 			unsigned char c2 = *p++;
5489 			uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2];
5490 
5491 			if (w <= 0x89) {
5492 				if (w == 0x1C) {
5493 					*out++ = 0x2014; /* EM DASH */
5494 					continue;
5495 				} else if (w == 0x1F) {
5496 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
5497 					continue;
5498 				} else if (w == 0x20) {
5499 					*out++ = 0x301C; /* FULLWIDTH TILDE */
5500 					continue;
5501 				} else if (w == 0x21) {
5502 					*out++ = 0x2016; /* PARALLEL TO */
5503 					continue;
5504 				} else if (w == 0x3C) {
5505 					*out++ = 0x2212; /* FULLWIDTH HYPHEN-MINUS */
5506 					continue;
5507 				} else if (w == 0x50) {
5508 					*out++ = 0xA2; /* FULLWIDTH CENT SIGN */
5509 					continue;
5510 				} else if (w == 0x51) {
5511 					*out++ = 0xA3; /* FULLWIDTH POUND SIGN */
5512 					continue;
5513 				} else if (w == 0x89) {
5514 					*out++ = 0xAC; /* FULLWIDTH NOT SIGN */
5515 					continue;
5516 				}
5517 			} else {
5518 				if (w >= 0x2F0 && w <= 0x3A3) {
5519 					for (int i = 0; i < 7; i++) {
5520 						if (w >= code_tbl[i][0] && w <= code_tbl[i][1]) {
5521 							*out++ = w - code_tbl[i][0] + code_tbl[i][2];
5522 							goto next_iteration;
5523 						}
5524 					}
5525 				}
5526 
5527 				if (w >= 0x340 && w <= 0x523) {
5528 					for (int i = 0; i < code_tbl_m_len; i++) {
5529 						if (w == code_tbl_m[i][0]) {
5530 							int n = 5;
5531 							if (code_tbl_m[i][1] == 0xF860) {
5532 								n = 3;
5533 							} else if (code_tbl_m[i][1] == 0xF861) {
5534 								n = 4;
5535 							}
5536 							if ((limit - out) < n) {
5537 								p -= 2;
5538 								goto finished;
5539 							}
5540 							for (int j = 1; j <= n; j++) {
5541 								*out++ = code_tbl_m[i][j];
5542 							}
5543 							goto next_iteration;
5544 						}
5545 					}
5546 				}
5547 
5548 				if (w >= 0x3AC && w <= 0x20A5) {
5549 					for (int i = 0; i < 8; i++) {
5550 						if (w >= code_ofst_tbl[i][0] && w <= code_ofst_tbl[i][1]) {
5551 							uint32_t w2 = code_map[i][w - code_ofst_tbl[i][0]];
5552 							if (!w2) {
5553 								*out++ = MBFL_BAD_INPUT;
5554 								goto next_iteration;
5555 							}
5556 							if ((limit - out) < 2) {
5557 								p -= 2;
5558 								goto finished;
5559 							}
5560 							*out++ = w2;
5561 							if (w >= 0x43E && w <= 0x441) {
5562 								*out++ = 0xF87A;
5563 							} else if (w == 0x3B1 || w == 0x3B7) {
5564 								*out++ = 0xF87F;
5565 							} else if (w == 0x4B8 || w == 0x4B9 || w == 0x4C4) {
5566 								*out++ = 0x20DD;
5567 							} else if (w == 0x1ED9 || w == 0x1EDA || w == 0x1EE8 || w == 0x1EF3 || (w >= 0x1EF5 && w <= 0x1EFB) || w == 0x1F05 || w == 0x1F06 || w == 0x1F18 || (w >= 0x1FF2 && w <= 0x20A5)) {
5568 								*out++ = 0xF87E;
5569 							}
5570 							goto next_iteration;
5571 						}
5572 					}
5573 				}
5574 			}
5575 
5576 			if (w < jisx0208_ucs_table_size) {
5577 				w = jisx0208_ucs_table[w];
5578 				if (!w)
5579 					w = MBFL_BAD_INPUT;
5580 				*out++ = w;
5581 			} else {
5582 				*out++ = MBFL_BAD_INPUT;
5583 			}
5584 		} else if (c == 0xFD) {
5585 			*out++ = 0xA9;
5586 		} else if (c == 0xFE) {
5587 			*out++ = 0x2122;
5588 		} else if (c == 0xFF) {
5589 			if ((limit - out) < 2) {
5590 				p--;
5591 				break;
5592 			}
5593 			*out++ = 0x2026;
5594 			*out++ = 0xF87F;
5595 		} else {
5596 			*out++ = MBFL_BAD_INPUT;
5597 		}
5598 next_iteration: ;
5599 	}
5600 
5601 finished:
5602 	*in_len = e - p;
5603 	*in = p;
5604 	return out - buf;
5605 }
5606 
process_s_form(uint32_t w,uint32_t w2,unsigned int * s)5607 static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s)
5608 {
5609 	if (w2 == 0xF87A) {
5610 		for (int i = 0; i < 4; i++) {
5611 			if (w == s_form_tbl[i+34+3+3]) {
5612 				*s = s_form_sjis_tbl[i+34+3+3];
5613 				return true;
5614 			}
5615 		}
5616 	} else if (w2 == 0x20DD) {
5617 		for (int i = 0; i < 3; i++) {
5618 			if (w == s_form_tbl[i+34+3]) {
5619 				*s = s_form_sjis_tbl[i+34+3];
5620 				return true;
5621 			}
5622 		}
5623 	} else if (w2 == 0xF87F) {
5624 		for (int i = 0; i < 3; i++) {
5625 			if (w == s_form_tbl[i+34]) {
5626 				*s = s_form_sjis_tbl[i+34];
5627 				return true;
5628 			}
5629 		}
5630 	} else if (w2 == 0xF87E) {
5631 		for (int i = 0; i < 34; i++) {
5632 			if (w == s_form_tbl[i]) {
5633 				*s = s_form_sjis_tbl[i];
5634 				return true;
5635 			}
5636 		}
5637 	}
5638 
5639 	return false;
5640 }
5641 
5642 /* For codepoints F860-F862, which are treated specially in MacJapanese */
5643 static int transcoding_hint_cp_width[3] = { 3, 4, 5 };
5644 
mb_wchar_to_sjismac(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)5645 static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
5646 {
5647 	unsigned char *out, *limit;
5648 	MB_CONVERT_BUF_LOAD(buf, out, limit);
5649 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5650 
5651 	uint32_t w;
5652 
5653 	if (buf->state) {
5654 		w = buf->state & 0xFFFF;
5655 		if (buf->state & 0xFF000000L) {
5656 			goto resume_transcoding_hint;
5657 		} else {
5658 			buf->state = 0;
5659 			goto process_codepoint;
5660 		}
5661 	}
5662 
5663 	while (len--) {
5664 		w = *in++;
5665 process_codepoint: ;
5666 		unsigned int s = 0;
5667 
5668 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
5669 			if (w == 0x5C) {
5670 				s = 0x80;
5671 			} else if (w == 0xA9) {
5672 				s = 0xFD;
5673 			} else {
5674 				s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
5675 			}
5676 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
5677 			if (w == 0x2122) {
5678 				s = 0xFE;
5679 			} else if (w == 0x2014) {
5680 				s = 0x213D;
5681 			} else if (w == 0x2116) {
5682 				s = 0x2C1D;
5683 			} else {
5684 				s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
5685 			}
5686 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
5687 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
5688 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
5689 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
5690 		}
5691 
5692 		if (w >= 0x2000) {
5693 			for (int i = 0; i < s_form_tbl_len; i++) {
5694 				if (w == s_form_tbl[i]) {
5695 					if (!len) {
5696 						if (end) {
5697 							s = s_form_sjis_fallback_tbl[i];
5698 							if (s) {
5699 								MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
5700 								out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5701 							} else {
5702 								MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5703 							}
5704 						} else {
5705 							buf->state = w;
5706 						}
5707 						MB_CONVERT_BUF_STORE(buf, out, limit);
5708 						return;
5709 					}
5710 					uint32_t w2 = *in++;
5711 					len--;
5712 
5713 					if (!process_s_form(w, w2, &s)) {
5714 						in--; len++;
5715 
5716 						for (int i = 0; i < s_form_tbl_len; i++) {
5717 							if (w == s_form_tbl[i]) {
5718 								s = s_form_sjis_fallback_tbl[i];
5719 								break;
5720 							}
5721 						}
5722 					}
5723 
5724 					if (s <= 0xFF) {
5725 						out = mb_convert_buf_add(out, s);
5726 					} else {
5727 						MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5728 						out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
5729 					}
5730 
5731 					goto next_iteration;
5732 				}
5733 			}
5734 
5735 			if (w == 0xF860 || w == 0xF861 || w == 0xF862) {
5736 				/* Apple 'transcoding hint' codepoints (from private use area) */
5737 				if (!len) {
5738 					if (end) {
5739 						MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5740 					} else {
5741 						buf->state = w;
5742 					}
5743 					MB_CONVERT_BUF_STORE(buf, out, limit);
5744 					return;
5745 				}
5746 
5747 				uint32_t w2 = *in++;
5748 				len--;
5749 
5750 				for (int i = 0; i < code_tbl_m_len; i++) {
5751 					if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) {
5752 						/* This might be a valid transcoding hint sequence */
5753 						int index = 3;
5754 
5755 						if (buf->state) {
5756 resume_transcoding_hint:
5757 							i = buf->state >> 24;
5758 							index = (buf->state >> 16) & 0xFF;
5759 							buf->state = 0;
5760 						}
5761 
5762 						int expected = transcoding_hint_cp_width[w - 0xF860];
5763 
5764 						while (index <= expected) {
5765 							if (!len) {
5766 								if (end) {
5767 									for (int j = 1; j < index; j++) {
5768 										MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5769 									}
5770 								} else {
5771 									buf->state = (i << 24) | (index << 16) | (w & 0xFFFF);
5772 								}
5773 								MB_CONVERT_BUF_STORE(buf, out, limit);
5774 								return;
5775 							}
5776 
5777 							w2 = *in++;
5778 							len--;
5779 
5780 							if (w2 != code_tbl_m[i][index]) {
5781 								/* Didn't match */
5782 								for (int j = 1; j < index; j++) {
5783 									MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac);
5784 								}
5785 								MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac);
5786 								MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5787 								goto next_iteration;
5788 							}
5789 
5790 							index++;
5791 						}
5792 
5793 						/* Successful match, emit SJIS-mac bytes */
5794 						s = code_tbl_m[i][0];
5795 						unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2;
5796 						SJIS_ENCODE(c1, c2, s1, s2);
5797 						MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5798 						out = mb_convert_buf_add2(out, s1, s2);
5799 						goto next_iteration;
5800 					}
5801 				}
5802 
5803 				/* No valid transcoding hint sequence found */
5804 				in--; len++;
5805 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5806 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5807 				continue;
5808 			}
5809 		}
5810 
5811 		if (!s) {
5812 			if (w == 0xA0) {
5813 				s = 0xA0;
5814 			} else if (w == 0xA5) { /* YEN SIGN */
5815 				/* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign;
5816 				 * convert codepoint 0xA5 to halfwidth Yen sign */
5817 				s = 0x5C; /* HALFWIDTH YEN SIGN */
5818 			} else if (w == 0xFF3C) {	/* FULLWIDTH REVERSE SOLIDUS */
5819 				s = 0x2140;
5820 			} else {
5821 				for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) {
5822 					if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) {
5823 						s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2];
5824 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5825 						goto found_kuten_code;
5826 					}
5827 				}
5828 
5829 				for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) {
5830 					if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) {
5831 						s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]];
5832 						if (s) {
5833 							s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5834 							goto found_kuten_code;
5835 						}
5836 					}
5837 				}
5838 
5839 				for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) {
5840 					if (w == wchar2sjis_mac_wchar_tbl[i][0]) {
5841 						s = wchar2sjis_mac_wchar_tbl[i][1];
5842 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
5843 						goto found_kuten_code;
5844 					}
5845 				}
5846 			}
5847 		}
5848 
5849 found_kuten_code:
5850 		if ((!s && w) || s >= 0x8080) {
5851 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac);
5852 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
5853 		} else if (s <= 0xFF) {
5854 			out = mb_convert_buf_add(out, s);
5855 		} else {
5856 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
5857 			SJIS_ENCODE(c1, c2, s1, s2);
5858 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
5859 			out = mb_convert_buf_add2(out, s1, s2);
5860 		}
5861 
5862 next_iteration: ;
5863 	}
5864 
5865 	MB_CONVERT_BUF_STORE(buf, out, limit);
5866 }
5867 
mbfilter_sjis_emoji_docomo2unicode(int s,int * snd)5868 int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd)
5869 {
5870 	/* All three mobile vendors had emoji for numbers on a telephone keypad
5871 	 * Unicode doesn't have those, but it has a combining character which puts
5872 	 * a 'keypad button' around the following character, making it look like
5873 	 * a key on a telephone or keyboard. That combining char is codepoint 0x20E3. */
5874 	if (s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
5875 		if ((s >= DOCOMO_KEYPAD(1) && s <= DOCOMO_KEYPAD(9)) || s == DOCOMO_KEYPAD(0) || s == DOCOMO_KEYPAD_HASH) {
5876 			EMIT_KEYPAD_EMOJI(convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]));
5877 		} else {
5878 			*snd = 0;
5879 			return convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]);
5880 		}
5881 	}
5882 	return 0;
5883 }
5884 
mbfilter_sjis_emoji_sb2unicode(int s,int * snd)5885 int mbfilter_sjis_emoji_sb2unicode(int s, int *snd)
5886 {
5887 	if (s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb1_max) {
5888 		if (s == 0x2817 || (s >= 0x2823 && s <= 0x282C)) {
5889 			EMIT_KEYPAD_EMOJI(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5890 		} else {
5891 			*snd = 0;
5892 			return convert_emoji_cp(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]);
5893 		}
5894 	} else if (s >= mb_tbl_code2uni_sb2_min && s <= mb_tbl_code2uni_sb2_max) {
5895 		*snd = 0;
5896 		return convert_emoji_cp(mb_tbl_code2uni_sb2[s - mb_tbl_code2uni_sb2_min]);
5897 	} else if (s >= mb_tbl_code2uni_sb3_min && s <= mb_tbl_code2uni_sb3_max) {
5898 		if (s >= 0x2B02 && s <= 0x2B0B) {
5899 			EMIT_FLAG_EMOJI(nflags_sb[s - 0x2B02]);
5900 		} else {
5901 			*snd = 0;
5902 			return convert_emoji_cp(mb_tbl_code2uni_sb3[s - mb_tbl_code2uni_sb3_min]);
5903 		}
5904 	}
5905 	return 0;
5906 }
5907 
mbfilter_unicode2sjis_emoji_docomo(int c,int * s1,mbfl_convert_filter * filter)5908 int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter)
5909 {
5910 	/* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
5911 	 * to a sequence of 2 codepoints, one of which is a combining character which
5912 	 * adds the 'key' image around the other
5913 	 *
5914 	 * In the other direction, look for such sequences and convert them to a
5915 	 * single emoji */
5916 	if (filter->status == 1) {
5917 		int c1 = filter->cache;
5918 		filter->cache = filter->status = 0;
5919 		if (c == 0x20E3) {
5920 			if (c1 == '#') {
5921 				*s1 = 0x2964;
5922 			} else if (c1 == '0') {
5923 				*s1 = 0x296F;
5924 			} else { /* Previous character was '1'-'9' */
5925 				*s1 = 0x2966 + (c1 - '1');
5926 			}
5927 			return 1;
5928 		} else {
5929 			/* This character wasn't combining character to make keypad symbol,
5930 			 * so pass the previous character through... and proceed to process the
5931 			 * current character as usual
5932 			 * (Single-byte ASCII characters are valid in Shift-JIS...) */
5933 			CK((*filter->output_function)(c1, filter->data));
5934 		}
5935 	}
5936 
5937 	if (c == '#' || (c >= '0' && c <= '9')) {
5938 		filter->status = 1;
5939 		filter->cache = c;
5940 		return 0;
5941 	}
5942 
5943 	if (c == 0xA9) { /* Copyright sign */
5944 		*s1 = 0x29B5;
5945 		return 1;
5946 	} else if (c == 0x00AE) { /* Registered sign */
5947 		*s1 = 0x29BA;
5948 		return 1;
5949 	} else if (c >= mb_tbl_uni_docomo2code2_min && c <= mb_tbl_uni_docomo2code2_max) {
5950 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
5951 		if (i >= 0) {
5952 			*s1 = mb_tbl_uni_docomo2code2_value[i];
5953 			return 1;
5954 		}
5955 	} else if (c >= mb_tbl_uni_docomo2code3_min && c <= mb_tbl_uni_docomo2code3_max) {
5956 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
5957 		if (i >= 0) {
5958 			*s1 = mb_tbl_uni_docomo2code3_value[i];
5959 			return 1;
5960 		}
5961 	} else if (c >= mb_tbl_uni_docomo2code5_min && c <= mb_tbl_uni_docomo2code5_max) {
5962 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
5963 		if (i >= 0) {
5964 			*s1 = mb_tbl_uni_docomo2code5_val[i];
5965 			return 1;
5966 		}
5967 	}
5968 	return 0;
5969 }
5970 
mbfilter_unicode2sjis_emoji_kddi_sjis(int c,int * s1,mbfl_convert_filter * filter)5971 int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter)
5972 {
5973 	if (filter->status == 1) {
5974 		int c1 = filter->cache;
5975 		filter->cache = filter->status = 0;
5976 		if (c == 0x20E3) {
5977 			if (c1 == '#') {
5978 				*s1 = 0x25BC;
5979 			} else if (c1 == '0') {
5980 				*s1 = 0x2830;
5981 			} else { /* Previous character was '1'-'9' */
5982 				*s1 = 0x27a6 + (c1 - '1');
5983 			}
5984 			return 1;
5985 		} else {
5986 			CK((*filter->output_function)(c1, filter->data));
5987 		}
5988 	} else if (filter->status == 2) {
5989 		int c1 = filter->cache;
5990 		filter->cache = filter->status = 0;
5991 		if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
5992 			for (int i = 0; i < 10; i++) {
5993 				if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
5994 					*s1 = nflags_code_kddi[i];
5995 					return 1;
5996 				}
5997 			}
5998 		}
5999 
6000 		/* If none of the KDDI national flag emoji matched, then we have no way
6001 		 * to convert the previous codepoint... */
6002 		mbfl_filt_conv_illegal_output(c1, filter);
6003 	}
6004 
6005 	if (c == '#' || (c >= '0' && c <= '9')) {
6006 		filter->status = 1;
6007 		filter->cache = c;
6008 		return 0;
6009 	} else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6010 		filter->status = 2;
6011 		filter->cache = c;
6012 		return 0;
6013 	}
6014 
6015 	if (c == 0xA9) { /* Copyright sign */
6016 		*s1 = 0x27DC;
6017 		return 1;
6018 	} else if (c == 0xAE) { /* Registered sign */
6019 		*s1 = 0x27DD;
6020 		return 1;
6021 	} else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) {
6022 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6023 		if (i >= 0) {
6024 			*s1 = mb_tbl_uni_kddi2code2_value[i];
6025 			return 1;
6026 		}
6027 	} else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) {
6028 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6029 		if (i >= 0) {
6030 			*s1 = mb_tbl_uni_kddi2code3_value[i];
6031 			return 1;
6032 		}
6033 	} else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) {
6034 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6035 		if (i >= 0) {
6036 			*s1 = mb_tbl_uni_kddi2code5_val[i];
6037 			return 1;
6038 		}
6039 	}
6040 	return 0;
6041 }
6042 
mbfilter_unicode2sjis_emoji_sb(int c,int * s1,mbfl_convert_filter * filter)6043 int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter)
6044 {
6045 	if (filter->status == 1) {
6046 		int c1 = filter->cache;
6047 		filter->cache = filter->status = 0;
6048 		if (c == 0x20E3) {
6049 			if (c1 == '#') {
6050 				*s1 = 0x2817;
6051 			} else if (c1 == '0') {
6052 				*s1 = 0x282c;
6053 			} else { /* Previous character was '1'-'9' */
6054 				*s1 = 0x2823 + (c1 - '1');
6055 			}
6056 			return 1;
6057 		} else {
6058 			(*filter->output_function)(c1, filter->data);
6059 		}
6060 	} else if (filter->status == 2) {
6061 		int c1 = filter->cache;
6062 		filter->cache = filter->status = 0;
6063 		if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */
6064 			for (int i = 0; i < 10; i++) {
6065 				if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) {
6066 					*s1 = nflags_code_sb[i];
6067 					return 1;
6068 				}
6069 			}
6070 		}
6071 
6072 		/* If none of the SoftBank national flag emoji matched, then we have no way
6073 		 * to convert the previous codepoint... */
6074 		mbfl_filt_conv_illegal_output(c1, filter);
6075 	}
6076 
6077 	if (c == '#' || (c >= '0' && c <= '9')) {
6078 		filter->status = 1;
6079 		filter->cache = c;
6080 		return 0;
6081 	} else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */
6082 		filter->status = 2;
6083 		filter->cache = c;
6084 		return 0;
6085 	}
6086 
6087 	if (c == 0xA9) { /* Copyright sign */
6088 		*s1 = 0x2855;
6089 		return 1;
6090 	} else if (c == 0xAE) { /* Registered sign */
6091 		*s1 = 0x2856;
6092 		return 1;
6093 	} else if (c >= mb_tbl_uni_sb2code2_min && c <= mb_tbl_uni_sb2code2_max) {
6094 		int i = mbfl_bisec_srch2(c, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
6095 		if (i >= 0) {
6096 			*s1 = mb_tbl_uni_sb2code2_value[i];
6097 			return 1;
6098 		}
6099 	} else if (c >= mb_tbl_uni_sb2code3_min && c <= mb_tbl_uni_sb2code3_max) {
6100 		int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
6101 		if (i >= 0) {
6102 			*s1 = mb_tbl_uni_sb2code3_value[i];
6103 			return 1;
6104 		}
6105 	} else if (c >= mb_tbl_uni_sb2code5_min && c <= mb_tbl_uni_sb2code5_max) {
6106 		int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
6107 		if (i >= 0) {
6108 			*s1 = mb_tbl_uni_sb2code5_val[i];
6109 			return 1;
6110 		}
6111 	}
6112 	return 0;
6113 }
6114 
mbfl_filt_conv_sjis_mobile_wchar(int c,mbfl_convert_filter * filter)6115 static int mbfl_filt_conv_sjis_mobile_wchar(int c, mbfl_convert_filter *filter)
6116 {
6117 	int c1, s, s1, s2, w, snd = 0;
6118 
6119 	switch (filter->status) {
6120 	case 0:
6121 		if (c >= 0 && c < 0x80) { /* ASCII */
6122 			if (filter->from == &mbfl_encoding_sjis_sb && c == 0x1B) {
6123 				/* ESC; escape sequences were used on older SoftBank phones for emoji */
6124 				filter->cache = c;
6125 				filter->status = 2;
6126 			} else {
6127 				CK((*filter->output_function)(c, filter->data));
6128 			}
6129 		} else if (c > 0xA0 && c < 0xE0) { /* Kana */
6130 			CK((*filter->output_function)(0xFEC0 + c, filter->data));
6131 		} else if (c > 0x80 && c < 0xFD && c != 0xA0) { /* Kanji, first byte */
6132 			filter->status = 1;
6133 			filter->cache = c;
6134 		} else {
6135 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6136 		}
6137 		break;
6138 
6139 	case 1: /* Kanji, second byte */
6140 		filter->status = 0;
6141 		c1 = filter->cache;
6142 		if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
6143 			w = 0;
6144 			SJIS_DECODE(c1, c, s1, s2);
6145 			s = ((s1 - 0x21) * 94) + s2 - 0x21;
6146 			if (s <= 137) {
6147 				if (s == 31) {
6148 					w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6149 				} else if (s == 32) {
6150 					w = 0xFF5E; /* FULLWIDTH TILDE */
6151 				} else if (s == 33) {
6152 					w = 0x2225; /* PARALLEL TO */
6153 				} else if (s == 60) {
6154 					w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6155 				} else if (s == 80) {
6156 					w = 0xFFE0; /* FULLWIDTH CENT SIGN */
6157 				} else if (s == 81) {
6158 					w = 0xFFE1; /* FULLWIDTH POUND SIGN */
6159 				} else if (s == 137) {
6160 					w = 0xFFE2; /* FULLWIDTH NOT SIGN */
6161 				}
6162 			}
6163 			if (w == 0) {
6164 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
6165 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
6166 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
6167 					w = jisx0208_ucs_table[s];
6168 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
6169 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
6170 				}
6171 
6172 				/* Emoji */
6173 				if (filter->from == &mbfl_encoding_sjis_docomo && s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) {
6174 					w = mbfilter_sjis_emoji_docomo2unicode(s, &snd);
6175 					if (snd > 0) {
6176 						CK((*filter->output_function)(snd, filter->data));
6177 					}
6178 				} else if (filter->from == &mbfl_encoding_sjis_kddi && s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi2_max) {
6179 					w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
6180 					if (snd > 0) {
6181 						CK((*filter->output_function)(snd, filter->data));
6182 					}
6183 				} else if (filter->from == &mbfl_encoding_sjis_sb && s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb3_max) {
6184 					w = mbfilter_sjis_emoji_sb2unicode(s, &snd);
6185 					if (snd > 0) {
6186 						CK((*filter->output_function)(snd, filter->data));
6187 					}
6188 				}
6189 
6190 				if (w == 0) {
6191 					if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */
6192 						w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
6193 					} else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */
6194 						w = s - (94*94) + 0xe000;
6195 					}
6196 				}
6197 			}
6198 			if (w <= 0) {
6199 				w = MBFL_BAD_INPUT;
6200 			}
6201 			CK((*filter->output_function)(w, filter->data));
6202 		} else {
6203 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6204 		}
6205 		break;
6206 
6207 	/* ESC: Softbank Emoji */
6208 	case 2:
6209 		if (c == '$') {
6210 			filter->cache = c;
6211 			filter->status++;
6212 		} else {
6213 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6214 			filter->status = filter->cache = 0;
6215 		}
6216 		break;
6217 
6218 	/* ESC $: Softbank Emoji */
6219 	case 3:
6220 		if ((c >= 'E' && c <= 'G') || (c >= 'O' && c <= 'Q')) {
6221 			filter->cache = c;
6222 			filter->status++;
6223 		} else {
6224 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6225 			filter->status = filter->cache = 0;
6226 		}
6227 		break;
6228 
6229 	/* ESC $ [GEFOPQ]: Softbank Emoji */
6230 	case 4:
6231 		c1 = filter->cache;
6232 		if (c == 0xF) { /* Terminate sequence of emoji */
6233 			filter->status = filter->cache = 0;
6234 			return 0;
6235 		} else {
6236 			if (c1 == 'G' && c >= 0x21 && c <= 0x7a) {
6237 				s1 = (0x91 - 0x21) * 94;
6238 			} else if (c1 == 'E' && c >= 0x21 && c <= 0x7A) {
6239 				s1 = (0x8D - 0x21) * 94;
6240 			} else if (c1 == 'F' && c >= 0x21 && c <= 0x7A) {
6241 				s1 = (0x8E - 0x21) * 94;
6242 			} else if (c1 == 'O' && c >= 0x21 && c <= 0x6D) {
6243 				s1 = (0x92 - 0x21) * 94;
6244 			} else if (c1 == 'P' && c >= 0x21 && c <= 0x6C) {
6245 				s1 = (0x95 - 0x21) * 94;
6246 			} else if (c1 == 'Q' && c >= 0x21 && c <= 0x5E) {
6247 				s1 = (0x96 - 0x21) * 94;
6248 			} else {
6249 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6250 				filter->status = filter->cache = 0;
6251 				return 0;
6252 			}
6253 
6254 			w = mbfilter_sjis_emoji_sb2unicode(s1 + c - 0x21, &snd);
6255 			if (w > 0) {
6256 				if (snd > 0) {
6257 					CK((*filter->output_function)(snd, filter->data));
6258 				}
6259 				CK((*filter->output_function)(w, filter->data));
6260 			} else {
6261 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
6262 				filter->status = filter->cache = 0;
6263 			}
6264 		}
6265 	}
6266 
6267 	return 0;
6268 }
6269 
mbfl_filt_conv_wchar_sjis_mobile(int c,mbfl_convert_filter * filter)6270 static int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter)
6271 {
6272 	int c1, c2, s1 = 0, s2 = 0;
6273 
6274 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
6275 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
6276 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
6277 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
6278 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
6279 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
6280 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
6281 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
6282 	} else if (c >= 0xE000 && c < (0xE000 + 20*94)) {
6283 		/* Private User Area (95ku - 114ku) */
6284 		s1 = c - 0xE000;
6285 		c1 = (s1 / 94) + 0x7F;
6286 		c2 = (s1 % 94) + 0x21;
6287 		s1 = (c1 << 8) | c2;
6288 		s2 = 1;
6289 	}
6290 
6291 	if (s1 <= 0) {
6292 		if (c == 0xA5) { /* YEN SIGN */
6293 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
6294 		} else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6295 			s1 = 0x2140;
6296 		} else if (c == 0x2225) { /* PARALLEL TO */
6297 			s1 = 0x2142;
6298 		} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6299 			s1 = 0x215D;
6300 		} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6301 			s1 = 0x2171;
6302 		} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6303 			s1 = 0x2172;
6304 		} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6305 			s1 = 0x224C;
6306 		}
6307 	}
6308 
6309 	if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {	/* not found or X 0212 */
6310 		s1 = -1;
6311 
6312 		/* CP932 vendor ext1 (13ku) */
6313 		for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) {
6314 			if (c == cp932ext1_ucs_table[c1]) {
6315 				s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21;
6316 				break;
6317 			}
6318 		}
6319 
6320 		if (s1 <= 0) {
6321 			/* CP932 vendor ext2 (115ku - 119ku) */
6322 			for (c1 = 0; c1 < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; c1++) {
6323 				if (c == cp932ext2_ucs_table[c1]) {
6324 					s1 = (((c1 / 94) + 0x79) << 8) + (c1 % 94) + 0x21;
6325 					break;
6326 				}
6327 			}
6328 		}
6329 
6330 		if (c == 0) {
6331 			s1 = 0;
6332 		}
6333 	}
6334 
6335 	if ((filter->to == &mbfl_encoding_sjis_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter)) ||
6336 		  (filter->to == &mbfl_encoding_sjis_kddi   && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter)) ||
6337 		  (filter->to == &mbfl_encoding_sjis_sb     && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter))) {
6338 		s1 = (((s1 / 94) + 0x21) << 8) | ((s1 % 94) + 0x21);
6339 	}
6340 
6341 	if (filter->status) {
6342 		return 0;
6343 	}
6344 
6345 	if (s1 >= 0) {
6346 		if (s1 < 0x100) { /* Latin/Kana */
6347 			CK((*filter->output_function)(s1, filter->data));
6348 		} else { /* Kanji */
6349 			c1 = (s1 >> 8) & 0xff;
6350 			c2 = s1 & 0xff;
6351 			SJIS_ENCODE(c1, c2, s1, s2);
6352 			CK((*filter->output_function)(s1, filter->data));
6353 			CK((*filter->output_function)(s2, filter->data));
6354 		}
6355 	} else {
6356 		CK(mbfl_filt_conv_illegal_output(c, filter));
6357 	}
6358 
6359 	return 0;
6360 }
6361 
mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter * filter)6362 int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter)
6363 {
6364 	int c1 = filter->cache;
6365 	if (filter->status == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) {
6366 		filter->cache = filter->status = 0;
6367 		CK((*filter->output_function)(c1, filter->data));
6368 	} else if (filter->status == 2) {
6369 		/* First of a pair of Regional Indicator codepoints came at the end of a string */
6370 		filter->cache = filter->status = 0;
6371 		mbfl_filt_conv_illegal_output(c1, filter);
6372 	}
6373 
6374 	if (filter->flush_function) {
6375 		(*filter->flush_function)(filter->data);
6376 	}
6377 
6378 	return 0;
6379 }
6380 
6381 static const unsigned short sjis_mobile_decode_tbl1[] = {
6382 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 8836, 9024, 9212, 9400, 9588, 9776, 9964, 10152, 10340, 10528, 10716, 10904, 11092, 0xFFFF, 0xFFFF, 0xFFFF
6383 };
6384 
mb_sjis_docomo_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6385 static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6386 {
6387 	unsigned char *p = *in, *e = p + *in_len;
6388 	/* Leave one extra space available in output buffer, since some iterations of
6389 	 * main loop (below) may emit two wchars */
6390 	uint32_t *out = buf, *limit = buf + bufsize - 1;
6391 
6392 	while (p < e && out < limit) {
6393 		unsigned char c = *p++;
6394 
6395 		if (c <= 0x7F) {
6396 			*out++ = c;
6397 		} else if (c >= 0xA1 && c <= 0xDF) {
6398 			/* Kana */
6399 			*out++ = 0xFEC0 + c;
6400 		} else {
6401 			/* Kanji */
6402 			if (p == e) {
6403 				*out++ = MBFL_BAD_INPUT;
6404 				break;
6405 			}
6406 			unsigned char c2 = *p++;
6407 			uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6408 
6409 			if (w <= 137) {
6410 				if (w == 31) {
6411 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6412 					continue;
6413 				} else if (w == 32) {
6414 					*out++ =  0xFF5E; /* FULLWIDTH TILDE */
6415 					continue;
6416 				} else if (w == 33) {
6417 					*out++ =  0x2225; /* PARALLEL TO */
6418 					continue;
6419 				} else if (w == 60) {
6420 					*out++ =  0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6421 					continue;
6422 				} else if (w == 80) {
6423 					*out++ =  0xFFE0; /* FULLWIDTH CENT SIGN */
6424 					continue;
6425 				} else if (w == 81) {
6426 					*out++ =  0xFFE1; /* FULLWIDTH POUND SIGN */
6427 					continue;
6428 				} else if (w == 137) {
6429 					*out++ =  0xFFE2; /* FULLWIDTH NOT SIGN */
6430 					continue;
6431 				}
6432 			}
6433 
6434 			if (w >= mb_tbl_code2uni_docomo1_min && w <= mb_tbl_code2uni_docomo1_max) {
6435 				int snd = 0;
6436 				w = mbfilter_sjis_emoji_docomo2unicode(w, &snd);
6437 				if (snd) {
6438 					*out++ = snd;
6439 				}
6440 			} else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6441 				w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6442 			} else if (w < jisx0208_ucs_table_size) {
6443 				w = jisx0208_ucs_table[w];
6444 			} else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6445 				w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6446 			} else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6447 				w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6448 			} else if (w >= (94*94) && w < (114*94)) {
6449 				w = w - (94*94) + 0xE000;
6450 			} else {
6451 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6452 					p--;
6453 				}
6454 				*out++ = MBFL_BAD_INPUT;
6455 				continue;
6456 			}
6457 
6458 			*out++ = w ? w : MBFL_BAD_INPUT;
6459 		}
6460 	}
6461 
6462 	*in_len = e - p;
6463 	*in = p;
6464 	return out - buf;
6465 }
6466 
mb_wchar_to_sjis_docomo(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6467 static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6468 {
6469 	unsigned char *out, *limit;
6470 	MB_CONVERT_BUF_LOAD(buf, out, limit);
6471 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6472 
6473 	uint32_t w;
6474 	unsigned int s = 0;
6475 
6476 	if (buf->state) {
6477 		/* Continue what we were doing on the previous call */
6478 		w = buf->state;
6479 		buf->state = 0;
6480 		goto reprocess_wchar;
6481 	}
6482 
6483 	while (len--) {
6484 		w = *in++;
6485 reprocess_wchar:
6486 		s = 0;
6487 
6488 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6489 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6490 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6491 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6492 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6493 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6494 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6495 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6496 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6497 			/* Private User Area (95ku - 114ku) */
6498 			s = w - 0xE000;
6499 			s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6500 			goto process_emoji;
6501 		}
6502 
6503 		if (!s) {
6504 			if (w == 0xA5) { /* YEN SIGN */
6505 				s = 0x216F; /* FULLWIDTH YEN SIGN */
6506 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
6507 				s = 0x2140;
6508 			} else if (w == 0x2225) { /* PARALLEL TO */
6509 				s = 0x2142;
6510 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6511 				s = 0x215D;
6512 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6513 				s = 0x2171;
6514 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6515 				s = 0x2172;
6516 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6517 				s = 0x224C;
6518 			}
6519 		}
6520 
6521 		if (w && (!s || s >= 0x8080)) {
6522 			s = 0;
6523 
6524 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6525 				if (w == cp932ext1_ucs_table[i]) {
6526 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6527 					goto process_emoji;
6528 				}
6529 			}
6530 
6531 			for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6532 				if (w == cp932ext2_ucs_table[i]) {
6533 					s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6534 					goto process_emoji;
6535 				}
6536 			}
6537 		}
6538 
6539 process_emoji:
6540 		/* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji
6541 		 * to a sequence of 2 codepoints, one of which is a combining character which
6542 		 * adds the 'key' image around the other
6543 		 *
6544 		 * In the other direction, look for such sequences and convert them to a
6545 		 * single emoji */
6546 		if (w == '#' || (w >= '0' && w <= '9')) {
6547 			if (!len) {
6548 				if (end) {
6549 					goto emit_output;
6550 				} else {
6551 					/* If we are at the end of the current buffer of codepoints, but another
6552 					 * buffer is coming, then remember that we have to reprocess `w` */
6553 					buf->state = w;
6554 					break;
6555 				}
6556 			}
6557 			uint32_t w2 = *in++; len--;
6558 			if (w2 == 0x20E3) {
6559 				if (w == '#') {
6560 					s = 0x2964;
6561 				} else if (w == '0') {
6562 					s = 0x296F;
6563 				} else { /* Previous character was '1'-'9' */
6564 					s = 0x2966 + (w - '1');
6565 				}
6566 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6567 			} else {
6568 				in--; len++;
6569 			}
6570 		} else if (w == 0xA9) { /* Copyright sign */
6571 			s = (((0x29B5 / 94) + 0x21) << 8) | ((0x29B5 % 94) + 0x21);
6572 		} else if (w == 0xAE) { /* Registered sign */
6573 			s = (((0x29BA / 94) + 0x21) << 8) | ((0x29BA % 94) + 0x21);
6574 		} else if (w >= mb_tbl_uni_docomo2code2_min && w <= mb_tbl_uni_docomo2code2_max) {
6575 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len);
6576 			if (i >= 0) {
6577 				s = mb_tbl_uni_docomo2code2_value[i];
6578 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6579 			}
6580 		} else if (w >= mb_tbl_uni_docomo2code3_min && w <= mb_tbl_uni_docomo2code3_max) {
6581 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len);
6582 			if (i >= 0) {
6583 				s = mb_tbl_uni_docomo2code3_value[i];
6584 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6585 			}
6586 		} else if (w >= mb_tbl_uni_docomo2code5_min && w <= mb_tbl_uni_docomo2code5_max) {
6587 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len);
6588 			if (i >= 0) {
6589 				s = mb_tbl_uni_docomo2code5_val[i];
6590 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6591 			}
6592 		}
6593 
6594 emit_output:
6595 		if (!s && w) {
6596 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_docomo);
6597 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6598 		} else if (s <= 0xFF) {
6599 			out = mb_convert_buf_add(out, s);
6600 		} else {
6601 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6602 			SJIS_ENCODE(c1, c2, s1, s2);
6603 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6604 			out = mb_convert_buf_add2(out, s1, s2);
6605 		}
6606 	}
6607 
6608 	MB_CONVERT_BUF_STORE(buf, out, limit);
6609 }
6610 
mb_sjis_kddi_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6611 static size_t mb_sjis_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6612 {
6613 	unsigned char *p = *in, *e = p + *in_len;
6614 	uint32_t *out = buf, *limit = buf + bufsize - 1;
6615 
6616 	while (p < e && out < limit) {
6617 		unsigned char c = *p++;
6618 
6619 		if (c <= 0x7F) {
6620 			*out++ = c;
6621 		} else if (c >= 0xA1 && c <= 0xDF) {
6622 			/* Kana */
6623 			*out++ = 0xFEC0 + c;
6624 		} else {
6625 			/* Kanji */
6626 			if (p == e) {
6627 				*out++ = MBFL_BAD_INPUT;
6628 				break;
6629 			}
6630 			unsigned char c2 = *p++;
6631 			uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6632 
6633 			if (w <= 137) {
6634 				if (w == 31) {
6635 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6636 					continue;
6637 				} else if (w == 32) {
6638 					*out++ = 0xFF5E; /* FULLWIDTH TILDE */
6639 					continue;
6640 				} else if (w == 33) {
6641 					*out++ = 0x2225; /* PARALLEL TO */
6642 					continue;
6643 				} else if (w == 60) {
6644 					*out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6645 					continue;
6646 				} else if (w == 80) {
6647 					*out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6648 					continue;
6649 				} else if (w == 81) {
6650 					*out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6651 					continue;
6652 				} else if (w == 137) {
6653 					*out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6654 					continue;
6655 				}
6656 			}
6657 
6658 			if (w >= mb_tbl_code2uni_kddi1_min && w <= mb_tbl_code2uni_kddi2_max) {
6659 				int snd = 0;
6660 				w = mbfilter_sjis_emoji_kddi2unicode(w, &snd);
6661 				if (!w) {
6662 					w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6663 					if (w >= (94*94) && w < (114*94)) {
6664 						w = w - (94*94) + 0xE000;
6665 					}
6666 				} else if (snd) {
6667 					*out++ = snd;
6668 				}
6669 			} else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6670 				w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6671 			} else if (w < jisx0208_ucs_table_size) {
6672 				w = jisx0208_ucs_table[w];
6673 			} else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6674 				w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6675 			} else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6676 				w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6677 			} else if (w >= (94*94) && w < (114*94)) {
6678 				w = w - (94*94) + 0xE000;
6679 			} else {
6680 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6681 					p--;
6682 				}
6683 				*out++ = MBFL_BAD_INPUT;
6684 				continue;
6685 			}
6686 
6687 			*out++ = w ? w : MBFL_BAD_INPUT;
6688 		}
6689 	}
6690 
6691 	*in_len = e - p;
6692 	*in = p;
6693 	return out - buf;
6694 }
6695 
mb_wchar_to_sjis_kddi(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)6696 static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
6697 {
6698 	unsigned char *out, *limit;
6699 	MB_CONVERT_BUF_LOAD(buf, out, limit);
6700 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
6701 
6702 	uint32_t w;
6703 	unsigned int s = 0;
6704 
6705 	if (buf->state) {
6706 		w = buf->state;
6707 		buf->state = 0;
6708 		goto reprocess_wchar;
6709 	}
6710 
6711 	while (len--) {
6712 		w = *in++;
6713 reprocess_wchar:
6714 		s = 0;
6715 
6716 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
6717 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
6718 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
6719 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
6720 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
6721 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
6722 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
6723 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
6724 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
6725 			/* Private User Area (95ku - 114ku) */
6726 			s = w - 0xE000;
6727 			s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
6728 			goto process_emoji;
6729 		}
6730 
6731 		if (!s) {
6732 			if (w == 0xA5) { /* YEN SIGN */
6733 				s = 0x216F; /* FULLWIDTH YEN SIGN */
6734 			} else if (w == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
6735 				s = 0x2140;
6736 			} else if (w == 0x2225) { /* PARALLEL TO */
6737 				s = 0x2142;
6738 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
6739 				s = 0x215D;
6740 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
6741 				s = 0x2171;
6742 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
6743 				s = 0x2172;
6744 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
6745 				s = 0x224C;
6746 			}
6747 		}
6748 
6749 		if (w && (!s || s >= 0x8080)) {
6750 			s = 0;
6751 
6752 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
6753 				if (w == cp932ext1_ucs_table[i]) {
6754 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
6755 					goto process_emoji;
6756 				}
6757 			}
6758 
6759 			for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
6760 				if (w == cp932ext2_ucs_table[i]) {
6761 					s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
6762 					goto process_emoji;
6763 				}
6764 			}
6765 		}
6766 
6767 process_emoji:
6768 		if (w == '#' || (w >= '0' && w <= '9')) {
6769 			if (!len) {
6770 				if (end) {
6771 					goto emit_output;
6772 				} else {
6773 					/* If we are at the end of the current buffer of codepoints, but another
6774 					 * buffer is coming, then remember that we have to reprocess `w` */
6775 					buf->state = w;
6776 					break;
6777 				}
6778 			}
6779 			uint32_t w2 = *in++; len--;
6780 			if (w2 == 0x20E3) {
6781 				if (w == '#') {
6782 					s = 0x25BC;
6783 				} else if (w == '0') {
6784 					s = 0x2830;
6785 				} else { /* Previous character was '1'-'9' */
6786 					s = 0x27A6 + (w - '1');
6787 				}
6788 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6789 			} else {
6790 				in--; len++;
6791 			}
6792 		} else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
6793 			if (!len) {
6794 				if (end) {
6795 					MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6796 				} else {
6797 					/* Reprocess `w` when this function is called again with another buffer
6798 					 * of wchars */
6799 					buf->state = w;
6800 				}
6801 				break;
6802 			}
6803 			uint32_t w2 = *in++; len--;
6804 			if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
6805 				for (int i = 0; i < 10; i++) {
6806 					if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
6807 						s = nflags_code_kddi[i];
6808 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6809 						goto emit_output;
6810 					}
6811 				}
6812 			}
6813 			in--; len++;
6814 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6815 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6816 			continue;
6817 		} else if (w == 0xA9) { /* Copyright sign */
6818 			s = (((0x27DC / 94) + 0x21) << 8) | ((0x27DC % 94) + 0x21);
6819 		} else if (w == 0xAE) { /* Registered sign */
6820 			s = (((0x27DD / 94) + 0x21) << 8) | ((0x27DD % 94) + 0x21);
6821 		} else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
6822 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
6823 			if (i >= 0) {
6824 				s = mb_tbl_uni_kddi2code2_value[i];
6825 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6826 			}
6827 		} else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
6828 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
6829 			if (i >= 0) {
6830 				s = mb_tbl_uni_kddi2code3_value[i];
6831 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6832 			}
6833 		} else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
6834 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
6835 			if (i >= 0) {
6836 				s = mb_tbl_uni_kddi2code5_val[i];
6837 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
6838 			}
6839 		}
6840 
6841 emit_output:
6842 		if (!s && w) {
6843 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi);
6844 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
6845 		} else if (s <= 0xFF) {
6846 			out = mb_convert_buf_add(out, s);
6847 		} else {
6848 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
6849 			SJIS_ENCODE(c1, c2, s1, s2);
6850 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
6851 			out = mb_convert_buf_add2(out, s1, s2);
6852 		}
6853 	}
6854 
6855 	MB_CONVERT_BUF_STORE(buf, out, limit);
6856 }
6857 
mb_sjis_sb_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)6858 static size_t mb_sjis_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
6859 {
6860 	unsigned char *p = *in, *e = p + *in_len;
6861 	uint32_t *out = buf, *limit = buf + bufsize - 1;
6862 
6863 	if (*state) {
6864 		goto softbank_emoji_escapes;
6865 	}
6866 
6867 	while (p < e && out < limit) {
6868 		unsigned char c = *p++;
6869 
6870 		if (c == 0x1B) {
6871 			/* Escape sequence */
6872 			if (p == e || *p++ != '$' || p == e) {
6873 				*out++ = MBFL_BAD_INPUT;
6874 				continue;
6875 			}
6876 			unsigned char c2 = *p++;
6877 			if ((c2 < 'E' || c2 > 'G') && (c2 < 'O' || c2 > 'Q')) {
6878 				*out++ = MBFL_BAD_INPUT;
6879 				continue;
6880 			}
6881 			/* Escape sequence was valid, next should be a series of specially
6882 			 * encoded Softbank emoji */
6883 			*state = c2;
6884 
6885 softbank_emoji_escapes:
6886 			while (p < e && out < limit) {
6887 				c = *p++;
6888 				if (c == 0xF) {
6889 					*state = 0;
6890 					break;
6891 				}
6892 				unsigned int s = 0;
6893 				if (*state == 'G' && c >= 0x21 && c <= 0x7A) {
6894 					s = (0x91 - 0x21) * 94;
6895 				} else if (*state == 'E' && c >= 0x21 && c <= 0x7A) {
6896 					s = (0x8D - 0x21) * 94;
6897 				} else if (*state == 'F' && c >= 0x21 && c <= 0x7A) {
6898 					s = (0x8E - 0x21) * 94;
6899 				} else if (*state == 'O' && c >= 0x21 && c <= 0x6D) {
6900 					s = (0x92 - 0x21) * 94;
6901 				} else if (*state == 'P' && c >= 0x21 && c <= 0x6C) {
6902 					s = (0x95 - 0x21) * 94;
6903 				} else if (*state == 'Q' && c >= 0x21 && c <= 0x5E) {
6904 					s = (0x96 - 0x21) * 94;
6905 				} else {
6906 					*out++ = MBFL_BAD_INPUT;
6907 					*state = 0;
6908 					break;
6909 				}
6910 
6911 				int snd = 0;
6912 				uint32_t w = mbfilter_sjis_emoji_sb2unicode(s + c - 0x21, &snd);
6913 				if (w) {
6914 					if (snd) {
6915 						*out++ = snd;
6916 					}
6917 					*out++ = w;
6918 				} else {
6919 					*out++ = MBFL_BAD_INPUT;
6920 					*state = 0;
6921 					break;
6922 				}
6923 			}
6924 		} else if (c <= 0x7F) {
6925 			*out++ = c;
6926 		} else if (c >= 0xA1 && c <= 0xDF) {
6927 			/* Kana */
6928 			*out++ = 0xFEC0 + c;
6929 		} else {
6930 			/* Kanji */
6931 			if (p == e) {
6932 				*out++ = MBFL_BAD_INPUT;
6933 				break;
6934 			}
6935 			unsigned char c2 = *p++;
6936 			uint32_t w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6937 
6938 			if (w <= 137) {
6939 				if (w == 31) {
6940 					*out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
6941 					continue;
6942 				} else if (w == 32) {
6943 					*out++ = 0xFF5E; /* FULLWIDTH TILDE */
6944 					continue;
6945 				} else if (w == 33) {
6946 					*out++ = 0x2225; /* PARALLEL TO */
6947 					continue;
6948 				} else if (w == 60) {
6949 					*out++ = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
6950 					continue;
6951 				} else if (w == 80) {
6952 					*out++ = 0xFFE0; /* FULLWIDTH CENT SIGN */
6953 					continue;
6954 				} else if (w == 81) {
6955 					*out++ = 0xFFE1; /* FULLWIDTH POUND SIGN */
6956 					continue;
6957 				} else if (w == 137) {
6958 					*out++ = 0xFFE2; /* FULLWIDTH NOT SIGN */
6959 					continue;
6960 				}
6961 			}
6962 
6963 			if (w >= mb_tbl_code2uni_sb1_min && w <= mb_tbl_code2uni_sb3_max) {
6964 				int snd = 0;
6965 				w = mbfilter_sjis_emoji_sb2unicode(w, &snd);
6966 				if (!w) {
6967 					w = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
6968 					if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6969 						w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6970 					} else if (w >= (94*94) && w < (114*94)) {
6971 						w = w - (94*94) + 0xE000;
6972 					}
6973 				} else if (snd) {
6974 					*out++ = snd;
6975 				}
6976 			} else if (w >= cp932ext1_ucs_table_min && w < cp932ext1_ucs_table_max) {
6977 				w = cp932ext1_ucs_table[w - cp932ext1_ucs_table_min];
6978 			} else if (w < jisx0208_ucs_table_size) {
6979 				w = jisx0208_ucs_table[w];
6980 			} else if (w >= cp932ext2_ucs_table_min && w < cp932ext2_ucs_table_max) {
6981 				w = cp932ext2_ucs_table[w - cp932ext2_ucs_table_min];
6982 			} else if (w >= cp932ext3_ucs_table_min && w < cp932ext3_ucs_table_max) {
6983 				w = cp932ext3_ucs_table[w - cp932ext3_ucs_table_min];
6984 			} else if (w >= (94*94) && w < (114*94)) {
6985 				w = w - (94*94) + 0xE000;
6986 			} else {
6987 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
6988 					p--;
6989 				}
6990 				*out++ = MBFL_BAD_INPUT;
6991 				continue;
6992 			}
6993 
6994 			*out++ = w ? w : MBFL_BAD_INPUT;
6995 		}
6996 	}
6997 
6998 	*in_len = e - p;
6999 	*in = p;
7000 	return out - buf;
7001 }
7002 
mb_wchar_to_sjis_sb(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7003 static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7004 {
7005 	unsigned char *out, *limit;
7006 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7007 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len + (buf->state ? 1 : 0));
7008 
7009 	uint32_t w;
7010 	unsigned int s = 0;
7011 
7012 	if (buf->state) {
7013 		w = buf->state;
7014 		buf->state = 0;
7015 		goto reprocess_wchar;
7016 	}
7017 
7018 	while (len--) {
7019 		w = *in++;
7020 reprocess_wchar:
7021 		s = 0;
7022 
7023 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7024 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7025 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7026 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7027 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7028 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
7029 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7030 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
7031 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7032 			/* Private User Area (95ku - 114ku) */
7033 			s = w - 0xE000;
7034 			s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21);
7035 			goto process_emoji;
7036 		}
7037 
7038 		if (!s) {
7039 			if (w == 0xA5) { /* YEN SIGN */
7040 				s = 0x216F; /* FULLWIDTH YEN SIGN */
7041 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7042 				s = 0x2140;
7043 			} else if (w == 0x2225) { /* PARALLEL TO */
7044 				s = 0x2142;
7045 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7046 				s = 0x215D;
7047 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7048 				s = 0x2171;
7049 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7050 				s = 0x2172;
7051 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7052 				s = 0x224C;
7053 			}
7054 		}
7055 
7056 		if (w && (!s || s >= 0x8080)) {
7057 			s = 0;
7058 
7059 			for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7060 				if (w == cp932ext1_ucs_table[i]) {
7061 					s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
7062 					goto process_emoji;
7063 				}
7064 			}
7065 
7066 			for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
7067 				if (w == cp932ext2_ucs_table[i]) {
7068 					s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21;
7069 					goto process_emoji;
7070 				}
7071 			}
7072 		}
7073 
7074 process_emoji:
7075 		if (w == '#' || (w >= '0' && w <= '9')) {
7076 			if (!len) {
7077 				if (end) {
7078 					goto emit_output;
7079 				} else {
7080 					/* If we are at the end of the current buffer of codepoints, but another
7081 					 * buffer is coming, then remember that we have to reprocess `w` */
7082 					buf->state = w;
7083 					break;
7084 				}
7085 			}
7086 			uint32_t w2 = *in++; len--;
7087 			if (w2 == 0x20E3) {
7088 				if (w == '#') {
7089 					s = 0x2817;
7090 				} else if (w == '0') {
7091 					s = 0x282c;
7092 				} else { /* Previous character was '1'-'9' */
7093 					s = 0x2823 + (w - '1');
7094 				}
7095 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7096 			} else {
7097 				in--; len++;
7098 			}
7099 		} else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */
7100 			if (!len) {
7101 				if (end) {
7102 					MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7103 				} else {
7104 					/* Reprocess `w` when this function is called again with
7105 					 * another buffer of wchars */
7106 					buf->state = w;
7107 				}
7108 				break;
7109 			}
7110 			uint32_t w2 = *in++; len--;
7111 			if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
7112 				for (int i = 0; i < 10; i++) {
7113 					if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
7114 						s = nflags_code_sb[i];
7115 						s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7116 						goto emit_output;
7117 					}
7118 				}
7119 			}
7120 			in--; len++;
7121 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7122 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7123 			continue;
7124 		} else if (w == 0xA9) { /* Copyright sign */
7125 			s = (((0x2855 / 94) + 0x21) << 8) | ((0x2855 % 94) + 0x21);
7126 		} else if (w == 0xAE) { /* Registered sign */
7127 			s = (((0x2856 / 94) + 0x21) << 8) | ((0x2856 % 94) + 0x21);
7128 		} else if (w >= mb_tbl_uni_sb2code2_min && w <= mb_tbl_uni_sb2code2_max) {
7129 			int i = mbfl_bisec_srch2(w, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len);
7130 			if (i >= 0) {
7131 				s = mb_tbl_uni_sb2code2_value[i];
7132 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7133 			}
7134 		} else if (w >= mb_tbl_uni_sb2code3_min && w <= mb_tbl_uni_sb2code3_max) {
7135 			int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len);
7136 			if (i >= 0) {
7137 				s = mb_tbl_uni_sb2code3_value[i];
7138 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7139 			}
7140 		} else if (w >= mb_tbl_uni_sb2code5_min && w <= mb_tbl_uni_sb2code5_max) {
7141 			int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len);
7142 			if (i >= 0) {
7143 				s = mb_tbl_uni_sb2code5_val[i];
7144 				s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21);
7145 			}
7146 		}
7147 
7148 emit_output:
7149 		if (!s && w) {
7150 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb);
7151 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7152 		} else if (s <= 0xFF) {
7153 			out = mb_convert_buf_add(out, s);
7154 		} else {
7155 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7156 			SJIS_ENCODE(c1, c2, s1, s2);
7157 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7158 			out = mb_convert_buf_add2(out, s1, s2);
7159 		}
7160 	}
7161 
7162 	MB_CONVERT_BUF_STORE(buf, out, limit);
7163 }
7164 
mb_sjis2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7165 static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7166 {
7167 	unsigned char *p = *in, *e = p + *in_len;
7168 	uint32_t *out = buf, *limit = buf + bufsize - 1;
7169 
7170 	while (p < e && out < limit) {
7171 		unsigned char c = *p++;
7172 
7173 		if (c <= 0x7F) {
7174 			if (c == 0x5C) {
7175 				*out++ = 0xA5;
7176 			} else if (c == 0x7E) {
7177 				*out++ = 0x203E;
7178 			} else {
7179 				*out++ = c;
7180 			}
7181 		} else if (c >= 0xA1 && c <= 0xDF) {
7182 			*out++ = 0xFEC0 + c;
7183 		} else {
7184 			if (p == e) {
7185 				*out++ = MBFL_BAD_INPUT;
7186 				break;
7187 			}
7188 			unsigned char c2 = *p++;
7189 			uint32_t w1 = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7190 
7191 			/* Conversion for combining characters */
7192 			if (w1 >= 0x0170 && w1 <= 0x03F1) {
7193 				int k = mbfl_bisec_srch2(w1, jisx0213_u2_key_b, jisx0213_u2_tbl_len);
7194 				if (k >= 0) {
7195 					*out++ = jisx0213_u2_tbl[2*k];
7196 					*out++ = jisx0213_u2_tbl[2*k+1];
7197 					continue;
7198 				}
7199 			}
7200 
7201 			/* Conversion for BMP */
7202 			if (w1 < jisx0213_ucs_table_size) {
7203 				uint32_t w = jisx0213_ucs_table[w1];
7204 				if (w) {
7205 					*out++ = w;
7206 					continue;
7207 				}
7208 			}
7209 
7210 			/* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */
7211 			int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
7212 			if (k >= 0) {
7213 				*out++ = jisx0213_jis_u5_tbl[k] + 0x20000;
7214 			} else {
7215 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7216 					p--;
7217 				}
7218 				*out++ = MBFL_BAD_INPUT;
7219 			}
7220 		}
7221 	}
7222 
7223 	 *in_len = e - p;
7224 	 *in = p;
7225 	 return out - buf;
7226 }
7227 
mb_wchar_to_sjis2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7228 static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7229 {
7230 	unsigned char *out, *limit;
7231 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7232 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7233 
7234 	uint32_t w;
7235 	if (buf->state) {
7236 		w = buf->state;
7237 		buf->state = 0;
7238 		goto process_codepoint;
7239 	}
7240 
7241 	while (len--) {
7242 		w = *in++;
7243 process_codepoint: ;
7244 		unsigned int s = 0;
7245 
7246 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
7247 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
7248 				if (w == jisx0213_u2_tbl[2*k]) {
7249 					if (!len) {
7250 						if (!end) {
7251 							buf->state = w;
7252 							MB_CONVERT_BUF_STORE(buf, out, limit);
7253 							return;
7254 						}
7255 					} else {
7256 						uint32_t w2 = *in++; len--;
7257 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
7258 							k++;
7259 						}
7260 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
7261 							s = jisx0213_u2_key[k];
7262 							break;
7263 						}
7264 						in--; len++;
7265 					}
7266 
7267 					/* Fallback */
7268 					s = jisx0213_u2_fb_tbl[k];
7269 					break;
7270 				}
7271 			}
7272 		}
7273 
7274 		/* Check for major Japanese chars: U+4E00-U+9FFF */
7275 		if (!s) {
7276 			for (int k = 0; k < uni2jis_tbl_len; k++) {
7277 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
7278 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
7279 					break;
7280 				}
7281 			}
7282 		}
7283 
7284 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
7285 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
7286 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
7287 			if (k >= 0) {
7288 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
7289 			}
7290 		}
7291 
7292 		/* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
7293 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
7294 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
7295 			if (k >= 0) {
7296 				s = jisx0213_u5_jis_tbl[k];
7297 			}
7298 		}
7299 
7300 		if (!s) {
7301 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
7302 			if (w == 0xFE45) {
7303 				s = 0x233E;
7304 			} else if (w == 0xFE46) {
7305 				s = 0x233D;
7306 			} else if (w >= 0xF91D && w <= 0xF9DC) {
7307 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
7308 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
7309 				if (k >= 0) {
7310 					s = ucs_r2b_jisx0213_cmap_val[k];
7311 				}
7312 			}
7313 		}
7314 
7315 		if (!s && w) {
7316 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis2004);
7317 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
7318 		} else if (s <= 0xFF) {
7319 			out = mb_convert_buf_add(out, s);
7320 		} else {
7321 			unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2;
7322 			SJIS_ENCODE(c1, c2, s1, s2);
7323 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
7324 			out = mb_convert_buf_add2(out, s1, s2);
7325 		}
7326 	}
7327 
7328 	MB_CONVERT_BUF_STORE(buf, out, limit);
7329 }
7330 
mbfl_filt_conv_cp932_wchar(int c,mbfl_convert_filter * filter)7331 static int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
7332 {
7333 	int c1, s, s1, s2, w;
7334 
7335 	switch (filter->status) {
7336 	case 0:
7337 		if (c >= 0 && c < 0x80) {	/* latin */
7338 			CK((*filter->output_function)(c, filter->data));
7339 		} else if (c > 0xa0 && c < 0xe0) {	/* kana */
7340 			CK((*filter->output_function)(0xfec0 + c, filter->data));
7341 		} else if (c > 0x80 && c < 0xfd && c != 0xa0) {	/* kanji first char */
7342 			filter->status = 1;
7343 			filter->cache = c;
7344 		} else {
7345 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7346 		}
7347 		break;
7348 
7349 	case 1:		/* kanji second char */
7350 		filter->status = 0;
7351 		c1 = filter->cache;
7352 		if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
7353 			w = 0;
7354 			SJIS_DECODE(c1, c, s1, s2);
7355 			s = (s1 - 0x21)*94 + s2 - 0x21;
7356 			if (s <= 137) {
7357 				if (s == 31) {
7358 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
7359 				} else if (s == 32) {
7360 					w = 0xff5e;			/* FULLWIDTH TILDE */
7361 				} else if (s == 33) {
7362 					w = 0x2225;			/* PARALLEL TO */
7363 				} else if (s == 60) {
7364 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
7365 				} else if (s == 80) {
7366 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
7367 				} else if (s == 81) {
7368 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
7369 				} else if (s == 137) {
7370 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
7371 				}
7372 			}
7373 			if (w == 0) {
7374 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
7375 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7376 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
7377 					w = jisx0208_ucs_table[s];
7378 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
7379 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7380 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {		/* vendor ext3 (115ku - 119ku) */
7381 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7382 				} else if (s >= (94*94) && s < (114*94)) {		/* user (95ku - 114ku) */
7383 					w = s - (94*94) + 0xe000;
7384 				}
7385 			}
7386 
7387 			if (w <= 0) {
7388 				w = MBFL_BAD_INPUT;
7389 			}
7390 
7391 			CK((*filter->output_function)(w, filter->data));
7392 		} else {
7393 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
7394 		}
7395 		break;
7396 
7397 		EMPTY_SWITCH_DEFAULT_CASE();
7398 	}
7399 
7400 	return 0;
7401 }
7402 
mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter * filter)7403 static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
7404 {
7405 	if (filter->status) {
7406 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
7407 		filter->status = 0;
7408 	}
7409 
7410 	if (filter->flush_function) {
7411 		(*filter->flush_function)(filter->data);
7412 	}
7413 
7414 	return 0;
7415 }
7416 
mbfl_filt_conv_wchar_cp932(int c,mbfl_convert_filter * filter)7417 static int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
7418 {
7419 	int c1, c2, s1, s2;
7420 
7421 	s1 = 0;
7422 	s2 = 0;
7423 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
7424 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
7425 	} else if (c == 0x203E) {
7426 		s1 = 0x7E;
7427 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
7428 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
7429 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
7430 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
7431 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
7432 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
7433 	} else if (c >= 0xe000 && c < (0xe000 + 20*94)) {	/* user  (95ku - 114ku) */
7434 		s1 = c - 0xe000;
7435 		c1 = s1/94 + 0x7f;
7436 		c2 = s1%94 + 0x21;
7437 		s1 = (c1 << 8) | c2;
7438 		s2 = 1;
7439 	}
7440 	if (s1 <= 0) {
7441 		if (c == 0xa5) { /* YEN SIGN */
7442 			s1 = 0x5C;
7443 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
7444 			s1 = 0x2140;
7445 		} else if (c == 0x2225) {	/* PARALLEL TO */
7446 			s1 = 0x2142;
7447 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
7448 			s1 = 0x215d;
7449 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
7450 			s1 = 0x2171;
7451 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
7452 			s1 = 0x2172;
7453 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
7454 			s1 = 0x224c;
7455 		}
7456 	}
7457 	if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {	/* not found or X 0212 */
7458 		s1 = -1;
7459 		c1 = 0;
7460 		c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
7461 		while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
7462 			if (c == cp932ext1_ucs_table[c1]) {
7463 				s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
7464 				break;
7465 			}
7466 			c1++;
7467 		}
7468 		if (s1 <= 0) {
7469 			c1 = 0;
7470 			c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
7471 			while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
7472 				if (c == cp932ext3_ucs_table[c1]) {
7473 					s1 = ((c1/94 + 0x93) << 8) + (c1%94 + 0x21);
7474 					break;
7475 				}
7476 				c1++;
7477 			}
7478 		}
7479 		if (c == 0) {
7480 			s1 = 0;
7481 		} else if (s1 <= 0) {
7482 			s1 = -1;
7483 		}
7484 	}
7485 	if (s1 >= 0) {
7486 		if (s1 < 0x100) { /* latin or kana */
7487 			CK((*filter->output_function)(s1, filter->data));
7488 		} else { /* kanji */
7489 			c1 = (s1 >> 8) & 0xff;
7490 			c2 = s1 & 0xff;
7491 			SJIS_ENCODE(c1, c2, s1, s2);
7492 			CK((*filter->output_function)(s1, filter->data));
7493 			CK((*filter->output_function)(s2, filter->data));
7494 		}
7495 	} else {
7496 		CK(mbfl_filt_conv_illegal_output(c, filter));
7497 	}
7498 
7499 	return 0;
7500 }
7501 
mbfl_filt_conv_wchar_sjiswin(int c,mbfl_convert_filter * filter)7502 static int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter)
7503 {
7504 	if (c == 0xA5) {
7505 		CK((*filter->output_function)(0x81, filter->data));
7506 		CK((*filter->output_function)(0x8F, filter->data));
7507 	} else if (c == 0x203E) {
7508 		CK((*filter->output_function)(0x81, filter->data));
7509 		CK((*filter->output_function)(0x50, filter->data));
7510 	} else {
7511 		return mbfl_filt_conv_wchar_cp932(c, filter);
7512 	}
7513 	return 0;
7514 }
7515 
mb_cp932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)7516 static size_t mb_cp932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
7517 {
7518 	unsigned char *p = *in, *e = p + *in_len;
7519 	uint32_t *out = buf, *limit = buf + bufsize;
7520 
7521 	while (p < e && out < limit) {
7522 		unsigned char c = *p++;
7523 
7524 		if (c < 0x80) {
7525 			*out++ = c;
7526 		} else if (c > 0xA0 && c < 0xE0) {
7527 			/* Kana */
7528 			*out++ = 0xFEC0 + c;
7529 		} else {
7530 			if (p == e) {
7531 				*out++ = MBFL_BAD_INPUT;
7532 				break;
7533 			}
7534 			unsigned char c2 = *p++;
7535 			unsigned int w = 0;
7536 			unsigned int s = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
7537 
7538 			if (s <= 137) {
7539 				if (s == 31) {
7540 					w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
7541 				} else if (s == 32) {
7542 					w = 0xFF5E; /* FULLWIDTH TILDE */
7543 				} else if (s == 33) {
7544 					w = 0x2225; /* PARALLEL TO */
7545 				} else if (s == 60) {
7546 					w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
7547 				} else if (s == 80) {
7548 					w = 0xFFE0; /* FULLWIDTH CENT SIGN */
7549 				} else if (s == 81) {
7550 					w = 0xFFE1; /* FULLWIDTH POUND SIGN */
7551 				} else if (s == 137) {
7552 					w = 0xFFE2; /* FULLWIDTH NOT SIGN */
7553 				}
7554 			}
7555 
7556 			if (w == 0) {
7557 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
7558 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
7559 				} else if (s < jisx0208_ucs_table_size) {
7560 					w = jisx0208_ucs_table[s];
7561 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
7562 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
7563 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
7564 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
7565 				} else if (s >= (94*94) && s < (114*94)) {
7566 					w = s - (94*94) + 0xE000;
7567 				}
7568 			}
7569 
7570 			if (!w) {
7571 				if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7572 					p--;
7573 				}
7574 				w = MBFL_BAD_INPUT;
7575 			}
7576 			*out++ = w;
7577 		}
7578 	}
7579 
7580 	*in_len = e - p;
7581 	*in = p;
7582 	return out - buf;
7583 }
7584 
mb_wchar_to_cp932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7585 static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7586 {
7587 	unsigned char *out, *limit;
7588 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7589 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7590 
7591 	while (len--) {
7592 		uint32_t w = *in++;
7593 		unsigned int s1 = 0, s2 = 0, c1, c2;
7594 
7595 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7596 			s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7597 		} else if (w == 0x203E) {
7598 			s1 = 0x7E;
7599 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7600 			s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7601 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7602 			s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7603 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7604 			s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7605 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7606 			s1 = w - 0xE000;
7607 			c1 = s1/94 + 0x7F;
7608 			c2 = s1%94 + 0x21;
7609 			s1 = (c1 << 8) | c2;
7610 			s2 = 1;
7611 		}
7612 
7613 		if (w == 0xA5) { /* YEN SIGN */
7614 			s1 = 0x5C;
7615 		} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7616 			s1 = 0x2140;
7617 		} else if (w == 0x2225) { /* PARALLEL TO */
7618 			s1 = 0x2142;
7619 		} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7620 			s1 = 0x215D;
7621 		} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7622 			s1 = 0x2171;
7623 		} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7624 			s1 = 0x2172;
7625 		} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7626 			s1 = 0x224C;
7627 		} else if (w == 0) {
7628 			out = mb_convert_buf_add(out, 0);
7629 			continue;
7630 		}
7631 
7632 		if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7633 			for (unsigned int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7634 				if (cp932ext1_ucs_table[i] == w) {
7635 					s1 = ((i/94 + 0x2D) << 8) + (i%94 + 0x21);
7636 					goto emit_output;
7637 				}
7638 			}
7639 
7640 			for (unsigned int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
7641 				if (cp932ext3_ucs_table[i] == w) {
7642 					s1 = ((i/94 + 0x93) << 8) + (i%94 + 0x21);
7643 					goto emit_output;
7644 				}
7645 			}
7646 
7647 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7648 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7649 			continue;
7650 		}
7651 
7652 emit_output:
7653 		if (s1 < 0x100) {
7654 			out = mb_convert_buf_add(out, s1);
7655 		} else {
7656 			c1 = (s1 >> 8) & 0xFF;
7657 			c2 = s1 & 0xFF;
7658 			SJIS_ENCODE(c1, c2, s1, s2);
7659 			out = mb_convert_buf_add2(out, s1, s2);
7660 		}
7661 	}
7662 
7663 	MB_CONVERT_BUF_STORE(buf, out, limit);
7664 }
7665 
mb_wchar_to_sjiswin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)7666 static void mb_wchar_to_sjiswin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
7667 {
7668 	unsigned char *out, *limit;
7669 	MB_CONVERT_BUF_LOAD(buf, out, limit);
7670 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7671 
7672 	while (len--) {
7673 		uint32_t w = *in++;
7674 		unsigned int s1 = 0, s2 = 0, c1, c2;
7675 
7676 		if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
7677 			s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
7678 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
7679 			s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
7680 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
7681 			s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
7682 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
7683 			s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
7684 		} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
7685 			s1 = w - 0xE000;
7686 			c1 = s1/94 + 0x7F;
7687 			c2 = s1%94 + 0x21;
7688 			s1 = (c1 << 8) | c2;
7689 			s2 = 1;
7690 		}
7691 
7692 		if (w == 0xA5) { /* YEN SIGN */
7693 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
7694 		} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
7695 			s1 = 0x2140;
7696 		} else if (w == 0x2225) { /* PARALLEL TO */
7697 			s1 = 0x2142;
7698 		} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
7699 			s1 = 0x215D;
7700 		} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
7701 			s1 = 0x2171;
7702 		} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
7703 			s1 = 0x2172;
7704 		} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
7705 			s1 = 0x224C;
7706 		} else if (w == 0) {
7707 			out = mb_convert_buf_add(out, 0);
7708 			continue;
7709 		}
7710 
7711 		if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
7712 			for (unsigned int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
7713 				if (cp932ext1_ucs_table[i] == w) {
7714 					s1 = ((i/94 + 0x2D) << 8) + (i%94 + 0x21);
7715 					goto emit_output;
7716 				}
7717 			}
7718 
7719 			for (unsigned int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
7720 				if (cp932ext3_ucs_table[i] == w) {
7721 					s1 = ((i/94 + 0x93) << 8) + (i%94 + 0x21);
7722 					goto emit_output;
7723 				}
7724 			}
7725 
7726 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
7727 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
7728 			continue;
7729 		}
7730 
7731 emit_output:
7732 		if (s1 < 0x100) {
7733 			out = mb_convert_buf_add(out, s1);
7734 		} else {
7735 			c1 = (s1 >> 8) & 0xFF;
7736 			c2 = s1 & 0xFF;
7737 			SJIS_ENCODE(c1, c2, s1, s2);
7738 			out = mb_convert_buf_add2(out, s1, s2);
7739 		}
7740 	}
7741 
7742 	MB_CONVERT_BUF_STORE(buf, out, limit);
7743 }
7744 
7745 static const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xEF */
7746 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7747 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7748 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7749 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7750 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7751 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7752 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7753 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7754 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7755 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7756 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7757 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7758 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7759 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7760 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7761 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7762 };
7763 
7764 static const unsigned char mblen_table_sjismac[] = { /* 0x81-0x9F,0xE0-0xED */
7765 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7766 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7767 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7768 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7769 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7770 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7771 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7772 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7773 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7774 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7775 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7776 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7777 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7778 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7779 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
7780 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
7781 };
7782 
7783 static const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC */
7784 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7785 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7786 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7787 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7788 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7789 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7790 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7791 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7792 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7793 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7794 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7795 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7796 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7797 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7798 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7799 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
7800 };
7801 
7802 static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};
7803 
7804 static const struct mbfl_convert_vtbl vtbl_sjis_wchar = {
7805 	mbfl_no_encoding_sjis,
7806 	mbfl_no_encoding_wchar,
7807 	mbfl_filt_conv_common_ctor,
7808 	NULL,
7809 	mbfl_filt_conv_sjis_wchar,
7810 	mbfl_filt_conv_sjis_wchar_flush,
7811 	NULL
7812 };
7813 
7814 static const struct mbfl_convert_vtbl vtbl_wchar_sjis = {
7815 	mbfl_no_encoding_wchar,
7816 	mbfl_no_encoding_sjis,
7817 	mbfl_filt_conv_common_ctor,
7818 	NULL,
7819 	mbfl_filt_conv_wchar_sjis,
7820 	mbfl_filt_conv_common_flush,
7821 	NULL
7822 };
7823 
7824 const mbfl_encoding mbfl_encoding_sjis = {
7825 	mbfl_no_encoding_sjis,
7826 	"SJIS",
7827 	"Shift_JIS",
7828 	mbfl_encoding_sjis_aliases,
7829 	mblen_table_sjis,
7830 	MBFL_ENCTYPE_GL_UNSAFE,
7831 	&vtbl_sjis_wchar,
7832 	&vtbl_wchar_sjis,
7833 	mb_sjis_to_wchar,
7834 	mb_wchar_to_sjis,
7835 	NULL
7836 };
7837 
7838 static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
7839 
7840 static const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = {
7841 	mbfl_no_encoding_sjis_mac,
7842 	mbfl_no_encoding_wchar,
7843 	mbfl_filt_conv_common_ctor,
7844 	NULL,
7845 	mbfl_filt_conv_sjis_mac_wchar,
7846 	mbfl_filt_conv_sjis_wchar_flush,
7847 	NULL,
7848 };
7849 
7850 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_mac = {
7851 	mbfl_no_encoding_wchar,
7852 	mbfl_no_encoding_sjis_mac,
7853 	mbfl_filt_conv_common_ctor,
7854 	NULL,
7855 	mbfl_filt_conv_wchar_sjis_mac,
7856 	mbfl_filt_conv_wchar_sjis_mac_flush,
7857 	NULL,
7858 };
7859 
7860 const mbfl_encoding mbfl_encoding_sjis_mac = {
7861 	mbfl_no_encoding_sjis_mac,
7862 	"SJIS-mac",
7863 	"Shift_JIS",
7864 	mbfl_encoding_sjis_mac_aliases,
7865 	mblen_table_sjismac,
7866 	MBFL_ENCTYPE_GL_UNSAFE,
7867 	&vtbl_sjis_mac_wchar,
7868 	&vtbl_wchar_sjis_mac,
7869 	mb_sjismac_to_wchar,
7870 	mb_wchar_to_sjismac,
7871 	NULL
7872 };
7873 
7874 static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL};
7875 static const char *mbfl_encoding_sjis_kddi_aliases[] = {"SJIS-KDDI", "shift_jis-kddi", "x-sjis-emoji-kddi", NULL};
7876 static const char *mbfl_encoding_sjis_sb_aliases[] = {"SJIS-SOFTBANK", "shift_jis-softbank", "x-sjis-emoji-softbank", NULL};
7877 
7878 static const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = {
7879 	mbfl_no_encoding_sjis_docomo,
7880 	mbfl_no_encoding_wchar,
7881 	mbfl_filt_conv_common_ctor,
7882 	NULL,
7883 	mbfl_filt_conv_sjis_mobile_wchar,
7884 	mbfl_filt_conv_sjis_wchar_flush,
7885 	NULL,
7886 };
7887 
7888 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_docomo = {
7889 	mbfl_no_encoding_wchar,
7890 	mbfl_no_encoding_sjis_docomo,
7891 	mbfl_filt_conv_common_ctor,
7892 	NULL,
7893 	mbfl_filt_conv_wchar_sjis_mobile,
7894 	mbfl_filt_conv_sjis_mobile_flush,
7895 	NULL,
7896 };
7897 
7898 const mbfl_encoding mbfl_encoding_sjis_docomo = {
7899 	mbfl_no_encoding_sjis_docomo,
7900 	"SJIS-Mobile#DOCOMO",
7901 	"Shift_JIS",
7902 	mbfl_encoding_sjis_docomo_aliases,
7903 	mblen_table_sjis_mobile,
7904 	MBFL_ENCTYPE_GL_UNSAFE,
7905 	&vtbl_sjis_docomo_wchar,
7906 	&vtbl_wchar_sjis_docomo,
7907 	mb_sjis_docomo_to_wchar,
7908 	mb_wchar_to_sjis_docomo,
7909 	NULL
7910 };
7911 
7912 static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = {
7913 	mbfl_no_encoding_sjis_kddi,
7914 	mbfl_no_encoding_wchar,
7915 	mbfl_filt_conv_common_ctor,
7916 	NULL,
7917 	mbfl_filt_conv_sjis_mobile_wchar,
7918 	mbfl_filt_conv_sjis_wchar_flush,
7919 	NULL,
7920 };
7921 
7922 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_kddi = {
7923 	mbfl_no_encoding_wchar,
7924 	mbfl_no_encoding_sjis_kddi,
7925 	mbfl_filt_conv_common_ctor,
7926 	NULL,
7927 	mbfl_filt_conv_wchar_sjis_mobile,
7928 	mbfl_filt_conv_sjis_mobile_flush,
7929 	NULL,
7930 };
7931 
7932 const mbfl_encoding mbfl_encoding_sjis_kddi = {
7933 	mbfl_no_encoding_sjis_kddi,
7934 	"SJIS-Mobile#KDDI",
7935 	"Shift_JIS",
7936 	mbfl_encoding_sjis_kddi_aliases,
7937 	mblen_table_sjis_mobile,
7938 	MBFL_ENCTYPE_GL_UNSAFE,
7939 	&vtbl_sjis_kddi_wchar,
7940 	&vtbl_wchar_sjis_kddi,
7941 	mb_sjis_kddi_to_wchar,
7942 	mb_wchar_to_sjis_kddi,
7943 	NULL
7944 };
7945 
7946 static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = {
7947 	mbfl_no_encoding_sjis_sb,
7948 	mbfl_no_encoding_wchar,
7949 	mbfl_filt_conv_common_ctor,
7950 	NULL,
7951 	mbfl_filt_conv_sjis_mobile_wchar,
7952 	mbfl_filt_conv_sjis_wchar_flush,
7953 	NULL,
7954 };
7955 
7956 static const struct mbfl_convert_vtbl vtbl_wchar_sjis_sb = {
7957 	mbfl_no_encoding_wchar,
7958 	mbfl_no_encoding_sjis_sb,
7959 	mbfl_filt_conv_common_ctor,
7960 	NULL,
7961 	mbfl_filt_conv_wchar_sjis_mobile,
7962 	mbfl_filt_conv_sjis_mobile_flush,
7963 	NULL,
7964 };
7965 
7966 const mbfl_encoding mbfl_encoding_sjis_sb = {
7967 	mbfl_no_encoding_sjis_sb,
7968 	"SJIS-Mobile#SOFTBANK",
7969 	"Shift_JIS",
7970 	mbfl_encoding_sjis_sb_aliases,
7971 	mblen_table_sjis_mobile,
7972 	MBFL_ENCTYPE_GL_UNSAFE,
7973 	&vtbl_sjis_sb_wchar,
7974 	&vtbl_wchar_sjis_sb,
7975 	mb_sjis_sb_to_wchar,
7976 	mb_wchar_to_sjis_sb,
7977 	NULL
7978 };
7979 
7980 /* Although the specification for Shift-JIS-2004 indicates that 0x5C and
7981  * 0x7E should (respectively) represent a Yen sign and an overbar, feedback
7982  * from Japanese PHP users indicates that they prefer 0x5C and 0x7E to be
7983  * treated as equivalent to U+005C and U+007E. This is the historical
7984  * behavior of mbstring, and promotes compatibility with other software
7985  * which handles Shift-JIS and Shift-JIS-2004 text in this way. */
7986 
7987 static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL};
7988 
7989 static const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
7990 	mbfl_no_encoding_sjis2004,
7991 	mbfl_no_encoding_wchar,
7992 	mbfl_filt_conv_common_ctor,
7993 	NULL,
7994 	mbfl_filt_conv_jis2004_wchar,
7995 	mbfl_filt_conv_jis2004_wchar_flush,
7996 	NULL,
7997 };
7998 
7999 static const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = {
8000 	mbfl_no_encoding_wchar,
8001 	mbfl_no_encoding_sjis2004,
8002 	mbfl_filt_conv_common_ctor,
8003 	NULL,
8004 	mbfl_filt_conv_wchar_jis2004,
8005 	mbfl_filt_conv_wchar_jis2004_flush,
8006 	NULL,
8007 };
8008 
8009 const mbfl_encoding mbfl_encoding_sjis2004 = {
8010 	mbfl_no_encoding_sjis2004,
8011 	"SJIS-2004",
8012 	"Shift_JIS",
8013 	mbfl_encoding_sjis2004_aliases,
8014 	mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
8015 	MBFL_ENCTYPE_GL_UNSAFE,
8016 	&vtbl_sjis2004_wchar,
8017 	&vtbl_wchar_sjis2004,
8018 	mb_sjis2004_to_wchar,
8019 	mb_wchar_to_sjis2004,
8020 	NULL
8021 };
8022 
8023 /* CP932 is Microsoft's version of Shift-JIS.
8024  *
8025  * What we call "SJIS-win" is a variant of CP932 which maps U+00A5
8026  * and U+203E the same way as eucJP-win; namely, instead of mapping
8027  * U+00A5 (YEN SIGN) to 0x5C and U+203E (OVERLINE) to 0x7E,
8028  * these codepoints are mapped to appropriate JIS X 0208 characters.
8029  *
8030  * When converting from Shift-JIS to Unicode, there is no difference
8031  * between CP932 and "SJIS-win".
8032  *
8033  * Additional facts:
8034  *
8035  * • In the libmbfl library which formed the base for mbstring, "CP932" and
8036  *   "SJIS-win" were originally aliases. The differing mappings were added in
8037  *   December 2002. The libmbfl author later stated that this was done so that
8038  *   "CP932" would comply with a certain specification, while "SJIS-win" would
8039  *   maintain the existing mappings. He does not remember which specification
8040  *   it was.
8041  * • The WHATWG specification for "Shift_JIS" (followed by web browsers)
8042  *   agrees with our mappings for "CP932".
8043  * • Microsoft Windows' "best-fit" mappings for CP932 (via the
8044  *   WideCharToMultiByte API) convert U+00A5 to 0x5C, which also agrees with
8045  *   our mappings for "CP932".
8046  * • glibc's iconv converts U+203E to CP932 0x7E, which again agrees with
8047  *   our mappings for "CP932".
8048  * • When converting Shift-JIS to CP932, the conversion goes through Unicode.
8049  *   Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that
8050  *   0x7E will go to 0x7E when converting Shift-JIS to CP932.
8051  */
8052 
8053 static const unsigned char mblen_table_sjiswin[] = { /* 0x81-0x9F,0xE0-0xFF */
8054 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8055 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8056 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8057 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8058 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8059 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8060 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8061 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8062 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8063 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8064 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8065 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8066 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8067 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8068 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
8069 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
8070 };
8071 
8072 static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL};
8073 static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL};
8074 
8075 static const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
8076 	mbfl_no_encoding_cp932,
8077 	mbfl_no_encoding_wchar,
8078 	mbfl_filt_conv_common_ctor,
8079 	NULL,
8080 	mbfl_filt_conv_cp932_wchar,
8081 	mbfl_filt_conv_cp932_wchar_flush,
8082 	NULL,
8083 };
8084 
8085 static const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
8086 	mbfl_no_encoding_wchar,
8087 	mbfl_no_encoding_cp932,
8088 	mbfl_filt_conv_common_ctor,
8089 	NULL,
8090 	mbfl_filt_conv_wchar_cp932,
8091 	mbfl_filt_conv_common_flush,
8092 	NULL,
8093 };
8094 
8095 const mbfl_encoding mbfl_encoding_cp932 = {
8096 	mbfl_no_encoding_cp932,
8097 	"CP932",
8098 	"Shift_JIS",
8099 	mbfl_encoding_cp932_aliases,
8100 	mblen_table_sjiswin,
8101 	MBFL_ENCTYPE_GL_UNSAFE,
8102 	&vtbl_cp932_wchar,
8103 	&vtbl_wchar_cp932,
8104 	mb_cp932_to_wchar,
8105 	mb_wchar_to_cp932,
8106 	NULL
8107 };
8108 
8109 static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
8110 	mbfl_no_encoding_sjiswin,
8111 	mbfl_no_encoding_wchar,
8112 	mbfl_filt_conv_common_ctor,
8113 	NULL,
8114 	mbfl_filt_conv_cp932_wchar,
8115 	mbfl_filt_conv_cp932_wchar_flush,
8116 	NULL,
8117 };
8118 
8119 static const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = {
8120 	mbfl_no_encoding_wchar,
8121 	mbfl_no_encoding_sjiswin,
8122 	mbfl_filt_conv_common_ctor,
8123 	NULL,
8124 	mbfl_filt_conv_wchar_sjiswin,
8125 	mbfl_filt_conv_common_flush,
8126 	NULL,
8127 };
8128 
8129 const mbfl_encoding mbfl_encoding_sjiswin = {
8130 	mbfl_no_encoding_sjiswin,
8131 	"SJIS-win",
8132 	"Shift_JIS",
8133 	mbfl_encoding_sjiswin_aliases,
8134 	mblen_table_sjiswin,
8135 	MBFL_ENCTYPE_GL_UNSAFE,
8136 	&vtbl_sjiswin_wchar,
8137 	&vtbl_wchar_sjiswin,
8138 	mb_cp932_to_wchar,
8139 	mb_wchar_to_sjiswin,
8140 	NULL
8141 };
8142 
8143 /*
8144  * EUC variants
8145  */
8146 
mbfl_filt_conv_eucjp_wchar(int c,mbfl_convert_filter * filter)8147 static int mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
8148 {
8149 	int c1, s, w = 0;
8150 
8151 	switch (filter->status) {
8152 	case 0:
8153 		if (c >= 0 && c < 0x80) {	/* latin */
8154 			CK((*filter->output_function)(c, filter->data));
8155 		} else if (c > 0xa0 && c < 0xff) {	/* X 0208 first char */
8156 			filter->status = 1;
8157 			filter->cache = c;
8158 		} else if (c == 0x8e) {	/* kana first char */
8159 			filter->status = 2;
8160 		} else if (c == 0x8f) {	/* X 0212 first char */
8161 			filter->status = 3;
8162 		} else {
8163 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8164 		}
8165 		break;
8166 
8167 	case 1:	/* got first half */
8168 		filter->status = 0;
8169 		c1 = filter->cache;
8170 		if (c > 0xa0 && c < 0xff) {
8171 			s = (c1 - 0xa1)*94 + c - 0xa1;
8172 			if (s >= 0 && s < jisx0208_ucs_table_size) {
8173 				w = jisx0208_ucs_table[s];
8174 				if (!w)
8175 					w = MBFL_BAD_INPUT;
8176 			} else {
8177 				w = MBFL_BAD_INPUT;
8178 			}
8179 
8180 			CK((*filter->output_function)(w, filter->data));
8181 		} else {
8182 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8183 		}
8184 		break;
8185 
8186 	case 2:	/* got 0x8e */
8187 		filter->status = 0;
8188 		if (c > 0xa0 && c < 0xe0) {
8189 			w = 0xfec0 + c;
8190 			CK((*filter->output_function)(w, filter->data));
8191 		} else {
8192 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8193 		}
8194 		break;
8195 
8196 	case 3: /* got 0x8f, JIS X 0212 first byte */
8197 		filter->status++;
8198 		filter->cache = c;
8199 		break;
8200 
8201 	case 4: /* got 0x8f, JIS X 0212 second byte */
8202 		filter->status = 0;
8203 		c1 = filter->cache;
8204 		if (c > 0xA0 && c < 0xFF && c1 > 0xA0 && c1 < 0xFF) {
8205 			s = (c1 - 0xa1)*94 + c - 0xa1;
8206 			if (s >= 0 && s < jisx0212_ucs_table_size) {
8207 				w = jisx0212_ucs_table[s];
8208 				if (!w)
8209 					w = MBFL_BAD_INPUT;
8210 			} else {
8211 				w = MBFL_BAD_INPUT;
8212 			}
8213 
8214 			CK((*filter->output_function)(w, filter->data));
8215 		} else {
8216 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8217 		}
8218 		break;
8219 
8220 		EMPTY_SWITCH_DEFAULT_CASE();
8221 	}
8222 
8223 	return 0;
8224 }
8225 
mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter * filter)8226 static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter)
8227 {
8228 	if (filter->status) {
8229 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8230 		filter->status = 0;
8231 	}
8232 
8233 	if (filter->flush_function) {
8234 		(*filter->flush_function)(filter->data);
8235 	}
8236 
8237 	return 0;
8238 }
8239 
mbfl_filt_conv_wchar_eucjp(int c,mbfl_convert_filter * filter)8240 static int mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
8241 {
8242 	int s = 0;
8243 
8244 	if (c == 0xAF) { /* U+00AF is MACRON */
8245 		s = 0xA2B4; /* Use JIS X 0212 overline */
8246 	} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8247 		s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8248 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8249 		s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8250 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8251 		s = ucs_i_jis_table[c - ucs_i_jis_table_min];
8252 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8253 		s = ucs_r_jis_table[c - ucs_r_jis_table_min];
8254 	}
8255 	if (s <= 0) {
8256 		if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
8257 			s = 0x2140;
8258 		} else if (c == 0x2225) {	/* PARALLEL TO */
8259 			s = 0x2142;
8260 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
8261 			s = 0x215d;
8262 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
8263 			s = 0x2171;
8264 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
8265 			s = 0x2172;
8266 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
8267 			s = 0x224c;
8268 		} else if (c == 0) {
8269 			s = 0;
8270 		} else {
8271 			s = -1;
8272 		}
8273 	}
8274 	if (s >= 0) {
8275 		if (s < 0x80) {	/* latin */
8276 			CK((*filter->output_function)(s, filter->data));
8277 		} else if (s < 0x100) {	/* kana */
8278 			CK((*filter->output_function)(0x8e, filter->data));
8279 			CK((*filter->output_function)(s, filter->data));
8280 		} else if (s < 0x8080)  {	/* X 0208 */
8281 			CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8282 			CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8283 		} else {	/* X 0212 */
8284 			CK((*filter->output_function)(0x8f, filter->data));
8285 			CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
8286 			CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
8287 		}
8288 	} else {
8289 		CK(mbfl_filt_conv_illegal_output(c, filter));
8290 	}
8291 
8292 	return 0;
8293 }
8294 
mb_eucjp_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8295 static size_t mb_eucjp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8296 {
8297 	unsigned char *p = *in, *e = p + *in_len;
8298 	uint32_t *out = buf, *limit = buf + bufsize;
8299 
8300 	while (p < e && out < limit) {
8301 		unsigned char c = *p++;
8302 
8303 		if (c < 0x80) {
8304 			*out++ = c;
8305 		} else if (c >= 0xA1 && c <= 0xFE && p < e) {
8306 			/* JISX 0208 */
8307 			unsigned char c2 = *p++;
8308 			if (c2 >= 0xA1 && c2 <= 0xFE) {
8309 				unsigned int s = (c - 0xA1)*94 + c2 - 0xA1;
8310 				if (s < jisx0208_ucs_table_size) {
8311 					uint32_t w = jisx0208_ucs_table[s];
8312 					if (!w)
8313 						w = MBFL_BAD_INPUT;
8314 					*out++ = w;
8315 				} else {
8316 					*out++ = MBFL_BAD_INPUT;
8317 				}
8318 			} else {
8319 				*out++ = MBFL_BAD_INPUT;
8320 			}
8321 		} else if (c == 0x8E && p < e) {
8322 			/* Kana */
8323 			unsigned char c2 = *p++;
8324 			*out++ = (c2 >= 0xA1 && c2 <= 0xDF) ? 0xFEC0 + c2 : MBFL_BAD_INPUT;
8325 		} else if (c == 0x8F) {
8326 			/* JISX 0212 */
8327 			if ((e - p) >= 2) {
8328 				unsigned char c2 = *p++;
8329 				unsigned char c3 = *p++;
8330 				if (c3 >= 0xA1 && c3 <= 0xFE && c2 >= 0xA1 && c2 <= 0xFE) {
8331 					unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1;
8332 					if (s < jisx0212_ucs_table_size) {
8333 						uint32_t w = jisx0212_ucs_table[s];
8334 						if (!w)
8335 							w = MBFL_BAD_INPUT;
8336 						*out++ = w;
8337 					} else {
8338 						*out++ = MBFL_BAD_INPUT;
8339 					}
8340 				} else {
8341 					*out++ = MBFL_BAD_INPUT;
8342 				}
8343 			} else {
8344 				*out++ = MBFL_BAD_INPUT;
8345 				p = e; /* Jump to end of string */
8346 			}
8347 		} else {
8348 			*out++ = MBFL_BAD_INPUT;
8349 		}
8350 	}
8351 
8352 	*in_len = e - p;
8353 	*in = p;
8354 	return out - buf;
8355 }
8356 
mb_wchar_to_eucjp(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8357 static void mb_wchar_to_eucjp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8358 {
8359 	unsigned char *out, *limit;
8360 	MB_CONVERT_BUF_LOAD(buf, out, limit);
8361 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8362 
8363 	while (len--) {
8364 		uint32_t w = *in++;
8365 		unsigned int s = 0;
8366 
8367 		if (w == 0xAF) { /* U+00AF is MACRON */
8368 			s = 0xA2B4; /* Use JIS X 0212 overline */
8369 		} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8370 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8371 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8372 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8373 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8374 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8375 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8376 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8377 		}
8378 
8379 		if (s == 0) {
8380 			if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8381 				s = 0x2140;
8382 			} else if (w == 0x2225) { /* PARALLEL TO */
8383 				s = 0x2142;
8384 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8385 				s = 0x215D;
8386 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8387 				s = 0x2171;
8388 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8389 				s = 0x2172;
8390 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8391 				s = 0x224C;
8392 			} else if (w == 0) {
8393 				out = mb_convert_buf_add(out, 0);
8394 				continue;
8395 			} else {
8396 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp);
8397 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8398 				continue;
8399 			}
8400 		}
8401 
8402 		if (s < 0x80) {
8403 			out = mb_convert_buf_add(out, s);
8404 		} else if (s < 0x100) {
8405 			out = mb_convert_buf_add2(out, 0x8E, s);
8406 		} else if (s < 0x8080)  {
8407 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8408 		} else {
8409 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8410 			out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8411 		}
8412 	}
8413 
8414 	MB_CONVERT_BUF_STORE(buf, out, limit);
8415 }
8416 
mbfl_filt_conv_eucjpwin_wchar(int c,mbfl_convert_filter * filter)8417 static int mbfl_filt_conv_eucjpwin_wchar(int c, mbfl_convert_filter *filter)
8418 {
8419 	int c1, s, w, n;
8420 
8421 	switch (filter->status) {
8422 	case 0:
8423 		if (c >= 0 && c < 0x80) { /* latin */
8424 			CK((*filter->output_function)(c, filter->data));
8425 		} else if (c >= 0xa1 && c <= 0xfe) { /* CP932 first char */
8426 			filter->status = 1;
8427 			filter->cache = c;
8428 		} else if (c == 0x8e) { /* kana first char */
8429 			filter->status = 2;
8430 		} else if (c == 0x8f) { /* X 0212 first char */
8431 			filter->status = 3;
8432 		} else {
8433 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8434 		}
8435 		break;
8436 
8437 	case 1:	/* got first half */
8438 		filter->status = 0;
8439 		c1 = filter->cache;
8440 		if (c > 0xa0 && c < 0xff) {
8441 			w = 0;
8442 			s = (c1 - 0xa1)*94 + c - 0xa1;
8443 			if (s <= 137) {
8444 				if (s == 31) {
8445 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
8446 				} else if (s == 32) {
8447 					w = 0xff5e;			/* FULLWIDTH TILDE */
8448 				} else if (s == 33) {
8449 					w = 0x2225;			/* PARALLEL TO */
8450 				} else if (s == 60) {
8451 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
8452 				} else if (s == 80) {
8453 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
8454 				} else if (s == 81) {
8455 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
8456 				} else if (s == 137) {
8457 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
8458 				}
8459 			}
8460 
8461 			if (w == 0) {
8462 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
8463 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8464 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
8465 					w = jisx0208_ucs_table[s];
8466 				} else if (s >= (84 * 94)) {		/* user (85ku - 94ku) */
8467 					w = s - (84 * 94) + 0xe000;
8468 				}
8469 			}
8470 
8471 			if (w <= 0) {
8472 				w = MBFL_BAD_INPUT;
8473 			}
8474 			CK((*filter->output_function)(w, filter->data));
8475 		} else {
8476 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8477 		}
8478 		break;
8479 
8480 	case 2:	/* got 0x8e, X0201 kana */
8481 		filter->status = 0;
8482 		if (c > 0xa0 && c < 0xe0) {
8483 			w = 0xfec0 + c;
8484 			CK((*filter->output_function)(w, filter->data));
8485 		} else {
8486 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8487 		}
8488 		break;
8489 
8490 	case 3:	/* got 0x8f,  X 0212 first char */
8491 		filter->status++;
8492 		filter->cache = c;
8493 		break;
8494 
8495 	case 4:	/* got 0x8f,  X 0212 second char */
8496 		filter->status = 0;
8497 		c1 = filter->cache;
8498 		if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
8499 			s = (c1 - 0xa1)*94 + c - 0xa1;
8500 
8501 			if (s >= 0 && s < jisx0212_ucs_table_size) {
8502 				w = jisx0212_ucs_table[s];
8503 
8504 				if (w == 0x007e) {
8505 					w = 0xff5e;		/* FULLWIDTH TILDE */
8506 				}
8507 			} else if (s >= (82*94) && s < (84*94)) {	/* vender ext3 (83ku - 84ku) <-> CP932 (115ku -120ku) */
8508 				s = (c1 << 8) | c;
8509 				w = 0;
8510 				n = 0;
8511 				while (n < cp932ext3_eucjp_table_size) {
8512 					if (s == cp932ext3_eucjp_table[n]) {
8513 						if (n < (cp932ext3_ucs_table_max - cp932ext3_ucs_table_min)) {
8514 							w = cp932ext3_ucs_table[n];
8515 						}
8516 						break;
8517 					}
8518 					n++;
8519 				}
8520 			} else if (s >= (84*94)) {		/* user (85ku - 94ku) */
8521 				w = s - (84*94) + (0xe000 + (94*10));
8522 			} else {
8523 				w = 0;
8524 			}
8525 
8526 			if (w == 0x00A6) {
8527 				w = 0xFFE4;		/* FULLWIDTH BROKEN BAR */
8528 			}
8529 
8530 			if (w <= 0) {
8531 				w = MBFL_BAD_INPUT;
8532 			}
8533 			CK((*filter->output_function)(w, filter->data));
8534 		} else {
8535 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8536 		}
8537 		break;
8538 
8539 		EMPTY_SWITCH_DEFAULT_CASE();
8540 	}
8541 
8542 	return 0;
8543 }
8544 
mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter * filter)8545 static int mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter *filter)
8546 {
8547 	if (filter->status) {
8548 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8549 		filter->status = 0;
8550 	}
8551 
8552 	if (filter->flush_function) {
8553 		(*filter->flush_function)(filter->data);
8554 	}
8555 
8556 	return 0;
8557 }
8558 
mbfl_filt_conv_wchar_eucjpwin(int c,mbfl_convert_filter * filter)8559 static int mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter)
8560 {
8561 	int c1, c2, s1 = 0;
8562 
8563 	if (c == 0xAF) { /* U+00AF is MACRON */
8564 		s1 = 0xA2B4; /* Use JIS X 0212 overline */
8565 	} else if (c == 0x203E) {
8566 		s1 = 0x7E;
8567 	} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8568 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8569 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8570 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8571 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8572 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8573 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8574 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8575 	} else if (c >= 0xe000 && c < (0xe000 + 10*94)) { /* user (X0208 85ku - 94ku) */
8576 		s1 = c - 0xe000;
8577 		c1 = s1/94 + 0x75;
8578 		c2 = s1%94 + 0x21;
8579 		s1 = (c1 << 8) | c2;
8580 	} else if (c >= (0xe000 + 10*94) && c < (0xe000 + 20*94)) { /* user (X0212 85ku - 94ku) */
8581 		s1 = c - (0xe000 + 10*94);
8582 		c1 = s1/94 + 0xf5;
8583 		c2 = s1%94 + 0xa1;
8584 		s1 = (c1 << 8) | c2;
8585 	}
8586 
8587 	if (s1 == 0xa2f1) {
8588 		s1 = 0x2d62;		/* NUMERO SIGN */
8589 	}
8590 
8591 	if (s1 <= 0) {
8592 		if (c == 0xa5) {		/* YEN SIGN */
8593 			s1 = 0x5C;
8594 		} else if (c == 0x2014) {
8595 			s1 = 0x213D;
8596 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
8597 			s1 = 0x2140;
8598 		} else if (c == 0x2225) {	/* PARALLEL TO */
8599 			s1 = 0x2142;
8600 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
8601 			s1 = 0x215d;
8602 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
8603 			s1 = 0x2171;
8604 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
8605 			s1 = 0x2172;
8606 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
8607 			s1 = 0x224c;
8608 		} else {
8609 			s1 = -1;
8610 			c1 = 0;
8611 			c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
8612 			while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
8613 				const int oh = cp932ext1_ucs_table_min / 94;
8614 
8615 				if (c == cp932ext1_ucs_table[c1]) {
8616 					s1 = ((c1 / 94 + oh + 0x21) << 8) + (c1 % 94 + 0x21);
8617 					break;
8618 				}
8619 				c1++;
8620 			}
8621 			if (s1 < 0) {
8622 				c1 = 0;
8623 				c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
8624 				while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
8625 					if (c == cp932ext3_ucs_table[c1]) {
8626 						if (c1 < cp932ext3_eucjp_table_size) {
8627 							s1 = cp932ext3_eucjp_table[c1];
8628 						}
8629 						break;
8630 					}
8631 					c1++;
8632 				}
8633 			}
8634 		}
8635 
8636 		if (c == 0) {
8637 			s1 = 0;
8638 		} else if (s1 <= 0) {
8639 			s1 = -1;
8640 		}
8641 	}
8642 
8643 	if (s1 >= 0) {
8644 		if (s1 < 0x80) {	/* latin */
8645 			CK((*filter->output_function)(s1, filter->data));
8646 		} else if (s1 < 0x100) {	/* kana */
8647 			CK((*filter->output_function)(0x8e, filter->data));
8648 			CK((*filter->output_function)(s1, filter->data));
8649 		} else if (s1 < 0x8080)  {	/* X 0208 */
8650 			CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8651 			CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8652 		} else {	/* X 0212 */
8653 			CK((*filter->output_function)(0x8f, filter->data));
8654 			CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
8655 			CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
8656 		}
8657 	} else {
8658 		CK(mbfl_filt_conv_illegal_output(c, filter));
8659 	}
8660 
8661 	return 0;
8662 }
8663 
mb_eucjpwin_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)8664 static size_t mb_eucjpwin_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
8665 {
8666 	unsigned char *p = *in, *e = p + *in_len;
8667 	uint32_t *out = buf, *limit = buf + bufsize;
8668 
8669 	while (p < e && out < limit) {
8670 		unsigned char c = *p++;
8671 
8672 		if (c < 0x80) {
8673 			*out++ = c;
8674 		} else if (c >= 0xA1 && c <= 0xFE && p < e) {
8675 			unsigned char c2 = *p++;
8676 
8677 			if (c2 >= 0xA1 && c2 <= 0xFE) {
8678 				unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
8679 
8680 				if (s <= 137) {
8681 					if (s == 31) {
8682 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
8683 					} else if (s == 32) {
8684 						w = 0xFF5E; /* FULLWIDTH TILDE */
8685 					} else if (s == 33) {
8686 						w = 0x2225; /* PARALLEL TO */
8687 					} else if (s == 60) {
8688 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
8689 					} else if (s == 80) {
8690 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
8691 					} else if (s == 81) {
8692 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
8693 					} else if (s == 137) {
8694 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
8695 					}
8696 				}
8697 
8698 				if (w == 0) {
8699 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
8700 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8701 					} else if (s < jisx0208_ucs_table_size) {
8702 						w = jisx0208_ucs_table[s];
8703 					} else if (s >= (84 * 94)) {
8704 						w = s - (84 * 94) + 0xE000;
8705 					}
8706 				}
8707 
8708 				if (!w)
8709 					w = MBFL_BAD_INPUT;
8710 				*out++ = w;
8711 			} else {
8712 				*out++ = MBFL_BAD_INPUT;
8713 			}
8714 		} else if (c == 0x8E && p < e) {
8715 			unsigned char c2 = *p++;
8716 			if (c2 >= 0xA1 && c2 <= 0xDF) {
8717 				*out++ = 0xFEC0 + c2;
8718 			} else {
8719 				*out++ = MBFL_BAD_INPUT;
8720 			}
8721 		} else if (c == 0x8F && p < e) {
8722 			unsigned char c2 = *p++;
8723 			if (p == e) {
8724 				*out++ = MBFL_BAD_INPUT;
8725 				continue;
8726 			}
8727 			unsigned char c3 = *p++;
8728 
8729 			if (c2 >= 0xA1 && c2 <= 0xFE && c3 >= 0xA1 && c3 <= 0xFE) {
8730 				unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1, w = 0;
8731 
8732 				if (s < jisx0212_ucs_table_size) {
8733 					w = jisx0212_ucs_table[s];
8734 					if (w == 0x7E)
8735 						w = 0xFF5E; /* FULLWIDTH TILDE */
8736 				} else if (s >= (82*94) && s < (84*94)) {
8737 					s = (c2 << 8) | c3;
8738 					for (int i = 0; i < cp932ext3_eucjp_table_size; i++) {
8739 						if (cp932ext3_eucjp_table[i] == s) {
8740 							w = cp932ext3_ucs_table[i];
8741 							break;
8742 						}
8743 					}
8744 				} else if (s >= (84*94)) {
8745 					w = s - (84*94) + 0xE000 + (94*10);
8746 				}
8747 
8748 				if (w == 0xA6)
8749 					w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
8750 
8751 				if (!w)
8752 					w = MBFL_BAD_INPUT;
8753 				*out++ = w;
8754 			} else {
8755 				*out++ = MBFL_BAD_INPUT;
8756 			}
8757 		} else {
8758 			*out++ = MBFL_BAD_INPUT;
8759 		}
8760 	}
8761 
8762 	*in_len = e - p;
8763 	*in = p;
8764 	return out - buf;
8765 }
8766 
mb_wchar_to_eucjpwin(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)8767 static void mb_wchar_to_eucjpwin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
8768 {
8769 	unsigned char *out, *limit;
8770 	MB_CONVERT_BUF_LOAD(buf, out, limit);
8771 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8772 
8773 	while (len--) {
8774 		uint32_t w = *in++;
8775 		unsigned int s = 0;
8776 
8777 		if (w == 0) {
8778 			out = mb_convert_buf_add(out, 0);
8779 			continue;
8780 		} else if (w == 0xAF) { /* U+00AF is MACRON */
8781 			s = 0xA2B4; /* Use JIS X 0212 overline */
8782 		} else if (w == 0x203E) {
8783 			s = 0x7E;
8784 		} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
8785 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
8786 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
8787 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
8788 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
8789 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
8790 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
8791 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
8792 		} else if (w >= 0xE000 && w < (0xE000 + 10*94)) {
8793 			s = w - 0xE000;
8794 			s = ((s/94 + 0x75) << 8) + (s%94) + 0x21;
8795 		} else if (w >= (0xE000 + 10*94) && w < (0xE000 + 20*94)) {
8796 			s = w - (0xE000 + 10*94);
8797 			s = ((s/94 + 0xF5) << 8) + (s%94) + 0xA1;
8798 		}
8799 
8800 		if (s == 0xA2F1)
8801 			s = 0x2D62; /* NUMERO SIGN */
8802 
8803 		if (s == 0) {
8804 			if (w == 0xA5) { /* YEN SIGN */
8805 				s = 0x5C;
8806 			} else if (w == 0x2014) { /* EM DASH */
8807 				s = 0x213D;
8808 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
8809 				s = 0x2140;
8810 			} else if (w == 0x2225) { /* PARALLEL TO */
8811 				s = 0x2142;
8812 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
8813 				s = 0x215D;
8814 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
8815 				s = 0x2171;
8816 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
8817 				s = 0x2172;
8818 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
8819 				s = 0x224C;
8820 			} else {
8821 				for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
8822 					if (cp932ext1_ucs_table[i] == w) {
8823 						s = (((i/94) + (cp932ext1_ucs_table_min/94) + 0x21) << 8) + (i%94) + 0x21;
8824 						break;
8825 					}
8826 				}
8827 
8828 				if (!s) {
8829 					for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
8830 						if (cp932ext3_ucs_table[i] == w) {
8831 							s = cp932ext3_eucjp_table[i];
8832 							break;
8833 						}
8834 					}
8835 				}
8836 			}
8837 		}
8838 
8839 		if (!s) {
8840 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjpwin);
8841 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
8842 		} else if (s < 0x80) {
8843 			out = mb_convert_buf_add(out, s);
8844 		} else if (s < 0x100) {
8845 			out = mb_convert_buf_add2(out, 0x8E, s);
8846 		} else if (s < 0x8080) {
8847 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8848 		} else {
8849 			MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
8850 			out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
8851 		}
8852 	}
8853 
8854 	MB_CONVERT_BUF_STORE(buf, out, limit);
8855 }
8856 
mbfl_filt_conv_cp51932_wchar(int c,mbfl_convert_filter * filter)8857 static int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
8858 {
8859 	int c1, s, w;
8860 
8861 	switch (filter->status) {
8862 	case 0:
8863 		if (c >= 0 && c < 0x80) { /* latin */
8864 			CK((*filter->output_function)(c, filter->data));
8865 		} else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */
8866 			filter->status = 1;
8867 			filter->cache = c;
8868 		} else if (c == 0x8e) { /* kana first char */
8869 			filter->status = 2;
8870 		} else {
8871 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8872 		}
8873 		break;
8874 
8875 	case 1:	/* got first half */
8876 		filter->status = 0;
8877 		c1 = filter->cache;
8878 		if (c > 0xa0 && c < 0xff) {
8879 			w = 0;
8880 			s = (c1 - 0xa1)*94 + c - 0xa1;
8881 			if (s <= 137) {
8882 				if (s == 31) {
8883 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
8884 				} else if (s == 32) {
8885 					w = 0xff5e;			/* FULLWIDTH TILDE */
8886 				} else if (s == 33) {
8887 					w = 0x2225;			/* PARALLEL TO */
8888 				} else if (s == 60) {
8889 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
8890 				} else if (s == 80) {
8891 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
8892 				} else if (s == 81) {
8893 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
8894 				} else if (s == 137) {
8895 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
8896 				}
8897 			}
8898 			if (w == 0) {
8899 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
8900 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
8901 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
8902 					w = jisx0208_ucs_table[s];
8903 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
8904 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
8905 				}
8906 			}
8907 			if (w <= 0) {
8908 				w = MBFL_BAD_INPUT;
8909 			}
8910 			CK((*filter->output_function)(w, filter->data));
8911 		} else {
8912 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8913 		}
8914 		break;
8915 
8916 	case 2:	/* got 0x8e, X0201 kana */
8917 		filter->status = 0;
8918 		if (c > 0xa0 && c < 0xe0) {
8919 			w = 0xfec0 + c;
8920 			CK((*filter->output_function)(w, filter->data));
8921 		} else {
8922 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
8923 		}
8924 		break;
8925 
8926 		EMPTY_SWITCH_DEFAULT_CASE();
8927 	}
8928 
8929 	return 0;
8930 }
8931 
mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter * filter)8932 static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter)
8933 {
8934 	if (filter->status) {
8935 		/* Input string was truncated */
8936 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
8937 		filter->status = 0;
8938 	}
8939 
8940 	if (filter->flush_function) {
8941 		(*filter->flush_function)(filter->data);
8942 	}
8943 
8944 	return 0;
8945 }
8946 
mbfl_filt_conv_wchar_cp51932(int c,mbfl_convert_filter * filter)8947 static int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
8948 {
8949 	int c1, c2, s1;
8950 
8951 	s1 = 0;
8952 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
8953 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
8954 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
8955 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
8956 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
8957 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
8958 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
8959 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
8960 	}
8961 	if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */
8962 	if (s1 <= 0) {
8963 		if (c == 0xa5) { /* YEN SIGN */
8964 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
8965 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
8966 			s1 = 0x2140;
8967 		} else if (c == 0x2225) {	/* PARALLEL TO */
8968 			s1 = 0x2142;
8969 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
8970 			s1 = 0x215d;
8971 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
8972 			s1 = 0x2171;
8973 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
8974 			s1 = 0x2172;
8975 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
8976 			s1 = 0x224c;
8977 		} else {
8978 			s1 = -1;
8979 			c1 = 0;
8980 			c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
8981 			while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
8982 				if (c == cp932ext1_ucs_table[c1]) {
8983 					s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
8984 					break;
8985 				}
8986 				c1++;
8987 			}
8988 			if (s1 < 0) {
8989 				c1 = 0;
8990 				c2 = cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
8991 				while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
8992 					if (c == cp932ext2_ucs_table[c1]) {
8993 						s1 = ((c1/94 + 0x79) << 8) +(c1%94 + 0x21);
8994 						break;
8995 					}
8996 					c1++;
8997 				}
8998 			}
8999 		}
9000 		if (c == 0) {
9001 			s1 = 0;
9002 		} else if (s1 <= 0) {
9003 			s1 = -1;
9004 		}
9005 	}
9006 
9007 	if (s1 >= 0) {
9008 		if (s1 < 0x80) {	/* latin */
9009 			CK((*filter->output_function)(s1, filter->data));
9010 		} else if (s1 < 0x100) {	/* kana */
9011 			CK((*filter->output_function)(0x8e, filter->data));
9012 			CK((*filter->output_function)(s1, filter->data));
9013 		} else if (s1 < 0x8080)  {	/* X 0208 */
9014 			CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
9015 			CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
9016 		} else {
9017 			CK(mbfl_filt_conv_illegal_output(c, filter));
9018 		}
9019 	} else {
9020 		CK(mbfl_filt_conv_illegal_output(c, filter));
9021 	}
9022 
9023 	return 0;
9024 }
9025 
mb_cp51932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9026 static size_t mb_cp51932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9027 {
9028 	unsigned char *p = *in, *e = p + *in_len;
9029 	uint32_t *out = buf, *limit = buf + bufsize;
9030 
9031 	while (p < e && out < limit) {
9032 		unsigned char c = *p++;
9033 
9034 		if (c < 0x80) {
9035 			*out++ = c;
9036 		} else if (c >= 0xA1 && c <= 0xFE && p < e) {
9037 			unsigned char c2 = *p++;
9038 			if (c2 >= 0xA1 && c2 <= 0xFE) {
9039 				unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
9040 
9041 				if (s <= 137) {
9042 					if (s == 31) {
9043 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
9044 					} else if (s == 32) {
9045 						w = 0xFF5E; /* FULLWIDTH TILDE */
9046 					} else if (s == 33) {
9047 						w = 0x2225; /* PARALLEL TO */
9048 					} else if (s == 60) {
9049 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
9050 					} else if (s == 80) {
9051 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
9052 					} else if (s == 81) {
9053 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
9054 					} else if (s == 137) {
9055 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
9056 					}
9057 				}
9058 
9059 				if (w == 0) {
9060 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
9061 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
9062 					} else if (s < jisx0208_ucs_table_size) {
9063 						w = jisx0208_ucs_table[s];
9064 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
9065 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
9066 					}
9067 				}
9068 
9069 				if (!w)
9070 					w = MBFL_BAD_INPUT;
9071 				*out++ = w;
9072 			} else {
9073 				*out++ = MBFL_BAD_INPUT;
9074 			}
9075 		} else if (c == 0x8E && p < e) {
9076 			unsigned char c2 = *p++;
9077 			if (c2 >= 0xA1 && c2 <= 0xDF) {
9078 				*out++ = 0xFEC0 + c2;
9079 			} else {
9080 				*out++ = MBFL_BAD_INPUT;
9081 			}
9082 		} else {
9083 			*out++ = MBFL_BAD_INPUT;
9084 		}
9085 	}
9086 
9087 	*in_len = e - p;
9088 	*in = p;
9089 	return out - buf;
9090 }
9091 
mb_wchar_to_cp51932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9092 static void mb_wchar_to_cp51932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9093 {
9094 	unsigned char *out, *limit;
9095 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9096 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9097 
9098 	while (len--) {
9099 		uint32_t w = *in++;
9100 		unsigned int s = 0;
9101 
9102 		if (w == 0) {
9103 			out = mb_convert_buf_add(out, 0);
9104 			continue;
9105 		} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
9106 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
9107 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
9108 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
9109 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
9110 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
9111 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
9112 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
9113 		}
9114 
9115 		if (s >= 0x8080) s = 0; /* We don't support JIS X0213 */
9116 
9117 		if (s == 0) {
9118 			if (w == 0xA5) { /* YEN SIGN */
9119 				s = 0x216F; /* FULLWIDTH YEN SIGN */
9120 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
9121 				s = 0x2140;
9122 			} else if (w == 0x2225) { /* PARALLEL TO */
9123 				s = 0x2142;
9124 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
9125 				s = 0x215D;
9126 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
9127 				s = 0x2171;
9128 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
9129 				s = 0x2172;
9130 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
9131 				s = 0x224C;
9132 			} else {
9133 				for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
9134 					if (cp932ext1_ucs_table[i] == w) {
9135 						s = ((i/94 + 0x2D) << 8) + (i%94) + 0x21;
9136 						goto found_it;
9137 					}
9138 				}
9139 
9140 				for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
9141 					if (cp932ext2_ucs_table[i] == w) {
9142 						s = ((i/94 + 0x79) << 8) + (i%94) + 0x21;
9143 						goto found_it;
9144 					}
9145 				}
9146 			}
9147 found_it: ;
9148 		}
9149 
9150 		if (!s || s >= 0x8080) {
9151 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp51932);
9152 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9153 		} else if (s < 0x80) {
9154 			out = mb_convert_buf_add(out, s);
9155 		} else if (s < 0x100) {
9156 			out = mb_convert_buf_add2(out, 0x8E, s);
9157 		} else {
9158 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9159 		}
9160 	}
9161 
9162 	MB_CONVERT_BUF_STORE(buf, out, limit);
9163 }
9164 
mb_eucjp2004_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9165 static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9166 {
9167 	unsigned char *p = *in, *e = p + *in_len;
9168 	uint32_t *out = buf, *limit = buf + bufsize - 1;
9169 
9170 	while (p < e && out < limit) {
9171 		unsigned char c = *p++;
9172 
9173 		if (c <= 0x7F) {
9174 			*out++ = c;
9175 		} else if (c >= 0xA1 && c <= 0xFE) {
9176 			/* Kanji */
9177 			if (p == e) {
9178 				*out++ = MBFL_BAD_INPUT;
9179 				break;
9180 			}
9181 			unsigned char c2 = *p++;
9182 			if (c2 <= 0xA0 || c2 == 0xFF) {
9183 				*out++ = MBFL_BAD_INPUT;
9184 				continue;
9185 			}
9186 
9187 			unsigned int s1 = c - 0x80, s2 = c2 - 0x80;
9188 			unsigned int w1 = (s1 << 8) | s2, w = 0;
9189 
9190 			/* Conversion for combining characters */
9191 			if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
9192 				int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
9193 				if (k >= 0) {
9194 					*out++ = jisx0213_u2_tbl[2*k];
9195 					*out++ = jisx0213_u2_tbl[2*k+1];
9196 					continue;
9197 				}
9198 			}
9199 
9200 			/* Conversion for BMP  */
9201 			w1 = (s1 - 0x21)*94 + s2 - 0x21;
9202 			if (w1 < jisx0213_ucs_table_size) {
9203 				w = jisx0213_ucs_table[w1];
9204 			}
9205 
9206 			/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
9207 			if (!w) {
9208 				int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9209 				if (k >= 0) {
9210 					w = jisx0213_jis_u5_tbl[k] + 0x20000;
9211 				}
9212 			}
9213 
9214 			*out++ = w ? w : MBFL_BAD_INPUT;
9215 		} else if (c == 0x8E && p < e) {
9216 			/* Kana */
9217 			unsigned char c2 = *p++;
9218 			if (c2 >= 0xA1 && c2 <= 0xDF) {
9219 				*out++ = 0xFEC0 + c2;
9220 			} else {
9221 				*out++ = MBFL_BAD_INPUT;
9222 			}
9223 		} else if (c == 0x8F && p < e) {
9224 			unsigned char c2 = *p++;
9225 			if ((c2 == 0xA1 || (c2 >= 0xA3 && c2 <= 0xA5) || c2 == 0xA8 || (c2 >= 0xAC && c2 <= 0xAF) || (c2 >= 0xEE && c2 <= 0xFE)) && p < e) {
9226 				unsigned char c3 = *p++;
9227 
9228 				if (c3 < 0xA1 || c3 == 0xFF) {
9229 					*out++ = MBFL_BAD_INPUT;
9230 					continue;
9231 				}
9232 
9233 				unsigned int s1 = c2 - 0xA1, s2 = c3 - 0xA1;
9234 
9235 				if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) {
9236 					int k;
9237 					for (k = 0; k < jisx0213_p2_ofst_len; k++) {
9238 						if (s1 == jisx0213_p2_ofst[k]) {
9239 							break;
9240 						}
9241 					}
9242 					k -= jisx0213_p2_ofst[k];
9243 
9244 					/* Check for Japanese chars in BMP */
9245 					unsigned int s = (s1 + 94 + k)*94 + s2;
9246 					ZEND_ASSERT(s < jisx0213_ucs_table_size);
9247 					unsigned int w = jisx0213_ucs_table[s];
9248 
9249 					/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
9250 					if (!w) {
9251 						k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9252 						if (k >= 0) {
9253 							w = jisx0213_jis_u5_tbl[k] + 0x20000;
9254 						}
9255 					}
9256 
9257 					*out++ = w ? w : MBFL_BAD_INPUT;
9258 				} else {
9259 					*out++ = MBFL_BAD_INPUT;
9260 				}
9261 			} else {
9262 				*out++ = MBFL_BAD_INPUT;
9263 			}
9264 		} else {
9265 			*out++ = MBFL_BAD_INPUT;
9266 		}
9267 	}
9268 
9269 	*in_len = e - p;
9270 	*in = p;
9271 	return out - buf;
9272 }
9273 
mb_wchar_to_eucjp2004(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9274 static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9275 {
9276 	unsigned char *out, *limit;
9277 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9278 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9279 
9280 	uint32_t w;
9281 	if (buf->state) {
9282 		w = buf->state;
9283 		buf->state = 0;
9284 		goto process_codepoint;
9285 	}
9286 
9287 	while (len--) {
9288 		w = *in++;
9289 process_codepoint: ;
9290 		unsigned int s = 0;
9291 
9292 		/* Check for 1st char of combining characters */
9293 		if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) {
9294 			for (int k = 0; k < jisx0213_u2_tbl_len; k++) {
9295 				if (w == jisx0213_u2_tbl[2*k]) {
9296 					if (!len) {
9297 						if (!end) {
9298 							buf->state = w;
9299 							MB_CONVERT_BUF_STORE(buf, out, limit);
9300 							return;
9301 						}
9302 					} else {
9303 						uint32_t w2 = *in++; len--;
9304 						if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) {
9305 							k++;
9306 						}
9307 						if (w2 == jisx0213_u2_tbl[2*k+1]) {
9308 							s = jisx0213_u2_key[k];
9309 							break;
9310 						}
9311 						in--; len++;
9312 					}
9313 
9314 					/* Fallback */
9315 					s = jisx0213_u2_fb_tbl[k];
9316 					break;
9317 				}
9318 			}
9319 		}
9320 
9321 		/* Check for major Japanese chars: U+4E00-U+9FFF */
9322 		if (!s) {
9323 			for (int k = 0; k < uni2jis_tbl_len; k++) {
9324 				if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) {
9325 					s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]];
9326 					break;
9327 				}
9328 			}
9329 		}
9330 
9331 		/* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */
9332 		if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) {
9333 			int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len);
9334 			if (k >= 0) {
9335 				s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k];
9336 			}
9337 		}
9338 
9339 		/* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
9340 		if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) {
9341 			int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len);
9342 			if (k >= 0) {
9343 				s = jisx0213_u5_jis_tbl[k];
9344 			}
9345 		}
9346 
9347 		if (!s) {
9348 			/* CJK Compatibility Forms: U+FE30-U+FE4F */
9349 			if (w == 0xFE45) {
9350 				s = 0x233E;
9351 			} else if (w == 0xFE46) {
9352 				s = 0x233D;
9353 			} else if (w >= 0xF91D && w <= 0xF9DC) {
9354 				/* CJK Compatibility Ideographs: U+F900-U+F92A */
9355 				int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len);
9356 				if (k >= 0) {
9357 					s = ucs_r2b_jisx0213_cmap_val[k];
9358 				}
9359 			}
9360 		}
9361 
9362 		if (!s && w) {
9363 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004);
9364 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
9365 		} else if (s <= 0x7F) {
9366 			out = mb_convert_buf_add(out, s);
9367 		} else if (s <= 0xFF) {
9368 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9369 			out = mb_convert_buf_add2(out, 0x8E, s);
9370 		} else if (s <= 0x7EFF) {
9371 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
9372 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80);
9373 		} else {
9374 			unsigned int s2 = s & 0xFF;
9375 			int k = ((s >> 8) & 0xFF) - 0x7F;
9376 			ZEND_ASSERT(k < jisx0213_p2_ofst_len);
9377 			s = jisx0213_p2_ofst[k] + 0x21;
9378 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
9379 			out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80);
9380 		}
9381 	}
9382 
9383 	MB_CONVERT_BUF_STORE(buf, out, limit);
9384 }
9385 
mbfl_filt_conv_euccn_wchar(int c,mbfl_convert_filter * filter)9386 static int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter)
9387 {
9388 	int c1, w;
9389 
9390 	switch (filter->status) {
9391 	case 0:
9392 		if (c >= 0 && c < 0x80) { /* latin */
9393 			CK((*filter->output_function)(c, filter->data));
9394 		} else if ((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) { /* dbcs lead byte */
9395 			filter->status = 1;
9396 			filter->cache = c;
9397 		} else {
9398 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9399 		}
9400 		break;
9401 
9402 	case 1: /* dbcs second byte */
9403 		filter->status = 0;
9404 		c1 = filter->cache;
9405 		if (c > 0xA0 && c < 0xFF) {
9406 			w = (c1 - 0x81)*192 + c - 0x40;
9407 			ZEND_ASSERT(w < cp936_ucs_table_size);
9408 			if (w == 0x1864) {
9409 				w = 0x30FB;
9410 			} else if (w == 0x186A) {
9411 				w = 0x2015;
9412 			} else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9413 				w = 0;
9414 			} else {
9415 				w = cp936_ucs_table[w];
9416 			}
9417 
9418 			if (w <= 0) {
9419 				w = MBFL_BAD_INPUT;
9420 			}
9421 
9422 			CK((*filter->output_function)(w, filter->data));
9423 		} else {
9424 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9425 		}
9426 		break;
9427 
9428 		EMPTY_SWITCH_DEFAULT_CASE();
9429 	}
9430 
9431 	return 0;
9432 }
9433 
mbfl_filt_conv_wchar_euccn(int c,mbfl_convert_filter * filter)9434 static int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter)
9435 {
9436 	int s = 0;
9437 
9438 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
9439 		if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261) {
9440 			s = 0;
9441 		} else {
9442 			s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
9443 		}
9444 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
9445 		if (c == 0x2015) {
9446 			s = 0xA1AA;
9447 		} else if (c == 0x2014 || (c >= 0x2170 && c <= 0x2179)) {
9448 			s = 0;
9449 		} else {
9450 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
9451 		}
9452 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
9453 		if (c == 0x30FB) {
9454 			s = 0xA1A4;
9455 		} else {
9456 			s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
9457 		}
9458 	} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
9459 		s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
9460 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
9461 		if (c == 0xFF04) {
9462 			s = 0xA1E7;
9463 		} else if (c == 0xFF5E) {
9464 			s = 0xA1AB;
9465 		} else if (c >= 0xFF01 && c <= 0xFF5D) {
9466 			s = c - 0xFF01 + 0xA3A1;
9467 		} else if (c >= 0xFFE0 && c <= 0xFFE5) {
9468 			s = ucs_hff_s_cp936_table[c - 0xFFE0];
9469 		}
9470 	}
9471 
9472 	/* exclude CP936 extensions */
9473 	if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9474 		s = 0;
9475 	}
9476 
9477 	if (s <= 0) {
9478 		if (c < 0x80) {
9479 			s = c;
9480 		} else if (s <= 0) {
9481 			s = -1;
9482 		}
9483 	}
9484 
9485 	if (s >= 0) {
9486 		if (s < 0x80) { /* latin */
9487 			CK((*filter->output_function)(s, filter->data));
9488 		} else {
9489 			CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9490 			CK((*filter->output_function)(s & 0xFF, filter->data));
9491 		}
9492 	} else {
9493 		CK(mbfl_filt_conv_illegal_output(c, filter));
9494 	}
9495 
9496 	return 0;
9497 }
9498 
mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter * filter)9499 static int mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter *filter)
9500 {
9501 	if (filter->status == 1) {
9502 		/* 2-byte character was truncated */
9503 		filter->status = 0;
9504 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9505 	}
9506 
9507 	if (filter->flush_function) {
9508 		(*filter->flush_function)(filter->data);
9509 	}
9510 
9511 	return 0;
9512 }
9513 
mb_euccn_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9514 static size_t mb_euccn_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9515 {
9516 	unsigned char *p = *in, *e = p + *in_len;
9517 	uint32_t *out = buf, *limit = buf + bufsize;
9518 
9519 	while (p < e && out < limit) {
9520 		unsigned char c = *p++;
9521 
9522 		if (c < 0x80) {
9523 			*out++ = c;
9524 		} else if (((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) && p < e) {
9525 			unsigned char c2 = *p++;
9526 
9527 			if (c2 >= 0xA1 && c2 <= 0xFE) {
9528 				unsigned int w = (c - 0x81)*192 + c2 - 0x40;
9529 				ZEND_ASSERT(w < cp936_ucs_table_size);
9530 				if (w == 0x1864) {
9531 					w = 0x30FB;
9532 				} else if (w == 0x186A) {
9533 					w = 0x2015;
9534 				} else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) {
9535 					w = 0;
9536 				} else {
9537 					w = cp936_ucs_table[w];
9538 				}
9539 
9540 				if (!w)
9541 					w = MBFL_BAD_INPUT;
9542 				*out++ = w;
9543 			} else {
9544 				*out++ = MBFL_BAD_INPUT;
9545 			}
9546 		} else {
9547 			*out++ = MBFL_BAD_INPUT;
9548 		}
9549 	}
9550 
9551 	*in_len = e - p;
9552 	*in = p;
9553 	return out - buf;
9554 }
9555 
mb_wchar_to_euccn(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9556 static void mb_wchar_to_euccn(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9557 {
9558 	unsigned char *out, *limit;
9559 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9560 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9561 
9562 	while (len--) {
9563 		uint32_t w = *in++;
9564 		unsigned int s = 0;
9565 
9566 		if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
9567 			if (w != 0xB7 && w != 0x144 && w != 0x148 && w != 0x251 && w != 0x261) {
9568 				s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
9569 			}
9570 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
9571 			if (w == 0x2015) {
9572 				s = 0xA1AA;
9573 			} else if (w != 0x2014 && (w < 0x2170 || w > 0x2179)) {
9574 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
9575 			}
9576 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
9577 			if (w == 0x30FB) {
9578 				s = 0xA1A4;
9579 			} else {
9580 				s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
9581 			}
9582 		} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
9583 			s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
9584 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
9585 			if (w == 0xFF04) {
9586 				s = 0xA1E7;
9587 			} else if (w == 0xFF5E) {
9588 				s = 0xA1AB;
9589 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
9590 				s = w - 0xFF01 + 0xA3A1;
9591 			} else if (w >= 0xFFE0 && w <= 0xFFE5) {
9592 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
9593 			}
9594 		}
9595 
9596 		/* Exclude CP936 extensions */
9597 		if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9598 			s = 0;
9599 		}
9600 
9601 		if (!s) {
9602 			if (w < 0x80) {
9603 				out = mb_convert_buf_add(out, w);
9604 			} else {
9605 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euccn);
9606 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9607 			}
9608 		} else if (s < 0x80) {
9609 			out = mb_convert_buf_add(out, s);
9610 		} else {
9611 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
9612 		}
9613 	}
9614 
9615 	MB_CONVERT_BUF_STORE(buf, out, limit);
9616 }
9617 
mbfl_filt_conv_euctw_wchar(int c,mbfl_convert_filter * filter)9618 static int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
9619 {
9620 	int c1, s, w;
9621 
9622 	switch (filter->status) {
9623 	case 0:
9624 		if (c >= 0 && c < 0x80) { /* latin */
9625 			CK((*filter->output_function)(c, filter->data));
9626 		} else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) { /* 2-byte character, first byte */
9627 			filter->status = 1;
9628 			filter->cache = c;
9629 		} else if (c == 0x8E) { /* 4-byte character, first byte */
9630 			filter->status = 2;
9631 		} else {
9632 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9633 		}
9634 		break;
9635 
9636 	case 1: /* 2-byte character, second byte */
9637 		filter->status = 0;
9638 		c1 = filter->cache;
9639 		if (c > 0xA0 && c < 0xFF) {
9640 			w = (c1 - 0xA1)*94 + (c - 0xA1);
9641 			if (w >= 0 && w < cns11643_1_ucs_table_size) {
9642 				w = cns11643_1_ucs_table[w];
9643 			} else {
9644 				w = 0;
9645 			}
9646 
9647 			if (w <= 0) {
9648 				w = MBFL_BAD_INPUT;
9649 			}
9650 
9651 			CK((*filter->output_function)(w, filter->data));
9652 		} else {
9653 			filter->status = filter->cache = 0;
9654 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9655 		}
9656 		break;
9657 
9658 	case 2: /* got 0x8e, second byte */
9659 		if (c == 0xA1 || c == 0xA2 || c == 0xAE) {
9660 			filter->status = 3;
9661 			filter->cache = c - 0xA1;
9662 		} else {
9663 			filter->status = filter->cache = 0;
9664 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9665 		}
9666 		break;
9667 
9668 	case 3: /* got 0x8e, third byte */
9669 		filter->status = 0;
9670 		c1 = filter->cache;
9671 		if (c >= 0xA1 && ((c1 == 0 && ((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) ||
9672 				(c1 == 1 && c <= 0xF2) || (c1 == 13 && c <= 0xE7))) {
9673 			filter->status = 4;
9674 			filter->cache = (c1 << 8) + c - 0xA1;
9675 		} else {
9676 			filter->status = filter->cache = 0;
9677 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9678 		}
9679 		break;
9680 
9681 	case 4:	/* multi-byte character, fourth byte */
9682 		filter->status = 0;
9683 		c1 = filter->cache;
9684 		if (c1 <= 0xDFF && c > 0xA0 && c < 0xFF) {
9685 			int plane = (c1 & 0xF00) >> 8; /* This is actually the CNS-11643 plane minus one */
9686 			s = (c1 & 0xFF)*94 + c - 0xA1;
9687 			w = 0;
9688 			if (s >= 0) {
9689 				/* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9690 				 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9691 				 * We only support the older version of CNS-11643
9692 				 * This is the same as iconv from glibc 2.2 */
9693 				if (plane == 0 && s < cns11643_1_ucs_table_size) {
9694 					w = cns11643_1_ucs_table[s];
9695 				} else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9696 					w = cns11643_2_ucs_table[s];
9697 				} else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9698 					w = cns11643_14_ucs_table[s];
9699 				}
9700 			}
9701 
9702 			if (w <= 0) {
9703 				w = MBFL_BAD_INPUT;
9704 			}
9705 
9706 			CK((*filter->output_function)(w, filter->data));
9707 		} else {
9708 			filter->status = filter->cache = 0;
9709 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9710 		}
9711 		break;
9712 
9713 		EMPTY_SWITCH_DEFAULT_CASE();
9714 	}
9715 
9716 	return 0;
9717 }
9718 
mbfl_filt_conv_wchar_euctw(int c,mbfl_convert_filter * filter)9719 static int mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
9720 {
9721 	int s = 0;
9722 
9723 	if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) {
9724 		s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min];
9725 	} else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) {
9726 		s = ucs_a2_cns11643_table[c - ucs_a2_cns11643_table_min];
9727 	} else if (c >= ucs_a3_cns11643_table_min && c < ucs_a3_cns11643_table_max) {
9728 		s = ucs_a3_cns11643_table[c - ucs_a3_cns11643_table_min];
9729 	} else if (c >= ucs_i_cns11643_table_min && c < ucs_i_cns11643_table_max) {
9730 		s = ucs_i_cns11643_table[c - ucs_i_cns11643_table_min];
9731 	} else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) {
9732 		s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min];
9733 	}
9734 
9735 	if (s <= 0) {
9736 		if (c == 0) {
9737 			s = 0;
9738 		} else if (s <= 0) {
9739 			s = -1;
9740 		}
9741 	}
9742 
9743 	if (s >= 0) {
9744 		int plane = (s & 0x1F0000) >> 16;
9745 		if (plane <= 1) {
9746 			if (s < 0x80) { /* latin */
9747 				CK((*filter->output_function)(s, filter->data));
9748 			} else {
9749 				s = (s & 0xFFFF) | 0x8080;
9750 				CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9751 				CK((*filter->output_function)(s & 0xFF, filter->data));
9752 			}
9753 		} else {
9754 			s = (0x8EA00000 + (plane << 16)) | ((s & 0xFFFF) | 0x8080);
9755 			CK((*filter->output_function)(0x8e , filter->data));
9756 			CK((*filter->output_function)((s >> 16) & 0xFF, filter->data));
9757 			CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
9758 			CK((*filter->output_function)(s & 0xFF, filter->data));
9759 		}
9760 	} else {
9761 		CK(mbfl_filt_conv_illegal_output(c, filter));
9762 	}
9763 	return 0;
9764 }
9765 
mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter * filter)9766 static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter)
9767 {
9768 	if (filter->status) {
9769 		/* 2-byte or 4-byte character was truncated */
9770 		filter->status = 0;
9771 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9772 	}
9773 
9774 	if (filter->flush_function) {
9775 		(*filter->flush_function)(filter->data);
9776 	}
9777 
9778 	return 0;
9779 }
9780 
mb_euctw_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)9781 static size_t mb_euctw_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
9782 {
9783 	unsigned char *p = *in, *e = p + *in_len;
9784 	uint32_t *out = buf, *limit = buf + bufsize;
9785 
9786 	while (p < e && out < limit) {
9787 		unsigned char c = *p++;
9788 
9789 		if (c < 0x80) {
9790 			*out++ = c;
9791 		} else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3 && p < e) {
9792 			unsigned char c2 = *p++;
9793 
9794 			if (c2 >= 0xA1 && c2 <= 0xFE) {
9795 				unsigned int w = (c - 0xA1)*94 + (c2 - 0xA1);
9796 				if (w < cns11643_1_ucs_table_size) {
9797 					w = cns11643_1_ucs_table[w];
9798 				} else {
9799 					w = 0;
9800 				}
9801 				if (!w)
9802 					w = MBFL_BAD_INPUT;
9803 				*out++ = w;
9804 			} else {
9805 				*out++ = MBFL_BAD_INPUT;
9806 			}
9807 		} else if (c == 0x8E && p < e) {
9808 			unsigned char c2 = *p++;
9809 
9810 			if ((c2 == 0xA1 || c2 == 0xA2 || c2 == 0xAE) && p < e) {
9811 				unsigned int plane = c2 - 0xA1; /* This is actually the CNS-11643 plane minus one */
9812 				unsigned char c3 = *p++;
9813 
9814 				if (c3 >= 0xA1 && ((plane == 0 && ((c3 >= 0xA1 && c3 <= 0xA6) || (c3 >= 0xC2 && c3 <= 0xFD)) && c3 != 0xC3) || (plane == 1 && c3 <= 0xF2) || (plane == 13 && c3 <= 0xE7)) && p < e) {
9815 					unsigned char c4 = *p++;
9816 
9817 					if (c2 <= 0xAE && c4 > 0xA0 && c4 < 0xFF) {
9818 						unsigned int s = (c3 - 0xA1)*94 + c4 - 0xA1, w = 0;
9819 
9820 						/* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
9821 						 * and added tens of thousands more characters in planes 4, 5, 6, and 7
9822 						 * We only support the older version of CNS-11643
9823 						 * This is the same as iconv from glibc 2.2 */
9824 						if (plane == 0 && s < cns11643_1_ucs_table_size) {
9825 							w = cns11643_1_ucs_table[s];
9826 						} else if (plane == 1 && s < cns11643_2_ucs_table_size) {
9827 							w = cns11643_2_ucs_table[s];
9828 						} else if (plane == 13 && s < cns11643_14_ucs_table_size) {
9829 							w = cns11643_14_ucs_table[s];
9830 						}
9831 
9832 						if (!w)
9833 							w = MBFL_BAD_INPUT;
9834 						*out++ = w;
9835 						continue;
9836 					}
9837 				}
9838 			}
9839 
9840 			*out++ = MBFL_BAD_INPUT;
9841 		} else {
9842 			*out++ = MBFL_BAD_INPUT;
9843 		}
9844 	}
9845 
9846 	*in_len = e - p;
9847 	*in = p;
9848 	return out - buf;
9849 }
9850 
mb_wchar_to_euctw(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)9851 static void mb_wchar_to_euctw(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
9852 {
9853 	unsigned char *out, *limit;
9854 	MB_CONVERT_BUF_LOAD(buf, out, limit);
9855 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9856 
9857 	while (len--) {
9858 		uint32_t w = *in++;
9859 		unsigned int s = 0;
9860 
9861 		if (w >= ucs_a1_cns11643_table_min && w < ucs_a1_cns11643_table_max) {
9862 			s = ucs_a1_cns11643_table[w - ucs_a1_cns11643_table_min];
9863 		} else if (w >= ucs_a2_cns11643_table_min && w < ucs_a2_cns11643_table_max) {
9864 			s = ucs_a2_cns11643_table[w - ucs_a2_cns11643_table_min];
9865 		} else if (w >= ucs_a3_cns11643_table_min && w < ucs_a3_cns11643_table_max) {
9866 			s = ucs_a3_cns11643_table[w - ucs_a3_cns11643_table_min];
9867 		} else if (w >= ucs_i_cns11643_table_min && w < ucs_i_cns11643_table_max) {
9868 			s = ucs_i_cns11643_table[w - ucs_i_cns11643_table_min];
9869 		} else if (w >= ucs_r_cns11643_table_min && w < ucs_r_cns11643_table_max) {
9870 			s = ucs_r_cns11643_table[w - ucs_r_cns11643_table_min];
9871 		}
9872 
9873 		if (!s) {
9874 			if (w == 0) {
9875 				out = mb_convert_buf_add(out, 0);
9876 			} else {
9877 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euctw);
9878 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
9879 			}
9880 		} else {
9881 			unsigned int plane = s >> 16;
9882 			if (plane <= 1) {
9883 				if (s < 0x80) {
9884 					out = mb_convert_buf_add(out, s);
9885 				} else {
9886 					out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9887 				}
9888 			} else {
9889 				MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4);
9890 				out = mb_convert_buf_add4(out, 0x8E, 0xA0 + plane, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
9891 			}
9892 		}
9893 	}
9894 
9895 	MB_CONVERT_BUF_STORE(buf, out, limit);
9896 }
9897 
mbfl_filt_conv_euckr_wchar(int c,mbfl_convert_filter * filter)9898 static int mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter)
9899 {
9900 	int c1, w, flag;
9901 
9902 	switch (filter->status) {
9903 	case 0:
9904 		if (c >= 0 && c < 0x80) { /* latin */
9905 			CK((*filter->output_function)(c, filter->data));
9906 		} else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9) { /* dbcs lead byte */
9907 			filter->status = 1;
9908 			filter->cache = c;
9909 		} else {
9910 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9911 		}
9912 		break;
9913 
9914 	case 1: /* dbcs second byte */
9915 		filter->status = 0;
9916 		c1 = filter->cache;
9917 		flag = 0;
9918 		if (c1 >= 0xa1 && c1 <= 0xc6) {
9919 			flag = 1;
9920 		} else if (c1 >= 0xc7 && c1 <= 0xfe && c1 != 0xc9) {
9921 			flag = 2;
9922 		}
9923 		if (flag > 0 && c >= 0xa1 && c <= 0xfe) {
9924 			if (flag == 1) { /* 1st: 0xa1..0xc6, 2nd: 0x41..0x7a, 0x81..0xfe */
9925 				w = (c1 - 0x81)*190 + c - 0x41;
9926 				ZEND_ASSERT(w < uhc1_ucs_table_size);
9927 				w = uhc1_ucs_table[w];
9928 			} else { /* 1st: 0xc7..0xc8,0xca..0xfe, 2nd: 0xa1..0xfe */
9929 				w = (c1 - 0xc7)*94 + c - 0xa1;
9930 				ZEND_ASSERT(w < uhc3_ucs_table_size);
9931 				w = uhc3_ucs_table[w];
9932 			}
9933 
9934 			if (w <= 0) {
9935 				w = MBFL_BAD_INPUT;
9936 			}
9937 			CK((*filter->output_function)(w, filter->data));
9938 		} else {
9939 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
9940 		}
9941 		break;
9942 
9943 		EMPTY_SWITCH_DEFAULT_CASE();
9944 	}
9945 
9946 	return 0;
9947 }
9948 
mbfl_filt_conv_wchar_euckr(int c,mbfl_convert_filter * filter)9949 static int mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter)
9950 {
9951 	int s = 0;
9952 
9953 	if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
9954 		s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
9955 	} else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
9956 		s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
9957 	} else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
9958 		s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
9959 	} else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
9960 		s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
9961 	} else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
9962 		s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
9963 	} else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
9964 		s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
9965 	} else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
9966 		s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
9967 	}
9968 
9969 	/* exclude UHC extension area (although we are using the UHC conversion tables) */
9970 	if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
9971 		s = 0;
9972 	}
9973 
9974 	if (s <= 0) {
9975 		if (c < 0x80) {
9976 			s = c;
9977 		} else {
9978 			s = -1;
9979 		}
9980 	}
9981 
9982 	if (s >= 0) {
9983 		if (s < 0x80) { /* latin */
9984 			CK((*filter->output_function)(s, filter->data));
9985 		} else {
9986 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
9987 			CK((*filter->output_function)(s & 0xff, filter->data));
9988 		}
9989 	} else {
9990 		CK(mbfl_filt_conv_illegal_output(c, filter));
9991 	}
9992 
9993 	return 0;
9994 }
9995 
mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter * filter)9996 static int mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter *filter)
9997 {
9998 	if (filter->status == 1) {
9999 		/* 2-byte character was truncated */
10000 		filter->status = 0;
10001 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10002 	}
10003 
10004 	if (filter->flush_function) {
10005 		(*filter->flush_function)(filter->data);
10006 	}
10007 
10008 	return 0;
10009 }
10010 
mb_euckr_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10011 static size_t mb_euckr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10012 {
10013 	unsigned char *p = *in, *e = p + *in_len;
10014 	uint32_t *out = buf, *limit = buf + bufsize;
10015 
10016 	while (p < e && out < limit) {
10017 		unsigned char c = *p++;
10018 
10019 		if (c < 0x80) {
10020 			*out++ = c;
10021 		} else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9 && p < e) {
10022 			unsigned char c2 = *p++;
10023 			if (c2 < 0xA1 || c2 == 0xFF) {
10024 				*out++ = MBFL_BAD_INPUT;
10025 				continue;
10026 			}
10027 
10028 			if (c <= 0xC6) {
10029 				unsigned int w = (c - 0x81)*190 + c2 - 0x41;
10030 				ZEND_ASSERT(w < uhc1_ucs_table_size);
10031 				w = uhc1_ucs_table[w];
10032 				if (!w)
10033 					w = MBFL_BAD_INPUT;
10034 				*out++ = w;
10035 			} else {
10036 				unsigned int w = (c - 0xC7)*94 + c2 - 0xA1;
10037 				ZEND_ASSERT(w < uhc3_ucs_table_size);
10038 				w = uhc3_ucs_table[w];
10039 				if (!w)
10040 					w = MBFL_BAD_INPUT;
10041 				*out++ = w;
10042 			}
10043 		} else {
10044 			*out++ = MBFL_BAD_INPUT;
10045 		}
10046 	}
10047 
10048 	*in_len = e - p;
10049 	*in = p;
10050 	return out - buf;
10051 }
10052 
mb_wchar_to_euckr(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10053 static void mb_wchar_to_euckr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10054 {
10055 	unsigned char *out, *limit;
10056 	MB_CONVERT_BUF_LOAD(buf, out, limit);
10057 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10058 
10059 	while (len--) {
10060 		uint32_t w = *in++;
10061 		unsigned int s = 0;
10062 
10063 		if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10064 			s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10065 		} else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10066 			s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10067 		} else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10068 			s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10069 		} else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10070 			s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10071 		} else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10072 			s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10073 		} else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10074 			s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10075 		} else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10076 			s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10077 		}
10078 
10079 		/* Exclude UHC extension area (although we are using the UHC conversion tables) */
10080 		if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) {
10081 			s = 0;
10082 		}
10083 
10084 		if (!s) {
10085 			if (w < 0x80) {
10086 				out = mb_convert_buf_add(out, w);
10087 			} else {
10088 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euckr);
10089 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10090 			}
10091 		} else if (s < 0x80) {
10092 			out = mb_convert_buf_add(out, s);
10093 		} else {
10094 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10095 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10096 		}
10097 	}
10098 
10099 	MB_CONVERT_BUF_STORE(buf, out, limit);
10100 }
10101 
mbfl_filt_conv_uhc_wchar(int c,mbfl_convert_filter * filter)10102 static int mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter)
10103 {
10104 	switch (filter->status) {
10105 	case 0:
10106 		if (c >= 0 && c < 0x80) { /* latin */
10107 			CK((*filter->output_function)(c, filter->data));
10108 		} else if (c > 0x80 && c < 0xfe && c != 0xc9) { /* dbcs lead byte */
10109 			filter->status = 1;
10110 			filter->cache = c;
10111 		} else {
10112 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10113 		}
10114 		break;
10115 
10116 	case 1: /* dbcs second byte */
10117 		filter->status = 0;
10118 		int c1 = filter->cache, w = 0;
10119 
10120 		if (c1 >= 0x81 && c1 <= 0xc6 && c >= 0x41 && c <= 0xfe) {
10121 			w = (c1 - 0x81)*190 + (c - 0x41);
10122 			if (w >= 0 && w < uhc1_ucs_table_size) {
10123 				w = uhc1_ucs_table[w];
10124 			}
10125 		} else if (c1 >= 0xc7 && c1 < 0xfe && c >= 0xa1 && c <= 0xfe) {
10126 			w = (c1 - 0xc7)*94 + (c - 0xa1);
10127 			if (w >= 0 && w < uhc3_ucs_table_size) {
10128 				w = uhc3_ucs_table[w];
10129 			}
10130 		}
10131 
10132 		if (w == 0) {
10133 			w = MBFL_BAD_INPUT;
10134 		}
10135 		CK((*filter->output_function)(w, filter->data));
10136 		break;
10137 
10138 		EMPTY_SWITCH_DEFAULT_CASE();
10139 	}
10140 
10141 	return 0;
10142 }
10143 
mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter * filter)10144 static int mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter *filter)
10145 {
10146 	if (filter->status == 1) {
10147 		/* 2-byte character was truncated */
10148 		filter->status = 0;
10149 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10150 	}
10151 
10152 	if (filter->flush_function) {
10153 		(*filter->flush_function)(filter->data);
10154 	}
10155 
10156 	return 0;
10157 }
10158 
mbfl_filt_conv_wchar_uhc(int c,mbfl_convert_filter * filter)10159 static int mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter)
10160 {
10161 	int s = 0;
10162 
10163 	if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
10164 		s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
10165 	} else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) {
10166 		s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min];
10167 	} else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) {
10168 		s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min];
10169 	} else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) {
10170 		s = ucs_i_uhc_table[c - ucs_i_uhc_table_min];
10171 	} else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) {
10172 		s = ucs_s_uhc_table[c - ucs_s_uhc_table_min];
10173 	} else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) {
10174 		s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min];
10175 	} else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) {
10176 		s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min];
10177 	}
10178 
10179 	if (s == 0 && c != 0) {
10180 		s = -1;
10181 	}
10182 
10183 	if (s >= 0) {
10184 		if (s < 0x80) { /* latin */
10185 			CK((*filter->output_function)(s, filter->data));
10186 		} else {
10187 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10188 			CK((*filter->output_function)(s & 0xff, filter->data));
10189 		}
10190 	} else {
10191 		CK(mbfl_filt_conv_illegal_output(c, filter));
10192 	}
10193 
10194 	return 0;
10195 }
10196 
mb_uhc_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10197 static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10198 {
10199 	unsigned char *p = *in, *e = p + *in_len;
10200 	uint32_t *out = buf, *limit = buf + bufsize;
10201 
10202 	e--; /* Stop the main loop 1 byte short of the end of the input */
10203 
10204 	while (p < e && out < limit) {
10205 		unsigned char c = *p++;
10206 
10207 		if (c < 0x80) {
10208 			*out++ = c;
10209 		} else if (c > 0x80 && c < 0xFE) {
10210 			/* We don't need to check p < e here; it's not possible that this pointer dereference
10211 			 * will be outside the input string, because of e-- above */
10212 			unsigned char c2 = *p++;
10213 			if (c2 < 0x41 || c2 == 0xFF) {
10214 				*out++ = MBFL_BAD_INPUT;
10215 				continue;
10216 			}
10217 			unsigned int w = 0;
10218 
10219 			if (c <= 0xC6) {
10220 				w = (c - 0x81)*190 + c2 - 0x41;
10221 				ZEND_ASSERT(w < uhc1_ucs_table_size);
10222 				w = uhc1_ucs_table[w];
10223 			} else if (c2 >= 0xA1) {
10224 				w = (c - 0xC7)*94 + c2 - 0xA1;
10225 				ZEND_ASSERT(w < uhc3_ucs_table_size);
10226 				w = uhc3_ucs_table[w];
10227 			}
10228 			if (!w) {
10229 				/* If c == 0xC9, we shouldn't have tried to read a 2-byte char at all... but it is faster
10230 				 * to fix up that rare case here rather than include an extra check in the hot path */
10231 				if (c == 0xC9) {
10232 					p--;
10233 				}
10234 				w = MBFL_BAD_INPUT;
10235 			}
10236 			*out++ = w;
10237 		} else {
10238 			*out++ = MBFL_BAD_INPUT;
10239 		}
10240 	}
10241 
10242 	/* Finish up last byte of input string if there is one */
10243 	if (p == e && out < limit) {
10244 		unsigned char c = *p++;
10245 		*out++ = (c < 0x80) ? c : MBFL_BAD_INPUT;
10246 	}
10247 
10248 	*in_len = e - p + 1;
10249 	*in = p;
10250 	return out - buf;
10251 }
10252 
mb_wchar_to_uhc(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)10253 static void mb_wchar_to_uhc(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
10254 {
10255 	unsigned char *out, *limit;
10256 	MB_CONVERT_BUF_LOAD(buf, out, limit);
10257 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10258 
10259 	while (len--) {
10260 		uint32_t w = *in++;
10261 		unsigned int s = 0;
10262 
10263 		if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) {
10264 			s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min];
10265 		} else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) {
10266 			s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min];
10267 		} else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) {
10268 			s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min];
10269 		} else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) {
10270 			s = ucs_i_uhc_table[w - ucs_i_uhc_table_min];
10271 		} else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) {
10272 			s = ucs_s_uhc_table[w - ucs_s_uhc_table_min];
10273 		} else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) {
10274 			s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min];
10275 		} else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) {
10276 			s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min];
10277 		}
10278 
10279 		if (!s) {
10280 			if (w == 0) {
10281 				out = mb_convert_buf_add(out, 0);
10282 			} else {
10283 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_uhc);
10284 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
10285 			}
10286 		} else if (s < 0x80) {
10287 			out = mb_convert_buf_add(out, s);
10288 		} else {
10289 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
10290 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
10291 		}
10292 	}
10293 
10294 	MB_CONVERT_BUF_STORE(buf, out, limit);
10295 }
10296 
10297 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
10298 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10299 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10300 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10301 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10302 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10303 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10304 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10305 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10306 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
10307 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10308 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10309 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10310 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10311 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10312 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10313 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10314 };
10315 
10316 static const char *mbfl_encoding_euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
10317 
10318 static const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
10319 	mbfl_no_encoding_euc_jp,
10320 	mbfl_no_encoding_wchar,
10321 	mbfl_filt_conv_common_ctor,
10322 	NULL,
10323 	mbfl_filt_conv_eucjp_wchar,
10324 	mbfl_filt_conv_eucjp_wchar_flush,
10325 	NULL,
10326 };
10327 
10328 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp = {
10329 	mbfl_no_encoding_wchar,
10330 	mbfl_no_encoding_euc_jp,
10331 	mbfl_filt_conv_common_ctor,
10332 	NULL,
10333 	mbfl_filt_conv_wchar_eucjp,
10334 	mbfl_filt_conv_common_flush,
10335 	NULL,
10336 };
10337 
10338 const mbfl_encoding mbfl_encoding_euc_jp = {
10339 	mbfl_no_encoding_euc_jp,
10340 	"EUC-JP",
10341 	"EUC-JP",
10342 	mbfl_encoding_euc_jp_aliases,
10343 	mblen_table_eucjp,
10344 	0,
10345 	&vtbl_eucjp_wchar,
10346 	&vtbl_wchar_eucjp,
10347 	mb_eucjp_to_wchar,
10348 	mb_wchar_to_eucjp,
10349 	NULL
10350 };
10351 
10352 static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
10353 
10354 static const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
10355 	mbfl_no_encoding_eucjp2004,
10356 	mbfl_no_encoding_wchar,
10357 	mbfl_filt_conv_common_ctor,
10358 	NULL,
10359 	mbfl_filt_conv_jis2004_wchar,
10360 	mbfl_filt_conv_jis2004_wchar_flush,
10361 	NULL,
10362 };
10363 
10364 static const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = {
10365 	mbfl_no_encoding_wchar,
10366 	mbfl_no_encoding_eucjp2004,
10367 	mbfl_filt_conv_common_ctor,
10368 	NULL,
10369 	mbfl_filt_conv_wchar_jis2004,
10370 	mbfl_filt_conv_wchar_jis2004_flush,
10371 	NULL,
10372 };
10373 
10374 const mbfl_encoding mbfl_encoding_eucjp2004 = {
10375 	mbfl_no_encoding_eucjp2004,
10376 	"EUC-JP-2004",
10377 	"EUC-JP",
10378 	mbfl_encoding_eucjp2004_aliases,
10379 	mblen_table_eucjp,
10380 	0,
10381 	&vtbl_eucjp2004_wchar,
10382 	&vtbl_wchar_eucjp2004,
10383 	mb_eucjp2004_to_wchar,
10384 	mb_wchar_to_eucjp2004,
10385 	NULL
10386 };
10387 
10388 static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL};
10389 
10390 static const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
10391 	mbfl_no_encoding_eucjp_win,
10392 	mbfl_no_encoding_wchar,
10393 	mbfl_filt_conv_common_ctor,
10394 	NULL,
10395 	mbfl_filt_conv_eucjpwin_wchar,
10396 	mbfl_filt_conv_eucjpwin_wchar_flush,
10397 	NULL,
10398 };
10399 
10400 static const struct mbfl_convert_vtbl vtbl_wchar_eucjpwin = {
10401 	mbfl_no_encoding_wchar,
10402 	mbfl_no_encoding_eucjp_win,
10403 	mbfl_filt_conv_common_ctor,
10404 	NULL,
10405 	mbfl_filt_conv_wchar_eucjpwin,
10406 	mbfl_filt_conv_common_flush,
10407 	NULL,
10408 };
10409 
10410 const mbfl_encoding mbfl_encoding_eucjp_win = {
10411 	mbfl_no_encoding_eucjp_win,
10412 	"eucJP-win",
10413 	"EUC-JP",
10414 	mbfl_encoding_eucjp_win_aliases,
10415 	mblen_table_eucjp,
10416 	0,
10417 	&vtbl_eucjpwin_wchar,
10418 	&vtbl_wchar_eucjpwin,
10419 	mb_eucjpwin_to_wchar,
10420 	mb_wchar_to_eucjpwin,
10421 	NULL
10422 };
10423 
10424 static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
10425 
10426 static const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
10427 	mbfl_no_encoding_cp51932,
10428 	mbfl_no_encoding_wchar,
10429 	mbfl_filt_conv_common_ctor,
10430 	NULL,
10431 	mbfl_filt_conv_cp51932_wchar,
10432 	mbfl_filt_conv_cp51932_wchar_flush,
10433 	NULL,
10434 };
10435 
10436 static const struct mbfl_convert_vtbl vtbl_wchar_cp51932 = {
10437 	mbfl_no_encoding_wchar,
10438 	mbfl_no_encoding_cp51932,
10439 	mbfl_filt_conv_common_ctor,
10440 	NULL,
10441 	mbfl_filt_conv_wchar_cp51932,
10442 	mbfl_filt_conv_common_flush,
10443 	NULL,
10444 };
10445 
10446 const mbfl_encoding mbfl_encoding_cp51932 = {
10447 	mbfl_no_encoding_cp51932,
10448 	"CP51932",
10449 	"CP51932",
10450 	mbfl_encoding_cp51932_aliases,
10451 	mblen_table_eucjp,
10452 	0,
10453 	&vtbl_cp51932_wchar,
10454 	&vtbl_wchar_cp51932,
10455 	mb_cp51932_to_wchar,
10456 	mb_wchar_to_cp51932,
10457 	NULL
10458 };
10459 
10460 static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
10461   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10462   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10463   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10464   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10465   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10466   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10467   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10468   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10469   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10470   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10471   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10472   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10473   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10474   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10475   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10476   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10477 };
10478 
10479 static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
10480 
10481 static const struct mbfl_convert_vtbl vtbl_euccn_wchar = {
10482 	mbfl_no_encoding_euc_cn,
10483 	mbfl_no_encoding_wchar,
10484 	mbfl_filt_conv_common_ctor,
10485 	NULL,
10486 	mbfl_filt_conv_euccn_wchar,
10487 	mbfl_filt_conv_euccn_wchar_flush,
10488 	NULL,
10489 };
10490 
10491 static const struct mbfl_convert_vtbl vtbl_wchar_euccn = {
10492 	mbfl_no_encoding_wchar,
10493 	mbfl_no_encoding_euc_cn,
10494 	mbfl_filt_conv_common_ctor,
10495 	NULL,
10496 	mbfl_filt_conv_wchar_euccn,
10497 	mbfl_filt_conv_common_flush,
10498 	NULL,
10499 };
10500 
10501 const mbfl_encoding mbfl_encoding_euc_cn = {
10502 	mbfl_no_encoding_euc_cn,
10503 	"EUC-CN",
10504 	"CN-GB",
10505 	mbfl_encoding_euc_cn_aliases,
10506 	mblen_table_euccn,
10507 	0,
10508 	&vtbl_euccn_wchar,
10509 	&vtbl_wchar_euccn,
10510 	mb_euccn_to_wchar,
10511 	mb_wchar_to_euccn,
10512 	NULL
10513 };
10514 
10515 static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
10516 
10517 static const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
10518 	mbfl_no_encoding_euc_tw,
10519 	mbfl_no_encoding_wchar,
10520 	mbfl_filt_conv_common_ctor,
10521 	NULL,
10522 	mbfl_filt_conv_euctw_wchar,
10523 	mbfl_filt_conv_euctw_wchar_flush,
10524 	NULL,
10525 };
10526 
10527 static const struct mbfl_convert_vtbl vtbl_wchar_euctw = {
10528 	mbfl_no_encoding_wchar,
10529 	mbfl_no_encoding_euc_tw,
10530 	mbfl_filt_conv_common_ctor,
10531 	NULL,
10532 	mbfl_filt_conv_wchar_euctw,
10533 	mbfl_filt_conv_common_flush,
10534 	NULL,
10535 };
10536 
10537 const mbfl_encoding mbfl_encoding_euc_tw = {
10538 	mbfl_no_encoding_euc_tw,
10539 	"EUC-TW",
10540 	"EUC-TW",
10541 	mbfl_encoding_euc_tw_aliases,
10542 	mblen_table_euccn,
10543 	0,
10544 	&vtbl_euctw_wchar,
10545 	&vtbl_wchar_euctw,
10546 	mb_euctw_to_wchar,
10547 	mb_wchar_to_euctw,
10548 	NULL
10549 };
10550 
10551 static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
10552 
10553 static const struct mbfl_convert_vtbl vtbl_euckr_wchar = {
10554 	mbfl_no_encoding_euc_kr,
10555 	mbfl_no_encoding_wchar,
10556 	mbfl_filt_conv_common_ctor,
10557 	NULL,
10558 	mbfl_filt_conv_euckr_wchar,
10559 	mbfl_filt_conv_euckr_wchar_flush,
10560 	NULL,
10561 };
10562 
10563 static const struct mbfl_convert_vtbl vtbl_wchar_euckr = {
10564 	mbfl_no_encoding_wchar,
10565 	mbfl_no_encoding_euc_kr,
10566 	mbfl_filt_conv_common_ctor,
10567 	NULL,
10568 	mbfl_filt_conv_wchar_euckr,
10569 	mbfl_filt_conv_common_flush,
10570 	NULL,
10571 };
10572 
10573 const mbfl_encoding mbfl_encoding_euc_kr = {
10574 	mbfl_no_encoding_euc_kr,
10575 	"EUC-KR",
10576 	"EUC-KR",
10577 	mbfl_encoding_euc_kr_aliases,
10578 	mblen_table_euccn,
10579 	0,
10580 	&vtbl_euckr_wchar,
10581 	&vtbl_wchar_euckr,
10582 	mb_euckr_to_wchar,
10583 	mb_wchar_to_euckr,
10584 	NULL
10585 };
10586 
10587 /* UHC was introduced by MicroSoft in Windows 95, and is also known as CP949.
10588  * It is the same as EUC-KR, but with 8,822 additional characters added to
10589  * complete all the characters in the Johab charset. */
10590 
10591 static const unsigned char mblen_table_81_to_fe[] = { /* 0x81-0xFE */
10592 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10593 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10594 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10595 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10596 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10597 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10598 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10599 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10600 	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10601 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10602 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10603 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10604 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10605 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10606 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10607 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
10608 };
10609 
10610 static const char *mbfl_encoding_uhc_aliases[] = {"CP949", NULL};
10611 
10612 static const struct mbfl_convert_vtbl vtbl_uhc_wchar = {
10613 	mbfl_no_encoding_uhc,
10614 	mbfl_no_encoding_wchar,
10615 	mbfl_filt_conv_common_ctor,
10616 	NULL,
10617 	mbfl_filt_conv_uhc_wchar,
10618 	mbfl_filt_conv_uhc_wchar_flush,
10619 	NULL,
10620 };
10621 
10622 static const struct mbfl_convert_vtbl vtbl_wchar_uhc = {
10623 	mbfl_no_encoding_wchar,
10624 	mbfl_no_encoding_uhc,
10625 	mbfl_filt_conv_common_ctor,
10626 	NULL,
10627 	mbfl_filt_conv_wchar_uhc,
10628 	mbfl_filt_conv_common_flush,
10629 	NULL,
10630 };
10631 
10632 const mbfl_encoding mbfl_encoding_uhc = {
10633 	mbfl_no_encoding_uhc,
10634 	"UHC",
10635 	"UHC",
10636 	mbfl_encoding_uhc_aliases,
10637 	mblen_table_81_to_fe,
10638 	0,
10639 	&vtbl_uhc_wchar,
10640 	&vtbl_wchar_uhc,
10641 	mb_uhc_to_wchar,
10642 	mb_wchar_to_uhc,
10643 	NULL
10644 };
10645 
10646 /*
10647  * GB18030/CP936
10648  */
10649 
mbfl_filt_conv_gb18030_wchar(int c,mbfl_convert_filter * filter)10650 static int mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
10651 {
10652 	int k;
10653 	int c1, c2, c3, w = -1;
10654 
10655 	switch (filter->status) {
10656 	case 0:
10657 		if (c >= 0 && c < 0x80) { /* latin */
10658 			CK((*filter->output_function)(c, filter->data));
10659 		} else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */
10660 			filter->status = 1;
10661 			filter->cache = c;
10662 		} else {
10663 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10664 		}
10665 		break;
10666 
10667 	case 1: /* dbcs/qbcs second byte */
10668 		c1 = filter->cache;
10669 		filter->status = 0;
10670 
10671 		if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) {
10672 			/* 4 byte range: Unicode BMP */
10673 			filter->status = 2;
10674 			filter->cache = (c1 << 8) | c;
10675 			return 0;
10676 		} else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) {
10677 			/* 4 byte range: Unicode 16 planes */
10678 			filter->status = 2;
10679 			filter->cache = (c1 << 8) | c;
10680 			return 0;
10681 		} else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) {
10682 			/* UDA part 1,2: U+E000-U+E4C5 */
10683 			w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
10684 			CK((*filter->output_function)(w, filter->data));
10685 		} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
10686 			/* UDA part3 : U+E4C6-U+E765*/
10687 			w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
10688 			CK((*filter->output_function)(w, filter->data));
10689 		}
10690 
10691 		c2 = (c1 << 8) | c;
10692 
10693 		if (w <= 0 &&
10694 			((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
10695 			 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
10696 			 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
10697 			for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) {
10698 				if (c2 >= mbfl_gb18030_pua_tbl[k][2] && c2 <= mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][1] - mbfl_gb18030_pua_tbl[k][0]) {
10699 					w = c2 - mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
10700 					CK((*filter->output_function)(w, filter->data));
10701 					break;
10702 				}
10703 			}
10704 		}
10705 
10706 		if (w <= 0) {
10707 			if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
10708 				(c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
10709 				(c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
10710 				(c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
10711 				(c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
10712 				w = (c1 - 0x81)*192 + c - 0x40;
10713 				ZEND_ASSERT(w < cp936_ucs_table_size);
10714 				CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
10715 			} else {
10716 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10717 			}
10718 		}
10719 		break;
10720 
10721 	case 2: /* qbcs third byte */
10722 		c1 = (filter->cache >> 8) & 0xff;
10723 		c2 = filter->cache & 0xff;
10724 		filter->status = filter->cache = 0;
10725 		if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
10726 			filter->cache = (c1 << 16) | (c2 << 8) | c;
10727 			filter->status = 3;
10728 		} else {
10729 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10730 		}
10731 		break;
10732 
10733 	case 3: /* qbcs fourth byte */
10734 		c1 = (filter->cache >> 16) & 0xff;
10735 		c2 = (filter->cache >> 8) & 0xff;
10736 		c3 = filter->cache & 0xff;
10737 		filter->status = filter->cache = 0;
10738 		if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
10739 			if (c1 >= 0x90 && c1 <= 0xe3) {
10740 				w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000;
10741 				if (w > 0x10FFFF) {
10742 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10743 					return 0;
10744 				}
10745 			} else { /* Unicode BMP */
10746 				w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30);
10747 				if (w >= 0 && w <= 39419) {
10748 					k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max);
10749 					w += mbfl_gb_uni_ofst[k];
10750 				} else {
10751 					CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10752 					return 0;
10753 				}
10754 			}
10755 			CK((*filter->output_function)(w, filter->data));
10756 		} else {
10757 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10758 		}
10759 		break;
10760 
10761 		EMPTY_SWITCH_DEFAULT_CASE();
10762 	}
10763 
10764 	return 0;
10765 }
10766 
mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter * filter)10767 static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter)
10768 {
10769 	if (filter->status) {
10770 		/* multi-byte character was truncated */
10771 		filter->status = 0;
10772 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
10773 	}
10774 
10775 	if (filter->flush_function) {
10776 		(*filter->flush_function)(filter->data);
10777 	}
10778 
10779 	return 0;
10780 }
10781 
mbfl_filt_conv_wchar_gb18030(int c,mbfl_convert_filter * filter)10782 static int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
10783 {
10784 	int k, k1, k2;
10785 	int c1, s = 0, s1 = 0;
10786 
10787 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
10788 		if (c == 0x01f9) {
10789 			s = 0xa8bf;
10790 		} else {
10791 			s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
10792 		}
10793 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
10794 		if (c == 0x20ac) { /* euro-sign */
10795 			s = 0xa2e3;
10796 		} else {
10797 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
10798 		}
10799 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
10800 		s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
10801 	} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
10802 		s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
10803 	} else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
10804 		/* U+F900-FA2F CJK Compatibility Ideographs */
10805 		if (c == 0xf92c) {
10806 			s = 0xfd9c;
10807 		} else if (c == 0xf979) {
10808 			s = 0xfd9d;
10809 		} else if (c == 0xf995) {
10810 			s = 0xfd9e;
10811 		} else if (c == 0xf9e7) {
10812 			s = 0xfd9f;
10813 		} else if (c == 0xf9f1) {
10814 			s = 0xfda0;
10815 		} else if (c >= 0xfa0c && c <= 0xfa29) {
10816 			s = ucs_ci_s_cp936_table[c - 0xfa0c];
10817 		}
10818 	} else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
10819 		/* FE30h CJK Compatibility Forms  */
10820 		s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
10821 	} else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
10822 		/* U+FE50-FE6F Small Form Variants */
10823 		s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min];
10824 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
10825 		/* U+FF00-FFFF HW/FW Forms */
10826 		if (c == 0xff04) {
10827 			s = 0xa1e7;
10828 		} else if (c == 0xff5e) {
10829 			s = 0xa1ab;
10830 		} else if (c >= 0xff01 && c <= 0xff5d) {
10831 			s = c - 0xff01 + 0xa3a1;
10832 		} else if (c >= 0xffe0 && c <= 0xffe5) {
10833 			s = ucs_hff_s_cp936_table[c-0xffe0];
10834 		}
10835 	}
10836 
10837 	/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
10838 	 * do a binary search in a table of differing codepoints to see if we have one */
10839 	if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] && c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
10840 		k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
10841 		if (k1 >= 0) {
10842 			s = mbfl_gb18030_c_tbl_val[k1];
10843 		}
10844 	}
10845 
10846 	if (c >= 0xe000 && c <= 0xe864) { /* PUA */
10847 		if (c < 0xe766) {
10848 			if (c < 0xe4c6) {
10849 				c1 = c - 0xe000;
10850 				s = (c1 % 94) + 0xa1;
10851 				c1 /= 94;
10852 				s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
10853 			} else {
10854 				c1 = c - 0xe4c6;
10855 				s = ((c1 / 96) + 0xa1) << 8;
10856 				c1 %= 96;
10857 				s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
10858 			}
10859 		} else {
10860 			/* U+E766..U+E864 */
10861 			k1 = 0;
10862 			k2 = mbfl_gb18030_pua_tbl_max;
10863 			while (k1 < k2) {
10864 				k = (k1 + k2) >> 1;
10865 				if (c < mbfl_gb18030_pua_tbl[k][0]) {
10866 					k2 = k;
10867 				} else if (c > mbfl_gb18030_pua_tbl[k][1]) {
10868 					k1 = k + 1;
10869 				} else {
10870 					s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
10871 					break;
10872 				}
10873 			}
10874 		}
10875 	}
10876 
10877 	/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
10878 	if (s <= 0 && c >= 0x0080 && c <= 0xffff) {
10879 		/* BMP */
10880 		s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
10881 		if (s >= 0) {
10882 			c1 = c - mbfl_gb_uni_ofst[s];
10883 			s = (c1 % 10) + 0x30;
10884 			c1 /= 10;
10885 			s |= ((c1 % 126) + 0x81) << 8;
10886 			c1 /= 126;
10887 			s |= ((c1 % 10) + 0x30) << 16;
10888 			c1 /= 10;
10889 			s1 = c1 + 0x81;
10890 		}
10891 	} else if (c >= 0x10000 && c <= 0x10ffff) {
10892 		/* Code set 3: Unicode U+10000..U+10FFFF */
10893 		c1 = c - 0x10000;
10894 		s = (c1 % 10) + 0x30;
10895 		c1 /= 10;
10896 		s |= ((c1 % 126) + 0x81) << 8;
10897 		c1 /= 126;
10898 		s |= ((c1 % 10) + 0x30) << 16;
10899 		c1 /= 10;
10900 		s1 = c1 + 0x90;
10901 	}
10902 
10903 	if (c == 0) {
10904 		s = 0;
10905 	} else if (s == 0) {
10906 		s = -1;
10907 	}
10908 
10909 	if (s >= 0) {
10910 		if (s <= 0x80) { /* latin */
10911 			CK((*filter->output_function)(s, filter->data));
10912 		} else if (s1 > 0) { /* qbcs */
10913 			CK((*filter->output_function)(s1 & 0xff, filter->data));
10914 			CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
10915 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10916 			CK((*filter->output_function)(s & 0xff, filter->data));
10917 		} else { /* dbcs */
10918 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
10919 			CK((*filter->output_function)(s & 0xff, filter->data));
10920 		}
10921 	} else {
10922 		CK(mbfl_filt_conv_illegal_output(c, filter));
10923 	}
10924 
10925 	return 0;
10926 }
10927 
10928 static const unsigned short gb18030_pua_tbl3[] = {
10929 	/* 0xFE50 */
10930 	0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
10931 	0x0000,0xE81E,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10932 	0x0000,0xE826,0x0000,0x0000,0x0000,0x0000,0xE82B,0xE82C,
10933 	0x0000,0x0000,0x0000,0x0000,0xE831,0xE832,0x0000,0x0000,
10934 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
10935 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE843,0x0000,
10936 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10937 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10938 	0xE854,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10939 	0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
10940 	/* 0xFEA0 */
10941 	0xE864
10942 };
10943 
mb_gb18030_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)10944 static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
10945 {
10946 	unsigned char *p = *in, *e = p + *in_len;
10947 	uint32_t *out = buf, *limit = buf + bufsize;
10948 
10949 	while (p < e && out < limit) {
10950 		unsigned char c = *p++;
10951 
10952 		if (c < 0x80) {
10953 			*out++ = c;
10954 		} else if (c == 0x80 || c == 0xFF) {
10955 			*out++ = MBFL_BAD_INPUT;
10956 		} else {
10957 			if (p == e) {
10958 				*out++ = MBFL_BAD_INPUT;
10959 				break;
10960 			}
10961 			unsigned char c2 = *p++;
10962 
10963 			if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
10964 				if (p >= e) {
10965 					*out++ = MBFL_BAD_INPUT;
10966 					break;
10967 				}
10968 				unsigned char c3 = *p++;
10969 
10970 				if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
10971 					unsigned char c4 = *p++;
10972 
10973 					if (c4 >= 0x30 && c4 <= 0x39) {
10974 						if (c >= 0x90 && c <= 0xE3) {
10975 							unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
10976 							*out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
10977 						} else {
10978 							/* Unicode BMP */
10979 							unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
10980 							if (w <= 39419) {
10981 								*out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
10982 							} else {
10983 								*out++ = MBFL_BAD_INPUT;
10984 							}
10985 						}
10986 					} else {
10987 						*out++ = MBFL_BAD_INPUT;
10988 					}
10989 				} else {
10990 					*out++ = MBFL_BAD_INPUT;
10991 				}
10992 			} else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
10993 				/* UDA part 1, 2: U+E000-U+E4C5 */
10994 				*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
10995 			} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
10996 				/* UDA part 3: U+E4C6-U+E765 */
10997 				*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
10998 			} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
10999 				unsigned int w = (c - 0x81)*192 + c2 - 0x40;
11000 
11001 				if (w >= 0x192B) {
11002 					if (w <= 0x1EBE) {
11003 						if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
11004 							*out++ = cp936_pua_tbl1[w - 0x192B];
11005 							continue;
11006 						}
11007 					} else if (w >= 0x413A) {
11008 						if (w <= 0x413E) {
11009 							*out++ = cp936_pua_tbl2[w - 0x413A];
11010 							continue;
11011 						} else if (w >= 0x5DD0 && w <= 0x5E20) {
11012 							unsigned int c = gb18030_pua_tbl3[w - 0x5DD0];
11013 							if (c) {
11014 								*out++ = c;
11015 								continue;
11016 							}
11017 						}
11018 					}
11019 				}
11020 
11021 				if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
11022 					ZEND_ASSERT(w < cp936_ucs_table_size);
11023 					*out++ = cp936_ucs_table[w];
11024 				} else {
11025 					*out++ = MBFL_BAD_INPUT;
11026 				}
11027 			} else {
11028 				*out++ = MBFL_BAD_INPUT;
11029 			}
11030 		}
11031 	}
11032 
11033 	*in_len = e - p;
11034 	*in = p;
11035 	return out - buf;
11036 }
11037 
mb_wchar_to_gb18030(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11038 static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11039 {
11040 	unsigned char *out, *limit;
11041 	MB_CONVERT_BUF_LOAD(buf, out, limit);
11042 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11043 
11044 	while (len--) {
11045 		uint32_t w = *in++;
11046 		unsigned int s = 0;
11047 
11048 		if (w == 0) {
11049 			out = mb_convert_buf_add(out, 0);
11050 			continue;
11051 		} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11052 			if (w == 0x1F9) {
11053 				s = 0xA8Bf;
11054 			} else {
11055 				s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11056 			}
11057 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11058 			if (w == 0x20AC) { /* Euro sign */
11059 				s = 0xA2E3;
11060 			} else {
11061 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11062 			}
11063 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11064 			s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11065 		} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11066 			s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11067 		} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11068 			/* U+F900-U+FA2F CJK Compatibility Ideographs */
11069 			if (w == 0xF92C) {
11070 				s = 0xFD9C;
11071 			} else if (w == 0xF979) {
11072 				s = 0xFD9D;
11073 			} else if (w == 0xF995) {
11074 				s = 0xFD9E;
11075 			} else if (w == 0xF9E7) {
11076 				s = 0xFD9F;
11077 			} else if (w == 0xF9F1) {
11078 				s = 0xFDA0;
11079 			} else if (w >= 0xFA0C && w <= 0xFA29) {
11080 				s = ucs_ci_s_cp936_table[w - 0xFA0C];
11081 			}
11082 		} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11083 			/* CJK Compatibility Forms  */
11084 			s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11085 		} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11086 			/* U+FE50-U+FE6F Small Form Variants */
11087 			s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11088 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11089 			/* U+FF00-U+FFFF HW/FW Forms */
11090 			if (w == 0xFF04) {
11091 				s = 0xA1E7;
11092 			} else if (w == 0xFF5E) {
11093 				s = 0xA1AB;
11094 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
11095 				s = w - 0xFF01 + 0xA3A1;
11096 			} else if (w >= 0xFFE0 && w <= 0xFFE5) {
11097 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
11098 			}
11099 		} else if (w >= 0xE000 && w <= 0xE864) {
11100 			/* PUA */
11101 			if (w < 0xE766) {
11102 				if (w < 0xE4C6) {
11103 					unsigned int c1 = w - 0xE000;
11104 					s = (c1 % 94) + 0xA1;
11105 					c1 /= 94;
11106 					s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
11107 				} else {
11108 					unsigned int c1 = w - 0xE4C6;
11109 					s = ((c1 / 96) + 0xA1) << 8;
11110 					c1 %= 96;
11111 					s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11112 				}
11113 			} else {
11114 				/* U+E766-U+E864 */
11115 				unsigned int k1 = 0, k2 = mbfl_gb18030_pua_tbl_max;
11116 				while (k1 < k2) {
11117 					unsigned int k = (k1 + k2) >> 1;
11118 					if (w < mbfl_gb18030_pua_tbl[k][0]) {
11119 						k2 = k;
11120 					} else if (w > mbfl_gb18030_pua_tbl[k][1]) {
11121 						k1 = k + 1;
11122 					} else {
11123 						s = w - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
11124 						break;
11125 					}
11126 				}
11127 			}
11128 		}
11129 
11130 		/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11131 		 * do a binary search in a table of differing codepoints to see if we have one */
11132 		if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
11133 			int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
11134 			if (i >= 0) {
11135 				s = mbfl_gb18030_c_tbl_val[i];
11136 			}
11137 		}
11138 
11139 		/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11140 		if (!s && w >= 0x80 && w <= 0xFFFF) {
11141 			/* BMP */
11142 			int i = mbfl_bisec_srch(w, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
11143 			if (i >= 0) {
11144 				unsigned int c1 = w - mbfl_gb_uni_ofst[i];
11145 				s = (c1 % 10) + 0x30;
11146 				c1 /= 10;
11147 				s |= ((c1 % 126) + 0x81) << 8;
11148 				c1 /= 126;
11149 				s |= ((c1 % 10) + 0x30) << 16;
11150 				c1 /= 10;
11151 				s |= (c1 + 0x81) << 24;
11152 			}
11153 		} else if (w >= 0x10000 && w <= 0x10FFFF) {
11154 			/* Code set 3: Unicode U+10000-U+10FFFF */
11155 			unsigned int c1 = w - 0x10000;
11156 			s = (c1 % 10) + 0x30;
11157 			c1 /= 10;
11158 			s |= ((c1 % 126) + 0x81) << 8;
11159 			c1 /= 126;
11160 			s |= ((c1 % 10) + 0x30) << 16;
11161 			c1 /= 10;
11162 			s |= (c1 + 0x90) << 24;
11163 		}
11164 
11165 		if (!s) {
11166 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
11167 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11168 		} else if (s < 0x80) {
11169 			out = mb_convert_buf_add(out, s);
11170 		} else if (s > 0xFFFFFF) {
11171 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
11172 			out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
11173 		} else {
11174 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11175 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11176 		}
11177 	}
11178 
11179 	MB_CONVERT_BUF_STORE(buf, out, limit);
11180 }
11181 
mbfl_filt_conv_cp936_wchar(int c,mbfl_convert_filter * filter)11182 static int mbfl_filt_conv_cp936_wchar(int c, mbfl_convert_filter *filter)
11183 {
11184 	int k;
11185 	int c1, c2, w = -1;
11186 
11187 	switch (filter->status) {
11188 	case 0:
11189 		if (c >= 0 && c < 0x80) {	/* latin */
11190 			CK((*filter->output_function)(c, filter->data));
11191 		} else if (c == 0x80) {	/* euro sign */
11192 			CK((*filter->output_function)(0x20ac, filter->data));
11193 		} else if (c < 0xff) {	/* dbcs lead byte */
11194 			filter->status = 1;
11195 			filter->cache = c;
11196 		} else { /* 0xff */
11197 			CK((*filter->output_function)(0xf8f5, filter->data));
11198 		}
11199 		break;
11200 
11201 	case 1:		/* dbcs second byte */
11202 		filter->status = 0;
11203 		c1 = filter->cache;
11204 
11205 		if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) &&
11206 			(c >= 0xa1 && c <= 0xfe)) {
11207 			/* UDA part1,2: U+E000-U+E4C5 */
11208 			w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
11209 			CK((*filter->output_function)(w, filter->data));
11210 		} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
11211 			/* UDA part3 : U+E4C6-U+E765*/
11212 			w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
11213 			CK((*filter->output_function)(w, filter->data));
11214 		}
11215 
11216 		c2 = (c1 << 8) | c;
11217 
11218 		if (w <= 0 &&
11219 			((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
11220 			 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
11221 			 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
11222 			for (k = 0; k < mbfl_cp936_pua_tbl_max; k++) {
11223 				if (c2 >= mbfl_cp936_pua_tbl[k][2] &&
11224 					c2 <= mbfl_cp936_pua_tbl[k][2] +
11225 					mbfl_cp936_pua_tbl[k][1] -  mbfl_cp936_pua_tbl[k][0]) {
11226 					w = c2 -  mbfl_cp936_pua_tbl[k][2] + mbfl_cp936_pua_tbl[k][0];
11227 					CK((*filter->output_function)(w, filter->data));
11228 					break;
11229 				}
11230 			}
11231 		}
11232 
11233 		if (w <= 0) {
11234 			if (c1 < 0xff && c1 > 0x80 && c >= 0x40 && c < 0xff && c != 0x7f) {
11235 				w = (c1 - 0x81)*192 + c - 0x40;
11236 				ZEND_ASSERT(w < cp936_ucs_table_size);
11237 				CK((*filter->output_function)(cp936_ucs_table[w], filter->data));
11238 			} else {
11239 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11240 			}
11241 		}
11242 		break;
11243 
11244 		EMPTY_SWITCH_DEFAULT_CASE();
11245 	}
11246 
11247 	return 0;
11248 }
11249 
mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter * filter)11250 static int mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter *filter)
11251 {
11252 	if (filter->status) {
11253 		/* 2-byte character was truncated */
11254 		filter->status = 0;
11255 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11256 	}
11257 
11258 	if (filter->flush_function) {
11259 		(*filter->flush_function)(filter->data);
11260 	}
11261 
11262 	return 0;
11263 }
11264 
mbfl_filt_conv_wchar_cp936(int c,mbfl_convert_filter * filter)11265 static int mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter)
11266 {
11267 	int k, k1, k2;
11268 	int c1, s = 0;
11269 
11270 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
11271 		/* U+0000 - U+0451 */
11272 		s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
11273 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
11274 		/* U+2000 - U+26FF */
11275 		if (c == 0x203e) {
11276 			s = 0xa3fe;
11277 		} else if (c == 0x2218) {
11278 			s = 0xa1e3;
11279 		} else if (c == 0x223c) {
11280 			s = 0xa1ab;
11281 		} else {
11282 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
11283 		}
11284 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
11285 		/* U+2F00 - U+33FF */
11286 		s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
11287 	} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
11288 		/* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11289 		s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
11290 	} else if (c >= 0xe000 && c <= 0xe864) { /* PUA */
11291 		if (c < 0xe766) {
11292 			if (c < 0xe4c6) {
11293 				c1 = c - 0xe000;
11294 				s = (c1 % 94) + 0xa1; c1 /= 94;
11295 				s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
11296 			} else {
11297 				c1 = c - 0xe4c6;
11298 				s = ((c1 / 96) + 0xa1) << 8; c1 %= 96;
11299 				s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
11300 			}
11301 		} else {
11302 			/* U+E766..U+E864 */
11303 			k1 = 0; k2 = mbfl_cp936_pua_tbl_max;
11304 			while (k1 < k2) {
11305 				k = (k1 + k2) >> 1;
11306 				if (c < mbfl_cp936_pua_tbl[k][0]) {
11307 					k2 = k;
11308 				} else if (c > mbfl_cp936_pua_tbl[k][1]) {
11309 					k1 = k + 1;
11310 				} else {
11311 					s = c - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11312 					break;
11313 				}
11314 			}
11315 		}
11316 	} else if (c == 0xf8f5) {
11317 		s = 0xff;
11318 	} else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
11319 		/* U+F900-FA2F CJK Compatibility Ideographs */
11320 		s = ucs_ci_cp936_table[c - ucs_ci_cp936_table_min];
11321 	} else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
11322 		s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
11323 	} else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
11324 		s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min]; /* U+FE50-FE6F Small Form Variants */
11325 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
11326 		/* U+FF00-FFFF HW/FW Forms */
11327 		if (c == 0xff04) {
11328 			s = 0xa1e7;
11329 		} else if (c == 0xff5e) {
11330 			s = 0xa1ab;
11331 		} else if (c >= 0xff01 && c <= 0xff5d) {
11332 			s = c - 0xff01 + 0xa3a1;
11333 		} else if (c >= 0xffe0 && c <= 0xffe5) {
11334 			s = ucs_hff_s_cp936_table[c-0xffe0];
11335 		}
11336 	}
11337 
11338 	if (s <= 0) {
11339 		if (c == 0) {
11340 			s = 0;
11341 		} else if (s <= 0) {
11342 			s = -1;
11343 		}
11344 	}
11345 
11346 	if (s >= 0) {
11347 		if (s <= 0x80 || s == 0xff) {	/* latin */
11348 			CK((*filter->output_function)(s, filter->data));
11349 		} else {
11350 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
11351 			CK((*filter->output_function)(s & 0xff, filter->data));
11352 		}
11353 	} else {
11354 		CK(mbfl_filt_conv_illegal_output(c, filter));
11355 	}
11356 
11357 	return 0;
11358 }
11359 
mb_cp936_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11360 static size_t mb_cp936_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11361 {
11362 	unsigned char *p = *in, *e = p + *in_len;
11363 	uint32_t *out = buf, *limit = buf + bufsize;
11364 
11365 	while (p < e && out < limit) {
11366 		unsigned char c = *p++;
11367 
11368 		if (c < 0x80) {
11369 			*out++ = c;
11370 		} else if (c == 0x80) {
11371 			*out++ = 0x20AC; /* Euro sign */
11372 		} else if (c < 0xFF) {
11373 			if (p >= e) {
11374 				*out++ = MBFL_BAD_INPUT;
11375 				continue;
11376 			}
11377 
11378 			unsigned char c2 = *p++;
11379 			if (c2 < 0x40 || c2 == 0x7F || c2 == 0xFF) {
11380 				*out++ = MBFL_BAD_INPUT;
11381 				continue;
11382 			}
11383 
11384 			if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && c2 >= 0xA1) {
11385 				/* UDA part 1, 2: U+E000-U+E4C5 */
11386 				*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11387 			} else if (c >= 0xA1 && c <= 0xA7 && c2 < 0xA1) {
11388 				/* UDA part 3: U+E4C6-U+E765*/
11389 				*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11390 			} else {
11391 				unsigned int w = (c - 0x81)*192 + c2 - 0x40; /* Convert c, c2 into GB 2312 table lookup index */
11392 
11393 				/* For CP936 and GB18030, certain GB 2312 byte combinations are mapped to PUA codepoints,
11394 				 * whereas the same combinations aren't mapped to any codepoint for HZ and EUC-CN
11395 				 * To avoid duplicating the entire GB 2312 -> Unicode lookup table, we have three
11396 				 * auxiliary tables which are consulted instead for specific ranges of lookup indices */
11397 				if (w >= 0x192B) {
11398 					if (w <= 0x1EBE) {
11399 						*out++ = cp936_pua_tbl1[w - 0x192B];
11400 						continue;
11401 					} else if (w >= 0x413A) {
11402 						if (w <= 0x413E) {
11403 							*out++ = cp936_pua_tbl2[w - 0x413A];
11404 							continue;
11405 						} else if (w >= 0x5DD0 && w <= 0x5E20) {
11406 							*out++ = cp936_pua_tbl3[w - 0x5DD0];
11407 							continue;
11408 						}
11409 					}
11410 				}
11411 
11412 				ZEND_ASSERT(w < cp936_ucs_table_size);
11413 				*out++ = cp936_ucs_table[w];
11414 			}
11415 		} else {
11416 			*out++ = 0xF8F5;
11417 		}
11418 	}
11419 
11420 	*in_len = e - p;
11421 	*in = p;
11422 	return out - buf;
11423 }
11424 
mb_wchar_to_cp936(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11425 static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11426 {
11427 	unsigned char *out, *limit;
11428 	MB_CONVERT_BUF_LOAD(buf, out, limit);
11429 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11430 
11431 	while (len--) {
11432 		uint32_t w = *in++;
11433 		unsigned int s = 0;
11434 
11435 		if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11436 			/* U+0000-U+0451 */
11437 			s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11438 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11439 			/* U+2000-U+26FF */
11440 			if (w == 0x203E) {
11441 				s = 0xA3FE;
11442 			} else if (w == 0x2218) {
11443 				s = 0xA1E3;
11444 			} else if (w == 0x223C) {
11445 				s = 0xA1AB;
11446 			} else {
11447 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11448 			}
11449 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11450 			/* U+2F00-U+33FF */
11451 			s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11452 		} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11453 			/* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */
11454 			s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11455 		} else if (w >= 0xE000 && w <= 0xE864) {
11456 			/* PUA */
11457 			if (w < 0xe766) {
11458 				if (w < 0xe4c6) {
11459 					unsigned int c1 = w - 0xE000;
11460 					s = (c1 % 94) + 0xA1;
11461 					c1 /= 94;
11462 					s |= (c1 < 0x6 ? c1 + 0xAA : c1 + 0xF2) << 8;
11463 				} else {
11464 					unsigned int c1 = w - 0xE4C6;
11465 					s = ((c1 / 96) + 0xA1) << 8;
11466 					c1 %= 96;
11467 					s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11468 				}
11469 			} else {
11470 				/* U+E766-U+E864 */
11471 				unsigned int k1 = 0;
11472 				unsigned int k2 = mbfl_cp936_pua_tbl_max;
11473 				while (k1 < k2) {
11474 					int k = (k1 + k2) >> 1;
11475 					if (w < mbfl_cp936_pua_tbl[k][0]) {
11476 						k2 = k;
11477 					} else if (w > mbfl_cp936_pua_tbl[k][1]) {
11478 						k1 = k + 1;
11479 					} else {
11480 						s = w - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2];
11481 						break;
11482 					}
11483 				}
11484 			}
11485 		} else if (w == 0xF8F5) {
11486 			s = 0xFF;
11487 		} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11488 			/* U+F900-U+FA2F CJK Compatibility Ideographs */
11489 			s = ucs_ci_cp936_table[w - ucs_ci_cp936_table_min];
11490 		} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11491 			s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11492 		} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11493 			/* U+FE50-U+FE6F Small Form Variants */
11494 			s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11495 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11496 			/* U+FF00-U+FFFF HW/FW Forms */
11497 			if (w == 0xFF04) {
11498 				s = 0xA1E7;
11499 			} else if (w == 0xFF5E) {
11500 				s = 0xA1AB;
11501 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
11502 				s = w - 0xFF01 + 0xA3A1;
11503 			} else if (w >= 0xFFE0 && w <= 0xFFE5) {
11504 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
11505 			}
11506 		}
11507 
11508 		if (!s) {
11509 			if (w == 0) {
11510 				out = mb_convert_buf_add(out, 0);
11511 			} else {
11512 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp936);
11513 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
11514 			}
11515 		} else if (s <= 0x80 || s == 0xFF) {
11516 			out = mb_convert_buf_add(out, s);
11517 		} else {
11518 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11519 		}
11520 	}
11521 
11522 	MB_CONVERT_BUF_STORE(buf, out, limit);
11523 }
11524 
11525 static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
11526 
11527 static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
11528 	mbfl_no_encoding_gb18030,
11529 	mbfl_no_encoding_wchar,
11530 	mbfl_filt_conv_common_ctor,
11531 	NULL,
11532 	mbfl_filt_conv_gb18030_wchar,
11533 	mbfl_filt_conv_gb18030_wchar_flush,
11534 	NULL,
11535 };
11536 
11537 static const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = {
11538 	mbfl_no_encoding_wchar,
11539 	mbfl_no_encoding_gb18030,
11540 	mbfl_filt_conv_common_ctor,
11541 	NULL,
11542 	mbfl_filt_conv_wchar_gb18030,
11543 	mbfl_filt_conv_common_flush,
11544 	NULL,
11545 };
11546 
11547 const mbfl_encoding mbfl_encoding_gb18030 = {
11548 	mbfl_no_encoding_gb18030,
11549 	"GB18030",
11550 	"GB18030",
11551 	mbfl_encoding_gb18030_aliases,
11552 	NULL,
11553 	MBFL_ENCTYPE_GL_UNSAFE,
11554 	&vtbl_gb18030_wchar,
11555 	&vtbl_wchar_gb18030,
11556 	mb_gb18030_to_wchar,
11557 	mb_wchar_to_gb18030,
11558 	NULL
11559 };
11560 
11561 static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
11562 
11563 static const struct mbfl_convert_vtbl vtbl_cp936_wchar = {
11564 	mbfl_no_encoding_cp936,
11565 	mbfl_no_encoding_wchar,
11566 	mbfl_filt_conv_common_ctor,
11567 	NULL,
11568 	mbfl_filt_conv_cp936_wchar,
11569 	mbfl_filt_conv_cp936_wchar_flush,
11570 	NULL,
11571 };
11572 
11573 static const struct mbfl_convert_vtbl vtbl_wchar_cp936 = {
11574 	mbfl_no_encoding_wchar,
11575 	mbfl_no_encoding_cp936,
11576 	mbfl_filt_conv_common_ctor,
11577 	NULL,
11578 	mbfl_filt_conv_wchar_cp936,
11579 	mbfl_filt_conv_common_flush,
11580 	NULL,
11581 };
11582 
11583 const mbfl_encoding mbfl_encoding_cp936 = {
11584 	mbfl_no_encoding_cp936,
11585 	"CP936",
11586 	"CP936",
11587 	mbfl_encoding_cp936_aliases,
11588 	mblen_table_81_to_fe,
11589 	MBFL_ENCTYPE_GL_UNSAFE,
11590 	&vtbl_cp936_wchar,
11591 	&vtbl_wchar_cp936,
11592 	mb_cp936_to_wchar,
11593 	mb_wchar_to_cp936,
11594 	NULL
11595 };
11596 
11597 /*
11598  * BIG5/CP950
11599  */
11600 
11601 /* 63 + 94 = 157 or 94 */
11602 static unsigned short cp950_pua_tbl[][4] = {
11603 	{0xe000, 0xe310, 0xfa40, 0xfefe},
11604 	{0xe311, 0xeeb7, 0x8e40, 0xa0fe},
11605 	{0xeeb8, 0xf6b0, 0x8140, 0x8dfe},
11606 	{0xf6b1, 0xf70e, 0xc6a1, 0xc6fe},
11607 	{0xf70f, 0xf848, 0xc740, 0xc8fe},
11608 };
11609 
is_in_cp950_pua(int c1,int c)11610 static inline int is_in_cp950_pua(int c1, int c)
11611 {
11612 	if ((c1 >= 0xfa && c1 <= 0xfe) || (c1 >= 0x8e && c1 <= 0xa0) || (c1 >= 0x81 && c1 <= 0x8d) || (c1 >= 0xc7 && c1 <= 0xc8)) {
11613 		return (c >= 0x40 && c <= 0x7e) || (c >= 0xa1 && c <= 0xfe);
11614 	} else if (c1 == 0xc6) {
11615 		return c >= 0xa1 && c <= 0xfe;
11616 	}
11617 	return 0;
11618 }
11619 
mbfl_filt_conv_big5_wchar(int c,mbfl_convert_filter * filter)11620 static int mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter)
11621 {
11622 	int k, c1, w;
11623 
11624 	switch (filter->status) {
11625 	case 0:
11626 		if (c >= 0 && c < 0x80) { /* latin */
11627 			CK((*filter->output_function)(c, filter->data));
11628 		} else if (filter->from->no_encoding != mbfl_no_encoding_cp950 && c > 0xA0 && c <= 0xF9 && c != 0xC8) {
11629 			filter->status = 1;
11630 			filter->cache = c;
11631 		} else if (filter->from->no_encoding == mbfl_no_encoding_cp950 && c > 0x80 && c <= 0xFE) {
11632 			filter->status = 1;
11633 			filter->cache = c;
11634 		} else {
11635 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11636 		}
11637 		break;
11638 
11639 	case 1: /* dbcs second byte */
11640 		filter->status = 0;
11641 		c1 = filter->cache;
11642 		if ((c > 0x3f && c < 0x7f) || (c > 0xa0 && c < 0xff)) {
11643 			if (c < 0x7f) {
11644 				w = (c1 - 0xa1)*157 + (c - 0x40);
11645 			} else {
11646 				w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f;
11647 			}
11648 			if (w >= 0 && w < big5_ucs_table_size) {
11649 				w = big5_ucs_table[w];
11650 			} else {
11651 				w = 0;
11652 			}
11653 
11654 			if (filter->from->no_encoding == mbfl_no_encoding_cp950) {
11655 				/* PUA for CP950 */
11656 				if (is_in_cp950_pua(c1, c)) {
11657 					int c2 = (c1 << 8) | c;
11658 
11659 					for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
11660 						if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) {
11661 							break;
11662 						}
11663 					}
11664 
11665 					if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
11666 						w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
11667 					} else {
11668 						w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
11669 					}
11670 				} else if (c1 == 0xA1) {
11671 					if (c == 0x45) {
11672 						w = 0x2027;
11673 					} else if (c == 0x4E) {
11674 						w = 0xFE51;
11675 					} else if (c == 0x5A) {
11676 						w = 0x2574;
11677 					} else if (c == 0xC2) {
11678 						w = 0x00AF;
11679 					} else if (c == 0xC3) {
11680 						w = 0xFFE3;
11681 					} else if (c == 0xC5) {
11682 						w = 0x02CD;
11683 					} else if (c == 0xE3) {
11684 						w = 0xFF5E;
11685 					} else if (c == 0xF2) {
11686 						w = 0x2295;
11687 					} else if (c == 0xF3) {
11688 						w = 0x2299;
11689 					} else if (c == 0xFE) {
11690 						w = 0xFF0F;
11691 					}
11692 				} else if (c1 == 0xA2) {
11693 					if (c == 0x40) {
11694 						w = 0xFF3C;
11695 					} else if (c == 0x41) {
11696 						w = 0x2215;
11697 					} else if (c == 0x42) {
11698 						w = 0xFE68;
11699 					} else if (c == 0x46) {
11700 						w = 0xFFE0;
11701 					} else if (c == 0x47) {
11702 						w = 0xFFE1;
11703 					} else if (c == 0xCC) {
11704 						w = 0x5341;
11705 					} else if (c == 0xCE) {
11706 						w = 0x5345;
11707 					}
11708 				}
11709 			}
11710 
11711 			if (w <= 0) {
11712 				w = MBFL_BAD_INPUT;
11713 			}
11714 			CK((*filter->output_function)(w, filter->data));
11715 		} else {
11716 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11717 		}
11718 		break;
11719 
11720 		EMPTY_SWITCH_DEFAULT_CASE();
11721 	}
11722 
11723 	return 0;
11724 }
11725 
mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter * filter)11726 static int mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter *filter)
11727 {
11728 	if (filter->status == 1) {
11729 		/* 2-byte character was truncated */
11730 		filter->status = 0;
11731 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
11732 	}
11733 
11734 	if (filter->flush_function) {
11735 		(*filter->flush_function)(filter->data);
11736 	}
11737 
11738 	return 0;
11739 }
11740 
mbfl_filt_conv_wchar_big5(int c,mbfl_convert_filter * filter)11741 static int mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
11742 {
11743 	int k, s = 0;
11744 
11745 	if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) {
11746 		s = ucs_a1_big5_table[c - ucs_a1_big5_table_min];
11747 	} else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) {
11748 		s = ucs_a2_big5_table[c - ucs_a2_big5_table_min];
11749 	} else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) {
11750 		s = ucs_a3_big5_table[c - ucs_a3_big5_table_min];
11751 	} else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) {
11752 		s = ucs_i_big5_table[c - ucs_i_big5_table_min];
11753 	} else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) {
11754 		s = ucs_r1_big5_table[c - ucs_r1_big5_table_min];
11755 	} else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) {
11756 		s = ucs_r2_big5_table[c - ucs_r2_big5_table_min];
11757 	}
11758 
11759 	if (filter->to->no_encoding == mbfl_no_encoding_cp950) {
11760 		if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */
11761 			for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
11762 				if (c <= cp950_pua_tbl[k][1]) {
11763 					break;
11764 				}
11765 			}
11766 
11767 			int c1 = c - cp950_pua_tbl[k][0];
11768 			if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) {
11769 				int c2 = cp950_pua_tbl[k][2] >> 8;
11770 				s = ((c1 / 157) + c2) << 8;
11771 				c1 %= 157;
11772 				s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40);
11773 			} else {
11774 				s = c1 + cp950_pua_tbl[k][2];
11775 			}
11776 		} else if (c == 0x00A2) {
11777 			s = 0;
11778 		} else if (c == 0x00A3) {
11779 			s = 0;
11780 		} else if (c == 0x00AF) {
11781 			s = 0xA1C2;
11782 		} else if (c == 0x02CD) {
11783 			s = 0xA1C5;
11784 		} else if (c == 0x0401) {
11785 			s = 0;
11786 		} else if (c >= 0x0414 && c <= 0x041C) {
11787 			s = 0;
11788 		} else if (c >= 0x0423 && c <= 0x044F) {
11789 			s = 0;
11790 		} else if (c == 0x0451) {
11791 			s = 0;
11792 		} else if (c == 0x2022) {
11793 			s = 0;
11794 		} else if (c == 0x2027) {
11795 			s = 0xA145;
11796 		} else if (c == 0x203E) {
11797 			s = 0;
11798 		} else if (c == 0x2215) {
11799 			s = 0xA241;
11800 		} else if (c == 0x223C) {
11801 			s = 0;
11802 		} else if (c == 0x2295) {
11803 			s = 0xA1F2;
11804 		} else if (c == 0x2299) {
11805 			s = 0xA1F3;
11806 		} else if (c >= 0x2460 && c <= 0x247D) {
11807 			s = 0;
11808 		} else if (c == 0x2574) {
11809 			s = 0xA15A;
11810 		} else if (c == 0x2609) {
11811 			s = 0;
11812 		} else if (c == 0x2641) {
11813 			s = 0;
11814 		} else if (c == 0x3005 || (c >= 0x302A && c <= 0x30FF)) {
11815 			s = 0;
11816 		} else if (c == 0xFE51) {
11817 			s = 0xA14E;
11818 		} else if (c == 0xFE68) {
11819 			s = 0xA242;
11820 		} else if (c == 0xFF3C) {
11821 			s = 0xA240;
11822 		} else if (c == 0xFF5E) {
11823 			s = 0xA1E3;
11824 		} else if (c == 0xFF64) {
11825 			s = 0;
11826 		} else if (c == 0xFFE0) {
11827 			s = 0xA246;
11828 		} else if (c == 0xFFE1) {
11829 			s = 0xA247;
11830 		} else if (c == 0xFFE3) {
11831 			s = 0xA1C3;
11832 		} else if (c == 0xFF0F) {
11833 			s = 0xA1FE;
11834 		}
11835 	}
11836 
11837 	if (s <= 0) {
11838 		if (c == 0) {
11839 			s = 0;
11840 		} else {
11841 			s = -1;
11842 		}
11843 	}
11844 
11845 	if (s >= 0) {
11846 		if (s <= 0x80) { /* latin */
11847 			CK((*filter->output_function)(s, filter->data));
11848 		} else {
11849 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
11850 			CK((*filter->output_function)(s & 0xff, filter->data));
11851 		}
11852 	} else {
11853 		CK(mbfl_filt_conv_illegal_output(c, filter));
11854 	}
11855 
11856 	return 0;
11857 }
11858 
mb_big5_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11859 static size_t mb_big5_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11860 {
11861 	unsigned char *p = *in, *e = p + *in_len;
11862 	uint32_t *out = buf, *limit = buf + bufsize;
11863 
11864 	e--; /* Stop the main loop 1 byte short of the end of the input */
11865 
11866 	while (p < e && out < limit) {
11867 		unsigned char c = *p++;
11868 
11869 		if (c <= 0x7F) {
11870 			*out++ = c;
11871 		} else if (c > 0xA0 && c <= 0xF9) {
11872 			/* We don't need to check p < e here; it's not possible that this pointer dereference
11873 			 * will be outside the input string, because of e-- above */
11874 			unsigned char c2 = *p++;
11875 
11876 			if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
11877 				unsigned int w = (c - 0xA1)*157 + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
11878 				ZEND_ASSERT(w < big5_ucs_table_size);
11879 				w = big5_ucs_table[w];
11880 				if (!w) {
11881 					if (c == 0xC8) {
11882 						p--;
11883 					}
11884 					w = MBFL_BAD_INPUT;
11885 				}
11886 				*out++ = w;
11887 			} else {
11888 				*out++ = MBFL_BAD_INPUT;
11889 			}
11890 		} else {
11891 			*out++ = MBFL_BAD_INPUT;
11892 		}
11893 	}
11894 
11895 	/* Finish up last byte of input string if there is one */
11896 	if (p == e && out < limit) {
11897 		unsigned char c = *p++;
11898 		*out++ = (c <= 0x7F) ? c : MBFL_BAD_INPUT;
11899 	}
11900 
11901 	*in_len = e - p + 1;
11902 	*in = p;
11903 	return out - buf;
11904 }
11905 
mb_wchar_to_big5(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)11906 static void mb_wchar_to_big5(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11907 {
11908 	unsigned char *out, *limit;
11909 	MB_CONVERT_BUF_LOAD(buf, out, limit);
11910 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11911 
11912 	while (len--) {
11913 		uint32_t w = *in++;
11914 		unsigned int s = 0;
11915 
11916 		if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
11917 			s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
11918 		} else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
11919 			s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
11920 		} else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
11921 			s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
11922 		} else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
11923 			s = ucs_i_big5_table[w - ucs_i_big5_table_min];
11924 		} else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
11925 			s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
11926 		} else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
11927 			s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
11928 		}
11929 
11930 		if (!s) {
11931 			if (w == 0) {
11932 				out = mb_convert_buf_add(out, 0);
11933 			} else {
11934 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
11935 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11936 			}
11937 		} else if (s <= 0x80) {
11938 			out = mb_convert_buf_add(out, s);
11939 		} else {
11940 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11941 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11942 		}
11943 	}
11944 
11945 	MB_CONVERT_BUF_STORE(buf, out, limit);
11946 }
11947 
mb_cp950_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)11948 static size_t mb_cp950_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11949 {
11950 	unsigned char *p = *in, *e = p + *in_len;
11951 	uint32_t *out = buf, *limit = buf + bufsize;
11952 
11953 	while (p < e && out < limit) {
11954 		unsigned char c = *p++;
11955 
11956 		if (c <= 0x7F) {
11957 			*out++ = c;
11958 		} else if (c > 0x80 && c <= 0xFE && p < e) {
11959 			unsigned char c2 = *p++;
11960 
11961 			if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE)) {
11962 				unsigned int w = ((c - 0xA1)*157) + c2 - ((c2 <= 0x7E) ? 0x40 : 0xA1 - 0x3F);
11963 				w = (w < big5_ucs_table_size) ? big5_ucs_table[w] : 0;
11964 
11965 				/* PUA for CP950 */
11966 				if (is_in_cp950_pua(c, c2)) {
11967 					unsigned int s = (c << 8) | c2;
11968 
11969 					int k;
11970 					for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
11971 						if (s >= cp950_pua_tbl[k][2] && s <= cp950_pua_tbl[k][3]) {
11972 							break;
11973 						}
11974 					}
11975 
11976 					if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
11977 						w = 157*(c - (cp950_pua_tbl[k][2] >> 8)) + c2 - (c2 >= 0xA1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0];
11978 					} else {
11979 						w = s - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0];
11980 					}
11981 				} else if (c == 0xA1) {
11982 					if (c2 == 0x45) {
11983 						w = 0x2027;
11984 					} else if (c2 == 0x4E) {
11985 						w = 0xFE51;
11986 					} else if (c2 == 0x5A) {
11987 						w = 0x2574;
11988 					} else if (c2 == 0xC2) {
11989 						w = 0x00AF;
11990 					} else if (c2 == 0xC3) {
11991 						w = 0xFFE3;
11992 					} else if (c2 == 0xC5) {
11993 						w = 0x02CD;
11994 					} else if (c2 == 0xE3) {
11995 						w = 0xFF5E;
11996 					} else if (c2 == 0xF2) {
11997 						w = 0x2295;
11998 					} else if (c2 == 0xF3) {
11999 						w = 0x2299;
12000 					} else if (c2 == 0xFE) {
12001 						w = 0xFF0F;
12002 					}
12003 				} else if (c == 0xA2) {
12004 					if (c2 == 0x40) {
12005 						w = 0xFF3C;
12006 					} else if (c2 == 0x41) {
12007 						w = 0x2215;
12008 					} else if (c2 == 0x42) {
12009 						w = 0xFE68;
12010 					} else if (c2 == 0x46) {
12011 						w = 0xFFE0;
12012 					} else if (c2 == 0x47) {
12013 						w = 0xFFE1;
12014 					} else if (c2 == 0xCC) {
12015 						w = 0x5341;
12016 					} else if (c2 == 0xCE) {
12017 						w = 0x5345;
12018 					}
12019 				}
12020 
12021 				if (!w)
12022 					w = MBFL_BAD_INPUT;
12023 				*out++ = w;
12024 			} else {
12025 				*out++ = MBFL_BAD_INPUT;
12026 			}
12027 		} else {
12028 			*out++ = MBFL_BAD_INPUT;
12029 		}
12030 	}
12031 
12032 	*in_len = e - p;
12033 	*in = p;
12034 	return out - buf;
12035 }
12036 
mb_wchar_to_cp950(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12037 static void mb_wchar_to_cp950(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12038 {
12039 	unsigned char *out, *limit;
12040 	MB_CONVERT_BUF_LOAD(buf, out, limit);
12041 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12042 
12043 	while (len--) {
12044 		uint32_t w = *in++;
12045 		unsigned int s = 0;
12046 
12047 		if (w >= ucs_a1_big5_table_min && w < ucs_a1_big5_table_max) {
12048 			s = ucs_a1_big5_table[w - ucs_a1_big5_table_min];
12049 		} else if (w >= ucs_a2_big5_table_min && w < ucs_a2_big5_table_max) {
12050 			s = ucs_a2_big5_table[w - ucs_a2_big5_table_min];
12051 		} else if (w >= ucs_a3_big5_table_min && w < ucs_a3_big5_table_max) {
12052 			s = ucs_a3_big5_table[w - ucs_a3_big5_table_min];
12053 		} else if (w >= ucs_i_big5_table_min && w < ucs_i_big5_table_max) {
12054 			s = ucs_i_big5_table[w - ucs_i_big5_table_min];
12055 		} else if (w >= ucs_r1_big5_table_min && w < ucs_r1_big5_table_max) {
12056 			s = ucs_r1_big5_table[w - ucs_r1_big5_table_min];
12057 		} else if (w >= ucs_r2_big5_table_min && w < ucs_r2_big5_table_max) {
12058 			s = ucs_r2_big5_table[w - ucs_r2_big5_table_min];
12059 		}
12060 
12061 		if (w >= 0xE000 && w <= 0xF848) {
12062 			int k;
12063 			for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) {
12064 				if (w <= cp950_pua_tbl[k][1]) {
12065 					break;
12066 				}
12067 			}
12068 
12069 			int c1 = w - cp950_pua_tbl[k][0];
12070 			if ((cp950_pua_tbl[k][2] & 0xFF) == 0x40) {
12071 				int c2 = cp950_pua_tbl[k][2] >> 8;
12072 				s = ((c1 / 157) + c2) << 8;
12073 				c1 %= 157;
12074 				s |= c1 + (c1 >= 0x3F ? 0x62 : 0x40);
12075 			} else {
12076 				s = c1 + cp950_pua_tbl[k][2];
12077 			}
12078 		} else if (w == 0xA2 || w == 0xA3 || w == 0x401 || (w >= 0x414 && w <= 0x41C) || (w >= 0x423 && w <= 0x44F) || w == 0x451 || w == 0x2022 || w == 0x203E || w == 0x223C || (w >= 0x2460 && w <= 0x247D) || w == 0x2609 || w == 0x2641 || w == 0x3005 || (w >= 0x302A && w <= 0x30FF) || w == 0xFF64) {
12079 			s = 0;
12080 		} else if (w == 0xAF) {
12081 			s = 0xA1C2;
12082 		} else if (w == 0x2CD) {
12083 			s = 0xA1C5;
12084 		} else if (w == 0x2027) {
12085 			s = 0xA145;
12086 		} else if (w == 0x2215) {
12087 			s = 0xA241;
12088 		} else if (w == 0x2295) {
12089 			s = 0xA1F2;
12090 		} else if (w == 0x2299) {
12091 			s = 0xA1F3;
12092 		} else if (w == 0x2574) {
12093 			s = 0xA15A;
12094 		} else if (w == 0xFE51) {
12095 			s = 0xA14E;
12096 		} else if (w == 0xFE68) {
12097 			s = 0xA242;
12098 		} else if (w == 0xFF3C) {
12099 			s = 0xA240;
12100 		} else if (w == 0xFF5E) {
12101 			s = 0xA1E3;
12102 		} else if (w == 0xFFE0) {
12103 			s = 0xA246;
12104 		} else if (w == 0xFFE1) {
12105 			s = 0xA247;
12106 		} else if (w == 0xFFE3) {
12107 			s = 0xA1C3;
12108 		} else if (w == 0xFF0F) {
12109 			s = 0xA1FE;
12110 		}
12111 
12112 		if (!s) {
12113 			if (w == 0) {
12114 				out = mb_convert_buf_add(out, 0);
12115 			} else {
12116 				MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_big5);
12117 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12118 			}
12119 		} else if (s <= 0x80) {
12120 			out = mb_convert_buf_add(out, s);
12121 		} else {
12122 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12123 			out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
12124 		}
12125 	}
12126 
12127 	MB_CONVERT_BUF_STORE(buf, out, limit);
12128 }
12129 
12130 static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL};
12131 
12132 static const struct mbfl_convert_vtbl vtbl_big5_wchar = {
12133 	mbfl_no_encoding_big5,
12134 	mbfl_no_encoding_wchar,
12135 	mbfl_filt_conv_common_ctor,
12136 	NULL,
12137 	mbfl_filt_conv_big5_wchar,
12138 	mbfl_filt_conv_big5_wchar_flush,
12139 	NULL,
12140 };
12141 
12142 static const struct mbfl_convert_vtbl vtbl_wchar_big5 = {
12143 	mbfl_no_encoding_wchar,
12144 	mbfl_no_encoding_big5,
12145 	mbfl_filt_conv_common_ctor,
12146 	NULL,
12147 	mbfl_filt_conv_wchar_big5,
12148 	mbfl_filt_conv_common_flush,
12149 	NULL
12150 };
12151 
12152 const mbfl_encoding mbfl_encoding_big5 = {
12153 	mbfl_no_encoding_big5,
12154 	"BIG-5",
12155 	"BIG5",
12156 	mbfl_encoding_big5_aliases,
12157 	mblen_table_81_to_fe,
12158 	MBFL_ENCTYPE_GL_UNSAFE,
12159 	&vtbl_big5_wchar,
12160 	&vtbl_wchar_big5,
12161 	mb_big5_to_wchar,
12162 	mb_wchar_to_big5,
12163 	NULL
12164 };
12165 
12166 static const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
12167 	mbfl_no_encoding_cp950,
12168 	mbfl_no_encoding_wchar,
12169 	mbfl_filt_conv_common_ctor,
12170 	NULL,
12171 	mbfl_filt_conv_big5_wchar,
12172 	mbfl_filt_conv_big5_wchar_flush,
12173 	NULL,
12174 };
12175 
12176 static const struct mbfl_convert_vtbl vtbl_wchar_cp950 = {
12177 	mbfl_no_encoding_wchar,
12178 	mbfl_no_encoding_cp950,
12179 	mbfl_filt_conv_common_ctor,
12180 	NULL,
12181 	mbfl_filt_conv_wchar_big5,
12182 	mbfl_filt_conv_common_flush,
12183 	NULL,
12184 };
12185 
12186 const mbfl_encoding mbfl_encoding_cp950 = {
12187 	mbfl_no_encoding_cp950,
12188 	"CP950",
12189 	"BIG5",
12190 	NULL,
12191 	mblen_table_81_to_fe,
12192 	MBFL_ENCTYPE_GL_UNSAFE,
12193 	&vtbl_cp950_wchar,
12194 	&vtbl_wchar_cp950,
12195 	mb_cp950_to_wchar,
12196 	mb_wchar_to_cp950,
12197 	NULL
12198 };
12199 
12200 /*
12201  * HZ
12202  */
12203 
mbfl_filt_conv_hz_wchar(int c,mbfl_convert_filter * filter)12204 static int mbfl_filt_conv_hz_wchar(int c, mbfl_convert_filter *filter)
12205 {
12206 	int c1, s, w;
12207 
12208 	switch (filter->status & 0xf) {
12209 	/* case 0x00: ASCII */
12210 	/* case 0x10: GB2312 */
12211 	case 0:
12212 		if (c == '~') {
12213 			filter->status += 2;
12214 		} else if (filter->status == 0x10 && ((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77))) {
12215 			/* DBCS first char */
12216 			filter->cache = c;
12217 			filter->status += 1;
12218 		} else if (filter->status == 0 && c >= 0 && c < 0x80) { /* latin, CTLs */
12219 			CK((*filter->output_function)(c, filter->data));
12220 		} else {
12221 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12222 		}
12223 		break;
12224 
12225 	/* case 0x11: GB2312 second char */
12226 	case 1:
12227 		filter->status &= ~0xf;
12228 		c1 = filter->cache;
12229 		if (c1 > 0x20 && c1 < 0x7F && c > 0x20 && c < 0x7F) {
12230 			s = (c1 - 1)*192 + c + 0x40; /* GB2312 */
12231 			ZEND_ASSERT(s < cp936_ucs_table_size);
12232 			if (s == 0x1864) {
12233 				w = 0x30FB;
12234 			} else if (s == 0x186A) {
12235 				w = 0x2015;
12236 			} else if (s == 0x186C) {
12237 				w = 0x2225;
12238 			} else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12239 				w = 0;
12240 			} else {
12241 				w = cp936_ucs_table[s];
12242 			}
12243 
12244 			if (w <= 0) {
12245 				w = MBFL_BAD_INPUT;
12246 			}
12247 
12248 			CK((*filter->output_function)(w, filter->data));
12249 		} else {
12250 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12251 		}
12252 		break;
12253 
12254 	/* '~' */
12255 	case 2:
12256 		if (c == '}' && filter->status == 0x12) {
12257 			filter->status = 0;
12258 		} else if (c == '{' && filter->status == 2) {
12259 			filter->status = 0x10;
12260 		} else if (c == '~' && filter->status == 2) {
12261 			CK((*filter->output_function)('~', filter->data));
12262 			filter->status -= 2;
12263 		} else if (c == '\n') {
12264 			/* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12265 			filter->status -= 2;
12266 		} else {
12267 			/* Invalid character after ~ */
12268 			filter->status -= 2;
12269 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12270 		}
12271 		break;
12272 
12273 		EMPTY_SWITCH_DEFAULT_CASE();
12274 	}
12275 
12276 	return 0;
12277 }
12278 
mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter * filter)12279 static int mbfl_filt_conv_hz_wchar_flush(mbfl_convert_filter *filter)
12280 {
12281 	if (filter->status == 0x11) {
12282 		/* 2-byte character was truncated */
12283 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
12284 	}
12285 
12286 	filter->status = 0;
12287 
12288 	if (filter->flush_function) {
12289 		(*filter->flush_function)(filter->data);
12290 	}
12291 
12292 	return 0;
12293 }
12294 
mbfl_filt_conv_wchar_hz(int c,mbfl_convert_filter * filter)12295 static int mbfl_filt_conv_wchar_hz(int c, mbfl_convert_filter *filter)
12296 {
12297 	int s = 0;
12298 
12299 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
12300 		if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261 || c == 0x2CA || c == 0x2CB || c == 0x2D9) {
12301 			s = 0;
12302 		} else {
12303 			s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
12304 		}
12305 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
12306 		if (c == 0x2015) {
12307 			s = 0xA1AA;
12308 		} else if (c == 0x2010 || c == 0x2013 || c == 0x2014 || c == 0x2016 || c == 0x2025 || c == 0x2035 ||
12309 				c == 0x2105 || c == 0x2109 || c == 0x2121 || (c >= 0x2170 && c <= 0x2179) || (c >= 0x2196 && c <= 0x2199) ||
12310 				c == 0x2215 || c == 0x221F || c == 0x2223 || c == 0x2252 || c == 0x2266 || c == 0x2267 || c == 0x2295 ||
12311 				(c >= 0x2550 && c <= 0x2573) || c == 0x22BF || c == 0x2609 || (c >= 0x2581 && c <= 0x258F) ||
12312 				(c >= 0x2593 && c <= 0x2595) || c == 0x25BC || c == 0x25BD || (c >= 0x25E2 && c <= 0x25E5)) {
12313 			s = 0;
12314 		} else {
12315 			s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
12316 		}
12317 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
12318 		if (c == 0x30FB) {
12319 			s = 0xA1A4;
12320 		} else if (c == 0x3006 || c == 0x3007 || c == 0x3012 || c == 0x3231 || c == 0x32A3 || c >= 0x3300 ||
12321 				(c >= 0x3018 && c <= 0x3040) || (c >= 0x309B && c <= 0x309E) || (c >= 0x30FC && c <= 0x30FE)) {
12322 			s = 0;
12323 		} else {
12324 			s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
12325 		}
12326 	} else if (c >= ucs_i_gb2312_table_min && c < ucs_i_gb2312_table_max) {
12327 		s = ucs_i_gb2312_table[c - ucs_i_gb2312_table_min];
12328 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
12329 		if (c == 0xFF04) {
12330 			s = 0xA1E7;
12331 		} else if (c == 0xFF5E) {
12332 			s = 0xA1AB;
12333 		} else if (c >= 0xFF01 && c <= 0xFF5D) {
12334 			s = c - 0xFF01 + 0xA3A1;
12335 		} else if (c == 0xFFE0 || c == 0xFFE1 || c == 0xFFE3 || c == 0xFFE5) {
12336 			s = ucs_hff_s_cp936_table[c - 0xFFE0];
12337 		}
12338 	}
12339 
12340 	if (s & 0x8000) {
12341 		s -= 0x8080;
12342 	}
12343 
12344 	if (s <= 0) {
12345 		s = (c == 0) ? 0 : -1;
12346 	} else if ((s >= 0x80 && s < 0x2121) || s > 0x8080) {
12347 		s = -1;
12348 	}
12349 
12350 	if (s >= 0) {
12351 		if (s < 0x80) { /* ASCII */
12352 			if ((filter->status & 0xff00) != 0) {
12353 				CK((*filter->output_function)('~', filter->data));
12354 				CK((*filter->output_function)('}', filter->data));
12355 			}
12356 			filter->status = 0;
12357 			if (s == 0x7E) {
12358 				CK((*filter->output_function)('~', filter->data));
12359 			}
12360 			CK((*filter->output_function)(s, filter->data));
12361 		} else { /* GB 2312-80 */
12362 			if ((filter->status & 0xFF00) != 0x200) {
12363 				CK((*filter->output_function)('~', filter->data));
12364 				CK((*filter->output_function)('{', filter->data));
12365 			}
12366 			filter->status = 0x200;
12367 			CK((*filter->output_function)((s >> 8) & 0x7F, filter->data));
12368 			CK((*filter->output_function)(s & 0x7F, filter->data));
12369 		}
12370 	} else {
12371 		CK(mbfl_filt_conv_illegal_output(c, filter));
12372 	}
12373 
12374 	return 0;
12375 }
12376 
mbfl_filt_conv_any_hz_flush(mbfl_convert_filter * filter)12377 static int mbfl_filt_conv_any_hz_flush(mbfl_convert_filter *filter)
12378 {
12379 	/* back to latin */
12380 	if (filter->status & 0xFF00) {
12381 		CK((*filter->output_function)('~', filter->data));
12382 		CK((*filter->output_function)('}', filter->data));
12383 	}
12384 	filter->status = 0;
12385 	return 0;
12386 }
12387 
12388 #define ASCII 0
12389 #define GB2312 1
12390 
mb_hz_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)12391 static size_t mb_hz_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
12392 {
12393 	unsigned char *p = *in, *e = p + *in_len;
12394 	uint32_t *out = buf, *limit = buf + bufsize;
12395 
12396 	while (p < e && out < limit) {
12397 		unsigned char c = *p++;
12398 
12399 		if (c == '~') {
12400 			if (p == e) {
12401 				break;
12402 			}
12403 			unsigned char c2 = *p++;
12404 
12405 			if (c2 == '}' && *state == GB2312) {
12406 				*state = ASCII;
12407 			} else if (c2 == '{' && *state == ASCII) {
12408 				*state = GB2312;
12409 			} else if (c2 == '~' && *state == ASCII) {
12410 				*out++ = '~';
12411 			} else if (c2 == '\n') {
12412 				/* "~\n" is a line continuation; no output is needed, nor should we shift modes */
12413 			} else {
12414 				/* Invalid character after ~ */
12415 				*out++ = MBFL_BAD_INPUT;
12416 			}
12417 		} else if (((c > 0x20 && c <= 0x29) || (c >= 0x30 && c <= 0x77)) && p < e && *state == GB2312) {
12418 			unsigned char c2 = *p++;
12419 
12420 			if (c > 0x20 && c < 0x7F && c2 > 0x20 && c2 < 0x7F) {
12421 				unsigned int s = (c - 1)*192 + c2 + 0x40;
12422 				ZEND_ASSERT(s < cp936_ucs_table_size);
12423 
12424 				if (s == 0x1864) {
12425 					s = 0x30FB;
12426 				} else if (s == 0x186A) {
12427 					s = 0x2015;
12428 				} else if (s == 0x186C) {
12429 					s = 0x2225;
12430 				} else if ((s >= 0x1920 && s <= 0x192A) || s == 0x1963 || (s >= 0x1C60 && s <= 0x1C7F) || (s >= 0x1DBB && s <= 0x1DC4)) {
12431 					s = 0;
12432 				} else {
12433 					s = cp936_ucs_table[s];
12434 				}
12435 				if (!s)
12436 					s = MBFL_BAD_INPUT;
12437 				*out++ = s;
12438 			} else {
12439 				*out++ = MBFL_BAD_INPUT;
12440 			}
12441 		} else if (c < 0x80 && *state == ASCII) {
12442 			*out++ = c;
12443 		} else {
12444 			*out++ = MBFL_BAD_INPUT;
12445 		}
12446 	}
12447 
12448 	*in_len = e - p;
12449 	*in = p;
12450 	return out - buf;
12451 }
12452 
mb_wchar_to_hz(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)12453 static void mb_wchar_to_hz(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
12454 {
12455 	unsigned char *out, *limit;
12456 	MB_CONVERT_BUF_LOAD(buf, out, limit);
12457 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12458 
12459 	while (len--) {
12460 		uint32_t w = *in++;
12461 		unsigned int s = 0;
12462 
12463 		if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
12464 			if (w == 0xB7 || w == 0x144 || w == 0x148 || w == 0x251 || w == 0x261 || w == 0x2CA || w == 0x2CB || w == 0x2D9) {
12465 				s = 0;
12466 			} else {
12467 				s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
12468 			}
12469 		} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
12470 			if (w == 0x2015) {
12471 				s = 0xA1AA;
12472 			} else if (w == 0x2010 || w == 0x2013 || w == 0x2014 || w == 0x2016 || w == 0x2025 || w == 0x2035 || w == 0x2105 || w == 0x2109 || w == 0x2121 || (w >= 0x2170 && w <= 0x2179) || (w >= 0x2196 && w <= 0x2199) || w == 0x2215 || w == 0x221F || w == 0x2223 || w == 0x2252 || w == 0x2266 || w == 0x2267 || w == 0x2295 || (w >= 0x2550 && w <= 0x2573) || w == 0x22BF || w == 0x2609 || (w >= 0x2581 && w <= 0x258F) || (w >= 0x2593 && w <= 0x2595) || w == 0x25BC || w == 0x25BD || (w >= 0x25E2 && w <= 0x25E5)) {
12473 				s = 0;
12474 			} else {
12475 				s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
12476 			}
12477 		} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
12478 			if (w == 0x30FB) {
12479 				s = 0xA1A4;
12480 			} else if (w == 0x3006 || w == 0x3007 || w == 0x3012 || w == 0x3231 || w == 0x32A3 || w >= 0x3300 || (w >= 0x3018 && w <= 0x3040) || (w >= 0x309B && w <= 0x309E) || (w >= 0x30FC && w <= 0x30FE)) {
12481 				s = 0;
12482 			} else {
12483 				s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
12484 			}
12485 		} else if (w >= ucs_i_gb2312_table_min && w < ucs_i_gb2312_table_max) {
12486 			s = ucs_i_gb2312_table[w - ucs_i_gb2312_table_min];
12487 		} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
12488 			if (w == 0xFF04) {
12489 				s = 0xA1E7;
12490 			} else if (w == 0xFF5E) {
12491 				s = 0xA1AB;
12492 			} else if (w >= 0xFF01 && w <= 0xFF5D) {
12493 				s = w - 0xFF01 + 0xA3A1;
12494 			} else if (w == 0xFFE0 || w == 0xFFE1 || w == 0xFFE3 || w == 0xFFE5) {
12495 				s = ucs_hff_s_cp936_table[w - 0xFFE0];
12496 			}
12497 		}
12498 
12499 		s &= ~0x8080;
12500 
12501 		if ((!s && w) || (s >= 0x80 && s < 0x2121)) {
12502 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_hz);
12503 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
12504 		} else if (s < 0x80) {
12505 			/* ASCII */
12506 			if (buf->state != ASCII) {
12507 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3);
12508 				out = mb_convert_buf_add2(out, '~', '}');
12509 				buf->state = ASCII;
12510 			}
12511 			if (s == '~') {
12512 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12513 				out = mb_convert_buf_add2(out, '~', '~');
12514 			} else {
12515 				out = mb_convert_buf_add(out, s);
12516 			}
12517 		} else {
12518 			/* GB 2312-80 */
12519 			if (buf->state != GB2312) {
12520 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
12521 				out = mb_convert_buf_add2(out, '~', '{');
12522 				buf->state = GB2312;
12523 			} else {
12524 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
12525 			}
12526 			out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F);
12527 		}
12528 	}
12529 
12530 	if (end && buf->state != ASCII) {
12531 		/* If not in ASCII state, need to emit closing control chars */
12532 		MB_CONVERT_BUF_ENSURE(buf, out, limit, 2);
12533 		out = mb_convert_buf_add2(out, '~', '}');
12534 	}
12535 
12536 	MB_CONVERT_BUF_STORE(buf, out, limit);
12537 }
12538 
12539 static const struct mbfl_convert_vtbl vtbl_hz_wchar = {
12540 	mbfl_no_encoding_hz,
12541 	mbfl_no_encoding_wchar,
12542 	mbfl_filt_conv_common_ctor,
12543 	NULL,
12544 	mbfl_filt_conv_hz_wchar,
12545 	mbfl_filt_conv_hz_wchar_flush,
12546 	NULL,
12547 };
12548 
12549 static const struct mbfl_convert_vtbl vtbl_wchar_hz = {
12550 	mbfl_no_encoding_wchar,
12551 	mbfl_no_encoding_hz,
12552 	mbfl_filt_conv_common_ctor,
12553 	NULL,
12554 	mbfl_filt_conv_wchar_hz,
12555 	mbfl_filt_conv_any_hz_flush,
12556 	NULL,
12557 };
12558 
12559 const mbfl_encoding mbfl_encoding_hz = {
12560 	mbfl_no_encoding_hz,
12561 	"HZ",
12562 	"HZ-GB-2312",
12563 	NULL,
12564 	NULL,
12565 	MBFL_ENCTYPE_GL_UNSAFE,
12566 	&vtbl_hz_wchar,
12567 	&vtbl_wchar_hz,
12568 	mb_hz_to_wchar,
12569 	mb_wchar_to_hz,
12570 	NULL
12571 };
12572