1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * the source code included in this files was separated from mbfilter_cp936.c
26  * by rui hirokawa <hirokawa@php.net> on 11 Aug 2011.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_gb18030.h"
32 
33 #include "unicode_table_cp936.h"
34 #include "unicode_table_gb18030.h"
35 
36 static int mbfl_filt_ident_gb18030(int c, mbfl_identify_filter *filter);
37 
38 static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
39 
40 const mbfl_encoding mbfl_encoding_gb18030 = {
41 	mbfl_no_encoding_gb18030,
42 	"GB18030",
43 	"GB18030",
44 	(const char *(*)[])&mbfl_encoding_gb18030_aliases,
45 	NULL,
46 	MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE,
47 	&vtbl_gb18030_wchar,
48 	&vtbl_wchar_gb18030
49 };
50 
51 const struct mbfl_identify_vtbl vtbl_identify_gb18030 = {
52 	mbfl_no_encoding_gb18030,
53 	mbfl_filt_ident_common_ctor,
54 	mbfl_filt_ident_gb18030
55 };
56 
57 const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
58 	mbfl_no_encoding_gb18030,
59 	mbfl_no_encoding_wchar,
60 	mbfl_filt_conv_common_ctor,
61 	NULL,
62 	mbfl_filt_conv_gb18030_wchar,
63 	mbfl_filt_conv_common_flush,
64 	NULL,
65 };
66 
67 const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = {
68 	mbfl_no_encoding_wchar,
69 	mbfl_no_encoding_gb18030,
70 	mbfl_filt_conv_common_ctor,
71 	NULL,
72 	mbfl_filt_conv_wchar_gb18030,
73 	mbfl_filt_conv_common_flush,
74 	NULL,
75 };
76 
77 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
78 
79 
80 int
mbfl_bisec_srch(int w,const unsigned short * tbl,int n)81 mbfl_bisec_srch(int w, const unsigned short *tbl, int n)
82 {
83 	int k, k1 = 0, k2 = n-1;
84 
85 	while (k1 < k2) {
86 		k = (k1+k2) >> 1;
87 		if (w <= tbl[2*k+1]) {
88 			k2 = k;
89 		} else if (w >= tbl[2*k+2]) {
90 			k1 = k + 1;
91 		} else {
92 			return -1;
93 		}
94 	}
95 	return k1;
96 }
97 
98 int
mbfl_bisec_srch2(int w,const unsigned short tbl[],int n)99 mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
100 {
101 	int k, k1 = 0, k2 = n;
102 
103 	if (w == tbl[0]) {
104 		return 0;
105 	}
106 
107 	while (k2 - k1 > 1) {
108 		k = (k1 + k2) >> 1;
109 		if (w < tbl[k]) {
110 			k2 = k;
111 		} else if (w > tbl[k]) {
112 			k1 = k;
113 		} else {
114 			return k;
115 		}
116 	}
117 	return -1;
118 }
119 
120 /*
121  * GB18030 => wchar
122  */
123 int
mbfl_filt_conv_gb18030_wchar(int c,mbfl_convert_filter * filter)124 mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
125 {
126 	int k;
127 	int c1, c2, c3, w = -1;
128 
129 	switch (filter->status) {
130 	case 0:
131 		if (c >= 0 && c < 0x80) {	/* latin */
132 			CK((*filter->output_function)(c, filter->data));
133 		} else if (c == 0x80) {	/* euro sign */
134 			CK((*filter->output_function)(0x20ac, filter->data));
135 		} else if (c == 0xff) {
136 			CK((*filter->output_function)(0x00ff, filter->data));
137 		} else if (c > 0x80 && c < 0xff) {	/* dbcs/qbcs lead byte */
138 			filter->status = 1;
139 			filter->cache = c;
140 		} else {
141 			w = c & MBFL_WCSGROUP_MASK;
142 			w |= MBFL_WCSGROUP_THROUGH;
143 			CK((*filter->output_function)(w, filter->data));
144 		}
145 		break;
146 
147 	case 1:		/* dbcs/qbcs second byte */
148 		c1 = filter->cache;
149 		filter->status = 0;
150 
151 		if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) { /* 4 byte range: Unicode BMP */
152 			filter->status = 2;
153 			filter->cache = (c1 << 8) | c;
154 			return c;
155 		} else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) {
156 			/* 4 byte range: Unicode 16 planes */
157 			filter->status = 2;
158 			filter->cache = (c1 << 8) | c;
159 			return c;
160 		} else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) &&
161 				   (c >= 0xa1 && c <= 0xfe)) { /* UDA part1,2: U+E000-U+E4C5 */
162 			w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
163 			CK((*filter->output_function)(w, filter->data));
164 		} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
165 			/* UDA part3 : U+E4C6-U+E765*/
166 			w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6;
167 			CK((*filter->output_function)(w, filter->data));
168 		}
169 
170 		c2 = (c1 << 8) | c;
171 
172 		if (w <= 0 &&
173 			((c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) ||
174 			 (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
175 			 (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
176 			for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) {
177 				if (c2 >= mbfl_gb18030_pua_tbl[k][2] &&
178 					c2 <= mbfl_gb18030_pua_tbl[k][2] +  mbfl_gb18030_pua_tbl[k][1]
179 					-  mbfl_gb18030_pua_tbl[k][0]) {
180 					w = c2 -  mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
181 					CK((*filter->output_function)(w, filter->data));
182 					break;
183 				}
184 			}
185 		}
186 
187 		if (w <= 0) {
188 			if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
189 				(c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
190 				(c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
191 				(c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
192 				(c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
193 				w = (c1 - 0x81)*192 + (c - 0x40);
194 				if (w >= 0 && w < cp936_ucs_table_size) {
195 					w = cp936_ucs_table[w];
196 				} else {
197 					w = 0;
198 				}
199 				if (w <= 0) {
200 					w = (c1 << 8) | c;
201 					w &= MBFL_WCSPLANE_MASK;
202 					w |= MBFL_WCSPLANE_GB18030;
203 				}
204 				CK((*filter->output_function)(w, filter->data));
205 			} else if ((c >= 0 && c < 0x21) || c == 0x7f) {		/* CTLs */
206 				CK((*filter->output_function)(c, filter->data));
207 			} else {
208 				w = (c1 << 8) | c;
209 				w &= MBFL_WCSGROUP_MASK;
210 				w |= MBFL_WCSGROUP_THROUGH;
211 				CK((*filter->output_function)(w, filter->data));
212 			}
213 		}
214 		break;
215 	case 2: /* qbcs third byte */
216 		c1 = (filter->cache >> 8) & 0xff;
217 		c2 = filter->cache & 0xff;
218 		filter->status = 0;
219 		filter->cache = 0;
220 		if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) &&
221 			c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
222 			filter->cache = (c1 << 16) | (c2 << 8) | c;
223 			filter->status = 3;
224 		} else {
225 			w = (c1 << 16) | (c2 << 8) | c;
226 			w &= MBFL_WCSGROUP_MASK;
227 			w |= MBFL_WCSGROUP_THROUGH;
228 			CK((*filter->output_function)(w, filter->data));
229 		}
230  		break;
231 
232 	case 3: /* qbcs fourth byte */
233 		c1 = (filter->cache >> 16) & 0xff;
234 		c2 = (filter->cache >> 8) & 0xff;
235 		c3 = filter->cache & 0xff;
236 		filter->status = 0;
237 		filter->cache = 0;
238 		if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) &&
239 			c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
240 			if (c1 >= 0x90 && c1 <= 0xe3) {
241 				w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000;
242 			} else { /* Unicode BMP */
243 				w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30);
244 				if (w >= 0 && w <= 39419) {
245 					k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max);
246 					if (k<0) {
247 						/* error */
248 						w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
249 						w &= MBFL_WCSGROUP_MASK;
250 						w |= MBFL_WCSGROUP_THROUGH;
251 						CK((*filter->output_function)(w, filter->data));
252 						return c;
253 					}
254 					w += mbfl_gb_uni_ofst[k];
255 				} else {
256 					w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
257 					w &= MBFL_WCSGROUP_MASK;
258 					w |= MBFL_WCSGROUP_THROUGH;
259 					CK((*filter->output_function)(w, filter->data));
260 					return c;
261 				}
262 			}
263 			CK((*filter->output_function)(w, filter->data));
264 		} else {
265 			w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
266 			w &= MBFL_WCSGROUP_MASK;
267 			w |= MBFL_WCSGROUP_THROUGH;
268 			CK((*filter->output_function)(w, filter->data));
269 		}
270  		break;
271 
272 	default:
273 		filter->status = 0;
274 		break;
275 	}
276 
277 	return c;
278 }
279 
280 /*
281  * wchar => GB18030
282  */
283 int
mbfl_filt_conv_wchar_gb18030(int c,mbfl_convert_filter * filter)284 mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
285 {
286 	int k, k1, k2;
287 	int c1, s = 0, s1 = 0;
288 
289 	if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
290 		s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
291 	} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
292 		s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
293 	} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
294 		s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
295 	} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
296 		s = ucs_i_cp936_table[c - ucs_i_cp936_table_min];
297 	} else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) {
298 		/* U+F900-FA2F CJK Compatibility Ideographs */
299 		if (c == 0xf92c) {
300 			s = 0xfd9c;
301 		} else if (c == 0xf979) {
302 			s = 0xfd9d;
303 		} else if (c == 0xf995) {
304 			s = 0xfd9e;
305 		} else if (c == 0xf9e7) {
306 			s = 0xfd9f;
307 		} else if (c == 0xf9f1) {
308 			s = 0xfda0;
309 		} else if (c >= 0xfa0c && c <= 0xfa29) {
310 			s = ucs_ci_s_cp936_table[c - 0xfa0c];
311 		}
312 	} else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) {
313 		/* FE30h CJK Compatibility Forms  */
314 		s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min];
315 	} else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) {
316 		/* U+FE50-FE6F Small Form Variants */
317 		s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min];
318 	} else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) {
319 		/* U+FF00-FFFF HW/FW Forms */
320 		if (c == 0xff04) {
321 			s = 0xa1e7;
322 		} else if (c == 0xff5e) {
323 			s = 0xa1ab;
324 		} else if (c >= 0xff01 && c <= 0xff5d) {
325 			s = c - 0xff01 + 0xa3a1;
326 		} else if (c >= 0xffe0 && c <= 0xffe5) {
327 			s = ucs_hff_s_cp936_table[c-0xffe0];
328 		}
329 	}
330 
331 	if (c == 0x20ac) { /* euro-sign */
332 		s = 0xa2e3;
333 	}
334 
335 	if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] &&
336 		c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
337 		k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
338 		if (k1 >= 0) {
339 			s = mbfl_gb18030_c_tbl_val[k1];
340 		}
341 	}
342 
343 	if (c >= 0xe000 && c <= 0xe864) { /* PUA */
344 		if (c < 0xe766) {
345 			if (c < 0xe4c6) {
346 				c1 = c - 0xe000;
347 				s = (c1 % 94) + 0xa1; c1 /= 94;
348 				s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
349 			} else {
350 				c1 = c - 0xe4c6;
351 				s = ((c1 / 96) + 0xa1) << 8; c1 %= 96;
352 				s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
353 			}
354 		} else {
355 			/* U+E766..U+E864 */
356 			k1 = 0; k2 = mbfl_gb18030_pua_tbl_max;
357 			while (k1 < k2) {
358 				k = (k1 + k2) >> 1;
359 				if (c < mbfl_gb18030_pua_tbl[k][0]) {
360 					k2 = k;
361 				} else if (c > mbfl_gb18030_pua_tbl[k][1]) {
362 					k1 = k + 1;
363 				} else {
364 					s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
365 					break;
366 				}
367 			}
368 		}
369 	}
370 
371 	if (s <= 0 && c >= 0x0080 && c <= 0xffff) { /* BMP */
372 		s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
373 		if (s >= 0) {
374 			c1 = c - mbfl_gb_uni_ofst[s];
375 			s = (c1 % 10) + 0x30; c1 /= 10;
376 			s |= ((c1 % 126) + 0x81) << 8; c1 /= 126;
377 			s |= ((c1 % 10) + 0x30) << 16; c1 /= 10;
378 			s1 = c1 + 0x81;
379 		}
380 	} else if (c >= 0x10000 && c <= 0x10ffff) { /* Code set 3: Unicode U+10000..U+10FFFF */
381 		c1 = c - 0x10000;
382 		s = (c1 % 10) + 0x30; c1 /= 10;
383 		s |= ((c1 % 126) + 0x81) << 8; c1 /= 126;
384 		s |= ((c1 % 10) + 0x30) << 16; c1 /= 10;
385 		s1 = c1 + 0x90;
386 	}
387 
388 	if (s <= 0) {
389 		c1 = c & ~MBFL_WCSPLANE_MASK;
390 		if (c1 == MBFL_WCSPLANE_WINCP936) {
391 			s = c & MBFL_WCSPLANE_MASK;
392 		}
393 		if (c == 0) {
394 			s = 0;
395 		} else if (s <= 0) {
396 			s = -1;
397 		}
398 	}
399 	if (s >= 0) {
400 		if (s <= 0x80) {	/* latin */
401 			CK((*filter->output_function)(s, filter->data));
402 		} else if (s1 > 0) { /* qbcs */
403 			CK((*filter->output_function)(s1 & 0xff, filter->data));
404 			CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
405 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
406 			CK((*filter->output_function)(s & 0xff, filter->data));
407 		} else { /* dbcs */
408 			CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
409 			CK((*filter->output_function)(s & 0xff, filter->data));
410 		}
411 	} else {
412 		CK(mbfl_filt_conv_illegal_output(c, filter));
413 	}
414 
415 	return c;
416 }
417 
mbfl_filt_ident_gb18030(int c,mbfl_identify_filter * filter)418 static int mbfl_filt_ident_gb18030(int c, mbfl_identify_filter *filter)
419 {
420 	int c1;
421 
422 	c1 = (filter->status >> 8) & 0xff;
423 	filter->status &= 0xff;
424 
425 	if (filter->status == 0) {
426 		if (c <= 0x80 || c == 0xff) {
427 			filter->status = 0;
428 		} else {
429 			filter->status = 1;
430 			filter->status |= (c << 8);
431 		}
432 	} else if (filter->status == 1) { /* dbcs/qbcs 2nd byte */
433 		if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c >= 0x30 && c <= 0x39) { /* qbcs */
434 			filter->status = 2;
435 		} else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) {
436 			filter->status = 0; /* UDA part 1,2 */
437 		} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
438 			filter->status = 0; /* UDA part 3 */
439 		} else if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) ||
440 				   (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) ||
441 				   (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) ||
442 				   (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) ||
443 				   (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) {
444 			filter->status = 0; /* DBCS */
445 		} else {
446 			filter->flag = 1; /* bad */
447 			filter->status = 0;
448 		}
449 	} else if (filter->status == 2) { /* qbcs 3rd byte */
450 		if (c > 0x80 && c < 0xff) {
451 			filter->status = 3;
452 		} else {
453 			filter->flag = 1; /* bad */
454 			filter->status = 0;
455 		}
456 	} else if (filter->status == 3) { /* qbcs 4th byte */
457 		if (c >= 0x30 && c < 0x40) {
458 			filter->status = 0;
459 		} else {
460 			filter->flag = 1; /* bad */
461 			filter->status = 0;
462 		}
463 	} else {							/* bad */
464 		filter->flag = 1;
465 	}
466 
467 	return c;
468 }
469