1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file: Rui Hirokawa <hirokawa@php.net>
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter_tw.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_euc_tw.h"
32 
33 #include "unicode_table_cns11643.h"
34 
35 static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter);
36 
37 static const unsigned char mblen_table_euctw[] = { /* 0xA1-0xFE */
38   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1,
47   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
53   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
54 };
55 
56 
57 static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
58 
59 const mbfl_encoding mbfl_encoding_euc_tw = {
60 	mbfl_no_encoding_euc_tw,
61 	"EUC-TW",
62 	"EUC-TW",
63 	mbfl_encoding_euc_tw_aliases,
64 	mblen_table_euctw,
65 	0,
66 	&vtbl_euctw_wchar,
67 	&vtbl_wchar_euctw,
68 	NULL
69 };
70 
71 const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
72 	mbfl_no_encoding_euc_tw,
73 	mbfl_no_encoding_wchar,
74 	mbfl_filt_conv_common_ctor,
75 	NULL,
76 	mbfl_filt_conv_euctw_wchar,
77 	mbfl_filt_conv_euctw_wchar_flush,
78 	NULL,
79 };
80 
81 const struct mbfl_convert_vtbl vtbl_wchar_euctw = {
82 	mbfl_no_encoding_wchar,
83 	mbfl_no_encoding_euc_tw,
84 	mbfl_filt_conv_common_ctor,
85 	NULL,
86 	mbfl_filt_conv_wchar_euctw,
87 	mbfl_filt_conv_common_flush,
88 	NULL,
89 };
90 
91 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
92 
mbfl_filt_conv_euctw_wchar(int c,mbfl_convert_filter * filter)93 int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
94 {
95 	int c1, s, w;
96 
97 	switch (filter->status) {
98 	case 0:
99 		if (c >= 0 && c < 0x80) { /* latin */
100 			CK((*filter->output_function)(c, filter->data));
101 		} else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) { /* 2-byte character, first byte */
102 			filter->status = 1;
103 			filter->cache = c;
104 		} else if (c == 0x8E) { /* 4-byte character, first byte */
105 			filter->status = 2;
106 		} else {
107 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
108 		}
109 		break;
110 
111 	case 1: /* 2-byte character, second byte */
112 		filter->status = 0;
113 		c1 = filter->cache;
114 		if (c > 0xA0 && c < 0xFF) {
115 			w = (c1 - 0xA1)*94 + (c - 0xA1);
116 			if (w >= 0 && w < cns11643_1_ucs_table_size) {
117 				w = cns11643_1_ucs_table[w];
118 			} else {
119 				w = 0;
120 			}
121 
122 			if (w <= 0) {
123 				w = MBFL_BAD_INPUT;
124 			}
125 
126 			CK((*filter->output_function)(w, filter->data));
127 		} else {
128 			filter->status = filter->cache = 0;
129 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
130 		}
131 		break;
132 
133 	case 2: /* got 0x8e, second byte */
134 		if (c == 0xA1 || c == 0xA2 || c == 0xAE) {
135 			filter->status = 3;
136 			filter->cache = c - 0xA1;
137 		} else {
138 			filter->status = filter->cache = 0;
139 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
140 		}
141 		break;
142 
143 	case 3: /* got 0x8e, third byte */
144 		filter->status = 0;
145 		c1 = filter->cache;
146 		if (c >= 0xA1 && ((c1 == 0 && ((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) ||
147 				(c1 == 1 && c <= 0xF2) || (c1 == 13 && c <= 0xE7))) {
148 			filter->status = 4;
149 			filter->cache = (c1 << 8) + c - 0xA1;
150 		} else {
151 			filter->status = filter->cache = 0;
152 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
153 		}
154 		break;
155 
156 	case 4:	/* multi-byte character, fourth byte */
157 		filter->status = 0;
158 		c1 = filter->cache;
159 		if (c1 <= 0xDFF && c > 0xA0 && c < 0xFF) {
160 			int plane = (c1 & 0xF00) >> 8; /* This is actually the CNS-11643 plane minus one */
161 			s = (c1 & 0xFF)*94 + c - 0xA1;
162 			w = 0;
163 			if (s >= 0) {
164 				/* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
165 				 * and added tens of thousands more characters in planes 4, 5, 6, and 7
166 				 * We only support the older version of CNS-11643
167 				 * This is the same as iconv from glibc 2.2 */
168 				if (plane == 0 && s < cns11643_1_ucs_table_size) {
169 					w = cns11643_1_ucs_table[s];
170 				} else if (plane == 1 && s < cns11643_2_ucs_table_size) {
171 					w = cns11643_2_ucs_table[s];
172 				} else if (plane == 13 && s < cns11643_14_ucs_table_size) {
173 					w = cns11643_14_ucs_table[s];
174 				}
175 			}
176 
177 			if (w <= 0) {
178 				w = MBFL_BAD_INPUT;
179 			}
180 
181 			CK((*filter->output_function)(w, filter->data));
182 		} else {
183 			filter->status = filter->cache = 0;
184 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
185 		}
186 		break;
187 
188 	default:
189 		filter->status = 0;
190 		break;
191 	}
192 
193 	return 0;
194 }
195 
mbfl_filt_conv_wchar_euctw(int c,mbfl_convert_filter * filter)196 int mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
197 {
198 	int s = 0;
199 
200 	if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) {
201 		s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min];
202 	} else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) {
203 		s = ucs_a2_cns11643_table[c - ucs_a2_cns11643_table_min];
204 	} else if (c >= ucs_a3_cns11643_table_min && c < ucs_a3_cns11643_table_max) {
205 		s = ucs_a3_cns11643_table[c - ucs_a3_cns11643_table_min];
206 	} else if (c >= ucs_i_cns11643_table_min && c < ucs_i_cns11643_table_max) {
207 		s = ucs_i_cns11643_table[c - ucs_i_cns11643_table_min];
208 	} else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) {
209 		s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min];
210 	}
211 
212 	if (s <= 0) {
213 		if (c == 0) {
214 			s = 0;
215 		} else if (s <= 0) {
216 			s = -1;
217 		}
218 	}
219 
220 	if (s >= 0) {
221 		int plane = (s & 0x1F0000) >> 16;
222 		if (plane <= 1) {
223 			if (s < 0x80) { /* latin */
224 				CK((*filter->output_function)(s, filter->data));
225 			} else {
226 				s = (s & 0xFFFF) | 0x8080;
227 				CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
228 				CK((*filter->output_function)(s & 0xFF, filter->data));
229 			}
230 		} else {
231 			s = (0x8EA00000 + (plane << 16)) | ((s & 0xFFFF) | 0x8080);
232 			CK((*filter->output_function)(0x8e , filter->data));
233 			CK((*filter->output_function)((s >> 16) & 0xFF, filter->data));
234 			CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
235 			CK((*filter->output_function)(s & 0xFF, filter->data));
236 		}
237 	} else {
238 		CK(mbfl_filt_conv_illegal_output(c, filter));
239 	}
240 	return 0;
241 }
242 
mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter * filter)243 static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter)
244 {
245 	if (filter->status) {
246 		/* 2-byte or 4-byte character was truncated */
247 		filter->status = 0;
248 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
249 	}
250 
251 	if (filter->flush_function) {
252 		(*filter->flush_function)(filter->data);
253 	}
254 
255 	return 0;
256 }
257