1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file: Rui Hirokawa <hirokawa@php.net>
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter_tw.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_euc_tw.h"
32
33 #include "unicode_table_cns11643.h"
34
35 static int mbfl_filt_ident_euctw(int c, mbfl_identify_filter *filter);
36
37 static const unsigned char mblen_table_euctw[] = { /* 0xA1-0xFE */
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
53 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
54 };
55
56
57 static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
58
59 const mbfl_encoding mbfl_encoding_euc_tw = {
60 mbfl_no_encoding_euc_tw,
61 "EUC-TW",
62 "EUC-TW",
63 (const char *(*)[])&mbfl_encoding_euc_tw_aliases,
64 mblen_table_euctw,
65 MBFL_ENCTYPE_MBCS,
66 &vtbl_euctw_wchar,
67 &vtbl_wchar_euctw
68 };
69
70 const struct mbfl_identify_vtbl vtbl_identify_euctw = {
71 mbfl_no_encoding_euc_tw,
72 mbfl_filt_ident_common_ctor,
73 mbfl_filt_ident_euctw
74 };
75
76 const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
77 mbfl_no_encoding_euc_tw,
78 mbfl_no_encoding_wchar,
79 mbfl_filt_conv_common_ctor,
80 NULL,
81 mbfl_filt_conv_euctw_wchar,
82 mbfl_filt_conv_common_flush,
83 NULL,
84 };
85
86 const struct mbfl_convert_vtbl vtbl_wchar_euctw = {
87 mbfl_no_encoding_wchar,
88 mbfl_no_encoding_euc_tw,
89 mbfl_filt_conv_common_ctor,
90 NULL,
91 mbfl_filt_conv_wchar_euctw,
92 mbfl_filt_conv_common_flush,
93 NULL,
94 };
95
96 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
97
98 /*
99 * EUC-TW => wchar
100 */
101 int
mbfl_filt_conv_euctw_wchar(int c,mbfl_convert_filter * filter)102 mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
103 {
104 int c1, s, w, plane;
105
106 switch (filter->status) {
107 case 0:
108 if (c >= 0 && c < 0x80) { /* latin */
109 CK((*filter->output_function)(c, filter->data));
110 } else if (c > 0xa0 && c < 0xff) { /* dbcs first byte */
111 filter->status = 1;
112 filter->cache = c;
113 } else if (c == 0x8e) { /* mbcs first byte */
114 filter->status = 2;
115 filter->cache = c;
116 } else {
117 w = c & MBFL_WCSGROUP_MASK;
118 w |= MBFL_WCSGROUP_THROUGH;
119 CK((*filter->output_function)(w, filter->data));
120 }
121 break;
122
123 case 1: /* mbcs second byte */
124 filter->status = 0;
125 c1 = filter->cache;
126 if (c > 0xa0 && c < 0xff) {
127 w = (c1 - 0xa1)*94 + (c - 0xa1);
128 if (w >= 0 && w < cns11643_1_ucs_table_size) {
129 w = cns11643_1_ucs_table[w];
130 } else {
131 w = 0;
132 }
133 if (w <= 0) {
134 w = (c1 << 8) | c;
135 w &= MBFL_WCSPLANE_MASK;
136 w |= MBFL_WCSPLANE_CNS11643;
137 }
138 CK((*filter->output_function)(w, filter->data));
139 } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
140 CK((*filter->output_function)(c, filter->data));
141 } else {
142 w = (c1 << 8) | c;
143 w &= MBFL_WCSGROUP_MASK;
144 w |= MBFL_WCSGROUP_THROUGH;
145 CK((*filter->output_function)(w, filter->data));
146 }
147 break;
148
149 case 2: /* got 0x8e, first char */
150 c1 = filter->cache;
151 if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
152 CK((*filter->output_function)(c, filter->data));
153 filter->status = 0;
154 } else if (c > 0xa0 && c < 0xaf) {
155 filter->status = 3;
156 filter->cache = c - 0xa1;
157 } else {
158 w = (c1 << 8) | c;
159 w &= MBFL_WCSGROUP_MASK;
160 w |= MBFL_WCSGROUP_THROUGH;
161 CK((*filter->output_function)(w, filter->data));
162 }
163 break;
164
165 case 3: /* got 0x8e, third char */
166 filter->status = 0;
167 c1 = filter->cache;
168 if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
169 CK((*filter->output_function)(c, filter->data));
170 filter->status = 0;
171 } else if (c > 0xa0 && c < 0xff) {
172 filter->status = 4;
173 filter->cache = (c1 << 8) + c - 0xa1;
174 } else {
175 w = (c1 << 8) | c;
176 w &= MBFL_WCSGROUP_MASK;
177 w |= MBFL_WCSGROUP_THROUGH;
178 CK((*filter->output_function)(w, filter->data));
179 }
180 break;
181
182 case 4: /* mbcs fourth char */
183 filter->status = 0;
184 c1 = filter->cache;
185 if (c1 >= 0x100 && c1 <= 0xdff && c > 0xa0 && c < 0xff) {
186 plane = (c1 & 0xf00) >> 8;
187 s = (c1 & 0xff)*94 + c - 0xa1;
188 w = 0;
189 if (s >= 0) {
190 if (plane == 1 && s < cns11643_2_ucs_table_size) {
191 w = cns11643_2_ucs_table[s];
192 }
193 if (plane == 13 && s < cns11643_14_ucs_table_size) {
194 w = cns11643_14_ucs_table[s];
195 }
196 }
197 if (w <= 0) {
198 w = ((c1 & 0x7f) << 8) | (c & 0x7f);
199 w &= MBFL_WCSPLANE_MASK;
200 w |= MBFL_WCSPLANE_CNS11643;
201 }
202 CK((*filter->output_function)(w, filter->data));
203 } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
204 CK((*filter->output_function)(c, filter->data));
205 } else {
206 w = (c1 << 8) | c | 0x8e0000;
207 w &= MBFL_WCSGROUP_MASK;
208 w |= MBFL_WCSGROUP_THROUGH;
209 CK((*filter->output_function)(w, filter->data));
210 }
211 break;
212
213 default:
214 filter->status = 0;
215 break;
216 }
217
218 return c;
219 }
220
221 /*
222 * wchar => EUC-TW
223 */
224 int
mbfl_filt_conv_wchar_euctw(int c,mbfl_convert_filter * filter)225 mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
226 {
227 int c1, s, plane;
228
229 s = 0;
230 if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) {
231 s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min];
232 } else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) {
233 s = ucs_a2_cns11643_table[c - ucs_a2_cns11643_table_min];
234 } else if (c >= ucs_a3_cns11643_table_min && c < ucs_a3_cns11643_table_max) {
235 s = ucs_a3_cns11643_table[c - ucs_a3_cns11643_table_min];
236 } else if (c >= ucs_i_cns11643_table_min && c < ucs_i_cns11643_table_max) {
237 s = ucs_i_cns11643_table[c - ucs_i_cns11643_table_min];
238 } else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) {
239 s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min];
240 }
241 if (s <= 0) {
242 c1 = c & ~MBFL_WCSPLANE_MASK;
243 if (c1 == MBFL_WCSPLANE_CNS11643) {
244 s = c & MBFL_WCSPLANE_MASK;
245 }
246 if (c == 0) {
247 s = 0;
248 } else if (s <= 0) {
249 s = -1;
250 }
251 }
252 if (s >= 0) {
253 plane = (s & 0x1f0000) >> 16;
254 if (plane <= 1){
255 if (s < 0x80) { /* latin */
256 CK((*filter->output_function)(s, filter->data));
257 } else {
258 s = (s & 0xffff) | 0x8080;
259 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
260 CK((*filter->output_function)(s & 0xff, filter->data));
261 }
262 } else {
263 s = (0x8ea00000 + (plane << 16)) | ((s & 0xffff) | 0x8080);
264 CK((*filter->output_function)(0x8e , filter->data));
265 CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
266 CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
267 CK((*filter->output_function)(s & 0xff, filter->data));
268 }
269 } else {
270 CK(mbfl_filt_conv_illegal_output(c, filter));
271 }
272 return c;
273 }
274
mbfl_filt_ident_euctw(int c,mbfl_identify_filter * filter)275 static int mbfl_filt_ident_euctw(int c, mbfl_identify_filter *filter)
276 {
277 switch (filter->status) {
278 case 0: /* latin */
279 if (c >= 0 && c < 0x80) { /* ok */
280 ;
281 } else if (c > 0xa0 && c < 0xff) { /* DBCS lead byte */
282 filter->status = 1;
283 } else if (c == 0x8e) { /* DBCS lead byte */
284 filter->status = 2;
285 } else { /* bad */
286 filter->flag = 1;
287 }
288 break;
289
290 case 1: /* got lead byte */
291 if (c < 0xa1 || c > 0xfe) { /* bad */
292 filter->flag = 1;
293 }
294 filter->status = 0;
295 break;
296
297 case 2: /* got lead byte */
298 if (c >= 0xa1 && c < 0xaf) { /* ok */
299 filter->status = 3;
300 } else {
301 filter->flag = 1; /* bad */
302 }
303 break;
304
305 case 3: /* got lead byte */
306 if (c < 0xa1 || c > 0xfe) { /* bad */
307 filter->flag = 1;
308 }
309 filter->status = 4;
310 break;
311
312 case 4: /* got lead byte */
313 if (c < 0xa1 || c > 0xfe) { /* bad */
314 filter->flag = 1;
315 }
316 filter->status = 0;
317 break;
318
319 default:
320 filter->status = 0;
321 break;
322 }
323
324 return c;
325 }
326