1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter_ja.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_cp51932.h"
32 
33 #include "unicode_table_cp932_ext.h"
34 #include "unicode_table_jis.h"
35 #include "cp932_table.h"
36 
37 static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter);
38 static size_t mb_cp51932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
39 static void mb_wchar_to_cp51932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
40 
41 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
53   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
54   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
55   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
56   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
58 };
59 
60 static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
61 
62 const mbfl_encoding mbfl_encoding_cp51932 = {
63 	mbfl_no_encoding_cp51932,
64 	"CP51932",
65 	"CP51932",
66 	mbfl_encoding_cp51932_aliases,
67 	mblen_table_eucjp,
68 	0,
69 	&vtbl_cp51932_wchar,
70 	&vtbl_wchar_cp51932,
71 	mb_cp51932_to_wchar,
72 	mb_wchar_to_cp51932,
73 	NULL
74 };
75 
76 const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
77 	mbfl_no_encoding_cp51932,
78 	mbfl_no_encoding_wchar,
79 	mbfl_filt_conv_common_ctor,
80 	NULL,
81 	mbfl_filt_conv_cp51932_wchar,
82 	mbfl_filt_conv_cp51932_wchar_flush,
83 	NULL,
84 };
85 
86 const struct mbfl_convert_vtbl vtbl_wchar_cp51932 = {
87 	mbfl_no_encoding_wchar,
88 	mbfl_no_encoding_cp51932,
89 	mbfl_filt_conv_common_ctor,
90 	NULL,
91 	mbfl_filt_conv_wchar_cp51932,
92 	mbfl_filt_conv_common_flush,
93 	NULL,
94 };
95 
96 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
97 
98 /*
99  * cp51932 => wchar
100  */
101 int
mbfl_filt_conv_cp51932_wchar(int c,mbfl_convert_filter * filter)102 mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
103 {
104 	int c1, s, w;
105 
106 	switch (filter->status) {
107 	case 0:
108 		if (c >= 0 && c < 0x80) { /* latin */
109 			CK((*filter->output_function)(c, filter->data));
110 		} else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */
111 			filter->status = 1;
112 			filter->cache = c;
113 		} else if (c == 0x8e) { /* kana first char */
114 			filter->status = 2;
115 		} else {
116 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
117 		}
118 		break;
119 
120 	case 1:	/* got first half */
121 		filter->status = 0;
122 		c1 = filter->cache;
123 		if (c > 0xa0 && c < 0xff) {
124 			w = 0;
125 			s = (c1 - 0xa1)*94 + c - 0xa1;
126 			if (s <= 137) {
127 				if (s == 31) {
128 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
129 				} else if (s == 32) {
130 					w = 0xff5e;			/* FULLWIDTH TILDE */
131 				} else if (s == 33) {
132 					w = 0x2225;			/* PARALLEL TO */
133 				} else if (s == 60) {
134 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
135 				} else if (s == 80) {
136 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
137 				} else if (s == 81) {
138 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
139 				} else if (s == 137) {
140 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
141 				}
142 			}
143 			if (w == 0) {
144 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
145 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
146 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
147 					w = jisx0208_ucs_table[s];
148 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
149 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
150 				}
151 			}
152 			if (w <= 0) {
153 				w = MBFL_BAD_INPUT;
154 			}
155 			CK((*filter->output_function)(w, filter->data));
156 		} else {
157 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
158 		}
159 		break;
160 
161 	case 2:	/* got 0x8e, X0201 kana */
162 		filter->status = 0;
163 		if (c > 0xa0 && c < 0xe0) {
164 			w = 0xfec0 + c;
165 			CK((*filter->output_function)(w, filter->data));
166 		} else {
167 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
168 		}
169 		break;
170 
171 		EMPTY_SWITCH_DEFAULT_CASE();
172 	}
173 
174 	return 0;
175 }
176 
mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter * filter)177 static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter)
178 {
179 	if (filter->status) {
180 		/* Input string was truncated */
181 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
182 		filter->status = 0;
183 	}
184 
185 	if (filter->flush_function) {
186 		(*filter->flush_function)(filter->data);
187 	}
188 
189 	return 0;
190 }
191 
192 /*
193  * wchar => cp51932
194  */
195 int
mbfl_filt_conv_wchar_cp51932(int c,mbfl_convert_filter * filter)196 mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
197 {
198 	int c1, c2, s1;
199 
200 	s1 = 0;
201 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
202 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
203 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
204 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
205 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
206 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
207 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
208 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
209 	}
210 	if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */
211 	if (s1 <= 0) {
212 		if (c == 0xa5) { /* YEN SIGN */
213 			s1 = 0x216F; /* FULLWIDTH YEN SIGN */
214 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
215 			s1 = 0x2140;
216 		} else if (c == 0x2225) {	/* PARALLEL TO */
217 			s1 = 0x2142;
218 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
219 			s1 = 0x215d;
220 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
221 			s1 = 0x2171;
222 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
223 			s1 = 0x2172;
224 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
225 			s1 = 0x224c;
226 		} else {
227 			s1 = -1;
228 			c1 = 0;
229 			c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
230 			while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
231 				if (c == cp932ext1_ucs_table[c1]) {
232 					s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
233 					break;
234 				}
235 				c1++;
236 			}
237 			if (s1 < 0) {
238 				c1 = 0;
239 				c2 = cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
240 				while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
241 					if (c == cp932ext2_ucs_table[c1]) {
242 					  s1 = ((c1/94 + 0x79) << 8) +(c1%94 + 0x21);
243 					  break;
244 					}
245 					c1++;
246 				}
247 			}
248 		}
249 		if (c == 0) {
250 			s1 = 0;
251 		} else if (s1 <= 0) {
252 			s1 = -1;
253 		}
254 	}
255 
256 	if (s1 >= 0) {
257 		if (s1 < 0x80) {	/* latin */
258 			CK((*filter->output_function)(s1, filter->data));
259 		} else if (s1 < 0x100) {	/* kana */
260 			CK((*filter->output_function)(0x8e, filter->data));
261 			CK((*filter->output_function)(s1, filter->data));
262 		} else if (s1 < 0x8080)  {	/* X 0208 */
263 			CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
264 			CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
265 		} else {
266 		    CK(mbfl_filt_conv_illegal_output(c, filter));
267 		}
268 	} else {
269 		CK(mbfl_filt_conv_illegal_output(c, filter));
270 	}
271 
272 	return 0;
273 }
274 
mb_cp51932_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)275 static size_t mb_cp51932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
276 {
277 	unsigned char *p = *in, *e = p + *in_len;
278 	uint32_t *out = buf, *limit = buf + bufsize;
279 
280 	while (p < e && out < limit) {
281 		unsigned char c = *p++;
282 
283 		if (c < 0x80) {
284 			*out++ = c;
285 		} else if (c >= 0xA1 && c <= 0xFE && p < e) {
286 			unsigned char c2 = *p++;
287 			if (c2 >= 0xA1 && c2 <= 0xFE) {
288 				unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
289 
290 				if (s <= 137) {
291 					if (s == 31) {
292 						w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
293 					} else if (s == 32) {
294 						w = 0xFF5E; /* FULLWIDTH TILDE */
295 					} else if (s == 33) {
296 						w = 0x2225; /* PARALLEL TO */
297 					} else if (s == 60) {
298 						w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
299 					} else if (s == 80) {
300 						w = 0xFFE0; /* FULLWIDTH CENT SIGN */
301 					} else if (s == 81) {
302 						w = 0xFFE1; /* FULLWIDTH POUND SIGN */
303 					} else if (s == 137) {
304 						w = 0xFFE2; /* FULLWIDTH NOT SIGN */
305 					}
306 				}
307 
308 				if (w == 0) {
309 					if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
310 						w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
311 					} else if (s < jisx0208_ucs_table_size) {
312 						w = jisx0208_ucs_table[s];
313 					} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {
314 						w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
315 					}
316 				}
317 
318 				if (!w)
319 					w = MBFL_BAD_INPUT;
320 				*out++ = w;
321 			} else {
322 				*out++ = MBFL_BAD_INPUT;
323 			}
324 		} else if (c == 0x8E && p < e) {
325 			unsigned char c2 = *p++;
326 			if (c2 >= 0xA1 && c2 <= 0xDF) {
327 				*out++ = 0xFEC0 + c2;
328 			} else {
329 				*out++ = MBFL_BAD_INPUT;
330 			}
331 		} else {
332 			*out++ = MBFL_BAD_INPUT;
333 		}
334 	}
335 
336 	*in_len = e - p;
337 	*in = p;
338 	return out - buf;
339 }
340 
mb_wchar_to_cp51932(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)341 static void mb_wchar_to_cp51932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
342 {
343 	unsigned char *out, *limit;
344 	MB_CONVERT_BUF_LOAD(buf, out, limit);
345 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
346 
347 	while (len--) {
348 		uint32_t w = *in++;
349 		unsigned int s = 0;
350 
351 		if (w == 0) {
352 			out = mb_convert_buf_add(out, 0);
353 			continue;
354 		} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
355 			s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
356 		} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
357 			s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
358 		} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
359 			s = ucs_i_jis_table[w - ucs_i_jis_table_min];
360 		} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
361 			s = ucs_r_jis_table[w - ucs_r_jis_table_min];
362 		}
363 
364 		if (s >= 0x8080) s = 0; /* We don't support JIS X0213 */
365 
366 		if (s == 0) {
367 			if (w == 0xA5) { /* YEN SIGN */
368 				s = 0x216F; /* FULLWIDTH YEN SIGN */
369 			} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
370 				s = 0x2140;
371 			} else if (w == 0x2225) { /* PARALLEL TO */
372 				s = 0x2142;
373 			} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
374 				s = 0x215D;
375 			} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
376 				s = 0x2171;
377 			} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
378 				s = 0x2172;
379 			} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
380 				s = 0x224C;
381 			} else {
382 				for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
383 					if (cp932ext1_ucs_table[i] == w) {
384 						s = ((i/94 + 0x2D) << 8) + (i%94) + 0x21;
385 						goto found_it;
386 					}
387 				}
388 
389 				for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) {
390 					if (cp932ext2_ucs_table[i] == w) {
391 						s = ((i/94 + 0x79) << 8) + (i%94) + 0x21;
392 						goto found_it;
393 					}
394 				}
395 			}
396 found_it: ;
397 		}
398 
399 		if (!s || s >= 0x8080) {
400 			MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp51932);
401 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
402 		} else if (s < 0x80) {
403 			out = mb_convert_buf_add(out, s);
404 		} else if (s < 0x100) {
405 			out = mb_convert_buf_add2(out, 0x8E, s);
406 		} else {
407 			out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
408 		}
409 	}
410 
411 	MB_CONVERT_BUF_STORE(buf, out, limit);
412 }
413