1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * the source code included in this files was separated from mbfilter_ja.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 /* CP932 is Microsoft's version of Shift-JIS.
31  *
32  * What we call "SJIS-win" is a variant of CP932 which maps U+00A5
33  * and U+203E the same way as eucJP-win; namely, instead of mapping
34  * U+00A5 (YEN SIGN) to 0x5C and U+203E (OVERLINE) to 0x7E,
35  * these codepoints are mapped to appropriate JIS X 0208 characters.
36  *
37  * When converting from Shift-JIS to Unicode, there is no difference
38  * between CP932 and "SJIS-win".
39  *
40  * Additional facts:
41  *
42  * • In the libmbfl library which formed the base for mbstring, "CP932" and
43  *   "SJIS-win" were originally aliases. The differing mappings were added in
44  *   December 2002. The libmbfl author later stated that this was done so that
45  *   "CP932" would comply with a certain specification, while "SJIS-win" would
46  *   maintain the existing mappings. He does not remember which specification
47  *   it was.
48  * • The WHATWG specification for "Shift_JIS" (followed by web browsers)
49  *   agrees with our mappings for "CP932".
50  * • Microsoft Windows' "best-fit" mappings for CP932 (via the
51  *   WideCharToMultiByte API) convert U+00A5 to 0x5C, which also agrees with
52  *   our mappings for "CP932".
53  * • glibc's iconv converts U+203E to CP932 0x7E, which again agrees with
54  *   our mappings for "CP932".
55  * • When converting Shift-JIS to CP932, the conversion goes through Unicode.
56  *   Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that
57  *   0x7E will go to 0x7E when converting Shift-JIS to CP932.
58  */
59 
60 #include "mbfilter.h"
61 #include "mbfilter_cp932.h"
62 
63 #include "unicode_table_cp932_ext.h"
64 #include "unicode_table_jis.h"
65 
66 static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter);
67 
68 static const unsigned char mblen_table_sjis[] = { /* 0x81-0x9f,0xE0-0xFF */
69   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
79   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
80   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
81   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
82   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
85 };
86 
87 static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL};
88 static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL};
89 
90 const mbfl_encoding mbfl_encoding_cp932 = {
91 	mbfl_no_encoding_cp932,
92 	"CP932",
93 	"Shift_JIS",
94 	mbfl_encoding_cp932_aliases,
95 	mblen_table_sjis,
96 	MBFL_ENCTYPE_GL_UNSAFE,
97 	&vtbl_cp932_wchar,
98 	&vtbl_wchar_cp932,
99 	NULL
100 };
101 
102 const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
103 	mbfl_no_encoding_cp932,
104 	mbfl_no_encoding_wchar,
105 	mbfl_filt_conv_common_ctor,
106 	NULL,
107 	mbfl_filt_conv_cp932_wchar,
108 	mbfl_filt_conv_cp932_wchar_flush,
109 	NULL,
110 };
111 
112 const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
113 	mbfl_no_encoding_wchar,
114 	mbfl_no_encoding_cp932,
115 	mbfl_filt_conv_common_ctor,
116 	NULL,
117 	mbfl_filt_conv_wchar_cp932,
118 	mbfl_filt_conv_common_flush,
119 	NULL,
120 };
121 
122 const mbfl_encoding mbfl_encoding_sjiswin = {
123 	mbfl_no_encoding_sjiswin,
124 	"SJIS-win",
125 	"Shift_JIS",
126 	mbfl_encoding_sjiswin_aliases,
127 	mblen_table_sjis,
128 	MBFL_ENCTYPE_GL_UNSAFE,
129 	&vtbl_sjiswin_wchar,
130 	&vtbl_wchar_sjiswin,
131 	NULL
132 };
133 
134 const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
135 	mbfl_no_encoding_sjiswin,
136 	mbfl_no_encoding_wchar,
137 	mbfl_filt_conv_common_ctor,
138 	NULL,
139 	mbfl_filt_conv_cp932_wchar,
140 	mbfl_filt_conv_cp932_wchar_flush,
141 	NULL,
142 };
143 
144 const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = {
145 	mbfl_no_encoding_wchar,
146 	mbfl_no_encoding_sjiswin,
147 	mbfl_filt_conv_common_ctor,
148 	NULL,
149 	mbfl_filt_conv_wchar_sjiswin,
150 	mbfl_filt_conv_common_flush,
151 	NULL,
152 };
153 
154 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
155 
156 #define SJIS_ENCODE(c1,c2,s1,s2)	\
157 		do {						\
158 			s1 = c1;				\
159 			s1--;					\
160 			s1 >>= 1;				\
161 			if ((c1) < 0x5f) {		\
162 				s1 += 0x71;			\
163 			} else {				\
164 				s1 += 0xb1;			\
165 			}						\
166 			s2 = c2;				\
167 			if ((c1) & 1) {			\
168 				if ((c2) < 0x60) {	\
169 					s2--;			\
170 				}					\
171 				s2 += 0x20;			\
172 			} else {				\
173 				s2 += 0x7e;			\
174 			}						\
175 		} while (0)
176 
177 #define SJIS_DECODE(c1,c2,s1,s2)	\
178 		do {						\
179 			s1 = c1;				\
180 			if (s1 < 0xa0) {		\
181 				s1 -= 0x81;			\
182 			} else {				\
183 				s1 -= 0xc1;			\
184 			}						\
185 			s1 <<= 1;				\
186 			s1 += 0x21;				\
187 			s2 = c2;				\
188 			if (s2 < 0x9f) {		\
189 				if (s2 < 0x7f) {	\
190 					s2++;			\
191 				}					\
192 				s2 -= 0x20;			\
193 			} else {				\
194 				s1++;				\
195 				s2 -= 0x7e;			\
196 			}						\
197 		} while (0)
198 
mbfl_filt_conv_cp932_wchar(int c,mbfl_convert_filter * filter)199 int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
200 {
201 	int c1, s, s1, s2, w;
202 
203 	switch (filter->status) {
204 	case 0:
205 		if (c >= 0 && c < 0x80) {	/* latin */
206 			CK((*filter->output_function)(c, filter->data));
207 		} else if (c > 0xa0 && c < 0xe0) {	/* kana */
208 			CK((*filter->output_function)(0xfec0 + c, filter->data));
209 		} else if (c > 0x80 && c < 0xfd && c != 0xa0) {	/* kanji first char */
210 			filter->status = 1;
211 			filter->cache = c;
212 		} else {
213 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
214 		}
215 		break;
216 
217 	case 1:		/* kanji second char */
218 		filter->status = 0;
219 		c1 = filter->cache;
220 		if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
221 			w = 0;
222 			SJIS_DECODE(c1, c, s1, s2);
223 			s = (s1 - 0x21)*94 + s2 - 0x21;
224 			if (s <= 137) {
225 				if (s == 31) {
226 					w = 0xff3c;			/* FULLWIDTH REVERSE SOLIDUS */
227 				} else if (s == 32) {
228 					w = 0xff5e;			/* FULLWIDTH TILDE */
229 				} else if (s == 33) {
230 					w = 0x2225;			/* PARALLEL TO */
231 				} else if (s == 60) {
232 					w = 0xff0d;			/* FULLWIDTH HYPHEN-MINUS */
233 				} else if (s == 80) {
234 					w = 0xffe0;			/* FULLWIDTH CENT SIGN */
235 				} else if (s == 81) {
236 					w = 0xffe1;			/* FULLWIDTH POUND SIGN */
237 				} else if (s == 137) {
238 					w = 0xffe2;			/* FULLWIDTH NOT SIGN */
239 				}
240 			}
241 			if (w == 0) {
242 				if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {		/* vendor ext1 (13ku) */
243 					w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
244 				} else if (s >= 0 && s < jisx0208_ucs_table_size) {		/* X 0208 */
245 					w = jisx0208_ucs_table[s];
246 				} else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {		/* vendor ext2 (89ku - 92ku) */
247 					w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
248 				} else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {		/* vendor ext3 (115ku - 119ku) */
249 					w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
250 				} else if (s >= (94*94) && s < (114*94)) {		/* user (95ku - 114ku) */
251 					w = s - (94*94) + 0xe000;
252 				}
253 			}
254 
255 			if (w <= 0) {
256 				w = MBFL_BAD_INPUT;
257 			}
258 
259 			CK((*filter->output_function)(w, filter->data));
260 		} else {
261 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
262 		}
263 		break;
264 
265 	default:
266 		filter->status = 0;
267 		break;
268 	}
269 
270 	return 0;
271 }
272 
mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter * filter)273 static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
274 {
275 	if (filter->status) {
276 		(*filter->output_function)(MBFL_BAD_INPUT, filter->data);
277 		filter->status = 0;
278 	}
279 
280 	if (filter->flush_function) {
281 		(*filter->flush_function)(filter->data);
282 	}
283 
284 	return 0;
285 }
286 
mbfl_filt_conv_wchar_cp932(int c,mbfl_convert_filter * filter)287 int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
288 {
289 	int c1, c2, s1, s2;
290 
291 	s1 = 0;
292 	s2 = 0;
293 	if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
294 		s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
295 	} else if (c == 0x203E) {
296 		s1 = 0x7E;
297 	} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
298 		s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
299 	} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
300 		s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
301 	} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
302 		s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
303 	} else if (c >= 0xe000 && c < (0xe000 + 20*94)) {	/* user  (95ku - 114ku) */
304 		s1 = c - 0xe000;
305 		c1 = s1/94 + 0x7f;
306 		c2 = s1%94 + 0x21;
307 		s1 = (c1 << 8) | c2;
308 		s2 = 1;
309 	}
310 	if (s1 <= 0) {
311 		if (c == 0xa5) { /* YEN SIGN */
312 			s1 = 0x5C;
313 		} else if (c == 0xff3c) {	/* FULLWIDTH REVERSE SOLIDUS */
314 			s1 = 0x2140;
315 		} else if (c == 0x2225) {	/* PARALLEL TO */
316 			s1 = 0x2142;
317 		} else if (c == 0xff0d) {	/* FULLWIDTH HYPHEN-MINUS */
318 			s1 = 0x215d;
319 		} else if (c == 0xffe0) {	/* FULLWIDTH CENT SIGN */
320 			s1 = 0x2171;
321 		} else if (c == 0xffe1) {	/* FULLWIDTH POUND SIGN */
322 			s1 = 0x2172;
323 		} else if (c == 0xffe2) {	/* FULLWIDTH NOT SIGN */
324 			s1 = 0x224c;
325 		}
326 	}
327 	if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {	/* not found or X 0212 */
328 		s1 = -1;
329 		c1 = 0;
330 		c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
331 		while (c1 < c2) {		/* CP932 vendor ext1 (13ku) */
332 			if (c == cp932ext1_ucs_table[c1]) {
333 				s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
334 				break;
335 			}
336 			c1++;
337 		}
338 		if (s1 <= 0) {
339 			c1 = 0;
340 			c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
341 			while (c1 < c2) {		/* CP932 vendor ext3 (115ku - 119ku) */
342 				if (c == cp932ext3_ucs_table[c1]) {
343 					s1 = ((c1/94 + 0x93) << 8) + (c1%94 + 0x21);
344 					break;
345 				}
346 				c1++;
347 			}
348 		}
349 		if (c == 0) {
350 			s1 = 0;
351 		} else if (s1 <= 0) {
352 			s1 = -1;
353 		}
354 	}
355 	if (s1 >= 0) {
356 		if (s1 < 0x100) { /* latin or kana */
357 			CK((*filter->output_function)(s1, filter->data));
358 		} else { /* kanji */
359 			c1 = (s1 >> 8) & 0xff;
360 			c2 = s1 & 0xff;
361 			SJIS_ENCODE(c1, c2, s1, s2);
362 			CK((*filter->output_function)(s1, filter->data));
363 			CK((*filter->output_function)(s2, filter->data));
364 		}
365 	} else {
366 		CK(mbfl_filt_conv_illegal_output(c, filter));
367 	}
368 
369 	return 0;
370 }
371 
mbfl_filt_conv_wchar_sjiswin(int c,mbfl_convert_filter * filter)372 int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter)
373 {
374 	if (c == 0xA5) {
375 		CK((*filter->output_function)(0x81, filter->data));
376 		CK((*filter->output_function)(0x8F, filter->data));
377 	} else if (c == 0x203E) {
378 		CK((*filter->output_function)(0x81, filter->data));
379 		CK((*filter->output_function)(0x50, filter->data));
380 	} else {
381 		return mbfl_filt_conv_wchar_cp932(c, filter);
382 	}
383 	return 0;
384 }
385