1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by rui hrokawa <hirokawa@php.net> on 8 aug 2011.
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include "mbfilter.h"
35 
36 #include "mbfilter_utf8_mobile.h"
37 #include "mbfilter_sjis_mobile.h"
38 
39 extern int mbfl_filt_ident_utf8(int c, mbfl_identify_filter *filter);
40 extern int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter);
41 
42 extern const unsigned char mblen_table_utf8[];
43 
44 static const char *mbfl_encoding_utf8_docomo_aliases[] = {"UTF-8-DOCOMO", "UTF8-DOCOMO", NULL};
45 static const char *mbfl_encoding_utf8_kddi_a_aliases[] = {NULL};
46 static const char *mbfl_encoding_utf8_kddi_b_aliases[] = {"UTF-8-Mobile#KDDI", "UTF-8-KDDI", "UTF8-KDDI", NULL};
47 static const char *mbfl_encoding_utf8_sb_aliases[] = {"UTF-8-SOFTBANK", "UTF8-SOFTBANK", NULL};
48 
49 const mbfl_encoding mbfl_encoding_utf8_docomo = {
50 	mbfl_no_encoding_utf8_docomo,
51 	"UTF-8-Mobile#DOCOMO",
52 	"UTF-8",
53 	(const char *(*)[])&mbfl_encoding_utf8_docomo_aliases,
54 	mblen_table_utf8,
55 	MBFL_ENCTYPE_MBCS
56 };
57 
58 const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
59 	mbfl_no_encoding_utf8_kddi_a,
60 	"UTF-8-Mobile#KDDI-A",
61 	"UTF-8",
62 	(const char *(*)[])&mbfl_encoding_utf8_kddi_a_aliases,
63 	mblen_table_utf8,
64 	MBFL_ENCTYPE_MBCS
65 };
66 
67 const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
68 	mbfl_no_encoding_utf8_kddi_b,
69 	"UTF-8-Mobile#KDDI-B",
70 	"UTF-8",
71 	(const char *(*)[])&mbfl_encoding_utf8_kddi_b_aliases,
72 	mblen_table_utf8,
73 	MBFL_ENCTYPE_MBCS
74 };
75 
76 const mbfl_encoding mbfl_encoding_utf8_sb = {
77 	mbfl_no_encoding_utf8_sb,
78 	"UTF-8-Mobile#SOFTBANK",
79 	"UTF-8",
80 	(const char *(*)[])&mbfl_encoding_utf8_sb_aliases,
81 	mblen_table_utf8,
82 	MBFL_ENCTYPE_MBCS
83 };
84 
85 const struct mbfl_identify_vtbl vtbl_identify_utf8_docomo = {
86 	mbfl_no_encoding_utf8_docomo,
87 	mbfl_filt_ident_common_ctor,
88 	mbfl_filt_ident_common_dtor,
89 	mbfl_filt_ident_utf8
90 };
91 
92 const struct mbfl_identify_vtbl vtbl_identify_utf8_kddi_a = {
93 	mbfl_no_encoding_utf8_kddi_a,
94 	mbfl_filt_ident_common_ctor,
95 	mbfl_filt_ident_common_dtor,
96 	mbfl_filt_ident_utf8
97 };
98 
99 const struct mbfl_identify_vtbl vtbl_identify_utf8_kddi_b = {
100 	mbfl_no_encoding_utf8_kddi_b,
101 	mbfl_filt_ident_common_ctor,
102 	mbfl_filt_ident_common_dtor,
103 	mbfl_filt_ident_utf8
104 };
105 
106 const struct mbfl_identify_vtbl vtbl_identify_utf8_sb = {
107 	mbfl_no_encoding_utf8_sb,
108 	mbfl_filt_ident_common_ctor,
109 	mbfl_filt_ident_common_dtor,
110 	mbfl_filt_ident_utf8
111 };
112 
113 const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {
114 	mbfl_no_encoding_utf8_docomo,
115 	mbfl_no_encoding_wchar,
116 	mbfl_filt_conv_common_ctor,
117 	mbfl_filt_conv_common_dtor,
118 	mbfl_filt_conv_utf8_mobile_wchar,
119 	mbfl_filt_conv_utf8_wchar_flush
120 };
121 
122 const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo = {
123 	mbfl_no_encoding_wchar,
124 	mbfl_no_encoding_utf8_docomo,
125 	mbfl_filt_conv_common_ctor,
126 	mbfl_filt_conv_common_dtor,
127 	mbfl_filt_conv_wchar_utf8_mobile,
128 	mbfl_filt_conv_common_flush
129 };
130 
131 const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar = {
132 	mbfl_no_encoding_utf8_kddi_a,
133 	mbfl_no_encoding_wchar,
134 	mbfl_filt_conv_common_ctor,
135 	mbfl_filt_conv_common_dtor,
136 	mbfl_filt_conv_utf8_mobile_wchar,
137 	mbfl_filt_conv_utf8_wchar_flush
138 };
139 
140 const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a = {
141 	mbfl_no_encoding_wchar,
142 	mbfl_no_encoding_utf8_kddi_a,
143 	mbfl_filt_conv_common_ctor,
144 	mbfl_filt_conv_common_dtor,
145 	mbfl_filt_conv_wchar_utf8_mobile,
146 	mbfl_filt_conv_common_flush
147 };
148 
149 const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar = {
150 	mbfl_no_encoding_utf8_kddi_b,
151 	mbfl_no_encoding_wchar,
152 	mbfl_filt_conv_common_ctor,
153 	mbfl_filt_conv_common_dtor,
154 	mbfl_filt_conv_utf8_mobile_wchar,
155 	mbfl_filt_conv_utf8_wchar_flush
156 };
157 
158 const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b = {
159 	mbfl_no_encoding_wchar,
160 	mbfl_no_encoding_utf8_kddi_b,
161 	mbfl_filt_conv_common_ctor,
162 	mbfl_filt_conv_common_dtor,
163 	mbfl_filt_conv_wchar_utf8_mobile,
164 	mbfl_filt_conv_common_flush
165 };
166 
167 const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar = {
168 	mbfl_no_encoding_utf8_sb,
169 	mbfl_no_encoding_wchar,
170 	mbfl_filt_conv_common_ctor,
171 	mbfl_filt_conv_common_dtor,
172 	mbfl_filt_conv_utf8_mobile_wchar,
173 	mbfl_filt_conv_utf8_wchar_flush
174 };
175 
176 const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb = {
177 	mbfl_no_encoding_wchar,
178 	mbfl_no_encoding_utf8_sb,
179 	mbfl_filt_conv_common_ctor,
180 	mbfl_filt_conv_common_dtor,
181 	mbfl_filt_conv_wchar_utf8_mobile,
182 	mbfl_filt_conv_common_flush
183 };
184 
185 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
186 
187 /*
188  * UTF-8 => wchar
189  */
mbfl_filt_conv_utf8_mobile_wchar(int c,mbfl_convert_filter * filter)190 int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter)
191 {
192 	int s, w = 0, flag = 0;
193 	int s1 = 0, c1 = 0, snd = 0;
194 
195 retry:
196 	switch (filter->status & 0xff) {
197 	case 0x00:
198 		if (c < 0x80) {
199 			CK((*filter->output_function)(c, filter->data));
200 		} else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
201 			filter->status = 0x10;
202 			filter->cache = c & 0x1f;
203 		} else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
204 			filter->status = 0x20;
205 			filter->cache = c & 0xf;
206 		} else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
207 			filter->status = 0x30;
208 			filter->cache = c & 0x7;
209 		} else {
210 			mbfl_filt_put_invalid_char(c, filter);
211 		}
212 		break;
213 	case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
214 	case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
215 	case 0x32: /* 4byte code 4th char: 0x80-0xbf */
216 		filter->status = 0;
217 		if (c >= 0x80 && c <= 0xbf) {
218 			s = (filter->cache<<6) | (c & 0x3f);
219 			filter->cache = 0;
220 
221 			if (filter->from->no_encoding == mbfl_no_encoding_utf8_docomo &&
222 				mbfilter_conv_r_map_tbl(s, &s1, mbfl_docomo2uni_pua, 4) > 0) {
223 				s = mbfilter_sjis_emoji_docomo2unicode(s1, &snd);
224 			} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_a &&
225 					   mbfilter_conv_r_map_tbl(s, &s1, mbfl_kddi2uni_pua, 7) > 0) {
226 				s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
227 			} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_b &&
228 					   mbfilter_conv_r_map_tbl(s, &s1, mbfl_kddi2uni_pua_b, 8) > 0) {
229 				s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
230 			} else if (filter->from->no_encoding == mbfl_no_encoding_utf8_sb &&
231 					   mbfilter_conv_r_map_tbl(s, &s1, mbfl_sb2uni_pua, 6) > 0) {
232 				s = mbfilter_sjis_emoji_sb2unicode(s1, &snd);
233 			}
234 
235 			if (snd > 0) {
236 				CK((*filter->output_function)(snd, filter->data));
237 			}
238 			CK((*filter->output_function)(s, filter->data));
239 		} else {
240 			mbfl_filt_put_invalid_char(filter->cache, filter);
241 			goto retry;
242 		}
243 		break;
244 	case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
245 		s = (filter->cache<<6) | (c & 0x3f);
246 		c1 = filter->cache & 0xf;
247 
248 		if ((c >= 0x80 && c <= 0xbf) &&
249 			((c1 == 0x0 && c >= 0xa0) ||
250 			 (c1 == 0xd && c < 0xa0) ||
251 			 (c1 > 0x0 && c1 != 0xd))) {
252 			filter->cache = s;
253 			filter->status++;
254 		} else {
255 			mbfl_filt_put_invalid_char(filter->cache, filter);
256 			goto retry;
257 		}
258 		break;
259 	case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
260 		s = (filter->cache<<6) | (c & 0x3f);
261 		c1 = filter->cache & 0x7;
262 
263 		if ((c >= 0x80 && c <= 0xbf) &&
264 			((c1 == 0x0 && c >= 0x90) ||
265 			 (c1 == 0x4 && c < 0x90) ||
266 			 (c1 > 0x0 && c1 != 0x4))) {
267 			filter->cache = s;
268 			filter->status++;
269 		} else {
270 			mbfl_filt_put_invalid_char(filter->cache, filter);
271 			goto retry;
272 		}
273 		break;
274 	case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
275 		if (c >= 0x80 && c <= 0xbf) {
276 			filter->cache = (filter->cache<<6) | (c & 0x3f);
277 			filter->status++;
278 		} else {
279 			mbfl_filt_put_invalid_char(filter->cache, filter);
280 			goto retry;
281 		}
282 		break;
283 	default:
284 		filter->status = 0;
285 		break;
286 	}
287 
288 	return c;
289 }
290 
291 /*
292  * wchar => UTF-8
293  */
mbfl_filt_conv_wchar_utf8_mobile(int c,mbfl_convert_filter * filter)294 int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter)
295 {
296 	if (c >= 0 && c < 0x110000) {
297 		int s1, c1;
298 
299 		if ((filter->to->no_encoding == mbfl_no_encoding_utf8_docomo &&
300 			 mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter) > 0 &&
301 			 mbfilter_conv_map_tbl(s1, &c1, mbfl_docomo2uni_pua, 4) > 0) ||
302 			(filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_a &&
303 			 mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0 &&
304 			 mbfilter_conv_map_tbl(s1, &c1, mbfl_kddi2uni_pua, 7) > 0) ||
305 			(filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_b &&
306 			 mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0 &&
307 			 mbfilter_conv_map_tbl(s1, &c1, mbfl_kddi2uni_pua_b, 8) > 0) ||
308 			(filter->to->no_encoding == mbfl_no_encoding_utf8_sb &&
309 			 mbfilter_unicode2sjis_emoji_sb(c, &s1, filter) > 0 &&
310 			 mbfilter_conv_map_tbl(s1, &c1, mbfl_sb2uni_pua, 6) > 0)) {
311 			c = c1;
312 		}
313 
314 		if (filter->status == 1 && filter->cache > 0) {
315 			return c;
316 		}
317 
318 		if (c < 0x80) {
319 			CK((*filter->output_function)(c, filter->data));
320 		} else if (c < 0x800) {
321 			CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
322 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
323 		} else if (c < 0x10000) {
324 			CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
325 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
326 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
327 		} else {
328 			CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
329 			CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
330 			CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
331 			CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
332 		}
333 	} else {
334 		if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
335 			CK(mbfl_filt_conv_illegal_output(c, filter));
336 		}
337 	}
338 
339 	return c;
340 }
341 
342