1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by rui hrokawa <hirokawa@php.net> on 8 aug 2011.
27 *
28 */
29
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #include "mbfilter.h"
35
36 #include "mbfilter_utf8_mobile.h"
37 #include "mbfilter_sjis_mobile.h"
38
39 extern int mbfl_filt_ident_utf8(int c, mbfl_identify_filter *filter);
40 extern int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter);
41
42 extern const unsigned char mblen_table_utf8[];
43
44 static const char *mbfl_encoding_utf8_docomo_aliases[] = {"UTF-8-DOCOMO", "UTF8-DOCOMO", NULL};
45 static const char *mbfl_encoding_utf8_kddi_a_aliases[] = {NULL};
46 static const char *mbfl_encoding_utf8_kddi_b_aliases[] = {"UTF-8-Mobile#KDDI", "UTF-8-KDDI", "UTF8-KDDI", NULL};
47 static const char *mbfl_encoding_utf8_sb_aliases[] = {"UTF-8-SOFTBANK", "UTF8-SOFTBANK", NULL};
48
49 const mbfl_encoding mbfl_encoding_utf8_docomo = {
50 mbfl_no_encoding_utf8_docomo,
51 "UTF-8-Mobile#DOCOMO",
52 "UTF-8",
53 (const char *(*)[])&mbfl_encoding_utf8_docomo_aliases,
54 mblen_table_utf8,
55 MBFL_ENCTYPE_MBCS,
56 &vtbl_utf8_docomo_wchar,
57 &vtbl_wchar_utf8_docomo
58 };
59
60 const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
61 mbfl_no_encoding_utf8_kddi_a,
62 "UTF-8-Mobile#KDDI-A",
63 "UTF-8",
64 (const char *(*)[])&mbfl_encoding_utf8_kddi_a_aliases,
65 mblen_table_utf8,
66 MBFL_ENCTYPE_MBCS,
67 &vtbl_utf8_kddi_a_wchar,
68 &vtbl_wchar_utf8_kddi_a
69 };
70
71 const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
72 mbfl_no_encoding_utf8_kddi_b,
73 "UTF-8-Mobile#KDDI-B",
74 "UTF-8",
75 (const char *(*)[])&mbfl_encoding_utf8_kddi_b_aliases,
76 mblen_table_utf8,
77 MBFL_ENCTYPE_MBCS,
78 &vtbl_utf8_kddi_b_wchar,
79 &vtbl_wchar_utf8_kddi_b
80 };
81
82 const mbfl_encoding mbfl_encoding_utf8_sb = {
83 mbfl_no_encoding_utf8_sb,
84 "UTF-8-Mobile#SOFTBANK",
85 "UTF-8",
86 (const char *(*)[])&mbfl_encoding_utf8_sb_aliases,
87 mblen_table_utf8,
88 MBFL_ENCTYPE_MBCS,
89 &vtbl_utf8_sb_wchar,
90 &vtbl_wchar_utf8_sb
91 };
92
93 const struct mbfl_identify_vtbl vtbl_identify_utf8_docomo = {
94 mbfl_no_encoding_utf8_docomo,
95 mbfl_filt_ident_common_ctor,
96 mbfl_filt_ident_common_dtor,
97 mbfl_filt_ident_utf8
98 };
99
100 const struct mbfl_identify_vtbl vtbl_identify_utf8_kddi_a = {
101 mbfl_no_encoding_utf8_kddi_a,
102 mbfl_filt_ident_common_ctor,
103 mbfl_filt_ident_common_dtor,
104 mbfl_filt_ident_utf8
105 };
106
107 const struct mbfl_identify_vtbl vtbl_identify_utf8_kddi_b = {
108 mbfl_no_encoding_utf8_kddi_b,
109 mbfl_filt_ident_common_ctor,
110 mbfl_filt_ident_common_dtor,
111 mbfl_filt_ident_utf8
112 };
113
114 const struct mbfl_identify_vtbl vtbl_identify_utf8_sb = {
115 mbfl_no_encoding_utf8_sb,
116 mbfl_filt_ident_common_ctor,
117 mbfl_filt_ident_common_dtor,
118 mbfl_filt_ident_utf8
119 };
120
121 const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {
122 mbfl_no_encoding_utf8_docomo,
123 mbfl_no_encoding_wchar,
124 mbfl_filt_conv_common_ctor,
125 mbfl_filt_conv_common_dtor,
126 mbfl_filt_conv_utf8_mobile_wchar,
127 mbfl_filt_conv_utf8_wchar_flush
128 };
129
130 const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo = {
131 mbfl_no_encoding_wchar,
132 mbfl_no_encoding_utf8_docomo,
133 mbfl_filt_conv_common_ctor,
134 mbfl_filt_conv_common_dtor,
135 mbfl_filt_conv_wchar_utf8_mobile,
136 mbfl_filt_conv_common_flush
137 };
138
139 const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar = {
140 mbfl_no_encoding_utf8_kddi_a,
141 mbfl_no_encoding_wchar,
142 mbfl_filt_conv_common_ctor,
143 mbfl_filt_conv_common_dtor,
144 mbfl_filt_conv_utf8_mobile_wchar,
145 mbfl_filt_conv_utf8_wchar_flush
146 };
147
148 const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a = {
149 mbfl_no_encoding_wchar,
150 mbfl_no_encoding_utf8_kddi_a,
151 mbfl_filt_conv_common_ctor,
152 mbfl_filt_conv_common_dtor,
153 mbfl_filt_conv_wchar_utf8_mobile,
154 mbfl_filt_conv_common_flush
155 };
156
157 const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar = {
158 mbfl_no_encoding_utf8_kddi_b,
159 mbfl_no_encoding_wchar,
160 mbfl_filt_conv_common_ctor,
161 mbfl_filt_conv_common_dtor,
162 mbfl_filt_conv_utf8_mobile_wchar,
163 mbfl_filt_conv_utf8_wchar_flush
164 };
165
166 const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b = {
167 mbfl_no_encoding_wchar,
168 mbfl_no_encoding_utf8_kddi_b,
169 mbfl_filt_conv_common_ctor,
170 mbfl_filt_conv_common_dtor,
171 mbfl_filt_conv_wchar_utf8_mobile,
172 mbfl_filt_conv_common_flush
173 };
174
175 const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar = {
176 mbfl_no_encoding_utf8_sb,
177 mbfl_no_encoding_wchar,
178 mbfl_filt_conv_common_ctor,
179 mbfl_filt_conv_common_dtor,
180 mbfl_filt_conv_utf8_mobile_wchar,
181 mbfl_filt_conv_utf8_wchar_flush
182 };
183
184 const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb = {
185 mbfl_no_encoding_wchar,
186 mbfl_no_encoding_utf8_sb,
187 mbfl_filt_conv_common_ctor,
188 mbfl_filt_conv_common_dtor,
189 mbfl_filt_conv_wchar_utf8_mobile,
190 mbfl_filt_conv_common_flush
191 };
192
193 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
194 int mbfl_filt_put_invalid_char(int c, mbfl_convert_filter *filter);
195
196 /*
197 * UTF-8 => wchar
198 */
mbfl_filt_conv_utf8_mobile_wchar(int c,mbfl_convert_filter * filter)199 int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter)
200 {
201 int s, s1 = 0, c1 = 0, snd = 0;
202
203 retry:
204 switch (filter->status & 0xff) {
205 case 0x00:
206 if (c < 0x80) {
207 CK((*filter->output_function)(c, filter->data));
208 } else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
209 filter->status = 0x10;
210 filter->cache = c & 0x1f;
211 } else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
212 filter->status = 0x20;
213 filter->cache = c & 0xf;
214 } else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
215 filter->status = 0x30;
216 filter->cache = c & 0x7;
217 } else {
218 CK(mbfl_filt_put_invalid_char(c, filter));
219 }
220 break;
221 case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
222 case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
223 case 0x32: /* 4byte code 4th char: 0x80-0xbf */
224 filter->status = 0;
225 if (c >= 0x80 && c <= 0xbf) {
226 s = (filter->cache<<6) | (c & 0x3f);
227 filter->cache = 0;
228
229 if (filter->from->no_encoding == mbfl_no_encoding_utf8_docomo &&
230 mbfilter_conv_r_map_tbl(s, &s1, mbfl_docomo2uni_pua, 4) > 0) {
231 s = mbfilter_sjis_emoji_docomo2unicode(s1, &snd);
232 } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_a &&
233 mbfilter_conv_r_map_tbl(s, &s1, mbfl_kddi2uni_pua, 7) > 0) {
234 s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
235 } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_b &&
236 mbfilter_conv_r_map_tbl(s, &s1, mbfl_kddi2uni_pua_b, 8) > 0) {
237 s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd);
238 } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_sb &&
239 mbfilter_conv_r_map_tbl(s, &s1, mbfl_sb2uni_pua, 6) > 0) {
240 s = mbfilter_sjis_emoji_sb2unicode(s1, &snd);
241 }
242
243 if (snd > 0) {
244 CK((*filter->output_function)(snd, filter->data));
245 }
246 CK((*filter->output_function)(s, filter->data));
247 } else {
248 CK(mbfl_filt_put_invalid_char(filter->cache, filter));
249 goto retry;
250 }
251 break;
252 case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
253 s = (filter->cache<<6) | (c & 0x3f);
254 c1 = filter->cache & 0xf;
255
256 if ((c >= 0x80 && c <= 0xbf) &&
257 ((c1 == 0x0 && c >= 0xa0) ||
258 (c1 == 0xd && c < 0xa0) ||
259 (c1 > 0x0 && c1 != 0xd))) {
260 filter->cache = s;
261 filter->status++;
262 } else {
263 CK(mbfl_filt_put_invalid_char(filter->cache, filter));
264 goto retry;
265 }
266 break;
267 case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
268 s = (filter->cache<<6) | (c & 0x3f);
269 c1 = filter->cache & 0x7;
270
271 if ((c >= 0x80 && c <= 0xbf) &&
272 ((c1 == 0x0 && c >= 0x90) ||
273 (c1 == 0x4 && c < 0x90) ||
274 (c1 > 0x0 && c1 != 0x4))) {
275 filter->cache = s;
276 filter->status++;
277 } else {
278 CK(mbfl_filt_put_invalid_char(filter->cache, filter));
279 goto retry;
280 }
281 break;
282 case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
283 if (c >= 0x80 && c <= 0xbf) {
284 filter->cache = (filter->cache<<6) | (c & 0x3f);
285 filter->status++;
286 } else {
287 CK(mbfl_filt_put_invalid_char(filter->cache, filter));
288 goto retry;
289 }
290 break;
291 default:
292 filter->status = 0;
293 break;
294 }
295
296 return c;
297 }
298
299 /*
300 * wchar => UTF-8
301 */
mbfl_filt_conv_wchar_utf8_mobile(int c,mbfl_convert_filter * filter)302 int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter)
303 {
304 if (c >= 0 && c < 0x110000) {
305 int s1, c1;
306
307 if ((filter->to->no_encoding == mbfl_no_encoding_utf8_docomo &&
308 mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter) > 0 &&
309 mbfilter_conv_map_tbl(s1, &c1, mbfl_docomo2uni_pua, 4) > 0) ||
310 (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_a &&
311 mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0 &&
312 mbfilter_conv_map_tbl(s1, &c1, mbfl_kddi2uni_pua, 7) > 0) ||
313 (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_b &&
314 mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter) > 0 &&
315 mbfilter_conv_map_tbl(s1, &c1, mbfl_kddi2uni_pua_b, 8) > 0) ||
316 (filter->to->no_encoding == mbfl_no_encoding_utf8_sb &&
317 mbfilter_unicode2sjis_emoji_sb(c, &s1, filter) > 0 &&
318 mbfilter_conv_map_tbl(s1, &c1, mbfl_sb2uni_pua, 6) > 0)) {
319 c = c1;
320 }
321
322 if (filter->status == 1 && filter->cache > 0) {
323 return c;
324 }
325
326 if (c < 0x80) {
327 CK((*filter->output_function)(c, filter->data));
328 } else if (c < 0x800) {
329 CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
330 CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
331 } else if (c < 0x10000) {
332 CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
333 CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
334 CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
335 } else {
336 CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
337 CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
338 CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
339 CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
340 }
341 } else {
342 if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
343 CK(mbfl_filt_conv_illegal_output(c, filter));
344 }
345 }
346
347 return c;
348 }
349