1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27 * mbfilter.c is included in this package .
28 *
29 */
30
31 #ifdef HAVE_CONFIG_H
32 #include "config.h"
33 #endif
34
35 #ifdef HAVE_STDDEF_H
36 #include <stddef.h>
37 #endif
38
39 #include "mbfl_encoding.h"
40 #include "mbfl_allocators.h"
41 #include "mbfl_filter_output.h"
42 #include "mbfilter_pass.h"
43 #include "mbfilter_8bit.h"
44 #include "mbfilter_wchar.h"
45
46 #include "filters/mbfilter_euc_cn.h"
47 #include "filters/mbfilter_hz.h"
48 #include "filters/mbfilter_euc_tw.h"
49 #include "filters/mbfilter_big5.h"
50 #include "filters/mbfilter_uhc.h"
51 #include "filters/mbfilter_euc_kr.h"
52 #include "filters/mbfilter_iso2022_kr.h"
53 #include "filters/mbfilter_sjis.h"
54 #include "filters/mbfilter_sjis_open.h"
55 #include "filters/mbfilter_cp51932.h"
56 #include "filters/mbfilter_jis.h"
57 #include "filters/mbfilter_iso2022_jp_ms.h"
58 #include "filters/mbfilter_euc_jp.h"
59 #include "filters/mbfilter_euc_jp_win.h"
60 #include "filters/mbfilter_ascii.h"
61 #include "filters/mbfilter_koi8r.h"
62 #include "filters/mbfilter_koi8u.h"
63 #include "filters/mbfilter_cp866.h"
64 #include "filters/mbfilter_cp932.h"
65 #include "filters/mbfilter_cp936.h"
66 #include "filters/mbfilter_cp1251.h"
67 #include "filters/mbfilter_cp1252.h"
68 #include "filters/mbfilter_cp1254.h"
69 #include "filters/mbfilter_cp5022x.h"
70 #include "filters/mbfilter_iso8859_1.h"
71 #include "filters/mbfilter_iso8859_2.h"
72 #include "filters/mbfilter_iso8859_3.h"
73 #include "filters/mbfilter_iso8859_4.h"
74 #include "filters/mbfilter_iso8859_5.h"
75 #include "filters/mbfilter_iso8859_6.h"
76 #include "filters/mbfilter_iso8859_7.h"
77 #include "filters/mbfilter_iso8859_8.h"
78 #include "filters/mbfilter_iso8859_9.h"
79 #include "filters/mbfilter_iso8859_10.h"
80 #include "filters/mbfilter_iso8859_13.h"
81 #include "filters/mbfilter_iso8859_14.h"
82 #include "filters/mbfilter_iso8859_15.h"
83 #include "filters/mbfilter_base64.h"
84 #include "filters/mbfilter_qprint.h"
85 #include "filters/mbfilter_uuencode.h"
86 #include "filters/mbfilter_7bit.h"
87 #include "filters/mbfilter_utf7.h"
88 #include "filters/mbfilter_utf7imap.h"
89 #include "filters/mbfilter_utf8.h"
90 #include "filters/mbfilter_utf16.h"
91 #include "filters/mbfilter_utf32.h"
92 #include "filters/mbfilter_byte2.h"
93 #include "filters/mbfilter_byte4.h"
94 #include "filters/mbfilter_ucs4.h"
95 #include "filters/mbfilter_ucs2.h"
96 #include "filters/mbfilter_htmlent.h"
97 #include "filters/mbfilter_armscii8.h"
98 #include "filters/mbfilter_cp850.h"
99
100 /* hex character table "0123456789ABCDEF" */
101 static char mbfl_hexchar_table[] = {
102 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
103 };
104
105 const struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = {
106 &vtbl_utf8_wchar,
107 &vtbl_wchar_utf8,
108 &vtbl_eucjp_wchar,
109 &vtbl_wchar_eucjp,
110 &vtbl_sjis_wchar,
111 &vtbl_wchar_sjis,
112 &vtbl_sjis_open_wchar,
113 &vtbl_wchar_sjis_open,
114 &vtbl_cp51932_wchar,
115 &vtbl_wchar_cp51932,
116 &vtbl_jis_wchar,
117 &vtbl_wchar_jis,
118 &vtbl_jis_ms_wchar,
119 &vtbl_wchar_jis_ms,
120 &vtbl_2022jp_wchar,
121 &vtbl_wchar_2022jp,
122 &vtbl_2022jpms_wchar,
123 &vtbl_wchar_2022jpms,
124 &vtbl_eucjpwin_wchar,
125 &vtbl_wchar_eucjpwin,
126 &vtbl_cp932_wchar,
127 &vtbl_wchar_cp932,
128 &vtbl_euccn_wchar,
129 &vtbl_wchar_euccn,
130 &vtbl_cp936_wchar,
131 &vtbl_wchar_cp936,
132 &vtbl_hz_wchar,
133 &vtbl_wchar_hz,
134 &vtbl_euctw_wchar,
135 &vtbl_wchar_euctw,
136 &vtbl_big5_wchar,
137 &vtbl_wchar_big5,
138 &vtbl_euckr_wchar,
139 &vtbl_wchar_euckr,
140 &vtbl_uhc_wchar,
141 &vtbl_wchar_uhc,
142 &vtbl_2022kr_wchar,
143 &vtbl_wchar_2022kr,
144 &vtbl_cp1251_wchar,
145 &vtbl_wchar_cp1251,
146 &vtbl_cp866_wchar,
147 &vtbl_wchar_cp866,
148 &vtbl_koi8r_wchar,
149 &vtbl_wchar_koi8r,
150 &vtbl_koi8u_wchar,
151 &vtbl_wchar_koi8u,
152 &vtbl_cp1252_wchar,
153 &vtbl_wchar_cp1252,
154 &vtbl_cp1254_wchar,
155 &vtbl_wchar_cp1254,
156 &vtbl_cp50220_wchar,
157 &vtbl_wchar_cp50220,
158 &vtbl_cp50220raw_wchar,
159 &vtbl_wchar_cp50220raw,
160 &vtbl_cp50221_wchar,
161 &vtbl_wchar_cp50221,
162 &vtbl_cp50222_wchar,
163 &vtbl_wchar_cp50222,
164 &vtbl_ascii_wchar,
165 &vtbl_wchar_ascii,
166 &vtbl_8859_1_wchar,
167 &vtbl_wchar_8859_1,
168 &vtbl_8859_2_wchar,
169 &vtbl_wchar_8859_2,
170 &vtbl_8859_3_wchar,
171 &vtbl_wchar_8859_3,
172 &vtbl_8859_4_wchar,
173 &vtbl_wchar_8859_4,
174 &vtbl_8859_5_wchar,
175 &vtbl_wchar_8859_5,
176 &vtbl_8859_6_wchar,
177 &vtbl_wchar_8859_6,
178 &vtbl_8859_7_wchar,
179 &vtbl_wchar_8859_7,
180 &vtbl_8859_8_wchar,
181 &vtbl_wchar_8859_8,
182 &vtbl_8859_9_wchar,
183 &vtbl_wchar_8859_9,
184 &vtbl_8859_10_wchar,
185 &vtbl_wchar_8859_10,
186 &vtbl_8859_13_wchar,
187 &vtbl_wchar_8859_13,
188 &vtbl_8859_14_wchar,
189 &vtbl_wchar_8859_14,
190 &vtbl_8859_15_wchar,
191 &vtbl_wchar_8859_15,
192 &vtbl_8bit_b64,
193 &vtbl_b64_8bit,
194 &vtbl_uuencode_8bit,
195 &vtbl_wchar_html,
196 &vtbl_html_wchar,
197 &vtbl_8bit_qprint,
198 &vtbl_qprint_8bit,
199 &vtbl_8bit_7bit,
200 &vtbl_7bit_8bit,
201 &vtbl_utf7_wchar,
202 &vtbl_wchar_utf7,
203 &vtbl_utf7imap_wchar,
204 &vtbl_wchar_utf7imap,
205 &vtbl_utf16_wchar,
206 &vtbl_wchar_utf16,
207 &vtbl_utf16be_wchar,
208 &vtbl_wchar_utf16be,
209 &vtbl_utf16le_wchar,
210 &vtbl_wchar_utf16le,
211 &vtbl_utf32_wchar,
212 &vtbl_wchar_utf32,
213 &vtbl_utf32be_wchar,
214 &vtbl_wchar_utf32be,
215 &vtbl_utf32le_wchar,
216 &vtbl_wchar_utf32le,
217 &vtbl_ucs4_wchar,
218 &vtbl_wchar_ucs4,
219 &vtbl_ucs4be_wchar,
220 &vtbl_wchar_ucs4be,
221 &vtbl_ucs4le_wchar,
222 &vtbl_wchar_ucs4le,
223 &vtbl_ucs2_wchar,
224 &vtbl_wchar_ucs2,
225 &vtbl_ucs2be_wchar,
226 &vtbl_wchar_ucs2be,
227 &vtbl_ucs2le_wchar,
228 &vtbl_wchar_ucs2le,
229 &vtbl_byte4be_wchar,
230 &vtbl_wchar_byte4be,
231 &vtbl_byte4le_wchar,
232 &vtbl_wchar_byte4le,
233 &vtbl_byte2be_wchar,
234 &vtbl_wchar_byte2be,
235 &vtbl_byte2le_wchar,
236 &vtbl_wchar_byte2le,
237 &vtbl_armscii8_wchar,
238 &vtbl_wchar_armscii8,
239 &vtbl_cp850_wchar,
240 &vtbl_wchar_cp850,
241 &vtbl_pass,
242 NULL
243 };
244
245 static int
mbfl_convert_filter_common_init(mbfl_convert_filter * filter,enum mbfl_no_encoding from,enum mbfl_no_encoding to,const struct mbfl_convert_vtbl * vtbl,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)246 mbfl_convert_filter_common_init(
247 mbfl_convert_filter *filter,
248 enum mbfl_no_encoding from,
249 enum mbfl_no_encoding to,
250 const struct mbfl_convert_vtbl *vtbl,
251 int (*output_function)(int, void* ),
252 int (*flush_function)(void*),
253 void* data)
254 {
255 /* encoding structure */
256 if ((filter->from = mbfl_no2encoding(from)) == NULL) {
257 return 1;
258 }
259
260 if ((filter->to = mbfl_no2encoding(to)) == NULL) {
261 return 1;
262 }
263
264 if (output_function != NULL) {
265 filter->output_function = output_function;
266 } else {
267 filter->output_function = mbfl_filter_output_null;
268 }
269
270 filter->flush_function = flush_function;
271 filter->data = data;
272 filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
273 filter->illegal_substchar = 0x3f; /* '?' */
274 filter->num_illegalchar = 0;
275 filter->filter_ctor = vtbl->filter_ctor;
276 filter->filter_dtor = vtbl->filter_dtor;
277 filter->filter_function = vtbl->filter_function;
278 filter->filter_flush = vtbl->filter_flush;
279 filter->filter_copy = vtbl->filter_copy;
280
281 (*filter->filter_ctor)(filter);
282
283 return 0;
284 }
285
286
287 mbfl_convert_filter *
mbfl_convert_filter_new(enum mbfl_no_encoding from,enum mbfl_no_encoding to,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)288 mbfl_convert_filter_new(
289 enum mbfl_no_encoding from,
290 enum mbfl_no_encoding to,
291 int (*output_function)(int, void* ),
292 int (*flush_function)(void*),
293 void* data)
294 {
295 mbfl_convert_filter * filter;
296 const struct mbfl_convert_vtbl *vtbl;
297
298 vtbl = mbfl_convert_filter_get_vtbl(from, to);
299
300 if (vtbl == NULL) {
301 vtbl = &vtbl_pass;
302 }
303
304 /* allocate */
305 filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
306 if (filter == NULL) {
307 return NULL;
308 }
309
310 if (mbfl_convert_filter_common_init(filter, from, to, vtbl,
311 output_function, flush_function, data)) {
312 mbfl_free(filter);
313 return NULL;
314 }
315
316 return filter;
317 }
318
319 mbfl_convert_filter *
mbfl_convert_filter_new2(const struct mbfl_convert_vtbl * vtbl,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)320 mbfl_convert_filter_new2(
321 const struct mbfl_convert_vtbl *vtbl,
322 int (*output_function)(int, void* ),
323 int (*flush_function)(void*),
324 void* data)
325 {
326 mbfl_convert_filter * filter;
327
328 if (vtbl == NULL) {
329 vtbl = &vtbl_pass;
330 }
331
332 /* allocate */
333 filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
334 if (filter == NULL) {
335 return NULL;
336 }
337
338 if (mbfl_convert_filter_common_init(filter, vtbl->from, vtbl->to, vtbl,
339 output_function, flush_function, data)) {
340 mbfl_free(filter);
341 return NULL;
342 }
343
344 return filter;
345 }
346
347 void
mbfl_convert_filter_delete(mbfl_convert_filter * filter)348 mbfl_convert_filter_delete(mbfl_convert_filter *filter)
349 {
350 if (filter) {
351 (*filter->filter_dtor)(filter);
352 mbfl_free((void*)filter);
353 }
354 }
355
356 int
mbfl_convert_filter_feed(int c,mbfl_convert_filter * filter)357 mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
358 {
359 return (*filter->filter_function)(c, filter);
360 }
361
362 int
mbfl_convert_filter_flush(mbfl_convert_filter * filter)363 mbfl_convert_filter_flush(mbfl_convert_filter *filter)
364 {
365 (*filter->filter_flush)(filter);
366 return (filter->flush_function ? (*filter->flush_function)(filter->data) : 0);
367 }
368
mbfl_convert_filter_reset(mbfl_convert_filter * filter,enum mbfl_no_encoding from,enum mbfl_no_encoding to)369 void mbfl_convert_filter_reset(mbfl_convert_filter *filter,
370 enum mbfl_no_encoding from, enum mbfl_no_encoding to)
371 {
372 const struct mbfl_convert_vtbl *vtbl;
373
374 /* destruct old filter */
375 (*filter->filter_dtor)(filter);
376
377 vtbl = mbfl_convert_filter_get_vtbl(from, to);
378
379 if (vtbl == NULL) {
380 vtbl = &vtbl_pass;
381 }
382
383 mbfl_convert_filter_common_init(filter, from, to, vtbl,
384 filter->output_function, filter->flush_function, filter->data);
385 }
386
387 void
mbfl_convert_filter_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)388 mbfl_convert_filter_copy(
389 mbfl_convert_filter *src,
390 mbfl_convert_filter *dest)
391 {
392 if (src->filter_copy != NULL) {
393 src->filter_copy(src, dest);
394 return;
395 }
396
397 *dest = *src;
398 }
399
mbfl_convert_filter_devcat(mbfl_convert_filter * filter,mbfl_memory_device * src)400 int mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src)
401 {
402 int n;
403 unsigned char *p;
404
405 p = src->buffer;
406 n = src->pos;
407 while (n > 0) {
408 if ((*filter->filter_function)(*p++, filter) < 0) {
409 return -1;
410 }
411 n--;
412 }
413
414 return n;
415 }
416
mbfl_convert_filter_strcat(mbfl_convert_filter * filter,const unsigned char * p)417 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
418 {
419 int c;
420
421 while ((c = *p++) != '\0') {
422 if ((*filter->filter_function)(c, filter) < 0) {
423 return -1;
424 }
425 }
426
427 return 0;
428 }
429
430 /* illegal character output function for conv-filter */
431 int
mbfl_filt_conv_illegal_output(int c,mbfl_convert_filter * filter)432 mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
433 {
434 int mode_backup, ret, n, m, r;
435
436 ret = 0;
437 mode_backup = filter->illegal_mode;
438 filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
439 switch (mode_backup) {
440 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
441 ret = (*filter->filter_function)(filter->illegal_substchar, filter);
442 break;
443 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
444 if (c >= 0) {
445 if (c < MBFL_WCSGROUP_UCS4MAX) { /* unicode */
446 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
447 } else {
448 if (c < MBFL_WCSGROUP_WCHARMAX) {
449 m = c & ~MBFL_WCSPLANE_MASK;
450 switch (m) {
451 case MBFL_WCSPLANE_JIS0208:
452 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS+");
453 break;
454 case MBFL_WCSPLANE_JIS0212:
455 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS2+");
456 break;
457 case MBFL_WCSPLANE_WINCP932:
458 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"W932+");
459 break;
460 case MBFL_WCSPLANE_8859_1:
461 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"I8859_1+");
462 break;
463 default:
464 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"?+");
465 break;
466 }
467 c &= MBFL_WCSPLANE_MASK;
468 } else {
469 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"BAD+");
470 c &= MBFL_WCSGROUP_MASK;
471 }
472 }
473 if (ret >= 0) {
474 m = 0;
475 r = 28;
476 while (r >= 0) {
477 n = (c >> r) & 0xf;
478 if (n || m) {
479 m = 1;
480 ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
481 if (ret < 0) {
482 break;
483 }
484 }
485 r -= 4;
486 }
487 if (m == 0 && ret >= 0) {
488 ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
489 }
490 }
491 }
492 break;
493 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
494 if (c >= 0) {
495 if (c < MBFL_WCSGROUP_UCS4MAX) { /* unicode */
496 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
497 if (ret < 0)
498 break;
499
500 m = 0;
501 r = 28;
502 while (r >= 0) {
503 n = (c >> r) & 0xf;
504 if (n || m) {
505 m = 1;
506 ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
507 if (ret < 0) {
508 break;
509 }
510 }
511 r -= 4;
512 }
513 if (ret < 0) {
514 break;
515 }
516 if (m == 0) {
517 ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
518 }
519 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
520 } else {
521 ret = (*filter->filter_function)(filter->illegal_substchar, filter);
522 }
523 }
524 break;
525 default:
526 break;
527 }
528 filter->illegal_mode = mode_backup;
529 filter->num_illegalchar++;
530
531 return ret;
532 }
533
mbfl_convert_filter_get_vtbl(enum mbfl_no_encoding from,enum mbfl_no_encoding to)534 const struct mbfl_convert_vtbl * mbfl_convert_filter_get_vtbl(enum mbfl_no_encoding from, enum mbfl_no_encoding to)
535 {
536 const struct mbfl_convert_vtbl *vtbl;
537 int i;
538
539 if (to == mbfl_no_encoding_base64 ||
540 to == mbfl_no_encoding_qprint ||
541 to == mbfl_no_encoding_7bit) {
542 from = mbfl_no_encoding_8bit;
543 } else if (from == mbfl_no_encoding_base64 ||
544 from == mbfl_no_encoding_qprint ||
545 from == mbfl_no_encoding_uuencode) {
546 to = mbfl_no_encoding_8bit;
547 }
548
549 i = 0;
550 while ((vtbl = mbfl_convert_filter_list[i++]) != NULL){
551 if (vtbl->from == from && vtbl->to == to) {
552 return vtbl;
553 }
554 }
555
556 return NULL;
557 }
558
559 /*
560 * commonly used constructor and destructor
561 */
mbfl_filt_conv_common_ctor(mbfl_convert_filter * filter)562 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
563 {
564 filter->status = 0;
565 filter->cache = 0;
566 }
567
mbfl_filt_conv_common_flush(mbfl_convert_filter * filter)568 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
569 {
570 filter->status = 0;
571 filter->cache = 0;
572
573 if (filter->flush_function != NULL) {
574 (*filter->flush_function)(filter->data);
575 }
576 return 0;
577 }
578
mbfl_filt_conv_common_dtor(mbfl_convert_filter * filter)579 void mbfl_filt_conv_common_dtor(mbfl_convert_filter *filter)
580 {
581 filter->status = 0;
582 filter->cache = 0;
583 }
584
585
586