1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27  * mbfilter.c is included in this package .
28  *
29  */
30 
31 #ifdef HAVE_CONFIG_H
32 #include "config.h"
33 #endif
34 
35 #ifdef HAVE_STDDEF_H
36 #include <stddef.h>
37 #endif
38 
39 #include "mbfl_encoding.h"
40 #include "mbfl_allocators.h"
41 #include "mbfl_filter_output.h"
42 #include "mbfilter_pass.h"
43 #include "mbfilter_8bit.h"
44 #include "mbfilter_wchar.h"
45 
46 #include "filters/mbfilter_euc_cn.h"
47 #include "filters/mbfilter_hz.h"
48 #include "filters/mbfilter_euc_tw.h"
49 #include "filters/mbfilter_big5.h"
50 #include "filters/mbfilter_uhc.h"
51 #include "filters/mbfilter_euc_kr.h"
52 #include "filters/mbfilter_iso2022_kr.h"
53 #include "filters/mbfilter_sjis.h"
54 #include "filters/mbfilter_sjis_open.h"
55 #include "filters/mbfilter_cp51932.h"
56 #include "filters/mbfilter_jis.h"
57 #include "filters/mbfilter_iso2022_jp_ms.h"
58 #include "filters/mbfilter_euc_jp.h"
59 #include "filters/mbfilter_euc_jp_win.h"
60 #include "filters/mbfilter_ascii.h"
61 #include "filters/mbfilter_koi8r.h"
62 #include "filters/mbfilter_koi8u.h"
63 #include "filters/mbfilter_cp866.h"
64 #include "filters/mbfilter_cp932.h"
65 #include "filters/mbfilter_cp936.h"
66 #include "filters/mbfilter_cp1251.h"
67 #include "filters/mbfilter_cp1252.h"
68 #include "filters/mbfilter_cp1254.h"
69 #include "filters/mbfilter_cp5022x.h"
70 #include "filters/mbfilter_iso8859_1.h"
71 #include "filters/mbfilter_iso8859_2.h"
72 #include "filters/mbfilter_iso8859_3.h"
73 #include "filters/mbfilter_iso8859_4.h"
74 #include "filters/mbfilter_iso8859_5.h"
75 #include "filters/mbfilter_iso8859_6.h"
76 #include "filters/mbfilter_iso8859_7.h"
77 #include "filters/mbfilter_iso8859_8.h"
78 #include "filters/mbfilter_iso8859_9.h"
79 #include "filters/mbfilter_iso8859_10.h"
80 #include "filters/mbfilter_iso8859_13.h"
81 #include "filters/mbfilter_iso8859_14.h"
82 #include "filters/mbfilter_iso8859_15.h"
83 #include "filters/mbfilter_base64.h"
84 #include "filters/mbfilter_qprint.h"
85 #include "filters/mbfilter_uuencode.h"
86 #include "filters/mbfilter_7bit.h"
87 #include "filters/mbfilter_utf7.h"
88 #include "filters/mbfilter_utf7imap.h"
89 #include "filters/mbfilter_utf8.h"
90 #include "filters/mbfilter_utf16.h"
91 #include "filters/mbfilter_utf32.h"
92 #include "filters/mbfilter_byte2.h"
93 #include "filters/mbfilter_byte4.h"
94 #include "filters/mbfilter_ucs4.h"
95 #include "filters/mbfilter_ucs2.h"
96 #include "filters/mbfilter_htmlent.h"
97 #include "filters/mbfilter_armscii8.h"
98 #include "filters/mbfilter_cp850.h"
99 
100 /* hex character table "0123456789ABCDEF" */
101 static char mbfl_hexchar_table[] = {
102 	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
103 };
104 
105 const struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = {
106 	&vtbl_utf8_wchar,
107 	&vtbl_wchar_utf8,
108 	&vtbl_eucjp_wchar,
109 	&vtbl_wchar_eucjp,
110 	&vtbl_sjis_wchar,
111 	&vtbl_wchar_sjis,
112 	&vtbl_sjis_open_wchar,
113 	&vtbl_wchar_sjis_open,
114 	&vtbl_cp51932_wchar,
115 	&vtbl_wchar_cp51932,
116 	&vtbl_jis_wchar,
117 	&vtbl_wchar_jis,
118 	&vtbl_jis_ms_wchar,
119 	&vtbl_wchar_jis_ms,
120 	&vtbl_2022jp_wchar,
121 	&vtbl_wchar_2022jp,
122 	&vtbl_2022jpms_wchar,
123 	&vtbl_wchar_2022jpms,
124 	&vtbl_eucjpwin_wchar,
125 	&vtbl_wchar_eucjpwin,
126 	&vtbl_cp932_wchar,
127 	&vtbl_wchar_cp932,
128 	&vtbl_euccn_wchar,
129 	&vtbl_wchar_euccn,
130 	&vtbl_cp936_wchar,
131 	&vtbl_wchar_cp936,
132 	&vtbl_hz_wchar,
133 	&vtbl_wchar_hz,
134 	&vtbl_euctw_wchar,
135 	&vtbl_wchar_euctw,
136 	&vtbl_big5_wchar,
137 	&vtbl_wchar_big5,
138 	&vtbl_euckr_wchar,
139 	&vtbl_wchar_euckr,
140 	&vtbl_uhc_wchar,
141 	&vtbl_wchar_uhc,
142 	&vtbl_2022kr_wchar,
143 	&vtbl_wchar_2022kr,
144 	&vtbl_cp1251_wchar,
145 	&vtbl_wchar_cp1251,
146 	&vtbl_cp866_wchar,
147 	&vtbl_wchar_cp866,
148 	&vtbl_koi8r_wchar,
149 	&vtbl_wchar_koi8r,
150 	&vtbl_koi8u_wchar,
151 	&vtbl_wchar_koi8u,
152 	&vtbl_cp1252_wchar,
153 	&vtbl_wchar_cp1252,
154 	&vtbl_cp1254_wchar,
155 	&vtbl_wchar_cp1254,
156 	&vtbl_cp50220_wchar,
157 	&vtbl_wchar_cp50220,
158 	&vtbl_cp50220raw_wchar,
159 	&vtbl_wchar_cp50220raw,
160 	&vtbl_cp50221_wchar,
161 	&vtbl_wchar_cp50221,
162 	&vtbl_cp50222_wchar,
163 	&vtbl_wchar_cp50222,
164 	&vtbl_ascii_wchar,
165 	&vtbl_wchar_ascii,
166 	&vtbl_8859_1_wchar,
167 	&vtbl_wchar_8859_1,
168 	&vtbl_8859_2_wchar,
169 	&vtbl_wchar_8859_2,
170 	&vtbl_8859_3_wchar,
171 	&vtbl_wchar_8859_3,
172 	&vtbl_8859_4_wchar,
173 	&vtbl_wchar_8859_4,
174 	&vtbl_8859_5_wchar,
175 	&vtbl_wchar_8859_5,
176 	&vtbl_8859_6_wchar,
177 	&vtbl_wchar_8859_6,
178 	&vtbl_8859_7_wchar,
179 	&vtbl_wchar_8859_7,
180 	&vtbl_8859_8_wchar,
181 	&vtbl_wchar_8859_8,
182 	&vtbl_8859_9_wchar,
183 	&vtbl_wchar_8859_9,
184 	&vtbl_8859_10_wchar,
185 	&vtbl_wchar_8859_10,
186 	&vtbl_8859_13_wchar,
187 	&vtbl_wchar_8859_13,
188 	&vtbl_8859_14_wchar,
189 	&vtbl_wchar_8859_14,
190 	&vtbl_8859_15_wchar,
191 	&vtbl_wchar_8859_15,
192 	&vtbl_8bit_b64,
193 	&vtbl_b64_8bit,
194 	&vtbl_uuencode_8bit,
195 	&vtbl_wchar_html,
196 	&vtbl_html_wchar,
197 	&vtbl_8bit_qprint,
198 	&vtbl_qprint_8bit,
199 	&vtbl_8bit_7bit,
200 	&vtbl_7bit_8bit,
201 	&vtbl_utf7_wchar,
202 	&vtbl_wchar_utf7,
203 	&vtbl_utf7imap_wchar,
204 	&vtbl_wchar_utf7imap,
205 	&vtbl_utf16_wchar,
206 	&vtbl_wchar_utf16,
207 	&vtbl_utf16be_wchar,
208 	&vtbl_wchar_utf16be,
209 	&vtbl_utf16le_wchar,
210 	&vtbl_wchar_utf16le,
211 	&vtbl_utf32_wchar,
212 	&vtbl_wchar_utf32,
213 	&vtbl_utf32be_wchar,
214 	&vtbl_wchar_utf32be,
215 	&vtbl_utf32le_wchar,
216 	&vtbl_wchar_utf32le,
217 	&vtbl_ucs4_wchar,
218 	&vtbl_wchar_ucs4,
219 	&vtbl_ucs4be_wchar,
220 	&vtbl_wchar_ucs4be,
221 	&vtbl_ucs4le_wchar,
222 	&vtbl_wchar_ucs4le,
223 	&vtbl_ucs2_wchar,
224 	&vtbl_wchar_ucs2,
225 	&vtbl_ucs2be_wchar,
226 	&vtbl_wchar_ucs2be,
227 	&vtbl_ucs2le_wchar,
228 	&vtbl_wchar_ucs2le,
229 	&vtbl_byte4be_wchar,
230 	&vtbl_wchar_byte4be,
231 	&vtbl_byte4le_wchar,
232 	&vtbl_wchar_byte4le,
233 	&vtbl_byte2be_wchar,
234 	&vtbl_wchar_byte2be,
235 	&vtbl_byte2le_wchar,
236 	&vtbl_wchar_byte2le,
237 	&vtbl_armscii8_wchar,
238 	&vtbl_wchar_armscii8,
239 	&vtbl_cp850_wchar,
240 	&vtbl_wchar_cp850,
241 	&vtbl_pass,
242 	NULL
243 };
244 
245 static int
mbfl_convert_filter_common_init(mbfl_convert_filter * filter,enum mbfl_no_encoding from,enum mbfl_no_encoding to,const struct mbfl_convert_vtbl * vtbl,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)246 mbfl_convert_filter_common_init(
247 	mbfl_convert_filter *filter,
248 	enum mbfl_no_encoding from,
249 	enum mbfl_no_encoding to,
250 	const struct mbfl_convert_vtbl *vtbl,
251     int (*output_function)(int, void* ),
252     int (*flush_function)(void*),
253     void* data)
254 {
255 	/* encoding structure */
256 	if ((filter->from = mbfl_no2encoding(from)) == NULL) {
257 		return 1;
258 	}
259 
260 	if ((filter->to = mbfl_no2encoding(to)) == NULL) {
261 		return 1;
262 	}
263 
264 	if (output_function != NULL) {
265 		filter->output_function = output_function;
266 	} else {
267 		filter->output_function = mbfl_filter_output_null;
268 	}
269 
270 	filter->flush_function = flush_function;
271 	filter->data = data;
272 	filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
273 	filter->illegal_substchar = 0x3f;		/* '?' */
274 	filter->num_illegalchar = 0;
275 	filter->filter_ctor = vtbl->filter_ctor;
276 	filter->filter_dtor = vtbl->filter_dtor;
277 	filter->filter_function = vtbl->filter_function;
278 	filter->filter_flush = vtbl->filter_flush;
279 	filter->filter_copy = vtbl->filter_copy;
280 
281 	(*filter->filter_ctor)(filter);
282 
283 	return 0;
284 }
285 
286 
287 mbfl_convert_filter *
mbfl_convert_filter_new(enum mbfl_no_encoding from,enum mbfl_no_encoding to,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)288 mbfl_convert_filter_new(
289     enum mbfl_no_encoding from,
290     enum mbfl_no_encoding to,
291     int (*output_function)(int, void* ),
292     int (*flush_function)(void*),
293     void* data)
294 {
295 	mbfl_convert_filter * filter;
296 	const struct mbfl_convert_vtbl *vtbl;
297 
298 	vtbl = mbfl_convert_filter_get_vtbl(from, to);
299 
300 	if (vtbl == NULL) {
301 		vtbl = &vtbl_pass;
302 	}
303 
304 	/* allocate */
305 	filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
306 	if (filter == NULL) {
307 		return NULL;
308 	}
309 
310 	if (mbfl_convert_filter_common_init(filter, from, to, vtbl,
311 			output_function, flush_function, data)) {
312 		mbfl_free(filter);
313 		return NULL;
314 	}
315 
316 	return filter;
317 }
318 
319 mbfl_convert_filter *
mbfl_convert_filter_new2(const struct mbfl_convert_vtbl * vtbl,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)320 mbfl_convert_filter_new2(
321 	const struct mbfl_convert_vtbl *vtbl,
322     int (*output_function)(int, void* ),
323     int (*flush_function)(void*),
324     void* data)
325 {
326 	mbfl_convert_filter * filter;
327 
328 	if (vtbl == NULL) {
329 		vtbl = &vtbl_pass;
330 	}
331 
332 	/* allocate */
333 	filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
334 	if (filter == NULL) {
335 		return NULL;
336 	}
337 
338 	if (mbfl_convert_filter_common_init(filter, vtbl->from, vtbl->to, vtbl,
339 			output_function, flush_function, data)) {
340 		mbfl_free(filter);
341 		return NULL;
342 	}
343 
344 	return filter;
345 }
346 
347 void
mbfl_convert_filter_delete(mbfl_convert_filter * filter)348 mbfl_convert_filter_delete(mbfl_convert_filter *filter)
349 {
350 	if (filter) {
351 		(*filter->filter_dtor)(filter);
352 		mbfl_free((void*)filter);
353 	}
354 }
355 
356 int
mbfl_convert_filter_feed(int c,mbfl_convert_filter * filter)357 mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
358 {
359 	return (*filter->filter_function)(c, filter);
360 }
361 
362 int
mbfl_convert_filter_flush(mbfl_convert_filter * filter)363 mbfl_convert_filter_flush(mbfl_convert_filter *filter)
364 {
365 	(*filter->filter_flush)(filter);
366 	return (filter->flush_function ? (*filter->flush_function)(filter->data) : 0);
367 }
368 
mbfl_convert_filter_reset(mbfl_convert_filter * filter,enum mbfl_no_encoding from,enum mbfl_no_encoding to)369 void mbfl_convert_filter_reset(mbfl_convert_filter *filter,
370 	    enum mbfl_no_encoding from, enum mbfl_no_encoding to)
371 {
372 	const struct mbfl_convert_vtbl *vtbl;
373 
374 	/* destruct old filter */
375 	(*filter->filter_dtor)(filter);
376 
377 	vtbl = mbfl_convert_filter_get_vtbl(from, to);
378 
379 	if (vtbl == NULL) {
380 		vtbl = &vtbl_pass;
381 	}
382 
383 	mbfl_convert_filter_common_init(filter, from, to, vtbl,
384 			filter->output_function, filter->flush_function, filter->data);
385 }
386 
387 void
mbfl_convert_filter_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)388 mbfl_convert_filter_copy(
389     mbfl_convert_filter *src,
390     mbfl_convert_filter *dest)
391 {
392 	if (src->filter_copy != NULL) {
393 		src->filter_copy(src, dest);
394 		return;
395 	}
396 
397 	*dest = *src;
398 }
399 
mbfl_convert_filter_devcat(mbfl_convert_filter * filter,mbfl_memory_device * src)400 int mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src)
401 {
402 	int n;
403 	unsigned char *p;
404 
405 	p = src->buffer;
406 	n = src->pos;
407 	while (n > 0) {
408 		if ((*filter->filter_function)(*p++, filter) < 0) {
409 			return -1;
410 		}
411 		n--;
412 	}
413 
414 	return n;
415 }
416 
mbfl_convert_filter_strcat(mbfl_convert_filter * filter,const unsigned char * p)417 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
418 {
419 	int c;
420 
421 	while ((c = *p++) != '\0') {
422 		if ((*filter->filter_function)(c, filter) < 0) {
423 			return -1;
424 		}
425 	}
426 
427 	return 0;
428 }
429 
430 /* illegal character output function for conv-filter */
431 int
mbfl_filt_conv_illegal_output(int c,mbfl_convert_filter * filter)432 mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
433 {
434 	int mode_backup, ret, n, m, r;
435 
436 	ret = 0;
437 	mode_backup = filter->illegal_mode;
438 	filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
439 	switch (mode_backup) {
440 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
441 		ret = (*filter->filter_function)(filter->illegal_substchar, filter);
442 		break;
443 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
444 		if (c >= 0) {
445 			if (c < MBFL_WCSGROUP_UCS4MAX) {	/* unicode */
446 				ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
447 			} else {
448 				if (c < MBFL_WCSGROUP_WCHARMAX) {
449 					m = c & ~MBFL_WCSPLANE_MASK;
450 					switch (m) {
451 					case MBFL_WCSPLANE_JIS0208:
452 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS+");
453 						break;
454 					case MBFL_WCSPLANE_JIS0212:
455 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS2+");
456 						break;
457 					case MBFL_WCSPLANE_WINCP932:
458 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"W932+");
459 						break;
460 					case MBFL_WCSPLANE_8859_1:
461 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"I8859_1+");
462 						break;
463 					default:
464 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"?+");
465 						break;
466 					}
467 					c &= MBFL_WCSPLANE_MASK;
468 				} else {
469 					ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"BAD+");
470 					c &= MBFL_WCSGROUP_MASK;
471 				}
472 			}
473 			if (ret >= 0) {
474 				m = 0;
475 				r = 28;
476 				while (r >= 0) {
477 					n = (c >> r) & 0xf;
478 					if (n || m) {
479 						m = 1;
480 						ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
481 						if (ret < 0) {
482 							break;
483 						}
484 					}
485 					r -= 4;
486 				}
487 				if (m == 0 && ret >= 0) {
488 					ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
489 				}
490 			}
491 		}
492 		break;
493 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
494 		if (c >= 0) {
495 			if (c < MBFL_WCSGROUP_UCS4MAX) {	/* unicode */
496 				ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
497 				if (ret < 0)
498 					break;
499 
500 				m = 0;
501 				r = 28;
502 				while (r >= 0) {
503 					n = (c >> r) & 0xf;
504 					if (n || m) {
505 						m = 1;
506 						ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
507 						if (ret < 0) {
508 							break;
509 						}
510 					}
511 					r -= 4;
512 				}
513 				if (ret < 0) {
514 					break;
515 				}
516 				if (m == 0) {
517 					ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
518 				}
519 				ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
520 			} else {
521 				ret = (*filter->filter_function)(filter->illegal_substchar, filter);
522 			}
523 		}
524 		break;
525 	default:
526 		break;
527 	}
528 	filter->illegal_mode = mode_backup;
529 	filter->num_illegalchar++;
530 
531 	return ret;
532 }
533 
mbfl_convert_filter_get_vtbl(enum mbfl_no_encoding from,enum mbfl_no_encoding to)534 const struct mbfl_convert_vtbl * mbfl_convert_filter_get_vtbl(enum mbfl_no_encoding from, enum mbfl_no_encoding to)
535 {
536 	const struct mbfl_convert_vtbl *vtbl;
537 	int i;
538 
539 	if (to == mbfl_no_encoding_base64 ||
540 	    to == mbfl_no_encoding_qprint ||
541 	    to == mbfl_no_encoding_7bit) {
542 		from = mbfl_no_encoding_8bit;
543 	} else if (from == mbfl_no_encoding_base64 ||
544 			   from == mbfl_no_encoding_qprint ||
545 			   from == mbfl_no_encoding_uuencode) {
546 		to = mbfl_no_encoding_8bit;
547 	}
548 
549 	i = 0;
550 	while ((vtbl = mbfl_convert_filter_list[i++]) != NULL){
551 		if (vtbl->from == from && vtbl->to == to) {
552 			return vtbl;
553 		}
554 	}
555 
556 	return NULL;
557 }
558 
559 /*
560  * commonly used constructor and destructor
561  */
mbfl_filt_conv_common_ctor(mbfl_convert_filter * filter)562 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
563 {
564 	filter->status = 0;
565 	filter->cache = 0;
566 }
567 
mbfl_filt_conv_common_flush(mbfl_convert_filter * filter)568 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
569 {
570 	filter->status = 0;
571 	filter->cache = 0;
572 
573 	if (filter->flush_function != NULL) {
574 		(*filter->flush_function)(filter->data);
575 	}
576 	return 0;
577 }
578 
mbfl_filt_conv_common_dtor(mbfl_convert_filter * filter)579 void mbfl_filt_conv_common_dtor(mbfl_convert_filter *filter)
580 {
581 	filter->status = 0;
582 	filter->cache = 0;
583 }
584 
585 
586