1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27  * mbfilter.c is included in this package .
28  *
29  */
30 
31 #ifdef HAVE_CONFIG_H
32 #include "config.h"
33 #endif
34 
35 #ifdef HAVE_STDDEF_H
36 #include <stddef.h>
37 #endif
38 
39 #include "mbfl_encoding.h"
40 #include "mbfl_allocators.h"
41 #include "mbfl_filter_output.h"
42 #include "mbfilter_pass.h"
43 #include "mbfilter_8bit.h"
44 #include "mbfilter_wchar.h"
45 
46 #include "filters/mbfilter_euc_cn.h"
47 #include "filters/mbfilter_hz.h"
48 #include "filters/mbfilter_euc_tw.h"
49 #include "filters/mbfilter_big5.h"
50 #include "filters/mbfilter_uhc.h"
51 #include "filters/mbfilter_euc_kr.h"
52 #include "filters/mbfilter_iso2022_kr.h"
53 #include "filters/mbfilter_sjis.h"
54 #include "filters/mbfilter_sjis_open.h"
55 #include "filters/mbfilter_sjis_2004.h"
56 #include "filters/mbfilter_sjis_mobile.h"
57 #include "filters/mbfilter_sjis_mac.h"
58 #include "filters/mbfilter_cp51932.h"
59 #include "filters/mbfilter_jis.h"
60 #include "filters/mbfilter_iso2022_jp_ms.h"
61 #include "filters/mbfilter_iso2022jp_2004.h"
62 #include "filters/mbfilter_iso2022jp_mobile.h"
63 #include "filters/mbfilter_euc_jp.h"
64 #include "filters/mbfilter_euc_jp_2004.h"
65 #include "filters/mbfilter_euc_jp_win.h"
66 #include "filters/mbfilter_gb18030.h"
67 #include "filters/mbfilter_ascii.h"
68 #include "filters/mbfilter_koi8r.h"
69 #include "filters/mbfilter_koi8u.h"
70 #include "filters/mbfilter_cp866.h"
71 #include "filters/mbfilter_cp932.h"
72 #include "filters/mbfilter_cp936.h"
73 #include "filters/mbfilter_cp1251.h"
74 #include "filters/mbfilter_cp1252.h"
75 #include "filters/mbfilter_cp1254.h"
76 #include "filters/mbfilter_cp5022x.h"
77 #include "filters/mbfilter_iso8859_1.h"
78 #include "filters/mbfilter_iso8859_2.h"
79 #include "filters/mbfilter_iso8859_3.h"
80 #include "filters/mbfilter_iso8859_4.h"
81 #include "filters/mbfilter_iso8859_5.h"
82 #include "filters/mbfilter_iso8859_6.h"
83 #include "filters/mbfilter_iso8859_7.h"
84 #include "filters/mbfilter_iso8859_8.h"
85 #include "filters/mbfilter_iso8859_9.h"
86 #include "filters/mbfilter_iso8859_10.h"
87 #include "filters/mbfilter_iso8859_13.h"
88 #include "filters/mbfilter_iso8859_14.h"
89 #include "filters/mbfilter_iso8859_15.h"
90 #include "filters/mbfilter_base64.h"
91 #include "filters/mbfilter_qprint.h"
92 #include "filters/mbfilter_uuencode.h"
93 #include "filters/mbfilter_7bit.h"
94 #include "filters/mbfilter_utf7.h"
95 #include "filters/mbfilter_utf7imap.h"
96 #include "filters/mbfilter_utf8.h"
97 #include "filters/mbfilter_utf8_mobile.h"
98 #include "filters/mbfilter_utf16.h"
99 #include "filters/mbfilter_utf32.h"
100 #include "filters/mbfilter_byte2.h"
101 #include "filters/mbfilter_byte4.h"
102 #include "filters/mbfilter_ucs4.h"
103 #include "filters/mbfilter_ucs2.h"
104 #include "filters/mbfilter_htmlent.h"
105 #include "filters/mbfilter_armscii8.h"
106 #include "filters/mbfilter_cp850.h"
107 
108 /* hex character table "0123456789ABCDEF" */
109 static char mbfl_hexchar_table[] = {
110 	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
111 };
112 
113 static const struct mbfl_convert_vtbl *mbfl_special_filter_list[] = {
114 	&vtbl_8bit_b64,
115 	&vtbl_b64_8bit,
116 	&vtbl_uuencode_8bit,
117 	&vtbl_8bit_qprint,
118 	&vtbl_qprint_8bit,
119 	&vtbl_8bit_7bit,
120 	&vtbl_7bit_8bit,
121 	&vtbl_pass,
122 	NULL
123 };
124 
125 static int
mbfl_convert_filter_common_init(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to,const struct mbfl_convert_vtbl * vtbl,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)126 mbfl_convert_filter_common_init(
127 	mbfl_convert_filter *filter,
128 	const mbfl_encoding *from,
129 	const mbfl_encoding *to,
130 	const struct mbfl_convert_vtbl *vtbl,
131     int (*output_function)(int, void* ),
132     int (*flush_function)(void*),
133     void* data)
134 {
135 	/* encoding structure */
136 	filter->from = from;
137 	filter->to = to;
138 
139 	if (output_function != NULL) {
140 		filter->output_function = output_function;
141 	} else {
142 		filter->output_function = mbfl_filter_output_null;
143 	}
144 
145 	filter->flush_function = flush_function;
146 	filter->data = data;
147 	filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
148 	filter->illegal_substchar = 0x3f;		/* '?' */
149 	filter->num_illegalchar = 0;
150 	filter->filter_ctor = vtbl->filter_ctor;
151 	filter->filter_dtor = vtbl->filter_dtor;
152 	filter->filter_function = vtbl->filter_function;
153 	filter->filter_flush = vtbl->filter_flush;
154 	filter->filter_copy = vtbl->filter_copy;
155 
156 	(*filter->filter_ctor)(filter);
157 
158 	return 0;
159 }
160 
161 
162 mbfl_convert_filter *
mbfl_convert_filter_new(const mbfl_encoding * from,const mbfl_encoding * to,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)163 mbfl_convert_filter_new(
164     const mbfl_encoding *from,
165     const mbfl_encoding *to,
166     int (*output_function)(int, void* ),
167     int (*flush_function)(void*),
168     void* data)
169 {
170 	mbfl_convert_filter * filter;
171 	const struct mbfl_convert_vtbl *vtbl;
172 
173 	vtbl = mbfl_convert_filter_get_vtbl(from, to);
174 	if (vtbl == NULL) {
175 		return NULL;
176 	}
177 
178 	/* allocate */
179 	filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
180 	if (filter == NULL) {
181 		return NULL;
182 	}
183 
184 	if (mbfl_convert_filter_common_init(filter, from, to, vtbl,
185 			output_function, flush_function, data)) {
186 		mbfl_free(filter);
187 		return NULL;
188 	}
189 
190 	return filter;
191 }
192 
193 mbfl_convert_filter *
mbfl_convert_filter_new2(const struct mbfl_convert_vtbl * vtbl,int (* output_function)(int,void *),int (* flush_function)(void *),void * data)194 mbfl_convert_filter_new2(
195 	const struct mbfl_convert_vtbl *vtbl,
196     int (*output_function)(int, void* ),
197     int (*flush_function)(void*),
198     void* data)
199 {
200 	mbfl_convert_filter * filter;
201 	const mbfl_encoding *from_encoding, *to_encoding;
202 
203 	if (vtbl == NULL) {
204 		vtbl = &vtbl_pass;
205 	}
206 
207 	from_encoding = mbfl_no2encoding(vtbl->from);
208 	to_encoding = mbfl_no2encoding(vtbl->to);
209 
210 	/* allocate */
211 	filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
212 	if (filter == NULL) {
213 		return NULL;
214 	}
215 
216 	if (mbfl_convert_filter_common_init(filter, from_encoding, to_encoding, vtbl,
217 			output_function, flush_function, data)) {
218 		mbfl_free(filter);
219 		return NULL;
220 	}
221 
222 	return filter;
223 }
224 
225 void
mbfl_convert_filter_delete(mbfl_convert_filter * filter)226 mbfl_convert_filter_delete(mbfl_convert_filter *filter)
227 {
228 	if (filter) {
229 		(*filter->filter_dtor)(filter);
230 		mbfl_free((void*)filter);
231 	}
232 }
233 
234 int
mbfl_convert_filter_feed(int c,mbfl_convert_filter * filter)235 mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
236 {
237 	return (*filter->filter_function)(c, filter);
238 }
239 
240 int
mbfl_convert_filter_feed_string(mbfl_convert_filter * filter,const unsigned char * p,size_t len)241 mbfl_convert_filter_feed_string(mbfl_convert_filter *filter, const unsigned char *p, size_t len) {
242 	while (len > 0) {
243 		if ((*filter->filter_function)(*p++, filter) < 0) {
244 			return -1;
245 		}
246 		len--;
247 	}
248 	return 0;
249 }
250 
251 int
mbfl_convert_filter_flush(mbfl_convert_filter * filter)252 mbfl_convert_filter_flush(mbfl_convert_filter *filter)
253 {
254 	(*filter->filter_flush)(filter);
255 	return (filter->flush_function ? (*filter->flush_function)(filter->data) : 0);
256 }
257 
mbfl_convert_filter_reset(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to)258 void mbfl_convert_filter_reset(mbfl_convert_filter *filter,
259 	    const mbfl_encoding *from, const mbfl_encoding *to)
260 {
261 	const struct mbfl_convert_vtbl *vtbl;
262 
263 	/* destruct old filter */
264 	(*filter->filter_dtor)(filter);
265 
266 	vtbl = mbfl_convert_filter_get_vtbl(from, to);
267 
268 	if (vtbl == NULL) {
269 		vtbl = &vtbl_pass;
270 	}
271 
272 	mbfl_convert_filter_common_init(filter, from, to, vtbl,
273 			filter->output_function, filter->flush_function, filter->data);
274 }
275 
276 void
mbfl_convert_filter_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)277 mbfl_convert_filter_copy(
278     mbfl_convert_filter *src,
279     mbfl_convert_filter *dest)
280 {
281 	if (src->filter_copy != NULL) {
282 		src->filter_copy(src, dest);
283 		return;
284 	}
285 
286 	*dest = *src;
287 }
288 
mbfl_convert_filter_devcat(mbfl_convert_filter * filter,mbfl_memory_device * src)289 int mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src)
290 {
291 	size_t n;
292 	unsigned char *p;
293 
294 	p = src->buffer;
295 	n = src->pos;
296 	while (n > 0) {
297 		if ((*filter->filter_function)(*p++, filter) < 0) {
298 			return -1;
299 		}
300 		n--;
301 	}
302 
303 	return 0;
304 }
305 
mbfl_convert_filter_strcat(mbfl_convert_filter * filter,const unsigned char * p)306 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
307 {
308 	int c;
309 
310 	while ((c = *p++) != '\0') {
311 		if ((*filter->filter_function)(c, filter) < 0) {
312 			return -1;
313 		}
314 	}
315 
316 	return 0;
317 }
318 
319 /* illegal character output function for conv-filter */
320 int
mbfl_filt_conv_illegal_output(int c,mbfl_convert_filter * filter)321 mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
322 {
323 	int mode_backup, substchar_backup, ret, n, m, r;
324 
325 	ret = 0;
326 
327 	mode_backup = filter->illegal_mode;
328 	substchar_backup = filter->illegal_substchar;
329 
330 	/* The used substitution character may not be supported by the target character encoding.
331 	 * If that happens, first try to use "?" instead and if that also fails, silently drop the
332 	 * character. */
333 	if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
334 			&& filter->illegal_substchar != 0x3f) {
335 		filter->illegal_substchar = 0x3f;
336 	} else {
337 		filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
338 	}
339 
340 	switch (mode_backup) {
341 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
342 		ret = (*filter->filter_function)(substchar_backup, filter);
343 		break;
344 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
345 		if (c >= 0) {
346 			if (c < MBFL_WCSGROUP_UCS4MAX) {	/* unicode */
347 				ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
348 			} else {
349 				if (c < MBFL_WCSGROUP_WCHARMAX) {
350 					m = c & ~MBFL_WCSPLANE_MASK;
351 					switch (m) {
352 					case MBFL_WCSPLANE_JIS0208:
353 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS+");
354 						break;
355 					case MBFL_WCSPLANE_JIS0212:
356 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS2+");
357 						break;
358 					case MBFL_WCSPLANE_JIS0213:
359 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS3+");
360 						break;
361 					case MBFL_WCSPLANE_WINCP932:
362 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"W932+");
363 						break;
364 					case MBFL_WCSPLANE_GB18030:
365 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"GB+");
366 						break;
367 					case MBFL_WCSPLANE_8859_1:
368 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"I8859_1+");
369 						break;
370 					default:
371 						ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"?+");
372 						break;
373 					}
374 					c &= MBFL_WCSPLANE_MASK;
375 				} else {
376 					ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"BAD+");
377 					c &= MBFL_WCSGROUP_MASK;
378 				}
379 			}
380 			if (ret >= 0) {
381 				m = 0;
382 				r = 28;
383 				while (r >= 0) {
384 					n = (c >> r) & 0xf;
385 					if (n || m) {
386 						m = 1;
387 						ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
388 						if (ret < 0) {
389 							break;
390 						}
391 					}
392 					r -= 4;
393 				}
394 				if (m == 0 && ret >= 0) {
395 					ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
396 				}
397 			}
398 		}
399 		break;
400 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
401 		if (c >= 0) {
402 			if (c < MBFL_WCSGROUP_UCS4MAX) {	/* unicode */
403 				ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
404 				if (ret < 0)
405 					break;
406 
407 				m = 0;
408 				r = 28;
409 				while (r >= 0) {
410 					n = (c >> r) & 0xf;
411 					if (n || m) {
412 						m = 1;
413 						ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
414 						if (ret < 0) {
415 							break;
416 						}
417 					}
418 					r -= 4;
419 				}
420 				if (ret < 0) {
421 					break;
422 				}
423 				if (m == 0) {
424 					ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
425 				}
426 				ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
427 			} else {
428 				ret = (*filter->filter_function)(substchar_backup, filter);
429 			}
430 		}
431 		break;
432 	default:
433 		break;
434 	}
435 
436 	filter->illegal_mode = mode_backup;
437 	filter->illegal_substchar = substchar_backup;
438 	filter->num_illegalchar++;
439 
440 	return ret;
441 }
442 
mbfl_convert_filter_get_vtbl(const mbfl_encoding * from,const mbfl_encoding * to)443 const struct mbfl_convert_vtbl * mbfl_convert_filter_get_vtbl(
444 		const mbfl_encoding *from, const mbfl_encoding *to)
445 {
446 	if (to->no_encoding == mbfl_no_encoding_base64 ||
447 	    to->no_encoding == mbfl_no_encoding_qprint ||
448 	    to->no_encoding == mbfl_no_encoding_7bit) {
449 		from = &mbfl_encoding_8bit;
450 	} else if (from->no_encoding == mbfl_no_encoding_base64 ||
451 			   from->no_encoding == mbfl_no_encoding_qprint ||
452 			   from->no_encoding == mbfl_no_encoding_uuencode) {
453 		to = &mbfl_encoding_8bit;
454 	}
455 
456 	if (to == from && (to == &mbfl_encoding_wchar || to == &mbfl_encoding_8bit)) {
457 		return &vtbl_pass;
458 	}
459 
460 	if (to->no_encoding == mbfl_no_encoding_wchar) {
461 		return from->input_filter;
462 	} else if (from->no_encoding == mbfl_no_encoding_wchar) {
463 		return to->output_filter;
464 	} else {
465 		int i = 0;
466 		const struct mbfl_convert_vtbl *vtbl;
467 		while ((vtbl = mbfl_special_filter_list[i++]) != NULL){
468 			if (vtbl->from == from->no_encoding && vtbl->to == to->no_encoding) {
469 				return vtbl;
470 			}
471 		}
472 		return NULL;
473 	}
474 }
475 
476 /*
477  * commonly used constructor and destructor
478  */
mbfl_filt_conv_common_ctor(mbfl_convert_filter * filter)479 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
480 {
481 	filter->status = 0;
482 	filter->cache = 0;
483 }
484 
mbfl_filt_conv_common_flush(mbfl_convert_filter * filter)485 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
486 {
487 	filter->status = 0;
488 	filter->cache = 0;
489 
490 	if (filter->flush_function != NULL) {
491 		(*filter->flush_function)(filter->data);
492 	}
493 	return 0;
494 }
495 
mbfl_filt_conv_common_dtor(mbfl_convert_filter * filter)496 void mbfl_filt_conv_common_dtor(mbfl_convert_filter *filter)
497 {
498 	filter->status = 0;
499 	filter->cache = 0;
500 }
501