1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27  * mbfilter.c is included in this package .
28  *
29  */
30 
31 #include <stddef.h>
32 
33 #include "mbfl_encoding.h"
34 #include "mbfl_filter_output.h"
35 #include "mbfilter_pass.h"
36 #include "mbfilter_8bit.h"
37 #include "mbfilter_wchar.h"
38 
39 #include "filters/mbfilter_euc_cn.h"
40 #include "filters/mbfilter_hz.h"
41 #include "filters/mbfilter_euc_tw.h"
42 #include "filters/mbfilter_big5.h"
43 #include "filters/mbfilter_uhc.h"
44 #include "filters/mbfilter_euc_kr.h"
45 #include "filters/mbfilter_iso2022_kr.h"
46 #include "filters/mbfilter_sjis.h"
47 #include "filters/mbfilter_sjis_2004.h"
48 #include "filters/mbfilter_sjis_mobile.h"
49 #include "filters/mbfilter_sjis_mac.h"
50 #include "filters/mbfilter_cp51932.h"
51 #include "filters/mbfilter_jis.h"
52 #include "filters/mbfilter_iso2022_jp_ms.h"
53 #include "filters/mbfilter_iso2022jp_2004.h"
54 #include "filters/mbfilter_iso2022jp_mobile.h"
55 #include "filters/mbfilter_euc_jp.h"
56 #include "filters/mbfilter_euc_jp_2004.h"
57 #include "filters/mbfilter_euc_jp_win.h"
58 #include "filters/mbfilter_gb18030.h"
59 #include "filters/mbfilter_cp932.h"
60 #include "filters/mbfilter_cp936.h"
61 #include "filters/mbfilter_cp5022x.h"
62 #include "filters/mbfilter_base64.h"
63 #include "filters/mbfilter_qprint.h"
64 #include "filters/mbfilter_uuencode.h"
65 #include "filters/mbfilter_7bit.h"
66 #include "filters/mbfilter_utf7.h"
67 #include "filters/mbfilter_utf7imap.h"
68 #include "filters/mbfilter_utf8.h"
69 #include "filters/mbfilter_utf8_mobile.h"
70 #include "filters/mbfilter_utf16.h"
71 #include "filters/mbfilter_utf32.h"
72 #include "filters/mbfilter_ucs4.h"
73 #include "filters/mbfilter_ucs2.h"
74 #include "filters/mbfilter_htmlent.h"
75 #include "filters/mbfilter_singlebyte.h"
76 
77 /* hex character table "0123456789ABCDEF" */
78 static char mbfl_hexchar_table[] = {
79 	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
80 };
81 
82 static const struct mbfl_convert_vtbl *mbfl_special_filter_list[] = {
83 	&vtbl_8bit_b64,
84 	&vtbl_b64_8bit,
85 	&vtbl_uuencode_8bit,
86 	&vtbl_8bit_qprint,
87 	&vtbl_qprint_8bit,
88 	&vtbl_pass,
89 	NULL
90 };
91 
mbfl_convert_filter_init(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to,const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)92 static void mbfl_convert_filter_init(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to,
93 	const struct mbfl_convert_vtbl *vtbl, output_function_t output_function, flush_function_t flush_function, void* data)
94 {
95 	/* encoding structure */
96 	filter->from = from;
97 	filter->to = to;
98 
99 	if (output_function != NULL) {
100 		filter->output_function = output_function;
101 	} else {
102 		filter->output_function = mbfl_filter_output_null;
103 	}
104 
105 	filter->flush_function = flush_function;
106 	filter->data = data;
107 	filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
108 	filter->illegal_substchar = '?';
109 	filter->num_illegalchar = 0;
110 	filter->filter_dtor = vtbl->filter_dtor;
111 	filter->filter_function = vtbl->filter_function;
112 	filter->filter_flush = (filter_flush_t)vtbl->filter_flush;
113 	filter->filter_copy = vtbl->filter_copy;
114 
115 	(*vtbl->filter_ctor)(filter);
116 }
117 
mbfl_convert_filter_new(const mbfl_encoding * from,const mbfl_encoding * to,output_function_t output_function,flush_function_t flush_function,void * data)118 mbfl_convert_filter* mbfl_convert_filter_new(const mbfl_encoding *from, const mbfl_encoding *to, output_function_t output_function,
119 	flush_function_t flush_function, void* data)
120 {
121 	const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
122 	if (vtbl == NULL) {
123 		return NULL;
124 	}
125 
126 	mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
127 	mbfl_convert_filter_init(filter, from, to, vtbl, output_function, flush_function, data);
128 	return filter;
129 }
130 
mbfl_convert_filter_new2(const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)131 mbfl_convert_filter* mbfl_convert_filter_new2(const struct mbfl_convert_vtbl *vtbl, output_function_t output_function,
132 	flush_function_t flush_function, void* data)
133 {
134 	const mbfl_encoding *from_encoding = mbfl_no2encoding(vtbl->from);
135 	const mbfl_encoding *to_encoding = mbfl_no2encoding(vtbl->to);
136 
137 	mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
138 	mbfl_convert_filter_init(filter, from_encoding, to_encoding, vtbl, output_function, flush_function, data);
139 	return filter;
140 }
141 
mbfl_convert_filter_delete(mbfl_convert_filter * filter)142 void mbfl_convert_filter_delete(mbfl_convert_filter *filter)
143 {
144 	if (filter->filter_dtor) {
145 		(*filter->filter_dtor)(filter);
146 	}
147 	efree(filter);
148 }
149 
150 /* Feed a char, return 0 if ok - used by mailparse ext */
mbfl_convert_filter_feed(int c,mbfl_convert_filter * filter)151 int mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
152 {
153 	return (*filter->filter_function)(c, filter);
154 }
155 
156 /* Feed string into `filter` byte by byte; return pointer to first byte not processed */
mbfl_convert_filter_feed_string(mbfl_convert_filter * filter,unsigned char * p,size_t len)157 unsigned char* mbfl_convert_filter_feed_string(mbfl_convert_filter *filter, unsigned char *p, size_t len)
158 {
159 	while (len--) {
160 		if ((*filter->filter_function)(*p++, filter) < 0) {
161 			break;
162 		}
163 	}
164 	return p;
165 }
166 
mbfl_convert_filter_flush(mbfl_convert_filter * filter)167 int mbfl_convert_filter_flush(mbfl_convert_filter *filter)
168 {
169 	(*filter->filter_flush)(filter);
170 	return 0;
171 }
172 
mbfl_convert_filter_reset(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to)173 void mbfl_convert_filter_reset(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to)
174 {
175 	if (filter->filter_dtor) {
176 		(*filter->filter_dtor)(filter);
177 	}
178 
179 	const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
180 
181 	if (vtbl == NULL) {
182 		vtbl = &vtbl_pass;
183 	}
184 
185 	mbfl_convert_filter_init(filter, from, to, vtbl, filter->output_function, filter->flush_function, filter->data);
186 }
187 
mbfl_convert_filter_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)188 void mbfl_convert_filter_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
189 {
190 	if (src->filter_copy != NULL) {
191 		src->filter_copy(src, dest);
192 		return;
193 	}
194 
195 	*dest = *src;
196 }
197 
mbfl_convert_filter_devcat(mbfl_convert_filter * filter,mbfl_memory_device * src)198 void mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src)
199 {
200 	mbfl_convert_filter_feed_string(filter, src->buffer, src->pos);
201 }
202 
mbfl_convert_filter_strcat(mbfl_convert_filter * filter,const unsigned char * p)203 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
204 {
205 	int c;
206 	while ((c = *p++)) {
207 		if ((*filter->filter_function)(c, filter) < 0) {
208 			return -1;
209 		}
210 	}
211 
212 	return 0;
213 }
214 
mbfl_filt_conv_output_hex(unsigned int w,mbfl_convert_filter * filter)215 static int mbfl_filt_conv_output_hex(unsigned int w, mbfl_convert_filter *filter)
216 {
217 	bool nonzero = false;
218 	int shift = 28, ret = 0;
219 
220 	while (shift >= 0) {
221 		int n = (w >> shift) & 0xF;
222 		if (n || nonzero) {
223 			nonzero = true;
224 			ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
225 			if (ret < 0) {
226 				return ret;
227 			}
228 		}
229 		shift -= 4;
230 	}
231 
232 	if (!nonzero) {
233 		/* No hex digits were output by above loop */
234 		ret = (*filter->filter_function)('0', filter);
235 	}
236 
237 	return ret;
238 }
239 
240 /* illegal character output function for conv-filter */
mbfl_filt_conv_illegal_output(int c,mbfl_convert_filter * filter)241 int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
242 {
243 	unsigned int w = c;
244 	int ret = 0;
245 	int mode_backup = filter->illegal_mode;
246 	int substchar_backup = filter->illegal_substchar;
247 
248 	/* The used substitution character may not be supported by the target character encoding.
249 	 * If that happens, first try to use "?" instead and if that also fails, silently drop the
250 	 * character. */
251 	if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
252 			&& filter->illegal_substchar != '?') {
253 		filter->illegal_substchar = '?';
254 	} else {
255 		filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
256 	}
257 
258 	switch (mode_backup) {
259 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
260 		ret = (*filter->filter_function)(substchar_backup, filter);
261 		break;
262 
263 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
264 		if (w != MBFL_BAD_INPUT) {
265 			ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
266 			if (ret < 0)
267 				break;
268 			ret = mbfl_filt_conv_output_hex(w, filter);
269 		} else {
270 			ret = (*filter->filter_function)(substchar_backup, filter);
271 		}
272 		break;
273 
274 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
275 		if (w != MBFL_BAD_INPUT) {
276 			ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
277 			if (ret < 0)
278 				break;
279 			ret = mbfl_filt_conv_output_hex(w, filter);
280 			if (ret < 0)
281 				break;
282 			ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
283 		} else {
284 			ret = (*filter->filter_function)(substchar_backup, filter);
285 		}
286 		break;
287 
288 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE:
289 	default:
290 		break;
291 	}
292 
293 	filter->illegal_mode = mode_backup;
294 	filter->illegal_substchar = substchar_backup;
295 	filter->num_illegalchar++;
296 
297 	return ret;
298 }
299 
mbfl_convert_filter_get_vtbl(const mbfl_encoding * from,const mbfl_encoding * to)300 const struct mbfl_convert_vtbl* mbfl_convert_filter_get_vtbl(const mbfl_encoding *from, const mbfl_encoding *to)
301 {
302 	if (to->no_encoding == mbfl_no_encoding_base64 ||
303 	    to->no_encoding == mbfl_no_encoding_qprint) {
304 		from = &mbfl_encoding_8bit;
305 	} else if (from->no_encoding == mbfl_no_encoding_base64 ||
306 			   from->no_encoding == mbfl_no_encoding_qprint ||
307 			   from->no_encoding == mbfl_no_encoding_uuencode) {
308 		to = &mbfl_encoding_8bit;
309 	}
310 
311 	if (to == from && (to == &mbfl_encoding_wchar || to == &mbfl_encoding_8bit)) {
312 		return &vtbl_pass;
313 	}
314 
315 	if (to->no_encoding == mbfl_no_encoding_wchar) {
316 		return from->input_filter;
317 	} else if (from->no_encoding == mbfl_no_encoding_wchar) {
318 		return to->output_filter;
319 	} else {
320 		int i = 0;
321 		const struct mbfl_convert_vtbl *vtbl;
322 		while ((vtbl = mbfl_special_filter_list[i++])) {
323 			if (vtbl->from == from->no_encoding && vtbl->to == to->no_encoding) {
324 				return vtbl;
325 			}
326 		}
327 		return NULL;
328 	}
329 }
330 
331 /*
332  * commonly used constructor
333  */
mbfl_filt_conv_common_ctor(mbfl_convert_filter * filter)334 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
335 {
336 	filter->status = filter->cache = 0;
337 }
338 
mbfl_filt_conv_common_flush(mbfl_convert_filter * filter)339 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
340 {
341 	if (filter->flush_function) {
342 		(*filter->flush_function)(filter->data);
343 	}
344 	return 0;
345 }
346 
mb_fast_convert(unsigned char * in,size_t in_len,const mbfl_encoding * from,const mbfl_encoding * to,uint32_t replacement_char,unsigned int error_mode,unsigned int * num_errors)347 zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors)
348 {
349 	uint32_t wchar_buf[128];
350 	unsigned int state = 0;
351 
352 	if (to == &mbfl_encoding_base64 || to == &mbfl_encoding_qprint) {
353 		from = &mbfl_encoding_8bit;
354 	} else if (from == &mbfl_encoding_base64 || from == &mbfl_encoding_qprint || from == &mbfl_encoding_uuencode) {
355 		to = &mbfl_encoding_8bit;
356 	}
357 
358 	mb_convert_buf buf;
359 	mb_convert_buf_init(&buf, in_len, replacement_char, error_mode);
360 
361 	while (in_len) {
362 		size_t out_len = from->to_wchar(&in, &in_len, wchar_buf, 128, &state);
363 		ZEND_ASSERT(out_len <= 128);
364 		to->from_wchar(wchar_buf, out_len, &buf, !in_len);
365 	}
366 
367 	*num_errors = buf.errors;
368 	return mb_convert_buf_result(&buf);
369 }
370 
convert_cp_to_hex(uint32_t cp,uint32_t * out)371 static uint32_t* convert_cp_to_hex(uint32_t cp, uint32_t *out)
372 {
373 	bool nonzero = false;
374 	int shift = 28;
375 
376 	while (shift >= 0) {
377 		int n = (cp >> shift) & 0xF;
378 		if (n || nonzero) {
379 			nonzero = true;
380 			*out++ = mbfl_hexchar_table[n];
381 		}
382 		shift -= 4;
383 	}
384 
385 	if (!nonzero) {
386 		/* No hex digits were output by above loop */
387 		*out++ = '0';
388 	}
389 
390 	return out;
391 }
392 
mb_illegal_marker(uint32_t bad_cp,uint32_t * out,unsigned int err_mode,uint32_t replacement_char)393 static size_t mb_illegal_marker(uint32_t bad_cp, uint32_t *out, unsigned int err_mode, uint32_t replacement_char)
394 {
395 	uint32_t *start = out;
396 
397 	if (bad_cp == MBFL_BAD_INPUT && err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
398 		*out++ = replacement_char;
399 	} else {
400 		switch (err_mode) {
401 		case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
402 			*out++ = replacement_char;
403 			break;
404 
405 		case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
406 			out[0] = 'U';
407 			out[1] = '+';
408 			out = convert_cp_to_hex(bad_cp, &out[2]);
409 			break;
410 
411 		case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
412 			out[0] = '&'; out[1] = '#'; out[2] = 'x';
413 			out = convert_cp_to_hex(bad_cp, &out[3]);
414 			*out++ = ';';
415 			break;
416 		}
417 	}
418 
419 	return out - start;
420 }
421 
mb_illegal_output(uint32_t bad_cp,mb_from_wchar_fn fn,mb_convert_buf * buf)422 void mb_illegal_output(uint32_t bad_cp, mb_from_wchar_fn fn, mb_convert_buf* buf)
423 {
424 	buf->errors++;
425 
426 	uint32_t temp[12];
427 	uint32_t repl_char = buf->replacement_char;
428 	unsigned int err_mode = buf->error_mode;
429 
430 	size_t len = mb_illegal_marker(bad_cp, temp, err_mode, repl_char);
431 
432 	/* Avoid infinite loop if `fn` is not able to handle `repl_char` */
433 	if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR && repl_char != '?') {
434 		buf->replacement_char = '?';
435 	} else {
436 		buf->error_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
437 	}
438 
439 	fn(temp, len, buf, false);
440 
441 	buf->replacement_char = repl_char;
442 	buf->error_mode = err_mode;
443 }
444