1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27  * mbfilter.c is included in this package .
28  *
29  */
30 
31 #include <stddef.h>
32 
33 #include "mbfl_encoding.h"
34 #include "mbfl_filter_output.h"
35 #include "mbfilter_pass.h"
36 #include "mbfilter_8bit.h"
37 #include "mbfilter_wchar.h"
38 
39 #include "filters/mbfilter_base64.h"
40 #include "filters/mbfilter_cjk.h"
41 #include "filters/mbfilter_qprint.h"
42 #include "filters/mbfilter_uuencode.h"
43 #include "filters/mbfilter_7bit.h"
44 #include "filters/mbfilter_utf7.h"
45 #include "filters/mbfilter_utf7imap.h"
46 #include "filters/mbfilter_utf8.h"
47 #include "filters/mbfilter_utf16.h"
48 #include "filters/mbfilter_utf32.h"
49 #include "filters/mbfilter_ucs4.h"
50 #include "filters/mbfilter_ucs2.h"
51 #include "filters/mbfilter_htmlent.h"
52 #include "filters/mbfilter_singlebyte.h"
53 
54 /* hex character table "0123456789ABCDEF" */
55 static char mbfl_hexchar_table[] = {
56 	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
57 };
58 
59 static const struct mbfl_convert_vtbl *mbfl_special_filter_list[] = {
60 	&vtbl_8bit_b64,
61 	&vtbl_b64_8bit,
62 	&vtbl_uuencode_8bit,
63 	&vtbl_8bit_qprint,
64 	&vtbl_qprint_8bit,
65 	&vtbl_pass,
66 	NULL
67 };
68 
mbfl_convert_filter_init(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to,const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)69 static void mbfl_convert_filter_init(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to,
70 	const struct mbfl_convert_vtbl *vtbl, output_function_t output_function, flush_function_t flush_function, void* data)
71 {
72 	/* encoding structure */
73 	filter->from = from;
74 	filter->to = to;
75 
76 	if (output_function != NULL) {
77 		filter->output_function = output_function;
78 	} else {
79 		filter->output_function = mbfl_filter_output_null;
80 	}
81 
82 	filter->flush_function = flush_function;
83 	filter->data = data;
84 	filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
85 	filter->illegal_substchar = '?';
86 	filter->num_illegalchar = 0;
87 	filter->filter_dtor = vtbl->filter_dtor;
88 	filter->filter_function = vtbl->filter_function;
89 	filter->filter_flush = (filter_flush_t)vtbl->filter_flush;
90 	filter->filter_copy = vtbl->filter_copy;
91 
92 	(*vtbl->filter_ctor)(filter);
93 }
94 
mbfl_convert_filter_new(const mbfl_encoding * from,const mbfl_encoding * to,output_function_t output_function,flush_function_t flush_function,void * data)95 mbfl_convert_filter* mbfl_convert_filter_new(const mbfl_encoding *from, const mbfl_encoding *to, output_function_t output_function,
96 	flush_function_t flush_function, void* data)
97 {
98 	const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
99 	if (vtbl == NULL) {
100 		return NULL;
101 	}
102 
103 	mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
104 	mbfl_convert_filter_init(filter, from, to, vtbl, output_function, flush_function, data);
105 	return filter;
106 }
107 
mbfl_convert_filter_new2(const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)108 mbfl_convert_filter* mbfl_convert_filter_new2(const struct mbfl_convert_vtbl *vtbl, output_function_t output_function,
109 	flush_function_t flush_function, void* data)
110 {
111 	const mbfl_encoding *from_encoding = mbfl_no2encoding(vtbl->from);
112 	const mbfl_encoding *to_encoding = mbfl_no2encoding(vtbl->to);
113 
114 	mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
115 	mbfl_convert_filter_init(filter, from_encoding, to_encoding, vtbl, output_function, flush_function, data);
116 	return filter;
117 }
118 
mbfl_convert_filter_delete(mbfl_convert_filter * filter)119 void mbfl_convert_filter_delete(mbfl_convert_filter *filter)
120 {
121 	if (filter->filter_dtor) {
122 		(*filter->filter_dtor)(filter);
123 	}
124 	efree(filter);
125 }
126 
127 /* Feed a char, return 0 if ok - used by mailparse ext */
mbfl_convert_filter_feed(int c,mbfl_convert_filter * filter)128 int mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
129 {
130 	return (*filter->filter_function)(c, filter);
131 }
132 
133 /* Feed string into `filter` byte by byte; return pointer to first byte not processed */
mbfl_convert_filter_feed_string(mbfl_convert_filter * filter,unsigned char * p,size_t len)134 unsigned char* mbfl_convert_filter_feed_string(mbfl_convert_filter *filter, unsigned char *p, size_t len)
135 {
136 	while (len--) {
137 		if ((*filter->filter_function)(*p++, filter) < 0) {
138 			break;
139 		}
140 	}
141 	return p;
142 }
143 
mbfl_convert_filter_flush(mbfl_convert_filter * filter)144 int mbfl_convert_filter_flush(mbfl_convert_filter *filter)
145 {
146 	(*filter->filter_flush)(filter);
147 	return 0;
148 }
149 
mbfl_convert_filter_reset(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to)150 void mbfl_convert_filter_reset(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to)
151 {
152 	if (filter->filter_dtor) {
153 		(*filter->filter_dtor)(filter);
154 	}
155 
156 	const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
157 
158 	if (vtbl == NULL) {
159 		vtbl = &vtbl_pass;
160 	}
161 
162 	mbfl_convert_filter_init(filter, from, to, vtbl, filter->output_function, filter->flush_function, filter->data);
163 }
164 
mbfl_convert_filter_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)165 void mbfl_convert_filter_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
166 {
167 	if (src->filter_copy != NULL) {
168 		src->filter_copy(src, dest);
169 		return;
170 	}
171 
172 	*dest = *src;
173 }
174 
mbfl_convert_filter_devcat(mbfl_convert_filter * filter,mbfl_memory_device * src)175 void mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src)
176 {
177 	mbfl_convert_filter_feed_string(filter, src->buffer, src->pos);
178 }
179 
mbfl_convert_filter_strcat(mbfl_convert_filter * filter,const unsigned char * p)180 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
181 {
182 	int c;
183 	while ((c = *p++)) {
184 		if ((*filter->filter_function)(c, filter) < 0) {
185 			return -1;
186 		}
187 	}
188 
189 	return 0;
190 }
191 
mbfl_filt_conv_output_hex(unsigned int w,mbfl_convert_filter * filter)192 static int mbfl_filt_conv_output_hex(unsigned int w, mbfl_convert_filter *filter)
193 {
194 	bool nonzero = false;
195 	int shift = 28, ret = 0;
196 
197 	while (shift >= 0) {
198 		int n = (w >> shift) & 0xF;
199 		if (n || nonzero) {
200 			nonzero = true;
201 			ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
202 			if (ret < 0) {
203 				return ret;
204 			}
205 		}
206 		shift -= 4;
207 	}
208 
209 	if (!nonzero) {
210 		/* No hex digits were output by above loop */
211 		ret = (*filter->filter_function)('0', filter);
212 	}
213 
214 	return ret;
215 }
216 
217 /* illegal character output function for conv-filter */
mbfl_filt_conv_illegal_output(int c,mbfl_convert_filter * filter)218 int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
219 {
220 	unsigned int w = c;
221 	int ret = 0;
222 	int mode_backup = filter->illegal_mode;
223 	uint32_t substchar_backup = filter->illegal_substchar;
224 
225 	/* The used substitution character may not be supported by the target character encoding.
226 	 * If that happens, first try to use "?" instead and if that also fails, silently drop the
227 	 * character. */
228 	if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
229 			&& filter->illegal_substchar != '?') {
230 		filter->illegal_substchar = '?';
231 	} else {
232 		filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
233 	}
234 
235 	switch (mode_backup) {
236 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
237 		ret = (*filter->filter_function)(substchar_backup, filter);
238 		break;
239 
240 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
241 		if (w != MBFL_BAD_INPUT) {
242 			ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
243 			if (ret < 0)
244 				break;
245 			ret = mbfl_filt_conv_output_hex(w, filter);
246 		} else {
247 			ret = (*filter->filter_function)(substchar_backup, filter);
248 		}
249 		break;
250 
251 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
252 		if (w != MBFL_BAD_INPUT) {
253 			ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
254 			if (ret < 0)
255 				break;
256 			ret = mbfl_filt_conv_output_hex(w, filter);
257 			if (ret < 0)
258 				break;
259 			ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
260 		} else {
261 			ret = (*filter->filter_function)(substchar_backup, filter);
262 		}
263 		break;
264 
265 	case MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE:
266 	default:
267 		break;
268 	}
269 
270 	filter->illegal_mode = mode_backup;
271 	filter->illegal_substchar = substchar_backup;
272 	filter->num_illegalchar++;
273 
274 	return ret;
275 }
276 
mbfl_convert_filter_get_vtbl(const mbfl_encoding * from,const mbfl_encoding * to)277 const struct mbfl_convert_vtbl* mbfl_convert_filter_get_vtbl(const mbfl_encoding *from, const mbfl_encoding *to)
278 {
279 	if (to->no_encoding == mbfl_no_encoding_base64 ||
280 	    to->no_encoding == mbfl_no_encoding_qprint) {
281 		from = &mbfl_encoding_8bit;
282 	} else if (from->no_encoding == mbfl_no_encoding_base64 ||
283 			   from->no_encoding == mbfl_no_encoding_qprint ||
284 			   from->no_encoding == mbfl_no_encoding_uuencode) {
285 		to = &mbfl_encoding_8bit;
286 	}
287 
288 	if (to == from && (to == &mbfl_encoding_wchar || to == &mbfl_encoding_8bit)) {
289 		return &vtbl_pass;
290 	}
291 
292 	if (to->no_encoding == mbfl_no_encoding_wchar) {
293 		return from->input_filter;
294 	} else if (from->no_encoding == mbfl_no_encoding_wchar) {
295 		return to->output_filter;
296 	} else {
297 		int i = 0;
298 		const struct mbfl_convert_vtbl *vtbl;
299 		while ((vtbl = mbfl_special_filter_list[i++])) {
300 			if (vtbl->from == from->no_encoding && vtbl->to == to->no_encoding) {
301 				return vtbl;
302 			}
303 		}
304 		return NULL;
305 	}
306 }
307 
308 /*
309  * commonly used constructor
310  */
mbfl_filt_conv_common_ctor(mbfl_convert_filter * filter)311 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
312 {
313 	filter->status = filter->cache = 0;
314 }
315 
mbfl_filt_conv_common_flush(mbfl_convert_filter * filter)316 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
317 {
318 	if (filter->flush_function) {
319 		(*filter->flush_function)(filter->data);
320 	}
321 	return 0;
322 }
323 
mb_fast_convert(unsigned char * in,size_t in_len,const mbfl_encoding * from,const mbfl_encoding * to,uint32_t replacement_char,unsigned int error_mode,unsigned int * num_errors)324 zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors)
325 {
326 	uint32_t wchar_buf[128];
327 	unsigned int state = 0;
328 
329 	if (to == &mbfl_encoding_base64 || to == &mbfl_encoding_qprint) {
330 		from = &mbfl_encoding_8bit;
331 	} else if (from == &mbfl_encoding_base64 || from == &mbfl_encoding_qprint || from == &mbfl_encoding_uuencode) {
332 		to = &mbfl_encoding_8bit;
333 	}
334 
335 	mb_convert_buf buf;
336 	mb_convert_buf_init(&buf, in_len, replacement_char, error_mode);
337 
338 	while (in_len) {
339 		size_t out_len = from->to_wchar(&in, &in_len, wchar_buf, 128, &state);
340 		ZEND_ASSERT(out_len <= 128);
341 		to->from_wchar(wchar_buf, out_len, &buf, !in_len);
342 	}
343 
344 	*num_errors = buf.errors;
345 	return mb_convert_buf_result(&buf, to);
346 }
347 
convert_cp_to_hex(uint32_t cp,uint32_t * out)348 static uint32_t* convert_cp_to_hex(uint32_t cp, uint32_t *out)
349 {
350 	bool nonzero = false;
351 	int shift = 28;
352 
353 	while (shift >= 0) {
354 		int n = (cp >> shift) & 0xF;
355 		if (n || nonzero) {
356 			nonzero = true;
357 			*out++ = mbfl_hexchar_table[n];
358 		}
359 		shift -= 4;
360 	}
361 
362 	if (!nonzero) {
363 		/* No hex digits were output by above loop */
364 		*out++ = '0';
365 	}
366 
367 	return out;
368 }
369 
mb_illegal_marker(uint32_t bad_cp,uint32_t * out,unsigned int err_mode,uint32_t replacement_char)370 static size_t mb_illegal_marker(uint32_t bad_cp, uint32_t *out, unsigned int err_mode, uint32_t replacement_char)
371 {
372 	uint32_t *start = out;
373 
374 	if (bad_cp == MBFL_BAD_INPUT) {
375 		/* Input string contained a byte sequence which was invalid in the 'from' encoding
376 		 * Unless the error handling mode is set to NONE, insert the replacement character */
377 		if (err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
378 			*out++ = replacement_char;
379 		}
380 	} else {
381 		/* Input string contained a byte sequence which was valid in the 'from' encoding,
382 		 * but decoded to a Unicode codepoint which cannot be represented in the 'to' encoding */
383 		switch (err_mode) {
384 		case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
385 			*out++ = replacement_char;
386 			break;
387 
388 		case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
389 			out[0] = 'U';
390 			out[1] = '+';
391 			out = convert_cp_to_hex(bad_cp, &out[2]);
392 			break;
393 
394 		case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
395 			out[0] = '&'; out[1] = '#'; out[2] = 'x';
396 			out = convert_cp_to_hex(bad_cp, &out[3]);
397 			*out++ = ';';
398 			break;
399 		}
400 	}
401 
402 	return out - start;
403 }
404 
mb_illegal_output(uint32_t bad_cp,mb_from_wchar_fn fn,mb_convert_buf * buf)405 void mb_illegal_output(uint32_t bad_cp, mb_from_wchar_fn fn, mb_convert_buf* buf)
406 {
407 	buf->errors++;
408 
409 	uint32_t temp[12];
410 	uint32_t repl_char = buf->replacement_char;
411 	unsigned int err_mode = buf->error_mode;
412 
413 	if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
414 		/* This mode is for internal use only, when converting a string to
415 		 * UTF-8 before searching it; it uses a byte which is illegal in
416 		 * UTF-8 as an error marker. This ensures that error markers will
417 		 * never 'accidentally' match valid text, as could happen when a
418 		 * character like '?' is used as an error marker. */
419 		MB_CONVERT_BUF_ENSURE(buf, buf->out, buf->limit, 1);
420 		buf->out = mb_convert_buf_add(buf->out, 0xFF);
421 		return;
422 	}
423 
424 	size_t len = mb_illegal_marker(bad_cp, temp, err_mode, repl_char);
425 
426 	/* Avoid infinite loop if `fn` is not able to handle `repl_char` */
427 	if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR && repl_char != '?') {
428 		buf->replacement_char = '?';
429 	} else {
430 		buf->error_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
431 	}
432 
433 	fn(temp, len, buf, false);
434 
435 	buf->replacement_char = repl_char;
436 	buf->error_mode = err_mode;
437 }
438