1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27 * mbfilter.c is included in this package .
28 *
29 */
30
31 #include <stddef.h>
32
33 #include "mbfl_encoding.h"
34 #include "mbfl_filter_output.h"
35 #include "mbfilter_pass.h"
36 #include "mbfilter_8bit.h"
37 #include "mbfilter_wchar.h"
38
39 #include "filters/mbfilter_base64.h"
40 #include "filters/mbfilter_cjk.h"
41 #include "filters/mbfilter_qprint.h"
42 #include "filters/mbfilter_uuencode.h"
43 #include "filters/mbfilter_7bit.h"
44 #include "filters/mbfilter_utf7.h"
45 #include "filters/mbfilter_utf7imap.h"
46 #include "filters/mbfilter_utf8.h"
47 #include "filters/mbfilter_utf16.h"
48 #include "filters/mbfilter_utf32.h"
49 #include "filters/mbfilter_ucs4.h"
50 #include "filters/mbfilter_ucs2.h"
51 #include "filters/mbfilter_htmlent.h"
52 #include "filters/mbfilter_singlebyte.h"
53
54 /* hex character table "0123456789ABCDEF" */
55 static char mbfl_hexchar_table[] = {
56 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
57 };
58
59 static const struct mbfl_convert_vtbl *mbfl_special_filter_list[] = {
60 &vtbl_8bit_b64,
61 &vtbl_b64_8bit,
62 &vtbl_uuencode_8bit,
63 &vtbl_8bit_qprint,
64 &vtbl_qprint_8bit,
65 &vtbl_pass,
66 NULL
67 };
68
mbfl_convert_filter_init(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to,const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)69 static void mbfl_convert_filter_init(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to,
70 const struct mbfl_convert_vtbl *vtbl, output_function_t output_function, flush_function_t flush_function, void* data)
71 {
72 /* encoding structure */
73 filter->from = from;
74 filter->to = to;
75
76 if (output_function != NULL) {
77 filter->output_function = output_function;
78 } else {
79 filter->output_function = mbfl_filter_output_null;
80 }
81
82 filter->flush_function = flush_function;
83 filter->data = data;
84 filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
85 filter->illegal_substchar = '?';
86 filter->num_illegalchar = 0;
87 filter->filter_dtor = vtbl->filter_dtor;
88 filter->filter_function = vtbl->filter_function;
89 filter->filter_flush = (filter_flush_t)vtbl->filter_flush;
90 filter->filter_copy = vtbl->filter_copy;
91
92 (*vtbl->filter_ctor)(filter);
93 }
94
mbfl_convert_filter_new(const mbfl_encoding * from,const mbfl_encoding * to,output_function_t output_function,flush_function_t flush_function,void * data)95 mbfl_convert_filter* mbfl_convert_filter_new(const mbfl_encoding *from, const mbfl_encoding *to, output_function_t output_function,
96 flush_function_t flush_function, void* data)
97 {
98 const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
99 if (vtbl == NULL) {
100 return NULL;
101 }
102
103 mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
104 mbfl_convert_filter_init(filter, from, to, vtbl, output_function, flush_function, data);
105 return filter;
106 }
107
mbfl_convert_filter_new2(const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)108 mbfl_convert_filter* mbfl_convert_filter_new2(const struct mbfl_convert_vtbl *vtbl, output_function_t output_function,
109 flush_function_t flush_function, void* data)
110 {
111 const mbfl_encoding *from_encoding = mbfl_no2encoding(vtbl->from);
112 const mbfl_encoding *to_encoding = mbfl_no2encoding(vtbl->to);
113
114 mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
115 mbfl_convert_filter_init(filter, from_encoding, to_encoding, vtbl, output_function, flush_function, data);
116 return filter;
117 }
118
mbfl_convert_filter_delete(mbfl_convert_filter * filter)119 void mbfl_convert_filter_delete(mbfl_convert_filter *filter)
120 {
121 if (filter->filter_dtor) {
122 (*filter->filter_dtor)(filter);
123 }
124 efree(filter);
125 }
126
127 /* Feed a char, return 0 if ok - used by mailparse ext */
mbfl_convert_filter_feed(int c,mbfl_convert_filter * filter)128 int mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
129 {
130 return (*filter->filter_function)(c, filter);
131 }
132
133 /* Feed string into `filter` byte by byte; return pointer to first byte not processed */
mbfl_convert_filter_feed_string(mbfl_convert_filter * filter,unsigned char * p,size_t len)134 unsigned char* mbfl_convert_filter_feed_string(mbfl_convert_filter *filter, unsigned char *p, size_t len)
135 {
136 while (len--) {
137 if ((*filter->filter_function)(*p++, filter) < 0) {
138 break;
139 }
140 }
141 return p;
142 }
143
mbfl_convert_filter_flush(mbfl_convert_filter * filter)144 int mbfl_convert_filter_flush(mbfl_convert_filter *filter)
145 {
146 (*filter->filter_flush)(filter);
147 return 0;
148 }
149
mbfl_convert_filter_reset(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to)150 void mbfl_convert_filter_reset(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to)
151 {
152 if (filter->filter_dtor) {
153 (*filter->filter_dtor)(filter);
154 }
155
156 const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
157
158 if (vtbl == NULL) {
159 vtbl = &vtbl_pass;
160 }
161
162 mbfl_convert_filter_init(filter, from, to, vtbl, filter->output_function, filter->flush_function, filter->data);
163 }
164
mbfl_convert_filter_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)165 void mbfl_convert_filter_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
166 {
167 if (src->filter_copy != NULL) {
168 src->filter_copy(src, dest);
169 return;
170 }
171
172 *dest = *src;
173 }
174
mbfl_convert_filter_devcat(mbfl_convert_filter * filter,mbfl_memory_device * src)175 void mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src)
176 {
177 mbfl_convert_filter_feed_string(filter, src->buffer, src->pos);
178 }
179
mbfl_convert_filter_strcat(mbfl_convert_filter * filter,const unsigned char * p)180 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
181 {
182 int c;
183 while ((c = *p++)) {
184 if ((*filter->filter_function)(c, filter) < 0) {
185 return -1;
186 }
187 }
188
189 return 0;
190 }
191
mbfl_filt_conv_output_hex(unsigned int w,mbfl_convert_filter * filter)192 static int mbfl_filt_conv_output_hex(unsigned int w, mbfl_convert_filter *filter)
193 {
194 bool nonzero = false;
195 int shift = 28, ret = 0;
196
197 while (shift >= 0) {
198 int n = (w >> shift) & 0xF;
199 if (n || nonzero) {
200 nonzero = true;
201 ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
202 if (ret < 0) {
203 return ret;
204 }
205 }
206 shift -= 4;
207 }
208
209 if (!nonzero) {
210 /* No hex digits were output by above loop */
211 ret = (*filter->filter_function)('0', filter);
212 }
213
214 return ret;
215 }
216
217 /* illegal character output function for conv-filter */
mbfl_filt_conv_illegal_output(int c,mbfl_convert_filter * filter)218 int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
219 {
220 unsigned int w = c;
221 int ret = 0;
222 int mode_backup = filter->illegal_mode;
223 uint32_t substchar_backup = filter->illegal_substchar;
224
225 /* The used substitution character may not be supported by the target character encoding.
226 * If that happens, first try to use "?" instead and if that also fails, silently drop the
227 * character. */
228 if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
229 && filter->illegal_substchar != '?') {
230 filter->illegal_substchar = '?';
231 } else {
232 filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
233 }
234
235 switch (mode_backup) {
236 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
237 ret = (*filter->filter_function)(substchar_backup, filter);
238 break;
239
240 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
241 if (w != MBFL_BAD_INPUT) {
242 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
243 if (ret < 0)
244 break;
245 ret = mbfl_filt_conv_output_hex(w, filter);
246 } else {
247 ret = (*filter->filter_function)(substchar_backup, filter);
248 }
249 break;
250
251 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
252 if (w != MBFL_BAD_INPUT) {
253 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
254 if (ret < 0)
255 break;
256 ret = mbfl_filt_conv_output_hex(w, filter);
257 if (ret < 0)
258 break;
259 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
260 } else {
261 ret = (*filter->filter_function)(substchar_backup, filter);
262 }
263 break;
264
265 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE:
266 default:
267 break;
268 }
269
270 filter->illegal_mode = mode_backup;
271 filter->illegal_substchar = substchar_backup;
272 filter->num_illegalchar++;
273
274 return ret;
275 }
276
mbfl_convert_filter_get_vtbl(const mbfl_encoding * from,const mbfl_encoding * to)277 const struct mbfl_convert_vtbl* mbfl_convert_filter_get_vtbl(const mbfl_encoding *from, const mbfl_encoding *to)
278 {
279 if (to->no_encoding == mbfl_no_encoding_base64 ||
280 to->no_encoding == mbfl_no_encoding_qprint) {
281 from = &mbfl_encoding_8bit;
282 } else if (from->no_encoding == mbfl_no_encoding_base64 ||
283 from->no_encoding == mbfl_no_encoding_qprint ||
284 from->no_encoding == mbfl_no_encoding_uuencode) {
285 to = &mbfl_encoding_8bit;
286 }
287
288 if (to == from && (to == &mbfl_encoding_wchar || to == &mbfl_encoding_8bit)) {
289 return &vtbl_pass;
290 }
291
292 if (to->no_encoding == mbfl_no_encoding_wchar) {
293 return from->input_filter;
294 } else if (from->no_encoding == mbfl_no_encoding_wchar) {
295 return to->output_filter;
296 } else {
297 int i = 0;
298 const struct mbfl_convert_vtbl *vtbl;
299 while ((vtbl = mbfl_special_filter_list[i++])) {
300 if (vtbl->from == from->no_encoding && vtbl->to == to->no_encoding) {
301 return vtbl;
302 }
303 }
304 return NULL;
305 }
306 }
307
308 /*
309 * commonly used constructor
310 */
mbfl_filt_conv_common_ctor(mbfl_convert_filter * filter)311 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
312 {
313 filter->status = filter->cache = 0;
314 }
315
mbfl_filt_conv_common_flush(mbfl_convert_filter * filter)316 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
317 {
318 if (filter->flush_function) {
319 (*filter->flush_function)(filter->data);
320 }
321 return 0;
322 }
323
mb_fast_convert(unsigned char * in,size_t in_len,const mbfl_encoding * from,const mbfl_encoding * to,uint32_t replacement_char,unsigned int error_mode,unsigned int * num_errors)324 zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors)
325 {
326 uint32_t wchar_buf[128];
327 unsigned int state = 0;
328
329 if (to == &mbfl_encoding_base64 || to == &mbfl_encoding_qprint) {
330 from = &mbfl_encoding_8bit;
331 } else if (from == &mbfl_encoding_base64 || from == &mbfl_encoding_qprint || from == &mbfl_encoding_uuencode) {
332 to = &mbfl_encoding_8bit;
333 }
334
335 mb_convert_buf buf;
336 mb_convert_buf_init(&buf, in_len, replacement_char, error_mode);
337
338 while (in_len) {
339 size_t out_len = from->to_wchar(&in, &in_len, wchar_buf, 128, &state);
340 ZEND_ASSERT(out_len <= 128);
341 to->from_wchar(wchar_buf, out_len, &buf, !in_len);
342 }
343
344 *num_errors = buf.errors;
345 return mb_convert_buf_result(&buf, to);
346 }
347
convert_cp_to_hex(uint32_t cp,uint32_t * out)348 static uint32_t* convert_cp_to_hex(uint32_t cp, uint32_t *out)
349 {
350 bool nonzero = false;
351 int shift = 28;
352
353 while (shift >= 0) {
354 int n = (cp >> shift) & 0xF;
355 if (n || nonzero) {
356 nonzero = true;
357 *out++ = mbfl_hexchar_table[n];
358 }
359 shift -= 4;
360 }
361
362 if (!nonzero) {
363 /* No hex digits were output by above loop */
364 *out++ = '0';
365 }
366
367 return out;
368 }
369
mb_illegal_marker(uint32_t bad_cp,uint32_t * out,unsigned int err_mode,uint32_t replacement_char)370 static size_t mb_illegal_marker(uint32_t bad_cp, uint32_t *out, unsigned int err_mode, uint32_t replacement_char)
371 {
372 uint32_t *start = out;
373
374 if (bad_cp == MBFL_BAD_INPUT) {
375 /* Input string contained a byte sequence which was invalid in the 'from' encoding
376 * Unless the error handling mode is set to NONE, insert the replacement character */
377 if (err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
378 *out++ = replacement_char;
379 }
380 } else {
381 /* Input string contained a byte sequence which was valid in the 'from' encoding,
382 * but decoded to a Unicode codepoint which cannot be represented in the 'to' encoding */
383 switch (err_mode) {
384 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
385 *out++ = replacement_char;
386 break;
387
388 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
389 out[0] = 'U';
390 out[1] = '+';
391 out = convert_cp_to_hex(bad_cp, &out[2]);
392 break;
393
394 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
395 out[0] = '&'; out[1] = '#'; out[2] = 'x';
396 out = convert_cp_to_hex(bad_cp, &out[3]);
397 *out++ = ';';
398 break;
399 }
400 }
401
402 return out - start;
403 }
404
mb_illegal_output(uint32_t bad_cp,mb_from_wchar_fn fn,mb_convert_buf * buf)405 void mb_illegal_output(uint32_t bad_cp, mb_from_wchar_fn fn, mb_convert_buf* buf)
406 {
407 buf->errors++;
408
409 uint32_t temp[12];
410 uint32_t repl_char = buf->replacement_char;
411 unsigned int err_mode = buf->error_mode;
412
413 if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
414 /* This mode is for internal use only, when converting a string to
415 * UTF-8 before searching it; it uses a byte which is illegal in
416 * UTF-8 as an error marker. This ensures that error markers will
417 * never 'accidentally' match valid text, as could happen when a
418 * character like '?' is used as an error marker. */
419 MB_CONVERT_BUF_ENSURE(buf, buf->out, buf->limit, 1);
420 buf->out = mb_convert_buf_add(buf->out, 0xFF);
421 return;
422 }
423
424 size_t len = mb_illegal_marker(bad_cp, temp, err_mode, repl_char);
425
426 /* Avoid infinite loop if `fn` is not able to handle `repl_char` */
427 if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR && repl_char != '?') {
428 buf->replacement_char = '?';
429 } else {
430 buf->error_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
431 }
432
433 fn(temp, len, buf, false);
434
435 buf->replacement_char = repl_char;
436 buf->error_mode = err_mode;
437 }
438