1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27 * mbfilter.c is included in this package .
28 *
29 */
30
31 #include <stddef.h>
32
33 #include "mbfl_encoding.h"
34 #include "mbfl_filter_output.h"
35 #include "mbfilter_pass.h"
36 #include "mbfilter_8bit.h"
37 #include "mbfilter_wchar.h"
38
39 #include "filters/mbfilter_euc_cn.h"
40 #include "filters/mbfilter_hz.h"
41 #include "filters/mbfilter_euc_tw.h"
42 #include "filters/mbfilter_big5.h"
43 #include "filters/mbfilter_uhc.h"
44 #include "filters/mbfilter_euc_kr.h"
45 #include "filters/mbfilter_iso2022_kr.h"
46 #include "filters/mbfilter_sjis.h"
47 #include "filters/mbfilter_sjis_2004.h"
48 #include "filters/mbfilter_sjis_mobile.h"
49 #include "filters/mbfilter_sjis_mac.h"
50 #include "filters/mbfilter_cp51932.h"
51 #include "filters/mbfilter_jis.h"
52 #include "filters/mbfilter_iso2022_jp_ms.h"
53 #include "filters/mbfilter_iso2022jp_2004.h"
54 #include "filters/mbfilter_iso2022jp_mobile.h"
55 #include "filters/mbfilter_euc_jp.h"
56 #include "filters/mbfilter_euc_jp_2004.h"
57 #include "filters/mbfilter_euc_jp_win.h"
58 #include "filters/mbfilter_gb18030.h"
59 #include "filters/mbfilter_cp932.h"
60 #include "filters/mbfilter_cp936.h"
61 #include "filters/mbfilter_cp5022x.h"
62 #include "filters/mbfilter_base64.h"
63 #include "filters/mbfilter_qprint.h"
64 #include "filters/mbfilter_uuencode.h"
65 #include "filters/mbfilter_7bit.h"
66 #include "filters/mbfilter_utf7.h"
67 #include "filters/mbfilter_utf7imap.h"
68 #include "filters/mbfilter_utf8.h"
69 #include "filters/mbfilter_utf8_mobile.h"
70 #include "filters/mbfilter_utf16.h"
71 #include "filters/mbfilter_utf32.h"
72 #include "filters/mbfilter_ucs4.h"
73 #include "filters/mbfilter_ucs2.h"
74 #include "filters/mbfilter_htmlent.h"
75 #include "filters/mbfilter_singlebyte.h"
76
77 /* hex character table "0123456789ABCDEF" */
78 static char mbfl_hexchar_table[] = {
79 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
80 };
81
82 static const struct mbfl_convert_vtbl *mbfl_special_filter_list[] = {
83 &vtbl_8bit_b64,
84 &vtbl_b64_8bit,
85 &vtbl_uuencode_8bit,
86 &vtbl_8bit_qprint,
87 &vtbl_qprint_8bit,
88 &vtbl_pass,
89 NULL
90 };
91
mbfl_convert_filter_init(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to,const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)92 static void mbfl_convert_filter_init(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to,
93 const struct mbfl_convert_vtbl *vtbl, output_function_t output_function, flush_function_t flush_function, void* data)
94 {
95 /* encoding structure */
96 filter->from = from;
97 filter->to = to;
98
99 if (output_function != NULL) {
100 filter->output_function = output_function;
101 } else {
102 filter->output_function = mbfl_filter_output_null;
103 }
104
105 filter->flush_function = flush_function;
106 filter->data = data;
107 filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
108 filter->illegal_substchar = '?';
109 filter->num_illegalchar = 0;
110 filter->filter_dtor = vtbl->filter_dtor;
111 filter->filter_function = vtbl->filter_function;
112 filter->filter_flush = (filter_flush_t)vtbl->filter_flush;
113 filter->filter_copy = vtbl->filter_copy;
114
115 (*vtbl->filter_ctor)(filter);
116 }
117
mbfl_convert_filter_new(const mbfl_encoding * from,const mbfl_encoding * to,output_function_t output_function,flush_function_t flush_function,void * data)118 mbfl_convert_filter* mbfl_convert_filter_new(const mbfl_encoding *from, const mbfl_encoding *to, output_function_t output_function,
119 flush_function_t flush_function, void* data)
120 {
121 const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
122 if (vtbl == NULL) {
123 return NULL;
124 }
125
126 mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
127 mbfl_convert_filter_init(filter, from, to, vtbl, output_function, flush_function, data);
128 return filter;
129 }
130
mbfl_convert_filter_new2(const struct mbfl_convert_vtbl * vtbl,output_function_t output_function,flush_function_t flush_function,void * data)131 mbfl_convert_filter* mbfl_convert_filter_new2(const struct mbfl_convert_vtbl *vtbl, output_function_t output_function,
132 flush_function_t flush_function, void* data)
133 {
134 const mbfl_encoding *from_encoding = mbfl_no2encoding(vtbl->from);
135 const mbfl_encoding *to_encoding = mbfl_no2encoding(vtbl->to);
136
137 mbfl_convert_filter *filter = emalloc(sizeof(mbfl_convert_filter));
138 mbfl_convert_filter_init(filter, from_encoding, to_encoding, vtbl, output_function, flush_function, data);
139 return filter;
140 }
141
mbfl_convert_filter_delete(mbfl_convert_filter * filter)142 void mbfl_convert_filter_delete(mbfl_convert_filter *filter)
143 {
144 if (filter->filter_dtor) {
145 (*filter->filter_dtor)(filter);
146 }
147 efree(filter);
148 }
149
150 /* Feed a char, return 0 if ok - used by mailparse ext */
mbfl_convert_filter_feed(int c,mbfl_convert_filter * filter)151 int mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
152 {
153 return (*filter->filter_function)(c, filter);
154 }
155
156 /* Feed string into `filter` byte by byte; return pointer to first byte not processed */
mbfl_convert_filter_feed_string(mbfl_convert_filter * filter,unsigned char * p,size_t len)157 unsigned char* mbfl_convert_filter_feed_string(mbfl_convert_filter *filter, unsigned char *p, size_t len)
158 {
159 while (len--) {
160 if ((*filter->filter_function)(*p++, filter) < 0) {
161 break;
162 }
163 }
164 return p;
165 }
166
mbfl_convert_filter_flush(mbfl_convert_filter * filter)167 int mbfl_convert_filter_flush(mbfl_convert_filter *filter)
168 {
169 (*filter->filter_flush)(filter);
170 return 0;
171 }
172
mbfl_convert_filter_reset(mbfl_convert_filter * filter,const mbfl_encoding * from,const mbfl_encoding * to)173 void mbfl_convert_filter_reset(mbfl_convert_filter *filter, const mbfl_encoding *from, const mbfl_encoding *to)
174 {
175 if (filter->filter_dtor) {
176 (*filter->filter_dtor)(filter);
177 }
178
179 const struct mbfl_convert_vtbl *vtbl = mbfl_convert_filter_get_vtbl(from, to);
180
181 if (vtbl == NULL) {
182 vtbl = &vtbl_pass;
183 }
184
185 mbfl_convert_filter_init(filter, from, to, vtbl, filter->output_function, filter->flush_function, filter->data);
186 }
187
mbfl_convert_filter_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)188 void mbfl_convert_filter_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
189 {
190 if (src->filter_copy != NULL) {
191 src->filter_copy(src, dest);
192 return;
193 }
194
195 *dest = *src;
196 }
197
mbfl_convert_filter_devcat(mbfl_convert_filter * filter,mbfl_memory_device * src)198 void mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src)
199 {
200 mbfl_convert_filter_feed_string(filter, src->buffer, src->pos);
201 }
202
mbfl_convert_filter_strcat(mbfl_convert_filter * filter,const unsigned char * p)203 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
204 {
205 int c;
206 while ((c = *p++)) {
207 if ((*filter->filter_function)(c, filter) < 0) {
208 return -1;
209 }
210 }
211
212 return 0;
213 }
214
mbfl_filt_conv_output_hex(unsigned int w,mbfl_convert_filter * filter)215 static int mbfl_filt_conv_output_hex(unsigned int w, mbfl_convert_filter *filter)
216 {
217 bool nonzero = false;
218 int shift = 28, ret = 0;
219
220 while (shift >= 0) {
221 int n = (w >> shift) & 0xF;
222 if (n || nonzero) {
223 nonzero = true;
224 ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
225 if (ret < 0) {
226 return ret;
227 }
228 }
229 shift -= 4;
230 }
231
232 if (!nonzero) {
233 /* No hex digits were output by above loop */
234 ret = (*filter->filter_function)('0', filter);
235 }
236
237 return ret;
238 }
239
240 /* illegal character output function for conv-filter */
mbfl_filt_conv_illegal_output(int c,mbfl_convert_filter * filter)241 int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
242 {
243 unsigned int w = c;
244 int ret = 0;
245 int mode_backup = filter->illegal_mode;
246 int substchar_backup = filter->illegal_substchar;
247
248 /* The used substitution character may not be supported by the target character encoding.
249 * If that happens, first try to use "?" instead and if that also fails, silently drop the
250 * character. */
251 if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
252 && filter->illegal_substchar != '?') {
253 filter->illegal_substchar = '?';
254 } else {
255 filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
256 }
257
258 switch (mode_backup) {
259 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
260 ret = (*filter->filter_function)(substchar_backup, filter);
261 break;
262
263 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
264 if (w != MBFL_BAD_INPUT) {
265 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
266 if (ret < 0)
267 break;
268 ret = mbfl_filt_conv_output_hex(w, filter);
269 } else {
270 ret = (*filter->filter_function)(substchar_backup, filter);
271 }
272 break;
273
274 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
275 if (w != MBFL_BAD_INPUT) {
276 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
277 if (ret < 0)
278 break;
279 ret = mbfl_filt_conv_output_hex(w, filter);
280 if (ret < 0)
281 break;
282 ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
283 } else {
284 ret = (*filter->filter_function)(substchar_backup, filter);
285 }
286 break;
287
288 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE:
289 default:
290 break;
291 }
292
293 filter->illegal_mode = mode_backup;
294 filter->illegal_substchar = substchar_backup;
295 filter->num_illegalchar++;
296
297 return ret;
298 }
299
mbfl_convert_filter_get_vtbl(const mbfl_encoding * from,const mbfl_encoding * to)300 const struct mbfl_convert_vtbl* mbfl_convert_filter_get_vtbl(const mbfl_encoding *from, const mbfl_encoding *to)
301 {
302 if (to->no_encoding == mbfl_no_encoding_base64 ||
303 to->no_encoding == mbfl_no_encoding_qprint) {
304 from = &mbfl_encoding_8bit;
305 } else if (from->no_encoding == mbfl_no_encoding_base64 ||
306 from->no_encoding == mbfl_no_encoding_qprint ||
307 from->no_encoding == mbfl_no_encoding_uuencode) {
308 to = &mbfl_encoding_8bit;
309 }
310
311 if (to == from && (to == &mbfl_encoding_wchar || to == &mbfl_encoding_8bit)) {
312 return &vtbl_pass;
313 }
314
315 if (to->no_encoding == mbfl_no_encoding_wchar) {
316 return from->input_filter;
317 } else if (from->no_encoding == mbfl_no_encoding_wchar) {
318 return to->output_filter;
319 } else {
320 int i = 0;
321 const struct mbfl_convert_vtbl *vtbl;
322 while ((vtbl = mbfl_special_filter_list[i++])) {
323 if (vtbl->from == from->no_encoding && vtbl->to == to->no_encoding) {
324 return vtbl;
325 }
326 }
327 return NULL;
328 }
329 }
330
331 /*
332 * commonly used constructor
333 */
mbfl_filt_conv_common_ctor(mbfl_convert_filter * filter)334 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
335 {
336 filter->status = filter->cache = 0;
337 }
338
mbfl_filt_conv_common_flush(mbfl_convert_filter * filter)339 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
340 {
341 if (filter->flush_function) {
342 (*filter->flush_function)(filter->data);
343 }
344 return 0;
345 }
346
mb_fast_convert(unsigned char * in,size_t in_len,const mbfl_encoding * from,const mbfl_encoding * to,uint32_t replacement_char,unsigned int error_mode,unsigned int * num_errors)347 zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors)
348 {
349 uint32_t wchar_buf[128];
350 unsigned int state = 0;
351
352 if (to == &mbfl_encoding_base64 || to == &mbfl_encoding_qprint) {
353 from = &mbfl_encoding_8bit;
354 } else if (from == &mbfl_encoding_base64 || from == &mbfl_encoding_qprint || from == &mbfl_encoding_uuencode) {
355 to = &mbfl_encoding_8bit;
356 }
357
358 mb_convert_buf buf;
359 mb_convert_buf_init(&buf, in_len, replacement_char, error_mode);
360
361 while (in_len) {
362 size_t out_len = from->to_wchar(&in, &in_len, wchar_buf, 128, &state);
363 ZEND_ASSERT(out_len <= 128);
364 to->from_wchar(wchar_buf, out_len, &buf, !in_len);
365 }
366
367 *num_errors = buf.errors;
368 return mb_convert_buf_result(&buf);
369 }
370
convert_cp_to_hex(uint32_t cp,uint32_t * out)371 static uint32_t* convert_cp_to_hex(uint32_t cp, uint32_t *out)
372 {
373 bool nonzero = false;
374 int shift = 28;
375
376 while (shift >= 0) {
377 int n = (cp >> shift) & 0xF;
378 if (n || nonzero) {
379 nonzero = true;
380 *out++ = mbfl_hexchar_table[n];
381 }
382 shift -= 4;
383 }
384
385 if (!nonzero) {
386 /* No hex digits were output by above loop */
387 *out++ = '0';
388 }
389
390 return out;
391 }
392
mb_illegal_marker(uint32_t bad_cp,uint32_t * out,unsigned int err_mode,uint32_t replacement_char)393 static size_t mb_illegal_marker(uint32_t bad_cp, uint32_t *out, unsigned int err_mode, uint32_t replacement_char)
394 {
395 uint32_t *start = out;
396
397 if (bad_cp == MBFL_BAD_INPUT && err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
398 *out++ = replacement_char;
399 } else {
400 switch (err_mode) {
401 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
402 *out++ = replacement_char;
403 break;
404
405 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
406 out[0] = 'U';
407 out[1] = '+';
408 out = convert_cp_to_hex(bad_cp, &out[2]);
409 break;
410
411 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
412 out[0] = '&'; out[1] = '#'; out[2] = 'x';
413 out = convert_cp_to_hex(bad_cp, &out[3]);
414 *out++ = ';';
415 break;
416 }
417 }
418
419 return out - start;
420 }
421
mb_illegal_output(uint32_t bad_cp,mb_from_wchar_fn fn,mb_convert_buf * buf)422 void mb_illegal_output(uint32_t bad_cp, mb_from_wchar_fn fn, mb_convert_buf* buf)
423 {
424 buf->errors++;
425
426 uint32_t temp[12];
427 uint32_t repl_char = buf->replacement_char;
428 unsigned int err_mode = buf->error_mode;
429
430 size_t len = mb_illegal_marker(bad_cp, temp, err_mode, repl_char);
431
432 /* Avoid infinite loop if `fn` is not able to handle `repl_char` */
433 if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR && repl_char != '?') {
434 buf->replacement_char = '?';
435 } else {
436 buf->error_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
437 }
438
439 fn(temp, len, buf, false);
440
441 buf->replacement_char = repl_char;
442 buf->error_mode = err_mode;
443 }
444