xref: /PHP-8.1/ext/mbstring/libmbfl/mbfl/mbfilter.c (revision b721d0f7)
1 /*
2  * charset=UTF-8
3  */
4 
5 /*
6  * "streamable kanji code filter and converter"
7  *
8  * Copyright (c) 1998,1999,2000,2001 HappySize, Inc. All rights reserved.
9  *
10  * This software is released under the GNU Lesser General Public License.
11  * (Version 2.1, February 1999)
12  * Please read the following detail of the licence (in japanese).
13  *
14  * ◆使用許諾条件◆
15  *
16  * このソフトウェアは株式会社ハッピーサイズによって開発されました。株式会社ハッ
17  * ピーサイズは、著作権法および万国著作権条約の定めにより、このソフトウェアに関
18  * するすべての権利を留保する権利を持ち、ここに行使します。株式会社ハッピーサイ
19  * ズは以下に明記した条件に従って、このソフトウェアを使用する排他的ではない権利
20  * をお客様に許諾します。何人たりとも、以下の条件に反してこのソフトウェアを使用
21  * することはできません。
22  *
23  * このソフトウェアを「GNU Lesser General Public License (Version 2.1, February
24  * 1999)」に示された条件で使用することを、全ての方に許諾します。「GNU Lesser
25  * General Public License」を満たさない使用には、株式会社ハッピーサイズから書面
26  * による許諾を得る必要があります。
27  *
28  * 「GNU Lesser General Public License」の全文は以下のウェブページから取得でき
29  * ます。「GNU Lesser General Public License」とは、これまでLibrary General
30  * Public Licenseと呼ばれていたものです。
31  *     http://www.gnu.org/ --- GNUウェブサイト
32  *     http://www.gnu.org/copyleft/lesser.html --- ライセンス文面
33  * このライセンスの内容がわからない方、守れない方には使用を許諾しません。
34  *
35  * しかしながら、当社とGNUプロジェクトとの特定の関係を示唆または主張するもので
36  * はありません。
37  *
38  * ◆保証内容◆
39  *
40  * このソフトウェアは、期待された動作・機能・性能を持つことを目標として設計され
41  * 開発されていますが、これを保証するものではありません。このソフトウェアは「こ
42  * のまま」の状態で提供されており、たとえばこのソフトウェアの有用性ないし特定の
43  * 目的に合致することといった、何らかの保証内容が、明示されたり暗黙に示されてい
44  * る場合であっても、その保証は無効です。このソフトウェアを使用した結果ないし使
45  * 用しなかった結果によって、直接あるいは間接に受けた身体的な傷害、財産上の損害
46  * 、データの損失あるいはその他の全ての損害については、その損害の可能性が使用者
47  * 、当社あるいは第三者によって警告されていた場合であっても、当社はその損害の賠
48  * 償および補填を行いません。この規定は他の全ての、書面上または書面に無い保証・
49  * 契約・規定に優先します。
50  *
51  * ◆著作権者の連絡先および使用条件についての問い合わせ先◆
52  *
53  * 〒102-0073
54  * 東京都千代田区九段北1-13-5日本地所第一ビル4F
55  * 株式会社ハッピーサイズ
56  * Phone: 03-3512-3655, Fax: 03-3512-3656
57  * Email: sales@happysize.co.jp
58  * Web: http://happysize.com/
59  *
60  * ◆著者◆
61  *
62  * 金本 茂 <sgk@happysize.co.jp>
63  *
64  * ◆履歴◆
65  *
66  * 1998/11/10 sgk implementation in C++
67  * 1999/4/25  sgk Cで書きなおし。
68  * 1999/4/26  sgk 入力フィルタを実装。漢字コードを推定しながらフィルタを追加。
69  * 1999/6/??      Unicodeサポート。
70  * 1999/6/22  sgk ライセンスをLGPLに変更。
71  *
72  */
73 
74 /*
75  * Unicode support
76  *
77  * Portions copyright (c) 1999,2000,2001 by the PHP3 internationalization team.
78  * All rights reserved.
79  *
80  */
81 
82 #include <stddef.h>
83 #include <string.h>
84 
85 #include "mbfilter.h"
86 #include "mbfl_filter_output.h"
87 #include "mbfilter_8bit.h"
88 #include "mbfilter_wchar.h"
89 #include "mbstring.h"
90 #include "php_unicode.h"
91 #include "filters/mbfilter_base64.h"
92 #include "filters/mbfilter_qprint.h"
93 #include "filters/mbfilter_singlebyte.h"
94 #include "filters/mbfilter_tl_jisx0201_jisx0208.h"
95 #include "filters/mbfilter_utf8.h"
96 
97 #include "eaw_table.h"
98 #include "rare_cp_bitvec.h"
99 
100 /* hex character table "0123456789ABCDEF" */
101 static char mbfl_hexchar_table[] = {
102 	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
103 };
104 
105 
106 
107 /*
108  * encoding filter
109  */
110 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
111 
112 
113 /*
114  *  buffering converter
115  */
116 mbfl_buffer_converter *
mbfl_buffer_converter_new(const mbfl_encoding * from,const mbfl_encoding * to,size_t buf_initsz)117 mbfl_buffer_converter_new(
118 	const mbfl_encoding *from,
119 	const mbfl_encoding *to,
120     size_t buf_initsz)
121 {
122 	mbfl_buffer_converter *convd = emalloc(sizeof(mbfl_buffer_converter));
123 	convd->to = to;
124 
125 	/* create convert filter */
126 	convd->filter1 = NULL;
127 	convd->filter2 = NULL;
128 	if (mbfl_convert_filter_get_vtbl(from, to) != NULL) {
129 		convd->filter1 = mbfl_convert_filter_new(from, to, mbfl_memory_device_output, NULL, &convd->device);
130 	} else {
131 		convd->filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, to, mbfl_memory_device_output, NULL, &convd->device);
132 		if (convd->filter2 != NULL) {
133 			convd->filter1 = mbfl_convert_filter_new(from,
134 					&mbfl_encoding_wchar,
135 					(output_function_t)convd->filter2->filter_function,
136 					(flush_function_t)convd->filter2->filter_flush,
137 					convd->filter2);
138 			if (convd->filter1 == NULL) {
139 				mbfl_convert_filter_delete(convd->filter2);
140 			}
141 		}
142 	}
143 	if (convd->filter1 == NULL) {
144 		efree(convd);
145 		return NULL;
146 	}
147 
148 	mbfl_memory_device_init(&convd->device, buf_initsz, buf_initsz/4);
149 
150 	return convd;
151 }
152 
153 
154 void
mbfl_buffer_converter_delete(mbfl_buffer_converter * convd)155 mbfl_buffer_converter_delete(mbfl_buffer_converter *convd)
156 {
157 	if (convd != NULL) {
158 		if (convd->filter1) {
159 			mbfl_convert_filter_delete(convd->filter1);
160 		}
161 		if (convd->filter2) {
162 			mbfl_convert_filter_delete(convd->filter2);
163 		}
164 		mbfl_memory_device_clear(&convd->device);
165 		efree((void*)convd);
166 	}
167 }
168 
169 int
mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter * convd,int mode)170 mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter *convd, int mode)
171 {
172 	if (convd != NULL) {
173 		if (convd->filter2 != NULL) {
174 			convd->filter2->illegal_mode = mode;
175 		} else if (convd->filter1 != NULL) {
176 			convd->filter1->illegal_mode = mode;
177 		} else {
178 			return 0;
179 		}
180 	}
181 
182 	return 1;
183 }
184 
185 int
mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter * convd,int substchar)186 mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter *convd, int substchar)
187 {
188 	if (convd != NULL) {
189 		if (convd->filter2 != NULL) {
190 			convd->filter2->illegal_substchar = substchar;
191 		} else if (convd->filter1 != NULL) {
192 			convd->filter1->illegal_substchar = substchar;
193 		} else {
194 			return 0;
195 		}
196 	}
197 
198 	return 1;
199 }
200 
mbfl_buffer_converter_feed(mbfl_buffer_converter * convd,mbfl_string * string)201 size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *string)
202 {
203 	size_t n;
204 	unsigned char *p;
205 	mbfl_convert_filter *filter;
206 
207 	ZEND_ASSERT(convd);
208 	ZEND_ASSERT(string);
209 
210 	mbfl_memory_device_realloc(&convd->device, convd->device.pos + string->len, string->len/4);
211 	/* feed data */
212 	n = string->len;
213 	p = string->val;
214 
215 	filter = convd->filter1;
216 	if (filter != NULL) {
217 		while (n > 0) {
218 			if ((*filter->filter_function)(*p++, filter) < 0) {
219 				return p - string->val;
220 			}
221 			n--;
222 		}
223 	}
224 	return p - string->val;
225 }
226 
227 
228 int
mbfl_buffer_converter_flush(mbfl_buffer_converter * convd)229 mbfl_buffer_converter_flush(mbfl_buffer_converter *convd)
230 {
231 	if (convd == NULL) {
232 		return -1;
233 	}
234 
235 	if (convd->filter1 != NULL) {
236 		mbfl_convert_filter_flush(convd->filter1);
237 	}
238 
239 	return 0;
240 }
241 
242 mbfl_string *
mbfl_buffer_converter_result(mbfl_buffer_converter * convd,mbfl_string * result)243 mbfl_buffer_converter_result(mbfl_buffer_converter *convd, mbfl_string *result)
244 {
245 	if (convd == NULL || result == NULL) {
246 		return NULL;
247 	}
248 	result->encoding = convd->to;
249 	return mbfl_memory_device_result(&convd->device, result);
250 }
251 
252 mbfl_string *
mbfl_buffer_converter_feed_result(mbfl_buffer_converter * convd,mbfl_string * string,mbfl_string * result)253 mbfl_buffer_converter_feed_result(mbfl_buffer_converter *convd, mbfl_string *string,
254 				  mbfl_string *result)
255 {
256 	if (convd == NULL || string == NULL || result == NULL) {
257 		return NULL;
258 	}
259 	mbfl_buffer_converter_feed(convd, string);
260 	if (convd->filter1 != NULL) {
261 		mbfl_convert_filter_flush(convd->filter1);
262 	}
263 	result->encoding = convd->to;
264 	return mbfl_memory_device_result(&convd->device, result);
265 }
266 
mbfl_buffer_illegalchars(mbfl_buffer_converter * convd)267 size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
268 {
269 	size_t num_illegalchars = 0;
270 
271 	if (convd == NULL) {
272 		return 0;
273 	}
274 
275 	if (convd->filter1 != NULL) {
276 		num_illegalchars += convd->filter1->num_illegalchar;
277 	}
278 
279 	if (convd->filter2 != NULL) {
280 		num_illegalchars += convd->filter2->num_illegalchar;
281 	}
282 
283 	return num_illegalchars;
284 }
285 
286 /*
287  * encoding detector
288  */
mbfl_estimate_encoding_likelihood(int input_cp,void * void_data)289 static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
290 {
291 	mbfl_encoding_detector_data *data = void_data;
292 	unsigned int c = input_cp;
293 
294 	/* Receive wchars decoded from input string using candidate encoding.
295 	 * If the string was invalid in the candidate encoding, we assume
296 	 * it's the wrong one. Otherwise, give the candidate many 'demerits'
297 	 * for each 'rare' codepoint found, a smaller number for each ASCII
298 	 * punctuation character, and 1 for all other codepoints.
299 	 *
300 	 * The 'common' codepoints should cover the vast majority of
301 	 * codepoints we are likely to see in practice, while only covering
302 	 * a small minority of the entire Unicode encoding space. Why?
303 	 * Well, if the test string happens to be valid in an incorrect
304 	 * candidate encoding, the bogus codepoints which it decodes to will
305 	 * be more or less random. By treating the majority of codepoints as
306 	 * 'rare', we ensure that in almost all such cases, the bogus
307 	 * codepoints will include plenty of 'rares', thus giving the
308 	 * incorrect candidate encoding lots of demerits. See
309 	 * common_codepoints.txt for the actual list used.
310 	 *
311 	 * So, why give extra demerits for ASCII punctuation characters? It's
312 	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
313 	 * which deliberately only use bytes in the ASCII range. When
314 	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
315 	 * have an unusually high number of ASCII punctuation characters.
316 	 * So giving extra demerits for such characters will improve
317 	 * detection accuracy for UTF-7 and similar encodings.
318 	 *
319 	 * Finally, why 1 demerit for all other characters? That penalizes
320 	 * long strings, meaning we will tend to choose a candidate encoding
321 	 * in which the test string decodes to a smaller number of
322 	 * codepoints. That prevents single-byte encodings in which almost
323 	 * every possible input byte decodes to a 'common' codepoint from
324 	 * being favored too much. */
325 	if (c == MBFL_BAD_INPUT) {
326 		data->num_illegalchars++;
327 	} else if (c > 0xFFFF) {
328 		data->score += 40;
329 	} else if (c >= 0x21 && c <= 0x2F) {
330 		data->score += 6;
331 	} else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
332 		data->score += 30;
333 	} else {
334 		data->score += 1;
335 	}
336 	return 0;
337 }
338 
mbfl_encoding_detector_new(const mbfl_encoding ** elist,int elistsz,int strict)339 mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
340 {
341 	if (!elistsz) {
342 		return NULL;
343 	}
344 
345 	mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector));
346 	identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*));
347 	identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data));
348 
349 	int filter_list_size = 0;
350 	for (int i = 0; i < elistsz; i++) {
351 		mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar,
352 			mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]);
353 		if (filter) {
354 			identd->filter_list[filter_list_size++] = filter;
355 		}
356 	}
357 	identd->filter_list_size = filter_list_size;
358 	identd->strict = strict;
359 	return identd;
360 }
361 
mbfl_encoding_detector_delete(mbfl_encoding_detector * identd)362 void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
363 {
364 	for (int i = 0; i < identd->filter_list_size; i++) {
365 		mbfl_convert_filter_delete(identd->filter_list[i]);
366 	}
367 	efree(identd->filter_list);
368 	efree(identd->filter_data);
369 	efree(identd);
370 }
371 
mbfl_encoding_detector_feed(mbfl_encoding_detector * identd,mbfl_string * string)372 int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
373 {
374 	int num = identd->filter_list_size;
375 	size_t n = string->len;
376 	unsigned char *p = string->val;
377 	int bad = 0;
378 
379 	if (identd->strict) {
380 		for (int i = 0; i < num; i++) {
381 			mbfl_convert_filter *filter = identd->filter_list[i];
382 			mbfl_encoding_detector_data *data = &identd->filter_data[i];
383 			if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
384 				data->num_illegalchars++;
385 			}
386 		}
387 	}
388 
389 	while (n--) {
390 		for (int i = 0; i < num; i++) {
391 			mbfl_convert_filter *filter = identd->filter_list[i];
392 			mbfl_encoding_detector_data *data = &identd->filter_data[i];
393 			if (!data->num_illegalchars) {
394 				(*filter->filter_function)(*p, filter);
395 				if (data->num_illegalchars) {
396 					bad++;
397 				}
398 			}
399 		}
400 		if ((num - 1) <= bad && !identd->strict) {
401 			return 1;
402 		}
403 		p++;
404 	}
405 
406 	for (int i = 0; i < num; i++) {
407 		mbfl_convert_filter *filter = identd->filter_list[i];
408 		(filter->filter_flush)(filter);
409 	}
410 
411 	return 0;
412 }
413 
mbfl_encoding_detector_judge(mbfl_encoding_detector * identd)414 const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
415 {
416 	size_t best_score = SIZE_MAX; /* Low score is 'better' */
417 	const mbfl_encoding *enc = NULL;
418 
419 	for (int i = 0; i < identd->filter_list_size; i++) {
420 		mbfl_convert_filter *filter = identd->filter_list[i];
421 		mbfl_encoding_detector_data *data = &identd->filter_data[i];
422 		if (!data->num_illegalchars && data->score < best_score) {
423 			enc = filter->from;
424 			best_score = data->score;
425 		}
426 	}
427 
428 	return enc;
429 }
430 
431 /*
432  * encoding converter
433  */
434 mbfl_string *
mbfl_convert_encoding(mbfl_string * string,mbfl_string * result,const mbfl_encoding * toenc)435 mbfl_convert_encoding(
436     mbfl_string *string,
437     mbfl_string *result,
438     const mbfl_encoding *toenc)
439 {
440 	size_t n;
441 	unsigned char *p;
442 	mbfl_memory_device device;
443 	mbfl_convert_filter *filter1;
444 	mbfl_convert_filter *filter2;
445 
446 	/* initialize */
447 	if (toenc == NULL || string == NULL || result == NULL) {
448 		return NULL;
449 	}
450 
451 	filter1 = NULL;
452 	filter2 = NULL;
453 	if (mbfl_convert_filter_get_vtbl(string->encoding, toenc) != NULL) {
454 		filter1 = mbfl_convert_filter_new(string->encoding, toenc, mbfl_memory_device_output, 0, &device);
455 	} else {
456 		filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, toenc, mbfl_memory_device_output, 0, &device);
457 		if (filter2 != NULL) {
458 			filter1 = mbfl_convert_filter_new(string->encoding, &mbfl_encoding_wchar, (int (*)(int, void*))filter2->filter_function, NULL, filter2);
459 			if (filter1 == NULL) {
460 				mbfl_convert_filter_delete(filter2);
461 			}
462 		}
463 	}
464 	if (filter1 == NULL) {
465 		return NULL;
466 	}
467 
468 	if (filter2 != NULL) {
469 		filter2->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
470 		filter2->illegal_substchar = 0x3f;		/* '?' */
471 	}
472 
473 	mbfl_memory_device_init(&device, string->len, (string->len >> 2) + 8);
474 
475 	/* feed data */
476 	n = string->len;
477 	p = string->val;
478 	if (p != NULL) {
479 		while (n > 0) {
480 			if ((*filter1->filter_function)(*p++, filter1) < 0) {
481 				break;
482 			}
483 			n--;
484 		}
485 	}
486 
487 	mbfl_convert_filter_flush(filter1);
488 	mbfl_convert_filter_delete(filter1);
489 	if (filter2 != NULL) {
490 		mbfl_convert_filter_flush(filter2);
491 		mbfl_convert_filter_delete(filter2);
492 	}
493 
494 	return mbfl_memory_device_result(&device, result);
495 }
496 
497 /*
498  * identify encoding
499  */
mbfl_identify_encoding(mbfl_string * string,const mbfl_encoding ** elist,int elistsz,int strict)500 const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict)
501 {
502 	if (!elistsz) {
503 		return NULL;
504 	}
505 	mbfl_encoding_detector *identd = mbfl_encoding_detector_new(elist, elistsz, strict);
506 	mbfl_encoding_detector_feed(identd, string);
507 	const mbfl_encoding *enc = mbfl_encoding_detector_judge(identd);
508 	mbfl_encoding_detector_delete(identd);
509 	return enc;
510 }
511 
512 /*
513  *  strlen
514  */
515 static int
filter_count_output(int c,void * data)516 filter_count_output(int c, void *data)
517 {
518 	(*(size_t *)data)++;
519 	return 0;
520 }
521 
522 size_t
mbfl_strlen(const mbfl_string * string)523 mbfl_strlen(const mbfl_string *string)
524 {
525 	size_t len, n, k;
526 	unsigned char *p;
527 	const mbfl_encoding *encoding = string->encoding;
528 
529 	len = 0;
530 	if (encoding->flag & MBFL_ENCTYPE_SBCS) {
531 		len = string->len;
532 	} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
533 		len = string->len/2;
534 	} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
535 		len = string->len/4;
536 	} else if (encoding->mblen_table != NULL) {
537 		const unsigned char *mbtab = encoding->mblen_table;
538 		n = 0;
539 		p = string->val;
540 		k = string->len;
541 		/* count */
542 		if (p != NULL) {
543 			while (n < k) {
544 				unsigned m = mbtab[*p];
545 				n += m;
546 				p += m;
547 				len++;
548 			}
549 		}
550 	} else {
551 		/* wchar filter */
552 		mbfl_convert_filter *filter = mbfl_convert_filter_new(
553 		  string->encoding,
554 		  &mbfl_encoding_wchar,
555 		  filter_count_output, 0, &len);
556 		if (filter == NULL) {
557 			return (size_t) -1;
558 		}
559 		/* count */
560 		n = string->len;
561 		p = string->val;
562 		if (p != NULL) {
563 			while (n > 0) {
564 				(*filter->filter_function)(*p++, filter);
565 				n--;
566 			}
567 		}
568 		mbfl_convert_filter_delete(filter);
569 	}
570 
571 	return len;
572 }
573 
574 
575 /*
576  *  strpos
577  */
578 struct collector_strpos_data {
579 	mbfl_convert_filter *next_filter;
580 	mbfl_wchar_device needle;
581 	size_t needle_len;
582 	size_t start;
583 	size_t output;
584 	size_t found_pos;
585 	size_t needle_pos;
586 	size_t matched_pos;
587 };
588 
589 static int
collector_strpos(int c,void * data)590 collector_strpos(int c, void* data)
591 {
592 	int *p, *h, *m;
593 	ssize_t n;
594 	struct collector_strpos_data *pc = (struct collector_strpos_data*)data;
595 
596 	if (pc->output >= pc->start) {
597 		if (c == (int)pc->needle.buffer[pc->needle_pos]) {
598 			if (pc->needle_pos == 0) {
599 				pc->found_pos = pc->output;			/* found position */
600 			}
601 			pc->needle_pos++;						/* needle pointer */
602 			if (pc->needle_pos >= pc->needle_len) {
603 				pc->matched_pos = pc->found_pos;	/* matched position */
604 				pc->needle_pos--;
605 				goto retry;
606 			}
607 		} else if (pc->needle_pos != 0) {
608 retry:
609 			h = (int *)pc->needle.buffer;
610 			h++;
611 			for (;;) {
612 				pc->found_pos++;
613 				p = h;
614 				m = (int *)pc->needle.buffer;
615 				n = pc->needle_pos - 1;
616 				while (n > 0 && *p == *m) {
617 					n--;
618 					p++;
619 					m++;
620 				}
621 				if (n <= 0) {
622 					if (*m != c) {
623 						pc->needle_pos = 0;
624 					}
625 					break;
626 				} else {
627 					h++;
628 					pc->needle_pos--;
629 				}
630 			}
631 		}
632 	}
633 
634 	pc->output++;
635 	return 0;
636 }
637 
mbfl_find_offset_utf8(const unsigned char * str,const unsigned char * end,ssize_t offset)638 static const unsigned char *mbfl_find_offset_utf8(
639 		const unsigned char *str, const unsigned char *end, ssize_t offset) {
640 	if (offset < 0) {
641 		const unsigned char *pos = end;
642 		while (offset < 0) {
643 			if (pos <= str) {
644 				return NULL;
645 			}
646 
647 			unsigned char c = *(--pos);
648 			if (c < 0x80) {
649 				++offset;
650 			} else if ((c & 0xc0) != 0x80) {
651 				++offset;
652 			}
653 		}
654 		return pos;
655 	} else {
656 		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
657 		const unsigned char *pos = str;
658 		while (offset-- > 0) {
659 			if (pos >= end) {
660 				return NULL;
661 			}
662 			pos += u8_tbl[*pos];
663 		}
664 		return pos;
665 	}
666 }
667 
mbfl_pointer_to_offset_utf8(const unsigned char * start,const unsigned char * pos)668 static size_t mbfl_pointer_to_offset_utf8(const unsigned char *start, const unsigned char *pos) {
669 	size_t result = 0;
670 	while (pos > start) {
671 		unsigned char c = *--pos;
672 		if (c < 0x80) {
673 			++result;
674 		} else if ((c & 0xc0) != 0x80) {
675 			++result;
676 		}
677 	}
678 	return result;
679 }
680 
681 size_t
mbfl_strpos(mbfl_string * haystack,mbfl_string * needle,ssize_t offset,int reverse)682 mbfl_strpos(
683     mbfl_string *haystack,
684     mbfl_string *needle,
685     ssize_t offset,
686     int reverse)
687 {
688 	size_t result;
689 	mbfl_string _haystack_u8, _needle_u8;
690 	const mbfl_string *haystack_u8, *needle_u8 = NULL;
691 	const unsigned char *offset_pointer;
692 
693 	if (haystack->encoding->no_encoding != mbfl_no_encoding_utf8) {
694 		mbfl_string_init(&_haystack_u8);
695 		haystack_u8 = mbfl_convert_encoding(haystack, &_haystack_u8, &mbfl_encoding_utf8);
696 		if (haystack_u8 == NULL) {
697 			result = MBFL_ERROR_ENCODING;
698 			goto out;
699 		}
700 	} else {
701 		haystack_u8 = haystack;
702 	}
703 
704 	if (needle->encoding->no_encoding != mbfl_no_encoding_utf8) {
705 		mbfl_string_init(&_needle_u8);
706 		needle_u8 = mbfl_convert_encoding(needle, &_needle_u8, &mbfl_encoding_utf8);
707 		if (needle_u8 == NULL) {
708 			result = MBFL_ERROR_ENCODING;
709 			goto out;
710 		}
711 	} else {
712 		needle_u8 = needle;
713 	}
714 
715 	offset_pointer = mbfl_find_offset_utf8(
716 		haystack_u8->val, haystack_u8->val + haystack_u8->len, offset);
717 	if (!offset_pointer) {
718 		result = MBFL_ERROR_OFFSET;
719 		goto out;
720 	}
721 
722 	result = MBFL_ERROR_NOT_FOUND;
723 	if (haystack_u8->len < needle_u8->len) {
724 		goto out;
725 	}
726 
727 	const char *found_pos;
728 	if (!reverse) {
729 		found_pos = zend_memnstr(
730 			(const char *) offset_pointer,
731 			(const char *) needle_u8->val, needle_u8->len,
732 			(const char *) haystack_u8->val + haystack_u8->len);
733 	} else {
734 		if (offset >= 0) {
735 			found_pos = zend_memnrstr(
736 				(const char *) offset_pointer,
737 				(const char *) needle_u8->val, needle_u8->len,
738 				(const char *) haystack_u8->val + haystack_u8->len);
739 		} else {
740 			size_t needle_len = mbfl_strlen(needle_u8);
741 			offset_pointer = mbfl_find_offset_utf8(
742 				offset_pointer, haystack_u8->val + haystack_u8->len, needle_len);
743 			if (!offset_pointer) {
744 				offset_pointer = haystack_u8->val + haystack_u8->len;
745 			}
746 
747 			found_pos = zend_memnrstr(
748 				(const char *) haystack_u8->val,
749 				(const char *) needle_u8->val, needle_u8->len,
750 				(const char *) offset_pointer);
751 		}
752 	}
753 
754 	if (found_pos) {
755 		result = mbfl_pointer_to_offset_utf8(haystack_u8->val, (const unsigned char *) found_pos);
756 	}
757 
758 out:
759 	if (haystack_u8 == &_haystack_u8) {
760 		mbfl_string_clear(&_haystack_u8);
761 	}
762 	if (needle_u8 == &_needle_u8) {
763 		mbfl_string_clear(&_needle_u8);
764 	}
765 	return result;
766 }
767 
768 /*
769  *  substr_count
770  */
771 
772 size_t
mbfl_substr_count(mbfl_string * haystack,mbfl_string * needle)773 mbfl_substr_count(
774     mbfl_string *haystack,
775     mbfl_string *needle
776    )
777 {
778 	size_t n, result = 0;
779 	unsigned char *p;
780 	mbfl_convert_filter *filter;
781 	struct collector_strpos_data pc;
782 
783 	/* needle is converted into wchar */
784 	mbfl_wchar_device_init(&pc.needle);
785 	filter = mbfl_convert_filter_new(
786 	  needle->encoding,
787 	  &mbfl_encoding_wchar,
788 	  mbfl_wchar_device_output, 0, &pc.needle);
789 	if (filter == NULL) {
790 		return MBFL_ERROR_ENCODING;
791 	}
792 	mbfl_convert_filter_feed_string(filter, needle->val, needle->len);
793 	mbfl_convert_filter_flush(filter);
794 	mbfl_convert_filter_delete(filter);
795 	pc.needle_len = pc.needle.pos;
796 	if (pc.needle.buffer == NULL) {
797 		return MBFL_ERROR_ENCODING;
798 	}
799 	if (pc.needle_len == 0) {
800 		mbfl_wchar_device_clear(&pc.needle);
801 		return MBFL_ERROR_EMPTY;
802 	}
803 	/* initialize filter and collector data */
804 	filter = mbfl_convert_filter_new(
805 	  haystack->encoding,
806 	  &mbfl_encoding_wchar,
807 	  collector_strpos, 0, &pc);
808 	if (filter == NULL) {
809 		mbfl_wchar_device_clear(&pc.needle);
810 		return MBFL_ERROR_ENCODING;
811 	}
812 	pc.start = 0;
813 	pc.output = 0;
814 	pc.needle_pos = 0;
815 	pc.found_pos = 0;
816 	pc.matched_pos = MBFL_ERROR_NOT_FOUND;
817 
818 	/* feed data */
819 	p = haystack->val;
820 	n = haystack->len;
821 	if (p != NULL) {
822 		while (n > 0) {
823 			if ((*filter->filter_function)(*p++, filter) < 0) {
824 				pc.matched_pos = MBFL_ERROR_ENCODING;
825 				break;
826 			}
827 			if (pc.matched_pos != MBFL_ERROR_NOT_FOUND) {
828 				++result;
829 				pc.matched_pos = MBFL_ERROR_NOT_FOUND;
830 				pc.needle_pos = 0;
831 			}
832 			n--;
833 		}
834 	}
835 	mbfl_convert_filter_flush(filter);
836 	mbfl_convert_filter_delete(filter);
837 	mbfl_wchar_device_clear(&pc.needle);
838 
839 	return result;
840 }
841 
842 /*
843  *  substr
844  */
845 struct collector_substr_data {
846 	mbfl_convert_filter *next_filter;
847 	size_t start;
848 	size_t stop;
849 	size_t output;
850 };
851 
852 static int
collector_substr(int c,void * data)853 collector_substr(int c, void* data)
854 {
855 	struct collector_substr_data *pc = (struct collector_substr_data*)data;
856 
857 	if (pc->output >= pc->stop) {
858 		return -1;
859 	}
860 
861 	if (pc->output >= pc->start) {
862 		(*pc->next_filter->filter_function)(c, pc->next_filter);
863 	}
864 
865 	pc->output++;
866 
867 	return 0;
868 }
869 
870 mbfl_string *
mbfl_substr(mbfl_string * string,mbfl_string * result,size_t from,size_t length)871 mbfl_substr(
872     mbfl_string *string,
873     mbfl_string *result,
874     size_t from,
875     size_t length)
876 {
877 	const mbfl_encoding *encoding = string->encoding;
878 	size_t n, k, len, start, end;
879 	unsigned m;
880 	unsigned char *p, *w;
881 
882 	mbfl_string_init(result);
883 	result->encoding = string->encoding;
884 
885 	if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) ||
886 	   encoding->mblen_table != NULL) {
887 		len = string->len;
888 		if (encoding->flag & MBFL_ENCTYPE_SBCS) {
889 			start = from;
890 		} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
891 			start = from*2;
892 		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
893 			start = from*4;
894 		} else {
895 			const unsigned char *mbtab = encoding->mblen_table;
896 			start = 0;
897 			n = 0;
898 			k = 0;
899 			p = string->val;
900 			/* search start position */
901 			while (k <= from) {
902 				start = n;
903 				if (n >= len) {
904 					break;
905 				}
906 				m = mbtab[*p];
907 				n += m;
908 				p += m;
909 				k++;
910 			}
911 		}
912 
913 		if (length == MBFL_SUBSTR_UNTIL_END) {
914 			end = len;
915 		} else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
916 			end = start + length;
917 		} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
918 			end = start + length*2;
919 		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
920 			end = start + length*4;
921 		} else {
922 			const unsigned char *mbtab = encoding->mblen_table;
923 			end = start;
924 			n = start;
925 			k = 0;
926 			p = string->val + start;
927 			/* detect end position */
928 			while (k <= length) {
929 				end = n;
930 				if (n >= len) {
931 					break;
932 				}
933 				m = mbtab[*p];
934 				n += m;
935 				p += m;
936 				k++;
937 			}
938 		}
939 
940 		if (start > len) {
941 			start = len;
942 		}
943 		if (end > len) {
944 			end = len;
945 		}
946 		if (start > end) {
947 			start = end;
948 		}
949 
950 		/* allocate memory and copy */
951 		n = end - start;
952 		result->len = 0;
953 		result->val = w = (unsigned char*)emalloc(n + 1);
954 		result->len = n;
955 		memcpy(w, string->val + start, n);
956 		w[n] = '\0';
957 	} else {
958 		mbfl_memory_device device;
959 		struct collector_substr_data pc;
960 		mbfl_convert_filter *decoder;
961 		mbfl_convert_filter *encoder;
962 
963 		if (length == MBFL_SUBSTR_UNTIL_END) {
964 			length = mbfl_strlen(string) - from;
965 		}
966 
967 		mbfl_memory_device_init(&device, length + 1, 0);
968 		mbfl_string_init(result);
969 		result->encoding = string->encoding;
970 		/* output code filter */
971 		decoder = mbfl_convert_filter_new(
972 		    &mbfl_encoding_wchar,
973 		    string->encoding,
974 		    mbfl_memory_device_output, 0, &device);
975 		/* wchar filter */
976 		encoder = mbfl_convert_filter_new(
977 		    string->encoding,
978 		    &mbfl_encoding_wchar,
979 		    collector_substr, 0, &pc);
980 		if (decoder == NULL || encoder == NULL) {
981 			mbfl_convert_filter_delete(encoder);
982 			mbfl_convert_filter_delete(decoder);
983 			return NULL;
984 		}
985 		pc.next_filter = decoder;
986 		pc.start = from;
987 		pc.stop = from + length;
988 		pc.output = 0;
989 
990 		/* feed data */
991 		p = string->val;
992 		n = string->len;
993 		if (p != NULL) {
994 			while (n > 0) {
995 				if ((*encoder->filter_function)(*p++, encoder) < 0) {
996 					break;
997 				}
998 				n--;
999 			}
1000 		}
1001 
1002 		mbfl_convert_filter_flush(encoder);
1003 		mbfl_convert_filter_flush(decoder);
1004 		result = mbfl_memory_device_result(&device, result);
1005 		mbfl_convert_filter_delete(encoder);
1006 		mbfl_convert_filter_delete(decoder);
1007 	}
1008 
1009 	return result;
1010 }
1011 
1012 /*
1013  *  strcut
1014  */
1015 mbfl_string *
mbfl_strcut(mbfl_string * string,mbfl_string * result,size_t from,size_t length)1016 mbfl_strcut(
1017     mbfl_string *string,
1018     mbfl_string *result,
1019     size_t from,
1020     size_t length)
1021 {
1022 	const mbfl_encoding *encoding = string->encoding;
1023 	mbfl_memory_device device;
1024 
1025 	if (from >= string->len) {
1026 		from = string->len;
1027 	}
1028 
1029 	mbfl_string_init(result);
1030 	result->encoding = string->encoding;
1031 
1032 	if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) {
1033 		const unsigned char *start = NULL;
1034 		const unsigned char *end = NULL;
1035 		unsigned char *w;
1036 		size_t sz;
1037 
1038 		if (encoding->flag & MBFL_ENCTYPE_WCS2) {
1039 			from &= -2;
1040 
1041 			if (length >= string->len - from) {
1042 				length = string->len - from;
1043 			}
1044 
1045 			start = string->val + from;
1046 			end   = start + (length & -2);
1047 		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
1048 			from &= -4;
1049 
1050 			if (length >= string->len - from) {
1051 				length = string->len - from;
1052 			}
1053 
1054 			start = string->val + from;
1055 			end   = start + (length & -4);
1056 		} else if ((encoding->flag & MBFL_ENCTYPE_SBCS)) {
1057 			if (length >= string->len - from) {
1058 				length = string->len - from;
1059 			}
1060 
1061 			start = string->val + from;
1062 			end = start + length;
1063 		} else if (encoding->mblen_table != NULL) {
1064 			const unsigned char *mbtab = encoding->mblen_table;
1065 			const unsigned char *p, *q;
1066 			int m;
1067 
1068 			/* search start position */
1069 			for (m = 0, p = string->val, q = p + from;
1070 					p < q; p += (m = mbtab[*p]));
1071 
1072 			if (p > q) {
1073 				p -= m;
1074 			}
1075 
1076 			start = p;
1077 
1078 			/* search end position */
1079 			if (length >= string->len - (start - string->val)) {
1080 				end = string->val + string->len;
1081 			} else {
1082 				for (q = p + length; p < q; p += (m = mbtab[*p]));
1083 
1084 				if (p > q) {
1085 					p -= m;
1086 				}
1087 				end = p;
1088 			}
1089 		} else {
1090 			/* never reached */
1091 			return NULL;
1092 		}
1093 
1094 		/* allocate memory and copy string */
1095 		sz = end - start;
1096 		w = ecalloc(sz + 8, sizeof(unsigned char));
1097 
1098 		memcpy(w, start, sz);
1099 		w[sz] = '\0';
1100 		w[sz + 1] = '\0';
1101 		w[sz + 2] = '\0';
1102 		w[sz + 3] = '\0';
1103 
1104 		result->val = w;
1105 		result->len = sz;
1106 	} else {
1107 		mbfl_convert_filter *encoder     = NULL;
1108 		mbfl_convert_filter *decoder     = NULL;
1109 		const unsigned char *p, *q, *r;
1110 		struct {
1111 			mbfl_convert_filter encoder;
1112 			mbfl_convert_filter decoder;
1113 			const unsigned char *p;
1114 			size_t pos;
1115 		} bk, _bk;
1116 
1117 		/* output code filter */
1118 		if (!(decoder = mbfl_convert_filter_new(
1119 				&mbfl_encoding_wchar,
1120 				string->encoding,
1121 				mbfl_memory_device_output, 0, &device))) {
1122 			return NULL;
1123 		}
1124 
1125 		/* wchar filter */
1126 		if (!(encoder = mbfl_convert_filter_new(
1127 				string->encoding,
1128 				&mbfl_encoding_wchar,
1129 				mbfl_filter_output_null,
1130 				NULL, NULL))) {
1131 			mbfl_convert_filter_delete(decoder);
1132 			return NULL;
1133 		}
1134 
1135 		mbfl_memory_device_init(&device, length + 8, 0);
1136 
1137 		p = string->val;
1138 
1139 		/* search start position */
1140 		for (q = string->val + from; p < q; p++) {
1141 			(*encoder->filter_function)(*p, encoder);
1142 		}
1143 
1144 		/* switch the drain direction */
1145 		encoder->output_function = (output_function_t)decoder->filter_function;
1146 		encoder->flush_function = (flush_function_t)decoder->filter_flush;
1147 		encoder->data = decoder;
1148 
1149 		q = string->val + string->len;
1150 
1151 		/* save the encoder, decoder state and the pointer */
1152 		mbfl_convert_filter_copy(decoder, &_bk.decoder);
1153 		mbfl_convert_filter_copy(encoder, &_bk.encoder);
1154 		_bk.p = p;
1155 		_bk.pos = device.pos;
1156 
1157 		if (length > q - p) {
1158 			length = q - p;
1159 		}
1160 
1161 		if (length >= 20) {
1162 			/* output a little shorter than "length" */
1163 			/* XXX: the constant "20" was determined purely on the heuristics. */
1164 			for (r = p + length - 20; p < r; p++) {
1165 				(*encoder->filter_function)(*p, encoder);
1166 			}
1167 
1168 			/* if the offset of the resulting string exceeds the length,
1169 			 * then restore the state */
1170 			if (device.pos > length) {
1171 				p = _bk.p;
1172 				device.pos = _bk.pos;
1173 				if (decoder->filter_dtor)
1174 					decoder->filter_dtor(decoder);
1175 				if (encoder->filter_dtor)
1176 					encoder->filter_dtor(encoder);
1177 				mbfl_convert_filter_copy(&_bk.decoder, decoder);
1178 				mbfl_convert_filter_copy(&_bk.encoder, encoder);
1179 				bk = _bk;
1180 			} else {
1181 				/* save the encoder, decoder state and the pointer */
1182 				mbfl_convert_filter_copy(decoder, &bk.decoder);
1183 				mbfl_convert_filter_copy(encoder, &bk.encoder);
1184 				bk.p = p;
1185 				bk.pos = device.pos;
1186 
1187 				/* flush the stream */
1188 				(*encoder->filter_flush)(encoder);
1189 
1190 				/* if the offset of the resulting string exceeds the length,
1191 				 * then restore the state */
1192 				if (device.pos > length) {
1193 					if (bk.decoder.filter_dtor)
1194 						bk.decoder.filter_dtor(&bk.decoder);
1195 					if (bk.encoder.filter_dtor)
1196 						bk.encoder.filter_dtor(&bk.encoder);
1197 
1198 					p = _bk.p;
1199 					device.pos = _bk.pos;
1200 					if (decoder->filter_dtor)
1201 						decoder->filter_dtor(decoder);
1202 					if (encoder->filter_dtor)
1203 						encoder->filter_dtor(encoder);
1204 					mbfl_convert_filter_copy(&_bk.decoder, decoder);
1205 					mbfl_convert_filter_copy(&_bk.encoder, encoder);
1206 					bk = _bk;
1207 				} else {
1208 					if (_bk.decoder.filter_dtor)
1209 						_bk.decoder.filter_dtor(&_bk.decoder);
1210 					if (_bk.encoder.filter_dtor)
1211 						_bk.encoder.filter_dtor(&_bk.encoder);
1212 
1213 					p = bk.p;
1214 					device.pos = bk.pos;
1215 					if (decoder->filter_dtor)
1216 						decoder->filter_dtor(decoder);
1217 					if (encoder->filter_dtor)
1218 						encoder->filter_dtor(encoder);
1219 					mbfl_convert_filter_copy(&bk.decoder, decoder);
1220 					mbfl_convert_filter_copy(&bk.encoder, encoder);
1221 				}
1222 			}
1223 		} else {
1224 			bk = _bk;
1225 		}
1226 
1227 		/* detect end position */
1228 		while (p < q) {
1229 			(*encoder->filter_function)(*p, encoder);
1230 
1231 			if (device.pos > length) {
1232 				/* restore filter */
1233 				p = bk.p;
1234 				device.pos = bk.pos;
1235 				if (decoder->filter_dtor)
1236 					decoder->filter_dtor(decoder);
1237 				if (encoder->filter_dtor)
1238 					encoder->filter_dtor(encoder);
1239 				mbfl_convert_filter_copy(&bk.decoder, decoder);
1240 				mbfl_convert_filter_copy(&bk.encoder, encoder);
1241 				break;
1242 			}
1243 
1244 			p++;
1245 
1246 			/* backup current state */
1247 			mbfl_convert_filter_copy(decoder, &_bk.decoder);
1248 			mbfl_convert_filter_copy(encoder, &_bk.encoder);
1249 			_bk.pos = device.pos;
1250 			_bk.p = p;
1251 
1252 			(*encoder->filter_flush)(encoder);
1253 
1254 			if (device.pos > length) {
1255 				if (_bk.decoder.filter_dtor)
1256 					_bk.decoder.filter_dtor(&_bk.decoder);
1257 				if (_bk.encoder.filter_dtor)
1258 					_bk.encoder.filter_dtor(&_bk.encoder);
1259 
1260 				/* restore filter */
1261 				p = bk.p;
1262 				device.pos = bk.pos;
1263 				if (decoder->filter_dtor)
1264 					decoder->filter_dtor(decoder);
1265 				if (encoder->filter_dtor)
1266 					encoder->filter_dtor(encoder);
1267 				mbfl_convert_filter_copy(&bk.decoder, decoder);
1268 				mbfl_convert_filter_copy(&bk.encoder, encoder);
1269 				break;
1270 			}
1271 
1272 			if (bk.decoder.filter_dtor)
1273 				bk.decoder.filter_dtor(&bk.decoder);
1274 			if (bk.encoder.filter_dtor)
1275 				bk.encoder.filter_dtor(&bk.encoder);
1276 
1277 			p = _bk.p;
1278 			device.pos = _bk.pos;
1279 			if (decoder->filter_dtor)
1280 				decoder->filter_dtor(decoder);
1281 			if (encoder->filter_dtor)
1282 				encoder->filter_dtor(encoder);
1283 			mbfl_convert_filter_copy(&_bk.decoder, decoder);
1284 			mbfl_convert_filter_copy(&_bk.encoder, encoder);
1285 
1286 			bk = _bk;
1287 		}
1288 
1289 		decoder->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1290 		(*encoder->filter_flush)(encoder);
1291 
1292 		if (bk.decoder.filter_dtor)
1293 			bk.decoder.filter_dtor(&bk.decoder);
1294 		if (bk.encoder.filter_dtor)
1295 			bk.encoder.filter_dtor(&bk.encoder);
1296 
1297 		result = mbfl_memory_device_result(&device, result);
1298 
1299 		mbfl_convert_filter_delete(encoder);
1300 		mbfl_convert_filter_delete(decoder);
1301 	}
1302 
1303 	return result;
1304 }
1305 
1306 
1307 /*
1308  *  strwidth
1309  */
is_fullwidth(int c)1310 static size_t is_fullwidth(int c)
1311 {
1312 	int i;
1313 
1314 	if (c < mbfl_eaw_table[0].begin) {
1315 		return 0;
1316 	}
1317 
1318 	for (i = 0; i < sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]); i++) {
1319 		if (mbfl_eaw_table[i].begin <= c && c <= mbfl_eaw_table[i].end) {
1320 			return 1;
1321 		}
1322 	}
1323 
1324 	return 0;
1325 }
1326 
1327 static int
filter_count_width(int c,void * data)1328 filter_count_width(int c, void* data)
1329 {
1330 	(*(size_t *)data) += (is_fullwidth(c) ? 2: 1);
1331 	return 0;
1332 }
1333 
1334 size_t
mbfl_strwidth(mbfl_string * string)1335 mbfl_strwidth(mbfl_string *string)
1336 {
1337 	size_t len, n;
1338 	unsigned char *p;
1339 	mbfl_convert_filter *filter;
1340 
1341 	len = 0;
1342 	if (string->len > 0 && string->val != NULL) {
1343 		/* wchar filter */
1344 		filter = mbfl_convert_filter_new(
1345 		    string->encoding,
1346 		    &mbfl_encoding_wchar,
1347 		    filter_count_width, 0, &len);
1348 		if (filter == NULL) {
1349 			mbfl_convert_filter_delete(filter);
1350 			return -1;
1351 		}
1352 
1353 		/* feed data */
1354 		p = string->val;
1355 		n = string->len;
1356 		while (n > 0) {
1357 			(*filter->filter_function)(*p++, filter);
1358 			n--;
1359 		}
1360 
1361 		mbfl_convert_filter_flush(filter);
1362 		mbfl_convert_filter_delete(filter);
1363 	}
1364 
1365 	return len;
1366 }
1367 
1368 
1369 /*
1370  *  strimwidth
1371  */
1372 struct collector_strimwidth_data {
1373 	mbfl_convert_filter *decoder;
1374 	mbfl_convert_filter *decoder_backup;
1375 	mbfl_memory_device device;
1376 	size_t from;
1377 	size_t width;
1378 	size_t outwidth;
1379 	size_t outchar;
1380 	size_t endpos;
1381 	int status;
1382 };
1383 
1384 static int
collector_strimwidth(int c,void * data)1385 collector_strimwidth(int c, void* data)
1386 {
1387 	struct collector_strimwidth_data *pc = (struct collector_strimwidth_data*)data;
1388 
1389 	switch (pc->status) {
1390 	case 10:
1391 		(*pc->decoder->filter_function)(c, pc->decoder);
1392 		break;
1393 	default:
1394 		if (pc->outchar >= pc->from) {
1395 			pc->outwidth += (is_fullwidth(c) ? 2: 1);
1396 
1397 			if (pc->outwidth > pc->width) {
1398 				if (pc->status == 0) {
1399 					pc->endpos = pc->device.pos;
1400 					mbfl_convert_filter_copy(pc->decoder, pc->decoder_backup);
1401 				}
1402 				pc->status++;
1403 				(*pc->decoder->filter_function)(c, pc->decoder);
1404 				pc->outchar++;
1405 				return -1;
1406 			} else {
1407 				(*pc->decoder->filter_function)(c, pc->decoder);
1408 			}
1409 		}
1410 		pc->outchar++;
1411 		break;
1412 	}
1413 
1414 	return 0;
1415 }
1416 
1417 mbfl_string *
mbfl_strimwidth(mbfl_string * string,mbfl_string * marker,mbfl_string * result,size_t from,size_t width)1418 mbfl_strimwidth(
1419     mbfl_string *string,
1420     mbfl_string *marker,
1421     mbfl_string *result,
1422     size_t from,
1423     size_t width)
1424 {
1425 	struct collector_strimwidth_data pc;
1426 	mbfl_convert_filter *encoder;
1427 	size_t n, mkwidth;
1428 	unsigned char *p;
1429 
1430 	if (string == NULL || result == NULL) {
1431 		return NULL;
1432 	}
1433 	mbfl_string_init(result);
1434 	result->encoding = string->encoding;
1435 	mbfl_memory_device_init(&pc.device, MIN(string->len, width), 0);
1436 
1437 	/* output code filter */
1438 	pc.decoder = mbfl_convert_filter_new(
1439 	    &mbfl_encoding_wchar,
1440 	    string->encoding,
1441 	    mbfl_memory_device_output, 0, &pc.device);
1442 	pc.decoder_backup = mbfl_convert_filter_new(
1443 	    &mbfl_encoding_wchar,
1444 	    string->encoding,
1445 	    mbfl_memory_device_output, 0, &pc.device);
1446 	/* wchar filter */
1447 	encoder = mbfl_convert_filter_new(
1448 	    string->encoding,
1449 	    &mbfl_encoding_wchar,
1450 	    collector_strimwidth, 0, &pc);
1451 	if (pc.decoder == NULL || pc.decoder_backup == NULL || encoder == NULL) {
1452 		mbfl_convert_filter_delete(encoder);
1453 		mbfl_convert_filter_delete(pc.decoder);
1454 		mbfl_convert_filter_delete(pc.decoder_backup);
1455 		return NULL;
1456 	}
1457 	mkwidth = 0;
1458 	if (marker) {
1459 		mkwidth = mbfl_strwidth(marker);
1460 	}
1461 	pc.from = from;
1462 	pc.width = width - mkwidth;
1463 	pc.outwidth = 0;
1464 	pc.outchar = 0;
1465 	pc.status = 0;
1466 	pc.endpos = 0;
1467 
1468 	/* feed data */
1469 	p = string->val;
1470 	n = string->len;
1471 	if (p != NULL) {
1472 		while (n > 0) {
1473 			n--;
1474 			if ((*encoder->filter_function)(*p++, encoder) < 0) {
1475 				break;
1476 			}
1477 		}
1478 		mbfl_convert_filter_flush(encoder);
1479 		if (pc.status != 0 && mkwidth > 0) {
1480 			pc.width += mkwidth;
1481 			if (n > 0) {
1482 				while (n > 0) {
1483 					if ((*encoder->filter_function)(*p++, encoder) < 0) {
1484 						break;
1485 					}
1486 					n--;
1487 				}
1488 				mbfl_convert_filter_flush(encoder);
1489 			} else if (pc.outwidth > pc.width) {
1490 				pc.status++;
1491 			}
1492 			if (pc.status != 1) {
1493 				pc.status = 10;
1494 				pc.device.pos = pc.endpos;
1495 				mbfl_convert_filter_copy(pc.decoder_backup, pc.decoder);
1496 				mbfl_convert_filter_reset(encoder, marker->encoding, &mbfl_encoding_wchar);
1497 				p = marker->val;
1498 				n = marker->len;
1499 				while (n > 0) {
1500 					if ((*encoder->filter_function)(*p++, encoder) < 0) {
1501 						break;
1502 					}
1503 					n--;
1504 				}
1505 				mbfl_convert_filter_flush(encoder);
1506 			}
1507 		} else if (pc.status != 0) {
1508 			pc.device.pos = pc.endpos;
1509 			mbfl_convert_filter_copy(pc.decoder_backup, pc.decoder);
1510 		}
1511 		mbfl_convert_filter_flush(pc.decoder);
1512 	}
1513 	result = mbfl_memory_device_result(&pc.device, result);
1514 	mbfl_convert_filter_delete(encoder);
1515 	mbfl_convert_filter_delete(pc.decoder);
1516 	mbfl_convert_filter_delete(pc.decoder_backup);
1517 
1518 	return result;
1519 }
1520 
1521 mbfl_string *
mbfl_ja_jp_hantozen(mbfl_string * string,mbfl_string * result,int mode)1522 mbfl_ja_jp_hantozen(
1523     mbfl_string *string,
1524     mbfl_string *result,
1525     int mode)
1526 {
1527 	size_t n;
1528 	unsigned char *p;
1529 	mbfl_memory_device device;
1530 	mbfl_convert_filter *decoder = NULL;
1531 	mbfl_convert_filter *encoder = NULL;
1532 	mbfl_convert_filter *tl_filter = NULL;
1533 	mbfl_convert_filter *next_filter = NULL;
1534 
1535 	mbfl_memory_device_init(&device, string->len, 0);
1536 	mbfl_string_init(result);
1537 
1538 	result->encoding = string->encoding;
1539 
1540 	decoder = mbfl_convert_filter_new(
1541 		&mbfl_encoding_wchar,
1542 		string->encoding,
1543 		mbfl_memory_device_output, 0, &device);
1544 	if (decoder == NULL) {
1545 		goto out;
1546 	}
1547 	next_filter = decoder;
1548 
1549 	tl_filter = mbfl_convert_filter_new2(
1550 		&vtbl_tl_jisx0201_jisx0208,
1551 		(int(*)(int, void*))next_filter->filter_function,
1552 		(flush_function_t)next_filter->filter_flush,
1553 		next_filter);
1554 	if (tl_filter == NULL) {
1555 		goto out;
1556 	}
1557 
1558 	tl_filter->opaque = (void*)((intptr_t)mode);
1559 	next_filter = tl_filter;
1560 
1561 	encoder = mbfl_convert_filter_new(
1562 		string->encoding,
1563 		&mbfl_encoding_wchar,
1564 		(int(*)(int, void*))next_filter->filter_function,
1565 		(flush_function_t)next_filter->filter_flush,
1566 		next_filter);
1567 	if (encoder == NULL) {
1568 		goto out;
1569 	}
1570 
1571 	/* feed data */
1572 	p = string->val;
1573 	n = string->len;
1574 	if (p != NULL) {
1575 		while (n > 0) {
1576 			if ((*encoder->filter_function)(*p++, encoder) < 0) {
1577 				break;
1578 			}
1579 			n--;
1580 		}
1581 	}
1582 
1583 	mbfl_convert_filter_flush(encoder);
1584 	result = mbfl_memory_device_result(&device, result);
1585 out:
1586 	if (tl_filter != NULL) {
1587 		mbfl_convert_filter_delete(tl_filter);
1588 	}
1589 
1590 	if (decoder != NULL) {
1591 		mbfl_convert_filter_delete(decoder);
1592 	}
1593 
1594 	if (encoder != NULL) {
1595 		mbfl_convert_filter_delete(encoder);
1596 	}
1597 
1598 	return result;
1599 }
1600 
1601 
1602 /*
1603  *  MIME header encode
1604  */
1605 struct mime_header_encoder_data {
1606 	mbfl_convert_filter *conv1_filter;
1607 	mbfl_convert_filter *block_filter;
1608 	mbfl_convert_filter *conv2_filter;
1609 	mbfl_convert_filter *conv2_filter_backup;
1610 	mbfl_convert_filter *encod_filter;
1611 	mbfl_convert_filter *encod_filter_backup;
1612 	mbfl_memory_device outdev;
1613 	mbfl_memory_device tmpdev;
1614 	int status1;
1615 	int status2;
1616 	size_t prevpos;
1617 	size_t linehead;
1618 	size_t firstindent;
1619 	int encnamelen;
1620 	int lwsplen;
1621 	char encname[128];
1622 	char lwsp[16];
1623 };
1624 
1625 static int
mime_header_encoder_block_collector(int c,void * data)1626 mime_header_encoder_block_collector(int c, void *data)
1627 {
1628 	size_t n;
1629 	struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1630 
1631 	switch (pe->status2) {
1632 	case 1:	/* encoded word */
1633 		pe->prevpos = pe->outdev.pos;
1634 		mbfl_convert_filter_copy(pe->conv2_filter, pe->conv2_filter_backup);
1635 		mbfl_convert_filter_copy(pe->encod_filter, pe->encod_filter_backup);
1636 		(*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1637 		(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1638 		(*pe->encod_filter->filter_flush)(pe->encod_filter);
1639 		n = pe->outdev.pos - pe->linehead + pe->firstindent;
1640 		pe->outdev.pos = pe->prevpos;
1641 		mbfl_convert_filter_copy(pe->conv2_filter_backup, pe->conv2_filter);
1642 		mbfl_convert_filter_copy(pe->encod_filter_backup, pe->encod_filter);
1643 		if (n >= 74) {
1644 			(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1645 			(*pe->encod_filter->filter_flush)(pe->encod_filter);
1646 			mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2);	/* ?= */
1647 			mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1648 			pe->linehead = pe->outdev.pos;
1649 			pe->firstindent = 0;
1650 			mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1651 			c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1652 		} else {
1653 			c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1654 		}
1655 		break;
1656 
1657 	default:
1658 		mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1659 		c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1660 		pe->status2 = 1;
1661 		break;
1662 	}
1663 
1664 	return 0;
1665 }
1666 
1667 static int
mime_header_encoder_collector(int c,void * data)1668 mime_header_encoder_collector(int c, void *data)
1669 {
1670 	static int qp_table[256] = {
1671 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1672 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1673 		1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 */
1674 		0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0, 0, 1, 0, 1, /* 0x10 */
1675 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */
1676 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x50 */
1677 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */
1678 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x70 */
1679 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80 */
1680 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90 */
1681 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xA0 */
1682 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xB0 */
1683 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 */
1684 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xD0 */
1685 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xE0 */
1686 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1  /* 0xF0 */
1687 	};
1688 
1689 	size_t n;
1690 	struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1691 
1692 	switch (pe->status1) {
1693 	case 11:	/* encoded word */
1694 		(*pe->block_filter->filter_function)(c, pe->block_filter);
1695 		break;
1696 
1697 	default:	/* ASCII */
1698 		if (c <= 0x00ff && !qp_table[(c & 0xff)]) { /* ordinary characters */
1699 			mbfl_memory_device_output(c, &pe->tmpdev);
1700 			pe->status1 = 1;
1701 		} else if (pe->status1 == 0 && c == 0x20) {	/* repeat SPACE */
1702 			mbfl_memory_device_output(c, &pe->tmpdev);
1703 		} else {
1704 			if (pe->tmpdev.pos < 74 && c == 0x20) {
1705 				n = pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent;
1706 				if (n > 74) {
1707 					mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);		/* LWSP */
1708 					pe->linehead = pe->outdev.pos;
1709 					pe->firstindent = 0;
1710 				} else if (pe->outdev.pos > 0) {
1711 					mbfl_memory_device_output(0x20, &pe->outdev);
1712 				}
1713 				mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1714 				mbfl_memory_device_reset(&pe->tmpdev);
1715 				pe->status1 = 0;
1716 			} else {
1717 				n = pe->outdev.pos - pe->linehead + pe->encnamelen + pe->firstindent;
1718 				if (n > 60)  {
1719 					mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);		/* LWSP */
1720 					pe->linehead = pe->outdev.pos;
1721 					pe->firstindent = 0;
1722 				} else if (pe->outdev.pos > 0)  {
1723 					mbfl_memory_device_output(0x20, &pe->outdev);
1724 				}
1725 				mbfl_convert_filter_devcat(pe->block_filter, &pe->tmpdev);
1726 				mbfl_memory_device_reset(&pe->tmpdev);
1727 				(*pe->block_filter->filter_function)(c, pe->block_filter);
1728 				pe->status1 = 11;
1729 			}
1730 		}
1731 		break;
1732 	}
1733 
1734 	return 0;
1735 }
1736 
1737 mbfl_string *
mime_header_encoder_result(struct mime_header_encoder_data * pe,mbfl_string * result)1738 mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *result)
1739 {
1740 	if (pe->status1 >= 10) {
1741 		(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1742 		(*pe->encod_filter->filter_flush)(pe->encod_filter);
1743 		mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2);		/* ?= */
1744 	} else if (pe->tmpdev.pos > 0) {
1745 		if (pe->outdev.pos > 0) {
1746 			if ((pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent) > 74) {
1747 				mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1748 			} else {
1749 				mbfl_memory_device_output(0x20, &pe->outdev);
1750 			}
1751 		}
1752 		mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1753 	}
1754 	mbfl_memory_device_reset(&pe->tmpdev);
1755 	pe->prevpos = 0;
1756 	pe->linehead = 0;
1757 	pe->status1 = 0;
1758 	pe->status2 = 0;
1759 
1760 	return mbfl_memory_device_result(&pe->outdev, result);
1761 }
1762 
1763 struct mime_header_encoder_data*
mime_header_encoder_new(const mbfl_encoding * incode,const mbfl_encoding * outcode,const mbfl_encoding * transenc)1764 mime_header_encoder_new(
1765     const mbfl_encoding *incode,
1766     const mbfl_encoding *outcode,
1767     const mbfl_encoding *transenc)
1768 {
1769 	size_t n;
1770 	const char *s;
1771 	struct mime_header_encoder_data *pe;
1772 
1773 	/* get output encoding and check MIME charset name */
1774 	if (outcode->mime_name == NULL || outcode->mime_name[0] == '\0') {
1775 		return NULL;
1776 	}
1777 
1778 	pe = emalloc(sizeof(struct mime_header_encoder_data));
1779 	mbfl_memory_device_init(&pe->outdev, 0, 0);
1780 	mbfl_memory_device_init(&pe->tmpdev, 0, 0);
1781 	pe->prevpos = 0;
1782 	pe->linehead = 0;
1783 	pe->firstindent = 0;
1784 	pe->status1 = 0;
1785 	pe->status2 = 0;
1786 
1787 	/* make the encoding description string  exp. "=?ISO-2022-JP?B?" */
1788 	n = 0;
1789 	pe->encname[n++] = 0x3d;
1790 	pe->encname[n++] = 0x3f;
1791 	s = outcode->mime_name;
1792 	while (*s) {
1793 		pe->encname[n++] = *s++;
1794 	}
1795 	pe->encname[n++] = 0x3f;
1796 	if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1797 		pe->encname[n++] = 0x51;
1798 	} else {
1799 		pe->encname[n++] = 0x42;
1800 		transenc = &mbfl_encoding_base64;
1801 	}
1802 	pe->encname[n++] = 0x3f;
1803 	pe->encname[n] = '\0';
1804 	pe->encnamelen = n;
1805 
1806 	n = 0;
1807 	pe->lwsp[n++] = 0x0d;
1808 	pe->lwsp[n++] = 0x0a;
1809 	pe->lwsp[n++] = 0x20;
1810 	pe->lwsp[n] = '\0';
1811 	pe->lwsplen = n;
1812 
1813 	/* transfer encode filter */
1814 	pe->encod_filter = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1815 	pe->encod_filter_backup = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1816 
1817 	/* Output code filter */
1818 	pe->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1819 	pe->conv2_filter_backup = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1820 
1821 	/* encoded block filter */
1822 	pe->block_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, &mbfl_encoding_wchar, mime_header_encoder_block_collector, 0, pe);
1823 
1824 	/* Input code filter */
1825 	pe->conv1_filter = mbfl_convert_filter_new(incode, &mbfl_encoding_wchar, mime_header_encoder_collector, 0, pe);
1826 
1827 	if (pe->encod_filter == NULL ||
1828 	    pe->encod_filter_backup == NULL ||
1829 	    pe->conv2_filter == NULL ||
1830 	    pe->conv2_filter_backup == NULL ||
1831 	    pe->conv1_filter == NULL) {
1832 		mime_header_encoder_delete(pe);
1833 		return NULL;
1834 	}
1835 
1836 	if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1837 		pe->encod_filter->status |= MBFL_QPRINT_STS_MIME_HEADER;
1838 		pe->encod_filter_backup->status |= MBFL_QPRINT_STS_MIME_HEADER;
1839 	} else {
1840 		pe->encod_filter->status |= MBFL_BASE64_STS_MIME_HEADER;
1841 		pe->encod_filter_backup->status |= MBFL_BASE64_STS_MIME_HEADER;
1842 	}
1843 
1844 	return pe;
1845 }
1846 
1847 void
mime_header_encoder_delete(struct mime_header_encoder_data * pe)1848 mime_header_encoder_delete(struct mime_header_encoder_data *pe)
1849 {
1850 	if (pe) {
1851 		mbfl_convert_filter_delete(pe->conv1_filter);
1852 		mbfl_convert_filter_delete(pe->block_filter);
1853 		mbfl_convert_filter_delete(pe->conv2_filter);
1854 		mbfl_convert_filter_delete(pe->conv2_filter_backup);
1855 		mbfl_convert_filter_delete(pe->encod_filter);
1856 		mbfl_convert_filter_delete(pe->encod_filter_backup);
1857 		mbfl_memory_device_clear(&pe->outdev);
1858 		mbfl_memory_device_clear(&pe->tmpdev);
1859 		efree((void*)pe);
1860 	}
1861 }
1862 
1863 mbfl_string *
mbfl_mime_header_encode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode,const mbfl_encoding * encoding,const char * linefeed,int indent)1864 mbfl_mime_header_encode(
1865     mbfl_string *string,
1866     mbfl_string *result,
1867     const mbfl_encoding *outcode,
1868     const mbfl_encoding *encoding,
1869     const char *linefeed,
1870     int indent)
1871 {
1872 	size_t n;
1873 	unsigned char *p;
1874 	struct mime_header_encoder_data *pe;
1875 
1876 	mbfl_string_init(result);
1877 	result->encoding = &mbfl_encoding_ascii;
1878 
1879 	pe = mime_header_encoder_new(string->encoding, outcode, encoding);
1880 	if (pe == NULL) {
1881 		return NULL;
1882 	}
1883 
1884 	if (linefeed != NULL) {
1885 		n = 0;
1886 		while (*linefeed && n < 8) {
1887 			pe->lwsp[n++] = *linefeed++;
1888 		}
1889 		pe->lwsp[n++] = 0x20;
1890 		pe->lwsp[n] = '\0';
1891 		pe->lwsplen = n;
1892 	}
1893 	if (indent > 0 && indent < 74) {
1894 		pe->firstindent = indent;
1895 	}
1896 
1897 	n = string->len;
1898 	p = string->val;
1899 	while (n > 0) {
1900 		(*pe->conv1_filter->filter_function)(*p++, pe->conv1_filter);
1901 		n--;
1902 	}
1903 
1904 	result = mime_header_encoder_result(pe, result);
1905 	mime_header_encoder_delete(pe);
1906 
1907 	return result;
1908 }
1909 
1910 
1911 /*
1912  *  MIME header decode
1913  */
1914 struct mime_header_decoder_data {
1915 	mbfl_convert_filter *deco_filter;
1916 	mbfl_convert_filter *conv1_filter;
1917 	mbfl_convert_filter *conv2_filter;
1918 	mbfl_memory_device outdev;
1919 	mbfl_memory_device tmpdev;
1920 	size_t cspos;
1921 	int status;
1922 	const mbfl_encoding *encoding;
1923 	const mbfl_encoding *incode;
1924 	const mbfl_encoding *outcode;
1925 };
1926 
1927 static int
mime_header_decoder_collector(int c,void * data)1928 mime_header_decoder_collector(int c, void* data)
1929 {
1930 	const mbfl_encoding *encoding;
1931 	struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data;
1932 
1933 	switch (pd->status) {
1934 	case 1:
1935 		if (c == 0x3f) {		/* ? */
1936 			mbfl_memory_device_output(c, &pd->tmpdev);
1937 			pd->cspos = pd->tmpdev.pos;
1938 			pd->status = 2;
1939 		} else {
1940 			mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1941 			mbfl_memory_device_reset(&pd->tmpdev);
1942 			if (c == 0x3d) {		/* = */
1943 				mbfl_memory_device_output(c, &pd->tmpdev);
1944 			} else if (c == 0x0d || c == 0x0a) {	/* CR or LF */
1945 				pd->status = 9;
1946 			} else {
1947 				(*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
1948 				pd->status = 0;
1949 			}
1950 		}
1951 		break;
1952 	case 2:		/* store charset string */
1953 		if (c == 0x3f) {		/* ? */
1954 			/* identify charset */
1955 			mbfl_memory_device_output('\0', &pd->tmpdev);
1956 			encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]);
1957 			if (encoding != NULL) {
1958 				pd->incode = encoding;
1959 				pd->status = 3;
1960 			}
1961 			mbfl_memory_device_unput(&pd->tmpdev);
1962 			mbfl_memory_device_output(c, &pd->tmpdev);
1963 		} else {
1964 			mbfl_memory_device_output(c, &pd->tmpdev);
1965 			if (pd->tmpdev.pos > 100) {		/* too long charset string */
1966 				pd->status = 0;
1967 			} else if (c == 0x0d || c == 0x0a) {	/* CR or LF */
1968 				mbfl_memory_device_unput(&pd->tmpdev);
1969 				pd->status = 9;
1970 			}
1971 			if (pd->status != 2) {
1972 				mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1973 				mbfl_memory_device_reset(&pd->tmpdev);
1974 			}
1975 		}
1976 		break;
1977 	case 3:		/* identify encoding */
1978 		mbfl_memory_device_output(c, &pd->tmpdev);
1979 		if (c == 0x42 || c == 0x62) {		/* 'B' or 'b' */
1980 			pd->encoding = &mbfl_encoding_base64;
1981 			pd->status = 4;
1982 		} else if (c == 0x51 || c == 0x71) {	/* 'Q' or 'q' */
1983 			pd->encoding = &mbfl_encoding_qprint;
1984 			pd->status = 4;
1985 		} else {
1986 			if (c == 0x0d || c == 0x0a) {	/* CR or LF */
1987 				mbfl_memory_device_unput(&pd->tmpdev);
1988 				pd->status = 9;
1989 			} else {
1990 				pd->status = 0;
1991 			}
1992 			mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1993 			mbfl_memory_device_reset(&pd->tmpdev);
1994 		}
1995 		break;
1996 	case 4:		/* reset filter */
1997 		mbfl_memory_device_output(c, &pd->tmpdev);
1998 		if (c == 0x3f) {		/* ? */
1999 			/* charset convert filter */
2000 			mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar);
2001 			/* decode filter */
2002 			mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit);
2003 			pd->status = 5;
2004 		} else {
2005 			if (c == 0x0d || c == 0x0a) {	/* CR or LF */
2006 				mbfl_memory_device_unput(&pd->tmpdev);
2007 				pd->status = 9;
2008 			} else {
2009 				pd->status = 0;
2010 			}
2011 			mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2012 		}
2013 		mbfl_memory_device_reset(&pd->tmpdev);
2014 		break;
2015 	case 5:		/* encoded block */
2016 		if (c == 0x3f) {		/* ? */
2017 			pd->status = 6;
2018 		} else {
2019 			(*pd->deco_filter->filter_function)(c, pd->deco_filter);
2020 		}
2021 		break;
2022 	case 6:		/* check end position */
2023 		if (c == 0x3d) {		/* = */
2024 			/* flush and reset filter */
2025 			(*pd->deco_filter->filter_flush)(pd->deco_filter);
2026 			(*pd->conv1_filter->filter_flush)(pd->conv1_filter);
2027 			mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar);
2028 			pd->status = 7;
2029 		} else {
2030 			(*pd->deco_filter->filter_function)(0x3f, pd->deco_filter);
2031 			if (c != 0x3f) {		/* ? */
2032 				(*pd->deco_filter->filter_function)(c, pd->deco_filter);
2033 				pd->status = 5;
2034 			}
2035 		}
2036 		break;
2037 	case 7:		/* after encoded block */
2038 		if (c == 0x0d || c == 0x0a) {	/* CR LF */
2039 			pd->status = 8;
2040 		} else {
2041 			mbfl_memory_device_output(c, &pd->tmpdev);
2042 			if (c == 0x3d) {		/* = */
2043 				pd->status = 1;
2044 			} else if (c != 0x20 && c != 0x09) {		/* not space */
2045 				mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2046 				mbfl_memory_device_reset(&pd->tmpdev);
2047 				pd->status = 0;
2048 			}
2049 		}
2050 		break;
2051 	case 8:		/* folding */
2052 	case 9:		/* folding */
2053 		if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) {
2054 			if (c == 0x3d) {		/* = */
2055 				if (pd->status == 8) {
2056 					mbfl_memory_device_output(0x20, &pd->tmpdev);	/* SPACE */
2057 				} else {
2058 					(*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter);
2059 				}
2060 				mbfl_memory_device_output(c, &pd->tmpdev);
2061 				pd->status = 1;
2062 			} else {
2063 				mbfl_memory_device_output(0x20, &pd->tmpdev);
2064 				mbfl_memory_device_output(c, &pd->tmpdev);
2065 				mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2066 				mbfl_memory_device_reset(&pd->tmpdev);
2067 				pd->status = 0;
2068 			}
2069 		}
2070 		break;
2071 	default:		/* non encoded block */
2072 		if (c == 0x0d || c == 0x0a) {	/* CR LF */
2073 			pd->status = 9;
2074 		} else if (c == 0x3d) {		/* = */
2075 			mbfl_memory_device_output(c, &pd->tmpdev);
2076 			pd->status = 1;
2077 		} else {
2078 			(*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
2079 		}
2080 		break;
2081 	}
2082 
2083 	return 0;
2084 }
2085 
2086 mbfl_string *
mime_header_decoder_result(struct mime_header_decoder_data * pd,mbfl_string * result)2087 mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result)
2088 {
2089 	switch (pd->status) {
2090 	case 1:
2091 	case 2:
2092 	case 3:
2093 	case 4:
2094 	case 7:
2095 	case 8:
2096 	case 9:
2097 		mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2098 		break;
2099 	case 5:
2100 	case 6:
2101 		(*pd->deco_filter->filter_flush)(pd->deco_filter);
2102 		(*pd->conv1_filter->filter_flush)(pd->conv1_filter);
2103 		break;
2104 	}
2105 	(*pd->conv2_filter->filter_flush)(pd->conv2_filter);
2106 	mbfl_memory_device_reset(&pd->tmpdev);
2107 	pd->status = 0;
2108 
2109 	return mbfl_memory_device_result(&pd->outdev, result);
2110 }
2111 
2112 struct mime_header_decoder_data*
mime_header_decoder_new(const mbfl_encoding * outcode)2113 mime_header_decoder_new(const mbfl_encoding *outcode)
2114 {
2115 	struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data));
2116 
2117 	mbfl_memory_device_init(&pd->outdev, 0, 0);
2118 	mbfl_memory_device_init(&pd->tmpdev, 0, 0);
2119 	pd->cspos = 0;
2120 	pd->status = 0;
2121 	pd->encoding = &mbfl_encoding_8bit;
2122 	pd->incode = &mbfl_encoding_ascii;
2123 	pd->outcode = outcode;
2124 	/* charset convert filter */
2125 	pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev);
2126 	pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter);
2127 	/* decode filter */
2128 	pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter);
2129 
2130 	if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) {
2131 		mime_header_decoder_delete(pd);
2132 		return NULL;
2133 	}
2134 
2135 	return pd;
2136 }
2137 
2138 void
mime_header_decoder_delete(struct mime_header_decoder_data * pd)2139 mime_header_decoder_delete(struct mime_header_decoder_data *pd)
2140 {
2141 	if (pd) {
2142 		mbfl_convert_filter_delete(pd->conv2_filter);
2143 		mbfl_convert_filter_delete(pd->conv1_filter);
2144 		mbfl_convert_filter_delete(pd->deco_filter);
2145 		mbfl_memory_device_clear(&pd->outdev);
2146 		mbfl_memory_device_clear(&pd->tmpdev);
2147 		efree((void*)pd);
2148 	}
2149 }
2150 
2151 mbfl_string *
mbfl_mime_header_decode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode)2152 mbfl_mime_header_decode(
2153     mbfl_string *string,
2154     mbfl_string *result,
2155     const mbfl_encoding *outcode)
2156 {
2157 	size_t n;
2158 	unsigned char *p;
2159 	struct mime_header_decoder_data *pd;
2160 
2161 	mbfl_string_init(result);
2162 	result->encoding = outcode;
2163 
2164 	pd = mime_header_decoder_new(outcode);
2165 	if (pd == NULL) {
2166 		return NULL;
2167 	}
2168 
2169 	/* feed data */
2170 	n = string->len;
2171 	p = string->val;
2172 	while (n > 0) {
2173 		mime_header_decoder_collector(*p++, pd);
2174 		n--;
2175 	}
2176 
2177 	result = mime_header_decoder_result(pd, result);
2178 	mime_header_decoder_delete(pd);
2179 
2180 	return result;
2181 }
2182 
2183 
2184 
2185 /*
2186  *  convert HTML numeric entity
2187  */
2188 struct collector_htmlnumericentity_data {
2189 	mbfl_convert_filter *decoder;
2190 	int status;
2191 	int cache;
2192 	int digit;
2193 	int *convmap;
2194 	int mapsize;
2195 };
2196 
2197 static int
collector_encode_htmlnumericentity(int c,void * data)2198 collector_encode_htmlnumericentity(int c, void *data)
2199 {
2200 	struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2201 	int f, n, s, r, d, size, *mapelm;
2202 
2203 	size = pc->mapsize;
2204 	f = 0;
2205 	n = 0;
2206 	while (n < size) {
2207 		mapelm = &(pc->convmap[n*4]);
2208 		if (c >= mapelm[0] && c <= mapelm[1]) {
2209 			s = (c + mapelm[2]) & mapelm[3];
2210 			if (s >= 0) {
2211 				(*pc->decoder->filter_function)(0x26, pc->decoder);	/* '&' */
2212 				(*pc->decoder->filter_function)(0x23, pc->decoder);	/* '#' */
2213 				r = 100000000;
2214 				s %= r;
2215 				while (r > 0) {
2216 					d = s/r;
2217 					if (d || f) {
2218 						f = 1;
2219 						s %= r;
2220 						(*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2221 					}
2222 					r /= 10;
2223 				}
2224 				if (!f) {
2225 					f = 1;
2226 					(*pc->decoder->filter_function)(mbfl_hexchar_table[0], pc->decoder);
2227 				}
2228 				(*pc->decoder->filter_function)(0x3b, pc->decoder);		/* ';' */
2229 			}
2230 		}
2231 		if (f) {
2232 			break;
2233 		}
2234 		n++;
2235 	}
2236 	if (!f) {
2237 		(*pc->decoder->filter_function)(c, pc->decoder);
2238 	}
2239 
2240 	return 0;
2241 }
2242 
2243 static int
collector_decode_htmlnumericentity(int c,void * data)2244 collector_decode_htmlnumericentity(int c, void *data)
2245 {
2246 	struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2247 	int f, n, s, r, d, size, *mapelm;
2248 
2249 	switch (pc->status) {
2250 	case 1:
2251 		if (c == 0x23) {	/* '#' */
2252 			pc->status = 2;
2253 		} else {
2254 			pc->status = 0;
2255 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2256 			(*pc->decoder->filter_function)(c, pc->decoder);
2257 		}
2258 		break;
2259 	case 2:
2260 		if (c == 0x78) {	/* 'x' */
2261 			pc->status = 4;
2262 		} else if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2263 			pc->cache = c - 0x30;
2264 			pc->status = 3;
2265 			pc->digit = 1;
2266 		} else {
2267 			pc->status = 0;
2268 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2269 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2270 			(*pc->decoder->filter_function)(c, pc->decoder);
2271 		}
2272 		break;
2273 	case 3:
2274 		s = 0;
2275 		f = 0;
2276 		if (c >= 0x30 && c <= 0x39) {	/* '0' - '9' */
2277 			s = pc->cache;
2278 			if (pc->digit > 9 || s > INT_MAX/10) {
2279 				pc->status = 0;
2280 				f = 1;
2281 			} else {
2282 				s = s*10 + (c - 0x30);
2283 				pc->cache = s;
2284 				pc->digit++;
2285 			}
2286 		} else {
2287 			pc->status = 0;
2288 			s = pc->cache;
2289 			f = 1;
2290 			n = 0;
2291 			size = pc->mapsize;
2292 			while (n < size) {
2293 				mapelm = &(pc->convmap[n*4]);
2294 				d = s - mapelm[2];
2295 				if (d >= mapelm[0] && d <= mapelm[1]) {
2296 					f = 0;
2297 					(*pc->decoder->filter_function)(d, pc->decoder);
2298 					if (c != 0x3b) {	/* ';' */
2299 						(*pc->decoder->filter_function)(c, pc->decoder);
2300 					}
2301 					break;
2302 				}
2303 				n++;
2304 			}
2305 		}
2306 		if (f) {
2307 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2308 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2309 			r = 1;
2310 			n = pc->digit;
2311 			while (n > 1) {
2312 				r *= 10;
2313 				n--;
2314 			}
2315 			while (r > 0) {
2316 				d = s/r;
2317 				s %= r;
2318 				r /= 10;
2319 				(*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2320 			}
2321 			(*pc->decoder->filter_function)(c, pc->decoder);
2322 		}
2323 		break;
2324 	case 4:
2325 		if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2326 			pc->cache = c - 0x30;
2327 			pc->status = 5;
2328 			pc->digit = 1;
2329 		} else if (c >= 0x41 && c <= 0x46) { /* 'A' - 'F'  */
2330 			pc->cache = c - 0x41 + 10;
2331 			pc->status = 5;
2332 			pc->digit = 1;
2333 		} else if (c >= 0x61 && c <= 0x66) { /* 'a' - 'f'  */
2334 			pc->cache = c - 0x61 + 10;
2335 			pc->status = 5;
2336 			pc->digit = 1;
2337 		} else {
2338 			pc->status = 0;
2339 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2340 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2341 			(*pc->decoder->filter_function)(0x78, pc->decoder);		/* 'x' */
2342 			(*pc->decoder->filter_function)(c, pc->decoder);
2343 		}
2344 		break;
2345 	case 5:
2346 		s = 0;
2347 		f = 0;
2348 		if ((c >= 0x30 && c <= 0x39) ||
2349 			(c >= 0x41 && c <= 0x46) ||
2350 			(c >= 0x61 && c <= 0x66)) {	/* '0' - '9' or 'a' - 'f'  */
2351 			if (pc->digit > 9) {
2352 				pc->status = 0;
2353 				s = pc->cache;
2354 				f = 1;
2355 			} else {
2356 				if (c >= 0x30 && c <= 0x39) {
2357 					s = pc->cache*16 + (c - 0x30);
2358 				} else if (c >= 0x41 && c <= 0x46)  {
2359 					s = pc->cache*16 + (c - 0x41 + 10);
2360 				} else {
2361 					s = pc->cache*16 + (c - 0x61 + 10);
2362 				}
2363 				pc->cache = s;
2364 				pc->digit++;
2365 			}
2366 		} else {
2367 			pc->status = 0;
2368 			s = pc->cache;
2369 			f = 1;
2370 			n = 0;
2371 			size = pc->mapsize;
2372 			while (n < size) {
2373 				mapelm = &(pc->convmap[n*4]);
2374 				d = s - mapelm[2];
2375 				if (d >= mapelm[0] && d <= mapelm[1]) {
2376 					f = 0;
2377 					(*pc->decoder->filter_function)(d, pc->decoder);
2378 					if (c != 0x3b) {	/* ';' */
2379 						(*pc->decoder->filter_function)(c, pc->decoder);
2380 					}
2381 					break;
2382 				}
2383 				n++;
2384 			}
2385 		}
2386 		if (f) {
2387 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2388 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2389 			(*pc->decoder->filter_function)(0x78, pc->decoder);		/* 'x' */
2390 			r = 1;
2391 			n = pc->digit;
2392 			while (n > 0) {
2393 				r *= 16;
2394 				n--;
2395 			}
2396 			s %= r;
2397 			r /= 16;
2398 			while (r > 0) {
2399 				d = s/r;
2400 				s %= r;
2401 				r /= 16;
2402 				(*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2403 			}
2404 			(*pc->decoder->filter_function)(c, pc->decoder);
2405 		}
2406 		break;
2407 	default:
2408 		if (c == 0x26) {	/* '&' */
2409 			pc->status = 1;
2410 		} else {
2411 			(*pc->decoder->filter_function)(c, pc->decoder);
2412 		}
2413 		break;
2414 	}
2415 
2416 	return 0;
2417 }
2418 
2419 static int
collector_encode_hex_htmlnumericentity(int c,void * data)2420 collector_encode_hex_htmlnumericentity(int c, void *data)
2421 {
2422 	struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2423 	int f, n, s, r, d, size, *mapelm;
2424 
2425 	size = pc->mapsize;
2426 	f = 0;
2427 	n = 0;
2428 	while (n < size) {
2429 		mapelm = &(pc->convmap[n*4]);
2430 		if (c >= mapelm[0] && c <= mapelm[1]) {
2431 			s = (c + mapelm[2]) & mapelm[3];
2432 			if (s >= 0) {
2433 				(*pc->decoder->filter_function)(0x26, pc->decoder);	/* '&' */
2434 				(*pc->decoder->filter_function)(0x23, pc->decoder);	/* '#' */
2435 				(*pc->decoder->filter_function)(0x78, pc->decoder);	/* 'x' */
2436 				r = 0x1000000;
2437 				s %= r;
2438 				while (r > 0) {
2439 					d = s/r;
2440 					if (d || f) {
2441 						f = 1;
2442 						s %= r;
2443 						(*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2444 					}
2445 					r /= 16;
2446 				}
2447 				if (!f) {
2448 					f = 1;
2449 					(*pc->decoder->filter_function)(mbfl_hexchar_table[0], pc->decoder);
2450 				}
2451 				(*pc->decoder->filter_function)(0x3b, pc->decoder);		/* ';' */
2452 			}
2453 		}
2454 		if (f) {
2455 			break;
2456 		}
2457 		n++;
2458 	}
2459 	if (!f) {
2460 		(*pc->decoder->filter_function)(c, pc->decoder);
2461 	}
2462 
2463 	return 0;
2464 }
2465 
mbfl_filt_decode_htmlnumericentity_flush(mbfl_convert_filter * filter)2466 int mbfl_filt_decode_htmlnumericentity_flush(mbfl_convert_filter *filter)
2467 {
2468 	struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)filter;
2469 	int n, s, r, d;
2470 
2471 	if (pc->status) {
2472 		switch (pc->status) {
2473 		case 1: /* '&' */
2474 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2475 			break;
2476 		case 2: /* '#' */
2477 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2478 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2479 			break;
2480 		case 3: /* '0'-'9' */
2481 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2482 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2483 
2484 			s = pc->cache;
2485 			r = 1;
2486 			n = pc->digit;
2487 			while (n > 1) {
2488 				r *= 10;
2489 				n--;
2490 			}
2491 			while (r > 0) {
2492 				d = s/r;
2493 				s %= r;
2494 				r /= 10;
2495 				(*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2496 			}
2497 
2498 			break;
2499 		case 4: /* 'x' */
2500 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2501 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2502 			(*pc->decoder->filter_function)(0x78, pc->decoder);		/* 'x' */
2503 			break;
2504 		case 5: /* '0'-'9','a'-'f' */
2505 			(*pc->decoder->filter_function)(0x26, pc->decoder);		/* '&' */
2506 			(*pc->decoder->filter_function)(0x23, pc->decoder);		/* '#' */
2507 			(*pc->decoder->filter_function)(0x78, pc->decoder);		/* 'x' */
2508 
2509 			s = pc->cache;
2510 			r = 1;
2511 			n = pc->digit;
2512 			while (n > 0) {
2513 				r *= 16;
2514 				n--;
2515 			}
2516 			s %= r;
2517 			r /= 16;
2518 			while (r > 0) {
2519 				d = s/r;
2520 				s %= r;
2521 				r /= 16;
2522 				(*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2523 			}
2524 			break;
2525 		default:
2526 			break;
2527 		}
2528 	}
2529 
2530 	pc->status = 0;
2531 	pc->cache = 0;
2532 	pc->digit = 0;
2533 
2534 	return 0;
2535 }
2536 
2537 
2538 mbfl_string *
mbfl_html_numeric_entity(mbfl_string * string,mbfl_string * result,int * convmap,int mapsize,int type)2539 mbfl_html_numeric_entity(
2540     mbfl_string *string,
2541     mbfl_string *result,
2542     int *convmap,
2543     int mapsize,
2544     int type)
2545 {
2546 	struct collector_htmlnumericentity_data pc;
2547 	mbfl_memory_device device;
2548 	mbfl_convert_filter *encoder;
2549 	size_t n;
2550 	unsigned char *p;
2551 
2552 	if (string == NULL || result == NULL) {
2553 		return NULL;
2554 	}
2555 	mbfl_string_init(result);
2556 	result->encoding = string->encoding;
2557 	mbfl_memory_device_init(&device, string->len, 0);
2558 
2559 	/* output code filter */
2560 	pc.decoder = mbfl_convert_filter_new(
2561 	    &mbfl_encoding_wchar,
2562 	    string->encoding,
2563 	    mbfl_memory_device_output, 0, &device);
2564 	/* wchar filter */
2565 	if (type == 0) { /* decimal output */
2566 		encoder = mbfl_convert_filter_new(
2567 		    string->encoding,
2568 		    &mbfl_encoding_wchar,
2569 		    collector_encode_htmlnumericentity, 0, &pc);
2570 	} else if (type == 2) { /* hex output */
2571 		encoder = mbfl_convert_filter_new(
2572 		    string->encoding,
2573 		    &mbfl_encoding_wchar,
2574 		    collector_encode_hex_htmlnumericentity, 0, &pc);
2575 	} else { /* type == 1: decimal/hex input */
2576 		encoder = mbfl_convert_filter_new(
2577 		    string->encoding,
2578 		    &mbfl_encoding_wchar,
2579 		    collector_decode_htmlnumericentity,
2580 		    (flush_function_t)mbfl_filt_decode_htmlnumericentity_flush, &pc);
2581 	}
2582 	if (pc.decoder == NULL || encoder == NULL) {
2583 		mbfl_convert_filter_delete(encoder);
2584 		mbfl_convert_filter_delete(pc.decoder);
2585 		return NULL;
2586 	}
2587 	pc.status = 0;
2588 	pc.cache = 0;
2589 	pc.digit = 0;
2590 	pc.convmap = convmap;
2591 	pc.mapsize = mapsize;
2592 
2593 	/* feed data */
2594 	p = string->val;
2595 	n = string->len;
2596 	if (p != NULL) {
2597 		while (n > 0) {
2598 			if ((*encoder->filter_function)(*p++, encoder) < 0) {
2599 				break;
2600 			}
2601 			n--;
2602 		}
2603 	}
2604 	mbfl_convert_filter_flush(encoder);
2605 	mbfl_convert_filter_flush(pc.decoder);
2606 	result = mbfl_memory_device_result(&device, result);
2607 	mbfl_convert_filter_delete(encoder);
2608 	mbfl_convert_filter_delete(pc.decoder);
2609 
2610 	return result;
2611 }
2612