xref: /PHP-8.2/ext/mbstring/libmbfl/mbfl/mbfilter.c (revision aa553af9)
1 /*
2  * charset=UTF-8
3  */
4 
5 /*
6  * "streamable kanji code filter and converter"
7  *
8  * Copyright (c) 1998,1999,2000,2001 HappySize, Inc. All rights reserved.
9  *
10  * This software is released under the GNU Lesser General Public License.
11  * (Version 2.1, February 1999)
12  * Please read the following detail of the licence (in japanese).
13  *
14  * ◆使用許諾条件◆
15  *
16  * このソフトウェアは株式会社ハッピーサイズによって開発されました。株式会社ハッ
17  * ピーサイズは、著作権法および万国著作権条約の定めにより、このソフトウェアに関
18  * するすべての権利を留保する権利を持ち、ここに行使します。株式会社ハッピーサイ
19  * ズは以下に明記した条件に従って、このソフトウェアを使用する排他的ではない権利
20  * をお客様に許諾します。何人たりとも、以下の条件に反してこのソフトウェアを使用
21  * することはできません。
22  *
23  * このソフトウェアを「GNU Lesser General Public License (Version 2.1, February
24  * 1999)」に示された条件で使用することを、全ての方に許諾します。「GNU Lesser
25  * General Public License」を満たさない使用には、株式会社ハッピーサイズから書面
26  * による許諾を得る必要があります。
27  *
28  * 「GNU Lesser General Public License」の全文は以下のウェブページから取得でき
29  * ます。「GNU Lesser General Public License」とは、これまでLibrary General
30  * Public Licenseと呼ばれていたものです。
31  *     http://www.gnu.org/ --- GNUウェブサイト
32  *     http://www.gnu.org/copyleft/lesser.html --- ライセンス文面
33  * このライセンスの内容がわからない方、守れない方には使用を許諾しません。
34  *
35  * しかしながら、当社とGNUプロジェクトとの特定の関係を示唆または主張するもので
36  * はありません。
37  *
38  * ◆保証内容◆
39  *
40  * このソフトウェアは、期待された動作・機能・性能を持つことを目標として設計され
41  * 開発されていますが、これを保証するものではありません。このソフトウェアは「こ
42  * のまま」の状態で提供されており、たとえばこのソフトウェアの有用性ないし特定の
43  * 目的に合致することといった、何らかの保証内容が、明示されたり暗黙に示されてい
44  * る場合であっても、その保証は無効です。このソフトウェアを使用した結果ないし使
45  * 用しなかった結果によって、直接あるいは間接に受けた身体的な傷害、財産上の損害
46  * 、データの損失あるいはその他の全ての損害については、その損害の可能性が使用者
47  * 、当社あるいは第三者によって警告されていた場合であっても、当社はその損害の賠
48  * 償および補填を行いません。この規定は他の全ての、書面上または書面に無い保証・
49  * 契約・規定に優先します。
50  *
51  * ◆著作権者の連絡先および使用条件についての問い合わせ先◆
52  *
53  * 〒102-0073
54  * 東京都千代田区九段北1-13-5日本地所第一ビル4F
55  * 株式会社ハッピーサイズ
56  * Phone: 03-3512-3655, Fax: 03-3512-3656
57  * Email: sales@happysize.co.jp
58  * Web: http://happysize.com/
59  *
60  * ◆著者◆
61  *
62  * 金本 茂 <sgk@happysize.co.jp>
63  *
64  * ◆履歴◆
65  *
66  * 1998/11/10 sgk implementation in C++
67  * 1999/4/25  sgk Cで書きなおし。
68  * 1999/4/26  sgk 入力フィルタを実装。漢字コードを推定しながらフィルタを追加。
69  * 1999/6/??      Unicodeサポート。
70  * 1999/6/22  sgk ライセンスをLGPLに変更。
71  *
72  */
73 
74 /*
75  * Unicode support
76  *
77  * Portions copyright (c) 1999,2000,2001 by the PHP3 internationalization team.
78  * All rights reserved.
79  *
80  */
81 
82 #include <stddef.h>
83 #include <string.h>
84 
85 #include "mbfilter.h"
86 #include "mbfl_filter_output.h"
87 #include "mbfilter_8bit.h"
88 #include "mbfilter_wchar.h"
89 #include "mbstring.h"
90 #include "php_unicode.h"
91 #include "filters/mbfilter_base64.h"
92 #include "filters/mbfilter_qprint.h"
93 #include "filters/mbfilter_singlebyte.h"
94 #include "filters/mbfilter_utf8.h"
95 
96 #include "rare_cp_bitvec.h"
97 
98 /*
99  *  buffering converter
100  */
101 mbfl_buffer_converter *
mbfl_buffer_converter_new(const mbfl_encoding * from,const mbfl_encoding * to,size_t buf_initsz)102 mbfl_buffer_converter_new(
103 	const mbfl_encoding *from,
104 	const mbfl_encoding *to,
105     size_t buf_initsz)
106 {
107 	mbfl_buffer_converter *convd = emalloc(sizeof(mbfl_buffer_converter));
108 	convd->to = to;
109 
110 	/* create convert filter */
111 	convd->filter1 = NULL;
112 	convd->filter2 = NULL;
113 	if (mbfl_convert_filter_get_vtbl(from, to) != NULL) {
114 		convd->filter1 = mbfl_convert_filter_new(from, to, mbfl_memory_device_output, NULL, &convd->device);
115 	} else {
116 		convd->filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, to, mbfl_memory_device_output, NULL, &convd->device);
117 		if (convd->filter2 != NULL) {
118 			convd->filter1 = mbfl_convert_filter_new(from,
119 					&mbfl_encoding_wchar,
120 					(output_function_t)convd->filter2->filter_function,
121 					(flush_function_t)convd->filter2->filter_flush,
122 					convd->filter2);
123 			if (convd->filter1 == NULL) {
124 				mbfl_convert_filter_delete(convd->filter2);
125 			}
126 		}
127 	}
128 	if (convd->filter1 == NULL) {
129 		efree(convd);
130 		return NULL;
131 	}
132 
133 	mbfl_memory_device_init(&convd->device, buf_initsz, buf_initsz/4);
134 
135 	return convd;
136 }
137 
mbfl_buffer_converter_delete(mbfl_buffer_converter * convd)138 void mbfl_buffer_converter_delete(mbfl_buffer_converter *convd)
139 {
140 	mbfl_convert_filter_delete(convd->filter1);
141 	if (convd->filter2) {
142 		mbfl_convert_filter_delete(convd->filter2);
143 	}
144 	mbfl_memory_device_clear(&convd->device);
145 	efree((void*)convd);
146 }
147 
mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter * convd,int mode)148 void mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter *convd, int mode)
149 {
150 	if (convd->filter2) {
151 		convd->filter2->illegal_mode = mode;
152 	} else {
153 		convd->filter1->illegal_mode = mode;
154 	}
155 }
156 
mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter * convd,int substchar)157 void mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter *convd, int substchar)
158 {
159 	if (convd->filter2) {
160 		convd->filter2->illegal_substchar = substchar;
161 	} else {
162 		convd->filter1->illegal_substchar = substchar;
163 	}
164 }
165 
mbfl_buffer_converter_feed(mbfl_buffer_converter * convd,mbfl_string * string)166 size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *string)
167 {
168 	size_t n;
169 	unsigned char *p;
170 	mbfl_convert_filter *filter;
171 
172 	ZEND_ASSERT(convd);
173 	ZEND_ASSERT(string);
174 
175 	mbfl_memory_device_realloc(&convd->device, convd->device.pos + string->len, string->len/4);
176 	/* feed data */
177 	n = string->len;
178 	p = string->val;
179 
180 	filter = convd->filter1;
181 	if (filter != NULL) {
182 		while (n > 0) {
183 			if ((*filter->filter_function)(*p++, filter) < 0) {
184 				return p - string->val;
185 			}
186 			n--;
187 		}
188 	}
189 	return p - string->val;
190 }
191 
mbfl_buffer_converter_flush(mbfl_buffer_converter * convd)192 void mbfl_buffer_converter_flush(mbfl_buffer_converter *convd)
193 {
194 	mbfl_convert_filter_flush(convd->filter1);
195 }
196 
mbfl_buffer_converter_result(mbfl_buffer_converter * convd,mbfl_string * result)197 mbfl_string* mbfl_buffer_converter_result(mbfl_buffer_converter *convd, mbfl_string *result)
198 {
199 	result->encoding = convd->to;
200 	return mbfl_memory_device_result(&convd->device, result);
201 }
202 
mbfl_buffer_converter_feed_result(mbfl_buffer_converter * convd,mbfl_string * string,mbfl_string * result)203 mbfl_string* mbfl_buffer_converter_feed_result(mbfl_buffer_converter *convd, mbfl_string *string, mbfl_string *result)
204 {
205 	mbfl_buffer_converter_feed(convd, string);
206 	mbfl_convert_filter_flush(convd->filter1);
207 	result->encoding = convd->to;
208 	return mbfl_memory_device_result(&convd->device, result);
209 }
210 
mbfl_buffer_illegalchars(mbfl_buffer_converter * convd)211 size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
212 {
213 	size_t num_illegalchars = convd->filter1->num_illegalchar;
214 
215 	if (convd->filter2) {
216 		num_illegalchars += convd->filter2->num_illegalchar;
217 	}
218 
219 	return num_illegalchars;
220 }
221 
222 /*
223  * encoding detector
224  */
mbfl_estimate_encoding_likelihood(int input_cp,void * void_data)225 static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
226 {
227 	mbfl_encoding_detector_data *data = void_data;
228 	unsigned int c = input_cp;
229 
230 	/* Receive wchars decoded from input string using candidate encoding.
231 	 * If the string was invalid in the candidate encoding, we assume
232 	 * it's the wrong one. Otherwise, give the candidate many 'demerits'
233 	 * for each 'rare' codepoint found, a smaller number for each ASCII
234 	 * punctuation character, and 1 for all other codepoints.
235 	 *
236 	 * The 'common' codepoints should cover the vast majority of
237 	 * codepoints we are likely to see in practice, while only covering
238 	 * a small minority of the entire Unicode encoding space. Why?
239 	 * Well, if the test string happens to be valid in an incorrect
240 	 * candidate encoding, the bogus codepoints which it decodes to will
241 	 * be more or less random. By treating the majority of codepoints as
242 	 * 'rare', we ensure that in almost all such cases, the bogus
243 	 * codepoints will include plenty of 'rares', thus giving the
244 	 * incorrect candidate encoding lots of demerits. See
245 	 * common_codepoints.txt for the actual list used.
246 	 *
247 	 * So, why give extra demerits for ASCII punctuation characters? It's
248 	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
249 	 * which deliberately only use bytes in the ASCII range. When
250 	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
251 	 * have an unusually high number of ASCII punctuation characters.
252 	 * So giving extra demerits for such characters will improve
253 	 * detection accuracy for UTF-7 and similar encodings.
254 	 *
255 	 * Finally, why 1 demerit for all other characters? That penalizes
256 	 * long strings, meaning we will tend to choose a candidate encoding
257 	 * in which the test string decodes to a smaller number of
258 	 * codepoints. That prevents single-byte encodings in which almost
259 	 * every possible input byte decodes to a 'common' codepoint from
260 	 * being favored too much. */
261 	if (c == MBFL_BAD_INPUT) {
262 		data->num_illegalchars++;
263 	} else if (c > 0xFFFF) {
264 		data->score += 40;
265 	} else if (c >= 0x21 && c <= 0x2F) {
266 		data->score += 6;
267 	} else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
268 		data->score += 30;
269 	} else {
270 		data->score += 1;
271 	}
272 	return 0;
273 }
274 
mbfl_encoding_detector_new(const mbfl_encoding ** elist,int elistsz,int strict)275 mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
276 {
277 	if (!elistsz) {
278 		return NULL;
279 	}
280 
281 	mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector));
282 	identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*));
283 	identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data));
284 
285 	int filter_list_size = 0;
286 	for (int i = 0; i < elistsz; i++) {
287 		mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar,
288 			mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]);
289 		if (filter) {
290 			identd->filter_list[filter_list_size++] = filter;
291 		}
292 	}
293 	identd->filter_list_size = filter_list_size;
294 	identd->strict = strict;
295 	return identd;
296 }
297 
mbfl_encoding_detector_delete(mbfl_encoding_detector * identd)298 void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
299 {
300 	for (int i = 0; i < identd->filter_list_size; i++) {
301 		mbfl_convert_filter_delete(identd->filter_list[i]);
302 	}
303 	efree(identd->filter_list);
304 	efree(identd->filter_data);
305 	efree(identd);
306 }
307 
mbfl_encoding_detector_feed(mbfl_encoding_detector * identd,mbfl_string * string)308 int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
309 {
310 	int num = identd->filter_list_size;
311 	size_t n = string->len;
312 	unsigned char *p = string->val;
313 	int bad = 0;
314 
315 	if (identd->strict) {
316 		for (int i = 0; i < num; i++) {
317 			mbfl_convert_filter *filter = identd->filter_list[i];
318 			mbfl_encoding_detector_data *data = &identd->filter_data[i];
319 			if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
320 				data->num_illegalchars++;
321 			}
322 		}
323 	}
324 
325 	while (n--) {
326 		for (int i = 0; i < num; i++) {
327 			mbfl_convert_filter *filter = identd->filter_list[i];
328 			mbfl_encoding_detector_data *data = &identd->filter_data[i];
329 			if (!data->num_illegalchars) {
330 				(*filter->filter_function)(*p, filter);
331 				if (data->num_illegalchars) {
332 					bad++;
333 				}
334 			}
335 		}
336 		if ((num - 1) <= bad && !identd->strict) {
337 			return 1;
338 		}
339 		p++;
340 	}
341 
342 	for (int i = 0; i < num; i++) {
343 		mbfl_convert_filter *filter = identd->filter_list[i];
344 		(filter->filter_flush)(filter);
345 	}
346 
347 	return 0;
348 }
349 
mbfl_encoding_detector_judge(mbfl_encoding_detector * identd)350 const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
351 {
352 	size_t best_score = SIZE_MAX; /* Low score is 'better' */
353 	const mbfl_encoding *enc = NULL;
354 
355 	for (int i = 0; i < identd->filter_list_size; i++) {
356 		mbfl_convert_filter *filter = identd->filter_list[i];
357 		mbfl_encoding_detector_data *data = &identd->filter_data[i];
358 		if (!data->num_illegalchars && data->score < best_score) {
359 			enc = filter->from;
360 			best_score = data->score;
361 		}
362 	}
363 
364 	return enc;
365 }
366 
367 /*
368  * encoding converter
369  */
370 mbfl_string *
mbfl_convert_encoding(mbfl_string * string,mbfl_string * result,const mbfl_encoding * toenc)371 mbfl_convert_encoding(
372     mbfl_string *string,
373     mbfl_string *result,
374     const mbfl_encoding *toenc)
375 {
376 	size_t n;
377 	unsigned char *p;
378 	mbfl_memory_device device;
379 	mbfl_convert_filter *filter1 = NULL;
380 	mbfl_convert_filter *filter2 = NULL;
381 
382 	/* initialize */
383 	if (mbfl_convert_filter_get_vtbl(string->encoding, toenc) != NULL) {
384 		filter1 = mbfl_convert_filter_new(string->encoding, toenc, mbfl_memory_device_output, 0, &device);
385 	} else {
386 		filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, toenc, mbfl_memory_device_output, 0, &device);
387 		if (filter2 != NULL) {
388 			filter1 = mbfl_convert_filter_new(string->encoding, &mbfl_encoding_wchar, (int (*)(int, void*))filter2->filter_function, NULL, filter2);
389 			if (filter1 == NULL) {
390 				mbfl_convert_filter_delete(filter2);
391 			}
392 		}
393 	}
394 	if (filter1 == NULL) {
395 		return NULL;
396 	}
397 
398 	if (filter2 != NULL) {
399 		filter2->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
400 		filter2->illegal_substchar = 0x3f;		/* '?' */
401 	}
402 
403 	mbfl_memory_device_init(&device, string->len, (string->len >> 2) + 8);
404 
405 	/* feed data */
406 	n = string->len;
407 	p = string->val;
408 	if (p != NULL) {
409 		while (n > 0) {
410 			if ((*filter1->filter_function)(*p++, filter1) < 0) {
411 				break;
412 			}
413 			n--;
414 		}
415 	}
416 
417 	mbfl_convert_filter_flush(filter1);
418 	mbfl_convert_filter_delete(filter1);
419 	if (filter2 != NULL) {
420 		mbfl_convert_filter_flush(filter2);
421 		mbfl_convert_filter_delete(filter2);
422 	}
423 
424 	return mbfl_memory_device_result(&device, result);
425 }
426 
427 /*
428  * identify encoding
429  */
mbfl_identify_encoding(mbfl_string * string,const mbfl_encoding ** elist,int elistsz,int strict)430 const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict)
431 {
432 	if (!elistsz) {
433 		return NULL;
434 	}
435 	mbfl_encoding_detector *identd = mbfl_encoding_detector_new(elist, elistsz, strict);
436 	mbfl_encoding_detector_feed(identd, string);
437 	const mbfl_encoding *enc = mbfl_encoding_detector_judge(identd);
438 	mbfl_encoding_detector_delete(identd);
439 	return enc;
440 }
441 
442 /*
443  *  strlen
444  */
mbfl_strlen(const mbfl_string * string)445 size_t mbfl_strlen(const mbfl_string *string)
446 {
447 	size_t len = 0;
448 	const mbfl_encoding *encoding = string->encoding;
449 
450 	if (encoding->flag & MBFL_ENCTYPE_SBCS) {
451 		len = string->len;
452 	} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
453 		len = string->len/2;
454 	} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
455 		len = string->len/4;
456 	} else if (encoding->mblen_table) {
457 		const unsigned char *mbtab = encoding->mblen_table;
458 		unsigned char *p = string->val, *e = p + string->len;
459 		while (p < e) {
460 			p += mbtab[*p];
461 			len++;
462 		}
463 	} else {
464 		uint32_t wchar_buf[128];
465 		unsigned char *in = string->val;
466 		size_t in_len = string->len;
467 		unsigned int state = 0;
468 
469 		while (in_len) {
470 			len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
471 		}
472 	}
473 
474 	return len;
475 }
476 
477 
478 /*
479  *  strpos
480  */
481 struct collector_strpos_data {
482 	mbfl_convert_filter *next_filter;
483 	mbfl_wchar_device needle;
484 	size_t needle_len;
485 	size_t start;
486 	size_t output;
487 	size_t found_pos;
488 	size_t needle_pos;
489 	size_t matched_pos;
490 };
491 
492 static int
collector_strpos(int c,void * data)493 collector_strpos(int c, void* data)
494 {
495 	int *p, *h, *m;
496 	ssize_t n;
497 	struct collector_strpos_data *pc = (struct collector_strpos_data*)data;
498 
499 	if (pc->output >= pc->start) {
500 		if (c == (int)pc->needle.buffer[pc->needle_pos]) {
501 			if (pc->needle_pos == 0) {
502 				pc->found_pos = pc->output;			/* found position */
503 			}
504 			pc->needle_pos++;						/* needle pointer */
505 			if (pc->needle_pos >= pc->needle_len) {
506 				pc->matched_pos = pc->found_pos;	/* matched position */
507 				pc->needle_pos--;
508 				goto retry;
509 			}
510 		} else if (pc->needle_pos != 0) {
511 retry:
512 			h = (int *)pc->needle.buffer;
513 			h++;
514 			for (;;) {
515 				pc->found_pos++;
516 				p = h;
517 				m = (int *)pc->needle.buffer;
518 				n = pc->needle_pos - 1;
519 				while (n > 0 && *p == *m) {
520 					n--;
521 					p++;
522 					m++;
523 				}
524 				if (n <= 0) {
525 					if (*m != c) {
526 						pc->needle_pos = 0;
527 					}
528 					break;
529 				} else {
530 					h++;
531 					pc->needle_pos--;
532 				}
533 			}
534 		}
535 	}
536 
537 	pc->output++;
538 	return 0;
539 }
540 
mbfl_find_offset_utf8(const unsigned char * str,const unsigned char * end,ssize_t offset)541 static const unsigned char *mbfl_find_offset_utf8(
542 		const unsigned char *str, const unsigned char *end, ssize_t offset) {
543 	if (offset < 0) {
544 		const unsigned char *pos = end;
545 		while (offset < 0) {
546 			if (pos <= str) {
547 				return NULL;
548 			}
549 
550 			unsigned char c = *(--pos);
551 			if (c < 0x80) {
552 				++offset;
553 			} else if ((c & 0xc0) != 0x80) {
554 				++offset;
555 			}
556 		}
557 		return pos;
558 	} else {
559 		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
560 		const unsigned char *pos = str;
561 		while (offset-- > 0) {
562 			if (pos >= end) {
563 				return NULL;
564 			}
565 			pos += u8_tbl[*pos];
566 		}
567 		return pos;
568 	}
569 }
570 
mbfl_pointer_to_offset_utf8(const unsigned char * start,const unsigned char * pos)571 static size_t mbfl_pointer_to_offset_utf8(const unsigned char *start, const unsigned char *pos) {
572 	size_t result = 0;
573 	while (pos > start) {
574 		unsigned char c = *--pos;
575 		if (c < 0x80) {
576 			++result;
577 		} else if ((c & 0xc0) != 0x80) {
578 			++result;
579 		}
580 	}
581 	return result;
582 }
583 
584 size_t
mbfl_strpos(mbfl_string * haystack,mbfl_string * needle,ssize_t offset,int reverse)585 mbfl_strpos(
586     mbfl_string *haystack,
587     mbfl_string *needle,
588     ssize_t offset,
589     int reverse)
590 {
591 	size_t result;
592 	mbfl_string _haystack_u8, _needle_u8;
593 	const mbfl_string *haystack_u8, *needle_u8 = NULL;
594 	const unsigned char *offset_pointer;
595 
596 	if (haystack->encoding->no_encoding != mbfl_no_encoding_utf8) {
597 		mbfl_string_init_set(&_haystack_u8, haystack->encoding);
598 		haystack_u8 = mbfl_convert_encoding(haystack, &_haystack_u8, &mbfl_encoding_utf8);
599 		if (haystack_u8 == NULL) {
600 			result = MBFL_ERROR_ENCODING;
601 			goto out;
602 		}
603 	} else {
604 		haystack_u8 = haystack;
605 	}
606 
607 	if (needle->encoding->no_encoding != mbfl_no_encoding_utf8) {
608 		mbfl_string_init_set(&_needle_u8, needle->encoding);
609 		needle_u8 = mbfl_convert_encoding(needle, &_needle_u8, &mbfl_encoding_utf8);
610 		if (needle_u8 == NULL) {
611 			result = MBFL_ERROR_ENCODING;
612 			goto out;
613 		}
614 	} else {
615 		needle_u8 = needle;
616 	}
617 
618 	offset_pointer = mbfl_find_offset_utf8(
619 		haystack_u8->val, haystack_u8->val + haystack_u8->len, offset);
620 	if (!offset_pointer) {
621 		result = MBFL_ERROR_OFFSET;
622 		goto out;
623 	}
624 
625 	result = MBFL_ERROR_NOT_FOUND;
626 	if (haystack_u8->len < needle_u8->len) {
627 		goto out;
628 	}
629 
630 	const char *found_pos;
631 	if (!reverse) {
632 		found_pos = zend_memnstr(
633 			(const char *) offset_pointer,
634 			(const char *) needle_u8->val, needle_u8->len,
635 			(const char *) haystack_u8->val + haystack_u8->len);
636 	} else {
637 		if (offset >= 0) {
638 			found_pos = zend_memnrstr(
639 				(const char *) offset_pointer,
640 				(const char *) needle_u8->val, needle_u8->len,
641 				(const char *) haystack_u8->val + haystack_u8->len);
642 		} else {
643 			size_t needle_len = mbfl_strlen(needle_u8);
644 			offset_pointer = mbfl_find_offset_utf8(
645 				offset_pointer, haystack_u8->val + haystack_u8->len, needle_len);
646 			if (!offset_pointer) {
647 				offset_pointer = haystack_u8->val + haystack_u8->len;
648 			}
649 
650 			found_pos = zend_memnrstr(
651 				(const char *) haystack_u8->val,
652 				(const char *) needle_u8->val, needle_u8->len,
653 				(const char *) offset_pointer);
654 		}
655 	}
656 
657 	if (found_pos) {
658 		result = mbfl_pointer_to_offset_utf8(haystack_u8->val, (const unsigned char *) found_pos);
659 	}
660 
661 out:
662 	if (haystack_u8 == &_haystack_u8) {
663 		mbfl_string_clear(&_haystack_u8);
664 	}
665 	if (needle_u8 == &_needle_u8) {
666 		mbfl_string_clear(&_needle_u8);
667 	}
668 	return result;
669 }
670 
671 /*
672  *  substr_count
673  */
674 
675 size_t
mbfl_substr_count(mbfl_string * haystack,mbfl_string * needle)676 mbfl_substr_count(
677     mbfl_string *haystack,
678     mbfl_string *needle
679    )
680 {
681 	size_t n, result = 0;
682 	unsigned char *p;
683 	mbfl_convert_filter *filter;
684 	struct collector_strpos_data pc;
685 
686 	/* needle is converted into wchar */
687 	mbfl_wchar_device_init(&pc.needle);
688 	filter = mbfl_convert_filter_new(
689 	  needle->encoding,
690 	  &mbfl_encoding_wchar,
691 	  mbfl_wchar_device_output, 0, &pc.needle);
692 	ZEND_ASSERT(filter);
693 	mbfl_convert_filter_feed_string(filter, needle->val, needle->len);
694 	mbfl_convert_filter_flush(filter);
695 	mbfl_convert_filter_delete(filter);
696 	pc.needle_len = pc.needle.pos;
697 	if (pc.needle.buffer == NULL) {
698 		return MBFL_ERROR_ENCODING;
699 	}
700 	if (pc.needle_len == 0) {
701 		mbfl_wchar_device_clear(&pc.needle);
702 		return MBFL_ERROR_EMPTY;
703 	}
704 	/* initialize filter and collector data */
705 	filter = mbfl_convert_filter_new(
706 	  haystack->encoding,
707 	  &mbfl_encoding_wchar,
708 	  collector_strpos, 0, &pc);
709 	ZEND_ASSERT(filter);
710 	pc.start = 0;
711 	pc.output = 0;
712 	pc.needle_pos = 0;
713 	pc.found_pos = 0;
714 	pc.matched_pos = MBFL_ERROR_NOT_FOUND;
715 
716 	/* feed data */
717 	p = haystack->val;
718 	n = haystack->len;
719 	if (p != NULL) {
720 		while (n > 0) {
721 			if ((*filter->filter_function)(*p++, filter) < 0) {
722 				pc.matched_pos = MBFL_ERROR_ENCODING;
723 				break;
724 			}
725 			if (pc.matched_pos != MBFL_ERROR_NOT_FOUND) {
726 				++result;
727 				pc.matched_pos = MBFL_ERROR_NOT_FOUND;
728 				pc.needle_pos = 0;
729 			}
730 			n--;
731 		}
732 	}
733 	mbfl_convert_filter_flush(filter);
734 	mbfl_convert_filter_delete(filter);
735 	mbfl_wchar_device_clear(&pc.needle);
736 
737 	return result;
738 }
739 
740 /*
741  *  substr
742  */
743 struct collector_substr_data {
744 	mbfl_convert_filter *next_filter;
745 	size_t start;
746 	size_t stop;
747 	size_t output;
748 };
749 
750 static int
collector_substr(int c,void * data)751 collector_substr(int c, void* data)
752 {
753 	struct collector_substr_data *pc = (struct collector_substr_data*)data;
754 
755 	if (pc->output >= pc->stop) {
756 		return -1;
757 	}
758 
759 	if (pc->output >= pc->start) {
760 		(*pc->next_filter->filter_function)(c, pc->next_filter);
761 	}
762 
763 	pc->output++;
764 
765 	return 0;
766 }
767 
768 mbfl_string *
mbfl_substr(mbfl_string * string,mbfl_string * result,size_t from,size_t length)769 mbfl_substr(
770     mbfl_string *string,
771     mbfl_string *result,
772     size_t from,
773     size_t length)
774 {
775 	const mbfl_encoding *encoding = string->encoding;
776 	size_t n, k, len, start, end;
777 	unsigned m;
778 	unsigned char *p, *w;
779 
780 	mbfl_string_init(result);
781 	result->encoding = string->encoding;
782 
783 	if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) ||
784 	   encoding->mblen_table != NULL) {
785 		len = string->len;
786 		if (encoding->flag & MBFL_ENCTYPE_SBCS) {
787 			start = from;
788 		} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
789 			start = from*2;
790 		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
791 			start = from*4;
792 		} else {
793 			const unsigned char *mbtab = encoding->mblen_table;
794 			start = 0;
795 			n = 0;
796 			k = 0;
797 			p = string->val;
798 			/* search start position */
799 			while (k <= from) {
800 				start = n;
801 				if (n >= len) {
802 					break;
803 				}
804 				m = mbtab[*p];
805 				n += m;
806 				p += m;
807 				k++;
808 			}
809 		}
810 
811 		if (length == MBFL_SUBSTR_UNTIL_END) {
812 			end = len;
813 		} else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
814 			end = start + length;
815 		} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
816 			end = start + length*2;
817 		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
818 			end = start + length*4;
819 		} else {
820 			const unsigned char *mbtab = encoding->mblen_table;
821 			end = start;
822 			n = start;
823 			k = 0;
824 			p = string->val + start;
825 			/* detect end position */
826 			while (k <= length) {
827 				end = n;
828 				if (n >= len) {
829 					break;
830 				}
831 				m = mbtab[*p];
832 				n += m;
833 				p += m;
834 				k++;
835 			}
836 		}
837 
838 		if (start > len) {
839 			start = len;
840 		}
841 		if (end > len) {
842 			end = len;
843 		}
844 		if (start > end) {
845 			start = end;
846 		}
847 
848 		/* allocate memory and copy */
849 		n = end - start;
850 		result->len = 0;
851 		result->val = w = (unsigned char*)emalloc(n + 1);
852 		result->len = n;
853 		memcpy(w, string->val + start, n);
854 		w[n] = '\0';
855 	} else {
856 		mbfl_memory_device device;
857 		struct collector_substr_data pc;
858 		mbfl_convert_filter *decoder;
859 		mbfl_convert_filter *encoder;
860 
861 		if (length == MBFL_SUBSTR_UNTIL_END) {
862 			length = mbfl_strlen(string) - from;
863 		}
864 
865 		mbfl_memory_device_init(&device, length + 1, 0);
866 		mbfl_string_init(result);
867 		result->encoding = string->encoding;
868 		/* output code filter */
869 		decoder = mbfl_convert_filter_new(
870 		    &mbfl_encoding_wchar,
871 		    string->encoding,
872 		    mbfl_memory_device_output, 0, &device);
873 		/* wchar filter */
874 		encoder = mbfl_convert_filter_new(
875 		    string->encoding,
876 		    &mbfl_encoding_wchar,
877 		    collector_substr, 0, &pc);
878 		if (decoder == NULL || encoder == NULL) {
879 			mbfl_convert_filter_delete(encoder);
880 			mbfl_convert_filter_delete(decoder);
881 			return NULL;
882 		}
883 		pc.next_filter = decoder;
884 		pc.start = from;
885 		pc.stop = from + length;
886 		pc.output = 0;
887 
888 		/* feed data */
889 		p = string->val;
890 		n = string->len;
891 		if (p != NULL) {
892 			while (n > 0) {
893 				if ((*encoder->filter_function)(*p++, encoder) < 0) {
894 					break;
895 				}
896 				n--;
897 			}
898 		}
899 
900 		mbfl_convert_filter_flush(encoder);
901 		mbfl_convert_filter_flush(decoder);
902 		result = mbfl_memory_device_result(&device, result);
903 		mbfl_convert_filter_delete(encoder);
904 		mbfl_convert_filter_delete(decoder);
905 	}
906 
907 	return result;
908 }
909 
910 /*
911  *  strcut
912  */
913 mbfl_string *
mbfl_strcut(mbfl_string * string,mbfl_string * result,size_t from,size_t length)914 mbfl_strcut(
915     mbfl_string *string,
916     mbfl_string *result,
917     size_t from,
918     size_t length)
919 {
920 	const mbfl_encoding *encoding = string->encoding;
921 	mbfl_memory_device device;
922 
923 	if (from >= string->len) {
924 		from = string->len;
925 	}
926 
927 	mbfl_string_init(result);
928 	result->encoding = string->encoding;
929 
930 	if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) {
931 		const unsigned char *start = NULL;
932 		const unsigned char *end = NULL;
933 		unsigned char *w;
934 		size_t sz;
935 
936 		if (encoding->flag & MBFL_ENCTYPE_WCS2) {
937 			from &= -2;
938 
939 			if (length >= string->len - from) {
940 				length = string->len - from;
941 			}
942 
943 			start = string->val + from;
944 			end   = start + (length & -2);
945 		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
946 			from &= -4;
947 
948 			if (length >= string->len - from) {
949 				length = string->len - from;
950 			}
951 
952 			start = string->val + from;
953 			end   = start + (length & -4);
954 		} else if ((encoding->flag & MBFL_ENCTYPE_SBCS)) {
955 			if (length >= string->len - from) {
956 				length = string->len - from;
957 			}
958 
959 			start = string->val + from;
960 			end = start + length;
961 		} else if (encoding->mblen_table != NULL) {
962 			const unsigned char *mbtab = encoding->mblen_table;
963 			const unsigned char *p, *q;
964 			int m;
965 
966 			/* search start position */
967 			for (m = 0, p = string->val, q = p + from;
968 					p < q; p += (m = mbtab[*p]));
969 
970 			if (p > q) {
971 				p -= m;
972 			}
973 
974 			start = p;
975 
976 			/* search end position */
977 			if (length >= string->len - (start - string->val)) {
978 				end = string->val + string->len;
979 			} else {
980 				for (q = p + length; p < q; p += (m = mbtab[*p]));
981 
982 				if (p > q) {
983 					p -= m;
984 				}
985 				end = p;
986 			}
987 		} else {
988 			/* never reached */
989 			return NULL;
990 		}
991 
992 		/* allocate memory and copy string */
993 		sz = end - start;
994 		w = ecalloc(sz + 8, sizeof(unsigned char));
995 
996 		memcpy(w, start, sz);
997 		w[sz] = '\0';
998 		w[sz + 1] = '\0';
999 		w[sz + 2] = '\0';
1000 		w[sz + 3] = '\0';
1001 
1002 		result->val = w;
1003 		result->len = sz;
1004 	} else {
1005 		mbfl_convert_filter *encoder     = NULL;
1006 		mbfl_convert_filter *decoder     = NULL;
1007 		const unsigned char *p, *q, *r;
1008 		struct {
1009 			mbfl_convert_filter encoder;
1010 			mbfl_convert_filter decoder;
1011 			const unsigned char *p;
1012 			size_t pos;
1013 		} bk, _bk;
1014 
1015 		/* output code filter */
1016 		if (!(decoder = mbfl_convert_filter_new(
1017 				&mbfl_encoding_wchar,
1018 				string->encoding,
1019 				mbfl_memory_device_output, 0, &device))) {
1020 			return NULL;
1021 		}
1022 
1023 		/* wchar filter */
1024 		if (!(encoder = mbfl_convert_filter_new(
1025 				string->encoding,
1026 				&mbfl_encoding_wchar,
1027 				mbfl_filter_output_null,
1028 				NULL, NULL))) {
1029 			mbfl_convert_filter_delete(decoder);
1030 			return NULL;
1031 		}
1032 
1033 		mbfl_memory_device_init(&device, length + 8, 0);
1034 
1035 		p = string->val;
1036 
1037 		/* search start position */
1038 		for (q = string->val + from; p < q; p++) {
1039 			(*encoder->filter_function)(*p, encoder);
1040 		}
1041 
1042 		/* switch the drain direction */
1043 		encoder->output_function = (output_function_t)decoder->filter_function;
1044 		encoder->flush_function = (flush_function_t)decoder->filter_flush;
1045 		encoder->data = decoder;
1046 
1047 		q = string->val + string->len;
1048 
1049 		/* save the encoder, decoder state and the pointer */
1050 		mbfl_convert_filter_copy(decoder, &_bk.decoder);
1051 		mbfl_convert_filter_copy(encoder, &_bk.encoder);
1052 		_bk.p = p;
1053 		_bk.pos = device.pos;
1054 
1055 		if (length > q - p) {
1056 			length = q - p;
1057 		}
1058 
1059 		if (length >= 20) {
1060 			/* output a little shorter than "length" */
1061 			/* XXX: the constant "20" was determined purely on the heuristics. */
1062 			for (r = p + length - 20; p < r; p++) {
1063 				(*encoder->filter_function)(*p, encoder);
1064 			}
1065 
1066 			/* if the offset of the resulting string exceeds the length,
1067 			 * then restore the state */
1068 			if (device.pos > length) {
1069 				p = _bk.p;
1070 				device.pos = _bk.pos;
1071 				if (decoder->filter_dtor)
1072 					decoder->filter_dtor(decoder);
1073 				if (encoder->filter_dtor)
1074 					encoder->filter_dtor(encoder);
1075 				mbfl_convert_filter_copy(&_bk.decoder, decoder);
1076 				mbfl_convert_filter_copy(&_bk.encoder, encoder);
1077 				bk = _bk;
1078 			} else {
1079 				/* save the encoder, decoder state and the pointer */
1080 				mbfl_convert_filter_copy(decoder, &bk.decoder);
1081 				mbfl_convert_filter_copy(encoder, &bk.encoder);
1082 				bk.p = p;
1083 				bk.pos = device.pos;
1084 
1085 				/* flush the stream */
1086 				(*encoder->filter_flush)(encoder);
1087 
1088 				/* if the offset of the resulting string exceeds the length,
1089 				 * then restore the state */
1090 				if (device.pos > length) {
1091 					if (bk.decoder.filter_dtor)
1092 						bk.decoder.filter_dtor(&bk.decoder);
1093 					if (bk.encoder.filter_dtor)
1094 						bk.encoder.filter_dtor(&bk.encoder);
1095 
1096 					p = _bk.p;
1097 					device.pos = _bk.pos;
1098 					if (decoder->filter_dtor)
1099 						decoder->filter_dtor(decoder);
1100 					if (encoder->filter_dtor)
1101 						encoder->filter_dtor(encoder);
1102 					mbfl_convert_filter_copy(&_bk.decoder, decoder);
1103 					mbfl_convert_filter_copy(&_bk.encoder, encoder);
1104 					bk = _bk;
1105 				} else {
1106 					if (_bk.decoder.filter_dtor)
1107 						_bk.decoder.filter_dtor(&_bk.decoder);
1108 					if (_bk.encoder.filter_dtor)
1109 						_bk.encoder.filter_dtor(&_bk.encoder);
1110 
1111 					p = bk.p;
1112 					device.pos = bk.pos;
1113 					if (decoder->filter_dtor)
1114 						decoder->filter_dtor(decoder);
1115 					if (encoder->filter_dtor)
1116 						encoder->filter_dtor(encoder);
1117 					mbfl_convert_filter_copy(&bk.decoder, decoder);
1118 					mbfl_convert_filter_copy(&bk.encoder, encoder);
1119 				}
1120 			}
1121 		} else {
1122 			bk = _bk;
1123 		}
1124 
1125 		/* detect end position */
1126 		while (p < q) {
1127 			(*encoder->filter_function)(*p, encoder);
1128 
1129 			if (device.pos > length) {
1130 				/* restore filter */
1131 				p = bk.p;
1132 				device.pos = bk.pos;
1133 				if (decoder->filter_dtor)
1134 					decoder->filter_dtor(decoder);
1135 				if (encoder->filter_dtor)
1136 					encoder->filter_dtor(encoder);
1137 				mbfl_convert_filter_copy(&bk.decoder, decoder);
1138 				mbfl_convert_filter_copy(&bk.encoder, encoder);
1139 				break;
1140 			}
1141 
1142 			p++;
1143 
1144 			/* backup current state */
1145 			mbfl_convert_filter_copy(decoder, &_bk.decoder);
1146 			mbfl_convert_filter_copy(encoder, &_bk.encoder);
1147 			_bk.pos = device.pos;
1148 			_bk.p = p;
1149 
1150 			(*encoder->filter_flush)(encoder);
1151 
1152 			if (device.pos > length) {
1153 				if (_bk.decoder.filter_dtor)
1154 					_bk.decoder.filter_dtor(&_bk.decoder);
1155 				if (_bk.encoder.filter_dtor)
1156 					_bk.encoder.filter_dtor(&_bk.encoder);
1157 
1158 				/* restore filter */
1159 				p = bk.p;
1160 				device.pos = bk.pos;
1161 				if (decoder->filter_dtor)
1162 					decoder->filter_dtor(decoder);
1163 				if (encoder->filter_dtor)
1164 					encoder->filter_dtor(encoder);
1165 				mbfl_convert_filter_copy(&bk.decoder, decoder);
1166 				mbfl_convert_filter_copy(&bk.encoder, encoder);
1167 				break;
1168 			}
1169 
1170 			if (bk.decoder.filter_dtor)
1171 				bk.decoder.filter_dtor(&bk.decoder);
1172 			if (bk.encoder.filter_dtor)
1173 				bk.encoder.filter_dtor(&bk.encoder);
1174 
1175 			p = _bk.p;
1176 			device.pos = _bk.pos;
1177 			if (decoder->filter_dtor)
1178 				decoder->filter_dtor(decoder);
1179 			if (encoder->filter_dtor)
1180 				encoder->filter_dtor(encoder);
1181 			mbfl_convert_filter_copy(&_bk.decoder, decoder);
1182 			mbfl_convert_filter_copy(&_bk.encoder, encoder);
1183 
1184 			bk = _bk;
1185 		}
1186 
1187 		decoder->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1188 		(*encoder->filter_flush)(encoder);
1189 
1190 		if (bk.decoder.filter_dtor)
1191 			bk.decoder.filter_dtor(&bk.decoder);
1192 		if (bk.encoder.filter_dtor)
1193 			bk.encoder.filter_dtor(&bk.encoder);
1194 
1195 		result = mbfl_memory_device_result(&device, result);
1196 
1197 		mbfl_convert_filter_delete(encoder);
1198 		mbfl_convert_filter_delete(decoder);
1199 	}
1200 
1201 	return result;
1202 }
1203 
1204 
1205 /*
1206  *  MIME header encode
1207  */
1208 struct mime_header_encoder_data {
1209 	mbfl_convert_filter *conv1_filter;
1210 	mbfl_convert_filter *block_filter;
1211 	mbfl_convert_filter *conv2_filter;
1212 	mbfl_convert_filter *conv2_filter_backup;
1213 	mbfl_convert_filter *encod_filter;
1214 	mbfl_convert_filter *encod_filter_backup;
1215 	mbfl_memory_device outdev;
1216 	mbfl_memory_device tmpdev;
1217 	int status1;
1218 	int status2;
1219 	size_t prevpos;
1220 	size_t linehead;
1221 	size_t firstindent;
1222 	int encnamelen;
1223 	int lwsplen;
1224 	char encname[128];
1225 	char lwsp[16];
1226 };
1227 
1228 static int
mime_header_encoder_block_collector(int c,void * data)1229 mime_header_encoder_block_collector(int c, void *data)
1230 {
1231 	size_t n;
1232 	struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1233 
1234 	switch (pe->status2) {
1235 	case 1:	/* encoded word */
1236 		pe->prevpos = pe->outdev.pos;
1237 		mbfl_convert_filter_copy(pe->conv2_filter, pe->conv2_filter_backup);
1238 		mbfl_convert_filter_copy(pe->encod_filter, pe->encod_filter_backup);
1239 		(*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1240 		(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1241 		(*pe->encod_filter->filter_flush)(pe->encod_filter);
1242 		n = pe->outdev.pos - pe->linehead + pe->firstindent;
1243 		pe->outdev.pos = pe->prevpos;
1244 		mbfl_convert_filter_copy(pe->conv2_filter_backup, pe->conv2_filter);
1245 		mbfl_convert_filter_copy(pe->encod_filter_backup, pe->encod_filter);
1246 		if (n >= 74) {
1247 			(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1248 			(*pe->encod_filter->filter_flush)(pe->encod_filter);
1249 			mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2);	/* ?= */
1250 			mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1251 			pe->linehead = pe->outdev.pos;
1252 			pe->firstindent = 0;
1253 			mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1254 			c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1255 		} else {
1256 			c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1257 		}
1258 		break;
1259 
1260 	default:
1261 		mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1262 		c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1263 		pe->status2 = 1;
1264 		break;
1265 	}
1266 
1267 	return 0;
1268 }
1269 
1270 static int
mime_header_encoder_collector(int c,void * data)1271 mime_header_encoder_collector(int c, void *data)
1272 {
1273 	static int qp_table[256] = {
1274 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1275 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1276 		1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 */
1277 		0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0, 0, 1, 0, 1, /* 0x10 */
1278 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */
1279 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x50 */
1280 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */
1281 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x70 */
1282 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80 */
1283 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90 */
1284 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xA0 */
1285 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xB0 */
1286 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 */
1287 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xD0 */
1288 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xE0 */
1289 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1  /* 0xF0 */
1290 	};
1291 
1292 	size_t n;
1293 	struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1294 
1295 	switch (pe->status1) {
1296 	case 11:	/* encoded word */
1297 		(*pe->block_filter->filter_function)(c, pe->block_filter);
1298 		break;
1299 
1300 	default:	/* ASCII */
1301 		if (c <= 0x00ff && !qp_table[(c & 0xff)]) { /* ordinary characters */
1302 			mbfl_memory_device_output(c, &pe->tmpdev);
1303 			pe->status1 = 1;
1304 		} else if (pe->status1 == 0 && c == 0x20) {	/* repeat SPACE */
1305 			mbfl_memory_device_output(c, &pe->tmpdev);
1306 		} else {
1307 			if (pe->tmpdev.pos < 74 && c == 0x20) {
1308 				n = pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent;
1309 				if (n > 74) {
1310 					mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);		/* LWSP */
1311 					pe->linehead = pe->outdev.pos;
1312 					pe->firstindent = 0;
1313 				} else if (pe->outdev.pos > 0) {
1314 					mbfl_memory_device_output(0x20, &pe->outdev);
1315 				}
1316 				mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1317 				mbfl_memory_device_reset(&pe->tmpdev);
1318 				pe->status1 = 0;
1319 			} else {
1320 				n = pe->outdev.pos - pe->linehead + pe->encnamelen + pe->firstindent;
1321 				if (n > 60)  {
1322 					mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);		/* LWSP */
1323 					pe->linehead = pe->outdev.pos;
1324 					pe->firstindent = 0;
1325 				} else if (pe->outdev.pos > 0)  {
1326 					mbfl_memory_device_output(0x20, &pe->outdev);
1327 				}
1328 				mbfl_convert_filter_devcat(pe->block_filter, &pe->tmpdev);
1329 				mbfl_memory_device_reset(&pe->tmpdev);
1330 				(*pe->block_filter->filter_function)(c, pe->block_filter);
1331 				pe->status1 = 11;
1332 			}
1333 		}
1334 		break;
1335 	}
1336 
1337 	return 0;
1338 }
1339 
1340 mbfl_string *
mime_header_encoder_result(struct mime_header_encoder_data * pe,mbfl_string * result)1341 mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *result)
1342 {
1343 	if (pe->status1 >= 10) {
1344 		(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1345 		(*pe->encod_filter->filter_flush)(pe->encod_filter);
1346 		mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2);		/* ?= */
1347 	} else if (pe->tmpdev.pos > 0) {
1348 		if (pe->outdev.pos > 0) {
1349 			if ((pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent) > 74) {
1350 				mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1351 			} else {
1352 				mbfl_memory_device_output(0x20, &pe->outdev);
1353 			}
1354 		}
1355 		mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1356 	}
1357 	mbfl_memory_device_reset(&pe->tmpdev);
1358 	pe->prevpos = 0;
1359 	pe->linehead = 0;
1360 	pe->status1 = 0;
1361 	pe->status2 = 0;
1362 
1363 	return mbfl_memory_device_result(&pe->outdev, result);
1364 }
1365 
1366 struct mime_header_encoder_data*
mime_header_encoder_new(const mbfl_encoding * incode,const mbfl_encoding * outcode,const mbfl_encoding * transenc)1367 mime_header_encoder_new(
1368     const mbfl_encoding *incode,
1369     const mbfl_encoding *outcode,
1370     const mbfl_encoding *transenc)
1371 {
1372 	size_t n;
1373 	const char *s;
1374 	struct mime_header_encoder_data *pe;
1375 
1376 	/* get output encoding and check MIME charset name */
1377 	if (outcode->mime_name == NULL || outcode->mime_name[0] == '\0') {
1378 		return NULL;
1379 	}
1380 
1381 	pe = emalloc(sizeof(struct mime_header_encoder_data));
1382 	mbfl_memory_device_init(&pe->outdev, 0, 0);
1383 	mbfl_memory_device_init(&pe->tmpdev, 0, 0);
1384 	pe->prevpos = 0;
1385 	pe->linehead = 0;
1386 	pe->firstindent = 0;
1387 	pe->status1 = 0;
1388 	pe->status2 = 0;
1389 
1390 	/* make the encoding description string  exp. "=?ISO-2022-JP?B?" */
1391 	n = 0;
1392 	pe->encname[n++] = 0x3d;
1393 	pe->encname[n++] = 0x3f;
1394 	s = outcode->mime_name;
1395 	while (*s) {
1396 		pe->encname[n++] = *s++;
1397 	}
1398 	pe->encname[n++] = 0x3f;
1399 	if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1400 		pe->encname[n++] = 0x51;
1401 	} else {
1402 		pe->encname[n++] = 0x42;
1403 		transenc = &mbfl_encoding_base64;
1404 	}
1405 	pe->encname[n++] = 0x3f;
1406 	pe->encname[n] = '\0';
1407 	pe->encnamelen = n;
1408 
1409 	n = 0;
1410 	pe->lwsp[n++] = 0x0d;
1411 	pe->lwsp[n++] = 0x0a;
1412 	pe->lwsp[n++] = 0x20;
1413 	pe->lwsp[n] = '\0';
1414 	pe->lwsplen = n;
1415 
1416 	/* transfer encode filter */
1417 	pe->encod_filter = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1418 	pe->encod_filter_backup = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1419 
1420 	/* Output code filter */
1421 	pe->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1422 	pe->conv2_filter_backup = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1423 
1424 	/* encoded block filter */
1425 	pe->block_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, &mbfl_encoding_wchar, mime_header_encoder_block_collector, 0, pe);
1426 
1427 	/* Input code filter */
1428 	pe->conv1_filter = mbfl_convert_filter_new(incode, &mbfl_encoding_wchar, mime_header_encoder_collector, 0, pe);
1429 
1430 	if (pe->encod_filter == NULL ||
1431 	    pe->encod_filter_backup == NULL ||
1432 	    pe->conv2_filter == NULL ||
1433 	    pe->conv2_filter_backup == NULL ||
1434 	    pe->conv1_filter == NULL) {
1435 		mime_header_encoder_delete(pe);
1436 		return NULL;
1437 	}
1438 
1439 	if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1440 		pe->encod_filter->status |= MBFL_QPRINT_STS_MIME_HEADER;
1441 		pe->encod_filter_backup->status |= MBFL_QPRINT_STS_MIME_HEADER;
1442 	} else {
1443 		pe->encod_filter->status |= MBFL_BASE64_STS_MIME_HEADER;
1444 		pe->encod_filter_backup->status |= MBFL_BASE64_STS_MIME_HEADER;
1445 	}
1446 
1447 	return pe;
1448 }
1449 
1450 void
mime_header_encoder_delete(struct mime_header_encoder_data * pe)1451 mime_header_encoder_delete(struct mime_header_encoder_data *pe)
1452 {
1453 	if (pe) {
1454 		mbfl_convert_filter_delete(pe->conv1_filter);
1455 		mbfl_convert_filter_delete(pe->block_filter);
1456 		mbfl_convert_filter_delete(pe->conv2_filter);
1457 		mbfl_convert_filter_delete(pe->conv2_filter_backup);
1458 		mbfl_convert_filter_delete(pe->encod_filter);
1459 		mbfl_convert_filter_delete(pe->encod_filter_backup);
1460 		mbfl_memory_device_clear(&pe->outdev);
1461 		mbfl_memory_device_clear(&pe->tmpdev);
1462 		efree((void*)pe);
1463 	}
1464 }
1465 
1466 mbfl_string *
mbfl_mime_header_encode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode,const mbfl_encoding * encoding,const char * linefeed,int indent)1467 mbfl_mime_header_encode(
1468     mbfl_string *string,
1469     mbfl_string *result,
1470     const mbfl_encoding *outcode,
1471     const mbfl_encoding *encoding,
1472     const char *linefeed,
1473     int indent)
1474 {
1475 	size_t n;
1476 	unsigned char *p;
1477 	struct mime_header_encoder_data *pe;
1478 
1479 	mbfl_string_init(result);
1480 	result->encoding = &mbfl_encoding_ascii;
1481 
1482 	pe = mime_header_encoder_new(string->encoding, outcode, encoding);
1483 	if (pe == NULL) {
1484 		return NULL;
1485 	}
1486 
1487 	if (linefeed != NULL) {
1488 		n = 0;
1489 		while (*linefeed && n < 8) {
1490 			pe->lwsp[n++] = *linefeed++;
1491 		}
1492 		pe->lwsp[n++] = 0x20;
1493 		pe->lwsp[n] = '\0';
1494 		pe->lwsplen = n;
1495 	}
1496 	if (indent > 0 && indent < 74) {
1497 		pe->firstindent = indent;
1498 	}
1499 
1500 	n = string->len;
1501 	p = string->val;
1502 	while (n > 0) {
1503 		(*pe->conv1_filter->filter_function)(*p++, pe->conv1_filter);
1504 		n--;
1505 	}
1506 
1507 	result = mime_header_encoder_result(pe, result);
1508 	mime_header_encoder_delete(pe);
1509 
1510 	return result;
1511 }
1512 
1513 
1514 /*
1515  *  MIME header decode
1516  */
1517 struct mime_header_decoder_data {
1518 	mbfl_convert_filter *deco_filter;
1519 	mbfl_convert_filter *conv1_filter;
1520 	mbfl_convert_filter *conv2_filter;
1521 	mbfl_memory_device outdev;
1522 	mbfl_memory_device tmpdev;
1523 	size_t cspos;
1524 	int status;
1525 	const mbfl_encoding *encoding;
1526 	const mbfl_encoding *incode;
1527 	const mbfl_encoding *outcode;
1528 };
1529 
1530 static int
mime_header_decoder_collector(int c,void * data)1531 mime_header_decoder_collector(int c, void* data)
1532 {
1533 	const mbfl_encoding *encoding;
1534 	struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data;
1535 
1536 	switch (pd->status) {
1537 	case 1:
1538 		if (c == 0x3f) {		/* ? */
1539 			mbfl_memory_device_output(c, &pd->tmpdev);
1540 			pd->cspos = pd->tmpdev.pos;
1541 			pd->status = 2;
1542 		} else {
1543 			mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1544 			mbfl_memory_device_reset(&pd->tmpdev);
1545 			if (c == 0x3d) {		/* = */
1546 				mbfl_memory_device_output(c, &pd->tmpdev);
1547 			} else if (c == 0x0d || c == 0x0a) {	/* CR or LF */
1548 				pd->status = 9;
1549 			} else {
1550 				(*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
1551 				pd->status = 0;
1552 			}
1553 		}
1554 		break;
1555 	case 2:		/* store charset string */
1556 		if (c == 0x3f) {		/* ? */
1557 			/* identify charset */
1558 			mbfl_memory_device_output('\0', &pd->tmpdev);
1559 			encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]);
1560 			if (encoding != NULL) {
1561 				pd->incode = encoding;
1562 				pd->status = 3;
1563 			}
1564 			mbfl_memory_device_unput(&pd->tmpdev);
1565 			mbfl_memory_device_output(c, &pd->tmpdev);
1566 		} else {
1567 			mbfl_memory_device_output(c, &pd->tmpdev);
1568 			if (pd->tmpdev.pos > 100) {		/* too long charset string */
1569 				pd->status = 0;
1570 			} else if (c == 0x0d || c == 0x0a) {	/* CR or LF */
1571 				mbfl_memory_device_unput(&pd->tmpdev);
1572 				pd->status = 9;
1573 			}
1574 			if (pd->status != 2) {
1575 				mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1576 				mbfl_memory_device_reset(&pd->tmpdev);
1577 			}
1578 		}
1579 		break;
1580 	case 3:		/* identify encoding */
1581 		mbfl_memory_device_output(c, &pd->tmpdev);
1582 		if (c == 0x42 || c == 0x62) {		/* 'B' or 'b' */
1583 			pd->encoding = &mbfl_encoding_base64;
1584 			pd->status = 4;
1585 		} else if (c == 0x51 || c == 0x71) {	/* 'Q' or 'q' */
1586 			pd->encoding = &mbfl_encoding_qprint;
1587 			pd->status = 4;
1588 		} else {
1589 			if (c == 0x0d || c == 0x0a) {	/* CR or LF */
1590 				mbfl_memory_device_unput(&pd->tmpdev);
1591 				pd->status = 9;
1592 			} else {
1593 				pd->status = 0;
1594 			}
1595 			mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1596 			mbfl_memory_device_reset(&pd->tmpdev);
1597 		}
1598 		break;
1599 	case 4:		/* reset filter */
1600 		mbfl_memory_device_output(c, &pd->tmpdev);
1601 		if (c == 0x3f) {		/* ? */
1602 			/* charset convert filter */
1603 			mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar);
1604 			/* decode filter */
1605 			mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit);
1606 			pd->status = 5;
1607 		} else {
1608 			if (c == 0x0d || c == 0x0a) {	/* CR or LF */
1609 				mbfl_memory_device_unput(&pd->tmpdev);
1610 				pd->status = 9;
1611 			} else {
1612 				pd->status = 0;
1613 			}
1614 			mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1615 		}
1616 		mbfl_memory_device_reset(&pd->tmpdev);
1617 		break;
1618 	case 5:		/* encoded block */
1619 		if (c == 0x3f) {		/* ? */
1620 			pd->status = 6;
1621 		} else {
1622 			(*pd->deco_filter->filter_function)(c, pd->deco_filter);
1623 		}
1624 		break;
1625 	case 6:		/* check end position */
1626 		if (c == 0x3d) {		/* = */
1627 			/* flush and reset filter */
1628 			(*pd->deco_filter->filter_flush)(pd->deco_filter);
1629 			(*pd->conv1_filter->filter_flush)(pd->conv1_filter);
1630 			mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar);
1631 			pd->status = 7;
1632 		} else {
1633 			(*pd->deco_filter->filter_function)(0x3f, pd->deco_filter);
1634 			if (c != 0x3f) {		/* ? */
1635 				(*pd->deco_filter->filter_function)(c, pd->deco_filter);
1636 				pd->status = 5;
1637 			}
1638 		}
1639 		break;
1640 	case 7:		/* after encoded block */
1641 		if (c == 0x0d || c == 0x0a) {	/* CR LF */
1642 			pd->status = 8;
1643 		} else {
1644 			mbfl_memory_device_output(c, &pd->tmpdev);
1645 			if (c == 0x3d) {		/* = */
1646 				pd->status = 1;
1647 			} else if (c != 0x20 && c != 0x09) {		/* not space */
1648 				mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1649 				mbfl_memory_device_reset(&pd->tmpdev);
1650 				pd->status = 0;
1651 			}
1652 		}
1653 		break;
1654 	case 8:		/* folding */
1655 	case 9:		/* folding */
1656 		if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) {
1657 			if (c == 0x3d) {		/* = */
1658 				if (pd->status == 8) {
1659 					mbfl_memory_device_output(0x20, &pd->tmpdev);	/* SPACE */
1660 				} else {
1661 					(*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter);
1662 				}
1663 				mbfl_memory_device_output(c, &pd->tmpdev);
1664 				pd->status = 1;
1665 			} else {
1666 				mbfl_memory_device_output(0x20, &pd->tmpdev);
1667 				mbfl_memory_device_output(c, &pd->tmpdev);
1668 				mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1669 				mbfl_memory_device_reset(&pd->tmpdev);
1670 				pd->status = 0;
1671 			}
1672 		}
1673 		break;
1674 	default:		/* non encoded block */
1675 		if (c == 0x0d || c == 0x0a) {	/* CR LF */
1676 			pd->status = 9;
1677 		} else if (c == 0x3d) {		/* = */
1678 			mbfl_memory_device_output(c, &pd->tmpdev);
1679 			pd->status = 1;
1680 		} else {
1681 			(*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
1682 		}
1683 		break;
1684 	}
1685 
1686 	return 0;
1687 }
1688 
1689 mbfl_string *
mime_header_decoder_result(struct mime_header_decoder_data * pd,mbfl_string * result)1690 mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result)
1691 {
1692 	switch (pd->status) {
1693 	case 1:
1694 	case 2:
1695 	case 3:
1696 	case 4:
1697 	case 7:
1698 	case 8:
1699 	case 9:
1700 		mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1701 		break;
1702 	case 5:
1703 	case 6:
1704 		(*pd->deco_filter->filter_flush)(pd->deco_filter);
1705 		(*pd->conv1_filter->filter_flush)(pd->conv1_filter);
1706 		break;
1707 	}
1708 	(*pd->conv2_filter->filter_flush)(pd->conv2_filter);
1709 	mbfl_memory_device_reset(&pd->tmpdev);
1710 	pd->status = 0;
1711 
1712 	return mbfl_memory_device_result(&pd->outdev, result);
1713 }
1714 
1715 struct mime_header_decoder_data*
mime_header_decoder_new(const mbfl_encoding * outcode)1716 mime_header_decoder_new(const mbfl_encoding *outcode)
1717 {
1718 	struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data));
1719 
1720 	mbfl_memory_device_init(&pd->outdev, 0, 0);
1721 	mbfl_memory_device_init(&pd->tmpdev, 0, 0);
1722 	pd->cspos = 0;
1723 	pd->status = 0;
1724 	pd->encoding = &mbfl_encoding_8bit;
1725 	pd->incode = &mbfl_encoding_ascii;
1726 	pd->outcode = outcode;
1727 	/* charset convert filter */
1728 	pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev);
1729 	pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter);
1730 	/* decode filter */
1731 	pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter);
1732 
1733 	if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) {
1734 		mime_header_decoder_delete(pd);
1735 		return NULL;
1736 	}
1737 
1738 	return pd;
1739 }
1740 
1741 void
mime_header_decoder_delete(struct mime_header_decoder_data * pd)1742 mime_header_decoder_delete(struct mime_header_decoder_data *pd)
1743 {
1744 	if (pd) {
1745 		mbfl_convert_filter_delete(pd->conv2_filter);
1746 		mbfl_convert_filter_delete(pd->conv1_filter);
1747 		mbfl_convert_filter_delete(pd->deco_filter);
1748 		mbfl_memory_device_clear(&pd->outdev);
1749 		mbfl_memory_device_clear(&pd->tmpdev);
1750 		efree((void*)pd);
1751 	}
1752 }
1753 
1754 mbfl_string *
mbfl_mime_header_decode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode)1755 mbfl_mime_header_decode(
1756     mbfl_string *string,
1757     mbfl_string *result,
1758     const mbfl_encoding *outcode)
1759 {
1760 	size_t n;
1761 	unsigned char *p;
1762 	struct mime_header_decoder_data *pd;
1763 
1764 	mbfl_string_init(result);
1765 	result->encoding = outcode;
1766 
1767 	pd = mime_header_decoder_new(outcode);
1768 	if (pd == NULL) {
1769 		return NULL;
1770 	}
1771 
1772 	/* feed data */
1773 	n = string->len;
1774 	p = string->val;
1775 	while (n > 0) {
1776 		mime_header_decoder_collector(*p++, pd);
1777 		n--;
1778 	}
1779 
1780 	result = mime_header_decoder_result(pd, result);
1781 	mime_header_decoder_delete(pd);
1782 
1783 	return result;
1784 }
1785