1 /*
2 * charset=UTF-8
3 */
4
5 /*
6 * "streamable kanji code filter and converter"
7 *
8 * Copyright (c) 1998,1999,2000,2001 HappySize, Inc. All rights reserved.
9 *
10 * This software is released under the GNU Lesser General Public License.
11 * (Version 2.1, February 1999)
12 * Please read the following detail of the licence (in japanese).
13 *
14 * ◆使用許諾条件◆
15 *
16 * このソフトウェアは株式会社ハッピーサイズによって開発されました。株式会社ハッ
17 * ピーサイズは、著作権法および万国著作権条約の定めにより、このソフトウェアに関
18 * するすべての権利を留保する権利を持ち、ここに行使します。株式会社ハッピーサイ
19 * ズは以下に明記した条件に従って、このソフトウェアを使用する排他的ではない権利
20 * をお客様に許諾します。何人たりとも、以下の条件に反してこのソフトウェアを使用
21 * することはできません。
22 *
23 * このソフトウェアを「GNU Lesser General Public License (Version 2.1, February
24 * 1999)」に示された条件で使用することを、全ての方に許諾します。「GNU Lesser
25 * General Public License」を満たさない使用には、株式会社ハッピーサイズから書面
26 * による許諾を得る必要があります。
27 *
28 * 「GNU Lesser General Public License」の全文は以下のウェブページから取得でき
29 * ます。「GNU Lesser General Public License」とは、これまでLibrary General
30 * Public Licenseと呼ばれていたものです。
31 * http://www.gnu.org/ --- GNUウェブサイト
32 * http://www.gnu.org/copyleft/lesser.html --- ライセンス文面
33 * このライセンスの内容がわからない方、守れない方には使用を許諾しません。
34 *
35 * しかしながら、当社とGNUプロジェクトとの特定の関係を示唆または主張するもので
36 * はありません。
37 *
38 * ◆保証内容◆
39 *
40 * このソフトウェアは、期待された動作・機能・性能を持つことを目標として設計され
41 * 開発されていますが、これを保証するものではありません。このソフトウェアは「こ
42 * のまま」の状態で提供されており、たとえばこのソフトウェアの有用性ないし特定の
43 * 目的に合致することといった、何らかの保証内容が、明示されたり暗黙に示されてい
44 * る場合であっても、その保証は無効です。このソフトウェアを使用した結果ないし使
45 * 用しなかった結果によって、直接あるいは間接に受けた身体的な傷害、財産上の損害
46 * 、データの損失あるいはその他の全ての損害については、その損害の可能性が使用者
47 * 、当社あるいは第三者によって警告されていた場合であっても、当社はその損害の賠
48 * 償および補填を行いません。この規定は他の全ての、書面上または書面に無い保証・
49 * 契約・規定に優先します。
50 *
51 * ◆著作権者の連絡先および使用条件についての問い合わせ先◆
52 *
53 * 〒102-0073
54 * 東京都千代田区九段北1-13-5日本地所第一ビル4F
55 * 株式会社ハッピーサイズ
56 * Phone: 03-3512-3655, Fax: 03-3512-3656
57 * Email: sales@happysize.co.jp
58 * Web: http://happysize.com/
59 *
60 * ◆著者◆
61 *
62 * 金本 茂 <sgk@happysize.co.jp>
63 *
64 * ◆履歴◆
65 *
66 * 1998/11/10 sgk implementation in C++
67 * 1999/4/25 sgk Cで書きなおし。
68 * 1999/4/26 sgk 入力フィルタを実装。漢字コードを推定しながらフィルタを追加。
69 * 1999/6/?? Unicodeサポート。
70 * 1999/6/22 sgk ライセンスをLGPLに変更。
71 *
72 */
73
74 /*
75 * Unicode support
76 *
77 * Portions copyright (c) 1999,2000,2001 by the PHP3 internationalization team.
78 * All rights reserved.
79 *
80 */
81
82 #include <stddef.h>
83 #include <string.h>
84
85 #include "mbfilter.h"
86 #include "mbfl_filter_output.h"
87 #include "mbfilter_8bit.h"
88 #include "mbfilter_wchar.h"
89 #include "mbstring.h"
90 #include "php_unicode.h"
91 #include "filters/mbfilter_base64.h"
92 #include "filters/mbfilter_qprint.h"
93 #include "filters/mbfilter_singlebyte.h"
94 #include "filters/mbfilter_utf8.h"
95
96 #include "rare_cp_bitvec.h"
97
98 /*
99 * buffering converter
100 */
101 mbfl_buffer_converter *
mbfl_buffer_converter_new(const mbfl_encoding * from,const mbfl_encoding * to,size_t buf_initsz)102 mbfl_buffer_converter_new(
103 const mbfl_encoding *from,
104 const mbfl_encoding *to,
105 size_t buf_initsz)
106 {
107 mbfl_buffer_converter *convd = emalloc(sizeof(mbfl_buffer_converter));
108 convd->to = to;
109
110 /* create convert filter */
111 convd->filter1 = NULL;
112 convd->filter2 = NULL;
113 if (mbfl_convert_filter_get_vtbl(from, to) != NULL) {
114 convd->filter1 = mbfl_convert_filter_new(from, to, mbfl_memory_device_output, NULL, &convd->device);
115 } else {
116 convd->filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, to, mbfl_memory_device_output, NULL, &convd->device);
117 if (convd->filter2 != NULL) {
118 convd->filter1 = mbfl_convert_filter_new(from,
119 &mbfl_encoding_wchar,
120 (output_function_t)convd->filter2->filter_function,
121 (flush_function_t)convd->filter2->filter_flush,
122 convd->filter2);
123 if (convd->filter1 == NULL) {
124 mbfl_convert_filter_delete(convd->filter2);
125 }
126 }
127 }
128 if (convd->filter1 == NULL) {
129 efree(convd);
130 return NULL;
131 }
132
133 mbfl_memory_device_init(&convd->device, buf_initsz, buf_initsz/4);
134
135 return convd;
136 }
137
mbfl_buffer_converter_delete(mbfl_buffer_converter * convd)138 void mbfl_buffer_converter_delete(mbfl_buffer_converter *convd)
139 {
140 mbfl_convert_filter_delete(convd->filter1);
141 if (convd->filter2) {
142 mbfl_convert_filter_delete(convd->filter2);
143 }
144 mbfl_memory_device_clear(&convd->device);
145 efree((void*)convd);
146 }
147
mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter * convd,int mode)148 void mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter *convd, int mode)
149 {
150 if (convd->filter2) {
151 convd->filter2->illegal_mode = mode;
152 } else {
153 convd->filter1->illegal_mode = mode;
154 }
155 }
156
mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter * convd,int substchar)157 void mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter *convd, int substchar)
158 {
159 if (convd->filter2) {
160 convd->filter2->illegal_substchar = substchar;
161 } else {
162 convd->filter1->illegal_substchar = substchar;
163 }
164 }
165
mbfl_buffer_converter_feed(mbfl_buffer_converter * convd,mbfl_string * string)166 size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *string)
167 {
168 size_t n;
169 unsigned char *p;
170 mbfl_convert_filter *filter;
171
172 ZEND_ASSERT(convd);
173 ZEND_ASSERT(string);
174
175 mbfl_memory_device_realloc(&convd->device, convd->device.pos + string->len, string->len/4);
176 /* feed data */
177 n = string->len;
178 p = string->val;
179
180 filter = convd->filter1;
181 if (filter != NULL) {
182 while (n > 0) {
183 if ((*filter->filter_function)(*p++, filter) < 0) {
184 return p - string->val;
185 }
186 n--;
187 }
188 }
189 return p - string->val;
190 }
191
mbfl_buffer_converter_flush(mbfl_buffer_converter * convd)192 void mbfl_buffer_converter_flush(mbfl_buffer_converter *convd)
193 {
194 mbfl_convert_filter_flush(convd->filter1);
195 }
196
mbfl_buffer_converter_result(mbfl_buffer_converter * convd,mbfl_string * result)197 mbfl_string* mbfl_buffer_converter_result(mbfl_buffer_converter *convd, mbfl_string *result)
198 {
199 result->encoding = convd->to;
200 return mbfl_memory_device_result(&convd->device, result);
201 }
202
mbfl_buffer_converter_feed_result(mbfl_buffer_converter * convd,mbfl_string * string,mbfl_string * result)203 mbfl_string* mbfl_buffer_converter_feed_result(mbfl_buffer_converter *convd, mbfl_string *string, mbfl_string *result)
204 {
205 mbfl_buffer_converter_feed(convd, string);
206 mbfl_convert_filter_flush(convd->filter1);
207 result->encoding = convd->to;
208 return mbfl_memory_device_result(&convd->device, result);
209 }
210
mbfl_buffer_illegalchars(mbfl_buffer_converter * convd)211 size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
212 {
213 size_t num_illegalchars = convd->filter1->num_illegalchar;
214
215 if (convd->filter2) {
216 num_illegalchars += convd->filter2->num_illegalchar;
217 }
218
219 return num_illegalchars;
220 }
221
222 /*
223 * encoding detector
224 */
mbfl_estimate_encoding_likelihood(int input_cp,void * void_data)225 static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
226 {
227 mbfl_encoding_detector_data *data = void_data;
228 unsigned int c = input_cp;
229
230 /* Receive wchars decoded from input string using candidate encoding.
231 * If the string was invalid in the candidate encoding, we assume
232 * it's the wrong one. Otherwise, give the candidate many 'demerits'
233 * for each 'rare' codepoint found, a smaller number for each ASCII
234 * punctuation character, and 1 for all other codepoints.
235 *
236 * The 'common' codepoints should cover the vast majority of
237 * codepoints we are likely to see in practice, while only covering
238 * a small minority of the entire Unicode encoding space. Why?
239 * Well, if the test string happens to be valid in an incorrect
240 * candidate encoding, the bogus codepoints which it decodes to will
241 * be more or less random. By treating the majority of codepoints as
242 * 'rare', we ensure that in almost all such cases, the bogus
243 * codepoints will include plenty of 'rares', thus giving the
244 * incorrect candidate encoding lots of demerits. See
245 * common_codepoints.txt for the actual list used.
246 *
247 * So, why give extra demerits for ASCII punctuation characters? It's
248 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
249 * which deliberately only use bytes in the ASCII range. When
250 * misinterpreted as ASCII/UTF-8, strings in these encodings will
251 * have an unusually high number of ASCII punctuation characters.
252 * So giving extra demerits for such characters will improve
253 * detection accuracy for UTF-7 and similar encodings.
254 *
255 * Finally, why 1 demerit for all other characters? That penalizes
256 * long strings, meaning we will tend to choose a candidate encoding
257 * in which the test string decodes to a smaller number of
258 * codepoints. That prevents single-byte encodings in which almost
259 * every possible input byte decodes to a 'common' codepoint from
260 * being favored too much. */
261 if (c == MBFL_BAD_INPUT) {
262 data->num_illegalchars++;
263 } else if (c > 0xFFFF) {
264 data->score += 40;
265 } else if (c >= 0x21 && c <= 0x2F) {
266 data->score += 6;
267 } else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
268 data->score += 30;
269 } else {
270 data->score += 1;
271 }
272 return 0;
273 }
274
mbfl_encoding_detector_new(const mbfl_encoding ** elist,int elistsz,int strict)275 mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
276 {
277 if (!elistsz) {
278 return NULL;
279 }
280
281 mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector));
282 identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*));
283 identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data));
284
285 int filter_list_size = 0;
286 for (int i = 0; i < elistsz; i++) {
287 mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar,
288 mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]);
289 if (filter) {
290 identd->filter_list[filter_list_size++] = filter;
291 }
292 }
293 identd->filter_list_size = filter_list_size;
294 identd->strict = strict;
295 return identd;
296 }
297
mbfl_encoding_detector_delete(mbfl_encoding_detector * identd)298 void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
299 {
300 for (int i = 0; i < identd->filter_list_size; i++) {
301 mbfl_convert_filter_delete(identd->filter_list[i]);
302 }
303 efree(identd->filter_list);
304 efree(identd->filter_data);
305 efree(identd);
306 }
307
mbfl_encoding_detector_feed(mbfl_encoding_detector * identd,mbfl_string * string)308 int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
309 {
310 int num = identd->filter_list_size;
311 size_t n = string->len;
312 unsigned char *p = string->val;
313 int bad = 0;
314
315 if (identd->strict) {
316 for (int i = 0; i < num; i++) {
317 mbfl_convert_filter *filter = identd->filter_list[i];
318 mbfl_encoding_detector_data *data = &identd->filter_data[i];
319 if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
320 data->num_illegalchars++;
321 }
322 }
323 }
324
325 while (n--) {
326 for (int i = 0; i < num; i++) {
327 mbfl_convert_filter *filter = identd->filter_list[i];
328 mbfl_encoding_detector_data *data = &identd->filter_data[i];
329 if (!data->num_illegalchars) {
330 (*filter->filter_function)(*p, filter);
331 if (data->num_illegalchars) {
332 bad++;
333 }
334 }
335 }
336 if ((num - 1) <= bad && !identd->strict) {
337 return 1;
338 }
339 p++;
340 }
341
342 for (int i = 0; i < num; i++) {
343 mbfl_convert_filter *filter = identd->filter_list[i];
344 (filter->filter_flush)(filter);
345 }
346
347 return 0;
348 }
349
mbfl_encoding_detector_judge(mbfl_encoding_detector * identd)350 const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
351 {
352 size_t best_score = SIZE_MAX; /* Low score is 'better' */
353 const mbfl_encoding *enc = NULL;
354
355 for (int i = 0; i < identd->filter_list_size; i++) {
356 mbfl_convert_filter *filter = identd->filter_list[i];
357 mbfl_encoding_detector_data *data = &identd->filter_data[i];
358 if (!data->num_illegalchars && data->score < best_score) {
359 enc = filter->from;
360 best_score = data->score;
361 }
362 }
363
364 return enc;
365 }
366
367 /*
368 * encoding converter
369 */
370 mbfl_string *
mbfl_convert_encoding(mbfl_string * string,mbfl_string * result,const mbfl_encoding * toenc)371 mbfl_convert_encoding(
372 mbfl_string *string,
373 mbfl_string *result,
374 const mbfl_encoding *toenc)
375 {
376 size_t n;
377 unsigned char *p;
378 mbfl_memory_device device;
379 mbfl_convert_filter *filter1 = NULL;
380 mbfl_convert_filter *filter2 = NULL;
381
382 /* initialize */
383 if (mbfl_convert_filter_get_vtbl(string->encoding, toenc) != NULL) {
384 filter1 = mbfl_convert_filter_new(string->encoding, toenc, mbfl_memory_device_output, 0, &device);
385 } else {
386 filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, toenc, mbfl_memory_device_output, 0, &device);
387 if (filter2 != NULL) {
388 filter1 = mbfl_convert_filter_new(string->encoding, &mbfl_encoding_wchar, (int (*)(int, void*))filter2->filter_function, NULL, filter2);
389 if (filter1 == NULL) {
390 mbfl_convert_filter_delete(filter2);
391 }
392 }
393 }
394 if (filter1 == NULL) {
395 return NULL;
396 }
397
398 if (filter2 != NULL) {
399 filter2->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
400 filter2->illegal_substchar = 0x3f; /* '?' */
401 }
402
403 mbfl_memory_device_init(&device, string->len, (string->len >> 2) + 8);
404
405 /* feed data */
406 n = string->len;
407 p = string->val;
408 if (p != NULL) {
409 while (n > 0) {
410 if ((*filter1->filter_function)(*p++, filter1) < 0) {
411 break;
412 }
413 n--;
414 }
415 }
416
417 mbfl_convert_filter_flush(filter1);
418 mbfl_convert_filter_delete(filter1);
419 if (filter2 != NULL) {
420 mbfl_convert_filter_flush(filter2);
421 mbfl_convert_filter_delete(filter2);
422 }
423
424 return mbfl_memory_device_result(&device, result);
425 }
426
427 /*
428 * identify encoding
429 */
mbfl_identify_encoding(mbfl_string * string,const mbfl_encoding ** elist,int elistsz,int strict)430 const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict)
431 {
432 if (!elistsz) {
433 return NULL;
434 }
435 mbfl_encoding_detector *identd = mbfl_encoding_detector_new(elist, elistsz, strict);
436 mbfl_encoding_detector_feed(identd, string);
437 const mbfl_encoding *enc = mbfl_encoding_detector_judge(identd);
438 mbfl_encoding_detector_delete(identd);
439 return enc;
440 }
441
442 /*
443 * strlen
444 */
mbfl_strlen(const mbfl_string * string)445 size_t mbfl_strlen(const mbfl_string *string)
446 {
447 size_t len = 0;
448 const mbfl_encoding *encoding = string->encoding;
449
450 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
451 len = string->len;
452 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
453 len = string->len/2;
454 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
455 len = string->len/4;
456 } else if (encoding->mblen_table) {
457 const unsigned char *mbtab = encoding->mblen_table;
458 unsigned char *p = string->val, *e = p + string->len;
459 while (p < e) {
460 p += mbtab[*p];
461 len++;
462 }
463 } else {
464 uint32_t wchar_buf[128];
465 unsigned char *in = string->val;
466 size_t in_len = string->len;
467 unsigned int state = 0;
468
469 while (in_len) {
470 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
471 }
472 }
473
474 return len;
475 }
476
477
478 /*
479 * strpos
480 */
481 struct collector_strpos_data {
482 mbfl_convert_filter *next_filter;
483 mbfl_wchar_device needle;
484 size_t needle_len;
485 size_t start;
486 size_t output;
487 size_t found_pos;
488 size_t needle_pos;
489 size_t matched_pos;
490 };
491
492 static int
collector_strpos(int c,void * data)493 collector_strpos(int c, void* data)
494 {
495 int *p, *h, *m;
496 ssize_t n;
497 struct collector_strpos_data *pc = (struct collector_strpos_data*)data;
498
499 if (pc->output >= pc->start) {
500 if (c == (int)pc->needle.buffer[pc->needle_pos]) {
501 if (pc->needle_pos == 0) {
502 pc->found_pos = pc->output; /* found position */
503 }
504 pc->needle_pos++; /* needle pointer */
505 if (pc->needle_pos >= pc->needle_len) {
506 pc->matched_pos = pc->found_pos; /* matched position */
507 pc->needle_pos--;
508 goto retry;
509 }
510 } else if (pc->needle_pos != 0) {
511 retry:
512 h = (int *)pc->needle.buffer;
513 h++;
514 for (;;) {
515 pc->found_pos++;
516 p = h;
517 m = (int *)pc->needle.buffer;
518 n = pc->needle_pos - 1;
519 while (n > 0 && *p == *m) {
520 n--;
521 p++;
522 m++;
523 }
524 if (n <= 0) {
525 if (*m != c) {
526 pc->needle_pos = 0;
527 }
528 break;
529 } else {
530 h++;
531 pc->needle_pos--;
532 }
533 }
534 }
535 }
536
537 pc->output++;
538 return 0;
539 }
540
mbfl_find_offset_utf8(const unsigned char * str,const unsigned char * end,ssize_t offset)541 static const unsigned char *mbfl_find_offset_utf8(
542 const unsigned char *str, const unsigned char *end, ssize_t offset) {
543 if (offset < 0) {
544 const unsigned char *pos = end;
545 while (offset < 0) {
546 if (pos <= str) {
547 return NULL;
548 }
549
550 unsigned char c = *(--pos);
551 if (c < 0x80) {
552 ++offset;
553 } else if ((c & 0xc0) != 0x80) {
554 ++offset;
555 }
556 }
557 return pos;
558 } else {
559 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
560 const unsigned char *pos = str;
561 while (offset-- > 0) {
562 if (pos >= end) {
563 return NULL;
564 }
565 pos += u8_tbl[*pos];
566 }
567 return pos;
568 }
569 }
570
mbfl_pointer_to_offset_utf8(const unsigned char * start,const unsigned char * pos)571 static size_t mbfl_pointer_to_offset_utf8(const unsigned char *start, const unsigned char *pos) {
572 size_t result = 0;
573 while (pos > start) {
574 unsigned char c = *--pos;
575 if (c < 0x80) {
576 ++result;
577 } else if ((c & 0xc0) != 0x80) {
578 ++result;
579 }
580 }
581 return result;
582 }
583
584 size_t
mbfl_strpos(mbfl_string * haystack,mbfl_string * needle,ssize_t offset,int reverse)585 mbfl_strpos(
586 mbfl_string *haystack,
587 mbfl_string *needle,
588 ssize_t offset,
589 int reverse)
590 {
591 size_t result;
592 mbfl_string _haystack_u8, _needle_u8;
593 const mbfl_string *haystack_u8, *needle_u8 = NULL;
594 const unsigned char *offset_pointer;
595
596 if (haystack->encoding->no_encoding != mbfl_no_encoding_utf8) {
597 mbfl_string_init_set(&_haystack_u8, haystack->encoding);
598 haystack_u8 = mbfl_convert_encoding(haystack, &_haystack_u8, &mbfl_encoding_utf8);
599 if (haystack_u8 == NULL) {
600 result = MBFL_ERROR_ENCODING;
601 goto out;
602 }
603 } else {
604 haystack_u8 = haystack;
605 }
606
607 if (needle->encoding->no_encoding != mbfl_no_encoding_utf8) {
608 mbfl_string_init_set(&_needle_u8, needle->encoding);
609 needle_u8 = mbfl_convert_encoding(needle, &_needle_u8, &mbfl_encoding_utf8);
610 if (needle_u8 == NULL) {
611 result = MBFL_ERROR_ENCODING;
612 goto out;
613 }
614 } else {
615 needle_u8 = needle;
616 }
617
618 offset_pointer = mbfl_find_offset_utf8(
619 haystack_u8->val, haystack_u8->val + haystack_u8->len, offset);
620 if (!offset_pointer) {
621 result = MBFL_ERROR_OFFSET;
622 goto out;
623 }
624
625 result = MBFL_ERROR_NOT_FOUND;
626 if (haystack_u8->len < needle_u8->len) {
627 goto out;
628 }
629
630 const char *found_pos;
631 if (!reverse) {
632 found_pos = zend_memnstr(
633 (const char *) offset_pointer,
634 (const char *) needle_u8->val, needle_u8->len,
635 (const char *) haystack_u8->val + haystack_u8->len);
636 } else {
637 if (offset >= 0) {
638 found_pos = zend_memnrstr(
639 (const char *) offset_pointer,
640 (const char *) needle_u8->val, needle_u8->len,
641 (const char *) haystack_u8->val + haystack_u8->len);
642 } else {
643 size_t needle_len = mbfl_strlen(needle_u8);
644 offset_pointer = mbfl_find_offset_utf8(
645 offset_pointer, haystack_u8->val + haystack_u8->len, needle_len);
646 if (!offset_pointer) {
647 offset_pointer = haystack_u8->val + haystack_u8->len;
648 }
649
650 found_pos = zend_memnrstr(
651 (const char *) haystack_u8->val,
652 (const char *) needle_u8->val, needle_u8->len,
653 (const char *) offset_pointer);
654 }
655 }
656
657 if (found_pos) {
658 result = mbfl_pointer_to_offset_utf8(haystack_u8->val, (const unsigned char *) found_pos);
659 }
660
661 out:
662 if (haystack_u8 == &_haystack_u8) {
663 mbfl_string_clear(&_haystack_u8);
664 }
665 if (needle_u8 == &_needle_u8) {
666 mbfl_string_clear(&_needle_u8);
667 }
668 return result;
669 }
670
671 /*
672 * substr_count
673 */
674
675 size_t
mbfl_substr_count(mbfl_string * haystack,mbfl_string * needle)676 mbfl_substr_count(
677 mbfl_string *haystack,
678 mbfl_string *needle
679 )
680 {
681 size_t n, result = 0;
682 unsigned char *p;
683 mbfl_convert_filter *filter;
684 struct collector_strpos_data pc;
685
686 /* needle is converted into wchar */
687 mbfl_wchar_device_init(&pc.needle);
688 filter = mbfl_convert_filter_new(
689 needle->encoding,
690 &mbfl_encoding_wchar,
691 mbfl_wchar_device_output, 0, &pc.needle);
692 ZEND_ASSERT(filter);
693 mbfl_convert_filter_feed_string(filter, needle->val, needle->len);
694 mbfl_convert_filter_flush(filter);
695 mbfl_convert_filter_delete(filter);
696 pc.needle_len = pc.needle.pos;
697 if (pc.needle.buffer == NULL) {
698 return MBFL_ERROR_ENCODING;
699 }
700 if (pc.needle_len == 0) {
701 mbfl_wchar_device_clear(&pc.needle);
702 return MBFL_ERROR_EMPTY;
703 }
704 /* initialize filter and collector data */
705 filter = mbfl_convert_filter_new(
706 haystack->encoding,
707 &mbfl_encoding_wchar,
708 collector_strpos, 0, &pc);
709 ZEND_ASSERT(filter);
710 pc.start = 0;
711 pc.output = 0;
712 pc.needle_pos = 0;
713 pc.found_pos = 0;
714 pc.matched_pos = MBFL_ERROR_NOT_FOUND;
715
716 /* feed data */
717 p = haystack->val;
718 n = haystack->len;
719 if (p != NULL) {
720 while (n > 0) {
721 if ((*filter->filter_function)(*p++, filter) < 0) {
722 pc.matched_pos = MBFL_ERROR_ENCODING;
723 break;
724 }
725 if (pc.matched_pos != MBFL_ERROR_NOT_FOUND) {
726 ++result;
727 pc.matched_pos = MBFL_ERROR_NOT_FOUND;
728 pc.needle_pos = 0;
729 }
730 n--;
731 }
732 }
733 mbfl_convert_filter_flush(filter);
734 mbfl_convert_filter_delete(filter);
735 mbfl_wchar_device_clear(&pc.needle);
736
737 return result;
738 }
739
740 /*
741 * substr
742 */
743 struct collector_substr_data {
744 mbfl_convert_filter *next_filter;
745 size_t start;
746 size_t stop;
747 size_t output;
748 };
749
750 static int
collector_substr(int c,void * data)751 collector_substr(int c, void* data)
752 {
753 struct collector_substr_data *pc = (struct collector_substr_data*)data;
754
755 if (pc->output >= pc->stop) {
756 return -1;
757 }
758
759 if (pc->output >= pc->start) {
760 (*pc->next_filter->filter_function)(c, pc->next_filter);
761 }
762
763 pc->output++;
764
765 return 0;
766 }
767
768 mbfl_string *
mbfl_substr(mbfl_string * string,mbfl_string * result,size_t from,size_t length)769 mbfl_substr(
770 mbfl_string *string,
771 mbfl_string *result,
772 size_t from,
773 size_t length)
774 {
775 const mbfl_encoding *encoding = string->encoding;
776 size_t n, k, len, start, end;
777 unsigned m;
778 unsigned char *p, *w;
779
780 mbfl_string_init(result);
781 result->encoding = string->encoding;
782
783 if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) ||
784 encoding->mblen_table != NULL) {
785 len = string->len;
786 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
787 start = from;
788 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
789 start = from*2;
790 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
791 start = from*4;
792 } else {
793 const unsigned char *mbtab = encoding->mblen_table;
794 start = 0;
795 n = 0;
796 k = 0;
797 p = string->val;
798 /* search start position */
799 while (k <= from) {
800 start = n;
801 if (n >= len) {
802 break;
803 }
804 m = mbtab[*p];
805 n += m;
806 p += m;
807 k++;
808 }
809 }
810
811 if (length == MBFL_SUBSTR_UNTIL_END) {
812 end = len;
813 } else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
814 end = start + length;
815 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
816 end = start + length*2;
817 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
818 end = start + length*4;
819 } else {
820 const unsigned char *mbtab = encoding->mblen_table;
821 end = start;
822 n = start;
823 k = 0;
824 p = string->val + start;
825 /* detect end position */
826 while (k <= length) {
827 end = n;
828 if (n >= len) {
829 break;
830 }
831 m = mbtab[*p];
832 n += m;
833 p += m;
834 k++;
835 }
836 }
837
838 if (start > len) {
839 start = len;
840 }
841 if (end > len) {
842 end = len;
843 }
844 if (start > end) {
845 start = end;
846 }
847
848 /* allocate memory and copy */
849 n = end - start;
850 result->len = 0;
851 result->val = w = (unsigned char*)emalloc(n + 1);
852 result->len = n;
853 memcpy(w, string->val + start, n);
854 w[n] = '\0';
855 } else {
856 mbfl_memory_device device;
857 struct collector_substr_data pc;
858 mbfl_convert_filter *decoder;
859 mbfl_convert_filter *encoder;
860
861 if (length == MBFL_SUBSTR_UNTIL_END) {
862 length = mbfl_strlen(string) - from;
863 }
864
865 mbfl_memory_device_init(&device, length + 1, 0);
866 mbfl_string_init(result);
867 result->encoding = string->encoding;
868 /* output code filter */
869 decoder = mbfl_convert_filter_new(
870 &mbfl_encoding_wchar,
871 string->encoding,
872 mbfl_memory_device_output, 0, &device);
873 /* wchar filter */
874 encoder = mbfl_convert_filter_new(
875 string->encoding,
876 &mbfl_encoding_wchar,
877 collector_substr, 0, &pc);
878 if (decoder == NULL || encoder == NULL) {
879 mbfl_convert_filter_delete(encoder);
880 mbfl_convert_filter_delete(decoder);
881 return NULL;
882 }
883 pc.next_filter = decoder;
884 pc.start = from;
885 pc.stop = from + length;
886 pc.output = 0;
887
888 /* feed data */
889 p = string->val;
890 n = string->len;
891 if (p != NULL) {
892 while (n > 0) {
893 if ((*encoder->filter_function)(*p++, encoder) < 0) {
894 break;
895 }
896 n--;
897 }
898 }
899
900 mbfl_convert_filter_flush(encoder);
901 mbfl_convert_filter_flush(decoder);
902 result = mbfl_memory_device_result(&device, result);
903 mbfl_convert_filter_delete(encoder);
904 mbfl_convert_filter_delete(decoder);
905 }
906
907 return result;
908 }
909
910 /*
911 * strcut
912 */
913 mbfl_string *
mbfl_strcut(mbfl_string * string,mbfl_string * result,size_t from,size_t length)914 mbfl_strcut(
915 mbfl_string *string,
916 mbfl_string *result,
917 size_t from,
918 size_t length)
919 {
920 const mbfl_encoding *encoding = string->encoding;
921 mbfl_memory_device device;
922
923 if (from >= string->len) {
924 from = string->len;
925 }
926
927 mbfl_string_init(result);
928 result->encoding = string->encoding;
929
930 if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) {
931 const unsigned char *start = NULL;
932 const unsigned char *end = NULL;
933 unsigned char *w;
934 size_t sz;
935
936 if (encoding->flag & MBFL_ENCTYPE_WCS2) {
937 from &= -2;
938
939 if (length >= string->len - from) {
940 length = string->len - from;
941 }
942
943 start = string->val + from;
944 end = start + (length & -2);
945 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
946 from &= -4;
947
948 if (length >= string->len - from) {
949 length = string->len - from;
950 }
951
952 start = string->val + from;
953 end = start + (length & -4);
954 } else if ((encoding->flag & MBFL_ENCTYPE_SBCS)) {
955 if (length >= string->len - from) {
956 length = string->len - from;
957 }
958
959 start = string->val + from;
960 end = start + length;
961 } else if (encoding->mblen_table != NULL) {
962 const unsigned char *mbtab = encoding->mblen_table;
963 const unsigned char *p, *q;
964 int m;
965
966 /* search start position */
967 for (m = 0, p = string->val, q = p + from;
968 p < q; p += (m = mbtab[*p]));
969
970 if (p > q) {
971 p -= m;
972 }
973
974 start = p;
975
976 /* search end position */
977 if (length >= string->len - (start - string->val)) {
978 end = string->val + string->len;
979 } else {
980 for (q = p + length; p < q; p += (m = mbtab[*p]));
981
982 if (p > q) {
983 p -= m;
984 }
985 end = p;
986 }
987 } else {
988 /* never reached */
989 return NULL;
990 }
991
992 /* allocate memory and copy string */
993 sz = end - start;
994 w = ecalloc(sz + 8, sizeof(unsigned char));
995
996 memcpy(w, start, sz);
997 w[sz] = '\0';
998 w[sz + 1] = '\0';
999 w[sz + 2] = '\0';
1000 w[sz + 3] = '\0';
1001
1002 result->val = w;
1003 result->len = sz;
1004 } else {
1005 mbfl_convert_filter *encoder = NULL;
1006 mbfl_convert_filter *decoder = NULL;
1007 const unsigned char *p, *q, *r;
1008 struct {
1009 mbfl_convert_filter encoder;
1010 mbfl_convert_filter decoder;
1011 const unsigned char *p;
1012 size_t pos;
1013 } bk, _bk;
1014
1015 /* output code filter */
1016 if (!(decoder = mbfl_convert_filter_new(
1017 &mbfl_encoding_wchar,
1018 string->encoding,
1019 mbfl_memory_device_output, 0, &device))) {
1020 return NULL;
1021 }
1022
1023 /* wchar filter */
1024 if (!(encoder = mbfl_convert_filter_new(
1025 string->encoding,
1026 &mbfl_encoding_wchar,
1027 mbfl_filter_output_null,
1028 NULL, NULL))) {
1029 mbfl_convert_filter_delete(decoder);
1030 return NULL;
1031 }
1032
1033 mbfl_memory_device_init(&device, length + 8, 0);
1034
1035 p = string->val;
1036
1037 /* search start position */
1038 for (q = string->val + from; p < q; p++) {
1039 (*encoder->filter_function)(*p, encoder);
1040 }
1041
1042 /* switch the drain direction */
1043 encoder->output_function = (output_function_t)decoder->filter_function;
1044 encoder->flush_function = (flush_function_t)decoder->filter_flush;
1045 encoder->data = decoder;
1046
1047 q = string->val + string->len;
1048
1049 /* save the encoder, decoder state and the pointer */
1050 mbfl_convert_filter_copy(decoder, &_bk.decoder);
1051 mbfl_convert_filter_copy(encoder, &_bk.encoder);
1052 _bk.p = p;
1053 _bk.pos = device.pos;
1054
1055 if (length > q - p) {
1056 length = q - p;
1057 }
1058
1059 if (length >= 20) {
1060 /* output a little shorter than "length" */
1061 /* XXX: the constant "20" was determined purely on the heuristics. */
1062 for (r = p + length - 20; p < r; p++) {
1063 (*encoder->filter_function)(*p, encoder);
1064 }
1065
1066 /* if the offset of the resulting string exceeds the length,
1067 * then restore the state */
1068 if (device.pos > length) {
1069 p = _bk.p;
1070 device.pos = _bk.pos;
1071 if (decoder->filter_dtor)
1072 decoder->filter_dtor(decoder);
1073 if (encoder->filter_dtor)
1074 encoder->filter_dtor(encoder);
1075 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1076 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1077 bk = _bk;
1078 } else {
1079 /* save the encoder, decoder state and the pointer */
1080 mbfl_convert_filter_copy(decoder, &bk.decoder);
1081 mbfl_convert_filter_copy(encoder, &bk.encoder);
1082 bk.p = p;
1083 bk.pos = device.pos;
1084
1085 /* flush the stream */
1086 (*encoder->filter_flush)(encoder);
1087
1088 /* if the offset of the resulting string exceeds the length,
1089 * then restore the state */
1090 if (device.pos > length) {
1091 if (bk.decoder.filter_dtor)
1092 bk.decoder.filter_dtor(&bk.decoder);
1093 if (bk.encoder.filter_dtor)
1094 bk.encoder.filter_dtor(&bk.encoder);
1095
1096 p = _bk.p;
1097 device.pos = _bk.pos;
1098 if (decoder->filter_dtor)
1099 decoder->filter_dtor(decoder);
1100 if (encoder->filter_dtor)
1101 encoder->filter_dtor(encoder);
1102 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1103 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1104 bk = _bk;
1105 } else {
1106 if (_bk.decoder.filter_dtor)
1107 _bk.decoder.filter_dtor(&_bk.decoder);
1108 if (_bk.encoder.filter_dtor)
1109 _bk.encoder.filter_dtor(&_bk.encoder);
1110
1111 p = bk.p;
1112 device.pos = bk.pos;
1113 if (decoder->filter_dtor)
1114 decoder->filter_dtor(decoder);
1115 if (encoder->filter_dtor)
1116 encoder->filter_dtor(encoder);
1117 mbfl_convert_filter_copy(&bk.decoder, decoder);
1118 mbfl_convert_filter_copy(&bk.encoder, encoder);
1119 }
1120 }
1121 } else {
1122 bk = _bk;
1123 }
1124
1125 /* detect end position */
1126 while (p < q) {
1127 (*encoder->filter_function)(*p, encoder);
1128
1129 if (device.pos > length) {
1130 /* restore filter */
1131 p = bk.p;
1132 device.pos = bk.pos;
1133 if (decoder->filter_dtor)
1134 decoder->filter_dtor(decoder);
1135 if (encoder->filter_dtor)
1136 encoder->filter_dtor(encoder);
1137 mbfl_convert_filter_copy(&bk.decoder, decoder);
1138 mbfl_convert_filter_copy(&bk.encoder, encoder);
1139 break;
1140 }
1141
1142 p++;
1143
1144 /* backup current state */
1145 mbfl_convert_filter_copy(decoder, &_bk.decoder);
1146 mbfl_convert_filter_copy(encoder, &_bk.encoder);
1147 _bk.pos = device.pos;
1148 _bk.p = p;
1149
1150 (*encoder->filter_flush)(encoder);
1151
1152 if (device.pos > length) {
1153 if (_bk.decoder.filter_dtor)
1154 _bk.decoder.filter_dtor(&_bk.decoder);
1155 if (_bk.encoder.filter_dtor)
1156 _bk.encoder.filter_dtor(&_bk.encoder);
1157
1158 /* restore filter */
1159 p = bk.p;
1160 device.pos = bk.pos;
1161 if (decoder->filter_dtor)
1162 decoder->filter_dtor(decoder);
1163 if (encoder->filter_dtor)
1164 encoder->filter_dtor(encoder);
1165 mbfl_convert_filter_copy(&bk.decoder, decoder);
1166 mbfl_convert_filter_copy(&bk.encoder, encoder);
1167 break;
1168 }
1169
1170 if (bk.decoder.filter_dtor)
1171 bk.decoder.filter_dtor(&bk.decoder);
1172 if (bk.encoder.filter_dtor)
1173 bk.encoder.filter_dtor(&bk.encoder);
1174
1175 p = _bk.p;
1176 device.pos = _bk.pos;
1177 if (decoder->filter_dtor)
1178 decoder->filter_dtor(decoder);
1179 if (encoder->filter_dtor)
1180 encoder->filter_dtor(encoder);
1181 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1182 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1183
1184 bk = _bk;
1185 }
1186
1187 decoder->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1188 (*encoder->filter_flush)(encoder);
1189
1190 if (bk.decoder.filter_dtor)
1191 bk.decoder.filter_dtor(&bk.decoder);
1192 if (bk.encoder.filter_dtor)
1193 bk.encoder.filter_dtor(&bk.encoder);
1194
1195 result = mbfl_memory_device_result(&device, result);
1196
1197 mbfl_convert_filter_delete(encoder);
1198 mbfl_convert_filter_delete(decoder);
1199 }
1200
1201 return result;
1202 }
1203
1204
1205 /*
1206 * MIME header encode
1207 */
1208 struct mime_header_encoder_data {
1209 mbfl_convert_filter *conv1_filter;
1210 mbfl_convert_filter *block_filter;
1211 mbfl_convert_filter *conv2_filter;
1212 mbfl_convert_filter *conv2_filter_backup;
1213 mbfl_convert_filter *encod_filter;
1214 mbfl_convert_filter *encod_filter_backup;
1215 mbfl_memory_device outdev;
1216 mbfl_memory_device tmpdev;
1217 int status1;
1218 int status2;
1219 size_t prevpos;
1220 size_t linehead;
1221 size_t firstindent;
1222 int encnamelen;
1223 int lwsplen;
1224 char encname[128];
1225 char lwsp[16];
1226 };
1227
1228 static int
mime_header_encoder_block_collector(int c,void * data)1229 mime_header_encoder_block_collector(int c, void *data)
1230 {
1231 size_t n;
1232 struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1233
1234 switch (pe->status2) {
1235 case 1: /* encoded word */
1236 pe->prevpos = pe->outdev.pos;
1237 mbfl_convert_filter_copy(pe->conv2_filter, pe->conv2_filter_backup);
1238 mbfl_convert_filter_copy(pe->encod_filter, pe->encod_filter_backup);
1239 (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1240 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1241 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1242 n = pe->outdev.pos - pe->linehead + pe->firstindent;
1243 pe->outdev.pos = pe->prevpos;
1244 mbfl_convert_filter_copy(pe->conv2_filter_backup, pe->conv2_filter);
1245 mbfl_convert_filter_copy(pe->encod_filter_backup, pe->encod_filter);
1246 if (n >= 74) {
1247 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1248 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1249 mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
1250 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1251 pe->linehead = pe->outdev.pos;
1252 pe->firstindent = 0;
1253 mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1254 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1255 } else {
1256 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1257 }
1258 break;
1259
1260 default:
1261 mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1262 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1263 pe->status2 = 1;
1264 break;
1265 }
1266
1267 return 0;
1268 }
1269
1270 static int
mime_header_encoder_collector(int c,void * data)1271 mime_header_encoder_collector(int c, void *data)
1272 {
1273 static int qp_table[256] = {
1274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1276 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 */
1277 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0, 0, 1, 0, 1, /* 0x10 */
1278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */
1279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x50 */
1280 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */
1281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x70 */
1282 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80 */
1283 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90 */
1284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xA0 */
1285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xB0 */
1286 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 */
1287 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xD0 */
1288 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xE0 */
1289 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 /* 0xF0 */
1290 };
1291
1292 size_t n;
1293 struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1294
1295 switch (pe->status1) {
1296 case 11: /* encoded word */
1297 (*pe->block_filter->filter_function)(c, pe->block_filter);
1298 break;
1299
1300 default: /* ASCII */
1301 if (c <= 0x00ff && !qp_table[(c & 0xff)]) { /* ordinary characters */
1302 mbfl_memory_device_output(c, &pe->tmpdev);
1303 pe->status1 = 1;
1304 } else if (pe->status1 == 0 && c == 0x20) { /* repeat SPACE */
1305 mbfl_memory_device_output(c, &pe->tmpdev);
1306 } else {
1307 if (pe->tmpdev.pos < 74 && c == 0x20) {
1308 n = pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent;
1309 if (n > 74) {
1310 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
1311 pe->linehead = pe->outdev.pos;
1312 pe->firstindent = 0;
1313 } else if (pe->outdev.pos > 0) {
1314 mbfl_memory_device_output(0x20, &pe->outdev);
1315 }
1316 mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1317 mbfl_memory_device_reset(&pe->tmpdev);
1318 pe->status1 = 0;
1319 } else {
1320 n = pe->outdev.pos - pe->linehead + pe->encnamelen + pe->firstindent;
1321 if (n > 60) {
1322 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
1323 pe->linehead = pe->outdev.pos;
1324 pe->firstindent = 0;
1325 } else if (pe->outdev.pos > 0) {
1326 mbfl_memory_device_output(0x20, &pe->outdev);
1327 }
1328 mbfl_convert_filter_devcat(pe->block_filter, &pe->tmpdev);
1329 mbfl_memory_device_reset(&pe->tmpdev);
1330 (*pe->block_filter->filter_function)(c, pe->block_filter);
1331 pe->status1 = 11;
1332 }
1333 }
1334 break;
1335 }
1336
1337 return 0;
1338 }
1339
1340 mbfl_string *
mime_header_encoder_result(struct mime_header_encoder_data * pe,mbfl_string * result)1341 mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *result)
1342 {
1343 if (pe->status1 >= 10) {
1344 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1345 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1346 mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
1347 } else if (pe->tmpdev.pos > 0) {
1348 if (pe->outdev.pos > 0) {
1349 if ((pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent) > 74) {
1350 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1351 } else {
1352 mbfl_memory_device_output(0x20, &pe->outdev);
1353 }
1354 }
1355 mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1356 }
1357 mbfl_memory_device_reset(&pe->tmpdev);
1358 pe->prevpos = 0;
1359 pe->linehead = 0;
1360 pe->status1 = 0;
1361 pe->status2 = 0;
1362
1363 return mbfl_memory_device_result(&pe->outdev, result);
1364 }
1365
1366 struct mime_header_encoder_data*
mime_header_encoder_new(const mbfl_encoding * incode,const mbfl_encoding * outcode,const mbfl_encoding * transenc)1367 mime_header_encoder_new(
1368 const mbfl_encoding *incode,
1369 const mbfl_encoding *outcode,
1370 const mbfl_encoding *transenc)
1371 {
1372 size_t n;
1373 const char *s;
1374 struct mime_header_encoder_data *pe;
1375
1376 /* get output encoding and check MIME charset name */
1377 if (outcode->mime_name == NULL || outcode->mime_name[0] == '\0') {
1378 return NULL;
1379 }
1380
1381 pe = emalloc(sizeof(struct mime_header_encoder_data));
1382 mbfl_memory_device_init(&pe->outdev, 0, 0);
1383 mbfl_memory_device_init(&pe->tmpdev, 0, 0);
1384 pe->prevpos = 0;
1385 pe->linehead = 0;
1386 pe->firstindent = 0;
1387 pe->status1 = 0;
1388 pe->status2 = 0;
1389
1390 /* make the encoding description string exp. "=?ISO-2022-JP?B?" */
1391 n = 0;
1392 pe->encname[n++] = 0x3d;
1393 pe->encname[n++] = 0x3f;
1394 s = outcode->mime_name;
1395 while (*s) {
1396 pe->encname[n++] = *s++;
1397 }
1398 pe->encname[n++] = 0x3f;
1399 if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1400 pe->encname[n++] = 0x51;
1401 } else {
1402 pe->encname[n++] = 0x42;
1403 transenc = &mbfl_encoding_base64;
1404 }
1405 pe->encname[n++] = 0x3f;
1406 pe->encname[n] = '\0';
1407 pe->encnamelen = n;
1408
1409 n = 0;
1410 pe->lwsp[n++] = 0x0d;
1411 pe->lwsp[n++] = 0x0a;
1412 pe->lwsp[n++] = 0x20;
1413 pe->lwsp[n] = '\0';
1414 pe->lwsplen = n;
1415
1416 /* transfer encode filter */
1417 pe->encod_filter = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1418 pe->encod_filter_backup = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1419
1420 /* Output code filter */
1421 pe->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1422 pe->conv2_filter_backup = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1423
1424 /* encoded block filter */
1425 pe->block_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, &mbfl_encoding_wchar, mime_header_encoder_block_collector, 0, pe);
1426
1427 /* Input code filter */
1428 pe->conv1_filter = mbfl_convert_filter_new(incode, &mbfl_encoding_wchar, mime_header_encoder_collector, 0, pe);
1429
1430 if (pe->encod_filter == NULL ||
1431 pe->encod_filter_backup == NULL ||
1432 pe->conv2_filter == NULL ||
1433 pe->conv2_filter_backup == NULL ||
1434 pe->conv1_filter == NULL) {
1435 mime_header_encoder_delete(pe);
1436 return NULL;
1437 }
1438
1439 if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1440 pe->encod_filter->status |= MBFL_QPRINT_STS_MIME_HEADER;
1441 pe->encod_filter_backup->status |= MBFL_QPRINT_STS_MIME_HEADER;
1442 } else {
1443 pe->encod_filter->status |= MBFL_BASE64_STS_MIME_HEADER;
1444 pe->encod_filter_backup->status |= MBFL_BASE64_STS_MIME_HEADER;
1445 }
1446
1447 return pe;
1448 }
1449
1450 void
mime_header_encoder_delete(struct mime_header_encoder_data * pe)1451 mime_header_encoder_delete(struct mime_header_encoder_data *pe)
1452 {
1453 if (pe) {
1454 mbfl_convert_filter_delete(pe->conv1_filter);
1455 mbfl_convert_filter_delete(pe->block_filter);
1456 mbfl_convert_filter_delete(pe->conv2_filter);
1457 mbfl_convert_filter_delete(pe->conv2_filter_backup);
1458 mbfl_convert_filter_delete(pe->encod_filter);
1459 mbfl_convert_filter_delete(pe->encod_filter_backup);
1460 mbfl_memory_device_clear(&pe->outdev);
1461 mbfl_memory_device_clear(&pe->tmpdev);
1462 efree((void*)pe);
1463 }
1464 }
1465
1466 mbfl_string *
mbfl_mime_header_encode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode,const mbfl_encoding * encoding,const char * linefeed,int indent)1467 mbfl_mime_header_encode(
1468 mbfl_string *string,
1469 mbfl_string *result,
1470 const mbfl_encoding *outcode,
1471 const mbfl_encoding *encoding,
1472 const char *linefeed,
1473 int indent)
1474 {
1475 size_t n;
1476 unsigned char *p;
1477 struct mime_header_encoder_data *pe;
1478
1479 mbfl_string_init(result);
1480 result->encoding = &mbfl_encoding_ascii;
1481
1482 pe = mime_header_encoder_new(string->encoding, outcode, encoding);
1483 if (pe == NULL) {
1484 return NULL;
1485 }
1486
1487 if (linefeed != NULL) {
1488 n = 0;
1489 while (*linefeed && n < 8) {
1490 pe->lwsp[n++] = *linefeed++;
1491 }
1492 pe->lwsp[n++] = 0x20;
1493 pe->lwsp[n] = '\0';
1494 pe->lwsplen = n;
1495 }
1496 if (indent > 0 && indent < 74) {
1497 pe->firstindent = indent;
1498 }
1499
1500 n = string->len;
1501 p = string->val;
1502 while (n > 0) {
1503 (*pe->conv1_filter->filter_function)(*p++, pe->conv1_filter);
1504 n--;
1505 }
1506
1507 result = mime_header_encoder_result(pe, result);
1508 mime_header_encoder_delete(pe);
1509
1510 return result;
1511 }
1512
1513
1514 /*
1515 * MIME header decode
1516 */
1517 struct mime_header_decoder_data {
1518 mbfl_convert_filter *deco_filter;
1519 mbfl_convert_filter *conv1_filter;
1520 mbfl_convert_filter *conv2_filter;
1521 mbfl_memory_device outdev;
1522 mbfl_memory_device tmpdev;
1523 size_t cspos;
1524 int status;
1525 const mbfl_encoding *encoding;
1526 const mbfl_encoding *incode;
1527 const mbfl_encoding *outcode;
1528 };
1529
1530 static int
mime_header_decoder_collector(int c,void * data)1531 mime_header_decoder_collector(int c, void* data)
1532 {
1533 const mbfl_encoding *encoding;
1534 struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data;
1535
1536 switch (pd->status) {
1537 case 1:
1538 if (c == 0x3f) { /* ? */
1539 mbfl_memory_device_output(c, &pd->tmpdev);
1540 pd->cspos = pd->tmpdev.pos;
1541 pd->status = 2;
1542 } else {
1543 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1544 mbfl_memory_device_reset(&pd->tmpdev);
1545 if (c == 0x3d) { /* = */
1546 mbfl_memory_device_output(c, &pd->tmpdev);
1547 } else if (c == 0x0d || c == 0x0a) { /* CR or LF */
1548 pd->status = 9;
1549 } else {
1550 (*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
1551 pd->status = 0;
1552 }
1553 }
1554 break;
1555 case 2: /* store charset string */
1556 if (c == 0x3f) { /* ? */
1557 /* identify charset */
1558 mbfl_memory_device_output('\0', &pd->tmpdev);
1559 encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]);
1560 if (encoding != NULL) {
1561 pd->incode = encoding;
1562 pd->status = 3;
1563 }
1564 mbfl_memory_device_unput(&pd->tmpdev);
1565 mbfl_memory_device_output(c, &pd->tmpdev);
1566 } else {
1567 mbfl_memory_device_output(c, &pd->tmpdev);
1568 if (pd->tmpdev.pos > 100) { /* too long charset string */
1569 pd->status = 0;
1570 } else if (c == 0x0d || c == 0x0a) { /* CR or LF */
1571 mbfl_memory_device_unput(&pd->tmpdev);
1572 pd->status = 9;
1573 }
1574 if (pd->status != 2) {
1575 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1576 mbfl_memory_device_reset(&pd->tmpdev);
1577 }
1578 }
1579 break;
1580 case 3: /* identify encoding */
1581 mbfl_memory_device_output(c, &pd->tmpdev);
1582 if (c == 0x42 || c == 0x62) { /* 'B' or 'b' */
1583 pd->encoding = &mbfl_encoding_base64;
1584 pd->status = 4;
1585 } else if (c == 0x51 || c == 0x71) { /* 'Q' or 'q' */
1586 pd->encoding = &mbfl_encoding_qprint;
1587 pd->status = 4;
1588 } else {
1589 if (c == 0x0d || c == 0x0a) { /* CR or LF */
1590 mbfl_memory_device_unput(&pd->tmpdev);
1591 pd->status = 9;
1592 } else {
1593 pd->status = 0;
1594 }
1595 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1596 mbfl_memory_device_reset(&pd->tmpdev);
1597 }
1598 break;
1599 case 4: /* reset filter */
1600 mbfl_memory_device_output(c, &pd->tmpdev);
1601 if (c == 0x3f) { /* ? */
1602 /* charset convert filter */
1603 mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar);
1604 /* decode filter */
1605 mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit);
1606 pd->status = 5;
1607 } else {
1608 if (c == 0x0d || c == 0x0a) { /* CR or LF */
1609 mbfl_memory_device_unput(&pd->tmpdev);
1610 pd->status = 9;
1611 } else {
1612 pd->status = 0;
1613 }
1614 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1615 }
1616 mbfl_memory_device_reset(&pd->tmpdev);
1617 break;
1618 case 5: /* encoded block */
1619 if (c == 0x3f) { /* ? */
1620 pd->status = 6;
1621 } else {
1622 (*pd->deco_filter->filter_function)(c, pd->deco_filter);
1623 }
1624 break;
1625 case 6: /* check end position */
1626 if (c == 0x3d) { /* = */
1627 /* flush and reset filter */
1628 (*pd->deco_filter->filter_flush)(pd->deco_filter);
1629 (*pd->conv1_filter->filter_flush)(pd->conv1_filter);
1630 mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar);
1631 pd->status = 7;
1632 } else {
1633 (*pd->deco_filter->filter_function)(0x3f, pd->deco_filter);
1634 if (c != 0x3f) { /* ? */
1635 (*pd->deco_filter->filter_function)(c, pd->deco_filter);
1636 pd->status = 5;
1637 }
1638 }
1639 break;
1640 case 7: /* after encoded block */
1641 if (c == 0x0d || c == 0x0a) { /* CR LF */
1642 pd->status = 8;
1643 } else {
1644 mbfl_memory_device_output(c, &pd->tmpdev);
1645 if (c == 0x3d) { /* = */
1646 pd->status = 1;
1647 } else if (c != 0x20 && c != 0x09) { /* not space */
1648 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1649 mbfl_memory_device_reset(&pd->tmpdev);
1650 pd->status = 0;
1651 }
1652 }
1653 break;
1654 case 8: /* folding */
1655 case 9: /* folding */
1656 if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) {
1657 if (c == 0x3d) { /* = */
1658 if (pd->status == 8) {
1659 mbfl_memory_device_output(0x20, &pd->tmpdev); /* SPACE */
1660 } else {
1661 (*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter);
1662 }
1663 mbfl_memory_device_output(c, &pd->tmpdev);
1664 pd->status = 1;
1665 } else {
1666 mbfl_memory_device_output(0x20, &pd->tmpdev);
1667 mbfl_memory_device_output(c, &pd->tmpdev);
1668 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1669 mbfl_memory_device_reset(&pd->tmpdev);
1670 pd->status = 0;
1671 }
1672 }
1673 break;
1674 default: /* non encoded block */
1675 if (c == 0x0d || c == 0x0a) { /* CR LF */
1676 pd->status = 9;
1677 } else if (c == 0x3d) { /* = */
1678 mbfl_memory_device_output(c, &pd->tmpdev);
1679 pd->status = 1;
1680 } else {
1681 (*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
1682 }
1683 break;
1684 }
1685
1686 return 0;
1687 }
1688
1689 mbfl_string *
mime_header_decoder_result(struct mime_header_decoder_data * pd,mbfl_string * result)1690 mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result)
1691 {
1692 switch (pd->status) {
1693 case 1:
1694 case 2:
1695 case 3:
1696 case 4:
1697 case 7:
1698 case 8:
1699 case 9:
1700 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1701 break;
1702 case 5:
1703 case 6:
1704 (*pd->deco_filter->filter_flush)(pd->deco_filter);
1705 (*pd->conv1_filter->filter_flush)(pd->conv1_filter);
1706 break;
1707 }
1708 (*pd->conv2_filter->filter_flush)(pd->conv2_filter);
1709 mbfl_memory_device_reset(&pd->tmpdev);
1710 pd->status = 0;
1711
1712 return mbfl_memory_device_result(&pd->outdev, result);
1713 }
1714
1715 struct mime_header_decoder_data*
mime_header_decoder_new(const mbfl_encoding * outcode)1716 mime_header_decoder_new(const mbfl_encoding *outcode)
1717 {
1718 struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data));
1719
1720 mbfl_memory_device_init(&pd->outdev, 0, 0);
1721 mbfl_memory_device_init(&pd->tmpdev, 0, 0);
1722 pd->cspos = 0;
1723 pd->status = 0;
1724 pd->encoding = &mbfl_encoding_8bit;
1725 pd->incode = &mbfl_encoding_ascii;
1726 pd->outcode = outcode;
1727 /* charset convert filter */
1728 pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev);
1729 pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter);
1730 /* decode filter */
1731 pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter);
1732
1733 if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) {
1734 mime_header_decoder_delete(pd);
1735 return NULL;
1736 }
1737
1738 return pd;
1739 }
1740
1741 void
mime_header_decoder_delete(struct mime_header_decoder_data * pd)1742 mime_header_decoder_delete(struct mime_header_decoder_data *pd)
1743 {
1744 if (pd) {
1745 mbfl_convert_filter_delete(pd->conv2_filter);
1746 mbfl_convert_filter_delete(pd->conv1_filter);
1747 mbfl_convert_filter_delete(pd->deco_filter);
1748 mbfl_memory_device_clear(&pd->outdev);
1749 mbfl_memory_device_clear(&pd->tmpdev);
1750 efree((void*)pd);
1751 }
1752 }
1753
1754 mbfl_string *
mbfl_mime_header_decode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode)1755 mbfl_mime_header_decode(
1756 mbfl_string *string,
1757 mbfl_string *result,
1758 const mbfl_encoding *outcode)
1759 {
1760 size_t n;
1761 unsigned char *p;
1762 struct mime_header_decoder_data *pd;
1763
1764 mbfl_string_init(result);
1765 result->encoding = outcode;
1766
1767 pd = mime_header_decoder_new(outcode);
1768 if (pd == NULL) {
1769 return NULL;
1770 }
1771
1772 /* feed data */
1773 n = string->len;
1774 p = string->val;
1775 while (n > 0) {
1776 mime_header_decoder_collector(*p++, pd);
1777 n--;
1778 }
1779
1780 result = mime_header_decoder_result(pd, result);
1781 mime_header_decoder_delete(pd);
1782
1783 return result;
1784 }
1785