1 /*
2 * charset=UTF-8
3 */
4
5 /*
6 * "streamable kanji code filter and converter"
7 *
8 * Copyright (c) 1998,1999,2000,2001 HappySize, Inc. All rights reserved.
9 *
10 * This software is released under the GNU Lesser General Public License.
11 * (Version 2.1, February 1999)
12 * Please read the following detail of the licence (in japanese).
13 *
14 * ◆使用許諾条件◆
15 *
16 * このソフトウェアは株式会社ハッピーサイズによって開発されました。株式会社ハッ
17 * ピーサイズは、著作権法および万国著作権条約の定めにより、このソフトウェアに関
18 * するすべての権利を留保する権利を持ち、ここに行使します。株式会社ハッピーサイ
19 * ズは以下に明記した条件に従って、このソフトウェアを使用する排他的ではない権利
20 * をお客様に許諾します。何人たりとも、以下の条件に反してこのソフトウェアを使用
21 * することはできません。
22 *
23 * このソフトウェアを「GNU Lesser General Public License (Version 2.1, February
24 * 1999)」に示された条件で使用することを、全ての方に許諾します。「GNU Lesser
25 * General Public License」を満たさない使用には、株式会社ハッピーサイズから書面
26 * による許諾を得る必要があります。
27 *
28 * 「GNU Lesser General Public License」の全文は以下のウェブページから取得でき
29 * ます。「GNU Lesser General Public License」とは、これまでLibrary General
30 * Public Licenseと呼ばれていたものです。
31 * http://www.gnu.org/ --- GNUウェブサイト
32 * http://www.gnu.org/copyleft/lesser.html --- ライセンス文面
33 * このライセンスの内容がわからない方、守れない方には使用を許諾しません。
34 *
35 * しかしながら、当社とGNUプロジェクトとの特定の関係を示唆または主張するもので
36 * はありません。
37 *
38 * ◆保証内容◆
39 *
40 * このソフトウェアは、期待された動作・機能・性能を持つことを目標として設計され
41 * 開発されていますが、これを保証するものではありません。このソフトウェアは「こ
42 * のまま」の状態で提供されており、たとえばこのソフトウェアの有用性ないし特定の
43 * 目的に合致することといった、何らかの保証内容が、明示されたり暗黙に示されてい
44 * る場合であっても、その保証は無効です。このソフトウェアを使用した結果ないし使
45 * 用しなかった結果によって、直接あるいは間接に受けた身体的な傷害、財産上の損害
46 * 、データの損失あるいはその他の全ての損害については、その損害の可能性が使用者
47 * 、当社あるいは第三者によって警告されていた場合であっても、当社はその損害の賠
48 * 償および補填を行いません。この規定は他の全ての、書面上または書面に無い保証・
49 * 契約・規定に優先します。
50 *
51 * ◆著作権者の連絡先および使用条件についての問い合わせ先◆
52 *
53 * 〒102-0073
54 * 東京都千代田区九段北1-13-5日本地所第一ビル4F
55 * 株式会社ハッピーサイズ
56 * Phone: 03-3512-3655, Fax: 03-3512-3656
57 * Email: sales@happysize.co.jp
58 * Web: http://happysize.com/
59 *
60 * ◆著者◆
61 *
62 * 金本 茂 <sgk@happysize.co.jp>
63 *
64 * ◆履歴◆
65 *
66 * 1998/11/10 sgk implementation in C++
67 * 1999/4/25 sgk Cで書きなおし。
68 * 1999/4/26 sgk 入力フィルタを実装。漢字コードを推定しながらフィルタを追加。
69 * 1999/6/?? Unicodeサポート。
70 * 1999/6/22 sgk ライセンスをLGPLに変更。
71 *
72 */
73
74 /*
75 * Unicode support
76 *
77 * Portions copyright (c) 1999,2000,2001 by the PHP3 internationalization team.
78 * All rights reserved.
79 *
80 */
81
82 #include <stddef.h>
83 #include <string.h>
84
85 #include "mbfilter.h"
86 #include "mbfl_filter_output.h"
87 #include "mbfilter_8bit.h"
88 #include "mbfilter_wchar.h"
89 #include "filters/mbfilter_ascii.h"
90 #include "filters/mbfilter_base64.h"
91 #include "filters/mbfilter_qprint.h"
92 #include "filters/mbfilter_tl_jisx0201_jisx0208.h"
93 #include "filters/mbfilter_utf8.h"
94
95 #include "eaw_table.h"
96
97 /* hex character table "0123456789ABCDEF" */
98 static char mbfl_hexchar_table[] = {
99 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
100 };
101
102
103
104 /*
105 * encoding filter
106 */
107 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
108
109
110 /*
111 * buffering converter
112 */
113 mbfl_buffer_converter *
mbfl_buffer_converter_new(const mbfl_encoding * from,const mbfl_encoding * to,size_t buf_initsz)114 mbfl_buffer_converter_new(
115 const mbfl_encoding *from,
116 const mbfl_encoding *to,
117 size_t buf_initsz)
118 {
119 mbfl_buffer_converter *convd = emalloc(sizeof(mbfl_buffer_converter));
120 convd->to = to;
121
122 /* create convert filter */
123 convd->filter1 = NULL;
124 convd->filter2 = NULL;
125 if (mbfl_convert_filter_get_vtbl(from, to) != NULL) {
126 convd->filter1 = mbfl_convert_filter_new(from, to, mbfl_memory_device_output, NULL, &convd->device);
127 } else {
128 convd->filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, to, mbfl_memory_device_output, NULL, &convd->device);
129 if (convd->filter2 != NULL) {
130 convd->filter1 = mbfl_convert_filter_new(from,
131 &mbfl_encoding_wchar,
132 (output_function_t)convd->filter2->filter_function,
133 (flush_function_t)convd->filter2->filter_flush,
134 convd->filter2);
135 if (convd->filter1 == NULL) {
136 mbfl_convert_filter_delete(convd->filter2);
137 }
138 }
139 }
140 if (convd->filter1 == NULL) {
141 efree(convd);
142 return NULL;
143 }
144
145 mbfl_memory_device_init(&convd->device, buf_initsz, buf_initsz/4);
146
147 return convd;
148 }
149
150
151 void
mbfl_buffer_converter_delete(mbfl_buffer_converter * convd)152 mbfl_buffer_converter_delete(mbfl_buffer_converter *convd)
153 {
154 if (convd != NULL) {
155 if (convd->filter1) {
156 mbfl_convert_filter_delete(convd->filter1);
157 }
158 if (convd->filter2) {
159 mbfl_convert_filter_delete(convd->filter2);
160 }
161 mbfl_memory_device_clear(&convd->device);
162 efree((void*)convd);
163 }
164 }
165
166 int
mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter * convd,int mode)167 mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter *convd, int mode)
168 {
169 if (convd != NULL) {
170 if (convd->filter2 != NULL) {
171 convd->filter2->illegal_mode = mode;
172 } else if (convd->filter1 != NULL) {
173 convd->filter1->illegal_mode = mode;
174 } else {
175 return 0;
176 }
177 }
178
179 return 1;
180 }
181
182 int
mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter * convd,int substchar)183 mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter *convd, int substchar)
184 {
185 if (convd != NULL) {
186 if (convd->filter2 != NULL) {
187 convd->filter2->illegal_substchar = substchar;
188 } else if (convd->filter1 != NULL) {
189 convd->filter1->illegal_substchar = substchar;
190 } else {
191 return 0;
192 }
193 }
194
195 return 1;
196 }
197
mbfl_buffer_converter_feed(mbfl_buffer_converter * convd,mbfl_string * string)198 size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *string)
199 {
200 size_t n;
201 unsigned char *p;
202 mbfl_convert_filter *filter;
203 int (*filter_function)(int c, mbfl_convert_filter *filter);
204
205 ZEND_ASSERT(convd);
206 ZEND_ASSERT(string);
207
208 mbfl_memory_device_realloc(&convd->device, convd->device.pos + string->len, string->len/4);
209 /* feed data */
210 n = string->len;
211 p = string->val;
212
213 filter = convd->filter1;
214 if (filter != NULL) {
215 filter_function = filter->filter_function;
216 while (n > 0) {
217 if ((*filter_function)(*p++, filter) < 0) {
218 return p - string->val;
219 }
220 n--;
221 }
222 }
223 return p - string->val;
224 }
225
226
227 int
mbfl_buffer_converter_flush(mbfl_buffer_converter * convd)228 mbfl_buffer_converter_flush(mbfl_buffer_converter *convd)
229 {
230 if (convd == NULL) {
231 return -1;
232 }
233
234 if (convd->filter1 != NULL) {
235 mbfl_convert_filter_flush(convd->filter1);
236 }
237 if (convd->filter2 != NULL) {
238 mbfl_convert_filter_flush(convd->filter2);
239 }
240
241 return 0;
242 }
243
244 mbfl_string *
mbfl_buffer_converter_result(mbfl_buffer_converter * convd,mbfl_string * result)245 mbfl_buffer_converter_result(mbfl_buffer_converter *convd, mbfl_string *result)
246 {
247 if (convd == NULL || result == NULL) {
248 return NULL;
249 }
250 result->encoding = convd->to;
251 return mbfl_memory_device_result(&convd->device, result);
252 }
253
254 mbfl_string *
mbfl_buffer_converter_feed_result(mbfl_buffer_converter * convd,mbfl_string * string,mbfl_string * result)255 mbfl_buffer_converter_feed_result(mbfl_buffer_converter *convd, mbfl_string *string,
256 mbfl_string *result)
257 {
258 if (convd == NULL || string == NULL || result == NULL) {
259 return NULL;
260 }
261 mbfl_buffer_converter_feed(convd, string);
262 if (convd->filter1 != NULL) {
263 mbfl_convert_filter_flush(convd->filter1);
264 }
265 if (convd->filter2 != NULL) {
266 mbfl_convert_filter_flush(convd->filter2);
267 }
268 result->encoding = convd->to;
269 return mbfl_memory_device_result(&convd->device, result);
270 }
271
mbfl_buffer_illegalchars(mbfl_buffer_converter * convd)272 size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
273 {
274 size_t num_illegalchars = 0;
275
276 if (convd == NULL) {
277 return 0;
278 }
279
280 if (convd->filter1 != NULL) {
281 num_illegalchars += convd->filter1->num_illegalchar;
282 }
283
284 if (convd->filter2 != NULL) {
285 num_illegalchars += convd->filter2->num_illegalchar;
286 }
287
288 return num_illegalchars;
289 }
290
291 /*
292 * encoding detector
293 */
294 mbfl_encoding_detector *
mbfl_encoding_detector_new(const mbfl_encoding ** elist,int elistsz,int strict)295 mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
296 {
297 mbfl_encoding_detector *identd;
298
299 int i, num;
300 mbfl_identify_filter *filter;
301
302 if (elist == NULL || elistsz <= 0) {
303 return NULL;
304 }
305
306 /* allocate */
307 identd = emalloc(sizeof(mbfl_encoding_detector));
308 identd->filter_list = ecalloc(elistsz, sizeof(mbfl_identify_filter *));
309
310 /* create filters */
311 i = 0;
312 num = 0;
313 while (i < elistsz) {
314 filter = mbfl_identify_filter_new2(elist[i]);
315 if (filter != NULL) {
316 identd->filter_list[num] = filter;
317 num++;
318 }
319 i++;
320 }
321 identd->filter_list_size = num;
322
323 /* set strict flag */
324 identd->strict = strict;
325
326 return identd;
327 }
328
329
330 void
mbfl_encoding_detector_delete(mbfl_encoding_detector * identd)331 mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
332 {
333 int i;
334
335 if (identd != NULL) {
336 if (identd->filter_list != NULL) {
337 i = identd->filter_list_size;
338 while (i > 0) {
339 i--;
340 mbfl_identify_filter_delete(identd->filter_list[i]);
341 }
342 efree((void *)identd->filter_list);
343 }
344 efree((void *)identd);
345 }
346 }
347
348 int
mbfl_encoding_detector_feed(mbfl_encoding_detector * identd,mbfl_string * string)349 mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
350 {
351 int res = 0;
352 /* feed data */
353 if (identd != NULL && string != NULL && string->val != NULL) {
354 int num = identd->filter_list_size;
355 size_t n = string->len;
356 unsigned char *p = string->val;
357 int bad = 0;
358 while (n > 0) {
359 int i;
360 for (i = 0; i < num; i++) {
361 mbfl_identify_filter *filter = identd->filter_list[i];
362 if (!filter->flag) {
363 (*filter->filter_function)(*p, filter);
364 if (filter->flag) {
365 bad++;
366 }
367 }
368 }
369 if ((num - 1) <= bad) {
370 res = 1;
371 break;
372 }
373 p++;
374 n--;
375 }
376 }
377
378 return res;
379 }
380
mbfl_encoding_detector_judge(mbfl_encoding_detector * identd)381 const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
382 {
383 mbfl_identify_filter *filter;
384 const mbfl_encoding *encoding = NULL;
385 int n;
386
387 /* judge */
388 if (identd != NULL) {
389 n = identd->filter_list_size - 1;
390 while (n >= 0) {
391 filter = identd->filter_list[n];
392 if (!filter->flag) {
393 if (!identd->strict || !filter->status) {
394 encoding = filter->encoding;
395 }
396 }
397 n--;
398 }
399
400 /* fallback judge */
401 if (!encoding) {
402 n = identd->filter_list_size - 1;
403 while (n >= 0) {
404 filter = identd->filter_list[n];
405 if (!filter->flag) {
406 encoding = filter->encoding;
407 }
408 n--;
409 }
410 }
411 }
412
413 return encoding;
414 }
415
416 /*
417 * encoding converter
418 */
419 mbfl_string *
mbfl_convert_encoding(mbfl_string * string,mbfl_string * result,const mbfl_encoding * toenc)420 mbfl_convert_encoding(
421 mbfl_string *string,
422 mbfl_string *result,
423 const mbfl_encoding *toenc)
424 {
425 size_t n;
426 unsigned char *p;
427 mbfl_memory_device device;
428 mbfl_convert_filter *filter1;
429 mbfl_convert_filter *filter2;
430
431 /* initialize */
432 if (toenc == NULL || string == NULL || result == NULL) {
433 return NULL;
434 }
435
436 filter1 = NULL;
437 filter2 = NULL;
438 if (mbfl_convert_filter_get_vtbl(string->encoding, toenc) != NULL) {
439 filter1 = mbfl_convert_filter_new(string->encoding, toenc, mbfl_memory_device_output, 0, &device);
440 } else {
441 filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, toenc, mbfl_memory_device_output, 0, &device);
442 if (filter2 != NULL) {
443 filter1 = mbfl_convert_filter_new(string->encoding, &mbfl_encoding_wchar, (int (*)(int, void*))filter2->filter_function, NULL, filter2);
444 if (filter1 == NULL) {
445 mbfl_convert_filter_delete(filter2);
446 }
447 }
448 }
449 if (filter1 == NULL) {
450 return NULL;
451 }
452
453 if (filter2 != NULL) {
454 filter2->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
455 filter2->illegal_substchar = 0x3f; /* '?' */
456 }
457
458 mbfl_memory_device_init(&device, string->len, (string->len >> 2) + 8);
459
460 /* feed data */
461 n = string->len;
462 p = string->val;
463 if (p != NULL) {
464 while (n > 0) {
465 if ((*filter1->filter_function)(*p++, filter1) < 0) {
466 break;
467 }
468 n--;
469 }
470 }
471
472 mbfl_convert_filter_flush(filter1);
473 mbfl_convert_filter_delete(filter1);
474 if (filter2 != NULL) {
475 mbfl_convert_filter_flush(filter2);
476 mbfl_convert_filter_delete(filter2);
477 }
478
479 return mbfl_memory_device_result(&device, result);
480 }
481
482
483 /*
484 * identify encoding
485 */
486 const mbfl_encoding *
mbfl_identify_encoding(mbfl_string * string,const mbfl_encoding ** elist,int elistsz,int strict)487 mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict)
488 {
489 int i, num, bad;
490 size_t n;
491 unsigned char *p;
492 mbfl_identify_filter *flist, *filter;
493 const mbfl_encoding *encoding;
494
495 /* flist is an array of mbfl_identify_filter instances */
496 flist = ecalloc(elistsz, sizeof(mbfl_identify_filter));
497
498 num = 0;
499 if (elist != NULL) {
500 for (i = 0; i < elistsz; i++) {
501 if (!mbfl_identify_filter_init2(&flist[num], elist[i])) {
502 num++;
503 }
504 }
505 }
506
507 /* feed data */
508 n = string->len;
509 p = string->val;
510
511 if (p != NULL) {
512 bad = 0;
513 while (n > 0) {
514 for (i = 0; i < num; i++) {
515 filter = &flist[i];
516 if (!filter->flag) {
517 (*filter->filter_function)(*p, filter);
518 if (filter->flag) {
519 bad++;
520 }
521 }
522 }
523 if ((num - 1) <= bad && !strict) {
524 break;
525 }
526 p++;
527 n--;
528 }
529 }
530
531 /* judge */
532 encoding = NULL;
533
534 for (i = 0; i < num; i++) {
535 filter = &flist[i];
536 if (!filter->flag) {
537 if (strict && filter->status) {
538 continue;
539 }
540 encoding = filter->encoding;
541 break;
542 }
543 }
544
545 /* fall-back judge */
546 if (!encoding) {
547 for (i = 0; i < num; i++) {
548 filter = &flist[i];
549 if (!filter->flag && (!strict || !filter->status)) {
550 encoding = filter->encoding;
551 break;
552 }
553 }
554 }
555
556 efree((void *)flist);
557
558 return encoding;
559 }
560
561 /*
562 * strlen
563 */
564 static int
filter_count_output(int c,void * data)565 filter_count_output(int c, void *data)
566 {
567 (*(size_t *)data)++;
568 return c;
569 }
570
571 size_t
mbfl_strlen(const mbfl_string * string)572 mbfl_strlen(const mbfl_string *string)
573 {
574 size_t len, n, k;
575 unsigned char *p;
576 const mbfl_encoding *encoding = string->encoding;
577
578 len = 0;
579 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
580 len = string->len;
581 } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
582 len = string->len/2;
583 } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
584 len = string->len/4;
585 } else if (encoding->mblen_table != NULL) {
586 const unsigned char *mbtab = encoding->mblen_table;
587 n = 0;
588 p = string->val;
589 k = string->len;
590 /* count */
591 if (p != NULL) {
592 while (n < k) {
593 unsigned m = mbtab[*p];
594 n += m;
595 p += m;
596 len++;
597 }
598 }
599 } else {
600 /* wchar filter */
601 mbfl_convert_filter *filter = mbfl_convert_filter_new(
602 string->encoding,
603 &mbfl_encoding_wchar,
604 filter_count_output, 0, &len);
605 if (filter == NULL) {
606 return (size_t) -1;
607 }
608 /* count */
609 n = string->len;
610 p = string->val;
611 if (p != NULL) {
612 while (n > 0) {
613 (*filter->filter_function)(*p++, filter);
614 n--;
615 }
616 }
617 mbfl_convert_filter_delete(filter);
618 }
619
620 return len;
621 }
622
623
624 /*
625 * strpos
626 */
627 struct collector_strpos_data {
628 mbfl_convert_filter *next_filter;
629 mbfl_wchar_device needle;
630 size_t needle_len;
631 size_t start;
632 size_t output;
633 size_t found_pos;
634 size_t needle_pos;
635 size_t matched_pos;
636 };
637
638 static int
collector_strpos(int c,void * data)639 collector_strpos(int c, void* data)
640 {
641 int *p, *h, *m;
642 ssize_t n;
643 struct collector_strpos_data *pc = (struct collector_strpos_data*)data;
644
645 if (pc->output >= pc->start) {
646 if (c == (int)pc->needle.buffer[pc->needle_pos]) {
647 if (pc->needle_pos == 0) {
648 pc->found_pos = pc->output; /* found position */
649 }
650 pc->needle_pos++; /* needle pointer */
651 if (pc->needle_pos >= pc->needle_len) {
652 pc->matched_pos = pc->found_pos; /* matched position */
653 pc->needle_pos--;
654 goto retry;
655 }
656 } else if (pc->needle_pos != 0) {
657 retry:
658 h = (int *)pc->needle.buffer;
659 h++;
660 for (;;) {
661 pc->found_pos++;
662 p = h;
663 m = (int *)pc->needle.buffer;
664 n = pc->needle_pos - 1;
665 while (n > 0 && *p == *m) {
666 n--;
667 p++;
668 m++;
669 }
670 if (n <= 0) {
671 if (*m != c) {
672 pc->needle_pos = 0;
673 }
674 break;
675 } else {
676 h++;
677 pc->needle_pos--;
678 }
679 }
680 }
681 }
682
683 pc->output++;
684 return c;
685 }
686
mbfl_find_offset_utf8(const unsigned char * str,const unsigned char * end,ssize_t offset)687 static const unsigned char *mbfl_find_offset_utf8(
688 const unsigned char *str, const unsigned char *end, ssize_t offset) {
689 if (offset < 0) {
690 const unsigned char *pos = end;
691 while (offset < 0) {
692 if (pos <= str) {
693 return NULL;
694 }
695
696 unsigned char c = *(--pos);
697 if (c < 0x80) {
698 ++offset;
699 } else if ((c & 0xc0) != 0x80) {
700 ++offset;
701 }
702 }
703 return pos;
704 } else {
705 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
706 const unsigned char *pos = str;
707 while (offset-- > 0) {
708 if (pos >= end) {
709 return NULL;
710 }
711 pos += u8_tbl[*pos];
712 }
713 return pos;
714 }
715 }
716
mbfl_pointer_to_offset_utf8(const unsigned char * start,const unsigned char * pos)717 static size_t mbfl_pointer_to_offset_utf8(const unsigned char *start, const unsigned char *pos) {
718 size_t result = 0;
719 while (pos > start) {
720 unsigned char c = *--pos;
721 if (c < 0x80) {
722 ++result;
723 } else if ((c & 0xc0) != 0x80) {
724 ++result;
725 }
726 }
727 return result;
728 }
729
730 size_t
mbfl_strpos(mbfl_string * haystack,mbfl_string * needle,ssize_t offset,int reverse)731 mbfl_strpos(
732 mbfl_string *haystack,
733 mbfl_string *needle,
734 ssize_t offset,
735 int reverse)
736 {
737 size_t result;
738 mbfl_string _haystack_u8, _needle_u8;
739 const mbfl_string *haystack_u8, *needle_u8 = NULL;
740 const unsigned char *offset_pointer;
741
742 if (haystack->encoding->no_encoding != mbfl_no_encoding_utf8) {
743 mbfl_string_init(&_haystack_u8);
744 haystack_u8 = mbfl_convert_encoding(haystack, &_haystack_u8, &mbfl_encoding_utf8);
745 if (haystack_u8 == NULL) {
746 result = MBFL_ERROR_ENCODING;
747 goto out;
748 }
749 } else {
750 haystack_u8 = haystack;
751 }
752
753 if (needle->encoding->no_encoding != mbfl_no_encoding_utf8) {
754 mbfl_string_init(&_needle_u8);
755 needle_u8 = mbfl_convert_encoding(needle, &_needle_u8, &mbfl_encoding_utf8);
756 if (needle_u8 == NULL) {
757 result = MBFL_ERROR_ENCODING;
758 goto out;
759 }
760 } else {
761 needle_u8 = needle;
762 }
763
764 offset_pointer = mbfl_find_offset_utf8(
765 haystack_u8->val, haystack_u8->val + haystack_u8->len, offset);
766 if (!offset_pointer) {
767 result = MBFL_ERROR_OFFSET;
768 goto out;
769 }
770
771 result = MBFL_ERROR_NOT_FOUND;
772 if (haystack_u8->len < needle_u8->len) {
773 goto out;
774 }
775
776 const char *found_pos;
777 if (!reverse) {
778 found_pos = zend_memnstr(
779 (const char *) offset_pointer,
780 (const char *) needle_u8->val, needle_u8->len,
781 (const char *) haystack_u8->val + haystack_u8->len);
782 } else {
783 if (offset >= 0) {
784 found_pos = zend_memnrstr(
785 (const char *) offset_pointer,
786 (const char *) needle_u8->val, needle_u8->len,
787 (const char *) haystack_u8->val + haystack_u8->len);
788 } else {
789 size_t needle_len = mbfl_strlen(needle_u8);
790 offset_pointer = mbfl_find_offset_utf8(
791 offset_pointer, haystack_u8->val + haystack_u8->len, needle_len);
792 if (!offset_pointer) {
793 offset_pointer = haystack_u8->val + haystack_u8->len;
794 }
795
796 found_pos = zend_memnrstr(
797 (const char *) haystack_u8->val,
798 (const char *) needle_u8->val, needle_u8->len,
799 (const char *) offset_pointer);
800 }
801 }
802
803 if (found_pos) {
804 result = mbfl_pointer_to_offset_utf8(haystack_u8->val, (const unsigned char *) found_pos);
805 }
806
807 out:
808 if (haystack_u8 == &_haystack_u8) {
809 mbfl_string_clear(&_haystack_u8);
810 }
811 if (needle_u8 == &_needle_u8) {
812 mbfl_string_clear(&_needle_u8);
813 }
814 return result;
815 }
816
817 /*
818 * substr_count
819 */
820
821 size_t
mbfl_substr_count(mbfl_string * haystack,mbfl_string * needle)822 mbfl_substr_count(
823 mbfl_string *haystack,
824 mbfl_string *needle
825 )
826 {
827 size_t n, result = 0;
828 unsigned char *p;
829 mbfl_convert_filter *filter;
830 struct collector_strpos_data pc;
831
832 /* needle is converted into wchar */
833 mbfl_wchar_device_init(&pc.needle);
834 filter = mbfl_convert_filter_new(
835 needle->encoding,
836 &mbfl_encoding_wchar,
837 mbfl_wchar_device_output, 0, &pc.needle);
838 if (filter == NULL) {
839 return MBFL_ERROR_ENCODING;
840 }
841 mbfl_convert_filter_feed_string(filter, needle->val, needle->len);
842 mbfl_convert_filter_flush(filter);
843 mbfl_convert_filter_delete(filter);
844 pc.needle_len = pc.needle.pos;
845 if (pc.needle.buffer == NULL) {
846 return MBFL_ERROR_ENCODING;
847 }
848 if (pc.needle_len == 0) {
849 mbfl_wchar_device_clear(&pc.needle);
850 return MBFL_ERROR_EMPTY;
851 }
852 /* initialize filter and collector data */
853 filter = mbfl_convert_filter_new(
854 haystack->encoding,
855 &mbfl_encoding_wchar,
856 collector_strpos, 0, &pc);
857 if (filter == NULL) {
858 mbfl_wchar_device_clear(&pc.needle);
859 return MBFL_ERROR_ENCODING;
860 }
861 pc.start = 0;
862 pc.output = 0;
863 pc.needle_pos = 0;
864 pc.found_pos = 0;
865 pc.matched_pos = MBFL_ERROR_NOT_FOUND;
866
867 /* feed data */
868 p = haystack->val;
869 n = haystack->len;
870 if (p != NULL) {
871 while (n > 0) {
872 if ((*filter->filter_function)(*p++, filter) < 0) {
873 pc.matched_pos = MBFL_ERROR_ENCODING;
874 break;
875 }
876 if (pc.matched_pos != MBFL_ERROR_NOT_FOUND) {
877 ++result;
878 pc.matched_pos = MBFL_ERROR_NOT_FOUND;
879 pc.needle_pos = 0;
880 }
881 n--;
882 }
883 }
884 mbfl_convert_filter_flush(filter);
885 mbfl_convert_filter_delete(filter);
886 mbfl_wchar_device_clear(&pc.needle);
887
888 return result;
889 }
890
891 /*
892 * substr
893 */
894 struct collector_substr_data {
895 mbfl_convert_filter *next_filter;
896 size_t start;
897 size_t stop;
898 size_t output;
899 };
900
901 static int
collector_substr(int c,void * data)902 collector_substr(int c, void* data)
903 {
904 struct collector_substr_data *pc = (struct collector_substr_data*)data;
905
906 if (pc->output >= pc->stop) {
907 return -1;
908 }
909
910 if (pc->output >= pc->start) {
911 (*pc->next_filter->filter_function)(c, pc->next_filter);
912 }
913
914 pc->output++;
915
916 return c;
917 }
918
919 mbfl_string *
mbfl_substr(mbfl_string * string,mbfl_string * result,size_t from,size_t length)920 mbfl_substr(
921 mbfl_string *string,
922 mbfl_string *result,
923 size_t from,
924 size_t length)
925 {
926 const mbfl_encoding *encoding = string->encoding;
927 size_t n, k, len, start, end;
928 unsigned m;
929 unsigned char *p, *w;
930
931 mbfl_string_init(result);
932 result->encoding = string->encoding;
933
934 if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE | MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) ||
935 encoding->mblen_table != NULL) {
936 len = string->len;
937 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
938 start = from;
939 } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
940 start = from*2;
941 } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
942 start = from*4;
943 } else {
944 const unsigned char *mbtab = encoding->mblen_table;
945 start = 0;
946 n = 0;
947 k = 0;
948 p = string->val;
949 /* search start position */
950 while (k <= from) {
951 start = n;
952 if (n >= len) {
953 break;
954 }
955 m = mbtab[*p];
956 n += m;
957 p += m;
958 k++;
959 }
960 }
961
962 if (length == MBFL_SUBSTR_UNTIL_END) {
963 end = len;
964 } else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
965 end = start + length;
966 } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
967 end = start + length*2;
968 } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
969 end = start + length*4;
970 } else {
971 const unsigned char *mbtab = encoding->mblen_table;
972 end = start;
973 n = start;
974 k = 0;
975 p = string->val + start;
976 /* detect end position */
977 while (k <= length) {
978 end = n;
979 if (n >= len) {
980 break;
981 }
982 m = mbtab[*p];
983 n += m;
984 p += m;
985 k++;
986 }
987 }
988
989 if (start > len) {
990 start = len;
991 }
992 if (end > len) {
993 end = len;
994 }
995 if (start > end) {
996 start = end;
997 }
998
999 /* allocate memory and copy */
1000 n = end - start;
1001 result->len = 0;
1002 result->val = w = (unsigned char*)emalloc(n + 1);
1003 result->len = n;
1004 memcpy(w, string->val + start, n);
1005 w[n] = '\0';
1006 } else {
1007 mbfl_memory_device device;
1008 struct collector_substr_data pc;
1009 mbfl_convert_filter *decoder;
1010 mbfl_convert_filter *encoder;
1011
1012 if (length == MBFL_SUBSTR_UNTIL_END) {
1013 length = mbfl_strlen(string) - from;
1014 }
1015
1016 mbfl_memory_device_init(&device, length + 1, 0);
1017 mbfl_string_init(result);
1018 result->encoding = string->encoding;
1019 /* output code filter */
1020 decoder = mbfl_convert_filter_new(
1021 &mbfl_encoding_wchar,
1022 string->encoding,
1023 mbfl_memory_device_output, 0, &device);
1024 /* wchar filter */
1025 encoder = mbfl_convert_filter_new(
1026 string->encoding,
1027 &mbfl_encoding_wchar,
1028 collector_substr, 0, &pc);
1029 if (decoder == NULL || encoder == NULL) {
1030 mbfl_convert_filter_delete(encoder);
1031 mbfl_convert_filter_delete(decoder);
1032 return NULL;
1033 }
1034 pc.next_filter = decoder;
1035 pc.start = from;
1036 pc.stop = from + length;
1037 pc.output = 0;
1038
1039 /* feed data */
1040 p = string->val;
1041 n = string->len;
1042 if (p != NULL) {
1043 while (n > 0) {
1044 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1045 break;
1046 }
1047 n--;
1048 }
1049 }
1050
1051 mbfl_convert_filter_flush(encoder);
1052 mbfl_convert_filter_flush(decoder);
1053 result = mbfl_memory_device_result(&device, result);
1054 mbfl_convert_filter_delete(encoder);
1055 mbfl_convert_filter_delete(decoder);
1056 }
1057
1058 return result;
1059 }
1060
1061 /*
1062 * strcut
1063 */
1064 mbfl_string *
mbfl_strcut(mbfl_string * string,mbfl_string * result,size_t from,size_t length)1065 mbfl_strcut(
1066 mbfl_string *string,
1067 mbfl_string *result,
1068 size_t from,
1069 size_t length)
1070 {
1071 const mbfl_encoding *encoding = string->encoding;
1072 mbfl_memory_device device;
1073
1074 if (from >= string->len) {
1075 from = string->len;
1076 }
1077
1078 mbfl_string_init(result);
1079 result->encoding = string->encoding;
1080
1081 if ((encoding->flag & (MBFL_ENCTYPE_SBCS
1082 | MBFL_ENCTYPE_WCS2BE
1083 | MBFL_ENCTYPE_WCS2LE
1084 | MBFL_ENCTYPE_WCS4BE
1085 | MBFL_ENCTYPE_WCS4LE))
1086 || encoding->mblen_table != NULL) {
1087 const unsigned char *start = NULL;
1088 const unsigned char *end = NULL;
1089 unsigned char *w;
1090 size_t sz;
1091
1092 if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
1093 from &= -2;
1094
1095 if (length >= string->len - from) {
1096 length = string->len - from;
1097 }
1098
1099 start = string->val + from;
1100 end = start + (length & -2);
1101 } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
1102 from &= -4;
1103
1104 if (length >= string->len - from) {
1105 length = string->len - from;
1106 }
1107
1108 start = string->val + from;
1109 end = start + (length & -4);
1110 } else if ((encoding->flag & MBFL_ENCTYPE_SBCS)) {
1111 if (length >= string->len - from) {
1112 length = string->len - from;
1113 }
1114
1115 start = string->val + from;
1116 end = start + length;
1117 } else if (encoding->mblen_table != NULL) {
1118 const unsigned char *mbtab = encoding->mblen_table;
1119 const unsigned char *p, *q;
1120 int m;
1121
1122 /* search start position */
1123 for (m = 0, p = string->val, q = p + from;
1124 p < q; p += (m = mbtab[*p]));
1125
1126 if (p > q) {
1127 p -= m;
1128 }
1129
1130 start = p;
1131
1132 /* search end position */
1133 if (length >= string->len - (start - string->val)) {
1134 end = string->val + string->len;
1135 } else {
1136 for (q = p + length; p < q; p += (m = mbtab[*p]));
1137
1138 if (p > q) {
1139 p -= m;
1140 }
1141 end = p;
1142 }
1143 } else {
1144 /* never reached */
1145 return NULL;
1146 }
1147
1148 /* allocate memory and copy string */
1149 sz = end - start;
1150 w = ecalloc(sz + 8, sizeof(unsigned char));
1151
1152 memcpy(w, start, sz);
1153 w[sz] = '\0';
1154 w[sz + 1] = '\0';
1155 w[sz + 2] = '\0';
1156 w[sz + 3] = '\0';
1157
1158 result->val = w;
1159 result->len = sz;
1160 } else {
1161 mbfl_convert_filter *encoder = NULL;
1162 mbfl_convert_filter *decoder = NULL;
1163 const unsigned char *p, *q, *r;
1164 struct {
1165 mbfl_convert_filter encoder;
1166 mbfl_convert_filter decoder;
1167 const unsigned char *p;
1168 size_t pos;
1169 } bk, _bk;
1170
1171 /* output code filter */
1172 if (!(decoder = mbfl_convert_filter_new(
1173 &mbfl_encoding_wchar,
1174 string->encoding,
1175 mbfl_memory_device_output, 0, &device))) {
1176 return NULL;
1177 }
1178
1179 /* wchar filter */
1180 if (!(encoder = mbfl_convert_filter_new(
1181 string->encoding,
1182 &mbfl_encoding_wchar,
1183 mbfl_filter_output_null,
1184 NULL, NULL))) {
1185 mbfl_convert_filter_delete(decoder);
1186 return NULL;
1187 }
1188
1189 mbfl_memory_device_init(&device, length + 8, 0);
1190
1191 p = string->val;
1192
1193 /* search start position */
1194 for (q = string->val + from; p < q; p++) {
1195 (*encoder->filter_function)(*p, encoder);
1196 }
1197
1198 /* switch the drain direction */
1199 encoder->output_function = (output_function_t)decoder->filter_function;
1200 encoder->flush_function = (flush_function_t)decoder->filter_flush;
1201 encoder->data = decoder;
1202
1203 q = string->val + string->len;
1204
1205 /* save the encoder, decoder state and the pointer */
1206 mbfl_convert_filter_copy(decoder, &_bk.decoder);
1207 mbfl_convert_filter_copy(encoder, &_bk.encoder);
1208 _bk.p = p;
1209 _bk.pos = device.pos;
1210
1211 if (length > q - p) {
1212 length = q - p;
1213 }
1214
1215 if (length >= 20) {
1216 /* output a little shorter than "length" */
1217 /* XXX: the constant "20" was determined purely on the heuristics. */
1218 for (r = p + length - 20; p < r; p++) {
1219 (*encoder->filter_function)(*p, encoder);
1220 }
1221
1222 /* if the offset of the resulting string exceeds the length,
1223 * then restore the state */
1224 if (device.pos > length) {
1225 p = _bk.p;
1226 device.pos = _bk.pos;
1227 if (decoder->filter_dtor)
1228 decoder->filter_dtor(decoder);
1229 if (encoder->filter_dtor)
1230 encoder->filter_dtor(encoder);
1231 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1232 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1233 bk = _bk;
1234 } else {
1235 /* save the encoder, decoder state and the pointer */
1236 mbfl_convert_filter_copy(decoder, &bk.decoder);
1237 mbfl_convert_filter_copy(encoder, &bk.encoder);
1238 bk.p = p;
1239 bk.pos = device.pos;
1240
1241 /* flush the stream */
1242 (*encoder->filter_flush)(encoder);
1243
1244 /* if the offset of the resulting string exceeds the length,
1245 * then restore the state */
1246 if (device.pos > length) {
1247 if (bk.decoder.filter_dtor)
1248 bk.decoder.filter_dtor(&bk.decoder);
1249 if (bk.encoder.filter_dtor)
1250 bk.encoder.filter_dtor(&bk.encoder);
1251
1252 p = _bk.p;
1253 device.pos = _bk.pos;
1254 if (decoder->filter_dtor)
1255 decoder->filter_dtor(decoder);
1256 if (encoder->filter_dtor)
1257 encoder->filter_dtor(encoder);
1258 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1259 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1260 bk = _bk;
1261 } else {
1262 if (_bk.decoder.filter_dtor)
1263 _bk.decoder.filter_dtor(&_bk.decoder);
1264 if (_bk.encoder.filter_dtor)
1265 _bk.encoder.filter_dtor(&_bk.encoder);
1266
1267 p = bk.p;
1268 device.pos = bk.pos;
1269 if (decoder->filter_dtor)
1270 decoder->filter_dtor(decoder);
1271 if (encoder->filter_dtor)
1272 encoder->filter_dtor(encoder);
1273 mbfl_convert_filter_copy(&bk.decoder, decoder);
1274 mbfl_convert_filter_copy(&bk.encoder, encoder);
1275 }
1276 }
1277 } else {
1278 bk = _bk;
1279 }
1280
1281 /* detect end position */
1282 while (p < q) {
1283 (*encoder->filter_function)(*p, encoder);
1284
1285 if (device.pos > length) {
1286 /* restore filter */
1287 p = bk.p;
1288 device.pos = bk.pos;
1289 if (decoder->filter_dtor)
1290 decoder->filter_dtor(decoder);
1291 if (encoder->filter_dtor)
1292 encoder->filter_dtor(encoder);
1293 mbfl_convert_filter_copy(&bk.decoder, decoder);
1294 mbfl_convert_filter_copy(&bk.encoder, encoder);
1295 break;
1296 }
1297
1298 p++;
1299
1300 /* backup current state */
1301 mbfl_convert_filter_copy(decoder, &_bk.decoder);
1302 mbfl_convert_filter_copy(encoder, &_bk.encoder);
1303 _bk.pos = device.pos;
1304 _bk.p = p;
1305
1306 (*encoder->filter_flush)(encoder);
1307
1308 if (device.pos > length) {
1309 if (_bk.decoder.filter_dtor)
1310 _bk.decoder.filter_dtor(&_bk.decoder);
1311 if (_bk.encoder.filter_dtor)
1312 _bk.encoder.filter_dtor(&_bk.encoder);
1313
1314 /* restore filter */
1315 p = bk.p;
1316 device.pos = bk.pos;
1317 if (decoder->filter_dtor)
1318 decoder->filter_dtor(decoder);
1319 if (encoder->filter_dtor)
1320 encoder->filter_dtor(encoder);
1321 mbfl_convert_filter_copy(&bk.decoder, decoder);
1322 mbfl_convert_filter_copy(&bk.encoder, encoder);
1323 break;
1324 }
1325
1326 if (bk.decoder.filter_dtor)
1327 bk.decoder.filter_dtor(&bk.decoder);
1328 if (bk.encoder.filter_dtor)
1329 bk.encoder.filter_dtor(&bk.encoder);
1330
1331 p = _bk.p;
1332 device.pos = _bk.pos;
1333 if (decoder->filter_dtor)
1334 decoder->filter_dtor(decoder);
1335 if (encoder->filter_dtor)
1336 encoder->filter_dtor(encoder);
1337 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1338 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1339
1340 bk = _bk;
1341 }
1342
1343 (*encoder->filter_flush)(encoder);
1344
1345 if (bk.decoder.filter_dtor)
1346 bk.decoder.filter_dtor(&bk.decoder);
1347 if (bk.encoder.filter_dtor)
1348 bk.encoder.filter_dtor(&bk.encoder);
1349
1350 result = mbfl_memory_device_result(&device, result);
1351
1352 mbfl_convert_filter_delete(encoder);
1353 mbfl_convert_filter_delete(decoder);
1354 }
1355
1356 return result;
1357 }
1358
1359
1360 /*
1361 * strwidth
1362 */
is_fullwidth(int c)1363 static size_t is_fullwidth(int c)
1364 {
1365 int i;
1366
1367 if (c < mbfl_eaw_table[0].begin) {
1368 return 0;
1369 }
1370
1371 for (i = 0; i < sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]); i++) {
1372 if (mbfl_eaw_table[i].begin <= c && c <= mbfl_eaw_table[i].end) {
1373 return 1;
1374 }
1375 }
1376
1377 return 0;
1378 }
1379
1380 static int
filter_count_width(int c,void * data)1381 filter_count_width(int c, void* data)
1382 {
1383 (*(size_t *)data) += (is_fullwidth(c) ? 2: 1);
1384 return c;
1385 }
1386
1387 size_t
mbfl_strwidth(mbfl_string * string)1388 mbfl_strwidth(mbfl_string *string)
1389 {
1390 size_t len, n;
1391 unsigned char *p;
1392 mbfl_convert_filter *filter;
1393
1394 len = 0;
1395 if (string->len > 0 && string->val != NULL) {
1396 /* wchar filter */
1397 filter = mbfl_convert_filter_new(
1398 string->encoding,
1399 &mbfl_encoding_wchar,
1400 filter_count_width, 0, &len);
1401 if (filter == NULL) {
1402 mbfl_convert_filter_delete(filter);
1403 return -1;
1404 }
1405
1406 /* feed data */
1407 p = string->val;
1408 n = string->len;
1409 while (n > 0) {
1410 (*filter->filter_function)(*p++, filter);
1411 n--;
1412 }
1413
1414 mbfl_convert_filter_flush(filter);
1415 mbfl_convert_filter_delete(filter);
1416 }
1417
1418 return len;
1419 }
1420
1421
1422 /*
1423 * strimwidth
1424 */
1425 struct collector_strimwidth_data {
1426 mbfl_convert_filter *decoder;
1427 mbfl_convert_filter *decoder_backup;
1428 mbfl_memory_device device;
1429 size_t from;
1430 size_t width;
1431 size_t outwidth;
1432 size_t outchar;
1433 size_t endpos;
1434 int status;
1435 };
1436
1437 static int
collector_strimwidth(int c,void * data)1438 collector_strimwidth(int c, void* data)
1439 {
1440 struct collector_strimwidth_data *pc = (struct collector_strimwidth_data*)data;
1441
1442 switch (pc->status) {
1443 case 10:
1444 (*pc->decoder->filter_function)(c, pc->decoder);
1445 break;
1446 default:
1447 if (pc->outchar >= pc->from) {
1448 pc->outwidth += (is_fullwidth(c) ? 2: 1);
1449
1450 if (pc->outwidth > pc->width) {
1451 if (pc->status == 0) {
1452 pc->endpos = pc->device.pos;
1453 mbfl_convert_filter_copy(pc->decoder, pc->decoder_backup);
1454 }
1455 pc->status++;
1456 (*pc->decoder->filter_function)(c, pc->decoder);
1457 c = -1;
1458 } else {
1459 (*pc->decoder->filter_function)(c, pc->decoder);
1460 }
1461 }
1462 pc->outchar++;
1463 break;
1464 }
1465
1466 return c;
1467 }
1468
1469 mbfl_string *
mbfl_strimwidth(mbfl_string * string,mbfl_string * marker,mbfl_string * result,size_t from,size_t width)1470 mbfl_strimwidth(
1471 mbfl_string *string,
1472 mbfl_string *marker,
1473 mbfl_string *result,
1474 size_t from,
1475 size_t width)
1476 {
1477 struct collector_strimwidth_data pc;
1478 mbfl_convert_filter *encoder;
1479 size_t n, mkwidth;
1480 unsigned char *p;
1481
1482 if (string == NULL || result == NULL) {
1483 return NULL;
1484 }
1485 mbfl_string_init(result);
1486 result->encoding = string->encoding;
1487 mbfl_memory_device_init(&pc.device, MIN(string->len, width), 0);
1488
1489 /* output code filter */
1490 pc.decoder = mbfl_convert_filter_new(
1491 &mbfl_encoding_wchar,
1492 string->encoding,
1493 mbfl_memory_device_output, 0, &pc.device);
1494 pc.decoder_backup = mbfl_convert_filter_new(
1495 &mbfl_encoding_wchar,
1496 string->encoding,
1497 mbfl_memory_device_output, 0, &pc.device);
1498 /* wchar filter */
1499 encoder = mbfl_convert_filter_new(
1500 string->encoding,
1501 &mbfl_encoding_wchar,
1502 collector_strimwidth, 0, &pc);
1503 if (pc.decoder == NULL || pc.decoder_backup == NULL || encoder == NULL) {
1504 mbfl_convert_filter_delete(encoder);
1505 mbfl_convert_filter_delete(pc.decoder);
1506 mbfl_convert_filter_delete(pc.decoder_backup);
1507 return NULL;
1508 }
1509 mkwidth = 0;
1510 if (marker) {
1511 mkwidth = mbfl_strwidth(marker);
1512 }
1513 pc.from = from;
1514 pc.width = width - mkwidth;
1515 pc.outwidth = 0;
1516 pc.outchar = 0;
1517 pc.status = 0;
1518 pc.endpos = 0;
1519
1520 /* feed data */
1521 p = string->val;
1522 n = string->len;
1523 if (p != NULL) {
1524 while (n > 0) {
1525 n--;
1526 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1527 break;
1528 }
1529 }
1530 mbfl_convert_filter_flush(encoder);
1531 if (pc.status != 0 && mkwidth > 0) {
1532 pc.width += mkwidth;
1533 if (n > 0) {
1534 while (n > 0) {
1535 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1536 break;
1537 }
1538 n--;
1539 }
1540 mbfl_convert_filter_flush(encoder);
1541 } else if (pc.outwidth > pc.width) {
1542 pc.status++;
1543 }
1544 if (pc.status != 1) {
1545 pc.status = 10;
1546 pc.device.pos = pc.endpos;
1547 mbfl_convert_filter_copy(pc.decoder_backup, pc.decoder);
1548 mbfl_convert_filter_reset(encoder, marker->encoding, &mbfl_encoding_wchar);
1549 p = marker->val;
1550 n = marker->len;
1551 while (n > 0) {
1552 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1553 break;
1554 }
1555 n--;
1556 }
1557 mbfl_convert_filter_flush(encoder);
1558 }
1559 } else if (pc.status != 0) {
1560 pc.device.pos = pc.endpos;
1561 mbfl_convert_filter_copy(pc.decoder_backup, pc.decoder);
1562 }
1563 mbfl_convert_filter_flush(pc.decoder);
1564 }
1565 result = mbfl_memory_device_result(&pc.device, result);
1566 mbfl_convert_filter_delete(encoder);
1567 mbfl_convert_filter_delete(pc.decoder);
1568 mbfl_convert_filter_delete(pc.decoder_backup);
1569
1570 return result;
1571 }
1572
1573 mbfl_string *
mbfl_ja_jp_hantozen(mbfl_string * string,mbfl_string * result,int mode)1574 mbfl_ja_jp_hantozen(
1575 mbfl_string *string,
1576 mbfl_string *result,
1577 int mode)
1578 {
1579 size_t n;
1580 unsigned char *p;
1581 mbfl_memory_device device;
1582 mbfl_convert_filter *decoder = NULL;
1583 mbfl_convert_filter *encoder = NULL;
1584 mbfl_convert_filter *tl_filter = NULL;
1585 mbfl_convert_filter *next_filter = NULL;
1586 mbfl_filt_tl_jisx0201_jisx0208_param *param = NULL;
1587
1588 mbfl_memory_device_init(&device, string->len, 0);
1589 mbfl_string_init(result);
1590
1591 result->encoding = string->encoding;
1592
1593 decoder = mbfl_convert_filter_new(
1594 &mbfl_encoding_wchar,
1595 string->encoding,
1596 mbfl_memory_device_output, 0, &device);
1597 if (decoder == NULL) {
1598 goto out;
1599 }
1600 next_filter = decoder;
1601
1602 param = emalloc(sizeof(mbfl_filt_tl_jisx0201_jisx0208_param));
1603 param->mode = mode;
1604
1605 tl_filter = mbfl_convert_filter_new2(
1606 &vtbl_tl_jisx0201_jisx0208,
1607 (int(*)(int, void*))next_filter->filter_function,
1608 (flush_function_t)next_filter->filter_flush,
1609 next_filter);
1610 if (tl_filter == NULL) {
1611 efree(param);
1612 goto out;
1613 }
1614
1615 tl_filter->opaque = param;
1616 next_filter = tl_filter;
1617
1618 encoder = mbfl_convert_filter_new(
1619 string->encoding,
1620 &mbfl_encoding_wchar,
1621 (int(*)(int, void*))next_filter->filter_function,
1622 (flush_function_t)next_filter->filter_flush,
1623 next_filter);
1624 if (encoder == NULL) {
1625 goto out;
1626 }
1627
1628 /* feed data */
1629 p = string->val;
1630 n = string->len;
1631 if (p != NULL) {
1632 while (n > 0) {
1633 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1634 break;
1635 }
1636 n--;
1637 }
1638 }
1639
1640 mbfl_convert_filter_flush(encoder);
1641 result = mbfl_memory_device_result(&device, result);
1642 out:
1643 if (tl_filter != NULL) {
1644 if (tl_filter->opaque != NULL) {
1645 efree(tl_filter->opaque);
1646 }
1647 mbfl_convert_filter_delete(tl_filter);
1648 }
1649
1650 if (decoder != NULL) {
1651 mbfl_convert_filter_delete(decoder);
1652 }
1653
1654 if (encoder != NULL) {
1655 mbfl_convert_filter_delete(encoder);
1656 }
1657
1658 return result;
1659 }
1660
1661
1662 /*
1663 * MIME header encode
1664 */
1665 struct mime_header_encoder_data {
1666 mbfl_convert_filter *conv1_filter;
1667 mbfl_convert_filter *block_filter;
1668 mbfl_convert_filter *conv2_filter;
1669 mbfl_convert_filter *conv2_filter_backup;
1670 mbfl_convert_filter *encod_filter;
1671 mbfl_convert_filter *encod_filter_backup;
1672 mbfl_memory_device outdev;
1673 mbfl_memory_device tmpdev;
1674 int status1;
1675 int status2;
1676 size_t prevpos;
1677 size_t linehead;
1678 size_t firstindent;
1679 int encnamelen;
1680 int lwsplen;
1681 char encname[128];
1682 char lwsp[16];
1683 };
1684
1685 static int
mime_header_encoder_block_collector(int c,void * data)1686 mime_header_encoder_block_collector(int c, void *data)
1687 {
1688 size_t n;
1689 struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1690
1691 switch (pe->status2) {
1692 case 1: /* encoded word */
1693 pe->prevpos = pe->outdev.pos;
1694 mbfl_convert_filter_copy(pe->conv2_filter, pe->conv2_filter_backup);
1695 mbfl_convert_filter_copy(pe->encod_filter, pe->encod_filter_backup);
1696 (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1697 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1698 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1699 n = pe->outdev.pos - pe->linehead + pe->firstindent;
1700 pe->outdev.pos = pe->prevpos;
1701 mbfl_convert_filter_copy(pe->conv2_filter_backup, pe->conv2_filter);
1702 mbfl_convert_filter_copy(pe->encod_filter_backup, pe->encod_filter);
1703 if (n >= 74) {
1704 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1705 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1706 mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
1707 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1708 pe->linehead = pe->outdev.pos;
1709 pe->firstindent = 0;
1710 mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1711 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1712 } else {
1713 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1714 }
1715 break;
1716
1717 default:
1718 mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1719 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1720 pe->status2 = 1;
1721 break;
1722 }
1723
1724 return c;
1725 }
1726
1727 static int
mime_header_encoder_collector(int c,void * data)1728 mime_header_encoder_collector(int c, void *data)
1729 {
1730 static int qp_table[256] = {
1731 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1732 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1733 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 */
1734 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0, 0, 1, 0, 1, /* 0x10 */
1735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */
1736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x50 */
1737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */
1738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x70 */
1739 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80 */
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90 */
1741 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xA0 */
1742 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xB0 */
1743 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 */
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xD0 */
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xE0 */
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 /* 0xF0 */
1747 };
1748
1749 size_t n;
1750 struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1751
1752 switch (pe->status1) {
1753 case 11: /* encoded word */
1754 (*pe->block_filter->filter_function)(c, pe->block_filter);
1755 break;
1756
1757 default: /* ASCII */
1758 if (c <= 0x00ff && !qp_table[(c & 0xff)]) { /* ordinary characters */
1759 mbfl_memory_device_output(c, &pe->tmpdev);
1760 pe->status1 = 1;
1761 } else if (pe->status1 == 0 && c == 0x20) { /* repeat SPACE */
1762 mbfl_memory_device_output(c, &pe->tmpdev);
1763 } else {
1764 if (pe->tmpdev.pos < 74 && c == 0x20) {
1765 n = pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent;
1766 if (n > 74) {
1767 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
1768 pe->linehead = pe->outdev.pos;
1769 pe->firstindent = 0;
1770 } else if (pe->outdev.pos > 0) {
1771 mbfl_memory_device_output(0x20, &pe->outdev);
1772 }
1773 mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1774 mbfl_memory_device_reset(&pe->tmpdev);
1775 pe->status1 = 0;
1776 } else {
1777 n = pe->outdev.pos - pe->linehead + pe->encnamelen + pe->firstindent;
1778 if (n > 60) {
1779 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
1780 pe->linehead = pe->outdev.pos;
1781 pe->firstindent = 0;
1782 } else if (pe->outdev.pos > 0) {
1783 mbfl_memory_device_output(0x20, &pe->outdev);
1784 }
1785 mbfl_convert_filter_devcat(pe->block_filter, &pe->tmpdev);
1786 mbfl_memory_device_reset(&pe->tmpdev);
1787 (*pe->block_filter->filter_function)(c, pe->block_filter);
1788 pe->status1 = 11;
1789 }
1790 }
1791 break;
1792 }
1793
1794 return c;
1795 }
1796
1797 mbfl_string *
mime_header_encoder_result(struct mime_header_encoder_data * pe,mbfl_string * result)1798 mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *result)
1799 {
1800 if (pe->status1 >= 10) {
1801 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1802 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1803 mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
1804 } else if (pe->tmpdev.pos > 0) {
1805 if (pe->outdev.pos > 0) {
1806 if ((pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent) > 74) {
1807 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1808 } else {
1809 mbfl_memory_device_output(0x20, &pe->outdev);
1810 }
1811 }
1812 mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1813 }
1814 mbfl_memory_device_reset(&pe->tmpdev);
1815 pe->prevpos = 0;
1816 pe->linehead = 0;
1817 pe->status1 = 0;
1818 pe->status2 = 0;
1819
1820 return mbfl_memory_device_result(&pe->outdev, result);
1821 }
1822
1823 struct mime_header_encoder_data*
mime_header_encoder_new(const mbfl_encoding * incode,const mbfl_encoding * outcode,const mbfl_encoding * transenc)1824 mime_header_encoder_new(
1825 const mbfl_encoding *incode,
1826 const mbfl_encoding *outcode,
1827 const mbfl_encoding *transenc)
1828 {
1829 size_t n;
1830 const char *s;
1831 struct mime_header_encoder_data *pe;
1832
1833 /* get output encoding and check MIME charset name */
1834 if (outcode->mime_name == NULL || outcode->mime_name[0] == '\0') {
1835 return NULL;
1836 }
1837
1838 pe = emalloc(sizeof(struct mime_header_encoder_data));
1839 mbfl_memory_device_init(&pe->outdev, 0, 0);
1840 mbfl_memory_device_init(&pe->tmpdev, 0, 0);
1841 pe->prevpos = 0;
1842 pe->linehead = 0;
1843 pe->firstindent = 0;
1844 pe->status1 = 0;
1845 pe->status2 = 0;
1846
1847 /* make the encoding description string exp. "=?ISO-2022-JP?B?" */
1848 n = 0;
1849 pe->encname[n++] = 0x3d;
1850 pe->encname[n++] = 0x3f;
1851 s = outcode->mime_name;
1852 while (*s) {
1853 pe->encname[n++] = *s++;
1854 }
1855 pe->encname[n++] = 0x3f;
1856 if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1857 pe->encname[n++] = 0x51;
1858 } else {
1859 pe->encname[n++] = 0x42;
1860 transenc = &mbfl_encoding_base64;
1861 }
1862 pe->encname[n++] = 0x3f;
1863 pe->encname[n] = '\0';
1864 pe->encnamelen = n;
1865
1866 n = 0;
1867 pe->lwsp[n++] = 0x0d;
1868 pe->lwsp[n++] = 0x0a;
1869 pe->lwsp[n++] = 0x20;
1870 pe->lwsp[n] = '\0';
1871 pe->lwsplen = n;
1872
1873 /* transfer encode filter */
1874 pe->encod_filter = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1875 pe->encod_filter_backup = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1876
1877 /* Output code filter */
1878 pe->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1879 pe->conv2_filter_backup = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1880
1881 /* encoded block filter */
1882 pe->block_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, &mbfl_encoding_wchar, mime_header_encoder_block_collector, 0, pe);
1883
1884 /* Input code filter */
1885 pe->conv1_filter = mbfl_convert_filter_new(incode, &mbfl_encoding_wchar, mime_header_encoder_collector, 0, pe);
1886
1887 if (pe->encod_filter == NULL ||
1888 pe->encod_filter_backup == NULL ||
1889 pe->conv2_filter == NULL ||
1890 pe->conv2_filter_backup == NULL ||
1891 pe->conv1_filter == NULL) {
1892 mime_header_encoder_delete(pe);
1893 return NULL;
1894 }
1895
1896 if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1897 pe->encod_filter->status |= MBFL_QPRINT_STS_MIME_HEADER;
1898 pe->encod_filter_backup->status |= MBFL_QPRINT_STS_MIME_HEADER;
1899 } else {
1900 pe->encod_filter->status |= MBFL_BASE64_STS_MIME_HEADER;
1901 pe->encod_filter_backup->status |= MBFL_BASE64_STS_MIME_HEADER;
1902 }
1903
1904 return pe;
1905 }
1906
1907 void
mime_header_encoder_delete(struct mime_header_encoder_data * pe)1908 mime_header_encoder_delete(struct mime_header_encoder_data *pe)
1909 {
1910 if (pe) {
1911 mbfl_convert_filter_delete(pe->conv1_filter);
1912 mbfl_convert_filter_delete(pe->block_filter);
1913 mbfl_convert_filter_delete(pe->conv2_filter);
1914 mbfl_convert_filter_delete(pe->conv2_filter_backup);
1915 mbfl_convert_filter_delete(pe->encod_filter);
1916 mbfl_convert_filter_delete(pe->encod_filter_backup);
1917 mbfl_memory_device_clear(&pe->outdev);
1918 mbfl_memory_device_clear(&pe->tmpdev);
1919 efree((void*)pe);
1920 }
1921 }
1922
1923 mbfl_string *
mbfl_mime_header_encode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode,const mbfl_encoding * encoding,const char * linefeed,int indent)1924 mbfl_mime_header_encode(
1925 mbfl_string *string,
1926 mbfl_string *result,
1927 const mbfl_encoding *outcode,
1928 const mbfl_encoding *encoding,
1929 const char *linefeed,
1930 int indent)
1931 {
1932 size_t n;
1933 unsigned char *p;
1934 struct mime_header_encoder_data *pe;
1935
1936 mbfl_string_init(result);
1937 result->encoding = &mbfl_encoding_ascii;
1938
1939 pe = mime_header_encoder_new(string->encoding, outcode, encoding);
1940 if (pe == NULL) {
1941 return NULL;
1942 }
1943
1944 if (linefeed != NULL) {
1945 n = 0;
1946 while (*linefeed && n < 8) {
1947 pe->lwsp[n++] = *linefeed++;
1948 }
1949 pe->lwsp[n++] = 0x20;
1950 pe->lwsp[n] = '\0';
1951 pe->lwsplen = n;
1952 }
1953 if (indent > 0 && indent < 74) {
1954 pe->firstindent = indent;
1955 }
1956
1957 n = string->len;
1958 p = string->val;
1959 while (n > 0) {
1960 (*pe->conv1_filter->filter_function)(*p++, pe->conv1_filter);
1961 n--;
1962 }
1963
1964 result = mime_header_encoder_result(pe, result);
1965 mime_header_encoder_delete(pe);
1966
1967 return result;
1968 }
1969
1970
1971 /*
1972 * MIME header decode
1973 */
1974 struct mime_header_decoder_data {
1975 mbfl_convert_filter *deco_filter;
1976 mbfl_convert_filter *conv1_filter;
1977 mbfl_convert_filter *conv2_filter;
1978 mbfl_memory_device outdev;
1979 mbfl_memory_device tmpdev;
1980 size_t cspos;
1981 int status;
1982 const mbfl_encoding *encoding;
1983 const mbfl_encoding *incode;
1984 const mbfl_encoding *outcode;
1985 };
1986
1987 static int
mime_header_decoder_collector(int c,void * data)1988 mime_header_decoder_collector(int c, void* data)
1989 {
1990 const mbfl_encoding *encoding;
1991 struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data;
1992
1993 switch (pd->status) {
1994 case 1:
1995 if (c == 0x3f) { /* ? */
1996 mbfl_memory_device_output(c, &pd->tmpdev);
1997 pd->cspos = pd->tmpdev.pos;
1998 pd->status = 2;
1999 } else {
2000 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2001 mbfl_memory_device_reset(&pd->tmpdev);
2002 if (c == 0x3d) { /* = */
2003 mbfl_memory_device_output(c, &pd->tmpdev);
2004 } else if (c == 0x0d || c == 0x0a) { /* CR or LF */
2005 pd->status = 9;
2006 } else {
2007 (*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
2008 pd->status = 0;
2009 }
2010 }
2011 break;
2012 case 2: /* store charset string */
2013 if (c == 0x3f) { /* ? */
2014 /* identify charset */
2015 mbfl_memory_device_output('\0', &pd->tmpdev);
2016 encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]);
2017 if (encoding != NULL) {
2018 pd->incode = encoding;
2019 pd->status = 3;
2020 }
2021 mbfl_memory_device_unput(&pd->tmpdev);
2022 mbfl_memory_device_output(c, &pd->tmpdev);
2023 } else {
2024 mbfl_memory_device_output(c, &pd->tmpdev);
2025 if (pd->tmpdev.pos > 100) { /* too long charset string */
2026 pd->status = 0;
2027 } else if (c == 0x0d || c == 0x0a) { /* CR or LF */
2028 mbfl_memory_device_unput(&pd->tmpdev);
2029 pd->status = 9;
2030 }
2031 if (pd->status != 2) {
2032 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2033 mbfl_memory_device_reset(&pd->tmpdev);
2034 }
2035 }
2036 break;
2037 case 3: /* identify encoding */
2038 mbfl_memory_device_output(c, &pd->tmpdev);
2039 if (c == 0x42 || c == 0x62) { /* 'B' or 'b' */
2040 pd->encoding = &mbfl_encoding_base64;
2041 pd->status = 4;
2042 } else if (c == 0x51 || c == 0x71) { /* 'Q' or 'q' */
2043 pd->encoding = &mbfl_encoding_qprint;
2044 pd->status = 4;
2045 } else {
2046 if (c == 0x0d || c == 0x0a) { /* CR or LF */
2047 mbfl_memory_device_unput(&pd->tmpdev);
2048 pd->status = 9;
2049 } else {
2050 pd->status = 0;
2051 }
2052 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2053 mbfl_memory_device_reset(&pd->tmpdev);
2054 }
2055 break;
2056 case 4: /* reset filter */
2057 mbfl_memory_device_output(c, &pd->tmpdev);
2058 if (c == 0x3f) { /* ? */
2059 /* charset convert filter */
2060 mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar);
2061 /* decode filter */
2062 mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit);
2063 pd->status = 5;
2064 } else {
2065 if (c == 0x0d || c == 0x0a) { /* CR or LF */
2066 mbfl_memory_device_unput(&pd->tmpdev);
2067 pd->status = 9;
2068 } else {
2069 pd->status = 0;
2070 }
2071 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2072 }
2073 mbfl_memory_device_reset(&pd->tmpdev);
2074 break;
2075 case 5: /* encoded block */
2076 if (c == 0x3f) { /* ? */
2077 pd->status = 6;
2078 } else {
2079 (*pd->deco_filter->filter_function)(c, pd->deco_filter);
2080 }
2081 break;
2082 case 6: /* check end position */
2083 if (c == 0x3d) { /* = */
2084 /* flush and reset filter */
2085 (*pd->deco_filter->filter_flush)(pd->deco_filter);
2086 (*pd->conv1_filter->filter_flush)(pd->conv1_filter);
2087 mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar);
2088 pd->status = 7;
2089 } else {
2090 (*pd->deco_filter->filter_function)(0x3f, pd->deco_filter);
2091 if (c != 0x3f) { /* ? */
2092 (*pd->deco_filter->filter_function)(c, pd->deco_filter);
2093 pd->status = 5;
2094 }
2095 }
2096 break;
2097 case 7: /* after encoded block */
2098 if (c == 0x0d || c == 0x0a) { /* CR LF */
2099 pd->status = 8;
2100 } else {
2101 mbfl_memory_device_output(c, &pd->tmpdev);
2102 if (c == 0x3d) { /* = */
2103 pd->status = 1;
2104 } else if (c != 0x20 && c != 0x09) { /* not space */
2105 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2106 mbfl_memory_device_reset(&pd->tmpdev);
2107 pd->status = 0;
2108 }
2109 }
2110 break;
2111 case 8: /* folding */
2112 case 9: /* folding */
2113 if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) {
2114 if (c == 0x3d) { /* = */
2115 if (pd->status == 8) {
2116 mbfl_memory_device_output(0x20, &pd->tmpdev); /* SPACE */
2117 } else {
2118 (*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter);
2119 }
2120 mbfl_memory_device_output(c, &pd->tmpdev);
2121 pd->status = 1;
2122 } else {
2123 mbfl_memory_device_output(0x20, &pd->tmpdev);
2124 mbfl_memory_device_output(c, &pd->tmpdev);
2125 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2126 mbfl_memory_device_reset(&pd->tmpdev);
2127 pd->status = 0;
2128 }
2129 }
2130 break;
2131 default: /* non encoded block */
2132 if (c == 0x0d || c == 0x0a) { /* CR LF */
2133 pd->status = 9;
2134 } else if (c == 0x3d) { /* = */
2135 mbfl_memory_device_output(c, &pd->tmpdev);
2136 pd->status = 1;
2137 } else {
2138 (*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
2139 }
2140 break;
2141 }
2142
2143 return c;
2144 }
2145
2146 mbfl_string *
mime_header_decoder_result(struct mime_header_decoder_data * pd,mbfl_string * result)2147 mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result)
2148 {
2149 switch (pd->status) {
2150 case 1:
2151 case 2:
2152 case 3:
2153 case 4:
2154 case 7:
2155 case 8:
2156 case 9:
2157 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2158 break;
2159 case 5:
2160 case 6:
2161 (*pd->deco_filter->filter_flush)(pd->deco_filter);
2162 (*pd->conv1_filter->filter_flush)(pd->conv1_filter);
2163 break;
2164 }
2165 (*pd->conv2_filter->filter_flush)(pd->conv2_filter);
2166 mbfl_memory_device_reset(&pd->tmpdev);
2167 pd->status = 0;
2168
2169 return mbfl_memory_device_result(&pd->outdev, result);
2170 }
2171
2172 struct mime_header_decoder_data*
mime_header_decoder_new(const mbfl_encoding * outcode)2173 mime_header_decoder_new(const mbfl_encoding *outcode)
2174 {
2175 struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data));
2176
2177 mbfl_memory_device_init(&pd->outdev, 0, 0);
2178 mbfl_memory_device_init(&pd->tmpdev, 0, 0);
2179 pd->cspos = 0;
2180 pd->status = 0;
2181 pd->encoding = &mbfl_encoding_8bit;
2182 pd->incode = &mbfl_encoding_ascii;
2183 pd->outcode = outcode;
2184 /* charset convert filter */
2185 pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev);
2186 pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter);
2187 /* decode filter */
2188 pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter);
2189
2190 if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) {
2191 mime_header_decoder_delete(pd);
2192 return NULL;
2193 }
2194
2195 return pd;
2196 }
2197
2198 void
mime_header_decoder_delete(struct mime_header_decoder_data * pd)2199 mime_header_decoder_delete(struct mime_header_decoder_data *pd)
2200 {
2201 if (pd) {
2202 mbfl_convert_filter_delete(pd->conv2_filter);
2203 mbfl_convert_filter_delete(pd->conv1_filter);
2204 mbfl_convert_filter_delete(pd->deco_filter);
2205 mbfl_memory_device_clear(&pd->outdev);
2206 mbfl_memory_device_clear(&pd->tmpdev);
2207 efree((void*)pd);
2208 }
2209 }
2210
2211 mbfl_string *
mbfl_mime_header_decode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode)2212 mbfl_mime_header_decode(
2213 mbfl_string *string,
2214 mbfl_string *result,
2215 const mbfl_encoding *outcode)
2216 {
2217 size_t n;
2218 unsigned char *p;
2219 struct mime_header_decoder_data *pd;
2220
2221 mbfl_string_init(result);
2222 result->encoding = outcode;
2223
2224 pd = mime_header_decoder_new(outcode);
2225 if (pd == NULL) {
2226 return NULL;
2227 }
2228
2229 /* feed data */
2230 n = string->len;
2231 p = string->val;
2232 while (n > 0) {
2233 mime_header_decoder_collector(*p++, pd);
2234 n--;
2235 }
2236
2237 result = mime_header_decoder_result(pd, result);
2238 mime_header_decoder_delete(pd);
2239
2240 return result;
2241 }
2242
2243
2244
2245 /*
2246 * convert HTML numeric entity
2247 */
2248 struct collector_htmlnumericentity_data {
2249 mbfl_convert_filter *decoder;
2250 int status;
2251 int cache;
2252 int digit;
2253 int *convmap;
2254 int mapsize;
2255 };
2256
2257 static int
collector_encode_htmlnumericentity(int c,void * data)2258 collector_encode_htmlnumericentity(int c, void *data)
2259 {
2260 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2261 int f, n, s, r, d, size, *mapelm;
2262
2263 size = pc->mapsize;
2264 f = 0;
2265 n = 0;
2266 while (n < size) {
2267 mapelm = &(pc->convmap[n*4]);
2268 if (c >= mapelm[0] && c <= mapelm[1]) {
2269 s = (c + mapelm[2]) & mapelm[3];
2270 if (s >= 0) {
2271 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2272 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2273 r = 100000000;
2274 s %= r;
2275 while (r > 0) {
2276 d = s/r;
2277 if (d || f) {
2278 f = 1;
2279 s %= r;
2280 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2281 }
2282 r /= 10;
2283 }
2284 if (!f) {
2285 f = 1;
2286 (*pc->decoder->filter_function)(mbfl_hexchar_table[0], pc->decoder);
2287 }
2288 (*pc->decoder->filter_function)(0x3b, pc->decoder); /* ';' */
2289 }
2290 }
2291 if (f) {
2292 break;
2293 }
2294 n++;
2295 }
2296 if (!f) {
2297 (*pc->decoder->filter_function)(c, pc->decoder);
2298 }
2299
2300 return c;
2301 }
2302
2303 static int
collector_decode_htmlnumericentity(int c,void * data)2304 collector_decode_htmlnumericentity(int c, void *data)
2305 {
2306 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2307 int f, n, s, r, d, size, *mapelm;
2308
2309 switch (pc->status) {
2310 case 1:
2311 if (c == 0x23) { /* '#' */
2312 pc->status = 2;
2313 } else {
2314 pc->status = 0;
2315 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2316 (*pc->decoder->filter_function)(c, pc->decoder);
2317 }
2318 break;
2319 case 2:
2320 if (c == 0x78) { /* 'x' */
2321 pc->status = 4;
2322 } else if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2323 pc->cache = c - 0x30;
2324 pc->status = 3;
2325 pc->digit = 1;
2326 } else {
2327 pc->status = 0;
2328 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2329 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2330 (*pc->decoder->filter_function)(c, pc->decoder);
2331 }
2332 break;
2333 case 3:
2334 s = 0;
2335 f = 0;
2336 if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2337 s = pc->cache;
2338 if (pc->digit > 9 || s > INT_MAX/10) {
2339 pc->status = 0;
2340 f = 1;
2341 } else {
2342 s = s*10 + (c - 0x30);
2343 pc->cache = s;
2344 pc->digit++;
2345 }
2346 } else {
2347 pc->status = 0;
2348 s = pc->cache;
2349 f = 1;
2350 n = 0;
2351 size = pc->mapsize;
2352 while (n < size) {
2353 mapelm = &(pc->convmap[n*4]);
2354 d = s - mapelm[2];
2355 if (d >= mapelm[0] && d <= mapelm[1]) {
2356 f = 0;
2357 (*pc->decoder->filter_function)(d, pc->decoder);
2358 if (c != 0x3b) { /* ';' */
2359 (*pc->decoder->filter_function)(c, pc->decoder);
2360 }
2361 break;
2362 }
2363 n++;
2364 }
2365 }
2366 if (f) {
2367 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2368 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2369 r = 1;
2370 n = pc->digit;
2371 while (n > 1) {
2372 r *= 10;
2373 n--;
2374 }
2375 while (r > 0) {
2376 d = s/r;
2377 s %= r;
2378 r /= 10;
2379 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2380 }
2381 (*pc->decoder->filter_function)(c, pc->decoder);
2382 }
2383 break;
2384 case 4:
2385 if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2386 pc->cache = c - 0x30;
2387 pc->status = 5;
2388 pc->digit = 1;
2389 } else if (c >= 0x41 && c <= 0x46) { /* 'A' - 'F' */
2390 pc->cache = c - 0x41 + 10;
2391 pc->status = 5;
2392 pc->digit = 1;
2393 } else if (c >= 0x61 && c <= 0x66) { /* 'a' - 'f' */
2394 pc->cache = c - 0x61 + 10;
2395 pc->status = 5;
2396 pc->digit = 1;
2397 } else {
2398 pc->status = 0;
2399 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2400 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2401 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2402 (*pc->decoder->filter_function)(c, pc->decoder);
2403 }
2404 break;
2405 case 5:
2406 s = 0;
2407 f = 0;
2408 if ((c >= 0x30 && c <= 0x39) ||
2409 (c >= 0x41 && c <= 0x46) ||
2410 (c >= 0x61 && c <= 0x66)) { /* '0' - '9' or 'a' - 'f' */
2411 if (pc->digit > 9) {
2412 pc->status = 0;
2413 s = pc->cache;
2414 f = 1;
2415 } else {
2416 if (c >= 0x30 && c <= 0x39) {
2417 s = pc->cache*16 + (c - 0x30);
2418 } else if (c >= 0x41 && c <= 0x46) {
2419 s = pc->cache*16 + (c - 0x41 + 10);
2420 } else {
2421 s = pc->cache*16 + (c - 0x61 + 10);
2422 }
2423 pc->cache = s;
2424 pc->digit++;
2425 }
2426 } else {
2427 pc->status = 0;
2428 s = pc->cache;
2429 f = 1;
2430 n = 0;
2431 size = pc->mapsize;
2432 while (n < size) {
2433 mapelm = &(pc->convmap[n*4]);
2434 d = s - mapelm[2];
2435 if (d >= mapelm[0] && d <= mapelm[1]) {
2436 f = 0;
2437 (*pc->decoder->filter_function)(d, pc->decoder);
2438 if (c != 0x3b) { /* ';' */
2439 (*pc->decoder->filter_function)(c, pc->decoder);
2440 }
2441 break;
2442 }
2443 n++;
2444 }
2445 }
2446 if (f) {
2447 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2448 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2449 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2450 r = 1;
2451 n = pc->digit;
2452 while (n > 0) {
2453 r *= 16;
2454 n--;
2455 }
2456 s %= r;
2457 r /= 16;
2458 while (r > 0) {
2459 d = s/r;
2460 s %= r;
2461 r /= 16;
2462 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2463 }
2464 (*pc->decoder->filter_function)(c, pc->decoder);
2465 }
2466 break;
2467 default:
2468 if (c == 0x26) { /* '&' */
2469 pc->status = 1;
2470 } else {
2471 (*pc->decoder->filter_function)(c, pc->decoder);
2472 }
2473 break;
2474 }
2475
2476 return c;
2477 }
2478
2479 static int
collector_encode_hex_htmlnumericentity(int c,void * data)2480 collector_encode_hex_htmlnumericentity(int c, void *data)
2481 {
2482 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2483 int f, n, s, r, d, size, *mapelm;
2484
2485 size = pc->mapsize;
2486 f = 0;
2487 n = 0;
2488 while (n < size) {
2489 mapelm = &(pc->convmap[n*4]);
2490 if (c >= mapelm[0] && c <= mapelm[1]) {
2491 s = (c + mapelm[2]) & mapelm[3];
2492 if (s >= 0) {
2493 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2494 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2495 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2496 r = 0x1000000;
2497 s %= r;
2498 while (r > 0) {
2499 d = s/r;
2500 if (d || f) {
2501 f = 1;
2502 s %= r;
2503 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2504 }
2505 r /= 16;
2506 }
2507 if (!f) {
2508 f = 1;
2509 (*pc->decoder->filter_function)(mbfl_hexchar_table[0], pc->decoder);
2510 }
2511 (*pc->decoder->filter_function)(0x3b, pc->decoder); /* ';' */
2512 }
2513 }
2514 if (f) {
2515 break;
2516 }
2517 n++;
2518 }
2519 if (!f) {
2520 (*pc->decoder->filter_function)(c, pc->decoder);
2521 }
2522
2523 return c;
2524 }
2525
mbfl_filt_decode_htmlnumericentity_flush(mbfl_convert_filter * filter)2526 int mbfl_filt_decode_htmlnumericentity_flush(mbfl_convert_filter *filter)
2527 {
2528 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)filter;
2529 int n, s, r, d;
2530
2531 if (pc->status) {
2532 switch (pc->status) {
2533 case 1: /* '&' */
2534 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2535 break;
2536 case 2: /* '#' */
2537 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2538 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2539 break;
2540 case 3: /* '0'-'9' */
2541 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2542 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2543
2544 s = pc->cache;
2545 r = 1;
2546 n = pc->digit;
2547 while (n > 1) {
2548 r *= 10;
2549 n--;
2550 }
2551 while (r > 0) {
2552 d = s/r;
2553 s %= r;
2554 r /= 10;
2555 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2556 }
2557
2558 break;
2559 case 4: /* 'x' */
2560 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2561 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2562 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2563 break;
2564 case 5: /* '0'-'9','a'-'f' */
2565 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2566 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2567 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2568
2569 s = pc->cache;
2570 r = 1;
2571 n = pc->digit;
2572 while (n > 0) {
2573 r *= 16;
2574 n--;
2575 }
2576 s %= r;
2577 r /= 16;
2578 while (r > 0) {
2579 d = s/r;
2580 s %= r;
2581 r /= 16;
2582 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2583 }
2584 break;
2585 default:
2586 break;
2587 }
2588 }
2589
2590 pc->status = 0;
2591 pc->cache = 0;
2592 pc->digit = 0;
2593
2594 return 0;
2595 }
2596
2597
2598 mbfl_string *
mbfl_html_numeric_entity(mbfl_string * string,mbfl_string * result,int * convmap,int mapsize,int type)2599 mbfl_html_numeric_entity(
2600 mbfl_string *string,
2601 mbfl_string *result,
2602 int *convmap,
2603 int mapsize,
2604 int type)
2605 {
2606 struct collector_htmlnumericentity_data pc;
2607 mbfl_memory_device device;
2608 mbfl_convert_filter *encoder;
2609 size_t n;
2610 unsigned char *p;
2611
2612 if (string == NULL || result == NULL) {
2613 return NULL;
2614 }
2615 mbfl_string_init(result);
2616 result->encoding = string->encoding;
2617 mbfl_memory_device_init(&device, string->len, 0);
2618
2619 /* output code filter */
2620 pc.decoder = mbfl_convert_filter_new(
2621 &mbfl_encoding_wchar,
2622 string->encoding,
2623 mbfl_memory_device_output, 0, &device);
2624 /* wchar filter */
2625 if (type == 0) { /* decimal output */
2626 encoder = mbfl_convert_filter_new(
2627 string->encoding,
2628 &mbfl_encoding_wchar,
2629 collector_encode_htmlnumericentity, 0, &pc);
2630 } else if (type == 2) { /* hex output */
2631 encoder = mbfl_convert_filter_new(
2632 string->encoding,
2633 &mbfl_encoding_wchar,
2634 collector_encode_hex_htmlnumericentity, 0, &pc);
2635 } else { /* type == 1: decimal/hex input */
2636 encoder = mbfl_convert_filter_new(
2637 string->encoding,
2638 &mbfl_encoding_wchar,
2639 collector_decode_htmlnumericentity,
2640 (flush_function_t)mbfl_filt_decode_htmlnumericentity_flush, &pc);
2641 }
2642 if (pc.decoder == NULL || encoder == NULL) {
2643 mbfl_convert_filter_delete(encoder);
2644 mbfl_convert_filter_delete(pc.decoder);
2645 return NULL;
2646 }
2647 pc.status = 0;
2648 pc.cache = 0;
2649 pc.digit = 0;
2650 pc.convmap = convmap;
2651 pc.mapsize = mapsize;
2652
2653 /* feed data */
2654 p = string->val;
2655 n = string->len;
2656 if (p != NULL) {
2657 while (n > 0) {
2658 if ((*encoder->filter_function)(*p++, encoder) < 0) {
2659 break;
2660 }
2661 n--;
2662 }
2663 }
2664 mbfl_convert_filter_flush(encoder);
2665 mbfl_convert_filter_flush(pc.decoder);
2666 result = mbfl_memory_device_result(&device, result);
2667 mbfl_convert_filter_delete(encoder);
2668 mbfl_convert_filter_delete(pc.decoder);
2669
2670 return result;
2671 }
2672