1 /*
2 * charset=UTF-8
3 */
4
5 /*
6 * "streamable kanji code filter and converter"
7 *
8 * Copyright (c) 1998,1999,2000,2001 HappySize, Inc. All rights reserved.
9 *
10 * This software is released under the GNU Lesser General Public License.
11 * (Version 2.1, February 1999)
12 * Please read the following detail of the licence (in japanese).
13 *
14 * ◆使用許諾条件◆
15 *
16 * このソフトウェアは株式会社ハッピーサイズによって開発されました。株式会社ハッ
17 * ピーサイズは、著作権法および万国著作権条約の定めにより、このソフトウェアに関
18 * するすべての権利を留保する権利を持ち、ここに行使します。株式会社ハッピーサイ
19 * ズは以下に明記した条件に従って、このソフトウェアを使用する排他的ではない権利
20 * をお客様に許諾します。何人たりとも、以下の条件に反してこのソフトウェアを使用
21 * することはできません。
22 *
23 * このソフトウェアを「GNU Lesser General Public License (Version 2.1, February
24 * 1999)」に示された条件で使用することを、全ての方に許諾します。「GNU Lesser
25 * General Public License」を満たさない使用には、株式会社ハッピーサイズから書面
26 * による許諾を得る必要があります。
27 *
28 * 「GNU Lesser General Public License」の全文は以下のウェブページから取得でき
29 * ます。「GNU Lesser General Public License」とは、これまでLibrary General
30 * Public Licenseと呼ばれていたものです。
31 * http://www.gnu.org/ --- GNUウェブサイト
32 * http://www.gnu.org/copyleft/lesser.html --- ライセンス文面
33 * このライセンスの内容がわからない方、守れない方には使用を許諾しません。
34 *
35 * しかしながら、当社とGNUプロジェクトとの特定の関係を示唆または主張するもので
36 * はありません。
37 *
38 * ◆保証内容◆
39 *
40 * このソフトウェアは、期待された動作・機能・性能を持つことを目標として設計され
41 * 開発されていますが、これを保証するものではありません。このソフトウェアは「こ
42 * のまま」の状態で提供されており、たとえばこのソフトウェアの有用性ないし特定の
43 * 目的に合致することといった、何らかの保証内容が、明示されたり暗黙に示されてい
44 * る場合であっても、その保証は無効です。このソフトウェアを使用した結果ないし使
45 * 用しなかった結果によって、直接あるいは間接に受けた身体的な傷害、財産上の損害
46 * 、データの損失あるいはその他の全ての損害については、その損害の可能性が使用者
47 * 、当社あるいは第三者によって警告されていた場合であっても、当社はその損害の賠
48 * 償および補填を行いません。この規定は他の全ての、書面上または書面に無い保証・
49 * 契約・規定に優先します。
50 *
51 * ◆著作権者の連絡先および使用条件についての問い合わせ先◆
52 *
53 * 〒102-0073
54 * 東京都千代田区九段北1-13-5日本地所第一ビル4F
55 * 株式会社ハッピーサイズ
56 * Phone: 03-3512-3655, Fax: 03-3512-3656
57 * Email: sales@happysize.co.jp
58 * Web: http://happysize.com/
59 *
60 * ◆著者◆
61 *
62 * 金本 茂 <sgk@happysize.co.jp>
63 *
64 * ◆履歴◆
65 *
66 * 1998/11/10 sgk implementation in C++
67 * 1999/4/25 sgk Cで書きなおし。
68 * 1999/4/26 sgk 入力フィルタを実装。漢字コードを推定しながらフィルタを追加。
69 * 1999/6/?? Unicodeサポート。
70 * 1999/6/22 sgk ライセンスをLGPLに変更。
71 *
72 */
73
74 /*
75 * Unicode support
76 *
77 * Portions copyright (c) 1999,2000,2001 by the PHP3 internationalization team.
78 * All rights reserved.
79 *
80 */
81
82 #include <stddef.h>
83 #include <string.h>
84
85 #include "mbfilter.h"
86 #include "mbfl_filter_output.h"
87 #include "mbfilter_8bit.h"
88 #include "mbfilter_wchar.h"
89 #include "mbstring.h"
90 #include "php_unicode.h"
91 #include "filters/mbfilter_base64.h"
92 #include "filters/mbfilter_qprint.h"
93 #include "filters/mbfilter_singlebyte.h"
94 #include "filters/mbfilter_tl_jisx0201_jisx0208.h"
95 #include "filters/mbfilter_utf8.h"
96
97 #include "eaw_table.h"
98 #include "rare_cp_bitvec.h"
99
100 /* hex character table "0123456789ABCDEF" */
101 static char mbfl_hexchar_table[] = {
102 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
103 };
104
105
106
107 /*
108 * encoding filter
109 */
110 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
111
112
113 /*
114 * buffering converter
115 */
116 mbfl_buffer_converter *
mbfl_buffer_converter_new(const mbfl_encoding * from,const mbfl_encoding * to,size_t buf_initsz)117 mbfl_buffer_converter_new(
118 const mbfl_encoding *from,
119 const mbfl_encoding *to,
120 size_t buf_initsz)
121 {
122 mbfl_buffer_converter *convd = emalloc(sizeof(mbfl_buffer_converter));
123 convd->to = to;
124
125 /* create convert filter */
126 convd->filter1 = NULL;
127 convd->filter2 = NULL;
128 if (mbfl_convert_filter_get_vtbl(from, to) != NULL) {
129 convd->filter1 = mbfl_convert_filter_new(from, to, mbfl_memory_device_output, NULL, &convd->device);
130 } else {
131 convd->filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, to, mbfl_memory_device_output, NULL, &convd->device);
132 if (convd->filter2 != NULL) {
133 convd->filter1 = mbfl_convert_filter_new(from,
134 &mbfl_encoding_wchar,
135 (output_function_t)convd->filter2->filter_function,
136 (flush_function_t)convd->filter2->filter_flush,
137 convd->filter2);
138 if (convd->filter1 == NULL) {
139 mbfl_convert_filter_delete(convd->filter2);
140 }
141 }
142 }
143 if (convd->filter1 == NULL) {
144 efree(convd);
145 return NULL;
146 }
147
148 mbfl_memory_device_init(&convd->device, buf_initsz, buf_initsz/4);
149
150 return convd;
151 }
152
153
154 void
mbfl_buffer_converter_delete(mbfl_buffer_converter * convd)155 mbfl_buffer_converter_delete(mbfl_buffer_converter *convd)
156 {
157 if (convd != NULL) {
158 if (convd->filter1) {
159 mbfl_convert_filter_delete(convd->filter1);
160 }
161 if (convd->filter2) {
162 mbfl_convert_filter_delete(convd->filter2);
163 }
164 mbfl_memory_device_clear(&convd->device);
165 efree((void*)convd);
166 }
167 }
168
169 int
mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter * convd,int mode)170 mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter *convd, int mode)
171 {
172 if (convd != NULL) {
173 if (convd->filter2 != NULL) {
174 convd->filter2->illegal_mode = mode;
175 } else if (convd->filter1 != NULL) {
176 convd->filter1->illegal_mode = mode;
177 } else {
178 return 0;
179 }
180 }
181
182 return 1;
183 }
184
185 int
mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter * convd,int substchar)186 mbfl_buffer_converter_illegal_substchar(mbfl_buffer_converter *convd, int substchar)
187 {
188 if (convd != NULL) {
189 if (convd->filter2 != NULL) {
190 convd->filter2->illegal_substchar = substchar;
191 } else if (convd->filter1 != NULL) {
192 convd->filter1->illegal_substchar = substchar;
193 } else {
194 return 0;
195 }
196 }
197
198 return 1;
199 }
200
mbfl_buffer_converter_feed(mbfl_buffer_converter * convd,mbfl_string * string)201 size_t mbfl_buffer_converter_feed(mbfl_buffer_converter *convd, mbfl_string *string)
202 {
203 size_t n;
204 unsigned char *p;
205 mbfl_convert_filter *filter;
206
207 ZEND_ASSERT(convd);
208 ZEND_ASSERT(string);
209
210 mbfl_memory_device_realloc(&convd->device, convd->device.pos + string->len, string->len/4);
211 /* feed data */
212 n = string->len;
213 p = string->val;
214
215 filter = convd->filter1;
216 if (filter != NULL) {
217 while (n > 0) {
218 if ((*filter->filter_function)(*p++, filter) < 0) {
219 return p - string->val;
220 }
221 n--;
222 }
223 }
224 return p - string->val;
225 }
226
227
228 int
mbfl_buffer_converter_flush(mbfl_buffer_converter * convd)229 mbfl_buffer_converter_flush(mbfl_buffer_converter *convd)
230 {
231 if (convd == NULL) {
232 return -1;
233 }
234
235 if (convd->filter1 != NULL) {
236 mbfl_convert_filter_flush(convd->filter1);
237 }
238
239 return 0;
240 }
241
242 mbfl_string *
mbfl_buffer_converter_result(mbfl_buffer_converter * convd,mbfl_string * result)243 mbfl_buffer_converter_result(mbfl_buffer_converter *convd, mbfl_string *result)
244 {
245 if (convd == NULL || result == NULL) {
246 return NULL;
247 }
248 result->encoding = convd->to;
249 return mbfl_memory_device_result(&convd->device, result);
250 }
251
252 mbfl_string *
mbfl_buffer_converter_feed_result(mbfl_buffer_converter * convd,mbfl_string * string,mbfl_string * result)253 mbfl_buffer_converter_feed_result(mbfl_buffer_converter *convd, mbfl_string *string,
254 mbfl_string *result)
255 {
256 if (convd == NULL || string == NULL || result == NULL) {
257 return NULL;
258 }
259 mbfl_buffer_converter_feed(convd, string);
260 if (convd->filter1 != NULL) {
261 mbfl_convert_filter_flush(convd->filter1);
262 }
263 result->encoding = convd->to;
264 return mbfl_memory_device_result(&convd->device, result);
265 }
266
mbfl_buffer_illegalchars(mbfl_buffer_converter * convd)267 size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
268 {
269 size_t num_illegalchars = 0;
270
271 if (convd == NULL) {
272 return 0;
273 }
274
275 if (convd->filter1 != NULL) {
276 num_illegalchars += convd->filter1->num_illegalchar;
277 }
278
279 if (convd->filter2 != NULL) {
280 num_illegalchars += convd->filter2->num_illegalchar;
281 }
282
283 return num_illegalchars;
284 }
285
286 /*
287 * encoding detector
288 */
mbfl_estimate_encoding_likelihood(int input_cp,void * void_data)289 static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
290 {
291 mbfl_encoding_detector_data *data = void_data;
292 unsigned int c = input_cp;
293
294 /* Receive wchars decoded from input string using candidate encoding.
295 * If the string was invalid in the candidate encoding, we assume
296 * it's the wrong one. Otherwise, give the candidate many 'demerits'
297 * for each 'rare' codepoint found, a smaller number for each ASCII
298 * punctuation character, and 1 for all other codepoints.
299 *
300 * The 'common' codepoints should cover the vast majority of
301 * codepoints we are likely to see in practice, while only covering
302 * a small minority of the entire Unicode encoding space. Why?
303 * Well, if the test string happens to be valid in an incorrect
304 * candidate encoding, the bogus codepoints which it decodes to will
305 * be more or less random. By treating the majority of codepoints as
306 * 'rare', we ensure that in almost all such cases, the bogus
307 * codepoints will include plenty of 'rares', thus giving the
308 * incorrect candidate encoding lots of demerits. See
309 * common_codepoints.txt for the actual list used.
310 *
311 * So, why give extra demerits for ASCII punctuation characters? It's
312 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
313 * which deliberately only use bytes in the ASCII range. When
314 * misinterpreted as ASCII/UTF-8, strings in these encodings will
315 * have an unusually high number of ASCII punctuation characters.
316 * So giving extra demerits for such characters will improve
317 * detection accuracy for UTF-7 and similar encodings.
318 *
319 * Finally, why 1 demerit for all other characters? That penalizes
320 * long strings, meaning we will tend to choose a candidate encoding
321 * in which the test string decodes to a smaller number of
322 * codepoints. That prevents single-byte encodings in which almost
323 * every possible input byte decodes to a 'common' codepoint from
324 * being favored too much. */
325 if (c == MBFL_BAD_INPUT) {
326 data->num_illegalchars++;
327 } else if (c > 0xFFFF) {
328 data->score += 40;
329 } else if (c >= 0x21 && c <= 0x2F) {
330 data->score += 6;
331 } else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
332 data->score += 30;
333 } else {
334 data->score += 1;
335 }
336 return 0;
337 }
338
mbfl_encoding_detector_new(const mbfl_encoding ** elist,int elistsz,int strict)339 mbfl_encoding_detector *mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict)
340 {
341 if (!elistsz) {
342 return NULL;
343 }
344
345 mbfl_encoding_detector *identd = emalloc(sizeof(mbfl_encoding_detector));
346 identd->filter_list = ecalloc(elistsz, sizeof(mbfl_convert_filter*));
347 identd->filter_data = ecalloc(elistsz, sizeof(mbfl_encoding_detector_data));
348
349 int filter_list_size = 0;
350 for (int i = 0; i < elistsz; i++) {
351 mbfl_convert_filter *filter = mbfl_convert_filter_new(elist[i], &mbfl_encoding_wchar,
352 mbfl_estimate_encoding_likelihood, NULL, &identd->filter_data[filter_list_size]);
353 if (filter) {
354 identd->filter_list[filter_list_size++] = filter;
355 }
356 }
357 identd->filter_list_size = filter_list_size;
358 identd->strict = strict;
359 return identd;
360 }
361
mbfl_encoding_detector_delete(mbfl_encoding_detector * identd)362 void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd)
363 {
364 for (int i = 0; i < identd->filter_list_size; i++) {
365 mbfl_convert_filter_delete(identd->filter_list[i]);
366 }
367 efree(identd->filter_list);
368 efree(identd->filter_data);
369 efree(identd);
370 }
371
mbfl_encoding_detector_feed(mbfl_encoding_detector * identd,mbfl_string * string)372 int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *string)
373 {
374 int num = identd->filter_list_size;
375 size_t n = string->len;
376 unsigned char *p = string->val;
377 int bad = 0;
378
379 if (identd->strict) {
380 for (int i = 0; i < num; i++) {
381 mbfl_convert_filter *filter = identd->filter_list[i];
382 mbfl_encoding_detector_data *data = &identd->filter_data[i];
383 if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
384 data->num_illegalchars++;
385 }
386 }
387 }
388
389 while (n--) {
390 for (int i = 0; i < num; i++) {
391 mbfl_convert_filter *filter = identd->filter_list[i];
392 mbfl_encoding_detector_data *data = &identd->filter_data[i];
393 if (!data->num_illegalchars) {
394 (*filter->filter_function)(*p, filter);
395 if (data->num_illegalchars) {
396 bad++;
397 }
398 }
399 }
400 if ((num - 1) <= bad && !identd->strict) {
401 return 1;
402 }
403 p++;
404 }
405
406 for (int i = 0; i < num; i++) {
407 mbfl_convert_filter *filter = identd->filter_list[i];
408 (filter->filter_flush)(filter);
409 }
410
411 return 0;
412 }
413
mbfl_encoding_detector_judge(mbfl_encoding_detector * identd)414 const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd)
415 {
416 size_t best_score = SIZE_MAX; /* Low score is 'better' */
417 const mbfl_encoding *enc = NULL;
418
419 for (int i = 0; i < identd->filter_list_size; i++) {
420 mbfl_convert_filter *filter = identd->filter_list[i];
421 mbfl_encoding_detector_data *data = &identd->filter_data[i];
422 if (!data->num_illegalchars && data->score < best_score) {
423 enc = filter->from;
424 best_score = data->score;
425 }
426 }
427
428 return enc;
429 }
430
431 /*
432 * encoding converter
433 */
434 mbfl_string *
mbfl_convert_encoding(mbfl_string * string,mbfl_string * result,const mbfl_encoding * toenc)435 mbfl_convert_encoding(
436 mbfl_string *string,
437 mbfl_string *result,
438 const mbfl_encoding *toenc)
439 {
440 size_t n;
441 unsigned char *p;
442 mbfl_memory_device device;
443 mbfl_convert_filter *filter1;
444 mbfl_convert_filter *filter2;
445
446 /* initialize */
447 if (toenc == NULL || string == NULL || result == NULL) {
448 return NULL;
449 }
450
451 filter1 = NULL;
452 filter2 = NULL;
453 if (mbfl_convert_filter_get_vtbl(string->encoding, toenc) != NULL) {
454 filter1 = mbfl_convert_filter_new(string->encoding, toenc, mbfl_memory_device_output, 0, &device);
455 } else {
456 filter2 = mbfl_convert_filter_new(&mbfl_encoding_wchar, toenc, mbfl_memory_device_output, 0, &device);
457 if (filter2 != NULL) {
458 filter1 = mbfl_convert_filter_new(string->encoding, &mbfl_encoding_wchar, (int (*)(int, void*))filter2->filter_function, NULL, filter2);
459 if (filter1 == NULL) {
460 mbfl_convert_filter_delete(filter2);
461 }
462 }
463 }
464 if (filter1 == NULL) {
465 return NULL;
466 }
467
468 if (filter2 != NULL) {
469 filter2->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
470 filter2->illegal_substchar = 0x3f; /* '?' */
471 }
472
473 mbfl_memory_device_init(&device, string->len, (string->len >> 2) + 8);
474
475 /* feed data */
476 n = string->len;
477 p = string->val;
478 if (p != NULL) {
479 while (n > 0) {
480 if ((*filter1->filter_function)(*p++, filter1) < 0) {
481 break;
482 }
483 n--;
484 }
485 }
486
487 mbfl_convert_filter_flush(filter1);
488 mbfl_convert_filter_delete(filter1);
489 if (filter2 != NULL) {
490 mbfl_convert_filter_flush(filter2);
491 mbfl_convert_filter_delete(filter2);
492 }
493
494 return mbfl_memory_device_result(&device, result);
495 }
496
497 /*
498 * identify encoding
499 */
mbfl_identify_encoding(mbfl_string * string,const mbfl_encoding ** elist,int elistsz,int strict)500 const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int elistsz, int strict)
501 {
502 if (!elistsz) {
503 return NULL;
504 }
505 mbfl_encoding_detector *identd = mbfl_encoding_detector_new(elist, elistsz, strict);
506 mbfl_encoding_detector_feed(identd, string);
507 const mbfl_encoding *enc = mbfl_encoding_detector_judge(identd);
508 mbfl_encoding_detector_delete(identd);
509 return enc;
510 }
511
512 /*
513 * strlen
514 */
515 static int
filter_count_output(int c,void * data)516 filter_count_output(int c, void *data)
517 {
518 (*(size_t *)data)++;
519 return 0;
520 }
521
522 size_t
mbfl_strlen(const mbfl_string * string)523 mbfl_strlen(const mbfl_string *string)
524 {
525 size_t len, n, k;
526 unsigned char *p;
527 const mbfl_encoding *encoding = string->encoding;
528
529 len = 0;
530 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
531 len = string->len;
532 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
533 len = string->len/2;
534 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
535 len = string->len/4;
536 } else if (encoding->mblen_table != NULL) {
537 const unsigned char *mbtab = encoding->mblen_table;
538 n = 0;
539 p = string->val;
540 k = string->len;
541 /* count */
542 if (p != NULL) {
543 while (n < k) {
544 unsigned m = mbtab[*p];
545 n += m;
546 p += m;
547 len++;
548 }
549 }
550 } else {
551 /* wchar filter */
552 mbfl_convert_filter *filter = mbfl_convert_filter_new(
553 string->encoding,
554 &mbfl_encoding_wchar,
555 filter_count_output, 0, &len);
556 if (filter == NULL) {
557 return (size_t) -1;
558 }
559 /* count */
560 n = string->len;
561 p = string->val;
562 if (p != NULL) {
563 while (n > 0) {
564 (*filter->filter_function)(*p++, filter);
565 n--;
566 }
567 }
568 mbfl_convert_filter_delete(filter);
569 }
570
571 return len;
572 }
573
574
575 /*
576 * strpos
577 */
578 struct collector_strpos_data {
579 mbfl_convert_filter *next_filter;
580 mbfl_wchar_device needle;
581 size_t needle_len;
582 size_t start;
583 size_t output;
584 size_t found_pos;
585 size_t needle_pos;
586 size_t matched_pos;
587 };
588
589 static int
collector_strpos(int c,void * data)590 collector_strpos(int c, void* data)
591 {
592 int *p, *h, *m;
593 ssize_t n;
594 struct collector_strpos_data *pc = (struct collector_strpos_data*)data;
595
596 if (pc->output >= pc->start) {
597 if (c == (int)pc->needle.buffer[pc->needle_pos]) {
598 if (pc->needle_pos == 0) {
599 pc->found_pos = pc->output; /* found position */
600 }
601 pc->needle_pos++; /* needle pointer */
602 if (pc->needle_pos >= pc->needle_len) {
603 pc->matched_pos = pc->found_pos; /* matched position */
604 pc->needle_pos--;
605 goto retry;
606 }
607 } else if (pc->needle_pos != 0) {
608 retry:
609 h = (int *)pc->needle.buffer;
610 h++;
611 for (;;) {
612 pc->found_pos++;
613 p = h;
614 m = (int *)pc->needle.buffer;
615 n = pc->needle_pos - 1;
616 while (n > 0 && *p == *m) {
617 n--;
618 p++;
619 m++;
620 }
621 if (n <= 0) {
622 if (*m != c) {
623 pc->needle_pos = 0;
624 }
625 break;
626 } else {
627 h++;
628 pc->needle_pos--;
629 }
630 }
631 }
632 }
633
634 pc->output++;
635 return 0;
636 }
637
mbfl_find_offset_utf8(const unsigned char * str,const unsigned char * end,ssize_t offset)638 static const unsigned char *mbfl_find_offset_utf8(
639 const unsigned char *str, const unsigned char *end, ssize_t offset) {
640 if (offset < 0) {
641 const unsigned char *pos = end;
642 while (offset < 0) {
643 if (pos <= str) {
644 return NULL;
645 }
646
647 unsigned char c = *(--pos);
648 if (c < 0x80) {
649 ++offset;
650 } else if ((c & 0xc0) != 0x80) {
651 ++offset;
652 }
653 }
654 return pos;
655 } else {
656 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
657 const unsigned char *pos = str;
658 while (offset-- > 0) {
659 if (pos >= end) {
660 return NULL;
661 }
662 pos += u8_tbl[*pos];
663 }
664 return pos;
665 }
666 }
667
mbfl_pointer_to_offset_utf8(const unsigned char * start,const unsigned char * pos)668 static size_t mbfl_pointer_to_offset_utf8(const unsigned char *start, const unsigned char *pos) {
669 size_t result = 0;
670 while (pos > start) {
671 unsigned char c = *--pos;
672 if (c < 0x80) {
673 ++result;
674 } else if ((c & 0xc0) != 0x80) {
675 ++result;
676 }
677 }
678 return result;
679 }
680
681 size_t
mbfl_strpos(mbfl_string * haystack,mbfl_string * needle,ssize_t offset,int reverse)682 mbfl_strpos(
683 mbfl_string *haystack,
684 mbfl_string *needle,
685 ssize_t offset,
686 int reverse)
687 {
688 size_t result;
689 mbfl_string _haystack_u8, _needle_u8;
690 const mbfl_string *haystack_u8, *needle_u8 = NULL;
691 const unsigned char *offset_pointer;
692
693 if (haystack->encoding->no_encoding != mbfl_no_encoding_utf8) {
694 mbfl_string_init(&_haystack_u8);
695 haystack_u8 = mbfl_convert_encoding(haystack, &_haystack_u8, &mbfl_encoding_utf8);
696 if (haystack_u8 == NULL) {
697 result = MBFL_ERROR_ENCODING;
698 goto out;
699 }
700 } else {
701 haystack_u8 = haystack;
702 }
703
704 if (needle->encoding->no_encoding != mbfl_no_encoding_utf8) {
705 mbfl_string_init(&_needle_u8);
706 needle_u8 = mbfl_convert_encoding(needle, &_needle_u8, &mbfl_encoding_utf8);
707 if (needle_u8 == NULL) {
708 result = MBFL_ERROR_ENCODING;
709 goto out;
710 }
711 } else {
712 needle_u8 = needle;
713 }
714
715 offset_pointer = mbfl_find_offset_utf8(
716 haystack_u8->val, haystack_u8->val + haystack_u8->len, offset);
717 if (!offset_pointer) {
718 result = MBFL_ERROR_OFFSET;
719 goto out;
720 }
721
722 result = MBFL_ERROR_NOT_FOUND;
723 if (haystack_u8->len < needle_u8->len) {
724 goto out;
725 }
726
727 const char *found_pos;
728 if (!reverse) {
729 found_pos = zend_memnstr(
730 (const char *) offset_pointer,
731 (const char *) needle_u8->val, needle_u8->len,
732 (const char *) haystack_u8->val + haystack_u8->len);
733 } else {
734 if (offset >= 0) {
735 found_pos = zend_memnrstr(
736 (const char *) offset_pointer,
737 (const char *) needle_u8->val, needle_u8->len,
738 (const char *) haystack_u8->val + haystack_u8->len);
739 } else {
740 size_t needle_len = mbfl_strlen(needle_u8);
741 offset_pointer = mbfl_find_offset_utf8(
742 offset_pointer, haystack_u8->val + haystack_u8->len, needle_len);
743 if (!offset_pointer) {
744 offset_pointer = haystack_u8->val + haystack_u8->len;
745 }
746
747 found_pos = zend_memnrstr(
748 (const char *) haystack_u8->val,
749 (const char *) needle_u8->val, needle_u8->len,
750 (const char *) offset_pointer);
751 }
752 }
753
754 if (found_pos) {
755 result = mbfl_pointer_to_offset_utf8(haystack_u8->val, (const unsigned char *) found_pos);
756 }
757
758 out:
759 if (haystack_u8 == &_haystack_u8) {
760 mbfl_string_clear(&_haystack_u8);
761 }
762 if (needle_u8 == &_needle_u8) {
763 mbfl_string_clear(&_needle_u8);
764 }
765 return result;
766 }
767
768 /*
769 * substr_count
770 */
771
772 size_t
mbfl_substr_count(mbfl_string * haystack,mbfl_string * needle)773 mbfl_substr_count(
774 mbfl_string *haystack,
775 mbfl_string *needle
776 )
777 {
778 size_t n, result = 0;
779 unsigned char *p;
780 mbfl_convert_filter *filter;
781 struct collector_strpos_data pc;
782
783 /* needle is converted into wchar */
784 mbfl_wchar_device_init(&pc.needle);
785 filter = mbfl_convert_filter_new(
786 needle->encoding,
787 &mbfl_encoding_wchar,
788 mbfl_wchar_device_output, 0, &pc.needle);
789 if (filter == NULL) {
790 return MBFL_ERROR_ENCODING;
791 }
792 mbfl_convert_filter_feed_string(filter, needle->val, needle->len);
793 mbfl_convert_filter_flush(filter);
794 mbfl_convert_filter_delete(filter);
795 pc.needle_len = pc.needle.pos;
796 if (pc.needle.buffer == NULL) {
797 return MBFL_ERROR_ENCODING;
798 }
799 if (pc.needle_len == 0) {
800 mbfl_wchar_device_clear(&pc.needle);
801 return MBFL_ERROR_EMPTY;
802 }
803 /* initialize filter and collector data */
804 filter = mbfl_convert_filter_new(
805 haystack->encoding,
806 &mbfl_encoding_wchar,
807 collector_strpos, 0, &pc);
808 if (filter == NULL) {
809 mbfl_wchar_device_clear(&pc.needle);
810 return MBFL_ERROR_ENCODING;
811 }
812 pc.start = 0;
813 pc.output = 0;
814 pc.needle_pos = 0;
815 pc.found_pos = 0;
816 pc.matched_pos = MBFL_ERROR_NOT_FOUND;
817
818 /* feed data */
819 p = haystack->val;
820 n = haystack->len;
821 if (p != NULL) {
822 while (n > 0) {
823 if ((*filter->filter_function)(*p++, filter) < 0) {
824 pc.matched_pos = MBFL_ERROR_ENCODING;
825 break;
826 }
827 if (pc.matched_pos != MBFL_ERROR_NOT_FOUND) {
828 ++result;
829 pc.matched_pos = MBFL_ERROR_NOT_FOUND;
830 pc.needle_pos = 0;
831 }
832 n--;
833 }
834 }
835 mbfl_convert_filter_flush(filter);
836 mbfl_convert_filter_delete(filter);
837 mbfl_wchar_device_clear(&pc.needle);
838
839 return result;
840 }
841
842 /*
843 * substr
844 */
845 struct collector_substr_data {
846 mbfl_convert_filter *next_filter;
847 size_t start;
848 size_t stop;
849 size_t output;
850 };
851
852 static int
collector_substr(int c,void * data)853 collector_substr(int c, void* data)
854 {
855 struct collector_substr_data *pc = (struct collector_substr_data*)data;
856
857 if (pc->output >= pc->stop) {
858 return -1;
859 }
860
861 if (pc->output >= pc->start) {
862 (*pc->next_filter->filter_function)(c, pc->next_filter);
863 }
864
865 pc->output++;
866
867 return 0;
868 }
869
870 mbfl_string *
mbfl_substr(mbfl_string * string,mbfl_string * result,size_t from,size_t length)871 mbfl_substr(
872 mbfl_string *string,
873 mbfl_string *result,
874 size_t from,
875 size_t length)
876 {
877 const mbfl_encoding *encoding = string->encoding;
878 size_t n, k, len, start, end;
879 unsigned m;
880 unsigned char *p, *w;
881
882 mbfl_string_init(result);
883 result->encoding = string->encoding;
884
885 if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) ||
886 encoding->mblen_table != NULL) {
887 len = string->len;
888 if (encoding->flag & MBFL_ENCTYPE_SBCS) {
889 start = from;
890 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
891 start = from*2;
892 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
893 start = from*4;
894 } else {
895 const unsigned char *mbtab = encoding->mblen_table;
896 start = 0;
897 n = 0;
898 k = 0;
899 p = string->val;
900 /* search start position */
901 while (k <= from) {
902 start = n;
903 if (n >= len) {
904 break;
905 }
906 m = mbtab[*p];
907 n += m;
908 p += m;
909 k++;
910 }
911 }
912
913 if (length == MBFL_SUBSTR_UNTIL_END) {
914 end = len;
915 } else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
916 end = start + length;
917 } else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
918 end = start + length*2;
919 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
920 end = start + length*4;
921 } else {
922 const unsigned char *mbtab = encoding->mblen_table;
923 end = start;
924 n = start;
925 k = 0;
926 p = string->val + start;
927 /* detect end position */
928 while (k <= length) {
929 end = n;
930 if (n >= len) {
931 break;
932 }
933 m = mbtab[*p];
934 n += m;
935 p += m;
936 k++;
937 }
938 }
939
940 if (start > len) {
941 start = len;
942 }
943 if (end > len) {
944 end = len;
945 }
946 if (start > end) {
947 start = end;
948 }
949
950 /* allocate memory and copy */
951 n = end - start;
952 result->len = 0;
953 result->val = w = (unsigned char*)emalloc(n + 1);
954 result->len = n;
955 memcpy(w, string->val + start, n);
956 w[n] = '\0';
957 } else {
958 mbfl_memory_device device;
959 struct collector_substr_data pc;
960 mbfl_convert_filter *decoder;
961 mbfl_convert_filter *encoder;
962
963 if (length == MBFL_SUBSTR_UNTIL_END) {
964 length = mbfl_strlen(string) - from;
965 }
966
967 mbfl_memory_device_init(&device, length + 1, 0);
968 mbfl_string_init(result);
969 result->encoding = string->encoding;
970 /* output code filter */
971 decoder = mbfl_convert_filter_new(
972 &mbfl_encoding_wchar,
973 string->encoding,
974 mbfl_memory_device_output, 0, &device);
975 /* wchar filter */
976 encoder = mbfl_convert_filter_new(
977 string->encoding,
978 &mbfl_encoding_wchar,
979 collector_substr, 0, &pc);
980 if (decoder == NULL || encoder == NULL) {
981 mbfl_convert_filter_delete(encoder);
982 mbfl_convert_filter_delete(decoder);
983 return NULL;
984 }
985 pc.next_filter = decoder;
986 pc.start = from;
987 pc.stop = from + length;
988 pc.output = 0;
989
990 /* feed data */
991 p = string->val;
992 n = string->len;
993 if (p != NULL) {
994 while (n > 0) {
995 if ((*encoder->filter_function)(*p++, encoder) < 0) {
996 break;
997 }
998 n--;
999 }
1000 }
1001
1002 mbfl_convert_filter_flush(encoder);
1003 mbfl_convert_filter_flush(decoder);
1004 result = mbfl_memory_device_result(&device, result);
1005 mbfl_convert_filter_delete(encoder);
1006 mbfl_convert_filter_delete(decoder);
1007 }
1008
1009 return result;
1010 }
1011
1012 /*
1013 * strcut
1014 */
1015 mbfl_string *
mbfl_strcut(mbfl_string * string,mbfl_string * result,size_t from,size_t length)1016 mbfl_strcut(
1017 mbfl_string *string,
1018 mbfl_string *result,
1019 size_t from,
1020 size_t length)
1021 {
1022 const mbfl_encoding *encoding = string->encoding;
1023 mbfl_memory_device device;
1024
1025 if (from >= string->len) {
1026 from = string->len;
1027 }
1028
1029 mbfl_string_init(result);
1030 result->encoding = string->encoding;
1031
1032 if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) {
1033 const unsigned char *start = NULL;
1034 const unsigned char *end = NULL;
1035 unsigned char *w;
1036 size_t sz;
1037
1038 if (encoding->flag & MBFL_ENCTYPE_WCS2) {
1039 from &= -2;
1040
1041 if (length >= string->len - from) {
1042 length = string->len - from;
1043 }
1044
1045 start = string->val + from;
1046 end = start + (length & -2);
1047 } else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
1048 from &= -4;
1049
1050 if (length >= string->len - from) {
1051 length = string->len - from;
1052 }
1053
1054 start = string->val + from;
1055 end = start + (length & -4);
1056 } else if ((encoding->flag & MBFL_ENCTYPE_SBCS)) {
1057 if (length >= string->len - from) {
1058 length = string->len - from;
1059 }
1060
1061 start = string->val + from;
1062 end = start + length;
1063 } else if (encoding->mblen_table != NULL) {
1064 const unsigned char *mbtab = encoding->mblen_table;
1065 const unsigned char *p, *q;
1066 int m;
1067
1068 /* search start position */
1069 for (m = 0, p = string->val, q = p + from;
1070 p < q; p += (m = mbtab[*p]));
1071
1072 if (p > q) {
1073 p -= m;
1074 }
1075
1076 start = p;
1077
1078 /* search end position */
1079 if (length >= string->len - (start - string->val)) {
1080 end = string->val + string->len;
1081 } else {
1082 for (q = p + length; p < q; p += (m = mbtab[*p]));
1083
1084 if (p > q) {
1085 p -= m;
1086 }
1087 end = p;
1088 }
1089 } else {
1090 /* never reached */
1091 return NULL;
1092 }
1093
1094 /* allocate memory and copy string */
1095 sz = end - start;
1096 w = ecalloc(sz + 8, sizeof(unsigned char));
1097
1098 memcpy(w, start, sz);
1099 w[sz] = '\0';
1100 w[sz + 1] = '\0';
1101 w[sz + 2] = '\0';
1102 w[sz + 3] = '\0';
1103
1104 result->val = w;
1105 result->len = sz;
1106 } else {
1107 mbfl_convert_filter *encoder = NULL;
1108 mbfl_convert_filter *decoder = NULL;
1109 const unsigned char *p, *q, *r;
1110 struct {
1111 mbfl_convert_filter encoder;
1112 mbfl_convert_filter decoder;
1113 const unsigned char *p;
1114 size_t pos;
1115 } bk, _bk;
1116
1117 /* output code filter */
1118 if (!(decoder = mbfl_convert_filter_new(
1119 &mbfl_encoding_wchar,
1120 string->encoding,
1121 mbfl_memory_device_output, 0, &device))) {
1122 return NULL;
1123 }
1124
1125 /* wchar filter */
1126 if (!(encoder = mbfl_convert_filter_new(
1127 string->encoding,
1128 &mbfl_encoding_wchar,
1129 mbfl_filter_output_null,
1130 NULL, NULL))) {
1131 mbfl_convert_filter_delete(decoder);
1132 return NULL;
1133 }
1134
1135 mbfl_memory_device_init(&device, length + 8, 0);
1136
1137 p = string->val;
1138
1139 /* search start position */
1140 for (q = string->val + from; p < q; p++) {
1141 (*encoder->filter_function)(*p, encoder);
1142 }
1143
1144 /* switch the drain direction */
1145 encoder->output_function = (output_function_t)decoder->filter_function;
1146 encoder->flush_function = (flush_function_t)decoder->filter_flush;
1147 encoder->data = decoder;
1148
1149 q = string->val + string->len;
1150
1151 /* save the encoder, decoder state and the pointer */
1152 mbfl_convert_filter_copy(decoder, &_bk.decoder);
1153 mbfl_convert_filter_copy(encoder, &_bk.encoder);
1154 _bk.p = p;
1155 _bk.pos = device.pos;
1156
1157 if (length > q - p) {
1158 length = q - p;
1159 }
1160
1161 if (length >= 20) {
1162 /* output a little shorter than "length" */
1163 /* XXX: the constant "20" was determined purely on the heuristics. */
1164 for (r = p + length - 20; p < r; p++) {
1165 (*encoder->filter_function)(*p, encoder);
1166 }
1167
1168 /* if the offset of the resulting string exceeds the length,
1169 * then restore the state */
1170 if (device.pos > length) {
1171 p = _bk.p;
1172 device.pos = _bk.pos;
1173 if (decoder->filter_dtor)
1174 decoder->filter_dtor(decoder);
1175 if (encoder->filter_dtor)
1176 encoder->filter_dtor(encoder);
1177 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1178 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1179 bk = _bk;
1180 } else {
1181 /* save the encoder, decoder state and the pointer */
1182 mbfl_convert_filter_copy(decoder, &bk.decoder);
1183 mbfl_convert_filter_copy(encoder, &bk.encoder);
1184 bk.p = p;
1185 bk.pos = device.pos;
1186
1187 /* flush the stream */
1188 (*encoder->filter_flush)(encoder);
1189
1190 /* if the offset of the resulting string exceeds the length,
1191 * then restore the state */
1192 if (device.pos > length) {
1193 if (bk.decoder.filter_dtor)
1194 bk.decoder.filter_dtor(&bk.decoder);
1195 if (bk.encoder.filter_dtor)
1196 bk.encoder.filter_dtor(&bk.encoder);
1197
1198 p = _bk.p;
1199 device.pos = _bk.pos;
1200 if (decoder->filter_dtor)
1201 decoder->filter_dtor(decoder);
1202 if (encoder->filter_dtor)
1203 encoder->filter_dtor(encoder);
1204 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1205 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1206 bk = _bk;
1207 } else {
1208 if (_bk.decoder.filter_dtor)
1209 _bk.decoder.filter_dtor(&_bk.decoder);
1210 if (_bk.encoder.filter_dtor)
1211 _bk.encoder.filter_dtor(&_bk.encoder);
1212
1213 p = bk.p;
1214 device.pos = bk.pos;
1215 if (decoder->filter_dtor)
1216 decoder->filter_dtor(decoder);
1217 if (encoder->filter_dtor)
1218 encoder->filter_dtor(encoder);
1219 mbfl_convert_filter_copy(&bk.decoder, decoder);
1220 mbfl_convert_filter_copy(&bk.encoder, encoder);
1221 }
1222 }
1223 } else {
1224 bk = _bk;
1225 }
1226
1227 /* detect end position */
1228 while (p < q) {
1229 (*encoder->filter_function)(*p, encoder);
1230
1231 if (device.pos > length) {
1232 /* restore filter */
1233 p = bk.p;
1234 device.pos = bk.pos;
1235 if (decoder->filter_dtor)
1236 decoder->filter_dtor(decoder);
1237 if (encoder->filter_dtor)
1238 encoder->filter_dtor(encoder);
1239 mbfl_convert_filter_copy(&bk.decoder, decoder);
1240 mbfl_convert_filter_copy(&bk.encoder, encoder);
1241 break;
1242 }
1243
1244 p++;
1245
1246 /* backup current state */
1247 mbfl_convert_filter_copy(decoder, &_bk.decoder);
1248 mbfl_convert_filter_copy(encoder, &_bk.encoder);
1249 _bk.pos = device.pos;
1250 _bk.p = p;
1251
1252 (*encoder->filter_flush)(encoder);
1253
1254 if (device.pos > length) {
1255 if (_bk.decoder.filter_dtor)
1256 _bk.decoder.filter_dtor(&_bk.decoder);
1257 if (_bk.encoder.filter_dtor)
1258 _bk.encoder.filter_dtor(&_bk.encoder);
1259
1260 /* restore filter */
1261 p = bk.p;
1262 device.pos = bk.pos;
1263 if (decoder->filter_dtor)
1264 decoder->filter_dtor(decoder);
1265 if (encoder->filter_dtor)
1266 encoder->filter_dtor(encoder);
1267 mbfl_convert_filter_copy(&bk.decoder, decoder);
1268 mbfl_convert_filter_copy(&bk.encoder, encoder);
1269 break;
1270 }
1271
1272 if (bk.decoder.filter_dtor)
1273 bk.decoder.filter_dtor(&bk.decoder);
1274 if (bk.encoder.filter_dtor)
1275 bk.encoder.filter_dtor(&bk.encoder);
1276
1277 p = _bk.p;
1278 device.pos = _bk.pos;
1279 if (decoder->filter_dtor)
1280 decoder->filter_dtor(decoder);
1281 if (encoder->filter_dtor)
1282 encoder->filter_dtor(encoder);
1283 mbfl_convert_filter_copy(&_bk.decoder, decoder);
1284 mbfl_convert_filter_copy(&_bk.encoder, encoder);
1285
1286 bk = _bk;
1287 }
1288
1289 decoder->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
1290 (*encoder->filter_flush)(encoder);
1291
1292 if (bk.decoder.filter_dtor)
1293 bk.decoder.filter_dtor(&bk.decoder);
1294 if (bk.encoder.filter_dtor)
1295 bk.encoder.filter_dtor(&bk.encoder);
1296
1297 result = mbfl_memory_device_result(&device, result);
1298
1299 mbfl_convert_filter_delete(encoder);
1300 mbfl_convert_filter_delete(decoder);
1301 }
1302
1303 return result;
1304 }
1305
1306
1307 /*
1308 * strwidth
1309 */
is_fullwidth(int c)1310 static size_t is_fullwidth(int c)
1311 {
1312 int i;
1313
1314 if (c < mbfl_eaw_table[0].begin) {
1315 return 0;
1316 }
1317
1318 for (i = 0; i < sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]); i++) {
1319 if (mbfl_eaw_table[i].begin <= c && c <= mbfl_eaw_table[i].end) {
1320 return 1;
1321 }
1322 }
1323
1324 return 0;
1325 }
1326
1327 static int
filter_count_width(int c,void * data)1328 filter_count_width(int c, void* data)
1329 {
1330 (*(size_t *)data) += (is_fullwidth(c) ? 2: 1);
1331 return 0;
1332 }
1333
1334 size_t
mbfl_strwidth(mbfl_string * string)1335 mbfl_strwidth(mbfl_string *string)
1336 {
1337 size_t len, n;
1338 unsigned char *p;
1339 mbfl_convert_filter *filter;
1340
1341 len = 0;
1342 if (string->len > 0 && string->val != NULL) {
1343 /* wchar filter */
1344 filter = mbfl_convert_filter_new(
1345 string->encoding,
1346 &mbfl_encoding_wchar,
1347 filter_count_width, 0, &len);
1348 if (filter == NULL) {
1349 mbfl_convert_filter_delete(filter);
1350 return -1;
1351 }
1352
1353 /* feed data */
1354 p = string->val;
1355 n = string->len;
1356 while (n > 0) {
1357 (*filter->filter_function)(*p++, filter);
1358 n--;
1359 }
1360
1361 mbfl_convert_filter_flush(filter);
1362 mbfl_convert_filter_delete(filter);
1363 }
1364
1365 return len;
1366 }
1367
1368
1369 /*
1370 * strimwidth
1371 */
1372 struct collector_strimwidth_data {
1373 mbfl_convert_filter *decoder;
1374 mbfl_convert_filter *decoder_backup;
1375 mbfl_memory_device device;
1376 size_t from;
1377 size_t width;
1378 size_t outwidth;
1379 size_t outchar;
1380 size_t endpos;
1381 int status;
1382 };
1383
1384 static int
collector_strimwidth(int c,void * data)1385 collector_strimwidth(int c, void* data)
1386 {
1387 struct collector_strimwidth_data *pc = (struct collector_strimwidth_data*)data;
1388
1389 switch (pc->status) {
1390 case 10:
1391 (*pc->decoder->filter_function)(c, pc->decoder);
1392 break;
1393 default:
1394 if (pc->outchar >= pc->from) {
1395 pc->outwidth += (is_fullwidth(c) ? 2: 1);
1396
1397 if (pc->outwidth > pc->width) {
1398 if (pc->status == 0) {
1399 pc->endpos = pc->device.pos;
1400 mbfl_convert_filter_copy(pc->decoder, pc->decoder_backup);
1401 }
1402 pc->status++;
1403 (*pc->decoder->filter_function)(c, pc->decoder);
1404 pc->outchar++;
1405 return -1;
1406 } else {
1407 (*pc->decoder->filter_function)(c, pc->decoder);
1408 }
1409 }
1410 pc->outchar++;
1411 break;
1412 }
1413
1414 return 0;
1415 }
1416
1417 mbfl_string *
mbfl_strimwidth(mbfl_string * string,mbfl_string * marker,mbfl_string * result,size_t from,size_t width)1418 mbfl_strimwidth(
1419 mbfl_string *string,
1420 mbfl_string *marker,
1421 mbfl_string *result,
1422 size_t from,
1423 size_t width)
1424 {
1425 struct collector_strimwidth_data pc;
1426 mbfl_convert_filter *encoder;
1427 size_t n, mkwidth;
1428 unsigned char *p;
1429
1430 if (string == NULL || result == NULL) {
1431 return NULL;
1432 }
1433 mbfl_string_init(result);
1434 result->encoding = string->encoding;
1435 mbfl_memory_device_init(&pc.device, MIN(string->len, width), 0);
1436
1437 /* output code filter */
1438 pc.decoder = mbfl_convert_filter_new(
1439 &mbfl_encoding_wchar,
1440 string->encoding,
1441 mbfl_memory_device_output, 0, &pc.device);
1442 pc.decoder_backup = mbfl_convert_filter_new(
1443 &mbfl_encoding_wchar,
1444 string->encoding,
1445 mbfl_memory_device_output, 0, &pc.device);
1446 /* wchar filter */
1447 encoder = mbfl_convert_filter_new(
1448 string->encoding,
1449 &mbfl_encoding_wchar,
1450 collector_strimwidth, 0, &pc);
1451 if (pc.decoder == NULL || pc.decoder_backup == NULL || encoder == NULL) {
1452 mbfl_convert_filter_delete(encoder);
1453 mbfl_convert_filter_delete(pc.decoder);
1454 mbfl_convert_filter_delete(pc.decoder_backup);
1455 return NULL;
1456 }
1457 mkwidth = 0;
1458 if (marker) {
1459 mkwidth = mbfl_strwidth(marker);
1460 }
1461 pc.from = from;
1462 pc.width = width - mkwidth;
1463 pc.outwidth = 0;
1464 pc.outchar = 0;
1465 pc.status = 0;
1466 pc.endpos = 0;
1467
1468 /* feed data */
1469 p = string->val;
1470 n = string->len;
1471 if (p != NULL) {
1472 while (n > 0) {
1473 n--;
1474 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1475 break;
1476 }
1477 }
1478 mbfl_convert_filter_flush(encoder);
1479 if (pc.status != 0 && mkwidth > 0) {
1480 pc.width += mkwidth;
1481 if (n > 0) {
1482 while (n > 0) {
1483 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1484 break;
1485 }
1486 n--;
1487 }
1488 mbfl_convert_filter_flush(encoder);
1489 } else if (pc.outwidth > pc.width) {
1490 pc.status++;
1491 }
1492 if (pc.status != 1) {
1493 pc.status = 10;
1494 pc.device.pos = pc.endpos;
1495 mbfl_convert_filter_copy(pc.decoder_backup, pc.decoder);
1496 mbfl_convert_filter_reset(encoder, marker->encoding, &mbfl_encoding_wchar);
1497 p = marker->val;
1498 n = marker->len;
1499 while (n > 0) {
1500 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1501 break;
1502 }
1503 n--;
1504 }
1505 mbfl_convert_filter_flush(encoder);
1506 }
1507 } else if (pc.status != 0) {
1508 pc.device.pos = pc.endpos;
1509 mbfl_convert_filter_copy(pc.decoder_backup, pc.decoder);
1510 }
1511 mbfl_convert_filter_flush(pc.decoder);
1512 }
1513 result = mbfl_memory_device_result(&pc.device, result);
1514 mbfl_convert_filter_delete(encoder);
1515 mbfl_convert_filter_delete(pc.decoder);
1516 mbfl_convert_filter_delete(pc.decoder_backup);
1517
1518 return result;
1519 }
1520
1521 mbfl_string *
mbfl_ja_jp_hantozen(mbfl_string * string,mbfl_string * result,int mode)1522 mbfl_ja_jp_hantozen(
1523 mbfl_string *string,
1524 mbfl_string *result,
1525 int mode)
1526 {
1527 size_t n;
1528 unsigned char *p;
1529 mbfl_memory_device device;
1530 mbfl_convert_filter *decoder = NULL;
1531 mbfl_convert_filter *encoder = NULL;
1532 mbfl_convert_filter *tl_filter = NULL;
1533 mbfl_convert_filter *next_filter = NULL;
1534
1535 mbfl_memory_device_init(&device, string->len, 0);
1536 mbfl_string_init(result);
1537
1538 result->encoding = string->encoding;
1539
1540 decoder = mbfl_convert_filter_new(
1541 &mbfl_encoding_wchar,
1542 string->encoding,
1543 mbfl_memory_device_output, 0, &device);
1544 if (decoder == NULL) {
1545 goto out;
1546 }
1547 next_filter = decoder;
1548
1549 tl_filter = mbfl_convert_filter_new2(
1550 &vtbl_tl_jisx0201_jisx0208,
1551 (int(*)(int, void*))next_filter->filter_function,
1552 (flush_function_t)next_filter->filter_flush,
1553 next_filter);
1554 if (tl_filter == NULL) {
1555 goto out;
1556 }
1557
1558 tl_filter->opaque = (void*)((intptr_t)mode);
1559 next_filter = tl_filter;
1560
1561 encoder = mbfl_convert_filter_new(
1562 string->encoding,
1563 &mbfl_encoding_wchar,
1564 (int(*)(int, void*))next_filter->filter_function,
1565 (flush_function_t)next_filter->filter_flush,
1566 next_filter);
1567 if (encoder == NULL) {
1568 goto out;
1569 }
1570
1571 /* feed data */
1572 p = string->val;
1573 n = string->len;
1574 if (p != NULL) {
1575 while (n > 0) {
1576 if ((*encoder->filter_function)(*p++, encoder) < 0) {
1577 break;
1578 }
1579 n--;
1580 }
1581 }
1582
1583 mbfl_convert_filter_flush(encoder);
1584 result = mbfl_memory_device_result(&device, result);
1585 out:
1586 if (tl_filter != NULL) {
1587 mbfl_convert_filter_delete(tl_filter);
1588 }
1589
1590 if (decoder != NULL) {
1591 mbfl_convert_filter_delete(decoder);
1592 }
1593
1594 if (encoder != NULL) {
1595 mbfl_convert_filter_delete(encoder);
1596 }
1597
1598 return result;
1599 }
1600
1601
1602 /*
1603 * MIME header encode
1604 */
1605 struct mime_header_encoder_data {
1606 mbfl_convert_filter *conv1_filter;
1607 mbfl_convert_filter *block_filter;
1608 mbfl_convert_filter *conv2_filter;
1609 mbfl_convert_filter *conv2_filter_backup;
1610 mbfl_convert_filter *encod_filter;
1611 mbfl_convert_filter *encod_filter_backup;
1612 mbfl_memory_device outdev;
1613 mbfl_memory_device tmpdev;
1614 int status1;
1615 int status2;
1616 size_t prevpos;
1617 size_t linehead;
1618 size_t firstindent;
1619 int encnamelen;
1620 int lwsplen;
1621 char encname[128];
1622 char lwsp[16];
1623 };
1624
1625 static int
mime_header_encoder_block_collector(int c,void * data)1626 mime_header_encoder_block_collector(int c, void *data)
1627 {
1628 size_t n;
1629 struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1630
1631 switch (pe->status2) {
1632 case 1: /* encoded word */
1633 pe->prevpos = pe->outdev.pos;
1634 mbfl_convert_filter_copy(pe->conv2_filter, pe->conv2_filter_backup);
1635 mbfl_convert_filter_copy(pe->encod_filter, pe->encod_filter_backup);
1636 (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1637 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1638 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1639 n = pe->outdev.pos - pe->linehead + pe->firstindent;
1640 pe->outdev.pos = pe->prevpos;
1641 mbfl_convert_filter_copy(pe->conv2_filter_backup, pe->conv2_filter);
1642 mbfl_convert_filter_copy(pe->encod_filter_backup, pe->encod_filter);
1643 if (n >= 74) {
1644 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1645 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1646 mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
1647 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1648 pe->linehead = pe->outdev.pos;
1649 pe->firstindent = 0;
1650 mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1651 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1652 } else {
1653 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1654 }
1655 break;
1656
1657 default:
1658 mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
1659 c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
1660 pe->status2 = 1;
1661 break;
1662 }
1663
1664 return 0;
1665 }
1666
1667 static int
mime_header_encoder_collector(int c,void * data)1668 mime_header_encoder_collector(int c, void *data)
1669 {
1670 static int qp_table[256] = {
1671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1673 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 */
1674 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0, 0, 1, 0, 1, /* 0x10 */
1675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */
1676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x50 */
1677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */
1678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x70 */
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80 */
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90 */
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xA0 */
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xB0 */
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 */
1684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xD0 */
1685 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xE0 */
1686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 /* 0xF0 */
1687 };
1688
1689 size_t n;
1690 struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
1691
1692 switch (pe->status1) {
1693 case 11: /* encoded word */
1694 (*pe->block_filter->filter_function)(c, pe->block_filter);
1695 break;
1696
1697 default: /* ASCII */
1698 if (c <= 0x00ff && !qp_table[(c & 0xff)]) { /* ordinary characters */
1699 mbfl_memory_device_output(c, &pe->tmpdev);
1700 pe->status1 = 1;
1701 } else if (pe->status1 == 0 && c == 0x20) { /* repeat SPACE */
1702 mbfl_memory_device_output(c, &pe->tmpdev);
1703 } else {
1704 if (pe->tmpdev.pos < 74 && c == 0x20) {
1705 n = pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent;
1706 if (n > 74) {
1707 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
1708 pe->linehead = pe->outdev.pos;
1709 pe->firstindent = 0;
1710 } else if (pe->outdev.pos > 0) {
1711 mbfl_memory_device_output(0x20, &pe->outdev);
1712 }
1713 mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1714 mbfl_memory_device_reset(&pe->tmpdev);
1715 pe->status1 = 0;
1716 } else {
1717 n = pe->outdev.pos - pe->linehead + pe->encnamelen + pe->firstindent;
1718 if (n > 60) {
1719 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
1720 pe->linehead = pe->outdev.pos;
1721 pe->firstindent = 0;
1722 } else if (pe->outdev.pos > 0) {
1723 mbfl_memory_device_output(0x20, &pe->outdev);
1724 }
1725 mbfl_convert_filter_devcat(pe->block_filter, &pe->tmpdev);
1726 mbfl_memory_device_reset(&pe->tmpdev);
1727 (*pe->block_filter->filter_function)(c, pe->block_filter);
1728 pe->status1 = 11;
1729 }
1730 }
1731 break;
1732 }
1733
1734 return 0;
1735 }
1736
1737 mbfl_string *
mime_header_encoder_result(struct mime_header_encoder_data * pe,mbfl_string * result)1738 mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *result)
1739 {
1740 if (pe->status1 >= 10) {
1741 (*pe->conv2_filter->filter_flush)(pe->conv2_filter);
1742 (*pe->encod_filter->filter_flush)(pe->encod_filter);
1743 mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
1744 } else if (pe->tmpdev.pos > 0) {
1745 if (pe->outdev.pos > 0) {
1746 if ((pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent) > 74) {
1747 mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
1748 } else {
1749 mbfl_memory_device_output(0x20, &pe->outdev);
1750 }
1751 }
1752 mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
1753 }
1754 mbfl_memory_device_reset(&pe->tmpdev);
1755 pe->prevpos = 0;
1756 pe->linehead = 0;
1757 pe->status1 = 0;
1758 pe->status2 = 0;
1759
1760 return mbfl_memory_device_result(&pe->outdev, result);
1761 }
1762
1763 struct mime_header_encoder_data*
mime_header_encoder_new(const mbfl_encoding * incode,const mbfl_encoding * outcode,const mbfl_encoding * transenc)1764 mime_header_encoder_new(
1765 const mbfl_encoding *incode,
1766 const mbfl_encoding *outcode,
1767 const mbfl_encoding *transenc)
1768 {
1769 size_t n;
1770 const char *s;
1771 struct mime_header_encoder_data *pe;
1772
1773 /* get output encoding and check MIME charset name */
1774 if (outcode->mime_name == NULL || outcode->mime_name[0] == '\0') {
1775 return NULL;
1776 }
1777
1778 pe = emalloc(sizeof(struct mime_header_encoder_data));
1779 mbfl_memory_device_init(&pe->outdev, 0, 0);
1780 mbfl_memory_device_init(&pe->tmpdev, 0, 0);
1781 pe->prevpos = 0;
1782 pe->linehead = 0;
1783 pe->firstindent = 0;
1784 pe->status1 = 0;
1785 pe->status2 = 0;
1786
1787 /* make the encoding description string exp. "=?ISO-2022-JP?B?" */
1788 n = 0;
1789 pe->encname[n++] = 0x3d;
1790 pe->encname[n++] = 0x3f;
1791 s = outcode->mime_name;
1792 while (*s) {
1793 pe->encname[n++] = *s++;
1794 }
1795 pe->encname[n++] = 0x3f;
1796 if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1797 pe->encname[n++] = 0x51;
1798 } else {
1799 pe->encname[n++] = 0x42;
1800 transenc = &mbfl_encoding_base64;
1801 }
1802 pe->encname[n++] = 0x3f;
1803 pe->encname[n] = '\0';
1804 pe->encnamelen = n;
1805
1806 n = 0;
1807 pe->lwsp[n++] = 0x0d;
1808 pe->lwsp[n++] = 0x0a;
1809 pe->lwsp[n++] = 0x20;
1810 pe->lwsp[n] = '\0';
1811 pe->lwsplen = n;
1812
1813 /* transfer encode filter */
1814 pe->encod_filter = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1815 pe->encod_filter_backup = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
1816
1817 /* Output code filter */
1818 pe->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1819 pe->conv2_filter_backup = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
1820
1821 /* encoded block filter */
1822 pe->block_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, &mbfl_encoding_wchar, mime_header_encoder_block_collector, 0, pe);
1823
1824 /* Input code filter */
1825 pe->conv1_filter = mbfl_convert_filter_new(incode, &mbfl_encoding_wchar, mime_header_encoder_collector, 0, pe);
1826
1827 if (pe->encod_filter == NULL ||
1828 pe->encod_filter_backup == NULL ||
1829 pe->conv2_filter == NULL ||
1830 pe->conv2_filter_backup == NULL ||
1831 pe->conv1_filter == NULL) {
1832 mime_header_encoder_delete(pe);
1833 return NULL;
1834 }
1835
1836 if (transenc->no_encoding == mbfl_no_encoding_qprint) {
1837 pe->encod_filter->status |= MBFL_QPRINT_STS_MIME_HEADER;
1838 pe->encod_filter_backup->status |= MBFL_QPRINT_STS_MIME_HEADER;
1839 } else {
1840 pe->encod_filter->status |= MBFL_BASE64_STS_MIME_HEADER;
1841 pe->encod_filter_backup->status |= MBFL_BASE64_STS_MIME_HEADER;
1842 }
1843
1844 return pe;
1845 }
1846
1847 void
mime_header_encoder_delete(struct mime_header_encoder_data * pe)1848 mime_header_encoder_delete(struct mime_header_encoder_data *pe)
1849 {
1850 if (pe) {
1851 mbfl_convert_filter_delete(pe->conv1_filter);
1852 mbfl_convert_filter_delete(pe->block_filter);
1853 mbfl_convert_filter_delete(pe->conv2_filter);
1854 mbfl_convert_filter_delete(pe->conv2_filter_backup);
1855 mbfl_convert_filter_delete(pe->encod_filter);
1856 mbfl_convert_filter_delete(pe->encod_filter_backup);
1857 mbfl_memory_device_clear(&pe->outdev);
1858 mbfl_memory_device_clear(&pe->tmpdev);
1859 efree((void*)pe);
1860 }
1861 }
1862
1863 mbfl_string *
mbfl_mime_header_encode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode,const mbfl_encoding * encoding,const char * linefeed,int indent)1864 mbfl_mime_header_encode(
1865 mbfl_string *string,
1866 mbfl_string *result,
1867 const mbfl_encoding *outcode,
1868 const mbfl_encoding *encoding,
1869 const char *linefeed,
1870 int indent)
1871 {
1872 size_t n;
1873 unsigned char *p;
1874 struct mime_header_encoder_data *pe;
1875
1876 mbfl_string_init(result);
1877 result->encoding = &mbfl_encoding_ascii;
1878
1879 pe = mime_header_encoder_new(string->encoding, outcode, encoding);
1880 if (pe == NULL) {
1881 return NULL;
1882 }
1883
1884 if (linefeed != NULL) {
1885 n = 0;
1886 while (*linefeed && n < 8) {
1887 pe->lwsp[n++] = *linefeed++;
1888 }
1889 pe->lwsp[n++] = 0x20;
1890 pe->lwsp[n] = '\0';
1891 pe->lwsplen = n;
1892 }
1893 if (indent > 0 && indent < 74) {
1894 pe->firstindent = indent;
1895 }
1896
1897 n = string->len;
1898 p = string->val;
1899 while (n > 0) {
1900 (*pe->conv1_filter->filter_function)(*p++, pe->conv1_filter);
1901 n--;
1902 }
1903
1904 result = mime_header_encoder_result(pe, result);
1905 mime_header_encoder_delete(pe);
1906
1907 return result;
1908 }
1909
1910
1911 /*
1912 * MIME header decode
1913 */
1914 struct mime_header_decoder_data {
1915 mbfl_convert_filter *deco_filter;
1916 mbfl_convert_filter *conv1_filter;
1917 mbfl_convert_filter *conv2_filter;
1918 mbfl_memory_device outdev;
1919 mbfl_memory_device tmpdev;
1920 size_t cspos;
1921 int status;
1922 const mbfl_encoding *encoding;
1923 const mbfl_encoding *incode;
1924 const mbfl_encoding *outcode;
1925 };
1926
1927 static int
mime_header_decoder_collector(int c,void * data)1928 mime_header_decoder_collector(int c, void* data)
1929 {
1930 const mbfl_encoding *encoding;
1931 struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data;
1932
1933 switch (pd->status) {
1934 case 1:
1935 if (c == 0x3f) { /* ? */
1936 mbfl_memory_device_output(c, &pd->tmpdev);
1937 pd->cspos = pd->tmpdev.pos;
1938 pd->status = 2;
1939 } else {
1940 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1941 mbfl_memory_device_reset(&pd->tmpdev);
1942 if (c == 0x3d) { /* = */
1943 mbfl_memory_device_output(c, &pd->tmpdev);
1944 } else if (c == 0x0d || c == 0x0a) { /* CR or LF */
1945 pd->status = 9;
1946 } else {
1947 (*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
1948 pd->status = 0;
1949 }
1950 }
1951 break;
1952 case 2: /* store charset string */
1953 if (c == 0x3f) { /* ? */
1954 /* identify charset */
1955 mbfl_memory_device_output('\0', &pd->tmpdev);
1956 encoding = mbfl_name2encoding((const char *)&pd->tmpdev.buffer[pd->cspos]);
1957 if (encoding != NULL) {
1958 pd->incode = encoding;
1959 pd->status = 3;
1960 }
1961 mbfl_memory_device_unput(&pd->tmpdev);
1962 mbfl_memory_device_output(c, &pd->tmpdev);
1963 } else {
1964 mbfl_memory_device_output(c, &pd->tmpdev);
1965 if (pd->tmpdev.pos > 100) { /* too long charset string */
1966 pd->status = 0;
1967 } else if (c == 0x0d || c == 0x0a) { /* CR or LF */
1968 mbfl_memory_device_unput(&pd->tmpdev);
1969 pd->status = 9;
1970 }
1971 if (pd->status != 2) {
1972 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1973 mbfl_memory_device_reset(&pd->tmpdev);
1974 }
1975 }
1976 break;
1977 case 3: /* identify encoding */
1978 mbfl_memory_device_output(c, &pd->tmpdev);
1979 if (c == 0x42 || c == 0x62) { /* 'B' or 'b' */
1980 pd->encoding = &mbfl_encoding_base64;
1981 pd->status = 4;
1982 } else if (c == 0x51 || c == 0x71) { /* 'Q' or 'q' */
1983 pd->encoding = &mbfl_encoding_qprint;
1984 pd->status = 4;
1985 } else {
1986 if (c == 0x0d || c == 0x0a) { /* CR or LF */
1987 mbfl_memory_device_unput(&pd->tmpdev);
1988 pd->status = 9;
1989 } else {
1990 pd->status = 0;
1991 }
1992 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
1993 mbfl_memory_device_reset(&pd->tmpdev);
1994 }
1995 break;
1996 case 4: /* reset filter */
1997 mbfl_memory_device_output(c, &pd->tmpdev);
1998 if (c == 0x3f) { /* ? */
1999 /* charset convert filter */
2000 mbfl_convert_filter_reset(pd->conv1_filter, pd->incode, &mbfl_encoding_wchar);
2001 /* decode filter */
2002 mbfl_convert_filter_reset(pd->deco_filter, pd->encoding, &mbfl_encoding_8bit);
2003 pd->status = 5;
2004 } else {
2005 if (c == 0x0d || c == 0x0a) { /* CR or LF */
2006 mbfl_memory_device_unput(&pd->tmpdev);
2007 pd->status = 9;
2008 } else {
2009 pd->status = 0;
2010 }
2011 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2012 }
2013 mbfl_memory_device_reset(&pd->tmpdev);
2014 break;
2015 case 5: /* encoded block */
2016 if (c == 0x3f) { /* ? */
2017 pd->status = 6;
2018 } else {
2019 (*pd->deco_filter->filter_function)(c, pd->deco_filter);
2020 }
2021 break;
2022 case 6: /* check end position */
2023 if (c == 0x3d) { /* = */
2024 /* flush and reset filter */
2025 (*pd->deco_filter->filter_flush)(pd->deco_filter);
2026 (*pd->conv1_filter->filter_flush)(pd->conv1_filter);
2027 mbfl_convert_filter_reset(pd->conv1_filter, &mbfl_encoding_ascii, &mbfl_encoding_wchar);
2028 pd->status = 7;
2029 } else {
2030 (*pd->deco_filter->filter_function)(0x3f, pd->deco_filter);
2031 if (c != 0x3f) { /* ? */
2032 (*pd->deco_filter->filter_function)(c, pd->deco_filter);
2033 pd->status = 5;
2034 }
2035 }
2036 break;
2037 case 7: /* after encoded block */
2038 if (c == 0x0d || c == 0x0a) { /* CR LF */
2039 pd->status = 8;
2040 } else {
2041 mbfl_memory_device_output(c, &pd->tmpdev);
2042 if (c == 0x3d) { /* = */
2043 pd->status = 1;
2044 } else if (c != 0x20 && c != 0x09) { /* not space */
2045 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2046 mbfl_memory_device_reset(&pd->tmpdev);
2047 pd->status = 0;
2048 }
2049 }
2050 break;
2051 case 8: /* folding */
2052 case 9: /* folding */
2053 if (c != 0x0d && c != 0x0a && c != 0x20 && c != 0x09) {
2054 if (c == 0x3d) { /* = */
2055 if (pd->status == 8) {
2056 mbfl_memory_device_output(0x20, &pd->tmpdev); /* SPACE */
2057 } else {
2058 (*pd->conv1_filter->filter_function)(0x20, pd->conv1_filter);
2059 }
2060 mbfl_memory_device_output(c, &pd->tmpdev);
2061 pd->status = 1;
2062 } else {
2063 mbfl_memory_device_output(0x20, &pd->tmpdev);
2064 mbfl_memory_device_output(c, &pd->tmpdev);
2065 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2066 mbfl_memory_device_reset(&pd->tmpdev);
2067 pd->status = 0;
2068 }
2069 }
2070 break;
2071 default: /* non encoded block */
2072 if (c == 0x0d || c == 0x0a) { /* CR LF */
2073 pd->status = 9;
2074 } else if (c == 0x3d) { /* = */
2075 mbfl_memory_device_output(c, &pd->tmpdev);
2076 pd->status = 1;
2077 } else {
2078 (*pd->conv1_filter->filter_function)(c, pd->conv1_filter);
2079 }
2080 break;
2081 }
2082
2083 return 0;
2084 }
2085
2086 mbfl_string *
mime_header_decoder_result(struct mime_header_decoder_data * pd,mbfl_string * result)2087 mime_header_decoder_result(struct mime_header_decoder_data *pd, mbfl_string *result)
2088 {
2089 switch (pd->status) {
2090 case 1:
2091 case 2:
2092 case 3:
2093 case 4:
2094 case 7:
2095 case 8:
2096 case 9:
2097 mbfl_convert_filter_devcat(pd->conv1_filter, &pd->tmpdev);
2098 break;
2099 case 5:
2100 case 6:
2101 (*pd->deco_filter->filter_flush)(pd->deco_filter);
2102 (*pd->conv1_filter->filter_flush)(pd->conv1_filter);
2103 break;
2104 }
2105 (*pd->conv2_filter->filter_flush)(pd->conv2_filter);
2106 mbfl_memory_device_reset(&pd->tmpdev);
2107 pd->status = 0;
2108
2109 return mbfl_memory_device_result(&pd->outdev, result);
2110 }
2111
2112 struct mime_header_decoder_data*
mime_header_decoder_new(const mbfl_encoding * outcode)2113 mime_header_decoder_new(const mbfl_encoding *outcode)
2114 {
2115 struct mime_header_decoder_data *pd = emalloc(sizeof(struct mime_header_decoder_data));
2116
2117 mbfl_memory_device_init(&pd->outdev, 0, 0);
2118 mbfl_memory_device_init(&pd->tmpdev, 0, 0);
2119 pd->cspos = 0;
2120 pd->status = 0;
2121 pd->encoding = &mbfl_encoding_8bit;
2122 pd->incode = &mbfl_encoding_ascii;
2123 pd->outcode = outcode;
2124 /* charset convert filter */
2125 pd->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, pd->outcode, mbfl_memory_device_output, 0, &pd->outdev);
2126 pd->conv1_filter = mbfl_convert_filter_new(pd->incode, &mbfl_encoding_wchar, mbfl_filter_output_pipe, 0, pd->conv2_filter);
2127 /* decode filter */
2128 pd->deco_filter = mbfl_convert_filter_new(pd->encoding, &mbfl_encoding_8bit, mbfl_filter_output_pipe, 0, pd->conv1_filter);
2129
2130 if (pd->conv1_filter == NULL || pd->conv2_filter == NULL || pd->deco_filter == NULL) {
2131 mime_header_decoder_delete(pd);
2132 return NULL;
2133 }
2134
2135 return pd;
2136 }
2137
2138 void
mime_header_decoder_delete(struct mime_header_decoder_data * pd)2139 mime_header_decoder_delete(struct mime_header_decoder_data *pd)
2140 {
2141 if (pd) {
2142 mbfl_convert_filter_delete(pd->conv2_filter);
2143 mbfl_convert_filter_delete(pd->conv1_filter);
2144 mbfl_convert_filter_delete(pd->deco_filter);
2145 mbfl_memory_device_clear(&pd->outdev);
2146 mbfl_memory_device_clear(&pd->tmpdev);
2147 efree((void*)pd);
2148 }
2149 }
2150
2151 mbfl_string *
mbfl_mime_header_decode(mbfl_string * string,mbfl_string * result,const mbfl_encoding * outcode)2152 mbfl_mime_header_decode(
2153 mbfl_string *string,
2154 mbfl_string *result,
2155 const mbfl_encoding *outcode)
2156 {
2157 size_t n;
2158 unsigned char *p;
2159 struct mime_header_decoder_data *pd;
2160
2161 mbfl_string_init(result);
2162 result->encoding = outcode;
2163
2164 pd = mime_header_decoder_new(outcode);
2165 if (pd == NULL) {
2166 return NULL;
2167 }
2168
2169 /* feed data */
2170 n = string->len;
2171 p = string->val;
2172 while (n > 0) {
2173 mime_header_decoder_collector(*p++, pd);
2174 n--;
2175 }
2176
2177 result = mime_header_decoder_result(pd, result);
2178 mime_header_decoder_delete(pd);
2179
2180 return result;
2181 }
2182
2183
2184
2185 /*
2186 * convert HTML numeric entity
2187 */
2188 struct collector_htmlnumericentity_data {
2189 mbfl_convert_filter *decoder;
2190 int status;
2191 int cache;
2192 int digit;
2193 int *convmap;
2194 int mapsize;
2195 };
2196
2197 static int
collector_encode_htmlnumericentity(int c,void * data)2198 collector_encode_htmlnumericentity(int c, void *data)
2199 {
2200 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2201 int f, n, s, r, d, size, *mapelm;
2202
2203 size = pc->mapsize;
2204 f = 0;
2205 n = 0;
2206 while (n < size) {
2207 mapelm = &(pc->convmap[n*4]);
2208 if (c >= mapelm[0] && c <= mapelm[1]) {
2209 s = (c + mapelm[2]) & mapelm[3];
2210 if (s >= 0) {
2211 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2212 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2213 r = 100000000;
2214 s %= r;
2215 while (r > 0) {
2216 d = s/r;
2217 if (d || f) {
2218 f = 1;
2219 s %= r;
2220 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2221 }
2222 r /= 10;
2223 }
2224 if (!f) {
2225 f = 1;
2226 (*pc->decoder->filter_function)(mbfl_hexchar_table[0], pc->decoder);
2227 }
2228 (*pc->decoder->filter_function)(0x3b, pc->decoder); /* ';' */
2229 }
2230 }
2231 if (f) {
2232 break;
2233 }
2234 n++;
2235 }
2236 if (!f) {
2237 (*pc->decoder->filter_function)(c, pc->decoder);
2238 }
2239
2240 return 0;
2241 }
2242
2243 static int
collector_decode_htmlnumericentity(int c,void * data)2244 collector_decode_htmlnumericentity(int c, void *data)
2245 {
2246 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2247 int f, n, s, r, d, size, *mapelm;
2248
2249 switch (pc->status) {
2250 case 1:
2251 if (c == 0x23) { /* '#' */
2252 pc->status = 2;
2253 } else {
2254 pc->status = 0;
2255 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2256 (*pc->decoder->filter_function)(c, pc->decoder);
2257 }
2258 break;
2259 case 2:
2260 if (c == 0x78) { /* 'x' */
2261 pc->status = 4;
2262 } else if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2263 pc->cache = c - 0x30;
2264 pc->status = 3;
2265 pc->digit = 1;
2266 } else {
2267 pc->status = 0;
2268 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2269 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2270 (*pc->decoder->filter_function)(c, pc->decoder);
2271 }
2272 break;
2273 case 3:
2274 s = 0;
2275 f = 0;
2276 if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2277 s = pc->cache;
2278 if (pc->digit > 9 || s > INT_MAX/10) {
2279 pc->status = 0;
2280 f = 1;
2281 } else {
2282 s = s*10 + (c - 0x30);
2283 pc->cache = s;
2284 pc->digit++;
2285 }
2286 } else {
2287 pc->status = 0;
2288 s = pc->cache;
2289 f = 1;
2290 n = 0;
2291 size = pc->mapsize;
2292 while (n < size) {
2293 mapelm = &(pc->convmap[n*4]);
2294 d = s - mapelm[2];
2295 if (d >= mapelm[0] && d <= mapelm[1]) {
2296 f = 0;
2297 (*pc->decoder->filter_function)(d, pc->decoder);
2298 if (c != 0x3b) { /* ';' */
2299 (*pc->decoder->filter_function)(c, pc->decoder);
2300 }
2301 break;
2302 }
2303 n++;
2304 }
2305 }
2306 if (f) {
2307 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2308 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2309 r = 1;
2310 n = pc->digit;
2311 while (n > 1) {
2312 r *= 10;
2313 n--;
2314 }
2315 while (r > 0) {
2316 d = s/r;
2317 s %= r;
2318 r /= 10;
2319 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2320 }
2321 (*pc->decoder->filter_function)(c, pc->decoder);
2322 }
2323 break;
2324 case 4:
2325 if (c >= 0x30 && c <= 0x39) { /* '0' - '9' */
2326 pc->cache = c - 0x30;
2327 pc->status = 5;
2328 pc->digit = 1;
2329 } else if (c >= 0x41 && c <= 0x46) { /* 'A' - 'F' */
2330 pc->cache = c - 0x41 + 10;
2331 pc->status = 5;
2332 pc->digit = 1;
2333 } else if (c >= 0x61 && c <= 0x66) { /* 'a' - 'f' */
2334 pc->cache = c - 0x61 + 10;
2335 pc->status = 5;
2336 pc->digit = 1;
2337 } else {
2338 pc->status = 0;
2339 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2340 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2341 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2342 (*pc->decoder->filter_function)(c, pc->decoder);
2343 }
2344 break;
2345 case 5:
2346 s = 0;
2347 f = 0;
2348 if ((c >= 0x30 && c <= 0x39) ||
2349 (c >= 0x41 && c <= 0x46) ||
2350 (c >= 0x61 && c <= 0x66)) { /* '0' - '9' or 'a' - 'f' */
2351 if (pc->digit > 9) {
2352 pc->status = 0;
2353 s = pc->cache;
2354 f = 1;
2355 } else {
2356 if (c >= 0x30 && c <= 0x39) {
2357 s = pc->cache*16 + (c - 0x30);
2358 } else if (c >= 0x41 && c <= 0x46) {
2359 s = pc->cache*16 + (c - 0x41 + 10);
2360 } else {
2361 s = pc->cache*16 + (c - 0x61 + 10);
2362 }
2363 pc->cache = s;
2364 pc->digit++;
2365 }
2366 } else {
2367 pc->status = 0;
2368 s = pc->cache;
2369 f = 1;
2370 n = 0;
2371 size = pc->mapsize;
2372 while (n < size) {
2373 mapelm = &(pc->convmap[n*4]);
2374 d = s - mapelm[2];
2375 if (d >= mapelm[0] && d <= mapelm[1]) {
2376 f = 0;
2377 (*pc->decoder->filter_function)(d, pc->decoder);
2378 if (c != 0x3b) { /* ';' */
2379 (*pc->decoder->filter_function)(c, pc->decoder);
2380 }
2381 break;
2382 }
2383 n++;
2384 }
2385 }
2386 if (f) {
2387 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2388 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2389 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2390 r = 1;
2391 n = pc->digit;
2392 while (n > 0) {
2393 r *= 16;
2394 n--;
2395 }
2396 s %= r;
2397 r /= 16;
2398 while (r > 0) {
2399 d = s/r;
2400 s %= r;
2401 r /= 16;
2402 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2403 }
2404 (*pc->decoder->filter_function)(c, pc->decoder);
2405 }
2406 break;
2407 default:
2408 if (c == 0x26) { /* '&' */
2409 pc->status = 1;
2410 } else {
2411 (*pc->decoder->filter_function)(c, pc->decoder);
2412 }
2413 break;
2414 }
2415
2416 return 0;
2417 }
2418
2419 static int
collector_encode_hex_htmlnumericentity(int c,void * data)2420 collector_encode_hex_htmlnumericentity(int c, void *data)
2421 {
2422 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data;
2423 int f, n, s, r, d, size, *mapelm;
2424
2425 size = pc->mapsize;
2426 f = 0;
2427 n = 0;
2428 while (n < size) {
2429 mapelm = &(pc->convmap[n*4]);
2430 if (c >= mapelm[0] && c <= mapelm[1]) {
2431 s = (c + mapelm[2]) & mapelm[3];
2432 if (s >= 0) {
2433 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2434 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2435 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2436 r = 0x1000000;
2437 s %= r;
2438 while (r > 0) {
2439 d = s/r;
2440 if (d || f) {
2441 f = 1;
2442 s %= r;
2443 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2444 }
2445 r /= 16;
2446 }
2447 if (!f) {
2448 f = 1;
2449 (*pc->decoder->filter_function)(mbfl_hexchar_table[0], pc->decoder);
2450 }
2451 (*pc->decoder->filter_function)(0x3b, pc->decoder); /* ';' */
2452 }
2453 }
2454 if (f) {
2455 break;
2456 }
2457 n++;
2458 }
2459 if (!f) {
2460 (*pc->decoder->filter_function)(c, pc->decoder);
2461 }
2462
2463 return 0;
2464 }
2465
mbfl_filt_decode_htmlnumericentity_flush(mbfl_convert_filter * filter)2466 int mbfl_filt_decode_htmlnumericentity_flush(mbfl_convert_filter *filter)
2467 {
2468 struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)filter;
2469 int n, s, r, d;
2470
2471 if (pc->status) {
2472 switch (pc->status) {
2473 case 1: /* '&' */
2474 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2475 break;
2476 case 2: /* '#' */
2477 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2478 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2479 break;
2480 case 3: /* '0'-'9' */
2481 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2482 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2483
2484 s = pc->cache;
2485 r = 1;
2486 n = pc->digit;
2487 while (n > 1) {
2488 r *= 10;
2489 n--;
2490 }
2491 while (r > 0) {
2492 d = s/r;
2493 s %= r;
2494 r /= 10;
2495 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2496 }
2497
2498 break;
2499 case 4: /* 'x' */
2500 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2501 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2502 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2503 break;
2504 case 5: /* '0'-'9','a'-'f' */
2505 (*pc->decoder->filter_function)(0x26, pc->decoder); /* '&' */
2506 (*pc->decoder->filter_function)(0x23, pc->decoder); /* '#' */
2507 (*pc->decoder->filter_function)(0x78, pc->decoder); /* 'x' */
2508
2509 s = pc->cache;
2510 r = 1;
2511 n = pc->digit;
2512 while (n > 0) {
2513 r *= 16;
2514 n--;
2515 }
2516 s %= r;
2517 r /= 16;
2518 while (r > 0) {
2519 d = s/r;
2520 s %= r;
2521 r /= 16;
2522 (*pc->decoder->filter_function)(mbfl_hexchar_table[d], pc->decoder);
2523 }
2524 break;
2525 default:
2526 break;
2527 }
2528 }
2529
2530 pc->status = 0;
2531 pc->cache = 0;
2532 pc->digit = 0;
2533
2534 return 0;
2535 }
2536
2537
2538 mbfl_string *
mbfl_html_numeric_entity(mbfl_string * string,mbfl_string * result,int * convmap,int mapsize,int type)2539 mbfl_html_numeric_entity(
2540 mbfl_string *string,
2541 mbfl_string *result,
2542 int *convmap,
2543 int mapsize,
2544 int type)
2545 {
2546 struct collector_htmlnumericentity_data pc;
2547 mbfl_memory_device device;
2548 mbfl_convert_filter *encoder;
2549 size_t n;
2550 unsigned char *p;
2551
2552 if (string == NULL || result == NULL) {
2553 return NULL;
2554 }
2555 mbfl_string_init(result);
2556 result->encoding = string->encoding;
2557 mbfl_memory_device_init(&device, string->len, 0);
2558
2559 /* output code filter */
2560 pc.decoder = mbfl_convert_filter_new(
2561 &mbfl_encoding_wchar,
2562 string->encoding,
2563 mbfl_memory_device_output, 0, &device);
2564 /* wchar filter */
2565 if (type == 0) { /* decimal output */
2566 encoder = mbfl_convert_filter_new(
2567 string->encoding,
2568 &mbfl_encoding_wchar,
2569 collector_encode_htmlnumericentity, 0, &pc);
2570 } else if (type == 2) { /* hex output */
2571 encoder = mbfl_convert_filter_new(
2572 string->encoding,
2573 &mbfl_encoding_wchar,
2574 collector_encode_hex_htmlnumericentity, 0, &pc);
2575 } else { /* type == 1: decimal/hex input */
2576 encoder = mbfl_convert_filter_new(
2577 string->encoding,
2578 &mbfl_encoding_wchar,
2579 collector_decode_htmlnumericentity,
2580 (flush_function_t)mbfl_filt_decode_htmlnumericentity_flush, &pc);
2581 }
2582 if (pc.decoder == NULL || encoder == NULL) {
2583 mbfl_convert_filter_delete(encoder);
2584 mbfl_convert_filter_delete(pc.decoder);
2585 return NULL;
2586 }
2587 pc.status = 0;
2588 pc.cache = 0;
2589 pc.digit = 0;
2590 pc.convmap = convmap;
2591 pc.mapsize = mapsize;
2592
2593 /* feed data */
2594 p = string->val;
2595 n = string->len;
2596 if (p != NULL) {
2597 while (n > 0) {
2598 if ((*encoder->filter_function)(*p++, encoder) < 0) {
2599 break;
2600 }
2601 n--;
2602 }
2603 }
2604 mbfl_convert_filter_flush(encoder);
2605 mbfl_convert_filter_flush(pc.decoder);
2606 result = mbfl_memory_device_result(&device, result);
2607 mbfl_convert_filter_delete(encoder);
2608 mbfl_convert_filter_delete(pc.decoder);
2609
2610 return result;
2611 }
2612