1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #include "mbfilter.h"
35 #include "mbfilter_utf7.h"
36
37 static int mbfl_filt_ident_utf7(int c, mbfl_identify_filter *filter);
38
39 static const unsigned char mbfl_base64_table[] = {
40 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
41 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
42 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
43 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
44 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
45 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
46 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
47 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
48 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
49 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
50 };
51
52 static const char *mbfl_encoding_utf7_aliases[] = {"utf7", NULL};
53
54 const mbfl_encoding mbfl_encoding_utf7 = {
55 mbfl_no_encoding_utf7,
56 "UTF-7",
57 "UTF-7",
58 (const char *(*)[])&mbfl_encoding_utf7_aliases,
59 NULL,
60 MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_SHFTCODE | MBFL_ENCTYPE_GL_UNSAFE
61 };
62
63 const struct mbfl_identify_vtbl vtbl_identify_utf7 = {
64 mbfl_no_encoding_utf7,
65 mbfl_filt_ident_common_ctor,
66 mbfl_filt_ident_common_dtor,
67 mbfl_filt_ident_utf7
68 };
69
70 const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
71 mbfl_no_encoding_utf7,
72 mbfl_no_encoding_wchar,
73 mbfl_filt_conv_common_ctor,
74 mbfl_filt_conv_common_dtor,
75 mbfl_filt_conv_utf7_wchar,
76 mbfl_filt_conv_common_flush
77 };
78
79 const struct mbfl_convert_vtbl vtbl_wchar_utf7 = {
80 mbfl_no_encoding_wchar,
81 mbfl_no_encoding_utf7,
82 mbfl_filt_conv_common_ctor,
83 mbfl_filt_conv_common_dtor,
84 mbfl_filt_conv_wchar_utf7,
85 mbfl_filt_conv_wchar_utf7_flush
86 };
87
88
89 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
90
91 /*
92 * UTF-7 => wchar
93 */
mbfl_filt_conv_utf7_wchar(int c,mbfl_convert_filter * filter)94 int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
95 {
96 int s, n;
97
98 n = -1;
99 if (filter->status != 0) { /* Modified Base64 */
100 if (c >= 0x41 && c <= 0x5a) { /* A - Z */
101 n = c - 65;
102 } else if (c >= 0x61 && c <= 0x7a) { /* a - z */
103 n = c - 71;
104 } else if (c >= 0x30 && c <= 0x39) { /* 0 - 9 */
105 n = c + 4;
106 } else if (c == 0x2b) { /* '+' */
107 n = 62;
108 } else if (c == 0x2f) { /* '/' */
109 n = 63;
110 }
111 if (n < 0 || n > 63) {
112 if (c == 0x2d) {
113 if (filter->status == 1) { /* "+-" -> "+" */
114 CK((*filter->output_function)(0x2b, filter->data));
115 }
116 } else if (c >= 0 && c < 0x80) { /* ASCII exclude '-' */
117 CK((*filter->output_function)(c, filter->data));
118 } else { /* illegal character */
119 s = c & MBFL_WCSGROUP_MASK;
120 s |= MBFL_WCSGROUP_THROUGH;
121 CK((*filter->output_function)(s, filter->data));
122 }
123 filter->cache = 0;
124 filter->status = 0;
125 return c;
126 }
127 }
128
129 switch (filter->status) {
130 /* directly encoded characters */
131 case 0:
132 if (c == 0x2b) { /* '+' shift character */
133 filter->status = 1;
134 } else if (c >= 0 && c < 0x80) { /* ASCII */
135 CK((*filter->output_function)(c, filter->data));
136 } else { /* illegal character */
137 s = c & MBFL_WCSGROUP_MASK;
138 s |= MBFL_WCSGROUP_THROUGH;
139 CK((*filter->output_function)(s, filter->data));
140 }
141 break;
142
143 /* decode Modified Base64 */
144 case 1:
145 case 2:
146 filter->cache |= n << 10;
147 filter->status = 3;
148 break;
149 case 3:
150 filter->cache |= n << 4;
151 filter->status = 4;
152 break;
153 case 4:
154 s = ((n >> 2) & 0xf) | (filter->cache & 0xffff);
155 n = (n & 0x3) << 14;
156 filter->status = 5;
157 if (s >= 0xd800 && s < 0xdc00) {
158 s = (((s & 0x3ff) << 16) + 0x400000) | n;
159 filter->cache = s;
160 } else if (s >= 0xdc00 && s < 0xe000) {
161 s &= 0x3ff;
162 s |= (filter->cache & 0xfff0000) >> 6;
163 filter->cache = n;
164 if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
165 CK((*filter->output_function)(s, filter->data));
166 } else { /* illegal character */
167 s &= MBFL_WCSGROUP_MASK;
168 s |= MBFL_WCSGROUP_THROUGH;
169 CK((*filter->output_function)(s, filter->data));
170 }
171 } else {
172 filter->cache = n;
173 CK((*filter->output_function)(s, filter->data));
174 }
175 break;
176
177 case 5:
178 filter->cache |= n << 8;
179 filter->status = 6;
180 break;
181 case 6:
182 filter->cache |= n << 2;
183 filter->status = 7;
184 break;
185 case 7:
186 s = ((n >> 4) & 0x3) | (filter->cache & 0xffff);
187 n = (n & 0xf) << 12;
188 filter->status = 8;
189 if (s >= 0xd800 && s < 0xdc00) {
190 s = (((s & 0x3ff) << 16) + 0x400000) | n;
191 filter->cache = s;
192 } else if (s >= 0xdc00 && s < 0xe000) {
193 s &= 0x3ff;
194 s |= (filter->cache & 0xfff0000) >> 6;
195 filter->cache = n;
196 if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
197 CK((*filter->output_function)(s, filter->data));
198 } else { /* illegal character */
199 s &= MBFL_WCSGROUP_MASK;
200 s |= MBFL_WCSGROUP_THROUGH;
201 CK((*filter->output_function)(s, filter->data));
202 }
203 } else {
204 filter->cache = n;
205 CK((*filter->output_function)(s, filter->data));
206 }
207 break;
208
209 case 8:
210 filter->cache |= n << 6;
211 filter->status = 9;
212 break;
213 case 9:
214 s = n | (filter->cache & 0xffff);
215 filter->status = 2;
216 if (s >= 0xd800 && s < 0xdc00) {
217 s = (((s & 0x3ff) << 16) + 0x400000);
218 filter->cache = s;
219 } else if (s >= 0xdc00 && s < 0xe000) {
220 s &= 0x3ff;
221 s |= (filter->cache & 0xfff0000) >> 6;
222 filter->cache = 0;
223 if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
224 CK((*filter->output_function)(s, filter->data));
225 } else { /* illegal character */
226 s &= MBFL_WCSGROUP_MASK;
227 s |= MBFL_WCSGROUP_THROUGH;
228 CK((*filter->output_function)(s, filter->data));
229 }
230 } else {
231 filter->cache = 0;
232 CK((*filter->output_function)(s, filter->data));
233 }
234 break;
235
236 default:
237 filter->status = 0;
238 break;
239 }
240
241 return c;
242 }
243
244 /*
245 * wchar => UTF-7
246 */
mbfl_filt_conv_wchar_utf7(int c,mbfl_convert_filter * filter)247 int mbfl_filt_conv_wchar_utf7(int c, mbfl_convert_filter *filter)
248 {
249 int s, n;
250
251 n = 0;
252 if (c >= 0 && c < 0x80) { /* ASCII */
253 if (c >= 0x41 && c <= 0x5a) { /* A - Z */
254 n = 1;
255 } else if (c >= 0x61 && c <= 0x7a) { /* a - z */
256 n = 1;
257 } else if (c >= 0x30 && c <= 0x39) { /* 0 - 9 */
258 n = 1;
259 } else if (c == '\0') { /* '\0' */
260 n = 1;
261 } else if (c == 0x2f) { /* '/' */
262 n = 1;
263 } else if (c == 0x2d) { /* '-' */
264 n = 1;
265 } else if (c == 0x20) { /* SPACE */
266 n = 2;
267 } else if (c == 0x09) { /* HTAB */
268 n = 2;
269 } else if (c == 0x0d) { /* CR */
270 n = 2;
271 } else if (c == 0x0a) { /* LF */
272 n = 2;
273 } else if (c == 0x27) { /* "'" */
274 n = 2;
275 } else if (c == 0x28) { /* '(' */
276 n = 2;
277 } else if (c == 0x29) { /* ')' */
278 n = 2;
279 } else if (c == 0x2c) { /* ',' */
280 n = 2;
281 } else if (c == 0x2e) { /* '.' */
282 n = 2;
283 } else if (c == 0x3a) { /* ':' */
284 n = 2;
285 } else if (c == 0x3f) { /* '?' */
286 n = 2;
287 }
288 } else if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
289 ;
290 } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
291 s = ((c >> 10) - 0x40) | 0xd800;
292 CK((*filter->filter_function)(s, filter));
293 s = (c & 0x3ff) | 0xdc00;
294 CK((*filter->filter_function)(s, filter));
295 return c;
296 } else {
297 if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
298 CK(mbfl_filt_conv_illegal_output(c, filter));
299 }
300 return c;
301 }
302
303 switch (filter->status) {
304 case 0:
305 if (n != 0) { /* directly encode characters */
306 CK((*filter->output_function)(c, filter->data));
307 } else { /* Modified Base64 */
308 CK((*filter->output_function)(0x2b, filter->data)); /* '+' */
309 filter->status++;
310 filter->cache = c;
311 }
312 break;
313
314 /* encode Modified Base64 */
315 case 1:
316 s = filter->cache;
317 CK((*filter->output_function)(mbfl_base64_table[(s >> 10) & 0x3f], filter->data));
318 CK((*filter->output_function)(mbfl_base64_table[(s >> 4) & 0x3f], filter->data));
319 if (n != 0) {
320 CK((*filter->output_function)(mbfl_base64_table[(s << 2) & 0x3c], filter->data));
321 if (n == 1) {
322 CK((*filter->output_function)(0x2d, filter->data)); /* '-' */
323 }
324 CK((*filter->output_function)(c, filter->data));
325 filter->status = 0;
326 } else {
327 filter->status++;
328 filter->cache = ((s & 0xf) << 16) | c;
329 }
330 break;
331
332 case 2:
333 s = filter->cache;
334 CK((*filter->output_function)(mbfl_base64_table[(s >> 14) & 0x3f], filter->data));
335 CK((*filter->output_function)(mbfl_base64_table[(s >> 8) & 0x3f], filter->data));
336 CK((*filter->output_function)(mbfl_base64_table[(s >> 2) & 0x3f], filter->data));
337 if (n != 0) {
338 CK((*filter->output_function)(mbfl_base64_table[(s << 4) & 0x30], filter->data));
339 if (n == 1) {
340 CK((*filter->output_function)(0x2d, filter->data)); /* '-' */
341 }
342 CK((*filter->output_function)(c, filter->data));
343 filter->status = 0;
344 } else {
345 filter->status++;
346 filter->cache = ((s & 0x3) << 16) | c;
347 }
348 break;
349
350 case 3:
351 s = filter->cache;
352 CK((*filter->output_function)(mbfl_base64_table[(s >> 12) & 0x3f], filter->data));
353 CK((*filter->output_function)(mbfl_base64_table[(s >> 6) & 0x3f], filter->data));
354 CK((*filter->output_function)(mbfl_base64_table[s & 0x3f], filter->data));
355 if (n != 0) {
356 if (n == 1) {
357 CK((*filter->output_function)(0x2d, filter->data)); /* '-' */
358 }
359 CK((*filter->output_function)(c, filter->data));
360 filter->status = 0;
361 } else {
362 filter->status = 1;
363 filter->cache = c;
364 }
365 break;
366
367 default:
368 filter->status = 0;
369 break;
370 }
371
372 return c;
373
374 }
375
mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter * filter)376 int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter)
377 {
378 int status, cache;
379
380 status = filter->status;
381 cache = filter->cache;
382 filter->status = 0;
383 filter->cache = 0;
384 /* flush fragments */
385 switch (status) {
386 case 1:
387 CK((*filter->output_function)(mbfl_base64_table[(cache >> 10) & 0x3f], filter->data));
388 CK((*filter->output_function)(mbfl_base64_table[(cache >> 4) & 0x3f], filter->data));
389 CK((*filter->output_function)(mbfl_base64_table[(cache << 2) & 0x3c], filter->data));
390 CK((*filter->output_function)(0x2d, filter->data)); /* '-' */
391 break;
392
393 case 2:
394 CK((*filter->output_function)(mbfl_base64_table[(cache >> 14) & 0x3f], filter->data));
395 CK((*filter->output_function)(mbfl_base64_table[(cache >> 8) & 0x3f], filter->data));
396 CK((*filter->output_function)(mbfl_base64_table[(cache >> 2) & 0x3f], filter->data));
397 CK((*filter->output_function)(mbfl_base64_table[(cache << 4) & 0x30], filter->data));
398 CK((*filter->output_function)(0x2d, filter->data)); /* '-' */
399 break;
400
401 case 3:
402 CK((*filter->output_function)(mbfl_base64_table[(cache >> 12) & 0x3f], filter->data));
403 CK((*filter->output_function)(mbfl_base64_table[(cache >> 6) & 0x3f], filter->data));
404 CK((*filter->output_function)(mbfl_base64_table[cache & 0x3f], filter->data));
405 CK((*filter->output_function)(0x2d, filter->data)); /* '-' */
406 break;
407 }
408
409 if (filter->flush_function != NULL) {
410 (*filter->flush_function)(filter->data);
411 }
412
413 return 0;
414 }
415
mbfl_filt_ident_utf7(int c,mbfl_identify_filter * filter)416 static int mbfl_filt_ident_utf7(int c, mbfl_identify_filter *filter)
417 {
418 int n;
419
420 switch (filter->status) {
421 /* directly encoded characters */
422 case 0:
423 if (c == 0x2b) { /* '+' shift character */
424 filter->status++;
425 } else if (c == 0x5c || c == 0x7e || c < 0 || c > 0x7f) { /* illegal character */
426 filter->flag = 1; /* bad */
427 }
428 break;
429
430 /* Modified Base64 */
431 case 1:
432 case 2:
433 n = 0;
434 if (c >= 0x41 && c <= 0x5a) { /* A - Z */
435 n = 1;
436 } else if (c >= 0x61 && c <= 0x7a) { /* a - z */
437 n = 1;
438 } else if (c >= 0x30 && c <= 0x39) { /* 0 - 9 */
439 n = 1;
440 } else if (c == 0x2b) { /* '+' */
441 n = 1;
442 } else if (c == 0x2f) { /* '/' */
443 n = 1;
444 }
445 if (n <= 0) {
446 if (filter->status == 1 && c != 0x2d) {
447 filter->flag = 1; /* bad */
448 } else if (c < 0 || c > 0x7f) {
449 filter->flag = 1; /* bad */
450 }
451 filter->status = 0;
452 } else {
453 filter->status = 2;
454 }
455 break;
456
457 default:
458 filter->status = 0;
459 break;
460 }
461
462 return c;
463 }
464