1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include "mbfilter.h"
31 #include "mbfilter_utf16.h"
32
33 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
34
35 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
36
37 const mbfl_encoding mbfl_encoding_utf16 = {
38 mbfl_no_encoding_utf16,
39 "UTF-16",
40 "UTF-16",
41 mbfl_encoding_utf16_aliases,
42 NULL,
43 MBFL_ENCTYPE_MWC2,
44 &vtbl_utf16_wchar,
45 &vtbl_wchar_utf16,
46 NULL
47 };
48
49 const mbfl_encoding mbfl_encoding_utf16be = {
50 mbfl_no_encoding_utf16be,
51 "UTF-16BE",
52 "UTF-16BE",
53 NULL,
54 NULL,
55 MBFL_ENCTYPE_MWC2,
56 &vtbl_utf16be_wchar,
57 &vtbl_wchar_utf16be,
58 NULL
59 };
60
61 const mbfl_encoding mbfl_encoding_utf16le = {
62 mbfl_no_encoding_utf16le,
63 "UTF-16LE",
64 "UTF-16LE",
65 NULL,
66 NULL,
67 MBFL_ENCTYPE_MWC2,
68 &vtbl_utf16le_wchar,
69 &vtbl_wchar_utf16le,
70 NULL
71 };
72
73 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
74 mbfl_no_encoding_utf16,
75 mbfl_no_encoding_wchar,
76 mbfl_filt_conv_common_ctor,
77 NULL,
78 mbfl_filt_conv_utf16_wchar,
79 mbfl_filt_conv_utf16_wchar_flush,
80 NULL,
81 };
82
83 const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
84 mbfl_no_encoding_wchar,
85 mbfl_no_encoding_utf16,
86 mbfl_filt_conv_common_ctor,
87 NULL,
88 mbfl_filt_conv_wchar_utf16be,
89 mbfl_filt_conv_common_flush,
90 NULL,
91 };
92
93 const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
94 mbfl_no_encoding_utf16be,
95 mbfl_no_encoding_wchar,
96 mbfl_filt_conv_common_ctor,
97 NULL,
98 mbfl_filt_conv_utf16be_wchar,
99 mbfl_filt_conv_utf16_wchar_flush,
100 NULL,
101 };
102
103 const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
104 mbfl_no_encoding_wchar,
105 mbfl_no_encoding_utf16be,
106 mbfl_filt_conv_common_ctor,
107 NULL,
108 mbfl_filt_conv_wchar_utf16be,
109 mbfl_filt_conv_common_flush,
110 NULL,
111 };
112
113 const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
114 mbfl_no_encoding_utf16le,
115 mbfl_no_encoding_wchar,
116 mbfl_filt_conv_common_ctor,
117 NULL,
118 mbfl_filt_conv_utf16le_wchar,
119 mbfl_filt_conv_utf16_wchar_flush,
120 NULL,
121 };
122
123 const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
124 mbfl_no_encoding_wchar,
125 mbfl_no_encoding_utf16le,
126 mbfl_filt_conv_common_ctor,
127 NULL,
128 mbfl_filt_conv_wchar_utf16le,
129 mbfl_filt_conv_common_flush,
130 NULL,
131 };
132
133 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
134
mbfl_filt_conv_utf16_wchar(int c,mbfl_convert_filter * filter)135 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
136 {
137 /* Start with the assumption that the string is big-endian;
138 * If we find a little-endian BOM, then we will change that assumption */
139 if (filter->status == 0) {
140 filter->cache = c & 0xFF;
141 filter->status = 1;
142 } else {
143 int n = (filter->cache << 8) | (c & 0xFF);
144 if (n == 0xFFFE) {
145 /* Switch to little-endian mode */
146 filter->filter_function = mbfl_filt_conv_utf16le_wchar;
147 filter->cache = filter->status = 0;
148 } else {
149 filter->filter_function = mbfl_filt_conv_utf16be_wchar;
150 if (n >= 0xD800 && n <= 0xDBFF) {
151 filter->cache = n & 0x3FF; /* Pick out 10 data bits */
152 filter->status = 2;
153 return 0;
154 } else if (n >= 0xDC00 && n <= 0xDFFF) {
155 /* This is wrong; second part of surrogate pair has come first */
156 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
157 } else if (n != 0xFEFF) {
158 CK((*filter->output_function)(n, filter->data));
159 }
160 filter->cache = filter->status = 0;
161 }
162 }
163
164 return 0;
165 }
166
mbfl_filt_conv_utf16be_wchar(int c,mbfl_convert_filter * filter)167 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
168 {
169 int n;
170
171 switch (filter->status) {
172 case 0: /* First byte */
173 filter->cache = c & 0xFF;
174 filter->status = 1;
175 break;
176
177 case 1: /* Second byte */
178 n = (filter->cache << 8) | (c & 0xFF);
179 if (n >= 0xD800 && n <= 0xDBFF) {
180 filter->cache = n & 0x3FF; /* Pick out 10 data bits */
181 filter->status = 2;
182 } else if (n >= 0xDC00 && n <= 0xDFFF) {
183 /* This is wrong; second part of surrogate pair has come first */
184 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
185 filter->status = 0;
186 } else {
187 CK((*filter->output_function)(n, filter->data));
188 filter->status = 0;
189 }
190 break;
191
192 case 2: /* Second part of surrogate, first byte */
193 filter->cache = (filter->cache << 8) | (c & 0xFF);
194 filter->status = 3;
195 break;
196
197 case 3: /* Second part of surrogate, second byte */
198 n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
199 if (n >= 0xD800 && n <= 0xDBFF) {
200 /* Wrong; that's the first half of a surrogate pair, not the second */
201 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
202 filter->cache = n & 0x3FF;
203 filter->status = 2;
204 } else if (n >= 0xDC00 && n <= 0xDFFF) {
205 n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
206 CK((*filter->output_function)(n, filter->data));
207 filter->status = 0;
208 } else {
209 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
210 CK((*filter->output_function)(n, filter->data));
211 filter->status = 0;
212 }
213 }
214
215 return 0;
216 }
217
mbfl_filt_conv_wchar_utf16be(int c,mbfl_convert_filter * filter)218 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
219 {
220 int n;
221
222 if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
223 CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
224 CK((*filter->output_function)(c & 0xff, filter->data));
225 } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
226 n = ((c >> 10) - 0x40) | 0xd800;
227 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
228 CK((*filter->output_function)(n & 0xff, filter->data));
229 n = (c & 0x3ff) | 0xdc00;
230 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
231 CK((*filter->output_function)(n & 0xff, filter->data));
232 } else {
233 CK(mbfl_filt_conv_illegal_output(c, filter));
234 }
235
236 return 0;
237 }
238
mbfl_filt_conv_utf16le_wchar(int c,mbfl_convert_filter * filter)239 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
240 {
241 int n;
242
243 switch (filter->status) {
244 case 0:
245 filter->cache = c & 0xff;
246 filter->status = 1;
247 break;
248
249 case 1:
250 if ((c & 0xfc) == 0xd8) {
251 /* Looks like we have a surrogate pair here */
252 filter->cache += ((c & 0x3) << 8);
253 filter->status = 2;
254 } else if ((c & 0xfc) == 0xdc) {
255 /* This is wrong; the second part of the surrogate pair has come first */
256 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
257 filter->status = 0;
258 } else {
259 CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
260 filter->status = 0;
261 }
262 break;
263
264 case 2:
265 filter->cache = (filter->cache << 10) + (c & 0xff);
266 filter->status = 3;
267 break;
268
269 case 3:
270 n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
271 if (n >= 0xD800 && n <= 0xDBFF) {
272 /* We previously saw the first part of a surrogate pair and were
273 * expecting the second part; this is another first part */
274 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
275 filter->cache = n & 0x3FF;
276 filter->status = 2;
277 } else if (n >= 0xDC00 && n <= 0xDFFF) {
278 n = filter->cache + ((c & 0x3) << 8) + 0x10000;
279 CK((*filter->output_function)(n, filter->data));
280 filter->status = 0;
281 } else {
282 /* The first part of a surrogate pair was followed by some other codepoint
283 * which is not part of a surrogate pair at all */
284 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
285 CK((*filter->output_function)(n, filter->data));
286 filter->status = 0;
287 }
288 break;
289 }
290
291 return 0;
292 }
293
mbfl_filt_conv_wchar_utf16le(int c,mbfl_convert_filter * filter)294 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
295 {
296 int n;
297
298 if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
299 CK((*filter->output_function)(c & 0xff, filter->data));
300 CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
301 } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
302 n = ((c >> 10) - 0x40) | 0xd800;
303 CK((*filter->output_function)(n & 0xff, filter->data));
304 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
305 n = (c & 0x3ff) | 0xdc00;
306 CK((*filter->output_function)(n & 0xff, filter->data));
307 CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
308 } else {
309 CK(mbfl_filt_conv_illegal_output(c, filter));
310 }
311
312 return 0;
313 }
314
mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter * filter)315 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
316 {
317 if (filter->status) {
318 /* Input string was truncated */
319 filter->status = 0;
320 CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
321 }
322
323 if (filter->flush_function) {
324 (*filter->flush_function)(filter->data);
325 }
326
327 return 0;
328 }
329