1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include "mbfilter.h"
31 #include "mbfilter_utf16.h"
32 
33 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
34 
35 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
36 
37 const mbfl_encoding mbfl_encoding_utf16 = {
38 	mbfl_no_encoding_utf16,
39 	"UTF-16",
40 	"UTF-16",
41 	mbfl_encoding_utf16_aliases,
42 	NULL,
43 	MBFL_ENCTYPE_MWC2,
44 	&vtbl_utf16_wchar,
45 	&vtbl_wchar_utf16,
46 	NULL
47 };
48 
49 const mbfl_encoding mbfl_encoding_utf16be = {
50 	mbfl_no_encoding_utf16be,
51 	"UTF-16BE",
52 	"UTF-16BE",
53 	NULL,
54 	NULL,
55 	MBFL_ENCTYPE_MWC2,
56 	&vtbl_utf16be_wchar,
57 	&vtbl_wchar_utf16be,
58 	NULL
59 };
60 
61 const mbfl_encoding mbfl_encoding_utf16le = {
62 	mbfl_no_encoding_utf16le,
63 	"UTF-16LE",
64 	"UTF-16LE",
65 	NULL,
66 	NULL,
67 	MBFL_ENCTYPE_MWC2,
68 	&vtbl_utf16le_wchar,
69 	&vtbl_wchar_utf16le,
70 	NULL
71 };
72 
73 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
74 	mbfl_no_encoding_utf16,
75 	mbfl_no_encoding_wchar,
76 	mbfl_filt_conv_common_ctor,
77 	NULL,
78 	mbfl_filt_conv_utf16_wchar,
79 	mbfl_filt_conv_utf16_wchar_flush,
80 	NULL,
81 };
82 
83 const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
84 	mbfl_no_encoding_wchar,
85 	mbfl_no_encoding_utf16,
86 	mbfl_filt_conv_common_ctor,
87 	NULL,
88 	mbfl_filt_conv_wchar_utf16be,
89 	mbfl_filt_conv_common_flush,
90 	NULL,
91 };
92 
93 const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
94 	mbfl_no_encoding_utf16be,
95 	mbfl_no_encoding_wchar,
96 	mbfl_filt_conv_common_ctor,
97 	NULL,
98 	mbfl_filt_conv_utf16be_wchar,
99 	mbfl_filt_conv_utf16_wchar_flush,
100 	NULL,
101 };
102 
103 const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
104 	mbfl_no_encoding_wchar,
105 	mbfl_no_encoding_utf16be,
106 	mbfl_filt_conv_common_ctor,
107 	NULL,
108 	mbfl_filt_conv_wchar_utf16be,
109 	mbfl_filt_conv_common_flush,
110 	NULL,
111 };
112 
113 const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
114 	mbfl_no_encoding_utf16le,
115 	mbfl_no_encoding_wchar,
116 	mbfl_filt_conv_common_ctor,
117 	NULL,
118 	mbfl_filt_conv_utf16le_wchar,
119 	mbfl_filt_conv_utf16_wchar_flush,
120 	NULL,
121 };
122 
123 const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
124 	mbfl_no_encoding_wchar,
125 	mbfl_no_encoding_utf16le,
126 	mbfl_filt_conv_common_ctor,
127 	NULL,
128 	mbfl_filt_conv_wchar_utf16le,
129 	mbfl_filt_conv_common_flush,
130 	NULL,
131 };
132 
133 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
134 
mbfl_filt_conv_utf16_wchar(int c,mbfl_convert_filter * filter)135 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
136 {
137 	/* Start with the assumption that the string is big-endian;
138 	 * If we find a little-endian BOM, then we will change that assumption */
139 	if (filter->status == 0) {
140 		filter->cache = c & 0xFF;
141 		filter->status = 1;
142 	} else {
143 		int n = (filter->cache << 8) | (c & 0xFF);
144 		if (n == 0xFFFE) {
145 			/* Switch to little-endian mode */
146 			filter->filter_function = mbfl_filt_conv_utf16le_wchar;
147 			filter->cache = filter->status = 0;
148 		} else {
149 			filter->filter_function = mbfl_filt_conv_utf16be_wchar;
150 			if (n >= 0xD800 && n <= 0xDBFF) {
151 				filter->cache = n & 0x3FF; /* Pick out 10 data bits */
152 				filter->status = 2;
153 				return 0;
154 			} else if (n >= 0xDC00 && n <= 0xDFFF) {
155 				/* This is wrong; second part of surrogate pair has come first */
156 				CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
157 			} else if (n != 0xFEFF) {
158 				CK((*filter->output_function)(n, filter->data));
159 			}
160 			filter->cache = filter->status = 0;
161 		}
162 	}
163 
164 	return 0;
165 }
166 
mbfl_filt_conv_utf16be_wchar(int c,mbfl_convert_filter * filter)167 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
168 {
169 	int n;
170 
171 	switch (filter->status) {
172 	case 0: /* First byte */
173 		filter->cache = c & 0xFF;
174 		filter->status = 1;
175 		break;
176 
177 	case 1: /* Second byte */
178 		n = (filter->cache << 8) | (c & 0xFF);
179 		if (n >= 0xD800 && n <= 0xDBFF) {
180 			filter->cache = n & 0x3FF; /* Pick out 10 data bits */
181 			filter->status = 2;
182 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
183 			/* This is wrong; second part of surrogate pair has come first */
184 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
185 			filter->status = 0;
186 		} else {
187 			CK((*filter->output_function)(n, filter->data));
188 			filter->status = 0;
189 		}
190 		break;
191 
192 	case 2: /* Second part of surrogate, first byte */
193 		filter->cache = (filter->cache << 8) | (c & 0xFF);
194 		filter->status = 3;
195 		break;
196 
197 	case 3: /* Second part of surrogate, second byte */
198 		n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
199 		if (n >= 0xD800 && n <= 0xDBFF) {
200 			/* Wrong; that's the first half of a surrogate pair, not the second */
201 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
202 			filter->cache = n & 0x3FF;
203 			filter->status = 2;
204 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
205 			n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
206 			CK((*filter->output_function)(n, filter->data));
207 			filter->status = 0;
208 		} else {
209 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
210 			CK((*filter->output_function)(n, filter->data));
211 			filter->status = 0;
212 		}
213 	}
214 
215 	return 0;
216 }
217 
mbfl_filt_conv_wchar_utf16be(int c,mbfl_convert_filter * filter)218 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
219 {
220 	int n;
221 
222 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
223 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
224 		CK((*filter->output_function)(c & 0xff, filter->data));
225 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
226 		n = ((c >> 10) - 0x40) | 0xd800;
227 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
228 		CK((*filter->output_function)(n & 0xff, filter->data));
229 		n = (c & 0x3ff) | 0xdc00;
230 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
231 		CK((*filter->output_function)(n & 0xff, filter->data));
232 	} else {
233 		CK(mbfl_filt_conv_illegal_output(c, filter));
234 	}
235 
236 	return 0;
237 }
238 
mbfl_filt_conv_utf16le_wchar(int c,mbfl_convert_filter * filter)239 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
240 {
241 	int n;
242 
243 	switch (filter->status) {
244 	case 0:
245 		filter->cache = c & 0xff;
246 		filter->status = 1;
247 		break;
248 
249 	case 1:
250 		if ((c & 0xfc) == 0xd8) {
251 			/* Looks like we have a surrogate pair here */
252 			filter->cache += ((c & 0x3) << 8);
253 			filter->status = 2;
254 		} else if ((c & 0xfc) == 0xdc) {
255 			/* This is wrong; the second part of the surrogate pair has come first */
256 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
257 			filter->status = 0;
258 		} else {
259 			CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
260 			filter->status = 0;
261 		}
262 		break;
263 
264 	case 2:
265 		filter->cache = (filter->cache << 10) + (c & 0xff);
266 		filter->status = 3;
267 		break;
268 
269 	case 3:
270 		n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
271 		if (n >= 0xD800 && n <= 0xDBFF) {
272 			/* We previously saw the first part of a surrogate pair and were
273 			 * expecting the second part; this is another first part */
274 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
275 			filter->cache = n & 0x3FF;
276 			filter->status = 2;
277 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
278 			n = filter->cache + ((c & 0x3) << 8) + 0x10000;
279 			CK((*filter->output_function)(n, filter->data));
280 			filter->status = 0;
281 		} else {
282 			/* The first part of a surrogate pair was followed by some other codepoint
283 			 * which is not part of a surrogate pair at all */
284 			CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
285 			CK((*filter->output_function)(n, filter->data));
286 			filter->status = 0;
287 		}
288 		break;
289 	}
290 
291 	return 0;
292 }
293 
mbfl_filt_conv_wchar_utf16le(int c,mbfl_convert_filter * filter)294 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
295 {
296 	int n;
297 
298 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
299 		CK((*filter->output_function)(c & 0xff, filter->data));
300 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
301 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
302 		n = ((c >> 10) - 0x40) | 0xd800;
303 		CK((*filter->output_function)(n & 0xff, filter->data));
304 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
305 		n = (c & 0x3ff) | 0xdc00;
306 		CK((*filter->output_function)(n & 0xff, filter->data));
307 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
308 	} else {
309 		CK(mbfl_filt_conv_illegal_output(c, filter));
310 	}
311 
312 	return 0;
313 }
314 
mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter * filter)315 static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
316 {
317 	if (filter->status) {
318 		/* Input string was truncated */
319 		filter->status = 0;
320 		CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
321 	}
322 
323 	if (filter->flush_function) {
324 		(*filter->flush_function)(filter->data);
325 	}
326 
327 	return 0;
328 }
329