1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include "mbfilter.h"
35 #include "mbfilter_utf16.h"
36 
37 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
38 
39 const mbfl_encoding mbfl_encoding_utf16 = {
40 	mbfl_no_encoding_utf16,
41 	"UTF-16",
42 	"UTF-16",
43 	(const char *(*)[])&mbfl_encoding_utf16_aliases,
44 	NULL,
45 	MBFL_ENCTYPE_MWC2BE,
46 	&vtbl_utf16_wchar,
47 	&vtbl_wchar_utf16
48 };
49 
50 const mbfl_encoding mbfl_encoding_utf16be = {
51 	mbfl_no_encoding_utf16be,
52 	"UTF-16BE",
53 	"UTF-16BE",
54 	NULL,
55 	NULL,
56 	MBFL_ENCTYPE_MWC2BE,
57 	&vtbl_utf16be_wchar,
58 	&vtbl_wchar_utf16be
59 };
60 
61 const mbfl_encoding mbfl_encoding_utf16le = {
62 	mbfl_no_encoding_utf16le,
63 	"UTF-16LE",
64 	"UTF-16LE",
65 	NULL,
66 	NULL,
67 	MBFL_ENCTYPE_MWC2LE,
68 	&vtbl_utf16le_wchar,
69 	&vtbl_wchar_utf16le
70 };
71 
72 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
73 	mbfl_no_encoding_utf16,
74 	mbfl_no_encoding_wchar,
75 	mbfl_filt_conv_common_ctor,
76 	mbfl_filt_conv_common_dtor,
77 	mbfl_filt_conv_utf16_wchar,
78 	mbfl_filt_conv_common_flush
79 };
80 
81 const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
82 	mbfl_no_encoding_wchar,
83 	mbfl_no_encoding_utf16,
84 	mbfl_filt_conv_common_ctor,
85 	mbfl_filt_conv_common_dtor,
86 	mbfl_filt_conv_wchar_utf16be,
87 	mbfl_filt_conv_common_flush
88 };
89 
90 const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
91 	mbfl_no_encoding_utf16be,
92 	mbfl_no_encoding_wchar,
93 	mbfl_filt_conv_common_ctor,
94 	mbfl_filt_conv_common_dtor,
95 	mbfl_filt_conv_utf16be_wchar,
96 	mbfl_filt_conv_common_flush
97 };
98 
99 const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
100 	mbfl_no_encoding_wchar,
101 	mbfl_no_encoding_utf16be,
102 	mbfl_filt_conv_common_ctor,
103 	mbfl_filt_conv_common_dtor,
104 	mbfl_filt_conv_wchar_utf16be,
105 	mbfl_filt_conv_common_flush
106 };
107 
108 const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
109 	mbfl_no_encoding_utf16le,
110 	mbfl_no_encoding_wchar,
111 	mbfl_filt_conv_common_ctor,
112 	mbfl_filt_conv_common_dtor,
113 	mbfl_filt_conv_utf16le_wchar,
114 	mbfl_filt_conv_common_flush
115 };
116 
117 const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
118 	mbfl_no_encoding_wchar,
119 	mbfl_no_encoding_utf16le,
120 	mbfl_filt_conv_common_ctor,
121 	mbfl_filt_conv_common_dtor,
122 	mbfl_filt_conv_wchar_utf16le,
123 	mbfl_filt_conv_common_flush
124 };
125 
126 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
127 
128 /*
129  * UTF-16 => wchar
130  */
mbfl_filt_conv_utf16_wchar(int c,mbfl_convert_filter * filter)131 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
132 {
133 	int n, endian;
134 
135 	endian = filter->status & 0xff00;
136 	switch (filter->status & 0x0f) {
137 	case 0:
138 		if (endian) {
139 			n = c & 0xff;
140 		} else {
141 			n = (c & 0xff) << 8;
142 		}
143 		filter->cache |= n;
144 		filter->status++;
145 		break;
146 	default:
147 		if (endian) {
148 			n = (c & 0xff) << 8;
149 		} else {
150 			n = c & 0xff;
151 		}
152 		n |= filter->cache & 0xffff;
153 		filter->status &= ~0x0f;
154 		if (n >= 0xd800 && n < 0xdc00) {
155 			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
156 		} else if (n >= 0xdc00 && n < 0xe000) {
157 			n &= 0x3ff;
158 			n |= (filter->cache & 0xfff0000) >> 6;
159 			filter->cache = 0;
160 			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
161 				CK((*filter->output_function)(n, filter->data));
162 			} else {		/* illegal character */
163 				n &= MBFL_WCSGROUP_MASK;
164 				n |= MBFL_WCSGROUP_THROUGH;
165 				CK((*filter->output_function)(n, filter->data));
166 			}
167 		} else {
168 			int is_first = filter->status & 0x10;
169 			filter->cache = 0;
170 			filter->status |= 0x10;
171 			if (!is_first) {
172 				if (n == 0xfffe) {
173 					if (endian) {
174 						filter->status &= ~0x100;		/* big-endian */
175 					} else {
176 						filter->status |= 0x100;		/* little-endian */
177 					}
178 					break;
179 				} else if (n == 0xfeff) {
180 					break;
181 				}
182 			}
183 			CK((*filter->output_function)(n, filter->data));
184 		}
185 		break;
186 	}
187 
188 	return c;
189 }
190 
191 /*
192  * UTF-16BE => wchar
193  */
mbfl_filt_conv_utf16be_wchar(int c,mbfl_convert_filter * filter)194 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
195 {
196 	int n;
197 
198 	switch (filter->status) {
199 	case 0:
200 		filter->status = 1;
201 		n = (c & 0xff) << 8;
202 		filter->cache |= n;
203 		break;
204 	default:
205 		filter->status = 0;
206 		n = (filter->cache & 0xff00) | (c & 0xff);
207 		if (n >= 0xd800 && n < 0xdc00) {
208 			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
209 		} else if (n >= 0xdc00 && n < 0xe000) {
210 			n &= 0x3ff;
211 			n |= (filter->cache & 0xfff0000) >> 6;
212 			filter->cache = 0;
213 			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
214 				CK((*filter->output_function)(n, filter->data));
215 			} else {		/* illegal character */
216 				n &= MBFL_WCSGROUP_MASK;
217 				n |= MBFL_WCSGROUP_THROUGH;
218 				CK((*filter->output_function)(n, filter->data));
219 			}
220 		} else {
221 			filter->cache = 0;
222 			CK((*filter->output_function)(n, filter->data));
223 		}
224 		break;
225 	}
226 
227 	return c;
228 }
229 
230 /*
231  * wchar => UTF-16BE
232  */
mbfl_filt_conv_wchar_utf16be(int c,mbfl_convert_filter * filter)233 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
234 {
235 	int n;
236 
237 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
238 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
239 		CK((*filter->output_function)(c & 0xff, filter->data));
240 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
241 		n = ((c >> 10) - 0x40) | 0xd800;
242 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
243 		CK((*filter->output_function)(n & 0xff, filter->data));
244 		n = (c & 0x3ff) | 0xdc00;
245 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
246 		CK((*filter->output_function)(n & 0xff, filter->data));
247 	} else {
248 		CK(mbfl_filt_conv_illegal_output(c, filter));
249 	}
250 
251 	return c;
252 }
253 
254 /*
255  * UTF-16LE => wchar
256  */
mbfl_filt_conv_utf16le_wchar(int c,mbfl_convert_filter * filter)257 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
258 {
259 	int n;
260 
261 	switch (filter->status) {
262 	case 0:
263 		filter->status = 1;
264 		n = c & 0xff;
265 		filter->cache |= n;
266 		break;
267 	default:
268 		filter->status = 0;
269 		n = (filter->cache & 0xff) | ((c & 0xff) << 8);
270 		if (n >= 0xd800 && n < 0xdc00) {
271 			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
272 		} else if (n >= 0xdc00 && n < 0xe000) {
273 			n &= 0x3ff;
274 			n |= (filter->cache & 0xfff0000) >> 6;
275 			filter->cache = 0;
276 			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
277 				CK((*filter->output_function)(n, filter->data));
278 			} else {		/* illegal character */
279 				n &= MBFL_WCSGROUP_MASK;
280 				n |= MBFL_WCSGROUP_THROUGH;
281 				CK((*filter->output_function)(n, filter->data));
282 			}
283 		} else {
284 			filter->cache = 0;
285 			CK((*filter->output_function)(n, filter->data));
286 		}
287 		break;
288 	}
289 
290 	return c;
291 }
292 
293 /*
294  * wchar => UTF-16LE
295  */
mbfl_filt_conv_wchar_utf16le(int c,mbfl_convert_filter * filter)296 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
297 {
298 	int n;
299 
300 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
301 		CK((*filter->output_function)(c & 0xff, filter->data));
302 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
303 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
304 		n = ((c >> 10) - 0x40) | 0xd800;
305 		CK((*filter->output_function)(n & 0xff, filter->data));
306 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
307 		n = (c & 0x3ff) | 0xdc00;
308 		CK((*filter->output_function)(n & 0xff, filter->data));
309 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
310 	} else {
311 		CK(mbfl_filt_conv_illegal_output(c, filter));
312 	}
313 
314 	return c;
315 }
316