1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include "mbfilter.h"
35 #include "mbfilter_utf16.h"
36 
37 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
38 
39 const mbfl_encoding mbfl_encoding_utf16 = {
40 	mbfl_no_encoding_utf16,
41 	"UTF-16",
42 	"UTF-16",
43 	(const char *(*)[])&mbfl_encoding_utf16_aliases,
44 	NULL,
45 	MBFL_ENCTYPE_MWC2BE,
46 	&vtbl_utf16_wchar,
47 	&vtbl_wchar_utf16
48 };
49 
50 const mbfl_encoding mbfl_encoding_utf16be = {
51 	mbfl_no_encoding_utf16be,
52 	"UTF-16BE",
53 	"UTF-16BE",
54 	NULL,
55 	NULL,
56 	MBFL_ENCTYPE_MWC2BE,
57 	&vtbl_utf16be_wchar,
58 	&vtbl_wchar_utf16be
59 };
60 
61 const mbfl_encoding mbfl_encoding_utf16le = {
62 	mbfl_no_encoding_utf16le,
63 	"UTF-16LE",
64 	"UTF-16LE",
65 	NULL,
66 	NULL,
67 	MBFL_ENCTYPE_MWC2LE,
68 	&vtbl_utf16le_wchar,
69 	&vtbl_wchar_utf16le
70 };
71 
72 const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
73 	mbfl_no_encoding_utf16,
74 	mbfl_no_encoding_wchar,
75 	mbfl_filt_conv_common_ctor,
76 	mbfl_filt_conv_common_dtor,
77 	mbfl_filt_conv_utf16_wchar,
78 	mbfl_filt_conv_common_flush
79 };
80 
81 const struct mbfl_convert_vtbl vtbl_wchar_utf16 = {
82 	mbfl_no_encoding_wchar,
83 	mbfl_no_encoding_utf16,
84 	mbfl_filt_conv_common_ctor,
85 	mbfl_filt_conv_common_dtor,
86 	mbfl_filt_conv_wchar_utf16be,
87 	mbfl_filt_conv_common_flush
88 };
89 
90 const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
91 	mbfl_no_encoding_utf16be,
92 	mbfl_no_encoding_wchar,
93 	mbfl_filt_conv_common_ctor,
94 	mbfl_filt_conv_common_dtor,
95 	mbfl_filt_conv_utf16be_wchar,
96 	mbfl_filt_conv_common_flush
97 };
98 
99 const struct mbfl_convert_vtbl vtbl_wchar_utf16be = {
100 	mbfl_no_encoding_wchar,
101 	mbfl_no_encoding_utf16be,
102 	mbfl_filt_conv_common_ctor,
103 	mbfl_filt_conv_common_dtor,
104 	mbfl_filt_conv_wchar_utf16be,
105 	mbfl_filt_conv_common_flush
106 };
107 
108 const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
109 	mbfl_no_encoding_utf16le,
110 	mbfl_no_encoding_wchar,
111 	mbfl_filt_conv_common_ctor,
112 	mbfl_filt_conv_common_dtor,
113 	mbfl_filt_conv_utf16le_wchar,
114 	mbfl_filt_conv_common_flush
115 };
116 
117 const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
118 	mbfl_no_encoding_wchar,
119 	mbfl_no_encoding_utf16le,
120 	mbfl_filt_conv_common_ctor,
121 	mbfl_filt_conv_common_dtor,
122 	mbfl_filt_conv_wchar_utf16le,
123 	mbfl_filt_conv_common_flush
124 };
125 
126 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
127 
128 /*
129  * UTF-16 => wchar
130  */
mbfl_filt_conv_utf16_wchar(int c,mbfl_convert_filter * filter)131 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
132 {
133 	int n, endian;
134 
135 	endian = filter->status & 0xff00;
136 	switch (filter->status & 0x0f) {
137 	case 0:
138 		if (endian) {
139 			n = c & 0xff;
140 		} else {
141 			n = (c & 0xff) << 8;
142 		}
143 		filter->cache |= n;
144 		filter->status++;
145 		break;
146 	default:
147 		if (endian) {
148 			n = (c & 0xff) << 8;
149 		} else {
150 			n = c & 0xff;
151 		}
152 		n |= filter->cache & 0xffff;
153 		filter->status &= ~0x0f;
154 		if (n >= 0xd800 && n < 0xdc00) {
155 			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
156 		} else if (n >= 0xdc00 && n < 0xe000) {
157 			n &= 0x3ff;
158 			n |= (filter->cache & 0xfff0000) >> 6;
159 			filter->cache = 0;
160 			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
161 				CK((*filter->output_function)(n, filter->data));
162 			} else {		/* illegal character */
163 				n &= MBFL_WCSGROUP_MASK;
164 				n |= MBFL_WCSGROUP_THROUGH;
165 				CK((*filter->output_function)(n, filter->data));
166 			}
167 		} else {
168 			int is_first = filter->status & 0x10;
169 			filter->cache = 0;
170 			filter->status |= 0x10;
171 			if (!is_first) {
172 				if (n == 0xfffe) {
173 					if (endian) {
174 						filter->status &= ~0x100;		/* big-endian */
175 					} else {
176 						filter->status |= 0x100;		/* little-endian */
177 					}
178 					break;
179 				} else if (n == 0xfeff) {
180 					break;
181 				}
182 			}
183 			CK((*filter->output_function)(n, filter->data));
184 		}
185 		break;
186 	}
187 
188 	return c;
189 }
190 
191 /*
192  * UTF-16BE => wchar
193  */
mbfl_filt_conv_utf16be_wchar(int c,mbfl_convert_filter * filter)194 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
195 {
196 	int n;
197 
198 	switch (filter->status) {
199 	case 0:
200 		filter->status = 1;
201 		n = (c & 0xff) << 8;
202 		filter->cache |= n;
203 		break;
204 	default:
205 		filter->status = 0;
206 		n = (filter->cache & 0xff00) | (c & 0xff);
207 		if (n >= 0xd800 && n < 0xdc00) {
208 			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
209 		} else if (n >= 0xdc00 && n < 0xe000) {
210 			n &= 0x3ff;
211 			n |= (filter->cache & 0xfff0000) >> 6;
212 			filter->cache = 0;
213 			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
214 				CK((*filter->output_function)(n, filter->data));
215 			} else {		/* illegal character */
216 				n &= MBFL_WCSGROUP_MASK;
217 				n |= MBFL_WCSGROUP_THROUGH;
218 				CK((*filter->output_function)(n, filter->data));
219 			}
220 		} else {
221 			filter->cache = 0;
222 			CK((*filter->output_function)(n, filter->data));
223 		}
224 		break;
225 	}
226 
227 	return c;
228 }
229 
230 /*
231  * wchar => UTF-16BE
232  */
mbfl_filt_conv_wchar_utf16be(int c,mbfl_convert_filter * filter)233 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
234 {
235 	int n;
236 
237 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
238 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
239 		CK((*filter->output_function)(c & 0xff, filter->data));
240 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
241 		n = ((c >> 10) - 0x40) | 0xd800;
242 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
243 		CK((*filter->output_function)(n & 0xff, filter->data));
244 		n = (c & 0x3ff) | 0xdc00;
245 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
246 		CK((*filter->output_function)(n & 0xff, filter->data));
247 	} else {
248 		if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
249 			CK(mbfl_filt_conv_illegal_output(c, filter));
250 		}
251 	}
252 
253 	return c;
254 }
255 
256 /*
257  * UTF-16LE => wchar
258  */
mbfl_filt_conv_utf16le_wchar(int c,mbfl_convert_filter * filter)259 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
260 {
261 	int n;
262 
263 	switch (filter->status) {
264 	case 0:
265 		filter->status = 1;
266 		n = c & 0xff;
267 		filter->cache |= n;
268 		break;
269 	default:
270 		filter->status = 0;
271 		n = (filter->cache & 0xff) | ((c & 0xff) << 8);
272 		if (n >= 0xd800 && n < 0xdc00) {
273 			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
274 		} else if (n >= 0xdc00 && n < 0xe000) {
275 			n &= 0x3ff;
276 			n |= (filter->cache & 0xfff0000) >> 6;
277 			filter->cache = 0;
278 			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
279 				CK((*filter->output_function)(n, filter->data));
280 			} else {		/* illegal character */
281 				n &= MBFL_WCSGROUP_MASK;
282 				n |= MBFL_WCSGROUP_THROUGH;
283 				CK((*filter->output_function)(n, filter->data));
284 			}
285 		} else {
286 			filter->cache = 0;
287 			CK((*filter->output_function)(n, filter->data));
288 		}
289 		break;
290 	}
291 
292 	return c;
293 }
294 
295 /*
296  * wchar => UTF-16LE
297  */
mbfl_filt_conv_wchar_utf16le(int c,mbfl_convert_filter * filter)298 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
299 {
300 	int n;
301 
302 	if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) {
303 		CK((*filter->output_function)(c & 0xff, filter->data));
304 		CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
305 	} else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) {
306 		n = ((c >> 10) - 0x40) | 0xd800;
307 		CK((*filter->output_function)(n & 0xff, filter->data));
308 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
309 		n = (c & 0x3ff) | 0xdc00;
310 		CK((*filter->output_function)(n & 0xff, filter->data));
311 		CK((*filter->output_function)((n >> 8) & 0xff, filter->data));
312 	} else {
313 		if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
314 			CK(mbfl_filt_conv_illegal_output(c, filter));
315 		}
316 	}
317 
318 	return c;
319 }
320