1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27  * mbfilter.c is included in this package .
28  *
29  */
30 
31 #include "libmbfl/config.h"
32 
33 #ifdef HAVE_STRINGS_H
34 	/* For strcasecmp */
35 	#include <strings.h>
36 #endif
37 
38 #include "mbfl_encoding.h"
39 #include "mbfilter_pass.h"
40 #include "mbfilter_8bit.h"
41 
42 #include "filters/mbfilter_base64.h"
43 #include "filters/mbfilter_cjk.h"
44 #include "filters/mbfilter_qprint.h"
45 #include "filters/mbfilter_uuencode.h"
46 #include "filters/mbfilter_7bit.h"
47 #include "filters/mbfilter_utf7.h"
48 #include "filters/mbfilter_utf7imap.h"
49 #include "filters/mbfilter_utf8.h"
50 #include "filters/mbfilter_utf16.h"
51 #include "filters/mbfilter_utf32.h"
52 #include "filters/mbfilter_ucs4.h"
53 #include "filters/mbfilter_ucs2.h"
54 #include "filters/mbfilter_htmlent.h"
55 #include "filters/mbfilter_singlebyte.h"
56 
57 #ifndef HAVE_STRCASECMP
58 #ifdef HAVE_STRICMP
59 #define strcasecmp stricmp
60 #endif
61 #endif
62 
63 
64 static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
65 	&mbfl_encoding_base64,
66 	&mbfl_encoding_uuencode,
67 	&mbfl_encoding_html_ent,
68 	&mbfl_encoding_qprint,
69 	&mbfl_encoding_7bit,
70 	&mbfl_encoding_8bit,
71 	&mbfl_encoding_ucs4,
72 	&mbfl_encoding_ucs4be,
73 	&mbfl_encoding_ucs4le,
74 	&mbfl_encoding_ucs2,
75 	&mbfl_encoding_ucs2be,
76 	&mbfl_encoding_ucs2le,
77 	&mbfl_encoding_utf32,
78 	&mbfl_encoding_utf32be,
79 	&mbfl_encoding_utf32le,
80 	&mbfl_encoding_utf16,
81 	&mbfl_encoding_utf16be,
82 	&mbfl_encoding_utf16le,
83 	&mbfl_encoding_utf8,
84 	&mbfl_encoding_utf7,
85 	&mbfl_encoding_utf7imap,
86 	&mbfl_encoding_ascii,
87 	&mbfl_encoding_euc_jp,
88 	&mbfl_encoding_sjis,
89 	&mbfl_encoding_eucjp_win,
90 	&mbfl_encoding_eucjp2004,
91 	&mbfl_encoding_sjis_docomo,
92 	&mbfl_encoding_sjis_kddi,
93 	&mbfl_encoding_sjis_sb,
94 	&mbfl_encoding_sjis_mac,
95 	&mbfl_encoding_sjis2004,
96 	&mbfl_encoding_utf8_docomo,
97 	&mbfl_encoding_utf8_kddi_a,
98 	&mbfl_encoding_utf8_kddi_b,
99 	&mbfl_encoding_utf8_sb,
100 	&mbfl_encoding_cp932,
101 	&mbfl_encoding_sjiswin,
102 	&mbfl_encoding_cp51932,
103 	&mbfl_encoding_jis,
104 	&mbfl_encoding_2022jp,
105 	&mbfl_encoding_2022jpms,
106 	&mbfl_encoding_gb18030,
107 	&mbfl_encoding_gb18030_2022,
108 	&mbfl_encoding_cp1252,
109 	&mbfl_encoding_cp1254,
110 	&mbfl_encoding_8859_1,
111 	&mbfl_encoding_8859_2,
112 	&mbfl_encoding_8859_3,
113 	&mbfl_encoding_8859_4,
114 	&mbfl_encoding_8859_5,
115 	&mbfl_encoding_8859_6,
116 	&mbfl_encoding_8859_7,
117 	&mbfl_encoding_8859_8,
118 	&mbfl_encoding_8859_9,
119 	&mbfl_encoding_8859_10,
120 	&mbfl_encoding_8859_13,
121 	&mbfl_encoding_8859_14,
122 	&mbfl_encoding_8859_15,
123 	&mbfl_encoding_8859_16,
124 	&mbfl_encoding_euc_cn,
125 	&mbfl_encoding_cp936,
126 	&mbfl_encoding_hz,
127 	&mbfl_encoding_euc_tw,
128 	&mbfl_encoding_big5,
129 	&mbfl_encoding_cp950,
130 	&mbfl_encoding_euc_kr,
131 	&mbfl_encoding_uhc,
132 	&mbfl_encoding_2022kr,
133 	&mbfl_encoding_cp1251,
134 	&mbfl_encoding_cp866,
135 	&mbfl_encoding_koi8r,
136 	&mbfl_encoding_koi8u,
137 	&mbfl_encoding_armscii8,
138 	&mbfl_encoding_cp850,
139 	&mbfl_encoding_2022jp_2004,
140 	&mbfl_encoding_2022jp_kddi,
141 	&mbfl_encoding_cp50220,
142 	&mbfl_encoding_cp50221,
143 	&mbfl_encoding_cp50222,
144 	NULL
145 };
146 
147 /* The following perfect hashing table was amended from gperf, and hashing code was generated using gperf.
148  * The table was amended to refer to the table above such that it is lighter for the data cache.
149  * You can use the generate_name_perfect_hash_table.php script to help generate the necessary lookup tables. */
150 
151 static const int8_t mbfl_encoding_ptr_list_after_hashing[] = {
152 	-1, -1, -1, -1,
153 	-1, -1,
154 	66,
155 	-1,
156 	73,
157 	-1,
158 	78,
159 	61,
160 	76,
161 	-1,
162 	59,
163 	46,
164 	52,
165 	54,
166 	49,
167 	57,
168 	69,
169 	21,
170 	50,
171 	58,
172 	75,
173 	35,
174 	9,
175 	64,
176 	48,
177 	56,
178 	74,
179 	47,
180 	55,
181 	40,
182 	45,
183 	53,
184 	18,
185 	39,
186 	72,
187 	60,
188 	23,
189 	10,
190 	30,
191 	36,
192 	67,
193 	71,
194 	37,
195 	27,
196 	77,
197 	26,
198 	51,
199 	12,
200 	6,
201 	11,
202 	7,
203 	29,
204 	5,
205 	24,
206 	0,
207 	2,
208 	13,
209 	43,
210 	31,
211 	33,
212 	38,
213 	63,
214 	8,
215 	1,
216 	15,
217 	-1,
218 	16,
219 	-1,
220 	14,
221 	3,
222 	44,
223 	-1,
224 	20,
225 	-1,
226 	32,
227 	-1,
228 	68,
229 	25,
230 	17,
231 	28,
232 	-1, -1, -1,
233 	22,
234 	-1, -1,
235 	4,
236 	-1, -1,
237 	62,
238 	-1, -1,
239 	34,
240 	-1,
241 	41,
242 	-1, -1, -1,
243 	42,
244 	70,
245 	19,
246 	-1, -1, -1,
247 	65
248 };
249 
mbfl_name2encoding_perfect_hash(const char * str,size_t len)250 static unsigned int mbfl_name2encoding_perfect_hash(const char *str, size_t len)
251 {
252 	static const unsigned char asso_values[] =
253 	{
254 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
255 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
256 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
257 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
258 		109, 109, 109, 109, 109,   1, 109, 109,   1,  19,
259 		0,  16,  13,   3,   7,  35,   1,  20, 109, 109,
260 		109, 109, 109, 109, 109,  16,   1,   0,  44,   6,
261 		26,  53,   8,   0,  25,  32,  13,  12,   1,   0,
262 		25,   0,  32,  18,  51,   3, 109,  15, 109, 109,
263 		1, 109, 109, 109, 109, 109, 109,  16,   1,   0,
264 		44,   6,  26,  53,   8,   0,  25,  32,  13,  12,
265 		1,   0,  25,   0,  32,  18,  51,   3, 109,  15,
266 		109, 109,   1, 109, 109, 109, 109, 109, 109, 109,
267 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
268 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
269 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
270 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
271 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
272 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
273 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
274 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
275 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
276 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
277 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
278 		109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
279 		109, 109, 109, 109, 109, 109
280 	};
281 	unsigned int hval = len;
282 
283 	switch (hval)
284 	{
285 		default:
286 			hval += asso_values[(unsigned char)str[6]];
287 			ZEND_FALLTHROUGH;
288 		case 6:
289 			hval += asso_values[(unsigned char)str[5]];
290 			ZEND_FALLTHROUGH;
291 		case 5:
292 			hval += asso_values[(unsigned char)str[4]];
293 			ZEND_FALLTHROUGH;
294 		case 4:
295 		case 3:
296 			hval += asso_values[(unsigned char)str[2]];
297 			ZEND_FALLTHROUGH;
298 		case 2:
299 		case 1:
300 			hval += asso_values[(unsigned char)str[0]];
301 			break;
302 	}
303 	return hval + asso_values[(unsigned char)str[len - 1]];
304 }
305 
306 #define NAME_HASH_MIN_NAME_LENGTH 2
307 #define NAME_HASH_MAX_NAME_LENGTH 23
308 
mbfl_name2encoding(const char * name)309 const mbfl_encoding *mbfl_name2encoding(const char *name)
310 {
311 	return mbfl_name2encoding_ex(name, strlen(name));
312 }
313 
mbfl_name2encoding_ex(const char * name,size_t name_len)314 const mbfl_encoding *mbfl_name2encoding_ex(const char *name, size_t name_len)
315 {
316 	const mbfl_encoding *const *encoding;
317 
318 	/* Sanity check perfect hash for name.
319 	 * Never enable this in production, this is only a development-time sanity check! */
320 #if ZEND_DEBUG && 0
321 	for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
322 		size_t name_length = strlen((*encoding)->name);
323 		if (!(name_length <= NAME_HASH_MAX_NAME_LENGTH && name_length >= NAME_HASH_MIN_NAME_LENGTH)) {
324 			fprintf(stderr, "name length is not satisfying bound check: %zu %s\n", name_length, (*encoding)->name);
325 			abort();
326 		}
327 		unsigned int key = mbfl_name2encoding_perfect_hash((*encoding)->name, name_length);
328 		if (mbfl_encoding_ptr_list[mbfl_encoding_ptr_list_after_hashing[key]] != *encoding) {
329 			fprintf(stderr, "mbfl_name2encoding_perfect_hash: key %u %s mismatch\n", key, (*encoding)->name);
330 			abort();
331 		}
332 	}
333 #endif
334 
335 	/* Use perfect hash lookup for name */
336 	if (name_len <= NAME_HASH_MAX_NAME_LENGTH && name_len >= NAME_HASH_MIN_NAME_LENGTH) {
337 		unsigned int key = mbfl_name2encoding_perfect_hash(name, name_len);
338 		if (key < sizeof(mbfl_encoding_ptr_list_after_hashing) / sizeof(mbfl_encoding_ptr_list_after_hashing[0])) {
339 			int8_t offset = mbfl_encoding_ptr_list_after_hashing[key];
340 			if (offset >= 0) {
341 				encoding = mbfl_encoding_ptr_list + offset;
342 				if (strncasecmp((*encoding)->name, name, name_len) == 0) {
343 					return *encoding;
344 				}
345 			}
346 		}
347 	}
348 
349 	/* search MIME charset name */
350 	for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
351 		if ((*encoding)->mime_name) {
352 			if (strcasecmp((*encoding)->mime_name, name) == 0) {
353 				return *encoding;
354 			}
355 		}
356 	}
357 
358 	/* search aliases */
359 	for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
360 		if ((*encoding)->aliases) {
361 			for (const char **alias = (*encoding)->aliases; *alias; alias++) {
362 				if (strcasecmp(*alias, name) == 0) {
363 					return *encoding;
364 				}
365 			}
366 		}
367 	}
368 
369 	return NULL;
370 }
371 
mbfl_no2encoding(enum mbfl_no_encoding no_encoding)372 const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding)
373 {
374 	const mbfl_encoding **encoding;
375 
376 	for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
377 		if ((*encoding)->no_encoding == no_encoding) {
378 			return *encoding;
379 		}
380 	}
381 
382 	return NULL;
383 }
384 
mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding)385 const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding)
386 {
387 	const mbfl_encoding *encoding = mbfl_no2encoding(no_encoding);
388 	return encoding ? encoding->name : "";
389 }
390 
mbfl_get_supported_encodings(void)391 const mbfl_encoding **mbfl_get_supported_encodings(void)
392 {
393 	return mbfl_encoding_ptr_list;
394 }
395 
mbfl_encoding_preferred_mime_name(const mbfl_encoding * encoding)396 const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding)
397 {
398 	if (encoding->mime_name && encoding->mime_name[0] != '\0') {
399 		return encoding->mime_name;
400 	}
401 	return NULL;
402 }
403