1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27 * mbfilter.c is included in this package .
28 *
29 */
30
31 #include "libmbfl/config.h"
32
33 #ifdef HAVE_STRINGS_H
34 /* For strcasecmp */
35 #include <strings.h>
36 #endif
37
38 #include "mbfl_encoding.h"
39 #include "mbfilter_pass.h"
40 #include "mbfilter_8bit.h"
41
42 #include "filters/mbfilter_base64.h"
43 #include "filters/mbfilter_cjk.h"
44 #include "filters/mbfilter_qprint.h"
45 #include "filters/mbfilter_uuencode.h"
46 #include "filters/mbfilter_7bit.h"
47 #include "filters/mbfilter_utf7.h"
48 #include "filters/mbfilter_utf7imap.h"
49 #include "filters/mbfilter_utf8.h"
50 #include "filters/mbfilter_utf16.h"
51 #include "filters/mbfilter_utf32.h"
52 #include "filters/mbfilter_ucs4.h"
53 #include "filters/mbfilter_ucs2.h"
54 #include "filters/mbfilter_htmlent.h"
55 #include "filters/mbfilter_singlebyte.h"
56
57 #ifndef HAVE_STRCASECMP
58 #ifdef HAVE_STRICMP
59 #define strcasecmp stricmp
60 #endif
61 #endif
62
63
64 static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
65 &mbfl_encoding_base64,
66 &mbfl_encoding_uuencode,
67 &mbfl_encoding_html_ent,
68 &mbfl_encoding_qprint,
69 &mbfl_encoding_7bit,
70 &mbfl_encoding_8bit,
71 &mbfl_encoding_ucs4,
72 &mbfl_encoding_ucs4be,
73 &mbfl_encoding_ucs4le,
74 &mbfl_encoding_ucs2,
75 &mbfl_encoding_ucs2be,
76 &mbfl_encoding_ucs2le,
77 &mbfl_encoding_utf32,
78 &mbfl_encoding_utf32be,
79 &mbfl_encoding_utf32le,
80 &mbfl_encoding_utf16,
81 &mbfl_encoding_utf16be,
82 &mbfl_encoding_utf16le,
83 &mbfl_encoding_utf8,
84 &mbfl_encoding_utf7,
85 &mbfl_encoding_utf7imap,
86 &mbfl_encoding_ascii,
87 &mbfl_encoding_euc_jp,
88 &mbfl_encoding_sjis,
89 &mbfl_encoding_eucjp_win,
90 &mbfl_encoding_eucjp2004,
91 &mbfl_encoding_sjis_docomo,
92 &mbfl_encoding_sjis_kddi,
93 &mbfl_encoding_sjis_sb,
94 &mbfl_encoding_sjis_mac,
95 &mbfl_encoding_sjis2004,
96 &mbfl_encoding_utf8_docomo,
97 &mbfl_encoding_utf8_kddi_a,
98 &mbfl_encoding_utf8_kddi_b,
99 &mbfl_encoding_utf8_sb,
100 &mbfl_encoding_cp932,
101 &mbfl_encoding_sjiswin,
102 &mbfl_encoding_cp51932,
103 &mbfl_encoding_jis,
104 &mbfl_encoding_2022jp,
105 &mbfl_encoding_2022jpms,
106 &mbfl_encoding_gb18030,
107 &mbfl_encoding_gb18030_2022,
108 &mbfl_encoding_cp1252,
109 &mbfl_encoding_cp1254,
110 &mbfl_encoding_8859_1,
111 &mbfl_encoding_8859_2,
112 &mbfl_encoding_8859_3,
113 &mbfl_encoding_8859_4,
114 &mbfl_encoding_8859_5,
115 &mbfl_encoding_8859_6,
116 &mbfl_encoding_8859_7,
117 &mbfl_encoding_8859_8,
118 &mbfl_encoding_8859_9,
119 &mbfl_encoding_8859_10,
120 &mbfl_encoding_8859_13,
121 &mbfl_encoding_8859_14,
122 &mbfl_encoding_8859_15,
123 &mbfl_encoding_8859_16,
124 &mbfl_encoding_euc_cn,
125 &mbfl_encoding_cp936,
126 &mbfl_encoding_hz,
127 &mbfl_encoding_euc_tw,
128 &mbfl_encoding_big5,
129 &mbfl_encoding_cp950,
130 &mbfl_encoding_euc_kr,
131 &mbfl_encoding_uhc,
132 &mbfl_encoding_2022kr,
133 &mbfl_encoding_cp1251,
134 &mbfl_encoding_cp866,
135 &mbfl_encoding_koi8r,
136 &mbfl_encoding_koi8u,
137 &mbfl_encoding_armscii8,
138 &mbfl_encoding_cp850,
139 &mbfl_encoding_2022jp_2004,
140 &mbfl_encoding_2022jp_kddi,
141 &mbfl_encoding_cp50220,
142 &mbfl_encoding_cp50221,
143 &mbfl_encoding_cp50222,
144 NULL
145 };
146
147 /* The following perfect hashing table was amended from gperf, and hashing code was generated using gperf.
148 * The table was amended to refer to the table above such that it is lighter for the data cache.
149 * You can use the generate_name_perfect_hash_table.php script to help generate the necessary lookup tables. */
150
151 static const int8_t mbfl_encoding_ptr_list_after_hashing[] = {
152 -1, -1, -1, -1,
153 -1, -1,
154 66,
155 -1,
156 73,
157 -1,
158 78,
159 61,
160 76,
161 -1,
162 59,
163 46,
164 52,
165 54,
166 49,
167 57,
168 69,
169 21,
170 50,
171 58,
172 75,
173 35,
174 9,
175 64,
176 48,
177 56,
178 74,
179 47,
180 55,
181 40,
182 45,
183 53,
184 18,
185 39,
186 72,
187 60,
188 23,
189 10,
190 30,
191 36,
192 67,
193 71,
194 37,
195 27,
196 77,
197 26,
198 51,
199 12,
200 6,
201 11,
202 7,
203 29,
204 5,
205 24,
206 0,
207 2,
208 13,
209 43,
210 31,
211 33,
212 38,
213 63,
214 8,
215 1,
216 15,
217 -1,
218 16,
219 -1,
220 14,
221 3,
222 44,
223 -1,
224 20,
225 -1,
226 32,
227 -1,
228 68,
229 25,
230 17,
231 28,
232 -1, -1, -1,
233 22,
234 -1, -1,
235 4,
236 -1, -1,
237 62,
238 -1, -1,
239 34,
240 -1,
241 41,
242 -1, -1, -1,
243 42,
244 70,
245 19,
246 -1, -1, -1,
247 65
248 };
249
mbfl_name2encoding_perfect_hash(const char * str,size_t len)250 static unsigned int mbfl_name2encoding_perfect_hash(const char *str, size_t len)
251 {
252 static const unsigned char asso_values[] =
253 {
254 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
255 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
256 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
257 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
258 109, 109, 109, 109, 109, 1, 109, 109, 1, 19,
259 0, 16, 13, 3, 7, 35, 1, 20, 109, 109,
260 109, 109, 109, 109, 109, 16, 1, 0, 44, 6,
261 26, 53, 8, 0, 25, 32, 13, 12, 1, 0,
262 25, 0, 32, 18, 51, 3, 109, 15, 109, 109,
263 1, 109, 109, 109, 109, 109, 109, 16, 1, 0,
264 44, 6, 26, 53, 8, 0, 25, 32, 13, 12,
265 1, 0, 25, 0, 32, 18, 51, 3, 109, 15,
266 109, 109, 1, 109, 109, 109, 109, 109, 109, 109,
267 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
268 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
269 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
270 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
271 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
272 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
273 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
274 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
275 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
276 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
277 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
278 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
279 109, 109, 109, 109, 109, 109
280 };
281 unsigned int hval = len;
282
283 switch (hval)
284 {
285 default:
286 hval += asso_values[(unsigned char)str[6]];
287 ZEND_FALLTHROUGH;
288 case 6:
289 hval += asso_values[(unsigned char)str[5]];
290 ZEND_FALLTHROUGH;
291 case 5:
292 hval += asso_values[(unsigned char)str[4]];
293 ZEND_FALLTHROUGH;
294 case 4:
295 case 3:
296 hval += asso_values[(unsigned char)str[2]];
297 ZEND_FALLTHROUGH;
298 case 2:
299 case 1:
300 hval += asso_values[(unsigned char)str[0]];
301 break;
302 }
303 return hval + asso_values[(unsigned char)str[len - 1]];
304 }
305
306 #define NAME_HASH_MIN_NAME_LENGTH 2
307 #define NAME_HASH_MAX_NAME_LENGTH 23
308
mbfl_name2encoding(const char * name)309 const mbfl_encoding *mbfl_name2encoding(const char *name)
310 {
311 return mbfl_name2encoding_ex(name, strlen(name));
312 }
313
mbfl_name2encoding_ex(const char * name,size_t name_len)314 const mbfl_encoding *mbfl_name2encoding_ex(const char *name, size_t name_len)
315 {
316 const mbfl_encoding *const *encoding;
317
318 /* Sanity check perfect hash for name.
319 * Never enable this in production, this is only a development-time sanity check! */
320 #if ZEND_DEBUG && 0
321 for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
322 size_t name_length = strlen((*encoding)->name);
323 if (!(name_length <= NAME_HASH_MAX_NAME_LENGTH && name_length >= NAME_HASH_MIN_NAME_LENGTH)) {
324 fprintf(stderr, "name length is not satisfying bound check: %zu %s\n", name_length, (*encoding)->name);
325 abort();
326 }
327 unsigned int key = mbfl_name2encoding_perfect_hash((*encoding)->name, name_length);
328 if (mbfl_encoding_ptr_list[mbfl_encoding_ptr_list_after_hashing[key]] != *encoding) {
329 fprintf(stderr, "mbfl_name2encoding_perfect_hash: key %u %s mismatch\n", key, (*encoding)->name);
330 abort();
331 }
332 }
333 #endif
334
335 /* Use perfect hash lookup for name */
336 if (name_len <= NAME_HASH_MAX_NAME_LENGTH && name_len >= NAME_HASH_MIN_NAME_LENGTH) {
337 unsigned int key = mbfl_name2encoding_perfect_hash(name, name_len);
338 if (key < sizeof(mbfl_encoding_ptr_list_after_hashing) / sizeof(mbfl_encoding_ptr_list_after_hashing[0])) {
339 int8_t offset = mbfl_encoding_ptr_list_after_hashing[key];
340 if (offset >= 0) {
341 encoding = mbfl_encoding_ptr_list + offset;
342 if (strncasecmp((*encoding)->name, name, name_len) == 0) {
343 return *encoding;
344 }
345 }
346 }
347 }
348
349 /* search MIME charset name */
350 for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
351 if ((*encoding)->mime_name) {
352 if (strncasecmp((*encoding)->mime_name, name, name_len) == 0 && (*encoding)->mime_name[name_len] == '\0') {
353 return *encoding;
354 }
355 }
356 }
357
358 /* search aliases */
359 for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
360 if ((*encoding)->aliases) {
361 for (const char **alias = (*encoding)->aliases; *alias; alias++) {
362 if (strncasecmp(name, *alias, name_len) == 0 && (*alias)[name_len] == '\0') {
363 return *encoding;
364 }
365 }
366 }
367 }
368
369 return NULL;
370 }
371
mbfl_no2encoding(enum mbfl_no_encoding no_encoding)372 const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding)
373 {
374 const mbfl_encoding **encoding;
375
376 for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
377 if ((*encoding)->no_encoding == no_encoding) {
378 return *encoding;
379 }
380 }
381
382 return NULL;
383 }
384
mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding)385 const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding)
386 {
387 const mbfl_encoding *encoding = mbfl_no2encoding(no_encoding);
388 return encoding ? encoding->name : "";
389 }
390
mbfl_get_supported_encodings(void)391 const mbfl_encoding **mbfl_get_supported_encodings(void)
392 {
393 return mbfl_encoding_ptr_list;
394 }
395
mbfl_encoding_preferred_mime_name(const mbfl_encoding * encoding)396 const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding)
397 {
398 if (encoding->mime_name && encoding->mime_name[0] != '\0') {
399 return encoding->mime_name;
400 }
401 return NULL;
402 }
403