1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2013 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Authors: Rasmus Lerdorf <rasmus@php.net> |
16 | Jaakko Hyv�tti <jaakko.hyvatti@iki.fi> |
17 | Wez Furlong <wez@thebrainroom.com> |
18 +----------------------------------------------------------------------+
19 */
20
21 /* $Id$ */
22
23 /*
24 * HTML entity resources:
25 *
26 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27 * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28 * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29 *
30 * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31 *
32 */
33
34 #include "php.h"
35 #if PHP_WIN32
36 #include "config.w32.h"
37 #else
38 #include <php_config.h>
39 #endif
40 #include "html.h"
41 #include "php_string.h"
42 #include "SAPI.h"
43 #if HAVE_LOCALE_H
44 #include <locale.h>
45 #endif
46 #if HAVE_LANGINFO_H
47 #include <langinfo.h>
48 #endif
49
50 #if HAVE_MBSTRING
51 # include "ext/mbstring/mbstring.h"
52 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
53 #endif
54
55 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
56 cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
57 cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
58 cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
59 };
60 typedef const char *const entity_table_t;
61
62 /* codepage 1252 is a Windows extension to iso-8859-1. */
63 static entity_table_t ent_cp_1252[] = {
64 "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
65 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
66 NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
67 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
68 "oelig", NULL, NULL, "Yuml"
69 };
70
71 static entity_table_t ent_iso_8859_1[] = {
72 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
73 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
74 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
75 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
76 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
77 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
78 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
79 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
80 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
81 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
82 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
83 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
84 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
85 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
86 "uuml", "yacute", "thorn", "yuml"
87 };
88
89 static entity_table_t ent_iso_8859_15[] = {
90 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
91 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
92 "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
93 "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
94 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
95 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
96 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
97 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
98 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
99 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
100 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
101 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
102 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
103 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
104 "uuml", "yacute", "thorn", "yuml"
105 };
106
107 static entity_table_t ent_uni_338_402[] = {
108 /* 338 (0x0152) */
109 "OElig", "oelig", NULL, NULL, NULL, NULL,
110 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
111 /* 352 (0x0160) */
112 "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
113 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
114 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
115 /* 376 (0x0178) */
116 "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
117 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
118 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
119 /* 400 (0x0190) */
120 NULL, NULL, "fnof"
121 };
122
123 static entity_table_t ent_uni_spacing[] = {
124 /* 710 */
125 "circ",
126 /* 711 - 730 */
127 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
128 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
129 /* 731 - 732 */
130 NULL, "tilde"
131 };
132
133 static entity_table_t ent_uni_greek[] = {
134 /* 913 */
135 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
136 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
137 NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
138 /* 938 - 944 are not mapped */
139 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
140 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
141 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
142 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
143 /* 970 - 976 are not mapped */
144 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
145 "thetasym", "upsih",
146 NULL, NULL, NULL,
147 "piv"
148 };
149
150 static entity_table_t ent_uni_punct[] = {
151 /* 8194 */
152 "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
153 "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
154 NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
155 /* 8216 */
156 "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
157 "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
158 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
159 /* 8242 */
160 "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
161 NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
162 "frasl"
163 };
164
165 static entity_table_t ent_uni_euro[] = {
166 "euro"
167 };
168
169 static entity_table_t ent_uni_8465_8501[] = {
170 /* 8465 */
171 "image", NULL, NULL, NULL, NULL, NULL, NULL,
172 /* 8472 */
173 "weierp", NULL, NULL, NULL,
174 /* 8476 */
175 "real", NULL, NULL, NULL, NULL, NULL,
176 /* 8482 */
177 "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 /* 8501 */
180 "alefsym",
181 };
182
183 static entity_table_t ent_uni_8592_9002[] = {
184 /* 8592 (0x2190) */
185 "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
186 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
187 /* 8608 (0x21a0) */
188 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
189 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
190 /* 8624 (0x21b0) */
191 NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
192 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
193 /* 8640 (0x21c0) */
194 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
195 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196 /* 8656 (0x21d0) */
197 "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
198 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
199 /* 8672 (0x21e0) */
200 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
201 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
203 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
204 /* 8704 (0x2200) */
205 "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
206 "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
207 /* 8720 (0x2210) */
208 NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
209 NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
210 /* 8736 (0x2220) */
211 "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
212 "or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
213 /* 8752 (0x2230) */
214 NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
215 NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
216 /* 8768 (0x2240) */
217 NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
218 "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
219 /* 8784 (0x2250) */
220 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
221 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
222 /* 8800 (0x2260) */
223 "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
224 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
225 /* 8816 (0x2270) */
226 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
227 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
228 /* 8832 (0x2280) */
229 NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
230 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
231 /* 8848 (0x2290) */
232 NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
233 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
234 /* 8864 (0x22a0) */
235 NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
236 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
237 /* 8880 (0x22b0) */
238 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
239 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240 /* 8896 (0x22c0) */
241 NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
242 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
243 /* 8912 (0x22d0) */
244 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
245 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246 /* 8928 (0x22e0) */
247 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
248 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249 /* 8944 (0x22f0) */
250 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
251 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252 /* 8960 (0x2300) */
253 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
254 "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
255 /* 8976 (0x2310) */
256 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
257 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258 /* 8992 (0x2320) */
259 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
260 NULL, "lang", "rang"
261 };
262
263 static entity_table_t ent_uni_9674[] = {
264 /* 9674 */
265 "loz"
266 };
267
268 static entity_table_t ent_uni_9824_9830[] = {
269 /* 9824 */
270 "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
271 };
272
273 static entity_table_t ent_koi8r[] = {
274 "#1105", /* "jo "*/
275 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
276 NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
277 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
278 "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
279 "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
280 "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
281 "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
282 "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
283 "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
284 "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
285 "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
286 "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
287 "#1066"
288 };
289
290 static entity_table_t ent_cp_1251[] = {
291 "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
292 "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
293 "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
294 "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
295 "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
296 "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
297 "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
298 "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
299 "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
300 "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
301 "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
302 "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
303 "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
304 "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
305 "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
306 "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
307 "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
308 "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
309 "#1103"
310 };
311
312 static entity_table_t ent_iso_8859_5[] = {
313 "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
314 "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
315 "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
316 "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
317 "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
318 "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
319 "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
320 "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
321 "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
322 "#1119"
323 };
324
325 static entity_table_t ent_cp_866[] = {
326
327 "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
328 "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
329 "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
330 "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
331 "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
332 "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
333 "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
334 "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
335 "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
336 "#160"
337 };
338
339 /* MacRoman has a couple of low-ascii chars that need mapping too */
340 /* Vertical tab (ASCII 11) is often used to store line breaks inside */
341 /* DB exports, this mapping changes it to a space */
342 static entity_table_t ent_macroman[] = {
343 "sp", NULL, NULL, NULL,
344 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
345 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
346 NULL, NULL, NULL, NULL, NULL, "quot", NULL,
347 NULL, NULL, "amp", NULL, NULL, NULL, NULL,
348 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
349 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
350 NULL, NULL, NULL, "lt", NULL, "gt", NULL,
351 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
352 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
353 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
354 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
357 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359 NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360 NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
361 "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
362 "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
363 "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
364 "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
365 "cent", "pound", "sect", "bull", "para", "szlig", "reg",
366 "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
367 "infin", "plusmn", "le", "ge", "yen", "micro", "part",
368 "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
369 "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
370 "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
371 "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
372 "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
373 "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
374 "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
375 "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
376 "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
377 "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
378 "#733", "#731", "#711"
379 };
380
381 struct html_entity_map {
382 enum entity_charset charset; /* charset identifier */
383 unsigned int basechar; /* char code at start of table */
384 unsigned int endchar; /* last char code in the table */
385 entity_table_t *table; /* the table of mappings */
386 };
387
388 static const struct html_entity_map entity_map[] = {
389 { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
390 { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
391 { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
392 { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
393 { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
394 { cs_utf_8, 338, 402, ent_uni_338_402 },
395 { cs_utf_8, 710, 732, ent_uni_spacing },
396 { cs_utf_8, 913, 982, ent_uni_greek },
397 { cs_utf_8, 8194, 8260, ent_uni_punct },
398 { cs_utf_8, 8364, 8364, ent_uni_euro },
399 { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
400 { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
401 { cs_utf_8, 9674, 9674, ent_uni_9674 },
402 { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
403 { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
404 { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
405 { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
406 { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
407 { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
408 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
409 { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
410 { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
411 { cs_cp866, 0xc0, 0xff, ent_cp_866 },
412 { cs_macroman, 0x0b, 0xff, ent_macroman },
413 { cs_terminator }
414 };
415
416 static const struct {
417 const char *codeset;
418 enum entity_charset charset;
419 } charset_map[] = {
420 { "ISO-8859-1", cs_8859_1 },
421 { "ISO8859-1", cs_8859_1 },
422 { "ISO-8859-15", cs_8859_15 },
423 { "ISO8859-15", cs_8859_15 },
424 { "utf-8", cs_utf_8 },
425 { "cp1252", cs_cp1252 },
426 { "Windows-1252", cs_cp1252 },
427 { "1252", cs_cp1252 },
428 { "BIG5", cs_big5 },
429 { "950", cs_big5 },
430 { "GB2312", cs_gb2312 },
431 { "936", cs_gb2312 },
432 { "BIG5-HKSCS", cs_big5hkscs },
433 { "Shift_JIS", cs_sjis },
434 { "SJIS", cs_sjis },
435 { "932", cs_sjis },
436 { "EUCJP", cs_eucjp },
437 { "EUC-JP", cs_eucjp },
438 { "KOI8-R", cs_koi8r },
439 { "koi8-ru", cs_koi8r },
440 { "koi8r", cs_koi8r },
441 { "cp1251", cs_cp1251 },
442 { "Windows-1251", cs_cp1251 },
443 { "win-1251", cs_cp1251 },
444 { "iso8859-5", cs_8859_5 },
445 { "iso-8859-5", cs_8859_5 },
446 { "cp866", cs_cp866 },
447 { "866", cs_cp866 },
448 { "ibm866", cs_cp866 },
449 { "MacRoman", cs_macroman },
450 { NULL }
451 };
452
453 static const struct {
454 unsigned short charcode;
455 char *entity;
456 int entitylen;
457 int flags;
458 } basic_entities[] = {
459 { '"', """, 6, ENT_HTML_QUOTE_DOUBLE },
460 { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE },
461 { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE },
462 { '<', "<", 4, 0 },
463 { '>', ">", 4, 0 },
464 { 0, NULL, 0, 0 }
465 };
466
467 struct basic_entities_dec {
468 unsigned short charcode;
469 char entity[8];
470 int entitylen;
471 };
472
473 #define MB_RETURN { \
474 *newpos = pos; \
475 mbseq[mbpos] = '\0'; \
476 *mbseqlen = mbpos; \
477 return this_char; }
478
479 #define MB_WRITE(mbchar) { \
480 mbspace--; \
481 if (mbspace == 0) { \
482 MB_RETURN; \
483 } \
484 mbseq[mbpos++] = (mbchar); }
485
486 /* skip one byte and return */
487 #define MB_FAILURE(pos) do { \
488 *newpos = pos + 1; \
489 *status = FAILURE; \
490 return 0; \
491 } while (0)
492
493 #define CHECK_LEN(pos, chars_need) \
494 if (chars_need < 1) { \
495 if((str_len - (pos)) < chars_need) { \
496 *newpos = pos; \
497 *status = FAILURE; \
498 return 0; \
499 } \
500 } else { \
501 if((str_len - (pos)) < chars_need) { \
502 *newpos = pos + 1; \
503 *status = FAILURE; \
504 return 0; \
505 } \
506 }
507
508 /* {{{ get_next_char
509 */
get_next_char(enum entity_charset charset,unsigned char * str,int str_len,int * newpos,unsigned char * mbseq,int * mbseqlen,int * status)510 inline static unsigned int get_next_char(enum entity_charset charset,
511 unsigned char * str,
512 int str_len,
513 int * newpos,
514 unsigned char * mbseq,
515 int * mbseqlen,
516 int *status)
517 {
518 int pos = *newpos;
519 int mbpos = 0;
520 int mbspace = *mbseqlen;
521 unsigned int this_char = 0;
522 unsigned char next_char;
523
524 *status = SUCCESS;
525
526 if (mbspace <= 0) {
527 *mbseqlen = 0;
528 CHECK_LEN(pos, 1);
529 *newpos = pos + 1;
530 return str[pos];
531 }
532
533 switch (charset) {
534 case cs_utf_8:
535 {
536 unsigned char c;
537 CHECK_LEN(pos, 1);
538 c = str[pos];
539 if (c < 0x80) {
540 MB_WRITE(c);
541 this_char = c;
542 pos++;
543 } else if (c < 0xc2) {
544 MB_FAILURE(pos);
545 } else if (c < 0xe0) {
546 CHECK_LEN(pos, 2);
547 if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
548 MB_FAILURE(pos);
549 }
550 this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
551 if (this_char < 0x80) {
552 MB_FAILURE(pos);
553 }
554 MB_WRITE((unsigned char)c);
555 MB_WRITE((unsigned char)str[pos + 1]);
556 pos += 2;
557 } else if (c < 0xf0) {
558 CHECK_LEN(pos, 3);
559 if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
560 MB_FAILURE(pos);
561 }
562 if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
563 MB_FAILURE(pos);
564 }
565 this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
566 if (this_char < 0x800) {
567 MB_FAILURE(pos);
568 } else if (this_char >= 0xd800 && this_char <= 0xdfff) {
569 MB_FAILURE(pos);
570 }
571 MB_WRITE((unsigned char)c);
572 MB_WRITE((unsigned char)str[pos + 1]);
573 MB_WRITE((unsigned char)str[pos + 2]);
574 pos += 3;
575 } else if (c < 0xf5) {
576 CHECK_LEN(pos, 4);
577 if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
578 MB_FAILURE(pos);
579 }
580 if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
581 MB_FAILURE(pos);
582 }
583 if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
584 MB_FAILURE(pos);
585 }
586 this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
587 if (this_char < 0x10000 || this_char > 0x10FFFF) {
588 MB_FAILURE(pos);
589 }
590 MB_WRITE((unsigned char)c);
591 MB_WRITE((unsigned char)str[pos + 1]);
592 MB_WRITE((unsigned char)str[pos + 2]);
593 MB_WRITE((unsigned char)str[pos + 3]);
594 pos += 4;
595 } else {
596 MB_FAILURE(pos);
597 }
598 }
599 break;
600 case cs_big5:
601 case cs_gb2312:
602 case cs_big5hkscs:
603 {
604 CHECK_LEN(pos, 1);
605 this_char = str[pos++];
606 /* check if this is the first of a 2-byte sequence */
607 if (this_char >= 0x81 && this_char <= 0xfe) {
608 /* peek at the next char */
609 CHECK_LEN(pos, 1);
610 next_char = str[pos++];
611 if ((next_char >= 0x40 && next_char <= 0x7e) ||
612 (next_char >= 0xa1 && next_char <= 0xfe)) {
613 /* yes, this a wide char */
614 MB_WRITE(this_char);
615 MB_WRITE(next_char);
616 this_char = (this_char << 8) | next_char;
617 } else {
618 MB_FAILURE(pos);
619 }
620 } else {
621 MB_WRITE(this_char);
622 }
623 }
624 break;
625 case cs_sjis:
626 {
627 CHECK_LEN(pos, 1);
628 this_char = str[pos++];
629 /* check if this is the first of a 2-byte sequence */
630 if ((this_char >= 0x81 && this_char <= 0x9f) ||
631 (this_char >= 0xe0 && this_char <= 0xfc)) {
632 /* peek at the next char */
633 CHECK_LEN(pos, 1);
634 next_char = str[pos++];
635 if ((next_char >= 0x40 && next_char <= 0x7e) ||
636 (next_char >= 0x80 && next_char <= 0xfc))
637 {
638 /* yes, this a wide char */
639 MB_WRITE(this_char);
640 MB_WRITE(next_char);
641 this_char = (this_char << 8) | next_char;
642 } else {
643 MB_FAILURE(pos);
644 }
645 } else {
646 MB_WRITE(this_char);
647 }
648 break;
649 }
650 case cs_eucjp:
651 {
652 CHECK_LEN(pos, 1);
653 this_char = str[pos++];
654 /* check if this is the first of a multi-byte sequence */
655 if (this_char >= 0xa1 && this_char <= 0xfe) {
656 /* peek at the next char */
657 CHECK_LEN(pos, 1);
658 next_char = str[pos++];
659 if (next_char >= 0xa1 && next_char <= 0xfe) {
660 /* yes, this a jis kanji char */
661 MB_WRITE(this_char);
662 MB_WRITE(next_char);
663 this_char = (this_char << 8) | next_char;
664 } else {
665 MB_FAILURE(pos);
666 }
667 } else if (this_char == 0x8e) {
668 /* peek at the next char */
669 CHECK_LEN(pos, 1);
670 next_char = str[pos++];
671 if (next_char >= 0xa1 && next_char <= 0xdf) {
672 /* JIS X 0201 kana */
673 MB_WRITE(this_char);
674 MB_WRITE(next_char);
675 this_char = (this_char << 8) | next_char;
676 } else {
677 MB_FAILURE(pos);
678 }
679 } else if (this_char == 0x8f) {
680 /* peek at the next two char */
681 unsigned char next2_char;
682 CHECK_LEN(pos, 2);
683 next_char = str[pos];
684 next2_char = str[pos + 1];
685 pos += 2;
686 if ((next_char >= 0xa1 && next_char <= 0xfe) &&
687 (next2_char >= 0xa1 && next2_char <= 0xfe)) {
688 /* JIS X 0212 hojo-kanji */
689 MB_WRITE(this_char);
690 MB_WRITE(next_char);
691 MB_WRITE(next2_char);
692 this_char = (this_char << 16) | (next_char << 8) | next2_char;
693 } else {
694 MB_FAILURE(pos);
695 }
696 } else {
697 MB_WRITE(this_char);
698 }
699 break;
700 }
701 default:
702 /* single-byte charsets */
703 CHECK_LEN(pos, 1);
704 this_char = str[pos++];
705 MB_WRITE(this_char);
706 break;
707 }
708 MB_RETURN;
709 }
710 /* }}} */
711
712 /* {{{ entity_charset determine_charset
713 * returns the charset identifier based on current locale or a hint.
714 * defaults to iso-8859-1 */
determine_charset(char * charset_hint TSRMLS_DC)715 static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
716 {
717 int i;
718 enum entity_charset charset = cs_8859_1;
719 int len = 0;
720 zval *uf_result = NULL;
721
722 /* Guarantee default behaviour for backwards compatibility */
723 if (charset_hint == NULL)
724 return cs_8859_1;
725
726 if ((len = strlen(charset_hint)) != 0) {
727 goto det_charset;
728 }
729 #if HAVE_MBSTRING
730 #if !defined(COMPILE_DL_MBSTRING)
731 /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
732 switch (MBSTRG(current_internal_encoding)) {
733 case mbfl_no_encoding_8859_1:
734 return cs_8859_1;
735
736 case mbfl_no_encoding_utf8:
737 return cs_utf_8;
738
739 case mbfl_no_encoding_euc_jp:
740 case mbfl_no_encoding_eucjp_win:
741 return cs_eucjp;
742
743 case mbfl_no_encoding_sjis:
744 case mbfl_no_encoding_sjis_open:
745 case mbfl_no_encoding_cp932:
746 return cs_sjis;
747
748 case mbfl_no_encoding_cp1252:
749 return cs_cp1252;
750
751 case mbfl_no_encoding_8859_15:
752 return cs_8859_15;
753
754 case mbfl_no_encoding_big5:
755 return cs_big5;
756
757 case mbfl_no_encoding_euc_cn:
758 case mbfl_no_encoding_hz:
759 case mbfl_no_encoding_cp936:
760 return cs_gb2312;
761
762 case mbfl_no_encoding_koi8r:
763 return cs_koi8r;
764
765 case mbfl_no_encoding_cp866:
766 return cs_cp866;
767
768 case mbfl_no_encoding_cp1251:
769 return cs_cp1251;
770
771 case mbfl_no_encoding_8859_5:
772 return cs_8859_5;
773
774 default:
775 ;
776 }
777 #else
778 {
779 zval nm_mb_internal_encoding;
780
781 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
782
783 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
784
785 charset_hint = Z_STRVAL_P(uf_result);
786 len = Z_STRLEN_P(uf_result);
787
788 if (charset_hint != NULL && len != 0) {
789 if (len == 4) { /* sizeof(none|auto|pass)-1 */
790 if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
791 !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
792 !memcmp("none", charset_hint, sizeof("none") - 1)) {
793
794 charset_hint = NULL;
795 len = 0;
796 }
797 } else {
798 /* Jump to det_charset only if mbstring isn't one of above eq pass, auto, none.
799 Otherwise try default_charset next */
800 goto det_charset;
801 }
802 }
803 }
804 }
805 #endif
806 #endif
807
808 charset_hint = SG(default_charset);
809 if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
810 goto det_charset;
811 }
812
813 /* try to detect the charset for the locale */
814 #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
815 charset_hint = nl_langinfo(CODESET);
816 if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
817 goto det_charset;
818 }
819 #endif
820
821 #if HAVE_LOCALE_H
822 /* try to figure out the charset from the locale */
823 {
824 char *localename;
825 char *dot, *at;
826
827 /* lang[_territory][.codeset][@modifier] */
828 localename = setlocale(LC_CTYPE, NULL);
829
830 dot = strchr(localename, '.');
831 if (dot) {
832 dot++;
833 /* locale specifies a codeset */
834 at = strchr(dot, '@');
835 if (at)
836 len = at - dot;
837 else
838 len = strlen(dot);
839 charset_hint = dot;
840 } else {
841 /* no explicit name; see if the name itself
842 * is the charset */
843 charset_hint = localename;
844 len = strlen(charset_hint);
845 }
846 }
847 #endif
848
849 det_charset:
850
851 if (charset_hint) {
852 int found = 0;
853
854 /* now walk the charset map and look for the codeset */
855 for (i = 0; charset_map[i].codeset; i++) {
856 if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
857 charset = charset_map[i].charset;
858 found = 1;
859 break;
860 }
861 }
862 if (!found) {
863 php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
864 charset_hint);
865 }
866 }
867 if (uf_result != NULL) {
868 zval_ptr_dtor(&uf_result);
869 }
870 return charset;
871 }
872 /* }}} */
873
874 /* {{{ php_utf32_utf8 */
php_utf32_utf8(unsigned char * buf,unsigned k)875 size_t php_utf32_utf8(unsigned char *buf, unsigned k)
876 {
877 size_t retval = 0;
878
879 if (k < 0x80) {
880 buf[0] = k;
881 retval = 1;
882 } else if (k < 0x800) {
883 buf[0] = 0xc0 | (k >> 6);
884 buf[1] = 0x80 | (k & 0x3f);
885 retval = 2;
886 } else if (k < 0x10000) {
887 buf[0] = 0xe0 | (k >> 12);
888 buf[1] = 0x80 | ((k >> 6) & 0x3f);
889 buf[2] = 0x80 | (k & 0x3f);
890 retval = 3;
891 } else if (k < 0x200000) {
892 buf[0] = 0xf0 | (k >> 18);
893 buf[1] = 0x80 | ((k >> 12) & 0x3f);
894 buf[2] = 0x80 | ((k >> 6) & 0x3f);
895 buf[3] = 0x80 | (k & 0x3f);
896 retval = 4;
897 } else if (k < 0x4000000) {
898 buf[0] = 0xf8 | (k >> 24);
899 buf[1] = 0x80 | ((k >> 18) & 0x3f);
900 buf[2] = 0x80 | ((k >> 12) & 0x3f);
901 buf[3] = 0x80 | ((k >> 6) & 0x3f);
902 buf[4] = 0x80 | (k & 0x3f);
903 retval = 5;
904 } else {
905 buf[0] = 0xfc | (k >> 30);
906 buf[1] = 0x80 | ((k >> 24) & 0x3f);
907 buf[2] = 0x80 | ((k >> 18) & 0x3f);
908 buf[3] = 0x80 | ((k >> 12) & 0x3f);
909 buf[4] = 0x80 | ((k >> 6) & 0x3f);
910 buf[5] = 0x80 | (k & 0x3f);
911 retval = 6;
912 }
913 buf[retval] = '\0';
914
915 return retval;
916 }
917 /* }}} */
918
919 /* {{{ php_unescape_html_entities
920 */
php_unescape_html_entities(unsigned char * old,int oldlen,int * newlen,int all,int quote_style,char * hint_charset TSRMLS_DC)921 PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
922 {
923 int retlen;
924 int j, k;
925 char *replaced, *ret, *p, *q, *lim, *next;
926 enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
927 unsigned char replacement[15];
928 int replacement_len;
929
930 ret = estrndup(old, oldlen);
931 retlen = oldlen;
932 if (!retlen) {
933 goto empty_source;
934 }
935
936 if (all) {
937 /* look for a match in the maps for this charset */
938 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
939 if (entity_map[j].charset != charset)
940 continue;
941
942 for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
943 unsigned char entity[32];
944 int entity_length = 0;
945
946 if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
947 continue;
948
949 entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
950 if (entity_length >= sizeof(entity)) {
951 continue;
952 }
953
954 /* When we have MBCS entities in the tables above, this will need to handle it */
955 replacement_len = 0;
956 switch (charset) {
957 case cs_8859_1:
958 case cs_cp1252:
959 case cs_8859_15:
960 case cs_cp1251:
961 case cs_8859_5:
962 case cs_cp866:
963 case cs_koi8r:
964 replacement[0] = k;
965 replacement[1] = '\0';
966 replacement_len = 1;
967 break;
968
969 case cs_big5:
970 case cs_gb2312:
971 case cs_big5hkscs:
972 case cs_sjis:
973 case cs_eucjp:
974 /* we cannot properly handle those multibyte encodings
975 * with php_str_to_str. skip it. */
976 continue;
977
978 case cs_utf_8:
979 replacement_len = php_utf32_utf8(replacement, k);
980 break;
981
982 default:
983 php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
984 efree(ret);
985 return NULL;
986 }
987
988 if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
989 replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
990 efree(ret);
991 ret = replaced;
992 }
993 }
994 }
995 }
996
997 for (j = 0; basic_entities[j].charcode != 0; j++) {
998
999 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1000 continue;
1001
1002 replacement[0] = (unsigned char)basic_entities[j].charcode;
1003 replacement[1] = '\0';
1004
1005 if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
1006 replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
1007 efree(ret);
1008 ret = replaced;
1009 }
1010 }
1011
1012 /* replace numeric entities & "&" */
1013 lim = ret + retlen;
1014 for (p = ret, q = ret; p < lim;) {
1015 int code;
1016
1017 if (p[0] == '&') {
1018 if (p + 2 < lim) {
1019 if (p[1] == '#') {
1020 int invalid_code = 0;
1021
1022 if (p[2] == 'x' || p[2] == 'X') {
1023 code = strtol(p + 3, &next, 16);
1024 } else {
1025 code = strtol(p + 2, &next, 10);
1026 }
1027
1028 if ((code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE)) ||
1029 (code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))) {
1030 invalid_code = 1;
1031 }
1032
1033 if (next != NULL && *next == ';' && !invalid_code) {
1034 switch (charset) {
1035 case cs_utf_8:
1036 q += php_utf32_utf8(q, code);
1037 break;
1038
1039 case cs_8859_1:
1040 case cs_8859_5:
1041 case cs_8859_15:
1042 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
1043 invalid_code = 1;
1044 } else {
1045 *(q++) = code;
1046 }
1047 break;
1048
1049 case cs_cp1252:
1050 if (code > 0xff) {
1051 invalid_code = 1;
1052 } else {
1053 *(q++) = code;
1054 }
1055 break;
1056
1057 case cs_cp1251:
1058 case cs_cp866:
1059 case cs_big5:
1060 case cs_big5hkscs:
1061 case cs_sjis:
1062 case cs_eucjp:
1063 if (code >= 0x80) {
1064 invalid_code = 1;
1065 } else {
1066 *(q++) = code;
1067 }
1068 break;
1069
1070 case cs_gb2312:
1071 if (code >= 0x81) {
1072 invalid_code = 1;
1073 } else {
1074 *(q++) = code;
1075 }
1076 break;
1077
1078 default:
1079 /* for backwards compatilibity */
1080 invalid_code = 1;
1081 break;
1082 }
1083 if (invalid_code) {
1084 for (; p <= next; p++) {
1085 *(q++) = *p;
1086 }
1087 }
1088 p = next + 1;
1089 } else {
1090 *(q++) = *(p++);
1091 *(q++) = *(p++);
1092 }
1093 } else if (p + 4 < lim &&
1094 p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
1095 p[4] == ';') {
1096 *(q++) = '&';
1097 p += 5;
1098 } else {
1099 *(q++) = *(p++);
1100 *(q++) = *(p++);
1101 }
1102 } else {
1103 *(q++) = *(p++);
1104 }
1105 } else {
1106 *(q++) = *(p++);
1107 }
1108 }
1109 *q = '\0';
1110 retlen = (size_t)(q - ret);
1111 empty_source:
1112 *newlen = retlen;
1113 return ret;
1114 }
1115 /* }}} */
1116
php_escape_html_entities(unsigned char * old,int oldlen,int * newlen,int all,int quote_style,char * hint_charset TSRMLS_DC)1117 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
1118 {
1119 return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
1120 }
1121
1122
1123 /* {{{ php_escape_html_entities
1124 */
php_escape_html_entities_ex(unsigned char * old,int oldlen,int * newlen,int all,int quote_style,char * hint_charset,zend_bool double_encode TSRMLS_DC)1125 PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
1126 {
1127 int i, j, maxlen, len;
1128 char *replaced;
1129 enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1130 int matches_map;
1131
1132 maxlen = 2 * oldlen;
1133 if (maxlen < 128)
1134 maxlen = 128;
1135 replaced = emalloc (maxlen);
1136 len = 0;
1137 i = 0;
1138 while (i < oldlen) {
1139 unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
1140 int mbseqlen = sizeof(mbsequence);
1141 int status = SUCCESS;
1142 unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
1143
1144 if(status == FAILURE) {
1145 /* invalid MB sequence */
1146 if (quote_style & ENT_HTML_IGNORE_ERRORS) {
1147 continue;
1148 }
1149 efree(replaced);
1150 if(!PG(display_errors)) {
1151 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
1152 }
1153 *newlen = 0;
1154 return STR_EMPTY_ALLOC();
1155 }
1156 matches_map = 0;
1157
1158 if (len + 16 > maxlen)
1159 replaced = erealloc (replaced, maxlen += 128);
1160
1161 if (all) {
1162 /* look for a match in the maps for this charset */
1163 unsigned char *rep = NULL;
1164
1165
1166 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1167 if (entity_map[j].charset == charset
1168 && this_char >= entity_map[j].basechar
1169 && this_char <= entity_map[j].endchar) {
1170 rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
1171 if (rep == NULL) {
1172 /* there is no entity for this position; fall through and
1173 * just output the character itself */
1174 break;
1175 }
1176
1177 matches_map = 1;
1178 break;
1179 }
1180 }
1181
1182 if (matches_map) {
1183 int l = strlen(rep);
1184 /* increase the buffer size */
1185 if (len + 2 + l >= maxlen) {
1186 replaced = erealloc(replaced, maxlen += 128);
1187 }
1188
1189 replaced[len++] = '&';
1190 strlcpy(replaced + len, rep, maxlen);
1191 len += l;
1192 replaced[len++] = ';';
1193 }
1194 }
1195 if (!matches_map) {
1196 int is_basic = 0;
1197
1198 if (this_char == '&') {
1199 if (double_encode) {
1200 encode_amp:
1201 memcpy(replaced + len, "&", sizeof("&") - 1);
1202 len += sizeof("&") - 1;
1203 } else {
1204 char *e = memchr(old + i, ';', oldlen - i);
1205 char *s = old + i;
1206
1207 if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
1208 goto encode_amp;
1209 } else {
1210 if (*s == '#') { /* numeric entities */
1211 s++;
1212 /* Hex (Z) */
1213 if (*s == 'x' || *s == 'X') {
1214 s++;
1215 while (s < e) {
1216 if (!isxdigit((int)*(unsigned char *)s++)) {
1217 goto encode_amp;
1218 }
1219 }
1220 /* Dec (Z)*/
1221 } else {
1222 while (s < e) {
1223 if (!isdigit((int)*(unsigned char *)s++)) {
1224 goto encode_amp;
1225 }
1226 }
1227 }
1228 } else { /* text entities */
1229 while (s < e) {
1230 if (!isalnum((int)*(unsigned char *)s++)) {
1231 goto encode_amp;
1232 }
1233 }
1234 }
1235 replaced[len++] = '&';
1236 }
1237 }
1238 is_basic = 1;
1239 } else {
1240 for (j = 0; basic_entities[j].charcode != 0; j++) {
1241 if ((basic_entities[j].charcode != this_char) ||
1242 (basic_entities[j].flags &&
1243 (quote_style & basic_entities[j].flags) == 0)) {
1244 continue;
1245 }
1246
1247 memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
1248 len += basic_entities[j].entitylen;
1249
1250 is_basic = 1;
1251 break;
1252 }
1253 }
1254
1255 if (!is_basic) {
1256 /* a wide char without a named entity; pass through the original sequence */
1257 if (mbseqlen > 1) {
1258 memcpy(replaced + len, mbsequence, mbseqlen);
1259 len += mbseqlen;
1260 } else {
1261 replaced[len++] = (unsigned char)this_char;
1262 }
1263 }
1264 }
1265 }
1266 replaced[len] = '\0';
1267 *newlen = len;
1268
1269 return replaced;
1270
1271
1272 }
1273 /* }}} */
1274
1275 /* {{{ php_html_entities
1276 */
php_html_entities(INTERNAL_FUNCTION_PARAMETERS,int all)1277 static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1278 {
1279 char *str, *hint_charset = NULL;
1280 int str_len, hint_charset_len = 0;
1281 int len;
1282 long quote_style = ENT_COMPAT;
1283 char *replaced;
1284 zend_bool double_encode = 1;
1285
1286 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, "e_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
1287 return;
1288 }
1289
1290 replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
1291 RETVAL_STRINGL(replaced, len, 0);
1292 }
1293 /* }}} */
1294
1295 #define HTML_SPECIALCHARS 0
1296 #define HTML_ENTITIES 1
1297
1298 /* {{{ register_html_constants
1299 */
register_html_constants(INIT_FUNC_ARGS)1300 void register_html_constants(INIT_FUNC_ARGS)
1301 {
1302 REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
1303 REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
1304 REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
1305 REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
1306 REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
1307 REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
1308 }
1309 /* }}} */
1310
1311 /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
1312 Convert special characters to HTML entities */
PHP_FUNCTION(htmlspecialchars)1313 PHP_FUNCTION(htmlspecialchars)
1314 {
1315 php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1316 }
1317 /* }}} */
1318
1319 /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
1320 Convert special HTML entities back to characters */
PHP_FUNCTION(htmlspecialchars_decode)1321 PHP_FUNCTION(htmlspecialchars_decode)
1322 {
1323 char *str, *new_str, *e, *p;
1324 int len, j, i, new_len;
1325 long quote_style = ENT_COMPAT;
1326 struct basic_entities_dec basic_entities_dec[8];
1327
1328 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) {
1329 return;
1330 }
1331
1332 new_str = estrndup(str, len);
1333 new_len = len;
1334 e = new_str + new_len;
1335
1336 if (!(p = memchr(new_str, '&', new_len))) {
1337 RETURN_STRINGL(new_str, new_len, 0);
1338 }
1339
1340 for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
1341 if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
1342 continue;
1343 }
1344 basic_entities_dec[j].charcode = basic_entities[i].charcode;
1345 memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
1346 basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
1347 j++;
1348 }
1349 basic_entities_dec[j].charcode = '&';
1350 basic_entities_dec[j].entitylen = sizeof("&") - 1;
1351 memcpy(basic_entities_dec[j].entity, "&", sizeof("&"));
1352 i = j + 1;
1353
1354 do {
1355 int l = e - p;
1356
1357 for (j = 0; j < i; j++) {
1358 if (basic_entities_dec[j].entitylen > l) {
1359 continue;
1360 }
1361 if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
1362 int e_len = basic_entities_dec[j].entitylen - 1;
1363
1364 *p++ = basic_entities_dec[j].charcode;
1365 memmove(p, p + e_len, (e - p - e_len));
1366 e -= e_len;
1367 goto done;
1368 }
1369 }
1370 p++;
1371
1372 done:
1373 if (p >= e) {
1374 break;
1375 }
1376 } while ((p = memchr(p, '&', (e - p))));
1377
1378 new_len = e - new_str;
1379
1380 new_str[new_len] = '\0';
1381 RETURN_STRINGL(new_str, new_len, 0);
1382 }
1383 /* }}} */
1384
1385 /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
1386 Convert all HTML entities to their applicable characters */
PHP_FUNCTION(html_entity_decode)1387 PHP_FUNCTION(html_entity_decode)
1388 {
1389 char *str, *hint_charset = NULL;
1390 int str_len, hint_charset_len = 0, len;
1391 long quote_style = ENT_COMPAT;
1392 char *replaced;
1393
1394 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
1395 "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
1396 return;
1397 }
1398
1399 replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
1400 if (replaced) {
1401 RETURN_STRINGL(replaced, len, 0);
1402 }
1403 RETURN_FALSE;
1404 }
1405 /* }}} */
1406
1407
1408 /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
1409 Convert all applicable characters to HTML entities */
PHP_FUNCTION(htmlentities)1410 PHP_FUNCTION(htmlentities)
1411 {
1412 php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1413 }
1414 /* }}} */
1415
1416 /* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]])
1417 Returns the internal translation table used by htmlspecialchars and htmlentities */
PHP_FUNCTION(get_html_translation_table)1418 PHP_FUNCTION(get_html_translation_table)
1419 {
1420 long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
1421 unsigned int i;
1422 int j;
1423 unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */
1424 void *dummy;
1425 char *charset_hint = NULL;
1426 int charset_hint_len;
1427 enum entity_charset charset;
1428
1429 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
1430 &which, "e_style, &charset_hint, &charset_hint_len) == FAILURE) {
1431 return;
1432 }
1433
1434 charset = determine_charset(charset_hint TSRMLS_CC);
1435
1436 array_init(return_value);
1437
1438 switch (which) {
1439 case HTML_ENTITIES:
1440 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1441 if (entity_map[j].charset != charset)
1442 continue;
1443 for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1444 char buffer[16];
1445 unsigned k;
1446 size_t written;
1447
1448 if (entity_map[j].table[i] == NULL)
1449 continue;
1450
1451 k = i + entity_map[j].basechar;
1452
1453 switch (charset) {
1454 case cs_utf_8:
1455 written = php_utf32_utf8(ind, k);
1456 ind[written] = '\0';
1457 break;
1458 case cs_big5:
1459 case cs_gb2312:
1460 case cs_big5hkscs:
1461 case cs_sjis:
1462 /* we have no mappings for these, but if we had... */
1463 /* break through */
1464 default: /* one byte */
1465 written = 1;
1466 ind[0] = (unsigned char)k;
1467 ind[1] = '\0';
1468 break;
1469 }
1470
1471 snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1472 if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) {
1473 /* in case of the single quote, which is repeated, the first one wins,
1474 * so don't replace the existint mapping */
1475 add_assoc_string(return_value, (const char*)ind, buffer, 1);
1476 }
1477 }
1478 }
1479 /* break thru */
1480
1481 case HTML_SPECIALCHARS:
1482 add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1);
1483 for (j = 0; basic_entities[j].charcode != 0; j++) {
1484 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1485 continue;
1486
1487 ind[0] = (unsigned char)basic_entities[j].charcode;
1488 ind[1] = '\0';
1489 if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) {
1490 add_assoc_stringl(return_value, ind, basic_entities[j].entity,
1491 basic_entities[j].entitylen, 1);
1492 }
1493 }
1494
1495 break;
1496 }
1497 }
1498 /* }}} */
1499
1500 /*
1501 * Local variables:
1502 * tab-width: 4
1503 * c-basic-offset: 4
1504 * End:
1505 * vim600: sw=4 ts=4 fdm=marker
1506 * vim<600: sw=4 ts=4
1507 */
1508