xref: /PHP-5.3/ext/standard/html.c (revision a2045ff3)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2013 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
16    |          Jaakko Hyv�tti <jaakko.hyvatti@iki.fi>                      |
17    |          Wez Furlong <wez@thebrainroom.com>                          |
18    +----------------------------------------------------------------------+
19 */
20 
21 /* $Id$ */
22 
23 /*
24  * HTML entity resources:
25  *
26  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29  *
30  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31  *
32  */
33 
34 #include "php.h"
35 #if PHP_WIN32
36 #include "config.w32.h"
37 #else
38 #include <php_config.h>
39 #endif
40 #include "html.h"
41 #include "php_string.h"
42 #include "SAPI.h"
43 #if HAVE_LOCALE_H
44 #include <locale.h>
45 #endif
46 #if HAVE_LANGINFO_H
47 #include <langinfo.h>
48 #endif
49 
50 #if HAVE_MBSTRING
51 # include "ext/mbstring/mbstring.h"
52 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
53 #endif
54 
55 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
56 					  cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
57 					  cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
58 					  cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
59 					};
60 typedef const char *const entity_table_t;
61 
62 /* codepage 1252 is a Windows extension to iso-8859-1. */
63 static entity_table_t ent_cp_1252[] = {
64 	"euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
65 	"Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
66 	NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
67 	"bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
68 	"oelig", NULL, NULL, "Yuml"
69 };
70 
71 static entity_table_t ent_iso_8859_1[] = {
72 	"nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
73 	"sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
74 	"macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
75 	"para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
76 	"frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
77 	"Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
78 	"Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
79 	"Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
80 	"Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
81 	"Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
82 	"atilde", "auml", "aring", "aelig", "ccedil", "egrave",
83 	"eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
84 	"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
85 	"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
86 	"uuml", "yacute", "thorn", "yuml"
87 };
88 
89 static entity_table_t ent_iso_8859_15[] = {
90 	"nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
91 	"sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
92 	"macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
93 	"micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
94 	"raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
95 	"Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
96 	"Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
97 	"Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
98 	"Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
99 	"Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
100 	"atilde", "auml", "aring", "aelig", "ccedil", "egrave",
101 	"eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
102 	"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
103 	"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
104 	"uuml", "yacute", "thorn", "yuml"
105 };
106 
107 static entity_table_t ent_uni_338_402[] = {
108 	/* 338 (0x0152) */
109 	"OElig", "oelig", NULL, NULL, NULL, NULL,
110 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
111 	/* 352 (0x0160) */
112 	"Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
113 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
114 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
115 	/* 376 (0x0178) */
116 	"Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
117 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
118 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
119 	/* 400 (0x0190) */
120 	NULL, NULL, "fnof"
121 };
122 
123 static entity_table_t ent_uni_spacing[] = {
124 	/* 710 */
125 	"circ",
126 	/* 711 - 730 */
127 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
128 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
129 	/* 731 - 732 */
130 	NULL, "tilde"
131 };
132 
133 static entity_table_t ent_uni_greek[] = {
134 	/* 913 */
135 	"Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
136 	"Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
137 	NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
138 	/* 938 - 944 are not mapped */
139 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
140 	"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
141 	"iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
142 	"sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
143 	/* 970 - 976 are not mapped */
144 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
145 	"thetasym", "upsih",
146 	NULL, NULL, NULL,
147 	"piv"
148 };
149 
150 static entity_table_t ent_uni_punct[] = {
151 	/* 8194 */
152 	"ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
153 	"thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
154 	NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
155 	/* 8216 */
156 	"lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
157 	"dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
158 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
159 	/* 8242 */
160 	"prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
161 	NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
162 	"frasl"
163 };
164 
165 static entity_table_t ent_uni_euro[] = {
166 	"euro"
167 };
168 
169 static entity_table_t ent_uni_8465_8501[] = {
170 	/* 8465 */
171 	"image", NULL, NULL, NULL, NULL, NULL, NULL,
172 	/* 8472 */
173 	"weierp", NULL, NULL, NULL,
174 	/* 8476 */
175 	"real", NULL, NULL, NULL, NULL, NULL,
176 	/* 8482 */
177 	"trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 	/* 8501 */
180 	"alefsym",
181 };
182 
183 static entity_table_t ent_uni_8592_9002[] = {
184 	/* 8592 (0x2190) */
185 	"larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
186 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
187 	/* 8608 (0x21a0) */
188 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
189 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
190 	/* 8624 (0x21b0) */
191 	NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
192 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
193 	/* 8640 (0x21c0) */
194 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
195 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196 	/* 8656 (0x21d0) */
197 	"lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
198 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
199 	/* 8672 (0x21e0) */
200 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
201 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
203 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
204 	/* 8704 (0x2200) */
205 	"forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
206 	"isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
207 	/* 8720 (0x2210) */
208 	NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
209 	NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
210 	/* 8736 (0x2220) */
211 	"ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
212 	"or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
213 	/* 8752 (0x2230) */
214 	NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
215 	NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
216 	/* 8768 (0x2240) */
217 	NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
218 	"asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
219 	/* 8784 (0x2250) */
220 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
221 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
222 	/* 8800 (0x2260) */
223 	"ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
224 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
225 	/* 8816 (0x2270) */
226 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
227 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
228 	/* 8832 (0x2280) */
229 	NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
230 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
231 	/* 8848 (0x2290) */
232 	NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
233 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
234 	/* 8864 (0x22a0) */
235 	NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
236 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
237 	/* 8880 (0x22b0) */
238 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
239 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240 	/* 8896 (0x22c0) */
241 	NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
242 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
243 	/* 8912 (0x22d0) */
244 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
245 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246 	/* 8928 (0x22e0) */
247 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
248 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249 	/* 8944 (0x22f0) */
250 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
251 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252 	/* 8960 (0x2300) */
253 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
254 	"lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
255 	/* 8976 (0x2310) */
256 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
257 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258 	/* 8992 (0x2320) */
259 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
260 	NULL, "lang", "rang"
261 };
262 
263 static entity_table_t ent_uni_9674[] = {
264 	/* 9674 */
265 	"loz"
266 };
267 
268 static entity_table_t ent_uni_9824_9830[] = {
269 	/* 9824 */
270 	"spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
271 };
272 
273 static entity_table_t ent_koi8r[] = {
274 	"#1105", /* "jo "*/
275 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
276 	NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
277 	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
278 	"#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
279 	"#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
280 	"#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
281 	"#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
282 	"#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
283 	"#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
284 	"#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
285 	"#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
286 	"#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
287 	"#1066"
288 };
289 
290 static entity_table_t ent_cp_1251[] = {
291 	"#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
292 	"Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
293 	"#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
294 	"bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
295 	"#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
296 	"#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
297 	"#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
298 	"#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
299 	"#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
300 	"#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
301 	"#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
302 	"#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
303 	"#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
304 	"#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
305 	"#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
306 	"#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
307 	"#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
308 	"#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
309 	"#1103"
310 };
311 
312 static entity_table_t ent_iso_8859_5[] = {
313 	"#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
314 	"#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
315 	"#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
316 	"#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
317 	"#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
318 	"#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
319 	"#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
320 	"#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
321 	"#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
322 	"#1119"
323 };
324 
325 static entity_table_t ent_cp_866[] = {
326 
327 	"#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
328 	"#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
329 	"#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
330 	"#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
331 	"#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
332 	"#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
333 	"#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
334 	"#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
335 	"#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632",
336 	"#160"
337 };
338 
339 /* MacRoman has a couple of low-ascii chars that need mapping too */
340 /* Vertical tab (ASCII 11) is often used to store line breaks inside */
341 /* DB exports, this mapping changes it to a space */
342 static entity_table_t ent_macroman[] = {
343 	"sp", NULL, NULL, NULL,
344 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
345 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
346 	NULL, NULL, NULL, NULL, NULL, "quot", NULL,
347 	NULL, NULL, "amp", NULL, NULL, NULL, NULL,
348 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
349 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
350 	NULL, NULL, NULL, "lt", NULL, "gt", NULL,
351 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
352 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
353 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
354 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
357 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359 	NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360 	NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
361 	"Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
362 	"ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
363 	"icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
364 	"otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
365 	"cent", "pound", "sect", "bull", "para", "szlig", "reg",
366 	"copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
367 	"infin", "plusmn", "le", "ge", "yen", "micro", "part",
368 	"sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
369 	"aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
370 	"asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
371 	"Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
372 	"rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
373 	"frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
374 	"middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
375 	"Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
376 	"Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
377 	"circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
378 	"#733", "#731", "#711"
379 };
380 
381 struct html_entity_map {
382 	enum entity_charset charset;	/* charset identifier */
383 	unsigned int basechar;			/* char code at start of table */
384 	unsigned int endchar;			/* last char code in the table */
385 	entity_table_t *table;			/* the table of mappings */
386 };
387 
388 static const struct html_entity_map entity_map[] = {
389 	{ cs_cp1252, 		0x80, 0x9f, ent_cp_1252 },
390 	{ cs_cp1252, 		0xa0, 0xff, ent_iso_8859_1 },
391 	{ cs_8859_1, 		0xa0, 0xff, ent_iso_8859_1 },
392 	{ cs_8859_15, 		0xa0, 0xff, ent_iso_8859_15 },
393 	{ cs_utf_8, 		0xa0, 0xff, ent_iso_8859_1 },
394 	{ cs_utf_8, 		338,  402,  ent_uni_338_402 },
395 	{ cs_utf_8, 		710,  732,  ent_uni_spacing },
396 	{ cs_utf_8, 		913,  982,  ent_uni_greek },
397 	{ cs_utf_8, 		8194, 8260, ent_uni_punct },
398 	{ cs_utf_8, 		8364, 8364, ent_uni_euro },
399 	{ cs_utf_8, 		8465, 8501, ent_uni_8465_8501 },
400 	{ cs_utf_8, 		8592, 9002, ent_uni_8592_9002 },
401 	{ cs_utf_8, 		9674, 9674, ent_uni_9674 },
402 	{ cs_utf_8, 		9824, 9830, ent_uni_9824_9830 },
403 	{ cs_big5, 			0xa0, 0xff, ent_iso_8859_1 },
404 	{ cs_gb2312, 		0xa0, 0xff, ent_iso_8859_1 },
405 	{ cs_big5hkscs, 	0xa0, 0xff, ent_iso_8859_1 },
406  	{ cs_sjis,			0xa0, 0xff, ent_iso_8859_1 },
407  	{ cs_eucjp,			0xa0, 0xff, ent_iso_8859_1 },
408 	{ cs_koi8r,		    0xa3, 0xff, ent_koi8r },
409 	{ cs_cp1251,		0x80, 0xff, ent_cp_1251 },
410 	{ cs_8859_5,		0xc0, 0xff, ent_iso_8859_5 },
411 	{ cs_cp866,		    0xc0, 0xff, ent_cp_866 },
412 	{ cs_macroman,		0x0b, 0xff, ent_macroman },
413 	{ cs_terminator }
414 };
415 
416 static const struct {
417 	const char *codeset;
418 	enum entity_charset charset;
419 } charset_map[] = {
420 	{ "ISO-8859-1", 	cs_8859_1 },
421 	{ "ISO8859-1",	 	cs_8859_1 },
422 	{ "ISO-8859-15", 	cs_8859_15 },
423 	{ "ISO8859-15", 	cs_8859_15 },
424 	{ "utf-8", 			cs_utf_8 },
425 	{ "cp1252", 		cs_cp1252 },
426 	{ "Windows-1252", 	cs_cp1252 },
427 	{ "1252",           cs_cp1252 },
428 	{ "BIG5",			cs_big5 },
429 	{ "950",            cs_big5 },
430 	{ "GB2312",			cs_gb2312 },
431 	{ "936",            cs_gb2312 },
432 	{ "BIG5-HKSCS",		cs_big5hkscs },
433 	{ "Shift_JIS",		cs_sjis },
434 	{ "SJIS",   		cs_sjis },
435 	{ "932",            cs_sjis },
436 	{ "EUCJP",   		cs_eucjp },
437 	{ "EUC-JP",   		cs_eucjp },
438 	{ "KOI8-R",         cs_koi8r },
439 	{ "koi8-ru",        cs_koi8r },
440 	{ "koi8r",          cs_koi8r },
441 	{ "cp1251",         cs_cp1251 },
442 	{ "Windows-1251",   cs_cp1251 },
443 	{ "win-1251",       cs_cp1251 },
444 	{ "iso8859-5",      cs_8859_5 },
445 	{ "iso-8859-5",     cs_8859_5 },
446 	{ "cp866",          cs_cp866 },
447 	{ "866",            cs_cp866 },
448 	{ "ibm866",         cs_cp866 },
449 	{ "MacRoman",       cs_macroman },
450 	{ NULL }
451 };
452 
453 static const struct {
454 	unsigned short charcode;
455 	char *entity;
456 	int entitylen;
457 	int flags;
458 } basic_entities[] = {
459 	{ '"',	"&quot;",	6,	ENT_HTML_QUOTE_DOUBLE },
460 	{ '\'',	"&#039;",	6,	ENT_HTML_QUOTE_SINGLE },
461 	{ '\'',	"&#39;",	5,	ENT_HTML_QUOTE_SINGLE },
462 	{ '<',	"&lt;",		4,	0 },
463 	{ '>',	"&gt;",		4,	0 },
464 	{ 0, NULL, 0, 0 }
465 };
466 
467 struct basic_entities_dec {
468 	unsigned short charcode;
469 	char entity[8];
470 	int entitylen;
471 };
472 
473 #define MB_RETURN { \
474 			*newpos = pos;       \
475 		  	mbseq[mbpos] = '\0'; \
476 		  	*mbseqlen = mbpos;   \
477 		  	return this_char; }
478 
479 #define MB_WRITE(mbchar) { \
480 			mbspace--;  \
481 			if (mbspace == 0) {      \
482 				MB_RETURN;           \
483 			}                        \
484 			mbseq[mbpos++] = (mbchar); }
485 
486 /* skip one byte and return */
487 #define MB_FAILURE(pos) do { \
488 	*newpos = pos + 1; \
489 	*status = FAILURE; \
490 	return 0; \
491 } while (0)
492 
493 #define CHECK_LEN(pos, chars_need)			\
494 	if (chars_need < 1) {						\
495 		if((str_len - (pos)) < chars_need) {	\
496 			*newpos = pos;						\
497 			*status = FAILURE;					\
498 			return 0;							\
499 		}										\
500 	} else {									\
501 		if((str_len - (pos)) < chars_need) {	\
502 			*newpos = pos + 1;					\
503 			*status = FAILURE;					\
504 			return 0;							\
505 		}										\
506 	}
507 
508 /* {{{ get_next_char
509  */
get_next_char(enum entity_charset charset,unsigned char * str,int str_len,int * newpos,unsigned char * mbseq,int * mbseqlen,int * status)510 inline static unsigned int get_next_char(enum entity_charset charset,
511 		unsigned char * str,
512 		int str_len,
513 		int * newpos,
514 		unsigned char * mbseq,
515 		int * mbseqlen,
516 		int *status)
517 {
518 	int pos = *newpos;
519 	int mbpos = 0;
520 	int mbspace = *mbseqlen;
521 	unsigned int this_char = 0;
522 	unsigned char next_char;
523 
524 	*status = SUCCESS;
525 
526 	if (mbspace <= 0) {
527 		*mbseqlen = 0;
528 		CHECK_LEN(pos, 1);
529 		*newpos = pos + 1;
530 		return str[pos];
531 	}
532 
533 	switch (charset) {
534 		case cs_utf_8:
535 			{
536 				unsigned char c;
537 				CHECK_LEN(pos, 1);
538 				c = str[pos];
539 				if (c < 0x80) {
540 					MB_WRITE(c);
541 					this_char = c;
542 					pos++;
543 				} else if (c < 0xc2) {
544 					MB_FAILURE(pos);
545 				} else if (c < 0xe0) {
546 					CHECK_LEN(pos, 2);
547 					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
548 						MB_FAILURE(pos);
549 					}
550 					this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
551 					if (this_char < 0x80) {
552 						MB_FAILURE(pos);
553 					}
554 					MB_WRITE((unsigned char)c);
555 					MB_WRITE((unsigned char)str[pos + 1]);
556 					pos += 2;
557 				} else if (c < 0xf0) {
558 					CHECK_LEN(pos, 3);
559 					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
560 						MB_FAILURE(pos);
561 					}
562 					if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
563 						MB_FAILURE(pos);
564 					}
565 					this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
566 					if (this_char < 0x800) {
567 						MB_FAILURE(pos);
568 					} else if (this_char >= 0xd800 && this_char <= 0xdfff) {
569 						MB_FAILURE(pos);
570 					}
571 					MB_WRITE((unsigned char)c);
572 					MB_WRITE((unsigned char)str[pos + 1]);
573 					MB_WRITE((unsigned char)str[pos + 2]);
574 					pos += 3;
575 				} else if (c < 0xf5) {
576 					CHECK_LEN(pos, 4);
577 					if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
578 						MB_FAILURE(pos);
579 					}
580 					if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
581 						MB_FAILURE(pos);
582 					}
583 					if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
584 						MB_FAILURE(pos);
585 					}
586 					this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
587 					if (this_char < 0x10000 || this_char > 0x10FFFF) {
588 						MB_FAILURE(pos);
589 					}
590 					MB_WRITE((unsigned char)c);
591 					MB_WRITE((unsigned char)str[pos + 1]);
592 					MB_WRITE((unsigned char)str[pos + 2]);
593 					MB_WRITE((unsigned char)str[pos + 3]);
594 					pos += 4;
595 				} else {
596 					MB_FAILURE(pos);
597 				}
598 			}
599 			break;
600 		case cs_big5:
601 		case cs_gb2312:
602 		case cs_big5hkscs:
603 			{
604 				CHECK_LEN(pos, 1);
605 				this_char = str[pos++];
606 				/* check if this is the first of a 2-byte sequence */
607 				if (this_char >= 0x81 && this_char <= 0xfe) {
608 					/* peek at the next char */
609 					CHECK_LEN(pos, 1);
610 					next_char = str[pos++];
611 					if ((next_char >= 0x40 && next_char <= 0x7e) ||
612 							(next_char >= 0xa1 && next_char <= 0xfe)) {
613 						/* yes, this a wide char */
614 						MB_WRITE(this_char);
615 						MB_WRITE(next_char);
616 						this_char = (this_char << 8) | next_char;
617 					} else {
618 						MB_FAILURE(pos);
619 					}
620 				} else {
621 					MB_WRITE(this_char);
622 				}
623 			}
624 			break;
625 		case cs_sjis:
626 			{
627 				CHECK_LEN(pos, 1);
628 				this_char = str[pos++];
629 				/* check if this is the first of a 2-byte sequence */
630 				if ((this_char >= 0x81 && this_char <= 0x9f) ||
631 					(this_char >= 0xe0 && this_char <= 0xfc)) {
632 					/* peek at the next char */
633 					CHECK_LEN(pos, 1);
634 					next_char = str[pos++];
635 					if ((next_char >= 0x40 && next_char <= 0x7e) ||
636 						(next_char >= 0x80 && next_char <= 0xfc))
637 					{
638 						/* yes, this a wide char */
639 						MB_WRITE(this_char);
640 						MB_WRITE(next_char);
641 						this_char = (this_char << 8) | next_char;
642 					} else {
643 						MB_FAILURE(pos);
644 					}
645 				} else {
646 					MB_WRITE(this_char);
647 				}
648 				break;
649 			}
650 		case cs_eucjp:
651 			{
652 				CHECK_LEN(pos, 1);
653 				this_char = str[pos++];
654 				/* check if this is the first of a multi-byte sequence */
655 				if (this_char >= 0xa1 && this_char <= 0xfe) {
656 					/* peek at the next char */
657 					CHECK_LEN(pos, 1);
658 					next_char = str[pos++];
659 					if (next_char >= 0xa1 && next_char <= 0xfe) {
660 						/* yes, this a jis kanji char */
661 						MB_WRITE(this_char);
662 						MB_WRITE(next_char);
663 						this_char = (this_char << 8) | next_char;
664 					} else {
665 						MB_FAILURE(pos);
666 					}
667 				} else if (this_char == 0x8e) {
668 					/* peek at the next char */
669 					CHECK_LEN(pos, 1);
670 					next_char = str[pos++];
671 					if (next_char >= 0xa1 && next_char <= 0xdf) {
672 						/* JIS X 0201 kana */
673 						MB_WRITE(this_char);
674 						MB_WRITE(next_char);
675 						this_char = (this_char << 8) | next_char;
676 					} else {
677 						MB_FAILURE(pos);
678 					}
679 				} else if (this_char == 0x8f) {
680 					/* peek at the next two char */
681 					unsigned char next2_char;
682 					CHECK_LEN(pos, 2);
683 					next_char = str[pos];
684 					next2_char = str[pos + 1];
685 					pos += 2;
686 					if ((next_char >= 0xa1 && next_char <= 0xfe) &&
687 						(next2_char >= 0xa1 && next2_char <= 0xfe)) {
688 						/* JIS X 0212 hojo-kanji */
689 						MB_WRITE(this_char);
690 						MB_WRITE(next_char);
691 						MB_WRITE(next2_char);
692 						this_char = (this_char << 16) | (next_char << 8) | next2_char;
693 					} else {
694 						MB_FAILURE(pos);
695 					}
696 				} else {
697 					MB_WRITE(this_char);
698 				}
699 				break;
700 			}
701 		default:
702 			/* single-byte charsets */
703 			CHECK_LEN(pos, 1);
704 			this_char = str[pos++];
705 			MB_WRITE(this_char);
706 			break;
707 	}
708 	MB_RETURN;
709 }
710 /* }}} */
711 
712 /* {{{ entity_charset determine_charset
713  * returns the charset identifier based on current locale or a hint.
714  * defaults to iso-8859-1 */
determine_charset(char * charset_hint TSRMLS_DC)715 static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
716 {
717 	int i;
718 	enum entity_charset charset = cs_8859_1;
719 	int len = 0;
720 	zval *uf_result = NULL;
721 
722 	/* Guarantee default behaviour for backwards compatibility */
723 	if (charset_hint == NULL)
724 		return cs_8859_1;
725 
726 	if ((len = strlen(charset_hint)) != 0) {
727 		goto det_charset;
728 	}
729 #if HAVE_MBSTRING
730 #if !defined(COMPILE_DL_MBSTRING)
731 	/* XXX: Ugly things. Why don't we look for a more sophisticated way? */
732 	switch (MBSTRG(current_internal_encoding)) {
733 		case mbfl_no_encoding_8859_1:
734 			return cs_8859_1;
735 
736 		case mbfl_no_encoding_utf8:
737 			return cs_utf_8;
738 
739 		case mbfl_no_encoding_euc_jp:
740 		case mbfl_no_encoding_eucjp_win:
741 			return cs_eucjp;
742 
743 		case mbfl_no_encoding_sjis:
744 		case mbfl_no_encoding_sjis_open:
745 		case mbfl_no_encoding_cp932:
746 			return cs_sjis;
747 
748 		case mbfl_no_encoding_cp1252:
749 			return cs_cp1252;
750 
751 		case mbfl_no_encoding_8859_15:
752 			return cs_8859_15;
753 
754 		case mbfl_no_encoding_big5:
755 			return cs_big5;
756 
757 		case mbfl_no_encoding_euc_cn:
758 		case mbfl_no_encoding_hz:
759 		case mbfl_no_encoding_cp936:
760 			return cs_gb2312;
761 
762 		case mbfl_no_encoding_koi8r:
763 			return cs_koi8r;
764 
765 		case mbfl_no_encoding_cp866:
766 			return cs_cp866;
767 
768 		case mbfl_no_encoding_cp1251:
769 			return cs_cp1251;
770 
771 		case mbfl_no_encoding_8859_5:
772 			return cs_8859_5;
773 
774 		default:
775 			;
776 	}
777 #else
778 	{
779 		zval nm_mb_internal_encoding;
780 
781 		ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
782 
783 		if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
784 
785 			charset_hint = Z_STRVAL_P(uf_result);
786 			len = Z_STRLEN_P(uf_result);
787 
788 			if (charset_hint != NULL && len != 0) {
789 				if (len == 4) { /* sizeof(none|auto|pass)-1 */
790 					if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
791 						!memcmp("auto", charset_hint, sizeof("auto") - 1) ||
792 						!memcmp("none", charset_hint, sizeof("none") - 1)) {
793 
794 						charset_hint = NULL;
795 						len = 0;
796 					}
797 				} else {
798 					/* Jump to det_charset only if mbstring isn't one of above eq pass, auto, none.
799 					   Otherwise try default_charset next */
800 					goto det_charset;
801 				}
802 			}
803 		}
804 	}
805 #endif
806 #endif
807 
808 	charset_hint = SG(default_charset);
809 	if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
810 		goto det_charset;
811 	}
812 
813 	/* try to detect the charset for the locale */
814 #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
815 	charset_hint = nl_langinfo(CODESET);
816 	if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
817 		goto det_charset;
818 	}
819 #endif
820 
821 #if HAVE_LOCALE_H
822 	/* try to figure out the charset from the locale */
823 	{
824 		char *localename;
825 		char *dot, *at;
826 
827 		/* lang[_territory][.codeset][@modifier] */
828 		localename = setlocale(LC_CTYPE, NULL);
829 
830 		dot = strchr(localename, '.');
831 		if (dot) {
832 			dot++;
833 			/* locale specifies a codeset */
834 			at = strchr(dot, '@');
835 			if (at)
836 				len = at - dot;
837 			else
838 				len = strlen(dot);
839 			charset_hint = dot;
840 		} else {
841 			/* no explicit name; see if the name itself
842 			 * is the charset */
843 			charset_hint = localename;
844 			len = strlen(charset_hint);
845 		}
846 	}
847 #endif
848 
849 det_charset:
850 
851 	if (charset_hint) {
852 		int found = 0;
853 
854 		/* now walk the charset map and look for the codeset */
855 		for (i = 0; charset_map[i].codeset; i++) {
856 			if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
857 				charset = charset_map[i].charset;
858 				found = 1;
859 				break;
860 			}
861 		}
862 		if (!found) {
863 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
864 					charset_hint);
865 		}
866 	}
867 	if (uf_result != NULL) {
868 		zval_ptr_dtor(&uf_result);
869 	}
870 	return charset;
871 }
872 /* }}} */
873 
874 /* {{{ php_utf32_utf8 */
php_utf32_utf8(unsigned char * buf,unsigned k)875 size_t php_utf32_utf8(unsigned char *buf, unsigned k)
876 {
877 	size_t retval = 0;
878 
879 	if (k < 0x80) {
880 		buf[0] = k;
881 		retval = 1;
882 	} else if (k < 0x800) {
883 		buf[0] = 0xc0 | (k >> 6);
884 		buf[1] = 0x80 | (k & 0x3f);
885 		retval = 2;
886 	} else if (k < 0x10000) {
887 		buf[0] = 0xe0 | (k >> 12);
888 		buf[1] = 0x80 | ((k >> 6) & 0x3f);
889 		buf[2] = 0x80 | (k & 0x3f);
890 		retval = 3;
891 	} else if (k < 0x200000) {
892 		buf[0] = 0xf0 | (k >> 18);
893 		buf[1] = 0x80 | ((k >> 12) & 0x3f);
894 		buf[2] = 0x80 | ((k >> 6) & 0x3f);
895 		buf[3] = 0x80 | (k & 0x3f);
896 		retval = 4;
897 	} else if (k < 0x4000000) {
898 		buf[0] = 0xf8 | (k >> 24);
899 		buf[1] = 0x80 | ((k >> 18) & 0x3f);
900 		buf[2] = 0x80 | ((k >> 12) & 0x3f);
901 		buf[3] = 0x80 | ((k >> 6) & 0x3f);
902 		buf[4] = 0x80 | (k & 0x3f);
903 		retval = 5;
904 	} else {
905 		buf[0] = 0xfc | (k >> 30);
906 		buf[1] = 0x80 | ((k >> 24) & 0x3f);
907 		buf[2] = 0x80 | ((k >> 18) & 0x3f);
908 		buf[3] = 0x80 | ((k >> 12) & 0x3f);
909 		buf[4] = 0x80 | ((k >> 6) & 0x3f);
910 		buf[5] = 0x80 | (k & 0x3f);
911 		retval = 6;
912 	}
913 	buf[retval] = '\0';
914 
915 	return retval;
916 }
917 /* }}} */
918 
919 /* {{{ php_unescape_html_entities
920  */
php_unescape_html_entities(unsigned char * old,int oldlen,int * newlen,int all,int quote_style,char * hint_charset TSRMLS_DC)921 PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
922 {
923 	int retlen;
924 	int j, k;
925 	char *replaced, *ret, *p, *q, *lim, *next;
926 	enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
927 	unsigned char replacement[15];
928 	int replacement_len;
929 
930 	ret = estrndup(old, oldlen);
931 	retlen = oldlen;
932 	if (!retlen) {
933 		goto empty_source;
934 	}
935 
936 	if (all) {
937 		/* look for a match in the maps for this charset */
938 		for (j = 0; entity_map[j].charset != cs_terminator; j++) {
939 			if (entity_map[j].charset != charset)
940 				continue;
941 
942 			for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
943 				unsigned char entity[32];
944 				int entity_length = 0;
945 
946 				if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
947 					continue;
948 
949 				entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
950 				if (entity_length >= sizeof(entity)) {
951 					continue;
952 				}
953 
954 				/* When we have MBCS entities in the tables above, this will need to handle it */
955 				replacement_len = 0;
956 				switch (charset) {
957 					case cs_8859_1:
958 					case cs_cp1252:
959 					case cs_8859_15:
960 					case cs_cp1251:
961 					case cs_8859_5:
962 					case cs_cp866:
963 					case cs_koi8r:
964 						replacement[0] = k;
965 						replacement[1] = '\0';
966 						replacement_len = 1;
967 						break;
968 
969 					case cs_big5:
970 					case cs_gb2312:
971 					case cs_big5hkscs:
972 					case cs_sjis:
973 					case cs_eucjp:
974 						/* we cannot properly handle those multibyte encodings
975 						 * with php_str_to_str. skip it. */
976 						continue;
977 
978 					case cs_utf_8:
979 						replacement_len = php_utf32_utf8(replacement, k);
980 						break;
981 
982 					default:
983 						php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
984 						efree(ret);
985 						return NULL;
986 				}
987 
988 				if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
989 					replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
990 					efree(ret);
991 					ret = replaced;
992 				}
993 			}
994 		}
995 	}
996 
997 	for (j = 0; basic_entities[j].charcode != 0; j++) {
998 
999 		if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1000 			continue;
1001 
1002 		replacement[0] = (unsigned char)basic_entities[j].charcode;
1003 		replacement[1] = '\0';
1004 
1005 		if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
1006 			replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
1007 			efree(ret);
1008 			ret = replaced;
1009 		}
1010 	}
1011 
1012 	/* replace numeric entities & "&amp;" */
1013 	lim = ret + retlen;
1014 	for (p = ret, q = ret; p < lim;) {
1015 		int code;
1016 
1017 		if (p[0] == '&') {
1018 			if (p + 2 < lim) {
1019 				if (p[1] == '#') {
1020 					int invalid_code = 0;
1021 
1022 					if (p[2] == 'x' || p[2] == 'X') {
1023 						code = strtol(p + 3, &next, 16);
1024 					} else {
1025 						code = strtol(p + 2, &next, 10);
1026 					}
1027 
1028 					if ((code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE)) ||
1029 						(code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))) {
1030 						invalid_code = 1;
1031 					}
1032 
1033 					if (next != NULL && *next == ';' && !invalid_code) {
1034 						switch (charset) {
1035 							case cs_utf_8:
1036 								q += php_utf32_utf8(q, code);
1037 								break;
1038 
1039 							case cs_8859_1:
1040 							case cs_8859_5:
1041 							case cs_8859_15:
1042 								if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
1043 									invalid_code = 1;
1044 								} else {
1045 									*(q++) = code;
1046 								}
1047 								break;
1048 
1049 							case cs_cp1252:
1050 								if (code > 0xff) {
1051 									invalid_code = 1;
1052 								} else {
1053 									*(q++) = code;
1054 								}
1055 								break;
1056 
1057 							case cs_cp1251:
1058 							case cs_cp866:
1059 							case cs_big5:
1060 							case cs_big5hkscs:
1061 							case cs_sjis:
1062 							case cs_eucjp:
1063 								if (code >= 0x80) {
1064 									invalid_code = 1;
1065 								} else {
1066 									*(q++) = code;
1067 								}
1068 								break;
1069 
1070 							case cs_gb2312:
1071 								if (code >= 0x81) {
1072 									invalid_code = 1;
1073 								} else {
1074 									*(q++) = code;
1075 								}
1076 								break;
1077 
1078 							default:
1079 								/* for backwards compatilibity */
1080 								invalid_code = 1;
1081 								break;
1082 						}
1083 						if (invalid_code) {
1084 							for (; p <= next; p++) {
1085 								*(q++) = *p;
1086 							}
1087 						}
1088 						p = next + 1;
1089 					} else {
1090 						*(q++) = *(p++);
1091 						*(q++) = *(p++);
1092 					}
1093 				} else if (p + 4 < lim &&
1094 							p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
1095 							p[4] == ';') {
1096 					*(q++) = '&';
1097 					p += 5;
1098 				} else {
1099 					*(q++) = *(p++);
1100 					*(q++) = *(p++);
1101 				}
1102 			} else {
1103 				*(q++) = *(p++);
1104 			}
1105 		} else {
1106 			*(q++) = *(p++);
1107 		}
1108 	}
1109 	*q = '\0';
1110 	retlen = (size_t)(q - ret);
1111 empty_source:
1112 	*newlen = retlen;
1113 	return ret;
1114 }
1115 /* }}} */
1116 
php_escape_html_entities(unsigned char * old,int oldlen,int * newlen,int all,int quote_style,char * hint_charset TSRMLS_DC)1117 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
1118 {
1119 	return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
1120 }
1121 
1122 
1123 /* {{{ php_escape_html_entities
1124  */
php_escape_html_entities_ex(unsigned char * old,int oldlen,int * newlen,int all,int quote_style,char * hint_charset,zend_bool double_encode TSRMLS_DC)1125 PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
1126 {
1127 	int i, j, maxlen, len;
1128 	char *replaced;
1129 	enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1130 	int matches_map;
1131 
1132 	maxlen = 2 * oldlen;
1133 	if (maxlen < 128)
1134 		maxlen = 128;
1135 	replaced = emalloc (maxlen);
1136 	len = 0;
1137 	i = 0;
1138 	while (i < oldlen) {
1139 		unsigned char mbsequence[16];	/* allow up to 15 characters in a multibyte sequence */
1140 		int mbseqlen = sizeof(mbsequence);
1141 		int status = SUCCESS;
1142 		unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
1143 
1144 		if(status == FAILURE) {
1145 			/* invalid MB sequence */
1146 			if (quote_style & ENT_HTML_IGNORE_ERRORS) {
1147 				continue;
1148 			}
1149 			efree(replaced);
1150 			if(!PG(display_errors)) {
1151 				php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
1152 			}
1153 			*newlen = 0;
1154 			return STR_EMPTY_ALLOC();
1155 		}
1156 		matches_map = 0;
1157 
1158 		if (len + 16 > maxlen)
1159 			replaced = erealloc (replaced, maxlen += 128);
1160 
1161 		if (all) {
1162 			/* look for a match in the maps for this charset */
1163 			unsigned char *rep = NULL;
1164 
1165 
1166 			for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1167 				if (entity_map[j].charset == charset
1168 						&& this_char >= entity_map[j].basechar
1169 						&& this_char <= entity_map[j].endchar) {
1170 					rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
1171 					if (rep == NULL) {
1172 						/* there is no entity for this position; fall through and
1173 						 * just output the character itself */
1174 						break;
1175 					}
1176 
1177 					matches_map = 1;
1178 					break;
1179 				}
1180 			}
1181 
1182 			if (matches_map) {
1183 				int l = strlen(rep);
1184 				/* increase the buffer size */
1185 				if (len + 2 + l >= maxlen) {
1186 					replaced = erealloc(replaced, maxlen += 128);
1187 				}
1188 
1189 				replaced[len++] = '&';
1190 				strlcpy(replaced + len, rep, maxlen);
1191 				len += l;
1192 				replaced[len++] = ';';
1193 			}
1194 		}
1195 		if (!matches_map) {
1196 			int is_basic = 0;
1197 
1198 			if (this_char == '&') {
1199 				if (double_encode) {
1200 encode_amp:
1201 					memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
1202 					len += sizeof("&amp;") - 1;
1203 				} else {
1204 					char *e = memchr(old + i, ';', oldlen - i);
1205 					char *s = old + i;
1206 
1207 					if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
1208 						goto encode_amp;
1209 					} else {
1210 						if (*s == '#') { /* numeric entities */
1211 							s++;
1212 							/* Hex (&#x5A;) */
1213 							if (*s == 'x' || *s == 'X') {
1214 								s++;
1215 								while (s < e) {
1216 									if (!isxdigit((int)*(unsigned char *)s++)) {
1217 										goto encode_amp;
1218 									}
1219 								}
1220 							/* Dec (&#90;)*/
1221 							} else {
1222 								while (s < e) {
1223 									if (!isdigit((int)*(unsigned char *)s++)) {
1224 										goto encode_amp;
1225 									}
1226 								}
1227 							}
1228 						} else { /* text entities */
1229 							while (s < e) {
1230 								if (!isalnum((int)*(unsigned char *)s++)) {
1231 									goto encode_amp;
1232 								}
1233 							}
1234 						}
1235 						replaced[len++] = '&';
1236 					}
1237 				}
1238 				is_basic = 1;
1239 			} else {
1240 				for (j = 0; basic_entities[j].charcode != 0; j++) {
1241 					if ((basic_entities[j].charcode != this_char) ||
1242 							(basic_entities[j].flags &&
1243 							(quote_style & basic_entities[j].flags) == 0)) {
1244 						continue;
1245 					}
1246 
1247 					memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
1248 					len += basic_entities[j].entitylen;
1249 
1250 					is_basic = 1;
1251 					break;
1252 				}
1253 			}
1254 
1255 			if (!is_basic) {
1256 				/* a wide char without a named entity; pass through the original sequence */
1257 				if (mbseqlen > 1) {
1258 					memcpy(replaced + len, mbsequence, mbseqlen);
1259 					len += mbseqlen;
1260 				} else {
1261 					replaced[len++] = (unsigned char)this_char;
1262 				}
1263 			}
1264 		}
1265 	}
1266 	replaced[len] = '\0';
1267 	*newlen = len;
1268 
1269 	return replaced;
1270 
1271 
1272 }
1273 /* }}} */
1274 
1275 /* {{{ php_html_entities
1276  */
php_html_entities(INTERNAL_FUNCTION_PARAMETERS,int all)1277 static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1278 {
1279 	char *str, *hint_charset = NULL;
1280 	int str_len, hint_charset_len = 0;
1281 	int len;
1282 	long quote_style = ENT_COMPAT;
1283 	char *replaced;
1284 	zend_bool double_encode = 1;
1285 
1286 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
1287 		return;
1288 	}
1289 
1290 	replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
1291 	RETVAL_STRINGL(replaced, len, 0);
1292 }
1293 /* }}} */
1294 
1295 #define HTML_SPECIALCHARS 	0
1296 #define HTML_ENTITIES	 	1
1297 
1298 /* {{{ register_html_constants
1299  */
register_html_constants(INIT_FUNC_ARGS)1300 void register_html_constants(INIT_FUNC_ARGS)
1301 {
1302 	REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
1303 	REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
1304 	REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
1305 	REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
1306 	REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
1307 	REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
1308 }
1309 /* }}} */
1310 
1311 /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
1312    Convert special characters to HTML entities */
PHP_FUNCTION(htmlspecialchars)1313 PHP_FUNCTION(htmlspecialchars)
1314 {
1315 	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1316 }
1317 /* }}} */
1318 
1319 /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
1320    Convert special HTML entities back to characters */
PHP_FUNCTION(htmlspecialchars_decode)1321 PHP_FUNCTION(htmlspecialchars_decode)
1322 {
1323 	char *str, *new_str, *e, *p;
1324 	int len, j, i, new_len;
1325 	long quote_style = ENT_COMPAT;
1326 	struct basic_entities_dec basic_entities_dec[8];
1327 
1328 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
1329 		return;
1330 	}
1331 
1332 	new_str = estrndup(str, len);
1333 	new_len = len;
1334 	e = new_str + new_len;
1335 
1336 	if (!(p = memchr(new_str, '&', new_len))) {
1337 		RETURN_STRINGL(new_str, new_len, 0);
1338 	}
1339 
1340 	for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
1341 		if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
1342 			continue;
1343 		}
1344 		basic_entities_dec[j].charcode = basic_entities[i].charcode;
1345 		memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
1346 		basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
1347 		j++;
1348 	}
1349 	basic_entities_dec[j].charcode = '&';
1350 	basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
1351 	memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
1352 	i = j + 1;
1353 
1354 	do {
1355 		int l = e - p;
1356 
1357 		for (j = 0; j < i; j++) {
1358 			if (basic_entities_dec[j].entitylen > l) {
1359 				continue;
1360 			}
1361 			if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
1362 				int e_len = basic_entities_dec[j].entitylen - 1;
1363 
1364 				*p++ = basic_entities_dec[j].charcode;
1365 				memmove(p, p + e_len, (e - p - e_len));
1366 				e -= e_len;
1367 				goto done;
1368 			}
1369 		}
1370 		p++;
1371 
1372 done:
1373 		if (p >= e) {
1374 			break;
1375 		}
1376 	} while ((p = memchr(p, '&', (e - p))));
1377 
1378 	new_len = e - new_str;
1379 
1380 	new_str[new_len] = '\0';
1381 	RETURN_STRINGL(new_str, new_len, 0);
1382 }
1383 /* }}} */
1384 
1385 /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
1386    Convert all HTML entities to their applicable characters */
PHP_FUNCTION(html_entity_decode)1387 PHP_FUNCTION(html_entity_decode)
1388 {
1389 	char *str, *hint_charset = NULL;
1390 	int str_len, hint_charset_len = 0, len;
1391 	long quote_style = ENT_COMPAT;
1392 	char *replaced;
1393 
1394 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
1395 							  &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
1396 		return;
1397 	}
1398 
1399 	replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
1400 	if (replaced) {
1401 		RETURN_STRINGL(replaced, len, 0);
1402 	}
1403 	RETURN_FALSE;
1404 }
1405 /* }}} */
1406 
1407 
1408 /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
1409    Convert all applicable characters to HTML entities */
PHP_FUNCTION(htmlentities)1410 PHP_FUNCTION(htmlentities)
1411 {
1412 	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1413 }
1414 /* }}} */
1415 
1416 /* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]])
1417    Returns the internal translation table used by htmlspecialchars and htmlentities */
PHP_FUNCTION(get_html_translation_table)1418 PHP_FUNCTION(get_html_translation_table)
1419 {
1420 	long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
1421 	unsigned int i;
1422 	int j;
1423 	unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */
1424 	void *dummy;
1425 	char *charset_hint = NULL;
1426 	int charset_hint_len;
1427 	enum entity_charset charset;
1428 
1429 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
1430 			&which, &quote_style, &charset_hint, &charset_hint_len) == FAILURE) {
1431 		return;
1432 	}
1433 
1434 	charset = determine_charset(charset_hint TSRMLS_CC);
1435 
1436 	array_init(return_value);
1437 
1438 	switch (which) {
1439 	case HTML_ENTITIES:
1440 		for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1441 			if (entity_map[j].charset != charset)
1442 				continue;
1443 			for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1444 				char buffer[16];
1445 				unsigned k;
1446 				size_t written;
1447 
1448 				if (entity_map[j].table[i] == NULL)
1449 					continue;
1450 
1451 				k = i + entity_map[j].basechar;
1452 
1453 				switch (charset) {
1454 				case cs_utf_8:
1455 					written = php_utf32_utf8(ind, k);
1456 					ind[written] = '\0';
1457 					break;
1458 				case cs_big5:
1459 				case cs_gb2312:
1460 				case cs_big5hkscs:
1461 				case cs_sjis:
1462 					/* we have no mappings for these, but if we had... */
1463 					/* break through */
1464 				default: /* one byte */
1465 					written = 1;
1466 					ind[0] = (unsigned char)k;
1467 					ind[1] = '\0';
1468 					break;
1469 				}
1470 
1471 				snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1472 				if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) {
1473 					/* in case of the single quote, which is repeated, the first one wins,
1474 						* so don't replace the existint mapping */
1475 					add_assoc_string(return_value, (const char*)ind, buffer, 1);
1476 				}
1477 			}
1478 		}
1479 		/* break thru */
1480 
1481 	case HTML_SPECIALCHARS:
1482 		add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
1483 		for (j = 0; basic_entities[j].charcode != 0; j++) {
1484 			if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1485 				continue;
1486 
1487 			ind[0] = (unsigned char)basic_entities[j].charcode;
1488 			ind[1] = '\0';
1489 			if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) {
1490 				add_assoc_stringl(return_value, ind, basic_entities[j].entity,
1491 					basic_entities[j].entitylen, 1);
1492 			}
1493 		}
1494 
1495 		break;
1496 	}
1497 }
1498 /* }}} */
1499 
1500 /*
1501  * Local variables:
1502  * tab-width: 4
1503  * c-basic-offset: 4
1504  * End:
1505  * vim600: sw=4 ts=4 fdm=marker
1506  * vim<600: sw=4 ts=4
1507  */
1508