1--TEST--
2Test mb_decode_numericentity() function : Convert HTML entities to text
3--EXTENSIONS--
4mbstring
5--FILE--
6<?php
7
8function varDumpToString($var)
9{
10    ob_start();
11    var_dump($var);
12    return trim(ob_get_clean());
13}
14
15function test($desc, $str, $expected, $convmap, $encoding) {
16    $result = mb_decode_numericentity($str, $convmap, $encoding);
17    echo $desc, ": ", varDumpToString($str), " => ", varDumpToString($result);
18    if ($result === $expected)
19        echo " (Good)\n";
20    else
21        echo " (BAD; expected ", varDumpToString($expected), ")\n";
22}
23
24function testNonAscii($desc, $str, $expected, $convmap, $encoding) {
25    $result = mb_decode_numericentity($str, $convmap, $encoding);
26    echo $desc, ": ", bin2hex($str), " => ", bin2hex($result);
27    if ($result === $expected)
28        echo " (Good)\n";
29    else
30        echo " (BAD; expected ", bin2hex($expected), ")\n";
31}
32
33$str1 = '&#161;&#162;&#163;&#164;&#165;&#166;&#167;&#168;&#169;&#170;&#171;&#172;&#173;&#174;&#175;&#176;&#177;&#178;&#179;&#180;&#181;&#182;&#183;&#184;&#185;&#186;&#187;&#188;&#189;&#190;&#191;&#192;&#193;&#194;&#195;&#196;&#197;&#198;&#199;&#200;&#201;&#202;&#203;&#204;&#205;&#206;&#207;&#208;&#209;&#210;&#211;&#212;&#213;&#214;&#215;&#216;&#217;&#218;&#219;&#220;&#221;&#222;&#223;&#224;&#225;&#226;&#227;&#228;&#229;&#230;&#231;&#232;&#233;&#234;&#235;&#236;&#237;&#238;&#239;&#240;&#241;&#242;&#243;&#244;&#245;&#246;&#247;&#248;&#249;&#250;&#251;&#252;&#253;&#254;&#255;';
34$str2 = '&#402;&#913;&#914;&#915;&#916;&#917;&#918;&#919;&#920;&#921;&#922;&#923;&#924;&#925;&#926;&#927;&#928;&#929;&#931;&#932;&#933;&#934;&#935;&#936;&#937;&#945;&#946;&#947;&#948;&#949;&#950;&#951;&#952;&#953;&#954;&#955;&#956;&#957;&#958;&#959;&#960;&#961;&#962;&#963;&#964;&#965;&#966;&#967;&#968;&#969;&#977;&#978;&#982;&#8226;&#8230;&#8242;&#8243;&#8254;&#8260;&#8472;&#8465;&#8476;&#8482;&#8501;&#8592;&#8593;&#8594;&#8595;&#8596;&#8629;&#8656;&#8657;&#8658;&#8659;&#8660;&#8704;&#8706;&#8707;&#8709;&#8711;&#8712;&#8713;&#8715;&#8719;&#8721;&#8722;&#8727;&#8730;&#8733;&#8734;&#8736;&#8743;&#8744;&#8745;&#8746;&#8747;&#8756;&#8764;&#8773;&#8776;&#8800;&#8801;&#8804;&#8805;&#8834;&#8835;&#8836;&#8838;&#8839;&#8853;&#8855;&#8869;&#8901;&#8968;&#8969;&#8970;&#8971;&#9001;&#9002;&#9674;&#9824;&#9827;&#9829;&#9830;';
35$str3 = 'a&#338;b&#339;c&#352;d&#353;e&#8364;fg';
36
37// Typical convmap, typical numeric entity-encoded string
38$convmap = array(0x0, 0x2FFFF, 0, 0xFFFF);
39echo "1: " . mb_decode_numericentity($str1, $convmap, "UTF-8") . "\n";
40echo "2: " . mb_decode_numericentity($str2, $convmap, "UTF-8") . "\n";
41echo "3: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n";
42
43// Numeric entities which are truncated at end of string
44echo "4: " . mb_decode_numericentity('&#1000000000', $convmap), "\n"; // Entity is too big
45echo "5: " . mb_decode_numericentity('&#9000000000', $convmap), "\n"; // Entity is too big
46echo "6: " . mb_decode_numericentity('&#10000000000', $convmap), "\n"; // Too many digits
47echo "7: " . mb_decode_numericentity('&#100000000000', $convmap), "\n"; // Too many digits
48echo "8: " . mb_decode_numericentity('&#000000000000', $convmap), "\n"; // Too many digits
49echo "9: " . mb_decode_numericentity('&#00000000000', $convmap), "\n"; // Too many digits
50echo "10: " . bin2hex(mb_decode_numericentity('&#0000000000', $convmap)), "\n"; // OK
51echo "11: " . bin2hex(mb_decode_numericentity('&#000000000', $convmap)), "\n"; // OK
52// Try with hex, not just decimal entities
53echo "11b: " . bin2hex(mb_decode_numericentity('&#x0000000', $convmap)), "\n"; // OK
54echo "11c: " . bin2hex(mb_decode_numericentity('&#x00000000', $convmap)), "\n"; // OK
55echo "11d: " . bin2hex(mb_decode_numericentity('&#x10000', $convmap)), "\n"; // OK
56echo "11e: " . mb_decode_numericentity('&#x000000000', $convmap), "\n"; // Too many digits
57
58// Large decimal entity, converting from non-ASCII input encoding
59echo "12: " . bin2hex(mb_decode_numericentity(mb_convert_encoding('&#12345678;', 'UCS-4', 'ASCII'), [0, 0x7FFFFFFF, 0, 0x7FFFFFFF], 'UCS-4')), "\n";
60
61$convmap = [];
62echo "13: " . mb_decode_numericentity('f&ouml;o', $convmap, "UTF-8") . "\n";
63
64$convmap = array(0x0, 0x2FFFF, 0); // 3 elements
65try {
66    echo "14: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n";
67} catch (ValueError $ex) {
68    echo "14: " . $ex->getMessage()."\n";
69}
70
71echo "15: " . bin2hex(mb_decode_numericentity('&#0;', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
72echo "16: " . bin2hex(mb_decode_numericentity('&#x0;', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
73
74// Weird convmap
75$convmap = [
76    0, 0, 0, 0,        // Only one codepoint, empty mask
77    100, 50, 0, 0xFFFF // 'End' codepoint is before 'start' codepoint
78];
79echo "17: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n";
80
81// Convmap with positive offset
82$convmap = [0, 10, 1000, 0xFFFF];
83echo "18: " . bin2hex(mb_decode_numericentity("&#1001;&#1002;&#1003;", $convmap, "UTF-8")) . "\n";
84echo "19: " . bin2hex(mb_decode_numericentity("&#x3E9;&#x3EA;&#x3EB;", $convmap, "UTF-8")) . "\n";
85
86echo "20: " . mb_decode_numericentity("&#123a;", [0, 0xFFFF, 0, 0xFFFF]) . "\n";
87
88test("10 digits for decimal entity", "&#0000000065;", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
89test("More than 10 digits for decimal entity", "&#00000000165;", "&#00000000165;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
90
91test("8 digits for hex entity", "&#x00000041;", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
92test("More than 8 digits for hex entity", "&#x000000141;", "&#x000000141;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
93
94test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII");
95
96// An entity can come right after a preceding ampersand
97test("Successive &", "&&#65,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
98
99// An entity can come right after a preceding &#
100test("Successive &#", "&#&#x32;", "&#2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
101test("Successive &#x", "&#x&#x32;", "&#x2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
102
103test("&#x only", "&#x;", "&#x;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
104
105// The starting & of an entity can terminate a preceding entity
106test("Successive &#65", "&#65&#65;", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
107test("Successive hex entities", "&#x32&#x32;", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
108
109// An entity can come right after an entity which is invalid because of being too long
110test("Starting entity immediately after decimal entity which is too long", "&#10000000000&#65;", "&#10000000000A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
111test("Starting entity immediately after hex entity which is too long", "&#x111111111&#65;", "&#x111111111A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
112
113$ucs4_test1 = mb_convert_encoding("&#1000000000&#65;", 'UCS-4BE', 'ASCII');
114testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
115$ucs4_test2 = mb_convert_encoding("&#x11111111&#65;", 'UCS-4BE', 'ASCII');
116testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length",  $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
117
118test("Starting entity immediately after invalid decimal entity", "&#0&#65;", "&#0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
119test("Starting entity immediately after invalid hex entity", "&#x0&#65;", "&#x0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
120
121test("Starting entity immediately after too-big decimal entity", "&#7001492542&#65;", "&#7001492542A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'ASCII');
122
123// If the numeric entity decodes to 0xFFFFFFFF, that should be passed through
124// Originally, the new implementation of mb_decode_numericentity used -1 as a marker indicating
125// that the entity could not be successfully decoded, so if the entity decoded successfully to
126// 0xFFFFFFFF (-1), it would be treated as an invalid entity
127test("Regression test (entity which decodes to 0xFFFFFFFF)", "&#xe;", "?", [0xFFFFFF86, 0xFFFFFFFF, 0xF, 0xFC015448], 'HZ');
128
129// With the legacy conversion filters, a trailing & could be truncated by mb_decode_numericentity,
130// because some text encodings did not properly invoke the next flush function in the chain
131test("Regression test (truncation of successive & with JIS encoding)", "&&&", "&&&", [0x20FF37FF, 0x7202F569, 0xC4090023, 0xF160], "JIS");
132
133// Previously, signed arithmetic was used on convmap entries
134test("Regression test (convmap entries are now treated as unsigned)", "&#7,", "?,", [0x22FFFF11, 0xBF111189, 0x67726511, 0x1161E719], "ASCII");
135
136// Try with '&', '&#', or '&#' at the end of a buffer of wchars, with more input
137// still left to process in the next buffer
138// (mb_decode_numericentity splits its input into 'chunks' and processes it one
139// chunk at a time)
140$convmap = [0, 0xFFFF, 0, 0xFFFF];
141for ($i = 0; $i < 256; $i++) {
142    $padding = str_repeat("a", $i);
143    // First try invalid decimal/hex entities
144    if (mb_decode_numericentity($padding . "&#ZZZ", $convmap, 'UTF-8') !== $padding . "&#ZZZ")
145        die("&#ZZZ is broken when it spans two buffers!");
146    if (mb_decode_numericentity($padding . "&#xZZZ", $convmap, 'UTF-8') !== $padding . "&#xZZZ")
147        die("&#xZZZ is broken when it spans two buffers!");
148    // Now try valid decimal/hex entities
149    if (mb_decode_numericentity($padding . "&#65", $convmap, 'UTF-8') !== $padding . "A")
150        die("&#65 is broken when it spans two buffers!");
151    if (mb_decode_numericentity($padding . "&#x41", $convmap, 'UTF-8') !== $padding . "A")
152        die("&#x41 is broken when it spans two buffers!");
153}
154
155// Try huge entities, big enough to fill an entire buffer
156for ($i = 12; $i < 256; $i++) {
157    $str = "&#" . str_repeat("0", $i) . "65";
158    if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
159        die("Decimal entity with huge number of digits broken");
160
161    $str = "&#x" . str_repeat("0", $i) . "41";
162    if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
163        die("Hexadecimal entity with huge number of digits broken");
164}
165
166?>
167--EXPECT--
1681: ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
1692: ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦
1703: aŒbœcŠdše€fg
1714: &#1000000000
1725: &#9000000000
1736: &#10000000000
1747: &#100000000000
1758: &#000000000000
1769: &#00000000000
17710: 00
17811: 00
17911b: 00
18011c: 00
18111d: f0908080
18211e: &#x000000000
18312: 00bc614e
18413: f&ouml;o
18514: mb_decode_numericentity(): Argument #2 ($map) must have a multiple of 4 elements
18615: 00
18716: 00
18817: föo
18918: 010203
19019: 010203
19120: {a;
19210 digits for decimal entity: string(13) "&#0000000065;" => string(1) "A" (Good)
193More than 10 digits for decimal entity: string(14) "&#00000000165;" => string(14) "&#00000000165;" (Good)
1948 digits for hex entity: string(12) "&#x00000041;" => string(1) "A" (Good)
195More than 8 digits for hex entity: string(13) "&#x000000141;" => string(13) "&#x000000141;" (Good)
196Single &: string(1) "&" => string(1) "&" (Good)
197Successive &: string(6) "&&#65," => string(3) "&A," (Good)
198Successive &#: string(8) "&#&#x32;" => string(3) "&#2" (Good)
199Successive &#x: string(9) "&#x&#x32;" => string(4) "&#x2" (Good)
200&#x only: string(4) "&#x;" => string(4) "&#x;" (Good)
201Successive &#65: string(9) "&#65&#65;" => string(2) "AA" (Good)
202Successive hex entities: string(11) "&#x32&#x32;" => string(2) "22" (Good)
203Starting entity immediately after decimal entity which is too long: string(18) "&#10000000000&#65;" => string(14) "&#10000000000A" (Good)
204Starting entity immediately after hex entity which is too long: string(17) "&#x111111111&#65;" => string(13) "&#x111111111A" (Good)
205Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca0000000041 (Good)
206Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 1111111100000041 (Good)
207Starting entity immediately after invalid decimal entity: string(8) "&#0&#65;" => string(4) "&#0A" (Good)
208Starting entity immediately after invalid hex entity: string(9) "&#x0&#65;" => string(5) "&#x0A" (Good)
209Starting entity immediately after too-big decimal entity: string(17) "&#7001492542&#65;" => string(13) "&#7001492542A" (Good)
210Regression test (entity which decodes to 0xFFFFFFFF): string(5) "&#xe;" => string(1) "?" (Good)
211Regression test (truncation of successive & with JIS encoding): string(3) "&&&" => string(3) "&&&" (Good)
212Regression test (convmap entries are now treated as unsigned): string(4) "&#7," => string(2) "?," (Good)
213