1--TEST--
2Test mb_decode_numericentity() function : Convert HTML entities to text
3--EXTENSIONS--
4mbstring
5--FILE--
6<?php
7
8function varDumpToString($var)
9{
10    ob_start();
11    var_dump($var);
12    return trim(ob_get_clean());
13}
14
15function test($desc, $str, $expected, $convmap, $encoding) {
16    $result = mb_decode_numericentity($str, $convmap, $encoding);
17    echo $desc, ": ", varDumpToString($str), " => ", varDumpToString($result);
18    if ($result === $expected)
19        echo " (Good)\n";
20    else
21        echo " (BAD; expected ", varDumpToString($expected), ")\n";
22}
23
24function testNonAscii($desc, $str, $expected, $convmap, $encoding) {
25    $result = mb_decode_numericentity($str, $convmap, $encoding);
26    echo $desc, ": ", bin2hex($str), " => ", bin2hex($result);
27    if ($result === $expected)
28        echo " (Good)\n";
29    else
30        echo " (BAD; expected ", bin2hex($expected), ")\n";
31}
32
33$str1 = '&#161;&#162;&#163;&#164;&#165;&#166;&#167;&#168;&#169;&#170;&#171;&#172;&#173;&#174;&#175;&#176;&#177;&#178;&#179;&#180;&#181;&#182;&#183;&#184;&#185;&#186;&#187;&#188;&#189;&#190;&#191;&#192;&#193;&#194;&#195;&#196;&#197;&#198;&#199;&#200;&#201;&#202;&#203;&#204;&#205;&#206;&#207;&#208;&#209;&#210;&#211;&#212;&#213;&#214;&#215;&#216;&#217;&#218;&#219;&#220;&#221;&#222;&#223;&#224;&#225;&#226;&#227;&#228;&#229;&#230;&#231;&#232;&#233;&#234;&#235;&#236;&#237;&#238;&#239;&#240;&#241;&#242;&#243;&#244;&#245;&#246;&#247;&#248;&#249;&#250;&#251;&#252;&#253;&#254;&#255;';
34$str2 = '&#402;&#913;&#914;&#915;&#916;&#917;&#918;&#919;&#920;&#921;&#922;&#923;&#924;&#925;&#926;&#927;&#928;&#929;&#931;&#932;&#933;&#934;&#935;&#936;&#937;&#945;&#946;&#947;&#948;&#949;&#950;&#951;&#952;&#953;&#954;&#955;&#956;&#957;&#958;&#959;&#960;&#961;&#962;&#963;&#964;&#965;&#966;&#967;&#968;&#969;&#977;&#978;&#982;&#8226;&#8230;&#8242;&#8243;&#8254;&#8260;&#8472;&#8465;&#8476;&#8482;&#8501;&#8592;&#8593;&#8594;&#8595;&#8596;&#8629;&#8656;&#8657;&#8658;&#8659;&#8660;&#8704;&#8706;&#8707;&#8709;&#8711;&#8712;&#8713;&#8715;&#8719;&#8721;&#8722;&#8727;&#8730;&#8733;&#8734;&#8736;&#8743;&#8744;&#8745;&#8746;&#8747;&#8756;&#8764;&#8773;&#8776;&#8800;&#8801;&#8804;&#8805;&#8834;&#8835;&#8836;&#8838;&#8839;&#8853;&#8855;&#8869;&#8901;&#8968;&#8969;&#8970;&#8971;&#9001;&#9002;&#9674;&#9824;&#9827;&#9829;&#9830;';
35$str3 = 'a&#338;b&#339;c&#352;d&#353;e&#8364;fg';
36
37// Typical convmap, typical numeric entity-encoded string
38$convmap = array(0x0, 0x2FFFF, 0, 0xFFFF);
39echo "1: " . mb_decode_numericentity($str1, $convmap, "UTF-8") . "\n";
40echo "2: " . mb_decode_numericentity($str2, $convmap, "UTF-8") . "\n";
41echo "3: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n";
42
43// Numeric entities which are truncated at end of string
44echo "4: " . mb_decode_numericentity('&#1000000000', $convmap), "\n"; // Entity is too big
45echo "5: " . mb_decode_numericentity('&#9000000000', $convmap), "\n"; // Entity is too big
46echo "6: " . mb_decode_numericentity('&#10000000000', $convmap), "\n"; // Too many digits
47echo "7: " . mb_decode_numericentity('&#100000000000', $convmap), "\n"; // Too many digits
48echo "8: " . mb_decode_numericentity('&#000000000000', $convmap), "\n"; // Too many digits
49echo "9: " . mb_decode_numericentity('&#00000000000', $convmap), "\n"; // Too many digits
50echo "10: " . bin2hex(mb_decode_numericentity('&#0000000000', $convmap)), "\n"; // OK
51echo "11: " . bin2hex(mb_decode_numericentity('&#000000000', $convmap)), "\n"; // OK
52// Try with hex, not just decimal entities
53echo "11b: " . bin2hex(mb_decode_numericentity('&#x0000000', $convmap)), "\n"; // OK
54echo "11c: " . bin2hex(mb_decode_numericentity('&#x00000000', $convmap)), "\n"; // OK
55echo "11d: " . bin2hex(mb_decode_numericentity('&#x10000', $convmap)), "\n"; // OK
56echo "11e: " . mb_decode_numericentity('&#x000000000', $convmap), "\n"; // Too many digits
57
58// Large decimal entity, converting from non-ASCII input encoding
59echo "12: " . bin2hex(mb_decode_numericentity(mb_convert_encoding('&#12345678;', 'UCS-4', 'ASCII'), [0, 0x7FFFFFFF, 0, 0x7FFFFFFF], 'UCS-4')), "\n";
60
61$convmap = [];
62echo "13: " . mb_decode_numericentity('f&ouml;o', $convmap, "UTF-8") . "\n";
63
64echo "15: " . bin2hex(mb_decode_numericentity('&#0;', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
65echo "16: " . bin2hex(mb_decode_numericentity('&#x0;', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
66
67// Weird convmap
68$convmap = [
69    0, 0, 0, 0,        // Only one codepoint, empty mask
70    100, 50, 0, 0xFFFF // 'End' codepoint is before 'start' codepoint
71];
72echo "17: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n";
73
74// Convmap with positive offset
75$convmap = [0, 10, 1000, 0xFFFF];
76echo "18: " . bin2hex(mb_decode_numericentity("&#1001;&#1002;&#1003;", $convmap, "UTF-8")) . "\n";
77echo "19: " . bin2hex(mb_decode_numericentity("&#x3E9;&#x3EA;&#x3EB;", $convmap, "UTF-8")) . "\n";
78
79echo "20: " . mb_decode_numericentity("&#123a;", [0, 0xFFFF, 0, 0xFFFF]) . "\n";
80
81test("10 digits for decimal entity", "&#0000000065;", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
82test("More than 10 digits for decimal entity", "&#00000000165;", "&#00000000165;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
83
84test("8 digits for hex entity", "&#x00000041;", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
85test("More than 8 digits for hex entity", "&#x000000141;", "&#x000000141;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
86
87test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII");
88
89// An entity can come right after a preceding ampersand
90test("Successive &", "&&#65,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
91
92// An entity can come right after a preceding &#
93test("Successive &#", "&#&#x32;", "&#2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
94test("Successive &#x", "&#x&#x32;", "&#x2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
95
96test("&#x only", "&#x;", "&#x;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
97
98// The starting & of an entity can terminate a preceding entity
99test("Successive &#65", "&#65&#65;", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
100test("Successive hex entities", "&#x32&#x32;", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
101
102// An entity can come right after an entity which is invalid because of being too long
103test("Starting entity immediately after decimal entity which is too long", "&#10000000000&#65;", "&#10000000000A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
104test("Starting entity immediately after hex entity which is too long", "&#x111111111&#65;", "&#x111111111A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
105
106test("Starting entity immediately after invalid decimal entity", "&#0&#65;", "&#0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
107test("Starting entity immediately after invalid hex entity", "&#x0&#65;", "&#x0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
108
109// Try with '&', '&#', or '&#' at the end of a buffer of wchars, with more input
110// still left to process in the next buffer
111// (mb_decode_numericentity splits its input into 'chunks' and processes it one
112// chunk at a time)
113$convmap = [0, 0xFFFF, 0, 0xFFFF];
114for ($i = 0; $i < 256; $i++) {
115    $padding = str_repeat("a", $i);
116    // First try invalid decimal/hex entities
117    if (mb_decode_numericentity($padding . "&#ZZZ", $convmap, 'UTF-8') !== $padding . "&#ZZZ")
118        die("&#ZZZ is broken when it spans two buffers!");
119    if (mb_decode_numericentity($padding . "&#xZZZ", $convmap, 'UTF-8') !== $padding . "&#xZZZ")
120        die("&#xZZZ is broken when it spans two buffers!");
121    // Now try valid decimal/hex entities
122    if (mb_decode_numericentity($padding . "&#65", $convmap, 'UTF-8') !== $padding . "A")
123        die("&#65 is broken when it spans two buffers!");
124    if (mb_decode_numericentity($padding . "&#x41", $convmap, 'UTF-8') !== $padding . "A")
125        die("&#x41 is broken when it spans two buffers!");
126}
127
128// Try huge entities, big enough to fill an entire buffer
129for ($i = 12; $i < 256; $i++) {
130    $str = "&#" . str_repeat("0", $i) . "65";
131    if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
132        die("Decimal entity with huge number of digits broken");
133
134    $str = "&#x" . str_repeat("0", $i) . "41";
135    if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
136        die("Hexadecimal entity with huge number of digits broken");
137}
138
139?>
140--EXPECT--
1411: ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
1422: ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦
1433: aŒbœcŠdše€fg
1444: &#1000000000
1455: &#9000000000
1466: &#10000000000
1477: &#100000000000
1488: &#000000000000
1499: &#00000000000
15010: 00
15111: 00
15211b: 00
15311c: 00
15411d: f0908080
15511e: &#x000000000
15612: 00bc614e
15713: f&ouml;o
15815: 00
15916: 00
16017: föo
16118: 010203
16219: 010203
16320: {a;
16410 digits for decimal entity: string(13) "&#0000000065;" => string(1) "A" (Good)
165More than 10 digits for decimal entity: string(14) "&#00000000165;" => string(14) "&#00000000165;" (Good)
1668 digits for hex entity: string(12) "&#x00000041;" => string(1) "A" (Good)
167More than 8 digits for hex entity: string(13) "&#x000000141;" => string(13) "&#x000000141;" (Good)
168Single &: string(1) "&" => string(1) "&" (Good)
169Successive &: string(6) "&&#65," => string(3) "&A," (Good)
170Successive &#: string(8) "&#&#x32;" => string(3) "&#2" (Good)
171Successive &#x: string(9) "&#x&#x32;" => string(4) "&#x2" (Good)
172&#x only: string(4) "&#x;" => string(4) "&#x;" (Good)
173Successive &#65: string(9) "&#65&#65;" => string(2) "AA" (Good)
174Successive hex entities: string(11) "&#x32&#x32;" => string(2) "22" (Good)
175Starting entity immediately after decimal entity which is too long: string(18) "&#10000000000&#65;" => string(14) "&#10000000000A" (Good)
176Starting entity immediately after hex entity which is too long: string(17) "&#x111111111&#65;" => string(13) "&#x111111111A" (Good)
177Starting entity immediately after invalid decimal entity: string(8) "&#0&#65;" => string(4) "&#0A" (Good)
178Starting entity immediately after invalid hex entity: string(9) "&#x0&#65;" => string(5) "&#x0A" (Good)
179