1--TEST-- 2Test mb_decode_numericentity() function : Convert HTML entities to text 3--EXTENSIONS-- 4mbstring 5--FILE-- 6<?php 7 8function varDumpToString($var) 9{ 10 ob_start(); 11 var_dump($var); 12 return trim(ob_get_clean()); 13} 14 15function test($desc, $str, $expected, $convmap, $encoding) { 16 $result = mb_decode_numericentity($str, $convmap, $encoding); 17 echo $desc, ": ", varDumpToString($str), " => ", varDumpToString($result); 18 if ($result === $expected) 19 echo " (Good)\n"; 20 else 21 echo " (BAD; expected ", varDumpToString($expected), ")\n"; 22} 23 24function testNonAscii($desc, $str, $expected, $convmap, $encoding) { 25 $result = mb_decode_numericentity($str, $convmap, $encoding); 26 echo $desc, ": ", bin2hex($str), " => ", bin2hex($result); 27 if ($result === $expected) 28 echo " (Good)\n"; 29 else 30 echo " (BAD; expected ", bin2hex($expected), ")\n"; 31} 32 33$str1 = '¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'; 34$str2 = 'ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦'; 35$str3 = 'aŒbœcŠdše€fg'; 36 37// Typical convmap, typical numeric entity-encoded string 38$convmap = array(0x0, 0x2FFFF, 0, 0xFFFF); 39echo "1: " . mb_decode_numericentity($str1, $convmap, "UTF-8") . "\n"; 40echo "2: " . mb_decode_numericentity($str2, $convmap, "UTF-8") . "\n"; 41echo "3: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n"; 42 43// Numeric entities which are truncated at end of string 44echo "4: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big 45echo "5: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big 46echo "6: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits 47echo "7: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits 48echo "8: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits 49echo "9: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits 50echo "10: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK 51echo "11: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK 52// Try with hex, not just decimal entities 53echo "11b: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK 54echo "11c: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK 55echo "11d: " . bin2hex(mb_decode_numericentity('𐀀', $convmap)), "\n"; // OK 56echo "11e: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits 57 58// Large decimal entity, converting from non-ASCII input encoding 59echo "12: " . bin2hex(mb_decode_numericentity(mb_convert_encoding('�', 'UCS-4', 'ASCII'), [0, 0x7FFFFFFF, 0, 0x7FFFFFFF], 'UCS-4')), "\n"; 60 61$convmap = []; 62echo "13: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n"; 63 64$convmap = array(0x0, 0x2FFFF, 0); // 3 elements 65try { 66 echo "14: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n"; 67} catch (ValueError $ex) { 68 echo "14: " . $ex->getMessage()."\n"; 69} 70 71echo "15: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n"; 72echo "16: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n"; 73 74// Weird convmap 75$convmap = [ 76 0, 0, 0, 0, // Only one codepoint, empty mask 77 100, 50, 0, 0xFFFF // 'End' codepoint is before 'start' codepoint 78]; 79echo "17: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n"; 80 81// Convmap with positive offset 82$convmap = [0, 10, 1000, 0xFFFF]; 83echo "18: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n"; 84echo "19: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n"; 85 86echo "20: " . mb_decode_numericentity("{a;", [0, 0xFFFF, 0, 0xFFFF]) . "\n"; 87 88test("10 digits for decimal entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 89test("More than 10 digits for decimal entity", "¥", "¥", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 90 91test("8 digits for hex entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 92test("More than 8 digits for hex entity", "Ł", "Ł", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 93 94test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII"); 95 96// An entity can come right after a preceding ampersand 97test("Successive &", "&A,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 98 99// An entity can come right after a preceding &# 100test("Successive &#", "&#2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 101test("Successive &#x", "&#x2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 102 103test("&#x only", "&#x;", "&#x;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 104 105// The starting & of an entity can terminate a preceding entity 106test("Successive A", "AA", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 107test("Successive hex entities", "22", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 108 109// An entity can come right after an entity which is invalid because of being too long 110test("Starting entity immediately after decimal entity which is too long", "�A", "�A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 111test("Starting entity immediately after hex entity which is too long", "�A", "�", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); 112 113$ucs4_test1 = mb_convert_encoding("�A", 'UCS-4BE', 'ASCII'); 114testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE'); 115$ucs4_test2 = mb_convert_encoding("�A", 'UCS-4BE', 'ASCII'); 116testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length", $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE'); 117 118test("Starting entity immediately after invalid decimal entity", "�A", "�A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII'); 119test("Starting entity immediately after invalid hex entity", "�A", "
", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII'); 120 121test("Starting entity immediately after too-big decimal entity", "�A", "�A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'ASCII'); 122 123// If the numeric entity decodes to 0xFFFFFFFF, that should be passed through 124// Originally, the new implementation of mb_decode_numericentity used -1 as a marker indicating 125// that the entity could not be successfully decoded, so if the entity decoded successfully to 126// 0xFFFFFFFF (-1), it would be treated as an invalid entity 127test("Regression test (entity which decodes to 0xFFFFFFFF)", "", "?", [0xFFFFFF86, 0xFFFFFFFF, 0xF, 0xFC015448], 'HZ'); 128 129// With the legacy conversion filters, a trailing & could be truncated by mb_decode_numericentity, 130// because some text encodings did not properly invoke the next flush function in the chain 131test("Regression test (truncation of successive & with JIS encoding)", "&&&", "&&&", [0x20FF37FF, 0x7202F569, 0xC4090023, 0xF160], "JIS"); 132 133// Previously, signed arithmetic was used on convmap entries 134test("Regression test (convmap entries are now treated as unsigned)", ",", "?,", [0x22FFFF11, 0xBF111189, 0x67726511, 0x1161E719], "ASCII"); 135 136// Try with '&', '&#', or '&#' at the end of a buffer of wchars, with more input 137// still left to process in the next buffer 138// (mb_decode_numericentity splits its input into 'chunks' and processes it one 139// chunk at a time) 140$convmap = [0, 0xFFFF, 0, 0xFFFF]; 141for ($i = 0; $i < 256; $i++) { 142 $padding = str_repeat("a", $i); 143 // First try invalid decimal/hex entities 144 if (mb_decode_numericentity($padding . "&#ZZZ", $convmap, 'UTF-8') !== $padding . "&#ZZZ") 145 die("&#ZZZ is broken when it spans two buffers!"); 146 if (mb_decode_numericentity($padding . "&#xZZZ", $convmap, 'UTF-8') !== $padding . "&#xZZZ") 147 die("&#xZZZ is broken when it spans two buffers!"); 148 // Now try valid decimal/hex entities 149 if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A") 150 die("A is broken when it spans two buffers!"); 151 if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A") 152 die("A is broken when it spans two buffers!"); 153} 154 155// Try huge entities, big enough to fill an entire buffer 156for ($i = 12; $i < 256; $i++) { 157 $str = "&#" . str_repeat("0", $i) . "65"; 158 if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str) 159 die("Decimal entity with huge number of digits broken"); 160 161 $str = "&#x" . str_repeat("0", $i) . "41"; 162 if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str) 163 die("Hexadecimal entity with huge number of digits broken"); 164} 165 166?> 167--EXPECT-- 1681: ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ 1692: ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦ 1703: aŒbœcŠdše€fg 1714: � 1725: � 1736: � 1747: � 1758: � 1769: � 17710: 00 17811: 00 17911b: 00 18011c: 00 18111d: f0908080 18211e: � 18312: 00bc614e 18413: föo 18514: mb_decode_numericentity(): Argument #2 ($map) must have a multiple of 4 elements 18615: 00 18716: 00 18817: föo 18918: 010203 19019: 010203 19120: {a; 19210 digits for decimal entity: string(13) "A" => string(1) "A" (Good) 193More than 10 digits for decimal entity: string(14) "¥" => string(14) "¥" (Good) 1948 digits for hex entity: string(12) "A" => string(1) "A" (Good) 195More than 8 digits for hex entity: string(13) "Ł" => string(13) "Ł" (Good) 196Single &: string(1) "&" => string(1) "&" (Good) 197Successive &: string(6) "&A," => string(3) "&A," (Good) 198Successive &#: string(8) "&#2" => string(3) "" (Good) 199Successive &#x: string(9) "&#x2" => string(4) "" (Good) 200&#x only: string(4) "&#x;" => string(4) "&#x;" (Good) 201Successive A: string(9) "AA" => string(2) "AA" (Good) 202Successive hex entities: string(11) "22" => string(2) "22" (Good) 203Starting entity immediately after decimal entity which is too long: string(18) "�A" => string(14) "�A" (Good) 204Starting entity immediately after hex entity which is too long: string(17) "�A" => string(13) "�" (Good) 205Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca0000000041 (Good) 206Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 1111111100000041 (Good) 207Starting entity immediately after invalid decimal entity: string(8) "�A" => string(4) "�A" (Good) 208Starting entity immediately after invalid hex entity: string(9) "�A" => string(5) "
" (Good) 209Starting entity immediately after too-big decimal entity: string(17) "�A" => string(13) "�A" (Good) 210Regression test (entity which decodes to 0xFFFFFFFF): string(5) "" => string(1) "?" (Good) 211Regression test (truncation of successive & with JIS encoding): string(3) "&&&" => string(3) "&&&" (Good) 212Regression test (convmap entries are now treated as unsigned): string(4) "," => string(2) "?," (Good) 213