--TEST-- Test mb_decode_numericentity() function : Convert HTML entities to text --EXTENSIONS-- mbstring --FILE-- ", varDumpToString($result); if ($result === $expected) echo " (Good)\n"; else echo " (BAD; expected ", varDumpToString($expected), ")\n"; } function testNonAscii($desc, $str, $expected, $convmap, $encoding) { $result = mb_decode_numericentity($str, $convmap, $encoding); echo $desc, ": ", bin2hex($str), " => ", bin2hex($result); if ($result === $expected) echo " (Good)\n"; else echo " (BAD; expected ", bin2hex($expected), ")\n"; } $str1 = '¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'; $str2 = 'ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦'; $str3 = 'aŒbœcŠdše€fg'; // Typical convmap, typical numeric entity-encoded string $convmap = array(0x0, 0x2FFFF, 0, 0xFFFF); echo "1: " . mb_decode_numericentity($str1, $convmap, "UTF-8") . "\n"; echo "2: " . mb_decode_numericentity($str2, $convmap, "UTF-8") . "\n"; echo "3: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n"; // Numeric entities which are truncated at end of string echo "4: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big echo "5: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big echo "6: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits echo "7: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits echo "8: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits echo "9: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits echo "10: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK echo "11: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK // Try with hex, not just decimal entities echo "11b: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK echo "11c: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK echo "11d: " . bin2hex(mb_decode_numericentity('𐀀', $convmap)), "\n"; // OK echo "11e: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits // Large decimal entity, converting from non-ASCII input encoding echo "12: " . bin2hex(mb_decode_numericentity(mb_convert_encoding('�', 'UCS-4', 'ASCII'), [0, 0x7FFFFFFF, 0, 0x7FFFFFFF], 'UCS-4')), "\n"; $convmap = []; echo "13: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n"; $convmap = array(0x0, 0x2FFFF, 0); // 3 elements try { echo "14: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n"; } catch (ValueError $ex) { echo "14: " . $ex->getMessage()."\n"; } echo "15: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n"; echo "16: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n"; // Weird convmap $convmap = [ 0, 0, 0, 0, // Only one codepoint, empty mask 100, 50, 0, 0xFFFF // 'End' codepoint is before 'start' codepoint ]; echo "17: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n"; // Convmap with positive offset $convmap = [0, 10, 1000, 0xFFFF]; echo "18: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n"; echo "19: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n"; echo "20: " . mb_decode_numericentity("{a;", [0, 0xFFFF, 0, 0xFFFF]) . "\n"; test("10 digits for decimal entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("More than 10 digits for decimal entity", "¥", "¥", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("8 digits for hex entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("More than 8 digits for hex entity", "Ł", "Ł", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII"); // An entity can come right after a preceding ampersand test("Successive &", "&A,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); // An entity can come right after a preceding &# test("Successive &#", "&#2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("Successive &#x", "&#x2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("&#x only", "&#x;", "&#x;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); // The starting & of an entity can terminate a preceding entity test("Successive A", "AA", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("Successive hex entities", "22", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); // An entity can come right after an entity which is invalid because of being too long test("Starting entity immediately after decimal entity which is too long", "�A", "�A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("Starting entity immediately after hex entity which is too long", "�A", "�", [0, 0xFFFF, 0, 0xFFFF], 'ASCII'); $ucs4_test1 = mb_convert_encoding("�A", 'UCS-4BE', 'ASCII'); testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE'); $ucs4_test2 = mb_convert_encoding("�A", 'UCS-4BE', 'ASCII'); testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length", $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE'); test("Starting entity immediately after invalid decimal entity", "�A", "�A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("Starting entity immediately after invalid hex entity", "�A", " ", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII'); test("Starting entity immediately after too-big decimal entity", "�A", "�A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'ASCII'); // If the numeric entity decodes to 0xFFFFFFFF, that should be passed through // Originally, the new implementation of mb_decode_numericentity used -1 as a marker indicating // that the entity could not be successfully decoded, so if the entity decoded successfully to // 0xFFFFFFFF (-1), it would be treated as an invalid entity test("Regression test (entity which decodes to 0xFFFFFFFF)", "", "?", [0xFFFFFF86, 0xFFFFFFFF, 0xF, 0xFC015448], 'HZ'); // With the legacy conversion filters, a trailing & could be truncated by mb_decode_numericentity, // because some text encodings did not properly invoke the next flush function in the chain test("Regression test (truncation of successive & with JIS encoding)", "&&&", "&&&", [0x20FF37FF, 0x7202F569, 0xC4090023, 0xF160], "JIS"); // Previously, signed arithmetic was used on convmap entries test("Regression test (convmap entries are now treated as unsigned)", ",", "?,", [0x22FFFF11, 0xBF111189, 0x67726511, 0x1161E719], "ASCII"); // Try with '&', '&#', or '&#' at the end of a buffer of wchars, with more input // still left to process in the next buffer // (mb_decode_numericentity splits its input into 'chunks' and processes it one // chunk at a time) $convmap = [0, 0xFFFF, 0, 0xFFFF]; for ($i = 0; $i < 256; $i++) { $padding = str_repeat("a", $i); // First try invalid decimal/hex entities if (mb_decode_numericentity($padding . "&#ZZZ", $convmap, 'UTF-8') !== $padding . "&#ZZZ") die("&#ZZZ is broken when it spans two buffers!"); if (mb_decode_numericentity($padding . "&#xZZZ", $convmap, 'UTF-8') !== $padding . "&#xZZZ") die("&#xZZZ is broken when it spans two buffers!"); // Now try valid decimal/hex entities if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A") die("A is broken when it spans two buffers!"); if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A") die("A is broken when it spans two buffers!"); } // Try huge entities, big enough to fill an entire buffer for ($i = 12; $i < 256; $i++) { $str = "&#" . str_repeat("0", $i) . "65"; if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str) die("Decimal entity with huge number of digits broken"); $str = "&#x" . str_repeat("0", $i) . "41"; if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str) die("Hexadecimal entity with huge number of digits broken"); } ?> --EXPECT-- 1: ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ 2: ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦ 3: aŒbœcŠdše€fg 4: � 5: � 6: � 7: � 8: � 9: � 10: 00 11: 00 11b: 00 11c: 00 11d: f0908080 11e: � 12: 00bc614e 13: föo 14: mb_decode_numericentity(): Argument #2 ($map) must have a multiple of 4 elements 15: 00 16: 00 17: föo 18: 010203 19: 010203 20: {a; 10 digits for decimal entity: string(13) "A" => string(1) "A" (Good) More than 10 digits for decimal entity: string(14) "¥" => string(14) "¥" (Good) 8 digits for hex entity: string(12) "A" => string(1) "A" (Good) More than 8 digits for hex entity: string(13) "Ł" => string(13) "Ł" (Good) Single &: string(1) "&" => string(1) "&" (Good) Successive &: string(6) "&A," => string(3) "&A," (Good) Successive &#: string(8) "&#2" => string(3) "" (Good) Successive &#x: string(9) "&#x2" => string(4) "" (Good) &#x only: string(4) "&#x;" => string(4) "&#x;" (Good) Successive A: string(9) "AA" => string(2) "AA" (Good) Successive hex entities: string(11) "22" => string(2) "22" (Good) Starting entity immediately after decimal entity which is too long: string(18) "�A" => string(14) "�A" (Good) Starting entity immediately after hex entity which is too long: string(17) "�A" => string(13) "�" (Good) Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca0000000041 (Good) Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 1111111100000041 (Good) Starting entity immediately after invalid decimal entity: string(8) "�A" => string(4) "�A" (Good) Starting entity immediately after invalid hex entity: string(9) "�A" => string(5) " " (Good) Starting entity immediately after too-big decimal entity: string(17) "�A" => string(13) "�A" (Good) Regression test (entity which decodes to 0xFFFFFFFF): string(5) "" => string(1) "?" (Good) Regression test (truncation of successive & with JIS encoding): string(3) "&&&" => string(3) "&&&" (Good) Regression test (convmap entries are now treated as unsigned): string(4) "," => string(2) "?," (Good)