--TEST-- Exhaustive test of UTF-8 text encoding (DoCoMo, KDDI, SoftBank variants) --EXTENSIONS-- mbstring --SKIPIF-- --FILE-- str_repeat("\x00\x00\x00%", 4), // CP 0x110000 "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000 "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF // Reserved range for UTF-16 surrogate pairs "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800 "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF // Truncated characters "\xDF" => "\x00\x00\x00%", // should have been 2-byte "\xEF\xBF" => "\x00\x00\x00%", // should have been 3-byte "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte "\xF1\x96" => "\x00\x00\x00%", "\xF1\x96\x80" => "\x00\x00\x00%", "\xF2\x94" => "\x00\x00\x00%", "\xF2\x94\x80" => "\x00\x00\x00%", "\xF3\x94" => "\x00\x00\x00%", "\xF3\x94\x80" => "\x00\x00\x00%", "\xE0\x9F" => "\x00\x00\x00%\x00\x00\x00%", "\xED\xA6" => "\x00\x00\x00%\x00\x00\x00%", // Multi-byte characters which end too soon and go to ASCII "\xDFA" => "\x00\x00\x00%\x00\x00\x00A", "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A", "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", // Multi-byte characters which end too soon and go to another MB char "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", // Continuation bytes which appear outside of a MB char "\x80" => "\x00\x00\x00%", "A\x80" => "\x00\x00\x00A\x00\x00\x00%", "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%", // Overlong code units // (Using more bytes than needed to encode a character) "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes ); function intToString($value) { if ($value <= 0xFF) return chr($value); else if ($value <= 0xFFFF) return pack('n', $value); else if ($value <= 0xFFFFFF) return chr($value >> 16) . pack('n', $value & 0xFFFF); else return pack('N', $value); } function readUTF8ConversionTable($path, &$from, &$to, &$invalid) { $from = array(); $to = array(); $invalid = array(); $fp = fopen($path, 'r+'); while ($line = fgets($fp, 256)) { if (sscanf($line, "0x%x\t0x%x", $codepoint, $char) == 2) { $codepoint = pack('N', $codepoint); $char = intToString($char); $from[$char] = $codepoint; $to[$codepoint] = $char; } else if (sscanf($line, "0x%x\tBAD", $codepoint) == 1) { $codepoint = pack('N', $codepoint); $invalid[$codepoint] = true; } } } function testUTF8Variant($encoding, $filename) { readUTF8ConversionTable(__DIR__ . $filename, $toUnicode, $fromUnicode, $invalidCodepoints); // Test some plain, vanilla codepoints (to/from mobile encoding) testValidString("\x00\x00", "\x00", "UTF-16BE", $encoding); for ($i = 0; $i < 1000; $i++) { $cp = pack('N', rand(1, 0x10FFFF)); if (isset($fromUnicode[$cp])) continue; if (mb_convert_encoding($cp, $encoding, 'UTF-32BE') !== mb_convert_encoding($cp, 'UTF-8', 'UTF-32BE')) die("Expected U+" . bin2hex($cp) . " to be the same in UTF-8 and " . $encoding); } if ($encoding === 'UTF-8-Mobile#DOCOMO') { // In Docomo Shift-JIS, we have mappings for U+FEE16 up to U+FEE25 and // then U+FEE29-U+FEE2B, U+FEE2D-U+FEE33 // These correspond to sequential Docomo SJIS codes, but in the middle there is // one emoji which converts to U+25EA (SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK) // However, when converting Unicode to Docomo vendor-specific encodings, we still // accept U+FEE26 and convert it to the same SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK emoji // So our mapping for U+FEE26 is not reversible // Encoded as UTF-8, that's EE9B80 unset($toUnicode["\xEE\x9B\x80"]); // Similar for U+FEE27, U+FEE28, U+FEE2C unset($toUnicode["\xEE\x9B\x81"]); unset($toUnicode["\xEE\x9B\x82"]); unset($toUnicode["\xEE\x9B\x86"]); } // Test all characters which are different in mobile encoding (from standard UTF-8) foreach ($toUnicode as $char => $cp) testValidString($char, $cp, $encoding, 'UCS-4BE', false); foreach ($fromUnicode as $cp => $char) testValidString($cp, $char, 'UCS-4BE', $encoding, false); foreach ($invalidCodepoints as $cp => $_) convertInvalidString($cp, '%', 'UCS-4BE', $encoding); // Try malformed UTF-8 sequences global $badUTF8; foreach ($badUTF8 as $invalidText => $expectedResult) testInvalidString($invalidText, $expectedResult, $encoding, 'UCS-4BE'); echo "$encoding OK\n"; } testUTF8Variant('UTF-8-Mobile#DOCOMO', '/data/UTF-8-DOCOMO.txt'); testUTF8Variant('UTF-8-Mobile#KDDI-A', '/data/UTF-8-KDDI-A.txt'); testUTF8Variant('UTF-8-Mobile#KDDI-B', '/data/UTF-8-KDDI-B.txt'); testUTF8Variant('UTF-8-Mobile#SOFTBANK', '/data/UTF-8-SOFTBANK.txt'); ?> --EXPECT-- UTF-8-Mobile#DOCOMO OK UTF-8-Mobile#KDDI-A OK UTF-8-Mobile#KDDI-B OK UTF-8-Mobile#SOFTBANK OK