1--TEST-- 2Exhaustive test of UTF-8 text encoding (DoCoMo, KDDI, SoftBank variants) 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(855); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15$badUTF8 = array( 16 // Codepoints outside of valid 0-0x10FFFF range for Unicode 17 "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000 18 "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000 19 "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF 20 21 // Reserved range for UTF-16 surrogate pairs 22 "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800 23 "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF 24 "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF 25 26 // Truncated characters 27 "\xDF" => "\x00\x00\x00%", // should have been 2-byte 28 "\xEF\xBF" => "\x00\x00\x00%", // should have been 3-byte 29 "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte 30 "\xF1\x96" => "\x00\x00\x00%", 31 "\xF1\x96\x80" => "\x00\x00\x00%", 32 "\xF2\x94" => "\x00\x00\x00%", 33 "\xF2\x94\x80" => "\x00\x00\x00%", 34 "\xF3\x94" => "\x00\x00\x00%", 35 "\xF3\x94\x80" => "\x00\x00\x00%", 36 "\xE0\x9F" => "\x00\x00\x00%\x00\x00\x00%", 37 "\xED\xA6" => "\x00\x00\x00%\x00\x00\x00%", 38 39 // Multi-byte characters which end too soon and go to ASCII 40 "\xDFA" => "\x00\x00\x00%\x00\x00\x00A", 41 "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 42 "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 43 "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 44 45 // Multi-byte characters which end too soon and go to another MB char 46 "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 47 "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 48 "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 49 50 // Continuation bytes which appear outside of a MB char 51 "\x80" => "\x00\x00\x00%", 52 "A\x80" => "\x00\x00\x00A\x00\x00\x00%", 53 "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%", 54 55 // Overlong code units 56 // (Using more bytes than needed to encode a character) 57 "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes 58 "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes 59 "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes 60); 61 62function intToString($value) { 63 if ($value <= 0xFF) 64 return chr($value); 65 else if ($value <= 0xFFFF) 66 return pack('n', $value); 67 else if ($value <= 0xFFFFFF) 68 return chr($value >> 16) . pack('n', $value & 0xFFFF); 69 else 70 return pack('N', $value); 71} 72 73function readUTF8ConversionTable($path, &$from, &$to, &$invalid) { 74 $from = array(); 75 $to = array(); 76 $invalid = array(); 77 78 $fp = fopen($path, 'r+'); 79 while ($line = fgets($fp, 256)) { 80 if (sscanf($line, "0x%x\t0x%x", $codepoint, $char) == 2) { 81 $codepoint = pack('N', $codepoint); 82 $char = intToString($char); 83 $from[$char] = $codepoint; 84 $to[$codepoint] = $char; 85 } else if (sscanf($line, "0x%x\tBAD", $codepoint) == 1) { 86 $codepoint = pack('N', $codepoint); 87 $invalid[$codepoint] = true; 88 } 89 } 90} 91 92function testUTF8Variant($encoding, $filename) { 93 readUTF8ConversionTable(__DIR__ . $filename, $toUnicode, $fromUnicode, $invalidCodepoints); 94 95 // Test some plain, vanilla codepoints (to/from mobile encoding) 96 testValidString("\x00\x00", "\x00", "UTF-16BE", $encoding); 97 98 for ($i = 0; $i < 1000; $i++) { 99 $cp = pack('N', rand(1, 0x10FFFF)); 100 if (isset($fromUnicode[$cp])) 101 continue; 102 if (mb_convert_encoding($cp, $encoding, 'UTF-32BE') !== mb_convert_encoding($cp, 'UTF-8', 'UTF-32BE')) 103 die("Expected U+" . bin2hex($cp) . " to be the same in UTF-8 and " . $encoding); 104 } 105 106 if ($encoding === 'UTF-8-Mobile#DOCOMO') { 107 // In Docomo Shift-JIS, we have mappings for U+FEE16 up to U+FEE25 and 108 // then U+FEE29-U+FEE2B, U+FEE2D-U+FEE33 109 // These correspond to sequential Docomo SJIS codes, but in the middle there is 110 // one emoji which converts to U+25EA (SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK) 111 112 // However, when converting Unicode to Docomo vendor-specific encodings, we still 113 // accept U+FEE26 and convert it to the same SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK emoji 114 // So our mapping for U+FEE26 is not reversible 115 // Encoded as UTF-8, that's EE9B80 116 unset($toUnicode["\xEE\x9B\x80"]); 117 // Similar for U+FEE27, U+FEE28, U+FEE2C 118 unset($toUnicode["\xEE\x9B\x81"]); 119 unset($toUnicode["\xEE\x9B\x82"]); 120 unset($toUnicode["\xEE\x9B\x86"]); 121 } 122 123 // Test all characters which are different in mobile encoding (from standard UTF-8) 124 foreach ($toUnicode as $char => $cp) 125 testValidString($char, $cp, $encoding, 'UCS-4BE', false); 126 foreach ($fromUnicode as $cp => $char) 127 testValidString($cp, $char, 'UCS-4BE', $encoding, false); 128 foreach ($invalidCodepoints as $cp => $_) 129 convertInvalidString($cp, '%', 'UCS-4BE', $encoding); 130 131 // Try malformed UTF-8 sequences 132 global $badUTF8; 133 foreach ($badUTF8 as $invalidText => $expectedResult) 134 testInvalidString($invalidText, $expectedResult, $encoding, 'UCS-4BE'); 135 136 echo "$encoding OK\n"; 137} 138 139testUTF8Variant('UTF-8-Mobile#DOCOMO', '/data/UTF-8-DOCOMO.txt'); 140testUTF8Variant('UTF-8-Mobile#KDDI-A', '/data/UTF-8-KDDI-A.txt'); 141testUTF8Variant('UTF-8-Mobile#KDDI-B', '/data/UTF-8-KDDI-B.txt'); 142testUTF8Variant('UTF-8-Mobile#SOFTBANK', '/data/UTF-8-SOFTBANK.txt'); 143 144?> 145--EXPECT-- 146UTF-8-Mobile#DOCOMO OK 147UTF-8-Mobile#KDDI-A OK 148UTF-8-Mobile#KDDI-B OK 149UTF-8-Mobile#SOFTBANK OK 150