1--TEST-- 2Exhaustive test of UTF-8 text encoding (DoCoMo, KDDI, SoftBank variants) 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(855); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15$badUTF8 = array( 16 // Codepoints outside of valid 0-0x10FFFF range for Unicode 17 "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000 18 "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000 19 "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF 20 21 // Reserved range for UTF-16 surrogate pairs 22 "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800 23 "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF 24 "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF 25 26 // Truncated characters 27 "\xDF" => "\x00\x00\x00%", // should have been 2-byte 28 "\xEF\xBF" => "\x00\x00\x00%", // should have been 3-byte 29 "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte 30 31 // Multi-byte characters which end too soon and go to ASCII 32 "\xDFA" => "\x00\x00\x00%\x00\x00\x00A", 33 "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 34 "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 35 "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 36 37 // Multi-byte characters which end too soon and go to another MB char 38 "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 39 "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 40 "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 41 42 // Continuation bytes which appear outside of a MB char 43 "\x80" => "\x00\x00\x00%", 44 "A\x80" => "\x00\x00\x00A\x00\x00\x00%", 45 "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%", 46 47 // Overlong code units 48 // (Using more bytes than needed to encode a character) 49 "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes 50 "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes 51 "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes 52); 53 54function intToString($value) { 55 if ($value <= 0xFF) 56 return chr($value); 57 else if ($value <= 0xFFFF) 58 return pack('n', $value); 59 else if ($value <= 0xFFFFFF) 60 return chr($value >> 16) . pack('n', $value & 0xFFFF); 61 else 62 return pack('N', $value); 63} 64 65function readUTF8ConversionTable($path, &$from, &$to, &$invalid) { 66 $from = array(); 67 $to = array(); 68 $invalid = array(); 69 70 $fp = fopen($path, 'r+'); 71 while ($line = fgets($fp, 256)) { 72 if (sscanf($line, "0x%x\t0x%x", $codepoint, $char) == 2) { 73 $codepoint = pack('N', $codepoint); 74 $char = intToString($char); 75 $from[$char] = $codepoint; 76 $to[$codepoint] = $char; 77 } else if (sscanf($line, "0x%x\tBAD", $codepoint) == 1) { 78 $codepoint = pack('N', $codepoint); 79 $invalid[$codepoint] = true; 80 } 81 } 82} 83 84function testUTF8Variant($encoding, $filename) { 85 readUTF8ConversionTable(__DIR__ . $filename, $toUnicode, $fromUnicode, $invalidCodepoints); 86 87 // Test some plain, vanilla codepoints (to/from mobile encoding) 88 testValidString("\x00\x00", "\x00", "UTF-16BE", $encoding); 89 90 for ($i = 0; $i < 1000; $i++) { 91 $cp = pack('N', rand(1, 0x10FFFF)); 92 if (isset($fromUnicode[$cp])) 93 continue; 94 if (mb_convert_encoding($cp, $encoding, 'UTF-32BE') !== mb_convert_encoding($cp, 'UTF-8', 'UTF-32BE')) 95 die("Expected U+" . bin2hex($cp) . " to be the same in UTF-8 and " . $encoding); 96 } 97 98 if ($encoding === 'UTF-8-Mobile#DOCOMO') { 99 // In Docomo Shift-JIS, we have mappings for U+FEE16 up to U+FEE25 and 100 // then U+FEE29-U+FEE2B, U+FEE2D-U+FEE33 101 // These correspond to sequential Docomo SJIS codes, but in the middle there is 102 // one emoji which converts to U+25EA (SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK) 103 104 // However, when converting Unicode to Docomo vendor-specific encodings, we still 105 // accept U+FEE26 and convert it to the same SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK emoji 106 // So our mapping for U+FEE26 is not reversible 107 // Encoded as UTF-8, that's EE9B80 108 unset($toUnicode["\xEE\x9B\x80"]); 109 // Similar for U+FEE27, U+FEE28, U+FEE2C 110 unset($toUnicode["\xEE\x9B\x81"]); 111 unset($toUnicode["\xEE\x9B\x82"]); 112 unset($toUnicode["\xEE\x9B\x86"]); 113 } 114 115 // Test all characters which are different in mobile encoding (from standard UTF-8) 116 foreach ($toUnicode as $char => $cp) 117 testValidString($char, $cp, $encoding, 'UCS-4BE', false); 118 foreach ($fromUnicode as $cp => $char) 119 testValidString($cp, $char, 'UCS-4BE', $encoding, false); 120 foreach ($invalidCodepoints as $cp => $_) 121 convertInvalidString($cp, '%', 'UCS-4BE', $encoding); 122 123 // Try malformed UTF-8 sequences 124 global $badUTF8; 125 foreach ($badUTF8 as $invalidText => $expectedResult) 126 testInvalidString($invalidText, $expectedResult, $encoding, 'UCS-4BE'); 127 128 echo "$encoding OK\n"; 129} 130 131testUTF8Variant('UTF-8-Mobile#DOCOMO', '/data/UTF-8-DOCOMO.txt'); 132testUTF8Variant('UTF-8-Mobile#KDDI-A', '/data/UTF-8-KDDI-A.txt'); 133testUTF8Variant('UTF-8-Mobile#KDDI-B', '/data/UTF-8-KDDI-B.txt'); 134testUTF8Variant('UTF-8-Mobile#SOFTBANK', '/data/UTF-8-SOFTBANK.txt'); 135 136?> 137--EXPECT-- 138UTF-8-Mobile#DOCOMO OK 139UTF-8-Mobile#KDDI-A OK 140UTF-8-Mobile#KDDI-B OK 141UTF-8-Mobile#SOFTBANK OK 142