1--TEST--
2Exhaustive test of UTF-8 text encoding (DoCoMo, KDDI, SoftBank variants)
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(855); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15$badUTF8 = array(
16  // Codepoints outside of valid 0-0x10FFFF range for Unicode
17  "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
18  "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
19  "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
20
21  // Reserved range for UTF-16 surrogate pairs
22  "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3),     // CP 0xD800
23  "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDBFF
24  "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDFFF
25
26  // Truncated characters
27  "\xDF" => "\x00\x00\x00%",         // should have been 2-byte
28  "\xEF\xBF" => "\x00\x00\x00%",     // should have been 3-byte
29  "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte
30
31  // Multi-byte characters which end too soon and go to ASCII
32  "\xDFA" => "\x00\x00\x00%\x00\x00\x00A",
33  "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
34  "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
35  "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
36
37  // Multi-byte characters which end too soon and go to another MB char
38  "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
39  "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
40  "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
41
42  // Continuation bytes which appear outside of a MB char
43  "\x80" => "\x00\x00\x00%",
44  "A\x80" => "\x00\x00\x00A\x00\x00\x00%",
45  "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%",
46
47  // Overlong code units
48  // (Using more bytes than needed to encode a character)
49  "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2),        // didn't need 2 bytes
50  "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3),    // didn't need 3 bytes
51  "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
52);
53
54function intToString($value) {
55  if ($value <= 0xFF)
56      return chr($value);
57  else if ($value <= 0xFFFF)
58      return pack('n', $value);
59  else if ($value <= 0xFFFFFF)
60      return chr($value >> 16) . pack('n', $value & 0xFFFF);
61  else
62      return pack('N', $value);
63}
64
65function readUTF8ConversionTable($path, &$from, &$to, &$invalid) {
66    $from = array();
67    $to   = array();
68    $invalid = array();
69
70    $fp = fopen($path, 'r+');
71    while ($line = fgets($fp, 256)) {
72        if (sscanf($line, "0x%x\t0x%x", $codepoint, $char) == 2) {
73            $codepoint = pack('N', $codepoint);
74            $char = intToString($char);
75            $from[$char] = $codepoint;
76            $to[$codepoint] = $char;
77        } else if (sscanf($line, "0x%x\tBAD", $codepoint) == 1) {
78          $codepoint = pack('N', $codepoint);
79          $invalid[$codepoint] = true;
80        }
81    }
82}
83
84function testUTF8Variant($encoding, $filename) {
85    readUTF8ConversionTable(__DIR__ . $filename, $toUnicode, $fromUnicode, $invalidCodepoints);
86
87    // Test some plain, vanilla codepoints (to/from mobile encoding)
88    testValidString("\x00\x00", "\x00", "UTF-16BE", $encoding);
89
90    for ($i = 0; $i < 1000; $i++) {
91      $cp = pack('N', rand(1, 0x10FFFF));
92      if (isset($fromUnicode[$cp]))
93        continue;
94      if (mb_convert_encoding($cp, $encoding, 'UTF-32BE') !== mb_convert_encoding($cp, 'UTF-8', 'UTF-32BE'))
95        die("Expected U+" . bin2hex($cp) . " to be the same in UTF-8 and " . $encoding);
96    }
97
98    if ($encoding === 'UTF-8-Mobile#DOCOMO') {
99      // In Docomo Shift-JIS, we have mappings for U+FEE16 up to U+FEE25 and
100      // then U+FEE29-U+FEE2B, U+FEE2D-U+FEE33
101      // These correspond to sequential Docomo SJIS codes, but in the middle there is
102      // one emoji which converts to U+25EA (SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK)
103
104      // However, when converting Unicode to Docomo vendor-specific encodings, we still
105      // accept U+FEE26 and convert it to the same SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK emoji
106      // So our mapping for U+FEE26 is not reversible
107      // Encoded as UTF-8, that's EE9B80
108      unset($toUnicode["\xEE\x9B\x80"]);
109      // Similar for U+FEE27, U+FEE28, U+FEE2C
110      unset($toUnicode["\xEE\x9B\x81"]);
111      unset($toUnicode["\xEE\x9B\x82"]);
112      unset($toUnicode["\xEE\x9B\x86"]);
113    }
114
115    // Test all characters which are different in mobile encoding (from standard UTF-8)
116    foreach ($toUnicode as $char => $cp)
117      testValidString($char, $cp, $encoding, 'UCS-4BE', false);
118    foreach ($fromUnicode as $cp => $char)
119      testValidString($cp, $char, 'UCS-4BE', $encoding, false);
120    foreach ($invalidCodepoints as $cp => $_)
121      convertInvalidString($cp, '%', 'UCS-4BE', $encoding);
122
123    // Try malformed UTF-8 sequences
124    global $badUTF8;
125    foreach ($badUTF8 as $invalidText => $expectedResult)
126      testInvalidString($invalidText, $expectedResult, $encoding, 'UCS-4BE');
127
128    echo "$encoding OK\n";
129}
130
131testUTF8Variant('UTF-8-Mobile#DOCOMO', '/data/UTF-8-DOCOMO.txt');
132testUTF8Variant('UTF-8-Mobile#KDDI-A', '/data/UTF-8-KDDI-A.txt');
133testUTF8Variant('UTF-8-Mobile#KDDI-B', '/data/UTF-8-KDDI-B.txt');
134testUTF8Variant('UTF-8-Mobile#SOFTBANK', '/data/UTF-8-SOFTBANK.txt');
135
136?>
137--EXPECT--
138UTF-8-Mobile#DOCOMO OK
139UTF-8-Mobile#KDDI-A OK
140UTF-8-Mobile#KDDI-B OK
141UTF-8-Mobile#SOFTBANK OK
142