1--TEST--
2Exhaustive test of UTF-8 text encoding (DoCoMo, KDDI, SoftBank variants)
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(855); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15$badUTF8 = array(
16  // Codepoints outside of valid 0-0x10FFFF range for Unicode
17  "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
18  "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
19  "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
20
21  // Reserved range for UTF-16 surrogate pairs
22  "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3),     // CP 0xD800
23  "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDBFF
24  "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDFFF
25
26  // Truncated characters
27  "\xDF" => "\x00\x00\x00%",         // should have been 2-byte
28  "\xEF\xBF" => "\x00\x00\x00%",     // should have been 3-byte
29  "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte
30  "\xF1\x96" => "\x00\x00\x00%",
31  "\xF1\x96\x80" => "\x00\x00\x00%",
32  "\xF2\x94" => "\x00\x00\x00%",
33  "\xF2\x94\x80" => "\x00\x00\x00%",
34  "\xF3\x94" => "\x00\x00\x00%",
35  "\xF3\x94\x80" => "\x00\x00\x00%",
36  "\xE0\x9F" => "\x00\x00\x00%\x00\x00\x00%",
37  "\xED\xA6" => "\x00\x00\x00%\x00\x00\x00%",
38
39  // Multi-byte characters which end too soon and go to ASCII
40  "\xDFA" => "\x00\x00\x00%\x00\x00\x00A",
41  "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
42  "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
43  "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
44
45  // Multi-byte characters which end too soon and go to another MB char
46  "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
47  "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
48  "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
49
50  // Continuation bytes which appear outside of a MB char
51  "\x80" => "\x00\x00\x00%",
52  "A\x80" => "\x00\x00\x00A\x00\x00\x00%",
53  "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%",
54
55  // Overlong code units
56  // (Using more bytes than needed to encode a character)
57  "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2),        // didn't need 2 bytes
58  "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3),    // didn't need 3 bytes
59  "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
60);
61
62function intToString($value) {
63  if ($value <= 0xFF)
64      return chr($value);
65  else if ($value <= 0xFFFF)
66      return pack('n', $value);
67  else if ($value <= 0xFFFFFF)
68      return chr($value >> 16) . pack('n', $value & 0xFFFF);
69  else
70      return pack('N', $value);
71}
72
73function readUTF8ConversionTable($path, &$from, &$to, &$invalid) {
74    $from = array();
75    $to   = array();
76    $invalid = array();
77
78    $fp = fopen($path, 'r+');
79    while ($line = fgets($fp, 256)) {
80        if (sscanf($line, "0x%x\t0x%x", $codepoint, $char) == 2) {
81            $codepoint = pack('N', $codepoint);
82            $char = intToString($char);
83            $from[$char] = $codepoint;
84            $to[$codepoint] = $char;
85        } else if (sscanf($line, "0x%x\tBAD", $codepoint) == 1) {
86          $codepoint = pack('N', $codepoint);
87          $invalid[$codepoint] = true;
88        }
89    }
90}
91
92function testUTF8Variant($encoding, $filename) {
93    readUTF8ConversionTable(__DIR__ . $filename, $toUnicode, $fromUnicode, $invalidCodepoints);
94
95    // Test some plain, vanilla codepoints (to/from mobile encoding)
96    testValidString("\x00\x00", "\x00", "UTF-16BE", $encoding);
97
98    for ($i = 0; $i < 1000; $i++) {
99      $cp = pack('N', rand(1, 0x10FFFF));
100      if (isset($fromUnicode[$cp]))
101        continue;
102      if (mb_convert_encoding($cp, $encoding, 'UTF-32BE') !== mb_convert_encoding($cp, 'UTF-8', 'UTF-32BE'))
103        die("Expected U+" . bin2hex($cp) . " to be the same in UTF-8 and " . $encoding);
104    }
105
106    if ($encoding === 'UTF-8-Mobile#DOCOMO') {
107      // In Docomo Shift-JIS, we have mappings for U+FEE16 up to U+FEE25 and
108      // then U+FEE29-U+FEE2B, U+FEE2D-U+FEE33
109      // These correspond to sequential Docomo SJIS codes, but in the middle there is
110      // one emoji which converts to U+25EA (SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK)
111
112      // However, when converting Unicode to Docomo vendor-specific encodings, we still
113      // accept U+FEE26 and convert it to the same SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK emoji
114      // So our mapping for U+FEE26 is not reversible
115      // Encoded as UTF-8, that's EE9B80
116      unset($toUnicode["\xEE\x9B\x80"]);
117      // Similar for U+FEE27, U+FEE28, U+FEE2C
118      unset($toUnicode["\xEE\x9B\x81"]);
119      unset($toUnicode["\xEE\x9B\x82"]);
120      unset($toUnicode["\xEE\x9B\x86"]);
121    }
122
123    // Test all characters which are different in mobile encoding (from standard UTF-8)
124    foreach ($toUnicode as $char => $cp)
125      testValidString($char, $cp, $encoding, 'UCS-4BE', false);
126    foreach ($fromUnicode as $cp => $char)
127      testValidString($cp, $char, 'UCS-4BE', $encoding, false);
128    foreach ($invalidCodepoints as $cp => $_)
129      convertInvalidString($cp, '%', 'UCS-4BE', $encoding);
130
131    // Try malformed UTF-8 sequences
132    global $badUTF8;
133    foreach ($badUTF8 as $invalidText => $expectedResult)
134      testInvalidString($invalidText, $expectedResult, $encoding, 'UCS-4BE');
135
136    echo "$encoding OK\n";
137}
138
139testUTF8Variant('UTF-8-Mobile#DOCOMO', '/data/UTF-8-DOCOMO.txt');
140testUTF8Variant('UTF-8-Mobile#KDDI-A', '/data/UTF-8-KDDI-A.txt');
141testUTF8Variant('UTF-8-Mobile#KDDI-B', '/data/UTF-8-KDDI-B.txt');
142testUTF8Variant('UTF-8-Mobile#SOFTBANK', '/data/UTF-8-SOFTBANK.txt');
143
144?>
145--EXPECT--
146UTF-8-Mobile#DOCOMO OK
147UTF-8-Mobile#KDDI-A OK
148UTF-8-Mobile#KDDI-B OK
149UTF-8-Mobile#SOFTBANK OK
150