1--TEST--
2Exhaustive test of ISO-2022-JP-KDDI text encoding
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(390); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15function shiftJISDecode($bytes) {
16  /* Convert CP932's default Shift-JIS representation to kuten code */
17  $first = ($bytes >> 8) & 0xFF;
18  $second = $bytes & 0xFF;
19  $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
20  if ($second > 0x9E) {
21    $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
22  } else if ($second > 0x7F) {
23    $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
24  } else {
25    $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
26  }
27  return $kuten;
28}
29
30/* Read in the table of all characters in CP932 */
31$cp932Chars = array(); /* CP932 string -> UTF-32BE string */
32$fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+');
33while ($line = fgets($fp, 256)) {
34  if ($line[0] == '#')
35    continue;
36
37  if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
38    if ($bytes < 256)
39      continue;
40    /* For ISO-2022-JP-KDDI, we only accept the first range of MicroSoft
41     * vendor extensions, in ku 13 */
42    if ($bytes > 0xEAA4)
43      continue;
44    $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint);
45  }
46}
47
48/* Add KDDI-specific emoji to the CP932 characters
49 * They are mapped in 22 ku (or 'rows') above the places where they are mapped
50 * in the Shift-JIS representation of KDDI emoji */
51$fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+');
52while ($line = fgets($fp, 256)) {
53  if ($line[0] == '#')
54    continue;
55  $fields = explode(';', rtrim($line));
56  if (count($fields) >= 4) {
57    if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2)
58      $utf32 = pack('N', $cp1) . pack('N', $cp2);
59    else
60      $utf32 = pack('N', hexdec($fields[0]));
61
62    if ($fields[2]) {
63      $kuten = shiftJISDecode(hexdec($fields[2]));
64      $ku = $kuten >> 8;
65      if ($ku >= 106 && $ku <= 112)
66        $cp932Chars[pack('n', $kuten - (22 * 0x100))] = $utf32;
67    }
68  }
69}
70
71/* Duplicate mappings for the same characters in CP932 */
72$nonInvertible = array();
73foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) {
74  $bytes = pack('n', shiftJISDecode($i));
75  $nonInvertible[$bytes] = $cp932Chars[$bytes];
76}
77
78/* Read in table of all characters in JISX-0201 charset */
79$jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */
80$fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+');
81while ($line = fgets($fp, 256)) {
82  if ($line[0] == '#')
83    continue;
84
85  if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
86    $jisx0201Chars[chr($byte)] = pack('N', $codepoint);
87}
88
89function testValid($from, $to, $encoding, $bothWays = true) {
90  identifyValidString($from, $encoding);
91  convertValidString($from, $to, $encoding, 'UTF-32BE', false);
92
93  if ($bothWays) {
94    /* ESC ( B at the beginning is redundant, since ASCII mode is the default */
95    if (substr($from, 0, 3) == "\x1B(B")
96      $from = substr($from, 3, strlen($from) - 3);
97    /* If the string switches to a different charset, it should switch back to
98     * ASCII at the end */
99    if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false)
100      $from .= "\x1B(B";
101
102    convertValidString($to, $from, 'UTF-32BE', $encoding, false);
103  }
104}
105
106function testInvalid($from, $to, $encoding) {
107  testInvalidString($from, $to, $encoding, 'UTF-32BE');
108}
109
110for ($i = 0; $i < 0x80; $i++) {
111  if ($i == 0x1B)
112    continue;
113  testValid(chr($i),            "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI');
114  testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false);
115  testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false);
116}
117
118for ($i = 0x80; $i < 256; $i++) {
119  if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
120    continue;
121  testInvalid(chr($i),            "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
122  testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
123  testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
124}
125
126echo "ASCII support OK\n";
127
128/* All valid JIS X 0201 characters
129 * Those with a 1 in the high bit are JIS X 0201 kana */
130foreach ($jisx0201Chars as $jisx0201 => $utf32BE) {
131  if (ord($jisx0201) >= 128) {
132    $kana = chr(ord($jisx0201) - 128);
133    testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-KDDI', false);
134    testValid($jisx0201, $utf32BE, 'ISO-2022-JP-KDDI', false);
135  }
136}
137
138for ($i = 0x80; $i < 256; $i++) {
139  if ($i >= 0xA1 && $i <= 0xDF)
140    continue;
141  testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
142  testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
143}
144
145echo "JIS X 0201 support OK\n";
146
147$validChars = $cp932Chars;
148/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
149for ($i = 0; $i <= 0x7F; $i++)
150  $validChars[chr($i)] = chr($i);
151for ($i = 0xA1; $i <= 0xDF; $i++)
152  $validChars[chr($i)] = $jisx0201Chars[chr($i)];
153$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
154findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
155
156foreach ($nonInvertible as $bytes => $char)
157  unset($cp932Chars[$bytes]);
158
159$good = array_keys($cp932Chars);
160shuffle($good);
161while (!empty($good)) {
162  $length = min(rand(5,10), count($good));
163  $from = $to = '';
164  while ($length--) {
165    $goodChar = array_pop($good);
166    $from .= $goodChar;
167    $to .= $cp932Chars[$goodChar];
168  }
169  testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI');
170}
171
172$good = array_keys($nonInvertible);
173shuffle($good);
174while (!empty($good)) {
175  $length = min(rand(5,10), count($good));
176  $from = $to = '';
177  while ($length--) {
178    $goodChar = array_pop($good);
179    $from .= $goodChar;
180    $to .= $nonInvertible[$goodChar];
181  }
182  testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI', false);
183}
184
185foreach (array_keys($invalidChars) as $invalid) {
186  $firstByte = ord($invalid[0]);
187  if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
188    /* The first byte of this 2-byte character will be rejected and result in % being sent
189     * to the output. Then the second byte will do something else. It is easier to write the
190     * test if we only check with the 1st byte. */
191    testInvalidString("\x1B\$B" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
192  } else {
193    testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
194  }
195}
196// Try Kanji which starts with a good byte, but the 2nd byte is junk
197testInvalidString("\x1B\$B\x21\xFF", "%", 'ISO-2022-JP-KDDI', 'UTF-8');
198
199foreach (array_keys($truncatedChars) as $truncated)
200  testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
201
202testValidString("\x1B\$B\x76\x27", "\x00\x01\xF1\xEF\x00\x01\xF1\xF5", 'ISO-2022-JP-KDDI', 'UTF-32BE', false); // Japan flag emoji
203testValidString("\x00#\x20\xE3", "\x1B\$B\x71\x69\x1B(B", 'UTF-16BE', 'ISO-2022-JP-KDDI', false); // Phone key emoji
204
205testValidString("\x1B\$(B\x21\x21", "\x30\x00", 'ISO-2022-JP-KDDI', 'UTF-16BE', false); // Try ESC $ ( B escape sequence
206
207// Switch from JISX 0208 Kanji to ASCII
208testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "ISO-2022-JP-KDDI", false);
209// Switch from JISX 0208 Kanji to JISX 0201 Kana
210testValidString("\x30\x00\xFF\x67", "\x1B\$B\x21\x21\x1B(I'\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
211
212/* Convert Unicode flag emoji to ISO-2022-JP-KDDI proprietary flag emoji
213 * I am not able to confirm that the kuten codes we are using for these proprietary emoji are the right ones
214 * (There doesn't seem to be any publically available reference, and I don't have a legacy KDDI device)
215 *
216 * However, the conversion does not work in the opposite direction; this is because of the test
217 * `if (s >= (84 * 94) && s < (91 * 94))`, which the kuten code which we are using for flag emoji doesn't match
218 * That test is inherited from the old implementation (from libmbfl), and I have no way to confirm that
219 * changing it won't break anything */
220testValidString("\x00\x01\xF1\xF0\x00\x01\xF1\xF7", "\x1B\$B\x70\x55\x1B(B", "UTF-32BE", "ISO-2022-JP-KDDI", false);
221
222echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";
223
224testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
225testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
226testValidString("\xFF\x5E", "\x1B\$B!A\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
227
228echo "Other mappings from Unicode -> ISO-2022-JP-KDDI OK\n";
229
230testInvalidString("\x1B\$B\x7F\x7E", "%", 'ISO-2022-JP-KDDI', 'UTF-8');
231
232// Test "long" illegal character markers
233mb_substitute_character("long");
234convertInvalidString("\xE0", "%", "ISO-2022-JP-KDDI", "UTF-8");
235// Invalid escapes:
236convertInvalidString("\x1B", "%", "ISO-2022-JP-KDDI", "UTF-8");
237convertInvalidString("\x1B.", "%", "ISO-2022-JP-KDDI", "UTF-8");
238convertInvalidString("\x1B\$", "%", "ISO-2022-JP-KDDI", "UTF-8");
239convertInvalidString("\x1B\$.", "%", "ISO-2022-JP-KDDI", "UTF-8");
240convertInvalidString("\x1B(", "%", "ISO-2022-JP-KDDI", "UTF-8");
241convertInvalidString("\x1B(.", "%", "ISO-2022-JP-KDDI", "UTF-8");
242convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-KDDI", "UTF-8");
243convertInvalidString("\x1B\$B\x9F", "%", "ISO-2022-JP-KDDI", "UTF-8"); // 0x9F does not start any 2-byte character
244convertInvalidString("\xE0\x00", "U+E000", "UTF-16BE", "ISO-2022-JP-KDDI");
245
246echo "Done!\n";
247?>
248--EXPECT--
249ASCII support OK
250JIS X 0201 support OK
251JIS X 0208 (with MS extensions) and KDDI emoji support OK
252Other mappings from Unicode -> ISO-2022-JP-KDDI OK
253Done!
254