1--TEST--
2Exhaustive test of ISO-2022-JP-MS text encoding
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(444); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15function shiftJISDecode($bytes) {
16  /* Convert CP932's default Shift-JIS representation to kuten code */
17  $first = ($bytes >> 8) & 0xFF;
18  $second = $bytes & 0xFF;
19  $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
20  if ($second > 0x9E) {
21    $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
22  } else if ($second > 0x7F) {
23    $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
24  } else {
25    $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
26  }
27  return $kuten;
28}
29
30/* Read in the table of all characters in CP932 */
31$cp932Chars = array(); /* CP932 string -> UTF-32BE string */
32$fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+');
33while ($line = fgets($fp, 256)) {
34  if ($line[0] == '#')
35    continue;
36
37  if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
38    if ($bytes < 256)
39      continue;
40    /* ISO-2022-JP-MS only uses the first two ranges of MS vendor extensions */
41    if ($bytes >= 0xFA00)
42      continue;
43    $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint);
44  }
45}
46
47/* Windows-932 has many cases where two different kuten codes map to the same
48 * Unicode codepoints */
49foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) {
50  $bytes = pack('n', shiftJISDecode($i));
51  $nonInvertible[$bytes] = $cp932Chars[$bytes];
52}
53
54/* Add User Defined codes (which use ESC $ ( ? escape sequence)) */
55$udcChars = array();
56for ($cp = 0xE000; $cp < (0xE000 + (20 * 94)); $cp++) {
57  $i = $cp - 0xE000;
58  $bytes = (((int)($i / 94) + 0x21) << 8) + (($i % 94) + 0x21);
59  $udcChars[pack('n', $bytes)] = pack('N', $cp);
60}
61
62/* Read in table of all characters in JISX-0201 charset */
63$jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */
64$fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+');
65while ($line = fgets($fp, 256)) {
66  if ($line[0] == '#')
67    continue;
68
69  if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
70    $jisx0201Chars[chr($byte)] = pack('N', $codepoint);
71}
72
73function testValid($from, $to, $encoding, $bothWays = true) {
74  identifyValidString($from, $encoding);
75  convertValidString($from, $to, $encoding, 'UTF-32BE', false);
76
77  if ($bothWays) {
78    /* ESC ( B at the beginning is redundant, since ASCII mode is the default */
79    if (substr($from, 0, 3) == "\x1B(B")
80      $from = substr($from, 3, strlen($from) - 3);
81    /* If the string switches to a different charset, it should switch back to
82     * ASCII at the end */
83    if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false || strpos($from, "\x1B\$(@") !== false || strpos($from, "\x1B\$(?") !== false)
84      $from .= "\x1B(B";
85
86    convertValidString($to, $from, 'UTF-32BE', $encoding, false);
87  }
88}
89
90function testInvalid($from, $to, $encoding) {
91  testInvalidString($from, $to, $encoding, 'UTF-32BE');
92}
93
94for ($i = 0; $i < 0x80; $i++) {
95  if ($i == 0x1B)
96    continue;
97  testValid(chr($i),            "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS');
98  testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false);
99  testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false);
100}
101
102for ($i = 0x80; $i < 256; $i++) {
103  if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
104    continue;
105  testInvalid(chr($i),            "\x00\x00\x00%", 'ISO-2022-JP-MS');
106  testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
107  testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
108}
109
110echo "ASCII support OK\n";
111
112/* All valid JIS X 0201 characters
113 * Those with a 1 in the high bit are JIS X 0201 kana */
114foreach ($jisx0201Chars as $jisx0201 => $utf32BE) {
115  if (ord($jisx0201) >= 128) {
116    $kana = chr(ord($jisx0201) - 128);
117    testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-MS', false);
118    testValid($jisx0201, $utf32BE, 'ISO-2022-JP-MS', false);
119  }
120}
121
122for ($i = 0x80; $i < 256; $i++) {
123  if ($i >= 0xA1 && $i <= 0xDF)
124    continue;
125  testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
126  testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
127}
128
129echo "JIS X 0201 support OK\n";
130
131function testAllValidCharsWithPrefix($validChars, $prefix, $bothWays) {
132  $good = array_keys($validChars);
133  shuffle($good);
134  while (!empty($good)) {
135    $length = min(rand(5,10), count($good));
136    $from = $to = '';
137    while ($length--) {
138      $goodChar = array_pop($good);
139      $from .= $goodChar;
140      $to .= $validChars[$goodChar];
141    }
142    testValid($prefix . $from, $to, 'ISO-2022-JP-MS', $bothWays);
143  }
144}
145
146$validChars = $cp932Chars;
147/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
148for ($i = 0; $i <= 0x7F; $i++)
149  $validChars[chr($i)] = chr($i);
150for ($i = 0xA1; $i <= 0xDF; $i++)
151  $validChars[chr($i)] = $jisx0201Chars[chr($i)];
152$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
153findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
154
155foreach ($nonInvertible as $bytes => $char)
156  unset($cp932Chars[$bytes]);
157
158testAllValidCharsWithPrefix($cp932Chars, "\x1B\$B", true);
159testAllValidCharsWithPrefix($nonInvertible, "\x1B\$B", false);
160
161foreach (array_keys($invalidChars) as $invalid) {
162  $firstByte = ord($invalid[0]);
163  if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
164    /* The first byte of this 2-byte character will be rejected and result in % being sent
165     * to the output. Then the second byte will do something else. It is easier to write the
166     * test if we only check with the 1st byte. */
167    testInvalidString("\x1B\$B" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
168  } else {
169    testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
170  }
171}
172foreach (array_keys($truncatedChars) as $truncated)
173  testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
174
175echo "JIS X 0208 (with MS extensions) support OK\n";
176
177$validChars = $udcChars;
178for ($i = 0; $i <= 0x7F; $i++)
179  $validChars[chr($i)] = chr($i);
180for ($i = 0xA1; $i <= 0xDF; $i++)
181  $validChars[chr($i)] = $jisx0201Chars[chr($i)];
182findInvalidChars($validChars, $invalidChars, $truncatedChars, array_fill_keys(range(0x21, 0x7F), 2));
183
184testAllValidCharsWithPrefix($udcChars, "\x1B\$(?", true);
185
186foreach (array_keys($invalidChars) as $invalid) {
187  $firstByte = ord(substr($invalid, 0, 1));
188  if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
189    testInvalidString("\x1B\$(?" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
190  } else {
191    testInvalidString("\x1B\$(?" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
192  }
193}
194foreach (array_keys($truncatedChars) as $truncated)
195  testInvalidString("\x1B\$(?" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
196
197echo "UDC support OK\n";
198
199testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-MS", false);
200testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", "UTF-16BE", "ISO-2022-JP-MS", false);
201testValidString("\xFF\x5E", "\x1B\$B!A\x1B(B", "UTF-16BE", "ISO-2022-JP-MS", false);
202
203echo "Other mappings from Unicode -> ISO-2022-JP-MS OK\n";
204
205// Alternative escape sequences for 2-byte characters
206testValidString("\x1B\$(B\x21\x21", "\x30\x00", "ISO-2022-JP-MS", "UTF-16BE", false);
207testValidString("\x1B\$(@\x21\x21", "\x30\x00", "ISO-2022-JP-MS", "UTF-16BE", false);
208
209// Switching between different character types
210testValidString("\x00a\x00b\x00c\xFF\x61\x00a\x00b\x00c", "abc\x1B(I\x21\x1B(Babc", "UTF-16BE", "ISO-2022-JP-MS", false);
211
212// Test "long" illegal character markers
213mb_substitute_character("long");
214convertInvalidString("\xE0", "%", "ISO-2022-JP-MS", "UTF-8");
215// Invalid escapes:
216convertInvalidString("\x1B", "%", "ISO-2022-JP-MS", "UTF-8");
217convertInvalidString("\x1B.", "%", "ISO-2022-JP-MS", "UTF-8");
218convertInvalidString("\x1B\$", "%", "ISO-2022-JP-MS", "UTF-8");
219convertInvalidString("\x1B\$.", "%", "ISO-2022-JP-MS", "UTF-8");
220convertInvalidString("\x1B(", "%", "ISO-2022-JP-MS", "UTF-8");
221convertInvalidString("\x1B(.", "%", "ISO-2022-JP-MS", "UTF-8");
222convertInvalidString("\x1B\$(", "%", "ISO-2022-JP-MS", "UTF-8");
223convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-MS", "UTF-8");
224convertInvalidString("\x1B\$B\x9F", "%", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character
225
226echo "Done!\n";
227?>
228--EXPECT--
229ASCII support OK
230JIS X 0201 support OK
231JIS X 0208 (with MS extensions) support OK
232UDC support OK
233Other mappings from Unicode -> ISO-2022-JP-MS OK
234Done!
235