1--TEST--
2Exhaustive test of CP50220, CP50221, and CP50222 encodings
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11include('encoding_tests.inc');
12mb_substitute_character(0x25); // '%'
13
14function shiftJISDecode($bytes) {
15  /* Convert CP932's default Shift-JIS representation to kuten code
16   *
17   * Shift-JIS is fun! The first byte only represents the top 7 bits of
18   * the ku number, because 94 first bytes were not available. There are
19   * two different ranges of 94 which the second byte can fall in, and
20   * we get the low bit of the ku number by seeing which one it is. */
21  $first = ($bytes >> 8) & 0xFF;
22  $second = $bytes & 0xFF;
23  $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
24  if ($second > 0x9E) {
25    $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
26  } else if ($second > 0x7F) {
27    $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
28  } else {
29    $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
30  }
31  return $kuten;
32}
33
34/* Read in table of all characters in CP932 charset */
35$cp932Chars = array(); /* CP932 -> UTF-16BE */
36$nonInvertible = array();
37$fromUnicode = array();
38$fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
39while ($line = fgets($fp, 256)) {
40  if ($line[0] == '#')
41    continue;
42
43  if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
44    if ($bytes < 256)
45      continue;
46
47
48    if (isset($fromUnicode[$codepoint])) {
49      $nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
50    } else {
51      $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
52      $fromUnicode[$codepoint] = $bytes;
53    }
54  }
55}
56
57/* Aside from the characters in that table, we also support a 'user' area,
58 * which maps to Unicode 'private' codepoints 0xE000-E757 */
59$codepoint = 0xE000;
60for ($i = 0xF0; $i <= 0xF9; $i++) {
61  for ($j = 0x40; $j <= 0xFC; $j++) {
62    if ($j == 0x7F)
63      continue;
64    $cp932Chars[pack('n', shiftJISDecode(($i << 8) + $j))] = pack('n', $codepoint);
65    $codepoint++;
66  }
67}
68
69/* Read in table of all characters in JISX-0201 charset */
70$jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */
71$fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+');
72while ($line = fgets($fp, 256)) {
73  if ($line[0] == '#')
74    continue;
75
76  if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
77    $jisx0201Chars[chr($byte)] = pack('n', $codepoint);
78}
79
80/* Read in table of all characters in JISX-0212 charset */
81$jisx0212Chars = array();
82$fp = fopen(__DIR__ . '/data/JISX0212.txt', 'r+');
83while ($line = fgets($fp, 256)) {
84  if ($line[0] == '#')
85    continue;
86
87  if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
88    $jisx0212Chars[pack('n', $bytes)] = pack('n', $codepoint);
89  }
90}
91
92/* Our conversions between CP5022x (when CP932 charset is selected) and Unicode
93 * differ in a number of places from the table provided by the Unicode Consortium */
94$cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */
95$cp932Chars["\x21\x42"] = "\x20\x16"; /* DOUBLE VERTICAL LINE instead of PARALLEL TO */
96$cp932Chars["\x21\x5D"] = "\x22\x12"; /* MINUS SIGN instead of FULLWIDTH HYPHEN-MINUS */
97$cp932Chars["\x21\x71"] = "\x00\xA2"; /* CENT SIGN instead of FULLWIDTH CENT SIGN */
98$cp932Chars["\x21\x72"] = "\x00\xA3"; /* POUND SIGN instead of FULLWIDTH POUND SIGN */
99$cp932Chars["\x22\x4C"] = "\x00\xAC"; /* NOT SIGN instead of FULLWIDTH NOT SIGN */
100
101function testValid($from, $to, $encoding, $bothWays = true) {
102  identifyValidString($from, $encoding);
103  convertValidString($from, $to, $encoding, 'UTF-16BE', false);
104
105  if ($bothWays) {
106    /* An 0xF at the beginning is redundant; it switches to ASCII mode, but
107     * ASCII mode is default */
108    if ($from[0] == "\x0F")
109      $from = substr($from, 1, strlen($from) - 1);
110    /* ESC ( B at the beginning is redundant, since ASCII mode is the default */
111    if (substr($from, 0, 3) == "\x1B(B")
112      $from = substr($from, 3, strlen($from) - 3);
113    /* If the string switches to a different charset, it should switch back to
114     * ASCII at the end */
115    if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false || strpos($from, "\x1B(I") !== false)
116      $from .= "\x1B(B";
117    if ($encoding == 'CP50222' && $from[0] == "\x0E")
118      $from .= "\x0F";
119
120    convertValidString($to, $from, 'UTF-16BE', $encoding, false);
121  }
122}
123
124function testInvalid($from, $to, $encoding) {
125  testInvalidString($from, $to, $encoding, 'UTF-16BE');
126}
127
128for ($i = 0; $i < 0x80; $i++) {
129  if ($i == 0xE || $i == 0xF || $i == 0x1B)
130    continue;
131  testValid(chr($i),            "\x00" . chr($i), 'CP50220');
132  testValid(chr($i),            "\x00" . chr($i), 'CP50221');
133  testValid(chr($i),            "\x00" . chr($i), 'CP50222');
134  testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50220');
135  testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50221');
136  testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50222');
137  testValid("\x0F" . chr($i),   "\x00" . chr($i), 'CP50222', false); /* 0xF is 'Shift Out' code */
138}
139
140for ($i = 0x80; $i < 256; $i++) {
141  if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
142    continue;
143  testInvalid(chr($i),            "\x00%", 'CP50220');
144  testInvalid(chr($i),            "\x00%", 'CP50221');
145  testInvalid(chr($i),            "\x00%", 'CP50222');
146  testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50220');
147  testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50221');
148  testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50222');
149  testInvalid("\x0F" . chr($i),   "\x00%", 'CP50220');
150  testInvalid("\x0F" . chr($i),   "\x00%", 'CP50221');
151  testInvalid("\x0F" . chr($i),   "\x00%", 'CP50222');
152}
153
154// Switch back to ASCII after a multibyte character
155convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50221', false);
156convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50222', false);
157
158echo "ASCII support OK\n";
159
160/* All valid JIS X 0201 characters
161 * Those with a 1 in the high bit are JIS X 0201 kana */
162foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
163  if (ord($jisx0201) >= 128) { /* Kana */
164    $kana = chr(ord($jisx0201) - 128);
165    testValid("\x1B(I" . $kana, $utf16BE, 'CP50221');
166    testValid("\x1B(J\x0E" . $kana, $utf16BE, 'CP50222', false); /* 0xE is 'Shift In' code */
167    testValid("\x0E" . $kana, $utf16BE, 'CP50222', false);
168    testValid($jisx0201, $utf16BE, 'CP50220', false);
169    testValid($jisx0201, $utf16BE, 'CP50221', false);
170    testValid($jisx0201, $utf16BE, 'CP50222', false);
171    convertValidString($utf16BE, "\x0E" . chr(ord($jisx0201) - 0x80) . "\x0F", 'UTF-16BE', 'CP50222', false);
172  } else { /* Latin */
173    testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80");
174    testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80");
175    testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50222', $utf16BE > "\x00\x80");
176  }
177}
178
179for ($i = 0x80; $i < 256; $i++) {
180  if ($i >= 0xA1 && $i <= 0xDF)
181    continue;
182  testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50220');
183  testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50221');
184  testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50222');
185  testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50220');
186  testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50221');
187  testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222');
188}
189
190/* Go from JIS X 0201 to ASCII or JIS X 0208 */
191convertValidString("\xFF\x61\x00A", "\x0E\x21\x0FA", 'UTF-16BE', 'CP50222', false);
192convertValidString("\xFF\x61\x22\x25", "\x0E\x21\x0F\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', 'CP50222', false);
193convertValidString("\xFF\x61\x20\x3E", "\x0E\x21\x0F\x1B(J\x7E\x1B(B", 'UTF-16BE', 'CP50222');
194
195echo "JIS X 0201 support OK\n";
196
197/* All valid CP932 characters */
198foreach ($cp932Chars as $cp932 => $utf16BE) {
199  testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220');
200  testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221');
201  testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222');
202}
203foreach ($nonInvertible as $cp932 => $utf16BE) {
204  testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220', false);
205  testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221', false);
206  testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false);
207}
208
209/* There are some conversions we support from Unicode -> CP5022x, but not in the opposite direction */
210foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
211  convertValidString("\x22\x25", "\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', $encoding, false);
212  convertValidString("\xFF\x0D", "\x1B\$B\x21\x5D\x1B(B", 'UTF-16BE', $encoding, false);
213  convertValidString("\xFF\xE0", "\x1B\$B\x21\x71\x1B(B", 'UTF-16BE', $encoding, false);
214  convertValidString("\xFF\xE1", "\x1B\$B\x21\x72\x1B(B", 'UTF-16BE', $encoding, false);
215  convertValidString("\xFF\xE2", "\x1B\$B\x22\x4C\x1B(B", 'UTF-16BE', $encoding, false);
216}
217
218/* All invalid 2-byte CP932 characters */
219for ($i = 0x21; $i <= 0x97; $i++) {
220  for ($j = 0; $j < 256; $j++) {
221    $testString = chr($i) . chr($j);
222    if (!isset($cp932Chars[$testString]) && !isset($nonInvertible[$testString])) {
223      testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50220');
224      testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50221');
225      testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50222');
226    }
227  }
228}
229
230/* Try truncated 2-byte characters */
231for ($i = 0x21; $i <= 0x97; $i++) {
232  testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50220');
233  testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50221');
234  testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222');
235}
236
237/* Test alternative escape sequence to select CP932 */
238testValid("\x1B\$(B\x21\x21", "\x30\x00", 'CP50220', false);
239
240echo "CP932 support OK\n";
241
242foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
243  testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50220', false);
244  testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50221', false);
245  testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50222', false);
246}
247
248for ($i = 0x21; $i <= 0x97; $i++) {
249  for ($j = 0; $j < 256; $j++) {
250    $testString = chr($i) . chr($j);
251    if (!isset($jisx0212Chars[$testString])) {
252      testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50220');
253      testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50221');
254      testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50222');
255    }
256  }
257}
258
259for ($i = 0x21; $i <= 0x97; $i++) {
260  testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50220');
261  testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50221');
262  testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50222');
263}
264
265echo "JIS X 0212 support OK\n";
266
267/* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */
268$fullwidthKatakana = array(
269  0xFF61 => 0x2123, /* Ideographic full stop */
270  0xFF62 => 0x2156, /* Left corner bracket */
271  0xFF63 => 0x2157, /* Right corner bracket */
272  0xFF64 => 0x2122, /* Ideographic comma */
273  0xFF65 => 0x2126, /* Katakana middle dot */
274  0xFF66 => 0x2572, /* Wo */
275  0xFF67 => 0x2521, /* Small A */
276  0xFF68 => 0x2523, /* Small I */
277  0xFF69 => 0x2525, /* Small U */
278  0xFF6A => 0x2527, /* Small E */
279  0xFF6B => 0x2529, /* Small O */
280  0xFF6C => 0x2563, /* Small Ya */
281  0xFF6D => 0x2565, /* Small Yu */
282  0xFF6E => 0x2567, /* Small Yo */
283  0xFF6F => 0x2543, /* Small Tsu */
284  0xFF70 => 0x213C, /* Prolonged Sound Marker */
285  0xFF71 => 0x2522, /* A */
286  0xFF72 => 0x2524, /* I */
287  0xFF73 => 0x2526, /* U */
288  0xFF74 => 0x2528, /* E */
289  0xFF75 => 0x252A, /* O */
290  0xFF76 => 0x252B, /* Ka */
291  0xFF77 => 0x252D, /* Ki */
292  0xFF78 => 0x252F, /* Ku */
293  0xFF79 => 0x2531, /* Ke */
294  0xFF7A => 0x2533, /* Ko */
295  0xFF7B => 0x2535, /* Sa */
296  0xFF7C => 0x2537, /* Shi */
297  0xFF7D => 0x2539, /* Su */
298  0xFF7E => 0x253B, /* Se */
299  0xFF7F => 0x253D, /* So */
300  0xFF80 => 0x253F, /* Ta */
301  0xFF81 => 0x2541, /* Chi */
302  0xFF82 => 0x2544, /* Tsu */
303  0xFF83 => 0x2546, /* Te */
304  0xFF84 => 0x2548, /* To */
305  0xFF85 => 0x254A, /* Na */
306  0xFF86 => 0x254B, /* Ni */
307  0xFF87 => 0x254C, /* Nu */
308  0xFF88 => 0x254D, /* Ne */
309  0xFF89 => 0x254E, /* No */
310  0xFF8A => 0x254F, /* Ha */
311  0xFF8B => 0x2552, /* Hi */
312  0xFF8C => 0x2555, /* Fu */
313  0xFF8D => 0x2558, /* He */
314  0xFF8E => 0x255B, /* Ho */
315  0xFF8F => 0x255E, /* Ma */
316  0xFF90 => 0x255F, /* Mi */
317  0xFF91 => 0x2560, /* Mu */
318  0xFF92 => 0x2561, /* Me */
319  0xFF93 => 0x2562, /* Mo */
320  0xFF94 => 0x2564, /* Ya */
321  0xFF95 => 0x2566, /* Yu */
322  0xFF96 => 0x2568, /* Yo */
323  0xFF97 => 0x2569, /* Ra */
324  0xFF98 => 0x256A, /* Ri */
325  0xFF99 => 0x256B, /* Ru */
326  0xFF9A => 0x256C, /* Re */
327  0xFF9B => 0x256D, /* Ro */
328  0xFF9C => 0x256F, /* Wa */
329  0xFF9D => 0x2573, /* N */
330  0xFF9E => 0x212B, /* Voice Mark */
331  0xFF9F => 0x212C  /* Semi-voice Mark */
332);
333foreach ($fullwidthKatakana as $cp => $kuten) {
334  convertValidString(pack('n', $cp), "\x1B\$B" . pack('n', $kuten) . "\x1B(B", 'UTF-16BE', 'CP50220', false);
335}
336
337echo "Folding of fullwidth katakana for CP50220 OK\n";
338
339testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50220');
340testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50221');
341testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222');
342
343echo "Invalid Unicode is flagged when converting to CP5022x\n";
344
345// Test "long" illegal character markers
346mb_substitute_character("long");
347convertInvalidString("\x80", "%", "CP50220", "UTF-8");
348convertInvalidString("\x80", "%", "CP50221", "UTF-8");
349convertInvalidString("\x80", "%", "CP50222", "UTF-8");
350convertInvalidString("\x1B\$B1", "%", "CP50220", "UTF-8");
351convertInvalidString("\x1B\$B1", "%", "CP50221", "UTF-8");
352convertInvalidString("\x1B\$B1", "%", "CP50222", "UTF-8");
353
354echo "Long error markers OK\n";
355
356foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
357  testInvalidString("\x1B", "%", $encoding, "UTF-8");
358  testInvalidString("\x1BX", "%X", $encoding, "UTF-8");
359  testInvalidString("\x1B(", "%", $encoding, "UTF-8");
360  testInvalidString("\x1B(X", "%(X", $encoding, "UTF-8");
361  testInvalidString("\x1B\$", "%", $encoding, "UTF-8");
362  testInvalidString("\x1B\$(", "%", $encoding, "UTF-8");
363  testInvalidString("\x1B\$X", "%\$X", $encoding, "UTF-8");
364  testInvalidString("\x1B\$(X", "%\$(X", $encoding, "UTF-8");
365}
366
367echo "Invalid escape sequences OK\n";
368
369// Regression tests
370if (mb_convert_encoding("\x1BC\xF5", 'UTF-16BE', 'CP50221') !== "\x00%\x00C\x00%")
371  die("Bad");
372
373// Previously, the CP50220 implementation would eat trailing null bytes
374$converted = mb_convert_encoding("ab\x00", 'UTF-16BE', 'CP50220');
375if ($converted !== "\x00a\x00b\x00\x00")
376  die("Bad handling of trailing null byte (got " . bin2hex($converted) . ")");
377
378// Previously, the CP50220 implementation would reorder error markers with
379// subsequent characters
380mb_substitute_character(0x3F);
381$converted = mb_convert_encoding("\xff\xff\x00&", 'CP50220', 'UTF-16BE');
382if ($converted !== '?&')
383  die("Bad handling of erroneous codepoint followed by good one (got " . bin2hex($converted) . ")");
384
385// In CP50220, two codepoints can be collapsed into a single kuten code in some cases
386// This should work even on a boundary between separately processed buffers
387$shouldCollapse = "\xFF\x76\xFF\x9E";
388$expected = "\x1B\$B%,\x1B(B";
389for ($i = 0; $i < 256; $i++) {
390  convertValidString(str_repeat("\x00a", $i) . $shouldCollapse, str_repeat('a', $i) . $expected, 'UTF-16BE', 'CP50220', false);
391}
392
393?>
394--EXPECT--
395ASCII support OK
396JIS X 0201 support OK
397CP932 support OK
398JIS X 0212 support OK
399Folding of fullwidth katakana for CP50220 OK
400Invalid Unicode is flagged when converting to CP5022x
401Long error markers OK
402Invalid escape sequences OK
403