1--TEST--
2Exhaustive test of EUC-JP-2004 encoding verification and conversion
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(200); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15$validChars = array(); /* EUC-JP-2004 string -> UTF-32BE */
16$fromUnicode = array(); /* UTF-16BE -> EUC-JP-2004 */
17$fp = fopen(__DIR__ . '/data/EUC-JP-2004.txt', 'r+');
18while ($line = fgets($fp, 256)) {
19  if ($line[0] == '#')
20    continue;
21
22  $codepoint2 = null;
23  if (sscanf($line, "0x%x\tU+%x+%x", $bytes, $codepoint1, $codepoint2) >= 2) {
24    if ($bytes < 256)
25      $eucjp = chr($bytes);
26    else if ($bytes <= 0xFFFF)
27      $eucjp = pack('n', $bytes);
28    else
29      $eucjp = chr($bytes >> 16) . pack('n', $bytes & 0xFFFF);
30
31    if ($codepoint2) {
32      $validChars[$eucjp] = pack('NN', $codepoint1, $codepoint2);
33    } else {
34      $validChars[$eucjp] = pack('N', $codepoint1);
35      if ($codepoint1 <= 0xFFFF)
36        $fromUnicode[pack('n', $codepoint1)] = $eucjp;
37    }
38  }
39}
40
41/* Convert 0xA1B1 to U+FFE3 (FULLWIDTH MACRON), not U+203E (OVERLINE) */
42$validChars["\xA1\xB1"] = "\x00\x00\xFF\xE3";
43$fromUnicode["\xFF\xE3"] = "\xA1\xB1";
44
45/* Convert 0xA1EF to U+FFE5 (FULLWIDTH YEN SIGN), not U+00A5 (YEN SIGN) */
46$validChars["\xA1\xEF"] = "\x00\x00\xFF\xE5";
47$fromUnicode["\xFF\xE5"] = "\xA1\xEF";
48
49/* Convert U+00A5 (YEN SIGN) to 0x5C; that is one of the single bytes
50 * which many legacy Japanese text encodings used to represent something
51 * different from its normal meaning ASCII. In ASCII it's a backslash,
52 * but legacy Japanese software often used it for a yen sign. */
53$fromUnicode["\x00\xA5"] = "\x5C";
54/* The other one is 0x7E, which is a tilde in ASCII, but was used in
55 * legacy Japanese software for an overline */
56$fromUnicode["\x20\x3E"] = "\x7E";
57
58testAllValidChars($validChars, 'EUC-JP-2004', 'UTF-32BE');
59echo "EUC-JP-2004 verification and conversion works for all valid characters\n";
60
61findInvalidChars($validChars, $invalidChars, $truncated);
62testAllInvalidChars($invalidChars, $validChars, 'EUC-JP-2004', 'UTF-32BE', "\x00\x00\x00%");
63testTruncatedChars($truncated, 'EUC-JP-2004', 'UTF-32BE', "\x00\x00\x00%");
64echo "EUC-JP-2004 verification and conversion rejects all invalid characters\n";
65
66testAllValidChars($fromUnicode, 'UTF-16BE', 'EUC-JP-2004', false);
67echo "Unicode -> EUC-JP-2004 conversion works on all valid characters\n";
68
69findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
70convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'EUC-JP-2004', '%');
71echo "Unicode -> EUC-JP-2004 conversion works on all invalid characters\n";
72
73// Test "long" illegal character markers
74mb_substitute_character("long");
75convertInvalidString("\x80", "%", "EUC-JP-2004", "UTF-8");
76convertInvalidString("\xFE\xFF", "%", "EUC-JP-2004", "UTF-8");
77
78echo "Done!\n";
79?>
80--EXPECT--
81EUC-JP-2004 verification and conversion works for all valid characters
82EUC-JP-2004 verification and conversion rejects all invalid characters
83Unicode -> EUC-JP-2004 conversion works on all valid characters
84Unicode -> EUC-JP-2004 conversion works on all invalid characters
85Done!
86