1--TEST--
2Exhaustive test of EUC-JP encoding verification and conversion
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(555); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15/* Read in the table of all characters in EUC-JP */
16readConversionTable(__DIR__ . '/data/EUC-JP.txt', $validChars, $fromUnicode, true);
17
18/* The JIS X 0208 character set does not have a single, straightforward
19 * mapping to the Unicode character set */
20
21/* Kuten code 0x2140 (EUC-JP 0xA1C0) is a backslash; this can be mapped to
22 * 0x005C for an ordinary backslash, or 0xFF3C for a _fullwidth_ one
23 * We go with fullwidth */
24$validChars["\xA1\xC0"] = "\x00\x00\xFF\x3C";
25$fromUnicode["\x00\x00\xFF\x3C"] = "\xA1\xC0";
26
27/* Unicode has both halfwidth and fullwidth NOT SIGN; convert both of them
28 * to JIS X 0208 NOT SIGN */
29$fromUnicode["\x00\x00\xFF\xE2"] = "\xA2\xCC";
30/* Likewise for fullwidth and halfwidth POUND SIGN */
31$fromUnicode["\x00\x00\xFF\xE1"] = "\xA1\xF2";
32/* Likewise for fullwidth and halfwidth CENT SIGN */
33$fromUnicode["\x00\x00\xFF\xE0"] = "\xA1\xF1";
34/* Convert Unicode FULLWIDTH TILDE to JIS X 0208 WAVE DASH */
35$fromUnicode["\x00\x00\xFF\x5E"] = "\xA1\xC1";
36/* Convert Unicode FULLWIDTH HYPHEN-MINUS to JIS X 0208 MINUS SIGN */
37$fromUnicode["\x00\x00\xFF\x0D"] = "\xA1\xDD";
38/* Convert Unicode PARALLEL TO to JIS X 0208 DOUBLE VERTICAL LINE */
39$fromUnicode["\x00\x00\x22\x25"] = "\xA1\xC2";
40
41/* Unicode 0x007E (tilde) can be represented in two different ways in EUC-JP
42 * When converting Unicode to EUC-JP, use the simpler representation */
43$fromUnicode["\x00\x00\x00\x7E"] = "\x7E";
44/* Likewise with 0x005C */
45$fromUnicode["\x00\x00\x00\x5C"] = "\x5C";
46
47/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
48$fromUnicode["\x00\x00\x20\x3E"] = "\xA1\xB1";
49
50findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA1, 0xFE), 2) + array(0x8E => 2, 0x8F => 3));
51
52/* In the JIS X 0212 character set, kuten code 0x2237 (EUC-JP 0x8FA2B7)
53 * is an ordinary tilde character
54 * This mapping is not reversible, because ASCII 0x7E also represents
55 * the same character */
56unset($validChars["\x8F\xA2\xB7"]);
57
58testAllValidChars($validChars, 'EUC-JP', 'UTF-32BE');
59echo "Encoding verification and conversion work for all valid characters\n";
60
61testAllInvalidChars($invalidChars, $validChars, 'EUC-JP', 'UTF-32BE', "\x00\x00\x00%");
62testTruncatedChars($truncated, 'EUC-JP', 'UTF-32BE', "\x00\x00\x00%");
63echo "Encoding verification and conversion work for all invalid characters\n";
64
65testValidString("\x8F\xA2\xB7", "\x00\x00\x00~", 'EUC-JP', 'UTF-32BE', false);
66echo "Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly\n";
67
68testAllValidChars($fromUnicode, 'UTF-32BE', 'EUC-JP', false);
69echo "Unicode -> EUC-JP conversion works on all valid characters\n";
70
71$invalidChars = array();
72for ($cp = 0; $cp <= 0xFFFF; $cp++) {
73  $char = pack('N', $cp);
74  if (!isset($fromUnicode[$char]))
75    $invalidChars[$char] = true;
76}
77convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-32BE', 'EUC-JP', '%');
78echo "Unicode -> EUC-JP conversion works on all invalid characters\n";
79
80// Test "long" illegal character markers
81mb_substitute_character("long");
82convertInvalidString("\x80", "%", "EUC-JP", "UTF-8");
83convertInvalidString("\xFE\xFF", "%", "EUC-JP", "UTF-8");
84
85echo "Done!\n";
86?>
87--EXPECT--
88Encoding verification and conversion work for all valid characters
89Encoding verification and conversion work for all invalid characters
90Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly
91Unicode -> EUC-JP conversion works on all valid characters
92Unicode -> EUC-JP conversion works on all invalid characters
93Done!
94