1--TEST--
2Exhaustive test of Shift-JIS encoding verification and conversion
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(999); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15/* Read in the table of all characters in Shift-JIS */
16readConversionTable(__DIR__ . '/data/SHIFTJIS.txt', $validChars, $fromUnicode);
17
18for ($i = 0; $i < 0x20; $i++) {
19  $validChars[chr($i)] = "\x00" . chr($i);
20  $fromUnicode["\x00" . chr($i)] = chr($i);
21}
22
23/* According to the relevant Japan Industrial Standards Committee standards,
24 * SJIS 0x5C is a Yen sign, and 0x7E is an overline.
25 *
26 * However, this conflicts with the implementation of SJIS in various legacy
27 * software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken
28 * as equivalent to the same ASCII bytes.
29 *
30 * Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes
31 * compatibly with Microsoft products. This was changed in PHP 8.1.0, in an
32 * attempt to comply with the JISC specifications. However, after discussion
33 * with various concerned Japanese developers, it seems that the historical
34 * behavior was more useful in the majority of applications which process
35 * SJIS-encoded text. */
36$validChars["\x5C"] = "\x00\x5C";
37$validChars["\x7E"] = "\x00\x7E";
38$fromUnicode["\x00\x5C"] = "\x5C";
39$fromUnicode["\x00\x7E"] = "\x7E";
40
41/* That means it does not make sense to convert U+203E (OVERLINE)
42 * to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */
43$fromUnicode["\x20\x3E"] = "\x81\x50";
44/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */
45$fromUnicode["\x00\xAF"] = "\x81\x50";
46/* Since we are treating 0x5C as equivalent to U+005C, it does not
47 * make sense to convert U+00A5 (YEN SIGN) to 0x5C
48 * Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */
49$fromUnicode["\x00\xA5"] = "\x81\x8F";
50
51/* DEL character */
52$validChars["\x7F"] = "\x00\x7F";
53$fromUnicode["\x00\x7F"] = "\x7F";
54/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
55$validChars["\x81\x5F"] = "\xFF\x3C";
56$fromUnicode["\xFF\x3C"] = "\x81\x5F";
57/* Unicode has both halfwidth and fullwidth NOT SIGN; convert both of them
58 * to JIS X 0208 NOT SIGN */
59$fromUnicode["\xFF\xE2"] = "\x81\xCA";
60/* Likewise for fullwidth and halfwidth POUND SIGN */
61$fromUnicode["\xFF\xE1"] = "\x81\x92";
62/* Likewise for fullwidth and halfwidth CENT SIGN */
63$fromUnicode["\xFF\xE0"] = "\x81\x91";
64/* Convert Unicode FULLWIDTH TILDE to JIS X 0208 WAVE DASH */
65$fromUnicode["\xFF\x5E"] = "\x81\x60";
66/* Convert Unicode FULLWIDTH HYPHEN-MINUS to JIS X 0208 MINUS SIGN */
67$fromUnicode["\xFF\x0D"] = "\x81\x7C";
68/* Convert Unicode PARALLEL TO to JIS X 0208 DOUBLE VERTICAL LINE */
69$fromUnicode["\x22\x25"] = "\x81\x61";
70
71testAllValidChars($validChars, 'Shift-JIS', 'UTF-16BE');
72echo "SJIS verification and conversion works on all valid characters\n";
73
74findInvalidChars($validChars, $invalidChars, $truncated,
75  array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xEF), 2));
76testAllInvalidChars($invalidChars, $validChars, 'Shift-JIS', 'UTF-16BE', "\x00%");
77testTruncatedChars($truncated, 'Shift-JIS', 'UTF-16BE', "\x00%");
78echo "SJIS verification and conversion works on all invalid characters\n";
79
80testAllValidChars($fromUnicode, 'UTF-16BE', 'Shift-JIS', false);
81echo "Unicode -> SJIS conversion works on all valid characters\n";
82
83findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
84convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'Shift-JIS', '%');
85echo "Unicode -> SJIS conversion works on all invalid characters\n";
86
87testValidString("\xFF\x5E", "\x81\x60", 'UTF-16BE', 'SJIS', false);
88echo "Other mappings from Unicode -> SJIS are OK\n";
89
90// Test "long" illegal character markers
91mb_substitute_character("long");
92convertInvalidString("\x80", "%", "Shift-JIS", "UTF-8");
93convertInvalidString("\x81\x20", "%", "Shift-JIS", "UTF-8");
94convertInvalidString("\xEA\xA9", "%", "Shift-JIS", "UTF-8");
95
96echo "Done!\n";
97?>
98--EXPECT--
99SJIS verification and conversion works on all valid characters
100SJIS verification and conversion works on all invalid characters
101Unicode -> SJIS conversion works on all valid characters
102Unicode -> SJIS conversion works on all invalid characters
103Other mappings from Unicode -> SJIS are OK
104Done!
105