1--TEST--
2Test of ASCII and JIS X 0201/0208/0212 support in ISO-2022-JP and JIS7/8 encodings
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11include('encoding_tests.inc');
12mb_substitute_character(0x25); // '%'
13
14/* Read in table of all characters in JISX-0212 charset */
15readConversionTable(__DIR__ . '/data/JISX0212.txt', $jisx0212Chars, $unused);
16
17/* Read in table of all characters in JISX-0208 charset */
18$jisx0208Chars = array(); /* JISX0208 -> UTF-16BE */
19$fp = fopen(__DIR__ . '/data/JISX0208.txt', 'r+');
20while ($line = fgets($fp, 256)) {
21	if ($line[0] == '#')
22		continue;
23
24	if (sscanf($line, "0x%x\t0x%x\t0x%x", $shiftJIS, $jis0208Code, $unicodeCP) == 3) {
25		$jisx0208Chars[pack('n', $jis0208Code)] = pack('n', $unicodeCP);
26	}
27}
28
29/* Read in table of all characters in JISX-0201 charset */
30readConversionTable(__DIR__ . '/data/JISX0201.txt', $jisx0201Chars, $unused);
31
32/* The JIS X 0208 character set does not have a single, straightforward
33 * mapping to the Unicode character set
34 * mbstring converts one character differently from the mappings in
35 * ../docs/JISX0208.txt, which comes from the Unicode Consortium */
36
37/* 0x2140 is a backslash; this can be mapped to 0x005C for an ordinary
38 * backslash, or 0xFF3C for a _fullwidth_ one */
39$jisx0208Chars["\x21\x40"] = "\xFF\x3C";
40
41function testValid($from, $to, $encoding, $bothWays = true) {
42	identifyValidString($from, $encoding);
43	convertValidString($from, $to, $encoding, 'UTF-16BE', false);
44
45	if ($bothWays) {
46		/* An 0xF at the beginning of a JIS7 string is redundant; it switches
47		 * to ASCII mode, but ASCII mode is default */
48		if ($from[0] == "\x0F")
49			$from = substr($from, 1, strlen($from) - 1);
50		/* ESC ( B at the beginning is redundant, since ASCII mode is the default */
51		if (substr($from, 0, 3) == "\x1B(B")
52			$from = substr($from, 3, strlen($from) - 3);
53		convertValidString($to, $from, 'UTF-16BE', $encoding, false);
54	}
55}
56
57function testInvalid($from, $to, $encoding) {
58	testInvalidString($from, $to, $encoding, 'UTF-16BE');
59}
60
61for ($i = 0; $i < 0x80; $i++) {
62	if ($i == 0xE || $i == 0xF || $i == 0x1B)
63		continue;
64	testValid(chr($i),                   "\x00" . chr($i), 'JIS');
65	convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */
66	testValid("\x1B(B" . chr($i),        "\x00" . chr($i), 'JIS');
67	testValid(chr($i),                   "\x00" . chr($i), 'ISO-2022-JP');
68	testValid("\x1B(B" . chr($i),        "\x00" . chr($i), 'ISO-2022-JP');
69}
70
71for ($i = 0x80; $i < 256; $i++) {
72	if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
73		continue;
74	testInvalid(chr($i),            "\x00%", 'JIS');
75	testInvalid("\x0F" . chr($i),   "\x00%", 'JIS');
76	testInvalid("\x1B(B" . chr($i), "\x00%", 'JIS');
77	testInvalid(chr($i),            "\x00%", 'ISO-2022-JP');
78	testInvalid("\x1B(B" . chr($i), "\x00%", 'ISO-2022-JP');
79}
80
81echo "ASCII support OK\n";
82
83/* All valid JIS X 0201 characters
84 * Those with a 1 in the high bit are JIS X 0201 kana; JIS7 encodes those
85 * with a 0 in the high bit and treats them as a separate charset
86 * (We don't test ISO-2022-JP here, as it does not support the JIS X 0201 charset) */
87foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
88	if (ord($jisx0201) >= 128) {
89		$kana = chr(ord($jisx0201) - 128);
90		testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false);
91		testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */
92		testValid($jisx0201, $utf16BE, 'JIS', false);
93	} else {
94		testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80");
95	}
96}
97
98for ($i = 0x80; $i < 256; $i++) {
99	if ($i >= 0xA1 && $i <= 0xDF)
100		continue;
101	testInvalid("\x1B(I" . chr($i)  . "\x1B(B", "\x00%", 'JIS');
102	testInvalid("\x1B(J" . chr($i)  . "\x1B(B", "\x00%", 'JIS');
103}
104
105echo "JIS X 0201 support OK\n";
106
107/* All valid JISX0208 characters */
108foreach ($jisx0208Chars as $jisx0208 => $utf16BE) {
109	testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS');
110	testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP');
111}
112
113/* All invalid 2-byte JISX0208 characters */
114for ($i = 0x21; $i <= 0x7E; $i++) {
115	for ($j = 0; $j < 256; $j++) {
116		$testString = chr($i) . chr($j);
117		if (!isset($jisx0208Chars[$testString])) {
118			testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS');
119			testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP');
120		}
121	}
122}
123
124/* Try truncated JISX0208 characters */
125for ($i = 0x21; $i <= 0x7E; $i++) {
126	testInvalid("\x1B\$B" . chr($i), "\x00%", 'JIS');
127	testInvalid("\x1B\$B" . chr($i), "\x00%", 'ISO-2022-JP');
128}
129
130/* Switch from Kanji to ASCII */
131testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "JIS", false);
132testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "ISO-2022-JP", false);
133
134echo "JIS X 0208 support OK\n";
135
136/* JIS7 supports escape to switch to JIS X 0212 charset, but ISO-2022-JP does not */
137
138/* All valid JISX0212 characters */
139foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
140	testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false);
141}
142
143/* All invalid 2-byte JISX0212 characters */
144for ($i = 0x21; $i <= 0x7E; $i++) {
145	for ($j = 0; $j < 256; $j++) {
146		$testString = chr($i) . chr($j);
147		if (!isset($jisx0212Chars[$testString])) {
148			testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS');
149		}
150	}
151}
152
153/* Try truncated JISX0212 characters */
154for ($i = 0x21; $i <= 0x7E; $i++) {
155	testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS');
156}
157
158testValidString("\x00\xA1", "\x1B\$(D\x22\x42\x1B(B", "UTF-16BE", "JIS", false);
159// Check that ISO-2022-JP treats JISX 0212 chars as error
160convertInvalidString("\x00\xA1", "%", "UTF-16BE", "ISO-2022-JP", false);
161
162echo "JIS X 0212 support OK\n";
163
164/* All possible escape sequences */
165$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
166$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true];
167for ($i = 0; $i <= 0xFF; $i++) {
168	for ($j = 0; $j <= 0xFF; $j++) {
169		$escapeSequence = "\x1B" . chr($i) . chr($j);
170		if ($escapeSequence === "\x1B\$(")
171			continue;
172		if (isset($validJisEscapes[$escapeSequence])) {
173			testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
174		} else {
175			identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
176		}
177		if (isset($validIso2022jpEscapes[$escapeSequence])) {
178			testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
179		} else {
180			identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
181		}
182	}
183}
184for ($i = 0; $i <= 0xFF; $i++) {
185	$escapeSequence = "\x1B\$(" . chr($i);
186	if (isset($validJisEscapes[$escapeSequence])) {
187		testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
188	} else {
189		identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
190	}
191	if (isset($validIso2022jpEscapes[$escapeSequence])) {
192		testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
193	} else {
194		identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
195	}
196}
197/* Also try a bare ESC */
198identifyInvalidString("\x1B", 'JIS');
199identifyInvalidString("\x1B", 'ISO-2022-JP');
200
201convertInvalidString("\x1B$", "%", "JIS", "UTF-8");
202convertInvalidString("\x1B$", "%", "ISO-2022-JP", "UTF-8");
203convertInvalidString("\x1B(", "%", "JIS", "UTF-8");
204convertInvalidString("\x1B(", "%", "ISO-2022-JP", "UTF-8");
205convertInvalidString("\x1B,", "%,", "JIS", "UTF-8");
206convertInvalidString("\x1B,", "%,", "ISO-2022-JP", "UTF-8");
207
208echo "All escape sequences work as expected\n";
209
210foreach (['JIS', 'ISO-2022-JP'] as $encoding) {
211	testValidString("\x22\x25", "\x1B\$B!B\x1B(B", 'UTF-16BE', $encoding, false);
212	testValidString("\xFF\x0D", "\x1B\$B!]\x1B(B", 'UTF-16BE', $encoding, false);
213	testValidString("\xFF\xE0", "\x1B\$B!q\x1B(B", 'UTF-16BE', $encoding, false);
214	testValidString("\xFF\xE1", "\x1B\$B!r\x1B(B", 'UTF-16BE', $encoding, false);
215	testValidString("\xFF\xE2", "\x1B\$B\"L\x1B(B", 'UTF-16BE', $encoding, false);
216
217	testValidString("\x00\xA5", "\x1B(J\x5C\x1B(B", 'UTF-16BE', $encoding, false);
218}
219testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", 'UTF-16BE', 'ISO-2022-JP', false);
220
221echo "Other mappings from Unicode -> ISO-2022-JP are OK\n";
222
223// Single bytes from 0xA3-0xDF can be used to encode kana in JIS8
224$grInvoked = [
225	"\xA3" => "\x1B(I\x23\x1B(B",
226	"\xB1" => "\x1B(I\x31\x1B(B",
227	"\xC2" => "\x1B(I\x42\x1B(B",
228	"\xDF" => "\x1B(I\x5F\x1B(B"
229];
230foreach ($grInvoked as $gr => $jisx) {
231	// JISX 0201 is used as the canonical form for outputting kana
232	testValidString($gr, $jisx, 'JIS', 'JIS', false);
233	if (mb_convert_encoding($gr, 'UTF-16BE', 'JIS') !== mb_convert_encoding($jisx, 'UTF-16BE', 'JIS'))
234		die("Equivalent GR byte and JISX 0201 sequence do not decode to the same codepoint");
235}
236
237echo "GR-invoked kana support OK\n";
238
239// Check handling of BOM
240convertInvalidString("\xFF\xFE", "%", "UTF-16BE", "JIS", false);
241convertInvalidString("\xFF\xFE", "%", "UTF-16BE", "ISO-2022-JP", false);
242
243// Test "long" illegal character markers
244mb_substitute_character("long");
245convertInvalidString("\xE0", "%", "JIS", "UTF-8");
246convertInvalidString("\xE0", "%", "ISO-2022-JP", "UTF-8");
247convertInvalidString("\x1B\$(X", "%\$(X", "JIS", "UTF-8"); // Invalid escape
248convertInvalidString("\x1B\$(X", "%\$(X", "ISO-2022-JP", "UTF-8"); // Invalid escape
249convertInvalidString("\x1B\$B!", "%", "JIS", "UTF-8"); // Truncated character
250convertInvalidString("\x1B\$B!", "%", "ISO-2022-JP", "UTF-8"); // Truncated character
251
252echo "Done!\n";
253?>
254--EXPECT--
255ASCII support OK
256JIS X 0201 support OK
257JIS X 0208 support OK
258JIS X 0212 support OK
259All escape sequences work as expected
260Other mappings from Unicode -> ISO-2022-JP are OK
261GR-invoked kana support OK
262Done!
263