1--TEST--
2Exhaustive test of ISO-2022-JP-2004 encoding verification and conversion
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(111); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15/* Read in table of all characters in JISX-0208 charset */
16$jisx0208Chars = array(); /* JISX0208 -> UTF-16BE */
17$fp = fopen(__DIR__ . '/data/JISX0208.txt', 'r+');
18while ($line = fgets($fp, 256)) {
19	if ($line[0] == '#')
20		continue;
21
22	if (sscanf($line, "0x%x\t0x%x\t0x%x", $shiftJIS, $jis0208Code, $unicodeCP) == 3) {
23		$jisx0208Chars[pack('n', $jis0208Code)] = pack('n', $unicodeCP);
24	}
25}
26
27/* The JIS X 0208 character set does not have a single, straightforward
28 * mapping to the Unicode character set
29 * mbstring converts one character differently from the mappings in
30 * data/JISX0208.txt, which comes from the Unicode Consortium */
31
32/* 0x2140 is a backslash; this can be mapped to 0x005C for an ordinary
33 * backslash, or 0xFF3C for a _fullwidth_ one */
34$jisx0208Chars["\x21\x40"] = "\xFF\x3C";
35
36/* Single bytes from 0x0-0x20 are allowed */
37for ($i = 0; $i <= 0x20; $i++) {
38	if ($i != 0x1B)
39		$jisx0208Chars[chr($i)] = "\x00" . chr($i);
40}
41/* As is 0x7F */
42$jisx0208Chars["\x7F"] = "\x00\x7F";
43
44/* Now read table of JISX-0213:2004 plane 1 and JISX-0213:2000 plane 2 chars */
45$jisx0213_2004_1Chars = array();
46$jisx0213_2000_2Chars = array();
47$fp = fopen(__DIR__ . '/data/ISO-2022-JP-2004-JISX0213.txt', 'r+');
48while ($line = fgets($fp, 256)) {
49	if ($line[0] == '#')
50		continue;
51
52	$cp2 = null;
53	if (sscanf($line, "%d-%x\tU+%x+%x", $type, $bytes, $cp1, $cp2) >= 3) {
54		if ($cp1 <= 0xFFFF)
55			$unicode = pack('n', $cp1);
56		else
57			$unicode = mb_convert_encoding(pack('N', $cp1), 'UTF-16BE', 'UTF-32BE');
58		if ($cp2)
59			$unicode .= pack('n', $cp2);
60
61		if ($type == 3)
62			$jisx0213_2004_1Chars[pack('n', $bytes)] = $unicode;
63		else if ($type == 4)
64			$jisx0213_2000_2Chars[pack('n', $bytes)] = $unicode;
65	}
66}
67
68/* JISX 0213 plane 1 0x2131 is an overline; Unicode has a halfwidth overline
69 * at 0x203E and a fullwidth overline at 0xFFE3
70 * We'll use the fullwidth version when converting JISX 0213 to Unicode */
71$jisx0213_2004_1Chars["\x21\x31"] = "\xFF\xE3";
72/* Same deal with the Yen sign; use the fullwidth one */
73$jisx0213_2004_1Chars["\x21\x6F"] = "\xFF\xE5";
74
75/* Since JISX 0213 is an extension of JISX 0208, allow the same single-byte chars */
76for ($i = 0; $i <= 0x20; $i++) {
77	if ($i != 0x1B)
78		$jisx0213_2004_1Chars[chr($i)] = "\x00" . chr($i);
79}
80$jisx0213_2004_1Chars["\x7F"] = "\x00\x7F";
81
82for ($i = 0; $i <= 0x20; $i++) {
83	if ($i != 0x1B)
84		$jisx0213_2000_2Chars[chr($i)] = "\x00" . chr($i);
85}
86$jisx0213_2000_2Chars["\x7F"] = "\x00\x7F";
87
88function testValid($from, $to, $bothWays = true) {
89	identifyValidString($from, 'ISO-2022-JP-2004');
90	convertValidString($from, $to, 'ISO-2022-JP-2004', 'UTF-16BE', false);
91
92	if ($bothWays) {
93		/* Try going in the opposite direction too
94		 * ESC ( B at the beginning of ISO-2022-JP-2004 string is redundant,
95		 * since ASCII mode is the default */
96		if (substr($from, 0, 3) == "\x1B(B")
97			$from = substr($from, 3, strlen($from) - 3);
98		/* If the ISO-2022-JP-2004 string switches to a different charset, it
99		 * should switch back to ASCII at the end */
100		if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B\$(Q") !== false || strpos($from, "\x1B\$(P") !== false)
101			$from .= "\x1B(B";
102
103		convertValidString($to, $from, 'UTF-16BE', 'ISO-2022-JP-2004', false);
104	}
105}
106
107function testInvalid($from, $to) {
108	testInvalidString($from, $to, 'ISO-2022-JP-2004', 'UTF-16BE');
109}
110
111/* Try all ASCII characters */
112for ($i = 0; $i <= 0x7F; $i++) {
113	if ($i == 0x1B)
114		continue;
115	testValid(chr($i), "\x00" . chr($i));
116}
117
118/* Try all ASCII characters, with explicit ASCII escape */
119for ($i = 0; $i <= 0x7F; $i++) {
120	if ($i == 0x1B)
121		continue;
122	testValid("\x1B(B" . chr($i), "\x00" . chr($i));
123}
124
125echo "Encoding verification and conversion works for all ASCII characters\n";
126
127/* Try a bare ESC */
128identifyInvalidString("\x1B", 'ISO-2022-JP-2004');
129
130/* Try all non-ASCII, non-ESC single bytes */
131for ($i = 0x80; $i <= 0xFF; $i++) {
132	testInvalid(chr($i), "\x00%");
133}
134
135echo "Encoding verification and conversion rejects all invalid single bytes\n";
136
137/* All valid JISX0208 characters */
138foreach ($jisx0208Chars as $jisx0208 => $utf16BE) {
139	/* Since JIS X 0213 charset is a superset of JIS X 0208, we don't bother
140	 * using JIS X 0208 when converting Unicode to ISO-2022-JP-2004
141	 * Therefore, don't test conversion in both directions here */
142	testValid("\x1B\$B" . $jisx0208, $utf16BE, false);
143}
144
145/* All invalid 1-byte JISX0208 characters */
146for ($i = 0; $i < 256; $i++) {
147	if ($i == 0x1B)
148		continue;
149	if ($i >= 0x21 && $i <= 0x7E)
150		continue;
151	$testString = chr($i);
152	if (!isset($jisx0208Chars[$testString])) {
153		testInvalid("\x1B\$B" . $testString, "\x00%");
154	}
155}
156
157/* All invalid 2-byte JISX0208 characters */
158for ($i = 0x21; $i <= 0x7E; $i++) {
159	for ($j = 0; $j < 256; $j++) {
160		$testString = chr($i) . chr($j);
161		if (!isset($jisx0208Chars[$testString])) {
162			testInvalid("\x1B\$B" . $testString, "\x00%");
163		}
164	}
165}
166
167echo "Encoding verification and conversion work on JISX-0208 characters\n";
168
169/* All JISX0213 plane 1 characters */
170foreach ($jisx0213_2004_1Chars as $jisx0213_2004 => $utf16BE) {
171	/* For single bytes, don't try conversion in both directions */
172	testValid("\x1B$(Q" . $jisx0213_2004, $utf16BE, $utf16BE > "\x01\x00");
173}
174
175/* All invalid 2-byte JISX0213 plane 1 characters */
176for ($i = 0x21; $i <= 0x7E; $i++) {
177	for ($j = 0; $j < 256; $j++) {
178		$testString = chr($i) . chr($j);
179		if (!isset($jisx0213_2004_1Chars[$testString])) {
180			testInvalid("\x1B$(Q" . $testString, "\x00%");
181		}
182	}
183}
184
185echo "Encoding verification and conversion work on JISX-0213:2004 plane 1 characters\n";
186
187/* All JISX0213 plane 2 characters */
188foreach ($jisx0213_2000_2Chars as $jisx0213_2000 => $utf16BE) {
189	/* For single bytes, don't try conversion in both directions */
190	testValid("\x1B$(P" . $jisx0213_2000, $utf16BE, $utf16BE > "\x01\x00");
191}
192
193/* All invalid 2-byte JISX0213 plane 2 characters */
194for ($i = 0x21; $i <= 0x7E; $i++) {
195	for ($j = 0; $j < 256; $j++) {
196		$testString = chr($i) . chr($j);
197		if (!isset($jisx0213_2000_2Chars[$testString])) {
198			testInvalid("\x1B$(P" . $testString, "\x00%");
199		}
200	}
201}
202
203echo "Encoding verification and conversion work on JISX-0213:2000 plane 2 characters\n";
204
205/* All possible escape sequences */
206$validEscapes = ["\x1B\$B" => true, "\x1B(B" => true, "\x1B$(Q" => true, "\x1B$(P" => true];
207for ($i = 0; $i <= 0xFF; $i++) {
208	for ($j = 0; $j <= 0xFF; $j++) {
209		$escapeSequence = "\x1B" . chr($i) . chr($j);
210		if (isset($validEscapes[$escapeSequence])) {
211			testValid($escapeSequence, "", false);
212		} else {
213			identifyInvalidString($escapeSequence, 'ISO-2022-JP-2004');
214		}
215	}
216}
217
218echo "All escape sequences work as expected\n";
219
220identifyInvalidString("\x1B$", 'ISO-2022-JP-2004');
221identifyInvalidString("\x1B(", 'ISO-2022-JP-2004');
222identifyInvalidString("\x1B$(", 'ISO-2022-JP-2004');
223
224echo "All incomplete escape sequences are rejected\n";
225
226/* Try all combinations of 2 different charsets in the same string */
227$ascii = "\x1B(Ba";
228$jisx0208 = "\x1B\$B" . array_keys($jisx0208Chars)[rand(0,1000)];
229$jisx0213_1 = "\x1B$(Q" . array_keys($jisx0213_2004_1Chars)[rand(0,1000)];
230$jisx0213_2 = "\x1B$(P" . array_keys($jisx0213_2000_2Chars)[rand(0,1000)];
231$differentCharsets = [$ascii, $jisx0208, $jisx0213_1, $jisx0213_2];
232foreach ($differentCharsets as $a) {
233	foreach ($differentCharsets as $b) {
234		identifyValidString($a . $b, 'ISO-2022-JP-2004');
235	}
236}
237
238/* Try redundant escape sequences (switching mode but including any characters
239 * in the new mode) */
240$ascii_Esc = "\x1B(B";
241$jisx0208_Esc = "\x1B\$B";
242$jisx0213_1_Esc = "\x1B$(Q";
243$jisx0213_2_Esc = "\x1B$(P";
244$differentCharsets = [$ascii_Esc, $jisx0208_Esc, $jisx0213_1_Esc, $jisx0213_2_Esc];
245foreach ($differentCharsets as $a) {
246	foreach ($differentCharsets as $b) {
247		testValid($a . $b, "", false);
248	}
249}
250
251echo "Combining multiple charsets in the same string works as expected\n";
252
253/* Try ending in the middle of a JISX0208 character */
254testInvalid(substr($jisx0208, 0, strlen($jisx0208) - 1), "\x00%");
255
256/* Try ending in the middle of a JISX0213 plane 1 character */
257testInvalid(substr($jisx0213_1, 0, strlen($jisx0213_1) - 1), "\x00%");
258
259/* Try ending in the middle of a JISX0213 plane 2 character */
260testInvalid(substr($jisx0213_2, 0, strlen($jisx0213_2) - 1), "\x00%");
261
262echo "Strings with truncated multi-byte characters are rejected\n";
263
264/* We have tried converting all kinds of strings with single characters;
265 * now try some random examples of strings with multiple characters */
266$jisx0208 = array_keys($jisx0208Chars);
267shuffle($jisx0208);
268$jisx0213_1 = array_keys($jisx0213_2004_1Chars);
269shuffle($jisx0213_1);
270$jisx0213_2 = array_keys($jisx0213_2000_2Chars);
271shuffle($jisx0213_2);
272
273for ($i = 0; $i < 100; $i++) {
274	$size = rand(5,20);
275	$testString = '';
276	$convertsTo = '';
277
278	/* Build a string from a random combination of characters in the supported
279	 * character sets */
280	while ($size--) {
281		$type  = rand(0,4);
282		$chars = rand(0,10);
283		if ($type == 0) { /* ASCII */
284			$testString .= "\x1B(B";
285			while ($chars--) {
286				$ascii = chr(rand(0x20, 0x7E));
287				$testString .= $ascii;
288				$convertsTo .= "\x00" . $ascii;
289			}
290		} else if ($type == 1) { /* JIS X 0208 */
291			$testString .= "\x1B\$B";
292			while ($chars--) {
293				$jis = array_pop($jisx0208);
294				$testString .= $jis;
295				$convertsTo .= $jisx0208Chars[$jis];
296			}
297		} else if ($type == 2) { /* JIS X 0213:2004 plane 1 */
298			$testString .= "\x1B$(Q";
299			while ($chars--) {
300				$jis = array_pop($jisx0213_1);
301				$testString .= $jis;
302				$convertsTo .= $jisx0213_2004_1Chars[$jis];
303			}
304		} else { /* JIS X 0213:2000 plane 2 */
305			$testString .= "\x1B$(P";
306			while ($chars-- && !empty($jisx0213_2)) {
307				$jis = array_pop($jisx0213_2);
308				$testString .= $jis;
309				$convertsTo .= $jisx0213_2000_2Chars[$jis];
310			}
311		}
312	}
313
314	testValid($testString, $convertsTo, false);
315}
316
317// Test "long" illegal character markers
318mb_substitute_character("long");
319convertInvalidString("\xE0", "%", "ISO-2022-JP-2004", "UTF-8");
320convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-2004", "UTF-8"); // Invalid escape
321convertInvalidString("\x1B\$B!", "%", "ISO-2022-JP-2004", "UTF-8"); // Truncated character
322
323echo "All done!\n";
324
325?>
326--EXPECT--
327Encoding verification and conversion works for all ASCII characters
328Encoding verification and conversion rejects all invalid single bytes
329Encoding verification and conversion work on JISX-0208 characters
330Encoding verification and conversion work on JISX-0213:2004 plane 1 characters
331Encoding verification and conversion work on JISX-0213:2000 plane 2 characters
332All escape sequences work as expected
333All incomplete escape sequences are rejected
334Combining multiple charsets in the same string works as expected
335Strings with truncated multi-byte characters are rejected
336All done!
337