xref: /PHP-8.1/ext/mbstring/tests/hz_encoding.phpt (revision 776296e1)
1--TEST--
2Exhaustive test of verification and conversion of HZ text
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11include('encoding_tests.inc');
12srand(1000); // Make results consistent
13mb_substitute_character(0x25); // '%'
14
15for ($i = 0; $i < 0x80; $i++) {
16    if ($i != 0x7E) // ~ is special and will be tested separately
17        testValidString(chr($i), chr($i), 'ASCII', 'HZ');
18}
19echo "Tested ASCII -> HZ\n";
20
21for ($i = 0; $i < 0x80; $i++) {
22    if ($i != 0x7E)
23        testValidString(chr($i), chr($i), 'HZ', 'ASCII');
24}
25echo "Tested HZ -> ASCII\n";
26
27for ($i = 0x80; $i < 0xFF; $i++) {
28    testInvalidString(chr($i), '%', 'HZ', 'ASCII');
29}
30echo "Tested non-ASCII bytes in ASCII mode\n";
31
32testValidString('~~', '~', 'HZ', 'ASCII');
33testValidString("~\n", '', 'HZ', 'ASCII', false);
34testValidString('~{~}', '', 'HZ', 'ASCII', false);
35testValidString("~{~\n~}", '', 'HZ', 'ASCII', false);
36echo "Tested valid ~ escapes\n";
37
38for ($i = 0; $i < 0xFF; $i++) {
39    if ($i != 0x0A) {
40        // Try invalid ~ escapes both in ASCII and GB modes
41        if ($i != 0x7E && $i != 0x7B) // not {
42            testInvalidString("~" . chr($i), '%', 'HZ', 'ASCII');
43        if ($i != 0x7D) // not }
44            testInvalidString("~{~" . chr($i) . "~}", '%', 'HZ', 'ASCII');
45    }
46}
47echo "Tested all invalid ~ escapes\n";
48
49readConversionTable(__DIR__ . '/data/GB2312.txt', $toUnicode, $fromUnicode);
50
51findInvalidChars($toUnicode, $invalid, $truncated);
52
53// Two characters in ISO-2022-CN convert to Unicode 0x2225
54$irreversible = ["\x21\x2C" => true];
55
56// Test all good GB2312 characters within ~{ ~} escapes
57$goodChars = array_keys($toUnicode);
58shuffle($goodChars);
59while (!empty($goodChars)) {
60    $reversible = true;
61    $length = 1; //min(rand(5,10), count($goodChars));
62    $fromString = $toString = '';
63    while ($length--) {
64        $goodChar = array_pop($goodChars);
65        $fromString .= $goodChar;
66        $toString .= $toUnicode[$goodChar];
67        if (isset($irreversible[$goodChar]))
68          $reversible = false;
69    }
70
71    testValidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE', $reversible);
72}
73
74// Test all invalid GB2312 characters within ~{ ~} escapes
75// However, don't test escape sequences; we will do those separately below
76unset($invalid["~"]);
77$badChars = array_keys($invalid);
78$goodChars = array();
79while (!empty($badChars)) {
80    if (empty($goodChars)) {
81        $goodChars = array_keys($toUnicode);
82        shuffle($goodChars);
83    }
84    $goodChar   = array_pop($goodChars);
85    $fromString = array_pop($badChars) . $goodChar;
86    $toString   = "\x00%" . $toUnicode[$goodChar];
87
88    testInvalidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE');
89}
90
91$truncatedChars = array_keys($truncated);
92foreach ($truncatedChars as $truncatedChar) {
93    testInvalidString('~{' . $truncatedChar, "\x00%", 'HZ', 'UTF-16BE');
94}
95
96echo "Tested HZ -> UTF-16BE (for all GB2312 characters)\n";
97
98findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
99
100// Although they do not appear in the Unicode -> GB2312 map, ASCII characters *are*
101// valid to convert to HZ
102for ($i = 0; $i <= 0x7F; $i++)
103    unset($invalid["\x00" . chr($i)]);
104
105$badChars = array_keys($invalid);
106$goodChars = array();
107while (!empty($badChars)) {
108    if (empty($goodChars)) {
109        $goodChars = array_keys($fromUnicode);
110        shuffle($goodChars);
111    }
112    $goodChar   = array_pop($goodChars);
113    $fromString = array_pop($badChars) . $goodChar;
114    $toString   = "%~{" . $fromUnicode[$goodChar] . "~}";
115
116    convertInvalidString($fromString, $toString, 'UTF-16BE', 'HZ');
117}
118
119echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";
120
121// Test "long" illegal character markers
122mb_substitute_character("long");
123convertInvalidString("~A", "%", "HZ", "UTF-8");
124convertInvalidString("\x80", "%", "HZ", "UTF-8");
125convertInvalidString("~{\x22\x21", "%", "HZ", "UTF-8");
126
127echo "Done!\n";
128?>
129--EXPECT--
130Tested ASCII -> HZ
131Tested HZ -> ASCII
132Tested non-ASCII bytes in ASCII mode
133Tested valid ~ escapes
134Tested all invalid ~ escapes
135Tested HZ -> UTF-16BE (for all GB2312 characters)
136Tested UTF-16BE -> HZ (for all GB2312 characters)
137Done!
138