xref: /php-src/ext/mbstring/tests/hz_encoding.phpt (revision 9c3972fb)
1--TEST--
2Exhaustive test of verification and conversion of HZ text
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11include('encoding_tests.inc');
12srand(1000); // Make results consistent
13mb_substitute_character(0x25); // '%'
14
15for ($i = 0; $i < 0x80; $i++) {
16    if ($i != 0x7E) // ~ is special and will be tested separately
17        testValidString(chr($i), chr($i), 'ASCII', 'HZ');
18}
19echo "Tested ASCII -> HZ\n";
20
21for ($i = 0; $i < 0x80; $i++) {
22    if ($i != 0x7E)
23        testValidString(chr($i), chr($i), 'HZ', 'ASCII');
24}
25echo "Tested HZ -> ASCII\n";
26
27for ($i = 0x80; $i < 0xFF; $i++) {
28    testInvalidString(chr($i), '%', 'HZ', 'ASCII');
29}
30echo "Tested non-ASCII bytes in ASCII mode\n";
31
32testValidString('~~', '~', 'HZ', 'ASCII');
33testValidString("~\n", '', 'HZ', 'ASCII', false);
34testValidString('~{~}', '', 'HZ', 'ASCII', false);
35testValidString("~{~\n~}", '', 'HZ', 'ASCII', false);
36testValidString('~~?', '~?', 'HZ', 'ASCII');
37echo "Tested valid ~ escapes\n";
38
39for ($i = 0; $i < 0xFF; $i++) {
40    if ($i != 0x0A) {
41        // Try invalid ~ escapes both in ASCII and GB modes
42        if ($i != 0x7E && $i != 0x7B) // not {
43            testInvalidString("~" . chr($i), '%', 'HZ', 'ASCII');
44        if ($i != 0x7D) // not }
45            testInvalidString("~{~" . chr($i) . "~}", '%', 'HZ', 'ASCII');
46    }
47}
48echo "Tested all invalid ~ escapes\n";
49
50readConversionTable(__DIR__ . '/data/GB2312.txt', $toUnicode, $fromUnicode);
51
52findInvalidChars($toUnicode, $invalid, $truncated);
53
54// Two characters in ISO-2022-CN convert to Unicode 0x2225
55$irreversible = ["\x21\x2C" => true];
56
57// Test all good GB2312 characters within ~{ ~} escapes
58$goodChars = array_keys($toUnicode);
59shuffle($goodChars);
60while (!empty($goodChars)) {
61    $reversible = true;
62    $length = 1; //min(rand(5,10), count($goodChars));
63    $fromString = $toString = '';
64    while ($length--) {
65        $goodChar = array_pop($goodChars);
66        $fromString .= $goodChar;
67        $toString .= $toUnicode[$goodChar];
68        if (isset($irreversible[$goodChar]))
69          $reversible = false;
70    }
71
72    testValidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE', $reversible);
73}
74
75// Test all invalid GB2312 characters within ~{ ~} escapes
76// However, don't test escape sequences; we will do those separately below
77unset($invalid["~"]);
78$badChars = array_keys($invalid);
79$goodChars = array();
80while (!empty($badChars)) {
81    if (empty($goodChars)) {
82        $goodChars = array_keys($toUnicode);
83        shuffle($goodChars);
84    }
85    $goodChar   = array_pop($goodChars);
86    $fromString = array_pop($badChars) . $goodChar;
87    $toString   = "\x00%" . $toUnicode[$goodChar];
88
89    testInvalidString('~{' . $fromString . '~}', $toString, 'HZ', 'UTF-16BE');
90}
91
92$truncatedChars = array_keys($truncated);
93foreach ($truncatedChars as $truncatedChar) {
94    testInvalidString('~{' . $truncatedChar, "\x00%", 'HZ', 'UTF-16BE');
95}
96
97echo "Tested HZ -> UTF-16BE (for all GB2312 characters)\n";
98
99findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
100
101// Although they do not appear in the Unicode -> GB2312 map, ASCII characters *are*
102// valid to convert to HZ
103for ($i = 0; $i <= 0x7F; $i++)
104    unset($invalid["\x00" . chr($i)]);
105
106$badChars = array_keys($invalid);
107$goodChars = array();
108while (!empty($badChars)) {
109    if (empty($goodChars)) {
110        $goodChars = array_keys($fromUnicode);
111        shuffle($goodChars);
112    }
113    $goodChar   = array_pop($goodChars);
114    $fromString = array_pop($badChars) . $goodChar;
115    $toString   = "%~{" . $fromUnicode[$goodChar] . "~}";
116
117    convertInvalidString($fromString, $toString, 'UTF-16BE', 'HZ');
118}
119
120echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";
121
122// Regression tests
123if (mb_convert_encoding("\x7A\xFA\x00\x00", 'HZ', 'UTF-16BE') !== "~{\x73\x43~}\x00")
124    die("Bad");
125if (mb_convert_encoding("~", 'UTF-16BE', 'HZ') !== "")
126    die("Bad");
127// There had once been a bug whereby the output buffer would be overrun by one byte.
128// It was found by fuzzing. Reproducing it required a string which was long enough
129// and had a GB2312 character at the end.
130$str = "\xD9\x96C\xA7\x1B\xF6\xD8\x86\x94\xB0\xA0\xE1\x9D\x8C\xF8G\xBBMk\xD2Y\tt\xF1\x96d\x17JA\xF9\xF8\xCF\xDC\xFE\x8E\x0E\xC1\x84\xDA\xDBM\xC1\x87\x1AZ\xD5\xA6)\xFF%2\\\xCC\x02\x16]Y\xF0\x00\xEA\xE8{)\x81\xD5VQZ\x12\xB5\xBC\x9A\x91\xA0x\x02\xBA\xF6c\xACo\x9BH\xB7qx\xF5\x0F\t\x15\xDByx\xBA[\xC9\xE8r\xCD*:\xBF\x10P\xF1>Q\x07\xEE\xE5\x80\xAD\xB9\xA2\x9B\xF6\xE1,\x82\xC6q\x94E\xD4\x0B\xC6\xBCQe=\xC3\xE0\xC8\xE0R\x97\x14q\x0C\x1A\x7F\xE1\xC4\xB8U\x8A\x86\x93\xB6/\x84\x95\x06\x91W\xB2\xB6\x1F!\t X\x1A\xD5\xD6\xDA<\x81ib\x9A\x1B3\xD3\xB7:\xE2QS\xD0\x91\x99[K\xF2E\xBBjoh_5\x15 \xA4\xCC\xB0\x7F\x06\xB3,\xB3\xA7u\xB9\x82\x00\xE2f$\x1C\x84NsP\xFAiPB{\x8D\xBA\xB3[\x88\xA9\xB1\xA2r\x86\xFF<\xFD\xFB\xF8\xD6\xABq\x00z\xFA\x87\x8C_\xD9N\xF2\xFA\xEA\xEA\xAA\xD7\xFA\xA2\xD4\x85/\xFC\xE1}\xF7\x9C\x86\xDD\x12@\xC3\xDA\nC\x1Di\xA9\xB0\xC3\xB3\x04\xB2\x1A\x07BA\x02\xED\x11\xA4\xDAz\x96\xB5\xD0!p\xE2\xAD\xEDI\xEF\xF7\\\x05d.p\x07\xC4\x8B\x952\xCDz\x90\x8C\xA6U\xDB\xC7\xF4\x94\xE9\x16X\xF1\xCC\xB13\x07a9\x86]\xF9k\xA9\x87E\xCB\x89\x9Fd\x0E\x81m\xC6c\xDA\x9C\xE9\xAF\x80.\xFAq\xD9\xAAd\x1DB\x1F\x854\xE8\x82v)A\xF3\xB4\x1D\xE5\xF0\xFFu\x0E\x0C\xC4q\xF0\xE7\xB4p\x86\xE6]9\xD9\xA5O\xBAw\x1B\x8D&]\x9D\xE2\x0F\xD2\xD5\x13AY#\x81\x90\xB2\xE8\xDA\xD2\xFC>\xA0\x9A\xBD\x0B\xCC\x08>\x1E\xD1\xFEgr1'$\xEE\xA2!\x8A\xBB>\x11j#Pz_!?\xA8\x15\xCF\xCB\x84\x86\xC1\xF78:\xDA\xBCE\xA7\x02SO\x8B\x81>\x96\xBD\xFD2\x84\xC5\xFC\x19\xE5\xF4\xEFp\xF08K\xBB\xAE-[}\xE1\xDB\x8A%6\xC7\xC9";
131if (substr(mb_convert_encoding($str, 'HZ', 'CP51932'), -4) !== "\x45\x49~}")
132    die("Bad");
133
134// Test "long" illegal character markers
135mb_substitute_character("long");
136convertInvalidString("~A", "%", "HZ", "UTF-8");
137convertInvalidString("\x80", "%", "HZ", "UTF-8");
138convertInvalidString("~{\x22\x21", "%", "HZ", "UTF-8");
139
140echo "Done!\n";
141?>
142--EXPECT--
143Tested ASCII -> HZ
144Tested HZ -> ASCII
145Tested non-ASCII bytes in ASCII mode
146Tested valid ~ escapes
147Tested all invalid ~ escapes
148Tested HZ -> UTF-16BE (for all GB2312 characters)
149Tested UTF-16BE -> HZ (for all GB2312 characters)
150Done!
151