1--TEST--
2Exhaustive test of verification and conversion of GB18030-2022 text
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8if (PHP_INT_SIZE == 4 && !extension_loaded("ctype")) die("skip needs ctype extension on 32-bit");
9?>
10--FILE--
11<?php
12include('encoding_tests.inc');
13srand(2323); // Make results consistent
14mb_substitute_character(0x25); // '%'
15
16$updatedMappings = [
17  "\xA6\xD9" => "\xFE\x10",
18  "\xA6\xDA" => "\xFE\x12",
19  "\xA6\xDB" => "\xFE\x11",
20  "\xA6\xDC" => "\xFE\x13",
21  "\xA6\xDD" => "\xFE\x14",
22  "\xA6\xDE" => "\xFE\x15",
23  "\xA6\xDF" => "\xFE\x16",
24  "\xA6\xEC" => "\xFE\x17",
25  "\xA6\xED" => "\xFE\x18",
26  "\xA6\xF3" => "\xFE\x19",
27
28  "\xA8\xBC" => "\x1E\x3F",
29  "\xA8\xBF" => "\x01\xF9",
30  "\xA9\x89" => "\x30\x3E",
31  "\xA9\x8A" => "\x2F\xF0",
32  "\xA9\x8B" => "\x2F\xF1",
33  "\xA9\x8C" => "\x2F\xF2",
34  "\xA9\x8D" => "\x2F\xF3",
35  "\xA9\x8E" => "\x2F\xF4",
36  "\xA9\x8F" => "\x2F\xF5",
37  "\xA9\x90" => "\x2F\xF6",
38  "\xA9\x91" => "\x2F\xF7",
39  "\xA9\x92" => "\x2F\xF8",
40  "\xA9\x93" => "\x2F\xF9",
41  "\xA9\x94" => "\x2F\xFA",
42  "\xA9\x95" => "\x2F\xFB",
43
44  "\xFE\x50" => "\x2E\x81",
45  "\xFE\x51" => "\xE8\x16",
46  "\xFE\x52" => "\xE8\x17",
47  "\xFE\x53" => "\xE8\x18",
48  "\xFE\x54" => "\x2E\x84",
49  "\xFE\x55" => "\x34\x73",
50  "\xFE\x56" => "\x34\x47",
51  "\xFE\x57" => "\x2E\x88",
52  "\xFE\x58" => "\x2E\x8B",
53  "\xFE\x59" => "\x9F\xB4",
54  "\xFE\x5A" => "\x35\x9E",
55  "\xFE\x5B" => "\x36\x1A",
56  "\xFE\x5C" => "\x36\x0E",
57  "\xFE\x5D" => "\x2E\x8C",
58  "\xFE\x5E" => "\x2E\x97",
59  "\xFE\x5F" => "\x39\x6E",
60
61  "\xFE\x60" => "\x39\x18",
62  "\xFE\x61" => "\x9F\xB5",
63  "\xFE\x62" => "\x39\xCF",
64  "\xFE\x63" => "\x39\xDF",
65  "\xFE\x64" => "\x3A\x73",
66  "\xFE\x65" => "\x39\xD0",
67  "\xFE\x66" => "\x9F\xB6",
68  "\xFE\x67" => "\x9F\xB7",
69  "\xFE\x68" => "\x3B\x4E",
70  "\xFE\x69" => "\x3C\x6E",
71  "\xFE\x6A" => "\x3C\xE0",
72  "\xFE\x6B" => "\x2E\xA7",
73  "\xFE\x6C" => "\xE8\x31",
74  "\xFE\x6D" => "\x9F\xB8",
75  "\xFE\x6E" => "\x2E\xAA",
76  "\xFE\x6F" => "\x40\x56",
77
78  "\xFE\x76" => "\xE8\x3B",
79  "\xFE\x7E" => "\x9F\xB9",
80  "\xFE\x90" => "\x9F\xBA",
81  "\xFE\x91" => "\xE8\x55",
82  "\xFE\xA0" => "\x9F\xBB"];
83testAllValidChars($updatedMappings, 'GB18030-2022', 'UTF-16BE', false);
84testAllValidChars(array_flip($updatedMappings), 'UTF-16BE', 'GB18030-2022', false);
85
86$sampleSMP = [
87  "\x00\x10\x03\x08" => "\xDE\x30\xE6\x36",
88  "\x00\x10\x14\xEB" => "\xDE\x34\xB8\x35",
89  "\x00\x10\x29\x76" => "\xDE\x38\xCE\x34",
90  "\x00\x10\x40\x6E" => "\xDF\x33\xA4\x34",
91  "\x00\x10\x78\x7B" => "\xE0\x34\xD5\x33",
92  "\x00\x01\x25\x2A" => "\x90\x37\xC6\x34",
93  "\x00\x01\x5B\xA4" => "\x91\x38\xCF\x30",
94  "\x00\x01\x6D\x81" => "\x92\x32\xA0\x33",
95  "\x00\x01\x7F\xB2" => "\x92\x35\xF8\x30",
96  "\x00\x01\x89\x9B" => "\x92\x37\xF9\x37",
97  "\x00\x01\x9E\x77" => "\x93\x32\x99\x37",
98  "\x00\x02\x08\x9A" => "\x95\x33\xE0\x38",
99  "\x00\x02\x1B\x00" => "\x95\x37\xBF\x38",
100  "\x00\x02\x31\xBE" => "\x96\x32\x90\x30",
101  "\x00\x02\x64\xD4" => "\x97\x32\xBF\x38",
102  "\x00\x02\xA9\xA0" => "\x98\x36\xBD\x30",
103  "\x00\x02\xBA\x38" => "\x98\x39\xEB\x38",
104  "\x00\x03\x1C\x13" => "\x9A\x39\xDC\x39",
105  "\x00\x03\x20\x6D" => "\x9B\x30\xCE\x33",
106  "\x00\x03\x22\xA9" => "\x9B\x31\x89\x35",
107  "\x00\x03\x39\xB3" => "\x9B\x35\xDF\x33",
108  "\x00\x03\xA7\xF2" => "\x9D\x38\x93\x36",
109  "\x00\x03\xDF\xFB" => "\x9E\x39\xC4\x31",
110  "\x00\x04\x01\x69" => "\x9F\x36\xA9\x39",
111  "\x00\x04\x23\x79" => "\xA0\x33\x9F\x39",
112  "\x00\x04\x26\x52" => "\xA0\x33\xE8\x38",
113  "\x00\x04\x38\xDB" => "\xA0\x37\xCB\x33",
114  "\x00\x04\x46\x84" => "\xA1\x30\xAF\x30",
115  "\x00\x04\x6C\x7C" => "\xA1\x38\x8B\x30",
116  "\x00\x04\x78\x41" => "\xA2\x30\xBC\x33",
117  "\x00\x04\x97\x32" => "\xA2\x36\xE0\x34",
118  "\x00\x04\x9E\xCC" => "\xA2\x38\xA7\x30",
119  "\x00\x04\xC5\xDB" => "\xA3\x36\x9E\x39",
120  "\x00\x04\xF4\xE2" => "\xA4\x35\xE4\x38",
121  "\x00\x05\x3B\xA6" => "\xA6\x30\x96\x34",
122  "\x00\x05\x76\x53" => "\xA7\x32\x8C\x35",
123  "\x00\x05\xEA\x9F" => "\xA9\x35\xDB\x37",
124  "\x00\x06\x12\x29" => "\xAA\x33\xDF\x39",
125  "\x00\x06\x1B\x9E" => "\xAA\x35\xD6\x30",
126  "\x00\x06\x3B\x26" => "\xAB\x32\x8B\x32",
127  "\x00\x06\x4C\xA8" => "\xAB\x35\xD1\x34",
128  "\x00\x06\x63\x3E" => "\xAC\x30\x9D\x36",
129  "\x00\x06\xB3\xA1" => "\xAD\x36\xC7\x35",
130  "\x00\x07\x0A\x31" => "\xAF\x34\x93\x35",
131  "\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37",
132  "\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35",
133  "\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32",
134  "\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34",
135  "\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34",
136  "\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30",
137  "\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35",
138  "\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32",
139  "\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36",
140  "\x00\x09\x14\x07" => "\xBA\x30\x96\x35",
141  "\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39",
142  "\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33",
143  "\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37",
144  "\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35",
145  "\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32",
146  "\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34",
147  "\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34",
148  "\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30",
149  "\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35",
150  "\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32",
151  "\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36",
152  "\x00\x09\x14\x07" => "\xBA\x30\x96\x35",
153  "\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39",
154  "\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33",
155  "\x00\x09\xBA\x2B" => "\xBD\x33\xF5\x37",
156  "\x00\x0A\x26\x00" => "\xBF\x35\xEA\x32",
157  "\x00\x0A\x36\xE9" => "\xBF\x39\xA3\x31",
158  "\x00\x0A\x7A\x20" => "\xC1\x32\xF5\x38",
159  "\x00\x0A\x9C\x93" => "\xC1\x39\xF5\x37",
160  "\x00\x0A\xC0\xD7" => "\xC2\x37\xA6\x31",
161  "\x00\x0A\xD8\x77" => "\xC3\x32\x8C\x39",
162  "\x00\x0B\x1A\x9B" => "\xC4\x35\xC4\x31",
163  "\x00\x0B\x4F\x27" => "\xC5\x36\x9B\x33",
164  "\x00\x0B\x72\x6D" => "\xC6\x33\xB0\x33",
165  "\x00\x0B\xEE\x23" => "\xC8\x38\xC1\x33",
166  "\x00\x0B\xF0\xDF" => "\xC8\x39\x89\x33",
167  "\x00\x0C\x0B\xE1" => "\xC9\x34\xC6\x37",
168  "\x00\x0C\x4C\x98" => "\xCA\x37\xD9\x34",
169  "\x00\x0C\x5F\x41" => "\xCB\x31\xBF\x31",
170  "\x00\x0C\x63\xE4" => "\xCB\x32\xB7\x38",
171  "\x00\x0C\x70\x0A" => "\xCB\x34\xF2\x38",
172  "\x00\x0C\xAD\x6A" => "\xCC\x37\xB0\x30",
173  "\x00\x0C\xCC\x03" => "\xCD\x33\xCB\x33",
174  "\x00\x0C\xD5\x4C" => "\xCD\x35\xBD\x30",
175  "\x00\x0C\xE6\x70" => "\xCD\x38\xF9\x38",
176  "\x00\x0D\x1B\x6A" => "\xCE\x39\xDC\x30",
177  "\x00\x0D\x55\xEE" => "\xD0\x31\xCE\x30",
178  "\x00\x0D\xBB\xB1" => "\xD2\x32\xA5\x31",
179  "\x00\x0D\xC0\x4F" => "\xD2\x33\x9D\x33",
180  "\x00\x0D\xFA\x84" => "\xD3\x35\x87\x34",
181  "\x00\x0E\x16\x71" => "\xD4\x30\xDC\x33",
182  "\x00\x0E\x1E\x03" => "\xD4\x32\xA2\x31",
183  "\x00\x0E\x20\xE8" => "\xD4\x32\xEC\x32",
184  "\x00\x0E\x39\x6A" => "\xD4\x37\xE9\x36",
185  "\x00\x0E\x6A\x95" => "\xD5\x37\xE8\x33",
186  "\x00\x0E\x7E\xCD" => "\xD6\x31\xF5\x39",
187  "\x00\x0E\x80\x69" => "\xD6\x32\xA1\x31",
188  "\x00\x0E\x9A\x7F" => "\xD6\x37\xC6\x39",
189  "\x00\x0E\xEE\x12" => "\xD8\x34\xC4\x34",
190  "\x00\x0E\xFC\xA1" => "\xD8\x37\xBF\x31",
191  "\x00\x0F\x29\xB0" => "\xD9\x36\xD2\x36",
192  "\x00\x0F\x2A\x12" => "\xD9\x36\xDC\x34",
193  "\x00\x0F\x6C\x8C" => "\xDB\x30\x9E\x32",
194  "\x00\x0F\xAF\x04" => "\xDC\x33\xDD\x38",
195  "\x00\x0F\xBE\x65" => "\xDC\x36\xED\x35",
196  "\x00\x0F\xE5\x88" => "\xDD\x34\xE7\x34",
197  "\x00\x0F\xE7\xB1" => "\xDD\x35\xA0\x37",
198  "\x00\x0F\xF4\x27" => "\xDD\x37\xE3\x37"];
199testAllValidChars($sampleSMP, 'UTF-32BE', 'GB18030-2022', false);
200
201function readGB18030_2022_ConversionTable($path, &$from, &$to, $utf32 = false) {
202    $from = [];
203    $to   = [];
204
205    $fp = fopen($path, 'r+');
206    while ($line = fgets($fp, 256)) {
207        if ($line[0] == '#')
208            continue;
209        if (sscanf($line, "%x\t%x", $codepoint, $char) == 2) {
210            $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
211            if ($char == PHP_INT_MAX) {
212                // We may be on a 32-bit machine and testing a text encoding with 4-byte codes
213                // (which can't be represented in a PHP integer)
214                $char = "";
215                for ($i = strpos($line, "\t") + 1; $i < strlen($line); $i += 2) {
216                    $substr = substr($line, $i, 2);
217                    if (ctype_xdigit($substr))
218                        $char .= chr(hexdec($substr));
219                    else
220                        break;
221                }
222            } else {
223                if ($char <= 0xFF)
224                    $char = chr($char); // hex codes must not have leading zero bytes
225                else if ($char <= 0xFFFF)
226                    $char = pack('n', $char);
227                else if ($char <= 0xFFFFFF)
228                    $char = chr($char >> 16) . pack('n', $char & 0xFFFF);
229                else
230                    $char = pack('N', $char);
231            }
232            $from[$char] = $codepoint;
233            $to[$codepoint] = $char;
234        }
235    }
236}
237
238readGB18030_2022_ConversionTable(__DIR__ . '/data/GB18030-2022MappingTableBMP.txt', $toUnicode, $fromUnicode);
239
240// We will test 4-byte codes separately
241findInvalidChars($toUnicode, $invalid, $truncated);
242
243function notFourByteCode($gb) {
244  return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) ||
245    (strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39));
246}
247
248$invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
249$truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY);
250
251testAllValidChars($toUnicode, 'GB18030-2022', 'UTF-16BE', false);
252testAllInvalidChars($invalid, $toUnicode, 'GB18030-2022', 'UTF-16BE', "\x00%");
253testTruncatedChars($truncated, 'GB18030-2022', 'UTF-16BE', "\x00%");
254
255echo "Tested GB18030-2022 (BMP) -> UTF-16BE\n";
256
257// Test one random 4-byte code for each range used for Unicode codepoints in BMP
258function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) {
259  return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30);
260}
261
262function fourByteCodeFromIndex($index) {
263  $quotient = intdiv($index, 10 * 126 * 10);
264  $byte4 = $quotient + 0x81;
265  $index -= ($quotient * 10 * 126 * 10);
266  $quotient = intdiv($index, 10 * 126);
267  $byte3 = $quotient + 0x30;
268  $index -= ($quotient * 10 * 126);
269  $quotient = intdiv($index, 10);
270  $byte2 = $quotient + 0x81;
271  $byte1 = $index - ($quotient * 10) + 0x30;
272  return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1);
273}
274
275// Invalid 4-byte codes in range for BMP
276testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
277testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
278testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
279testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
280testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
281
282// Valid 4-byte codes for other Unicode planes
283testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030-2022", "UTF-32BE");
284testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030-2022", "UTF-32BE");
285
286// Invalid 4-byte codes for other Unicode planes
287testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
288testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
289testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE");
290
291testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030-2022", "UTF-32BE");
292
293echo "Tested GB18030-2022 (SMP) <-> UTF-32BE\n";
294
295testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030-2022', false);
296echo "Tested UTF-16BE -> GB18030-2022 (BMP)\n";
297
298convertInvalidString("\xAA\xB8\x2D\x38\x00\x00\x00#", "%#", "UTF-32BE", "GB18030-2022");
299
300// Test "long" illegal character markers
301mb_substitute_character("long");
302convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030-2022", "UTF-8");
303convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030-2022", "UTF-8");
304
305echo "Done!\n";
306?>
307--EXPECT--
308Tested GB18030-2022 (BMP) -> UTF-16BE
309Tested GB18030-2022 (SMP) <-> UTF-32BE
310Tested UTF-16BE -> GB18030-2022 (BMP)
311Done!
312