1--TEST--
2Exhaustive test of verification and conversion of CP936 text
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11include('encoding_tests.inc');
12
13srand(1000); // Make results consistent
14mb_substitute_character(0x25); // '%'
15readConversionTable(__DIR__ . '/data/CP936.txt', $toUnicode, $fromUnicode);
16
17// Unicode has more than one codepoint for a 'tilde' character
18// On output, we emit U+FF5E FULLWIDTH TILDE, but we accept U+223C as input
19$fromUnicode["\x22\x3c"] = "\xa1\xab";
20
21// Circle
22// On output, we emit U+00B0 RING OPERATOR, but we accept U+2218 as input
23$fromUnicode["\x22\x18"] = "\xa1\xe3";
24
25// Overline
26// On output, we emit U+FFE3 FULLWIDTH MACRON, but we accept U+203E as input
27$fromUnicode["\x20\x3e"] = "\xa3\xfe";
28
29// We support some CJK compatibility ideographs
30// Ref: https://en.wikipedia.org/wiki/CJK_Compatibility_Ideographs
31// These are accepted when converting Unicode -> CP936, but are not produced
32// when converting CP936 -> Unicode
33$fromUnicode["\xf9\x00"] = "\xd8\x4d";
34$fromUnicode["\xf9\x01"] = "\xb8\xfc";
35$fromUnicode["\xf9\x02"] = "\xdc\x87";
36$fromUnicode["\xf9\x03"] = "\xd9\x5a";
37$fromUnicode["\xf9\x04"] = "\xbb\xac";
38$fromUnicode["\xf9\x05"] = "\xb4\xae";
39$fromUnicode["\xf9\x06"] = "\xbe\xe4";
40$fromUnicode["\xf9\x07"] = "\xfd\x94";
41$fromUnicode["\xf9\x08"] = "\xfd\x94";
42$fromUnicode["\xf9\x09"] = "\xc6\xf5";
43$fromUnicode["\xf9\x0a"] = "\xbd\xf0";
44$fromUnicode["\xf9\x0b"] = "\xc0\xae";
45$fromUnicode["\xf9\x0c"] = "\xc4\xce";
46$fromUnicode["\xf9\x0d"] = "\x91\xd0";
47$fromUnicode["\xf9\x0e"] = "\xb0\x5d";
48$fromUnicode["\xf9\x0f"] = "\xc1\x5f";
49$fromUnicode["\xf9\x10"] = "\xcc\x7d";
50$fromUnicode["\xf9\x11"] = "\xc2\xdd";
51$fromUnicode["\xf9\x12"] = "\xc2\xe3";
52$fromUnicode["\xf9\x13"] = "\xdf\x89";
53$fromUnicode["\xf9\x14"] = "\x98\xb7";
54$fromUnicode["\xf9\x15"] = "\xc2\xe5";
55$fromUnicode["\xf9\x16"] = "\xc0\xd3";
56$fromUnicode["\xf9\x17"] = "\xe7\xf3";
57$fromUnicode["\xf9\x18"] = "\xc2\xe4";
58$fromUnicode["\xf9\x19"] = "\xc0\xd2";
59$fromUnicode["\xf9\x1a"] = "\xf1\x98";
60$fromUnicode["\xf9\x1b"] = "\x81\x79";
61$fromUnicode["\xf9\x1c"] = "\xc2\xd1";
62$fromUnicode["\xf9\x1d"] = "\x99\xda";
63$fromUnicode["\xf9\x1e"] = "\xa0\x80";
64$fromUnicode["\xf9\x1f"] = "\xcc\x6d";
65$fromUnicode["\xf9\x20"] = "\xfb\x5b";
66$fromUnicode["\xf9\x21"] = "\x8d\xb9";
67$fromUnicode["\xf9\x22"] = "\x9e\x45";
68$fromUnicode["\xf9\x23"] = "\xcb\x7b";
69$fromUnicode["\xf9\x24"] = "\xd2\x68";
70$fromUnicode["\xf9\x25"] = "\xc0\xad";
71$fromUnicode["\xf9\x26"] = "\xc5\x44";
72$fromUnicode["\xf9\x27"] = "\xcf\x9e";
73$fromUnicode["\xf9\x28"] = "\xc0\xc8";
74$fromUnicode["\xf9\x29"] = "\xc0\xca";
75$fromUnicode["\xf9\x2a"] = "\xc0\xcb";
76$fromUnicode["\xf9\x2b"] = "\xc0\xc7";
77$fromUnicode["\xf9\x2c"] = "\xfd\x9c";
78$fromUnicode["\xf9\x2d"] = "\x81\xed";
79$fromUnicode["\xf9\x2e"] = "\xc0\xe4";
80$fromUnicode["\xf9\x2f"] = "\x84\xda";
81$fromUnicode["\xf9\x30"] = "\x93\xef";
82$fromUnicode["\xf9\x31"] = "\x99\xa9";
83$fromUnicode["\xf9\x32"] = "\xa0\x74";
84$fromUnicode["\xf9\x33"] = "\xb1\x52";
85$fromUnicode["\xf9\x34"] = "\xc0\xcf";
86$fromUnicode["\xf9\x35"] = "\xcc\x4a";
87$fromUnicode["\xf9\x36"] = "\xcc\x94";
88$fromUnicode["\xf9\x37"] = "\xc2\xb7";
89$fromUnicode["\xf9\x38"] = "\xc2\xb6";
90$fromUnicode["\xf9\x39"] = "\xf4\x94";
91$fromUnicode["\xf9\x3a"] = "\xfa\x98";
92$fromUnicode["\xf9\x3b"] = "\xc2\xb5";
93$fromUnicode["\xf9\x3c"] = "\xb5\x93";
94$fromUnicode["\xf9\x3d"] = "\xbe\x47";
95$fromUnicode["\xf9\x3e"] = "\xc7\x8a";
96$fromUnicode["\xf9\x3f"] = "\xe4\x9b";
97$fromUnicode["\xf9\x40"] = "\xc2\xb9";
98$fromUnicode["\xf9\x41"] = "\xd5\x93";
99$fromUnicode["\xf9\x42"] = "\x89\xc5";
100$fromUnicode["\xf9\x43"] = "\xc5\xaa";
101$fromUnicode["\xf9\x44"] = "\xbb\x5c";
102$fromUnicode["\xf9\x45"] = "\xc3\x40";
103$fromUnicode["\xf9\x46"] = "\xc0\xce";
104$fromUnicode["\xf9\x47"] = "\xc0\xda";
105$fromUnicode["\xf9\x48"] = "\xd9\x54";
106$fromUnicode["\xf9\x49"] = "\xc0\xd7";
107$fromUnicode["\xf9\x4a"] = "\x89\xbe";
108$fromUnicode["\xf9\x4b"] = "\x8c\xd2";
109$fromUnicode["\xf9\x4c"] = "\x98\xc7";
110$fromUnicode["\xf9\x4d"] = "\x9c\x49";
111$fromUnicode["\xf9\x4e"] = "\xc2\xa9";
112$fromUnicode["\xf9\x4f"] = "\xc0\xdb";
113$fromUnicode["\xf9\x50"] = "\xbf\x7c";
114$fromUnicode["\xf9\x51"] = "\xc2\xaa";
115$fromUnicode["\xf9\x52"] = "\xc0\xd5";
116$fromUnicode["\xf9\x53"] = "\xc0\xdf";
117$fromUnicode["\xf9\x54"] = "\x84\x43";
118$fromUnicode["\xf9\x55"] = "\xc1\xe8";
119$fromUnicode["\xf9\x56"] = "\xb6\xa0";
120$fromUnicode["\xf9\x57"] = "\xbe\x63";
121$fromUnicode["\xf9\x58"] = "\xc1\xe2";
122$fromUnicode["\xf9\x59"] = "\xc1\xea";
123$fromUnicode["\xf9\x5a"] = "\xd7\x78";
124$fromUnicode["\xf9\x5b"] = "\x92\x82";
125$fromUnicode["\xf9\x5c"] = "\x98\xb7";
126$fromUnicode["\xf9\x5d"] = "\xd6\x5a";
127$fromUnicode["\xf9\x5e"] = "\xb5\xa4";
128$fromUnicode["\xf9\x5f"] = "\x8c\x8e";
129$fromUnicode["\xf9\x60"] = "\xc5\xad";
130$fromUnicode["\xf9\x61"] = "\xc2\xca";
131$fromUnicode["\xf9\x62"] = "\xae\x90";
132$fromUnicode["\xf9\x63"] = "\xb1\xb1";
133$fromUnicode["\xf9\x64"] = "\xb4\x91";
134$fromUnicode["\xf9\x65"] = "\xb1\xe3";
135$fromUnicode["\xf9\x66"] = "\x8f\xcd";
136$fromUnicode["\xf9\x67"] = "\xb2\xbb";
137$fromUnicode["\xf9\x68"] = "\xc3\xda";
138$fromUnicode["\xf9\x69"] = "\x94\xb5";
139$fromUnicode["\xf9\x6a"] = "\xcb\xf7";
140$fromUnicode["\xf9\x6b"] = "\x85\xa2";
141$fromUnicode["\xf9\x6c"] = "\xc8\xfb";
142$fromUnicode["\xf9\x6d"] = "\xca\xa1";
143$fromUnicode["\xf9\x6e"] = "\xc8\x7e";
144$fromUnicode["\xf9\x6f"] = "\xd5\x66";
145$fromUnicode["\xf9\x70"] = "\x9a\xa2";
146$fromUnicode["\xf9\x71"] = "\xb3\xbd";
147$fromUnicode["\xf9\x72"] = "\xc9\xf2";
148$fromUnicode["\xf9\x73"] = "\xca\xb0";
149$fromUnicode["\xf9\x74"] = "\xc8\xf4";
150$fromUnicode["\xf9\x75"] = "\xc2\xd3";
151$fromUnicode["\xf9\x76"] = "\xc2\xd4";
152$fromUnicode["\xf9\x77"] = "\xc1\xc1";
153$fromUnicode["\xf9\x78"] = "\x83\xc9";
154$fromUnicode["\xf9\x7a"] = "\xc1\xba";
155$fromUnicode["\xf9\x7b"] = "\xbc\x5a";
156$fromUnicode["\xf9\x7c"] = "\xc1\xbc";
157$fromUnicode["\xf9\x7d"] = "\xd5\x8f";
158$fromUnicode["\xf9\x7e"] = "\xc1\xbf";
159$fromUnicode["\xf9\x7f"] = "\x84\xee";
160$fromUnicode["\xf9\x80"] = "\x85\xce";
161$fromUnicode["\xf9\x81"] = "\xc5\xae";
162$fromUnicode["\xf9\x82"] = "\x8f\x5d";
163$fromUnicode["\xf9\x83"] = "\xc2\xc3";
164$fromUnicode["\xf9\x84"] = "\x9e\x56";
165$fromUnicode["\xf9\x85"] = "\xb5\x5a";
166$fromUnicode["\xf9\x86"] = "\xe9\x82";
167$fromUnicode["\xf9\x87"] = "\xf3\x50";
168$fromUnicode["\xf9\x88"] = "\xfb\x90";
169$fromUnicode["\xf9\x89"] = "\xc0\xe8";
170$fromUnicode["\xf9\x8a"] = "\xc1\xa6";
171$fromUnicode["\xf9\x8b"] = "\x95\xd1";
172$fromUnicode["\xf9\x8c"] = "\x9a\x76";
173$fromUnicode["\xf9\x8d"] = "\xde\x5d";
174$fromUnicode["\xf9\x8e"] = "\xc4\xea";
175$fromUnicode["\xf9\x8f"] = "\x91\x7a";
176$fromUnicode["\xf9\x90"] = "\x91\xd9";
177$fromUnicode["\xf9\x91"] = "\x93\xd3";
178$fromUnicode["\xf9\x92"] = "\x9d\x69";
179$fromUnicode["\xf9\x93"] = "\x9f\x92";
180$fromUnicode["\xf9\x94"] = "\xad\x49";
181$fromUnicode["\xf9\x95"] = "\xfd\x9e";
182$fromUnicode["\xf9\x96"] = "\xbe\x9a";
183$fromUnicode["\xf9\x97"] = "\xc2\x93";
184$fromUnicode["\xf9\x98"] = "\xdd\x82";
185$fromUnicode["\xf9\x99"] = "\xc9\x8f";
186$fromUnicode["\xf9\x9a"] = "\xdf\x42";
187$fromUnicode["\xf9\x9b"] = "\xe5\x80";
188$fromUnicode["\xf9\x9c"] = "\xc1\xd0";
189$fromUnicode["\xf9\x9d"] = "\xc1\xd3";
190$fromUnicode["\xf9\x9e"] = "\xd1\xca";
191$fromUnicode["\xf9\x9f"] = "\xc1\xd2";
192$fromUnicode["\xf9\xa0"] = "\xc1\xd1";
193$fromUnicode["\xf9\xa1"] = "\xd5\x66";
194$fromUnicode["\xf9\xa2"] = "\xc1\xae";
195$fromUnicode["\xf9\xa3"] = "\xc4\xee";
196$fromUnicode["\xf9\xa4"] = "\xc4\xed";
197$fromUnicode["\xf9\xa5"] = "\x9a\x9a";
198$fromUnicode["\xf9\xa6"] = "\xba\x9f";
199$fromUnicode["\xf9\xa7"] = "\xab\x43";
200$fromUnicode["\xf9\xa8"] = "\xc1\xee";
201$fromUnicode["\xf9\xa9"] = "\xe0\xf2";
202$fromUnicode["\xf9\xaa"] = "\x8c\x8e";
203$fromUnicode["\xf9\xab"] = "\x8e\x58";
204$fromUnicode["\xf9\xac"] = "\xc1\xaf";
205$fromUnicode["\xf9\xad"] = "\xc1\xe1";
206$fromUnicode["\xf9\xae"] = "\xac\x93";
207$fromUnicode["\xf9\xaf"] = "\xc1\xe7";
208$fromUnicode["\xf9\xb0"] = "\xf1\xf6";
209$fromUnicode["\xf9\xb1"] = "\xe2\x8f";
210$fromUnicode["\xf9\xb2"] = "\xc1\xe3";
211$fromUnicode["\xf9\xb3"] = "\xec\x60";
212$fromUnicode["\xf9\xb4"] = "\xee\x49";
213$fromUnicode["\xf9\xb5"] = "\xc0\xfd";
214$fromUnicode["\xf9\xb6"] = "\xb6\x59";
215$fromUnicode["\xf9\xb7"] = "\xf5\xb7";
216$fromUnicode["\xf9\xb8"] = "\xeb\x60";
217$fromUnicode["\xf9\xb9"] = "\x90\xba";
218$fromUnicode["\xf9\xba"] = "\xc1\xcb";
219$fromUnicode["\xf9\xbb"] = "\xc1\xc5";
220$fromUnicode["\xf9\xbc"] = "\xe5\xbc";
221$fromUnicode["\xf9\xbd"] = "\xc4\xf2";
222$fromUnicode["\xf9\xbe"] = "\xc1\xcf";
223$fromUnicode["\xf9\xbf"] = "\x98\xb7";
224$fromUnicode["\xf9\xc0"] = "\xc1\xc7";
225$fromUnicode["\xf9\xc1"] = "\xaf\x9f";
226$fromUnicode["\xf9\xc2"] = "\xde\xa4";
227$fromUnicode["\xf9\xc3"] = "\xdf\x7c";
228$fromUnicode["\xf9\xc4"] = "\xfd\x88";
229$fromUnicode["\xf9\xc5"] = "\x95\x9e";
230$fromUnicode["\xf9\xc6"] = "\xc8\xee";
231$fromUnicode["\xf9\xc7"] = "\x84\xa2";
232$fromUnicode["\xf9\xc8"] = "\x96\x83";
233$fromUnicode["\xf9\xc9"] = "\xc1\xf8";
234$fromUnicode["\xf9\xca"] = "\xc1\xf7";
235$fromUnicode["\xf9\xcb"] = "\xc1\xef";
236$fromUnicode["\xf9\xcc"] = "\xc1\xf0";
237$fromUnicode["\xf9\xcd"] = "\xc1\xf4";
238$fromUnicode["\xf9\xce"] = "\xc1\xf2";
239$fromUnicode["\xf9\xcf"] = "\xbc\x7e";
240$fromUnicode["\xf9\xd0"] = "\xee\x90";
241$fromUnicode["\xf9\xd1"] = "\xc1\xf9";
242$fromUnicode["\xf9\xd2"] = "\xc2\xbe";
243$fromUnicode["\xf9\xd3"] = "\xea\x91";
244$fromUnicode["\xf9\xd4"] = "\x82\x90";
245$fromUnicode["\xf9\xd5"] = "\x8d\x91";
246$fromUnicode["\xf9\xd6"] = "\x9c\x53";
247$fromUnicode["\xf9\xd7"] = "\xdd\x86";
248$fromUnicode["\xf9\xd8"] = "\xc2\xc9";
249$fromUnicode["\xf9\xd9"] = "\x90\xfc";
250$fromUnicode["\xf9\xda"] = "\xc0\xf5";
251$fromUnicode["\xf9\xdb"] = "\xc2\xca";
252$fromUnicode["\xf9\xdc"] = "\xc2\xa1";
253$fromUnicode["\xf9\xdd"] = "\xc0\xfb";
254$fromUnicode["\xf9\xde"] = "\xc0\xf4";
255$fromUnicode["\xf9\xdf"] = "\xc2\xc4";
256$fromUnicode["\xf9\xe0"] = "\xd2\xd7";
257$fromUnicode["\xf9\xe1"] = "\xc0\xee";
258$fromUnicode["\xf9\xe2"] = "\xc0\xe6";
259$fromUnicode["\xf9\xe3"] = "\xc4\xe0";
260$fromUnicode["\xf9\xe4"] = "\xc0\xed";
261$fromUnicode["\xf9\xe5"] = "\xc1\xa1";
262$fromUnicode["\xf9\xe6"] = "\xee\xbe";
263$fromUnicode["\xf9\xe8"] = "\xd1\x65";
264$fromUnicode["\xf9\xe9"] = "\xc0\xef";
265$fromUnicode["\xf9\xea"] = "\xeb\x78";
266$fromUnicode["\xf9\xeb"] = "\xc4\xe4";
267$fromUnicode["\xf9\xec"] = "\xc4\xe7";
268$fromUnicode["\xf9\xed"] = "\xc1\xdf";
269$fromUnicode["\xf9\xee"] = "\x9f\xfb";
270$fromUnicode["\xf9\xef"] = "\xad\x55";
271$fromUnicode["\xf9\xf0"] = "\xcc\x41";
272$fromUnicode["\xf9\xf2"] = "\xf7\x5b";
273$fromUnicode["\xf9\xf3"] = "\xf7\xeb";
274$fromUnicode["\xf9\xf4"] = "\xc1\xd6";
275$fromUnicode["\xf9\xf5"] = "\xc1\xdc";
276$fromUnicode["\xf9\xf6"] = "\xc5\x52";
277$fromUnicode["\xf9\xf7"] = "\xc1\xa2";
278$fromUnicode["\xf9\xf8"] = "\xf3\xd2";
279$fromUnicode["\xf9\xf9"] = "\xc1\xa3";
280$fromUnicode["\xf9\xfa"] = "\xa0\xee";
281$fromUnicode["\xf9\xfb"] = "\xd6\xcb";
282$fromUnicode["\xf9\xfc"] = "\xd7\x52";
283$fromUnicode["\xf9\xfd"] = "\xca\xb2";
284$fromUnicode["\xf9\xfe"] = "\xb2\xe8";
285$fromUnicode["\xf9\xff"] = "\xb4\xcc";
286$fromUnicode["\xfa\x00"] = "\xc7\xd0";
287$fromUnicode["\xfa\x01"] = "\xb6\xc8";
288$fromUnicode["\xfa\x02"] = "\xcd\xd8";
289$fromUnicode["\xfa\x03"] = "\xcc\xc7";
290$fromUnicode["\xfa\x04"] = "\xd5\xac";
291$fromUnicode["\xfa\x05"] = "\xb6\xb4";
292$fromUnicode["\xfa\x06"] = "\xb1\xa9";
293$fromUnicode["\xfa\x07"] = "\xdd\x97";
294$fromUnicode["\xfa\x08"] = "\xd0\xd0";
295$fromUnicode["\xfa\x09"] = "\xbd\xb5";
296$fromUnicode["\xfa\x0a"] = "\xd2\x8a";
297$fromUnicode["\xfa\x0b"] = "\xc0\xaa";
298$fromUnicode["\xfa\x10"] = "\x89\x56";
299$fromUnicode["\xfa\x12"] = "\xc7\xe7";
300$fromUnicode["\xfa\x15"] = "\x84\x44";
301$fromUnicode["\xfa\x16"] = "\xd8\x69";
302$fromUnicode["\xfa\x17"] = "\xd2\xe6";
303$fromUnicode["\xfa\x19"] = "\xc9\xf1";
304$fromUnicode["\xfa\x1a"] = "\xcf\xe9";
305$fromUnicode["\xfa\x1b"] = "\xb8\xa3";
306$fromUnicode["\xfa\x1c"] = "\xbe\xb8";
307$fromUnicode["\xfa\x1d"] = "\xbe\xab";
308$fromUnicode["\xfa\x1e"] = "\xd3\xf0";
309$fromUnicode["\xfa\x22"] = "\xd6\x54";
310$fromUnicode["\xfa\x25"] = "\xd2\xdd";
311$fromUnicode["\xfa\x26"] = "\xb6\xbc";
312$fromUnicode["\xfa\x2a"] = "\xef\x88";
313$fromUnicode["\xfa\x2b"] = "\xef\x95";
314$fromUnicode["\xfa\x2c"] = "\xf0\x5e";
315$fromUnicode["\xfa\x2d"] = "\xfa\x51";
316
317findInvalidChars($toUnicode, $invalid, $truncated);
318testAllValidChars($toUnicode, 'CP936', 'UTF-16BE');
319testAllInvalidChars($invalid, $toUnicode, 'CP936', 'UTF-16BE', "\x00%");
320testTruncatedChars($truncated, 'CP936', 'UTF-16BE', "\x00%");
321echo "Tested CP936 -> UTF-16BE\n";
322
323findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
324convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'CP936', '%');
325echo "Tested UTF-16BE -> CP936\n";
326
327// Test "long" illegal character markers
328mb_substitute_character("long");
329convertInvalidString("\x81\x20", "%", "CP936", "UTF-8");
330convertInvalidString("\x81\x7F", "%", "CP936", "UTF-8");
331convertInvalidString("\xFE\xFF", "%", "CP936", "UTF-8");
332
333echo "Done!\n";
334?>
335--EXPECT--
336Tested CP936 -> UTF-16BE
337Tested UTF-16BE -> CP936
338Done!
339