1--TEST--
2Torture test for UTF-{7,8,16,32}
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(232); /* Make results consistent */
12mb_substitute_character(0x25); // '%'
13include('encoding_tests.inc');
14
15// all ranges of valid codepoints in UnicodeData.txt
16$validRanges = [
17 [0x0, 0x377],
18 [0x37a, 0x37f],
19 [0x384, 0x38a],
20 [0x38c, 0x38c],
21 [0x38e, 0x3a1],
22 [0x3a3, 0x52f],
23 [0x531, 0x556],
24 [0x559, 0x58a],
25 [0x58d, 0x58f],
26 [0x591, 0x5c7],
27 [0x5d0, 0x5ea],
28 [0x5ef, 0x5f4],
29 [0x600, 0x61c],
30 [0x61e, 0x70d],
31 [0x70f, 0x74a],
32 [0x74d, 0x7b1],
33 [0x7c0, 0x7fa],
34 [0x7fd, 0x82d],
35 [0x830, 0x83e],
36 [0x840, 0x85b],
37 [0x85e, 0x85e],
38 [0x860, 0x86a],
39 [0x8a0, 0x8b4],
40 [0x8b6, 0x8c7],
41 [0x8d3, 0x983],
42 [0x985, 0x98c],
43 [0x98f, 0x990],
44 [0x993, 0x9a8],
45 [0x9aa, 0x9b0],
46 [0x9b2, 0x9b2],
47 [0x9b6, 0x9b9],
48 [0x9bc, 0x9c4],
49 [0x9c7, 0x9c8],
50 [0x9cb, 0x9ce],
51 [0x9d7, 0x9d7],
52 [0x9dc, 0x9dd],
53 [0x9df, 0x9e3],
54 [0x9e6, 0x9fe],
55 [0xa01, 0xa03],
56 [0xa05, 0xa0a],
57 [0xa0f, 0xa10],
58 [0xa13, 0xa28],
59 [0xa2a, 0xa30],
60 [0xa32, 0xa33],
61 [0xa35, 0xa36],
62 [0xa38, 0xa39],
63 [0xa3c, 0xa3c],
64 [0xa3e, 0xa42],
65 [0xa47, 0xa48],
66 [0xa4b, 0xa4d],
67 [0xa51, 0xa51],
68 [0xa59, 0xa5c],
69 [0xa5e, 0xa5e],
70 [0xa66, 0xa76],
71 [0xa81, 0xa83],
72 [0xa85, 0xa8d],
73 [0xa8f, 0xa91],
74 [0xa93, 0xaa8],
75 [0xaaa, 0xab0],
76 [0xab2, 0xab3],
77 [0xab5, 0xab9],
78 [0xabc, 0xac5],
79 [0xac7, 0xac9],
80 [0xacb, 0xacd],
81 [0xad0, 0xad0],
82 [0xae0, 0xae3],
83 [0xae6, 0xaf1],
84 [0xaf9, 0xaff],
85 [0xb01, 0xb03],
86 [0xb05, 0xb0c],
87 [0xb0f, 0xb10],
88 [0xb13, 0xb28],
89 [0xb2a, 0xb30],
90 [0xb32, 0xb33],
91 [0xb35, 0xb39],
92 [0xb3c, 0xb44],
93 [0xb47, 0xb48],
94 [0xb4b, 0xb4d],
95 [0xb55, 0xb57],
96 [0xb5c, 0xb5d],
97 [0xb5f, 0xb63],
98 [0xb66, 0xb77],
99 [0xb82, 0xb83],
100 [0xb85, 0xb8a],
101 [0xb8e, 0xb90],
102 [0xb92, 0xb95],
103 [0xb99, 0xb9a],
104 [0xb9c, 0xb9c],
105 [0xb9e, 0xb9f],
106 [0xba3, 0xba4],
107 [0xba8, 0xbaa],
108 [0xbae, 0xbb9],
109 [0xbbe, 0xbc2],
110 [0xbc6, 0xbc8],
111 [0xbca, 0xbcd],
112 [0xbd0, 0xbd0],
113 [0xbd7, 0xbd7],
114 [0xbe6, 0xbfa],
115 [0xc00, 0xc0c],
116 [0xc0e, 0xc10],
117 [0xc12, 0xc28],
118 [0xc2a, 0xc39],
119 [0xc3d, 0xc44],
120 [0xc46, 0xc48],
121 [0xc4a, 0xc4d],
122 [0xc55, 0xc56],
123 [0xc58, 0xc5a],
124 [0xc60, 0xc63],
125 [0xc66, 0xc6f],
126 [0xc77, 0xc8c],
127 [0xc8e, 0xc90],
128 [0xc92, 0xca8],
129 [0xcaa, 0xcb3],
130 [0xcb5, 0xcb9],
131 [0xcbc, 0xcc4],
132 [0xcc6, 0xcc8],
133 [0xcca, 0xccd],
134 [0xcd5, 0xcd6],
135 [0xcde, 0xcde],
136 [0xce0, 0xce3],
137 [0xce6, 0xcef],
138 [0xcf1, 0xcf2],
139 [0xd00, 0xd0c],
140 [0xd0e, 0xd10],
141 [0xd12, 0xd44],
142 [0xd46, 0xd48],
143 [0xd4a, 0xd4f],
144 [0xd54, 0xd63],
145 [0xd66, 0xd7f],
146 [0xd81, 0xd83],
147 [0xd85, 0xd96],
148 [0xd9a, 0xdb1],
149 [0xdb3, 0xdbb],
150 [0xdbd, 0xdbd],
151 [0xdc0, 0xdc6],
152 [0xdca, 0xdca],
153 [0xdcf, 0xdd4],
154 [0xdd6, 0xdd6],
155 [0xdd8, 0xddf],
156 [0xde6, 0xdef],
157 [0xdf2, 0xdf4],
158 [0xe01, 0xe3a],
159 [0xe3f, 0xe5b],
160 [0xe81, 0xe82],
161 [0xe84, 0xe84],
162 [0xe86, 0xe8a],
163 [0xe8c, 0xea3],
164 [0xea5, 0xea5],
165 [0xea7, 0xebd],
166 [0xec0, 0xec4],
167 [0xec6, 0xec6],
168 [0xec8, 0xecd],
169 [0xed0, 0xed9],
170 [0xedc, 0xedf],
171 [0xf00, 0xf47],
172 [0xf49, 0xf6c],
173 [0xf71, 0xf97],
174 [0xf99, 0xfbc],
175 [0xfbe, 0xfcc],
176 [0xfce, 0xfda],
177 [0x1000, 0x10c5],
178 [0x10c7, 0x10c7],
179 [0x10cd, 0x10cd],
180 [0x10d0, 0x1248],
181 [0x124a, 0x124d],
182 [0x1250, 0x1256],
183 [0x1258, 0x1258],
184 [0x125a, 0x125d],
185 [0x1260, 0x1288],
186 [0x128a, 0x128d],
187 [0x1290, 0x12b0],
188 [0x12b2, 0x12b5],
189 [0x12b8, 0x12be],
190 [0x12c0, 0x12c0],
191 [0x12c2, 0x12c5],
192 [0x12c8, 0x12d6],
193 [0x12d8, 0x1310],
194 [0x1312, 0x1315],
195 [0x1318, 0x135a],
196 [0x135d, 0x137c],
197 [0x1380, 0x1399],
198 [0x13a0, 0x13f5],
199 [0x13f8, 0x13fd],
200 [0x1400, 0x169c],
201 [0x16a0, 0x16f8],
202 [0x1700, 0x170c],
203 [0x170e, 0x1714],
204 [0x1720, 0x1736],
205 [0x1740, 0x1753],
206 [0x1760, 0x176c],
207 [0x176e, 0x1770],
208 [0x1772, 0x1773],
209 [0x1780, 0x17dd],
210 [0x17e0, 0x17e9],
211 [0x17f0, 0x17f9],
212 [0x1800, 0x180e],
213 [0x1810, 0x1819],
214 [0x1820, 0x1878],
215 [0x1880, 0x18aa],
216 [0x18b0, 0x18f5],
217 [0x1900, 0x191e],
218 [0x1920, 0x192b],
219 [0x1930, 0x193b],
220 [0x1940, 0x1940],
221 [0x1944, 0x196d],
222 [0x1970, 0x1974],
223 [0x1980, 0x19ab],
224 [0x19b0, 0x19c9],
225 [0x19d0, 0x19da],
226 [0x19de, 0x1a1b],
227 [0x1a1e, 0x1a5e],
228 [0x1a60, 0x1a7c],
229 [0x1a7f, 0x1a89],
230 [0x1a90, 0x1a99],
231 [0x1aa0, 0x1aad],
232 [0x1ab0, 0x1ac0],
233 [0x1b00, 0x1b4b],
234 [0x1b50, 0x1b7c],
235 [0x1b80, 0x1bf3],
236 [0x1bfc, 0x1c37],
237 [0x1c3b, 0x1c49],
238 [0x1c4d, 0x1c88],
239 [0x1c90, 0x1cba],
240 [0x1cbd, 0x1cc7],
241 [0x1cd0, 0x1cfa],
242 [0x1d00, 0x1df9],
243 [0x1dfb, 0x1f15],
244 [0x1f18, 0x1f1d],
245 [0x1f20, 0x1f45],
246 [0x1f48, 0x1f4d],
247 [0x1f50, 0x1f57],
248 [0x1f59, 0x1f59],
249 [0x1f5b, 0x1f5b],
250 [0x1f5d, 0x1f5d],
251 [0x1f5f, 0x1f7d],
252 [0x1f80, 0x1fb4],
253 [0x1fb6, 0x1fc4],
254 [0x1fc6, 0x1fd3],
255 [0x1fd6, 0x1fdb],
256 [0x1fdd, 0x1fef],
257 [0x1ff2, 0x1ff4],
258 [0x1ff6, 0x1ffe],
259 [0x2000, 0x2064],
260 [0x2066, 0x2071],
261 [0x2074, 0x208e],
262 [0x2090, 0x209c],
263 [0x20a0, 0x20bf],
264 [0x20d0, 0x20f0],
265 [0x2100, 0x218b],
266 [0x2190, 0x2426],
267 [0x2440, 0x244a],
268 [0x2460, 0x2b73],
269 [0x2b76, 0x2b95],
270 [0x2b97, 0x2c2e],
271 [0x2c30, 0x2c5e],
272 [0x2c60, 0x2cf3],
273 [0x2cf9, 0x2d25],
274 [0x2d27, 0x2d27],
275 [0x2d2d, 0x2d2d],
276 [0x2d30, 0x2d67],
277 [0x2d6f, 0x2d70],
278 [0x2d7f, 0x2d96],
279 [0x2da0, 0x2da6],
280 [0x2da8, 0x2dae],
281 [0x2db0, 0x2db6],
282 [0x2db8, 0x2dbe],
283 [0x2dc0, 0x2dc6],
284 [0x2dc8, 0x2dce],
285 [0x2dd0, 0x2dd6],
286 [0x2dd8, 0x2dde],
287 [0x2de0, 0x2e52],
288 [0x2e80, 0x2e99],
289 [0x2e9b, 0x2ef3],
290 [0x2f00, 0x2fd5],
291 [0x2ff0, 0x2ffb],
292 [0x3000, 0x303f],
293 [0x3041, 0x3096],
294 [0x3099, 0x30ff],
295 [0x3105, 0x312f],
296 [0x3131, 0x318e],
297 [0x3190, 0x31e3],
298 [0x31f0, 0x321e],
299 [0x3220, 0x3400],
300 [0x4dbf, 0x4e00],
301 [0x9ffc, 0x9ffc],
302 [0xa000, 0xa48c],
303 [0xa490, 0xa4c6],
304 [0xa4d0, 0xa62b],
305 [0xa640, 0xa6f7],
306 [0xa700, 0xa7bf],
307 [0xa7c2, 0xa7ca],
308 [0xa7f5, 0xa82c],
309 [0xa830, 0xa839],
310 [0xa840, 0xa877],
311 [0xa880, 0xa8c5],
312 [0xa8ce, 0xa8d9],
313 [0xa8e0, 0xa953],
314 [0xa95f, 0xa97c],
315 [0xa980, 0xa9cd],
316 [0xa9cf, 0xa9d9],
317 [0xa9de, 0xa9fe],
318 [0xaa00, 0xaa36],
319 [0xaa40, 0xaa4d],
320 [0xaa50, 0xaa59],
321 [0xaa5c, 0xaac2],
322 [0xaadb, 0xaaf6],
323 [0xab01, 0xab06],
324 [0xab09, 0xab0e],
325 [0xab11, 0xab16],
326 [0xab20, 0xab26],
327 [0xab28, 0xab2e],
328 [0xab30, 0xab6b],
329 [0xab70, 0xabed],
330 [0xabf0, 0xabf9],
331 [0xac00, 0xac00],
332 [0xd7a3, 0xd7a3],
333 [0xd7b0, 0xd7c6],
334 [0xd7cb, 0xd7fb],
335 [0xd800, 0xd800],
336 [0xdb7f, 0xdb80],
337 [0xdbff, 0xdc00],
338 [0xdfff, 0xe000],
339 [0xf8ff, 0xfa6d],
340 [0xfa70, 0xfad9],
341 [0xfb00, 0xfb06],
342 [0xfb13, 0xfb17],
343 [0xfb1d, 0xfb36],
344 [0xfb38, 0xfb3c],
345 [0xfb3e, 0xfb3e],
346 [0xfb40, 0xfb41],
347 [0xfb43, 0xfb44],
348 [0xfb46, 0xfbc1],
349 [0xfbd3, 0xfd3f],
350 [0xfd50, 0xfd8f],
351 [0xfd92, 0xfdc7],
352 [0xfdf0, 0xfdfd],
353 [0xfe00, 0xfe19],
354 [0xfe20, 0xfe52],
355 [0xfe54, 0xfe66],
356 [0xfe68, 0xfe6b],
357 [0xfe70, 0xfe74],
358 [0xfe76, 0xfefc],
359 [0xfeff, 0xfeff],
360 [0xff01, 0xffbe],
361 [0xffc2, 0xffc7],
362 [0xffca, 0xffcf],
363 [0xffd2, 0xffd7],
364 [0xffda, 0xffdc],
365 [0xffe0, 0xffe6],
366 [0xffe8, 0xffee],
367 [0xfff9, 0xfffd],
368 [0x10000, 0x1000b],
369 [0x1000d, 0x10026],
370 [0x10028, 0x1003a],
371 [0x1003c, 0x1003d],
372 [0x1003f, 0x1004d],
373 [0x10050, 0x1005d],
374 [0x10080, 0x100fa],
375 [0x10100, 0x10102],
376 [0x10107, 0x10133],
377 [0x10137, 0x1018e],
378 [0x10190, 0x1019c],
379 [0x101a0, 0x101a0],
380 [0x101d0, 0x101fd],
381 [0x10280, 0x1029c],
382 [0x102a0, 0x102d0],
383 [0x102e0, 0x102fb],
384 [0x10300, 0x10323],
385 [0x1032d, 0x1034a],
386 [0x10350, 0x1037a],
387 [0x10380, 0x1039d],
388 [0x1039f, 0x103c3],
389 [0x103c8, 0x103d5],
390 [0x10400, 0x1049d],
391 [0x104a0, 0x104a9],
392 [0x104b0, 0x104d3],
393 [0x104d8, 0x104fb],
394 [0x10500, 0x10527],
395 [0x10530, 0x10563],
396 [0x1056f, 0x1056f],
397 [0x10600, 0x10736],
398 [0x10740, 0x10755],
399 [0x10760, 0x10767],
400 [0x10800, 0x10805],
401 [0x10808, 0x10808],
402 [0x1080a, 0x10835],
403 [0x10837, 0x10838],
404 [0x1083c, 0x1083c],
405 [0x1083f, 0x10855],
406 [0x10857, 0x1089e],
407 [0x108a7, 0x108af],
408 [0x108e0, 0x108f2],
409 [0x108f4, 0x108f5],
410 [0x108fb, 0x1091b],
411 [0x1091f, 0x10939],
412 [0x1093f, 0x1093f],
413 [0x10980, 0x109b7],
414 [0x109bc, 0x109cf],
415 [0x109d2, 0x10a03],
416 [0x10a05, 0x10a06],
417 [0x10a0c, 0x10a13],
418 [0x10a15, 0x10a17],
419 [0x10a19, 0x10a35],
420 [0x10a38, 0x10a3a],
421 [0x10a3f, 0x10a48],
422 [0x10a50, 0x10a58],
423 [0x10a60, 0x10a9f],
424 [0x10ac0, 0x10ae6],
425 [0x10aeb, 0x10af6],
426 [0x10b00, 0x10b35],
427 [0x10b39, 0x10b55],
428 [0x10b58, 0x10b72],
429 [0x10b78, 0x10b91],
430 [0x10b99, 0x10b9c],
431 [0x10ba9, 0x10baf],
432 [0x10c00, 0x10c48],
433 [0x10c80, 0x10cb2],
434 [0x10cc0, 0x10cf2],
435 [0x10cfa, 0x10d27],
436 [0x10d30, 0x10d39],
437 [0x10e60, 0x10e7e],
438 [0x10e80, 0x10ea9],
439 [0x10eab, 0x10ead],
440 [0x10eb0, 0x10eb1],
441 [0x10f00, 0x10f27],
442 [0x10f30, 0x10f59],
443 [0x10fb0, 0x10fcb],
444 [0x10fe0, 0x10ff6],
445 [0x11000, 0x1104d],
446 [0x11052, 0x1106f],
447 [0x1107f, 0x110c1],
448 [0x110cd, 0x110cd],
449 [0x110d0, 0x110e8],
450 [0x110f0, 0x110f9],
451 [0x11100, 0x11134],
452 [0x11136, 0x11147],
453 [0x11150, 0x11176],
454 [0x11180, 0x111df],
455 [0x111e1, 0x111f4],
456 [0x11200, 0x11211],
457 [0x11213, 0x1123e],
458 [0x11280, 0x11286],
459 [0x11288, 0x11288],
460 [0x1128a, 0x1128d],
461 [0x1128f, 0x1129d],
462 [0x1129f, 0x112a9],
463 [0x112b0, 0x112ea],
464 [0x112f0, 0x112f9],
465 [0x11300, 0x11303],
466 [0x11305, 0x1130c],
467 [0x1130f, 0x11310],
468 [0x11313, 0x11328],
469 [0x1132a, 0x11330],
470 [0x11332, 0x11333],
471 [0x11335, 0x11339],
472 [0x1133b, 0x11344],
473 [0x11347, 0x11348],
474 [0x1134b, 0x1134d],
475 [0x11350, 0x11350],
476 [0x11357, 0x11357],
477 [0x1135d, 0x11363],
478 [0x11366, 0x1136c],
479 [0x11370, 0x11374],
480 [0x11400, 0x1145b],
481 [0x1145d, 0x11461],
482 [0x11480, 0x114c7],
483 [0x114d0, 0x114d9],
484 [0x11580, 0x115b5],
485 [0x115b8, 0x115dd],
486 [0x11600, 0x11644],
487 [0x11650, 0x11659],
488 [0x11660, 0x1166c],
489 [0x11680, 0x116b8],
490 [0x116c0, 0x116c9],
491 [0x11700, 0x1171a],
492 [0x1171d, 0x1172b],
493 [0x11730, 0x1173f],
494 [0x11800, 0x1183b],
495 [0x118a0, 0x118f2],
496 [0x118ff, 0x11906],
497 [0x11909, 0x11909],
498 [0x1190c, 0x11913],
499 [0x11915, 0x11916],
500 [0x11918, 0x11935],
501 [0x11937, 0x11938],
502 [0x1193b, 0x11946],
503 [0x11950, 0x11959],
504 [0x119a0, 0x119a7],
505 [0x119aa, 0x119d7],
506 [0x119da, 0x119e4],
507 [0x11a00, 0x11a47],
508 [0x11a50, 0x11aa2],
509 [0x11ac0, 0x11af8],
510 [0x11c00, 0x11c08],
511 [0x11c0a, 0x11c36],
512 [0x11c38, 0x11c45],
513 [0x11c50, 0x11c6c],
514 [0x11c70, 0x11c8f],
515 [0x11c92, 0x11ca7],
516 [0x11ca9, 0x11cb6],
517 [0x11d00, 0x11d06],
518 [0x11d08, 0x11d09],
519 [0x11d0b, 0x11d36],
520 [0x11d3a, 0x11d3a],
521 [0x11d3c, 0x11d3d],
522 [0x11d3f, 0x11d47],
523 [0x11d50, 0x11d59],
524 [0x11d60, 0x11d65],
525 [0x11d67, 0x11d68],
526 [0x11d6a, 0x11d8e],
527 [0x11d90, 0x11d91],
528 [0x11d93, 0x11d98],
529 [0x11da0, 0x11da9],
530 [0x11ee0, 0x11ef8],
531 [0x11fb0, 0x11fb0],
532 [0x11fc0, 0x11ff1],
533 [0x11fff, 0x12399],
534 [0x12400, 0x1246e],
535 [0x12470, 0x12474],
536 [0x12480, 0x12543],
537 [0x13000, 0x1342e],
538 [0x13430, 0x13438],
539 [0x14400, 0x14646],
540 [0x16800, 0x16a38],
541 [0x16a40, 0x16a5e],
542 [0x16a60, 0x16a69],
543 [0x16a6e, 0x16a6f],
544 [0x16ad0, 0x16aed],
545 [0x16af0, 0x16af5],
546 [0x16b00, 0x16b45],
547 [0x16b50, 0x16b59],
548 [0x16b5b, 0x16b61],
549 [0x16b63, 0x16b77],
550 [0x16b7d, 0x16b8f],
551 [0x16e40, 0x16e9a],
552 [0x16f00, 0x16f4a],
553 [0x16f4f, 0x16f87],
554 [0x16f8f, 0x16f9f],
555 [0x16fe0, 0x16fe4],
556 [0x16ff0, 0x16ff1],
557 [0x17000, 0x17000],
558 [0x187f7, 0x187f7],
559 [0x18800, 0x18cd5],
560 [0x18d00, 0x18d00],
561 [0x18d08, 0x18d08],
562 [0x1b000, 0x1b11e],
563 [0x1b150, 0x1b152],
564 [0x1b164, 0x1b167],
565 [0x1b170, 0x1b2fb],
566 [0x1bc00, 0x1bc6a],
567 [0x1bc70, 0x1bc7c],
568 [0x1bc80, 0x1bc88],
569 [0x1bc90, 0x1bc99],
570 [0x1bc9c, 0x1bca3],
571 [0x1d000, 0x1d0f5],
572 [0x1d100, 0x1d126],
573 [0x1d129, 0x1d1e8],
574 [0x1d200, 0x1d245],
575 [0x1d2e0, 0x1d2f3],
576 [0x1d300, 0x1d356],
577 [0x1d360, 0x1d378],
578 [0x1d400, 0x1d454],
579 [0x1d456, 0x1d49c],
580 [0x1d49e, 0x1d49f],
581 [0x1d4a2, 0x1d4a2],
582 [0x1d4a5, 0x1d4a6],
583 [0x1d4a9, 0x1d4ac],
584 [0x1d4ae, 0x1d4b9],
585 [0x1d4bb, 0x1d4bb],
586 [0x1d4bd, 0x1d4c3],
587 [0x1d4c5, 0x1d505],
588 [0x1d507, 0x1d50a],
589 [0x1d50d, 0x1d514],
590 [0x1d516, 0x1d51c],
591 [0x1d51e, 0x1d539],
592 [0x1d53b, 0x1d53e],
593 [0x1d540, 0x1d544],
594 [0x1d546, 0x1d546],
595 [0x1d54a, 0x1d550],
596 [0x1d552, 0x1d6a5],
597 [0x1d6a8, 0x1d7cb],
598 [0x1d7ce, 0x1da8b],
599 [0x1da9b, 0x1da9f],
600 [0x1daa1, 0x1daaf],
601 [0x1e000, 0x1e006],
602 [0x1e008, 0x1e018],
603 [0x1e01b, 0x1e021],
604 [0x1e023, 0x1e024],
605 [0x1e026, 0x1e02a],
606 [0x1e100, 0x1e12c],
607 [0x1e130, 0x1e13d],
608 [0x1e140, 0x1e149],
609 [0x1e14e, 0x1e14f],
610 [0x1e2c0, 0x1e2f9],
611 [0x1e2ff, 0x1e2ff],
612 [0x1e800, 0x1e8c4],
613 [0x1e8c7, 0x1e8d6],
614 [0x1e900, 0x1e94b],
615 [0x1e950, 0x1e959],
616 [0x1e95e, 0x1e95f],
617 [0x1ec71, 0x1ecb4],
618 [0x1ed01, 0x1ed3d],
619 [0x1ee00, 0x1ee03],
620 [0x1ee05, 0x1ee1f],
621 [0x1ee21, 0x1ee22],
622 [0x1ee24, 0x1ee24],
623 [0x1ee27, 0x1ee27],
624 [0x1ee29, 0x1ee32],
625 [0x1ee34, 0x1ee37],
626 [0x1ee39, 0x1ee39],
627 [0x1ee3b, 0x1ee3b],
628 [0x1ee42, 0x1ee42],
629 [0x1ee47, 0x1ee47],
630 [0x1ee49, 0x1ee49],
631 [0x1ee4b, 0x1ee4b],
632 [0x1ee4d, 0x1ee4f],
633 [0x1ee51, 0x1ee52],
634 [0x1ee54, 0x1ee54],
635 [0x1ee57, 0x1ee57],
636 [0x1ee59, 0x1ee59],
637 [0x1ee5b, 0x1ee5b],
638 [0x1ee5d, 0x1ee5d],
639 [0x1ee5f, 0x1ee5f],
640 [0x1ee61, 0x1ee62],
641 [0x1ee64, 0x1ee64],
642 [0x1ee67, 0x1ee6a],
643 [0x1ee6c, 0x1ee72],
644 [0x1ee74, 0x1ee77],
645 [0x1ee79, 0x1ee7c],
646 [0x1ee7e, 0x1ee7e],
647 [0x1ee80, 0x1ee89],
648 [0x1ee8b, 0x1ee9b],
649 [0x1eea1, 0x1eea3],
650 [0x1eea5, 0x1eea9],
651 [0x1eeab, 0x1eebb],
652 [0x1eef0, 0x1eef1],
653 [0x1f000, 0x1f02b],
654 [0x1f030, 0x1f093],
655 [0x1f0a0, 0x1f0ae],
656 [0x1f0b1, 0x1f0bf],
657 [0x1f0c1, 0x1f0cf],
658 [0x1f0d1, 0x1f0f5],
659 [0x1f100, 0x1f1ad],
660 [0x1f1e6, 0x1f202],
661 [0x1f210, 0x1f23b],
662 [0x1f240, 0x1f248],
663 [0x1f250, 0x1f251],
664 [0x1f260, 0x1f265],
665 [0x1f300, 0x1f6d7],
666 [0x1f6e0, 0x1f6ec],
667 [0x1f6f0, 0x1f6fc],
668 [0x1f700, 0x1f773],
669 [0x1f780, 0x1f7d8],
670 [0x1f7e0, 0x1f7eb],
671 [0x1f800, 0x1f80b],
672 [0x1f810, 0x1f847],
673 [0x1f850, 0x1f859],
674 [0x1f860, 0x1f887],
675 [0x1f890, 0x1f8ad],
676 [0x1f8b0, 0x1f8b1],
677 [0x1f900, 0x1f978],
678 [0x1f97a, 0x1f9cb],
679 [0x1f9cd, 0x1fa53],
680 [0x1fa60, 0x1fa6d],
681 [0x1fa70, 0x1fa74],
682 [0x1fa78, 0x1fa7a],
683 [0x1fa80, 0x1fa86],
684 [0x1fa90, 0x1faa8],
685 [0x1fab0, 0x1fab6],
686 [0x1fac0, 0x1fac2],
687 [0x1fad0, 0x1fad6],
688 [0x1fb00, 0x1fb92],
689 [0x1fb94, 0x1fbca],
690 [0x1fbf0, 0x1fbf9],
691 [0x20000, 0x20000],
692 [0x2a6dd, 0x2a6dd],
693 [0x2a700, 0x2a700],
694 [0x2b734, 0x2b734],
695 [0x2b740, 0x2b740],
696 [0x2b81d, 0x2b81d],
697 [0x2b820, 0x2b820],
698 [0x2cea1, 0x2cea1],
699 [0x2ceb0, 0x2ceb0],
700 [0x2ebe0, 0x2ebe0],
701 [0x2f800, 0x2fa1d],
702 [0x30000, 0x30000],
703 [0x3134a, 0x3134a],
704 [0xe0001, 0xe0001],
705 [0xe0020, 0xe007f],
706 [0xe0100, 0xe01ef],
707 [0xf0000, 0xf0000],
708 [0xffffd, 0xffffd],
709 [0x100000, 0x100000],
710 [0x10fffd, 0x10fffd]];
711
712// in UTF-32BE
713$validCodepoints = array();
714
715foreach ($validRanges as $range) {
716  for ($cp = $range[0]; $cp <= $range[1]; $cp++) {
717    if (($cp < 0xD800 || $cp > 0xDFFF) && $cp !== 0xFEFF)
718      $validCodepoints[pack('N', $cp)] = true;
719  }
720}
721
722function testValidCodepoints($encoding) {
723  global $validCodepoints;
724
725  $good = array_keys($validCodepoints);
726  shuffle($good);
727
728  while (!empty($good)) {
729    $string = '';
730    $length = min(rand(20,30), count($good));
731    while ($length--) {
732      $string .= array_pop($good);
733    }
734
735    $converted = mb_convert_encoding($string, $encoding, 'UTF-32BE');
736    if ($converted === false)
737      die("mb_convert_encoding failed to convert UTF-32BE to $encoding." .
738          "\nString: " . bin2hex($string));
739    testValidString($converted, $string, $encoding, 'UTF-32BE');
740  }
741}
742
743function testInvalidCodepoints($invalid, $encoding) {
744  global $validCodepoints;
745
746  $good = array_keys($validCodepoints);
747  shuffle($good);
748
749  foreach ($invalid as $bad => $expected) {
750    $good1  = array_pop($good);
751    $string =  $bad . mb_convert_encoding($good1, $encoding, 'UTF-32BE');
752    testInvalidString($string, $expected . $good1, $encoding, 'UTF-32BE');
753  }
754}
755
756echo "== UTF-8 ==\n";
757
758testValidCodepoints('UTF-8');
759
760testValidString('', '', 'UTF-8', 'UTF-32BE');
761
762$invalid = array(
763  // Codepoints outside of valid 0-0x10FFFF range for Unicode
764  "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
765  "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
766  "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
767
768  // Reserved range for UTF-16 surrogate pairs
769  "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3),     // CP 0xD800
770  "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDBFF
771  "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDFFF
772
773  // Truncated characters
774  "\xDF" => "\x00\x00\x00%",         // should have been 2-byte
775  "\xEF\xBF" => "\x00\x00\x00%",     // should have been 3-byte
776  "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte
777  "\xF1\x96" => "\x00\x00\x00%",
778  "\xF1\x96\x80" => "\x00\x00\x00%",
779  "\xF2\x94" => "\x00\x00\x00%",
780  "\xF2\x94\x80" => "\x00\x00\x00%",
781  "\xF3\x94" => "\x00\x00\x00%",
782  "\xF3\x94\x80" => "\x00\x00\x00%",
783  "\xE0\x9F" => "\x00\x00\x00%\x00\x00\x00%",
784  "\xED\xA6" => "\x00\x00\x00%\x00\x00\x00%",
785
786  // Multi-byte characters which end too soon and go to ASCII
787  "\xDFA" => "\x00\x00\x00%\x00\x00\x00A",
788  "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
789  "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
790  "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A",
791
792  // Multi-byte characters which end too soon and go to another MB char
793  "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
794  "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
795  "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF",
796
797  // Multi-byte characters which end too soon and go to a junk byte
798  // (Which isn't even valid to start a new character)
799  "\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
800  "\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
801
802  // Continuation bytes which appear outside of a MB char
803  "\x80" => "\x00\x00\x00%",
804  "A\x80" => "\x00\x00\x00A\x00\x00\x00%",
805  "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%",
806
807  // Overlong code units
808  // (Using more bytes than needed to encode a character)
809  "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2),        // didn't need 2 bytes
810  "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3),    // didn't need 3 bytes
811  "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
812);
813
814testInvalidCodepoints($invalid, 'UTF-8');
815
816// Regression test for bug in SSE2-based accelerated UTF-8 validation function
817$truncated16byte = [
818  "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc6",
819  "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xef",
820  "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xef\xbf",
821  "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0",
822  "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf",
823  "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf\xbf"
824];
825foreach ($truncated16byte as $trunc) {
826  if (mb_check_encoding($trunc, 'UTF-8'))
827    die("UTF-8 validation was incorrect on 16-byte string with truncated multi-byte char at end");
828}
829
830echo "== UTF-16 ==\n";
831
832testValidCodepoints("UTF-16");
833testValidCodepoints("UTF-16LE");
834testValidCodepoints("UTF-16BE");
835
836testValidString('', '', 'UTF-16', 'UTF-32BE');
837testValidString('', '', 'UTF-16LE', 'UTF-32BE');
838testValidString('', '', 'UTF-16BE', 'UTF-32BE');
839
840$invalid = array(
841  // UTF-16 _cannot_ represent codepoints bigger than 0x10FFFF, so we're not
842  // worried about that. But there are plenty of other ways to mess up...
843
844  // Second half of surrogate pair comes first
845  "\xDC\x01\xD8\x02" => "\x00\x00\x00%\x00\x00\x00%",
846
847  // First half of surrogate pair not followed by second part
848  "\xD8\x01\x00A" => "\x00\x00\x00%\x00\x00\x00A",
849
850  // First half of surrogate pair at end of string
851  "\xD8\x01" => "\x00\x00\x00%",
852);
853
854testInvalidCodepoints($invalid, 'UTF-16');
855testInvalidCodepoints($invalid, 'UTF-16BE');
856
857// Truncated strings
858testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16', 'UTF-32BE');
859testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16', 'UTF-32BE');
860testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE');
861testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE');
862
863$invalid = array(
864  // Second half of surrogate pair comes first
865  "\x01\xDC\x02\xD8" => "\x00\x00\x00%\x00\x00\x00%",
866
867  // First half of surrogate pair not followed by second part
868  "\x01\xD8A\x00" => "\x00\x00\x00%\x00\x00\x00A",
869
870  // First half of surrogate pair at end of string
871  "\x01\xD8" => "\x00\x00\x00%",
872
873  // Two successive codepoints which are both the 1st part of a surrogate pair
874  "\x01\xD8\x02\xD8" => "\x00\x00\x00%\x00\x00\x00%"
875);
876
877testInvalidCodepoints($invalid, 'UTF-16LE');
878
879// Truncated
880testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
881testInvalidString("A\x00\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
882
883// Test treatment of BOM
884testValidString("\xFE\xFF\x12\x34", "\x00\x00\x12\x34", 'UTF-16', 'UTF-32BE', false);
885testValidString("\xFF\xFE\x12\x34", "\x00\x00\x34\x12", 'UTF-16', 'UTF-32BE', false);
886
887// Test treatment of (illegal) codepoints between U+D800 and U+DFFF
888testValidString("\xD8\x00", "\xD8\x00", 'UCS-2BE', 'UTF-16BE', false);
889testValidString("\xDB\xFF", "\xDB\xFF", 'UCS-2BE', 'UTF-16BE', false);
890testValidString("\xDC\x00", "\xDC\x00", 'UCS-2BE', 'UTF-16BE', false);
891testValidString("\xD8\x00", "\x00\xD8", 'UCS-2BE', 'UTF-16LE', false);
892testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false);
893
894// Try codepoint over U+10FFFF
895convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE');
896convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE');
897
898// Regression tests for bugs with initial AVX2-accelerated implementation
899convertInvalidString(str_repeat("a\x00", 15) . "\x00\xD8\x00\xFC", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16LE', 'UCS-2BE');
900convertInvalidString(str_repeat("\x00a", 15) . "\xD8\x00\xFC\x00", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16BE', 'UCS-2BE');
901
902// This string caused an out-of-bounds read; it was found by a fuzzer
903$str = "\xdb\xdb\xdb#\xdb\xdb\xdf\xdb\xdf\xdb\xdb\x0b\xdb\x00\xdc\xdb\xdf\xdb\xdf\xdb\xda\x0b\xdb\x00\xdcY\xdf\x03\xdb\x03\xd9\xd9\xd8";
904convertInvalidString($str, "\x00\x25\x00\x25\xdb\xdb\xdf\xdb\x00\x25\x00\x25\xdb\x00\xdc\xdb\x00\x25\x00\x25\x00\x25\xdb\x00\xdc\x59\x00\x25\x00\x25\x00\x25\x00\x25", 'UTF-16BE', 'UTF-16BE');
905
906$str = "\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xdd\xd9\x0a\xda\xda\xda\xda\xdd\xda\xdd\xd9\xda\xda\xda\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb";
907convertInvalidString($str, "\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\xda\xda\xda\xdd\x25\x00\xd9\x0a\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00", 'UTF-16LE', 'UTF-16LE');
908
909echo "== UTF-32 ==\n";
910
911testValidCodepoints("UTF-32LE");
912testValidCodepoints("UTF-32BE");
913
914// Empty string
915testValidString('', '', 'UTF-32', 'UTF-32BE');
916testValidString('', '', 'UTF-32BE', 'UTF-32');
917testValidString('', '', 'UTF-32LE', 'UTF-32BE');
918
919$invalid = array(
920  // Codepoints which are too big
921  "\x00\x11\x00\x00" => "\x00\x00\x00%",
922  "\x80\x00\x00\x00" => "\x00\x00\x00%",
923  "\xff\xff\xfe\xff" => "\x00\x00\x00%",
924
925  // Surrogates
926  "\x00\x00\xd8\x00" => "\x00\x00\x00%",
927  "\x00\x00\xdb\xff" => "\x00\x00\x00%",
928  "\x00\x00\xdc\x00" => "\x00\x00\x00%",
929  "\x00\x00\xdf\xff" => "\x00\x00\x00%",
930);
931
932testInvalidCodepoints($invalid, 'UTF-32');
933testInvalidCodepoints($invalid, 'UTF-32BE');
934
935// Truncated code units
936testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE');
937testInvalidString("\x00\x01",     "\x00\x00\x00%", 'UTF-32', 'UTF-32BE');
938testInvalidString("\x00",         "\x00\x00\x00%", 'UTF-32', 'UTF-32BE');
939testInvalidString("\x00",         "\x00\x00\x00%", 'UTF-32BE', 'UTF-32');
940testInvalidString("\x00",         "\x00\x00\x00%", 'UTF-32BE', 'UTF-32');
941testInvalidString("\x00",         "\x00\x00\x00%", 'UTF-32BE', 'UTF-32');
942
943$invalid = array(
944  // Codepoints which are too big
945  "\x00\x00\x11\x00" => "\x00\x00\x00%",
946  "\x00\x00\x00\x80" => "\x00\x00\x00%",
947  "\xff\xfe\xff\xff" => "\x00\x00\x00%",
948
949  // Surrogates
950  "\x00\xd8\x00\x00" => "\x00\x00\x00%",
951  "\xff\xdb\x00\x00" => "\x00\x00\x00%",
952  "\x00\xdc\x00\x00" => "\x00\x00\x00%",
953  "\xff\xdf\x00\x00" => "\x00\x00\x00%",
954);
955
956testInvalidCodepoints($invalid, 'UTF-32LE');
957
958// Truncated code units
959testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
960testInvalidString("\x00\x01",     "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
961testInvalidString("\x00",         "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
962
963// Test treatment of BOM
964testValidString("\x00\x00\xFE\xFF\x00\x00\x12\x34", "\x00\x00\x12\x34", 'UTF-32', 'UTF-32BE', false);
965testValidString("\xFF\xFE\x00\x00\x12\x34\x00\x00", "\x00\x00\x34\x12", 'UTF-32', 'UTF-32BE', false);
966
967// Test treatment of (illegal) codepoints between U+D800 and U+DFFF
968testValidString("\xD8\x00", "\x00\x00\xD8\x00", 'UCS-2BE', 'UTF-32BE', false);
969testValidString("\xDB\xFF", "\x00\x00\xDB\xFF", 'UCS-2BE', 'UTF-32BE', false);
970testValidString("\xDC\x00", "\x00\x00\xDC\x00", 'UCS-2BE', 'UTF-32BE', false);
971testValidString("\xD8\x00", "\x00\xD8\x00\x00", 'UCS-2BE', 'UTF-32LE', false);
972testValidString("\xDC\x00", "\x00\xDC\x00\x00", 'UCS-2BE', 'UTF-32LE', false);
973
974echo "== UTF-7 ==\n";
975
976testValidString('', '', 'UTF-7', 'UTF-32BE');
977
978// 'Direct' characters
979foreach (range(ord('A'), ord('Z')) as $byte)
980  testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE');
981foreach (range(ord('a'), ord('z')) as $byte)
982  testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE');
983foreach (range(ord('0'), ord('9')) as $byte)
984  testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE');
985foreach (str_split("'(),-./:?") as $char)
986  testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE');
987
988// 'Optional direct' characters are Base64-encoded in mbstring's implementation
989
990// Whitespace
991foreach (str_split(" \t\r\n\x00") as $char)
992  testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE');
993
994// Encoding + as +-
995testValidString('+-', "\x00\x00\x00+", 'UTF-7', 'UTF-32BE', false);
996
997// UTF-16 + Base64 encoding
998function encode($str, $encoding) {
999  // Base64 encoding for UTF-7 doesn't use '=' for padding
1000  return str_replace('=', '', base64_encode(mb_convert_encoding($str, 'UTF-16BE', $encoding)));
1001}
1002
1003for ($i = 0; $i < 256; $i++) {
1004  $reversible = true;
1005  if ($i >= ord('A') && $i <= ord('Z'))
1006    $reversible = false;
1007  if ($i >= ord('a') && $i <= ord('z'))
1008    $reversible = false;
1009  if ($i >= ord('0') && $i <= ord('9'))
1010    $reversible = false;
1011  if (strpos("'(),-./:?\x00 \t\r\n", chr($i)) !== false)
1012    $reversible = false;
1013
1014  testValidString('+' . encode("\x00" . chr($i), 'UTF-16BE') . '-', "\x00\x00\x00" . chr($i), 'UTF-7', 'UTF-32BE', $reversible);
1015}
1016
1017testValidString('+' . encode("\x12\x34", 'UTF-16BE') . '-', "\x00\x00\x12\x34", 'UTF-7', 'UTF-32BE');
1018testValidString('+' . encode("\x12\x34\x56\x78", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78", 'UTF-7', 'UTF-32BE');
1019testValidString('+' . encode("\x12\x34\x56\x78\x00\x40", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78\x00\x00\x00\x40", 'UTF-7', 'UTF-32BE');
1020testValidString('+' . encode("\xFF\xEE\xEE\xFF", 'UTF-16BE') . '-', "\x00\x00\xFF\xEE\x00\x00\xEE\xFF", 'UTF-7', 'UTF-32BE');
1021
1022// Surrogate pair
1023testValidString('+' . encode("\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE');
1024testValidString('+' . encode("\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-32BE') . '-', "\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-7', 'UTF-32BE', false);
1025testValidString('+' . encode("\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE');
1026
1027// Unterminated + section
1028// (This is not considered illegal)
1029testValidString('+' . encode('ABC', 'ASCII'), "\x00A\x00B\x00C", 'UTF-7', 'UTF-16BE', false);
1030
1031// + sections immediately after each other
1032// (This isn't illegal either)
1033testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00C\x00D", 'UTF-7', 'UTF-16BE', false);
1034
1035// + sections not immediately after each other
1036// (Just trying to be exhaustive here)
1037testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false);
1038
1039// + section terminated by a non-Base64 direct character which is NOT -
1040foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) {
1041  testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false);
1042}
1043
1044// Non-direct character followed by direct character
1045testValidString('%A', '+ACU-A', 'ASCII', 'UTF-7');
1046testValidString('%%A', '+ACUAJQ-A', 'ASCII', 'UTF-7');
1047testValidString('%%%A', '+ACUAJQAl-A', 'ASCII', 'UTF-7');
1048
1049// Now let's see how UTF-7 can go BAD...
1050
1051function rawEncode($str) {
1052  return str_replace('=', '', base64_encode($str));
1053}
1054
1055// Totally bogus byte
1056testInvalidString("\xFF", "%", 'UTF-7', 'UTF-8');
1057// Totally bogus codepoint... '+ACU-' is '%' in UTF-7'
1058testInvalidString("\x12\x34\x56\x78", "+ACU-", 'UTF-32BE', 'UTF-7');
1059
1060// First, messed up UTF16 in + section
1061// Second half of surrogate pair coming first
1062testInvalidString('+' . rawEncode("\xDC\x01\xD8\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1063testInvalidString('+' . rawEncode("\x00.\xDC\x01\xD8\x02") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1064testInvalidString('+' . rawEncode("\x00.\x00.\xDC\x01\xD8\x02") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1065
1066// First half of surrogate pair not followed by second half
1067testInvalidString('+' . rawEncode("\xD8\x01\x00A") . '-', "\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE');
1068testInvalidString('+' . rawEncode("\xD8\x01\xD9\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1069testInvalidString('+' . rawEncode("\x00.\xD8\x01\x00A") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE');
1070testInvalidString('+' . rawEncode("\x00.\xD8\x01\xD9\x02") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1071testInvalidString('+' . rawEncode("\x00.\x00.\xD8\x01\x00A") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE');
1072testInvalidString('+' . rawEncode("\x00.\x00.\xD8\x01\xD9\x02") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1073
1074// First half of surrogate pair appearing at end of string
1075testInvalidString('+' . rawEncode("\xD8\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1076testInvalidString('+' . rawEncode("\xD8\x01"), "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1077testInvalidString("+999999uJ", "\xEF\x9F\x9F\xE7\xB7\xB7%", 'UTF-7', 'UTF-8');
1078testInvalidString("+999euJ", "\xEF\x9F\x9F\xE5\xBA\xB8%", "UTF-7", "UTF-8");
1079testInvalidString("+euJ", "\xE7\xAB\xA2%", "UTF-7", "UTF-8");
1080
1081// Truncated string
1082testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1083testInvalidString('+l', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1084
1085// Base64 section should not have 4 ASCII characters; the first 3 can encode one
1086// UTF-16 character, so there is no need for the 4th
1087testInvalidString('+RR8I', "\xE4\x94\x9F%", 'UTF-7', 'UTF-8');
1088// Likewise with 7 characters
1089testInvalidString('+RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF-7', 'UTF-8');
1090
1091// Similarly, it is useless for a Base64 section to only contain a single 'A'
1092// (which decodes to only zero bits)
1093testInvalidString("+A", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1094
1095// And then, messed up Base64 encoding
1096
1097// Bad padding on + section (not zeroes)
1098$encoded = encode("\x12\x34", 'UTF-16BE'); // 3 Base64 bytes, 2 bits of padding...
1099$corrupted = substr($encoded, 0, 2) . chr(ord($encoded[2]) + 1);
1100testInvalidString('+' . $corrupted . '-', "\x00\x00\x12\x34\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1101
1102// Characters which are not Base64 (and not even ASCII) appearing in Base64 section
1103testInvalidString("+\x80", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1104
1105// Try codepoint over U+10FFFF; '+ACU-' is the error marker '%'
1106convertInvalidString("\x12\x34\x56\x78", "+ACU-", 'UCS-4BE', 'UTF-7');
1107convertInvalidString("\x00\x11\x56\x78", "+ACU-", 'UCS-4BE', 'UTF-7');
1108
1109// If error marker character needs to be ASCII-encoded but is able to serve as an
1110// ending character for a Base64 section, no need to add an additional dash
1111mb_substitute_character(0x3F); // ?
1112convertInvalidString("\x1E\xBE", '+AB4?', 'UTF-7', 'UTF-7');
1113
1114echo "Done!\n";
1115
1116?>
1117--EXPECT--
1118== UTF-8 ==
1119== UTF-16 ==
1120== UTF-32 ==
1121== UTF-7 ==
1122Done!
1123