1--TEST-- 2Torture test for UTF-{7,8,16,32} 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(232); /* Make results consistent */ 12mb_substitute_character(0x25); // '%' 13include('encoding_tests.inc'); 14 15// all ranges of valid codepoints in UnicodeData.txt 16$validRanges = [ 17 [0x0, 0x377], 18 [0x37a, 0x37f], 19 [0x384, 0x38a], 20 [0x38c, 0x38c], 21 [0x38e, 0x3a1], 22 [0x3a3, 0x52f], 23 [0x531, 0x556], 24 [0x559, 0x58a], 25 [0x58d, 0x58f], 26 [0x591, 0x5c7], 27 [0x5d0, 0x5ea], 28 [0x5ef, 0x5f4], 29 [0x600, 0x61c], 30 [0x61e, 0x70d], 31 [0x70f, 0x74a], 32 [0x74d, 0x7b1], 33 [0x7c0, 0x7fa], 34 [0x7fd, 0x82d], 35 [0x830, 0x83e], 36 [0x840, 0x85b], 37 [0x85e, 0x85e], 38 [0x860, 0x86a], 39 [0x8a0, 0x8b4], 40 [0x8b6, 0x8c7], 41 [0x8d3, 0x983], 42 [0x985, 0x98c], 43 [0x98f, 0x990], 44 [0x993, 0x9a8], 45 [0x9aa, 0x9b0], 46 [0x9b2, 0x9b2], 47 [0x9b6, 0x9b9], 48 [0x9bc, 0x9c4], 49 [0x9c7, 0x9c8], 50 [0x9cb, 0x9ce], 51 [0x9d7, 0x9d7], 52 [0x9dc, 0x9dd], 53 [0x9df, 0x9e3], 54 [0x9e6, 0x9fe], 55 [0xa01, 0xa03], 56 [0xa05, 0xa0a], 57 [0xa0f, 0xa10], 58 [0xa13, 0xa28], 59 [0xa2a, 0xa30], 60 [0xa32, 0xa33], 61 [0xa35, 0xa36], 62 [0xa38, 0xa39], 63 [0xa3c, 0xa3c], 64 [0xa3e, 0xa42], 65 [0xa47, 0xa48], 66 [0xa4b, 0xa4d], 67 [0xa51, 0xa51], 68 [0xa59, 0xa5c], 69 [0xa5e, 0xa5e], 70 [0xa66, 0xa76], 71 [0xa81, 0xa83], 72 [0xa85, 0xa8d], 73 [0xa8f, 0xa91], 74 [0xa93, 0xaa8], 75 [0xaaa, 0xab0], 76 [0xab2, 0xab3], 77 [0xab5, 0xab9], 78 [0xabc, 0xac5], 79 [0xac7, 0xac9], 80 [0xacb, 0xacd], 81 [0xad0, 0xad0], 82 [0xae0, 0xae3], 83 [0xae6, 0xaf1], 84 [0xaf9, 0xaff], 85 [0xb01, 0xb03], 86 [0xb05, 0xb0c], 87 [0xb0f, 0xb10], 88 [0xb13, 0xb28], 89 [0xb2a, 0xb30], 90 [0xb32, 0xb33], 91 [0xb35, 0xb39], 92 [0xb3c, 0xb44], 93 [0xb47, 0xb48], 94 [0xb4b, 0xb4d], 95 [0xb55, 0xb57], 96 [0xb5c, 0xb5d], 97 [0xb5f, 0xb63], 98 [0xb66, 0xb77], 99 [0xb82, 0xb83], 100 [0xb85, 0xb8a], 101 [0xb8e, 0xb90], 102 [0xb92, 0xb95], 103 [0xb99, 0xb9a], 104 [0xb9c, 0xb9c], 105 [0xb9e, 0xb9f], 106 [0xba3, 0xba4], 107 [0xba8, 0xbaa], 108 [0xbae, 0xbb9], 109 [0xbbe, 0xbc2], 110 [0xbc6, 0xbc8], 111 [0xbca, 0xbcd], 112 [0xbd0, 0xbd0], 113 [0xbd7, 0xbd7], 114 [0xbe6, 0xbfa], 115 [0xc00, 0xc0c], 116 [0xc0e, 0xc10], 117 [0xc12, 0xc28], 118 [0xc2a, 0xc39], 119 [0xc3d, 0xc44], 120 [0xc46, 0xc48], 121 [0xc4a, 0xc4d], 122 [0xc55, 0xc56], 123 [0xc58, 0xc5a], 124 [0xc60, 0xc63], 125 [0xc66, 0xc6f], 126 [0xc77, 0xc8c], 127 [0xc8e, 0xc90], 128 [0xc92, 0xca8], 129 [0xcaa, 0xcb3], 130 [0xcb5, 0xcb9], 131 [0xcbc, 0xcc4], 132 [0xcc6, 0xcc8], 133 [0xcca, 0xccd], 134 [0xcd5, 0xcd6], 135 [0xcde, 0xcde], 136 [0xce0, 0xce3], 137 [0xce6, 0xcef], 138 [0xcf1, 0xcf2], 139 [0xd00, 0xd0c], 140 [0xd0e, 0xd10], 141 [0xd12, 0xd44], 142 [0xd46, 0xd48], 143 [0xd4a, 0xd4f], 144 [0xd54, 0xd63], 145 [0xd66, 0xd7f], 146 [0xd81, 0xd83], 147 [0xd85, 0xd96], 148 [0xd9a, 0xdb1], 149 [0xdb3, 0xdbb], 150 [0xdbd, 0xdbd], 151 [0xdc0, 0xdc6], 152 [0xdca, 0xdca], 153 [0xdcf, 0xdd4], 154 [0xdd6, 0xdd6], 155 [0xdd8, 0xddf], 156 [0xde6, 0xdef], 157 [0xdf2, 0xdf4], 158 [0xe01, 0xe3a], 159 [0xe3f, 0xe5b], 160 [0xe81, 0xe82], 161 [0xe84, 0xe84], 162 [0xe86, 0xe8a], 163 [0xe8c, 0xea3], 164 [0xea5, 0xea5], 165 [0xea7, 0xebd], 166 [0xec0, 0xec4], 167 [0xec6, 0xec6], 168 [0xec8, 0xecd], 169 [0xed0, 0xed9], 170 [0xedc, 0xedf], 171 [0xf00, 0xf47], 172 [0xf49, 0xf6c], 173 [0xf71, 0xf97], 174 [0xf99, 0xfbc], 175 [0xfbe, 0xfcc], 176 [0xfce, 0xfda], 177 [0x1000, 0x10c5], 178 [0x10c7, 0x10c7], 179 [0x10cd, 0x10cd], 180 [0x10d0, 0x1248], 181 [0x124a, 0x124d], 182 [0x1250, 0x1256], 183 [0x1258, 0x1258], 184 [0x125a, 0x125d], 185 [0x1260, 0x1288], 186 [0x128a, 0x128d], 187 [0x1290, 0x12b0], 188 [0x12b2, 0x12b5], 189 [0x12b8, 0x12be], 190 [0x12c0, 0x12c0], 191 [0x12c2, 0x12c5], 192 [0x12c8, 0x12d6], 193 [0x12d8, 0x1310], 194 [0x1312, 0x1315], 195 [0x1318, 0x135a], 196 [0x135d, 0x137c], 197 [0x1380, 0x1399], 198 [0x13a0, 0x13f5], 199 [0x13f8, 0x13fd], 200 [0x1400, 0x169c], 201 [0x16a0, 0x16f8], 202 [0x1700, 0x170c], 203 [0x170e, 0x1714], 204 [0x1720, 0x1736], 205 [0x1740, 0x1753], 206 [0x1760, 0x176c], 207 [0x176e, 0x1770], 208 [0x1772, 0x1773], 209 [0x1780, 0x17dd], 210 [0x17e0, 0x17e9], 211 [0x17f0, 0x17f9], 212 [0x1800, 0x180e], 213 [0x1810, 0x1819], 214 [0x1820, 0x1878], 215 [0x1880, 0x18aa], 216 [0x18b0, 0x18f5], 217 [0x1900, 0x191e], 218 [0x1920, 0x192b], 219 [0x1930, 0x193b], 220 [0x1940, 0x1940], 221 [0x1944, 0x196d], 222 [0x1970, 0x1974], 223 [0x1980, 0x19ab], 224 [0x19b0, 0x19c9], 225 [0x19d0, 0x19da], 226 [0x19de, 0x1a1b], 227 [0x1a1e, 0x1a5e], 228 [0x1a60, 0x1a7c], 229 [0x1a7f, 0x1a89], 230 [0x1a90, 0x1a99], 231 [0x1aa0, 0x1aad], 232 [0x1ab0, 0x1ac0], 233 [0x1b00, 0x1b4b], 234 [0x1b50, 0x1b7c], 235 [0x1b80, 0x1bf3], 236 [0x1bfc, 0x1c37], 237 [0x1c3b, 0x1c49], 238 [0x1c4d, 0x1c88], 239 [0x1c90, 0x1cba], 240 [0x1cbd, 0x1cc7], 241 [0x1cd0, 0x1cfa], 242 [0x1d00, 0x1df9], 243 [0x1dfb, 0x1f15], 244 [0x1f18, 0x1f1d], 245 [0x1f20, 0x1f45], 246 [0x1f48, 0x1f4d], 247 [0x1f50, 0x1f57], 248 [0x1f59, 0x1f59], 249 [0x1f5b, 0x1f5b], 250 [0x1f5d, 0x1f5d], 251 [0x1f5f, 0x1f7d], 252 [0x1f80, 0x1fb4], 253 [0x1fb6, 0x1fc4], 254 [0x1fc6, 0x1fd3], 255 [0x1fd6, 0x1fdb], 256 [0x1fdd, 0x1fef], 257 [0x1ff2, 0x1ff4], 258 [0x1ff6, 0x1ffe], 259 [0x2000, 0x2064], 260 [0x2066, 0x2071], 261 [0x2074, 0x208e], 262 [0x2090, 0x209c], 263 [0x20a0, 0x20bf], 264 [0x20d0, 0x20f0], 265 [0x2100, 0x218b], 266 [0x2190, 0x2426], 267 [0x2440, 0x244a], 268 [0x2460, 0x2b73], 269 [0x2b76, 0x2b95], 270 [0x2b97, 0x2c2e], 271 [0x2c30, 0x2c5e], 272 [0x2c60, 0x2cf3], 273 [0x2cf9, 0x2d25], 274 [0x2d27, 0x2d27], 275 [0x2d2d, 0x2d2d], 276 [0x2d30, 0x2d67], 277 [0x2d6f, 0x2d70], 278 [0x2d7f, 0x2d96], 279 [0x2da0, 0x2da6], 280 [0x2da8, 0x2dae], 281 [0x2db0, 0x2db6], 282 [0x2db8, 0x2dbe], 283 [0x2dc0, 0x2dc6], 284 [0x2dc8, 0x2dce], 285 [0x2dd0, 0x2dd6], 286 [0x2dd8, 0x2dde], 287 [0x2de0, 0x2e52], 288 [0x2e80, 0x2e99], 289 [0x2e9b, 0x2ef3], 290 [0x2f00, 0x2fd5], 291 [0x2ff0, 0x2ffb], 292 [0x3000, 0x303f], 293 [0x3041, 0x3096], 294 [0x3099, 0x30ff], 295 [0x3105, 0x312f], 296 [0x3131, 0x318e], 297 [0x3190, 0x31e3], 298 [0x31f0, 0x321e], 299 [0x3220, 0x3400], 300 [0x4dbf, 0x4e00], 301 [0x9ffc, 0x9ffc], 302 [0xa000, 0xa48c], 303 [0xa490, 0xa4c6], 304 [0xa4d0, 0xa62b], 305 [0xa640, 0xa6f7], 306 [0xa700, 0xa7bf], 307 [0xa7c2, 0xa7ca], 308 [0xa7f5, 0xa82c], 309 [0xa830, 0xa839], 310 [0xa840, 0xa877], 311 [0xa880, 0xa8c5], 312 [0xa8ce, 0xa8d9], 313 [0xa8e0, 0xa953], 314 [0xa95f, 0xa97c], 315 [0xa980, 0xa9cd], 316 [0xa9cf, 0xa9d9], 317 [0xa9de, 0xa9fe], 318 [0xaa00, 0xaa36], 319 [0xaa40, 0xaa4d], 320 [0xaa50, 0xaa59], 321 [0xaa5c, 0xaac2], 322 [0xaadb, 0xaaf6], 323 [0xab01, 0xab06], 324 [0xab09, 0xab0e], 325 [0xab11, 0xab16], 326 [0xab20, 0xab26], 327 [0xab28, 0xab2e], 328 [0xab30, 0xab6b], 329 [0xab70, 0xabed], 330 [0xabf0, 0xabf9], 331 [0xac00, 0xac00], 332 [0xd7a3, 0xd7a3], 333 [0xd7b0, 0xd7c6], 334 [0xd7cb, 0xd7fb], 335 [0xd800, 0xd800], 336 [0xdb7f, 0xdb80], 337 [0xdbff, 0xdc00], 338 [0xdfff, 0xe000], 339 [0xf8ff, 0xfa6d], 340 [0xfa70, 0xfad9], 341 [0xfb00, 0xfb06], 342 [0xfb13, 0xfb17], 343 [0xfb1d, 0xfb36], 344 [0xfb38, 0xfb3c], 345 [0xfb3e, 0xfb3e], 346 [0xfb40, 0xfb41], 347 [0xfb43, 0xfb44], 348 [0xfb46, 0xfbc1], 349 [0xfbd3, 0xfd3f], 350 [0xfd50, 0xfd8f], 351 [0xfd92, 0xfdc7], 352 [0xfdf0, 0xfdfd], 353 [0xfe00, 0xfe19], 354 [0xfe20, 0xfe52], 355 [0xfe54, 0xfe66], 356 [0xfe68, 0xfe6b], 357 [0xfe70, 0xfe74], 358 [0xfe76, 0xfefc], 359 [0xfeff, 0xfeff], 360 [0xff01, 0xffbe], 361 [0xffc2, 0xffc7], 362 [0xffca, 0xffcf], 363 [0xffd2, 0xffd7], 364 [0xffda, 0xffdc], 365 [0xffe0, 0xffe6], 366 [0xffe8, 0xffee], 367 [0xfff9, 0xfffd], 368 [0x10000, 0x1000b], 369 [0x1000d, 0x10026], 370 [0x10028, 0x1003a], 371 [0x1003c, 0x1003d], 372 [0x1003f, 0x1004d], 373 [0x10050, 0x1005d], 374 [0x10080, 0x100fa], 375 [0x10100, 0x10102], 376 [0x10107, 0x10133], 377 [0x10137, 0x1018e], 378 [0x10190, 0x1019c], 379 [0x101a0, 0x101a0], 380 [0x101d0, 0x101fd], 381 [0x10280, 0x1029c], 382 [0x102a0, 0x102d0], 383 [0x102e0, 0x102fb], 384 [0x10300, 0x10323], 385 [0x1032d, 0x1034a], 386 [0x10350, 0x1037a], 387 [0x10380, 0x1039d], 388 [0x1039f, 0x103c3], 389 [0x103c8, 0x103d5], 390 [0x10400, 0x1049d], 391 [0x104a0, 0x104a9], 392 [0x104b0, 0x104d3], 393 [0x104d8, 0x104fb], 394 [0x10500, 0x10527], 395 [0x10530, 0x10563], 396 [0x1056f, 0x1056f], 397 [0x10600, 0x10736], 398 [0x10740, 0x10755], 399 [0x10760, 0x10767], 400 [0x10800, 0x10805], 401 [0x10808, 0x10808], 402 [0x1080a, 0x10835], 403 [0x10837, 0x10838], 404 [0x1083c, 0x1083c], 405 [0x1083f, 0x10855], 406 [0x10857, 0x1089e], 407 [0x108a7, 0x108af], 408 [0x108e0, 0x108f2], 409 [0x108f4, 0x108f5], 410 [0x108fb, 0x1091b], 411 [0x1091f, 0x10939], 412 [0x1093f, 0x1093f], 413 [0x10980, 0x109b7], 414 [0x109bc, 0x109cf], 415 [0x109d2, 0x10a03], 416 [0x10a05, 0x10a06], 417 [0x10a0c, 0x10a13], 418 [0x10a15, 0x10a17], 419 [0x10a19, 0x10a35], 420 [0x10a38, 0x10a3a], 421 [0x10a3f, 0x10a48], 422 [0x10a50, 0x10a58], 423 [0x10a60, 0x10a9f], 424 [0x10ac0, 0x10ae6], 425 [0x10aeb, 0x10af6], 426 [0x10b00, 0x10b35], 427 [0x10b39, 0x10b55], 428 [0x10b58, 0x10b72], 429 [0x10b78, 0x10b91], 430 [0x10b99, 0x10b9c], 431 [0x10ba9, 0x10baf], 432 [0x10c00, 0x10c48], 433 [0x10c80, 0x10cb2], 434 [0x10cc0, 0x10cf2], 435 [0x10cfa, 0x10d27], 436 [0x10d30, 0x10d39], 437 [0x10e60, 0x10e7e], 438 [0x10e80, 0x10ea9], 439 [0x10eab, 0x10ead], 440 [0x10eb0, 0x10eb1], 441 [0x10f00, 0x10f27], 442 [0x10f30, 0x10f59], 443 [0x10fb0, 0x10fcb], 444 [0x10fe0, 0x10ff6], 445 [0x11000, 0x1104d], 446 [0x11052, 0x1106f], 447 [0x1107f, 0x110c1], 448 [0x110cd, 0x110cd], 449 [0x110d0, 0x110e8], 450 [0x110f0, 0x110f9], 451 [0x11100, 0x11134], 452 [0x11136, 0x11147], 453 [0x11150, 0x11176], 454 [0x11180, 0x111df], 455 [0x111e1, 0x111f4], 456 [0x11200, 0x11211], 457 [0x11213, 0x1123e], 458 [0x11280, 0x11286], 459 [0x11288, 0x11288], 460 [0x1128a, 0x1128d], 461 [0x1128f, 0x1129d], 462 [0x1129f, 0x112a9], 463 [0x112b0, 0x112ea], 464 [0x112f0, 0x112f9], 465 [0x11300, 0x11303], 466 [0x11305, 0x1130c], 467 [0x1130f, 0x11310], 468 [0x11313, 0x11328], 469 [0x1132a, 0x11330], 470 [0x11332, 0x11333], 471 [0x11335, 0x11339], 472 [0x1133b, 0x11344], 473 [0x11347, 0x11348], 474 [0x1134b, 0x1134d], 475 [0x11350, 0x11350], 476 [0x11357, 0x11357], 477 [0x1135d, 0x11363], 478 [0x11366, 0x1136c], 479 [0x11370, 0x11374], 480 [0x11400, 0x1145b], 481 [0x1145d, 0x11461], 482 [0x11480, 0x114c7], 483 [0x114d0, 0x114d9], 484 [0x11580, 0x115b5], 485 [0x115b8, 0x115dd], 486 [0x11600, 0x11644], 487 [0x11650, 0x11659], 488 [0x11660, 0x1166c], 489 [0x11680, 0x116b8], 490 [0x116c0, 0x116c9], 491 [0x11700, 0x1171a], 492 [0x1171d, 0x1172b], 493 [0x11730, 0x1173f], 494 [0x11800, 0x1183b], 495 [0x118a0, 0x118f2], 496 [0x118ff, 0x11906], 497 [0x11909, 0x11909], 498 [0x1190c, 0x11913], 499 [0x11915, 0x11916], 500 [0x11918, 0x11935], 501 [0x11937, 0x11938], 502 [0x1193b, 0x11946], 503 [0x11950, 0x11959], 504 [0x119a0, 0x119a7], 505 [0x119aa, 0x119d7], 506 [0x119da, 0x119e4], 507 [0x11a00, 0x11a47], 508 [0x11a50, 0x11aa2], 509 [0x11ac0, 0x11af8], 510 [0x11c00, 0x11c08], 511 [0x11c0a, 0x11c36], 512 [0x11c38, 0x11c45], 513 [0x11c50, 0x11c6c], 514 [0x11c70, 0x11c8f], 515 [0x11c92, 0x11ca7], 516 [0x11ca9, 0x11cb6], 517 [0x11d00, 0x11d06], 518 [0x11d08, 0x11d09], 519 [0x11d0b, 0x11d36], 520 [0x11d3a, 0x11d3a], 521 [0x11d3c, 0x11d3d], 522 [0x11d3f, 0x11d47], 523 [0x11d50, 0x11d59], 524 [0x11d60, 0x11d65], 525 [0x11d67, 0x11d68], 526 [0x11d6a, 0x11d8e], 527 [0x11d90, 0x11d91], 528 [0x11d93, 0x11d98], 529 [0x11da0, 0x11da9], 530 [0x11ee0, 0x11ef8], 531 [0x11fb0, 0x11fb0], 532 [0x11fc0, 0x11ff1], 533 [0x11fff, 0x12399], 534 [0x12400, 0x1246e], 535 [0x12470, 0x12474], 536 [0x12480, 0x12543], 537 [0x13000, 0x1342e], 538 [0x13430, 0x13438], 539 [0x14400, 0x14646], 540 [0x16800, 0x16a38], 541 [0x16a40, 0x16a5e], 542 [0x16a60, 0x16a69], 543 [0x16a6e, 0x16a6f], 544 [0x16ad0, 0x16aed], 545 [0x16af0, 0x16af5], 546 [0x16b00, 0x16b45], 547 [0x16b50, 0x16b59], 548 [0x16b5b, 0x16b61], 549 [0x16b63, 0x16b77], 550 [0x16b7d, 0x16b8f], 551 [0x16e40, 0x16e9a], 552 [0x16f00, 0x16f4a], 553 [0x16f4f, 0x16f87], 554 [0x16f8f, 0x16f9f], 555 [0x16fe0, 0x16fe4], 556 [0x16ff0, 0x16ff1], 557 [0x17000, 0x17000], 558 [0x187f7, 0x187f7], 559 [0x18800, 0x18cd5], 560 [0x18d00, 0x18d00], 561 [0x18d08, 0x18d08], 562 [0x1b000, 0x1b11e], 563 [0x1b150, 0x1b152], 564 [0x1b164, 0x1b167], 565 [0x1b170, 0x1b2fb], 566 [0x1bc00, 0x1bc6a], 567 [0x1bc70, 0x1bc7c], 568 [0x1bc80, 0x1bc88], 569 [0x1bc90, 0x1bc99], 570 [0x1bc9c, 0x1bca3], 571 [0x1d000, 0x1d0f5], 572 [0x1d100, 0x1d126], 573 [0x1d129, 0x1d1e8], 574 [0x1d200, 0x1d245], 575 [0x1d2e0, 0x1d2f3], 576 [0x1d300, 0x1d356], 577 [0x1d360, 0x1d378], 578 [0x1d400, 0x1d454], 579 [0x1d456, 0x1d49c], 580 [0x1d49e, 0x1d49f], 581 [0x1d4a2, 0x1d4a2], 582 [0x1d4a5, 0x1d4a6], 583 [0x1d4a9, 0x1d4ac], 584 [0x1d4ae, 0x1d4b9], 585 [0x1d4bb, 0x1d4bb], 586 [0x1d4bd, 0x1d4c3], 587 [0x1d4c5, 0x1d505], 588 [0x1d507, 0x1d50a], 589 [0x1d50d, 0x1d514], 590 [0x1d516, 0x1d51c], 591 [0x1d51e, 0x1d539], 592 [0x1d53b, 0x1d53e], 593 [0x1d540, 0x1d544], 594 [0x1d546, 0x1d546], 595 [0x1d54a, 0x1d550], 596 [0x1d552, 0x1d6a5], 597 [0x1d6a8, 0x1d7cb], 598 [0x1d7ce, 0x1da8b], 599 [0x1da9b, 0x1da9f], 600 [0x1daa1, 0x1daaf], 601 [0x1e000, 0x1e006], 602 [0x1e008, 0x1e018], 603 [0x1e01b, 0x1e021], 604 [0x1e023, 0x1e024], 605 [0x1e026, 0x1e02a], 606 [0x1e100, 0x1e12c], 607 [0x1e130, 0x1e13d], 608 [0x1e140, 0x1e149], 609 [0x1e14e, 0x1e14f], 610 [0x1e2c0, 0x1e2f9], 611 [0x1e2ff, 0x1e2ff], 612 [0x1e800, 0x1e8c4], 613 [0x1e8c7, 0x1e8d6], 614 [0x1e900, 0x1e94b], 615 [0x1e950, 0x1e959], 616 [0x1e95e, 0x1e95f], 617 [0x1ec71, 0x1ecb4], 618 [0x1ed01, 0x1ed3d], 619 [0x1ee00, 0x1ee03], 620 [0x1ee05, 0x1ee1f], 621 [0x1ee21, 0x1ee22], 622 [0x1ee24, 0x1ee24], 623 [0x1ee27, 0x1ee27], 624 [0x1ee29, 0x1ee32], 625 [0x1ee34, 0x1ee37], 626 [0x1ee39, 0x1ee39], 627 [0x1ee3b, 0x1ee3b], 628 [0x1ee42, 0x1ee42], 629 [0x1ee47, 0x1ee47], 630 [0x1ee49, 0x1ee49], 631 [0x1ee4b, 0x1ee4b], 632 [0x1ee4d, 0x1ee4f], 633 [0x1ee51, 0x1ee52], 634 [0x1ee54, 0x1ee54], 635 [0x1ee57, 0x1ee57], 636 [0x1ee59, 0x1ee59], 637 [0x1ee5b, 0x1ee5b], 638 [0x1ee5d, 0x1ee5d], 639 [0x1ee5f, 0x1ee5f], 640 [0x1ee61, 0x1ee62], 641 [0x1ee64, 0x1ee64], 642 [0x1ee67, 0x1ee6a], 643 [0x1ee6c, 0x1ee72], 644 [0x1ee74, 0x1ee77], 645 [0x1ee79, 0x1ee7c], 646 [0x1ee7e, 0x1ee7e], 647 [0x1ee80, 0x1ee89], 648 [0x1ee8b, 0x1ee9b], 649 [0x1eea1, 0x1eea3], 650 [0x1eea5, 0x1eea9], 651 [0x1eeab, 0x1eebb], 652 [0x1eef0, 0x1eef1], 653 [0x1f000, 0x1f02b], 654 [0x1f030, 0x1f093], 655 [0x1f0a0, 0x1f0ae], 656 [0x1f0b1, 0x1f0bf], 657 [0x1f0c1, 0x1f0cf], 658 [0x1f0d1, 0x1f0f5], 659 [0x1f100, 0x1f1ad], 660 [0x1f1e6, 0x1f202], 661 [0x1f210, 0x1f23b], 662 [0x1f240, 0x1f248], 663 [0x1f250, 0x1f251], 664 [0x1f260, 0x1f265], 665 [0x1f300, 0x1f6d7], 666 [0x1f6e0, 0x1f6ec], 667 [0x1f6f0, 0x1f6fc], 668 [0x1f700, 0x1f773], 669 [0x1f780, 0x1f7d8], 670 [0x1f7e0, 0x1f7eb], 671 [0x1f800, 0x1f80b], 672 [0x1f810, 0x1f847], 673 [0x1f850, 0x1f859], 674 [0x1f860, 0x1f887], 675 [0x1f890, 0x1f8ad], 676 [0x1f8b0, 0x1f8b1], 677 [0x1f900, 0x1f978], 678 [0x1f97a, 0x1f9cb], 679 [0x1f9cd, 0x1fa53], 680 [0x1fa60, 0x1fa6d], 681 [0x1fa70, 0x1fa74], 682 [0x1fa78, 0x1fa7a], 683 [0x1fa80, 0x1fa86], 684 [0x1fa90, 0x1faa8], 685 [0x1fab0, 0x1fab6], 686 [0x1fac0, 0x1fac2], 687 [0x1fad0, 0x1fad6], 688 [0x1fb00, 0x1fb92], 689 [0x1fb94, 0x1fbca], 690 [0x1fbf0, 0x1fbf9], 691 [0x20000, 0x20000], 692 [0x2a6dd, 0x2a6dd], 693 [0x2a700, 0x2a700], 694 [0x2b734, 0x2b734], 695 [0x2b740, 0x2b740], 696 [0x2b81d, 0x2b81d], 697 [0x2b820, 0x2b820], 698 [0x2cea1, 0x2cea1], 699 [0x2ceb0, 0x2ceb0], 700 [0x2ebe0, 0x2ebe0], 701 [0x2f800, 0x2fa1d], 702 [0x30000, 0x30000], 703 [0x3134a, 0x3134a], 704 [0xe0001, 0xe0001], 705 [0xe0020, 0xe007f], 706 [0xe0100, 0xe01ef], 707 [0xf0000, 0xf0000], 708 [0xffffd, 0xffffd], 709 [0x100000, 0x100000], 710 [0x10fffd, 0x10fffd]]; 711 712// in UTF-32BE 713$validCodepoints = array(); 714 715foreach ($validRanges as $range) { 716 for ($cp = $range[0]; $cp <= $range[1]; $cp++) { 717 if (($cp < 0xD800 || $cp > 0xDFFF) && $cp !== 0xFEFF) 718 $validCodepoints[pack('N', $cp)] = true; 719 } 720} 721 722function testValidCodepoints($encoding) { 723 global $validCodepoints; 724 725 $good = array_keys($validCodepoints); 726 shuffle($good); 727 728 while (!empty($good)) { 729 $string = ''; 730 $length = min(rand(20,30), count($good)); 731 while ($length--) { 732 $string .= array_pop($good); 733 } 734 735 $converted = mb_convert_encoding($string, $encoding, 'UTF-32BE'); 736 if ($converted === false) 737 die("mb_convert_encoding failed to convert UTF-32BE to $encoding." . 738 "\nString: " . bin2hex($string)); 739 testValidString($converted, $string, $encoding, 'UTF-32BE'); 740 } 741} 742 743function testInvalidCodepoints($invalid, $encoding) { 744 global $validCodepoints; 745 746 $good = array_keys($validCodepoints); 747 shuffle($good); 748 749 foreach ($invalid as $bad => $expected) { 750 $good1 = array_pop($good); 751 $string = $bad . mb_convert_encoding($good1, $encoding, 'UTF-32BE'); 752 testInvalidString($string, $expected . $good1, $encoding, 'UTF-32BE'); 753 } 754} 755 756echo "== UTF-8 ==\n"; 757 758testValidCodepoints('UTF-8'); 759 760testValidString('', '', 'UTF-8', 'UTF-32BE'); 761 762$invalid = array( 763 // Codepoints outside of valid 0-0x10FFFF range for Unicode 764 "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000 765 "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000 766 "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF 767 768 // Reserved range for UTF-16 surrogate pairs 769 "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800 770 "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF 771 "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF 772 773 // Truncated characters 774 "\xDF" => "\x00\x00\x00%", // should have been 2-byte 775 "\xEF\xBF" => "\x00\x00\x00%", // should have been 3-byte 776 "\xF0\xBF\xBF" => "\x00\x00\x00%", // should have been 4-byte 777 "\xF1\x96" => "\x00\x00\x00%", 778 "\xF1\x96\x80" => "\x00\x00\x00%", 779 "\xF2\x94" => "\x00\x00\x00%", 780 "\xF2\x94\x80" => "\x00\x00\x00%", 781 "\xF3\x94" => "\x00\x00\x00%", 782 "\xF3\x94\x80" => "\x00\x00\x00%", 783 "\xE0\x9F" => "\x00\x00\x00%\x00\x00\x00%", 784 "\xED\xA6" => "\x00\x00\x00%\x00\x00\x00%", 785 786 // Multi-byte characters which end too soon and go to ASCII 787 "\xDFA" => "\x00\x00\x00%\x00\x00\x00A", 788 "\xEF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 789 "\xF0\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 790 "\xF0\xBF\xBFA" => "\x00\x00\x00%\x00\x00\x00A", 791 792 // Multi-byte characters which end too soon and go to another MB char 793 "\xDF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 794 "\xEF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 795 "\xF0\xBF\xBF\xDF\xBF" => "\x00\x00\x00%\x00\x00\x07\xFF", 796 797 // Multi-byte characters which end too soon and go to a junk byte 798 // (Which isn't even valid to start a new character) 799 "\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2), 800 "\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2), 801 802 // Continuation bytes which appear outside of a MB char 803 "\x80" => "\x00\x00\x00%", 804 "A\x80" => "\x00\x00\x00A\x00\x00\x00%", 805 "\xDF\xBF\x80" => "\x00\x00\x07\xFF\x00\x00\x00%", 806 807 // Overlong code units 808 // (Using more bytes than needed to encode a character) 809 "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes 810 "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes 811 "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes 812); 813 814testInvalidCodepoints($invalid, 'UTF-8'); 815 816// Regression test for bug in SSE2-based accelerated UTF-8 validation function 817$truncated16byte = [ 818 "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc6", 819 "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xef", 820 "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xef\xbf", 821 "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0", 822 "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf", 823 "k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf\xbf" 824]; 825foreach ($truncated16byte as $trunc) { 826 if (mb_check_encoding($trunc, 'UTF-8')) 827 die("UTF-8 validation was incorrect on 16-byte string with truncated multi-byte char at end"); 828} 829 830echo "== UTF-16 ==\n"; 831 832testValidCodepoints("UTF-16"); 833testValidCodepoints("UTF-16LE"); 834testValidCodepoints("UTF-16BE"); 835 836testValidString('', '', 'UTF-16', 'UTF-32BE'); 837testValidString('', '', 'UTF-16LE', 'UTF-32BE'); 838testValidString('', '', 'UTF-16BE', 'UTF-32BE'); 839 840$invalid = array( 841 // UTF-16 _cannot_ represent codepoints bigger than 0x10FFFF, so we're not 842 // worried about that. But there are plenty of other ways to mess up... 843 844 // Second half of surrogate pair comes first 845 "\xDC\x01\xD8\x02" => "\x00\x00\x00%\x00\x00\x00%", 846 847 // First half of surrogate pair not followed by second part 848 "\xD8\x01\x00A" => "\x00\x00\x00%\x00\x00\x00A", 849 850 // First half of surrogate pair at end of string 851 "\xD8\x01" => "\x00\x00\x00%", 852); 853 854testInvalidCodepoints($invalid, 'UTF-16'); 855testInvalidCodepoints($invalid, 'UTF-16BE'); 856 857// Truncated strings 858testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16', 'UTF-32BE'); 859testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16', 'UTF-32BE'); 860testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE'); 861testInvalidString("\x00A\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16BE', 'UTF-32BE'); 862 863$invalid = array( 864 // Second half of surrogate pair comes first 865 "\x01\xDC\x02\xD8" => "\x00\x00\x00%\x00\x00\x00%", 866 867 // First half of surrogate pair not followed by second part 868 "\x01\xD8A\x00" => "\x00\x00\x00%\x00\x00\x00A", 869 870 // First half of surrogate pair at end of string 871 "\x01\xD8" => "\x00\x00\x00%", 872 873 // Two successive codepoints which are both the 1st part of a surrogate pair 874 "\x01\xD8\x02\xD8" => "\x00\x00\x00%\x00\x00\x00%" 875); 876 877testInvalidCodepoints($invalid, 'UTF-16LE'); 878 879// Truncated 880testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE'); 881testInvalidString("A\x00\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE'); 882 883// Test treatment of BOM 884testValidString("\xFE\xFF\x12\x34", "\x00\x00\x12\x34", 'UTF-16', 'UTF-32BE', false); 885testValidString("\xFF\xFE\x12\x34", "\x00\x00\x34\x12", 'UTF-16', 'UTF-32BE', false); 886 887// Test treatment of (illegal) codepoints between U+D800 and U+DFFF 888testValidString("\xD8\x00", "\xD8\x00", 'UCS-2BE', 'UTF-16BE', false); 889testValidString("\xDB\xFF", "\xDB\xFF", 'UCS-2BE', 'UTF-16BE', false); 890testValidString("\xDC\x00", "\xDC\x00", 'UCS-2BE', 'UTF-16BE', false); 891testValidString("\xD8\x00", "\x00\xD8", 'UCS-2BE', 'UTF-16LE', false); 892testValidString("\xDC\x00", "\x00\xDC", 'UCS-2BE', 'UTF-16LE', false); 893 894// Try codepoint over U+10FFFF 895convertInvalidString("\x00\x11\x56\x78", "\x00%", 'UCS-4BE', 'UTF-16BE'); 896convertInvalidString("\x00\x11\x56\x78", "%\x00", 'UCS-4BE', 'UTF-16LE'); 897 898// Regression tests for bugs with initial AVX2-accelerated implementation 899convertInvalidString(str_repeat("a\x00", 15) . "\x00\xD8\x00\xFC", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16LE', 'UCS-2BE'); 900convertInvalidString(str_repeat("\x00a", 15) . "\xD8\x00\xFC\x00", str_repeat("\x00a", 15) . "\x00%\xFC\x00", 'UTF-16BE', 'UCS-2BE'); 901 902// This string caused an out-of-bounds read; it was found by a fuzzer 903$str = "\xdb\xdb\xdb#\xdb\xdb\xdf\xdb\xdf\xdb\xdb\x0b\xdb\x00\xdc\xdb\xdf\xdb\xdf\xdb\xda\x0b\xdb\x00\xdcY\xdf\x03\xdb\x03\xd9\xd9\xd8"; 904convertInvalidString($str, "\x00\x25\x00\x25\xdb\xdb\xdf\xdb\x00\x25\x00\x25\xdb\x00\xdc\xdb\x00\x25\x00\x25\x00\x25\xdb\x00\xdc\x59\x00\x25\x00\x25\x00\x25\x00\x25", 'UTF-16BE', 'UTF-16BE'); 905 906$str = "\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xdd\xd9\x0a\xda\xda\xda\xda\xdd\xda\xdd\xd9\xda\xda\xda\xda\xda\xda\xda\xda\xda\xd9\xdb\xda\xda\xda\xd9\xdb\xda\xda\xda\xda\xdd\xda\xda\xd9\xdb"; 907convertInvalidString($str, "\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\xda\xda\xda\xdd\x25\x00\xd9\x0a\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00\x25\x00", 'UTF-16LE', 'UTF-16LE'); 908 909echo "== UTF-32 ==\n"; 910 911testValidCodepoints("UTF-32LE"); 912testValidCodepoints("UTF-32BE"); 913 914// Empty string 915testValidString('', '', 'UTF-32', 'UTF-32BE'); 916testValidString('', '', 'UTF-32BE', 'UTF-32'); 917testValidString('', '', 'UTF-32LE', 'UTF-32BE'); 918 919$invalid = array( 920 // Codepoints which are too big 921 "\x00\x11\x00\x00" => "\x00\x00\x00%", 922 "\x80\x00\x00\x00" => "\x00\x00\x00%", 923 "\xff\xff\xfe\xff" => "\x00\x00\x00%", 924 925 // Surrogates 926 "\x00\x00\xd8\x00" => "\x00\x00\x00%", 927 "\x00\x00\xdb\xff" => "\x00\x00\x00%", 928 "\x00\x00\xdc\x00" => "\x00\x00\x00%", 929 "\x00\x00\xdf\xff" => "\x00\x00\x00%", 930); 931 932testInvalidCodepoints($invalid, 'UTF-32'); 933testInvalidCodepoints($invalid, 'UTF-32BE'); 934 935// Truncated code units 936testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE'); 937testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE'); 938testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32', 'UTF-32BE'); 939testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32'); 940testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32'); 941testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32BE', 'UTF-32'); 942 943$invalid = array( 944 // Codepoints which are too big 945 "\x00\x00\x11\x00" => "\x00\x00\x00%", 946 "\x00\x00\x00\x80" => "\x00\x00\x00%", 947 "\xff\xfe\xff\xff" => "\x00\x00\x00%", 948 949 // Surrogates 950 "\x00\xd8\x00\x00" => "\x00\x00\x00%", 951 "\xff\xdb\x00\x00" => "\x00\x00\x00%", 952 "\x00\xdc\x00\x00" => "\x00\x00\x00%", 953 "\xff\xdf\x00\x00" => "\x00\x00\x00%", 954); 955 956testInvalidCodepoints($invalid, 'UTF-32LE'); 957 958// Truncated code units 959testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE'); 960testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE'); 961testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE'); 962 963// Test treatment of BOM 964testValidString("\x00\x00\xFE\xFF\x00\x00\x12\x34", "\x00\x00\x12\x34", 'UTF-32', 'UTF-32BE', false); 965testValidString("\xFF\xFE\x00\x00\x12\x34\x00\x00", "\x00\x00\x34\x12", 'UTF-32', 'UTF-32BE', false); 966 967// Test treatment of (illegal) codepoints between U+D800 and U+DFFF 968testValidString("\xD8\x00", "\x00\x00\xD8\x00", 'UCS-2BE', 'UTF-32BE', false); 969testValidString("\xDB\xFF", "\x00\x00\xDB\xFF", 'UCS-2BE', 'UTF-32BE', false); 970testValidString("\xDC\x00", "\x00\x00\xDC\x00", 'UCS-2BE', 'UTF-32BE', false); 971testValidString("\xD8\x00", "\x00\xD8\x00\x00", 'UCS-2BE', 'UTF-32LE', false); 972testValidString("\xDC\x00", "\x00\xDC\x00\x00", 'UCS-2BE', 'UTF-32LE', false); 973 974echo "== UTF-7 ==\n"; 975 976testValidString('', '', 'UTF-7', 'UTF-32BE'); 977 978// 'Direct' characters 979foreach (range(ord('A'), ord('Z')) as $byte) 980 testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE'); 981foreach (range(ord('a'), ord('z')) as $byte) 982 testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE'); 983foreach (range(ord('0'), ord('9')) as $byte) 984 testValidString(chr($byte), "\x00\x00\x00" . chr($byte), 'UTF-7', 'UTF-32BE'); 985foreach (str_split("'(),-./:?") as $char) 986 testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE'); 987 988// 'Optional direct' characters are Base64-encoded in mbstring's implementation 989 990// Whitespace 991foreach (str_split(" \t\r\n\x00") as $char) 992 testValidString($char, "\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE'); 993 994// Encoding + as +- 995testValidString('+-', "\x00\x00\x00+", 'UTF-7', 'UTF-32BE', false); 996 997// UTF-16 + Base64 encoding 998function encode($str, $encoding) { 999 // Base64 encoding for UTF-7 doesn't use '=' for padding 1000 return str_replace('=', '', base64_encode(mb_convert_encoding($str, 'UTF-16BE', $encoding))); 1001} 1002 1003for ($i = 0; $i < 256; $i++) { 1004 $reversible = true; 1005 if ($i >= ord('A') && $i <= ord('Z')) 1006 $reversible = false; 1007 if ($i >= ord('a') && $i <= ord('z')) 1008 $reversible = false; 1009 if ($i >= ord('0') && $i <= ord('9')) 1010 $reversible = false; 1011 if (strpos("'(),-./:?\x00 \t\r\n", chr($i)) !== false) 1012 $reversible = false; 1013 1014 testValidString('+' . encode("\x00" . chr($i), 'UTF-16BE') . '-', "\x00\x00\x00" . chr($i), 'UTF-7', 'UTF-32BE', $reversible); 1015} 1016 1017testValidString('+' . encode("\x12\x34", 'UTF-16BE') . '-', "\x00\x00\x12\x34", 'UTF-7', 'UTF-32BE'); 1018testValidString('+' . encode("\x12\x34\x56\x78", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78", 'UTF-7', 'UTF-32BE'); 1019testValidString('+' . encode("\x12\x34\x56\x78\x00\x40", 'UTF-16BE') . '-', "\x00\x00\x12\x34\x00\x00\x56\x78\x00\x00\x00\x40", 'UTF-7', 'UTF-32BE'); 1020testValidString('+' . encode("\xFF\xEE\xEE\xFF", 'UTF-16BE') . '-', "\x00\x00\xFF\xEE\x00\x00\xEE\xFF", 'UTF-7', 'UTF-32BE'); 1021 1022// Surrogate pair 1023testValidString('+' . encode("\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE'); 1024testValidString('+' . encode("\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-32BE') . '-', "\x00\x00\x00A\x00\x01\x04\x00\x00\x00\x00B", 'UTF-7', 'UTF-32BE', false); 1025testValidString('+' . encode("\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-32BE') . '-', "\x00\x01\x04\x00\x00\x01\x04\x00", 'UTF-7', 'UTF-32BE'); 1026 1027// Unterminated + section 1028// (This is not considered illegal) 1029testValidString('+' . encode('ABC', 'ASCII'), "\x00A\x00B\x00C", 'UTF-7', 'UTF-16BE', false); 1030 1031// + sections immediately after each other 1032// (This isn't illegal either) 1033testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00C\x00D", 'UTF-7', 'UTF-16BE', false); 1034 1035// + sections not immediately after each other 1036// (Just trying to be exhaustive here) 1037testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false); 1038 1039// + section terminated by a non-Base64 direct character which is NOT - 1040foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) { 1041 testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false); 1042} 1043 1044// Non-direct character followed by direct character 1045testValidString('%A', '+ACU-A', 'ASCII', 'UTF-7'); 1046testValidString('%%A', '+ACUAJQ-A', 'ASCII', 'UTF-7'); 1047testValidString('%%%A', '+ACUAJQAl-A', 'ASCII', 'UTF-7'); 1048 1049// Now let's see how UTF-7 can go BAD... 1050 1051function rawEncode($str) { 1052 return str_replace('=', '', base64_encode($str)); 1053} 1054 1055// Totally bogus byte 1056testInvalidString("\xFF", "%", 'UTF-7', 'UTF-8'); 1057// Totally bogus codepoint... '+ACU-' is '%' in UTF-7' 1058testInvalidString("\x12\x34\x56\x78", "+ACU-", 'UTF-32BE', 'UTF-7'); 1059 1060// First, messed up UTF16 in + section 1061// Second half of surrogate pair coming first 1062testInvalidString('+' . rawEncode("\xDC\x01\xD8\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1063testInvalidString('+' . rawEncode("\x00.\xDC\x01\xD8\x02") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1064testInvalidString('+' . rawEncode("\x00.\x00.\xDC\x01\xD8\x02") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1065 1066// First half of surrogate pair not followed by second half 1067testInvalidString('+' . rawEncode("\xD8\x01\x00A") . '-', "\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE'); 1068testInvalidString('+' . rawEncode("\xD8\x01\xD9\x02") . '-', "\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1069testInvalidString('+' . rawEncode("\x00.\xD8\x01\x00A") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE'); 1070testInvalidString('+' . rawEncode("\x00.\xD8\x01\xD9\x02") . '-', "\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1071testInvalidString('+' . rawEncode("\x00.\x00.\xD8\x01\x00A") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00A", 'UTF-7', 'UTF-32BE'); 1072testInvalidString('+' . rawEncode("\x00.\x00.\xD8\x01\xD9\x02") . '-', "\x00\x00\x00.\x00\x00\x00.\x00\x00\x00%\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1073 1074// First half of surrogate pair appearing at end of string 1075testInvalidString('+' . rawEncode("\xD8\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1076testInvalidString('+' . rawEncode("\xD8\x01"), "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1077testInvalidString("+999999uJ", "\xEF\x9F\x9F\xE7\xB7\xB7%", 'UTF-7', 'UTF-8'); 1078testInvalidString("+999euJ", "\xEF\x9F\x9F\xE5\xBA\xB8%", "UTF-7", "UTF-8"); 1079testInvalidString("+euJ", "\xE7\xAB\xA2%", "UTF-7", "UTF-8"); 1080 1081// Truncated string 1082testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1083testInvalidString('+l', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1084 1085// Base64 section should not have 4 ASCII characters; the first 3 can encode one 1086// UTF-16 character, so there is no need for the 4th 1087testInvalidString('+RR8I', "\xE4\x94\x9F%", 'UTF-7', 'UTF-8'); 1088// Likewise with 7 characters 1089testInvalidString('+RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF-7', 'UTF-8'); 1090 1091// Similarly, it is useless for a Base64 section to only contain a single 'A' 1092// (which decodes to only zero bits) 1093testInvalidString("+A", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1094 1095// And then, messed up Base64 encoding 1096 1097// Bad padding on + section (not zeroes) 1098$encoded = encode("\x12\x34", 'UTF-16BE'); // 3 Base64 bytes, 2 bits of padding... 1099$corrupted = substr($encoded, 0, 2) . chr(ord($encoded[2]) + 1); 1100testInvalidString('+' . $corrupted . '-', "\x00\x00\x12\x34\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1101 1102// Characters which are not Base64 (and not even ASCII) appearing in Base64 section 1103testInvalidString("+\x80", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE'); 1104 1105// Try codepoint over U+10FFFF; '+ACU-' is the error marker '%' 1106convertInvalidString("\x12\x34\x56\x78", "+ACU-", 'UCS-4BE', 'UTF-7'); 1107convertInvalidString("\x00\x11\x56\x78", "+ACU-", 'UCS-4BE', 'UTF-7'); 1108 1109// If error marker character needs to be ASCII-encoded but is able to serve as an 1110// ending character for a Base64 section, no need to add an additional dash 1111mb_substitute_character(0x3F); // ? 1112convertInvalidString("\x1E\xBE", '+AB4?', 'UTF-7', 'UTF-7'); 1113 1114echo "Done!\n"; 1115 1116?> 1117--EXPECT-- 1118== UTF-8 == 1119== UTF-16 == 1120== UTF-32 == 1121== UTF-7 == 1122Done! 1123