1--TEST-- 2Unicode standard conformance test (ill-formed UTF sequences.) 3--SKIPIF-- 4<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> 5--FILE-- 6<?php 7function chk_enc($str, $n, $enc = "UTF-8", $with_bom = false) { 8 $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc)); 9 $dst = str_repeat("0000fffd", $n); 10 if ($with_bom) { 11 $dst = "0000feff" . $dst; 12 } 13 if ($dst == $src) { 14 return false; 15 } else { 16 return $src; 17 } 18} 19 20mb_substitute_character(0xfffd); 21 22 23echo "UTF-8 redundancy\n"; 24var_dump(chk_enc("\x31\x32\x33", 0)); 25var_dump(chk_enc("\x41\x42\x43", 0)); 26var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6)); 27var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6)); 28var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9)); 29var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9)); 30var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12)); 31var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11)); 32var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15)); 33var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15)); 34var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18)); 35var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18)); 36 37var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0)); 38var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9)); 39var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12)); 40var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15)); 41var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18)); 42 43var_dump(chk_enc("\xc1\xbf", 2)); 44var_dump(chk_enc("\xc2\x80", 0)); 45var_dump(chk_enc("\xdf\xbf", 0)); 46var_dump(chk_enc("\xe0\x9f\xff", 3)); 47var_dump(chk_enc("\xe0\xa0\x80", 2)); 48var_dump(chk_enc("\xef\xbf\xbf", 0)); 49var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4)); 50var_dump(chk_enc("\xf0\x90\x80\x80", 0)); 51var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4)); 52var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5)); 53var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5)); 54var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5)); 55var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6)); 56var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6)); 57var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6)); 58var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6)); 59 60echo "UTF-8 and surrogates area\n"; 61$out = ''; 62$cnt = 0; 63for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 64 $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3); 65 if ($s === false) { 66 $cnt++; 67 } else { 68 $out .= $s; 69 } 70} 71var_dump($cnt); 72var_dump($out); 73 74echo "UTF-32 code range\n"; 75var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE")); 76var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE")); 77var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE")); 78var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE")); 79var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32")); 80var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32")); 81var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32")); 82var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32")); 83var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32")); 84var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32")); 85 86echo "UTF-32 and surrogates area\n"; 87$out = ''; 88$cnt = 0; 89for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 90 $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE"); 91 if ($s === false) { 92 $cnt++; 93 } else { 94 $out .= $s; 95 } 96} 97var_dump($cnt); 98var_dump($out); 99 100$out = ''; 101$cnt = 0; 102for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 103 $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE"); 104 if ($s === false) { 105 $cnt++; 106 } else { 107 $out .= $s; 108 } 109} 110var_dump($cnt); 111var_dump($out); 112 113$out = ''; 114$cnt = 0; 115for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 116 $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32"); 117 if ($s === false) { 118 $cnt++; 119 } else { 120 $out .= $s; 121 } 122} 123var_dump($cnt); 124var_dump($out); 125 126echo "UTF-32 and surrogates area with BOM\n"; 127 128$out = ''; 129$cnt = 0; 130for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 131 $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 132 1, "UTF-32", true); 133 if ($s === false) { 134 $cnt++; 135 } else { 136 $out .= $s; 137 } 138} 139var_dump($cnt); 140var_dump(str_replace("0000feff","",$out)); 141 142$out = ''; 143$cnt = 0; 144for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 145 $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 146 1, "UTF-32", true); 147 if ($s === false) { 148 $cnt++; 149 } else { 150 $out .= $s; 151 } 152} 153var_dump($cnt); 154var_dump(str_replace("0000feff","",$out)); 155 156?> 157--EXPECT-- 158UTF-8 redundancy 159string(24) "000000310000003200000033" 160string(24) "000000410000004200000043" 161bool(false) 162bool(false) 163bool(false) 164bool(false) 165bool(false) 166bool(false) 167bool(false) 168bool(false) 169bool(false) 170bool(false) 171string(24) "000000a2000000a3000000a5" 172bool(false) 173bool(false) 174bool(false) 175bool(false) 176bool(false) 177string(8) "00000080" 178string(8) "000007ff" 179bool(false) 180string(8) "00000800" 181string(8) "0000ffff" 182bool(false) 183string(8) "00010000" 184bool(false) 185bool(false) 186bool(false) 187bool(false) 188bool(false) 189bool(false) 190bool(false) 191bool(false) 192UTF-8 and surrogates area 193int(2048) 194string(16) "0000d7ff0000e000" 195UTF-32 code range 196bool(false) 197string(8) "0010ffff" 198bool(false) 199string(8) "0010ffff" 200bool(false) 201string(8) "0010ffff" 202string(16) "0000feff0000fffd" 203string(16) "0000feff0010ffff" 204string(16) "0000feff0000fffd" 205string(16) "0000feff0010ffff" 206UTF-32 and surrogates area 207int(2048) 208string(16) "0000d7ff0000e000" 209int(2048) 210string(16) "0000d7ff0000e000" 211int(2048) 212string(16) "0000d7ff0000e000" 213UTF-32 and surrogates area with BOM 214int(2048) 215string(16) "0000d7ff0000e000" 216int(2048) 217string(16) "0000d7ff0000e000" 218