1--TEST-- 2Unicode standard conformance test (ill-formed UTF sequences.) 3--EXTENSIONS-- 4mbstring 5--FILE-- 6<?php 7function chk_enc($str, $n, $enc = "UTF-8") { 8 $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc)); 9 $dst = str_repeat("0000fffd", $n); 10 if ($dst == $src) { 11 return false; 12 } else { 13 return $src; 14 } 15} 16 17mb_substitute_character(0xfffd); 18 19 20echo "UTF-8 redundancy\n"; 21var_dump(chk_enc("\x31\x32\x33", 0)); 22var_dump(chk_enc("\x41\x42\x43", 0)); 23var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6)); 24var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6)); 25var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9)); 26var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9)); 27var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12)); 28var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11)); 29var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15)); 30var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15)); 31var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18)); 32var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18)); 33 34var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0)); 35var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9)); 36var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12)); 37var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15)); 38var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18)); 39 40var_dump(chk_enc("\xc1\xbf", 2)); 41var_dump(chk_enc("\xc2\x80", 0)); 42var_dump(chk_enc("\xdf\xbf", 0)); 43var_dump(chk_enc("\xe0\x9f\xff", 3)); 44var_dump(chk_enc("\xe0\xa0\x80", 2)); 45var_dump(chk_enc("\xef\xbf\xbf", 0)); 46var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4)); 47var_dump(chk_enc("\xf0\x90\x80\x80", 0)); 48var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4)); 49var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5)); 50var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5)); 51var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5)); 52var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6)); 53var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6)); 54var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6)); 55var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6)); 56 57echo "UTF-8 and surrogates area\n"; 58$out = ''; 59$cnt = 0; 60for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 61 $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3); 62 if ($s === false) { 63 $cnt++; 64 } else { 65 $out .= $s; 66 } 67} 68var_dump($cnt); 69var_dump($out); 70 71echo "UTF-32 code range\n"; 72var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE")); 73var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE")); 74var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE")); 75var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE")); 76var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32")); 77var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32")); 78var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32")); 79var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32")); 80var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32")); 81var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32")); 82 83echo "UTF-32 and surrogates area\n"; 84$out = ''; 85$cnt = 0; 86for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 87 $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE"); 88 if ($s === false) { 89 $cnt++; 90 } else { 91 $out .= $s; 92 } 93} 94var_dump($cnt); 95var_dump($out); 96 97$out = ''; 98$cnt = 0; 99for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 100 $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE"); 101 if ($s === false) { 102 $cnt++; 103 } else { 104 $out .= $s; 105 } 106} 107var_dump($cnt); 108var_dump($out); 109 110$out = ''; 111$cnt = 0; 112for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 113 $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32"); 114 if ($s === false) { 115 $cnt++; 116 } else { 117 $out .= $s; 118 } 119} 120var_dump($cnt); 121var_dump($out); 122 123echo "UTF-32 and surrogates area with BOM\n"; 124 125$out = ''; 126$cnt = 0; 127for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 128 $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 129 1, "UTF-32"); 130 if ($s === false) { 131 $cnt++; 132 } else { 133 $out .= $s; 134 } 135} 136var_dump($cnt); 137var_dump($out); 138 139$out = ''; 140$cnt = 0; 141for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 142 $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 143 1, "UTF-32"); 144 if ($s === false) { 145 $cnt++; 146 } else { 147 $out .= $s; 148 } 149} 150var_dump($cnt); 151var_dump($out); 152 153?> 154--EXPECT-- 155UTF-8 redundancy 156string(24) "000000310000003200000033" 157string(24) "000000410000004200000043" 158bool(false) 159bool(false) 160bool(false) 161bool(false) 162bool(false) 163bool(false) 164bool(false) 165bool(false) 166bool(false) 167bool(false) 168string(24) "000000a2000000a3000000a5" 169bool(false) 170bool(false) 171bool(false) 172bool(false) 173bool(false) 174string(8) "00000080" 175string(8) "000007ff" 176bool(false) 177string(8) "00000800" 178string(8) "0000ffff" 179bool(false) 180string(8) "00010000" 181bool(false) 182bool(false) 183bool(false) 184bool(false) 185bool(false) 186bool(false) 187bool(false) 188bool(false) 189UTF-8 and surrogates area 190int(2048) 191string(16) "0000d7ff0000e000" 192UTF-32 code range 193bool(false) 194string(8) "0010ffff" 195bool(false) 196string(8) "0010ffff" 197bool(false) 198string(8) "0010ffff" 199string(8) "0000fffd" 200string(8) "0010ffff" 201string(8) "0000fffd" 202string(8) "0010ffff" 203UTF-32 and surrogates area 204int(2048) 205string(16) "0000d7ff0000e000" 206int(2048) 207string(16) "0000d7ff0000e000" 208int(2048) 209string(16) "0000d7ff0000e000" 210UTF-32 and surrogates area with BOM 211int(2048) 212string(16) "0000d7ff0000e000" 213int(2048) 214string(16) "0000d7ff0000e000" 215