1--TEST-- 2Unicode standard conformance test (ill-formed UTF sequences.) 3--EXTENSIONS-- 4mbstring 5--FILE-- 6<?php 7function chk_enc($str, $n, $enc = "UTF-8") { 8 $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc)); 9 $dst = str_repeat("0000fffd", $n); 10 if ($dst == $src) { 11 return false; 12 } else { 13 return $src; 14 } 15} 16 17mb_substitute_character(0xfffd); 18 19echo "UTF-8 redundancy\n"; 20var_dump(chk_enc("\x31\x32\x33", 0)); 21var_dump(chk_enc("\x41\x42\x43", 0)); 22var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6)); 23var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6)); 24var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9)); 25var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9)); 26var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12)); 27var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11)); 28var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15)); 29var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15)); 30var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18)); 31var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18)); 32 33var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0)); 34var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9)); 35var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12)); 36var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15)); 37var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18)); 38 39var_dump(chk_enc("\xc1\xbf", 2)); 40var_dump(chk_enc("\xc2\x80", 0)); 41var_dump(chk_enc("\xdf\xbf", 0)); 42var_dump(chk_enc("\xe0\x9f\xff", 3)); 43var_dump(chk_enc("\xe0\xa0\x80", 2)); 44var_dump(chk_enc("\xef\xbf\xbf", 0)); 45var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4)); 46var_dump(chk_enc("\xf0\x90\x80\x80", 0)); 47var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4)); 48var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5)); 49var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5)); 50var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5)); 51var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6)); 52var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6)); 53var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6)); 54var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6)); 55 56echo "UTF-8 and surrogates area\n"; 57$out = ''; 58$cnt = 0; 59for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 60 $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3); 61 if ($s === false) { 62 $cnt++; 63 } else { 64 $out .= $s; 65 } 66} 67var_dump($cnt); 68var_dump($out); 69 70echo "UTF-32 code range\n"; 71var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE")); 72var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE")); 73var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE")); 74var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE")); 75var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32")); 76var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32")); 77var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32")); 78var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32")); 79var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32")); 80var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32")); 81 82echo "UTF-32 and surrogates area\n"; 83$out = ''; 84$cnt = 0; 85for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 86 $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE"); 87 if ($s === false) { 88 $cnt++; 89 } else { 90 $out .= $s; 91 } 92} 93var_dump($cnt); 94var_dump($out); 95 96$out = ''; 97$cnt = 0; 98for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 99 $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE"); 100 if ($s === false) { 101 $cnt++; 102 } else { 103 $out .= $s; 104 } 105} 106var_dump($cnt); 107var_dump($out); 108 109$out = ''; 110$cnt = 0; 111for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 112 $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32"); 113 if ($s === false) { 114 $cnt++; 115 } else { 116 $out .= $s; 117 } 118} 119var_dump($cnt); 120var_dump($out); 121 122echo "UTF-32 and surrogates area with BOM\n"; 123 124$out = ''; 125$cnt = 0; 126for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 127 $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 128 1, "UTF-32"); 129 if ($s === false) { 130 $cnt++; 131 } else { 132 $out .= $s; 133 } 134} 135var_dump($cnt); 136var_dump($out); 137 138$out = ''; 139$cnt = 0; 140for ($i = 0xd7ff; $i <= 0xe000; ++$i) { 141 $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 142 1, "UTF-32"); 143 if ($s === false) { 144 $cnt++; 145 } else { 146 $out .= $s; 147 } 148} 149var_dump($cnt); 150var_dump($out); 151 152?> 153--EXPECT-- 154UTF-8 redundancy 155string(24) "000000310000003200000033" 156string(24) "000000410000004200000043" 157bool(false) 158bool(false) 159bool(false) 160bool(false) 161bool(false) 162bool(false) 163bool(false) 164bool(false) 165bool(false) 166bool(false) 167string(24) "000000a2000000a3000000a5" 168bool(false) 169bool(false) 170bool(false) 171bool(false) 172bool(false) 173string(8) "00000080" 174string(8) "000007ff" 175bool(false) 176string(8) "00000800" 177string(8) "0000ffff" 178bool(false) 179string(8) "00010000" 180bool(false) 181bool(false) 182bool(false) 183bool(false) 184bool(false) 185bool(false) 186bool(false) 187bool(false) 188UTF-8 and surrogates area 189int(2048) 190string(16) "0000d7ff0000e000" 191UTF-32 code range 192bool(false) 193string(8) "0010ffff" 194bool(false) 195string(8) "0010ffff" 196bool(false) 197string(8) "0010ffff" 198string(8) "0000fffd" 199string(8) "0010ffff" 200string(8) "0000fffd" 201string(8) "0010ffff" 202UTF-32 and surrogates area 203int(2048) 204string(16) "0000d7ff0000e000" 205int(2048) 206string(16) "0000d7ff0000e000" 207int(2048) 208string(16) "0000d7ff0000e000" 209UTF-32 and surrogates area with BOM 210int(2048) 211string(16) "0000d7ff0000e000" 212int(2048) 213string(16) "0000d7ff0000e000" 214