1--TEST--
2Unicode standard conformance test (ill-formed UTF sequences.)
3--EXTENSIONS--
4mbstring
5--FILE--
6<?php
7function chk_enc($str, $n, $enc = "UTF-8") {
8    $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
9    $dst = str_repeat("0000fffd", $n);
10    if ($dst == $src) {
11        return false;
12    } else {
13        return $src;
14    }
15}
16
17mb_substitute_character(0xfffd);
18
19
20echo "UTF-8 redundancy\n";
21var_dump(chk_enc("\x31\x32\x33", 0));
22var_dump(chk_enc("\x41\x42\x43", 0));
23var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
24var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
25var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
26var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
27var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
28var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
29var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
30var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
31var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
32var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
33
34var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
35var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
36var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
37var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
38var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
39
40var_dump(chk_enc("\xc1\xbf", 2));
41var_dump(chk_enc("\xc2\x80", 0));
42var_dump(chk_enc("\xdf\xbf", 0));
43var_dump(chk_enc("\xe0\x9f\xff", 3));
44var_dump(chk_enc("\xe0\xa0\x80", 2));
45var_dump(chk_enc("\xef\xbf\xbf", 0));
46var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
47var_dump(chk_enc("\xf0\x90\x80\x80", 0));
48var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
49var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
50var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5));
51var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5));
52var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6));
53var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6));
54var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6));
55var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6));
56
57echo "UTF-8 and surrogates area\n";
58$out = '';
59$cnt = 0;
60for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
61    $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
62    if ($s === false) {
63        $cnt++;
64    } else {
65        $out .= $s;
66    }
67}
68var_dump($cnt);
69var_dump($out);
70
71echo "UTF-32 code range\n";
72var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE"));
73var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE"));
74var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE"));
75var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE"));
76var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32"));
77var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32"));
78var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32"));
79var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32"));
80var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32"));
81var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32"));
82
83echo "UTF-32 and surrogates area\n";
84$out = '';
85$cnt = 0;
86for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
87    $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE");
88    if ($s === false) {
89        $cnt++;
90    } else {
91        $out .= $s;
92    }
93}
94var_dump($cnt);
95var_dump($out);
96
97$out = '';
98$cnt = 0;
99for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
100    $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE");
101    if ($s === false) {
102        $cnt++;
103    } else {
104        $out .= $s;
105    }
106}
107var_dump($cnt);
108var_dump($out);
109
110$out = '';
111$cnt = 0;
112for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
113    $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32");
114    if ($s === false) {
115        $cnt++;
116    } else {
117        $out .= $s;
118    }
119}
120var_dump($cnt);
121var_dump($out);
122
123echo "UTF-32 and surrogates area with BOM\n";
124
125$out = '';
126$cnt = 0;
127for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
128    $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff),
129                 1, "UTF-32");
130    if ($s === false) {
131        $cnt++;
132    } else {
133        $out .= $s;
134    }
135}
136var_dump($cnt);
137var_dump($out);
138
139$out = '';
140$cnt = 0;
141for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
142    $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff),
143                 1, "UTF-32");
144    if ($s === false) {
145        $cnt++;
146    } else {
147        $out .= $s;
148    }
149}
150var_dump($cnt);
151var_dump($out);
152
153?>
154--EXPECT--
155UTF-8 redundancy
156string(24) "000000310000003200000033"
157string(24) "000000410000004200000043"
158bool(false)
159bool(false)
160bool(false)
161bool(false)
162bool(false)
163bool(false)
164bool(false)
165bool(false)
166bool(false)
167bool(false)
168string(24) "000000a2000000a3000000a5"
169bool(false)
170bool(false)
171bool(false)
172bool(false)
173bool(false)
174string(8) "00000080"
175string(8) "000007ff"
176bool(false)
177string(8) "00000800"
178string(8) "0000ffff"
179bool(false)
180string(8) "00010000"
181bool(false)
182bool(false)
183bool(false)
184bool(false)
185bool(false)
186bool(false)
187bool(false)
188bool(false)
189UTF-8 and surrogates area
190int(2048)
191string(16) "0000d7ff0000e000"
192UTF-32 code range
193bool(false)
194string(8) "0010ffff"
195bool(false)
196string(8) "0010ffff"
197bool(false)
198string(8) "0010ffff"
199string(8) "0000fffd"
200string(8) "0010ffff"
201string(8) "0000fffd"
202string(8) "0010ffff"
203UTF-32 and surrogates area
204int(2048)
205string(16) "0000d7ff0000e000"
206int(2048)
207string(16) "0000d7ff0000e000"
208int(2048)
209string(16) "0000d7ff0000e000"
210UTF-32 and surrogates area with BOM
211int(2048)
212string(16) "0000d7ff0000e000"
213int(2048)
214string(16) "0000d7ff0000e000"
215