1--TEST--
2Unicode standard conformance test (ill-formed UTF sequences.)
3--EXTENSIONS--
4mbstring
5--FILE--
6<?php
7function chk_enc($str, $n, $enc = "UTF-8") {
8    $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
9    $dst = str_repeat("0000fffd", $n);
10    if ($dst == $src) {
11        return false;
12    } else {
13        return $src;
14    }
15}
16
17mb_substitute_character(0xfffd);
18
19echo "UTF-8 redundancy\n";
20var_dump(chk_enc("\x31\x32\x33", 0));
21var_dump(chk_enc("\x41\x42\x43", 0));
22var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
23var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
24var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
25var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
26var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
27var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
28var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
29var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
30var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
31var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
32
33var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
34var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
35var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
36var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
37var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
38
39var_dump(chk_enc("\xc1\xbf", 2));
40var_dump(chk_enc("\xc2\x80", 0));
41var_dump(chk_enc("\xdf\xbf", 0));
42var_dump(chk_enc("\xe0\x9f\xff", 3));
43var_dump(chk_enc("\xe0\xa0\x80", 2));
44var_dump(chk_enc("\xef\xbf\xbf", 0));
45var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
46var_dump(chk_enc("\xf0\x90\x80\x80", 0));
47var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
48var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
49var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5));
50var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5));
51var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6));
52var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6));
53var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6));
54var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6));
55
56echo "UTF-8 and surrogates area\n";
57$out = '';
58$cnt = 0;
59for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
60    $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
61    if ($s === false) {
62        $cnt++;
63    } else {
64        $out .= $s;
65    }
66}
67var_dump($cnt);
68var_dump($out);
69
70echo "UTF-32 code range\n";
71var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE"));
72var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE"));
73var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE"));
74var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE"));
75var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32"));
76var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32"));
77var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32"));
78var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32"));
79var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32"));
80var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32"));
81
82echo "UTF-32 and surrogates area\n";
83$out = '';
84$cnt = 0;
85for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
86    $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE");
87    if ($s === false) {
88        $cnt++;
89    } else {
90        $out .= $s;
91    }
92}
93var_dump($cnt);
94var_dump($out);
95
96$out = '';
97$cnt = 0;
98for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
99    $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE");
100    if ($s === false) {
101        $cnt++;
102    } else {
103        $out .= $s;
104    }
105}
106var_dump($cnt);
107var_dump($out);
108
109$out = '';
110$cnt = 0;
111for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
112    $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32");
113    if ($s === false) {
114        $cnt++;
115    } else {
116        $out .= $s;
117    }
118}
119var_dump($cnt);
120var_dump($out);
121
122echo "UTF-32 and surrogates area with BOM\n";
123
124$out = '';
125$cnt = 0;
126for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
127    $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff),
128                 1, "UTF-32");
129    if ($s === false) {
130        $cnt++;
131    } else {
132        $out .= $s;
133    }
134}
135var_dump($cnt);
136var_dump($out);
137
138$out = '';
139$cnt = 0;
140for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
141    $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff),
142                 1, "UTF-32");
143    if ($s === false) {
144        $cnt++;
145    } else {
146        $out .= $s;
147    }
148}
149var_dump($cnt);
150var_dump($out);
151
152?>
153--EXPECT--
154UTF-8 redundancy
155string(24) "000000310000003200000033"
156string(24) "000000410000004200000043"
157bool(false)
158bool(false)
159bool(false)
160bool(false)
161bool(false)
162bool(false)
163bool(false)
164bool(false)
165bool(false)
166bool(false)
167string(24) "000000a2000000a3000000a5"
168bool(false)
169bool(false)
170bool(false)
171bool(false)
172bool(false)
173string(8) "00000080"
174string(8) "000007ff"
175bool(false)
176string(8) "00000800"
177string(8) "0000ffff"
178bool(false)
179string(8) "00010000"
180bool(false)
181bool(false)
182bool(false)
183bool(false)
184bool(false)
185bool(false)
186bool(false)
187bool(false)
188UTF-8 and surrogates area
189int(2048)
190string(16) "0000d7ff0000e000"
191UTF-32 code range
192bool(false)
193string(8) "0010ffff"
194bool(false)
195string(8) "0010ffff"
196bool(false)
197string(8) "0010ffff"
198string(8) "0000fffd"
199string(8) "0010ffff"
200string(8) "0000fffd"
201string(8) "0010ffff"
202UTF-32 and surrogates area
203int(2048)
204string(16) "0000d7ff0000e000"
205int(2048)
206string(16) "0000d7ff0000e000"
207int(2048)
208string(16) "0000d7ff0000e000"
209UTF-32 and surrogates area with BOM
210int(2048)
211string(16) "0000d7ff0000e000"
212int(2048)
213string(16) "0000d7ff0000e000"
214