1--TEST--
2Unicode standard conformance test (ill-formed UTF sequences.)
3--SKIPIF--
4<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5--FILE--
6<?php
7function chk_enc($str, $n, $enc = "UTF-8", $with_bom = false) {
8	$src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
9	$dst = str_repeat("0000fffd", $n);
10	if ($with_bom) {
11		$dst = "0000feff" . $dst;
12	}
13	if ($dst == $src) {
14		return false;
15	} else {
16		return $src;
17	}
18}
19
20mb_substitute_character(0xfffd);
21
22
23echo "UTF-8 redundancy\n";
24var_dump(chk_enc("\x31\x32\x33", 0));
25var_dump(chk_enc("\x41\x42\x43", 0));
26var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
27var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
28var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
29var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
30var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
31var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
32var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
33var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
34var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
35var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
36
37var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
38var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
39var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
40var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
41var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
42
43var_dump(chk_enc("\xc1\xbf", 2));
44var_dump(chk_enc("\xc2\x80", 0));
45var_dump(chk_enc("\xdf\xbf", 0));
46var_dump(chk_enc("\xe0\x9f\xff", 3));
47var_dump(chk_enc("\xe0\xa0\x80", 2));
48var_dump(chk_enc("\xef\xbf\xbf", 0));
49var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
50var_dump(chk_enc("\xf0\x90\x80\x80", 0));
51var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
52var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
53var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5));
54var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5));
55var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6));
56var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6));
57var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6));
58var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6));
59
60echo "UTF-8 and surrogates area\n";
61$out = '';
62$cnt = 0;
63for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
64	$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
65	if ($s === false) {
66		$cnt++;
67	} else {
68		$out .= $s;
69	}
70}
71var_dump($cnt);
72var_dump($out);
73
74echo "UTF-32 code range\n";
75var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE"));
76var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE"));
77var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE"));
78var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE"));
79var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32"));
80var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32"));
81var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32"));
82var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32"));
83var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32"));
84var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32"));
85
86echo "UTF-32 and surrogates area\n";
87$out = '';
88$cnt = 0;
89for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
90    $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE");
91	if ($s === false) {
92		$cnt++;
93	} else {
94		$out .= $s;
95	}
96}
97var_dump($cnt);
98var_dump($out);
99
100$out = '';
101$cnt = 0;
102for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
103    $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE");
104	if ($s === false) {
105		$cnt++;
106	} else {
107		$out .= $s;
108	}
109}
110var_dump($cnt);
111var_dump($out);
112
113$out = '';
114$cnt = 0;
115for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
116    $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32");
117	if ($s === false) {
118		$cnt++;
119	} else {
120		$out .= $s;
121	}
122}
123var_dump($cnt);
124var_dump($out);
125
126echo "UTF-32 and surrogates area with BOM\n";
127
128$out = '';
129$cnt = 0;
130for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
131    $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff),
132				 1, "UTF-32", true);
133	if ($s === false) {
134		$cnt++;
135	} else {
136		$out .= $s;
137	}
138}
139var_dump($cnt);
140var_dump(str_replace("0000feff","",$out));
141
142$out = '';
143$cnt = 0;
144for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
145    $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff),
146				 1, "UTF-32", true);
147	if ($s === false) {
148		$cnt++;
149	} else {
150		$out .= $s;
151	}
152}
153var_dump($cnt);
154var_dump(str_replace("0000feff","",$out));
155
156?>
157--EXPECT--
158UTF-8 redundancy
159string(24) "000000310000003200000033"
160string(24) "000000410000004200000043"
161bool(false)
162bool(false)
163bool(false)
164bool(false)
165bool(false)
166bool(false)
167bool(false)
168bool(false)
169bool(false)
170bool(false)
171string(24) "000000a2000000a3000000a5"
172bool(false)
173bool(false)
174bool(false)
175bool(false)
176bool(false)
177string(8) "00000080"
178string(8) "000007ff"
179bool(false)
180string(8) "00000800"
181string(8) "0000ffff"
182bool(false)
183string(8) "00010000"
184bool(false)
185bool(false)
186bool(false)
187bool(false)
188bool(false)
189bool(false)
190bool(false)
191bool(false)
192UTF-8 and surrogates area
193int(2048)
194string(16) "0000d7ff0000e000"
195UTF-32 code range
196bool(false)
197string(8) "0010ffff"
198bool(false)
199string(8) "0010ffff"
200bool(false)
201string(8) "0010ffff"
202string(16) "0000feff0000fffd"
203string(16) "0000feff0010ffff"
204string(16) "0000feff0000fffd"
205string(16) "0000feff0010ffff"
206UTF-32 and surrogates area
207int(2048)
208string(16) "0000d7ff0000e000"
209int(2048)
210string(16) "0000d7ff0000e000"
211int(2048)
212string(16) "0000d7ff0000e000"
213UTF-32 and surrogates area with BOM
214int(2048)
215string(16) "0000d7ff0000e000"
216int(2048)
217string(16) "0000d7ff0000e000"
218