1--TEST--
2Exhaustive test of mUTF-7 (IMAP) encoding verification and conversion
3--EXTENSIONS--
4mbstring
5--FILE--
6<?php
7include('encoding_tests.inc');
8mb_substitute_character(0x25); // '%'
9
10function utf16BE($utf8) {
11	return mb_convert_encoding($utf8, 'UTF-16BE', 'UTF-8');
12}
13
14function mBase64($str) {
15	return str_replace('=', '', str_replace('/', ',', base64_encode($str)));
16}
17
18function testValid($from, $to, $bothWays = true) {
19	testValidString($from, $to, 'UTF7-IMAP', 'UTF-8', $bothWays);
20}
21function testInvalid($from, $to) {
22	testInvalidString($from, $to, 'UTF7-IMAP', 'UTF-8');
23}
24
25/* An empty string is valid */
26testValid("", "");
27echo "Identification passes on empty string... good start!\n";
28
29/* RFC says that 0x00 should be Base64-encoded */
30testValidString("\x00", "&AAA-", 'UTF-8', 'UTF7-IMAP');
31echo "Null byte converted correctly\n";
32
33/* Identification and conversion of ASCII characters (minus &) */
34for ($i = 0x20; $i <= 0x7E; $i++) {
35	if ($i == 0x26) // '&'
36		continue;
37	testValid(chr($i), chr($i));
38}
39echo "Testing all valid single-character ASCII strings... check!\n";
40
41/* Identification and conversion of non-ASCII characters */
42for ($i = 0; $i < 0x20; $i++)
43	testInvalid(chr($i), "%");
44for ($i = 0x7F; $i < 256; $i++)
45	testInvalid(chr($i), "%");
46echo "Non-ASCII characters convert to illegal char marker... yes!\n";
47
48/* Identification of '&' when Base-64 encoded */
49testValid("&" . mBase64(utf16BE("&")) . "-", "&", false);
50echo "& can be Base64-encoded... yes!\n";
51
52/* Identification of unterminated & section */
53identifyInvalidString("&", 'UTF7-IMAP');
54identifyInvalidString("abc&", 'UTF7-IMAP');
55identifyInvalidString("&" . mBase64(utf16BE("ハムサンドイッチ")), 'UTF7-IMAP');
56echo "Testing unterminated & sections... yep!\n";
57
58/* Identification of null shifts (& immediately after -)
59 *
60 * This is illegal according to the spec for mUTF-7 (IMAP), but currently we are letting
61 * it pass... among other things, this makes it possible to concatenate UTF-7-IMAP
62 * strings naively without the concatenated strings being treated as 'invalid'
63 *
64 * If ever we want to enforce this part of the spec, uncomment the following test */
65/*
66identifyInvalidString("&" . mBase64(utf16BE("肉包子")) . "-&" . mBase64(utf16BE("冰淇淋")) . "-", 'UTF7-IMAP');
67echo "Testing consecutive & sections which should have been merged... yep!\n";
68*/
69
70/* Conversion of Base64-encoded ASCII characters (excluding &)
71 * These should be treated as erroneous and mb_substitute_character should apply */
72for ($i = 0x20; $i <= 0x7E; $i++) {
73	if ($i == 0x26) // '&'
74		continue;
75	testInvalid("&" . mBase64(utf16BE(chr($i))) . "-", "%");
76}
77echo "Testing ASCII characters which are Base64-encoded... great!\n";
78
79/* Conversion of & encoded as &- */
80testValid("&-", "&");
81testValid("abc&-", "abc&");
82testValid("&-.&-", "&.&");
83echo "Testing valid strings which use '&-' for '&'... good!\n";
84
85/* Identification of & sections containing non-Base64 */
86
87/* We'll use 6 character strings as a test, since 6 UTF-16 characters is just enough
88 * to fit perfectly in Base64 encoding, with no padding */
89$testString = mBase64(utf16BE("我是打酱油的"));
90if (strlen($testString) != 16)
91	die("Erk!!");
92for ($i = 0; $i < 256; $i++) {
93	if ($i >= 0x30 && $i <= 0x39) // '0'..'9'
94		continue;
95	if ($i >= 0x41 && $i <= 0x5A) // 'A'..'Z'
96		continue;
97	if ($i >= 0x61 && $i <= 0x7A) // 'a'..'z'
98		continue;
99	if ($i == 0x2B || $i == 0x2C) // '+' or ','
100		continue;
101	if ($i == 0x2D) // '-'... this will be interpreted as ending the Base64 section
102		continue;
103	identifyInvalidString("&" . substr($testString, 0, 11) . chr($i) . "-", 'UTF7-IMAP');
104}
105echo "Identification fails when Base64 sections contain non-Base64 bytes... right!\n";
106
107/* Tell me, please, how many ways can UTF-16BE text get messed up?
108 * Why, that's elementary... */
109
110/* 1. The second half of a surrogate pair could come first, */
111$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
112if (strlen($testString) != 4)
113	die("Ouch!");
114$testString = substr($testString, 2, 2) . substr($testString, 0, 2);
115identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
116/* (Or could appear by itself) */
117$testString2 = substr($testString, 0, 2);
118identifyInvalidString("&" . mBase64($testString2) . "-", 'UTF7-IMAP');
119
120/* ...and we should detect this wherever it occurs */
121$singleChar = "\x00\x01";
122$doubleChar = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
123if (strlen($doubleChar) != 4)
124	die("That was supposed to be a surrogate pair");
125identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
126identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
127identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
128identifyInvalidString("&" . mBase64($singleChar . $testString2) . "-", 'UTF7-IMAP');
129identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString2) . "-", 'UTF7-IMAP');
130identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString2) . "-", 'UTF7-IMAP');
131identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');
132identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
133identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
134identifyInvalidString("&" . mBase64($doubleChar . $testString2) . "-", 'UTF7-IMAP');
135identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString2) . "-", 'UTF7-IMAP');
136identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString2) . "-", 'UTF7-IMAP');
137
138/* 2. The first half of a surrogate pair might be followed by an invalid 2nd part, */
139$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
140$testString = substr($testString, 0, 2) . "\x00a";
141identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
142
143/* ...and we should also detect that wherever it occurs... */
144identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
145identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
146identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');
147
148/* 3. The first half of a surrogate pair could come at the end of the string, */
149$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
150testInvalid("&" . mBase64(substr($testString, 0, 2)) . "-", "%");
151testInvalid("&" . mBase64($singleChar . substr($testString, 0, 2)) . "-", "\x01%");
152testInvalid("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)) . "-", "\x01\x01%");
153/* ...and the string could even be improperly terminated... */
154testInvalid("&" . mBase64(substr($testString, 0, 2)), "%%");
155testInvalid("&" . mBase64($singleChar . substr($testString, 0, 2)), "\x01%%");
156/* NOTE: We currently don't check for trailing first half of surrogate pair when the string
157 * abruptly ends after a group of 3 Base64-encoded codepoints... that's why we only emit one
158 * error marker here for the incorrect termination of Base64 section and no error marker
159 * for the trailing first half of surrogate pair */
160testInvalid("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)), "\x01\x01%");
161
162/* 4. Or, it could have an odd number of bytes in it! */
163$testString = utf16BE("ドーナツ");
164$testString = substr($testString, 0, strlen($testString) - 1);
165identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
166
167/* And there is one bonus way to discombobulate your UTF-16BE when it is Base64-encoded...
168 * The Base64 might not decode to an integral number of bytes
169 * Or, equivalently... it might not be padded with zeroes (as the RFC requires) */
170$testString = utf16BE("☺⛑");
171if (strlen($testString) != 4)
172	die("No good");
173$encoded = mBase64($testString);
174if (strlen($encoded) != 6)
175	die("Don't like that");
176/* Mess up the padding by replacing the last Base64 character with ',',
177 * which represents 63 (a number with a 1 in the last bit) */
178identifyInvalidString("&" . substr($encoded, 0, strlen($encoded) - 1) . ",-", 'UTF7-IMAP');
179
180echo "Identification fails when UTF-16 text is invalid... no sweat!\n";
181
182/* OK, let's try valid Base64-encoded text now */
183
184/* 2-byte char */
185testValid("&" . mBase64(utf16BE("☺")) . "-", "☺");
186/* 2 + 2 */
187testValid("&" . mBase64(utf16BE("饺子")) . "-", "饺子");
188/* 2 + 2 + 2 */
189testValid("&" . mBase64(utf16BE("123")) . "-", "123");
190/* 2 + 2 + 2 + 2 */
191testValid("&" . mBase64(utf16BE("ᄚᄆᄇᄈ")) . "-", "ᄚᄆᄇᄈ");
192/* 4 */
193$longChar1 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
194$longChar2 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-8', 'UTF-32BE');
195testValid("&" . mBase64($longChar1) . "-", $longChar2);
196/* 2 + 4 */
197testValid("&" . mBase64(utf16BE("饼") . $longChar1) . "-", "饼" . $longChar2);
198/* 4 + 2 */
199testValid("&" . mBase64($longChar1 . utf16BE("饼")) . "-", $longChar2 . "饼");
200/* 2 + 4 + 2 */
201testValid("&" . mBase64(utf16BE("☺") . $longChar1 . utf16BE("饼")) . "-", "☺" . $longChar2 . "饼");
202/* 2 + 2 + 4 */
203testValid("&" . mBase64(utf16BE("西瓜") . $longChar1) . "-", "西瓜" . $longChar2);
204/* 2 + 2 + 4 + 2 */
205testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . utf16BE("☺")) . "-", "西瓜" . $longChar2 . "☺");
206/* 2 + 2 + 4 + 4 */
207testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . $longChar1) . "-", "西瓜" . $longChar2 . $longChar2);
208/* 2 + 2 + 2 + 4 */
209testValid("&" . mBase64(utf16BE("西红柿") . $longChar1) . "-", "西红柿" . $longChar2);
210
211/* Multiple sections of valid ASCII _and_ Base64-encoded text */
212testValid("123&" . mBase64(utf16BE("123")) . "-abc&" . mBase64(utf16BE("☺")) . "-.", "123123abc☺.");
213
214/* If a & character appears right after a non-ASCII character, we must first close the Base64
215 * section and then emit &- */
216testValidString("☺&", "&Jjo-&-", "UTF-8", "UTF7-IMAP", false);
217testValidString("西瓜&", "&iX903A-&-", "UTF-8", "UTF7-IMAP", false);
218testValidString("西红柿&", "&iX9+omf,-&-", "UTF-8", "UTF7-IMAP", false);
219
220echo "Identification and conversion of valid text is working... perfect!\n";
221
222// Try illegal Unicode codepoint (> 0x10FFFF)
223convertInvalidString("\x00\x20\x00\x00", "%", "UCS-4BE", "UTF7-IMAP");
224
225// Test "long" illegal character markers
226mb_substitute_character("long");
227convertInvalidString("\x10", "%", "UTF7-IMAP", "UTF-8");
228convertInvalidString("\x80", "%", "UTF7-IMAP", "UTF-8");
229convertInvalidString("abc&", "abc%", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it
230convertInvalidString("&**-", "%*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
231
232// Try strings where Base64 has an extra trailing byte which is not needed
233convertInvalidString('&RR8I', "\xE4\x94\x9F%", 'UTF7-IMAP', 'UTF-8');
234convertInvalidString('&RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF7-IMAP', 'UTF-8');
235
236// It is useless for a Base64 section to only contain a single 'A'
237// (which decodes to only zero bits)
238convertInvalidString("&A", "\x00\x00\x00%", 'UTF7-IMAP', 'UTF-32BE');
239
240echo "Done!\n";
241?>
242--EXPECT--
243Identification passes on empty string... good start!
244Null byte converted correctly
245Testing all valid single-character ASCII strings... check!
246Non-ASCII characters convert to illegal char marker... yes!
247& can be Base64-encoded... yes!
248Testing unterminated & sections... yep!
249Testing ASCII characters which are Base64-encoded... great!
250Testing valid strings which use '&-' for '&'... good!
251Identification fails when Base64 sections contain non-Base64 bytes... right!
252Identification fails when UTF-16 text is invalid... no sweat!
253Identification and conversion of valid text is working... perfect!
254Done!
255