1--TEST-- 2Exhaustive test of mUTF-7 (IMAP) encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--FILE-- 6<?php 7include('encoding_tests.inc'); 8mb_substitute_character(0x25); // '%' 9 10function utf16BE($utf8) { 11 return mb_convert_encoding($utf8, 'UTF-16BE', 'UTF-8'); 12} 13 14function mBase64($str) { 15 return str_replace('=', '', str_replace('/', ',', base64_encode($str))); 16} 17 18function testValid($from, $to, $bothWays = true) { 19 testValidString($from, $to, 'UTF7-IMAP', 'UTF-8', $bothWays); 20} 21function testInvalid($from, $to) { 22 testInvalidString($from, $to, 'UTF7-IMAP', 'UTF-8'); 23} 24 25/* An empty string is valid */ 26testValid("", ""); 27echo "Identification passes on empty string... good start!\n"; 28 29/* RFC says that 0x00 should be Base64-encoded */ 30testValidString("\x00", "&AAA-", 'UTF-8', 'UTF7-IMAP'); 31echo "Null byte converted correctly\n"; 32 33/* Identification and conversion of ASCII characters (minus &) */ 34for ($i = 0x20; $i <= 0x7E; $i++) { 35 if ($i == 0x26) // '&' 36 continue; 37 testValid(chr($i), chr($i)); 38} 39echo "Testing all valid single-character ASCII strings... check!\n"; 40 41/* Identification and conversion of non-ASCII characters */ 42for ($i = 0; $i < 0x20; $i++) 43 testInvalid(chr($i), "%"); 44for ($i = 0x7F; $i < 256; $i++) 45 testInvalid(chr($i), "%"); 46echo "Non-ASCII characters convert to illegal char marker... yes!\n"; 47 48/* Identification of '&' when Base-64 encoded */ 49testValid("&" . mBase64(utf16BE("&")) . "-", "&", false); 50echo "& can be Base64-encoded... yes!\n"; 51 52/* Identification of unterminated & section */ 53identifyInvalidString("&", 'UTF7-IMAP'); 54identifyInvalidString("abc&", 'UTF7-IMAP'); 55identifyInvalidString("&" . mBase64(utf16BE("ハムサンドイッチ")), 'UTF7-IMAP'); 56echo "Testing unterminated & sections... yep!\n"; 57 58/* Identification of null shifts (& immediately after -) 59 * 60 * This is illegal according to the spec for mUTF-7 (IMAP), but currently we are letting 61 * it pass... among other things, this makes it possible to concatenate UTF-7-IMAP 62 * strings naively without the concatenated strings being treated as 'invalid' 63 * 64 * If ever we want to enforce this part of the spec, uncomment the following test */ 65/* 66identifyInvalidString("&" . mBase64(utf16BE("肉包子")) . "-&" . mBase64(utf16BE("冰淇淋")) . "-", 'UTF7-IMAP'); 67echo "Testing consecutive & sections which should have been merged... yep!\n"; 68*/ 69 70/* Conversion of Base64-encoded ASCII characters (excluding &) 71 * These should be treated as erroneous and mb_substitute_character should apply */ 72for ($i = 0x20; $i <= 0x7E; $i++) { 73 if ($i == 0x26) // '&' 74 continue; 75 testInvalid("&" . mBase64(utf16BE(chr($i))) . "-", "%"); 76} 77echo "Testing ASCII characters which are Base64-encoded... great!\n"; 78 79/* Conversion of & encoded as &- */ 80testValid("&-", "&"); 81testValid("abc&-", "abc&"); 82testValid("&-.&-", "&.&"); 83echo "Testing valid strings which use '&-' for '&'... good!\n"; 84 85/* Identification of & sections containing non-Base64 */ 86 87/* We'll use 6 character strings as a test, since 6 UTF-16 characters is just enough 88 * to fit perfectly in Base64 encoding, with no padding */ 89$testString = mBase64(utf16BE("我是打酱油的")); 90if (strlen($testString) != 16) 91 die("Erk!!"); 92for ($i = 0; $i < 256; $i++) { 93 if ($i >= 0x30 && $i <= 0x39) // '0'..'9' 94 continue; 95 if ($i >= 0x41 && $i <= 0x5A) // 'A'..'Z' 96 continue; 97 if ($i >= 0x61 && $i <= 0x7A) // 'a'..'z' 98 continue; 99 if ($i == 0x2B || $i == 0x2C) // '+' or ',' 100 continue; 101 if ($i == 0x2D) // '-'... this will be interpreted as ending the Base64 section 102 continue; 103 identifyInvalidString("&" . substr($testString, 0, 11) . chr($i) . "-", 'UTF7-IMAP'); 104} 105echo "Identification fails when Base64 sections contain non-Base64 bytes... right!\n"; 106 107/* Tell me, please, how many ways can UTF-16BE text get messed up? 108 * Why, that's elementary... */ 109 110/* 1. The second half of a surrogate pair could come first, */ 111$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE'); 112if (strlen($testString) != 4) 113 die("Ouch!"); 114$testString = substr($testString, 2, 2) . substr($testString, 0, 2); 115identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP'); 116/* (Or could appear by itself) */ 117$testString2 = substr($testString, 0, 2); 118identifyInvalidString("&" . mBase64($testString2) . "-", 'UTF7-IMAP'); 119 120/* ...and we should detect this wherever it occurs */ 121$singleChar = "\x00\x01"; 122$doubleChar = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE'); 123if (strlen($doubleChar) != 4) 124 die("That was supposed to be a surrogate pair"); 125identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP'); 126identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP'); 127identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP'); 128identifyInvalidString("&" . mBase64($singleChar . $testString2) . "-", 'UTF7-IMAP'); 129identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString2) . "-", 'UTF7-IMAP'); 130identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString2) . "-", 'UTF7-IMAP'); 131identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP'); 132identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP'); 133identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP'); 134identifyInvalidString("&" . mBase64($doubleChar . $testString2) . "-", 'UTF7-IMAP'); 135identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString2) . "-", 'UTF7-IMAP'); 136identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString2) . "-", 'UTF7-IMAP'); 137 138/* 2. The first half of a surrogate pair might be followed by an invalid 2nd part, */ 139$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE'); 140$testString = substr($testString, 0, 2) . "\x00a"; 141identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP'); 142 143/* ...and we should also detect that wherever it occurs... */ 144identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP'); 145identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP'); 146identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP'); 147 148/* 3. The first half of a surrogate pair could come at the end of the string, */ 149$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE'); 150testInvalid("&" . mBase64(substr($testString, 0, 2)) . "-", "%"); 151testInvalid("&" . mBase64($singleChar . substr($testString, 0, 2)) . "-", "\x01%"); 152testInvalid("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)) . "-", "\x01\x01%"); 153/* ...and the string could even be improperly terminated... */ 154testInvalid("&" . mBase64(substr($testString, 0, 2)), "%%"); 155testInvalid("&" . mBase64($singleChar . substr($testString, 0, 2)), "\x01%%"); 156/* NOTE: We currently don't check for trailing first half of surrogate pair when the string 157 * abruptly ends after a group of 3 Base64-encoded codepoints... that's why we only emit one 158 * error marker here for the incorrect termination of Base64 section and no error marker 159 * for the trailing first half of surrogate pair */ 160testInvalid("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)), "\x01\x01%"); 161 162/* 4. Or, it could have an odd number of bytes in it! */ 163$testString = utf16BE("ドーナツ"); 164$testString = substr($testString, 0, strlen($testString) - 1); 165identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP'); 166 167/* And there is one bonus way to discombobulate your UTF-16BE when it is Base64-encoded... 168 * The Base64 might not decode to an integral number of bytes 169 * Or, equivalently... it might not be padded with zeroes (as the RFC requires) */ 170$testString = utf16BE("☺⛑"); 171if (strlen($testString) != 4) 172 die("No good"); 173$encoded = mBase64($testString); 174if (strlen($encoded) != 6) 175 die("Don't like that"); 176/* Mess up the padding by replacing the last Base64 character with ',', 177 * which represents 63 (a number with a 1 in the last bit) */ 178identifyInvalidString("&" . substr($encoded, 0, strlen($encoded) - 1) . ",-", 'UTF7-IMAP'); 179 180echo "Identification fails when UTF-16 text is invalid... no sweat!\n"; 181 182/* OK, let's try valid Base64-encoded text now */ 183 184/* 2-byte char */ 185testValid("&" . mBase64(utf16BE("☺")) . "-", "☺"); 186/* 2 + 2 */ 187testValid("&" . mBase64(utf16BE("饺子")) . "-", "饺子"); 188/* 2 + 2 + 2 */ 189testValid("&" . mBase64(utf16BE("123")) . "-", "123"); 190/* 2 + 2 + 2 + 2 */ 191testValid("&" . mBase64(utf16BE("ᄚᄆᄇᄈ")) . "-", "ᄚᄆᄇᄈ"); 192/* 4 */ 193$longChar1 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE'); 194$longChar2 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-8', 'UTF-32BE'); 195testValid("&" . mBase64($longChar1) . "-", $longChar2); 196/* 2 + 4 */ 197testValid("&" . mBase64(utf16BE("饼") . $longChar1) . "-", "饼" . $longChar2); 198/* 4 + 2 */ 199testValid("&" . mBase64($longChar1 . utf16BE("饼")) . "-", $longChar2 . "饼"); 200/* 2 + 4 + 2 */ 201testValid("&" . mBase64(utf16BE("☺") . $longChar1 . utf16BE("饼")) . "-", "☺" . $longChar2 . "饼"); 202/* 2 + 2 + 4 */ 203testValid("&" . mBase64(utf16BE("西瓜") . $longChar1) . "-", "西瓜" . $longChar2); 204/* 2 + 2 + 4 + 2 */ 205testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . utf16BE("☺")) . "-", "西瓜" . $longChar2 . "☺"); 206/* 2 + 2 + 4 + 4 */ 207testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . $longChar1) . "-", "西瓜" . $longChar2 . $longChar2); 208/* 2 + 2 + 2 + 4 */ 209testValid("&" . mBase64(utf16BE("西红柿") . $longChar1) . "-", "西红柿" . $longChar2); 210 211/* Multiple sections of valid ASCII _and_ Base64-encoded text */ 212testValid("123&" . mBase64(utf16BE("123")) . "-abc&" . mBase64(utf16BE("☺")) . "-.", "123123abc☺."); 213 214/* If a & character appears right after a non-ASCII character, we must first close the Base64 215 * section and then emit &- */ 216testValidString("☺&", "&Jjo-&-", "UTF-8", "UTF7-IMAP", false); 217testValidString("西瓜&", "&iX903A-&-", "UTF-8", "UTF7-IMAP", false); 218testValidString("西红柿&", "&iX9+omf,-&-", "UTF-8", "UTF7-IMAP", false); 219 220echo "Identification and conversion of valid text is working... perfect!\n"; 221 222// Try illegal Unicode codepoint (> 0x10FFFF) 223convertInvalidString("\x00\x20\x00\x00", "%", "UCS-4BE", "UTF7-IMAP"); 224 225// Test "long" illegal character markers 226mb_substitute_character("long"); 227convertInvalidString("\x10", "%", "UTF7-IMAP", "UTF-8"); 228convertInvalidString("\x80", "%", "UTF7-IMAP", "UTF-8"); 229convertInvalidString("abc&", "abc%", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it 230convertInvalidString("&**-", "%*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal 231 232// Try strings where Base64 has an extra trailing byte which is not needed 233convertInvalidString('&RR8I', "\xE4\x94\x9F%", 'UTF7-IMAP', 'UTF-8'); 234convertInvalidString('&RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF7-IMAP', 'UTF-8'); 235 236// It is useless for a Base64 section to only contain a single 'A' 237// (which decodes to only zero bits) 238convertInvalidString("&A", "\x00\x00\x00%", 'UTF7-IMAP', 'UTF-32BE'); 239 240echo "Done!\n"; 241?> 242--EXPECT-- 243Identification passes on empty string... good start! 244Null byte converted correctly 245Testing all valid single-character ASCII strings... check! 246Non-ASCII characters convert to illegal char marker... yes! 247& can be Base64-encoded... yes! 248Testing unterminated & sections... yep! 249Testing ASCII characters which are Base64-encoded... great! 250Testing valid strings which use '&-' for '&'... good! 251Identification fails when Base64 sections contain non-Base64 bytes... right! 252Identification fails when UTF-16 text is invalid... no sweat! 253Identification and conversion of valid text is working... perfect! 254Done! 255