1--TEST-- 2mb_substr() 3--EXTENSIONS-- 4mbstring 5--FILE-- 6<?php 7ini_set('include_path','.'); 8include_once('common.inc'); 9 10// EUC-JP 11$euc_jp = mb_convert_encoding('0123この文字列は日本語です。EUC-JPを使っています。日本語は面倒臭い。', 'EUC-JP', 'UTF-8'); 12// SJIS 13$sjis = mb_convert_encoding('日本語テキストです。0123456789。', 'SJIS', 'UTF-8'); 14// ISO-2022-JP 15$iso2022jp = "\x1B\$B\x21\x21!r\x1B(BABC"; 16// GB-18030 17$gb18030 = mb_convert_encoding('密码用户名密码名称名称', 'GB18030', 'UTF-8'); 18// HZ 19$hz = "The next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye."; 20// UTF-8 21$utf8 = "Greek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь"; 22// UTF-32 23$utf32 = mb_convert_encoding($utf8, 'UTF-32', 'UTF-8'); 24// UTF-7 25$utf7 = mb_convert_encoding($utf8, 'UTF-7', 'UTF-8'); 26 27echo "EUC-JP:\n"; 28print "1: ". bin2hex(mb_substr($euc_jp, 10, 10, 'EUC-JP')) . "\n"; 29print "2: ". bin2hex(mb_substr($euc_jp, 0, 100, 'EUC-JP')) . "\n"; 30 31$str = mb_substr($euc_jp, 100, 10, 'EUC-JP'); 32print ($str === "") ? "3 OK\n" : "BAD: " . bin2hex($str) . "\n"; 33 34$str = mb_substr($euc_jp, -100, 10, 'EUC-JP'); 35print ($str !== "") ? "4 OK: " . bin2hex($str) . "\n" : "BAD: " . bin2hex($str) . "\n"; 36 37echo "SJIS:\n"; 38print "1: " . bin2hex(mb_substr($sjis, 0, 3, 'SJIS')) . "\n"; 39print "2: " . bin2hex(mb_substr($sjis, -1, null, 'SJIS')) . "\n"; 40print "3: " . bin2hex(mb_substr($sjis, -5, 3, 'SJIS')) . "\n"; 41print "4: " . bin2hex(mb_substr($sjis, 1, null, 'SJIS')) . "\n"; 42print "5:" . bin2hex(mb_substr($sjis, 10, 0, 'SJIS')) . "\n"; 43echo "-- Testing illegal SJIS byte 0x80 --\n"; 44print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS')) . "\n"; 45print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS')) . "\n"; 46 47echo "SJIS-2004:\n"; 48print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-2004')) . "\n"; 49print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-2004')) . "\n"; 50 51echo "MacJapanese:\n"; 52print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'MacJapanese')) . "\n"; 53print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'MacJapanese')) . "\n"; 54 55echo "SJIS-Mobile#DOCOMO:\n"; 56print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#DOCOMO')) . "\n"; 57print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#DOCOMO')) . "\n"; 58 59echo "SJIS-Mobile#KDDI:\n"; 60print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#KDDI')) . "\n"; 61print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#KDDI')) . "\n"; 62 63echo "SJIS-Mobile#SoftBank:\n"; 64print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n"; 65print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n"; 66 67echo "-- Testing MacJapanese characters which map to 3-5 codepoints each --\n"; 68 69/* There are many characters in MacJapanese which map to sequences of several codepoints */ 70print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 0, 3, 'MacJapanese')) . "\n"; 71print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 3, 2, 'MacJapanese')) . "\n"; 72print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", -2, 1, 'MacJapanese')) . "\n"; 73print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 0, 3, 'MacJapanese')) . "\n"; 74print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 3, 2, 'MacJapanese')) . "\n"; 75print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", -2, 1, 'MacJapanese')) . "\n"; 76 77echo "ISO-2022-JP:\n"; 78print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n"; 79print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n"; 80print "3: " . bin2hex(mb_substr($iso2022jp, -6, 3, 'ISO-2022-JP')) . "\n"; 81print "4: " . bin2hex(mb_substr($iso2022jp, -3, 2, 'ISO-2022-JP')) . "\n"; 82print "5: " . bin2hex(mb_substr($iso2022jp, 1, null, 'ISO-2022-JP')) . "\n"; 83print "6:" . bin2hex(mb_substr($iso2022jp, 10, 0, 'ISO-2022-JP')) . "\n"; 84print "7:" . bin2hex(mb_substr($iso2022jp, 100, 10, 'ISO-2022-JP')) . "\n"; 85 86echo "GB-18030:\n"; 87print "1: " . bin2hex(mb_substr($gb18030, 0, 3, 'GB-18030')) . "\n"; 88print "2: " . bin2hex(mb_substr($gb18030, -1, null, 'GB-18030')) . "\n"; 89print "3: " . bin2hex(mb_substr($gb18030, -5, 3, 'GB-18030')) . "\n"; 90print "4: " . bin2hex(mb_substr($gb18030, 1, null, 'GB-18030')) . "\n"; 91print "5:" . bin2hex(mb_substr($gb18030, 10, 0, 'GB-18030')) . "\n"; 92 93echo "HZ:\n"; 94print "1: " . mb_substr($hz, 0, 3, 'HZ') . "\n"; 95print "2: " . mb_substr($hz, -1, null, 'HZ') . "\n"; 96print "3: " . mb_substr($hz, -5, 3, 'HZ') . "\n"; 97print "4: " . mb_substr($hz, 1, null, 'HZ') . "\n"; 98print "5:" . mb_substr($hz, 10, 0, 'HZ') . "\n"; 99 100echo "UTF-8:\n"; 101print "1: " . mb_substr($utf8, 0, 3, 'UTF-8') . "\n"; 102print "2: " . mb_substr($utf8, -1, null, 'UTF-8') . "\n"; 103print "3: " . mb_substr($utf8, -5, 3, 'UTF-8') . "\n"; 104print "4: " . mb_substr($utf8, 1, null, 'UTF-8') . "\n"; 105print "5:" . mb_substr($utf8, 10, 0, 'UTF-8') . "\n"; 106 107echo "UTF-32:\n"; 108print "1: " . mb_convert_encoding(mb_substr($utf32, 0, 3, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n"; 109print "2: " . mb_convert_encoding(mb_substr($utf32, -1, null, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n"; 110print "3: " . mb_convert_encoding(mb_substr($utf32, -5, 3, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n"; 111print "4: " . mb_convert_encoding(mb_substr($utf32, 1, null, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n"; 112print "5:" . mb_convert_encoding(mb_substr($utf32, 10, 0, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n"; 113 114echo "UTF-7:\n"; 115print "1: " . mb_convert_encoding(mb_substr($utf7, 0, 3, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n"; 116print "2: " . mb_convert_encoding(mb_substr($utf7, -1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n"; 117print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n"; 118print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n"; 119print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n"; 120 121echo "Testing agreement with mb_strpos on invalid UTF-8 string:\n"; 122/* Stefan Schiller pointed out that on invalid UTF-8 strings, character indices returned 123 * by mb_strpos would not extract the desired part of the string when passed to mb_substr. 124 * This is the test case which he provided: */ 125$data = "\xF0AAA<b>"; 126$pos = mb_strpos($data, "<", 0, "UTF-8"); 127$out = mb_substr($data, 0, $pos, "UTF-8"); 128print $out . "\n"; 129 130echo "Regression:\n"; 131/* During development, one >= comparison in mb_get_substr was wrongly written as > 132 * This was caught by libFuzzer */ 133$str = "\xbd\xbd\xbd\xbd\xbd\xbd\xbd\xbe\xbd\xbd\xbd\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x89\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x00\x00\x00\x00\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8b\x8b\x8b\xbd\xbd\xbd\xbd\xbd\xbd\xbd\xbe\x01:O\xaa\xd3"; 134echo bin2hex(mb_substr($str, 0, 128, "JIS")), "\n"; 135 136/* Alex messed up when reimplementing mb_substr and, in cases where `from` is non-zero and 137 * the number of characters to extract is more than 128, miscalculated where to end the substring 138 * Thanks to Maurício Fauth for finding the issue */ 139var_dump(mb_substr('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum dapibus feugiat ex non cursus. Pellentesque vestibulum tellus sit lectus.', 19, -1)); 140 141?> 142--EXPECT-- 143EUC-JP: 1441: c6fccbdcb8eca4c7a4b9a1a34555432d 1452: 30313233a4b3a4cecab8bbfacef3a4cfc6fccbdcb8eca4c7a4b9a1a34555432d4a50a4f2bbc8a4c3a4c6a4a4a4dea4b9a1a3c6fccbdcb8eca4cfccccc5ddbdada4a4a1a3 1463 OK 1474 OK: 30313233a4b3a4cecab8bbfacef3a4cf 148SJIS: 1491: 93fa967b8cea 1502: 8142 1513: 825582568257 1524: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142 1535: 154-- Testing illegal SJIS byte 0x80 -- 155633f 1563f6162 157SJIS-2004: 158633f 1593f6162 160MacJapanese: 1616380 162806162 163SJIS-Mobile#DOCOMO: 164633f 1653f6162 166SJIS-Mobile#KDDI: 167633f 1683f6162 169SJIS-Mobile#SoftBank: 170633f 1713f6162 172-- Testing MacJapanese characters which map to 3-5 codepoints each -- 173616263 1743f3f 17558 176616263 1773f3f 17878 179ISO-2022-JP: 1801: 1b2442212121721b284241 1812: 43 1823: 1b2442212121721b284241 1834: 4142 1845: 1b244221721b2842414243 1856: 1867: 187GB-18030: 1881: c3dcc2ebd3c3 1892: b3c6 1903: c2ebc3fbb3c6 1914: c2ebd3c3bba7c3fbc3dcc2ebc3fbb3c6c3fbb3c6 1925: 193HZ: 1941: The 1952: . 1963: ~{!#~}By 1974: he next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye. 1985: 199UTF-8: 2001: Gre 2012: ь 2023: йте 2034: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь 2045: 205UTF-32: 2061: Gre 2072: ь 2083: йте 2094: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь 2105: 211UTF-7: 2121: Gre 2132: ь 2143: йте 2154: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь 2165: 217Testing agreement with mb_strpos on invalid UTF-8 string: 218?AAA 219Regression: 2201b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842 221string(121) "it amet, consectetur adipiscing elit. Vestibulum dapibus feugiat ex non cursus. Pellentesque vestibulum tellus sit lectus" 222