1--TEST--
2Exhaustive test of Shift-JIS DoCoMo, KDDI, SoftBank encoding verification and conversion
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(818); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15/* Read in the table of all characters in Windows-932
16 * (The SJIS-Mobile encodings all use MS extensions) */
17readConversionTable(__DIR__ . '/data/CP932.txt', $sjisChars, $fromUnicode, true);
18
19/* U+301C (WAVE DASH) converts to SJIS 0x8160 (WAVE DASH) */
20$fromUnicode["\x00\x00\x30\x1C"] = "\x81\x60";
21/* U+2212 (MINUS SIGN) converts to SJIS 0x817C (FULLWIDTH HYPHEN-MINUS) */
22$fromUnicode["\x00\x00\x22\x12"] = "\x81\x7C";
23/* U+203E (OVERLINE) converts to SJIS 0x8150 (FULLWIDTH MACRON) */
24$fromUnicode["\x00\x00\x20\x3E"] = "\x81\x50";
25/* U+2016 (DOUBLE VERTICAL LINE) converts to SJIS 0x8161 (PARALLEL TO) */
26$fromUnicode["\x00\x00\x20\x16"] = "\x81\x61";
27/* U+00AF (MACRON) converts to SJIS 0x8150 (FULLWIDTH MACRON) */
28$fromUnicode["\x00\x00\x00\xAF"] = "\x81\x50";
29/* U+00AC (NOT SIGN) converts to SJIS 0x81CA (FULLWIDTH NOT SIGN) */
30$fromUnicode["\x00\x00\x00\xAC"] = "\x81\xCA";
31/* U+00A5 (YEN SIGN) converts to SJIS 0x818F (FULLWIDTH YEN SIGN) */
32$fromUnicode["\x00\x00\x00\xA5"] = "\x81\x8F";
33/* U+00A3 (POUND SIGN) converts to SJIS 0x8192 (FULLWIDTH POUND SIGN) */
34$fromUnicode["\x00\x00\x00\xA3"] = "\x81\x92";
35/* U+00A2 (CENT SIGN) converts to SJIS 0x8191 (FULLWIDTH CENT SIGN) */
36$fromUnicode["\x00\x00\x00\xA2"] = "\x81\x91";
37
38/* Aside from the characters in that table, we also support a 'user' area
39 * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */
40$codepoint = 0xE000;
41for ($i = 0xF0; $i <= 0xF9; $i++) {
42  for ($j = 0x40; $j <= 0xFC; $j++) {
43    if ($j == 0x7F)
44      continue;
45    $utf32 = pack('N', $codepoint);
46    $cp932 = chr($i) . chr($j);
47    $sjisChars[$cp932] = $utf32;
48    $fromUnicode[$utf32] = $cp932;
49    $codepoint++;
50  }
51}
52
53$invalidCodepoints = array();
54for ($i = 0; $i <= 0xFFFF; $i++) {
55  $cp = pack('N', $i);
56  if (!isset($fromUnicode[$cp]))
57    $invalidCodepoints[$cp] = true;
58}
59
60/* Windows-932 has many cases where two different kuten codes map to the same
61 * Unicode codepoints
62 *
63 * Everything from 0xED00-0xEEFF falls in this unfortunate category
64 * (Other sequences in 0xFA00-0xFC4B map to the same codepoints.)
65 * Our implementation of CP932 prefers the F's, but for SJIS-Mobile,
66 * we prefer the E's */
67$nonInvertible = array();
68for ($i = 0xFA00; $i <= 0xFC4B; $i++) {
69  $bytes = pack('n', $i);
70  if (isset($sjisChars[$bytes])) {
71    $nonInvertible[$bytes] = $sjisChars[$bytes];
72    unset($fromUnicode[$sjisChars[$bytes]]);
73  }
74}
75
76/* Other "collisions" */
77foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) {
78  $bytes = pack('n', $i);
79  $nonInvertible[$bytes] = $sjisChars[$bytes];
80  unset($fromUnicode[$sjisChars[$bytes]]);
81}
82
83$nonInvertibleSoftbank = $nonInvertible;
84$nonInvertibleDocomo   = $nonInvertible;
85
86/* Now read table of vendor-specific emoji encodings */
87$docomo = $sjisChars;
88$kddi = $sjisChars;
89$softbank = $sjisChars;
90$sbEmoji = array();
91$fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+');
92while ($line = fgets($fp, 256)) {
93  if ($line[0] == '#')
94    continue;
95  $fields = explode(';', rtrim($line));
96  if (count($fields) >= 4) {
97    if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2) {
98      $utf32 = pack('N', $cp1) . pack('N', $cp2);
99    } else {
100      $utf32 = pack('N', hexdec($fields[0]));
101      unset($invalidCodepoints[$utf32]);
102    }
103
104    if ($fields[1])
105      $docomo[pack('n', hexdec($fields[1]))] = $utf32;
106    if ($fields[2])
107      $kddi[pack('n', hexdec($fields[2]))] = $utf32;
108    if ($fields[3]) {
109      $bytes = pack('n', hexdec($fields[3]));
110      $sbEmoji[$bytes] = $utf32;
111      unset($nonInvertibleSoftbank[$bytes]);
112    }
113  }
114}
115
116/* Other, vendor-specific emoji which do not appear in EmojiSources.txt
117 * Most of these don't exist in Unicode and have been mapped to 'private
118 * area' codepoints */
119$docomo["\xF9\x4A"] = "\x00\x0F\xEE\x16"; // PIAS PI
120$docomo["\xF9\x4B"] = "\x00\x0F\xEE\x17"; // PIAS A
121$docomo["\xF9\x4C"] = "\x00\x0F\xEE\x18"; // INVERSE TICKET
122$docomo["\xF9\x4D"] = "\x00\x0F\xEE\x19"; // KATAKANA ABBREVIATION FOR TICKET ("chi ke")
123$docomo["\xF9\x4E"] = "\x00\x0F\xEE\x1A"; // RESERVE BY PHONE
124$docomo["\xF9\x4F"] = "\x00\x0F\xEE\x1B"; // P CODE
125$docomo["\xF9\x53"] = "\x00\x0F\xEE\x1C"; // MOVIES 2
126$docomo["\xF9\x54"] = "\x00\x0F\xEE\x1D"; // PIAS PI INVERSE
127$docomo["\xF9\x58"] = "\x00\x0F\xEE\x1E"; // PIAS PI CIRCLE
128$docomo["\xF9\x59"] = "\x00\x0F\xEE\x1F"; // PIAS PI SQUARE
129$docomo["\xF9\x5A"] = "\x00\x0F\xEE\x20"; // CHECK
130$docomo["\xF9\x5F"] = "\x00\x0F\xEE\x21"; // F
131$docomo["\xF9\x60"] = "\x00\x0F\xEE\x22"; // D
132$docomo["\xF9\x61"] = "\x00\x0F\xEE\x23"; // S
133$docomo["\xF9\x62"] = "\x00\x0F\xEE\x24"; // C
134$docomo["\xF9\x63"] = "\x00\x0F\xEE\x25"; // R
135$docomo["\xF9\x64"] = "\x00\x00\x25\xEA"; // SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK
136$nonInvertibleDocomo["\xF9\x64"] = "\x00\x00\x25\xEA";
137$docomo["\xF9\x65"] = "\x00\x00\x25\xA0"; // BLACK SQUARE
138$nonInvertibleDocomo["\xF9\x65"] = "\x00\x00\x25\xA0";
139$docomo["\xF9\x66"] = "\x00\x00\x25\xBF"; // DOWNWARD TRIANGLE
140$nonInvertibleDocomo["\xF9\x66"] = "\x00\x00\x25\xBF";
141/* TODO: test that FEE28 converts to F966, for backwards compatibility */
142$docomo["\xF9\x67"] = "\x00\x0F\xEE\x29"; // QUADRUPLE DAGGER
143$docomo["\xF9\x68"] = "\x00\x0F\xEE\x2A"; // TRIPLE DAGGER
144$docomo["\xF9\x69"] = "\x00\x0F\xEE\x2B"; // DOUBLE DAGGER
145$docomo["\xF9\x6A"] = "\x00\x00\x20\x20"; // DAGGER
146$nonInvertibleDocomo["\xF9\x6A"] = "\x00\x00\x20\x20";
147/* TODO: test that FEE2C converts to F96A, for backwards compatibility */
148$docomo["\xF9\x6B"] = "\x00\x0F\xEE\x2D"; // I (meaning "inexpensive")
149$docomo["\xF9\x6C"] = "\x00\x0F\xEE\x2E"; // M (meaning "moderate")
150$docomo["\xF9\x6D"] = "\x00\x0F\xEE\x2F"; // E (meaning "expensive")
151$docomo["\xF9\x6E"] = "\x00\x0F\xEE\x30"; // VE (meaning "very expensive")
152$docomo["\xF9\x6F"] = "\x00\x0F\xEE\x31"; // SPHERE
153$docomo["\xF9\x70"] = "\x00\x0F\xEE\x32"; // CREDIT CARDS NOT ACCEPTED
154$docomo["\xF9\x71"] = "\x00\x0F\xEE\x33"; // CHECKBOX
155$docomo["\xF9\x75"] = "\x00\x0F\xEE\x10"; // I-MODE
156$docomo["\xF9\x76"] = "\x00\x0F\xEE\x11"; // I-MODE WITH FRAME
157$docomo["\xF9\x78"] = "\x00\x0F\xEE\x12"; // PROVIDED BY DOCOMO
158$docomo["\xF9\x79"] = "\x00\x0F\xEE\x13"; // DOCOMO POINT
159$docomo["\xF9\x84"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY LOOP
160unset($invalidCodepoints["\x00\x00\x27\xBF"]);
161$docomo["\xF9\x86"] = "\x00\x0F\xE8\x2D"; // MOBILE Q
162$docomo["\xF9\xB1"] = "\x00\x0F\xEE\x14"; // I-APPLI
163$docomo["\xF9\xB2"] = "\x00\x0F\xEE\x15"; // I-APPLI WITH BORDER
164
165$kddi["\xF7\x94"] = "\x00\x0F\xEE\x40"; // EZ WEB
166$kddi["\xF7\xCF"] = "\x00\x0F\xEE\x41"; // EZ PLUS
167$kddi["\xF3\x70"] = "\x00\x0F\xEE\x42"; // EZ NAVIGATION
168$kddi["\xF4\x78"] = "\x00\x0F\xEE\x43"; // EZ MOVIE
169$kddi["\xF4\x86"] = "\x00\x0F\xEE\x44"; // CMAIL
170$kddi["\xF4\x8E"] = "\x00\x0F\xEE\x45"; // JAVA (TM)
171$kddi["\xF4\x8F"] = "\x00\x0F\xEE\x46"; // BREW
172$kddi["\xF4\x90"] = "\x00\x0F\xEE\x47"; // EZ RING MUSIC
173$kddi["\xF4\x91"] = "\x00\x0F\xEE\x48"; // EZ NAVI
174$kddi["\xF4\x92"] = "\x00\x0F\xEE\x49"; // WIN
175$kddi["\xF4\x93"] = "\x00\x0F\xEE\x4A"; // PREMIUM SIGN
176$kddi["\xF7\x48"] = "\x00\x0F\xE8\x2D"; // MOBILE Q
177$kddi["\xF7\xA3"] = "\x00\x0F\xE8\x3C"; // PDC ("personal digital cellular")
178$kddi["\xF7\xD2"] = "\x00\x0F\xEB\x89"; // OPENWAVE
179
180$sbEmoji["\xF7\xB1"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY
181$sbEmoji["\xF7\xF4"] = "\x00\x0F\xEE\x77"; // J-PHONE SHOP
182$sbEmoji["\xF7\xF5"] = "\x00\x0F\xEE\x78"; // SKY WEB
183$sbEmoji["\xF7\xF6"] = "\x00\x0F\xEE\x79"; // SKY WALKER
184$sbEmoji["\xF7\xF7"] = "\x00\x0F\xEE\x7A"; // SKY MELODY
185$sbEmoji["\xF7\xF8"] = "\x00\x0F\xEE\x7B"; // J-PHONE 1
186$sbEmoji["\xF7\xF9"] = "\x00\x0F\xEE\x7C"; // J-PHONE 2
187$sbEmoji["\xF7\xFA"] = "\x00\x0F\xEE\x7D"; // J-PHONE 3
188
189/* SoftBank-specific 'JSky1', 'JSky2', 'VODAFONE1', 'VODAFONE2', etc. emoji,
190 * which are not supported by Unicode */
191for ($i = 0xFBD8; $i <= 0xFBDE; $i++) {
192  $bytes = pack('n', $i);
193  $sbEmoji[$bytes] = pack('N', 0xFEE70 + $i - 0xFBD8);
194  unset($nonInvertibleSoftbank[$bytes]);
195}
196/* SoftBank-specific emoji for Shibuya department store */
197$sbEmoji["\xFB\xAA"] = "\x00\x0F\xE4\xC5";
198unset($nonInvertibleSoftbank["\xFB\xAA"]);
199
200$softbank = array_merge($softbank, $sbEmoji);
201
202/* For Softbank, we support an alternative representation for emoji which
203 * uses sequences starting with ESC. Apparently this was used in older
204 * versions of Softbank's phones.
205 * ESC could be followed by 6 different ASCII characters, each of which
206 * represented a different ku code */
207$escCodeToKu = array('G' => 0x91, 'E' => 0x8D, 'F' => 0x8E, 'O' => 0x92, 'P' => 0x95, 'Q' => 0x96);
208$escCodeMaxTen = array('G' => 0x7A, 'E' => 0x7A, 'F' => 0x7A, 'O' => 0x6D, 'P' => 0x6C, 'Q' => 0x5E);
209
210function shiftJISEncode($ku, $ten) {
211  $ku -= 0x21;
212  $ten -= 0x21;
213  $hiBits = $ku >> 1;
214  $loBit  = $ku % 2;
215  if ($hiBits < 31) {
216    $sjis = chr($hiBits + 0x81);
217  } else {
218    $sjis = chr($hiBits - 31 + 0xE0);
219  }
220  if ($loBit == 0) {
221    if ($ten < 63)
222      return $sjis . chr($ten + 0x40);
223    else
224      return $sjis . chr($ten - 63 + 0x80);
225  } else {
226    return $sjis . chr($ten + 0x9F);
227  }
228}
229
230foreach ($escCodeToKu as $char => $ku) {
231  for ($ten = 0x21; $ten <= $escCodeMaxTen[$char]; $ten++) {
232    $sjis = shiftJISEncode($ku, $ten);
233    if (isset($sbEmoji[$sjis])) {
234      $bytes = "\x1B\$" . $char . chr($ten);
235      $unicode = $softbank[$sjis];
236      $nonInvertibleSoftbank[$bytes] = $softbank[$bytes] = $unicode;
237    }
238  }
239}
240
241/* A bare ESC is not valid for Softbank, since it is used for escape sequences
242 * which represent emoji */
243unset($softbank["\x1B"]);
244
245function testSJISVariant($validChars, $nonInvertible, $encoding) {
246  global $fromUnicode, $invalidCodepoints, $escCodeToKu;
247
248  $lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
249  findInvalidChars($validChars, $invalidChars, $truncated, $lenTable);
250
251  foreach ($escCodeToKu as $char => $unused) {
252    unset($invalidChars["\x1B\$" . $char . "\x0F"]);
253    unset($truncated["\x1B\$" . $char]);
254  }
255
256  $escapes = [];
257  foreach ($nonInvertible as $bytes => $unicode) {
258    unset($validChars[$bytes]);
259    if (substr($bytes, 0, 1) === "\x1B")
260      array_push($escapes, $bytes);
261  }
262  /* 0xF is used to terminate a run of emoji encoded using ESC sequence
263   * We couldn't do this earlier or `findInvalidChars` wouldn't have worked
264   * as desired */
265  foreach ($escapes as $bytes) {
266    $nonInvertible[$bytes . "\x0F"] = $nonInvertible[$bytes];
267    unset($nonInvertible[$bytes]);
268  }
269
270  testAllValidChars($validChars, $encoding, 'UTF-32BE');
271  testAllValidChars($nonInvertible, $encoding, 'UTF-32BE', false);
272  echo "$encoding verification and conversion works on all valid characters\n";
273
274  testAllInvalidChars($invalidChars, $validChars, $encoding, 'UTF-32BE', "\x00\x00\x00%");
275  testTruncatedChars($truncated, $encoding, 'UTF-32BE', "\x00\x00\x00%");
276  echo "$encoding verification and conversion works on all invalid characters\n";
277
278  convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-32BE', $encoding, '%');
279  echo "Unicode -> $encoding conversion works on all invalid codepoints\n";
280
281  // Test "long" illegal character markers
282  mb_substitute_character("long");
283  convertInvalidString("\x80", "%", $encoding, "UTF-8");
284  convertInvalidString("\x81\x20", "%", $encoding, "UTF-8");
285  convertInvalidString("\xEA\xA9", "%", $encoding, "UTF-8");
286  mb_substitute_character(0x25); // '%'
287
288  // Test Regional Indicator codepoint at end of string
289  // The mobile SJIS variants all have special characters to represent certain national
290  // flags, but in Unicode these are represented by a sequence of _two_ codepoints
291  // So if only one of those two codepoints appears at the end of a string, it can't
292  // be converted to SJIS and should be treated as an error
293  convertInvalidString("\x00\x01\xF1\xE9", "%", "UTF-32BE", $encoding); // Regional Indicator C
294
295  // Test Regional Indicator codepoint followed by some other codepoint
296  convertInvalidString("\x00\x01\xF1\xE9\x00\x00\x00A", "%A", "UTF-32BE", $encoding);
297}
298
299testSJISVariant($docomo,   $nonInvertibleDocomo,   'SJIS-Mobile#DOCOMO');
300testSJISVariant($kddi,     $nonInvertible,         'SJIS-Mobile#KDDI');
301testSJISVariant($softbank, $nonInvertibleSoftbank, 'SJIS-Mobile#SOFTBANK');
302
303// Special Softbank escape sequences can appear at end of string
304convertValidString("\x1B\$O", "", "SJIS-Mobile#SOFTBANK", "UTF-8", false);
305convertValidString("\x1B\$P", "", "SJIS-Mobile#SOFTBANK", "UTF-8", false);
306convertValidString("\x1B\$Q", "", "SJIS-Mobile#SOFTBANK", "UTF-8", false);
307// Try invalid escape sequence
308convertInvalidString("\x1B\$X", "%", "SJIS-Mobile#SOFTBANK", "UTF-8", false);
309// Try truncated escape sequence
310convertInvalidString("\x1B\$", "%", "SJIS-Mobile#SOFTBANK", "UTF-8", false);
311
312// Regression test for problem with not allocating enough space in output buffer
313// This occurred when the input string was shorter than the output
314convertValidString("\xA9\xA9\xA9\xA9", "\xF9\xD6\xF9\xD6\xF9\xD6\xF9\xD6", '8bit', 'SJIS-Mobile#DOCOMO');
315convertValidString("\xA9\xA9\xA9\xA9", "\xF7\x74\xF7\x74\xF7\x74\xF7\x74", '8bit', 'SJIS-Mobile#KDDI');
316convertValidString("\xA9\xA9\xA9\xA9", "\xF7\xEE\xF7\xEE\xF7\xEE\xF7\xEE", '8bit', 'SJIS-Mobile#SOFTBANK');
317
318// Regression test: Old implementation used to drop digits (0-9) and hash (#) if
319// they appeared at end of input string
320for ($i = ord('0'); $i <= ord('9'); $i++) {
321  convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#DOCOMO');
322  convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#KDDI');
323  convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#SOFTBANK');
324}
325
326// Regression test: Originally, new implementation also did not handle 0-9 and hash
327// followed by U+20E3 (keycap modifier) correctly if the 0-9 or hash occurred at
328// the very end of one buffer of wchars, and the keycap modifier was at the
329// beginning of the following buffer of wchars
330for ($i = 0; $i <= 256; $i++) {
331  convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF9\x90", 'UTF-16BE', 'SJIS-Mobile#DOCOMO');
332  convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF7\xC9", 'UTF-16BE', 'SJIS-Mobile#KDDI');
333  convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF7\xC5", 'UTF-16BE', 'SJIS-Mobile#SOFTBANK');
334}
335
336// Regression test for 0-9 appearing at end of one buffer and U+203E NOT appearing
337// at the beginning of the next
338for ($i = 0; $i <= 256; $i++) {
339  convertValidString(str_repeat("\x000", $i), str_repeat('0', $i), 'UTF-16BE', 'SJIS-Mobile#DOCOMO');
340  convertValidString(str_repeat("\x000", $i), str_repeat('0', $i), 'UTF-16BE', 'SJIS-Mobile#KDDI');
341  convertValidString(str_repeat("\x000", $i), str_repeat('0', $i), 'UTF-16BE', 'SJIS-Mobile#SOFTBANK');
342}
343
344// Regression test for not making enough space in output buffer when 0-9 appeared
345// at the end of one buffer and was re-processed together with the next
346// This crazy-looking string was found by a fuzzer
347$str = "\x04\xff\x930\x00\xffUTF7~'F\x00A\x00\xffA\x0018030@\x00[\x1b\$EEEEE\x5C\x80(8~\x00F\x00zgb-18030$\x008~\x00F\x00z-gb-18EUC_JP-2004\x00z-g0\x0018030\x00b-18030$\x008~\x00F\x00z-gb-18EUC_JP-2004\x00z-g0\x0018030\x00";
348mb_convert_encoding($str, 'SJIS-Mobile#SOFTBANK', 'SJIS-Mobile#SOFTBANK');
349
350?>
351--EXPECT--
352SJIS-Mobile#DOCOMO verification and conversion works on all valid characters
353SJIS-Mobile#DOCOMO verification and conversion works on all invalid characters
354Unicode -> SJIS-Mobile#DOCOMO conversion works on all invalid codepoints
355SJIS-Mobile#KDDI verification and conversion works on all valid characters
356SJIS-Mobile#KDDI verification and conversion works on all invalid characters
357Unicode -> SJIS-Mobile#KDDI conversion works on all invalid codepoints
358SJIS-Mobile#SOFTBANK verification and conversion works on all valid characters
359SJIS-Mobile#SOFTBANK verification and conversion works on all invalid characters
360Unicode -> SJIS-Mobile#SOFTBANK conversion works on all invalid codepoints
361