1
2 /* gdkanji.c (Kanji code converter) */
3 /* written by Masahito Yamaga (ma@yama-ga.com) */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include "gd.h"
9 #include "gdhelpers.h"
10
11 #include <stdarg.h>
12 #if defined(HAVE_ICONV_H) || defined(HAVE_ICONV)
13 #include <iconv.h>
14 #ifdef HAVE_ERRNO_H
15 #include <errno.h>
16 #endif
17 #endif
18
19 #if defined(HAVE_ICONV_H) && !defined(HAVE_ICONV)
20 #define HAVE_ICONV 1
21 #endif
22
23 #define LIBNAME "any2eucjp()"
24
25 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
26 #ifndef SJISPRE
27 #define SJISPRE 1
28 #endif
29 #endif
30
31 #ifdef TRUE
32 #undef TRUE
33 #endif
34 #ifdef FALSE
35 #undef FALSE
36 #endif
37
38 #define TRUE 1
39 #define FALSE 0
40
41 #define NEW 1
42 #define OLD 2
43 #define ESCI 3
44 #define NEC 4
45 #define EUC 5
46 #define SJIS 6
47 #define EUCORSJIS 7
48 #define ASCII 8
49
50 #define NEWJISSTR "JIS7"
51 #define OLDJISSTR "jis"
52 #define EUCSTR "eucJP"
53 #define SJISSTR "SJIS"
54
55 #define ESC 27
56 #define SS2 142
57
58 static void
debug(const char * format,...)59 debug (const char *format,...)
60 {
61 #ifdef DEBUG
62 va_list args;
63
64 va_start (args, format);
65 fprintf (stdout, "%s: ", LIBNAME);
66 vfprintf (stdout, format, args);
67 fprintf (stdout, "\n");
68 va_end (args);
69 #endif
70 }
71
72 static void
error(const char * format,...)73 error (const char *format,...)
74 {
75 va_list args;
76 char *tmp;
77
78 va_start(args, format);
79 vspprintf(&tmp, 0, format, args);
80 va_end(args);
81 php_error_docref(NULL, E_WARNING, "%s: %s", LIBNAME, tmp);
82 efree(tmp);
83 }
84
85 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
86
87 static int
DetectKanjiCode(unsigned char * str)88 DetectKanjiCode (unsigned char *str)
89 {
90 static int whatcode = ASCII;
91 int oldcode = ASCII;
92 int c, i;
93 char *lang = NULL;
94
95 c = '\1';
96 i = 0;
97
98 if (whatcode != EUCORSJIS && whatcode != ASCII)
99 {
100 oldcode = whatcode;
101 whatcode = ASCII;
102 }
103
104 while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
105 {
106 if ((c = str[i++]) != '\0')
107 {
108 if (c == ESC)
109 {
110 c = str[i++];
111 if (c == '$')
112 {
113 c = str[i++];
114 if (c == 'B')
115 whatcode = NEW;
116 else if (c == '@')
117 whatcode = OLD;
118 }
119 else if (c == '(')
120 {
121 c = str[i++];
122 if (c == 'I')
123 whatcode = ESCI;
124 }
125 else if (c == 'K')
126 whatcode = NEC;
127 }
128 else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
129 whatcode = SJIS;
130 else if (c == SS2)
131 {
132 c = str[i++];
133 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
134 whatcode = SJIS;
135 else if (c >= 161 && c <= 223)
136 whatcode = EUCORSJIS;
137 }
138 else if (c >= 161 && c <= 223)
139 {
140 c = str[i++];
141 if (c >= 240 && c <= 254)
142 whatcode = EUC;
143 else if (c >= 161 && c <= 223)
144 whatcode = EUCORSJIS;
145 else if (c >= 224 && c <= 239)
146 {
147 whatcode = EUCORSJIS;
148 while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
149 {
150 if (c >= 129)
151 {
152 if (c <= 141 || (c >= 143 && c <= 159))
153 whatcode = SJIS;
154 else if (c >= 253 && c <= 254)
155 whatcode = EUC;
156 }
157 c = str[i++];
158 }
159 }
160 else if (c <= 159)
161 whatcode = SJIS;
162 }
163 else if (c >= 240 && c <= 254)
164 whatcode = EUC;
165 else if (c >= 224 && c <= 239)
166 {
167 c = str[i++];
168 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
169 whatcode = SJIS;
170 else if (c >= 253 && c <= 254)
171 whatcode = EUC;
172 else if (c >= 161 && c <= 252)
173 whatcode = EUCORSJIS;
174 }
175 }
176 }
177
178 #ifdef DEBUG
179 if (whatcode == ASCII)
180 debug ("Kanji code not included.");
181 else if (whatcode == EUCORSJIS)
182 debug ("Kanji code not detected.");
183 else
184 debug ("Kanji code detected at %d byte.", i);
185 #endif
186
187 if (whatcode == EUCORSJIS && oldcode != ASCII)
188 whatcode = oldcode;
189
190 if (whatcode == EUCORSJIS)
191 {
192 if (getenv ("LC_ALL"))
193 lang = getenv ("LC_ALL");
194 else if (getenv ("LC_CTYPE"))
195 lang = getenv ("LC_CTYPE");
196 else if (getenv ("LANG"))
197 lang = getenv ("LANG");
198
199 if (lang)
200 {
201 if (strcmp (lang, "ja_JP.SJIS") == 0 ||
202 #ifdef hpux
203 strcmp (lang, "japanese") == 0 ||
204 #endif
205 strcmp (lang, "ja_JP.mscode") == 0 ||
206 strcmp (lang, "ja_JP.PCK") == 0)
207 whatcode = SJIS;
208 else if (strncmp (lang, "ja", 2) == 0)
209 #ifdef SJISPRE
210 whatcode = SJIS;
211 #else
212 whatcode = EUC;
213 #endif
214 }
215 }
216
217 if (whatcode == EUCORSJIS)
218 #ifdef SJISPRE
219 whatcode = SJIS;
220 #else
221 whatcode = EUC;
222 #endif
223
224 return whatcode;
225 }
226
227 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
228
229 static void
SJIStoJIS(int * p1,int * p2)230 SJIStoJIS (int *p1, int *p2)
231 {
232 register unsigned char c1 = *p1;
233 register unsigned char c2 = *p2;
234 register int adjust = c2 < 159;
235 register int rowOffset = c1 < 160 ? 112 : 176;
236 register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
237
238 *p1 = ((c1 - rowOffset) << 1) - adjust;
239 *p2 -= cellOffset;
240 }
241
242 /* han2zen() was derived from han2zen() written by Ken Lunde. */
243
244 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
245 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
246
247 static void
han2zen(int * p1,int * p2)248 han2zen (int *p1, int *p2)
249 {
250 int c = *p1;
251 int daku = FALSE;
252 int handaku = FALSE;
253 int mtable[][2] =
254 {
255 {129, 66},
256 {129, 117},
257 {129, 118},
258 {129, 65},
259 {129, 69},
260 {131, 146},
261 {131, 64},
262 {131, 66},
263 {131, 68},
264 {131, 70},
265 {131, 72},
266 {131, 131},
267 {131, 133},
268 {131, 135},
269 {131, 98},
270 {129, 91},
271 {131, 65},
272 {131, 67},
273 {131, 69},
274 {131, 71},
275 {131, 73},
276 {131, 74},
277 {131, 76},
278 {131, 78},
279 {131, 80},
280 {131, 82},
281 {131, 84},
282 {131, 86},
283 {131, 88},
284 {131, 90},
285 {131, 92},
286 {131, 94},
287 {131, 96},
288 {131, 99},
289 {131, 101},
290 {131, 103},
291 {131, 105},
292 {131, 106},
293 {131, 107},
294 {131, 108},
295 {131, 109},
296 {131, 110},
297 {131, 113},
298 {131, 116},
299 {131, 119},
300 {131, 122},
301 {131, 125},
302 {131, 126},
303 {131, 128},
304 {131, 129},
305 {131, 130},
306 {131, 132},
307 {131, 134},
308 {131, 136},
309 {131, 137},
310 {131, 138},
311 {131, 139},
312 {131, 140},
313 {131, 141},
314 {131, 143},
315 {131, 147},
316 {129, 74},
317 {129, 75}
318 };
319
320 if (*p2 == 222 && IS_DAKU (*p1))
321 daku = TRUE; /* Daku-ten */
322 else if (*p2 == 223 && IS_HANDAKU (*p1))
323 handaku = TRUE; /* Han-daku-ten */
324
325 *p1 = mtable[c - 161][0];
326 *p2 = mtable[c - 161][1];
327
328 if (daku)
329 {
330 if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
331 (*p2)++;
332 else if (*p2 == 131 || *p2 == 69)
333 *p2 = 148;
334 }
335 else if (handaku && *p2 >= 110 && *p2 <= 122)
336 (*p2) += 2;
337 }
338
339 /* Recast strcpy to handle unsigned chars used below. */
340 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
341
342 static void
do_convert(unsigned char * to,unsigned char * from,const char * code)343 do_convert (unsigned char *to, unsigned char *from, const char *code)
344 {
345 #ifdef HAVE_ICONV
346 iconv_t cd;
347 size_t from_len, to_len;
348
349 if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
350 {
351 error ("iconv_open() error");
352 #ifdef HAVE_ERRNO_H
353 if (errno == EINVAL)
354 error ("invalid code specification: \"%s\" or \"%s\"",
355 EUCSTR, code);
356 #endif
357 strcpy ((char *) to, (const char *) from);
358 return;
359 }
360
361 from_len = strlen ((const char *) from) + 1;
362 to_len = BUFSIZ;
363
364 if ((int) iconv(cd, (char **) &from, &from_len, (char **) &to, &to_len) == -1)
365 {
366 #ifdef HAVE_ERRNO_H
367 if (errno == EINVAL)
368 error ("invalid end of input string");
369 else if (errno == EILSEQ)
370 error ("invalid code in input string");
371 else if (errno == E2BIG)
372 error ("output buffer overflow at do_convert()");
373 else
374 #endif
375 error ("something happen");
376 strcpy ((char *) to, (const char *) from);
377 return;
378 }
379
380 if (iconv_close (cd) != 0)
381 {
382 error ("iconv_close() error");
383 }
384 #else
385 int p1, p2, i, j;
386 int jisx0208 = FALSE;
387 int hankaku = FALSE;
388
389 j = 0;
390 if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
391 {
392 for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
393 {
394 if (from[i] == ESC)
395 {
396 i++;
397 if (from[i] == '$')
398 {
399 jisx0208 = TRUE;
400 hankaku = FALSE;
401 i++;
402 }
403 else if (from[i] == '(')
404 {
405 jisx0208 = FALSE;
406 i++;
407 if (from[i] == 'I') /* Hankaku Kana */
408 hankaku = TRUE;
409 else
410 hankaku = FALSE;
411 }
412 }
413 else
414 {
415 if (jisx0208)
416 to[j++] = from[i] + 128;
417 else if (hankaku)
418 {
419 to[j++] = SS2;
420 to[j++] = from[i] + 128;
421 }
422 else
423 to[j++] = from[i];
424 }
425 }
426 }
427 else if (strcmp (code, SJISSTR) == 0)
428 {
429 for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
430 {
431 p1 = from[i];
432 if (p1 < 127)
433 to[j++] = p1;
434 else if ((p1 >= 161) && (p1 <= 223))
435 { /* Hankaku Kana */
436 to[j++] = SS2;
437 to[j++] = p1;
438 }
439 else
440 {
441 p2 = from[++i];
442 SJIStoJIS (&p1, &p2);
443 to[j++] = p1 + 128;
444 to[j++] = p2 + 128;
445 }
446 }
447 }
448 else
449 {
450 error ("invalid code specification: \"%s\"", code);
451 return;
452 }
453
454 if (j >= BUFSIZ)
455 {
456 error ("output buffer overflow at do_convert()");
457 ustrcpy (to, from);
458 }
459 else
460 to[j] = '\0';
461 #endif /* HAVE_ICONV */
462 }
463
464 static int
do_check_and_conv(unsigned char * to,unsigned char * from)465 do_check_and_conv (unsigned char *to, unsigned char *from)
466 {
467 static unsigned char tmp[BUFSIZ];
468 int p1, p2, i, j;
469 int kanji = TRUE;
470
471 switch (DetectKanjiCode (from))
472 {
473 case NEW:
474 debug ("Kanji code is New JIS.");
475 do_convert (tmp, from, NEWJISSTR);
476 break;
477 case OLD:
478 debug ("Kanji code is Old JIS.");
479 do_convert (tmp, from, OLDJISSTR);
480 break;
481 case ESCI:
482 debug ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
483 do_convert (tmp, from, NEWJISSTR);
484 break;
485 case NEC:
486 debug ("Kanji code is NEC Kanji.");
487 error ("cannot convert NEC Kanji.");
488 ustrcpy (tmp, from);
489 kanji = FALSE;
490 break;
491 case EUC:
492 debug ("Kanji code is EUC.");
493 ustrcpy (tmp, from);
494 break;
495 case SJIS:
496 debug ("Kanji code is SJIS.");
497 do_convert (tmp, from, SJISSTR);
498 break;
499 case EUCORSJIS:
500 debug ("Kanji code is EUC or SJIS.");
501 ustrcpy (tmp, from);
502 kanji = FALSE;
503 break;
504 case ASCII:
505 debug ("This is ASCII string.");
506 ustrcpy (tmp, from);
507 kanji = FALSE;
508 break;
509 default:
510 debug ("This string includes unknown code.");
511 ustrcpy (tmp, from);
512 kanji = FALSE;
513 break;
514 }
515
516 /* Hankaku Kana ---> Zenkaku Kana */
517 if (kanji)
518 {
519 j = 0;
520 for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
521 {
522 if (tmp[i] == SS2)
523 {
524 p1 = tmp[++i];
525 if (tmp[i + 1] == SS2)
526 {
527 p2 = tmp[i + 2];
528 if (p2 == 222 || p2 == 223)
529 i += 2;
530 else
531 p2 = 0;
532 }
533 else
534 p2 = 0;
535 han2zen (&p1, &p2);
536 SJIStoJIS (&p1, &p2);
537 to[j++] = p1 + 128;
538 to[j++] = p2 + 128;
539 }
540 else
541 to[j++] = tmp[i];
542 }
543
544 if (j >= BUFSIZ)
545 {
546 error ("output buffer overflow at Hankaku --> Zenkaku");
547 ustrcpy (to, tmp);
548 }
549 else
550 to[j] = '\0';
551 }
552 else
553 ustrcpy (to, tmp);
554
555 return kanji;
556 }
557
558 int
any2eucjp(unsigned char * dest,unsigned char * src,unsigned int dest_max)559 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
560 {
561 static unsigned char tmp_dest[BUFSIZ];
562 int ret;
563
564 if (strlen ((const char *) src) >= BUFSIZ)
565 {
566 error ("input string too large");
567 return -1;
568 }
569 if (dest_max > BUFSIZ)
570 {
571 error ("invalid maximum size of destination\nit should be less than %d.", BUFSIZ);
572 return -1;
573 }
574 ret = do_check_and_conv (tmp_dest, src);
575 if (strlen ((const char *) tmp_dest) >= dest_max)
576 {
577 error ("output buffer overflow");
578 ustrcpy (dest, src);
579 return -1;
580 }
581 ustrcpy (dest, tmp_dest);
582 return ret;
583 }
584
585 #if 0
586 unsigned int
587 strwidth (unsigned char *s)
588 {
589 unsigned char *t;
590 unsigned int i;
591
592 t = (unsigned char *) gdMalloc (BUFSIZ);
593 any2eucjp (t, s, BUFSIZ);
594 i = strlen (t);
595 gdFree (t);
596 return i;
597 }
598
599 #ifdef DEBUG
600 int
601 main ()
602 {
603 unsigned char input[BUFSIZ];
604 unsigned char *output;
605 unsigned char *str;
606 int c, i = 0;
607
608 while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
609 input[i++] = c;
610 input[i] = '\0';
611
612 printf ("input : %d bytes\n", strlen ((const char *) input));
613 printf ("output: %d bytes\n", strwidth (input));
614
615 output = (unsigned char *) gdMalloc (BUFSIZ);
616 any2eucjp (output, input, BUFSIZ);
617 str = output;
618 while (*str != '\0')
619 putchar (*(str++));
620 putchar ('\n');
621 gdFree (output);
622
623 return 0;
624 }
625 #endif
626 #endif
627