1
2 /* gdkanji.c (Kanji code converter) */
3 /* written by Masahito Yamaga (ma@yama-ga.com) */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include "gd.h"
9 #include "gdhelpers.h"
10
11 #include <stdarg.h>
12 #if defined(HAVE_ICONV_H) || defined(HAVE_ICONV)
13 #include <iconv.h>
14 #ifdef HAVE_ERRNO_H
15 #include <errno.h>
16 #endif
17 #endif
18
19 #if defined(HAVE_ICONV_H) && !defined(HAVE_ICONV)
20 #define HAVE_ICONV 1
21 #endif
22
23 #define LIBNAME "any2eucjp()"
24
25 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
26 #ifndef SJISPRE
27 #define SJISPRE 1
28 #endif
29 #endif
30
31 #ifdef TRUE
32 #undef TRUE
33 #endif
34 #ifdef FALSE
35 #undef FALSE
36 #endif
37
38 #define TRUE 1
39 #define FALSE 0
40
41 #define NEW 1
42 #define OLD 2
43 #define ESCI 3
44 #define NEC 4
45 #define EUC 5
46 #define SJIS 6
47 #define EUCORSJIS 7
48 #define ASCII 8
49
50 #define NEWJISSTR "JIS7"
51 #define OLDJISSTR "jis"
52 #define EUCSTR "eucJP"
53 #define SJISSTR "SJIS"
54
55 #define ESC 27
56 #define SS2 142
57
58 static void
debug(const char * format,...)59 debug (const char *format,...)
60 {
61 #ifdef DEBUG
62 va_list args;
63
64 va_start (args, format);
65 fprintf (stdout, "%s: ", LIBNAME);
66 vfprintf (stdout, format, args);
67 fprintf (stdout, "\n");
68 va_end (args);
69 #endif
70 }
71
72 static void
error(const char * format,...)73 error (const char *format,...)
74 {
75 va_list args;
76 char *tmp;
77 TSRMLS_FETCH();
78
79 va_start(args, format);
80 vspprintf(&tmp, 0, format, args);
81 va_end(args);
82 php_error_docref(NULL TSRMLS_CC, E_WARNING, "%s: %s", LIBNAME, tmp);
83 efree(tmp);
84 }
85
86 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
87
88 static int
DetectKanjiCode(unsigned char * str)89 DetectKanjiCode (unsigned char *str)
90 {
91 static int whatcode = ASCII;
92 int oldcode = ASCII;
93 int c, i;
94 char *lang = NULL;
95
96 c = '\1';
97 i = 0;
98
99 if (whatcode != EUCORSJIS && whatcode != ASCII)
100 {
101 oldcode = whatcode;
102 whatcode = ASCII;
103 }
104
105 while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
106 {
107 if ((c = str[i++]) != '\0')
108 {
109 if (c == ESC)
110 {
111 c = str[i++];
112 if (c == '$')
113 {
114 c = str[i++];
115 if (c == 'B')
116 whatcode = NEW;
117 else if (c == '@')
118 whatcode = OLD;
119 }
120 else if (c == '(')
121 {
122 c = str[i++];
123 if (c == 'I')
124 whatcode = ESCI;
125 }
126 else if (c == 'K')
127 whatcode = NEC;
128 }
129 else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
130 whatcode = SJIS;
131 else if (c == SS2)
132 {
133 c = str[i++];
134 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
135 whatcode = SJIS;
136 else if (c >= 161 && c <= 223)
137 whatcode = EUCORSJIS;
138 }
139 else if (c >= 161 && c <= 223)
140 {
141 c = str[i++];
142 if (c >= 240 && c <= 254)
143 whatcode = EUC;
144 else if (c >= 161 && c <= 223)
145 whatcode = EUCORSJIS;
146 else if (c >= 224 && c <= 239)
147 {
148 whatcode = EUCORSJIS;
149 while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
150 {
151 if (c >= 129)
152 {
153 if (c <= 141 || (c >= 143 && c <= 159))
154 whatcode = SJIS;
155 else if (c >= 253 && c <= 254)
156 whatcode = EUC;
157 }
158 c = str[i++];
159 }
160 }
161 else if (c <= 159)
162 whatcode = SJIS;
163 }
164 else if (c >= 240 && c <= 254)
165 whatcode = EUC;
166 else if (c >= 224 && c <= 239)
167 {
168 c = str[i++];
169 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
170 whatcode = SJIS;
171 else if (c >= 253 && c <= 254)
172 whatcode = EUC;
173 else if (c >= 161 && c <= 252)
174 whatcode = EUCORSJIS;
175 }
176 }
177 }
178
179 #ifdef DEBUG
180 if (whatcode == ASCII)
181 debug ("Kanji code not included.");
182 else if (whatcode == EUCORSJIS)
183 debug ("Kanji code not detected.");
184 else
185 debug ("Kanji code detected at %d byte.", i);
186 #endif
187
188 if (whatcode == EUCORSJIS && oldcode != ASCII)
189 whatcode = oldcode;
190
191 if (whatcode == EUCORSJIS)
192 {
193 if (getenv ("LC_ALL"))
194 lang = getenv ("LC_ALL");
195 else if (getenv ("LC_CTYPE"))
196 lang = getenv ("LC_CTYPE");
197 else if (getenv ("LANG"))
198 lang = getenv ("LANG");
199
200 if (lang)
201 {
202 if (strcmp (lang, "ja_JP.SJIS") == 0 ||
203 #ifdef hpux
204 strcmp (lang, "japanese") == 0 ||
205 #endif
206 strcmp (lang, "ja_JP.mscode") == 0 ||
207 strcmp (lang, "ja_JP.PCK") == 0)
208 whatcode = SJIS;
209 else if (strncmp (lang, "ja", 2) == 0)
210 #ifdef SJISPRE
211 whatcode = SJIS;
212 #else
213 whatcode = EUC;
214 #endif
215 }
216 }
217
218 if (whatcode == EUCORSJIS)
219 #ifdef SJISPRE
220 whatcode = SJIS;
221 #else
222 whatcode = EUC;
223 #endif
224
225 return whatcode;
226 }
227
228 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
229
230 static void
SJIStoJIS(int * p1,int * p2)231 SJIStoJIS (int *p1, int *p2)
232 {
233 register unsigned char c1 = *p1;
234 register unsigned char c2 = *p2;
235 register int adjust = c2 < 159;
236 register int rowOffset = c1 < 160 ? 112 : 176;
237 register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
238
239 *p1 = ((c1 - rowOffset) << 1) - adjust;
240 *p2 -= cellOffset;
241 }
242
243 /* han2zen() was derived from han2zen() written by Ken Lunde. */
244
245 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
246 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
247
248 static void
han2zen(int * p1,int * p2)249 han2zen (int *p1, int *p2)
250 {
251 int c = *p1;
252 int daku = FALSE;
253 int handaku = FALSE;
254 int mtable[][2] =
255 {
256 {129, 66},
257 {129, 117},
258 {129, 118},
259 {129, 65},
260 {129, 69},
261 {131, 146},
262 {131, 64},
263 {131, 66},
264 {131, 68},
265 {131, 70},
266 {131, 72},
267 {131, 131},
268 {131, 133},
269 {131, 135},
270 {131, 98},
271 {129, 91},
272 {131, 65},
273 {131, 67},
274 {131, 69},
275 {131, 71},
276 {131, 73},
277 {131, 74},
278 {131, 76},
279 {131, 78},
280 {131, 80},
281 {131, 82},
282 {131, 84},
283 {131, 86},
284 {131, 88},
285 {131, 90},
286 {131, 92},
287 {131, 94},
288 {131, 96},
289 {131, 99},
290 {131, 101},
291 {131, 103},
292 {131, 105},
293 {131, 106},
294 {131, 107},
295 {131, 108},
296 {131, 109},
297 {131, 110},
298 {131, 113},
299 {131, 116},
300 {131, 119},
301 {131, 122},
302 {131, 125},
303 {131, 126},
304 {131, 128},
305 {131, 129},
306 {131, 130},
307 {131, 132},
308 {131, 134},
309 {131, 136},
310 {131, 137},
311 {131, 138},
312 {131, 139},
313 {131, 140},
314 {131, 141},
315 {131, 143},
316 {131, 147},
317 {129, 74},
318 {129, 75}
319 };
320
321 if (*p2 == 222 && IS_DAKU (*p1))
322 daku = TRUE; /* Daku-ten */
323 else if (*p2 == 223 && IS_HANDAKU (*p1))
324 handaku = TRUE; /* Han-daku-ten */
325
326 *p1 = mtable[c - 161][0];
327 *p2 = mtable[c - 161][1];
328
329 if (daku)
330 {
331 if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
332 (*p2)++;
333 else if (*p2 == 131 && *p2 == 69)
334 *p2 = 148;
335 }
336 else if (handaku && *p2 >= 110 && *p2 <= 122)
337 (*p2) += 2;
338 }
339
340 /* Recast strcpy to handle unsigned chars used below. */
341 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
342
343 static void
do_convert(unsigned char * to,unsigned char * from,const char * code)344 do_convert (unsigned char *to, unsigned char *from, const char *code)
345 {
346 #ifdef HAVE_ICONV
347 iconv_t cd;
348 size_t from_len, to_len;
349
350 if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
351 {
352 error ("iconv_open() error");
353 #ifdef HAVE_ERRNO_H
354 if (errno == EINVAL)
355 error ("invalid code specification: \"%s\" or \"%s\"",
356 EUCSTR, code);
357 #endif
358 strcpy ((char *) to, (const char *) from);
359 return;
360 }
361
362 from_len = strlen ((const char *) from) + 1;
363 to_len = BUFSIZ;
364
365 if ((int) iconv(cd, (char **) &from, &from_len, (char **) &to, &to_len) == -1)
366 {
367 #ifdef HAVE_ERRNO_H
368 if (errno == EINVAL)
369 error ("invalid end of input string");
370 else if (errno == EILSEQ)
371 error ("invalid code in input string");
372 else if (errno == E2BIG)
373 error ("output buffer overflow at do_convert()");
374 else
375 #endif
376 error ("something happen");
377 strcpy ((char *) to, (const char *) from);
378 return;
379 }
380
381 if (iconv_close (cd) != 0)
382 {
383 error ("iconv_close() error");
384 }
385 #else
386 int p1, p2, i, j;
387 int jisx0208 = FALSE;
388 int hankaku = FALSE;
389
390 j = 0;
391 if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
392 {
393 for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
394 {
395 if (from[i] == ESC)
396 {
397 i++;
398 if (from[i] == '$')
399 {
400 jisx0208 = TRUE;
401 hankaku = FALSE;
402 i++;
403 }
404 else if (from[i] == '(')
405 {
406 jisx0208 = FALSE;
407 i++;
408 if (from[i] == 'I') /* Hankaku Kana */
409 hankaku = TRUE;
410 else
411 hankaku = FALSE;
412 }
413 }
414 else
415 {
416 if (jisx0208)
417 to[j++] = from[i] + 128;
418 else if (hankaku)
419 {
420 to[j++] = SS2;
421 to[j++] = from[i] + 128;
422 }
423 else
424 to[j++] = from[i];
425 }
426 }
427 }
428 else if (strcmp (code, SJISSTR) == 0)
429 {
430 for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
431 {
432 p1 = from[i];
433 if (p1 < 127)
434 to[j++] = p1;
435 else if ((p1 >= 161) && (p1 <= 223))
436 { /* Hankaku Kana */
437 to[j++] = SS2;
438 to[j++] = p1;
439 }
440 else
441 {
442 p2 = from[++i];
443 SJIStoJIS (&p1, &p2);
444 to[j++] = p1 + 128;
445 to[j++] = p2 + 128;
446 }
447 }
448 }
449 else
450 {
451 error ("invalid code specification: \"%s\"", code);
452 return;
453 }
454
455 if (j >= BUFSIZ)
456 {
457 error ("output buffer overflow at do_convert()");
458 ustrcpy (to, from);
459 }
460 else
461 to[j] = '\0';
462 #endif /* HAVE_ICONV */
463 }
464
465 static int
do_check_and_conv(unsigned char * to,unsigned char * from)466 do_check_and_conv (unsigned char *to, unsigned char *from)
467 {
468 static unsigned char tmp[BUFSIZ];
469 int p1, p2, i, j;
470 int kanji = TRUE;
471
472 switch (DetectKanjiCode (from))
473 {
474 case NEW:
475 debug ("Kanji code is New JIS.");
476 do_convert (tmp, from, NEWJISSTR);
477 break;
478 case OLD:
479 debug ("Kanji code is Old JIS.");
480 do_convert (tmp, from, OLDJISSTR);
481 break;
482 case ESCI:
483 debug ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
484 do_convert (tmp, from, NEWJISSTR);
485 break;
486 case NEC:
487 debug ("Kanji code is NEC Kanji.");
488 error ("cannot convert NEC Kanji.");
489 ustrcpy (tmp, from);
490 kanji = FALSE;
491 break;
492 case EUC:
493 debug ("Kanji code is EUC.");
494 ustrcpy (tmp, from);
495 break;
496 case SJIS:
497 debug ("Kanji code is SJIS.");
498 do_convert (tmp, from, SJISSTR);
499 break;
500 case EUCORSJIS:
501 debug ("Kanji code is EUC or SJIS.");
502 ustrcpy (tmp, from);
503 kanji = FALSE;
504 break;
505 case ASCII:
506 debug ("This is ASCII string.");
507 ustrcpy (tmp, from);
508 kanji = FALSE;
509 break;
510 default:
511 debug ("This string includes unknown code.");
512 ustrcpy (tmp, from);
513 kanji = FALSE;
514 break;
515 }
516
517 /* Hankaku Kana ---> Zenkaku Kana */
518 if (kanji)
519 {
520 j = 0;
521 for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
522 {
523 if (tmp[i] == SS2)
524 {
525 p1 = tmp[++i];
526 if (tmp[i + 1] == SS2)
527 {
528 p2 = tmp[i + 2];
529 if (p2 == 222 || p2 == 223)
530 i += 2;
531 else
532 p2 = 0;
533 }
534 else
535 p2 = 0;
536 han2zen (&p1, &p2);
537 SJIStoJIS (&p1, &p2);
538 to[j++] = p1 + 128;
539 to[j++] = p2 + 128;
540 }
541 else
542 to[j++] = tmp[i];
543 }
544
545 if (j >= BUFSIZ)
546 {
547 error ("output buffer overflow at Hankaku --> Zenkaku");
548 ustrcpy (to, tmp);
549 }
550 else
551 to[j] = '\0';
552 }
553 else
554 ustrcpy (to, tmp);
555
556 return kanji;
557 }
558
559 int
any2eucjp(unsigned char * dest,unsigned char * src,unsigned int dest_max)560 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
561 {
562 static unsigned char tmp_dest[BUFSIZ];
563 int ret;
564
565 if (strlen ((const char *) src) >= BUFSIZ)
566 {
567 error ("input string too large");
568 return -1;
569 }
570 if (dest_max > BUFSIZ)
571 {
572 error ("invalid maximum size of destination\nit should be less than %d.", BUFSIZ);
573 return -1;
574 }
575 ret = do_check_and_conv (tmp_dest, src);
576 if (strlen ((const char *) tmp_dest) >= dest_max)
577 {
578 error ("output buffer overflow");
579 ustrcpy (dest, src);
580 return -1;
581 }
582 ustrcpy (dest, tmp_dest);
583 return ret;
584 }
585
586 #if 0
587 unsigned int
588 strwidth (unsigned char *s)
589 {
590 unsigned char *t;
591 unsigned int i;
592
593 t = (unsigned char *) gdMalloc (BUFSIZ);
594 any2eucjp (t, s, BUFSIZ);
595 i = strlen (t);
596 gdFree (t);
597 return i;
598 }
599
600 #ifdef DEBUG
601 int
602 main ()
603 {
604 unsigned char input[BUFSIZ];
605 unsigned char *output;
606 unsigned char *str;
607 int c, i = 0;
608
609 while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
610 input[i++] = c;
611 input[i] = '\0';
612
613 printf ("input : %d bytes\n", strlen ((const char *) input));
614 printf ("output: %d bytes\n", strwidth (input));
615
616 output = (unsigned char *) gdMalloc (BUFSIZ);
617 any2eucjp (output, input, BUFSIZ);
618 str = output;
619 while (*str != '\0')
620 putchar (*(str++));
621 putchar ('\n');
622 gdFree (output);
623
624 return 0;
625 }
626 #endif
627 #endif
628