1
2 /* gdkanji.c (Kanji code converter) */
3 /* written by Masahito Yamaga (ma@yama-ga.com) */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include "gd.h"
9 #include "gdhelpers.h"
10
11 #include <stdarg.h>
12 #if defined(HAVE_ICONV_H) || defined(HAVE_ICONV)
13 #include <iconv.h>
14 #include <errno.h>
15 #endif
16
17 #if defined(HAVE_ICONV_H) && !defined(HAVE_ICONV)
18 #define HAVE_ICONV 1
19 #endif
20
21 #define LIBNAME "any2eucjp()"
22
23 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
24 #ifndef SJISPRE
25 #define SJISPRE 1
26 #endif
27 #endif
28
29 #ifdef TRUE
30 #undef TRUE
31 #endif
32 #ifdef FALSE
33 #undef FALSE
34 #endif
35
36 #define TRUE 1
37 #define FALSE 0
38
39 #define NEW 1
40 #define OLD 2
41 #define ESCI 3
42 #define NEC 4
43 #define EUC 5
44 #define SJIS 6
45 #define EUCORSJIS 7
46 #define ASCII 8
47
48 #define NEWJISSTR "JIS7"
49 #define OLDJISSTR "jis"
50 #define EUCSTR "eucJP"
51 #define SJISSTR "SJIS"
52
53 #define ESC 27
54 #define SS2 142
55
56 static void
debug(const char * format,...)57 debug (const char *format,...)
58 {
59 #ifdef DEBUG
60 va_list args;
61
62 va_start (args, format);
63 fprintf (stdout, "%s: ", LIBNAME);
64 vfprintf (stdout, format, args);
65 fprintf (stdout, "\n");
66 va_end (args);
67 #endif
68 }
69
70 static void
error(const char * format,...)71 error (const char *format,...)
72 {
73 va_list args;
74 char *tmp;
75
76 va_start(args, format);
77 vspprintf(&tmp, 0, format, args);
78 va_end(args);
79 php_error_docref(NULL, E_WARNING, "%s: %s", LIBNAME, tmp);
80 efree(tmp);
81 }
82
83 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
84
85 static int
DetectKanjiCode(unsigned char * str)86 DetectKanjiCode (unsigned char *str)
87 {
88 static int whatcode = ASCII;
89 int oldcode = ASCII;
90 int c, i;
91 char *lang = NULL;
92
93 c = '\1';
94 i = 0;
95
96 if (whatcode != EUCORSJIS && whatcode != ASCII)
97 {
98 oldcode = whatcode;
99 whatcode = ASCII;
100 }
101
102 while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
103 {
104 if ((c = str[i++]) != '\0')
105 {
106 if (c == ESC)
107 {
108 c = str[i++];
109 if (c == '$')
110 {
111 c = str[i++];
112 if (c == 'B')
113 whatcode = NEW;
114 else if (c == '@')
115 whatcode = OLD;
116 }
117 else if (c == '(')
118 {
119 c = str[i++];
120 if (c == 'I')
121 whatcode = ESCI;
122 }
123 else if (c == 'K')
124 whatcode = NEC;
125 }
126 else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
127 whatcode = SJIS;
128 else if (c == SS2)
129 {
130 c = str[i++];
131 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
132 whatcode = SJIS;
133 else if (c >= 161 && c <= 223)
134 whatcode = EUCORSJIS;
135 }
136 else if (c >= 161 && c <= 223)
137 {
138 c = str[i++];
139 if (c >= 240 && c <= 254)
140 whatcode = EUC;
141 else if (c >= 161 && c <= 223)
142 whatcode = EUCORSJIS;
143 else if (c >= 224 && c <= 239)
144 {
145 whatcode = EUCORSJIS;
146 while (c >= 64 && whatcode == EUCORSJIS)
147 {
148 if (c >= 129)
149 {
150 if (c <= 141 || (c >= 143 && c <= 159))
151 whatcode = SJIS;
152 else if (c >= 253 && c <= 254)
153 whatcode = EUC;
154 }
155 c = str[i++];
156 }
157 }
158 else if (c <= 159)
159 whatcode = SJIS;
160 }
161 else if (c >= 240 && c <= 254)
162 whatcode = EUC;
163 else if (c >= 224 && c <= 239)
164 {
165 c = str[i++];
166 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
167 whatcode = SJIS;
168 else if (c >= 253 && c <= 254)
169 whatcode = EUC;
170 else if (c >= 161 && c <= 252)
171 whatcode = EUCORSJIS;
172 }
173 }
174 }
175
176 #ifdef DEBUG
177 if (whatcode == ASCII)
178 debug ("Kanji code not included.");
179 else if (whatcode == EUCORSJIS)
180 debug ("Kanji code not detected.");
181 else
182 debug ("Kanji code detected at %d byte.", i);
183 #endif
184
185 if (whatcode == EUCORSJIS && oldcode != ASCII)
186 whatcode = oldcode;
187
188 if (whatcode == EUCORSJIS)
189 {
190 if (getenv ("LC_ALL"))
191 lang = getenv ("LC_ALL");
192 else if (getenv ("LC_CTYPE"))
193 lang = getenv ("LC_CTYPE");
194 else if (getenv ("LANG"))
195 lang = getenv ("LANG");
196
197 if (lang)
198 {
199 if (strcmp (lang, "ja_JP.SJIS") == 0 ||
200 #ifdef hpux
201 strcmp (lang, "japanese") == 0 ||
202 #endif
203 strcmp (lang, "ja_JP.mscode") == 0 ||
204 strcmp (lang, "ja_JP.PCK") == 0)
205 whatcode = SJIS;
206 else if (strncmp (lang, "ja", 2) == 0)
207 #ifdef SJISPRE
208 whatcode = SJIS;
209 #else
210 whatcode = EUC;
211 #endif
212 }
213 }
214
215 if (whatcode == EUCORSJIS)
216 #ifdef SJISPRE
217 whatcode = SJIS;
218 #else
219 whatcode = EUC;
220 #endif
221
222 return whatcode;
223 }
224
225 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
226
227 static void
SJIStoJIS(int * p1,int * p2)228 SJIStoJIS (int *p1, int *p2)
229 {
230 register unsigned char c1 = *p1;
231 register unsigned char c2 = *p2;
232 register int adjust = c2 < 159;
233 register int rowOffset = c1 < 160 ? 112 : 176;
234 register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
235
236 *p1 = ((c1 - rowOffset) << 1) - adjust;
237 *p2 -= cellOffset;
238 }
239
240 /* han2zen() was derived from han2zen() written by Ken Lunde. */
241
242 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
243 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
244
245 static void
han2zen(int * p1,int * p2)246 han2zen (int *p1, int *p2)
247 {
248 int c = *p1;
249 int daku = FALSE;
250 int handaku = FALSE;
251 int mtable[][2] =
252 {
253 {129, 66},
254 {129, 117},
255 {129, 118},
256 {129, 65},
257 {129, 69},
258 {131, 146},
259 {131, 64},
260 {131, 66},
261 {131, 68},
262 {131, 70},
263 {131, 72},
264 {131, 131},
265 {131, 133},
266 {131, 135},
267 {131, 98},
268 {129, 91},
269 {131, 65},
270 {131, 67},
271 {131, 69},
272 {131, 71},
273 {131, 73},
274 {131, 74},
275 {131, 76},
276 {131, 78},
277 {131, 80},
278 {131, 82},
279 {131, 84},
280 {131, 86},
281 {131, 88},
282 {131, 90},
283 {131, 92},
284 {131, 94},
285 {131, 96},
286 {131, 99},
287 {131, 101},
288 {131, 103},
289 {131, 105},
290 {131, 106},
291 {131, 107},
292 {131, 108},
293 {131, 109},
294 {131, 110},
295 {131, 113},
296 {131, 116},
297 {131, 119},
298 {131, 122},
299 {131, 125},
300 {131, 126},
301 {131, 128},
302 {131, 129},
303 {131, 130},
304 {131, 132},
305 {131, 134},
306 {131, 136},
307 {131, 137},
308 {131, 138},
309 {131, 139},
310 {131, 140},
311 {131, 141},
312 {131, 143},
313 {131, 147},
314 {129, 74},
315 {129, 75}
316 };
317
318 if (*p2 == 222 && IS_DAKU (*p1))
319 daku = TRUE; /* Daku-ten */
320 else if (*p2 == 223 && IS_HANDAKU (*p1))
321 handaku = TRUE; /* Han-daku-ten */
322
323 *p1 = mtable[c - 161][0];
324 *p2 = mtable[c - 161][1];
325
326 if (daku)
327 {
328 if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
329 (*p2)++;
330 else if (*p2 == 131 || *p2 == 69)
331 *p2 = 148;
332 }
333 else if (handaku && *p2 >= 110 && *p2 <= 122)
334 (*p2) += 2;
335 }
336
337 /* Recast strcpy to handle unsigned chars used below. */
338 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
339
340 static void
do_convert(unsigned char * to,unsigned char * from,const char * code)341 do_convert (unsigned char *to, unsigned char *from, const char *code)
342 {
343 #ifdef HAVE_ICONV
344 iconv_t cd;
345 size_t from_len, to_len;
346
347 if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
348 {
349 error ("iconv_open() error");
350 if (errno == EINVAL)
351 error ("invalid code specification: \"%s\" or \"%s\"",
352 EUCSTR, code);
353 strcpy ((char *) to, (const char *) from);
354 return;
355 }
356
357 from_len = strlen ((const char *) from) + 1;
358 to_len = BUFSIZ;
359
360 if ((int) iconv(cd, (char **) &from, &from_len, (char **) &to, &to_len) == -1)
361 {
362 if (errno == EINVAL)
363 error ("invalid end of input string");
364 else if (errno == EILSEQ)
365 error ("invalid code in input string");
366 else if (errno == E2BIG)
367 error ("output buffer overflow at do_convert()");
368 else
369 error ("something happen");
370 strcpy ((char *) to, (const char *) from);
371 return;
372 }
373
374 if (iconv_close (cd) != 0)
375 {
376 error ("iconv_close() error");
377 }
378 #else
379 int p1, p2, i, j;
380 int jisx0208 = FALSE;
381 int hankaku = FALSE;
382
383 j = 0;
384 if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
385 {
386 for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
387 {
388 if (from[i] == ESC)
389 {
390 i++;
391 if (from[i] == '$')
392 {
393 jisx0208 = TRUE;
394 hankaku = FALSE;
395 i++;
396 }
397 else if (from[i] == '(')
398 {
399 jisx0208 = FALSE;
400 i++;
401 if (from[i] == 'I') /* Hankaku Kana */
402 hankaku = TRUE;
403 else
404 hankaku = FALSE;
405 }
406 }
407 else
408 {
409 if (jisx0208)
410 to[j++] = from[i] + 128;
411 else if (hankaku)
412 {
413 to[j++] = SS2;
414 to[j++] = from[i] + 128;
415 }
416 else
417 to[j++] = from[i];
418 }
419 }
420 }
421 else if (strcmp (code, SJISSTR) == 0)
422 {
423 for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
424 {
425 p1 = from[i];
426 if (p1 < 127)
427 to[j++] = p1;
428 else if ((p1 >= 161) && (p1 <= 223))
429 { /* Hankaku Kana */
430 to[j++] = SS2;
431 to[j++] = p1;
432 }
433 else
434 {
435 p2 = from[++i];
436 SJIStoJIS (&p1, &p2);
437 to[j++] = p1 + 128;
438 to[j++] = p2 + 128;
439 }
440 }
441 }
442 else
443 {
444 error ("invalid code specification: \"%s\"", code);
445 return;
446 }
447
448 if (j >= BUFSIZ)
449 {
450 error ("output buffer overflow at do_convert()");
451 ustrcpy (to, from);
452 }
453 else
454 to[j] = '\0';
455 #endif /* HAVE_ICONV */
456 }
457
458 static int
do_check_and_conv(unsigned char * to,unsigned char * from)459 do_check_and_conv (unsigned char *to, unsigned char *from)
460 {
461 static unsigned char tmp[BUFSIZ];
462 int p1, p2, i, j;
463 int kanji = TRUE;
464
465 switch (DetectKanjiCode (from))
466 {
467 case NEW:
468 debug ("Kanji code is New JIS.");
469 do_convert (tmp, from, NEWJISSTR);
470 break;
471 case OLD:
472 debug ("Kanji code is Old JIS.");
473 do_convert (tmp, from, OLDJISSTR);
474 break;
475 case ESCI:
476 debug ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
477 do_convert (tmp, from, NEWJISSTR);
478 break;
479 case NEC:
480 debug ("Kanji code is NEC Kanji.");
481 error ("cannot convert NEC Kanji.");
482 ustrcpy (tmp, from);
483 kanji = FALSE;
484 break;
485 case EUC:
486 debug ("Kanji code is EUC.");
487 ustrcpy (tmp, from);
488 break;
489 case SJIS:
490 debug ("Kanji code is SJIS.");
491 do_convert (tmp, from, SJISSTR);
492 break;
493 case EUCORSJIS:
494 debug ("Kanji code is EUC or SJIS.");
495 ustrcpy (tmp, from);
496 kanji = FALSE;
497 break;
498 case ASCII:
499 debug ("This is ASCII string.");
500 ustrcpy (tmp, from);
501 kanji = FALSE;
502 break;
503 default:
504 debug ("This string includes unknown code.");
505 ustrcpy (tmp, from);
506 kanji = FALSE;
507 break;
508 }
509
510 /* Hankaku Kana ---> Zenkaku Kana */
511 if (kanji)
512 {
513 j = 0;
514 for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
515 {
516 if (tmp[i] == SS2)
517 {
518 p1 = tmp[++i];
519 if (tmp[i + 1] == SS2)
520 {
521 p2 = tmp[i + 2];
522 if (p2 == 222 || p2 == 223)
523 i += 2;
524 else
525 p2 = 0;
526 }
527 else
528 p2 = 0;
529 han2zen (&p1, &p2);
530 SJIStoJIS (&p1, &p2);
531 to[j++] = p1 + 128;
532 to[j++] = p2 + 128;
533 }
534 else
535 to[j++] = tmp[i];
536 }
537
538 if (j >= BUFSIZ)
539 {
540 error ("output buffer overflow at Hankaku --> Zenkaku");
541 ustrcpy (to, tmp);
542 }
543 else
544 to[j] = '\0';
545 }
546 else
547 ustrcpy (to, tmp);
548
549 return kanji;
550 }
551
552 int
any2eucjp(unsigned char * dest,unsigned char * src,unsigned int dest_max)553 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
554 {
555 static unsigned char tmp_dest[BUFSIZ];
556 int ret;
557
558 if (strlen ((const char *) src) >= BUFSIZ)
559 {
560 error ("input string too large");
561 return -1;
562 }
563 if (dest_max > BUFSIZ)
564 {
565 error ("invalid maximum size of destination\nit should be less than %d.", BUFSIZ);
566 return -1;
567 }
568 ret = do_check_and_conv (tmp_dest, src);
569 if (strlen ((const char *) tmp_dest) >= dest_max)
570 {
571 error ("output buffer overflow");
572 ustrcpy (dest, src);
573 return -1;
574 }
575 ustrcpy (dest, tmp_dest);
576 return ret;
577 }
578
579 #if 0
580 unsigned int
581 strwidth (unsigned char *s)
582 {
583 unsigned char *t;
584 unsigned int i;
585
586 t = (unsigned char *) gdMalloc (BUFSIZ);
587 any2eucjp (t, s, BUFSIZ);
588 i = strlen (t);
589 gdFree (t);
590 return i;
591 }
592
593 #ifdef DEBUG
594 int
595 main ()
596 {
597 unsigned char input[BUFSIZ];
598 unsigned char *output;
599 unsigned char *str;
600 int c, i = 0;
601
602 while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
603 input[i++] = c;
604 input[i] = '\0';
605
606 printf ("input : %d bytes\n", strlen ((const char *) input));
607 printf ("output: %d bytes\n", strwidth (input));
608
609 output = (unsigned char *) gdMalloc (BUFSIZ);
610 any2eucjp (output, input, BUFSIZ);
611 str = output;
612 while (*str != '\0')
613 putchar (*(str++));
614 putchar ('\n');
615 gdFree (output);
616
617 return 0;
618 }
619 #endif
620 #endif
621