xref: /PHP-5.3/ext/gd/libgd/gdkanji.c (revision 36db28c8)
1 
2 /* gdkanji.c (Kanji code converter)                            */
3 /*                 written by Masahito Yamaga (ma@yama-ga.com) */
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include "gd.h"
9 #include "gdhelpers.h"
10 
11 #include <stdarg.h>
12 #if defined(HAVE_ICONV_H) || defined(HAVE_ICONV)
13 #include <iconv.h>
14 #ifdef HAVE_ERRNO_H
15 #include <errno.h>
16 #endif
17 #endif
18 
19 #if defined(HAVE_ICONV_H) && !defined(HAVE_ICONV)
20 #define HAVE_ICONV 1
21 #endif
22 
23 #define LIBNAME "any2eucjp()"
24 
25 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
26 #ifndef SJISPRE
27 #define SJISPRE 1
28 #endif
29 #endif
30 
31 #ifdef TRUE
32 #undef TRUE
33 #endif
34 #ifdef FALSE
35 #undef FALSE
36 #endif
37 
38 #define TRUE  1
39 #define FALSE 0
40 
41 #define NEW 1
42 #define OLD 2
43 #define ESCI 3
44 #define NEC 4
45 #define EUC 5
46 #define SJIS 6
47 #define EUCORSJIS 7
48 #define ASCII 8
49 
50 #define NEWJISSTR "JIS7"
51 #define OLDJISSTR "jis"
52 #define EUCSTR    "eucJP"
53 #define SJISSTR   "SJIS"
54 
55 #define ESC 27
56 #define SS2 142
57 
58 static void
debug(const char * format,...)59 debug (const char *format,...)
60 {
61 #ifdef DEBUG
62   va_list args;
63 
64   va_start (args, format);
65   fprintf (stdout, "%s: ", LIBNAME);
66   vfprintf (stdout, format, args);
67   fprintf (stdout, "\n");
68   va_end (args);
69 #endif
70 }
71 
72 static void
error(const char * format,...)73 error (const char *format,...)
74 {
75 	va_list args;
76 	char *tmp;
77 	TSRMLS_FETCH();
78 
79 	va_start(args, format);
80 	vspprintf(&tmp, 0, format, args);
81 	va_end(args);
82 	php_error_docref(NULL TSRMLS_CC, E_WARNING, "%s: %s", LIBNAME, tmp);
83 	efree(tmp);
84 }
85 
86 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
87 
88 static int
DetectKanjiCode(unsigned char * str)89 DetectKanjiCode (unsigned char *str)
90 {
91   static int whatcode = ASCII;
92   int oldcode = ASCII;
93   int c, i;
94   char *lang = NULL;
95 
96   c = '\1';
97   i = 0;
98 
99   if (whatcode != EUCORSJIS && whatcode != ASCII)
100     {
101       oldcode = whatcode;
102       whatcode = ASCII;
103     }
104 
105   while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
106     {
107       if ((c = str[i++]) != '\0')
108 	{
109 	  if (c == ESC)
110 	    {
111 	      c = str[i++];
112 	      if (c == '$')
113 		{
114 		  c = str[i++];
115 		  if (c == 'B')
116 		    whatcode = NEW;
117 		  else if (c == '@')
118 		    whatcode = OLD;
119 		}
120 	      else if (c == '(')
121 		{
122 		  c = str[i++];
123 		  if (c == 'I')
124 		    whatcode = ESCI;
125 		}
126 	      else if (c == 'K')
127 		whatcode = NEC;
128 	    }
129 	  else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
130 	    whatcode = SJIS;
131 	  else if (c == SS2)
132 	    {
133 	      c = str[i++];
134 	      if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
135 		whatcode = SJIS;
136 	      else if (c >= 161 && c <= 223)
137 		whatcode = EUCORSJIS;
138 	    }
139 	  else if (c >= 161 && c <= 223)
140 	    {
141 	      c = str[i++];
142 	      if (c >= 240 && c <= 254)
143 		whatcode = EUC;
144 	      else if (c >= 161 && c <= 223)
145 		whatcode = EUCORSJIS;
146 	      else if (c >= 224 && c <= 239)
147 		{
148 		  whatcode = EUCORSJIS;
149 		  while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
150 		    {
151 		      if (c >= 129)
152 			{
153 			  if (c <= 141 || (c >= 143 && c <= 159))
154 			    whatcode = SJIS;
155 			  else if (c >= 253 && c <= 254)
156 			    whatcode = EUC;
157 			}
158 		      c = str[i++];
159 		    }
160 		}
161 	      else if (c <= 159)
162 		whatcode = SJIS;
163 	    }
164 	  else if (c >= 240 && c <= 254)
165 	    whatcode = EUC;
166 	  else if (c >= 224 && c <= 239)
167 	    {
168 	      c = str[i++];
169 	      if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
170 		whatcode = SJIS;
171 	      else if (c >= 253 && c <= 254)
172 		whatcode = EUC;
173 	      else if (c >= 161 && c <= 252)
174 		whatcode = EUCORSJIS;
175 	    }
176 	}
177     }
178 
179 #ifdef DEBUG
180   if (whatcode == ASCII)
181     debug ("Kanji code not included.");
182   else if (whatcode == EUCORSJIS)
183     debug ("Kanji code not detected.");
184   else
185     debug ("Kanji code detected at %d byte.", i);
186 #endif
187 
188   if (whatcode == EUCORSJIS && oldcode != ASCII)
189     whatcode = oldcode;
190 
191   if (whatcode == EUCORSJIS)
192     {
193       if (getenv ("LC_ALL"))
194 	lang = getenv ("LC_ALL");
195       else if (getenv ("LC_CTYPE"))
196 	lang = getenv ("LC_CTYPE");
197       else if (getenv ("LANG"))
198 	lang = getenv ("LANG");
199 
200       if (lang)
201 	{
202 	  if (strcmp (lang, "ja_JP.SJIS") == 0 ||
203 #ifdef hpux
204 	      strcmp (lang, "japanese") == 0 ||
205 #endif
206 	      strcmp (lang, "ja_JP.mscode") == 0 ||
207 	      strcmp (lang, "ja_JP.PCK") == 0)
208 	    whatcode = SJIS;
209 	  else if (strncmp (lang, "ja", 2) == 0)
210 #ifdef SJISPRE
211 	    whatcode = SJIS;
212 #else
213 	    whatcode = EUC;
214 #endif
215 	}
216     }
217 
218   if (whatcode == EUCORSJIS)
219 #ifdef SJISPRE
220     whatcode = SJIS;
221 #else
222     whatcode = EUC;
223 #endif
224 
225   return whatcode;
226 }
227 
228 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
229 
230 static void
SJIStoJIS(int * p1,int * p2)231 SJIStoJIS (int *p1, int *p2)
232 {
233   register unsigned char c1 = *p1;
234   register unsigned char c2 = *p2;
235   register int adjust = c2 < 159;
236   register int rowOffset = c1 < 160 ? 112 : 176;
237   register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
238 
239   *p1 = ((c1 - rowOffset) << 1) - adjust;
240   *p2 -= cellOffset;
241 }
242 
243 /* han2zen() was derived from han2zen() written by Ken Lunde. */
244 
245 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
246 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
247 
248 static void
han2zen(int * p1,int * p2)249 han2zen (int *p1, int *p2)
250 {
251   int c = *p1;
252   int daku = FALSE;
253   int handaku = FALSE;
254   int mtable[][2] =
255   {
256     {129, 66},
257     {129, 117},
258     {129, 118},
259     {129, 65},
260     {129, 69},
261     {131, 146},
262     {131, 64},
263     {131, 66},
264     {131, 68},
265     {131, 70},
266     {131, 72},
267     {131, 131},
268     {131, 133},
269     {131, 135},
270     {131, 98},
271     {129, 91},
272     {131, 65},
273     {131, 67},
274     {131, 69},
275     {131, 71},
276     {131, 73},
277     {131, 74},
278     {131, 76},
279     {131, 78},
280     {131, 80},
281     {131, 82},
282     {131, 84},
283     {131, 86},
284     {131, 88},
285     {131, 90},
286     {131, 92},
287     {131, 94},
288     {131, 96},
289     {131, 99},
290     {131, 101},
291     {131, 103},
292     {131, 105},
293     {131, 106},
294     {131, 107},
295     {131, 108},
296     {131, 109},
297     {131, 110},
298     {131, 113},
299     {131, 116},
300     {131, 119},
301     {131, 122},
302     {131, 125},
303     {131, 126},
304     {131, 128},
305     {131, 129},
306     {131, 130},
307     {131, 132},
308     {131, 134},
309     {131, 136},
310     {131, 137},
311     {131, 138},
312     {131, 139},
313     {131, 140},
314     {131, 141},
315     {131, 143},
316     {131, 147},
317     {129, 74},
318     {129, 75}
319   };
320 
321   if (*p2 == 222 && IS_DAKU (*p1))
322     daku = TRUE;		/* Daku-ten */
323   else if (*p2 == 223 && IS_HANDAKU (*p1))
324     handaku = TRUE;		/* Han-daku-ten */
325 
326   *p1 = mtable[c - 161][0];
327   *p2 = mtable[c - 161][1];
328 
329   if (daku)
330     {
331       if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
332 	(*p2)++;
333       else if (*p2 == 131 && *p2 == 69)
334 	*p2 = 148;
335     }
336   else if (handaku && *p2 >= 110 && *p2 <= 122)
337     (*p2) += 2;
338 }
339 
340 /* Recast strcpy to handle unsigned chars used below. */
341 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
342 
343 static void
do_convert(unsigned char * to,unsigned char * from,const char * code)344 do_convert (unsigned char *to, unsigned char *from, const char *code)
345 {
346 #ifdef HAVE_ICONV
347   iconv_t cd;
348   size_t from_len, to_len;
349 
350   if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
351     {
352       error ("iconv_open() error");
353 #ifdef HAVE_ERRNO_H
354       if (errno == EINVAL)
355 	error ("invalid code specification: \"%s\" or \"%s\"",
356 	       EUCSTR, code);
357 #endif
358       strcpy ((char *) to, (const char *) from);
359       return;
360     }
361 
362   from_len = strlen ((const char *) from) + 1;
363   to_len = BUFSIZ;
364 
365   if ((int) iconv(cd, (char **) &from, &from_len, (char **) &to, &to_len) == -1)
366     {
367 #ifdef HAVE_ERRNO_H
368       if (errno == EINVAL)
369 	error ("invalid end of input string");
370       else if (errno == EILSEQ)
371 	error ("invalid code in input string");
372       else if (errno == E2BIG)
373 	error ("output buffer overflow at do_convert()");
374       else
375 #endif
376 	error ("something happen");
377       strcpy ((char *) to, (const char *) from);
378       return;
379     }
380 
381   if (iconv_close (cd) != 0)
382     {
383       error ("iconv_close() error");
384     }
385 #else
386   int p1, p2, i, j;
387   int jisx0208 = FALSE;
388   int hankaku = FALSE;
389 
390   j = 0;
391   if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
392     {
393       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
394 	{
395 	  if (from[i] == ESC)
396 	    {
397 	      i++;
398 	      if (from[i] == '$')
399 		{
400 		  jisx0208 = TRUE;
401 		  hankaku = FALSE;
402 		  i++;
403 		}
404 	      else if (from[i] == '(')
405 		{
406 		  jisx0208 = FALSE;
407 		  i++;
408 		  if (from[i] == 'I')	/* Hankaku Kana */
409 		    hankaku = TRUE;
410 		  else
411 		    hankaku = FALSE;
412 		}
413 	    }
414 	  else
415 	    {
416 	      if (jisx0208)
417 		to[j++] = from[i] + 128;
418 	      else if (hankaku)
419 		{
420 		  to[j++] = SS2;
421 		  to[j++] = from[i] + 128;
422 		}
423 	      else
424 		to[j++] = from[i];
425 	    }
426 	}
427     }
428   else if (strcmp (code, SJISSTR) == 0)
429     {
430       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
431 	{
432 	  p1 = from[i];
433 	  if (p1 < 127)
434 	    to[j++] = p1;
435 	  else if ((p1 >= 161) && (p1 <= 223))
436 	    {			/* Hankaku Kana */
437 	      to[j++] = SS2;
438 	      to[j++] = p1;
439 	    }
440 	  else
441 	    {
442 	      p2 = from[++i];
443 	      SJIStoJIS (&p1, &p2);
444 	      to[j++] = p1 + 128;
445 	      to[j++] = p2 + 128;
446 	    }
447 	}
448     }
449   else
450     {
451       error ("invalid code specification: \"%s\"", code);
452       return;
453     }
454 
455   if (j >= BUFSIZ)
456     {
457       error ("output buffer overflow at do_convert()");
458       ustrcpy (to, from);
459     }
460   else
461     to[j] = '\0';
462 #endif /* HAVE_ICONV */
463 }
464 
465 static int
do_check_and_conv(unsigned char * to,unsigned char * from)466 do_check_and_conv (unsigned char *to, unsigned char *from)
467 {
468   static unsigned char tmp[BUFSIZ];
469   int p1, p2, i, j;
470   int kanji = TRUE;
471 
472   switch (DetectKanjiCode (from))
473     {
474     case NEW:
475       debug ("Kanji code is New JIS.");
476       do_convert (tmp, from, NEWJISSTR);
477       break;
478     case OLD:
479       debug ("Kanji code is Old JIS.");
480       do_convert (tmp, from, OLDJISSTR);
481       break;
482     case ESCI:
483       debug ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
484       do_convert (tmp, from, NEWJISSTR);
485       break;
486     case NEC:
487       debug ("Kanji code is NEC Kanji.");
488       error ("cannot convert NEC Kanji.");
489       ustrcpy (tmp, from);
490       kanji = FALSE;
491       break;
492     case EUC:
493       debug ("Kanji code is EUC.");
494       ustrcpy (tmp, from);
495       break;
496     case SJIS:
497       debug ("Kanji code is SJIS.");
498       do_convert (tmp, from, SJISSTR);
499       break;
500     case EUCORSJIS:
501       debug ("Kanji code is EUC or SJIS.");
502       ustrcpy (tmp, from);
503       kanji = FALSE;
504       break;
505     case ASCII:
506       debug ("This is ASCII string.");
507       ustrcpy (tmp, from);
508       kanji = FALSE;
509       break;
510     default:
511       debug ("This string includes unknown code.");
512       ustrcpy (tmp, from);
513       kanji = FALSE;
514       break;
515     }
516 
517   /* Hankaku Kana ---> Zenkaku Kana */
518   if (kanji)
519     {
520       j = 0;
521       for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
522 	{
523 	  if (tmp[i] == SS2)
524 	    {
525 	      p1 = tmp[++i];
526 	      if (tmp[i + 1] == SS2)
527 		{
528 		  p2 = tmp[i + 2];
529 		  if (p2 == 222 || p2 == 223)
530 		    i += 2;
531 		  else
532 		    p2 = 0;
533 		}
534 	      else
535 		p2 = 0;
536 	      han2zen (&p1, &p2);
537 	      SJIStoJIS (&p1, &p2);
538 	      to[j++] = p1 + 128;
539 	      to[j++] = p2 + 128;
540 	    }
541 	  else
542 	    to[j++] = tmp[i];
543 	}
544 
545       if (j >= BUFSIZ)
546 	{
547 	  error ("output buffer overflow at Hankaku --> Zenkaku");
548 	  ustrcpy (to, tmp);
549 	}
550       else
551 	to[j] = '\0';
552     }
553   else
554     ustrcpy (to, tmp);
555 
556   return kanji;
557 }
558 
559 int
any2eucjp(unsigned char * dest,unsigned char * src,unsigned int dest_max)560 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
561 {
562   static unsigned char tmp_dest[BUFSIZ];
563   int ret;
564 
565   if (strlen ((const char *) src) >= BUFSIZ)
566     {
567       error ("input string too large");
568       return -1;
569     }
570   if (dest_max > BUFSIZ)
571     {
572       error ("invalid maximum size of destination\nit should be less than %d.", BUFSIZ);
573       return -1;
574     }
575   ret = do_check_and_conv (tmp_dest, src);
576   if (strlen ((const char *) tmp_dest) >= dest_max)
577     {
578       error ("output buffer overflow");
579       ustrcpy (dest, src);
580       return -1;
581     }
582   ustrcpy (dest, tmp_dest);
583   return ret;
584 }
585 
586 #if 0
587 unsigned int
588 strwidth (unsigned char *s)
589 {
590   unsigned char *t;
591   unsigned int i;
592 
593   t = (unsigned char *) gdMalloc (BUFSIZ);
594   any2eucjp (t, s, BUFSIZ);
595   i = strlen (t);
596   gdFree (t);
597   return i;
598 }
599 
600 #ifdef DEBUG
601 int
602 main ()
603 {
604   unsigned char input[BUFSIZ];
605   unsigned char *output;
606   unsigned char *str;
607   int c, i = 0;
608 
609   while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
610     input[i++] = c;
611   input[i] = '\0';
612 
613   printf ("input : %d bytes\n", strlen ((const char *) input));
614   printf ("output: %d bytes\n", strwidth (input));
615 
616   output = (unsigned char *) gdMalloc (BUFSIZ);
617   any2eucjp (output, input, BUFSIZ);
618   str = output;
619   while (*str != '\0')
620     putchar (*(str++));
621   putchar ('\n');
622   gdFree (output);
623 
624   return 0;
625 }
626 #endif
627 #endif
628