xref: /PHP-7.0/ext/gd/libgd/gdkanji.c (revision 2c16b9cf)
1 
2 /* gdkanji.c (Kanji code converter)                            */
3 /*                 written by Masahito Yamaga (ma@yama-ga.com) */
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include "gd.h"
9 #include "gdhelpers.h"
10 
11 #include <stdarg.h>
12 #if defined(HAVE_ICONV_H) || defined(HAVE_ICONV)
13 #include <iconv.h>
14 #ifdef HAVE_ERRNO_H
15 #include <errno.h>
16 #endif
17 #endif
18 
19 #if defined(HAVE_ICONV_H) && !defined(HAVE_ICONV)
20 #define HAVE_ICONV 1
21 #endif
22 
23 #define LIBNAME "any2eucjp()"
24 
25 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
26 #ifndef SJISPRE
27 #define SJISPRE 1
28 #endif
29 #endif
30 
31 #ifdef TRUE
32 #undef TRUE
33 #endif
34 #ifdef FALSE
35 #undef FALSE
36 #endif
37 
38 #define TRUE  1
39 #define FALSE 0
40 
41 #define NEW 1
42 #define OLD 2
43 #define ESCI 3
44 #define NEC 4
45 #define EUC 5
46 #define SJIS 6
47 #define EUCORSJIS 7
48 #define ASCII 8
49 
50 #define NEWJISSTR "JIS7"
51 #define OLDJISSTR "jis"
52 #define EUCSTR    "eucJP"
53 #define SJISSTR   "SJIS"
54 
55 #define ESC 27
56 #define SS2 142
57 
58 static void
debug(const char * format,...)59 debug (const char *format,...)
60 {
61 #ifdef DEBUG
62   va_list args;
63 
64   va_start (args, format);
65   fprintf (stdout, "%s: ", LIBNAME);
66   vfprintf (stdout, format, args);
67   fprintf (stdout, "\n");
68   va_end (args);
69 #endif
70 }
71 
72 static void
error(const char * format,...)73 error (const char *format,...)
74 {
75 	va_list args;
76 	char *tmp;
77 
78 	va_start(args, format);
79 	vspprintf(&tmp, 0, format, args);
80 	va_end(args);
81 	php_error_docref(NULL, E_WARNING, "%s: %s", LIBNAME, tmp);
82 	efree(tmp);
83 }
84 
85 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
86 
87 static int
DetectKanjiCode(unsigned char * str)88 DetectKanjiCode (unsigned char *str)
89 {
90   static int whatcode = ASCII;
91   int oldcode = ASCII;
92   int c, i;
93   char *lang = NULL;
94 
95   c = '\1';
96   i = 0;
97 
98   if (whatcode != EUCORSJIS && whatcode != ASCII)
99     {
100       oldcode = whatcode;
101       whatcode = ASCII;
102     }
103 
104   while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
105     {
106       if ((c = str[i++]) != '\0')
107 	{
108 	  if (c == ESC)
109 	    {
110 	      c = str[i++];
111 	      if (c == '$')
112 		{
113 		  c = str[i++];
114 		  if (c == 'B')
115 		    whatcode = NEW;
116 		  else if (c == '@')
117 		    whatcode = OLD;
118 		}
119 	      else if (c == '(')
120 		{
121 		  c = str[i++];
122 		  if (c == 'I')
123 		    whatcode = ESCI;
124 		}
125 	      else if (c == 'K')
126 		whatcode = NEC;
127 	    }
128 	  else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
129 	    whatcode = SJIS;
130 	  else if (c == SS2)
131 	    {
132 	      c = str[i++];
133 	      if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
134 		whatcode = SJIS;
135 	      else if (c >= 161 && c <= 223)
136 		whatcode = EUCORSJIS;
137 	    }
138 	  else if (c >= 161 && c <= 223)
139 	    {
140 	      c = str[i++];
141 	      if (c >= 240 && c <= 254)
142 		whatcode = EUC;
143 	      else if (c >= 161 && c <= 223)
144 		whatcode = EUCORSJIS;
145 	      else if (c >= 224 && c <= 239)
146 		{
147 		  whatcode = EUCORSJIS;
148 		  while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
149 		    {
150 		      if (c >= 129)
151 			{
152 			  if (c <= 141 || (c >= 143 && c <= 159))
153 			    whatcode = SJIS;
154 			  else if (c >= 253 && c <= 254)
155 			    whatcode = EUC;
156 			}
157 		      c = str[i++];
158 		    }
159 		}
160 	      else if (c <= 159)
161 		whatcode = SJIS;
162 	    }
163 	  else if (c >= 240 && c <= 254)
164 	    whatcode = EUC;
165 	  else if (c >= 224 && c <= 239)
166 	    {
167 	      c = str[i++];
168 	      if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
169 		whatcode = SJIS;
170 	      else if (c >= 253 && c <= 254)
171 		whatcode = EUC;
172 	      else if (c >= 161 && c <= 252)
173 		whatcode = EUCORSJIS;
174 	    }
175 	}
176     }
177 
178 #ifdef DEBUG
179   if (whatcode == ASCII)
180     debug ("Kanji code not included.");
181   else if (whatcode == EUCORSJIS)
182     debug ("Kanji code not detected.");
183   else
184     debug ("Kanji code detected at %d byte.", i);
185 #endif
186 
187   if (whatcode == EUCORSJIS && oldcode != ASCII)
188     whatcode = oldcode;
189 
190   if (whatcode == EUCORSJIS)
191     {
192       if (getenv ("LC_ALL"))
193 	lang = getenv ("LC_ALL");
194       else if (getenv ("LC_CTYPE"))
195 	lang = getenv ("LC_CTYPE");
196       else if (getenv ("LANG"))
197 	lang = getenv ("LANG");
198 
199       if (lang)
200 	{
201 	  if (strcmp (lang, "ja_JP.SJIS") == 0 ||
202 #ifdef hpux
203 	      strcmp (lang, "japanese") == 0 ||
204 #endif
205 	      strcmp (lang, "ja_JP.mscode") == 0 ||
206 	      strcmp (lang, "ja_JP.PCK") == 0)
207 	    whatcode = SJIS;
208 	  else if (strncmp (lang, "ja", 2) == 0)
209 #ifdef SJISPRE
210 	    whatcode = SJIS;
211 #else
212 	    whatcode = EUC;
213 #endif
214 	}
215     }
216 
217   if (whatcode == EUCORSJIS)
218 #ifdef SJISPRE
219     whatcode = SJIS;
220 #else
221     whatcode = EUC;
222 #endif
223 
224   return whatcode;
225 }
226 
227 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
228 
229 static void
SJIStoJIS(int * p1,int * p2)230 SJIStoJIS (int *p1, int *p2)
231 {
232   register unsigned char c1 = *p1;
233   register unsigned char c2 = *p2;
234   register int adjust = c2 < 159;
235   register int rowOffset = c1 < 160 ? 112 : 176;
236   register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
237 
238   *p1 = ((c1 - rowOffset) << 1) - adjust;
239   *p2 -= cellOffset;
240 }
241 
242 /* han2zen() was derived from han2zen() written by Ken Lunde. */
243 
244 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
245 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
246 
247 static void
han2zen(int * p1,int * p2)248 han2zen (int *p1, int *p2)
249 {
250   int c = *p1;
251   int daku = FALSE;
252   int handaku = FALSE;
253   int mtable[][2] =
254   {
255     {129, 66},
256     {129, 117},
257     {129, 118},
258     {129, 65},
259     {129, 69},
260     {131, 146},
261     {131, 64},
262     {131, 66},
263     {131, 68},
264     {131, 70},
265     {131, 72},
266     {131, 131},
267     {131, 133},
268     {131, 135},
269     {131, 98},
270     {129, 91},
271     {131, 65},
272     {131, 67},
273     {131, 69},
274     {131, 71},
275     {131, 73},
276     {131, 74},
277     {131, 76},
278     {131, 78},
279     {131, 80},
280     {131, 82},
281     {131, 84},
282     {131, 86},
283     {131, 88},
284     {131, 90},
285     {131, 92},
286     {131, 94},
287     {131, 96},
288     {131, 99},
289     {131, 101},
290     {131, 103},
291     {131, 105},
292     {131, 106},
293     {131, 107},
294     {131, 108},
295     {131, 109},
296     {131, 110},
297     {131, 113},
298     {131, 116},
299     {131, 119},
300     {131, 122},
301     {131, 125},
302     {131, 126},
303     {131, 128},
304     {131, 129},
305     {131, 130},
306     {131, 132},
307     {131, 134},
308     {131, 136},
309     {131, 137},
310     {131, 138},
311     {131, 139},
312     {131, 140},
313     {131, 141},
314     {131, 143},
315     {131, 147},
316     {129, 74},
317     {129, 75}
318   };
319 
320   if (*p2 == 222 && IS_DAKU (*p1))
321     daku = TRUE;		/* Daku-ten */
322   else if (*p2 == 223 && IS_HANDAKU (*p1))
323     handaku = TRUE;		/* Han-daku-ten */
324 
325   *p1 = mtable[c - 161][0];
326   *p2 = mtable[c - 161][1];
327 
328   if (daku)
329     {
330       if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
331 	(*p2)++;
332       else if (*p2 == 131 || *p2 == 69)
333 	*p2 = 148;
334     }
335   else if (handaku && *p2 >= 110 && *p2 <= 122)
336     (*p2) += 2;
337 }
338 
339 /* Recast strcpy to handle unsigned chars used below. */
340 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
341 
342 static void
do_convert(unsigned char * to,unsigned char * from,const char * code)343 do_convert (unsigned char *to, unsigned char *from, const char *code)
344 {
345 #ifdef HAVE_ICONV
346   iconv_t cd;
347   size_t from_len, to_len;
348 
349   if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
350     {
351       error ("iconv_open() error");
352 #ifdef HAVE_ERRNO_H
353       if (errno == EINVAL)
354 	error ("invalid code specification: \"%s\" or \"%s\"",
355 	       EUCSTR, code);
356 #endif
357       strcpy ((char *) to, (const char *) from);
358       return;
359     }
360 
361   from_len = strlen ((const char *) from) + 1;
362   to_len = BUFSIZ;
363 
364   if ((int) iconv(cd, (char **) &from, &from_len, (char **) &to, &to_len) == -1)
365     {
366 #ifdef HAVE_ERRNO_H
367       if (errno == EINVAL)
368 	error ("invalid end of input string");
369       else if (errno == EILSEQ)
370 	error ("invalid code in input string");
371       else if (errno == E2BIG)
372 	error ("output buffer overflow at do_convert()");
373       else
374 #endif
375 	error ("something happen");
376       strcpy ((char *) to, (const char *) from);
377       return;
378     }
379 
380   if (iconv_close (cd) != 0)
381     {
382       error ("iconv_close() error");
383     }
384 #else
385   int p1, p2, i, j;
386   int jisx0208 = FALSE;
387   int hankaku = FALSE;
388 
389   j = 0;
390   if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
391     {
392       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
393 	{
394 	  if (from[i] == ESC)
395 	    {
396 	      i++;
397 	      if (from[i] == '$')
398 		{
399 		  jisx0208 = TRUE;
400 		  hankaku = FALSE;
401 		  i++;
402 		}
403 	      else if (from[i] == '(')
404 		{
405 		  jisx0208 = FALSE;
406 		  i++;
407 		  if (from[i] == 'I')	/* Hankaku Kana */
408 		    hankaku = TRUE;
409 		  else
410 		    hankaku = FALSE;
411 		}
412 	    }
413 	  else
414 	    {
415 	      if (jisx0208)
416 		to[j++] = from[i] + 128;
417 	      else if (hankaku)
418 		{
419 		  to[j++] = SS2;
420 		  to[j++] = from[i] + 128;
421 		}
422 	      else
423 		to[j++] = from[i];
424 	    }
425 	}
426     }
427   else if (strcmp (code, SJISSTR) == 0)
428     {
429       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
430 	{
431 	  p1 = from[i];
432 	  if (p1 < 127)
433 	    to[j++] = p1;
434 	  else if ((p1 >= 161) && (p1 <= 223))
435 	    {			/* Hankaku Kana */
436 	      to[j++] = SS2;
437 	      to[j++] = p1;
438 	    }
439 	  else
440 	    {
441 	      p2 = from[++i];
442 	      SJIStoJIS (&p1, &p2);
443 	      to[j++] = p1 + 128;
444 	      to[j++] = p2 + 128;
445 	    }
446 	}
447     }
448   else
449     {
450       error ("invalid code specification: \"%s\"", code);
451       return;
452     }
453 
454   if (j >= BUFSIZ)
455     {
456       error ("output buffer overflow at do_convert()");
457       ustrcpy (to, from);
458     }
459   else
460     to[j] = '\0';
461 #endif /* HAVE_ICONV */
462 }
463 
464 static int
do_check_and_conv(unsigned char * to,unsigned char * from)465 do_check_and_conv (unsigned char *to, unsigned char *from)
466 {
467   static unsigned char tmp[BUFSIZ];
468   int p1, p2, i, j;
469   int kanji = TRUE;
470 
471   switch (DetectKanjiCode (from))
472     {
473     case NEW:
474       debug ("Kanji code is New JIS.");
475       do_convert (tmp, from, NEWJISSTR);
476       break;
477     case OLD:
478       debug ("Kanji code is Old JIS.");
479       do_convert (tmp, from, OLDJISSTR);
480       break;
481     case ESCI:
482       debug ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
483       do_convert (tmp, from, NEWJISSTR);
484       break;
485     case NEC:
486       debug ("Kanji code is NEC Kanji.");
487       error ("cannot convert NEC Kanji.");
488       ustrcpy (tmp, from);
489       kanji = FALSE;
490       break;
491     case EUC:
492       debug ("Kanji code is EUC.");
493       ustrcpy (tmp, from);
494       break;
495     case SJIS:
496       debug ("Kanji code is SJIS.");
497       do_convert (tmp, from, SJISSTR);
498       break;
499     case EUCORSJIS:
500       debug ("Kanji code is EUC or SJIS.");
501       ustrcpy (tmp, from);
502       kanji = FALSE;
503       break;
504     case ASCII:
505       debug ("This is ASCII string.");
506       ustrcpy (tmp, from);
507       kanji = FALSE;
508       break;
509     default:
510       debug ("This string includes unknown code.");
511       ustrcpy (tmp, from);
512       kanji = FALSE;
513       break;
514     }
515 
516   /* Hankaku Kana ---> Zenkaku Kana */
517   if (kanji)
518     {
519       j = 0;
520       for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
521 	{
522 	  if (tmp[i] == SS2)
523 	    {
524 	      p1 = tmp[++i];
525 	      if (tmp[i + 1] == SS2)
526 		{
527 		  p2 = tmp[i + 2];
528 		  if (p2 == 222 || p2 == 223)
529 		    i += 2;
530 		  else
531 		    p2 = 0;
532 		}
533 	      else
534 		p2 = 0;
535 	      han2zen (&p1, &p2);
536 	      SJIStoJIS (&p1, &p2);
537 	      to[j++] = p1 + 128;
538 	      to[j++] = p2 + 128;
539 	    }
540 	  else
541 	    to[j++] = tmp[i];
542 	}
543 
544       if (j >= BUFSIZ)
545 	{
546 	  error ("output buffer overflow at Hankaku --> Zenkaku");
547 	  ustrcpy (to, tmp);
548 	}
549       else
550 	to[j] = '\0';
551     }
552   else
553     ustrcpy (to, tmp);
554 
555   return kanji;
556 }
557 
558 int
any2eucjp(unsigned char * dest,unsigned char * src,unsigned int dest_max)559 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
560 {
561   static unsigned char tmp_dest[BUFSIZ];
562   int ret;
563 
564   if (strlen ((const char *) src) >= BUFSIZ)
565     {
566       error ("input string too large");
567       return -1;
568     }
569   if (dest_max > BUFSIZ)
570     {
571       error ("invalid maximum size of destination\nit should be less than %d.", BUFSIZ);
572       return -1;
573     }
574   ret = do_check_and_conv (tmp_dest, src);
575   if (strlen ((const char *) tmp_dest) >= dest_max)
576     {
577       error ("output buffer overflow");
578       ustrcpy (dest, src);
579       return -1;
580     }
581   ustrcpy (dest, tmp_dest);
582   return ret;
583 }
584 
585 #if 0
586 unsigned int
587 strwidth (unsigned char *s)
588 {
589   unsigned char *t;
590   unsigned int i;
591 
592   t = (unsigned char *) gdMalloc (BUFSIZ);
593   any2eucjp (t, s, BUFSIZ);
594   i = strlen (t);
595   gdFree (t);
596   return i;
597 }
598 
599 #ifdef DEBUG
600 int
601 main ()
602 {
603   unsigned char input[BUFSIZ];
604   unsigned char *output;
605   unsigned char *str;
606   int c, i = 0;
607 
608   while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
609     input[i++] = c;
610   input[i] = '\0';
611 
612   printf ("input : %d bytes\n", strlen ((const char *) input));
613   printf ("output: %d bytes\n", strwidth (input));
614 
615   output = (unsigned char *) gdMalloc (BUFSIZ);
616   any2eucjp (output, input, BUFSIZ);
617   str = output;
618   while (*str != '\0')
619     putchar (*(str++));
620   putchar ('\n');
621   gdFree (output);
622 
623   return 0;
624 }
625 #endif
626 #endif
627