xref: /PHP-5.3/ext/mbstring/oniguruma/enc/gb18030.c (revision 3e1c0517)
1 /**********************************************************************
2   gb18030.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2005  KUBO Takehiro <kubo AT jiubao DOT org>
6  *                     K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include "regenc.h"
32 
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #define DEBUG_GB18030(arg) printf arg
37 #endif
38 
39 enum {
40   C1, /* one-byte char */
41   C2, /* one-byte or second of two-byte char */
42   C4, /* one-byte or second or fourth of four-byte char */
43   CM  /* first of two- or four-byte char or second of two-byte char */
44 };
45 
46 static const char GB18030_MAP[] = {
47   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50   C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55   C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63 };
64 
65 static int
gb18030_mbc_enc_len(const UChar * p)66 gb18030_mbc_enc_len(const UChar* p)
67 {
68   if (GB18030_MAP[*p] != CM)
69     return 1;
70   p++;
71   if (GB18030_MAP[*p] == C4)
72     return 4;
73   if (GB18030_MAP[*p] == C1)
74     return 1; /* illegal sequence */
75   return 2;
76 }
77 
78 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)79 gb18030_mbc_to_code(const UChar* p, const UChar* end)
80 {
81   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
82 }
83 
84 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)85 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
86 {
87   return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
88 }
89 
90 static int
gb18030_mbc_to_normalize(OnigAmbigType flag,const UChar ** pp,const UChar * end,UChar * lower)91 gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
92                        UChar* lower)
93 {
94   return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag,
95                                       pp, end, lower);
96 }
97 
98 static int
gb18030_is_mbc_ambiguous(OnigAmbigType flag,const UChar ** pp,const UChar * end)99 gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
100 {
101   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
102 }
103 
104 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)105 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
106 {
107   return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
108 }
109 
110 enum state {
111   S_START,
112   S_one_C2,
113   S_one_C4,
114   S_one_CM,
115 
116   S_odd_CM_one_CX,
117   S_even_CM_one_CX,
118 
119   /* CMC4 : pair of "CM C4" */
120   S_one_CMC4,
121   S_odd_CMC4,
122   S_one_C4_odd_CMC4,
123   S_even_CMC4,
124   S_one_C4_even_CMC4,
125 
126   S_odd_CM_odd_CMC4,
127   S_even_CM_odd_CMC4,
128 
129   S_odd_CM_even_CMC4,
130   S_even_CM_even_CMC4,
131 
132   /* C4CM : pair of "C4 CM" */
133   S_odd_C4CM,
134   S_one_CM_odd_C4CM,
135   S_even_C4CM,
136   S_one_CM_even_C4CM,
137 
138   S_even_CM_odd_C4CM,
139   S_odd_CM_odd_C4CM,
140   S_even_CM_even_C4CM,
141   S_odd_CM_even_C4CM,
142 };
143 
144 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)145 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
146 {
147   const UChar *p;
148   enum state state = S_START;
149 
150   DEBUG_GB18030(("----------------\n"));
151   for (p = s; p >= start; p--) {
152     DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
153     switch (state) {
154     case S_START:
155       switch (GB18030_MAP[*p]) {
156       case C1:
157 	return (UChar *)s;
158       case C2:
159 	state = S_one_C2; /* C2 */
160 	break;
161       case C4:
162 	state = S_one_C4; /* C4 */
163 	break;
164       case CM:
165 	state = S_one_CM; /* CM */
166 	break;
167       }
168       break;
169     case S_one_C2: /* C2 */
170       switch (GB18030_MAP[*p]) {
171       case C1:
172       case C2:
173       case C4:
174 	return (UChar *)s;
175       case CM:
176 	state = S_odd_CM_one_CX; /* CM C2 */
177 	break;
178       }
179       break;
180     case S_one_C4: /* C4 */
181       switch (GB18030_MAP[*p]) {
182       case C1:
183       case C2:
184       case C4:
185 	return (UChar *)s;
186       case CM:
187 	state = S_one_CMC4;
188 	break;
189       }
190       break;
191     case S_one_CM: /* CM */
192       switch (GB18030_MAP[*p]) {
193       case C1:
194       case C2:
195 	return (UChar *)s;
196       case C4:
197 	state = S_odd_C4CM;
198 	break;
199       case CM:
200 	state = S_odd_CM_one_CX; /* CM CM */
201 	break;
202       }
203       break;
204 
205     case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
206       switch (GB18030_MAP[*p]) {
207       case C1:
208       case C2:
209       case C4:
210 	return (UChar *)(s - 1);
211       case CM:
212 	state = S_even_CM_one_CX;
213 	break;
214       }
215       break;
216     case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
217       switch (GB18030_MAP[*p]) {
218       case C1:
219       case C2:
220       case C4:
221 	return (UChar *)s;
222       case CM:
223 	state = S_odd_CM_one_CX;
224 	break;
225       }
226       break;
227 
228     case S_one_CMC4: /* CM C4 */
229       switch (GB18030_MAP[*p]) {
230       case C1:
231       case C2:
232 	return (UChar *)(s - 1);
233       case C4:
234 	state = S_one_C4_odd_CMC4; /* C4 CM C4 */
235 	break;
236       case CM:
237 	state = S_even_CM_one_CX; /* CM CM C4 */
238 	break;
239       }
240       break;
241     case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
242       switch (GB18030_MAP[*p]) {
243       case C1:
244       case C2:
245 	return (UChar *)(s - 1);
246       case C4:
247 	state = S_one_C4_odd_CMC4;
248 	break;
249       case CM:
250 	state = S_odd_CM_odd_CMC4;
251 	break;
252       }
253       break;
254     case S_one_C4_odd_CMC4: /* C4 CM C4 */
255       switch (GB18030_MAP[*p]) {
256       case C1:
257       case C2:
258       case C4:
259 	return (UChar *)(s - 1);
260       case CM:
261 	state = S_even_CMC4; /* CM C4 CM C4 */
262 	break;
263       }
264       break;
265     case S_even_CMC4: /* CM C4 CM C4 */
266       switch (GB18030_MAP[*p]) {
267       case C1:
268       case C2:
269 	return (UChar *)(s - 3);
270       case C4:
271 	state = S_one_C4_even_CMC4;
272 	break;
273       case CM:
274 	state = S_odd_CM_even_CMC4;
275 	break;
276       }
277       break;
278     case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
279       switch (GB18030_MAP[*p]) {
280       case C1:
281       case C2:
282       case C4:
283 	return (UChar *)(s - 3);
284       case CM:
285 	state = S_odd_CMC4;
286 	break;
287       }
288       break;
289 
290     case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
291       switch (GB18030_MAP[*p]) {
292       case C1:
293       case C2:
294       case C4:
295 	return (UChar *)(s - 3);
296       case CM:
297 	state = S_even_CM_odd_CMC4;
298 	break;
299       }
300       break;
301     case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
302       switch (GB18030_MAP[*p]) {
303       case C1:
304       case C2:
305       case C4:
306 	return (UChar *)(s - 1);
307       case CM:
308 	state = S_odd_CM_odd_CMC4;
309 	break;
310       }
311       break;
312 
313     case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
314       switch (GB18030_MAP[*p]) {
315       case C1:
316       case C2:
317       case C4:
318 	return (UChar *)(s - 1);
319       case CM:
320 	state = S_even_CM_even_CMC4;
321 	break;
322       }
323       break;
324     case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
325       switch (GB18030_MAP[*p]) {
326       case C1:
327       case C2:
328       case C4:
329 	return (UChar *)(s - 3);
330       case CM:
331 	state = S_odd_CM_even_CMC4;
332 	break;
333       }
334       break;
335 
336     case S_odd_C4CM: /* C4 CM */  /* C4 CM C4 CM C4 CM*/
337       switch (GB18030_MAP[*p]) {
338       case C1:
339       case C2:
340       case C4:
341 	return (UChar *)s;
342       case CM:
343 	state = S_one_CM_odd_C4CM; /* CM C4 CM */
344 	break;
345       }
346       break;
347     case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
348       switch (GB18030_MAP[*p]) {
349       case C1:
350       case C2:
351 	return (UChar *)(s - 2); /* |CM C4 CM */
352       case C4:
353 	state = S_even_C4CM;
354 	break;
355       case CM:
356 	state = S_even_CM_odd_C4CM;
357 	break;
358       }
359       break;
360     case S_even_C4CM: /* C4 CM C4 CM */
361       switch (GB18030_MAP[*p]) {
362       case C1:
363       case C2:
364       case C4:
365 	return (UChar *)(s - 2);  /* C4|CM C4 CM */
366       case CM:
367 	state = S_one_CM_even_C4CM;
368 	break;
369       }
370       break;
371     case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
372       switch (GB18030_MAP[*p]) {
373       case C1:
374       case C2:
375 	return (UChar *)(s - 0);  /*|CM C4 CM C4|CM */
376       case C4:
377 	state = S_odd_C4CM;
378 	break;
379       case CM:
380 	state = S_even_CM_even_C4CM;
381 	break;
382       }
383       break;
384 
385     case S_even_CM_odd_C4CM: /* CM CM C4 CM */
386       switch (GB18030_MAP[*p]) {
387       case C1:
388       case C2:
389       case C4:
390 	return (UChar *)(s - 0); /* |CM CM|C4|CM */
391       case CM:
392 	state = S_odd_CM_odd_C4CM;
393 	break;
394       }
395       break;
396     case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
397       switch (GB18030_MAP[*p]) {
398       case C1:
399       case C2:
400       case C4:
401 	return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
402       case CM:
403 	state = S_even_CM_odd_C4CM;
404 	break;
405       }
406       break;
407 
408     case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
409       switch (GB18030_MAP[*p]) {
410       case C1:
411       case C2:
412       case C4:
413 	return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
414       case CM:
415 	state = S_odd_CM_even_C4CM;
416 	break;
417       }
418       break;
419     case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
420       switch (GB18030_MAP[*p]) {
421       case C1:
422       case C2:
423       case C4:
424 	return (UChar *)(s - 0);  /* |CM CM|CM C4 CM C4|CM */
425       case CM:
426 	state = S_even_CM_even_C4CM;
427 	break;
428       }
429       break;
430     }
431   }
432 
433   DEBUG_GB18030(("state %d\n", state));
434   switch (state) {
435   case S_START:             return (UChar *)(s - 0);
436   case S_one_C2:            return (UChar *)(s - 0);
437   case S_one_C4:            return (UChar *)(s - 0);
438   case S_one_CM:            return (UChar *)(s - 0);
439 
440   case S_odd_CM_one_CX:     return (UChar *)(s - 1);
441   case S_even_CM_one_CX:    return (UChar *)(s - 0);
442 
443   case S_one_CMC4:          return (UChar *)(s - 1);
444   case S_odd_CMC4:          return (UChar *)(s - 1);
445   case S_one_C4_odd_CMC4:   return (UChar *)(s - 1);
446   case S_even_CMC4:         return (UChar *)(s - 3);
447   case S_one_C4_even_CMC4:  return (UChar *)(s - 3);
448 
449   case S_odd_CM_odd_CMC4:   return (UChar *)(s - 3);
450   case S_even_CM_odd_CMC4:  return (UChar *)(s - 1);
451 
452   case S_odd_CM_even_CMC4:  return (UChar *)(s - 1);
453   case S_even_CM_even_CMC4: return (UChar *)(s - 3);
454 
455   case S_odd_C4CM:          return (UChar *)(s - 0);
456   case S_one_CM_odd_C4CM:   return (UChar *)(s - 2);
457   case S_even_C4CM:         return (UChar *)(s - 2);
458   case S_one_CM_even_C4CM:  return (UChar *)(s - 0);
459 
460   case S_even_CM_odd_C4CM:  return (UChar *)(s - 0);
461   case S_odd_CM_odd_C4CM:   return (UChar *)(s - 2);
462   case S_even_CM_even_C4CM: return (UChar *)(s - 2);
463   case S_odd_CM_even_C4CM:  return (UChar *)(s - 0);
464   }
465 
466   return (UChar* )s;  /* never come here. (escape warning) */
467 }
468 
469 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end)470 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end)
471 {
472   return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
473 }
474 
475 OnigEncodingType OnigEncodingGB18030 = {
476   gb18030_mbc_enc_len,
477   "GB18030",   /* name */
478   4,          /* max enc length */
479   1,          /* min enc length */
480   ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
481   {
482       (OnigCodePoint )'\\'                       /* esc */
483     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
484     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
485     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
486     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
487     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
488   },
489   onigenc_is_mbc_newline_0x0a,
490   gb18030_mbc_to_code,
491   onigenc_mb4_code_to_mbclen,
492   gb18030_code_to_mbc,
493   gb18030_mbc_to_normalize,
494   gb18030_is_mbc_ambiguous,
495   onigenc_ascii_get_all_pair_ambig_codes,
496   onigenc_nothing_get_all_comp_ambig_codes,
497   gb18030_is_code_ctype,
498   onigenc_not_support_get_ctype_code_range,
499   gb18030_left_adjust_char_head,
500   gb18030_is_allowed_reverse_match
501 };
502