xref: /PHP-5.5/ext/mbstring/oniguruma/enc/gb18030.c (revision fe92d64a)
1 /**********************************************************************
2   gb18030.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2005-2007  KUBO Takehiro <kubo AT jiubao DOT org>
6  *                          K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include "regenc.h"
32 
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #define DEBUG_GB18030(arg) printf arg
37 #endif
38 
39 enum {
40   C1, /* one-byte char */
41   C2, /* one-byte or second of two-byte char */
42   C4, /* one-byte or second or fourth of four-byte char */
43   CM  /* first of two- or four-byte char or second of two-byte char */
44 };
45 
46 static const char GB18030_MAP[] = {
47   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50   C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55   C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63 };
64 
65 static int
gb18030_mbc_enc_len(const UChar * p)66 gb18030_mbc_enc_len(const UChar* p)
67 {
68   if (GB18030_MAP[*p] != CM)
69     return 1;
70   p++;
71   if (GB18030_MAP[*p] == C4)
72     return 4;
73   if (GB18030_MAP[*p] == C1)
74     return 1; /* illegal sequence */
75   return 2;
76 }
77 
78 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)79 gb18030_mbc_to_code(const UChar* p, const UChar* end)
80 {
81   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
82 }
83 
84 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)85 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
86 {
87   return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
88 }
89 
90 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)91 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
92                       UChar* lower)
93 {
94   return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
95                                    pp, end, lower);
96 }
97 
98 #if 0
99 static int
100 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
101 			 const UChar** pp, const UChar* end)
102 {
103   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
104 }
105 #endif
106 
107 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)108 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
109 {
110   return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
111 }
112 
113 enum state {
114   S_START,
115   S_one_C2,
116   S_one_C4,
117   S_one_CM,
118 
119   S_odd_CM_one_CX,
120   S_even_CM_one_CX,
121 
122   /* CMC4 : pair of "CM C4" */
123   S_one_CMC4,
124   S_odd_CMC4,
125   S_one_C4_odd_CMC4,
126   S_even_CMC4,
127   S_one_C4_even_CMC4,
128 
129   S_odd_CM_odd_CMC4,
130   S_even_CM_odd_CMC4,
131 
132   S_odd_CM_even_CMC4,
133   S_even_CM_even_CMC4,
134 
135   /* C4CM : pair of "C4 CM" */
136   S_odd_C4CM,
137   S_one_CM_odd_C4CM,
138   S_even_C4CM,
139   S_one_CM_even_C4CM,
140 
141   S_even_CM_odd_C4CM,
142   S_odd_CM_odd_C4CM,
143   S_even_CM_even_C4CM,
144   S_odd_CM_even_C4CM,
145 };
146 
147 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)148 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
149 {
150   const UChar *p;
151   enum state state = S_START;
152 
153   DEBUG_GB18030(("----------------\n"));
154   for (p = s; p >= start; p--) {
155     DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
156     switch (state) {
157     case S_START:
158       switch (GB18030_MAP[*p]) {
159       case C1:
160 	return (UChar *)s;
161       case C2:
162 	state = S_one_C2; /* C2 */
163 	break;
164       case C4:
165 	state = S_one_C4; /* C4 */
166 	break;
167       case CM:
168 	state = S_one_CM; /* CM */
169 	break;
170       }
171       break;
172     case S_one_C2: /* C2 */
173       switch (GB18030_MAP[*p]) {
174       case C1:
175       case C2:
176       case C4:
177 	return (UChar *)s;
178       case CM:
179 	state = S_odd_CM_one_CX; /* CM C2 */
180 	break;
181       }
182       break;
183     case S_one_C4: /* C4 */
184       switch (GB18030_MAP[*p]) {
185       case C1:
186       case C2:
187       case C4:
188 	return (UChar *)s;
189       case CM:
190 	state = S_one_CMC4;
191 	break;
192       }
193       break;
194     case S_one_CM: /* CM */
195       switch (GB18030_MAP[*p]) {
196       case C1:
197       case C2:
198 	return (UChar *)s;
199       case C4:
200 	state = S_odd_C4CM;
201 	break;
202       case CM:
203 	state = S_odd_CM_one_CX; /* CM CM */
204 	break;
205       }
206       break;
207 
208     case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
209       switch (GB18030_MAP[*p]) {
210       case C1:
211       case C2:
212       case C4:
213 	return (UChar *)(s - 1);
214       case CM:
215 	state = S_even_CM_one_CX;
216 	break;
217       }
218       break;
219     case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
220       switch (GB18030_MAP[*p]) {
221       case C1:
222       case C2:
223       case C4:
224 	return (UChar *)s;
225       case CM:
226 	state = S_odd_CM_one_CX;
227 	break;
228       }
229       break;
230 
231     case S_one_CMC4: /* CM C4 */
232       switch (GB18030_MAP[*p]) {
233       case C1:
234       case C2:
235 	return (UChar *)(s - 1);
236       case C4:
237 	state = S_one_C4_odd_CMC4; /* C4 CM C4 */
238 	break;
239       case CM:
240 	state = S_even_CM_one_CX; /* CM CM C4 */
241 	break;
242       }
243       break;
244     case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
245       switch (GB18030_MAP[*p]) {
246       case C1:
247       case C2:
248 	return (UChar *)(s - 1);
249       case C4:
250 	state = S_one_C4_odd_CMC4;
251 	break;
252       case CM:
253 	state = S_odd_CM_odd_CMC4;
254 	break;
255       }
256       break;
257     case S_one_C4_odd_CMC4: /* C4 CM C4 */
258       switch (GB18030_MAP[*p]) {
259       case C1:
260       case C2:
261       case C4:
262 	return (UChar *)(s - 1);
263       case CM:
264 	state = S_even_CMC4; /* CM C4 CM C4 */
265 	break;
266       }
267       break;
268     case S_even_CMC4: /* CM C4 CM C4 */
269       switch (GB18030_MAP[*p]) {
270       case C1:
271       case C2:
272 	return (UChar *)(s - 3);
273       case C4:
274 	state = S_one_C4_even_CMC4;
275 	break;
276       case CM:
277 	state = S_odd_CM_even_CMC4;
278 	break;
279       }
280       break;
281     case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
282       switch (GB18030_MAP[*p]) {
283       case C1:
284       case C2:
285       case C4:
286 	return (UChar *)(s - 3);
287       case CM:
288 	state = S_odd_CMC4;
289 	break;
290       }
291       break;
292 
293     case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
294       switch (GB18030_MAP[*p]) {
295       case C1:
296       case C2:
297       case C4:
298 	return (UChar *)(s - 3);
299       case CM:
300 	state = S_even_CM_odd_CMC4;
301 	break;
302       }
303       break;
304     case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
305       switch (GB18030_MAP[*p]) {
306       case C1:
307       case C2:
308       case C4:
309 	return (UChar *)(s - 1);
310       case CM:
311 	state = S_odd_CM_odd_CMC4;
312 	break;
313       }
314       break;
315 
316     case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
317       switch (GB18030_MAP[*p]) {
318       case C1:
319       case C2:
320       case C4:
321 	return (UChar *)(s - 1);
322       case CM:
323 	state = S_even_CM_even_CMC4;
324 	break;
325       }
326       break;
327     case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
328       switch (GB18030_MAP[*p]) {
329       case C1:
330       case C2:
331       case C4:
332 	return (UChar *)(s - 3);
333       case CM:
334 	state = S_odd_CM_even_CMC4;
335 	break;
336       }
337       break;
338 
339     case S_odd_C4CM: /* C4 CM */  /* C4 CM C4 CM C4 CM*/
340       switch (GB18030_MAP[*p]) {
341       case C1:
342       case C2:
343       case C4:
344 	return (UChar *)s;
345       case CM:
346 	state = S_one_CM_odd_C4CM; /* CM C4 CM */
347 	break;
348       }
349       break;
350     case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
351       switch (GB18030_MAP[*p]) {
352       case C1:
353       case C2:
354 	return (UChar *)(s - 2); /* |CM C4 CM */
355       case C4:
356 	state = S_even_C4CM;
357 	break;
358       case CM:
359 	state = S_even_CM_odd_C4CM;
360 	break;
361       }
362       break;
363     case S_even_C4CM: /* C4 CM C4 CM */
364       switch (GB18030_MAP[*p]) {
365       case C1:
366       case C2:
367       case C4:
368 	return (UChar *)(s - 2);  /* C4|CM C4 CM */
369       case CM:
370 	state = S_one_CM_even_C4CM;
371 	break;
372       }
373       break;
374     case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
375       switch (GB18030_MAP[*p]) {
376       case C1:
377       case C2:
378 	return (UChar *)(s - 0);  /*|CM C4 CM C4|CM */
379       case C4:
380 	state = S_odd_C4CM;
381 	break;
382       case CM:
383 	state = S_even_CM_even_C4CM;
384 	break;
385       }
386       break;
387 
388     case S_even_CM_odd_C4CM: /* CM CM C4 CM */
389       switch (GB18030_MAP[*p]) {
390       case C1:
391       case C2:
392       case C4:
393 	return (UChar *)(s - 0); /* |CM CM|C4|CM */
394       case CM:
395 	state = S_odd_CM_odd_C4CM;
396 	break;
397       }
398       break;
399     case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
400       switch (GB18030_MAP[*p]) {
401       case C1:
402       case C2:
403       case C4:
404 	return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
405       case CM:
406 	state = S_even_CM_odd_C4CM;
407 	break;
408       }
409       break;
410 
411     case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
412       switch (GB18030_MAP[*p]) {
413       case C1:
414       case C2:
415       case C4:
416 	return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
417       case CM:
418 	state = S_odd_CM_even_C4CM;
419 	break;
420       }
421       break;
422     case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
423       switch (GB18030_MAP[*p]) {
424       case C1:
425       case C2:
426       case C4:
427 	return (UChar *)(s - 0);  /* |CM CM|CM C4 CM C4|CM */
428       case CM:
429 	state = S_even_CM_even_C4CM;
430 	break;
431       }
432       break;
433     }
434   }
435 
436   DEBUG_GB18030(("state %d\n", state));
437   switch (state) {
438   case S_START:             return (UChar *)(s - 0);
439   case S_one_C2:            return (UChar *)(s - 0);
440   case S_one_C4:            return (UChar *)(s - 0);
441   case S_one_CM:            return (UChar *)(s - 0);
442 
443   case S_odd_CM_one_CX:     return (UChar *)(s - 1);
444   case S_even_CM_one_CX:    return (UChar *)(s - 0);
445 
446   case S_one_CMC4:          return (UChar *)(s - 1);
447   case S_odd_CMC4:          return (UChar *)(s - 1);
448   case S_one_C4_odd_CMC4:   return (UChar *)(s - 1);
449   case S_even_CMC4:         return (UChar *)(s - 3);
450   case S_one_C4_even_CMC4:  return (UChar *)(s - 3);
451 
452   case S_odd_CM_odd_CMC4:   return (UChar *)(s - 3);
453   case S_even_CM_odd_CMC4:  return (UChar *)(s - 1);
454 
455   case S_odd_CM_even_CMC4:  return (UChar *)(s - 1);
456   case S_even_CM_even_CMC4: return (UChar *)(s - 3);
457 
458   case S_odd_C4CM:          return (UChar *)(s - 0);
459   case S_one_CM_odd_C4CM:   return (UChar *)(s - 2);
460   case S_even_C4CM:         return (UChar *)(s - 2);
461   case S_one_CM_even_C4CM:  return (UChar *)(s - 0);
462 
463   case S_even_CM_odd_C4CM:  return (UChar *)(s - 0);
464   case S_odd_CM_odd_C4CM:   return (UChar *)(s - 2);
465   case S_even_CM_even_C4CM: return (UChar *)(s - 2);
466   case S_odd_CM_even_C4CM:  return (UChar *)(s - 0);
467   }
468 
469   return (UChar* )s;  /* never come here. (escape warning) */
470 }
471 
472 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)473 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
474 {
475   return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
476 }
477 
478 OnigEncodingType OnigEncodingGB18030 = {
479   gb18030_mbc_enc_len,
480   "GB18030",   /* name */
481   4,          /* max enc length */
482   1,          /* min enc length */
483   onigenc_is_mbc_newline_0x0a,
484   gb18030_mbc_to_code,
485   onigenc_mb4_code_to_mbclen,
486   gb18030_code_to_mbc,
487   gb18030_mbc_case_fold,
488   onigenc_ascii_apply_all_case_fold,
489   onigenc_ascii_get_case_fold_codes_by_str,
490   onigenc_minimum_property_name_to_ctype,
491   gb18030_is_code_ctype,
492   onigenc_not_support_get_ctype_code_range,
493   gb18030_left_adjust_char_head,
494   gb18030_is_allowed_reverse_match
495 };
496