xref: /PHP-7.3/ext/mbstring/oniguruma/src/gb18030.c (revision 1979c5d1)
1 /**********************************************************************
2   gb18030.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2005-2019  KUBO Takehiro <kubo AT jiubao DOT org>
6  *                          K.Kosako
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include "regenc.h"
32 
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #include <stdio.h>
37 #define DEBUG_GB18030(arg) printf arg
38 #endif
39 
40 enum {
41   C1, /* one-byte char */
42   C2, /* one-byte or second of two-byte char */
43   C4, /* one-byte or second or fourth of four-byte char */
44   CM  /* first of two- or four-byte char or second of two-byte char */
45 };
46 
47 static const char GB18030_MAP[] = {
48   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
51   C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
52   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
55   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
56   C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
63   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
64 };
65 
66 static int
gb18030_mbc_enc_len(const UChar * p)67 gb18030_mbc_enc_len(const UChar* p)
68 {
69   if (GB18030_MAP[*p] != CM)
70     return 1;
71 
72   p++;
73   if (GB18030_MAP[*p] == C4)
74     return 4;
75 
76   return 2;
77 }
78 
79 static int
gb18030_code_to_mbclen(OnigCodePoint code)80 gb18030_code_to_mbclen(OnigCodePoint code)
81 {
82        if ((code & 0xff000000) != 0) return 4;
83   else if ((code &   0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
84   else if ((code &     0xff00) != 0) return 2;
85   else {
86     if (GB18030_MAP[(int )(code & 0xff)] == CM)
87       return ONIGERR_INVALID_CODE_POINT_VALUE;
88 
89     return 1;
90   }
91 }
92 
93 static int
is_valid_mbc_string(const UChar * p,const UChar * end)94 is_valid_mbc_string(const UChar* p, const UChar* end)
95 {
96   while (p < end) {
97     if (*p < 0x80) {
98       p++;
99     }
100     else if (*p == 0x80 || *p == 0xff) {
101       return FALSE;
102     }
103     else {
104       p++;
105       if (p >= end) return FALSE;
106       if (*p < 0x40) {
107         if (*p < 0x30 || *p > 0x39)
108           return FALSE;
109 
110         p++;
111         if (p >= end) return FALSE;
112         if (*p < 0x81 || *p == 0xff) return FALSE;
113 
114         p++;
115         if (p >= end) return FALSE;
116         if (*p < 0x30 || *p > 0x39)
117           return FALSE;
118 
119         p++;
120       }
121       else if (*p == 0x7f || *p == 0xff) {
122         return FALSE;
123       }
124       else {
125         p++;
126       }
127     }
128   }
129 
130   return TRUE;
131 }
132 
133 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)134 gb18030_mbc_to_code(const UChar* p, const UChar* end)
135 {
136   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
137 }
138 
139 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)140 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
141 {
142   return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
143 }
144 
145 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)146 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
147                       UChar* lower)
148 {
149   return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
150                                    pp, end, lower);
151 }
152 
153 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)154 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
155 {
156   return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
157 }
158 
159 enum state {
160   S_START,
161   S_one_C2,
162   S_one_C4,
163   S_one_CM,
164 
165   S_odd_CM_one_CX,
166   S_even_CM_one_CX,
167 
168   /* CMC4 : pair of "CM C4" */
169   S_one_CMC4,
170   S_odd_CMC4,
171   S_one_C4_odd_CMC4,
172   S_even_CMC4,
173   S_one_C4_even_CMC4,
174 
175   S_odd_CM_odd_CMC4,
176   S_even_CM_odd_CMC4,
177 
178   S_odd_CM_even_CMC4,
179   S_even_CM_even_CMC4,
180 
181   /* C4CM : pair of "C4 CM" */
182   S_odd_C4CM,
183   S_one_CM_odd_C4CM,
184   S_even_C4CM,
185   S_one_CM_even_C4CM,
186 
187   S_even_CM_odd_C4CM,
188   S_odd_CM_odd_C4CM,
189   S_even_CM_even_C4CM,
190   S_odd_CM_even_C4CM,
191 };
192 
193 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)194 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
195 {
196   const UChar *p;
197   enum state state = S_START;
198 
199   DEBUG_GB18030(("----------------\n"));
200   for (p = s; p >= start; p--) {
201     DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
202     switch (state) {
203     case S_START:
204       switch (GB18030_MAP[*p]) {
205       case C1:
206         return (UChar *)s;
207       case C2:
208         state = S_one_C2; /* C2 */
209         break;
210       case C4:
211         state = S_one_C4; /* C4 */
212         break;
213       case CM:
214         state = S_one_CM; /* CM */
215         break;
216       }
217       break;
218     case S_one_C2: /* C2 */
219       switch (GB18030_MAP[*p]) {
220       case C1:
221       case C2:
222       case C4:
223         return (UChar *)s;
224       case CM:
225         state = S_odd_CM_one_CX; /* CM C2 */
226         break;
227       }
228       break;
229     case S_one_C4: /* C4 */
230       switch (GB18030_MAP[*p]) {
231       case C1:
232       case C2:
233       case C4:
234         return (UChar *)s;
235       case CM:
236         state = S_one_CMC4;
237         break;
238       }
239       break;
240     case S_one_CM: /* CM */
241       switch (GB18030_MAP[*p]) {
242       case C1:
243       case C2:
244         return (UChar *)s;
245       case C4:
246         state = S_odd_C4CM;
247         break;
248       case CM:
249         state = S_odd_CM_one_CX; /* CM CM */
250         break;
251       }
252       break;
253 
254     case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
255       switch (GB18030_MAP[*p]) {
256       case C1:
257       case C2:
258       case C4:
259         return (UChar *)(s - 1);
260       case CM:
261         state = S_even_CM_one_CX;
262         break;
263       }
264       break;
265     case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
266       switch (GB18030_MAP[*p]) {
267       case C1:
268       case C2:
269       case C4:
270         return (UChar *)s;
271       case CM:
272         state = S_odd_CM_one_CX;
273         break;
274       }
275       break;
276 
277     case S_one_CMC4: /* CM C4 */
278       switch (GB18030_MAP[*p]) {
279       case C1:
280       case C2:
281         return (UChar *)(s - 1);
282       case C4:
283         state = S_one_C4_odd_CMC4; /* C4 CM C4 */
284         break;
285       case CM:
286         state = S_even_CM_one_CX; /* CM CM C4 */
287         break;
288       }
289       break;
290     case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
291       switch (GB18030_MAP[*p]) {
292       case C1:
293       case C2:
294         return (UChar *)(s - 1);
295       case C4:
296         state = S_one_C4_odd_CMC4;
297         break;
298       case CM:
299         state = S_odd_CM_odd_CMC4;
300         break;
301       }
302       break;
303     case S_one_C4_odd_CMC4: /* C4 CM C4 */
304       switch (GB18030_MAP[*p]) {
305       case C1:
306       case C2:
307       case C4:
308         return (UChar *)(s - 1);
309       case CM:
310         state = S_even_CMC4; /* CM C4 CM C4 */
311         break;
312       }
313       break;
314     case S_even_CMC4: /* CM C4 CM C4 */
315       switch (GB18030_MAP[*p]) {
316       case C1:
317       case C2:
318         return (UChar *)(s - 3);
319       case C4:
320         state = S_one_C4_even_CMC4;
321         break;
322       case CM:
323         state = S_odd_CM_even_CMC4;
324         break;
325       }
326       break;
327     case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
328       switch (GB18030_MAP[*p]) {
329       case C1:
330       case C2:
331       case C4:
332         return (UChar *)(s - 3);
333       case CM:
334         state = S_odd_CMC4;
335         break;
336       }
337       break;
338 
339     case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
340       switch (GB18030_MAP[*p]) {
341       case C1:
342       case C2:
343       case C4:
344         return (UChar *)(s - 3);
345       case CM:
346         state = S_even_CM_odd_CMC4;
347         break;
348       }
349       break;
350     case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
351       switch (GB18030_MAP[*p]) {
352       case C1:
353       case C2:
354       case C4:
355         return (UChar *)(s - 1);
356       case CM:
357         state = S_odd_CM_odd_CMC4;
358         break;
359       }
360       break;
361 
362     case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
363       switch (GB18030_MAP[*p]) {
364       case C1:
365       case C2:
366       case C4:
367         return (UChar *)(s - 1);
368       case CM:
369         state = S_even_CM_even_CMC4;
370         break;
371       }
372       break;
373     case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
374       switch (GB18030_MAP[*p]) {
375       case C1:
376       case C2:
377       case C4:
378         return (UChar *)(s - 3);
379       case CM:
380         state = S_odd_CM_even_CMC4;
381         break;
382       }
383       break;
384 
385     case S_odd_C4CM: /* C4 CM */  /* C4 CM C4 CM C4 CM*/
386       switch (GB18030_MAP[*p]) {
387       case C1:
388       case C2:
389       case C4:
390         return (UChar *)s;
391       case CM:
392         state = S_one_CM_odd_C4CM; /* CM C4 CM */
393         break;
394       }
395       break;
396     case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
397       switch (GB18030_MAP[*p]) {
398       case C1:
399       case C2:
400         return (UChar *)(s - 2); /* |CM C4 CM */
401       case C4:
402         state = S_even_C4CM;
403         break;
404       case CM:
405         state = S_even_CM_odd_C4CM;
406         break;
407       }
408       break;
409     case S_even_C4CM: /* C4 CM C4 CM */
410       switch (GB18030_MAP[*p]) {
411       case C1:
412       case C2:
413       case C4:
414         return (UChar *)(s - 2);  /* C4|CM C4 CM */
415       case CM:
416         state = S_one_CM_even_C4CM;
417         break;
418       }
419       break;
420     case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
421       switch (GB18030_MAP[*p]) {
422       case C1:
423       case C2:
424         return (UChar *)(s - 0);  /*|CM C4 CM C4|CM */
425       case C4:
426         state = S_odd_C4CM;
427         break;
428       case CM:
429         state = S_even_CM_even_C4CM;
430         break;
431       }
432       break;
433 
434     case S_even_CM_odd_C4CM: /* CM CM C4 CM */
435       switch (GB18030_MAP[*p]) {
436       case C1:
437       case C2:
438       case C4:
439         return (UChar *)(s - 0); /* |CM CM|C4|CM */
440       case CM:
441         state = S_odd_CM_odd_C4CM;
442         break;
443       }
444       break;
445     case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
446       switch (GB18030_MAP[*p]) {
447       case C1:
448       case C2:
449       case C4:
450         return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
451       case CM:
452         state = S_even_CM_odd_C4CM;
453         break;
454       }
455       break;
456 
457     case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
458       switch (GB18030_MAP[*p]) {
459       case C1:
460       case C2:
461       case C4:
462         return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
463       case CM:
464         state = S_odd_CM_even_C4CM;
465         break;
466       }
467       break;
468     case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
469       switch (GB18030_MAP[*p]) {
470       case C1:
471       case C2:
472       case C4:
473         return (UChar *)(s - 0);  /* |CM CM|CM C4 CM C4|CM */
474       case CM:
475         state = S_even_CM_even_C4CM;
476         break;
477       }
478       break;
479     }
480   }
481 
482   DEBUG_GB18030(("state %d\n", state));
483   switch (state) {
484   case S_START:             return (UChar *)(s - 0);
485   case S_one_C2:            return (UChar *)(s - 0);
486   case S_one_C4:            return (UChar *)(s - 0);
487   case S_one_CM:            return (UChar *)(s - 0);
488 
489   case S_odd_CM_one_CX:     return (UChar *)(s - 1);
490   case S_even_CM_one_CX:    return (UChar *)(s - 0);
491 
492   case S_one_CMC4:          return (UChar *)(s - 1);
493   case S_odd_CMC4:          return (UChar *)(s - 1);
494   case S_one_C4_odd_CMC4:   return (UChar *)(s - 1);
495   case S_even_CMC4:         return (UChar *)(s - 3);
496   case S_one_C4_even_CMC4:  return (UChar *)(s - 3);
497 
498   case S_odd_CM_odd_CMC4:   return (UChar *)(s - 3);
499   case S_even_CM_odd_CMC4:  return (UChar *)(s - 1);
500 
501   case S_odd_CM_even_CMC4:  return (UChar *)(s - 1);
502   case S_even_CM_even_CMC4: return (UChar *)(s - 3);
503 
504   case S_odd_C4CM:          return (UChar *)(s - 0);
505   case S_one_CM_odd_C4CM:   return (UChar *)(s - 2);
506   case S_even_C4CM:         return (UChar *)(s - 2);
507   case S_one_CM_even_C4CM:  return (UChar *)(s - 0);
508 
509   case S_even_CM_odd_C4CM:  return (UChar *)(s - 0);
510   case S_odd_CM_odd_C4CM:   return (UChar *)(s - 2);
511   case S_even_CM_even_C4CM: return (UChar *)(s - 2);
512   case S_odd_CM_even_C4CM:  return (UChar *)(s - 0);
513   }
514 
515   return (UChar* )s;  /* never come here. (escape warning) */
516 }
517 
518 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)519 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
520 {
521   return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
522 }
523 
524 OnigEncodingType OnigEncodingGB18030 = {
525   gb18030_mbc_enc_len,
526   "GB18030",   /* name */
527   4,          /* max enc length */
528   1,          /* min enc length */
529   onigenc_is_mbc_newline_0x0a,
530   gb18030_mbc_to_code,
531   gb18030_code_to_mbclen,
532   gb18030_code_to_mbc,
533   gb18030_mbc_case_fold,
534   onigenc_ascii_apply_all_case_fold,
535   onigenc_ascii_get_case_fold_codes_by_str,
536   onigenc_minimum_property_name_to_ctype,
537   gb18030_is_code_ctype,
538   onigenc_not_support_get_ctype_code_range,
539   gb18030_left_adjust_char_head,
540   gb18030_is_allowed_reverse_match,
541   NULL, /* init */
542   NULL, /* is_initialized */
543   is_valid_mbc_string,
544   ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1,
545   0, 0
546 };
547