1 /**********************************************************************
2 gb18030.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2005-2019 KUBO Takehiro <kubo AT jiubao DOT org>
6 * K.Kosako
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include "regenc.h"
32
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #include <stdio.h>
37 #define DEBUG_GB18030(arg) printf arg
38 #endif
39
40 enum {
41 C1, /* one-byte char */
42 C2, /* one-byte or second of two-byte char */
43 C4, /* one-byte or second or fourth of four-byte char */
44 CM /* first of two- or four-byte char or second of two-byte char */
45 };
46
47 static const char GB18030_MAP[] = {
48 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
51 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
52 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
55 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
56 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
63 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
64 };
65
66 static int
gb18030_mbc_enc_len(const UChar * p)67 gb18030_mbc_enc_len(const UChar* p)
68 {
69 if (GB18030_MAP[*p] != CM)
70 return 1;
71
72 p++;
73 if (GB18030_MAP[*p] == C4)
74 return 4;
75
76 return 2;
77 }
78
79 static int
gb18030_code_to_mbclen(OnigCodePoint code)80 gb18030_code_to_mbclen(OnigCodePoint code)
81 {
82 if ((code & 0xff000000) != 0) return 4;
83 else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
84 else if ((code & 0xff00) != 0) return 2;
85 else {
86 if (GB18030_MAP[(int )(code & 0xff)] == CM)
87 return ONIGERR_INVALID_CODE_POINT_VALUE;
88
89 return 1;
90 }
91 }
92
93 static int
is_valid_mbc_string(const UChar * p,const UChar * end)94 is_valid_mbc_string(const UChar* p, const UChar* end)
95 {
96 while (p < end) {
97 if (*p < 0x80) {
98 p++;
99 }
100 else if (*p == 0x80 || *p == 0xff) {
101 return FALSE;
102 }
103 else {
104 p++;
105 if (p >= end) return FALSE;
106 if (*p < 0x40) {
107 if (*p < 0x30 || *p > 0x39)
108 return FALSE;
109
110 p++;
111 if (p >= end) return FALSE;
112 if (*p < 0x81 || *p == 0xff) return FALSE;
113
114 p++;
115 if (p >= end) return FALSE;
116 if (*p < 0x30 || *p > 0x39)
117 return FALSE;
118
119 p++;
120 }
121 else if (*p == 0x7f || *p == 0xff) {
122 return FALSE;
123 }
124 else {
125 p++;
126 }
127 }
128 }
129
130 return TRUE;
131 }
132
133 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)134 gb18030_mbc_to_code(const UChar* p, const UChar* end)
135 {
136 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
137 }
138
139 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)140 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
141 {
142 return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
143 }
144
145 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)146 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
147 UChar* lower)
148 {
149 return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
150 pp, end, lower);
151 }
152
153 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)154 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
155 {
156 return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
157 }
158
159 enum state {
160 S_START,
161 S_one_C2,
162 S_one_C4,
163 S_one_CM,
164
165 S_odd_CM_one_CX,
166 S_even_CM_one_CX,
167
168 /* CMC4 : pair of "CM C4" */
169 S_one_CMC4,
170 S_odd_CMC4,
171 S_one_C4_odd_CMC4,
172 S_even_CMC4,
173 S_one_C4_even_CMC4,
174
175 S_odd_CM_odd_CMC4,
176 S_even_CM_odd_CMC4,
177
178 S_odd_CM_even_CMC4,
179 S_even_CM_even_CMC4,
180
181 /* C4CM : pair of "C4 CM" */
182 S_odd_C4CM,
183 S_one_CM_odd_C4CM,
184 S_even_C4CM,
185 S_one_CM_even_C4CM,
186
187 S_even_CM_odd_C4CM,
188 S_odd_CM_odd_C4CM,
189 S_even_CM_even_C4CM,
190 S_odd_CM_even_C4CM,
191 };
192
193 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)194 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
195 {
196 const UChar *p;
197 enum state state = S_START;
198
199 DEBUG_GB18030(("----------------\n"));
200 for (p = s; p >= start; p--) {
201 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
202 switch (state) {
203 case S_START:
204 switch (GB18030_MAP[*p]) {
205 case C1:
206 return (UChar *)s;
207 case C2:
208 state = S_one_C2; /* C2 */
209 break;
210 case C4:
211 state = S_one_C4; /* C4 */
212 break;
213 case CM:
214 state = S_one_CM; /* CM */
215 break;
216 }
217 break;
218 case S_one_C2: /* C2 */
219 switch (GB18030_MAP[*p]) {
220 case C1:
221 case C2:
222 case C4:
223 return (UChar *)s;
224 case CM:
225 state = S_odd_CM_one_CX; /* CM C2 */
226 break;
227 }
228 break;
229 case S_one_C4: /* C4 */
230 switch (GB18030_MAP[*p]) {
231 case C1:
232 case C2:
233 case C4:
234 return (UChar *)s;
235 case CM:
236 state = S_one_CMC4;
237 break;
238 }
239 break;
240 case S_one_CM: /* CM */
241 switch (GB18030_MAP[*p]) {
242 case C1:
243 case C2:
244 return (UChar *)s;
245 case C4:
246 state = S_odd_C4CM;
247 break;
248 case CM:
249 state = S_odd_CM_one_CX; /* CM CM */
250 break;
251 }
252 break;
253
254 case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
255 switch (GB18030_MAP[*p]) {
256 case C1:
257 case C2:
258 case C4:
259 return (UChar *)(s - 1);
260 case CM:
261 state = S_even_CM_one_CX;
262 break;
263 }
264 break;
265 case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
266 switch (GB18030_MAP[*p]) {
267 case C1:
268 case C2:
269 case C4:
270 return (UChar *)s;
271 case CM:
272 state = S_odd_CM_one_CX;
273 break;
274 }
275 break;
276
277 case S_one_CMC4: /* CM C4 */
278 switch (GB18030_MAP[*p]) {
279 case C1:
280 case C2:
281 return (UChar *)(s - 1);
282 case C4:
283 state = S_one_C4_odd_CMC4; /* C4 CM C4 */
284 break;
285 case CM:
286 state = S_even_CM_one_CX; /* CM CM C4 */
287 break;
288 }
289 break;
290 case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
291 switch (GB18030_MAP[*p]) {
292 case C1:
293 case C2:
294 return (UChar *)(s - 1);
295 case C4:
296 state = S_one_C4_odd_CMC4;
297 break;
298 case CM:
299 state = S_odd_CM_odd_CMC4;
300 break;
301 }
302 break;
303 case S_one_C4_odd_CMC4: /* C4 CM C4 */
304 switch (GB18030_MAP[*p]) {
305 case C1:
306 case C2:
307 case C4:
308 return (UChar *)(s - 1);
309 case CM:
310 state = S_even_CMC4; /* CM C4 CM C4 */
311 break;
312 }
313 break;
314 case S_even_CMC4: /* CM C4 CM C4 */
315 switch (GB18030_MAP[*p]) {
316 case C1:
317 case C2:
318 return (UChar *)(s - 3);
319 case C4:
320 state = S_one_C4_even_CMC4;
321 break;
322 case CM:
323 state = S_odd_CM_even_CMC4;
324 break;
325 }
326 break;
327 case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
328 switch (GB18030_MAP[*p]) {
329 case C1:
330 case C2:
331 case C4:
332 return (UChar *)(s - 3);
333 case CM:
334 state = S_odd_CMC4;
335 break;
336 }
337 break;
338
339 case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
340 switch (GB18030_MAP[*p]) {
341 case C1:
342 case C2:
343 case C4:
344 return (UChar *)(s - 3);
345 case CM:
346 state = S_even_CM_odd_CMC4;
347 break;
348 }
349 break;
350 case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
351 switch (GB18030_MAP[*p]) {
352 case C1:
353 case C2:
354 case C4:
355 return (UChar *)(s - 1);
356 case CM:
357 state = S_odd_CM_odd_CMC4;
358 break;
359 }
360 break;
361
362 case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
363 switch (GB18030_MAP[*p]) {
364 case C1:
365 case C2:
366 case C4:
367 return (UChar *)(s - 1);
368 case CM:
369 state = S_even_CM_even_CMC4;
370 break;
371 }
372 break;
373 case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
374 switch (GB18030_MAP[*p]) {
375 case C1:
376 case C2:
377 case C4:
378 return (UChar *)(s - 3);
379 case CM:
380 state = S_odd_CM_even_CMC4;
381 break;
382 }
383 break;
384
385 case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
386 switch (GB18030_MAP[*p]) {
387 case C1:
388 case C2:
389 case C4:
390 return (UChar *)s;
391 case CM:
392 state = S_one_CM_odd_C4CM; /* CM C4 CM */
393 break;
394 }
395 break;
396 case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
397 switch (GB18030_MAP[*p]) {
398 case C1:
399 case C2:
400 return (UChar *)(s - 2); /* |CM C4 CM */
401 case C4:
402 state = S_even_C4CM;
403 break;
404 case CM:
405 state = S_even_CM_odd_C4CM;
406 break;
407 }
408 break;
409 case S_even_C4CM: /* C4 CM C4 CM */
410 switch (GB18030_MAP[*p]) {
411 case C1:
412 case C2:
413 case C4:
414 return (UChar *)(s - 2); /* C4|CM C4 CM */
415 case CM:
416 state = S_one_CM_even_C4CM;
417 break;
418 }
419 break;
420 case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
421 switch (GB18030_MAP[*p]) {
422 case C1:
423 case C2:
424 return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
425 case C4:
426 state = S_odd_C4CM;
427 break;
428 case CM:
429 state = S_even_CM_even_C4CM;
430 break;
431 }
432 break;
433
434 case S_even_CM_odd_C4CM: /* CM CM C4 CM */
435 switch (GB18030_MAP[*p]) {
436 case C1:
437 case C2:
438 case C4:
439 return (UChar *)(s - 0); /* |CM CM|C4|CM */
440 case CM:
441 state = S_odd_CM_odd_C4CM;
442 break;
443 }
444 break;
445 case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
446 switch (GB18030_MAP[*p]) {
447 case C1:
448 case C2:
449 case C4:
450 return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
451 case CM:
452 state = S_even_CM_odd_C4CM;
453 break;
454 }
455 break;
456
457 case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
458 switch (GB18030_MAP[*p]) {
459 case C1:
460 case C2:
461 case C4:
462 return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
463 case CM:
464 state = S_odd_CM_even_C4CM;
465 break;
466 }
467 break;
468 case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
469 switch (GB18030_MAP[*p]) {
470 case C1:
471 case C2:
472 case C4:
473 return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
474 case CM:
475 state = S_even_CM_even_C4CM;
476 break;
477 }
478 break;
479 }
480 }
481
482 DEBUG_GB18030(("state %d\n", state));
483 switch (state) {
484 case S_START: return (UChar *)(s - 0);
485 case S_one_C2: return (UChar *)(s - 0);
486 case S_one_C4: return (UChar *)(s - 0);
487 case S_one_CM: return (UChar *)(s - 0);
488
489 case S_odd_CM_one_CX: return (UChar *)(s - 1);
490 case S_even_CM_one_CX: return (UChar *)(s - 0);
491
492 case S_one_CMC4: return (UChar *)(s - 1);
493 case S_odd_CMC4: return (UChar *)(s - 1);
494 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
495 case S_even_CMC4: return (UChar *)(s - 3);
496 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
497
498 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
499 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
500
501 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
502 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
503
504 case S_odd_C4CM: return (UChar *)(s - 0);
505 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
506 case S_even_C4CM: return (UChar *)(s - 2);
507 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
508
509 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
510 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
511 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
512 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
513 }
514
515 return (UChar* )s; /* never come here. (escape warning) */
516 }
517
518 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)519 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
520 {
521 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
522 }
523
524 OnigEncodingType OnigEncodingGB18030 = {
525 gb18030_mbc_enc_len,
526 "GB18030", /* name */
527 4, /* max enc length */
528 1, /* min enc length */
529 onigenc_is_mbc_newline_0x0a,
530 gb18030_mbc_to_code,
531 gb18030_code_to_mbclen,
532 gb18030_code_to_mbc,
533 gb18030_mbc_case_fold,
534 onigenc_ascii_apply_all_case_fold,
535 onigenc_ascii_get_case_fold_codes_by_str,
536 onigenc_minimum_property_name_to_ctype,
537 gb18030_is_code_ctype,
538 onigenc_not_support_get_ctype_code_range,
539 gb18030_left_adjust_char_head,
540 gb18030_is_allowed_reverse_match,
541 NULL, /* init */
542 NULL, /* is_initialized */
543 is_valid_mbc_string,
544 ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1,
545 0, 0
546 };
547