1 /**********************************************************************
2 gb18030.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2005-2016 KUBO Takehiro <kubo AT jiubao DOT org>
6 * K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include "regenc.h"
32
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #define DEBUG_GB18030(arg) printf arg
37 #endif
38
39 enum {
40 C1, /* one-byte char */
41 C2, /* one-byte or second of two-byte char */
42 C4, /* one-byte or second or fourth of four-byte char */
43 CM /* first of two- or four-byte char or second of two-byte char */
44 };
45
46 static const char GB18030_MAP[] = {
47 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63 };
64
65 static int
gb18030_mbc_enc_len(const UChar * p)66 gb18030_mbc_enc_len(const UChar* p)
67 {
68 if (GB18030_MAP[*p] != CM)
69 return 1;
70 p++;
71 if (GB18030_MAP[*p] == C4)
72 return 4;
73 if (GB18030_MAP[*p] == C1)
74 return 1; /* illegal sequence */
75 return 2;
76 }
77
78 static int
is_valid_mbc_string(const UChar * p,const UChar * end)79 is_valid_mbc_string(const UChar* p, const UChar* end)
80 {
81 while (p < end) {
82 if (*p < 0x80) {
83 p++;
84 }
85 else if (*p == 0x80 || *p == 0xff) {
86 return FALSE;
87 }
88 else {
89 p++;
90 if (p >= end) return FALSE;
91 if (*p < 0x40) {
92 if (*p < 0x30 || *p > 0x39)
93 return FALSE;
94
95 p++;
96 if (p >= end) return FALSE;
97 if (*p < 0x81 || *p == 0xff) return FALSE;
98
99 p++;
100 if (p >= end) return FALSE;
101 if (*p < 0x30 || *p > 0x39)
102 return FALSE;
103
104 p++;
105 }
106 else if (*p == 0x7f || *p == 0xff) {
107 return FALSE;
108 }
109 else {
110 p++;
111 }
112 }
113 }
114
115 return TRUE;
116 }
117
118 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)119 gb18030_mbc_to_code(const UChar* p, const UChar* end)
120 {
121 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
122 }
123
124 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)125 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
126 {
127 return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
128 }
129
130 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)131 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
132 UChar* lower)
133 {
134 return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
135 pp, end, lower);
136 }
137
138 #if 0
139 static int
140 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
141 const UChar** pp, const UChar* end)
142 {
143 return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
144 }
145 #endif
146
147 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)148 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
149 {
150 return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
151 }
152
153 enum state {
154 S_START,
155 S_one_C2,
156 S_one_C4,
157 S_one_CM,
158
159 S_odd_CM_one_CX,
160 S_even_CM_one_CX,
161
162 /* CMC4 : pair of "CM C4" */
163 S_one_CMC4,
164 S_odd_CMC4,
165 S_one_C4_odd_CMC4,
166 S_even_CMC4,
167 S_one_C4_even_CMC4,
168
169 S_odd_CM_odd_CMC4,
170 S_even_CM_odd_CMC4,
171
172 S_odd_CM_even_CMC4,
173 S_even_CM_even_CMC4,
174
175 /* C4CM : pair of "C4 CM" */
176 S_odd_C4CM,
177 S_one_CM_odd_C4CM,
178 S_even_C4CM,
179 S_one_CM_even_C4CM,
180
181 S_even_CM_odd_C4CM,
182 S_odd_CM_odd_C4CM,
183 S_even_CM_even_C4CM,
184 S_odd_CM_even_C4CM,
185 };
186
187 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)188 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
189 {
190 const UChar *p;
191 enum state state = S_START;
192
193 DEBUG_GB18030(("----------------\n"));
194 for (p = s; p >= start; p--) {
195 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
196 switch (state) {
197 case S_START:
198 switch (GB18030_MAP[*p]) {
199 case C1:
200 return (UChar *)s;
201 case C2:
202 state = S_one_C2; /* C2 */
203 break;
204 case C4:
205 state = S_one_C4; /* C4 */
206 break;
207 case CM:
208 state = S_one_CM; /* CM */
209 break;
210 }
211 break;
212 case S_one_C2: /* C2 */
213 switch (GB18030_MAP[*p]) {
214 case C1:
215 case C2:
216 case C4:
217 return (UChar *)s;
218 case CM:
219 state = S_odd_CM_one_CX; /* CM C2 */
220 break;
221 }
222 break;
223 case S_one_C4: /* C4 */
224 switch (GB18030_MAP[*p]) {
225 case C1:
226 case C2:
227 case C4:
228 return (UChar *)s;
229 case CM:
230 state = S_one_CMC4;
231 break;
232 }
233 break;
234 case S_one_CM: /* CM */
235 switch (GB18030_MAP[*p]) {
236 case C1:
237 case C2:
238 return (UChar *)s;
239 case C4:
240 state = S_odd_C4CM;
241 break;
242 case CM:
243 state = S_odd_CM_one_CX; /* CM CM */
244 break;
245 }
246 break;
247
248 case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
249 switch (GB18030_MAP[*p]) {
250 case C1:
251 case C2:
252 case C4:
253 return (UChar *)(s - 1);
254 case CM:
255 state = S_even_CM_one_CX;
256 break;
257 }
258 break;
259 case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
260 switch (GB18030_MAP[*p]) {
261 case C1:
262 case C2:
263 case C4:
264 return (UChar *)s;
265 case CM:
266 state = S_odd_CM_one_CX;
267 break;
268 }
269 break;
270
271 case S_one_CMC4: /* CM C4 */
272 switch (GB18030_MAP[*p]) {
273 case C1:
274 case C2:
275 return (UChar *)(s - 1);
276 case C4:
277 state = S_one_C4_odd_CMC4; /* C4 CM C4 */
278 break;
279 case CM:
280 state = S_even_CM_one_CX; /* CM CM C4 */
281 break;
282 }
283 break;
284 case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
285 switch (GB18030_MAP[*p]) {
286 case C1:
287 case C2:
288 return (UChar *)(s - 1);
289 case C4:
290 state = S_one_C4_odd_CMC4;
291 break;
292 case CM:
293 state = S_odd_CM_odd_CMC4;
294 break;
295 }
296 break;
297 case S_one_C4_odd_CMC4: /* C4 CM C4 */
298 switch (GB18030_MAP[*p]) {
299 case C1:
300 case C2:
301 case C4:
302 return (UChar *)(s - 1);
303 case CM:
304 state = S_even_CMC4; /* CM C4 CM C4 */
305 break;
306 }
307 break;
308 case S_even_CMC4: /* CM C4 CM C4 */
309 switch (GB18030_MAP[*p]) {
310 case C1:
311 case C2:
312 return (UChar *)(s - 3);
313 case C4:
314 state = S_one_C4_even_CMC4;
315 break;
316 case CM:
317 state = S_odd_CM_even_CMC4;
318 break;
319 }
320 break;
321 case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
322 switch (GB18030_MAP[*p]) {
323 case C1:
324 case C2:
325 case C4:
326 return (UChar *)(s - 3);
327 case CM:
328 state = S_odd_CMC4;
329 break;
330 }
331 break;
332
333 case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
334 switch (GB18030_MAP[*p]) {
335 case C1:
336 case C2:
337 case C4:
338 return (UChar *)(s - 3);
339 case CM:
340 state = S_even_CM_odd_CMC4;
341 break;
342 }
343 break;
344 case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
345 switch (GB18030_MAP[*p]) {
346 case C1:
347 case C2:
348 case C4:
349 return (UChar *)(s - 1);
350 case CM:
351 state = S_odd_CM_odd_CMC4;
352 break;
353 }
354 break;
355
356 case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
357 switch (GB18030_MAP[*p]) {
358 case C1:
359 case C2:
360 case C4:
361 return (UChar *)(s - 1);
362 case CM:
363 state = S_even_CM_even_CMC4;
364 break;
365 }
366 break;
367 case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
368 switch (GB18030_MAP[*p]) {
369 case C1:
370 case C2:
371 case C4:
372 return (UChar *)(s - 3);
373 case CM:
374 state = S_odd_CM_even_CMC4;
375 break;
376 }
377 break;
378
379 case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
380 switch (GB18030_MAP[*p]) {
381 case C1:
382 case C2:
383 case C4:
384 return (UChar *)s;
385 case CM:
386 state = S_one_CM_odd_C4CM; /* CM C4 CM */
387 break;
388 }
389 break;
390 case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
391 switch (GB18030_MAP[*p]) {
392 case C1:
393 case C2:
394 return (UChar *)(s - 2); /* |CM C4 CM */
395 case C4:
396 state = S_even_C4CM;
397 break;
398 case CM:
399 state = S_even_CM_odd_C4CM;
400 break;
401 }
402 break;
403 case S_even_C4CM: /* C4 CM C4 CM */
404 switch (GB18030_MAP[*p]) {
405 case C1:
406 case C2:
407 case C4:
408 return (UChar *)(s - 2); /* C4|CM C4 CM */
409 case CM:
410 state = S_one_CM_even_C4CM;
411 break;
412 }
413 break;
414 case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
415 switch (GB18030_MAP[*p]) {
416 case C1:
417 case C2:
418 return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
419 case C4:
420 state = S_odd_C4CM;
421 break;
422 case CM:
423 state = S_even_CM_even_C4CM;
424 break;
425 }
426 break;
427
428 case S_even_CM_odd_C4CM: /* CM CM C4 CM */
429 switch (GB18030_MAP[*p]) {
430 case C1:
431 case C2:
432 case C4:
433 return (UChar *)(s - 0); /* |CM CM|C4|CM */
434 case CM:
435 state = S_odd_CM_odd_C4CM;
436 break;
437 }
438 break;
439 case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
440 switch (GB18030_MAP[*p]) {
441 case C1:
442 case C2:
443 case C4:
444 return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
445 case CM:
446 state = S_even_CM_odd_C4CM;
447 break;
448 }
449 break;
450
451 case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
452 switch (GB18030_MAP[*p]) {
453 case C1:
454 case C2:
455 case C4:
456 return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
457 case CM:
458 state = S_odd_CM_even_C4CM;
459 break;
460 }
461 break;
462 case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
463 switch (GB18030_MAP[*p]) {
464 case C1:
465 case C2:
466 case C4:
467 return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
468 case CM:
469 state = S_even_CM_even_C4CM;
470 break;
471 }
472 break;
473 }
474 }
475
476 DEBUG_GB18030(("state %d\n", state));
477 switch (state) {
478 case S_START: return (UChar *)(s - 0);
479 case S_one_C2: return (UChar *)(s - 0);
480 case S_one_C4: return (UChar *)(s - 0);
481 case S_one_CM: return (UChar *)(s - 0);
482
483 case S_odd_CM_one_CX: return (UChar *)(s - 1);
484 case S_even_CM_one_CX: return (UChar *)(s - 0);
485
486 case S_one_CMC4: return (UChar *)(s - 1);
487 case S_odd_CMC4: return (UChar *)(s - 1);
488 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
489 case S_even_CMC4: return (UChar *)(s - 3);
490 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
491
492 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
493 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
494
495 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
496 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
497
498 case S_odd_C4CM: return (UChar *)(s - 0);
499 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
500 case S_even_C4CM: return (UChar *)(s - 2);
501 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
502
503 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
504 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
505 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
506 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
507 }
508
509 return (UChar* )s; /* never come here. (escape warning) */
510 }
511
512 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)513 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
514 {
515 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
516 }
517
518 OnigEncodingType OnigEncodingGB18030 = {
519 gb18030_mbc_enc_len,
520 "GB18030", /* name */
521 4, /* max enc length */
522 1, /* min enc length */
523 onigenc_is_mbc_newline_0x0a,
524 gb18030_mbc_to_code,
525 onigenc_mb4_code_to_mbclen,
526 gb18030_code_to_mbc,
527 gb18030_mbc_case_fold,
528 onigenc_ascii_apply_all_case_fold,
529 onigenc_ascii_get_case_fold_codes_by_str,
530 onigenc_minimum_property_name_to_ctype,
531 gb18030_is_code_ctype,
532 onigenc_not_support_get_ctype_code_range,
533 gb18030_left_adjust_char_head,
534 gb18030_is_allowed_reverse_match,
535 NULL, /* init */
536 NULL, /* is_initialized */
537 is_valid_mbc_string
538 };
539