1 /**********************************************************************
2 gb18030.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2005 KUBO Takehiro <kubo AT jiubao DOT org>
6 * K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include "regenc.h"
32
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #define DEBUG_GB18030(arg) printf arg
37 #endif
38
39 enum {
40 C1, /* one-byte char */
41 C2, /* one-byte or second of two-byte char */
42 C4, /* one-byte or second or fourth of four-byte char */
43 CM /* first of two- or four-byte char or second of two-byte char */
44 };
45
46 static const char GB18030_MAP[] = {
47 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63 };
64
65 static int
gb18030_mbc_enc_len(const UChar * p)66 gb18030_mbc_enc_len(const UChar* p)
67 {
68 if (GB18030_MAP[*p] != CM)
69 return 1;
70 p++;
71 if (GB18030_MAP[*p] == C4)
72 return 4;
73 if (GB18030_MAP[*p] == C1)
74 return 1; /* illegal sequence */
75 return 2;
76 }
77
78 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)79 gb18030_mbc_to_code(const UChar* p, const UChar* end)
80 {
81 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
82 }
83
84 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)85 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
86 {
87 return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
88 }
89
90 static int
gb18030_mbc_to_normalize(OnigAmbigType flag,const UChar ** pp,const UChar * end,UChar * lower)91 gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
92 UChar* lower)
93 {
94 return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag,
95 pp, end, lower);
96 }
97
98 static int
gb18030_is_mbc_ambiguous(OnigAmbigType flag,const UChar ** pp,const UChar * end)99 gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
100 {
101 return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
102 }
103
104 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)105 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
106 {
107 return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
108 }
109
110 enum state {
111 S_START,
112 S_one_C2,
113 S_one_C4,
114 S_one_CM,
115
116 S_odd_CM_one_CX,
117 S_even_CM_one_CX,
118
119 /* CMC4 : pair of "CM C4" */
120 S_one_CMC4,
121 S_odd_CMC4,
122 S_one_C4_odd_CMC4,
123 S_even_CMC4,
124 S_one_C4_even_CMC4,
125
126 S_odd_CM_odd_CMC4,
127 S_even_CM_odd_CMC4,
128
129 S_odd_CM_even_CMC4,
130 S_even_CM_even_CMC4,
131
132 /* C4CM : pair of "C4 CM" */
133 S_odd_C4CM,
134 S_one_CM_odd_C4CM,
135 S_even_C4CM,
136 S_one_CM_even_C4CM,
137
138 S_even_CM_odd_C4CM,
139 S_odd_CM_odd_C4CM,
140 S_even_CM_even_C4CM,
141 S_odd_CM_even_C4CM,
142 };
143
144 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)145 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
146 {
147 const UChar *p;
148 enum state state = S_START;
149
150 DEBUG_GB18030(("----------------\n"));
151 for (p = s; p >= start; p--) {
152 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
153 switch (state) {
154 case S_START:
155 switch (GB18030_MAP[*p]) {
156 case C1:
157 return (UChar *)s;
158 case C2:
159 state = S_one_C2; /* C2 */
160 break;
161 case C4:
162 state = S_one_C4; /* C4 */
163 break;
164 case CM:
165 state = S_one_CM; /* CM */
166 break;
167 }
168 break;
169 case S_one_C2: /* C2 */
170 switch (GB18030_MAP[*p]) {
171 case C1:
172 case C2:
173 case C4:
174 return (UChar *)s;
175 case CM:
176 state = S_odd_CM_one_CX; /* CM C2 */
177 break;
178 }
179 break;
180 case S_one_C4: /* C4 */
181 switch (GB18030_MAP[*p]) {
182 case C1:
183 case C2:
184 case C4:
185 return (UChar *)s;
186 case CM:
187 state = S_one_CMC4;
188 break;
189 }
190 break;
191 case S_one_CM: /* CM */
192 switch (GB18030_MAP[*p]) {
193 case C1:
194 case C2:
195 return (UChar *)s;
196 case C4:
197 state = S_odd_C4CM;
198 break;
199 case CM:
200 state = S_odd_CM_one_CX; /* CM CM */
201 break;
202 }
203 break;
204
205 case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
206 switch (GB18030_MAP[*p]) {
207 case C1:
208 case C2:
209 case C4:
210 return (UChar *)(s - 1);
211 case CM:
212 state = S_even_CM_one_CX;
213 break;
214 }
215 break;
216 case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
217 switch (GB18030_MAP[*p]) {
218 case C1:
219 case C2:
220 case C4:
221 return (UChar *)s;
222 case CM:
223 state = S_odd_CM_one_CX;
224 break;
225 }
226 break;
227
228 case S_one_CMC4: /* CM C4 */
229 switch (GB18030_MAP[*p]) {
230 case C1:
231 case C2:
232 return (UChar *)(s - 1);
233 case C4:
234 state = S_one_C4_odd_CMC4; /* C4 CM C4 */
235 break;
236 case CM:
237 state = S_even_CM_one_CX; /* CM CM C4 */
238 break;
239 }
240 break;
241 case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
242 switch (GB18030_MAP[*p]) {
243 case C1:
244 case C2:
245 return (UChar *)(s - 1);
246 case C4:
247 state = S_one_C4_odd_CMC4;
248 break;
249 case CM:
250 state = S_odd_CM_odd_CMC4;
251 break;
252 }
253 break;
254 case S_one_C4_odd_CMC4: /* C4 CM C4 */
255 switch (GB18030_MAP[*p]) {
256 case C1:
257 case C2:
258 case C4:
259 return (UChar *)(s - 1);
260 case CM:
261 state = S_even_CMC4; /* CM C4 CM C4 */
262 break;
263 }
264 break;
265 case S_even_CMC4: /* CM C4 CM C4 */
266 switch (GB18030_MAP[*p]) {
267 case C1:
268 case C2:
269 return (UChar *)(s - 3);
270 case C4:
271 state = S_one_C4_even_CMC4;
272 break;
273 case CM:
274 state = S_odd_CM_even_CMC4;
275 break;
276 }
277 break;
278 case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
279 switch (GB18030_MAP[*p]) {
280 case C1:
281 case C2:
282 case C4:
283 return (UChar *)(s - 3);
284 case CM:
285 state = S_odd_CMC4;
286 break;
287 }
288 break;
289
290 case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
291 switch (GB18030_MAP[*p]) {
292 case C1:
293 case C2:
294 case C4:
295 return (UChar *)(s - 3);
296 case CM:
297 state = S_even_CM_odd_CMC4;
298 break;
299 }
300 break;
301 case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
302 switch (GB18030_MAP[*p]) {
303 case C1:
304 case C2:
305 case C4:
306 return (UChar *)(s - 1);
307 case CM:
308 state = S_odd_CM_odd_CMC4;
309 break;
310 }
311 break;
312
313 case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
314 switch (GB18030_MAP[*p]) {
315 case C1:
316 case C2:
317 case C4:
318 return (UChar *)(s - 1);
319 case CM:
320 state = S_even_CM_even_CMC4;
321 break;
322 }
323 break;
324 case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
325 switch (GB18030_MAP[*p]) {
326 case C1:
327 case C2:
328 case C4:
329 return (UChar *)(s - 3);
330 case CM:
331 state = S_odd_CM_even_CMC4;
332 break;
333 }
334 break;
335
336 case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
337 switch (GB18030_MAP[*p]) {
338 case C1:
339 case C2:
340 case C4:
341 return (UChar *)s;
342 case CM:
343 state = S_one_CM_odd_C4CM; /* CM C4 CM */
344 break;
345 }
346 break;
347 case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
348 switch (GB18030_MAP[*p]) {
349 case C1:
350 case C2:
351 return (UChar *)(s - 2); /* |CM C4 CM */
352 case C4:
353 state = S_even_C4CM;
354 break;
355 case CM:
356 state = S_even_CM_odd_C4CM;
357 break;
358 }
359 break;
360 case S_even_C4CM: /* C4 CM C4 CM */
361 switch (GB18030_MAP[*p]) {
362 case C1:
363 case C2:
364 case C4:
365 return (UChar *)(s - 2); /* C4|CM C4 CM */
366 case CM:
367 state = S_one_CM_even_C4CM;
368 break;
369 }
370 break;
371 case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
372 switch (GB18030_MAP[*p]) {
373 case C1:
374 case C2:
375 return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
376 case C4:
377 state = S_odd_C4CM;
378 break;
379 case CM:
380 state = S_even_CM_even_C4CM;
381 break;
382 }
383 break;
384
385 case S_even_CM_odd_C4CM: /* CM CM C4 CM */
386 switch (GB18030_MAP[*p]) {
387 case C1:
388 case C2:
389 case C4:
390 return (UChar *)(s - 0); /* |CM CM|C4|CM */
391 case CM:
392 state = S_odd_CM_odd_C4CM;
393 break;
394 }
395 break;
396 case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
397 switch (GB18030_MAP[*p]) {
398 case C1:
399 case C2:
400 case C4:
401 return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
402 case CM:
403 state = S_even_CM_odd_C4CM;
404 break;
405 }
406 break;
407
408 case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
409 switch (GB18030_MAP[*p]) {
410 case C1:
411 case C2:
412 case C4:
413 return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
414 case CM:
415 state = S_odd_CM_even_C4CM;
416 break;
417 }
418 break;
419 case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
420 switch (GB18030_MAP[*p]) {
421 case C1:
422 case C2:
423 case C4:
424 return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
425 case CM:
426 state = S_even_CM_even_C4CM;
427 break;
428 }
429 break;
430 }
431 }
432
433 DEBUG_GB18030(("state %d\n", state));
434 switch (state) {
435 case S_START: return (UChar *)(s - 0);
436 case S_one_C2: return (UChar *)(s - 0);
437 case S_one_C4: return (UChar *)(s - 0);
438 case S_one_CM: return (UChar *)(s - 0);
439
440 case S_odd_CM_one_CX: return (UChar *)(s - 1);
441 case S_even_CM_one_CX: return (UChar *)(s - 0);
442
443 case S_one_CMC4: return (UChar *)(s - 1);
444 case S_odd_CMC4: return (UChar *)(s - 1);
445 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
446 case S_even_CMC4: return (UChar *)(s - 3);
447 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
448
449 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
450 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
451
452 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
453 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
454
455 case S_odd_C4CM: return (UChar *)(s - 0);
456 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
457 case S_even_C4CM: return (UChar *)(s - 2);
458 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
459
460 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
461 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
462 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
463 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
464 }
465
466 return (UChar* )s; /* never come here. (escape warning) */
467 }
468
469 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end)470 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end)
471 {
472 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
473 }
474
475 OnigEncodingType OnigEncodingGB18030 = {
476 gb18030_mbc_enc_len,
477 "GB18030", /* name */
478 4, /* max enc length */
479 1, /* min enc length */
480 ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
481 {
482 (OnigCodePoint )'\\' /* esc */
483 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
484 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
485 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
486 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
487 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
488 },
489 onigenc_is_mbc_newline_0x0a,
490 gb18030_mbc_to_code,
491 onigenc_mb4_code_to_mbclen,
492 gb18030_code_to_mbc,
493 gb18030_mbc_to_normalize,
494 gb18030_is_mbc_ambiguous,
495 onigenc_ascii_get_all_pair_ambig_codes,
496 onigenc_nothing_get_all_comp_ambig_codes,
497 gb18030_is_code_ctype,
498 onigenc_not_support_get_ctype_code_range,
499 gb18030_left_adjust_char_head,
500 gb18030_is_allowed_reverse_match
501 };
502