1 /**********************************************************************
2 gb18030.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2005-2007 KUBO Takehiro <kubo AT jiubao DOT org>
6 * K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include "regenc.h"
32
33 #if 1
34 #define DEBUG_GB18030(arg)
35 #else
36 #define DEBUG_GB18030(arg) printf arg
37 #endif
38
39 enum {
40 C1, /* one-byte char */
41 C2, /* one-byte or second of two-byte char */
42 C4, /* one-byte or second or fourth of four-byte char */
43 CM /* first of two- or four-byte char or second of two-byte char */
44 };
45
46 static const char GB18030_MAP[] = {
47 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
48 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
49 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
50 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
51 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
52 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
53 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
54 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
55 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
56 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
57 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
58 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
59 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
60 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
61 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
62 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
63 };
64
65 static int
gb18030_mbc_enc_len(const UChar * p)66 gb18030_mbc_enc_len(const UChar* p)
67 {
68 if (GB18030_MAP[*p] != CM)
69 return 1;
70 p++;
71 if (GB18030_MAP[*p] == C4)
72 return 4;
73 if (GB18030_MAP[*p] == C1)
74 return 1; /* illegal sequence */
75 return 2;
76 }
77
78 static OnigCodePoint
gb18030_mbc_to_code(const UChar * p,const UChar * end)79 gb18030_mbc_to_code(const UChar* p, const UChar* end)
80 {
81 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
82 }
83
84 static int
gb18030_code_to_mbc(OnigCodePoint code,UChar * buf)85 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
86 {
87 return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
88 }
89
90 static int
gb18030_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)91 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
92 UChar* lower)
93 {
94 return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_GB18030, flag,
95 pp, end, lower);
96 }
97
98 #if 0
99 static int
100 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
101 const UChar** pp, const UChar* end)
102 {
103 return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
104 }
105 #endif
106
107 static int
gb18030_is_code_ctype(OnigCodePoint code,unsigned int ctype)108 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
109 {
110 return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
111 }
112
113 enum state {
114 S_START,
115 S_one_C2,
116 S_one_C4,
117 S_one_CM,
118
119 S_odd_CM_one_CX,
120 S_even_CM_one_CX,
121
122 /* CMC4 : pair of "CM C4" */
123 S_one_CMC4,
124 S_odd_CMC4,
125 S_one_C4_odd_CMC4,
126 S_even_CMC4,
127 S_one_C4_even_CMC4,
128
129 S_odd_CM_odd_CMC4,
130 S_even_CM_odd_CMC4,
131
132 S_odd_CM_even_CMC4,
133 S_even_CM_even_CMC4,
134
135 /* C4CM : pair of "C4 CM" */
136 S_odd_C4CM,
137 S_one_CM_odd_C4CM,
138 S_even_C4CM,
139 S_one_CM_even_C4CM,
140
141 S_even_CM_odd_C4CM,
142 S_odd_CM_odd_C4CM,
143 S_even_CM_even_C4CM,
144 S_odd_CM_even_C4CM,
145 };
146
147 static UChar*
gb18030_left_adjust_char_head(const UChar * start,const UChar * s)148 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
149 {
150 const UChar *p;
151 enum state state = S_START;
152
153 DEBUG_GB18030(("----------------\n"));
154 for (p = s; p >= start; p--) {
155 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
156 switch (state) {
157 case S_START:
158 switch (GB18030_MAP[*p]) {
159 case C1:
160 return (UChar *)s;
161 case C2:
162 state = S_one_C2; /* C2 */
163 break;
164 case C4:
165 state = S_one_C4; /* C4 */
166 break;
167 case CM:
168 state = S_one_CM; /* CM */
169 break;
170 }
171 break;
172 case S_one_C2: /* C2 */
173 switch (GB18030_MAP[*p]) {
174 case C1:
175 case C2:
176 case C4:
177 return (UChar *)s;
178 case CM:
179 state = S_odd_CM_one_CX; /* CM C2 */
180 break;
181 }
182 break;
183 case S_one_C4: /* C4 */
184 switch (GB18030_MAP[*p]) {
185 case C1:
186 case C2:
187 case C4:
188 return (UChar *)s;
189 case CM:
190 state = S_one_CMC4;
191 break;
192 }
193 break;
194 case S_one_CM: /* CM */
195 switch (GB18030_MAP[*p]) {
196 case C1:
197 case C2:
198 return (UChar *)s;
199 case C4:
200 state = S_odd_C4CM;
201 break;
202 case CM:
203 state = S_odd_CM_one_CX; /* CM CM */
204 break;
205 }
206 break;
207
208 case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
209 switch (GB18030_MAP[*p]) {
210 case C1:
211 case C2:
212 case C4:
213 return (UChar *)(s - 1);
214 case CM:
215 state = S_even_CM_one_CX;
216 break;
217 }
218 break;
219 case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
220 switch (GB18030_MAP[*p]) {
221 case C1:
222 case C2:
223 case C4:
224 return (UChar *)s;
225 case CM:
226 state = S_odd_CM_one_CX;
227 break;
228 }
229 break;
230
231 case S_one_CMC4: /* CM C4 */
232 switch (GB18030_MAP[*p]) {
233 case C1:
234 case C2:
235 return (UChar *)(s - 1);
236 case C4:
237 state = S_one_C4_odd_CMC4; /* C4 CM C4 */
238 break;
239 case CM:
240 state = S_even_CM_one_CX; /* CM CM C4 */
241 break;
242 }
243 break;
244 case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
245 switch (GB18030_MAP[*p]) {
246 case C1:
247 case C2:
248 return (UChar *)(s - 1);
249 case C4:
250 state = S_one_C4_odd_CMC4;
251 break;
252 case CM:
253 state = S_odd_CM_odd_CMC4;
254 break;
255 }
256 break;
257 case S_one_C4_odd_CMC4: /* C4 CM C4 */
258 switch (GB18030_MAP[*p]) {
259 case C1:
260 case C2:
261 case C4:
262 return (UChar *)(s - 1);
263 case CM:
264 state = S_even_CMC4; /* CM C4 CM C4 */
265 break;
266 }
267 break;
268 case S_even_CMC4: /* CM C4 CM C4 */
269 switch (GB18030_MAP[*p]) {
270 case C1:
271 case C2:
272 return (UChar *)(s - 3);
273 case C4:
274 state = S_one_C4_even_CMC4;
275 break;
276 case CM:
277 state = S_odd_CM_even_CMC4;
278 break;
279 }
280 break;
281 case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
282 switch (GB18030_MAP[*p]) {
283 case C1:
284 case C2:
285 case C4:
286 return (UChar *)(s - 3);
287 case CM:
288 state = S_odd_CMC4;
289 break;
290 }
291 break;
292
293 case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
294 switch (GB18030_MAP[*p]) {
295 case C1:
296 case C2:
297 case C4:
298 return (UChar *)(s - 3);
299 case CM:
300 state = S_even_CM_odd_CMC4;
301 break;
302 }
303 break;
304 case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
305 switch (GB18030_MAP[*p]) {
306 case C1:
307 case C2:
308 case C4:
309 return (UChar *)(s - 1);
310 case CM:
311 state = S_odd_CM_odd_CMC4;
312 break;
313 }
314 break;
315
316 case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
317 switch (GB18030_MAP[*p]) {
318 case C1:
319 case C2:
320 case C4:
321 return (UChar *)(s - 1);
322 case CM:
323 state = S_even_CM_even_CMC4;
324 break;
325 }
326 break;
327 case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
328 switch (GB18030_MAP[*p]) {
329 case C1:
330 case C2:
331 case C4:
332 return (UChar *)(s - 3);
333 case CM:
334 state = S_odd_CM_even_CMC4;
335 break;
336 }
337 break;
338
339 case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/
340 switch (GB18030_MAP[*p]) {
341 case C1:
342 case C2:
343 case C4:
344 return (UChar *)s;
345 case CM:
346 state = S_one_CM_odd_C4CM; /* CM C4 CM */
347 break;
348 }
349 break;
350 case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
351 switch (GB18030_MAP[*p]) {
352 case C1:
353 case C2:
354 return (UChar *)(s - 2); /* |CM C4 CM */
355 case C4:
356 state = S_even_C4CM;
357 break;
358 case CM:
359 state = S_even_CM_odd_C4CM;
360 break;
361 }
362 break;
363 case S_even_C4CM: /* C4 CM C4 CM */
364 switch (GB18030_MAP[*p]) {
365 case C1:
366 case C2:
367 case C4:
368 return (UChar *)(s - 2); /* C4|CM C4 CM */
369 case CM:
370 state = S_one_CM_even_C4CM;
371 break;
372 }
373 break;
374 case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
375 switch (GB18030_MAP[*p]) {
376 case C1:
377 case C2:
378 return (UChar *)(s - 0); /*|CM C4 CM C4|CM */
379 case C4:
380 state = S_odd_C4CM;
381 break;
382 case CM:
383 state = S_even_CM_even_C4CM;
384 break;
385 }
386 break;
387
388 case S_even_CM_odd_C4CM: /* CM CM C4 CM */
389 switch (GB18030_MAP[*p]) {
390 case C1:
391 case C2:
392 case C4:
393 return (UChar *)(s - 0); /* |CM CM|C4|CM */
394 case CM:
395 state = S_odd_CM_odd_C4CM;
396 break;
397 }
398 break;
399 case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
400 switch (GB18030_MAP[*p]) {
401 case C1:
402 case C2:
403 case C4:
404 return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
405 case CM:
406 state = S_even_CM_odd_C4CM;
407 break;
408 }
409 break;
410
411 case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
412 switch (GB18030_MAP[*p]) {
413 case C1:
414 case C2:
415 case C4:
416 return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
417 case CM:
418 state = S_odd_CM_even_C4CM;
419 break;
420 }
421 break;
422 case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
423 switch (GB18030_MAP[*p]) {
424 case C1:
425 case C2:
426 case C4:
427 return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */
428 case CM:
429 state = S_even_CM_even_C4CM;
430 break;
431 }
432 break;
433 }
434 }
435
436 DEBUG_GB18030(("state %d\n", state));
437 switch (state) {
438 case S_START: return (UChar *)(s - 0);
439 case S_one_C2: return (UChar *)(s - 0);
440 case S_one_C4: return (UChar *)(s - 0);
441 case S_one_CM: return (UChar *)(s - 0);
442
443 case S_odd_CM_one_CX: return (UChar *)(s - 1);
444 case S_even_CM_one_CX: return (UChar *)(s - 0);
445
446 case S_one_CMC4: return (UChar *)(s - 1);
447 case S_odd_CMC4: return (UChar *)(s - 1);
448 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
449 case S_even_CMC4: return (UChar *)(s - 3);
450 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
451
452 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
453 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
454
455 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
456 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
457
458 case S_odd_C4CM: return (UChar *)(s - 0);
459 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
460 case S_even_C4CM: return (UChar *)(s - 2);
461 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
462
463 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
464 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
465 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
466 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
467 }
468
469 return (UChar* )s; /* never come here. (escape warning) */
470 }
471
472 static int
gb18030_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)473 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
474 {
475 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
476 }
477
478 OnigEncodingType OnigEncodingGB18030 = {
479 gb18030_mbc_enc_len,
480 "GB18030", /* name */
481 4, /* max enc length */
482 1, /* min enc length */
483 onigenc_is_mbc_newline_0x0a,
484 gb18030_mbc_to_code,
485 onigenc_mb4_code_to_mbclen,
486 gb18030_code_to_mbc,
487 gb18030_mbc_case_fold,
488 onigenc_ascii_apply_all_case_fold,
489 onigenc_ascii_get_case_fold_codes_by_str,
490 onigenc_minimum_property_name_to_ctype,
491 gb18030_is_code_ctype,
492 onigenc_not_support_get_ctype_code_range,
493 gb18030_left_adjust_char_head,
494 gb18030_is_allowed_reverse_match
495 };
496