1 /**********************************************************************
2 euc_kr.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2019 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regenc.h"
31
32 static const int EncLen_EUCKR[] = {
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
49 };
50
51 static int
euckr_mbc_enc_len(const UChar * p)52 euckr_mbc_enc_len(const UChar* p)
53 {
54 return EncLen_EUCKR[*p];
55 }
56
57 static int
euckr_code_to_mbclen(OnigCodePoint code)58 euckr_code_to_mbclen(OnigCodePoint code)
59 {
60 if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
61 if ((code & 0xff00) != 0) return 2;
62 if (EncLen_EUCKR[(int )(code & 0xff)] == 1) return 1;
63
64 return ONIGERR_INVALID_CODE_POINT_VALUE;
65 }
66
67 static int
is_valid_mbc_string(const UChar * p,const UChar * end)68 is_valid_mbc_string(const UChar* p, const UChar* end)
69 {
70 while (p < end) {
71 if (*p < 0x80) {
72 p++;
73 }
74 else if (*p < 0xa1) {
75 return FALSE;
76 }
77 else if (*p < 0xff) {
78 p++;
79 if (p >= end) return FALSE;
80 if (*p < 0xa1 || *p == 0xff)
81 return FALSE;
82 p++;
83 }
84 else
85 return FALSE;
86 }
87
88 return TRUE;
89 }
90
91 static OnigCodePoint
euckr_mbc_to_code(const UChar * p,const UChar * end)92 euckr_mbc_to_code(const UChar* p, const UChar* end)
93 {
94 return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_KR, p, end);
95 }
96
97 static int
euckr_code_to_mbc(OnigCodePoint code,UChar * buf)98 euckr_code_to_mbc(OnigCodePoint code, UChar *buf)
99 {
100 return onigenc_mb2_code_to_mbc(ONIG_ENCODING_EUC_KR, code, buf);
101 }
102
103 static int
euckr_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)104 euckr_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
105 UChar* lower)
106 {
107 return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_EUC_KR, flag,
108 pp, end, lower);
109 }
110
111 static int
euckr_is_code_ctype(OnigCodePoint code,unsigned int ctype)112 euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype)
113 {
114 return onigenc_mb2_is_code_ctype(ONIG_ENCODING_EUC_KR, code, ctype);
115 }
116
117 #define euckr_islead(c) ((c) < 0xa1 || (c) == 0xff)
118
119 static UChar*
euckr_left_adjust_char_head(const UChar * start,const UChar * s)120 euckr_left_adjust_char_head(const UChar* start, const UChar* s)
121 {
122 /* Assumed in this encoding,
123 mb-trail bytes don't mix with single bytes.
124 */
125 const UChar *p;
126 int len;
127
128 if (s <= start) return (UChar* )s;
129 p = s;
130
131 while (!euckr_islead(*p) && p > start) p--;
132 len = enclen(ONIG_ENCODING_EUC_KR, p);
133 if (p + len > s) return (UChar* )p;
134 p += len;
135 return (UChar* )(p + ((s - p) & ~1));
136 }
137
138 static int
euckr_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)139 euckr_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
140 {
141 const UChar c = *s;
142 if (c <= 0x7e) return TRUE;
143 else return FALSE;
144 }
145
146 OnigEncodingType OnigEncodingEUC_KR = {
147 euckr_mbc_enc_len,
148 "EUC-KR", /* name */
149 2, /* max enc length */
150 1, /* min enc length */
151 onigenc_is_mbc_newline_0x0a,
152 euckr_mbc_to_code,
153 euckr_code_to_mbclen,
154 euckr_code_to_mbc,
155 euckr_mbc_case_fold,
156 onigenc_ascii_apply_all_case_fold,
157 onigenc_ascii_get_case_fold_codes_by_str,
158 onigenc_minimum_property_name_to_ctype,
159 euckr_is_code_ctype,
160 onigenc_not_support_get_ctype_code_range,
161 euckr_left_adjust_char_head,
162 euckr_is_allowed_reverse_match,
163 NULL, /* init */
164 NULL, /* is_initialized */
165 is_valid_mbc_string,
166 ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
167 0, 0
168 };
169
170 /* Same with OnigEncodingEUC_KR except the name */
171 OnigEncodingType OnigEncodingEUC_CN = {
172 euckr_mbc_enc_len,
173 "EUC-CN", /* name */
174 2, /* max enc length */
175 1, /* min enc length */
176 onigenc_is_mbc_newline_0x0a,
177 euckr_mbc_to_code,
178 euckr_code_to_mbclen,
179 euckr_code_to_mbc,
180 euckr_mbc_case_fold,
181 onigenc_ascii_apply_all_case_fold,
182 onigenc_ascii_get_case_fold_codes_by_str,
183 onigenc_minimum_property_name_to_ctype,
184 euckr_is_code_ctype,
185 onigenc_not_support_get_ctype_code_range,
186 euckr_left_adjust_char_head,
187 euckr_is_allowed_reverse_match,
188 NULL, /* init */
189 NULL, /* is_initialized */
190 is_valid_mbc_string,
191 ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
192 0, 0
193 };
194