1 /**********************************************************************
2 sjis.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regenc.h"
31
32 static const int EncLen_SJIS[] = {
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
49 };
50
51 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
68 };
69
70 #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
71 #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
72
73 static int
sjis_mbc_enc_len(const UChar * p)74 sjis_mbc_enc_len(const UChar* p)
75 {
76 return EncLen_SJIS[*p];
77 }
78
79 static int
sjis_code_to_mbclen(OnigCodePoint code)80 sjis_code_to_mbclen(OnigCodePoint code)
81 {
82 if (code < 256) {
83 if (EncLen_SJIS[(int )code] == 1)
84 return 1;
85 else
86 return 0;
87 }
88 else if (code <= 0xffff) {
89 return 2;
90 }
91 else
92 return 0;
93 }
94
95 static OnigCodePoint
sjis_mbc_to_code(const UChar * p,const UChar * end)96 sjis_mbc_to_code(const UChar* p, const UChar* end)
97 {
98 int c, i, len;
99 OnigCodePoint n;
100
101 len = enc_len(ONIG_ENCODING_SJIS, p);
102 c = *p++;
103 n = c;
104 if (len == 1) return n;
105
106 for (i = 1; i < len; i++) {
107 if (p >= end) break;
108 c = *p++;
109 n <<= 8; n += c;
110 }
111 return n;
112 }
113
114 static int
sjis_code_to_mbc(OnigCodePoint code,UChar * buf)115 sjis_code_to_mbc(OnigCodePoint code, UChar *buf)
116 {
117 UChar *p = buf;
118
119 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
120 *p++ = (UChar )(code & 0xff);
121
122 #if 0
123 if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf))
124 return REGERR_INVALID_WIDE_CHAR_VALUE;
125 #endif
126 return p - buf;
127 }
128
129 static int
sjis_mbc_to_normalize(OnigAmbigType flag,const UChar ** pp,const UChar * end,UChar * lower)130 sjis_mbc_to_normalize(OnigAmbigType flag,
131 const UChar** pp, const UChar* end, UChar* lower)
132 {
133 const UChar* p = *pp;
134
135 if (ONIGENC_IS_MBC_ASCII(p)) {
136 if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
137 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
138 }
139 else {
140 *lower = *p;
141 }
142
143 (*pp)++;
144 return 1;
145 }
146 else {
147 int len = enc_len(ONIG_ENCODING_SJIS, p);
148
149 if (lower != p) {
150 int i;
151 for (i = 0; i < len; i++) {
152 *lower++ = *p++;
153 }
154 }
155 (*pp) += len;
156 return len; /* return byte length of converted char to lower */
157 }
158 }
159
160 static int
sjis_is_mbc_ambiguous(OnigAmbigType flag,const UChar ** pp,const UChar * end)161 sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
162 {
163 return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end);
164
165 }
166
167 static int
sjis_is_code_ctype(OnigCodePoint code,unsigned int ctype)168 sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype)
169 {
170 if (code < 128)
171 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
172 else {
173 if ((ctype & (ONIGENC_CTYPE_WORD |
174 ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
175 return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE);
176 }
177 }
178
179 return FALSE;
180 }
181
182 static UChar*
sjis_left_adjust_char_head(const UChar * start,const UChar * s)183 sjis_left_adjust_char_head(const UChar* start, const UChar* s)
184 {
185 const UChar *p;
186 int len;
187
188 if (s <= start) return (UChar* )s;
189 p = s;
190
191 if (SJIS_ISMB_TRAIL(*p)) {
192 while (p > start) {
193 if (! SJIS_ISMB_FIRST(*--p)) {
194 p++;
195 break;
196 }
197 }
198 }
199 len = enc_len(ONIG_ENCODING_SJIS, p);
200 if (p + len > s) return (UChar* )p;
201 p += len;
202 return (UChar* )(p + ((s - p) & ~1));
203 }
204
205 static int
sjis_is_allowed_reverse_match(const UChar * s,const UChar * end)206 sjis_is_allowed_reverse_match(const UChar* s, const UChar* end)
207 {
208 const UChar c = *s;
209 return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
210 }
211
212 OnigEncodingType OnigEncodingSJIS = {
213 sjis_mbc_enc_len,
214 "Shift_JIS", /* name */
215 2, /* max byte length */
216 1, /* min byte length */
217 ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
218 {
219 (OnigCodePoint )'\\' /* esc */
220 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
221 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
222 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
223 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
224 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
225 },
226 onigenc_is_mbc_newline_0x0a,
227 sjis_mbc_to_code,
228 sjis_code_to_mbclen,
229 sjis_code_to_mbc,
230 sjis_mbc_to_normalize,
231 sjis_is_mbc_ambiguous,
232 onigenc_ascii_get_all_pair_ambig_codes,
233 onigenc_nothing_get_all_comp_ambig_codes,
234 sjis_is_code_ctype,
235 onigenc_not_support_get_ctype_code_range,
236 sjis_left_adjust_char_head,
237 sjis_is_allowed_reverse_match
238 };
239