1 /**********************************************************************
2 regenc.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2016 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regint.h"
31
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
33
34 extern int
onigenc_init(void)35 onigenc_init(void)
36 {
37 return 0;
38 }
39
40 extern int
onig_initialize_encoding(OnigEncoding enc)41 onig_initialize_encoding(OnigEncoding enc)
42 {
43 if (enc->init != 0 && (enc->is_initialized() == 0)) {
44 int r = (enc->init)();
45 return r;
46 }
47
48 return 0;
49 }
50
51 extern OnigEncoding
onigenc_get_default_encoding(void)52 onigenc_get_default_encoding(void)
53 {
54 return OnigEncDefaultCharEncoding;
55 }
56
57 extern int
onigenc_set_default_encoding(OnigEncoding enc)58 onigenc_set_default_encoding(OnigEncoding enc)
59 {
60 OnigEncDefaultCharEncoding = enc;
61 return 0;
62 }
63
64 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)65 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
66 {
67 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
68 if (p < s) {
69 p += enclen(enc, p);
70 }
71 return p;
72 }
73
74 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)75 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
76 const UChar* start, const UChar* s, const UChar** prev)
77 {
78 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
79
80 if (p < s) {
81 if (prev) *prev = (const UChar* )p;
82 p += enclen(enc, p);
83 }
84 else {
85 if (prev) *prev = (const UChar* )NULL; /* Sorry */
86 }
87 return p;
88 }
89
90 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)91 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
92 {
93 if (s <= start)
94 return (UChar* )NULL;
95
96 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
97 }
98
99 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)100 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
101 {
102 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
103 if (s <= start)
104 return (UChar* )NULL;
105
106 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
107 }
108 return (UChar* )s;
109 }
110
111 #if 0
112 extern int
113 onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end)
114 {
115 int len;
116 int n;
117
118 len = ONIGENC_MBC_ENC_LEN(enc, p);
119 n = (int )(end - p);
120
121 return (n < len ? n : len);
122 }
123 #endif
124
125 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)126 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
127 {
128 UChar* q = (UChar* )p;
129 while (n-- > 0) {
130 q += ONIGENC_MBC_ENC_LEN(enc, q);
131 }
132 return (q <= end ? q : NULL);
133 }
134
135 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)136 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
137 {
138 int n = 0;
139 UChar* q = (UChar* )p;
140
141 while (q < end) {
142 q += ONIGENC_MBC_ENC_LEN(enc, q);
143 n++;
144 }
145 return n;
146 }
147
148 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)149 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
150 {
151 int n = 0;
152 UChar* p = (UChar* )s;
153
154 while (1) {
155 if (*p == '\0') {
156 UChar* q;
157 int len = ONIGENC_MBC_MINLEN(enc);
158
159 if (len == 1) return n;
160 q = p + 1;
161 while (len > 1) {
162 if (*q != '\0') break;
163 q++;
164 len--;
165 }
166 if (len == 1) return n;
167 }
168 p += ONIGENC_MBC_ENC_LEN(enc, p);
169 n++;
170 }
171 }
172
173 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)174 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
175 {
176 UChar* start = (UChar* )s;
177 UChar* p = (UChar* )s;
178
179 while (1) {
180 if (*p == '\0') {
181 UChar* q;
182 int len = ONIGENC_MBC_MINLEN(enc);
183
184 if (len == 1) return (int )(p - start);
185 q = p + 1;
186 while (len > 1) {
187 if (*q != '\0') break;
188 q++;
189 len--;
190 }
191 if (len == 1) return (int )(p - start);
192 }
193 p += ONIGENC_MBC_ENC_LEN(enc, p);
194 }
195 }
196
197 const UChar OnigEncAsciiToLowerCaseTable[] = {
198 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
199 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
200 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
201 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
202 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
203 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
204 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
205 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
206 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
207 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
208 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
209 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
210 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
211 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
212 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
213 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
214 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
215 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
216 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
217 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
218 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
219 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
220 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
221 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
222 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
223 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
224 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
225 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
226 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
227 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
228 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
229 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
230 };
231
232 #ifdef USE_UPPER_CASE_TABLE
233 const UChar OnigEncAsciiToUpperCaseTable[256] = {
234 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
235 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
236 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
237 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
238 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
239 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
240 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
241 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
242 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
243 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
244 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
245 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
246 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
247 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
248 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
249 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
250 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
251 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
252 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
253 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
254 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
255 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
256 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
257 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
258 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
259 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
260 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
261 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
262 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
263 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
264 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
265 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
266 };
267 #endif
268
269 const unsigned short OnigEncAsciiCtypeTable[256] = {
270 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
271 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
272 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
273 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
274 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
275 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
276 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
277 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
278 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
279 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
280 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
281 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
282 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
283 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
284 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
285 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
286 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
287 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
288 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
289 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
290 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
291 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
292 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
293 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
294 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
295 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
296 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
297 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
298 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
299 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
300 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
301 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
302 };
303
304 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
305 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
306 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
307 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
308 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
309 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
310 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
311 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
312 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
313 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
314 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
315 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
316 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
317 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
318 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
319 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
320 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
321 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
322 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
323 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
324 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
325 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
326 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
327 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
328 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
329 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
330 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
331 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
332 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
333 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
334 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
335 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
336 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
337 };
338
339 #ifdef USE_UPPER_CASE_TABLE
340 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
341 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
342 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
343 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
344 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
345 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
346 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
347 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
348 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
349 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
350 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
351 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
352 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
353 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
354 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
355 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
356 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
357 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
358 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
359 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
360 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
361 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
362 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
363 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
364 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
365 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
366 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
367 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
368 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
369 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
370 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
371 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
372 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
373 };
374 #endif
375
376 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)377 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
378 {
379 /* nothing */
380 /* obsoleted. */
381 }
382
383 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)384 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
385 {
386 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
387 }
388
389 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
390 { 0x41, 0x61 },
391 { 0x42, 0x62 },
392 { 0x43, 0x63 },
393 { 0x44, 0x64 },
394 { 0x45, 0x65 },
395 { 0x46, 0x66 },
396 { 0x47, 0x67 },
397 { 0x48, 0x68 },
398 { 0x49, 0x69 },
399 { 0x4a, 0x6a },
400 { 0x4b, 0x6b },
401 { 0x4c, 0x6c },
402 { 0x4d, 0x6d },
403 { 0x4e, 0x6e },
404 { 0x4f, 0x6f },
405 { 0x50, 0x70 },
406 { 0x51, 0x71 },
407 { 0x52, 0x72 },
408 { 0x53, 0x73 },
409 { 0x54, 0x74 },
410 { 0x55, 0x75 },
411 { 0x56, 0x76 },
412 { 0x57, 0x77 },
413 { 0x58, 0x78 },
414 { 0x59, 0x79 },
415 { 0x5a, 0x7a }
416 };
417
418 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)419 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
420 OnigApplyAllCaseFoldFunc f, void* arg)
421 {
422 OnigCodePoint code;
423 int i, r;
424
425 for (i = 0;
426 i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
427 i++) {
428 code = OnigAsciiLowerMap[i].to;
429 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
430 if (r != 0) return r;
431
432 code = OnigAsciiLowerMap[i].from;
433 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
434 if (r != 0) return r;
435 }
436
437 return 0;
438 }
439
440 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])441 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
442 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
443 OnigCaseFoldCodeItem items[])
444 {
445 if (0x41 <= *p && *p <= 0x5a) {
446 items[0].byte_len = 1;
447 items[0].code_len = 1;
448 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
449 return 1;
450 }
451 else if (0x61 <= *p && *p <= 0x7a) {
452 items[0].byte_len = 1;
453 items[0].code_len = 1;
454 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
455 return 1;
456 }
457 else
458 return 0;
459 }
460
461 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)462 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
463 OnigApplyAllCaseFoldFunc f, void* arg)
464 {
465 static OnigCodePoint ss[] = { 0x73, 0x73 };
466
467 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
468 }
469
470 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)471 onigenc_apply_all_case_fold_with_map(int map_size,
472 const OnigPairCaseFoldCodes map[],
473 int ess_tsett_flag, OnigCaseFoldType flag,
474 OnigApplyAllCaseFoldFunc f, void* arg)
475 {
476 OnigCodePoint code;
477 int i, r;
478
479 r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
480 if (r != 0) return r;
481
482 for (i = 0; i < map_size; i++) {
483 code = map[i].to;
484 r = (*f)(map[i].from, &code, 1, arg);
485 if (r != 0) return r;
486
487 code = map[i].from;
488 r = (*f)(map[i].to, &code, 1, arg);
489 if (r != 0) return r;
490 }
491
492 if (ess_tsett_flag != 0)
493 return ss_apply_all_case_fold(flag, f, arg);
494
495 return 0;
496 }
497
498 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])499 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
500 const OnigPairCaseFoldCodes map[],
501 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
502 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
503 {
504 if (0x41 <= *p && *p <= 0x5a) {
505 items[0].byte_len = 1;
506 items[0].code_len = 1;
507 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
508 if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
509 && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
510 /* SS */
511 items[1].byte_len = 2;
512 items[1].code_len = 1;
513 items[1].code[0] = (OnigCodePoint )0xdf;
514 return 2;
515 }
516 else
517 return 1;
518 }
519 else if (0x61 <= *p && *p <= 0x7a) {
520 items[0].byte_len = 1;
521 items[0].code_len = 1;
522 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
523 if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
524 && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
525 /* ss */
526 items[1].byte_len = 2;
527 items[1].code_len = 1;
528 items[1].code[0] = (OnigCodePoint )0xdf;
529 return 2;
530 }
531 else
532 return 1;
533 }
534 else if (*p == 0xdf && ess_tsett_flag != 0) {
535 items[0].byte_len = 1;
536 items[0].code_len = 2;
537 items[0].code[0] = (OnigCodePoint )'s';
538 items[0].code[1] = (OnigCodePoint )'s';
539
540 items[1].byte_len = 1;
541 items[1].code_len = 2;
542 items[1].code[0] = (OnigCodePoint )'S';
543 items[1].code[1] = (OnigCodePoint )'S';
544
545 items[2].byte_len = 1;
546 items[2].code_len = 2;
547 items[2].code[0] = (OnigCodePoint )'s';
548 items[2].code[1] = (OnigCodePoint )'S';
549
550 items[3].byte_len = 1;
551 items[3].code_len = 2;
552 items[3].code[0] = (OnigCodePoint )'S';
553 items[3].code[1] = (OnigCodePoint )'s';
554
555 return 4;
556 }
557 else {
558 int i;
559
560 for (i = 0; i < map_size; i++) {
561 if (*p == map[i].from) {
562 items[0].byte_len = 1;
563 items[0].code_len = 1;
564 items[0].code[0] = map[i].to;
565 return 1;
566 }
567 else if (*p == map[i].to) {
568 items[0].byte_len = 1;
569 items[0].code_len = 1;
570 items[0].code[0] = map[i].from;
571 return 1;
572 }
573 }
574 }
575
576 return 0;
577 }
578
579
580 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)581 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
582 OnigCodePoint* sb_out ARG_UNUSED,
583 const OnigCodePoint* ranges[] ARG_UNUSED)
584 {
585 return ONIG_NO_SUPPORT_CONFIG;
586 }
587
588 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)589 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
590 {
591 if (p < end) {
592 if (*p == 0x0a) return 1;
593 }
594 return 0;
595 }
596
597 /* for single byte encodings */
598 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)599 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
600 const UChar*end ARG_UNUSED, UChar* lower)
601 {
602 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
603
604 (*p)++;
605 return 1; /* return byte length of converted char to lower */
606 }
607
608 #if 0
609 extern int
610 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag,
611 const UChar** pp, const UChar* end)
612 {
613 const UChar* p = *pp;
614
615 (*pp)++;
616 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
617 }
618 #endif
619
620 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)621 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
622 {
623 return 1;
624 }
625
626 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)627 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
628 {
629 return (OnigCodePoint )(*p);
630 }
631
632 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)633 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
634 {
635 return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
636 }
637
638 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)639 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
640 {
641 *buf = (UChar )(code & 0xff);
642 return 1;
643 }
644
645 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)646 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
647 const UChar* s)
648 {
649 return (UChar* )s;
650 }
651
652 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)653 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
654 const UChar* end ARG_UNUSED)
655 {
656 return TRUE;
657 }
658
659 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)660 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
661 const UChar* end ARG_UNUSED)
662 {
663 return FALSE;
664 }
665
666 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)667 onigenc_always_true_is_valid_mbc_string(const UChar* s ARG_UNUSED,
668 const UChar* end ARG_UNUSED)
669 {
670 return TRUE;
671 }
672
673 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)674 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
675 const UChar* p, const UChar* end)
676 {
677 while (p < end) {
678 p += enclen(enc, p);
679 }
680
681 if (p != end)
682 return FALSE;
683 else
684 return TRUE;
685 }
686
687 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)688 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
689 {
690 return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
691 }
692
693 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)694 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
695 {
696 int c, i, len;
697 OnigCodePoint n;
698
699 len = enclen(enc, p);
700 n = (OnigCodePoint )(*p++);
701 if (len == 1) return n;
702
703 for (i = 1; i < len; i++) {
704 if (p >= end) break;
705 c = *p++;
706 n <<= 8; n += c;
707 }
708 return n;
709 }
710
711 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)712 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
713 const UChar** pp, const UChar* end ARG_UNUSED,
714 UChar* lower)
715 {
716 int len;
717 const UChar *p = *pp;
718
719 if (ONIGENC_IS_MBC_ASCII(p)) {
720 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
721 (*pp)++;
722 return 1;
723 }
724 else {
725 int i;
726
727 len = enclen(enc, p);
728 for (i = 0; i < len; i++) {
729 *lower++ = *p++;
730 }
731 (*pp) += len;
732 return len; /* return byte length of converted to lower char */
733 }
734 }
735
736 #if 0
737 extern int
738 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
739 const UChar** pp, const UChar* end)
740 {
741 const UChar* p = *pp;
742
743 if (ONIGENC_IS_MBC_ASCII(p)) {
744 (*pp)++;
745 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
746 }
747
748 (*pp) += enclen(enc, p);
749 return FALSE;
750 }
751 #endif
752
753 extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code)754 onigenc_mb2_code_to_mbclen(OnigCodePoint code)
755 {
756 if ((code & 0xff00) != 0) return 2;
757 else return 1;
758 }
759
760 extern int
onigenc_mb4_code_to_mbclen(OnigCodePoint code)761 onigenc_mb4_code_to_mbclen(OnigCodePoint code)
762 {
763 if ((code & 0xff000000) != 0) return 4;
764 else if ((code & 0xff0000) != 0) return 3;
765 else if ((code & 0xff00) != 0) return 2;
766 else return 1;
767 }
768
769 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)770 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
771 {
772 UChar *p = buf;
773
774 if ((code & 0xff00) != 0) {
775 *p++ = (UChar )((code >> 8) & 0xff);
776 }
777 *p++ = (UChar )(code & 0xff);
778
779 #if 1
780 if (enclen(enc, buf) != (p - buf))
781 return ONIGERR_INVALID_CODE_POINT_VALUE;
782 #endif
783 return p - buf;
784 }
785
786 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)787 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
788 {
789 UChar *p = buf;
790
791 if ((code & 0xff000000) != 0) {
792 *p++ = (UChar )((code >> 24) & 0xff);
793 }
794 if ((code & 0xff0000) != 0 || p != buf) {
795 *p++ = (UChar )((code >> 16) & 0xff);
796 }
797 if ((code & 0xff00) != 0 || p != buf) {
798 *p++ = (UChar )((code >> 8) & 0xff);
799 }
800 *p++ = (UChar )(code & 0xff);
801
802 #if 1
803 if (enclen(enc, buf) != (p - buf))
804 return ONIGERR_INVALID_CODE_POINT_VALUE;
805 #endif
806 return p - buf;
807 }
808
809 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)810 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
811 {
812 static PosixBracketEntryType PBS[] = {
813 { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
814 { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
815 { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
816 { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
817 { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
818 { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
819 { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
820 { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
821 { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
822 { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
823 { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
824 { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
825 { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
826 { (UChar* )"Word", ONIGENC_CTYPE_WORD, 4 },
827 { (UChar* )NULL, -1, 0 }
828 };
829
830 PosixBracketEntryType *pb;
831 int len;
832
833 len = onigenc_strlen(enc, p, end);
834 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
835 if (len == pb->len &&
836 onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
837 return pb->ctype;
838 }
839
840 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
841 }
842
843 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)844 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
845 unsigned int ctype)
846 {
847 if (code < 128)
848 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
849 else {
850 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
851 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
852 }
853 }
854
855 return FALSE;
856 }
857
858 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)859 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
860 unsigned int ctype)
861 {
862 if (code < 128)
863 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
864 else {
865 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
866 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
867 }
868 }
869
870 return FALSE;
871 }
872
873 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)874 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
875 const UChar* sascii /* ascii */, int n)
876 {
877 int x, c;
878
879 while (n-- > 0) {
880 if (p >= end) return (int )(*sascii);
881
882 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
883 x = *sascii - c;
884 if (x) return x;
885
886 sascii++;
887 p += enclen(enc, p);
888 }
889 return 0;
890 }
891
892 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)893 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
894 {
895 int i;
896
897 for (i = 0; i < n; i++) {
898 if (a[i] != b[i])
899 return -1;
900 }
901
902 return 0;
903 }
904
905 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)906 onig_codes_byte_at(OnigCodePoint codes[], int at)
907 {
908 int index;
909 int b;
910 OnigCodePoint code;
911
912 index = at / 3;
913 b = at % 3;
914 code = codes[index];
915
916 return ((code >> ((2 - b) * 8)) & 0xff);
917 }
918