1 /**********************************************************************
2 regenc.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2019 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regint.h"
31
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
33
34 #define INITED_LIST_SIZE 20
35
36 static int InitedListNum;
37
38 static struct {
39 OnigEncoding enc;
40 int inited;
41 } InitedList[INITED_LIST_SIZE];
42
43 static int
enc_inited_entry(OnigEncoding enc)44 enc_inited_entry(OnigEncoding enc)
45 {
46 int i;
47
48 for (i = 0; i < InitedListNum; i++) {
49 if (InitedList[i].enc == enc) {
50 InitedList[i].inited = 1;
51 return i;
52 }
53 }
54
55 i = InitedListNum;
56 if (i < INITED_LIST_SIZE - 1) {
57 InitedList[i].enc = enc;
58 InitedList[i].inited = 1;
59 InitedListNum++;
60 return i;
61 }
62
63 return -1;
64 }
65
66 static int
enc_is_inited(OnigEncoding enc)67 enc_is_inited(OnigEncoding enc)
68 {
69 int i;
70
71 for (i = 0; i < InitedListNum; i++) {
72 if (InitedList[i].enc == enc) {
73 return InitedList[i].inited;
74 }
75 }
76
77 return 0;
78 }
79
80 static int OnigEncInited;
81
82 extern int
onigenc_init(void)83 onigenc_init(void)
84 {
85 if (OnigEncInited != 0) return 0;
86
87 OnigEncInited = 1;
88 return 0;
89 }
90
91 extern int
onigenc_end(void)92 onigenc_end(void)
93 {
94 int i;
95
96 for (i = 0; i < InitedListNum; i++) {
97 InitedList[i].enc = 0;
98 InitedList[i].inited = 0;
99 }
100 InitedListNum = 0;
101
102 OnigEncInited = 0;
103 return ONIG_NORMAL;
104 }
105
106 extern int
onig_initialize_encoding(OnigEncoding enc)107 onig_initialize_encoding(OnigEncoding enc)
108 {
109 int r;
110
111 if (enc != ONIG_ENCODING_ASCII &&
112 ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) {
113 OnigEncoding ascii = ONIG_ENCODING_ASCII;
114 if (ascii->init != 0 && enc_is_inited(ascii) == 0) {
115 r = ascii->init();
116 if (r != ONIG_NORMAL) return r;
117 enc_inited_entry(ascii);
118 }
119 }
120
121 if (enc->init != 0 &&
122 enc_is_inited(enc) == 0) {
123 r = (enc->init)();
124 if (r == ONIG_NORMAL)
125 enc_inited_entry(enc);
126 return r;
127 }
128
129 return 0;
130 }
131
132 extern OnigEncoding
onigenc_get_default_encoding(void)133 onigenc_get_default_encoding(void)
134 {
135 return OnigEncDefaultCharEncoding;
136 }
137
138 extern int
onigenc_set_default_encoding(OnigEncoding enc)139 onigenc_set_default_encoding(OnigEncoding enc)
140 {
141 OnigEncDefaultCharEncoding = enc;
142 return 0;
143 }
144
145 extern UChar*
onigenc_strdup(OnigEncoding enc,const UChar * s,const UChar * end)146 onigenc_strdup(OnigEncoding enc, const UChar* s, const UChar* end)
147 {
148 int slen, term_len, i;
149 UChar *r;
150
151 slen = (int )(end - s);
152 term_len = ONIGENC_MBC_MINLEN(enc);
153
154 r = (UChar* )xmalloc(slen + term_len);
155 CHECK_NULL_RETURN(r);
156 xmemcpy(r, s, slen);
157
158 for (i = 0; i < term_len; i++)
159 r[slen + i] = (UChar )0;
160
161 return r;
162 }
163
164 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)165 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
166 {
167 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
168 if (p < s) {
169 p += enclen(enc, p);
170 }
171 return p;
172 }
173
174 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)175 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
176 const UChar* start, const UChar* s, const UChar** prev)
177 {
178 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
179
180 if (p < s) {
181 if (prev) *prev = (const UChar* )p;
182 p += enclen(enc, p);
183 }
184 else {
185 if (prev)
186 *prev = onigenc_get_prev_char_head(enc, start, p);
187 }
188 return p;
189 }
190
191 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)192 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
193 {
194 if (s <= start)
195 return (UChar* )NULL;
196
197 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
198 }
199
200 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)201 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
202 {
203 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
204 if (s <= start)
205 return (UChar* )NULL;
206
207 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
208 }
209 return (UChar* )s;
210 }
211
212 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)213 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
214 {
215 UChar* q = (UChar* )p;
216 while (n-- > 0) {
217 q += ONIGENC_MBC_ENC_LEN(enc, q);
218 }
219 return (q <= end ? q : NULL);
220 }
221
222 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)223 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
224 {
225 int n = 0;
226 UChar* q = (UChar* )p;
227
228 while (q < end) {
229 q += ONIGENC_MBC_ENC_LEN(enc, q);
230 n++;
231 }
232 return n;
233 }
234
235 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)236 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
237 {
238 int n = 0;
239 UChar* p = (UChar* )s;
240
241 while (1) {
242 if (*p == '\0') {
243 UChar* q;
244 int len = ONIGENC_MBC_MINLEN(enc);
245
246 if (len == 1) return n;
247 q = p + 1;
248 while (len > 1) {
249 if (*q != '\0') break;
250 q++;
251 len--;
252 }
253 if (len == 1) return n;
254 }
255 p += ONIGENC_MBC_ENC_LEN(enc, p);
256 n++;
257 }
258 }
259
260 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)261 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
262 {
263 UChar* start = (UChar* )s;
264 UChar* p = (UChar* )s;
265
266 while (1) {
267 if (*p == '\0') {
268 UChar* q;
269 int len = ONIGENC_MBC_MINLEN(enc);
270
271 if (len == 1) return (int )(p - start);
272 q = p + 1;
273 while (len > 1) {
274 if (*q != '\0') break;
275 q++;
276 len--;
277 }
278 if (len == 1) return (int )(p - start);
279 }
280 p += ONIGENC_MBC_ENC_LEN(enc, p);
281 }
282 }
283
284 const UChar OnigEncAsciiToLowerCaseTable[] = {
285 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
286 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
287 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
288 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
289 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
290 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
291 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
292 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
293 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
294 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
295 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
296 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
297 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
298 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
299 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
300 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
301 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
302 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
303 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
304 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
305 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
306 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
307 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
308 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
309 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
310 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
311 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
312 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
313 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
314 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
315 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
316 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
317 };
318
319 #ifdef USE_UPPER_CASE_TABLE
320 const UChar OnigEncAsciiToUpperCaseTable[256] = {
321 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
322 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
323 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
324 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
325 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
326 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
327 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
328 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
329 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
330 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
331 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
332 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
333 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
334 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
335 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
336 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
337 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
338 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
339 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
340 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
341 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
342 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
343 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
344 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
345 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
346 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
347 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
348 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
349 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
350 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
351 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
352 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
353 };
354 #endif
355
356 const unsigned short OnigEncAsciiCtypeTable[256] = {
357 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
358 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
359 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
360 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
361 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
362 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
363 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
364 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
365 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
366 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
367 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
368 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
369 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
370 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
371 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
372 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
373 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
374 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
375 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
376 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
377 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
378 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
379 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
380 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
385 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
386 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
387 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
389 };
390
391 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
392 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
393 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
394 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
395 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
396 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
397 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
398 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
399 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
400 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
401 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
402 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
403 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
404 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
405 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
406 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
407 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
408 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
409 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
410 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
411 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
412 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
413 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
414 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
415 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
416 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
417 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
418 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
419 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
420 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
421 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
422 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
423 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
424 };
425
426 #ifdef USE_UPPER_CASE_TABLE
427 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
428 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
429 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
430 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
431 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
432 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
433 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
434 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
435 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
436 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
437 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
438 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
439 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
440 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
441 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
442 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
443 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
444 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
445 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
446 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
447 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
448 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
449 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
450 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
451 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
452 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
453 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
454 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
455 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
456 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
457 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
458 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
459 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
460 };
461 #endif
462
463 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)464 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
465 {
466 /* nothing */
467 /* obsoleted. */
468 }
469
470 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)471 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
472 {
473 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
474 }
475
476 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
477 { 0x41, 0x61 },
478 { 0x42, 0x62 },
479 { 0x43, 0x63 },
480 { 0x44, 0x64 },
481 { 0x45, 0x65 },
482 { 0x46, 0x66 },
483 { 0x47, 0x67 },
484 { 0x48, 0x68 },
485 { 0x49, 0x69 },
486 { 0x4a, 0x6a },
487 { 0x4b, 0x6b },
488 { 0x4c, 0x6c },
489 { 0x4d, 0x6d },
490 { 0x4e, 0x6e },
491 { 0x4f, 0x6f },
492 { 0x50, 0x70 },
493 { 0x51, 0x71 },
494 { 0x52, 0x72 },
495 { 0x53, 0x73 },
496 { 0x54, 0x74 },
497 { 0x55, 0x75 },
498 { 0x56, 0x76 },
499 { 0x57, 0x77 },
500 { 0x58, 0x78 },
501 { 0x59, 0x79 },
502 { 0x5a, 0x7a }
503 };
504
505 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)506 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
507 OnigApplyAllCaseFoldFunc f, void* arg)
508 {
509 OnigCodePoint code;
510 int i, r;
511
512 for (i = 0;
513 i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
514 i++) {
515 code = OnigAsciiLowerMap[i].to;
516 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
517 if (r != 0) return r;
518
519 code = OnigAsciiLowerMap[i].from;
520 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
521 if (r != 0) return r;
522 }
523
524 return 0;
525 }
526
527 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])528 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
529 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
530 OnigCaseFoldCodeItem items[])
531 {
532 if (0x41 <= *p && *p <= 0x5a) {
533 items[0].byte_len = 1;
534 items[0].code_len = 1;
535 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
536 return 1;
537 }
538 else if (0x61 <= *p && *p <= 0x7a) {
539 items[0].byte_len = 1;
540 items[0].code_len = 1;
541 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
542 return 1;
543 }
544 else
545 return 0;
546 }
547
548 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)549 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
550 OnigApplyAllCaseFoldFunc f, void* arg)
551 {
552 static OnigCodePoint ss[] = { 0x73, 0x73 };
553
554 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
555 }
556
557 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)558 onigenc_apply_all_case_fold_with_map(int map_size,
559 const OnigPairCaseFoldCodes map[],
560 int ess_tsett_flag, OnigCaseFoldType flag,
561 OnigApplyAllCaseFoldFunc f, void* arg)
562 {
563 OnigCodePoint code;
564 int i, r;
565
566 r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
567 if (r != 0) return r;
568
569 for (i = 0; i < map_size; i++) {
570 code = map[i].to;
571 r = (*f)(map[i].from, &code, 1, arg);
572 if (r != 0) return r;
573
574 code = map[i].from;
575 r = (*f)(map[i].to, &code, 1, arg);
576 if (r != 0) return r;
577 }
578
579 if (ess_tsett_flag != 0)
580 return ss_apply_all_case_fold(flag, f, arg);
581
582 return 0;
583 }
584
585 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])586 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
587 const OnigPairCaseFoldCodes map[],
588 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
589 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
590 {
591 if (0x41 <= *p && *p <= 0x5a) {
592 items[0].byte_len = 1;
593 items[0].code_len = 1;
594 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
595 if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
596 && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
597 /* SS */
598 items[1].byte_len = 2;
599 items[1].code_len = 1;
600 items[1].code[0] = (OnigCodePoint )0xdf;
601 return 2;
602 }
603 else
604 return 1;
605 }
606 else if (0x61 <= *p && *p <= 0x7a) {
607 items[0].byte_len = 1;
608 items[0].code_len = 1;
609 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
610 if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
611 && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
612 /* ss */
613 items[1].byte_len = 2;
614 items[1].code_len = 1;
615 items[1].code[0] = (OnigCodePoint )0xdf;
616 return 2;
617 }
618 else
619 return 1;
620 }
621 else if (*p == 0xdf && ess_tsett_flag != 0) {
622 items[0].byte_len = 1;
623 items[0].code_len = 2;
624 items[0].code[0] = (OnigCodePoint )'s';
625 items[0].code[1] = (OnigCodePoint )'s';
626
627 items[1].byte_len = 1;
628 items[1].code_len = 2;
629 items[1].code[0] = (OnigCodePoint )'S';
630 items[1].code[1] = (OnigCodePoint )'S';
631
632 items[2].byte_len = 1;
633 items[2].code_len = 2;
634 items[2].code[0] = (OnigCodePoint )'s';
635 items[2].code[1] = (OnigCodePoint )'S';
636
637 items[3].byte_len = 1;
638 items[3].code_len = 2;
639 items[3].code[0] = (OnigCodePoint )'S';
640 items[3].code[1] = (OnigCodePoint )'s';
641
642 return 4;
643 }
644 else {
645 int i;
646
647 for (i = 0; i < map_size; i++) {
648 if (*p == map[i].from) {
649 items[0].byte_len = 1;
650 items[0].code_len = 1;
651 items[0].code[0] = map[i].to;
652 return 1;
653 }
654 else if (*p == map[i].to) {
655 items[0].byte_len = 1;
656 items[0].code_len = 1;
657 items[0].code[0] = map[i].from;
658 return 1;
659 }
660 }
661 }
662
663 return 0;
664 }
665
666
667 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)668 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
669 OnigCodePoint* sb_out ARG_UNUSED,
670 const OnigCodePoint* ranges[] ARG_UNUSED)
671 {
672 return ONIG_NO_SUPPORT_CONFIG;
673 }
674
675 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)676 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
677 {
678 if (p < end) {
679 if (*p == 0x0a) return 1;
680 }
681 return 0;
682 }
683
684 /* for single byte encodings */
685 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)686 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
687 const UChar*end ARG_UNUSED, UChar* lower)
688 {
689 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
690
691 (*p)++;
692 return 1; /* return byte length of converted char to lower */
693 }
694
695 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)696 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
697 {
698 return 1;
699 }
700
701 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)702 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
703 {
704 return (OnigCodePoint )(*p);
705 }
706
707 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)708 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
709 {
710 return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
711 }
712
713 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)714 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
715 {
716 *buf = (UChar )(code & 0xff);
717 return 1;
718 }
719
720 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)721 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
722 const UChar* s)
723 {
724 return (UChar* )s;
725 }
726
727 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)728 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
729 const UChar* end ARG_UNUSED)
730 {
731 return TRUE;
732 }
733
734 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)735 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
736 const UChar* end ARG_UNUSED)
737 {
738 return FALSE;
739 }
740
741 extern int
onigenc_always_true_is_valid_mbc_string(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)742 onigenc_always_true_is_valid_mbc_string(const UChar* s ARG_UNUSED,
743 const UChar* end ARG_UNUSED)
744 {
745 return TRUE;
746 }
747
748 extern int
onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,const UChar * p,const UChar * end)749 onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
750 const UChar* p, const UChar* end)
751 {
752 while (p < end) {
753 p += enclen(enc, p);
754 }
755
756 if (p != end)
757 return FALSE;
758 else
759 return TRUE;
760 }
761
762 extern int
onigenc_is_valid_mbc_string(OnigEncoding enc,const UChar * s,const UChar * end)763 onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
764 {
765 return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
766 }
767
768 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)769 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
770 {
771 int c, i, len;
772 OnigCodePoint n;
773
774 len = enclen(enc, p);
775 n = (OnigCodePoint )(*p++);
776 if (len == 1) return n;
777
778 for (i = 1; i < len; i++) {
779 if (p >= end) break;
780 c = *p++;
781 n <<= 8; n += c;
782 }
783 return n;
784 }
785
786 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)787 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
788 const UChar** pp, const UChar* end ARG_UNUSED,
789 UChar* lower)
790 {
791 int len;
792 const UChar *p = *pp;
793
794 if (ONIGENC_IS_MBC_ASCII(p)) {
795 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
796 (*pp)++;
797 return 1;
798 }
799 else {
800 int i;
801
802 len = enclen(enc, p);
803 for (i = 0; i < len; i++) {
804 *lower++ = *p++;
805 }
806 (*pp) += len;
807 return len; /* return byte length of converted to lower char */
808 }
809 }
810
811 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)812 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
813 {
814 UChar *p = buf;
815
816 if ((code & 0xff00) != 0) {
817 *p++ = (UChar )((code >> 8) & 0xff);
818 }
819 *p++ = (UChar )(code & 0xff);
820
821 #if 1
822 if (enclen(enc, buf) != (p - buf))
823 return ONIGERR_INVALID_CODE_POINT_VALUE;
824 #endif
825 return (int )(p - buf);
826 }
827
828 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)829 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
830 {
831 UChar *p = buf;
832
833 if ((code & 0xff000000) != 0) {
834 *p++ = (UChar )((code >> 24) & 0xff);
835 }
836 if ((code & 0xff0000) != 0 || p != buf) {
837 *p++ = (UChar )((code >> 16) & 0xff);
838 }
839 if ((code & 0xff00) != 0 || p != buf) {
840 *p++ = (UChar )((code >> 8) & 0xff);
841 }
842 *p++ = (UChar )(code & 0xff);
843
844 #if 1
845 if (enclen(enc, buf) != (p - buf))
846 return ONIGERR_INVALID_CODE_POINT_VALUE;
847 #endif
848 return (int )(p - buf);
849 }
850
851 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)852 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
853 {
854 static PosixBracketEntryType PBS[] = {
855 { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
856 { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
857 { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
858 { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
859 { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
860 { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
861 { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
862 { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
863 { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
864 { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
865 { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
866 { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
867 { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
868 { (UChar* )"Word", ONIGENC_CTYPE_WORD, 4 },
869 { (UChar* )NULL, -1, 0 }
870 };
871
872 PosixBracketEntryType *pb;
873 int len;
874
875 len = onigenc_strlen(enc, p, end);
876 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
877 if (len == pb->len &&
878 onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
879 return pb->ctype;
880 }
881
882 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
883 }
884
885 extern int
onigenc_is_mbc_word_ascii(OnigEncoding enc,UChar * s,const UChar * end)886 onigenc_is_mbc_word_ascii(OnigEncoding enc, UChar* s, const UChar* end)
887 {
888 OnigCodePoint code = ONIGENC_MBC_TO_CODE(enc, s, end);
889
890 if (code > 127) return 0;
891
892 return ONIGENC_IS_ASCII_CODE_WORD(code);
893 }
894
895 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)896 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
897 unsigned int ctype)
898 {
899 if (code < 128)
900 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
901 else {
902 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
903 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
904 }
905 }
906
907 return FALSE;
908 }
909
910 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)911 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
912 unsigned int ctype)
913 {
914 if (code < 128)
915 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
916 else {
917 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
918 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
919 }
920 }
921
922 return FALSE;
923 }
924
925 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)926 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
927 const UChar* sascii /* ascii */, int n)
928 {
929 int x, c;
930
931 while (n-- > 0) {
932 if (p >= end) return (int )(*sascii);
933
934 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
935 x = *sascii - c;
936 if (x) return x;
937
938 sascii++;
939 p += enclen(enc, p);
940 }
941 return 0;
942 }
943
944 extern int
onig_codes_cmp(OnigCodePoint a[],OnigCodePoint b[],int n)945 onig_codes_cmp(OnigCodePoint a[], OnigCodePoint b[], int n)
946 {
947 int i;
948
949 for (i = 0; i < n; i++) {
950 if (a[i] != b[i])
951 return -1;
952 }
953
954 return 0;
955 }
956
957 extern int
onig_codes_byte_at(OnigCodePoint codes[],int at)958 onig_codes_byte_at(OnigCodePoint codes[], int at)
959 {
960 int index;
961 int b;
962 OnigCodePoint code;
963
964 index = at / 3;
965 b = at % 3;
966 code = codes[index];
967
968 return ((code >> ((2 - b) * 8)) & 0xff);
969 }
970