1 /**********************************************************************
2 regenc.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regint.h"
31
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
33
34 extern int
onigenc_init(void)35 onigenc_init(void)
36 {
37 return 0;
38 }
39
40 extern OnigEncoding
onigenc_get_default_encoding(void)41 onigenc_get_default_encoding(void)
42 {
43 return OnigEncDefaultCharEncoding;
44 }
45
46 extern int
onigenc_set_default_encoding(OnigEncoding enc)47 onigenc_set_default_encoding(OnigEncoding enc)
48 {
49 OnigEncDefaultCharEncoding = enc;
50 return 0;
51 }
52
53 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)54 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
55 {
56 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
57 if (p < s) {
58 p += enclen(enc, p);
59 }
60 return p;
61 }
62
63 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)64 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
65 const UChar* start, const UChar* s, const UChar** prev)
66 {
67 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
68
69 if (p < s) {
70 if (prev) *prev = (const UChar* )p;
71 p += enclen(enc, p);
72 }
73 else {
74 if (prev) *prev = (const UChar* )NULL; /* Sorry */
75 }
76 return p;
77 }
78
79 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)80 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
81 {
82 if (s <= start)
83 return (UChar* )NULL;
84
85 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
86 }
87
88 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)89 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
90 {
91 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
92 if (s <= start)
93 return (UChar* )NULL;
94
95 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
96 }
97 return (UChar* )s;
98 }
99
100 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)101 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
102 {
103 UChar* q = (UChar* )p;
104 while (n-- > 0) {
105 q += ONIGENC_MBC_ENC_LEN(enc, q);
106 }
107 return (q <= end ? q : NULL);
108 }
109
110 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)111 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
112 {
113 int n = 0;
114 UChar* q = (UChar* )p;
115
116 while (q < end) {
117 q += ONIGENC_MBC_ENC_LEN(enc, q);
118 n++;
119 }
120 return n;
121 }
122
123 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)124 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
125 {
126 int n = 0;
127 UChar* p = (UChar* )s;
128
129 while (1) {
130 if (*p == '\0') {
131 UChar* q;
132 int len = ONIGENC_MBC_MINLEN(enc);
133
134 if (len == 1) return n;
135 q = p + 1;
136 while (len > 1) {
137 if (*q != '\0') break;
138 q++;
139 len--;
140 }
141 if (len == 1) return n;
142 }
143 p += ONIGENC_MBC_ENC_LEN(enc, p);
144 n++;
145 }
146 }
147
148 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)149 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
150 {
151 UChar* start = (UChar* )s;
152 UChar* p = (UChar* )s;
153
154 while (1) {
155 if (*p == '\0') {
156 UChar* q;
157 int len = ONIGENC_MBC_MINLEN(enc);
158
159 if (len == 1) return (int )(p - start);
160 q = p + 1;
161 while (len > 1) {
162 if (*q != '\0') break;
163 q++;
164 len--;
165 }
166 if (len == 1) return (int )(p - start);
167 }
168 p += ONIGENC_MBC_ENC_LEN(enc, p);
169 }
170 }
171
172 const UChar OnigEncAsciiToLowerCaseTable[] = {
173 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
174 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
175 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
176 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
177 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
178 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
179 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
180 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
181 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
182 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
183 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
184 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
185 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
186 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
187 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
188 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
189 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
190 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
191 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
192 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
193 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
194 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
195 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
196 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
197 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
198 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
199 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
200 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
201 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
202 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
203 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
204 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
205 };
206
207 #ifdef USE_UPPER_CASE_TABLE
208 const UChar OnigEncAsciiToUpperCaseTable[256] = {
209 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
210 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
211 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
212 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
213 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
214 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
215 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
216 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
217 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
218 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
219 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
220 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
221 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
222 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
223 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
224 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
225 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
226 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
227 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
228 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
229 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
230 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
231 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
232 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
233 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
234 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
235 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
236 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
237 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
238 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
239 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
240 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
241 };
242 #endif
243
244 const unsigned short OnigEncAsciiCtypeTable[256] = {
245 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
246 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
247 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
248 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
249 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
250 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
251 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
252 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
253 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
254 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
255 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
256 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
257 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
258 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
259 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
260 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
261 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
262 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
263 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
264 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
265 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
266 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
267 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
268 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
269 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
270 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
271 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
272 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
273 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
274 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
275 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
276 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
277 };
278
279 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
280 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
281 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
282 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
283 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
284 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
285 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
286 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
287 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
288 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
289 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
290 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
291 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
292 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
293 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
294 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
295 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
296 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
297 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
298 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
299 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
300 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
301 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
302 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
303 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
304 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
305 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
306 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
307 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
308 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
309 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
310 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
311 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
312 };
313
314 #ifdef USE_UPPER_CASE_TABLE
315 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
316 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
317 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
318 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
319 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
320 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
321 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
322 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
323 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
324 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
325 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
326 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
327 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
328 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
329 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
330 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
331 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
332 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
333 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
334 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
335 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
336 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
337 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
338 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
339 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
340 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
341 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
342 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
343 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
344 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
345 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
346 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
347 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
348 };
349 #endif
350
351 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)352 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
353 {
354 /* nothing */
355 /* obsoleted. */
356 }
357
358 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)359 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
360 {
361 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
362 }
363
364 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
365 { 0x41, 0x61 },
366 { 0x42, 0x62 },
367 { 0x43, 0x63 },
368 { 0x44, 0x64 },
369 { 0x45, 0x65 },
370 { 0x46, 0x66 },
371 { 0x47, 0x67 },
372 { 0x48, 0x68 },
373 { 0x49, 0x69 },
374 { 0x4a, 0x6a },
375 { 0x4b, 0x6b },
376 { 0x4c, 0x6c },
377 { 0x4d, 0x6d },
378 { 0x4e, 0x6e },
379 { 0x4f, 0x6f },
380 { 0x50, 0x70 },
381 { 0x51, 0x71 },
382 { 0x52, 0x72 },
383 { 0x53, 0x73 },
384 { 0x54, 0x74 },
385 { 0x55, 0x75 },
386 { 0x56, 0x76 },
387 { 0x57, 0x77 },
388 { 0x58, 0x78 },
389 { 0x59, 0x79 },
390 { 0x5a, 0x7a }
391 };
392
393 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)394 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
395 OnigApplyAllCaseFoldFunc f, void* arg)
396 {
397 OnigCodePoint code;
398 int i, r;
399
400 for (i = 0;
401 i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
402 i++) {
403 code = OnigAsciiLowerMap[i].to;
404 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
405 if (r != 0) return r;
406
407 code = OnigAsciiLowerMap[i].from;
408 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
409 if (r != 0) return r;
410 }
411
412 return 0;
413 }
414
415 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])416 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
417 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
418 OnigCaseFoldCodeItem items[])
419 {
420 if (0x41 <= *p && *p <= 0x5a) {
421 items[0].byte_len = 1;
422 items[0].code_len = 1;
423 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
424 return 1;
425 }
426 else if (0x61 <= *p && *p <= 0x7a) {
427 items[0].byte_len = 1;
428 items[0].code_len = 1;
429 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
430 return 1;
431 }
432 else
433 return 0;
434 }
435
436 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)437 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
438 OnigApplyAllCaseFoldFunc f, void* arg)
439 {
440 static OnigCodePoint ss[] = { 0x73, 0x73 };
441
442 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
443 }
444
445 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)446 onigenc_apply_all_case_fold_with_map(int map_size,
447 const OnigPairCaseFoldCodes map[],
448 int ess_tsett_flag, OnigCaseFoldType flag,
449 OnigApplyAllCaseFoldFunc f, void* arg)
450 {
451 OnigCodePoint code;
452 int i, r;
453
454 r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
455 if (r != 0) return r;
456
457 for (i = 0; i < map_size; i++) {
458 code = map[i].to;
459 r = (*f)(map[i].from, &code, 1, arg);
460 if (r != 0) return r;
461
462 code = map[i].from;
463 r = (*f)(map[i].to, &code, 1, arg);
464 if (r != 0) return r;
465 }
466
467 if (ess_tsett_flag != 0)
468 return ss_apply_all_case_fold(flag, f, arg);
469
470 return 0;
471 }
472
473 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])474 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
475 const OnigPairCaseFoldCodes map[],
476 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
477 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
478 {
479 if (0x41 <= *p && *p <= 0x5a) {
480 items[0].byte_len = 1;
481 items[0].code_len = 1;
482 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
483 if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
484 && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
485 /* SS */
486 items[1].byte_len = 2;
487 items[1].code_len = 1;
488 items[1].code[0] = (OnigCodePoint )0xdf;
489 return 2;
490 }
491 else
492 return 1;
493 }
494 else if (0x61 <= *p && *p <= 0x7a) {
495 items[0].byte_len = 1;
496 items[0].code_len = 1;
497 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
498 if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
499 && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
500 /* ss */
501 items[1].byte_len = 2;
502 items[1].code_len = 1;
503 items[1].code[0] = (OnigCodePoint )0xdf;
504 return 2;
505 }
506 else
507 return 1;
508 }
509 else if (*p == 0xdf && ess_tsett_flag != 0) {
510 items[0].byte_len = 1;
511 items[0].code_len = 2;
512 items[0].code[0] = (OnigCodePoint )'s';
513 items[0].code[1] = (OnigCodePoint )'s';
514
515 items[1].byte_len = 1;
516 items[1].code_len = 2;
517 items[1].code[0] = (OnigCodePoint )'S';
518 items[1].code[1] = (OnigCodePoint )'S';
519
520 items[2].byte_len = 1;
521 items[2].code_len = 2;
522 items[2].code[0] = (OnigCodePoint )'s';
523 items[2].code[1] = (OnigCodePoint )'S';
524
525 items[3].byte_len = 1;
526 items[3].code_len = 2;
527 items[3].code[0] = (OnigCodePoint )'S';
528 items[3].code[1] = (OnigCodePoint )'s';
529
530 return 4;
531 }
532 else {
533 int i;
534
535 for (i = 0; i < map_size; i++) {
536 if (*p == map[i].from) {
537 items[0].byte_len = 1;
538 items[0].code_len = 1;
539 items[0].code[0] = map[i].to;
540 return 1;
541 }
542 else if (*p == map[i].to) {
543 items[0].byte_len = 1;
544 items[0].code_len = 1;
545 items[0].code[0] = map[i].from;
546 return 1;
547 }
548 }
549 }
550
551 return 0;
552 }
553
554
555 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)556 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
557 OnigCodePoint* sb_out ARG_UNUSED,
558 const OnigCodePoint* ranges[] ARG_UNUSED)
559 {
560 return ONIG_NO_SUPPORT_CONFIG;
561 }
562
563 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)564 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
565 {
566 if (p < end) {
567 if (*p == 0x0a) return 1;
568 }
569 return 0;
570 }
571
572 /* for single byte encodings */
573 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)574 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
575 const UChar*end ARG_UNUSED, UChar* lower)
576 {
577 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
578
579 (*p)++;
580 return 1; /* return byte length of converted char to lower */
581 }
582
583 #if 0
584 extern int
585 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag,
586 const UChar** pp, const UChar* end)
587 {
588 const UChar* p = *pp;
589
590 (*pp)++;
591 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
592 }
593 #endif
594
595 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)596 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
597 {
598 return 1;
599 }
600
601 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)602 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
603 {
604 return (OnigCodePoint )(*p);
605 }
606
607 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)608 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
609 {
610 return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
611 }
612
613 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)614 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
615 {
616 *buf = (UChar )(code & 0xff);
617 return 1;
618 }
619
620 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)621 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
622 const UChar* s)
623 {
624 return (UChar* )s;
625 }
626
627 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)628 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
629 const UChar* end ARG_UNUSED)
630 {
631 return TRUE;
632 }
633
634 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)635 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
636 const UChar* end ARG_UNUSED)
637 {
638 return FALSE;
639 }
640
641 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)642 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
643 {
644 int c, i, len;
645 OnigCodePoint n;
646
647 len = enclen(enc, p);
648 n = (OnigCodePoint )(*p++);
649 if (len == 1) return n;
650
651 for (i = 1; i < len; i++) {
652 if (p >= end) break;
653 c = *p++;
654 n <<= 8; n += c;
655 }
656 return n;
657 }
658
659 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)660 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
661 const UChar** pp, const UChar* end ARG_UNUSED,
662 UChar* lower)
663 {
664 int len;
665 const UChar *p = *pp;
666
667 if (ONIGENC_IS_MBC_ASCII(p)) {
668 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
669 (*pp)++;
670 return 1;
671 }
672 else {
673 int i;
674
675 len = enclen(enc, p);
676 for (i = 0; i < len; i++) {
677 *lower++ = *p++;
678 }
679 (*pp) += len;
680 return len; /* return byte length of converted to lower char */
681 }
682 }
683
684 #if 0
685 extern int
686 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
687 const UChar** pp, const UChar* end)
688 {
689 const UChar* p = *pp;
690
691 if (ONIGENC_IS_MBC_ASCII(p)) {
692 (*pp)++;
693 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
694 }
695
696 (*pp) += enclen(enc, p);
697 return FALSE;
698 }
699 #endif
700
701 extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code)702 onigenc_mb2_code_to_mbclen(OnigCodePoint code)
703 {
704 if ((code & 0xff00) != 0) return 2;
705 else return 1;
706 }
707
708 extern int
onigenc_mb4_code_to_mbclen(OnigCodePoint code)709 onigenc_mb4_code_to_mbclen(OnigCodePoint code)
710 {
711 if ((code & 0xff000000) != 0) return 4;
712 else if ((code & 0xff0000) != 0) return 3;
713 else if ((code & 0xff00) != 0) return 2;
714 else return 1;
715 }
716
717 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)718 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
719 {
720 UChar *p = buf;
721
722 if ((code & 0xff00) != 0) {
723 *p++ = (UChar )((code >> 8) & 0xff);
724 }
725 *p++ = (UChar )(code & 0xff);
726
727 #if 1
728 if (enclen(enc, buf) != (p - buf))
729 return ONIGERR_INVALID_CODE_POINT_VALUE;
730 #endif
731 return p - buf;
732 }
733
734 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)735 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
736 {
737 UChar *p = buf;
738
739 if ((code & 0xff000000) != 0) {
740 *p++ = (UChar )((code >> 24) & 0xff);
741 }
742 if ((code & 0xff0000) != 0 || p != buf) {
743 *p++ = (UChar )((code >> 16) & 0xff);
744 }
745 if ((code & 0xff00) != 0 || p != buf) {
746 *p++ = (UChar )((code >> 8) & 0xff);
747 }
748 *p++ = (UChar )(code & 0xff);
749
750 #if 1
751 if (enclen(enc, buf) != (p - buf))
752 return ONIGERR_INVALID_CODE_POINT_VALUE;
753 #endif
754 return p - buf;
755 }
756
757 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)758 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
759 {
760 static PosixBracketEntryType PBS[] = {
761 { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
762 { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
763 { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
764 { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
765 { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
766 { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
767 { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
768 { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
769 { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
770 { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
771 { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
772 { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
773 { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
774 { (UChar* )"Word", ONIGENC_CTYPE_WORD, 4 },
775 { (UChar* )NULL, -1, 0 }
776 };
777
778 PosixBracketEntryType *pb;
779 int len;
780
781 len = onigenc_strlen(enc, p, end);
782 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
783 if (len == pb->len &&
784 onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
785 return pb->ctype;
786 }
787
788 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
789 }
790
791 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)792 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
793 unsigned int ctype)
794 {
795 if (code < 128)
796 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
797 else {
798 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
799 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
800 }
801 }
802
803 return FALSE;
804 }
805
806 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)807 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
808 unsigned int ctype)
809 {
810 if (code < 128)
811 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
812 else {
813 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
814 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
815 }
816 }
817
818 return FALSE;
819 }
820
821 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)822 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
823 const UChar* sascii /* ascii */, int n)
824 {
825 int x, c;
826
827 while (n-- > 0) {
828 if (p >= end) return (int )(*sascii);
829
830 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
831 x = *sascii - c;
832 if (x) return x;
833
834 sascii++;
835 p += enclen(enc, p);
836 }
837 return 0;
838 }
839
840 /* Property management */
841 static int
resize_property_list(int new_size,const OnigCodePoint *** plist,int * psize)842 resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
843 {
844 int size;
845 const OnigCodePoint **list = *plist;
846
847 size = sizeof(OnigCodePoint*) * new_size;
848 if (IS_NULL(list)) {
849 list = (const OnigCodePoint** )xmalloc(size);
850 }
851 else {
852 list = (const OnigCodePoint** )xrealloc((void* )list, size);
853 }
854
855 if (IS_NULL(list)) return ONIGERR_MEMORY;
856
857 *plist = list;
858 *psize = new_size;
859
860 return 0;
861 }
862
863 extern int
onigenc_property_list_add_property(UChar * name,const OnigCodePoint * prop,hash_table_type ** table,const OnigCodePoint *** plist,int * pnum,int * psize)864 onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop,
865 hash_table_type **table, const OnigCodePoint*** plist, int *pnum,
866 int *psize)
867 {
868 #define PROP_INIT_SIZE 16
869
870 int r;
871
872 if (*psize <= *pnum) {
873 int new_size = (*psize == 0 ? PROP_INIT_SIZE : *psize * 2);
874 r = resize_property_list(new_size, plist, psize);
875 if (r != 0) return r;
876 }
877
878 (*plist)[*pnum] = prop;
879
880 if (ONIG_IS_NULL(*table)) {
881 *table = onig_st_init_strend_table_with_size(PROP_INIT_SIZE);
882 if (ONIG_IS_NULL(*table)) return ONIGERR_MEMORY;
883 }
884
885 *pnum = *pnum + 1;
886 onig_st_insert_strend(*table, name, name + strlen((char* )name),
887 (hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE));
888 return 0;
889 }
890
891 extern int
onigenc_property_list_init(int (* f)(void))892 onigenc_property_list_init(int (*f)(void))
893 {
894 int r;
895
896 THREAD_ATOMIC_START;
897
898 r = f();
899
900 THREAD_ATOMIC_END;
901 return r;
902 }
903