xref: /PHP-5.5/ext/mbstring/oniguruma/regenc.c (revision fe92d64a)
1 /**********************************************************************
2   regenc.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
33 
34 extern int
onigenc_init(void)35 onigenc_init(void)
36 {
37   return 0;
38 }
39 
40 extern OnigEncoding
onigenc_get_default_encoding(void)41 onigenc_get_default_encoding(void)
42 {
43   return OnigEncDefaultCharEncoding;
44 }
45 
46 extern int
onigenc_set_default_encoding(OnigEncoding enc)47 onigenc_set_default_encoding(OnigEncoding enc)
48 {
49   OnigEncDefaultCharEncoding = enc;
50   return 0;
51 }
52 
53 extern UChar*
onigenc_get_right_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)54 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
55 {
56   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
57   if (p < s) {
58     p += enclen(enc, p);
59   }
60   return p;
61 }
62 
63 extern UChar*
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,const UChar * start,const UChar * s,const UChar ** prev)64 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
65 				   const UChar* start, const UChar* s, const UChar** prev)
66 {
67   UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
68 
69   if (p < s) {
70     if (prev) *prev = (const UChar* )p;
71     p += enclen(enc, p);
72   }
73   else {
74     if (prev) *prev = (const UChar* )NULL; /* Sorry */
75   }
76   return p;
77 }
78 
79 extern UChar*
onigenc_get_prev_char_head(OnigEncoding enc,const UChar * start,const UChar * s)80 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
81 {
82   if (s <= start)
83     return (UChar* )NULL;
84 
85   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
86 }
87 
88 extern UChar*
onigenc_step_back(OnigEncoding enc,const UChar * start,const UChar * s,int n)89 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
90 {
91   while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
92     if (s <= start)
93       return (UChar* )NULL;
94 
95     s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
96   }
97   return (UChar* )s;
98 }
99 
100 extern UChar*
onigenc_step(OnigEncoding enc,const UChar * p,const UChar * end,int n)101 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
102 {
103   UChar* q = (UChar* )p;
104   while (n-- > 0) {
105     q += ONIGENC_MBC_ENC_LEN(enc, q);
106   }
107   return (q <= end ? q : NULL);
108 }
109 
110 extern int
onigenc_strlen(OnigEncoding enc,const UChar * p,const UChar * end)111 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
112 {
113   int n = 0;
114   UChar* q = (UChar* )p;
115 
116   while (q < end) {
117     q += ONIGENC_MBC_ENC_LEN(enc, q);
118     n++;
119   }
120   return n;
121 }
122 
123 extern int
onigenc_strlen_null(OnigEncoding enc,const UChar * s)124 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
125 {
126   int n = 0;
127   UChar* p = (UChar* )s;
128 
129   while (1) {
130     if (*p == '\0') {
131       UChar* q;
132       int len = ONIGENC_MBC_MINLEN(enc);
133 
134       if (len == 1) return n;
135       q = p + 1;
136       while (len > 1) {
137         if (*q != '\0') break;
138         q++;
139         len--;
140       }
141       if (len == 1) return n;
142     }
143     p += ONIGENC_MBC_ENC_LEN(enc, p);
144     n++;
145   }
146 }
147 
148 extern int
onigenc_str_bytelen_null(OnigEncoding enc,const UChar * s)149 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
150 {
151   UChar* start = (UChar* )s;
152   UChar* p = (UChar* )s;
153 
154   while (1) {
155     if (*p == '\0') {
156       UChar* q;
157       int len = ONIGENC_MBC_MINLEN(enc);
158 
159       if (len == 1) return (int )(p - start);
160       q = p + 1;
161       while (len > 1) {
162         if (*q != '\0') break;
163         q++;
164         len--;
165       }
166       if (len == 1) return (int )(p - start);
167     }
168     p += ONIGENC_MBC_ENC_LEN(enc, p);
169   }
170 }
171 
172 const UChar OnigEncAsciiToLowerCaseTable[] = {
173   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
174   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
175   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
176   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
177   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
178   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
179   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
180   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
181   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
182   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
183   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
184   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
185   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
186   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
187   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
188   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
189   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
190   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
191   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
192   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
193   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
194   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
195   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
196   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
197   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
198   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
199   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
200   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
201   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
202   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
203   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
204   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
205 };
206 
207 #ifdef USE_UPPER_CASE_TABLE
208 const UChar OnigEncAsciiToUpperCaseTable[256] = {
209   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
210   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
211   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
212   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
213   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
214   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
215   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
216   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
217   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
218   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
219   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
220   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
221   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
222   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
223   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
224   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
225   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
226   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
227   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
228   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
229   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
230   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
231   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
232   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
233   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
234   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
235   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
236   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
237   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
238   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
239   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
240   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
241 };
242 #endif
243 
244 const unsigned short OnigEncAsciiCtypeTable[256] = {
245   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
246   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
247   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
248   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
249   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
250   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
251   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
252   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
253   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
254   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
255   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
256   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
257   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
258   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
259   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
260   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
261   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
262   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
263   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
264   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
265   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
266   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
267   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
268   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
269   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
270   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
271   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
272   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
273   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
274   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
275   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
276   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
277 };
278 
279 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
280   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
281   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
282   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
283   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
284   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
285   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
286   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
287   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
288   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
289   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
290   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
291   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
292   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
293   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
294   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
295   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
296   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
297   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
298   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
299   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
300   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
301   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
302   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
303   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
304   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
305   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
306   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
307   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
308   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
309   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
310   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
311   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
312 };
313 
314 #ifdef USE_UPPER_CASE_TABLE
315 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
316   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
317   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
318   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
319   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
320   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
321   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
322   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
323   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
324   '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
325   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
326   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
327   '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
328   '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
329   '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
330   '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
331   '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
332   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
333   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
334   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
335   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
336   '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
337   '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
338   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
339   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
340   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
341   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
342   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
343   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
344   '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
345   '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
346   '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
347   '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
348 };
349 #endif
350 
351 extern void
onigenc_set_default_caseconv_table(const UChar * table ARG_UNUSED)352 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
353 {
354   /* nothing */
355   /* obsoleted. */
356 }
357 
358 extern UChar*
onigenc_get_left_adjust_char_head(OnigEncoding enc,const UChar * start,const UChar * s)359 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
360 {
361   return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
362 }
363 
364 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
365   { 0x41, 0x61 },
366   { 0x42, 0x62 },
367   { 0x43, 0x63 },
368   { 0x44, 0x64 },
369   { 0x45, 0x65 },
370   { 0x46, 0x66 },
371   { 0x47, 0x67 },
372   { 0x48, 0x68 },
373   { 0x49, 0x69 },
374   { 0x4a, 0x6a },
375   { 0x4b, 0x6b },
376   { 0x4c, 0x6c },
377   { 0x4d, 0x6d },
378   { 0x4e, 0x6e },
379   { 0x4f, 0x6f },
380   { 0x50, 0x70 },
381   { 0x51, 0x71 },
382   { 0x52, 0x72 },
383   { 0x53, 0x73 },
384   { 0x54, 0x74 },
385   { 0x55, 0x75 },
386   { 0x56, 0x76 },
387   { 0x57, 0x77 },
388   { 0x58, 0x78 },
389   { 0x59, 0x79 },
390   { 0x5a, 0x7a }
391 };
392 
393 extern int
onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)394 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
395 				  OnigApplyAllCaseFoldFunc f, void* arg)
396 {
397   OnigCodePoint code;
398   int i, r;
399 
400   for (i = 0;
401        i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
402        i++) {
403     code = OnigAsciiLowerMap[i].to;
404     r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
405     if (r != 0) return r;
406 
407     code = OnigAsciiLowerMap[i].from;
408     r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
409     if (r != 0) return r;
410   }
411 
412   return 0;
413 }
414 
415 extern int
onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end ARG_UNUSED,OnigCaseFoldCodeItem items[])416 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
417 	 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
418 	 OnigCaseFoldCodeItem items[])
419 {
420   if (0x41 <= *p && *p <= 0x5a) {
421     items[0].byte_len = 1;
422     items[0].code_len = 1;
423     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
424     return 1;
425   }
426   else if (0x61 <= *p && *p <= 0x7a) {
427     items[0].byte_len = 1;
428     items[0].code_len = 1;
429     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
430     return 1;
431   }
432   else
433     return 0;
434 }
435 
436 static int
ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,OnigApplyAllCaseFoldFunc f,void * arg)437 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
438 		       OnigApplyAllCaseFoldFunc f, void* arg)
439 {
440   static OnigCodePoint ss[] = { 0x73, 0x73 };
441 
442   return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
443 }
444 
445 extern int
onigenc_apply_all_case_fold_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)446 onigenc_apply_all_case_fold_with_map(int map_size,
447     const OnigPairCaseFoldCodes map[],
448     int ess_tsett_flag, OnigCaseFoldType flag,
449     OnigApplyAllCaseFoldFunc f, void* arg)
450 {
451   OnigCodePoint code;
452   int i, r;
453 
454   r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
455   if (r != 0) return r;
456 
457   for (i = 0; i < map_size; i++) {
458     code = map[i].to;
459     r = (*f)(map[i].from, &code, 1, arg);
460     if (r != 0) return r;
461 
462     code = map[i].from;
463     r = (*f)(map[i].to, &code, 1, arg);
464     if (r != 0) return r;
465   }
466 
467   if (ess_tsett_flag != 0)
468     return ss_apply_all_case_fold(flag, f, arg);
469 
470   return 0;
471 }
472 
473 extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,const OnigPairCaseFoldCodes map[],int ess_tsett_flag,OnigCaseFoldType flag ARG_UNUSED,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])474 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
475     const OnigPairCaseFoldCodes map[],
476     int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
477     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
478 {
479   if (0x41 <= *p && *p <= 0x5a) {
480     items[0].byte_len = 1;
481     items[0].code_len = 1;
482     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
483     if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
484 	&& (*(p+1) == 0x53 || *(p+1) == 0x73)) {
485       /* SS */
486       items[1].byte_len = 2;
487       items[1].code_len = 1;
488       items[1].code[0] = (OnigCodePoint )0xdf;
489       return 2;
490     }
491     else
492       return 1;
493   }
494   else if (0x61 <= *p && *p <= 0x7a) {
495     items[0].byte_len = 1;
496     items[0].code_len = 1;
497     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
498     if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
499 	&& (*(p+1) == 0x73 || *(p+1) == 0x53)) {
500       /* ss */
501       items[1].byte_len = 2;
502       items[1].code_len = 1;
503       items[1].code[0] = (OnigCodePoint )0xdf;
504       return 2;
505     }
506     else
507       return 1;
508   }
509   else if (*p == 0xdf && ess_tsett_flag != 0) {
510     items[0].byte_len = 1;
511     items[0].code_len = 2;
512     items[0].code[0] = (OnigCodePoint )'s';
513     items[0].code[1] = (OnigCodePoint )'s';
514 
515     items[1].byte_len = 1;
516     items[1].code_len = 2;
517     items[1].code[0] = (OnigCodePoint )'S';
518     items[1].code[1] = (OnigCodePoint )'S';
519 
520     items[2].byte_len = 1;
521     items[2].code_len = 2;
522     items[2].code[0] = (OnigCodePoint )'s';
523     items[2].code[1] = (OnigCodePoint )'S';
524 
525     items[3].byte_len = 1;
526     items[3].code_len = 2;
527     items[3].code[0] = (OnigCodePoint )'S';
528     items[3].code[1] = (OnigCodePoint )'s';
529 
530     return 4;
531   }
532   else {
533     int i;
534 
535     for (i = 0; i < map_size; i++) {
536       if (*p == map[i].from) {
537 	items[0].byte_len = 1;
538 	items[0].code_len = 1;
539 	items[0].code[0] = map[i].to;
540 	return 1;
541       }
542       else if (*p == map[i].to) {
543 	items[0].byte_len = 1;
544 	items[0].code_len = 1;
545 	items[0].code[0] = map[i].from;
546 	return 1;
547       }
548     }
549   }
550 
551   return 0;
552 }
553 
554 
555 extern int
onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,OnigCodePoint * sb_out ARG_UNUSED,const OnigCodePoint * ranges[]ARG_UNUSED)556 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
557 	 OnigCodePoint* sb_out ARG_UNUSED,
558 	 const OnigCodePoint* ranges[] ARG_UNUSED)
559 {
560   return ONIG_NO_SUPPORT_CONFIG;
561 }
562 
563 extern int
onigenc_is_mbc_newline_0x0a(const UChar * p,const UChar * end)564 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
565 {
566   if (p < end) {
567     if (*p == 0x0a) return 1;
568   }
569   return 0;
570 }
571 
572 /* for single byte encodings */
573 extern int
onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,const UChar ** p,const UChar * end ARG_UNUSED,UChar * lower)574 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
575 	    const UChar*end ARG_UNUSED, UChar* lower)
576 {
577   *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
578 
579   (*p)++;
580   return 1; /* return byte length of converted char to lower */
581 }
582 
583 #if 0
584 extern int
585 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag,
586 			       const UChar** pp, const UChar* end)
587 {
588   const UChar* p = *pp;
589 
590   (*pp)++;
591   return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
592 }
593 #endif
594 
595 extern int
onigenc_single_byte_mbc_enc_len(const UChar * p ARG_UNUSED)596 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED)
597 {
598   return 1;
599 }
600 
601 extern OnigCodePoint
onigenc_single_byte_mbc_to_code(const UChar * p,const UChar * end ARG_UNUSED)602 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
603 {
604   return (OnigCodePoint )(*p);
605 }
606 
607 extern int
onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)608 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED)
609 {
610   return (code < 0x100 ? 1 : ONIGERR_INVALID_CODE_POINT_VALUE);
611 }
612 
613 extern int
onigenc_single_byte_code_to_mbc(OnigCodePoint code,UChar * buf)614 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
615 {
616   *buf = (UChar )(code & 0xff);
617   return 1;
618 }
619 
620 extern UChar*
onigenc_single_byte_left_adjust_char_head(const UChar * start ARG_UNUSED,const UChar * s)621 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
622 					  const UChar* s)
623 {
624   return (UChar* )s;
625 }
626 
627 extern int
onigenc_always_true_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)628 onigenc_always_true_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
629 					     const UChar* end ARG_UNUSED)
630 {
631   return TRUE;
632 }
633 
634 extern int
onigenc_always_false_is_allowed_reverse_match(const UChar * s ARG_UNUSED,const UChar * end ARG_UNUSED)635 onigenc_always_false_is_allowed_reverse_match(const UChar* s   ARG_UNUSED,
636 					      const UChar* end ARG_UNUSED)
637 {
638   return FALSE;
639 }
640 
641 extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc,const UChar * p,const UChar * end)642 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
643 {
644   int c, i, len;
645   OnigCodePoint n;
646 
647   len = enclen(enc, p);
648   n = (OnigCodePoint )(*p++);
649   if (len == 1) return n;
650 
651   for (i = 1; i < len; i++) {
652     if (p >= end) break;
653     c = *p++;
654     n <<= 8;  n += c;
655   }
656   return n;
657 }
658 
659 extern int
onigenc_mbn_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end ARG_UNUSED,UChar * lower)660 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
661                           const UChar** pp, const UChar* end ARG_UNUSED,
662 			  UChar* lower)
663 {
664   int len;
665   const UChar *p = *pp;
666 
667   if (ONIGENC_IS_MBC_ASCII(p)) {
668     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
669     (*pp)++;
670     return 1;
671   }
672   else {
673     int i;
674 
675     len = enclen(enc, p);
676     for (i = 0; i < len; i++) {
677       *lower++ = *p++;
678     }
679     (*pp) += len;
680     return len; /* return byte length of converted to lower char */
681   }
682 }
683 
684 #if 0
685 extern int
686 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
687                              const UChar** pp, const UChar* end)
688 {
689   const UChar* p = *pp;
690 
691   if (ONIGENC_IS_MBC_ASCII(p)) {
692     (*pp)++;
693     return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
694   }
695 
696   (*pp) += enclen(enc, p);
697   return FALSE;
698 }
699 #endif
700 
701 extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code)702 onigenc_mb2_code_to_mbclen(OnigCodePoint code)
703 {
704   if ((code & 0xff00) != 0) return 2;
705   else return 1;
706 }
707 
708 extern int
onigenc_mb4_code_to_mbclen(OnigCodePoint code)709 onigenc_mb4_code_to_mbclen(OnigCodePoint code)
710 {
711        if ((code & 0xff000000) != 0) return 4;
712   else if ((code & 0xff0000) != 0) return 3;
713   else if ((code & 0xff00) != 0) return 2;
714   else return 1;
715 }
716 
717 extern int
onigenc_mb2_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)718 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
719 {
720   UChar *p = buf;
721 
722   if ((code & 0xff00) != 0) {
723     *p++ = (UChar )((code >>  8) & 0xff);
724   }
725   *p++ = (UChar )(code & 0xff);
726 
727 #if 1
728   if (enclen(enc, buf) != (p - buf))
729     return ONIGERR_INVALID_CODE_POINT_VALUE;
730 #endif
731   return p - buf;
732 }
733 
734 extern int
onigenc_mb4_code_to_mbc(OnigEncoding enc,OnigCodePoint code,UChar * buf)735 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
736 {
737   UChar *p = buf;
738 
739   if ((code & 0xff000000) != 0) {
740     *p++ = (UChar )((code >> 24) & 0xff);
741   }
742   if ((code & 0xff0000) != 0 || p != buf) {
743     *p++ = (UChar )((code >> 16) & 0xff);
744   }
745   if ((code & 0xff00) != 0 || p != buf) {
746     *p++ = (UChar )((code >> 8) & 0xff);
747   }
748   *p++ = (UChar )(code & 0xff);
749 
750 #if 1
751   if (enclen(enc, buf) != (p - buf))
752     return ONIGERR_INVALID_CODE_POINT_VALUE;
753 #endif
754   return p - buf;
755 }
756 
757 extern int
onigenc_minimum_property_name_to_ctype(OnigEncoding enc,UChar * p,UChar * end)758 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
759 {
760   static PosixBracketEntryType PBS[] = {
761     { (UChar* )"Alnum",  ONIGENC_CTYPE_ALNUM,  5 },
762     { (UChar* )"Alpha",  ONIGENC_CTYPE_ALPHA,  5 },
763     { (UChar* )"Blank",  ONIGENC_CTYPE_BLANK,  5 },
764     { (UChar* )"Cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
765     { (UChar* )"Digit",  ONIGENC_CTYPE_DIGIT,  5 },
766     { (UChar* )"Graph",  ONIGENC_CTYPE_GRAPH,  5 },
767     { (UChar* )"Lower",  ONIGENC_CTYPE_LOWER,  5 },
768     { (UChar* )"Print",  ONIGENC_CTYPE_PRINT,  5 },
769     { (UChar* )"Punct",  ONIGENC_CTYPE_PUNCT,  5 },
770     { (UChar* )"Space",  ONIGENC_CTYPE_SPACE,  5 },
771     { (UChar* )"Upper",  ONIGENC_CTYPE_UPPER,  5 },
772     { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
773     { (UChar* )"ASCII",  ONIGENC_CTYPE_ASCII,  5 },
774     { (UChar* )"Word",   ONIGENC_CTYPE_WORD,   4 },
775     { (UChar* )NULL, -1, 0 }
776   };
777 
778   PosixBracketEntryType *pb;
779   int len;
780 
781   len = onigenc_strlen(enc, p, end);
782   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
783     if (len == pb->len &&
784         onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
785       return pb->ctype;
786   }
787 
788   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
789 }
790 
791 extern int
onigenc_mb2_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)792 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
793 			  unsigned int ctype)
794 {
795   if (code < 128)
796     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
797   else {
798     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
799       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
800     }
801   }
802 
803   return FALSE;
804 }
805 
806 extern int
onigenc_mb4_is_code_ctype(OnigEncoding enc,OnigCodePoint code,unsigned int ctype)807 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
808 			  unsigned int ctype)
809 {
810   if (code < 128)
811     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
812   else {
813     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
814       return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
815     }
816   }
817 
818   return FALSE;
819 }
820 
821 extern int
onigenc_with_ascii_strncmp(OnigEncoding enc,const UChar * p,const UChar * end,const UChar * sascii,int n)822 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
823                            const UChar* sascii /* ascii */, int n)
824 {
825   int x, c;
826 
827   while (n-- > 0) {
828     if (p >= end) return (int )(*sascii);
829 
830     c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
831     x = *sascii - c;
832     if (x) return x;
833 
834     sascii++;
835     p += enclen(enc, p);
836   }
837   return 0;
838 }
839 
840 /* Property management */
841 static int
resize_property_list(int new_size,const OnigCodePoint *** plist,int * psize)842 resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
843 {
844   int size;
845   const OnigCodePoint **list = *plist;
846 
847   size = sizeof(OnigCodePoint*) * new_size;
848   if (IS_NULL(list)) {
849     list = (const OnigCodePoint** )xmalloc(size);
850   }
851   else {
852     list = (const OnigCodePoint** )xrealloc((void* )list, size);
853   }
854 
855   if (IS_NULL(list)) return ONIGERR_MEMORY;
856 
857   *plist = list;
858   *psize = new_size;
859 
860   return 0;
861 }
862 
863 extern int
onigenc_property_list_add_property(UChar * name,const OnigCodePoint * prop,hash_table_type ** table,const OnigCodePoint *** plist,int * pnum,int * psize)864 onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop,
865      hash_table_type **table, const OnigCodePoint*** plist, int *pnum,
866      int *psize)
867 {
868 #define PROP_INIT_SIZE     16
869 
870   int r;
871 
872   if (*psize <= *pnum) {
873     int new_size = (*psize == 0 ? PROP_INIT_SIZE : *psize * 2);
874     r = resize_property_list(new_size, plist, psize);
875     if (r != 0) return r;
876   }
877 
878   (*plist)[*pnum] = prop;
879 
880   if (ONIG_IS_NULL(*table)) {
881     *table = onig_st_init_strend_table_with_size(PROP_INIT_SIZE);
882     if (ONIG_IS_NULL(*table)) return ONIGERR_MEMORY;
883   }
884 
885   *pnum = *pnum + 1;
886   onig_st_insert_strend(*table, name, name + strlen((char* )name),
887 			(hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE));
888   return 0;
889 }
890 
891 extern int
onigenc_property_list_init(int (* f)(void))892 onigenc_property_list_init(int (*f)(void))
893 {
894   int r;
895 
896   THREAD_ATOMIC_START;
897 
898   r = f();
899 
900   THREAD_ATOMIC_END;
901   return r;
902 }
903