xref: /PHP-7.3/ext/mbstring/oniguruma/src/unicode.c (revision 1979c5d1)
1 /**********************************************************************
2   unicode.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 struct PoolPropertyNameCtype {
33   short int name;
34   short int ctype;
35 };
36 
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38   ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
39 
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
41   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42   0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62   0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64   0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
73 };
74 
75 #include "st.h"
76 
77 #include "unicode_fold_data.c"
78 
79 extern int
onigenc_unicode_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end,UChar * fold)80 onigenc_unicode_mbc_case_fold(OnigEncoding enc,
81     OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
82     UChar* fold)
83 {
84   const struct ByUnfoldKey* buk;
85 
86   OnigCodePoint code;
87   int i, len, rlen;
88   const UChar *p = *pp;
89 
90   code = ONIGENC_MBC_TO_CODE(enc, p, end);
91   len = enclen(enc, p);
92   *pp += len;
93 
94 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
95   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
96     if (code == 0x0130) {
97       return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
98     }
99 #if 0
100     if (code == 0x0049) {
101       return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
102     }
103 #endif
104   }
105 #endif
106 
107   buk = onigenc_unicode_unfold_key(code);
108   if (buk != 0) {
109     if (buk->fold_len == 1) {
110       return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
111     }
112     else {
113       OnigCodePoint* addr;
114 
115       FOLDS_FOLD_ADDR_BUK(buk, addr);
116       rlen = 0;
117       for (i = 0; i < buk->fold_len; i++) {
118         OnigCodePoint c = addr[i];
119         len = ONIGENC_CODE_TO_MBC(enc, c, fold);
120         fold += len;
121         rlen += len;
122       }
123       return rlen;
124     }
125   }
126 
127   for (i = 0; i < len; i++) {
128     *fold++ = *p++;
129   }
130   return len;
131 }
132 
133 static int
apply_case_fold1(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)134 apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
135 {
136   int i, j, k, n, r;
137 
138   for (i = from; i < to; ) {
139     OnigCodePoint fold = *FOLDS1_FOLD(i);
140     n = FOLDS1_UNFOLDS_NUM(i);
141     for (j = 0; j < n; j++) {
142       OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
143 
144       r = (*f)(fold, &unfold, 1, arg);
145       if (r != 0) return r;
146       r = (*f)(unfold, &fold, 1, arg);
147       if (r != 0) return r;
148 
149       for (k = 0; k < j; k++) {
150         OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
151         r = (*f)(unfold, &unfold2, 1, arg);
152         if (r != 0) return r;
153         r = (*f)(unfold2, &unfold, 1, arg);
154         if (r != 0) return r;
155       }
156     }
157 
158     i = FOLDS1_NEXT_INDEX(i);
159   }
160 
161   return 0;
162 }
163 
164 static int
apply_case_fold2(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)165 apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
166 {
167   int i, j, k, n, r;
168 
169   for (i = from; i < to; ) {
170     OnigCodePoint* fold = FOLDS2_FOLD(i);
171     n = FOLDS2_UNFOLDS_NUM(i);
172     for (j = 0; j < n; j++) {
173       OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
174 
175       r = (*f)(unfold, fold, 2, arg);
176       if (r != 0) return r;
177 
178       for (k = 0; k < j; k++) {
179         OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
180         r = (*f)(unfold, &unfold2, 1, arg);
181         if (r != 0) return r;
182         r = (*f)(unfold2, &unfold, 1, arg);
183         if (r != 0) return r;
184       }
185     }
186 
187     i = FOLDS2_NEXT_INDEX(i);
188   }
189 
190   return 0;
191 }
192 
193 static int
apply_case_fold3(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)194 apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
195 {
196   int i, j, k, n, r;
197 
198   for (i = from; i < to; ) {
199     OnigCodePoint* fold = FOLDS3_FOLD(i);
200     n = FOLDS3_UNFOLDS_NUM(i);
201     for (j = 0; j < n; j++) {
202       OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
203 
204       r = (*f)(unfold, fold, 3, arg);
205       if (r != 0) return r;
206 
207       for (k = 0; k < j; k++) {
208         OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
209         r = (*f)(unfold, &unfold2, 1, arg);
210         if (r != 0) return r;
211         r = (*f)(unfold2, &unfold, 1, arg);
212         if (r != 0) return r;
213       }
214     }
215 
216     i = FOLDS3_NEXT_INDEX(i);
217   }
218 
219   return 0;
220 }
221 
222 extern int
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)223 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
224                                     OnigApplyAllCaseFoldFunc f, void* arg)
225 {
226   int r;
227 
228   r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);
229   if (r != 0) return r;
230 
231 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
232   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
233     code = 0x0131;
234     r = (*f)(0x0049, &code, 1, arg);
235     if (r != 0) return r;
236     code = 0x0049;
237     r = (*f)(0x0131, &code, 1, arg);
238     if (r != 0) return r;
239 
240     code = 0x0130;
241     r = (*f)(0x0069, &code, 1, arg);
242     if (r != 0) return r;
243     code = 0x0069;
244     r = (*f)(0x0130, &code, 1, arg);
245     if (r != 0) return r;
246   }
247   else {
248 #endif
249     r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
250     if (r != 0) return r;
251 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
252   }
253 #endif
254 
255   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
256     return 0;
257 
258   r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
259   if (r != 0) return r;
260 
261 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
262   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
263 #endif
264     r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
265     if (r != 0) return r;
266 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
267   }
268 #endif
269 
270   r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
271   if (r != 0) return r;
272 
273   return 0;
274 }
275 
276 extern int
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])277 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
278     OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
279     OnigCaseFoldCodeItem items[])
280 {
281   int n, m, i, j, k, len;
282   OnigCodePoint code, codes[3];
283   const struct ByUnfoldKey* buk;
284 
285   n = 0;
286 
287   code = ONIGENC_MBC_TO_CODE(enc, p, end);
288   len = enclen(enc, p);
289 
290 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
291   if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
292     if (code == 0x0049) {
293       items[0].byte_len = len;
294       items[0].code_len = 1;
295       items[0].code[0]  = 0x0131;
296       return 1;
297     }
298     else if (code == 0x0130) {
299       items[0].byte_len = len;
300       items[0].code_len = 1;
301       items[0].code[0]  = 0x0069;
302       return 1;
303     }
304     else if (code == 0x0131) {
305       items[0].byte_len = len;
306       items[0].code_len = 1;
307       items[0].code[0]  = 0x0049;
308       return 1;
309     }
310     else if (code == 0x0069) {
311       items[0].byte_len = len;
312       items[0].code_len = 1;
313       items[0].code[0]  = 0x0130;
314       return 1;
315     }
316   }
317 #endif
318 
319   buk = onigenc_unicode_unfold_key(code);
320   if (buk != 0) {
321     if (buk->fold_len == 1) {
322       int un;
323       items[0].byte_len = len;
324       items[0].code_len = 1;
325       items[0].code[0]  = *FOLDS1_FOLD(buk->index);
326       n++;
327 
328       un = FOLDS1_UNFOLDS_NUM(buk->index);
329       for (i = 0; i < un; i++) {
330         OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];
331         if (unfold != code) {
332           items[n].byte_len = len;
333           items[n].code_len = 1;
334           items[n].code[0]  = unfold;
335           n++;
336         }
337       }
338       code = items[0].code[0]; /* for multi-code to unfold search. */
339     }
340     else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
341       OnigCodePoint cs[3][4];
342       int fn, ncs[3];
343 
344       if (buk->fold_len == 2) {
345         m = FOLDS2_UNFOLDS_NUM(buk->index);
346         for (i = 0; i < m; i++) {
347           OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i];
348           if (unfold == code) continue;
349 
350           items[n].byte_len = len;
351           items[n].code_len = 1;
352           items[n].code[0]  = unfold;
353           n++;
354         }
355 
356         for (fn = 0; fn < 2; fn++) {
357           int index;
358           cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];
359           ncs[fn] = 1;
360           index = onigenc_unicode_fold1_key(&cs[fn][0]);
361           if (index >= 0) {
362             int m = FOLDS1_UNFOLDS_NUM(index);
363             for (i = 0; i < m; i++) {
364               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
365             }
366             ncs[fn] += m;
367           }
368         }
369 
370         for (i = 0; i < ncs[0]; i++) {
371           for (j = 0; j < ncs[1]; j++) {
372             items[n].byte_len = len;
373             items[n].code_len = 2;
374             items[n].code[0]  = cs[0][i];
375             items[n].code[1]  = cs[1][j];
376             n++;
377           }
378         }
379       }
380       else { /* fold_len == 3 */
381         m = FOLDS3_UNFOLDS_NUM(buk->index);
382         for (i = 0; i < m; i++) {
383           OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i];
384           if (unfold == code) continue;
385 
386           items[n].byte_len = len;
387           items[n].code_len = 1;
388           items[n].code[0]  = unfold;
389           n++;
390         }
391 
392         for (fn = 0; fn < 3; fn++) {
393           int index;
394           cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];
395           ncs[fn] = 1;
396           index = onigenc_unicode_fold1_key(&cs[fn][0]);
397           if (index >= 0) {
398             int m = FOLDS1_UNFOLDS_NUM(index);
399             for (i = 0; i < m; i++) {
400               cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
401             }
402             ncs[fn] += m;
403           }
404         }
405 
406         for (i = 0; i < ncs[0]; i++) {
407           for (j = 0; j < ncs[1]; j++) {
408             for (k = 0; k < ncs[2]; k++) {
409               items[n].byte_len = len;
410               items[n].code_len = 3;
411               items[n].code[0]  = cs[0][i];
412               items[n].code[1]  = cs[1][j];
413               items[n].code[2]  = cs[2][k];
414               n++;
415             }
416           }
417         }
418       }
419 
420       /* multi char folded code is not head of another folded multi char */
421       return n;
422     }
423   }
424   else {
425     int index = onigenc_unicode_fold1_key(&code);
426     if (index >= 0) {
427       int m = FOLDS1_UNFOLDS_NUM(index);
428       for (i = 0; i < m; i++) {
429         items[n].byte_len = len;
430         items[n].code_len = 1;
431         items[n].code[0]  = FOLDS1_UNFOLDS(index)[i];
432         n++;
433       }
434     }
435   }
436 
437   if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
438     return n;
439 
440   p += len;
441   if (p < end) {
442     int clen;
443     int index;
444 
445     codes[0] = code;
446     code = ONIGENC_MBC_TO_CODE(enc, p, end);
447 
448     buk = onigenc_unicode_unfold_key(code);
449     if (buk != 0 && buk->fold_len == 1) {
450       codes[1] = *FOLDS1_FOLD(buk->index);
451     }
452     else
453       codes[1] = code;
454 
455     clen = enclen(enc, p);
456     len += clen;
457 
458     index = onigenc_unicode_fold2_key(codes);
459     if (index >= 0) {
460       m = FOLDS2_UNFOLDS_NUM(index);
461       for (i = 0; i < m; i++) {
462         items[n].byte_len = len;
463         items[n].code_len = 1;
464         items[n].code[0]  = FOLDS2_UNFOLDS(index)[i];
465         n++;
466       }
467     }
468 
469     p += clen;
470     if (p < end) {
471       code = ONIGENC_MBC_TO_CODE(enc, p, end);
472       buk = onigenc_unicode_unfold_key(code);
473       if (buk != 0 && buk->fold_len == 1) {
474         codes[2] = *FOLDS1_FOLD(buk->index);
475       }
476       else
477         codes[2] = code;
478 
479       clen = enclen(enc, p);
480       len += clen;
481 
482       index = onigenc_unicode_fold3_key(codes);
483       if (index >= 0) {
484         m = FOLDS3_UNFOLDS_NUM(index);
485         for (i = 0; i < m; i++) {
486           items[n].byte_len = len;
487           items[n].code_len = 1;
488           items[n].code[0]  = FOLDS3_UNFOLDS(index)[i];
489           n++;
490         }
491       }
492     }
493   }
494 
495   return n;
496 }
497 
498 #ifdef USE_UNICODE_PROPERTIES
499 #include "unicode_property_data.c"
500 #else
501 #include "unicode_property_data_posix.c"
502 #endif
503 
504 
505 #ifdef USE_UNICODE_WORD_BREAK
506 
507 enum WB_TYPE {
508   WB_Any = 0,
509   WB_ALetter,
510   WB_CR,
511   WB_Double_Quote,
512   WB_Extend,
513   WB_ExtendNumLet,
514   WB_Format,
515   WB_Hebrew_Letter,
516   WB_Katakana,
517   WB_LF,
518   WB_MidLetter,
519   WB_MidNum,
520   WB_MidNumLet,
521   WB_Newline,
522   WB_Numeric,
523   WB_Regional_Indicator,
524   WB_Single_Quote,
525   WB_WSegSpace,
526   WB_ZWJ,
527 };
528 
529 typedef struct {
530   OnigCodePoint start;
531   OnigCodePoint end;
532   enum WB_TYPE  type;
533 } WB_RANGE_TYPE;
534 
535 #include "unicode_wb_data.c"
536 
537 static enum WB_TYPE
wb_get_type(OnigCodePoint code)538 wb_get_type(OnigCodePoint code)
539 {
540   OnigCodePoint low, high, x;
541   enum WB_TYPE type;
542 
543   for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
544     x = (low + high) >> 1;
545     if (code > WB_RANGES[x].end)
546       low = x + 1;
547     else
548       high = x;
549   }
550 
551   type = (low < (OnigCodePoint )WB_RANGE_NUM &&
552           code >= WB_RANGES[low].start) ?
553     WB_RANGES[low].type : WB_Any;
554 
555   return type;
556 }
557 
558 #define IS_WB_IGNORE_TAIL(t)  ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
559 #define IS_WB_AHLetter(t)     ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
560 #define IS_WB_MidNumLetQ(t)   ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
561 
562 static int
wb_get_next_main_code(OnigEncoding enc,UChar * p,const UChar * end,OnigCodePoint * rcode,enum WB_TYPE * rtype)563 wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
564                       OnigCodePoint* rcode, enum WB_TYPE* rtype)
565 {
566   OnigCodePoint code;
567   enum WB_TYPE type;
568 
569   while (TRUE) {
570     p += enclen(enc, p);
571     if (p >= end) break;
572 
573     code = ONIGENC_MBC_TO_CODE(enc, p, end);
574     type = wb_get_type(code);
575     if (! IS_WB_IGNORE_TAIL(type)) {
576       *rcode = code;
577       *rtype = type;
578       return 1;
579     }
580   }
581 
582   return 0;
583 }
584 
585 extern int
onigenc_wb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)586 onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
587                              const UChar* start, const UChar* end)
588 {
589   int r;
590   UChar* pp;
591   OnigCodePoint cfrom;
592   OnigCodePoint cfrom2;
593   OnigCodePoint cto;
594   OnigCodePoint cto2;
595   enum WB_TYPE from;
596   enum WB_TYPE from2;
597   enum WB_TYPE to;
598   enum WB_TYPE to2;
599 
600   /* WB1: sot / Any */
601   if (p == start) return TRUE;
602   /* WB2: Any / eot */
603   if (p == end)   return TRUE;
604 
605   if (IS_NULL(prev)) {
606     prev = onigenc_get_prev_char_head(enc, start, p);
607     if (IS_NULL(prev)) return TRUE;
608   }
609 
610   cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
611   cto   = ONIGENC_MBC_TO_CODE(enc, p, end);
612 
613   from = wb_get_type(cfrom);
614   to   = wb_get_type(cto);
615 
616   /* short cut */
617   if (from == 0 && to == 0) goto WB999;
618 
619   /* WB3: CR + LF */
620   if (from == WB_CR && to == WB_LF) return FALSE;
621 
622   /* WB3a: (Newline|CR|LF) /  */
623   if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
624   /* WB3b: / (Newline|CR|LF) */
625   if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
626 
627   /* WB3c: ZWJ + {Extended_Pictographic} */
628   if (from == WB_ZWJ) {
629     if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
630       return FALSE;
631   }
632 
633   /* WB3d: WSegSpace + WSegSpace */
634   if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
635 
636   /* WB4:  X (Extend|Format|ZWJ)* -> X */
637   if (IS_WB_IGNORE_TAIL(to)) return FALSE;
638   if (IS_WB_IGNORE_TAIL(from)) {
639     while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
640       prev = pp;
641       cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
642       from = wb_get_type(cfrom);
643       if (! IS_WB_IGNORE_TAIL(from))
644         break;
645     }
646   }
647 
648   if (IS_WB_AHLetter(from)) {
649     /* WB5: AHLetter + AHLetter */
650     if (IS_WB_AHLetter(to)) return FALSE;
651 
652     /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
653     if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
654       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
655       if (r == 1) {
656         if (IS_WB_AHLetter(to2)) return FALSE;
657       }
658     }
659   }
660 
661   /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
662   if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
663     if (IS_WB_AHLetter(to)) {
664       from2 = WB_Any;
665       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
666         prev = pp;
667         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
668         from2 = wb_get_type(cfrom2);
669         if (! IS_WB_IGNORE_TAIL(from2))
670           break;
671       }
672 
673       if (IS_WB_AHLetter(from2)) return FALSE;
674     }
675   }
676 
677   if (from == WB_Hebrew_Letter) {
678     /* WB7a: Hebrew_Letter + Single_Quote */
679     if (to == WB_Single_Quote) return FALSE;
680 
681     /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
682     if (to == WB_Double_Quote) {
683       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
684       if (r == 1) {
685         if (to2 == WB_Hebrew_Letter) return FALSE;
686       }
687     }
688   }
689 
690   /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
691   if (from == WB_Double_Quote) {
692     if (to == WB_Hebrew_Letter) {
693       from2 = WB_Any;
694       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
695         prev = pp;
696         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
697         from2 = wb_get_type(cfrom2);
698         if (! IS_WB_IGNORE_TAIL(from2))
699           break;
700       }
701 
702       if (from2 == WB_Hebrew_Letter) return FALSE;
703     }
704   }
705 
706   if (to == WB_Numeric) {
707     /* WB8: Numeric + Numeric */
708     if (from == WB_Numeric) return FALSE;
709 
710     /* WB9: AHLetter + Numeric */
711     if (IS_WB_AHLetter(from)) return FALSE;
712 
713     /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
714     if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
715       from2 = WB_Any;
716       while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
717         prev = pp;
718         cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
719         from2 = wb_get_type(cfrom2);
720         if (! IS_WB_IGNORE_TAIL(from2))
721           break;
722       }
723 
724       if (from2 == WB_Numeric) return FALSE;
725     }
726   }
727 
728   if (from == WB_Numeric) {
729     /* WB10: Numeric + AHLetter */
730     if (IS_WB_AHLetter(to)) return FALSE;
731 
732     /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
733     if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
734       r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
735       if (r == 1) {
736         if (to2 == WB_Numeric) return FALSE;
737       }
738     }
739   }
740 
741   /* WB13: Katakana + Katakana */
742   if (from == WB_Katakana && to == WB_Katakana) return FALSE;
743 
744   /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
745   if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
746       || from == WB_ExtendNumLet) {
747     if (to == WB_ExtendNumLet) return FALSE;
748   }
749 
750   /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
751   if (from == WB_ExtendNumLet) {
752     if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
753       return FALSE;
754   }
755 
756 
757   /* WB15:   sot (RI RI)* RI + RI */
758   /* WB16: [^RI] (RI RI)* RI + RI */
759   if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
760     int n = 0;
761     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
762       cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
763       from2  = wb_get_type(cfrom2);
764       if (from2 != WB_Regional_Indicator)
765         break;
766 
767       n++;
768     }
769     if ((n % 2) == 0) return FALSE;
770   }
771 
772  WB999:
773   /* WB999: Any / Any */
774   return TRUE;
775 }
776 
777 #endif /* USE_UNICODE_WORD_BREAK */
778 
779 
780 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
781 
782 enum EGCB_BREAK_TYPE {
783   EGCB_NOT_BREAK = 0,
784   EGCB_BREAK     = 1,
785   EGCB_BREAK_UNDEF_GB11  = 2,
786   EGCB_BREAK_UNDEF_RI_RI = 3
787 };
788 
789 enum EGCB_TYPE {
790   EGCB_Other   = 0,
791   EGCB_CR      = 1,
792   EGCB_LF      = 2,
793   EGCB_Control = 3,
794   EGCB_Extend  = 4,
795   EGCB_Prepend = 5,
796   EGCB_Regional_Indicator = 6,
797   EGCB_SpacingMark = 7,
798   EGCB_ZWJ         = 8,
799 #if 0
800   /* obsoleted */
801   EGCB_E_Base         = 9,
802   EGCB_E_Base_GAZ     = 10,
803   EGCB_E_Modifier     = 11,
804   EGCB_Glue_After_Zwj = 12,
805 #endif
806   EGCB_L   = 13,
807   EGCB_LV  = 14,
808   EGCB_LVT = 15,
809   EGCB_T   = 16,
810   EGCB_V   = 17
811 };
812 
813 typedef struct {
814   OnigCodePoint  start;
815   OnigCodePoint  end;
816   enum EGCB_TYPE type;
817 } EGCB_RANGE_TYPE;
818 
819 #include "unicode_egcb_data.c"
820 
821 static enum EGCB_TYPE
egcb_get_type(OnigCodePoint code)822 egcb_get_type(OnigCodePoint code)
823 {
824   OnigCodePoint low, high, x;
825   enum EGCB_TYPE type;
826 
827   for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
828     x = (low + high) >> 1;
829     if (code > EGCB_RANGES[x].end)
830       low = x + 1;
831     else
832       high = x;
833   }
834 
835   type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
836           code >= EGCB_RANGES[low].start) ?
837     EGCB_RANGES[low].type : EGCB_Other;
838 
839   return type;
840 }
841 
842 #define IS_CONTROL_CR_LF(code)   ((code) <= EGCB_Control && (code) >= EGCB_CR)
843 #define IS_HANGUL(code)          ((code) >= EGCB_L)
844 
845 /* GB1 and GB2 are outside of this function. */
846 static enum EGCB_BREAK_TYPE
unicode_egcb_is_break_2code(OnigCodePoint from_code,OnigCodePoint to_code)847 unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
848 {
849   enum EGCB_TYPE from;
850   enum EGCB_TYPE to;
851 
852   from = egcb_get_type(from_code);
853   to   = egcb_get_type(to_code);
854 
855   /* short cut */
856   if (from == 0 && to == 0) goto GB999;
857 
858   /* GB3 */
859   if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
860   /* GB4 */
861   if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
862   /* GB5 */
863   if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
864 
865   if (IS_HANGUL(from) && IS_HANGUL(to)) {
866     /* GB6 */
867     if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
868     /* GB7 */
869     if ((from == EGCB_LV || from == EGCB_V)
870         && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
871 
872     /* GB8 */
873     if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))
874       return EGCB_NOT_BREAK;
875 
876     goto GB999;
877   }
878 
879   /* GB9 */
880   if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
881 
882   /* GB9a */
883   if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
884   /* GB9b */
885   if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
886 
887   /* GB10 removed */
888 
889   /* GB11 */
890   if (from == EGCB_ZWJ) {
891     if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
892       return EGCB_BREAK_UNDEF_GB11;
893 
894     goto GB999;
895   }
896 
897   /* GB12, GB13 */
898   if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
899     return EGCB_BREAK_UNDEF_RI_RI;
900   }
901 
902  GB999:
903   return EGCB_BREAK;
904 }
905 
906 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
907 
908 extern int
onigenc_egcb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)909 onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
910                                const UChar* start, const UChar* end)
911 {
912   OnigCodePoint from;
913   OnigCodePoint to;
914 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
915   enum EGCB_BREAK_TYPE btype;
916   enum EGCB_TYPE type;
917 #endif
918 
919   /* GB1 and GB2 */
920   if (p == start) return 1;
921   if (p == end)   return 1;
922 
923   if (IS_NULL(prev)) {
924     prev = onigenc_get_prev_char_head(enc, start, p);
925     if (IS_NULL(prev)) return 1;
926   }
927 
928   from = ONIGENC_MBC_TO_CODE(enc, prev, end);
929   to   = ONIGENC_MBC_TO_CODE(enc, p, end);
930 
931 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
932   if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
933     return from != 0x000d || to != 0x000a;
934   }
935 
936   btype = unicode_egcb_is_break_2code(from, to);
937   switch (btype) {
938   case EGCB_NOT_BREAK:
939     return 0;
940     break;
941   case EGCB_BREAK:
942     return 1;
943     break;
944 
945   case EGCB_BREAK_UNDEF_GB11:
946     while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
947       from = ONIGENC_MBC_TO_CODE(enc, prev, end);
948       if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
949         return 0;
950 
951       type = egcb_get_type(from);
952       if (type != EGCB_Extend)
953         break;
954     }
955     break;
956 
957   case EGCB_BREAK_UNDEF_RI_RI:
958     {
959       int n = 0;
960       while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
961         from = ONIGENC_MBC_TO_CODE(enc, prev, end);
962         type = egcb_get_type(from);
963         if (type != EGCB_Regional_Indicator)
964           break;
965 
966         n++;
967       }
968       if ((n % 2) == 0) return 0;
969     }
970     break;
971   }
972 
973   return 1;
974 
975 #else
976   return from != 0x000d || to != 0x000a;
977 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
978 }
979 
980 
981 #define USER_DEFINED_PROPERTY_MAX_NUM  20
982 
983 typedef struct {
984   int ctype;
985   OnigCodePoint* ranges;
986 } UserDefinedPropertyValue;
987 
988 static int UserDefinedPropertyNum;
989 static UserDefinedPropertyValue
990 UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
991 static st_table* UserDefinedPropertyTable;
992 
993 extern int
onig_unicode_define_user_property(const char * name,OnigCodePoint * ranges)994 onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
995 {
996   UserDefinedPropertyValue* e;
997   int r;
998   int i;
999   int n;
1000   int len;
1001   int c;
1002   char* s;
1003   UChar* uname;
1004 
1005   if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
1006     return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
1007 
1008   len = (int )strlen(name);
1009   if (len >= PROPERTY_NAME_MAX_SIZE)
1010     return ONIGERR_TOO_LONG_PROPERTY_NAME;
1011 
1012   s = (char* )xmalloc(len + 1);
1013   if (s == 0)
1014     return ONIGERR_MEMORY;
1015 
1016   uname = (UChar* )name;
1017   n = 0;
1018   for (i = 0; i < len; i++) {
1019     c = uname[i];
1020     if (c < 0x20 || c >= 0x80) {
1021       xfree(s);
1022       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1023     }
1024 
1025     if (c != ' ' && c != '-' && c != '_') {
1026       s[n] = c;
1027       n++;
1028     }
1029   }
1030   s[n] = '\0';
1031 
1032   if (UserDefinedPropertyTable == 0) {
1033     UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
1034     if (IS_NULL(UserDefinedPropertyTable)) {
1035       xfree(s);
1036       return ONIGERR_MEMORY;
1037     }
1038   }
1039 
1040   e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
1041   e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
1042   e->ranges = ranges;
1043   r = onig_st_insert_strend(UserDefinedPropertyTable,
1044                             (const UChar* )s, (const UChar* )s + n,
1045                             (hash_data_type )((void* )e));
1046   if (r < 0) return r;
1047 
1048   UserDefinedPropertyNum++;
1049   return 0;
1050 }
1051 
1052 extern int
onigenc_unicode_is_code_ctype(OnigCodePoint code,unsigned int ctype)1053 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
1054 {
1055   if (
1056 #ifdef USE_UNICODE_PROPERTIES
1057       ctype <= ONIGENC_MAX_STD_CTYPE &&
1058 #endif
1059       code < 256) {
1060     return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
1061   }
1062 
1063   if (ctype >= CODE_RANGES_NUM) {
1064     int index = ctype - CODE_RANGES_NUM;
1065     if (index < UserDefinedPropertyNum)
1066       return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
1067     else
1068       return ONIGERR_TYPE_BUG;
1069   }
1070 
1071   return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
1072 }
1073 
1074 
1075 extern int
onigenc_unicode_ctype_code_range(OnigCtype ctype,const OnigCodePoint * ranges[])1076 onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
1077 {
1078   if (ctype >= CODE_RANGES_NUM) {
1079     int index = ctype - CODE_RANGES_NUM;
1080     if (index < UserDefinedPropertyNum) {
1081       *ranges = UserDefinedPropertyRanges[index].ranges;
1082       return 0;
1083     }
1084     else
1085       return ONIGERR_TYPE_BUG;
1086   }
1087 
1088   *ranges = CodeRanges[ctype];
1089   return 0;
1090 }
1091 
1092 extern int
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype,OnigCodePoint * sb_out,const OnigCodePoint * ranges[])1093 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
1094                                       const OnigCodePoint* ranges[])
1095 {
1096   *sb_out = 0x00;
1097   return onigenc_unicode_ctype_code_range(ctype, ranges);
1098 }
1099 
1100 extern int
onigenc_unicode_property_name_to_ctype(OnigEncoding enc,UChar * name,UChar * end)1101 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
1102 {
1103   int len;
1104   UChar *p;
1105   OnigCodePoint code;
1106   const struct PoolPropertyNameCtype* pc;
1107   char buf[PROPERTY_NAME_MAX_SIZE];
1108 
1109   p = name;
1110   len = 0;
1111   while (p < end) {
1112     code = ONIGENC_MBC_TO_CODE(enc, p, end);
1113     if (code >= 0x80)
1114       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1115 
1116     if (code != ' ' && code != '-' && code != '_') {
1117       buf[len++] = (char )code;
1118       if (len >= PROPERTY_NAME_MAX_SIZE)
1119         return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1120     }
1121 
1122     p += enclen(enc, p);
1123   }
1124 
1125   buf[len] = 0;
1126 
1127   if (UserDefinedPropertyTable != 0) {
1128     UserDefinedPropertyValue* e;
1129     e = (UserDefinedPropertyValue* )NULL;
1130     onig_st_lookup_strend(UserDefinedPropertyTable,
1131                           (const UChar* )buf, (const UChar* )buf + len,
1132                           (hash_data_type* )((void* )(&e)));
1133     if (e != 0) {
1134       return e->ctype;
1135     }
1136   }
1137 
1138   pc = unicode_lookup_property_name(buf, len);
1139   if (pc != 0) {
1140     /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
1141 #ifndef USE_UNICODE_PROPERTIES
1142     if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
1143       return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1144 #endif
1145 
1146     return (int )pc->ctype;
1147   }
1148 
1149   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1150 }
1151