1 /**********************************************************************
2 unicode.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2019 K.Kosako
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "regint.h"
31
32 struct PoolPropertyNameCtype {
33 short int name;
34 short int ctype;
35 };
36
37 #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
38 ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
39
40 static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
41 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42 0x4008, 0x428c, 0x4289, 0x4288, 0x4288, 0x4288, 0x4008, 0x4008,
43 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
45 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
47 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
48 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
49 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
50 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
52 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
53 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
54 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
56 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
57 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
58 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
61 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
62 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
63 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
64 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
65 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
67 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
68 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
69 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
71 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
72 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
73 };
74
75 #include "st.h"
76
77 #include "unicode_fold_data.c"
78
79 extern int
onigenc_unicode_mbc_case_fold(OnigEncoding enc,OnigCaseFoldType flag ARG_UNUSED,const UChar ** pp,const UChar * end,UChar * fold)80 onigenc_unicode_mbc_case_fold(OnigEncoding enc,
81 OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
82 UChar* fold)
83 {
84 const struct ByUnfoldKey* buk;
85
86 OnigCodePoint code;
87 int i, len, rlen;
88 const UChar *p = *pp;
89
90 code = ONIGENC_MBC_TO_CODE(enc, p, end);
91 len = enclen(enc, p);
92 *pp += len;
93
94 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
95 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
96 if (code == 0x0130) {
97 return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
98 }
99 #if 0
100 if (code == 0x0049) {
101 return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
102 }
103 #endif
104 }
105 #endif
106
107 buk = onigenc_unicode_unfold_key(code);
108 if (buk != 0) {
109 if (buk->fold_len == 1) {
110 return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
111 }
112 else {
113 OnigCodePoint* addr;
114
115 FOLDS_FOLD_ADDR_BUK(buk, addr);
116 rlen = 0;
117 for (i = 0; i < buk->fold_len; i++) {
118 OnigCodePoint c = addr[i];
119 len = ONIGENC_CODE_TO_MBC(enc, c, fold);
120 fold += len;
121 rlen += len;
122 }
123 return rlen;
124 }
125 }
126
127 for (i = 0; i < len; i++) {
128 *fold++ = *p++;
129 }
130 return len;
131 }
132
133 static int
apply_case_fold1(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)134 apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
135 {
136 int i, j, k, n, r;
137
138 for (i = from; i < to; ) {
139 OnigCodePoint fold = *FOLDS1_FOLD(i);
140 n = FOLDS1_UNFOLDS_NUM(i);
141 for (j = 0; j < n; j++) {
142 OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
143
144 r = (*f)(fold, &unfold, 1, arg);
145 if (r != 0) return r;
146 r = (*f)(unfold, &fold, 1, arg);
147 if (r != 0) return r;
148
149 for (k = 0; k < j; k++) {
150 OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
151 r = (*f)(unfold, &unfold2, 1, arg);
152 if (r != 0) return r;
153 r = (*f)(unfold2, &unfold, 1, arg);
154 if (r != 0) return r;
155 }
156 }
157
158 i = FOLDS1_NEXT_INDEX(i);
159 }
160
161 return 0;
162 }
163
164 static int
apply_case_fold2(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)165 apply_case_fold2(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
166 {
167 int i, j, k, n, r;
168
169 for (i = from; i < to; ) {
170 OnigCodePoint* fold = FOLDS2_FOLD(i);
171 n = FOLDS2_UNFOLDS_NUM(i);
172 for (j = 0; j < n; j++) {
173 OnigCodePoint unfold = FOLDS2_UNFOLDS(i)[j];
174
175 r = (*f)(unfold, fold, 2, arg);
176 if (r != 0) return r;
177
178 for (k = 0; k < j; k++) {
179 OnigCodePoint unfold2 = FOLDS2_UNFOLDS(i)[k];
180 r = (*f)(unfold, &unfold2, 1, arg);
181 if (r != 0) return r;
182 r = (*f)(unfold2, &unfold, 1, arg);
183 if (r != 0) return r;
184 }
185 }
186
187 i = FOLDS2_NEXT_INDEX(i);
188 }
189
190 return 0;
191 }
192
193 static int
apply_case_fold3(int from,int to,OnigApplyAllCaseFoldFunc f,void * arg)194 apply_case_fold3(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
195 {
196 int i, j, k, n, r;
197
198 for (i = from; i < to; ) {
199 OnigCodePoint* fold = FOLDS3_FOLD(i);
200 n = FOLDS3_UNFOLDS_NUM(i);
201 for (j = 0; j < n; j++) {
202 OnigCodePoint unfold = FOLDS3_UNFOLDS(i)[j];
203
204 r = (*f)(unfold, fold, 3, arg);
205 if (r != 0) return r;
206
207 for (k = 0; k < j; k++) {
208 OnigCodePoint unfold2 = FOLDS3_UNFOLDS(i)[k];
209 r = (*f)(unfold, &unfold2, 1, arg);
210 if (r != 0) return r;
211 r = (*f)(unfold2, &unfold, 1, arg);
212 if (r != 0) return r;
213 }
214 }
215
216 i = FOLDS3_NEXT_INDEX(i);
217 }
218
219 return 0;
220 }
221
222 extern int
onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,OnigApplyAllCaseFoldFunc f,void * arg)223 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
224 OnigApplyAllCaseFoldFunc f, void* arg)
225 {
226 int r;
227
228 r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);
229 if (r != 0) return r;
230
231 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
232 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
233 code = 0x0131;
234 r = (*f)(0x0049, &code, 1, arg);
235 if (r != 0) return r;
236 code = 0x0049;
237 r = (*f)(0x0131, &code, 1, arg);
238 if (r != 0) return r;
239
240 code = 0x0130;
241 r = (*f)(0x0069, &code, 1, arg);
242 if (r != 0) return r;
243 code = 0x0069;
244 r = (*f)(0x0130, &code, 1, arg);
245 if (r != 0) return r;
246 }
247 else {
248 #endif
249 r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
250 if (r != 0) return r;
251 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
252 }
253 #endif
254
255 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
256 return 0;
257
258 r = apply_case_fold2(0, FOLDS2_NORMAL_END_INDEX, f, arg);
259 if (r != 0) return r;
260
261 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
262 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
263 #endif
264 r = apply_case_fold2(FOLDS2_NORMAL_END_INDEX, FOLDS2_END_INDEX, f, arg);
265 if (r != 0) return r;
266 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
267 }
268 #endif
269
270 r = apply_case_fold3(0, FOLDS3_NORMAL_END_INDEX, f, arg);
271 if (r != 0) return r;
272
273 return 0;
274 }
275
276 extern int
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,OnigCaseFoldType flag,const OnigUChar * p,const OnigUChar * end,OnigCaseFoldCodeItem items[])277 onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
278 OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
279 OnigCaseFoldCodeItem items[])
280 {
281 int n, m, i, j, k, len;
282 OnigCodePoint code, codes[3];
283 const struct ByUnfoldKey* buk;
284
285 n = 0;
286
287 code = ONIGENC_MBC_TO_CODE(enc, p, end);
288 len = enclen(enc, p);
289
290 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
291 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
292 if (code == 0x0049) {
293 items[0].byte_len = len;
294 items[0].code_len = 1;
295 items[0].code[0] = 0x0131;
296 return 1;
297 }
298 else if (code == 0x0130) {
299 items[0].byte_len = len;
300 items[0].code_len = 1;
301 items[0].code[0] = 0x0069;
302 return 1;
303 }
304 else if (code == 0x0131) {
305 items[0].byte_len = len;
306 items[0].code_len = 1;
307 items[0].code[0] = 0x0049;
308 return 1;
309 }
310 else if (code == 0x0069) {
311 items[0].byte_len = len;
312 items[0].code_len = 1;
313 items[0].code[0] = 0x0130;
314 return 1;
315 }
316 }
317 #endif
318
319 buk = onigenc_unicode_unfold_key(code);
320 if (buk != 0) {
321 if (buk->fold_len == 1) {
322 int un;
323 items[0].byte_len = len;
324 items[0].code_len = 1;
325 items[0].code[0] = *FOLDS1_FOLD(buk->index);
326 n++;
327
328 un = FOLDS1_UNFOLDS_NUM(buk->index);
329 for (i = 0; i < un; i++) {
330 OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];
331 if (unfold != code) {
332 items[n].byte_len = len;
333 items[n].code_len = 1;
334 items[n].code[0] = unfold;
335 n++;
336 }
337 }
338 code = items[0].code[0]; /* for multi-code to unfold search. */
339 }
340 else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
341 OnigCodePoint cs[3][4];
342 int fn, ncs[3];
343
344 if (buk->fold_len == 2) {
345 m = FOLDS2_UNFOLDS_NUM(buk->index);
346 for (i = 0; i < m; i++) {
347 OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i];
348 if (unfold == code) continue;
349
350 items[n].byte_len = len;
351 items[n].code_len = 1;
352 items[n].code[0] = unfold;
353 n++;
354 }
355
356 for (fn = 0; fn < 2; fn++) {
357 int index;
358 cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];
359 ncs[fn] = 1;
360 index = onigenc_unicode_fold1_key(&cs[fn][0]);
361 if (index >= 0) {
362 int m = FOLDS1_UNFOLDS_NUM(index);
363 for (i = 0; i < m; i++) {
364 cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
365 }
366 ncs[fn] += m;
367 }
368 }
369
370 for (i = 0; i < ncs[0]; i++) {
371 for (j = 0; j < ncs[1]; j++) {
372 items[n].byte_len = len;
373 items[n].code_len = 2;
374 items[n].code[0] = cs[0][i];
375 items[n].code[1] = cs[1][j];
376 n++;
377 }
378 }
379 }
380 else { /* fold_len == 3 */
381 m = FOLDS3_UNFOLDS_NUM(buk->index);
382 for (i = 0; i < m; i++) {
383 OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i];
384 if (unfold == code) continue;
385
386 items[n].byte_len = len;
387 items[n].code_len = 1;
388 items[n].code[0] = unfold;
389 n++;
390 }
391
392 for (fn = 0; fn < 3; fn++) {
393 int index;
394 cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];
395 ncs[fn] = 1;
396 index = onigenc_unicode_fold1_key(&cs[fn][0]);
397 if (index >= 0) {
398 int m = FOLDS1_UNFOLDS_NUM(index);
399 for (i = 0; i < m; i++) {
400 cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i];
401 }
402 ncs[fn] += m;
403 }
404 }
405
406 for (i = 0; i < ncs[0]; i++) {
407 for (j = 0; j < ncs[1]; j++) {
408 for (k = 0; k < ncs[2]; k++) {
409 items[n].byte_len = len;
410 items[n].code_len = 3;
411 items[n].code[0] = cs[0][i];
412 items[n].code[1] = cs[1][j];
413 items[n].code[2] = cs[2][k];
414 n++;
415 }
416 }
417 }
418 }
419
420 /* multi char folded code is not head of another folded multi char */
421 return n;
422 }
423 }
424 else {
425 int index = onigenc_unicode_fold1_key(&code);
426 if (index >= 0) {
427 int m = FOLDS1_UNFOLDS_NUM(index);
428 for (i = 0; i < m; i++) {
429 items[n].byte_len = len;
430 items[n].code_len = 1;
431 items[n].code[0] = FOLDS1_UNFOLDS(index)[i];
432 n++;
433 }
434 }
435 }
436
437 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
438 return n;
439
440 p += len;
441 if (p < end) {
442 int clen;
443 int index;
444
445 codes[0] = code;
446 code = ONIGENC_MBC_TO_CODE(enc, p, end);
447
448 buk = onigenc_unicode_unfold_key(code);
449 if (buk != 0 && buk->fold_len == 1) {
450 codes[1] = *FOLDS1_FOLD(buk->index);
451 }
452 else
453 codes[1] = code;
454
455 clen = enclen(enc, p);
456 len += clen;
457
458 index = onigenc_unicode_fold2_key(codes);
459 if (index >= 0) {
460 m = FOLDS2_UNFOLDS_NUM(index);
461 for (i = 0; i < m; i++) {
462 items[n].byte_len = len;
463 items[n].code_len = 1;
464 items[n].code[0] = FOLDS2_UNFOLDS(index)[i];
465 n++;
466 }
467 }
468
469 p += clen;
470 if (p < end) {
471 code = ONIGENC_MBC_TO_CODE(enc, p, end);
472 buk = onigenc_unicode_unfold_key(code);
473 if (buk != 0 && buk->fold_len == 1) {
474 codes[2] = *FOLDS1_FOLD(buk->index);
475 }
476 else
477 codes[2] = code;
478
479 clen = enclen(enc, p);
480 len += clen;
481
482 index = onigenc_unicode_fold3_key(codes);
483 if (index >= 0) {
484 m = FOLDS3_UNFOLDS_NUM(index);
485 for (i = 0; i < m; i++) {
486 items[n].byte_len = len;
487 items[n].code_len = 1;
488 items[n].code[0] = FOLDS3_UNFOLDS(index)[i];
489 n++;
490 }
491 }
492 }
493 }
494
495 return n;
496 }
497
498 #ifdef USE_UNICODE_PROPERTIES
499 #include "unicode_property_data.c"
500 #else
501 #include "unicode_property_data_posix.c"
502 #endif
503
504
505 #ifdef USE_UNICODE_WORD_BREAK
506
507 enum WB_TYPE {
508 WB_Any = 0,
509 WB_ALetter,
510 WB_CR,
511 WB_Double_Quote,
512 WB_Extend,
513 WB_ExtendNumLet,
514 WB_Format,
515 WB_Hebrew_Letter,
516 WB_Katakana,
517 WB_LF,
518 WB_MidLetter,
519 WB_MidNum,
520 WB_MidNumLet,
521 WB_Newline,
522 WB_Numeric,
523 WB_Regional_Indicator,
524 WB_Single_Quote,
525 WB_WSegSpace,
526 WB_ZWJ,
527 };
528
529 typedef struct {
530 OnigCodePoint start;
531 OnigCodePoint end;
532 enum WB_TYPE type;
533 } WB_RANGE_TYPE;
534
535 #include "unicode_wb_data.c"
536
537 static enum WB_TYPE
wb_get_type(OnigCodePoint code)538 wb_get_type(OnigCodePoint code)
539 {
540 OnigCodePoint low, high, x;
541 enum WB_TYPE type;
542
543 for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
544 x = (low + high) >> 1;
545 if (code > WB_RANGES[x].end)
546 low = x + 1;
547 else
548 high = x;
549 }
550
551 type = (low < (OnigCodePoint )WB_RANGE_NUM &&
552 code >= WB_RANGES[low].start) ?
553 WB_RANGES[low].type : WB_Any;
554
555 return type;
556 }
557
558 #define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
559 #define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
560 #define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
561
562 static int
wb_get_next_main_code(OnigEncoding enc,UChar * p,const UChar * end,OnigCodePoint * rcode,enum WB_TYPE * rtype)563 wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
564 OnigCodePoint* rcode, enum WB_TYPE* rtype)
565 {
566 OnigCodePoint code;
567 enum WB_TYPE type;
568
569 while (TRUE) {
570 p += enclen(enc, p);
571 if (p >= end) break;
572
573 code = ONIGENC_MBC_TO_CODE(enc, p, end);
574 type = wb_get_type(code);
575 if (! IS_WB_IGNORE_TAIL(type)) {
576 *rcode = code;
577 *rtype = type;
578 return 1;
579 }
580 }
581
582 return 0;
583 }
584
585 extern int
onigenc_wb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)586 onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
587 const UChar* start, const UChar* end)
588 {
589 int r;
590 UChar* pp;
591 OnigCodePoint cfrom;
592 OnigCodePoint cfrom2;
593 OnigCodePoint cto;
594 OnigCodePoint cto2;
595 enum WB_TYPE from;
596 enum WB_TYPE from2;
597 enum WB_TYPE to;
598 enum WB_TYPE to2;
599
600 /* WB1: sot / Any */
601 if (p == start) return TRUE;
602 /* WB2: Any / eot */
603 if (p == end) return TRUE;
604
605 if (IS_NULL(prev)) {
606 prev = onigenc_get_prev_char_head(enc, start, p);
607 if (IS_NULL(prev)) return TRUE;
608 }
609
610 cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
611 cto = ONIGENC_MBC_TO_CODE(enc, p, end);
612
613 from = wb_get_type(cfrom);
614 to = wb_get_type(cto);
615
616 /* short cut */
617 if (from == 0 && to == 0) goto WB999;
618
619 /* WB3: CR + LF */
620 if (from == WB_CR && to == WB_LF) return FALSE;
621
622 /* WB3a: (Newline|CR|LF) / */
623 if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
624 /* WB3b: / (Newline|CR|LF) */
625 if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
626
627 /* WB3c: ZWJ + {Extended_Pictographic} */
628 if (from == WB_ZWJ) {
629 if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
630 return FALSE;
631 }
632
633 /* WB3d: WSegSpace + WSegSpace */
634 if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
635
636 /* WB4: X (Extend|Format|ZWJ)* -> X */
637 if (IS_WB_IGNORE_TAIL(to)) return FALSE;
638 if (IS_WB_IGNORE_TAIL(from)) {
639 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
640 prev = pp;
641 cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
642 from = wb_get_type(cfrom);
643 if (! IS_WB_IGNORE_TAIL(from))
644 break;
645 }
646 }
647
648 if (IS_WB_AHLetter(from)) {
649 /* WB5: AHLetter + AHLetter */
650 if (IS_WB_AHLetter(to)) return FALSE;
651
652 /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
653 if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
654 r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
655 if (r == 1) {
656 if (IS_WB_AHLetter(to2)) return FALSE;
657 }
658 }
659 }
660
661 /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
662 if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
663 if (IS_WB_AHLetter(to)) {
664 from2 = WB_Any;
665 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
666 prev = pp;
667 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
668 from2 = wb_get_type(cfrom2);
669 if (! IS_WB_IGNORE_TAIL(from2))
670 break;
671 }
672
673 if (IS_WB_AHLetter(from2)) return FALSE;
674 }
675 }
676
677 if (from == WB_Hebrew_Letter) {
678 /* WB7a: Hebrew_Letter + Single_Quote */
679 if (to == WB_Single_Quote) return FALSE;
680
681 /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
682 if (to == WB_Double_Quote) {
683 r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
684 if (r == 1) {
685 if (to2 == WB_Hebrew_Letter) return FALSE;
686 }
687 }
688 }
689
690 /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
691 if (from == WB_Double_Quote) {
692 if (to == WB_Hebrew_Letter) {
693 from2 = WB_Any;
694 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
695 prev = pp;
696 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
697 from2 = wb_get_type(cfrom2);
698 if (! IS_WB_IGNORE_TAIL(from2))
699 break;
700 }
701
702 if (from2 == WB_Hebrew_Letter) return FALSE;
703 }
704 }
705
706 if (to == WB_Numeric) {
707 /* WB8: Numeric + Numeric */
708 if (from == WB_Numeric) return FALSE;
709
710 /* WB9: AHLetter + Numeric */
711 if (IS_WB_AHLetter(from)) return FALSE;
712
713 /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
714 if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
715 from2 = WB_Any;
716 while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
717 prev = pp;
718 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
719 from2 = wb_get_type(cfrom2);
720 if (! IS_WB_IGNORE_TAIL(from2))
721 break;
722 }
723
724 if (from2 == WB_Numeric) return FALSE;
725 }
726 }
727
728 if (from == WB_Numeric) {
729 /* WB10: Numeric + AHLetter */
730 if (IS_WB_AHLetter(to)) return FALSE;
731
732 /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
733 if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
734 r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
735 if (r == 1) {
736 if (to2 == WB_Numeric) return FALSE;
737 }
738 }
739 }
740
741 /* WB13: Katakana + Katakana */
742 if (from == WB_Katakana && to == WB_Katakana) return FALSE;
743
744 /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
745 if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
746 || from == WB_ExtendNumLet) {
747 if (to == WB_ExtendNumLet) return FALSE;
748 }
749
750 /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
751 if (from == WB_ExtendNumLet) {
752 if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
753 return FALSE;
754 }
755
756
757 /* WB15: sot (RI RI)* RI + RI */
758 /* WB16: [^RI] (RI RI)* RI + RI */
759 if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
760 int n = 0;
761 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
762 cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
763 from2 = wb_get_type(cfrom2);
764 if (from2 != WB_Regional_Indicator)
765 break;
766
767 n++;
768 }
769 if ((n % 2) == 0) return FALSE;
770 }
771
772 WB999:
773 /* WB999: Any / Any */
774 return TRUE;
775 }
776
777 #endif /* USE_UNICODE_WORD_BREAK */
778
779
780 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
781
782 enum EGCB_BREAK_TYPE {
783 EGCB_NOT_BREAK = 0,
784 EGCB_BREAK = 1,
785 EGCB_BREAK_UNDEF_GB11 = 2,
786 EGCB_BREAK_UNDEF_RI_RI = 3
787 };
788
789 enum EGCB_TYPE {
790 EGCB_Other = 0,
791 EGCB_CR = 1,
792 EGCB_LF = 2,
793 EGCB_Control = 3,
794 EGCB_Extend = 4,
795 EGCB_Prepend = 5,
796 EGCB_Regional_Indicator = 6,
797 EGCB_SpacingMark = 7,
798 EGCB_ZWJ = 8,
799 #if 0
800 /* obsoleted */
801 EGCB_E_Base = 9,
802 EGCB_E_Base_GAZ = 10,
803 EGCB_E_Modifier = 11,
804 EGCB_Glue_After_Zwj = 12,
805 #endif
806 EGCB_L = 13,
807 EGCB_LV = 14,
808 EGCB_LVT = 15,
809 EGCB_T = 16,
810 EGCB_V = 17
811 };
812
813 typedef struct {
814 OnigCodePoint start;
815 OnigCodePoint end;
816 enum EGCB_TYPE type;
817 } EGCB_RANGE_TYPE;
818
819 #include "unicode_egcb_data.c"
820
821 static enum EGCB_TYPE
egcb_get_type(OnigCodePoint code)822 egcb_get_type(OnigCodePoint code)
823 {
824 OnigCodePoint low, high, x;
825 enum EGCB_TYPE type;
826
827 for (low = 0, high = (OnigCodePoint )EGCB_RANGE_NUM; low < high; ) {
828 x = (low + high) >> 1;
829 if (code > EGCB_RANGES[x].end)
830 low = x + 1;
831 else
832 high = x;
833 }
834
835 type = (low < (OnigCodePoint )EGCB_RANGE_NUM &&
836 code >= EGCB_RANGES[low].start) ?
837 EGCB_RANGES[low].type : EGCB_Other;
838
839 return type;
840 }
841
842 #define IS_CONTROL_CR_LF(code) ((code) <= EGCB_Control && (code) >= EGCB_CR)
843 #define IS_HANGUL(code) ((code) >= EGCB_L)
844
845 /* GB1 and GB2 are outside of this function. */
846 static enum EGCB_BREAK_TYPE
unicode_egcb_is_break_2code(OnigCodePoint from_code,OnigCodePoint to_code)847 unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code)
848 {
849 enum EGCB_TYPE from;
850 enum EGCB_TYPE to;
851
852 from = egcb_get_type(from_code);
853 to = egcb_get_type(to_code);
854
855 /* short cut */
856 if (from == 0 && to == 0) goto GB999;
857
858 /* GB3 */
859 if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK;
860 /* GB4 */
861 if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK;
862 /* GB5 */
863 if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK;
864
865 if (IS_HANGUL(from) && IS_HANGUL(to)) {
866 /* GB6 */
867 if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK;
868 /* GB7 */
869 if ((from == EGCB_LV || from == EGCB_V)
870 && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK;
871
872 /* GB8 */
873 if ((to == EGCB_T) && (from == EGCB_LVT || from == EGCB_T))
874 return EGCB_NOT_BREAK;
875
876 goto GB999;
877 }
878
879 /* GB9 */
880 if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK;
881
882 /* GB9a */
883 if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK;
884 /* GB9b */
885 if (from == EGCB_Prepend) return EGCB_NOT_BREAK;
886
887 /* GB10 removed */
888
889 /* GB11 */
890 if (from == EGCB_ZWJ) {
891 if (onigenc_unicode_is_code_ctype(to_code, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
892 return EGCB_BREAK_UNDEF_GB11;
893
894 goto GB999;
895 }
896
897 /* GB12, GB13 */
898 if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) {
899 return EGCB_BREAK_UNDEF_RI_RI;
900 }
901
902 GB999:
903 return EGCB_BREAK;
904 }
905
906 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
907
908 extern int
onigenc_egcb_is_break_position(OnigEncoding enc,UChar * p,UChar * prev,const UChar * start,const UChar * end)909 onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
910 const UChar* start, const UChar* end)
911 {
912 OnigCodePoint from;
913 OnigCodePoint to;
914 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
915 enum EGCB_BREAK_TYPE btype;
916 enum EGCB_TYPE type;
917 #endif
918
919 /* GB1 and GB2 */
920 if (p == start) return 1;
921 if (p == end) return 1;
922
923 if (IS_NULL(prev)) {
924 prev = onigenc_get_prev_char_head(enc, start, p);
925 if (IS_NULL(prev)) return 1;
926 }
927
928 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
929 to = ONIGENC_MBC_TO_CODE(enc, p, end);
930
931 #ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
932 if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
933 return from != 0x000d || to != 0x000a;
934 }
935
936 btype = unicode_egcb_is_break_2code(from, to);
937 switch (btype) {
938 case EGCB_NOT_BREAK:
939 return 0;
940 break;
941 case EGCB_BREAK:
942 return 1;
943 break;
944
945 case EGCB_BREAK_UNDEF_GB11:
946 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
947 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
948 if (onigenc_unicode_is_code_ctype(from, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
949 return 0;
950
951 type = egcb_get_type(from);
952 if (type != EGCB_Extend)
953 break;
954 }
955 break;
956
957 case EGCB_BREAK_UNDEF_RI_RI:
958 {
959 int n = 0;
960 while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
961 from = ONIGENC_MBC_TO_CODE(enc, prev, end);
962 type = egcb_get_type(from);
963 if (type != EGCB_Regional_Indicator)
964 break;
965
966 n++;
967 }
968 if ((n % 2) == 0) return 0;
969 }
970 break;
971 }
972
973 return 1;
974
975 #else
976 return from != 0x000d || to != 0x000a;
977 #endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
978 }
979
980
981 #define USER_DEFINED_PROPERTY_MAX_NUM 20
982
983 typedef struct {
984 int ctype;
985 OnigCodePoint* ranges;
986 } UserDefinedPropertyValue;
987
988 static int UserDefinedPropertyNum;
989 static UserDefinedPropertyValue
990 UserDefinedPropertyRanges[USER_DEFINED_PROPERTY_MAX_NUM];
991 static st_table* UserDefinedPropertyTable;
992
993 extern int
onig_unicode_define_user_property(const char * name,OnigCodePoint * ranges)994 onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
995 {
996 UserDefinedPropertyValue* e;
997 int r;
998 int i;
999 int n;
1000 int len;
1001 int c;
1002 char* s;
1003 UChar* uname;
1004
1005 if (UserDefinedPropertyNum >= USER_DEFINED_PROPERTY_MAX_NUM)
1006 return ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS;
1007
1008 len = (int )strlen(name);
1009 if (len >= PROPERTY_NAME_MAX_SIZE)
1010 return ONIGERR_TOO_LONG_PROPERTY_NAME;
1011
1012 s = (char* )xmalloc(len + 1);
1013 if (s == 0)
1014 return ONIGERR_MEMORY;
1015
1016 uname = (UChar* )name;
1017 n = 0;
1018 for (i = 0; i < len; i++) {
1019 c = uname[i];
1020 if (c < 0x20 || c >= 0x80) {
1021 xfree(s);
1022 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1023 }
1024
1025 if (c != ' ' && c != '-' && c != '_') {
1026 s[n] = c;
1027 n++;
1028 }
1029 }
1030 s[n] = '\0';
1031
1032 if (UserDefinedPropertyTable == 0) {
1033 UserDefinedPropertyTable = onig_st_init_strend_table_with_size(10);
1034 if (IS_NULL(UserDefinedPropertyTable)) {
1035 xfree(s);
1036 return ONIGERR_MEMORY;
1037 }
1038 }
1039
1040 e = UserDefinedPropertyRanges + UserDefinedPropertyNum;
1041 e->ctype = CODE_RANGES_NUM + UserDefinedPropertyNum;
1042 e->ranges = ranges;
1043 r = onig_st_insert_strend(UserDefinedPropertyTable,
1044 (const UChar* )s, (const UChar* )s + n,
1045 (hash_data_type )((void* )e));
1046 if (r < 0) return r;
1047
1048 UserDefinedPropertyNum++;
1049 return 0;
1050 }
1051
1052 extern int
onigenc_unicode_is_code_ctype(OnigCodePoint code,unsigned int ctype)1053 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype)
1054 {
1055 if (
1056 #ifdef USE_UNICODE_PROPERTIES
1057 ctype <= ONIGENC_MAX_STD_CTYPE &&
1058 #endif
1059 code < 256) {
1060 return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
1061 }
1062
1063 if (ctype >= CODE_RANGES_NUM) {
1064 int index = ctype - CODE_RANGES_NUM;
1065 if (index < UserDefinedPropertyNum)
1066 return onig_is_in_code_range((UChar* )UserDefinedPropertyRanges[index].ranges, code);
1067 else
1068 return ONIGERR_TYPE_BUG;
1069 }
1070
1071 return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
1072 }
1073
1074
1075 extern int
onigenc_unicode_ctype_code_range(OnigCtype ctype,const OnigCodePoint * ranges[])1076 onigenc_unicode_ctype_code_range(OnigCtype ctype, const OnigCodePoint* ranges[])
1077 {
1078 if (ctype >= CODE_RANGES_NUM) {
1079 int index = ctype - CODE_RANGES_NUM;
1080 if (index < UserDefinedPropertyNum) {
1081 *ranges = UserDefinedPropertyRanges[index].ranges;
1082 return 0;
1083 }
1084 else
1085 return ONIGERR_TYPE_BUG;
1086 }
1087
1088 *ranges = CodeRanges[ctype];
1089 return 0;
1090 }
1091
1092 extern int
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype,OnigCodePoint * sb_out,const OnigCodePoint * ranges[])1093 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
1094 const OnigCodePoint* ranges[])
1095 {
1096 *sb_out = 0x00;
1097 return onigenc_unicode_ctype_code_range(ctype, ranges);
1098 }
1099
1100 extern int
onigenc_unicode_property_name_to_ctype(OnigEncoding enc,UChar * name,UChar * end)1101 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
1102 {
1103 int len;
1104 UChar *p;
1105 OnigCodePoint code;
1106 const struct PoolPropertyNameCtype* pc;
1107 char buf[PROPERTY_NAME_MAX_SIZE];
1108
1109 p = name;
1110 len = 0;
1111 while (p < end) {
1112 code = ONIGENC_MBC_TO_CODE(enc, p, end);
1113 if (code >= 0x80)
1114 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1115
1116 if (code != ' ' && code != '-' && code != '_') {
1117 buf[len++] = (char )code;
1118 if (len >= PROPERTY_NAME_MAX_SIZE)
1119 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1120 }
1121
1122 p += enclen(enc, p);
1123 }
1124
1125 buf[len] = 0;
1126
1127 if (UserDefinedPropertyTable != 0) {
1128 UserDefinedPropertyValue* e;
1129 e = (UserDefinedPropertyValue* )NULL;
1130 onig_st_lookup_strend(UserDefinedPropertyTable,
1131 (const UChar* )buf, (const UChar* )buf + len,
1132 (hash_data_type* )((void* )(&e)));
1133 if (e != 0) {
1134 return e->ctype;
1135 }
1136 }
1137
1138 pc = unicode_lookup_property_name(buf, len);
1139 if (pc != 0) {
1140 /* fprintf(stderr, "LOOKUP: %s: %d\n", buf, pc->ctype); */
1141 #ifndef USE_UNICODE_PROPERTIES
1142 if (pc->ctype > ONIGENC_MAX_STD_CTYPE)
1143 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1144 #endif
1145
1146 return (int )pc->ctype;
1147 }
1148
1149 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
1150 }
1151