1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the function for checking a script run. */
42
43 #ifdef HAVE_CONFIG_H
44 #include "config.h"
45 #endif
46
47 #include "pcre2_internal.h"
48
49
50 /*************************************************
51 * Check script run *
52 *************************************************/
53
54 /* A script run is conceptually a sequence of characters all in the same
55 Unicode script. However, it isn't quite that simple. There are special rules
56 for scripts that are commonly used together, and also special rules for digits.
57 This function implements the appropriate checks, which is possible only when
58 PCRE2 is compiled with Unicode support. The function returns TRUE if there is
59 no Unicode support; however, it should never be called in that circumstance
60 because an error is given by pcre2_compile() if a script run is called for in a
61 version of PCRE2 compiled without Unicode support.
62
63 Arguments:
64 pgr point to the first character
65 endptr point after the last character
66 utf TRUE if in UTF mode
67
68 Returns: TRUE if this is a valid script run
69 */
70
71 /* These dummy values must be less than the negation of the largest offset in
72 the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
73 records (and is only likely to be a few hundred). */
74
75 #define SCRIPT_UNSET (-99999)
76 #define SCRIPT_HANPENDING (-99998)
77 #define SCRIPT_HANHIRAKATA (-99997)
78 #define SCRIPT_HANBOPOMOFO (-99996)
79 #define SCRIPT_HANHANGUL (-99995)
80 #define SCRIPT_LIST (-99994)
81
82 #define INTERSECTION_LIST_SIZE 50
83
84 BOOL
PRIV(script_run)85 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
86 {
87 #ifdef SUPPORT_UNICODE
88 int require_script = SCRIPT_UNSET;
89 uint8_t intersection_list[INTERSECTION_LIST_SIZE];
90 const uint8_t *require_list = NULL;
91 uint32_t require_digitset = 0;
92 uint32_t c;
93
94 #if PCRE2_CODE_UNIT_WIDTH == 32
95 (void)utf; /* Avoid compiler warning */
96 #endif
97
98 /* Any string containing fewer than 2 characters is a valid script run. */
99
100 if (ptr >= endptr) return TRUE;
101 GETCHARINCTEST(c, ptr);
102 if (ptr >= endptr) return TRUE;
103
104 /* Scan strings of two or more characters, checking the Unicode characteristics
105 of each code point. We make use of the Script Extensions property. There is
106 special code for scripts that can be combined with characters from the Han
107 Chinese script. This may be used in conjunction with four other scripts in
108 these combinations:
109
110 . Han with Hiragana and Katakana is allowed (for Japanese).
111 . Han with Bopomofo is allowed (for Taiwanese Mandarin).
112 . Han with Hangul is allowed (for Korean).
113
114 If the first significant character's script is one of the four, the required
115 script type is immediately known. However, if the first significant
116 character's script is Han, we have to keep checking for a non-Han character.
117 Hence the SCRIPT_HANPENDING state. */
118
119 for (;;)
120 {
121 const ucd_record *ucd = GET_UCD(c);
122 int32_t scriptx = ucd->scriptx;
123
124 /* If the script extension is Unknown, the string is not a valid script run.
125 Such characters can only form script runs of length one. */
126
127 if (scriptx == ucp_Unknown) return FALSE;
128
129 /* A character whose script extension is Inherited is always accepted with
130 any script, and plays no further part in this testing. A character whose
131 script is Common is always accepted, but must still be tested for a digit
132 below. The scriptx value at this point is non-zero, because zero is
133 ucp_Unknown, tested for above. */
134
135 if (scriptx != ucp_Inherited)
136 {
137 if (scriptx != ucp_Common)
138 {
139 /* If the script extension value is positive, the character is not a mark
140 that can be used with many scripts. In the simple case we either set or
141 compare with the required script. However, handling the scripts that can
142 combine with Han are more complicated, as is the case when the previous
143 characters have been man-script marks. */
144
145 if (scriptx > 0)
146 {
147 switch(require_script)
148 {
149 /* Either the first significant character (require_script unset) or
150 after only Han characters. */
151
152 case SCRIPT_UNSET:
153 case SCRIPT_HANPENDING:
154 switch(scriptx)
155 {
156 case ucp_Han:
157 require_script = SCRIPT_HANPENDING;
158 break;
159
160 case ucp_Hiragana:
161 case ucp_Katakana:
162 require_script = SCRIPT_HANHIRAKATA;
163 break;
164
165 case ucp_Bopomofo:
166 require_script = SCRIPT_HANBOPOMOFO;
167 break;
168
169 case ucp_Hangul:
170 require_script = SCRIPT_HANHANGUL;
171 break;
172
173 /* Not a Han-related script. If expecting one, fail. Otherise set
174 the requirement to this script. */
175
176 default:
177 if (require_script == SCRIPT_HANPENDING) return FALSE;
178 require_script = scriptx;
179 break;
180 }
181 break;
182
183 /* Previously encountered one of the "with Han" scripts. Check that
184 this character is appropriate. */
185
186 case SCRIPT_HANHIRAKATA:
187 if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
188 scriptx != ucp_Katakana)
189 return FALSE;
190 break;
191
192 case SCRIPT_HANBOPOMOFO:
193 if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
194 break;
195
196 case SCRIPT_HANHANGUL:
197 if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
198 break;
199
200 /* We have a list of scripts to check that is derived from one or
201 more previous characters. This is either one of the lists in
202 ucd_script_sets[] (for one previous character) or the intersection of
203 several lists for multiple characters. */
204
205 case SCRIPT_LIST:
206 {
207 const uint8_t *list;
208 for (list = require_list; *list != 0; list++)
209 {
210 if (*list == scriptx) break;
211 }
212 if (*list == 0) return FALSE;
213 }
214
215 /* The rest of the string must be in this script, but we have to
216 allow for the Han complications. */
217
218 switch(scriptx)
219 {
220 case ucp_Han:
221 require_script = SCRIPT_HANPENDING;
222 break;
223
224 case ucp_Hiragana:
225 case ucp_Katakana:
226 require_script = SCRIPT_HANHIRAKATA;
227 break;
228
229 case ucp_Bopomofo:
230 require_script = SCRIPT_HANBOPOMOFO;
231 break;
232
233 case ucp_Hangul:
234 require_script = SCRIPT_HANHANGUL;
235 break;
236
237 default:
238 require_script = scriptx;
239 break;
240 }
241 break;
242
243 /* This is the easy case when a single script is required. */
244
245 default:
246 if (scriptx != require_script) return FALSE;
247 break;
248 }
249 } /* End of handing positive scriptx */
250
251 /* If scriptx is negative, this character is a mark-type character that
252 has a list of permitted scripts. */
253
254 else
255 {
256 uint32_t chspecial;
257 const uint8_t *clist, *rlist;
258 const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
259
260 switch(require_script)
261 {
262 case SCRIPT_UNSET:
263 require_list = PRIV(ucd_script_sets) - scriptx;
264 require_script = SCRIPT_LIST;
265 break;
266
267 /* An inspection of the Unicode 11.0.0 files shows that there are the
268 following types of Script Extension list that involve the Han,
269 Bopomofo, Hiragana, Katakana, and Hangul scripts:
270
271 . Bopomofo + Han
272 . Han + Hiragana + Katakana
273 . Hiragana + Katakana
274 . Bopopmofo + Hangul + Han + Hiragana + Katakana
275
276 The following code tries to make sense of this. */
277
278 #define FOUND_BOPOMOFO 1
279 #define FOUND_HIRAGANA 2
280 #define FOUND_KATAKANA 4
281 #define FOUND_HANGUL 8
282
283 case SCRIPT_HANPENDING:
284 chspecial = 0;
285 for (; *list != 0; list++)
286 {
287 switch (*list)
288 {
289 case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
290 case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
291 case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
292 case ucp_Hangul: chspecial |= FOUND_HANGUL; break;
293 default: break;
294 }
295 }
296
297 if (chspecial == 0) return FALSE;
298
299 if (chspecial == FOUND_BOPOMOFO)
300 {
301 require_script = SCRIPT_HANBOPOMOFO;
302 }
303 else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
304 {
305 require_script = SCRIPT_HANHIRAKATA;
306 }
307
308 /* Otherwise it must be allowed with all of them, so remain in
309 the pending state. */
310
311 break;
312
313 case SCRIPT_HANHIRAKATA:
314 for (; *list != 0; list++)
315 {
316 if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
317 }
318 if (*list == 0) return FALSE;
319 break;
320
321 case SCRIPT_HANBOPOMOFO:
322 for (; *list != 0; list++)
323 {
324 if (*list == ucp_Bopomofo) break;
325 }
326 if (*list == 0) return FALSE;
327 break;
328
329 case SCRIPT_HANHANGUL:
330 for (; *list != 0; list++)
331 {
332 if (*list == ucp_Hangul) break;
333 }
334 if (*list == 0) return FALSE;
335 break;
336
337 /* Previously encountered one or more characters that are allowed
338 with a list of scripts. Build the intersection of the required list
339 with this character's list in intersection_list[]. This code is
340 written so that it still works OK if the required list is already in
341 that vector. */
342
343 case SCRIPT_LIST:
344 {
345 int i = 0;
346 for (rlist = require_list; *rlist != 0; rlist++)
347 {
348 for (clist = list; *clist != 0; clist++)
349 {
350 if (*rlist == *clist)
351 {
352 intersection_list[i++] = *rlist;
353 break;
354 }
355 }
356 }
357 if (i == 0) return FALSE; /* No scripts in common */
358
359 /* If there's just one script in common, we can set it as the
360 unique required script. Otherwise, terminate the intersection list
361 and make it the required list. */
362
363 if (i == 1)
364 {
365 require_script = intersection_list[0];
366 }
367 else
368 {
369 intersection_list[i] = 0;
370 require_list = intersection_list;
371 }
372 }
373 break;
374
375 /* The previously set required script is a single script, not
376 Han-related. Check that it is in this character's list. */
377
378 default:
379 for (; *list != 0; list++)
380 {
381 if (*list == require_script) break;
382 }
383 if (*list == 0) return FALSE;
384 break;
385 }
386 } /* End of handling negative scriptx */
387 } /* End of checking non-Common character */
388
389 /* The character is in an acceptable script. We must now ensure that all
390 decimal digits in the string come from the same set. Some scripts (e.g.
391 Common, Arabic) have more than one set of decimal digits. This code does
392 not allow mixing sets, even within the same script. The vector called
393 PRIV(ucd_digit_sets)[] contains, in its first element, the number of
394 following elements, and then, in ascending order, the code points of the
395 '9' characters in every set of 10 digits. Each set is identified by the
396 offset in the vector of its '9' character. An initial check of the first
397 value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
398
399 if (ucd->chartype == ucp_Nd)
400 {
401 uint32_t digitset;
402
403 if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
404 {
405 int mid;
406 int bot = 1;
407 int top = PRIV(ucd_digit_sets)[0];
408 for (;;)
409 {
410 if (top <= bot + 1) /* <= rather than == is paranoia */
411 {
412 digitset = top;
413 break;
414 }
415 mid = (top + bot) / 2;
416 if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
417 }
418 }
419
420 /* A required value of 0 means "unset". */
421
422 if (require_digitset == 0) require_digitset = digitset;
423 else if (digitset != require_digitset) return FALSE;
424 } /* End digit handling */
425 } /* End checking non-Inherited character */
426
427 /* If we haven't yet got to the end, pick up the next character. */
428
429 if (ptr >= endptr) return TRUE;
430 GETCHARINCTEST(c, ptr);
431 } /* End checking loop */
432
433 #else /* NOT SUPPORT_UNICODE */
434 (void)ptr;
435 (void)endptr;
436 (void)utf;
437 return TRUE;
438 #endif /* SUPPORT_UNICODE */
439 }
440
441 /* End of pcre2_script_run.c */
442