1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2018 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains the function for checking a script run. */
42 
43 #ifdef HAVE_CONFIG_H
44 #include "config.h"
45 #endif
46 
47 #include "pcre2_internal.h"
48 
49 
50 /*************************************************
51 *                Check script run                *
52 *************************************************/
53 
54 /* A script run is conceptually a sequence of characters all in the same
55 Unicode script. However, it isn't quite that simple. There are special rules
56 for scripts that are commonly used together, and also special rules for digits.
57 This function implements the appropriate checks, which is possible only when
58 PCRE2 is compiled with Unicode support. The function returns TRUE if there is
59 no Unicode support; however, it should never be called in that circumstance
60 because an error is given by pcre2_compile() if a script run is called for in a
61 version of PCRE2 compiled without Unicode support.
62 
63 Arguments:
64   pgr       point to the first character
65   endptr    point after the last character
66   utf       TRUE if in UTF mode
67 
68 Returns:    TRUE if this is a valid script run
69 */
70 
71 /* These dummy values must be less than the negation of the largest offset in
72 the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
73 records (and is only likely to be a few hundred). */
74 
75 #define SCRIPT_UNSET        (-99999)
76 #define SCRIPT_HANPENDING   (-99998)
77 #define SCRIPT_HANHIRAKATA  (-99997)
78 #define SCRIPT_HANBOPOMOFO  (-99996)
79 #define SCRIPT_HANHANGUL    (-99995)
80 #define SCRIPT_LIST         (-99994)
81 
82 #define INTERSECTION_LIST_SIZE 50
83 
84 BOOL
PRIV(script_run)85 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
86 {
87 #ifdef SUPPORT_UNICODE
88 int require_script = SCRIPT_UNSET;
89 uint8_t intersection_list[INTERSECTION_LIST_SIZE];
90 const uint8_t *require_list = NULL;
91 uint32_t require_digitset = 0;
92 uint32_t c;
93 
94 #if PCRE2_CODE_UNIT_WIDTH == 32
95 (void)utf;    /* Avoid compiler warning */
96 #endif
97 
98 /* Any string containing fewer than 2 characters is a valid script run. */
99 
100 if (ptr >= endptr) return TRUE;
101 GETCHARINCTEST(c, ptr);
102 if (ptr >= endptr) return TRUE;
103 
104 /* Scan strings of two or more characters, checking the Unicode characteristics
105 of each code point. We make use of the Script Extensions property. There is
106 special code for scripts that can be combined with characters from the Han
107 Chinese script. This may be used in conjunction with four other scripts in
108 these combinations:
109 
110 . Han with Hiragana and Katakana is allowed (for Japanese).
111 . Han with Bopomofo is allowed (for Taiwanese Mandarin).
112 . Han with Hangul is allowed (for Korean).
113 
114 If the first significant character's script is one of the four, the required
115 script type is immediately known. However, if the first significant
116 character's script is Han, we have to keep checking for a non-Han character.
117 Hence the SCRIPT_HANPENDING state. */
118 
119 for (;;)
120   {
121   const ucd_record *ucd = GET_UCD(c);
122   int32_t scriptx = ucd->scriptx;
123 
124   /* If the script extension is Unknown, the string is not a valid script run.
125   Such characters can only form script runs of length one. */
126 
127   if (scriptx == ucp_Unknown) return FALSE;
128 
129   /* A character whose script extension is Inherited is always accepted with
130   any script, and plays no further part in this testing. A character whose
131   script is Common is always accepted, but must still be tested for a digit
132   below. The scriptx value at this point is non-zero, because zero is
133   ucp_Unknown, tested for above. */
134 
135   if (scriptx != ucp_Inherited)
136     {
137     if (scriptx != ucp_Common)
138       {
139       /* If the script extension value is positive, the character is not a mark
140       that can be used with many scripts. In the simple case we either set or
141       compare with the required script. However, handling the scripts that can
142       combine with Han are more complicated, as is the case when the previous
143       characters have been man-script marks. */
144 
145       if (scriptx > 0)
146         {
147         switch(require_script)
148           {
149           /* Either the first significant character (require_script unset) or
150           after only Han characters. */
151 
152           case SCRIPT_UNSET:
153           case SCRIPT_HANPENDING:
154           switch(scriptx)
155             {
156             case ucp_Han:
157             require_script = SCRIPT_HANPENDING;
158             break;
159 
160             case ucp_Hiragana:
161             case ucp_Katakana:
162             require_script = SCRIPT_HANHIRAKATA;
163             break;
164 
165             case ucp_Bopomofo:
166             require_script = SCRIPT_HANBOPOMOFO;
167             break;
168 
169             case ucp_Hangul:
170             require_script = SCRIPT_HANHANGUL;
171             break;
172 
173             /* Not a Han-related script. If expecting one, fail. Otherise set
174             the requirement to this script. */
175 
176             default:
177             if (require_script == SCRIPT_HANPENDING) return FALSE;
178             require_script = scriptx;
179             break;
180             }
181           break;
182 
183           /* Previously encountered one of the "with Han" scripts. Check that
184           this character is appropriate. */
185 
186           case SCRIPT_HANHIRAKATA:
187           if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
188               scriptx != ucp_Katakana)
189             return FALSE;
190           break;
191 
192           case SCRIPT_HANBOPOMOFO:
193           if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
194           break;
195 
196           case SCRIPT_HANHANGUL:
197           if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
198           break;
199 
200           /* We have a list of scripts to check that is derived from one or
201           more previous characters. This is either one of the lists in
202           ucd_script_sets[] (for one previous character) or the intersection of
203           several lists for multiple characters. */
204 
205           case SCRIPT_LIST:
206             {
207             const uint8_t *list;
208             for (list = require_list; *list != 0; list++)
209               {
210               if (*list == scriptx) break;
211               }
212             if (*list == 0) return FALSE;
213             }
214 
215           /* The rest of the string must be in this script, but we have to
216           allow for the Han complications. */
217 
218           switch(scriptx)
219             {
220             case ucp_Han:
221             require_script = SCRIPT_HANPENDING;
222             break;
223 
224             case ucp_Hiragana:
225             case ucp_Katakana:
226             require_script = SCRIPT_HANHIRAKATA;
227             break;
228 
229             case ucp_Bopomofo:
230             require_script = SCRIPT_HANBOPOMOFO;
231             break;
232 
233             case ucp_Hangul:
234             require_script = SCRIPT_HANHANGUL;
235             break;
236 
237             default:
238             require_script = scriptx;
239             break;
240             }
241           break;
242 
243           /* This is the easy case when a single script is required. */
244 
245           default:
246           if (scriptx != require_script) return FALSE;
247           break;
248           }
249         }  /* End of handing positive scriptx */
250 
251       /* If scriptx is negative, this character is a mark-type character that
252       has a list of permitted scripts. */
253 
254       else
255         {
256         uint32_t chspecial;
257         const uint8_t *clist, *rlist;
258         const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
259 
260         switch(require_script)
261           {
262           case SCRIPT_UNSET:
263           require_list = PRIV(ucd_script_sets) - scriptx;
264           require_script = SCRIPT_LIST;
265           break;
266 
267           /* An inspection of the Unicode 11.0.0 files shows that there are the
268           following types of Script Extension list that involve the Han,
269           Bopomofo, Hiragana, Katakana, and Hangul scripts:
270 
271           . Bopomofo + Han
272           . Han + Hiragana + Katakana
273           . Hiragana + Katakana
274           . Bopopmofo + Hangul + Han + Hiragana + Katakana
275 
276           The following code tries to make sense of this. */
277 
278 #define FOUND_BOPOMOFO 1
279 #define FOUND_HIRAGANA 2
280 #define FOUND_KATAKANA 4
281 #define FOUND_HANGUL   8
282 
283           case SCRIPT_HANPENDING:
284           chspecial = 0;
285           for (; *list != 0; list++)
286             {
287             switch (*list)
288               {
289               case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
290               case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
291               case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
292               case ucp_Hangul:   chspecial |= FOUND_HANGUL; break;
293               default: break;
294               }
295             }
296 
297            if (chspecial == 0) return FALSE;
298 
299            if (chspecial == FOUND_BOPOMOFO)
300              {
301              require_script = SCRIPT_HANBOPOMOFO;
302              }
303            else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
304              {
305              require_script = SCRIPT_HANHIRAKATA;
306              }
307 
308           /* Otherwise it must be allowed with all of them, so remain in
309           the pending state. */
310 
311           break;
312 
313           case SCRIPT_HANHIRAKATA:
314           for (; *list != 0; list++)
315             {
316             if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
317             }
318           if (*list == 0) return FALSE;
319           break;
320 
321           case SCRIPT_HANBOPOMOFO:
322           for (; *list != 0; list++)
323             {
324             if (*list == ucp_Bopomofo) break;
325             }
326           if (*list == 0) return FALSE;
327           break;
328 
329           case SCRIPT_HANHANGUL:
330           for (; *list != 0; list++)
331             {
332             if (*list == ucp_Hangul) break;
333             }
334           if (*list == 0) return FALSE;
335           break;
336 
337           /* Previously encountered one or more characters that are allowed
338           with a list of scripts. Build the intersection of the required list
339           with this character's list in intersection_list[]. This code is
340           written so that it still works OK if the required list is already in
341           that vector. */
342 
343           case SCRIPT_LIST:
344             {
345             int i = 0;
346             for (rlist = require_list; *rlist != 0; rlist++)
347               {
348               for (clist = list; *clist != 0; clist++)
349                 {
350                 if (*rlist == *clist)
351                   {
352                   intersection_list[i++] = *rlist;
353                   break;
354                   }
355                 }
356               }
357             if (i == 0) return FALSE;  /* No scripts in common */
358 
359             /* If there's just one script in common, we can set it as the
360             unique required script. Otherwise, terminate the intersection list
361             and make it the required list. */
362 
363             if (i == 1)
364               {
365               require_script = intersection_list[0];
366               }
367             else
368               {
369               intersection_list[i] = 0;
370               require_list = intersection_list;
371               }
372             }
373           break;
374 
375           /* The previously set required script is a single script, not
376           Han-related. Check that it is in this character's list. */
377 
378           default:
379           for (; *list != 0; list++)
380             {
381             if (*list == require_script) break;
382             }
383           if (*list == 0) return FALSE;
384           break;
385           }
386         }  /* End of handling negative scriptx */
387       }    /* End of checking non-Common character */
388 
389     /* The character is in an acceptable script. We must now ensure that all
390     decimal digits in the string come from the same set. Some scripts (e.g.
391     Common, Arabic) have more than one set of decimal digits. This code does
392     not allow mixing sets, even within the same script. The vector called
393     PRIV(ucd_digit_sets)[] contains, in its first element, the number of
394     following elements, and then, in ascending order, the code points of the
395     '9' characters in every set of 10 digits. Each set is identified by the
396     offset in the vector of its '9' character. An initial check of the first
397     value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
398 
399     if (ucd->chartype == ucp_Nd)
400       {
401       uint32_t digitset;
402 
403       if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
404         {
405         int mid;
406         int bot = 1;
407         int top = PRIV(ucd_digit_sets)[0];
408         for (;;)
409           {
410           if (top <= bot + 1)    /* <= rather than == is paranoia */
411             {
412             digitset = top;
413             break;
414             }
415           mid = (top + bot) / 2;
416           if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
417           }
418         }
419 
420       /* A required value of 0 means "unset". */
421 
422       if (require_digitset == 0) require_digitset = digitset;
423         else if (digitset != require_digitset) return FALSE;
424       }   /* End digit handling */
425     }     /* End checking non-Inherited character */
426 
427   /* If we haven't yet got to the end, pick up the next character. */
428 
429   if (ptr >= endptr) return TRUE;
430   GETCHARINCTEST(c, ptr);
431   }  /* End checking loop */
432 
433 #else   /* NOT SUPPORT_UNICODE */
434 (void)ptr;
435 (void)endptr;
436 (void)utf;
437 return TRUE;
438 #endif  /* SUPPORT_UNICODE */
439 }
440 
441 /* End of pcre2_script_run.c */
442