1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2021 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains the function for checking a script run. */
42 
43 #ifdef HAVE_CONFIG_H
44 #include "config.h"
45 #endif
46 
47 #include "pcre2_internal.h"
48 
49 
50 /*************************************************
51 *                Check script run                *
52 *************************************************/
53 
54 /* A script run is conceptually a sequence of characters all in the same
55 Unicode script. However, it isn't quite that simple. There are special rules
56 for scripts that are commonly used together, and also special rules for digits.
57 This function implements the appropriate checks, which is possible only when
58 PCRE2 is compiled with Unicode support. The function returns TRUE if there is
59 no Unicode support; however, it should never be called in that circumstance
60 because an error is given by pcre2_compile() if a script run is called for in a
61 version of PCRE2 compiled without Unicode support.
62 
63 Arguments:
64   pgr       point to the first character
65   endptr    point after the last character
66   utf       TRUE if in UTF mode
67 
68 Returns:    TRUE if this is a valid script run
69 */
70 
71 /* These are states in the checking process. */
72 
73 enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
74        SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
75        SCRIPT_HANPENDING,     /* Have had only Han characters */
76        SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
77        SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
78        SCRIPT_HANHANGUL       /* Expect Han or Hangul */
79        };
80 
81 #define UCD_MAPSIZE (ucp_Unknown/32 + 1)
82 #define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
83 
84 BOOL
PRIV(script_run)85 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
86 {
87 #ifdef SUPPORT_UNICODE
88 uint32_t require_state = SCRIPT_UNSET;
89 uint32_t require_map[FULL_MAPSIZE];
90 uint32_t map[FULL_MAPSIZE];
91 uint32_t require_digitset = 0;
92 uint32_t c;
93 
94 #if PCRE2_CODE_UNIT_WIDTH == 32
95 (void)utf;    /* Avoid compiler warning */
96 #endif
97 
98 /* Any string containing fewer than 2 characters is a valid script run. */
99 
100 if (ptr >= endptr) return TRUE;
101 GETCHARINCTEST(c, ptr);
102 if (ptr >= endptr) return TRUE;
103 
104 /* Initialize the require map. This is a full-size bitmap that has a bit for
105 every script, as opposed to the maps in ucd_script_sets, which only have bits
106 for scripts less than ucp_Unknown - those that appear in script extension
107 lists. */
108 
109 for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
110 
111 /* Scan strings of two or more characters, checking the Unicode characteristics
112 of each code point. There is special code for scripts that can be combined with
113 characters from the Han Chinese script. This may be used in conjunction with
114 four other scripts in these combinations:
115 
116 . Han with Hiragana and Katakana is allowed (for Japanese).
117 . Han with Bopomofo is allowed (for Taiwanese Mandarin).
118 . Han with Hangul is allowed (for Korean).
119 
120 If the first significant character's script is one of the four, the required
121 script type is immediately known. However, if the first significant
122 character's script is Han, we have to keep checking for a non-Han character.
123 Hence the SCRIPT_HANPENDING state. */
124 
125 for (;;)
126   {
127   const ucd_record *ucd = GET_UCD(c);
128   uint32_t script = ucd->script;
129 
130   /* If the script is Unknown, the string is not a valid script run. Such
131   characters can only form script runs of length one (see test above). */
132 
133   if (script == ucp_Unknown) return FALSE;
134 
135   /* A character without any script extensions whose script is Inherited or
136   Common is always accepted with any script. If there are extensions, the
137   following processing happens for all scripts. */
138 
139   if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
140     {
141     BOOL OK;
142 
143     /* Set up a full-sized map for this character that can include bits for all
144     scripts. Copy the scriptx map for this character (which covers those
145     scripts that appear in script extension lists), set the remaining values to
146     zero, and then, except for Common or Inherited, add this script's bit to
147     the map. */
148 
149     memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
150     memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
151     if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
152 
153     /* Handle the different checking states */
154 
155     switch(require_state)
156       {
157       /* First significant character - it might follow Common or Inherited
158       characters that do not have any script extensions. */
159 
160       case SCRIPT_UNSET:
161       switch(script)
162         {
163         case ucp_Han:
164         require_state = SCRIPT_HANPENDING;
165         break;
166 
167         case ucp_Hiragana:
168         case ucp_Katakana:
169         require_state = SCRIPT_HANHIRAKATA;
170         break;
171 
172         case ucp_Bopomofo:
173         require_state = SCRIPT_HANBOPOMOFO;
174         break;
175 
176         case ucp_Hangul:
177         require_state = SCRIPT_HANHANGUL;
178         break;
179 
180         default:
181         memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
182         require_state = SCRIPT_MAP;
183         break;
184         }
185       break;
186 
187       /* The first significant character was Han. An inspection of the Unicode
188       11.0.0 files shows that there are the following types of Script Extension
189       list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
190       scripts:
191 
192       . Bopomofo + Han
193       . Han + Hiragana + Katakana
194       . Hiragana + Katakana
195       . Bopopmofo + Hangul + Han + Hiragana + Katakana
196 
197       The following code tries to make sense of this. */
198 
199 #define FOUND_BOPOMOFO 1
200 #define FOUND_HIRAGANA 2
201 #define FOUND_KATAKANA 4
202 #define FOUND_HANGUL   8
203 
204       case SCRIPT_HANPENDING:
205       if (script != ucp_Han)   /* Another Han does nothing */
206         {
207         uint32_t chspecial = 0;
208 
209         if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
210         if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
211         if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
212         if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
213 
214         if (chspecial == 0) return FALSE;   /* Not allowed with Han */
215 
216         if (chspecial == FOUND_BOPOMOFO)
217           require_state = SCRIPT_HANBOPOMOFO;
218         else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
219           require_state = SCRIPT_HANHIRAKATA;
220 
221         /* Otherwise this character must be allowed with all of them, so remain
222         in the pending state. */
223         }
224       break;
225 
226       /* Previously encountered one of the "with Han" scripts. Check that
227       this character is appropriate. */
228 
229       case SCRIPT_HANHIRAKATA:
230       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
231           MAPBIT(map, ucp_Katakana) == 0) return FALSE;
232       break;
233 
234       case SCRIPT_HANBOPOMOFO:
235       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
236       break;
237 
238       case SCRIPT_HANHANGUL:
239       if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
240       break;
241 
242       /* Previously encountered one or more characters that are allowed with a
243       list of scripts. */
244 
245       case SCRIPT_MAP:
246       OK = FALSE;
247 
248       for (int i = 0; i < FULL_MAPSIZE; i++)
249         {
250         if ((require_map[i] & map[i]) != 0)
251           {
252           OK = TRUE;
253           break;
254           }
255         }
256 
257       if (!OK) return FALSE;
258 
259       /* The rest of the string must be in this script, but we have to
260       allow for the Han complications. */
261 
262       switch(script)
263         {
264         case ucp_Han:
265         require_state = SCRIPT_HANPENDING;
266         break;
267 
268         case ucp_Hiragana:
269         case ucp_Katakana:
270         require_state = SCRIPT_HANHIRAKATA;
271         break;
272 
273         case ucp_Bopomofo:
274         require_state = SCRIPT_HANBOPOMOFO;
275         break;
276 
277         case ucp_Hangul:
278         require_state = SCRIPT_HANHANGUL;
279         break;
280 
281         /* Compute the intersection of the required list of scripts and the
282         allowed scripts for this character. */
283 
284         default:
285         for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
286         break;
287         }
288 
289       break;
290       }
291     }   /* End checking character's script and extensions. */
292 
293   /* The character is in an acceptable script. We must now ensure that all
294   decimal digits in the string come from the same set. Some scripts (e.g.
295   Common, Arabic) have more than one set of decimal digits. This code does
296   not allow mixing sets, even within the same script. The vector called
297   PRIV(ucd_digit_sets)[] contains, in its first element, the number of
298   following elements, and then, in ascending order, the code points of the
299   '9' characters in every set of 10 digits. Each set is identified by the
300   offset in the vector of its '9' character. An initial check of the first
301   value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
302 
303   if (ucd->chartype == ucp_Nd)
304     {
305     uint32_t digitset;
306 
307     if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
308       {
309       int mid;
310       int bot = 1;
311       int top = PRIV(ucd_digit_sets)[0];
312       for (;;)
313         {
314         if (top <= bot + 1)    /* <= rather than == is paranoia */
315           {
316           digitset = top;
317           break;
318           }
319         mid = (top + bot) / 2;
320         if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
321         }
322       }
323 
324     /* A required value of 0 means "unset". */
325 
326     if (require_digitset == 0) require_digitset = digitset;
327       else if (digitset != require_digitset) return FALSE;
328     }   /* End digit handling */
329 
330   /* If we haven't yet got to the end, pick up the next character. */
331 
332   if (ptr >= endptr) return TRUE;
333   GETCHARINCTEST(c, ptr);
334   }  /* End checking loop */
335 
336 #else   /* NOT SUPPORT_UNICODE */
337 (void)ptr;
338 (void)endptr;
339 (void)utf;
340 return TRUE;
341 #endif  /* SUPPORT_UNICODE */
342 }
343 
344 /* End of pcre2_script_run.c */
345