xref: /php-src/ext/pcre/pcre2lib/pcre2_ucp.h (revision ae5beff6)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2022 University of Cambridge
11 
12 This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
13 Instead, modify the maint/GenerateUcpHeader.py script and run it to generate
14 a new version of this code.
15 
16 -----------------------------------------------------------------------------
17 Redistribution and use in source and binary forms, with or without
18 modification, are permitted provided that the following conditions are met:
19 
20     * Redistributions of source code must retain the above copyright notice,
21       this list of conditions and the following disclaimer.
22 
23     * Redistributions in binary form must reproduce the above copyright
24       notice, this list of conditions and the following disclaimer in the
25       documentation and/or other materials provided with the distribution.
26 
27     * Neither the name of the University of Cambridge nor the names of its
28       contributors may be used to endorse or promote products derived from
29       this software without specific prior written permission.
30 
31 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
32 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
35 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
36 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
37 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
39 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
41 POSSIBILITY OF SUCH DAMAGE.
42 -----------------------------------------------------------------------------
43 */
44 
45 #ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
46 #define PCRE2_UCP_H_IDEMPOTENT_GUARD
47 
48 /* This file contains definitions of the Unicode property values that are
49 returned by the UCD access macros and used throughout PCRE2.
50 
51 IMPORTANT: The specific values of the first two enums (general and particular
52 character categories) are assumed by the table called catposstab in the file
53 pcre2_auto_possess.c. They are unlikely to change, but should be checked after
54 an update. */
55 
56 /* These are the general character categories. */
57 
58 enum {
59   ucp_C,
60   ucp_L,
61   ucp_M,
62   ucp_N,
63   ucp_P,
64   ucp_S,
65   ucp_Z,
66 };
67 
68 /* These are the particular character categories. */
69 
70 enum {
71   ucp_Cc,    /* Control */
72   ucp_Cf,    /* Format */
73   ucp_Cn,    /* Unassigned */
74   ucp_Co,    /* Private use */
75   ucp_Cs,    /* Surrogate */
76   ucp_Ll,    /* Lower case letter */
77   ucp_Lm,    /* Modifier letter */
78   ucp_Lo,    /* Other letter */
79   ucp_Lt,    /* Title case letter */
80   ucp_Lu,    /* Upper case letter */
81   ucp_Mc,    /* Spacing mark */
82   ucp_Me,    /* Enclosing mark */
83   ucp_Mn,    /* Non-spacing mark */
84   ucp_Nd,    /* Decimal number */
85   ucp_Nl,    /* Letter number */
86   ucp_No,    /* Other number */
87   ucp_Pc,    /* Connector punctuation */
88   ucp_Pd,    /* Dash punctuation */
89   ucp_Pe,    /* Close punctuation */
90   ucp_Pf,    /* Final punctuation */
91   ucp_Pi,    /* Initial punctuation */
92   ucp_Po,    /* Other punctuation */
93   ucp_Ps,    /* Open punctuation */
94   ucp_Sc,    /* Currency symbol */
95   ucp_Sk,    /* Modifier symbol */
96   ucp_Sm,    /* Mathematical symbol */
97   ucp_So,    /* Other symbol */
98   ucp_Zl,    /* Line separator */
99   ucp_Zp,    /* Paragraph separator */
100   ucp_Zs,    /* Space separator */
101 };
102 
103 /* These are Boolean properties. */
104 
105 enum {
106   ucp_ASCII,
107   ucp_ASCII_Hex_Digit,
108   ucp_Alphabetic,
109   ucp_Bidi_Control,
110   ucp_Bidi_Mirrored,
111   ucp_Case_Ignorable,
112   ucp_Cased,
113   ucp_Changes_When_Casefolded,
114   ucp_Changes_When_Casemapped,
115   ucp_Changes_When_Lowercased,
116   ucp_Changes_When_Titlecased,
117   ucp_Changes_When_Uppercased,
118   ucp_Dash,
119   ucp_Default_Ignorable_Code_Point,
120   ucp_Deprecated,
121   ucp_Diacritic,
122   ucp_Emoji,
123   ucp_Emoji_Component,
124   ucp_Emoji_Modifier,
125   ucp_Emoji_Modifier_Base,
126   ucp_Emoji_Presentation,
127   ucp_Extended_Pictographic,
128   ucp_Extender,
129   ucp_Grapheme_Base,
130   ucp_Grapheme_Extend,
131   ucp_Grapheme_Link,
132   ucp_Hex_Digit,
133   ucp_IDS_Binary_Operator,
134   ucp_IDS_Trinary_Operator,
135   ucp_ID_Continue,
136   ucp_ID_Start,
137   ucp_Ideographic,
138   ucp_Join_Control,
139   ucp_Logical_Order_Exception,
140   ucp_Lowercase,
141   ucp_Math,
142   ucp_Noncharacter_Code_Point,
143   ucp_Pattern_Syntax,
144   ucp_Pattern_White_Space,
145   ucp_Prepended_Concatenation_Mark,
146   ucp_Quotation_Mark,
147   ucp_Radical,
148   ucp_Regional_Indicator,
149   ucp_Sentence_Terminal,
150   ucp_Soft_Dotted,
151   ucp_Terminal_Punctuation,
152   ucp_Unified_Ideograph,
153   ucp_Uppercase,
154   ucp_Variation_Selector,
155   ucp_White_Space,
156   ucp_XID_Continue,
157   ucp_XID_Start,
158   /* This must be last */
159   ucp_Bprop_Count
160 };
161 
162 /* Size of entries in ucd_boolprop_sets[] */
163 
164 #define ucd_boolprop_sets_item_size 2
165 
166 /* These are the bidi class values. */
167 
168 enum {
169   ucp_bidiAL,   /* Arabic_Letter */
170   ucp_bidiAN,   /* Arabic_Number */
171   ucp_bidiB,    /* Paragraph_Separator */
172   ucp_bidiBN,   /* Boundary_Neutral */
173   ucp_bidiCS,   /* Common_Separator */
174   ucp_bidiEN,   /* European_Number */
175   ucp_bidiES,   /* European_Separator */
176   ucp_bidiET,   /* European_Terminator */
177   ucp_bidiFSI,  /* First_Strong_Isolate */
178   ucp_bidiL,    /* Left_To_Right */
179   ucp_bidiLRE,  /* Left_To_Right_Embedding */
180   ucp_bidiLRI,  /* Left_To_Right_Isolate */
181   ucp_bidiLRO,  /* Left_To_Right_Override */
182   ucp_bidiNSM,  /* Nonspacing_Mark */
183   ucp_bidiON,   /* Other_Neutral */
184   ucp_bidiPDF,  /* Pop_Directional_Format */
185   ucp_bidiPDI,  /* Pop_Directional_Isolate */
186   ucp_bidiR,    /* Right_To_Left */
187   ucp_bidiRLE,  /* Right_To_Left_Embedding */
188   ucp_bidiRLI,  /* Right_To_Left_Isolate */
189   ucp_bidiRLO,  /* Right_To_Left_Override */
190   ucp_bidiS,    /* Segment_Separator */
191   ucp_bidiWS,   /* White_Space */
192 };
193 
194 /* These are grapheme break properties. The Extended Pictographic property
195 comes from the emoji-data.txt file. */
196 
197 enum {
198   ucp_gbCR,                    /*  0 */
199   ucp_gbLF,                    /*  1 */
200   ucp_gbControl,               /*  2 */
201   ucp_gbExtend,                /*  3 */
202   ucp_gbPrepend,               /*  4 */
203   ucp_gbSpacingMark,           /*  5 */
204   ucp_gbL,                     /*  6 Hangul syllable type L */
205   ucp_gbV,                     /*  7 Hangul syllable type V */
206   ucp_gbT,                     /*  8 Hangul syllable type T */
207   ucp_gbLV,                    /*  9 Hangul syllable type LV */
208   ucp_gbLVT,                   /* 10 Hangul syllable type LVT */
209   ucp_gbRegional_Indicator,    /* 11 */
210   ucp_gbOther,                 /* 12 */
211   ucp_gbZWJ,                   /* 13 */
212   ucp_gbExtended_Pictographic, /* 14 */
213 };
214 
215 /* These are the script identifications. */
216 
217 enum {
218   /* Scripts which has characters in other scripts. */
219   ucp_Latin,
220   ucp_Greek,
221   ucp_Cyrillic,
222   ucp_Arabic,
223   ucp_Syriac,
224   ucp_Thaana,
225   ucp_Devanagari,
226   ucp_Bengali,
227   ucp_Gurmukhi,
228   ucp_Gujarati,
229   ucp_Oriya,
230   ucp_Tamil,
231   ucp_Telugu,
232   ucp_Kannada,
233   ucp_Malayalam,
234   ucp_Sinhala,
235   ucp_Myanmar,
236   ucp_Georgian,
237   ucp_Hangul,
238   ucp_Mongolian,
239   ucp_Hiragana,
240   ucp_Katakana,
241   ucp_Bopomofo,
242   ucp_Han,
243   ucp_Yi,
244   ucp_Tagalog,
245   ucp_Hanunoo,
246   ucp_Buhid,
247   ucp_Tagbanwa,
248   ucp_Limbu,
249   ucp_Tai_Le,
250   ucp_Linear_B,
251   ucp_Cypriot,
252   ucp_Buginese,
253   ucp_Coptic,
254   ucp_Glagolitic,
255   ucp_Syloti_Nagri,
256   ucp_Phags_Pa,
257   ucp_Nko,
258   ucp_Kayah_Li,
259   ucp_Javanese,
260   ucp_Kaithi,
261   ucp_Mandaic,
262   ucp_Chakma,
263   ucp_Sharada,
264   ucp_Takri,
265   ucp_Duployan,
266   ucp_Grantha,
267   ucp_Khojki,
268   ucp_Linear_A,
269   ucp_Mahajani,
270   ucp_Manichaean,
271   ucp_Modi,
272   ucp_Old_Permic,
273   ucp_Psalter_Pahlavi,
274   ucp_Khudawadi,
275   ucp_Tirhuta,
276   ucp_Multani,
277   ucp_Adlam,
278   ucp_Masaram_Gondi,
279   ucp_Dogra,
280   ucp_Gunjala_Gondi,
281   ucp_Hanifi_Rohingya,
282   ucp_Sogdian,
283   ucp_Nandinagari,
284   ucp_Yezidi,
285   ucp_Cypro_Minoan,
286   ucp_Old_Uyghur,
287 
288   /* Scripts which has no characters in other scripts. */
289   ucp_Unknown,
290   ucp_Common,
291   ucp_Armenian,
292   ucp_Hebrew,
293   ucp_Thai,
294   ucp_Lao,
295   ucp_Tibetan,
296   ucp_Ethiopic,
297   ucp_Cherokee,
298   ucp_Canadian_Aboriginal,
299   ucp_Ogham,
300   ucp_Runic,
301   ucp_Khmer,
302   ucp_Old_Italic,
303   ucp_Gothic,
304   ucp_Deseret,
305   ucp_Inherited,
306   ucp_Ugaritic,
307   ucp_Shavian,
308   ucp_Osmanya,
309   ucp_Braille,
310   ucp_New_Tai_Lue,
311   ucp_Tifinagh,
312   ucp_Old_Persian,
313   ucp_Kharoshthi,
314   ucp_Balinese,
315   ucp_Cuneiform,
316   ucp_Phoenician,
317   ucp_Sundanese,
318   ucp_Lepcha,
319   ucp_Ol_Chiki,
320   ucp_Vai,
321   ucp_Saurashtra,
322   ucp_Rejang,
323   ucp_Lycian,
324   ucp_Carian,
325   ucp_Lydian,
326   ucp_Cham,
327   ucp_Tai_Tham,
328   ucp_Tai_Viet,
329   ucp_Avestan,
330   ucp_Egyptian_Hieroglyphs,
331   ucp_Samaritan,
332   ucp_Lisu,
333   ucp_Bamum,
334   ucp_Meetei_Mayek,
335   ucp_Imperial_Aramaic,
336   ucp_Old_South_Arabian,
337   ucp_Inscriptional_Parthian,
338   ucp_Inscriptional_Pahlavi,
339   ucp_Old_Turkic,
340   ucp_Batak,
341   ucp_Brahmi,
342   ucp_Meroitic_Cursive,
343   ucp_Meroitic_Hieroglyphs,
344   ucp_Miao,
345   ucp_Sora_Sompeng,
346   ucp_Caucasian_Albanian,
347   ucp_Bassa_Vah,
348   ucp_Elbasan,
349   ucp_Pahawh_Hmong,
350   ucp_Mende_Kikakui,
351   ucp_Mro,
352   ucp_Old_North_Arabian,
353   ucp_Nabataean,
354   ucp_Palmyrene,
355   ucp_Pau_Cin_Hau,
356   ucp_Siddham,
357   ucp_Warang_Citi,
358   ucp_Ahom,
359   ucp_Anatolian_Hieroglyphs,
360   ucp_Hatran,
361   ucp_Old_Hungarian,
362   ucp_SignWriting,
363   ucp_Bhaiksuki,
364   ucp_Marchen,
365   ucp_Newa,
366   ucp_Osage,
367   ucp_Tangut,
368   ucp_Nushu,
369   ucp_Soyombo,
370   ucp_Zanabazar_Square,
371   ucp_Makasar,
372   ucp_Medefaidrin,
373   ucp_Old_Sogdian,
374   ucp_Elymaic,
375   ucp_Nyiakeng_Puachue_Hmong,
376   ucp_Wancho,
377   ucp_Chorasmian,
378   ucp_Dives_Akuru,
379   ucp_Khitan_Small_Script,
380   ucp_Tangsa,
381   ucp_Toto,
382   ucp_Vithkuqi,
383   ucp_Kawi,
384   ucp_Nag_Mundari,
385 
386   /* This must be last */
387   ucp_Script_Count
388 };
389 
390 /* Size of entries in ucd_script_sets[] */
391 
392 #define ucd_script_sets_item_size 3
393 
394 #endif  /* PCRE2_UCP_H_IDEMPOTENT_GUARD */
395 
396 /* End of pcre2_ucp.h */
397