xref: /PHP-7.4/ext/pcre/pcre2lib/pcre2_ucp.h (revision 9f2d0395)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2018 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
43 #define PCRE2_UCP_H_IDEMPOTENT_GUARD
44 
45 /* This file contains definitions of the property values that are returned by
46 the UCD access macros. New values that are added for new releases of Unicode
47 should always be at the end of each enum, for backwards compatibility.
48 
49 IMPORTANT: Note also that the specific numeric values of the enums have to be
50 the same as the values that are generated by the maint/MultiStage2.py script,
51 where the equivalent property descriptive names are listed in vectors.
52 
53 ALSO: The specific values of the first two enums are assumed for the table
54 called catposstab in pcre2_compile.c. */
55 
56 /* These are the general character categories. */
57 
58 enum {
59   ucp_C,     /* Other */
60   ucp_L,     /* Letter */
61   ucp_M,     /* Mark */
62   ucp_N,     /* Number */
63   ucp_P,     /* Punctuation */
64   ucp_S,     /* Symbol */
65   ucp_Z      /* Separator */
66 };
67 
68 /* These are the particular character categories. */
69 
70 enum {
71   ucp_Cc,    /* Control */
72   ucp_Cf,    /* Format */
73   ucp_Cn,    /* Unassigned */
74   ucp_Co,    /* Private use */
75   ucp_Cs,    /* Surrogate */
76   ucp_Ll,    /* Lower case letter */
77   ucp_Lm,    /* Modifier letter */
78   ucp_Lo,    /* Other letter */
79   ucp_Lt,    /* Title case letter */
80   ucp_Lu,    /* Upper case letter */
81   ucp_Mc,    /* Spacing mark */
82   ucp_Me,    /* Enclosing mark */
83   ucp_Mn,    /* Non-spacing mark */
84   ucp_Nd,    /* Decimal number */
85   ucp_Nl,    /* Letter number */
86   ucp_No,    /* Other number */
87   ucp_Pc,    /* Connector punctuation */
88   ucp_Pd,    /* Dash punctuation */
89   ucp_Pe,    /* Close punctuation */
90   ucp_Pf,    /* Final punctuation */
91   ucp_Pi,    /* Initial punctuation */
92   ucp_Po,    /* Other punctuation */
93   ucp_Ps,    /* Open punctuation */
94   ucp_Sc,    /* Currency symbol */
95   ucp_Sk,    /* Modifier symbol */
96   ucp_Sm,    /* Mathematical symbol */
97   ucp_So,    /* Other symbol */
98   ucp_Zl,    /* Line separator */
99   ucp_Zp,    /* Paragraph separator */
100   ucp_Zs     /* Space separator */
101 };
102 
103 /* These are grapheme break properties. The Extended Pictographic property
104 comes from the emoji-data.txt file. */
105 
106 enum {
107   ucp_gbCR,                    /*  0 */
108   ucp_gbLF,                    /*  1 */
109   ucp_gbControl,               /*  2 */
110   ucp_gbExtend,                /*  3 */
111   ucp_gbPrepend,               /*  4 */
112   ucp_gbSpacingMark,           /*  5 */
113   ucp_gbL,                     /*  6 Hangul syllable type L */
114   ucp_gbV,                     /*  7 Hangul syllable type V */
115   ucp_gbT,                     /*  8 Hangul syllable type T */
116   ucp_gbLV,                    /*  9 Hangul syllable type LV */
117   ucp_gbLVT,                   /* 10 Hangul syllable type LVT */
118   ucp_gbRegionalIndicator,     /* 11 */
119   ucp_gbOther,                 /* 12 */
120   ucp_gbZWJ,                   /* 13 */
121   ucp_gbExtended_Pictographic  /* 14 */
122 };
123 
124 /* These are the script identifications. */
125 
126 enum {
127   ucp_Unknown,
128   ucp_Arabic,
129   ucp_Armenian,
130   ucp_Bengali,
131   ucp_Bopomofo,
132   ucp_Braille,
133   ucp_Buginese,
134   ucp_Buhid,
135   ucp_Canadian_Aboriginal,
136   ucp_Cherokee,
137   ucp_Common,
138   ucp_Coptic,
139   ucp_Cypriot,
140   ucp_Cyrillic,
141   ucp_Deseret,
142   ucp_Devanagari,
143   ucp_Ethiopic,
144   ucp_Georgian,
145   ucp_Glagolitic,
146   ucp_Gothic,
147   ucp_Greek,
148   ucp_Gujarati,
149   ucp_Gurmukhi,
150   ucp_Han,
151   ucp_Hangul,
152   ucp_Hanunoo,
153   ucp_Hebrew,
154   ucp_Hiragana,
155   ucp_Inherited,
156   ucp_Kannada,
157   ucp_Katakana,
158   ucp_Kharoshthi,
159   ucp_Khmer,
160   ucp_Lao,
161   ucp_Latin,
162   ucp_Limbu,
163   ucp_Linear_B,
164   ucp_Malayalam,
165   ucp_Mongolian,
166   ucp_Myanmar,
167   ucp_New_Tai_Lue,
168   ucp_Ogham,
169   ucp_Old_Italic,
170   ucp_Old_Persian,
171   ucp_Oriya,
172   ucp_Osmanya,
173   ucp_Runic,
174   ucp_Shavian,
175   ucp_Sinhala,
176   ucp_Syloti_Nagri,
177   ucp_Syriac,
178   ucp_Tagalog,
179   ucp_Tagbanwa,
180   ucp_Tai_Le,
181   ucp_Tamil,
182   ucp_Telugu,
183   ucp_Thaana,
184   ucp_Thai,
185   ucp_Tibetan,
186   ucp_Tifinagh,
187   ucp_Ugaritic,
188   ucp_Yi,
189   /* New for Unicode 5.0 */
190   ucp_Balinese,
191   ucp_Cuneiform,
192   ucp_Nko,
193   ucp_Phags_Pa,
194   ucp_Phoenician,
195   /* New for Unicode 5.1 */
196   ucp_Carian,
197   ucp_Cham,
198   ucp_Kayah_Li,
199   ucp_Lepcha,
200   ucp_Lycian,
201   ucp_Lydian,
202   ucp_Ol_Chiki,
203   ucp_Rejang,
204   ucp_Saurashtra,
205   ucp_Sundanese,
206   ucp_Vai,
207   /* New for Unicode 5.2 */
208   ucp_Avestan,
209   ucp_Bamum,
210   ucp_Egyptian_Hieroglyphs,
211   ucp_Imperial_Aramaic,
212   ucp_Inscriptional_Pahlavi,
213   ucp_Inscriptional_Parthian,
214   ucp_Javanese,
215   ucp_Kaithi,
216   ucp_Lisu,
217   ucp_Meetei_Mayek,
218   ucp_Old_South_Arabian,
219   ucp_Old_Turkic,
220   ucp_Samaritan,
221   ucp_Tai_Tham,
222   ucp_Tai_Viet,
223   /* New for Unicode 6.0.0 */
224   ucp_Batak,
225   ucp_Brahmi,
226   ucp_Mandaic,
227   /* New for Unicode 6.1.0 */
228   ucp_Chakma,
229   ucp_Meroitic_Cursive,
230   ucp_Meroitic_Hieroglyphs,
231   ucp_Miao,
232   ucp_Sharada,
233   ucp_Sora_Sompeng,
234   ucp_Takri,
235   /* New for Unicode 7.0.0 */
236   ucp_Bassa_Vah,
237   ucp_Caucasian_Albanian,
238   ucp_Duployan,
239   ucp_Elbasan,
240   ucp_Grantha,
241   ucp_Khojki,
242   ucp_Khudawadi,
243   ucp_Linear_A,
244   ucp_Mahajani,
245   ucp_Manichaean,
246   ucp_Mende_Kikakui,
247   ucp_Modi,
248   ucp_Mro,
249   ucp_Nabataean,
250   ucp_Old_North_Arabian,
251   ucp_Old_Permic,
252   ucp_Pahawh_Hmong,
253   ucp_Palmyrene,
254   ucp_Psalter_Pahlavi,
255   ucp_Pau_Cin_Hau,
256   ucp_Siddham,
257   ucp_Tirhuta,
258   ucp_Warang_Citi,
259   /* New for Unicode 8.0.0 */
260   ucp_Ahom,
261   ucp_Anatolian_Hieroglyphs,
262   ucp_Hatran,
263   ucp_Multani,
264   ucp_Old_Hungarian,
265   ucp_SignWriting,
266   /* New for Unicode 10.0.0 (no update since 8.0.0) */
267   ucp_Adlam,
268   ucp_Bhaiksuki,
269   ucp_Marchen,
270   ucp_Newa,
271   ucp_Osage,
272   ucp_Tangut,
273   ucp_Masaram_Gondi,
274   ucp_Nushu,
275   ucp_Soyombo,
276   ucp_Zanabazar_Square,
277   /* New for Unicode 11.0.0 */
278   ucp_Dogra,
279   ucp_Gunjala_Gondi,
280   ucp_Hanifi_Rohingya,
281   ucp_Makasar,
282   ucp_Medefaidrin,
283   ucp_Old_Sogdian,
284   ucp_Sogdian,
285   /* New for Unicode 12.0.0 */
286   ucp_Elymaic,
287   ucp_Nandinagari,
288   ucp_Nyiakeng_Puachue_Hmong,
289   ucp_Wancho,
290   /* New for Unicode 13.0.0 */
291   ucp_Chorasmian,
292   ucp_Dives_Akuru,
293   ucp_Khitan_Small_Script,
294   ucp_Yezidi
295 };
296 
297 #endif  /* PCRE2_UCP_H_IDEMPOTENT_GUARD */
298 
299 /* End of pcre2_ucp.h */
300