xref: /PHP-7.2/ext/mbstring/oniguruma/src/euc_tw.c (revision 0ae2f95b)
1 /**********************************************************************
2   euc_tw.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2016  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 static const int EncLen_EUCTW[] = {
33   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1,
42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
49 };
50 
51 static int
euctw_mbc_enc_len(const UChar * p)52 euctw_mbc_enc_len(const UChar* p)
53 {
54   return EncLen_EUCTW[*p];
55 }
56 
57 static int
is_valid_mbc_string(const UChar * p,const UChar * end)58 is_valid_mbc_string(const UChar* p, const UChar* end)
59 {
60   while (p < end) {
61     if (*p < 0x80) {
62       p++;
63     }
64     else if (*p < 0xa1) {
65       if (*p == 0x8e) {
66 	p++;
67 	if (p >= end) return FALSE;
68 	if (*p < 0xa1 || *p > 0xb0) return FALSE;
69 	p++;
70 	if (p >= end) return FALSE;
71 	if (*p < 0xa1 || *p == 0xff)
72 	  return FALSE;
73 	p++;
74 	if (p >= end) return FALSE;
75 	if (*p < 0xa1 || *p == 0xff)
76 	  return FALSE;
77 	p++;
78       }
79       else
80 	return FALSE;
81     }
82     else if (*p < 0xff) {
83       p++;
84       if (p >= end) return FALSE;
85       if (*p < 0xa1 || *p == 0xff)
86 	return FALSE;
87       p++;
88     }
89     else
90       return FALSE;
91   }
92 
93   return TRUE;
94 }
95 
96 static OnigCodePoint
euctw_mbc_to_code(const UChar * p,const UChar * end)97 euctw_mbc_to_code(const UChar* p, const UChar* end)
98 {
99   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_TW, p, end);
100 }
101 
102 static int
euctw_code_to_mbc(OnigCodePoint code,UChar * buf)103 euctw_code_to_mbc(OnigCodePoint code, UChar *buf)
104 {
105   return onigenc_mb4_code_to_mbc(ONIG_ENCODING_EUC_TW, code, buf);
106 }
107 
108 static int
euctw_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)109 euctw_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
110                     UChar* lower)
111 {
112   return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_EUC_TW, flag,
113                                    pp, end, lower);
114 }
115 
116 static int
euctw_is_code_ctype(OnigCodePoint code,unsigned int ctype)117 euctw_is_code_ctype(OnigCodePoint code, unsigned int ctype)
118 {
119   return onigenc_mb4_is_code_ctype(ONIG_ENCODING_EUC_TW, code, ctype);
120 }
121 
122 #define euctw_islead(c)    ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
123 
124 static UChar*
euctw_left_adjust_char_head(const UChar * start,const UChar * s)125 euctw_left_adjust_char_head(const UChar* start, const UChar* s)
126 {
127   /* Assumed in this encoding,
128      mb-trail bytes don't mix with single bytes.
129   */
130   const UChar *p;
131   int len;
132 
133   if (s <= start) return (UChar* )s;
134   p = s;
135 
136   while (!euctw_islead(*p) && p > start) p--;
137   len = enclen(ONIG_ENCODING_EUC_TW, p);
138   if (p + len > s) return (UChar* )p;
139   p += len;
140   return (UChar* )(p + ((s - p) & ~1));
141 }
142 
143 static int
euctw_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)144 euctw_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
145 {
146   const UChar c = *s;
147   if (c <= 0x7e) return TRUE;
148   else           return FALSE;
149 }
150 
151 OnigEncodingType OnigEncodingEUC_TW = {
152   euctw_mbc_enc_len,
153   "EUC-TW",   /* name */
154   4,          /* max enc length */
155   1,          /* min enc length */
156   onigenc_is_mbc_newline_0x0a,
157   euctw_mbc_to_code,
158   onigenc_mb4_code_to_mbclen,
159   euctw_code_to_mbc,
160   euctw_mbc_case_fold,
161   onigenc_ascii_apply_all_case_fold,
162   onigenc_ascii_get_case_fold_codes_by_str,
163   onigenc_minimum_property_name_to_ctype,
164   euctw_is_code_ctype,
165   onigenc_not_support_get_ctype_code_range,
166   euctw_left_adjust_char_head,
167   euctw_is_allowed_reverse_match,
168   NULL, /* init */
169   NULL, /* is_initialized */
170   is_valid_mbc_string
171 };
172