xref: /PHP-7.3/ext/mbstring/oniguruma/src/euc_kr.c (revision 1979c5d1)
1 /**********************************************************************
2   euc_kr.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2019  K.Kosako
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 static const int EncLen_EUCKR[] = {
33   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
49 };
50 
51 static int
euckr_mbc_enc_len(const UChar * p)52 euckr_mbc_enc_len(const UChar* p)
53 {
54   return EncLen_EUCKR[*p];
55 }
56 
57 static int
euckr_code_to_mbclen(OnigCodePoint code)58 euckr_code_to_mbclen(OnigCodePoint code)
59 {
60   if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE;
61   if ((code &    0xff00) != 0) return 2;
62   if (EncLen_EUCKR[(int )(code & 0xff)] == 1) return 1;
63 
64   return ONIGERR_INVALID_CODE_POINT_VALUE;
65 }
66 
67 static int
is_valid_mbc_string(const UChar * p,const UChar * end)68 is_valid_mbc_string(const UChar* p, const UChar* end)
69 {
70   while (p < end) {
71     if (*p < 0x80) {
72       p++;
73     }
74     else if (*p < 0xa1) {
75       return FALSE;
76     }
77     else if (*p < 0xff) {
78       p++;
79       if (p >= end) return FALSE;
80       if (*p < 0xa1 || *p == 0xff)
81         return FALSE;
82       p++;
83     }
84     else
85       return FALSE;
86   }
87 
88   return TRUE;
89 }
90 
91 static OnigCodePoint
euckr_mbc_to_code(const UChar * p,const UChar * end)92 euckr_mbc_to_code(const UChar* p, const UChar* end)
93 {
94   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_KR, p, end);
95 }
96 
97 static int
euckr_code_to_mbc(OnigCodePoint code,UChar * buf)98 euckr_code_to_mbc(OnigCodePoint code, UChar *buf)
99 {
100   return onigenc_mb2_code_to_mbc(ONIG_ENCODING_EUC_KR, code, buf);
101 }
102 
103 static int
euckr_mbc_case_fold(OnigCaseFoldType flag,const UChar ** pp,const UChar * end,UChar * lower)104 euckr_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
105                     UChar* lower)
106 {
107   return onigenc_mbn_mbc_case_fold(ONIG_ENCODING_EUC_KR, flag,
108                                    pp, end, lower);
109 }
110 
111 static int
euckr_is_code_ctype(OnigCodePoint code,unsigned int ctype)112 euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype)
113 {
114   return onigenc_mb2_is_code_ctype(ONIG_ENCODING_EUC_KR, code, ctype);
115 }
116 
117 #define euckr_islead(c)    ((c) < 0xa1 || (c) == 0xff)
118 
119 static UChar*
euckr_left_adjust_char_head(const UChar * start,const UChar * s)120 euckr_left_adjust_char_head(const UChar* start, const UChar* s)
121 {
122   /* Assumed in this encoding,
123      mb-trail bytes don't mix with single bytes.
124   */
125   const UChar *p;
126   int len;
127 
128   if (s <= start) return (UChar* )s;
129   p = s;
130 
131   while (!euckr_islead(*p) && p > start) p--;
132   len = enclen(ONIG_ENCODING_EUC_KR, p);
133   if (p + len > s) return (UChar* )p;
134   p += len;
135   return (UChar* )(p + ((s - p) & ~1));
136 }
137 
138 static int
euckr_is_allowed_reverse_match(const UChar * s,const UChar * end ARG_UNUSED)139 euckr_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED)
140 {
141   const UChar c = *s;
142   if (c <= 0x7e) return TRUE;
143   else           return FALSE;
144 }
145 
146 OnigEncodingType OnigEncodingEUC_KR = {
147   euckr_mbc_enc_len,
148   "EUC-KR",   /* name */
149   2,          /* max enc length */
150   1,          /* min enc length */
151   onigenc_is_mbc_newline_0x0a,
152   euckr_mbc_to_code,
153   euckr_code_to_mbclen,
154   euckr_code_to_mbc,
155   euckr_mbc_case_fold,
156   onigenc_ascii_apply_all_case_fold,
157   onigenc_ascii_get_case_fold_codes_by_str,
158   onigenc_minimum_property_name_to_ctype,
159   euckr_is_code_ctype,
160   onigenc_not_support_get_ctype_code_range,
161   euckr_left_adjust_char_head,
162   euckr_is_allowed_reverse_match,
163   NULL, /* init */
164   NULL, /* is_initialized */
165   is_valid_mbc_string,
166   ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
167   0, 0
168 };
169 
170 /* Same with OnigEncodingEUC_KR except the name */
171 OnigEncodingType OnigEncodingEUC_CN = {
172   euckr_mbc_enc_len,
173   "EUC-CN",   /* name */
174   2,          /* max enc length */
175   1,          /* min enc length */
176   onigenc_is_mbc_newline_0x0a,
177   euckr_mbc_to_code,
178   euckr_code_to_mbclen,
179   euckr_code_to_mbc,
180   euckr_mbc_case_fold,
181   onigenc_ascii_apply_all_case_fold,
182   onigenc_ascii_get_case_fold_codes_by_str,
183   onigenc_minimum_property_name_to_ctype,
184   euckr_is_code_ctype,
185   onigenc_not_support_get_ctype_code_range,
186   euckr_left_adjust_char_head,
187   euckr_is_allowed_reverse_match,
188   NULL, /* init */
189   NULL, /* is_initialized */
190   is_valid_mbc_string,
191   ENC_FLAG_ASCII_COMPATIBLE|ENC_FLAG_SKIP_OFFSET_1_OR_0,
192   0, 0
193 };
194