xref: /PHP-5.3/ext/mbstring/oniguruma/enc/iso8859_1.c (revision 7aab46a2)
1 /**********************************************************************
2   iso8859_1.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2006  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
33   ((EncISO_8859_1_CtypeTable[code] & ctype) != 0)
34 
35 static const unsigned short EncISO_8859_1_CtypeTable[256] = {
36   0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
37   0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008,
38   0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
39   0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
40   0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
41   0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
42   0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0,
43   0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
44   0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2,
45   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
46   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
47   0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0,
48   0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2,
49   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
50   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
51   0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008,
52   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
53   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
54   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
55   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
56   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
57   0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
58   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0,
59   0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
60   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2,
61   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2,
62   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0,
63   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2,
64   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2,
65   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2,
66   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0,
67   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2
68 };
69 
70 static int
iso_8859_1_mbc_to_normalize(OnigAmbigType flag,const UChar ** pp,const UChar * end,UChar * lower)71 iso_8859_1_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* lower)
72 {
73   const UChar* p = *pp;
74 
75   if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
76        ONIGENC_IS_MBC_ASCII(p)) ||
77       ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
78        !ONIGENC_IS_MBC_ASCII(p))) {
79     *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
80   }
81   else {
82     *lower = *p;
83   }
84   (*pp)++;
85   return 1; /* return byte length of converted char to lower */
86 }
87 
88 static int
iso_8859_1_is_mbc_ambiguous(OnigAmbigType flag,const UChar ** pp,const UChar * end)89 iso_8859_1_is_mbc_ambiguous(OnigAmbigType flag,
90 			    const UChar** pp, const UChar* end)
91 {
92   const UChar* p = *pp;
93 
94   (*pp)++;
95   if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
96        ONIGENC_IS_MBC_ASCII(p)) ||
97       ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
98        !ONIGENC_IS_MBC_ASCII(p))) {
99     int v = (EncISO_8859_1_CtypeTable[*p] &
100              (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER));
101 
102     if ((v | ONIGENC_CTYPE_LOWER) != 0) {
103       /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
104       if (*p == 0xdf || (*p >= 0xaa && *p <= 0xba))
105         return FALSE;
106       else
107         return TRUE;
108     }
109 
110     return (v != 0 ? TRUE : FALSE);
111   }
112   return FALSE;
113 }
114 
115 static int
iso_8859_1_is_code_ctype(OnigCodePoint code,unsigned int ctype)116 iso_8859_1_is_code_ctype(OnigCodePoint code, unsigned int ctype)
117 {
118   if (code < 256)
119     return ENC_IS_ISO_8859_1_CTYPE(code, ctype);
120   else
121     return FALSE;
122 }
123 
124 OnigEncodingType OnigEncodingISO_8859_1 = {
125   onigenc_single_byte_mbc_enc_len,
126   "ISO-8859-1",  /* name */
127   1,             /* max enc length */
128   1,             /* min enc length */
129   (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
130    ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ),
131   {
132       (OnigCodePoint )'\\'                       /* esc */
133     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
134     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
135     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
136     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
137     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
138   },
139   onigenc_is_mbc_newline_0x0a,
140   onigenc_single_byte_mbc_to_code,
141   onigenc_single_byte_code_to_mbclen,
142   onigenc_single_byte_code_to_mbc,
143   iso_8859_1_mbc_to_normalize,
144   iso_8859_1_is_mbc_ambiguous,
145   onigenc_iso_8859_1_get_all_pair_ambig_codes,
146   onigenc_ess_tsett_get_all_comp_ambig_codes,
147   iso_8859_1_is_code_ctype,
148   onigenc_not_support_get_ctype_code_range,
149   onigenc_single_byte_left_adjust_char_head,
150   onigenc_always_true_is_allowed_reverse_match
151 };
152