Ruby  1.9.3p551(2014-11-13revision48407)
euc_tw.c
Go to the documentation of this file.
1 /**********************************************************************
2  euc_tw.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 static const int EncLen_EUCTW[] = {
33  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1,
42  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
49 };
50 
51 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3 } state_t;
52 #define A ACCEPT
53 #define F FAILURE
54 static const signed char trans[][0x100] = {
55  { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
56  /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
57  /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
58  /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
59  /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
60  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
61  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
62  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
63  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
64  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 2, F,
65  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
66  /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68  /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69  /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70  /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71  /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
72  },
73  { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
74  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
75  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
76  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
77  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
78  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
79  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
80  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
81  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
82  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
83  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84  /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
85  /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
86  /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
87  /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
88  /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
89  /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
90  },
91  { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
92  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
93  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
94  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
95  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
96  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
101  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
102  /* a */ F, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
103  /* b */ 3, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
104  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
105  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
106  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
107  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
108  },
109  { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
110  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
111  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
112  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
113  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
114  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
115  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
116  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
117  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
118  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
119  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
120  /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
121  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122  /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123  /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124  /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125  /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
126  }
127 };
128 #undef A
129 #undef F
130 
131 static int
132 euctw_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
133 {
134  int firstbyte = *p++;
135  state_t s = trans[0][firstbyte];
136 #define RETURN(n) \
137  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
138  ONIGENC_CONSTRUCT_MBCLEN_INVALID()
139  if (s < 0) RETURN(1);
140  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCTW[firstbyte]-1);
141  s = trans[s][*p++];
142  if (s < 0) RETURN(2);
143  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-2);
144  s = trans[s][*p++];
145  if (s < 0) RETURN(3);
146  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-3);
147  s = trans[s][*p++];
148  RETURN(4);
149 #undef RETURN
150 }
151 
152 static OnigCodePoint
153 euctw_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED)
154 {
155  return onigenc_mbn_mbc_to_code(enc, p, end);
156 }
157 
158 static int
160 {
161  return onigenc_mb4_code_to_mbc(enc, code, buf);
162 }
163 
164 static int
165 euctw_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
166  UChar* lower, OnigEncoding enc)
167 {
168  return onigenc_mbn_mbc_case_fold(enc, flag,
169  pp, end, lower);
170 }
171 
172 static int
173 euctw_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
174 {
175  return onigenc_mb4_is_code_ctype(enc, code, ctype);
176 }
177 
178 #define euctw_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
179 
180 static UChar*
181 euctw_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
182 {
183  /* Assumed in this encoding,
184  mb-trail bytes don't mix with single bytes.
185  */
186  const UChar *p;
187  int len;
188 
189  if (s <= start) return (UChar* )s;
190  p = s;
191 
192  while (!euctw_islead(*p) && p > start) p--;
193  len = enclen(enc, p, end);
194  if (p + len > s) return (UChar* )p;
195  p += len;
196  return (UChar* )(p + ((s - p) & ~1));
197 }
198 
199 static int
200 euctw_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
201 {
202  const UChar c = *s;
203  if (c <= 0x7e) return TRUE;
204  else return FALSE;
205 }
206 
207 OnigEncodingDefine(euc_tw, EUC_TW) = {
209  "EUC-TW", /* name */
210  4, /* max enc length */
211  1, /* min enc length */
224 };
225 ENC_ALIAS("eucTW", "EUC-TW")
226