Ruby  1.9.3p551(2014-11-13revision48407)
euc_kr.c
Go to the documentation of this file.
1 /**********************************************************************
2  euc_kr.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 static const int EncLen_EUCKR[] = {
33  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
49 };
50 
51 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
52 #define A ACCEPT
53 #define F FAILURE
54 static const signed char trans[][0x100] = {
55  { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
56  /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
57  /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
58  /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
59  /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
60  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
61  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
62  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
63  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
64  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
65  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
66  /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68  /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69  /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70  /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71  /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
72  },
73  { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
74  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
75  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
76  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
77  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
78  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
79  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
80  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
81  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
82  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
83  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84  /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
85  /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
86  /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
87  /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
88  /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
89  /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
90  }
91 };
92 #undef A
93 #undef F
94 
95 static int
96 euckr_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
97 {
98  int firstbyte = *p++;
99  state_t s = trans[0][firstbyte];
100 #define RETURN(n) \
101  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
102  ONIGENC_CONSTRUCT_MBCLEN_INVALID()
103  if (s < 0) RETURN(1);
104  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCKR[firstbyte]-1);
105  s = trans[s][*p++];
106  RETURN(2);
107 #undef RETURN
108 }
109 
110 static OnigCodePoint
111 euckr_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
112 {
113  return onigenc_mbn_mbc_to_code(enc, p, end);
114 }
115 
116 static int
118 {
119  return onigenc_mb2_code_to_mbc(enc, code, buf);
120 }
121 
122 static int
123 euckr_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
124  UChar* lower, OnigEncoding enc)
125 {
126  return onigenc_mbn_mbc_case_fold(enc, flag,
127  pp, end, lower);
128 }
129 
130 #if 0
131 static int
132 euckr_is_mbc_ambiguous(OnigCaseFoldType flag,
133  const UChar** pp, const UChar* end, OnigEncoding enc)
134 {
135  return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
136 }
137 #endif
138 
139 static int
140 euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
141 {
142  return onigenc_mb2_is_code_ctype(enc, code, ctype);
143 }
144 
145 #define euckr_islead(c) ((c) < 0xa1 || (c) == 0xff)
146 
147 static UChar*
148 euckr_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
149 {
150  /* Assumed in this encoding,
151  mb-trail bytes don't mix with single bytes.
152  */
153  const UChar *p;
154  int len;
155 
156  if (s <= start) return (UChar* )s;
157  p = s;
158 
159  while (!euckr_islead(*p) && p > start) p--;
160  len = enclen(enc, p, end);
161  if (p + len > s) return (UChar* )p;
162  p += len;
163  return (UChar* )(p + ((s - p) & ~1));
164 }
165 
166 static int
167 euckr_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
168 {
169  const UChar c = *s;
170  if (c <= 0x7e) return TRUE;
171  else return FALSE;
172 }
173 
174 OnigEncodingDefine(euc_kr, EUC_KR) = {
176  "EUC-KR", /* name */
177  2, /* max enc length */
178  1, /* min enc length */
191 };
192 ENC_ALIAS("eucKR", "EUC-KR")
193