Ruby  1.9.3p448(2013-06-27revision41675)
utf_16le.c
Go to the documentation of this file.
1 /**********************************************************************
2  utf_16le.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
33 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
34 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
35 
36 static const int EncLen_UTF16[] = {
37  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
38  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
41  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50  2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
53 };
54 
55 static int
57  OnigEncoding enc ARG_UNUSED)
58 {
59  int len = (int)(e - p);
60  UChar byte;
61  if (len < 2)
63  byte = p[1];
64  if (!UTF16_IS_SURROGATE(byte)) {
66  }
67  if (UTF16_IS_SURROGATE_FIRST(byte)) {
68  if (len < 4)
70  if (UTF16_IS_SURROGATE_SECOND(p[3]))
72  }
74 }
75 
76 static int
77 utf16le_is_mbc_newline(const UChar* p, const UChar* end,
78  OnigEncoding enc ARG_UNUSED)
79 {
80  if (p + 1 < end) {
81  if (*p == 0x0a && *(p+1) == 0x00)
82  return 1;
83 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
84  if ((
85 #ifndef USE_CRNL_AS_LINE_TERMINATOR
86  *p == 0x0d ||
87 #endif
88  *p == 0x85) && *(p+1) == 0x00)
89  return 1;
90  if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
91  return 1;
92 #endif
93  }
94  return 0;
95 }
96 
97 static OnigCodePoint
98 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
99  OnigEncoding enc ARG_UNUSED)
100 {
101  OnigCodePoint code;
102  UChar c0 = *p;
103  UChar c1 = *(p+1);
104 
105  if (UTF16_IS_SURROGATE_FIRST(c1)) {
106  code = ((((c1 << 8) + c0) & 0x03ff) << 10)
107  + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000;
108  }
109  else {
110  code = c1 * 256 + p[0];
111  }
112  return code;
113 }
114 
115 static int
117  OnigEncoding enc ARG_UNUSED)
118 {
119  return (code > 0xffff ? 4 : 2);
120 }
121 
122 static int
124  OnigEncoding enc ARG_UNUSED)
125 {
126  UChar* p = buf;
127 
128  if (code > 0xffff) {
129  unsigned int high = (code >> 10) + 0xD7C0;
130  unsigned int low = (code & 0x3FF) + 0xDC00;
131  *p++ = high & 0xFF;
132  *p++ = (high >> 8) & 0xFF;
133  *p++ = low & 0xFF;
134  *p++ = (low >> 8) & 0xFF;
135  return 4;
136  }
137  else {
138  *p++ = (UChar )(code & 0xff);
139  *p++ = (UChar )((code & 0xff00) >> 8);
140  return 2;
141  }
142 }
143 
144 static int
146  const UChar** pp, const UChar* end, UChar* fold,
147  OnigEncoding enc)
148 {
149  const UChar* p = *pp;
150 
151  if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
152 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
153  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
154  if (*p == 0x49) {
155  *fold++ = 0x31;
156  *fold = 0x01;
157  (*pp) += 2;
158  return 2;
159  }
160  }
161 #endif
162 
163  *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
164  *fold = 0;
165  *pp += 2;
166  return 2;
167  }
168  else
169  return onigenc_unicode_mbc_case_fold(enc, flag, pp,
170  end, fold);
171 }
172 
173 #if 0
174 static int
175 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
176  const UChar* end)
177 {
178  const UChar* p = *pp;
179 
180  (*pp) += EncLen_UTF16[*(p+1)];
181 
182  if (*(p+1) == 0) {
183  int c, v;
184 
185  if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
186  return TRUE;
187  }
188 
189  c = *p;
190  v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
191  (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
192  if ((v | BIT_CTYPE_LOWER) != 0) {
193  /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
194  if (c >= 0xaa && c <= 0xba)
195  return FALSE;
196  else
197  return TRUE;
198  }
199  return (v != 0 ? TRUE : FALSE);
200  }
201 
202  return FALSE;
203 }
204 #endif
205 
206 static UChar*
207 utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
208  OnigEncoding enc ARG_UNUSED)
209 {
210  if (s <= start) return (UChar* )s;
211 
212  if ((s - start) % 2 == 1) {
213  s--;
214  }
215 
216  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
217  s -= 2;
218 
219  return (UChar* )s;
220 }
221 
222 static int
224  const OnigUChar* p, const OnigUChar* end,
225  OnigCaseFoldCodeItem items[],
226  OnigEncoding enc)
227 {
229  flag, p, end, items);
230 }
231 
232 OnigEncodingDefine(utf_16le, UTF_16LE) = {
234  "UTF-16LE", /* name */
235  4, /* max byte length */
236  2, /* min byte length */
249 };
250