00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regenc.h"
00031
00032 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
00033 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
00034 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
00035
00036 static const int EncLen_UTF16[] = {
00037 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00052 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00053 };
00054
00055 static int
00056 utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e,
00057 OnigEncoding enc ARG_UNUSED)
00058 {
00059 int len = e-p, byte;
00060 if (len < 2)
00061 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00062 byte = p[1];
00063 if (!UTF16_IS_SURROGATE(byte)) {
00064 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
00065 }
00066 if (UTF16_IS_SURROGATE_FIRST(byte)) {
00067 if (len < 4)
00068 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len);
00069 if (UTF16_IS_SURROGATE_SECOND(p[3]))
00070 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
00071 }
00072 return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00073 }
00074
00075 static int
00076 utf16le_is_mbc_newline(const UChar* p, const UChar* end,
00077 OnigEncoding enc ARG_UNUSED)
00078 {
00079 if (p + 1 < end) {
00080 if (*p == 0x0a && *(p+1) == 0x00)
00081 return 1;
00082 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00083 if ((
00084 #ifndef USE_CRNL_AS_LINE_TERMINATOR
00085 *p == 0x0d ||
00086 #endif
00087 *p == 0x85) && *(p+1) == 0x00)
00088 return 1;
00089 if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
00090 return 1;
00091 #endif
00092 }
00093 return 0;
00094 }
00095
00096 static OnigCodePoint
00097 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
00098 OnigEncoding enc ARG_UNUSED)
00099 {
00100 OnigCodePoint code;
00101 UChar c0 = *p;
00102 UChar c1 = *(p+1);
00103
00104 if (UTF16_IS_SURROGATE_FIRST(c1)) {
00105 code = ((((c1 << 8) + c0) & 0x03ff) << 10)
00106 + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000;
00107 }
00108 else {
00109 code = c1 * 256 + p[0];
00110 }
00111 return code;
00112 }
00113
00114 static int
00115 utf16le_code_to_mbclen(OnigCodePoint code,
00116 OnigEncoding enc ARG_UNUSED)
00117 {
00118 return (code > 0xffff ? 4 : 2);
00119 }
00120
00121 static int
00122 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf,
00123 OnigEncoding enc ARG_UNUSED)
00124 {
00125 UChar* p = buf;
00126
00127 if (code > 0xffff) {
00128 unsigned int high = (code >> 10) + 0xD7C0;
00129 unsigned int low = (code & 0x3FF) + 0xDC00;
00130 *p++ = high & 0xFF;
00131 *p++ = (high >> 8) & 0xFF;
00132 *p++ = low & 0xFF;
00133 *p++ = (low >> 8) & 0xFF;
00134 return 4;
00135 }
00136 else {
00137 *p++ = (UChar )(code & 0xff);
00138 *p++ = (UChar )((code & 0xff00) >> 8);
00139 return 2;
00140 }
00141 }
00142
00143 static int
00144 utf16le_mbc_case_fold(OnigCaseFoldType flag,
00145 const UChar** pp, const UChar* end, UChar* fold,
00146 OnigEncoding enc)
00147 {
00148 const UChar* p = *pp;
00149
00150 if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
00151 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
00152 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
00153 if (*p == 0x49) {
00154 *fold++ = 0x31;
00155 *fold = 0x01;
00156 (*pp) += 2;
00157 return 2;
00158 }
00159 }
00160 #endif
00161
00162 *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00163 *fold = 0;
00164 *pp += 2;
00165 return 2;
00166 }
00167 else
00168 return onigenc_unicode_mbc_case_fold(enc, flag, pp,
00169 end, fold);
00170 }
00171
00172 #if 0
00173 static int
00174 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
00175 const UChar* end)
00176 {
00177 const UChar* p = *pp;
00178
00179 (*pp) += EncLen_UTF16[*(p+1)];
00180
00181 if (*(p+1) == 0) {
00182 int c, v;
00183
00184 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00185 return TRUE;
00186 }
00187
00188 c = *p;
00189 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
00190 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00191 if ((v | BIT_CTYPE_LOWER) != 0) {
00192
00193 if (c >= 0xaa && c <= 0xba)
00194 return FALSE;
00195 else
00196 return TRUE;
00197 }
00198 return (v != 0 ? TRUE : FALSE);
00199 }
00200
00201 return FALSE;
00202 }
00203 #endif
00204
00205 static UChar*
00206 utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
00207 OnigEncoding enc ARG_UNUSED)
00208 {
00209 if (s <= start) return (UChar* )s;
00210
00211 if ((s - start) % 2 == 1) {
00212 s--;
00213 }
00214
00215 if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
00216 s -= 2;
00217
00218 return (UChar* )s;
00219 }
00220
00221 static int
00222 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,
00223 const OnigUChar* p, const OnigUChar* end,
00224 OnigCaseFoldCodeItem items[],
00225 OnigEncoding enc)
00226 {
00227 return onigenc_unicode_get_case_fold_codes_by_str(enc,
00228 flag, p, end, items);
00229 }
00230
00231 OnigEncodingDefine(utf_16le, UTF_16LE) = {
00232 utf16le_mbc_enc_len,
00233 "UTF-16LE",
00234 4,
00235 2,
00236 utf16le_is_mbc_newline,
00237 utf16le_mbc_to_code,
00238 utf16le_code_to_mbclen,
00239 utf16le_code_to_mbc,
00240 utf16le_mbc_case_fold,
00241 onigenc_unicode_apply_all_case_fold,
00242 utf16le_get_case_fold_codes_by_str,
00243 onigenc_unicode_property_name_to_ctype,
00244 onigenc_unicode_is_code_ctype,
00245 onigenc_utf16_32_get_ctype_code_range,
00246 utf16le_left_adjust_char_head,
00247 onigenc_always_false_is_allowed_reverse_match
00248 };
00249