00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regint.h"
00031
00032 static const int EncLen_SJIS[] = {
00033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
00049 };
00050
00051 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
00052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00060 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00068 };
00069
00070 #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
00071 #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
00072
00073 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
00074 #define A ACCEPT
00075 #define F FAILURE
00076 static const signed char trans[][0x100] = {
00077 {
00078 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00079 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00080 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00081 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00082 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00083 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00084 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00085 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00086 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00088 F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
00094 },
00095 {
00096 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00104 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00105 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00106 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00107 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00108 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00109 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00110 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00111 A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
00112 }
00113 };
00114 #undef A
00115 #undef F
00116
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120 int firstbyte = *p++;
00121 state_t s;
00122 s = trans[0][firstbyte];
00123 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
00126 s = trans[s][*p++];
00127 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129 }
00130
00131 static int
00132 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00133 {
00134 if (code < 256) {
00135 if (EncLen_SJIS[(int )code] == 1)
00136 return 1;
00137 else
00138 return 0;
00139 }
00140 else if (code <= 0xffff) {
00141 return 2;
00142 }
00143 else
00144 return ONIGERR_INVALID_CODE_POINT_VALUE;
00145 }
00146
00147 static OnigCodePoint
00148 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00149 {
00150 int c, i, len;
00151 OnigCodePoint n;
00152
00153 len = enclen(enc, p, end);
00154 c = *p++;
00155 n = c;
00156 if (len == 1) return n;
00157
00158 for (i = 1; i < len; i++) {
00159 if (p >= end) break;
00160 c = *p++;
00161 n <<= 8; n += c;
00162 }
00163 return n;
00164 }
00165
00166 static int
00167 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00168 {
00169 UChar *p = buf;
00170
00171 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
00172 *p++ = (UChar )(code & 0xff);
00173
00174 #if 0
00175 if (enclen(enc, buf) != (p - buf))
00176 return REGERR_INVALID_CODE_POINT_VALUE;
00177 #endif
00178 return p - buf;
00179 }
00180
00181 static int
00182 mbc_case_fold(OnigCaseFoldType flag,
00183 const UChar** pp, const UChar* end, UChar* lower,
00184 OnigEncoding enc)
00185 {
00186 const UChar* p = *pp;
00187
00188 if (ONIGENC_IS_MBC_ASCII(p)) {
00189 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00190 (*pp)++;
00191 return 1;
00192 }
00193 else {
00194 int i;
00195 int len = enclen(enc, p, end);
00196
00197 for (i = 0; i < len; i++) {
00198 *lower++ = *p++;
00199 }
00200 (*pp) += len;
00201 return len;
00202 }
00203 }
00204
00205 #if 0
00206 static int
00207 is_mbc_ambiguous(OnigCaseFoldType flag,
00208 const UChar** pp, const UChar* end)
00209 {
00210 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00211
00212 }
00213 #endif
00214
00215 #if 0
00216 static int
00217 is_code_ctype(OnigCodePoint code, unsigned int ctype)
00218 {
00219 if (code < 128)
00220 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00221 else {
00222 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00223 return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
00224 }
00225 }
00226
00227 return FALSE;
00228 }
00229 #endif
00230
00231 static UChar*
00232 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00233 {
00234 const UChar *p;
00235 int len;
00236
00237 if (s <= start) return (UChar* )s;
00238 p = s;
00239
00240 if (SJIS_ISMB_TRAIL(*p)) {
00241 while (p > start) {
00242 if (! SJIS_ISMB_FIRST(*--p)) {
00243 p++;
00244 break;
00245 }
00246 }
00247 }
00248 len = enclen(enc, p, end);
00249 if (p + len > s) return (UChar* )p;
00250 p += len;
00251 return (UChar* )(p + ((s - p) & ~1));
00252 }
00253
00254 static int
00255 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00256 {
00257 const UChar c = *s;
00258 return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
00259 }
00260
00261
00262 static int PropertyInited = 0;
00263 static const OnigCodePoint** PropertyList;
00264 static int PropertyListNum;
00265 static int PropertyListSize;
00266 static hash_table_type* PropertyNameTable;
00267
00268 static const OnigCodePoint CR_Hiragana[] = {
00269 1,
00270 0x829f, 0x82f1
00271 };
00272
00273 static const OnigCodePoint CR_Katakana[] = {
00274 4,
00275 0x00a6, 0x00af,
00276 0x00b1, 0x00dd,
00277 0x8340, 0x837e,
00278 0x8380, 0x8396,
00279 };
00280
00281 static int
00282 init_property_list(void)
00283 {
00284 int r;
00285
00286 PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana);
00287 PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana);
00288 PropertyInited = 1;
00289
00290 end:
00291 return r;
00292 }
00293
00294 static int
00295 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00296 {
00297 hash_data_type ctype;
00298
00299 PROPERTY_LIST_INIT_CHECK;
00300
00301 if (onig_st_lookup_strend(PropertyNameTable, p, end, &ctype) == 0) {
00302 return onigenc_minimum_property_name_to_ctype(enc, p, end);
00303 }
00304
00305 return (int)ctype;
00306 }
00307
00308 static int
00309 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00310 {
00311 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00312 if (code < 128)
00313 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00314 else {
00315 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00316 return TRUE;
00317 }
00318 }
00319 }
00320 else {
00321 PROPERTY_LIST_INIT_CHECK;
00322
00323 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00324 if (ctype >= (unsigned int )PropertyListNum)
00325 return ONIGERR_TYPE_BUG;
00326
00327 return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00328 }
00329
00330 return FALSE;
00331 }
00332
00333 static int
00334 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00335 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00336 {
00337 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00338 return ONIG_NO_SUPPORT_CONFIG;
00339 }
00340 else {
00341 *sb_out = 0x80;
00342
00343 PROPERTY_LIST_INIT_CHECK;
00344
00345 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00346 if (ctype >= (OnigCtype )PropertyListNum)
00347 return ONIGERR_TYPE_BUG;
00348
00349 *ranges = PropertyList[ctype];
00350 return 0;
00351 }
00352 }
00353
00354 OnigEncodingDefine(shift_jis, Shift_JIS) = {
00355 mbc_enc_len,
00356 "Shift_JIS",
00357 2,
00358 1,
00359 onigenc_is_mbc_newline_0x0a,
00360 mbc_to_code,
00361 code_to_mbclen,
00362 code_to_mbc,
00363 mbc_case_fold,
00364 onigenc_ascii_apply_all_case_fold,
00365 onigenc_ascii_get_case_fold_codes_by_str,
00366 property_name_to_ctype,
00367 is_code_ctype,
00368 get_ctype_code_range,
00369 left_adjust_char_head,
00370 is_allowed_reverse_match,
00371 0
00372 };
00373
00374
00375
00376
00377
00378
00379 ENC_ALIAS("SJIS", "Shift_JIS")
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389 ENC_REPLICATE("Windows-31J", "Shift_JIS")
00390 ENC_ALIAS("CP932", "Windows-31J")
00391 ENC_ALIAS("csWindows31J", "Windows-31J")
00392
00393
00394
00395
00396
00397
00398 ENC_REPLICATE("MacJapanese", "Shift_JIS")
00399 ENC_ALIAS("MacJapan", "MacJapanese")
00400