• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

enc/emacs_mule.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   emacs_mule.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regint.h"
00031 
00032 
00033 #define emacsmule_islead(c)    ((UChar )(c) < 0x9e)
00034 
00035 /*
00036     CHARACTER := ASCII_CHAR | MULTIBYTE_CHAR
00037     MULTIBYTE_CHAR := PRIMARY_CHAR_1 | PRIMARY_CHAR_2
00038                       | SECONDARY_CHAR_1 | SECONDARY_CHAR_2
00039     PRIMARY_CHAR_1   := LEADING_CODE_PRI C1
00040     PRIMARY_CHAR_2   := LEADING_CODE_PRI C1 C2
00041     SECONDARY_CHAR_1 := LEADING_CODE_SEC LEADING_CODE_EXT C1
00042     SECONDARY_CHAR_2 := LEADING_CODE_SEC LEADING_CODE_EXT C1 C2
00043     ASCII_CHAR := 0 | 1 | ... | 127
00044     LEADING_CODE_PRI := 129 | 130 | ... | 153
00045     LEADING_CODE_SEC := 154 | 155 | 156 | 157
00046     C1, C2, LEADING_CODE_EXT := 160 | 161 | ... | 255
00047  */
00048 
00049 static const int EncLen_EmacsMule[] = {
00050   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00052   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00053   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00054   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00055   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00056   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00059   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1,
00060   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00065   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00066 };
00067 
00068 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3, S4, S5, S6 } state_t;
00069 #define A ACCEPT
00070 #define F FAILURE
00071 static const signed char trans[][0x100] = {
00072   { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00073     /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00074     /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00075     /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00076     /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00077     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00078     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00079     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00080     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00081     /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00082     /* 9 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 5, 6, F, F,
00083     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00084     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00085     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00086     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00087     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00088     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
00089   },
00090   { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00091     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00092     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00093     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00094     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00095     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00096     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00101     /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00104     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00105     /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00106     /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A 
00107   },
00108   { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00109     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00110     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00111     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00112     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00113     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00114     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00115     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00116     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00117     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00118     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00119     /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00120     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00121     /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00122     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00123     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00124     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 
00125   },
00126   { /* S3   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00127     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00128     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00129     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00130     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00131     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00132     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00133     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00134     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00135     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00136     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00137     /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00138     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00139     /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00140     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00141     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00142     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
00143   },
00144   { /* S4   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00145     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00146     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00147     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00148     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00149     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00150     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00151     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00152     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00153     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00154     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00155     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00156     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00157     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00158     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00159     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00160     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
00161   },
00162   { /* S5   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00163     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00164     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00165     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00166     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00167     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00168     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00169     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00170     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00171     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00172     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00173     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00174     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00175     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00176     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00177     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00178     /* f */ 2, 2, 2, 2, 2, F, F, F, F, F, F, F, F, F, F, F
00179   },
00180   { /* S6   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00181     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00182     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00183     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00184     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00185     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00186     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00187     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00188     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00189     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00190     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00191     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00192     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00193     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00194     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00195     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00196     /* f */ F, F, F, F, F, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, F
00197   },
00198 
00199 };
00200 #undef A
00201 #undef F
00202 
00203 static int
00204 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00205 {
00206   int firstbyte = *p++;
00207   state_t s;
00208   s = trans[0][firstbyte];
00209   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00210                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00211   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EmacsMule[firstbyte]-1);
00212   s = trans[s][*p++];
00213   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00214                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00215   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EmacsMule[firstbyte]-2);
00216   s = trans[s][*p++];
00217   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
00218                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00219   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EmacsMule[firstbyte]-3);
00220   s = trans[s][*p++];
00221   return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
00222                        ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00223 }
00224 
00225 static OnigCodePoint
00226 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00227 {
00228   int c, i, len;
00229   OnigCodePoint n;
00230 
00231   len = enclen(enc, p, end);
00232   n = (OnigCodePoint )*p++;
00233   if (len == 1) return n;
00234 
00235   for (i = 1; i < len; i++) {
00236     if (p >= end) break;
00237     c = *p++;
00238     n <<= 8;  n += c;
00239   }
00240   return n;
00241 }
00242 
00243 static int
00244 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00245 {
00246   if (ONIGENC_IS_CODE_ASCII(code)) return 1;
00247   else if (code > 0xffffffff) return 0;
00248   else if ((code & 0xff000000) >= 0x80000000) return 4;
00249   else if ((code &   0xff0000) >= 0x800000) return 3;
00250   else if ((code &     0xff00) >= 0x8000) return 2;
00251   else
00252     return ONIGERR_INVALID_CODE_POINT_VALUE;
00253 }
00254 
00255 static int
00256 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00257 {
00258   UChar *p = buf;
00259 
00260   if ((code & 0xff000000) != 0) *p++ = (UChar )(((code >> 24) & 0xff));
00261   if ((code &   0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
00262   if ((code &     0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
00263   *p++ = (UChar )(code & 0xff);
00264 
00265   if (enclen(enc, buf, p) != (p - buf))
00266     return ONIGERR_INVALID_CODE_POINT_VALUE;
00267   return p - buf;
00268 }
00269 
00270 static int
00271 mbc_case_fold(OnigCaseFoldType flag,
00272               const UChar** pp, const UChar* end, UChar* lower,
00273               OnigEncoding enc)
00274 {
00275   int len;
00276   const UChar* p = *pp;
00277 
00278   if (ONIGENC_IS_MBC_ASCII(p)) {
00279     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00280     (*pp)++;
00281     return 1;
00282   }
00283   else {
00284     int i;
00285 
00286     len = mbc_enc_len(p, end, enc);
00287     for (i = 0; i < len; i++) {
00288       *lower++ = *p++;
00289     }
00290     (*pp) += len;
00291     return len; /* return byte length of converted char to lower */
00292   }
00293 }
00294 
00295 static UChar*
00296 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00297 {
00298   const UChar *p;
00299 
00300   if (s <= start) return (UChar* )s;
00301   p = s;
00302 
00303   while (!emacsmule_islead(*p) && p > start) p--;
00304   return (UChar* )p;
00305 }
00306 
00307 static int
00308 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
00309 {
00310   if (code < 128)
00311     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00312   else
00313     return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
00314 }
00315 
00316 /*
00317  * Name: Emacs-Mule
00318  * Link: http://www.m17n.org/mule/pricai96/mule.en.html
00319  */
00320 OnigEncodingDefine(emacs_mule, Emacs_Mule) = {
00321   mbc_enc_len,
00322   "Emacs-Mule",   /* name */
00323   4,          /* max enc length */
00324   1,          /* min enc length */
00325   onigenc_is_mbc_newline_0x0a,
00326   mbc_to_code,
00327   code_to_mbclen,
00328   code_to_mbc,
00329   mbc_case_fold,
00330   onigenc_ascii_apply_all_case_fold,
00331   onigenc_ascii_get_case_fold_codes_by_str,
00332   onigenc_minimum_property_name_to_ctype,
00333   is_code_ctype,
00334   onigenc_not_support_get_ctype_code_range,
00335   left_adjust_char_head,
00336   onigenc_always_true_is_allowed_reverse_match,
00337   0
00338 };
00339 
00340 ENC_REPLICATE("stateless-ISO-2022-JP", "Emacs-Mule")
00341 

Generated on Thu Sep 8 2011 03:48:05 for Ruby by  doxygen 1.7.1