• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

enc/utf_8.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   utf_8.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define USE_INVALID_CODE_SCHEME
00033 
00034 #ifdef USE_INVALID_CODE_SCHEME
00035 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
00036 #define INVALID_CODE_FE   0xfffffffe
00037 #define INVALID_CODE_FF   0xffffffff
00038 #define VALID_CODE_LIMIT  0x7fffffff
00039 #endif
00040 
00041 #define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80)
00042 
00043 static const int EncLen_UTF8[] = {
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00048   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00049   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00050   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00052   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00053   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00054   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00055   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00056   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00057   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00058   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00059   4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
00060 };
00061 
00062 typedef enum {
00063   FAILURE = -2,
00064   ACCEPT,
00065   S0, S1, S2, S3,
00066   S4, S5, S6, S7
00067 } state_t;
00068 #define A ACCEPT
00069 #define F FAILURE
00070 static const signed char trans[][0x100] = {
00071   { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00072     /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00073     /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00074     /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00075     /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00076     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00077     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00078     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00079     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00080     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00081     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00082     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00083     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00084     /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00085     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00086     /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
00087     /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F 
00088   },
00089   { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00090     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00091     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00092     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00093     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00094     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00095     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00096     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098     /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00099     /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00100     /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00103     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00104     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00105     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00106   },
00107   { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00108     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00109     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00110     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00111     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00112     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00113     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00114     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00115     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00116     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00117     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00118     /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00119     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00120     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00121     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00122     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00123     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00124   },
00125   { /* S3   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00126     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00127     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00128     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00129     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00130     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00131     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00132     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00133     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00134     /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00135     /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00136     /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00137     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00138     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00139     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00140     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00141     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00142   },
00143   { /* S4   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00144     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00145     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00146     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00147     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00148     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00149     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00150     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00151     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00152     /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00153     /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00154     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00155     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00156     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00157     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00158     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00159     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00160   },
00161   { /* S5   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00162     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00163     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00164     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00165     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00166     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00167     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00168     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00169     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00170     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00171     /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00172     /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00173     /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00174     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00175     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00176     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00177     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00178   },
00179   { /* S6   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00180     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00181     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00182     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00183     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00184     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00185     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00186     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00187     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00188     /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00189     /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00190     /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00191     /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00192     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00193     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00194     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00195     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00196   },
00197   { /* S7   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00198     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00199     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00200     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00201     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00202     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00203     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00204     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00205     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00206     /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00207     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00208     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00209     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00210     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00211     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00212     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00213     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00214   },
00215 };
00216 #undef A
00217 #undef F
00218 
00219 static int
00220 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00221 {
00222   int firstbyte = *p++;
00223   state_t s;
00224   s = trans[0][firstbyte];
00225   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00226                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00227 
00228   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
00229   s = trans[s][*p++];
00230   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00231                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00232 
00233   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
00234   s = trans[s][*p++];
00235   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
00236                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00237 
00238   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
00239   s = trans[s][*p++];
00240   return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
00241                        ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00242 }
00243 
00244 static int
00245 is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
00246 {
00247   if (p < end) {
00248     if (*p == 0x0a) return 1;
00249 
00250 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00251 #ifndef USE_CRNL_AS_LINE_TERMINATOR
00252     if (*p == 0x0d) return 1;
00253 #endif
00254     if (p + 1 < end) {
00255       if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
00256         return 1;
00257       if (p + 2 < end) {
00258         if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
00259             && *(p+1) == 0x80 && *p == 0xe2)  /* U+2028, U+2029 */
00260           return 1;
00261       }
00262     }
00263 #endif
00264   }
00265 
00266   return 0;
00267 }
00268 
00269 static OnigCodePoint
00270 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00271 {
00272   int c, len;
00273   OnigCodePoint n;
00274 
00275   len = enclen(enc, p, end);
00276   c = *p++;
00277   if (len > 1) {
00278     len--;
00279     n = c & ((1 << (6 - len)) - 1);
00280     while (len--) {
00281       c = *p++;
00282       n = (n << 6) | (c & ((1 << 6) - 1));
00283     }
00284     return n;
00285   }
00286   else {
00287 #ifdef USE_INVALID_CODE_SCHEME
00288     if (c > 0xfd) {
00289       return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
00290     }
00291 #endif
00292     return (OnigCodePoint )c;
00293   }
00294 }
00295 
00296 static int
00297 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00298 {
00299   if      ((code & 0xffffff80) == 0) return 1;
00300   else if ((code & 0xfffff800) == 0) return 2;
00301   else if ((code & 0xffff0000) == 0) return 3;
00302   else if ((code & 0xffe00000) == 0) return 4;
00303   else if ((code & 0xfc000000) == 0) return 5;
00304   else if ((code & 0x80000000) == 0) return 6;
00305 #ifdef USE_INVALID_CODE_SCHEME
00306   else if (code == INVALID_CODE_FE) return 1;
00307   else if (code == INVALID_CODE_FF) return 1;
00308 #endif
00309   else
00310     return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
00311 }
00312 
00313 static int
00314 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
00315 {
00316 #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
00317 #define UTF8_TRAIL0(code)        (UChar )(((code) & 0x3f) | 0x80)
00318 
00319   if ((code & 0xffffff80) == 0) {
00320     *buf = (UChar )code;
00321     return 1;
00322   }
00323   else {
00324     UChar *p = buf;
00325 
00326     if ((code & 0xfffff800) == 0) {
00327       *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
00328     }
00329     else if ((code & 0xffff0000) == 0) {
00330       *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
00331       *p++ = UTF8_TRAILS(code, 6);
00332     }
00333     else if ((code & 0xffe00000) == 0) {
00334       *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
00335       *p++ = UTF8_TRAILS(code, 12);
00336       *p++ = UTF8_TRAILS(code,  6);
00337     }
00338     else if ((code & 0xfc000000) == 0) {
00339       *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
00340       *p++ = UTF8_TRAILS(code, 18);
00341       *p++ = UTF8_TRAILS(code, 12);
00342       *p++ = UTF8_TRAILS(code,  6);
00343     }
00344     else if ((code & 0x80000000) == 0) {
00345       *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
00346       *p++ = UTF8_TRAILS(code, 24);
00347       *p++ = UTF8_TRAILS(code, 18);
00348       *p++ = UTF8_TRAILS(code, 12);
00349       *p++ = UTF8_TRAILS(code,  6);
00350     }
00351 #ifdef USE_INVALID_CODE_SCHEME
00352     else if (code == INVALID_CODE_FE) {
00353       *p = 0xfe;
00354       return 1;
00355     }
00356     else if (code == INVALID_CODE_FF) {
00357       *p = 0xff;
00358       return 1;
00359     }
00360 #endif
00361     else {
00362       return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
00363     }
00364 
00365     *p++ = UTF8_TRAIL0(code);
00366     return (int)(p - buf);
00367   }
00368 }
00369 
00370 static int
00371 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
00372                    const UChar* end, UChar* fold, OnigEncoding enc)
00373 {
00374   const UChar* p = *pp;
00375 
00376   if (ONIGENC_IS_MBC_ASCII(p)) {
00377 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
00378     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
00379       if (*p == 0x49) {
00380         *fold++ = 0xc4;
00381         *fold   = 0xb1;
00382         (*pp)++;
00383         return 2;
00384       }
00385     }
00386 #endif
00387 
00388     *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00389     (*pp)++;
00390     return 1; /* return byte length of converted char to lower */
00391   }
00392   else {
00393     return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold);
00394   }
00395 }
00396 
00397 
00398 static int
00399 get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
00400                           const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00401 {
00402   *sb_out = 0x80;
00403   return onigenc_unicode_ctype_code_range(ctype, ranges);
00404 }
00405 
00406 
00407 static UChar*
00408 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00409 {
00410   const UChar *p;
00411 
00412   if (s <= start) return (UChar* )s;
00413   p = s;
00414 
00415   while (!utf8_islead(*p) && p > start) p--;
00416   return (UChar* )p;
00417 }
00418 
00419 static int
00420 get_case_fold_codes_by_str(OnigCaseFoldType flag,
00421     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[],
00422     OnigEncoding enc)
00423 {
00424   return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items);
00425 }
00426 
00427 OnigEncodingDefine(utf_8, UTF_8) = {
00428   mbc_enc_len,
00429   "UTF-8",     /* name */
00430   6,           /* max byte length */
00431   1,           /* min byte length */
00432   is_mbc_newline,
00433   mbc_to_code,
00434   code_to_mbclen,
00435   code_to_mbc,
00436   mbc_case_fold,
00437   onigenc_unicode_apply_all_case_fold,
00438   get_case_fold_codes_by_str,
00439   onigenc_unicode_property_name_to_ctype,
00440   onigenc_unicode_is_code_ctype,
00441   get_ctype_code_range,
00442   left_adjust_char_head,
00443   onigenc_always_true_is_allowed_reverse_match
00444 };
00445 ENC_ALIAS("CP65001", "UTF-8")
00446 
00447 /*
00448  * Name: UTF8-MAC
00449  * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html
00450  * Link: http://developer.apple.com/qa/qa2001/qa1235.html
00451  * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html
00452  * Link: http://www.gnu.org/software/emacs/NEWS.23.2
00453  */
00454 ENC_REPLICATE("UTF8-MAC", "UTF-8")
00455 ENC_ALIAS("UTF-8-MAC", "UTF8-MAC")
00456 ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */
00457 
00458 

Generated on Thu Sep 8 2011 03:50:27 for Ruby by  doxygen 1.7.1