• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

enc/iso_8859_1.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   iso8859_1.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
00033   ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
00034 
00035 static const unsigned short EncISO_8859_1_CtypeTable[256] = {
00036   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
00037   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
00038   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
00039   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
00040   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
00041   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
00042   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
00043   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
00044   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
00045   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
00046   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
00047   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
00048   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
00049   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
00050   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
00051   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
00052   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00053   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00054   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00055   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00056   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
00057   0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
00058   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
00059   0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
00060   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
00061   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
00062   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
00063   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
00064   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
00065   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
00066   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
00067   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
00068 };
00069 
00070 static const OnigPairCaseFoldCodes CaseFoldMap[] = {
00071   { 0xc0, 0xe0 },
00072   { 0xc1, 0xe1 },
00073   { 0xc2, 0xe2 },
00074   { 0xc3, 0xe3 },
00075   { 0xc4, 0xe4 },
00076   { 0xc5, 0xe5 },
00077   { 0xc6, 0xe6 },
00078   { 0xc7, 0xe7 },
00079   { 0xc8, 0xe8 },
00080   { 0xc9, 0xe9 },
00081   { 0xca, 0xea },
00082   { 0xcb, 0xeb },
00083   { 0xcc, 0xec },
00084   { 0xcd, 0xed },
00085   { 0xce, 0xee },
00086   { 0xcf, 0xef },
00087 
00088   { 0xd0, 0xf0 },
00089   { 0xd1, 0xf1 },
00090   { 0xd2, 0xf2 },
00091   { 0xd3, 0xf3 },
00092   { 0xd4, 0xf4 },
00093   { 0xd5, 0xf5 },
00094   { 0xd6, 0xf6 },
00095   { 0xd8, 0xf8 },
00096   { 0xd9, 0xf9 },
00097   { 0xda, 0xfa },
00098   { 0xdb, 0xfb },
00099   { 0xdc, 0xfc },
00100   { 0xdd, 0xfd },
00101   { 0xde, 0xfe }
00102 };
00103 
00104 static int
00105 apply_all_case_fold(OnigCaseFoldType flag,
00106                     OnigApplyAllCaseFoldFunc f, void* arg,
00107                     OnigEncoding enc ARG_UNUSED)
00108 {
00109   return onigenc_apply_all_case_fold_with_map(
00110             sizeof(CaseFoldMap)/sizeof(OnigPairCaseFoldCodes), CaseFoldMap, 1,
00111             flag, f, arg);
00112 }
00113 
00114 static int
00115 get_case_fold_codes_by_str(OnigCaseFoldType flag,
00116                            const OnigUChar* p, const OnigUChar* end,
00117                            OnigCaseFoldCodeItem items[],
00118                            OnigEncoding enc ARG_UNUSED)
00119 {
00120   if (0x41 <= *p && *p <= 0x5a) {
00121     items[0].byte_len = 1;
00122     items[0].code_len = 1;
00123     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
00124     if (*p == 0x53 && end > p + 1
00125         && (*(p+1) == 0x53 || *(p+1) == 0x73)) { /* SS */
00126       items[1].byte_len = 2;
00127       items[1].code_len = 1;
00128       items[1].code[0] = (OnigCodePoint )0xdf;
00129       return 2;
00130     }
00131     else
00132       return 1;
00133   }
00134   else if (0x61 <= *p && *p <= 0x7a) {
00135     items[0].byte_len = 1;
00136     items[0].code_len = 1;
00137     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
00138     if (*p == 0x73 && end > p + 1
00139         && (*(p+1) == 0x73 || *(p+1) == 0x53)) { /* ss */
00140       items[1].byte_len = 2;
00141       items[1].code_len = 1;
00142       items[1].code[0] = (OnigCodePoint )0xdf;
00143       return 2;
00144     }
00145     else
00146       return 1;
00147   }
00148   else if (0xc0 <= *p && *p <= 0xcf) {
00149     items[0].byte_len = 1;
00150     items[0].code_len = 1;
00151     items[0].code[0] = (OnigCodePoint )(*p + 0x20);
00152     return 1;
00153   }
00154   else if (0xd0 <= *p && *p <= 0xdf) {
00155     if (*p == 0xdf) {
00156       items[0].byte_len = 1;
00157       items[0].code_len = 2;
00158       items[0].code[0] = (OnigCodePoint )'s';
00159       items[0].code[1] = (OnigCodePoint )'s';
00160 
00161       items[1].byte_len = 1;
00162       items[1].code_len = 2;
00163       items[1].code[0] = (OnigCodePoint )'S';
00164       items[1].code[1] = (OnigCodePoint )'S';
00165 
00166       items[2].byte_len = 1;
00167       items[2].code_len = 2;
00168       items[2].code[0] = (OnigCodePoint )'s';
00169       items[2].code[1] = (OnigCodePoint )'S';
00170 
00171       items[3].byte_len = 1;
00172       items[3].code_len = 2;
00173       items[3].code[0] = (OnigCodePoint )'S';
00174       items[3].code[1] = (OnigCodePoint )'s';
00175 
00176       return 4;
00177     }
00178     else if (*p != 0xd7) {
00179       items[0].byte_len = 1;
00180       items[0].code_len = 1;
00181       items[0].code[0] = (OnigCodePoint )(*p + 0x20);
00182       return 1;
00183     }
00184   }
00185   else if (0xe0 <= *p && *p <= 0xef) {
00186     items[0].byte_len = 1;
00187     items[0].code_len = 1;
00188     items[0].code[0] = (OnigCodePoint )(*p - 0x20);
00189     return 1;
00190   }
00191   else if (0xf0 <= *p && *p <= 0xfe) {
00192     if (*p != 0xf7) {
00193       items[0].byte_len = 1;
00194       items[0].code_len = 1;
00195       items[0].code[0] = (OnigCodePoint )(*p - 0x20);
00196       return 1;
00197     }
00198   }
00199 
00200   return 0;
00201 }
00202 
00203 static int
00204 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED,
00205               UChar* lower, OnigEncoding enc ARG_UNUSED)
00206 {
00207   const UChar* p = *pp;
00208 
00209   if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00210     *lower++ = 's';
00211     *lower   = 's';
00212     (*pp)++;
00213     return 2;
00214   }
00215 
00216   *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
00217   (*pp)++;
00218   return 1;
00219 }
00220 
00221 #if 0
00222 static int
00223 is_mbc_ambiguous(OnigCaseFoldType flag,
00224                  const UChar** pp, const UChar* end)
00225 {
00226   int v;
00227   const UChar* p = *pp;
00228 
00229   if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00230     (*pp)++;
00231     return TRUE;
00232   }
00233 
00234   (*pp)++;
00235   v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00236   if ((v | BIT_CTYPE_LOWER) != 0) {
00237     /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00238     if (*p >= 0xaa && *p <= 0xba)
00239       return FALSE;
00240     else
00241       return TRUE;
00242   }
00243 
00244   return (v != 0 ? TRUE : FALSE);
00245 }
00246 #endif
00247 
00248 static int
00249 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
00250 {
00251   if (code < 256)
00252     return ENC_IS_ISO_8859_1_CTYPE(code, ctype);
00253   else
00254     return FALSE;
00255 }
00256 
00257 OnigEncodingDefine(iso_8859_1, ISO_8859_1) = {
00258   onigenc_single_byte_mbc_enc_len,
00259   "ISO-8859-1",  /* name */
00260   1,             /* max enc length */
00261   1,             /* min enc length */
00262   onigenc_is_mbc_newline_0x0a,
00263   onigenc_single_byte_mbc_to_code,
00264   onigenc_single_byte_code_to_mbclen,
00265   onigenc_single_byte_code_to_mbc,
00266   mbc_case_fold,
00267   apply_all_case_fold,
00268   get_case_fold_codes_by_str,
00269   onigenc_minimum_property_name_to_ctype,
00270   is_code_ctype,
00271   onigenc_not_support_get_ctype_code_range,
00272   onigenc_single_byte_left_adjust_char_head,
00273   onigenc_always_true_is_allowed_reverse_match
00274 };
00275 ENC_ALIAS("ISO8859-1", "ISO-8859-1")
00276 
00277 /*
00278  * Name: windows-1252
00279  * MIBenum: 2252
00280  * Link: http://www.iana.org/assignments/character-sets
00281  * Link: http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx
00282  * Link: http://en.wikipedia.org/wiki/Windows-1252
00283  */
00284 ENC_REPLICATE("Windows-1252", "ISO-8859-1")
00285 ENC_ALIAS("CP1252", "Windows-1252")
00286 

Generated on Thu Sep 8 2011 03:48:40 for Ruby by  doxygen 1.7.1