Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
/usr/src/RPM/BUILD/opencc-0.4.3/src/encoding.c
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #include "encoding.h"
00020 #include "opencc.h"
00021 
00022 #define INITIAL_BUFF_SIZE 1024
00023 #define GET_BIT(byte, pos) (((byte) >> (pos))& 1)
00024 #define BITMASK(length) ((1 << length) - 1)
00025 
00026 ucs4_t* utf8_to_ucs4(const char* utf8, size_t length) {
00027   if (length == 0) {
00028     length = (size_t)-1;
00029   }
00030   size_t i;
00031   for (i = 0; i < length && utf8[i] != '\0'; i++) {}
00032   length = i;
00033   size_t freesize = INITIAL_BUFF_SIZE;
00034   ucs4_t* ucs4 = (ucs4_t*)malloc(sizeof(ucs4_t) * freesize);
00035   ucs4_t* pucs4 = ucs4;
00036   for (i = 0; i < length; i++) {
00037     ucs4_t byte[4] = { 0 };
00038     if (GET_BIT(utf8[i], 7) == 0) {
00039       /* U-00000000 - U-0000007F */
00040       /* 0xxxxxxx */
00041       byte[0] = utf8[i] & BITMASK(7);
00042     } else if (GET_BIT(utf8[i], 5) == 0) {
00043       /* U-00000080 - U-000007FF */
00044       /* 110xxxxx 10xxxxxx */
00045       if (i + 1 >= length) {
00046         goto err;
00047       }
00048       byte[0] = (utf8[i + 1] & BITMASK(6)) +
00049                 ((utf8[i] & BITMASK(2)) << 6);
00050       byte[1] = (utf8[i] >> 2) & BITMASK(3);
00051       i += 1;
00052     } else if (GET_BIT(utf8[i], 4) == 0) {
00053       /* U-00000800 - U-0000FFFF */
00054       /* 1110xxxx 10xxxxxx 10xxxxxx */
00055       if (i + 2 >= length) {
00056         goto err;
00057       }
00058       byte[0] = (utf8[i + 2] & BITMASK(6)) +
00059                 ((utf8[i + 1] & BITMASK(2)) << 6);
00060       byte[1] = ((utf8[i + 1] >> 2) & BITMASK(4))
00061                 + ((utf8[i] & BITMASK(4)) << 4);
00062       i += 2;
00063     } else if (GET_BIT(utf8[i], 3) == 0) {
00064       /* U-00010000 - U-001FFFFF */
00065       /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
00066       if (i + 3 >= length) {
00067         goto err;
00068       }
00069       byte[0] = (utf8[i + 3] & BITMASK(6)) +
00070                 ((utf8[i + 2] & BITMASK(2)) << 6);
00071       byte[1] = ((utf8[i + 2] >> 2) & BITMASK(4)) +
00072                 ((utf8[i + 1] & BITMASK(4)) << 4);
00073       byte[2] = ((utf8[i + 1] >> 4) & BITMASK(2)) +
00074                 ((utf8[i] & BITMASK(3)) << 2);
00075       i += 3;
00076     } else if (GET_BIT(utf8[i], 2) == 0) {
00077       /* U-00200000 - U-03FFFFFF */
00078       /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
00079       if (i + 4 >= length) {
00080         goto err;
00081       }
00082       byte[0] = (utf8[i + 4] & BITMASK(6)) +
00083                 ((utf8[i + 3] & BITMASK(2)) << 6);
00084       byte[1] = ((utf8[i + 3] >> 2) & BITMASK(4)) +
00085                 ((utf8[i + 2] & BITMASK(4)) << 4);
00086       byte[2] = ((utf8[i + 2] >> 4) & BITMASK(2)) +
00087                 ((utf8[i + 1] & BITMASK(6)) << 2);
00088       byte[3] = utf8[i] & BITMASK(2);
00089       i += 4;
00090     } else if (GET_BIT(utf8[i], 1) == 0) {
00091       /* U-04000000 - U-7FFFFFFF */
00092       /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
00093       if (i + 5 >= length) {
00094         goto err;
00095       }
00096       byte[0] = (utf8[i + 5] & BITMASK(6)) +
00097                 ((utf8[i + 4] & BITMASK(2)) << 6);
00098       byte[1] = ((utf8[i + 4] >> 2) & BITMASK(4)) +
00099                 ((utf8[i + 3] & BITMASK(4)) << 4);
00100       byte[2] = ((utf8[i + 3] >> 4) & BITMASK(2)) +
00101                 ((utf8[i + 2] & BITMASK(6)) << 2);
00102       byte[3] = (utf8[i + 1] & BITMASK(6)) +
00103                 ((utf8[i] & BITMASK(1)) << 6);
00104       i += 5;
00105     } else {
00106       goto err;
00107     }
00108     if (freesize == 0) {
00109       freesize = pucs4 - ucs4;
00110       ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * (freesize + freesize));
00111       pucs4 = ucs4 + freesize;
00112     }
00113     *pucs4 = (byte[3] << 24) + (byte[2] << 16) + (byte[1] << 8) + byte[0];
00114     pucs4++;
00115     freesize--;
00116   }
00117   length = (pucs4 - ucs4 + 1);
00118   ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * length);
00119   ucs4[length - 1] = 0;
00120   return ucs4;
00121 
00122 err:
00123   free(ucs4);
00124   return (ucs4_t*)-1;
00125 }
00126 
00127 char* ucs4_to_utf8(const ucs4_t* ucs4, size_t length) {
00128   if (length == 0) {
00129     length = (size_t)-1;
00130   }
00131   size_t i;
00132   for (i = 0; i < length && ucs4[i] != 0; i++) {}
00133   length = i;
00134   size_t freesize = INITIAL_BUFF_SIZE;
00135   char* utf8 = (char*)malloc(sizeof(char) * freesize);
00136   char* putf8 = utf8;
00137   for (i = 0; i < length; i++) {
00138     if ((ssize_t)freesize - 6 <= 0) {
00139       freesize = putf8 - utf8;
00140       utf8 = (char*)realloc(utf8, sizeof(char) * (freesize + freesize));
00141       putf8 = utf8 + freesize;
00142     }
00143     ucs4_t c = ucs4[i];
00144     ucs4_t byte[4] = {
00145       (c >> 0) & BITMASK(8), (c >> 8) & BITMASK(8),
00146       (c >> 16) & BITMASK(8), (c >> 24) & BITMASK(8)
00147     };
00148     size_t delta = 0;
00149     if (c <= 0x7F) {
00150       /* U-00000000 - U-0000007F */
00151       /* 0xxxxxxx */
00152       putf8[0] = byte[0] & BITMASK(7);
00153       delta = 1;
00154     } else if (c <= 0x7FF) {
00155       /* U-00000080 - U-000007FF */
00156       /* 110xxxxx 10xxxxxx */
00157       putf8[1] = 0x80 + (byte[0] & BITMASK(6));
00158       putf8[0] = 0xC0 + ((byte[0] >> 6) & BITMASK(2)) +
00159                  ((byte[1] & BITMASK(3)) << 2);
00160       delta = 2;
00161     } else if (c <= 0xFFFF) {
00162       /* U-00000800 - U-0000FFFF */
00163       /* 1110xxxx 10xxxxxx 10xxxxxx */
00164       putf8[2] = 0x80 + (byte[0] & BITMASK(6));
00165       putf8[1] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
00166                  ((byte[1] & BITMASK(4)) << 2);
00167       putf8[0] = 0xE0 + ((byte[1] >> 4) & BITMASK(4));
00168       delta = 3;
00169     } else if (c <= 0x1FFFFF) {
00170       /* U-00010000 - U-001FFFFF */
00171       /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
00172       putf8[3] = 0x80 + (byte[0] & BITMASK(6));
00173       putf8[2] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
00174                  ((byte[1] & BITMASK(4)) << 2);
00175       putf8[1] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
00176                  ((byte[2] & BITMASK(2)) << 4);
00177       putf8[0] = 0xF0 + ((byte[2] >> 2) & BITMASK(3));
00178       delta = 4;
00179     } else if (c <= 0x3FFFFFF) {
00180       /* U-00200000 - U-03FFFFFF */
00181       /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
00182       putf8[4] = 0x80 + (byte[0] & BITMASK(6));
00183       putf8[3] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
00184                  ((byte[1] & BITMASK(4)) << 2);
00185       putf8[2] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
00186                  ((byte[2] & BITMASK(2)) << 4);
00187       putf8[1] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
00188       putf8[0] = 0xF8 + (byte[3] & BITMASK(2));
00189       delta = 5;
00190     } else if (c <= 0x7FFFFFFF) {
00191       /* U-04000000 - U-7FFFFFFF */
00192       /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
00193       putf8[5] = 0x80 + (byte[0] & BITMASK(6));
00194       putf8[4] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
00195                  ((byte[1] & BITMASK(4)) << 2);
00196       putf8[3] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
00197                  ((byte[2] & BITMASK(2)) << 4);
00198       putf8[2] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
00199       putf8[1] = 0x80 + (byte[3] & BITMASK(6));
00200       putf8[0] = 0xFC + ((byte[3] >> 6) & BITMASK(1));
00201       delta = 6;
00202     } else {
00203       free(utf8);
00204       return (char*)-1;
00205     }
00206     putf8 += delta;
00207     freesize -= delta;
00208   }
00209   length = (putf8 - utf8 + 1);
00210   utf8 = (char*)realloc(utf8, sizeof(char) * length);
00211   utf8[length - 1] = '\0';
00212   return utf8;
00213 }
00214 
00215 size_t ucs4len(const ucs4_t* str) {
00216   const register ucs4_t* pstr = str;
00217   while (*pstr) {
00218     ++pstr;
00219   }
00220   return pstr - str;
00221 }
00222 
00223 int ucs4cmp(const ucs4_t* src, const ucs4_t* dst) {
00224   register int ret = 0;
00225   while (!(ret = *src - *dst) && *dst) {
00226     ++src, ++dst;
00227   }
00228   return ret;
00229 }
00230 
00231 void ucs4cpy(ucs4_t* dest, const ucs4_t* src) {
00232   while (*src) {
00233     *dest++ = *src++;
00234   }
00235   *dest = 0;
00236 }
00237 
00238 void ucs4ncpy(ucs4_t* dest, const ucs4_t* src, size_t len) {
00239   while (*src && len-- > 0) {
00240     *dest++ = *src++;
00241   }
00242 }
 All Data Structures Files Functions Variables Defines