Open Chinese Convert
0.4.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2010-2013 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #include "encoding.h" 00020 #include "opencc.h" 00021 00022 #define INITIAL_BUFF_SIZE 1024 00023 #define GET_BIT(byte, pos) (((byte) >> (pos))& 1) 00024 #define BITMASK(length) ((1 << length) - 1) 00025 00026 ucs4_t* utf8_to_ucs4(const char* utf8, size_t length) { 00027 if (length == 0) { 00028 length = (size_t)-1; 00029 } 00030 size_t i; 00031 for (i = 0; i < length && utf8[i] != '\0'; i++) {} 00032 length = i; 00033 size_t freesize = INITIAL_BUFF_SIZE; 00034 ucs4_t* ucs4 = (ucs4_t*)malloc(sizeof(ucs4_t) * freesize); 00035 ucs4_t* pucs4 = ucs4; 00036 for (i = 0; i < length; i++) { 00037 ucs4_t byte[4] = { 0 }; 00038 if (GET_BIT(utf8[i], 7) == 0) { 00039 /* U-00000000 - U-0000007F */ 00040 /* 0xxxxxxx */ 00041 byte[0] = utf8[i] & BITMASK(7); 00042 } else if (GET_BIT(utf8[i], 5) == 0) { 00043 /* U-00000080 - U-000007FF */ 00044 /* 110xxxxx 10xxxxxx */ 00045 if (i + 1 >= length) { 00046 goto err; 00047 } 00048 byte[0] = (utf8[i + 1] & BITMASK(6)) + 00049 ((utf8[i] & BITMASK(2)) << 6); 00050 byte[1] = (utf8[i] >> 2) & BITMASK(3); 00051 i += 1; 00052 } else if (GET_BIT(utf8[i], 4) == 0) { 00053 /* U-00000800 - U-0000FFFF */ 00054 /* 1110xxxx 10xxxxxx 10xxxxxx */ 00055 if (i + 2 >= length) { 00056 goto err; 00057 } 00058 byte[0] = (utf8[i + 2] & BITMASK(6)) + 00059 ((utf8[i + 1] & BITMASK(2)) << 6); 00060 byte[1] = ((utf8[i + 1] >> 2) & BITMASK(4)) 00061 + ((utf8[i] & BITMASK(4)) << 4); 00062 i += 2; 00063 } else if (GET_BIT(utf8[i], 3) == 0) { 00064 /* U-00010000 - U-001FFFFF */ 00065 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 00066 if (i + 3 >= length) { 00067 goto err; 00068 } 00069 byte[0] = (utf8[i + 3] & BITMASK(6)) + 00070 ((utf8[i + 2] & BITMASK(2)) << 6); 00071 byte[1] = ((utf8[i + 2] >> 2) & BITMASK(4)) + 00072 ((utf8[i + 1] & BITMASK(4)) << 4); 00073 byte[2] = ((utf8[i + 1] >> 4) & BITMASK(2)) + 00074 ((utf8[i] & BITMASK(3)) << 2); 00075 i += 3; 00076 } else if (GET_BIT(utf8[i], 2) == 0) { 00077 /* U-00200000 - U-03FFFFFF */ 00078 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 00079 if (i + 4 >= length) { 00080 goto err; 00081 } 00082 byte[0] = (utf8[i + 4] & BITMASK(6)) + 00083 ((utf8[i + 3] & BITMASK(2)) << 6); 00084 byte[1] = ((utf8[i + 3] >> 2) & BITMASK(4)) + 00085 ((utf8[i + 2] & BITMASK(4)) << 4); 00086 byte[2] = ((utf8[i + 2] >> 4) & BITMASK(2)) + 00087 ((utf8[i + 1] & BITMASK(6)) << 2); 00088 byte[3] = utf8[i] & BITMASK(2); 00089 i += 4; 00090 } else if (GET_BIT(utf8[i], 1) == 0) { 00091 /* U-04000000 - U-7FFFFFFF */ 00092 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 00093 if (i + 5 >= length) { 00094 goto err; 00095 } 00096 byte[0] = (utf8[i + 5] & BITMASK(6)) + 00097 ((utf8[i + 4] & BITMASK(2)) << 6); 00098 byte[1] = ((utf8[i + 4] >> 2) & BITMASK(4)) + 00099 ((utf8[i + 3] & BITMASK(4)) << 4); 00100 byte[2] = ((utf8[i + 3] >> 4) & BITMASK(2)) + 00101 ((utf8[i + 2] & BITMASK(6)) << 2); 00102 byte[3] = (utf8[i + 1] & BITMASK(6)) + 00103 ((utf8[i] & BITMASK(1)) << 6); 00104 i += 5; 00105 } else { 00106 goto err; 00107 } 00108 if (freesize == 0) { 00109 freesize = pucs4 - ucs4; 00110 ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * (freesize + freesize)); 00111 pucs4 = ucs4 + freesize; 00112 } 00113 *pucs4 = (byte[3] << 24) + (byte[2] << 16) + (byte[1] << 8) + byte[0]; 00114 pucs4++; 00115 freesize--; 00116 } 00117 length = (pucs4 - ucs4 + 1); 00118 ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * length); 00119 ucs4[length - 1] = 0; 00120 return ucs4; 00121 00122 err: 00123 free(ucs4); 00124 return (ucs4_t*)-1; 00125 } 00126 00127 char* ucs4_to_utf8(const ucs4_t* ucs4, size_t length) { 00128 if (length == 0) { 00129 length = (size_t)-1; 00130 } 00131 size_t i; 00132 for (i = 0; i < length && ucs4[i] != 0; i++) {} 00133 length = i; 00134 size_t freesize = INITIAL_BUFF_SIZE; 00135 char* utf8 = (char*)malloc(sizeof(char) * freesize); 00136 char* putf8 = utf8; 00137 for (i = 0; i < length; i++) { 00138 if ((ssize_t)freesize - 6 <= 0) { 00139 freesize = putf8 - utf8; 00140 utf8 = (char*)realloc(utf8, sizeof(char) * (freesize + freesize)); 00141 putf8 = utf8 + freesize; 00142 } 00143 ucs4_t c = ucs4[i]; 00144 ucs4_t byte[4] = { 00145 (c >> 0) & BITMASK(8), (c >> 8) & BITMASK(8), 00146 (c >> 16) & BITMASK(8), (c >> 24) & BITMASK(8) 00147 }; 00148 size_t delta = 0; 00149 if (c <= 0x7F) { 00150 /* U-00000000 - U-0000007F */ 00151 /* 0xxxxxxx */ 00152 putf8[0] = byte[0] & BITMASK(7); 00153 delta = 1; 00154 } else if (c <= 0x7FF) { 00155 /* U-00000080 - U-000007FF */ 00156 /* 110xxxxx 10xxxxxx */ 00157 putf8[1] = 0x80 + (byte[0] & BITMASK(6)); 00158 putf8[0] = 0xC0 + ((byte[0] >> 6) & BITMASK(2)) + 00159 ((byte[1] & BITMASK(3)) << 2); 00160 delta = 2; 00161 } else if (c <= 0xFFFF) { 00162 /* U-00000800 - U-0000FFFF */ 00163 /* 1110xxxx 10xxxxxx 10xxxxxx */ 00164 putf8[2] = 0x80 + (byte[0] & BITMASK(6)); 00165 putf8[1] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + 00166 ((byte[1] & BITMASK(4)) << 2); 00167 putf8[0] = 0xE0 + ((byte[1] >> 4) & BITMASK(4)); 00168 delta = 3; 00169 } else if (c <= 0x1FFFFF) { 00170 /* U-00010000 - U-001FFFFF */ 00171 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 00172 putf8[3] = 0x80 + (byte[0] & BITMASK(6)); 00173 putf8[2] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + 00174 ((byte[1] & BITMASK(4)) << 2); 00175 putf8[1] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + 00176 ((byte[2] & BITMASK(2)) << 4); 00177 putf8[0] = 0xF0 + ((byte[2] >> 2) & BITMASK(3)); 00178 delta = 4; 00179 } else if (c <= 0x3FFFFFF) { 00180 /* U-00200000 - U-03FFFFFF */ 00181 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 00182 putf8[4] = 0x80 + (byte[0] & BITMASK(6)); 00183 putf8[3] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + 00184 ((byte[1] & BITMASK(4)) << 2); 00185 putf8[2] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + 00186 ((byte[2] & BITMASK(2)) << 4); 00187 putf8[1] = 0x80 + ((byte[2] >> 2) & BITMASK(6)); 00188 putf8[0] = 0xF8 + (byte[3] & BITMASK(2)); 00189 delta = 5; 00190 } else if (c <= 0x7FFFFFFF) { 00191 /* U-04000000 - U-7FFFFFFF */ 00192 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 00193 putf8[5] = 0x80 + (byte[0] & BITMASK(6)); 00194 putf8[4] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) + 00195 ((byte[1] & BITMASK(4)) << 2); 00196 putf8[3] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) + 00197 ((byte[2] & BITMASK(2)) << 4); 00198 putf8[2] = 0x80 + ((byte[2] >> 2) & BITMASK(6)); 00199 putf8[1] = 0x80 + (byte[3] & BITMASK(6)); 00200 putf8[0] = 0xFC + ((byte[3] >> 6) & BITMASK(1)); 00201 delta = 6; 00202 } else { 00203 free(utf8); 00204 return (char*)-1; 00205 } 00206 putf8 += delta; 00207 freesize -= delta; 00208 } 00209 length = (putf8 - utf8 + 1); 00210 utf8 = (char*)realloc(utf8, sizeof(char) * length); 00211 utf8[length - 1] = '\0'; 00212 return utf8; 00213 } 00214 00215 size_t ucs4len(const ucs4_t* str) { 00216 const register ucs4_t* pstr = str; 00217 while (*pstr) { 00218 ++pstr; 00219 } 00220 return pstr - str; 00221 } 00222 00223 int ucs4cmp(const ucs4_t* src, const ucs4_t* dst) { 00224 register int ret = 0; 00225 while (!(ret = *src - *dst) && *dst) { 00226 ++src, ++dst; 00227 } 00228 return ret; 00229 } 00230 00231 void ucs4cpy(ucs4_t* dest, const ucs4_t* src) { 00232 while (*src) { 00233 *dest++ = *src++; 00234 } 00235 *dest = 0; 00236 } 00237 00238 void ucs4ncpy(ucs4_t* dest, const ucs4_t* src, size_t len) { 00239 while (*src && len-- > 0) { 00240 *dest++ = *src++; 00241 } 00242 }