Open Chinese Convert
0.4.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2010-2013 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #include "datrie.h" 00020 #include <fcntl.h> 00021 #include <unistd.h> 00022 00023 #ifdef __WIN32 00024 00025 /* Todo: Win32 mmap*/ 00026 #else /* ifdef __WIN32 */ 00027 # include <sys/mman.h> 00028 # define MMAP_ENABLED 00029 #endif /* ifdef __WIN32 */ 00030 00031 typedef enum { 00032 MEMORY_TYPE_MMAP, 00033 MEMORY_TYPE_ALLOCATE 00034 } memory_type; 00035 00036 typedef struct { 00037 const DatrieItem* dat; 00038 uint32_t dat_item_count; 00039 ucs4_t* lexicon; 00040 uint32_t lexicon_count; 00041 00042 ucs4_t*** lexicon_set; 00043 void* dic_memory; 00044 size_t dic_size; 00045 memory_type dic_memory_type; 00046 } DatrieDict; 00047 00048 static int load_allocate(DatrieDict* datrie_dictionary, int fd) { 00049 datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE; 00050 datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size); 00051 00052 if (datrie_dictionary->dic_memory == NULL) { 00053 /* 內存申請失敗 */ 00054 return -1; 00055 } 00056 lseek(fd, 0, SEEK_SET); 00057 00058 if (read(fd, datrie_dictionary->dic_memory, 00059 datrie_dictionary->dic_size) == -1) { 00060 /* 讀取失敗 */ 00061 return -1; 00062 } 00063 return 0; 00064 } 00065 00066 static int load_mmap(DatrieDict* datrie_dictionary, int fd) { 00067 #ifdef MMAP_ENABLED 00068 datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP; 00069 datrie_dictionary->dic_memory = mmap(NULL, 00070 datrie_dictionary->dic_size, 00071 PROT_READ, 00072 MAP_PRIVATE, 00073 fd, 00074 0); 00075 00076 if (datrie_dictionary->dic_memory == MAP_FAILED) { 00077 /* 內存映射創建失敗 */ 00078 datrie_dictionary->dic_memory = NULL; 00079 return -1; 00080 } 00081 return 0; 00082 00083 #else /* ifdef MMAP_ENABLED */ 00084 return -1; 00085 00086 #endif /* ifdef MMAP_ENABLED */ 00087 } 00088 00089 static int load_dict(DatrieDict* datrie_dictionary, FILE* fp) { 00090 int fd = fileno(fp); 00091 00092 fseek(fp, 0, SEEK_END); 00093 datrie_dictionary->dic_size = ftell(fp); 00094 00095 /* 首先嘗試mmap,如果失敗嘗試申請內存 */ 00096 if (load_mmap(datrie_dictionary, fd) == -1) { 00097 if (load_allocate(datrie_dictionary, fd) == -1) { 00098 return -1; 00099 } 00100 } 00101 00102 size_t header_len = strlen("OPENCCDATRIE"); 00103 00104 if (strncmp((const char*)datrie_dictionary->dic_memory, "OPENCCDATRIE", 00105 header_len) != 0) { 00106 return -1; 00107 } 00108 00109 size_t offset = 0; 00110 00111 offset += header_len * sizeof(char); 00112 00113 /* 詞彙表 */ 00114 uint32_t lexicon_length = 00115 *((uint32_t*)(datrie_dictionary->dic_memory + offset)); 00116 offset += sizeof(uint32_t); 00117 00118 datrie_dictionary->lexicon = (ucs4_t*)(datrie_dictionary->dic_memory + offset); 00119 offset += lexicon_length * sizeof(ucs4_t); 00120 00121 /* 詞彙索引表 */ 00122 uint32_t lexicon_index_length = 00123 *((uint32_t*)(datrie_dictionary->dic_memory + offset)); 00124 offset += sizeof(uint32_t); 00125 00126 uint32_t* lexicon_index = (uint32_t*)(datrie_dictionary->dic_memory + offset); 00127 offset += lexicon_index_length * sizeof(uint32_t); 00128 00129 datrie_dictionary->lexicon_count = 00130 *((uint32_t*)(datrie_dictionary->dic_memory + offset)); 00131 offset += sizeof(uint32_t); 00132 00133 datrie_dictionary->dat_item_count = 00134 *((uint32_t*)(datrie_dictionary->dic_memory + offset)); 00135 offset += sizeof(uint32_t); 00136 00137 datrie_dictionary->dat = 00138 (DatrieItem*)(datrie_dictionary->dic_memory + offset); 00139 00140 /* 構造索引表 */ 00141 datrie_dictionary->lexicon_set = (ucs4_t***)malloc( 00142 datrie_dictionary->lexicon_count * sizeof(ucs4_t * *)); 00143 size_t i, last = 0; 00144 00145 for (i = 0; i < datrie_dictionary->lexicon_count; i++) { 00146 size_t count, j; 00147 00148 for (j = last; j < lexicon_index_length; j++) { 00149 if (lexicon_index[j] == (uint32_t)-1) { 00150 break; 00151 } 00152 } 00153 count = j - last; 00154 00155 datrie_dictionary->lexicon_set[i] = 00156 (ucs4_t**)malloc((count + 1) * sizeof(ucs4_t*)); 00157 00158 for (j = 0; j < count; j++) { 00159 datrie_dictionary->lexicon_set[i][j] = 00160 datrie_dictionary->lexicon + lexicon_index[last + j]; 00161 } 00162 datrie_dictionary->lexicon_set[i][count] = NULL; 00163 last += j + 1; 00164 } 00165 00166 return 0; 00167 } 00168 00169 static int unload_dict(DatrieDict* datrie_dictionary) { 00170 if (datrie_dictionary->dic_memory != NULL) { 00171 size_t i; 00172 00173 for (i = 0; i < datrie_dictionary->lexicon_count; i++) { 00174 free(datrie_dictionary->lexicon_set[i]); 00175 } 00176 free(datrie_dictionary->lexicon_set); 00177 00178 if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) { 00179 #ifdef MMAP_ENABLED 00180 return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size); 00181 00182 #else /* ifdef MMAP_ENABLED */ 00183 debug_should_not_be_here(); 00184 #endif /* ifdef MMAP_ENABLED */ 00185 } else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) { 00186 free(datrie_dictionary->dic_memory); 00187 } else { 00188 return -1; 00189 } 00190 } 00191 return 0; 00192 } 00193 00194 Dict* dict_datrie_new(const char* filename) { 00195 DatrieDict* datrie_dictionary = (DatrieDict*)malloc( 00196 sizeof(DatrieDict)); 00197 00198 datrie_dictionary->dat = NULL; 00199 datrie_dictionary->lexicon = NULL; 00200 00201 FILE* fp = fopen(filename, "rb"); 00202 00203 if (load_dict(datrie_dictionary, fp) == -1) { 00204 dict_datrie_delete((Dict*)datrie_dictionary); 00205 return (Dict*)-1; 00206 } 00207 00208 fclose(fp); 00209 00210 return (Dict*)datrie_dictionary; 00211 } 00212 00213 int dict_datrie_delete(Dict* dict) { 00214 DatrieDict* datrie_dictionary = 00215 (DatrieDict*)dict; 00216 00217 if (unload_dict(datrie_dictionary) == -1) { 00218 free(datrie_dictionary); 00219 return -1; 00220 } 00221 00222 free(datrie_dictionary); 00223 return 0; 00224 } 00225 00226 int encode_char(ucs4_t ch) { 00227 return (int)ch; 00228 } 00229 00230 void datrie_match(const DatrieDict* datrie_dictionary, 00231 const ucs4_t* word, 00232 size_t* match_pos, 00233 size_t* id, 00234 size_t limit) { 00235 int i, p; 00236 00237 for (i = 0, p = 0; word[p] && (limit == 0 || (size_t)p < limit) && 00238 datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) { 00239 int k = encode_char(word[p]); 00240 int j = datrie_dictionary->dat[i].base + k; 00241 00242 if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) || 00243 (datrie_dictionary->dat[j].parent != i)) { 00244 break; 00245 } 00246 i = j; 00247 } 00248 00249 if (match_pos) { 00250 *match_pos = p; 00251 } 00252 00253 if (id) { 00254 *id = i; 00255 } 00256 } 00257 00258 const ucs4_t* const* dict_datrie_match_longest(Dict* dict, 00259 const ucs4_t* word, 00260 size_t maxlen, 00261 size_t* match_length) { 00262 DatrieDict* datrie_dictionary = 00263 (DatrieDict*)dict; 00264 00265 size_t pos, item; 00266 00267 datrie_match(datrie_dictionary, word, &pos, &item, maxlen); 00268 00269 while (datrie_dictionary->dat[item].word == -1 && pos > 1) { 00270 datrie_match(datrie_dictionary, word, &pos, &item, pos - 1); 00271 } 00272 00273 if ((pos == 0) || (datrie_dictionary->dat[item].word == -1)) { 00274 if (match_length != NULL) { 00275 *match_length = 0; 00276 } 00277 return NULL; 00278 } 00279 00280 if (match_length != NULL) { 00281 *match_length = pos; 00282 } 00283 00284 return (const ucs4_t* const*) 00285 datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word]; 00286 } 00287 00288 size_t dict_datrie_get_all_match_lengths(Dict* dict, 00289 const ucs4_t* word, 00290 size_t* match_length) { 00291 DatrieDict* datrie_dictionary = 00292 (DatrieDict*)dict; 00293 00294 size_t rscnt = 0; 00295 00296 int i, p; 00297 00298 for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED; 00299 p++) { 00300 int k = encode_char(word[p]); 00301 int j = datrie_dictionary->dat[i].base + k; 00302 00303 if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) || 00304 (datrie_dictionary->dat[j].parent != i)) { 00305 break; 00306 } 00307 i = j; 00308 00309 if (datrie_dictionary->dat[i].word != -1) { 00310 match_length[rscnt++] = p + 1; 00311 } 00312 } 00313 00314 return rscnt; 00315 }