Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
/usr/src/RPM/BUILD/opencc-0.4.3/src/dictionary/datrie.c
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #include "datrie.h"
00020 #include <fcntl.h>
00021 #include <unistd.h>
00022 
00023 #ifdef __WIN32
00024 
00025 /* Todo: Win32 mmap*/
00026 #else /* ifdef __WIN32 */
00027 # include <sys/mman.h>
00028 # define MMAP_ENABLED
00029 #endif /* ifdef __WIN32 */
00030 
00031 typedef enum {
00032   MEMORY_TYPE_MMAP,
00033   MEMORY_TYPE_ALLOCATE
00034 } memory_type;
00035 
00036 typedef struct {
00037   const DatrieItem* dat;
00038   uint32_t dat_item_count;
00039   ucs4_t* lexicon;
00040   uint32_t lexicon_count;
00041 
00042   ucs4_t*** lexicon_set;
00043   void* dic_memory;
00044   size_t dic_size;
00045   memory_type dic_memory_type;
00046 } DatrieDict;
00047 
00048 static int load_allocate(DatrieDict* datrie_dictionary, int fd) {
00049   datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE;
00050   datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size);
00051 
00052   if (datrie_dictionary->dic_memory == NULL) {
00053     /* 內存申請失敗 */
00054     return -1;
00055   }
00056   lseek(fd, 0, SEEK_SET);
00057 
00058   if (read(fd, datrie_dictionary->dic_memory,
00059            datrie_dictionary->dic_size) == -1) {
00060     /* 讀取失敗 */
00061     return -1;
00062   }
00063   return 0;
00064 }
00065 
00066 static int load_mmap(DatrieDict* datrie_dictionary, int fd) {
00067 #ifdef MMAP_ENABLED
00068   datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP;
00069   datrie_dictionary->dic_memory = mmap(NULL,
00070                                        datrie_dictionary->dic_size,
00071                                        PROT_READ,
00072                                        MAP_PRIVATE,
00073                                        fd,
00074                                        0);
00075 
00076   if (datrie_dictionary->dic_memory == MAP_FAILED) {
00077     /* 內存映射創建失敗 */
00078     datrie_dictionary->dic_memory = NULL;
00079     return -1;
00080   }
00081   return 0;
00082 
00083 #else /* ifdef MMAP_ENABLED */
00084   return -1;
00085 
00086 #endif /* ifdef MMAP_ENABLED */
00087 }
00088 
00089 static int load_dict(DatrieDict* datrie_dictionary, FILE* fp) {
00090   int fd = fileno(fp);
00091 
00092   fseek(fp, 0, SEEK_END);
00093   datrie_dictionary->dic_size = ftell(fp);
00094 
00095   /* 首先嘗試mmap,如果失敗嘗試申請內存 */
00096   if (load_mmap(datrie_dictionary, fd) == -1) {
00097     if (load_allocate(datrie_dictionary, fd) == -1) {
00098       return -1;
00099     }
00100   }
00101 
00102   size_t header_len = strlen("OPENCCDATRIE");
00103 
00104   if (strncmp((const char*)datrie_dictionary->dic_memory, "OPENCCDATRIE",
00105               header_len) != 0) {
00106     return -1;
00107   }
00108 
00109   size_t offset = 0;
00110 
00111   offset += header_len * sizeof(char);
00112 
00113   /* 詞彙表 */
00114   uint32_t lexicon_length =
00115     *((uint32_t*)(datrie_dictionary->dic_memory + offset));
00116   offset += sizeof(uint32_t);
00117 
00118   datrie_dictionary->lexicon = (ucs4_t*)(datrie_dictionary->dic_memory + offset);
00119   offset += lexicon_length * sizeof(ucs4_t);
00120 
00121   /* 詞彙索引表 */
00122   uint32_t lexicon_index_length =
00123     *((uint32_t*)(datrie_dictionary->dic_memory + offset));
00124   offset += sizeof(uint32_t);
00125 
00126   uint32_t* lexicon_index = (uint32_t*)(datrie_dictionary->dic_memory + offset);
00127   offset += lexicon_index_length * sizeof(uint32_t);
00128 
00129   datrie_dictionary->lexicon_count  =
00130     *((uint32_t*)(datrie_dictionary->dic_memory + offset));
00131   offset += sizeof(uint32_t);
00132 
00133   datrie_dictionary->dat_item_count =
00134     *((uint32_t*)(datrie_dictionary->dic_memory + offset));
00135   offset += sizeof(uint32_t);
00136 
00137   datrie_dictionary->dat =
00138     (DatrieItem*)(datrie_dictionary->dic_memory + offset);
00139 
00140   /* 構造索引表 */
00141   datrie_dictionary->lexicon_set = (ucs4_t***)malloc(
00142     datrie_dictionary->lexicon_count * sizeof(ucs4_t * *));
00143   size_t i, last = 0;
00144 
00145   for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
00146     size_t count, j;
00147 
00148     for (j = last; j < lexicon_index_length; j++) {
00149       if (lexicon_index[j] == (uint32_t)-1) {
00150         break;
00151       }
00152     }
00153     count = j - last;
00154 
00155     datrie_dictionary->lexicon_set[i] =
00156       (ucs4_t**)malloc((count + 1) * sizeof(ucs4_t*));
00157 
00158     for (j = 0; j < count; j++) {
00159       datrie_dictionary->lexicon_set[i][j] =
00160         datrie_dictionary->lexicon + lexicon_index[last + j];
00161     }
00162     datrie_dictionary->lexicon_set[i][count] = NULL;
00163     last += j + 1;
00164   }
00165 
00166   return 0;
00167 }
00168 
00169 static int unload_dict(DatrieDict* datrie_dictionary) {
00170   if (datrie_dictionary->dic_memory != NULL) {
00171     size_t i;
00172 
00173     for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
00174       free(datrie_dictionary->lexicon_set[i]);
00175     }
00176     free(datrie_dictionary->lexicon_set);
00177 
00178     if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) {
00179                 #ifdef MMAP_ENABLED
00180       return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size);
00181 
00182                 #else /* ifdef MMAP_ENABLED */
00183       debug_should_not_be_here();
00184                 #endif /* ifdef MMAP_ENABLED */
00185     } else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) {
00186       free(datrie_dictionary->dic_memory);
00187     } else {
00188       return -1;
00189     }
00190   }
00191   return 0;
00192 }
00193 
00194 Dict* dict_datrie_new(const char* filename) {
00195   DatrieDict* datrie_dictionary = (DatrieDict*)malloc(
00196     sizeof(DatrieDict));
00197 
00198   datrie_dictionary->dat = NULL;
00199   datrie_dictionary->lexicon = NULL;
00200 
00201   FILE* fp = fopen(filename, "rb");
00202 
00203   if (load_dict(datrie_dictionary, fp) == -1) {
00204     dict_datrie_delete((Dict*)datrie_dictionary);
00205     return (Dict*)-1;
00206   }
00207 
00208   fclose(fp);
00209 
00210   return (Dict*)datrie_dictionary;
00211 }
00212 
00213 int dict_datrie_delete(Dict* dict) {
00214   DatrieDict* datrie_dictionary =
00215     (DatrieDict*)dict;
00216 
00217   if (unload_dict(datrie_dictionary) == -1) {
00218     free(datrie_dictionary);
00219     return -1;
00220   }
00221 
00222   free(datrie_dictionary);
00223   return 0;
00224 }
00225 
00226 int encode_char(ucs4_t ch) {
00227   return (int)ch;
00228 }
00229 
00230 void datrie_match(const DatrieDict* datrie_dictionary,
00231                   const ucs4_t* word,
00232                   size_t* match_pos,
00233                   size_t* id,
00234                   size_t limit) {
00235   int i, p;
00236 
00237   for (i = 0, p = 0; word[p] && (limit == 0 || (size_t)p < limit) &&
00238        datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
00239     int k = encode_char(word[p]);
00240     int j = datrie_dictionary->dat[i].base + k;
00241 
00242     if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) ||
00243         (datrie_dictionary->dat[j].parent != i)) {
00244       break;
00245     }
00246     i = j;
00247   }
00248 
00249   if (match_pos) {
00250     *match_pos = p;
00251   }
00252 
00253   if (id) {
00254     *id = i;
00255   }
00256 }
00257 
00258 const ucs4_t* const* dict_datrie_match_longest(Dict* dict,
00259                                                const ucs4_t* word,
00260                                                size_t maxlen,
00261                                                size_t* match_length) {
00262   DatrieDict* datrie_dictionary =
00263     (DatrieDict*)dict;
00264 
00265   size_t pos, item;
00266 
00267   datrie_match(datrie_dictionary, word, &pos, &item, maxlen);
00268 
00269   while (datrie_dictionary->dat[item].word == -1 && pos > 1) {
00270     datrie_match(datrie_dictionary, word, &pos, &item, pos - 1);
00271   }
00272 
00273   if ((pos == 0) || (datrie_dictionary->dat[item].word == -1)) {
00274     if (match_length != NULL) {
00275       *match_length = 0;
00276     }
00277     return NULL;
00278   }
00279 
00280   if (match_length != NULL) {
00281     *match_length = pos;
00282   }
00283 
00284   return (const ucs4_t* const*)
00285          datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word];
00286 }
00287 
00288 size_t dict_datrie_get_all_match_lengths(Dict* dict,
00289                                          const ucs4_t* word,
00290                                          size_t* match_length) {
00291   DatrieDict* datrie_dictionary =
00292     (DatrieDict*)dict;
00293 
00294   size_t rscnt = 0;
00295 
00296   int i, p;
00297 
00298   for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED;
00299        p++) {
00300     int k = encode_char(word[p]);
00301     int j = datrie_dictionary->dat[i].base + k;
00302 
00303     if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) ||
00304         (datrie_dictionary->dat[j].parent != i)) {
00305       break;
00306     }
00307     i = j;
00308 
00309     if (datrie_dictionary->dat[i].word != -1) {
00310       match_length[rscnt++] = p + 1;
00311     }
00312   }
00313 
00314   return rscnt;
00315 }
 All Data Structures Files Functions Variables Defines