Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
/usr/src/RPM/BUILD/opencc-0.4.3/src/dictionary/text.c
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #include "../encoding.h"
00020 #include "text.h"
00021 
00022 #define INITIAL_DICTIONARY_SIZE 1024
00023 #define ENTRY_BUFF_SIZE 128
00024 #define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t)
00025 
00026 int qsort_entry_cmp(const void* a, const void* b) {
00027   return ucs4cmp(((TextEntry*)a)->key, ((TextEntry*)b)->key);
00028 }
00029 
00030 int parse_entry(const char* buff, TextEntry* entry_i) {
00031   size_t length;
00032   const char* pbuff;
00033 
00034   /* 解析鍵 */
00035   for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++pbuff) {}
00036 
00037   if (*pbuff == '\0') {
00038     return -1;
00039   }
00040   length = pbuff - buff;
00041 
00042   ucs4_t* ucs4_buff;
00043   ucs4_buff = utf8_to_ucs4(buff, length);
00044 
00045   if (ucs4_buff == (ucs4_t*)-1) {
00046     return -1;
00047   }
00048   entry_i->key = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t));
00049   ucs4cpy(entry_i->key, ucs4_buff);
00050   free(ucs4_buff);
00051 
00052   /* 解析值 */
00053   size_t value_i, value_count = INITIAL_DICTIONARY_SIZE;
00054   entry_i->value = (ucs4_t**)malloc(value_count * sizeof(ucs4_t*));
00055 
00056   for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++value_i) {
00057     if (value_i >= value_count) {
00058       value_count += value_count;
00059       entry_i->value = (ucs4_t**)realloc(
00060         entry_i->value,
00061         value_count * sizeof(ucs4_t*)
00062         );
00063     }
00064 
00065     for (buff = ++pbuff;
00066          *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n' && *pbuff != '\r';
00067          ++pbuff) {}
00068     length = pbuff - buff;
00069     ucs4_buff = utf8_to_ucs4(buff, length);
00070 
00071     if (ucs4_buff == (ucs4_t*)-1) {
00072       /* 發生錯誤 回退內存申請 */
00073       ssize_t i;
00074 
00075       for (i = value_i - 1; i >= 0; --i) {
00076         free(entry_i->value[i]);
00077       }
00078       free(entry_i->value);
00079       free(entry_i->key);
00080       return -1;
00081     }
00082 
00083     entry_i->value[value_i] = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t));
00084     ucs4cpy(entry_i->value[value_i], ucs4_buff);
00085     free(ucs4_buff);
00086   }
00087 
00088   entry_i->value = (ucs4_t**)realloc(
00089     entry_i->value,
00090     value_count * sizeof(ucs4_t*)
00091     );
00092   entry_i->value[value_i] = NULL;
00093 
00094   return 0;
00095 }
00096 
00097 Dict* dict_text_new(const char* filename) {
00098   TextDict* text_dictionary;
00099 
00100   text_dictionary = (TextDict*)malloc(sizeof(TextDict));
00101   text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE;
00102   text_dictionary->max_length = 0;
00103   text_dictionary->lexicon = (TextEntry*)malloc(
00104     sizeof(TextEntry) * text_dictionary->entry_count);
00105   text_dictionary->word_buff = NULL;
00106 
00107   static char buff[ENTRY_BUFF_SIZE];
00108 
00109   FILE* fp = fopen(filename, "r");
00110 
00111   if (fp == NULL) {
00112     dict_text_delete((Dict*)text_dictionary);
00113     return (Dict*)-1;
00114   }
00115   skip_utf8_bom(fp);
00116 
00117   size_t i = 0;
00118 
00119   while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
00120     if (i >= text_dictionary->entry_count) {
00121       text_dictionary->entry_count += text_dictionary->entry_count;
00122       text_dictionary->lexicon = (TextEntry*)realloc(
00123         text_dictionary->lexicon,
00124         sizeof(TextEntry) * text_dictionary->entry_count
00125         );
00126     }
00127 
00128     if (parse_entry(buff, text_dictionary->lexicon + i) == -1) {
00129       text_dictionary->entry_count = i;
00130       dict_text_delete((Dict*)text_dictionary);
00131       return (Dict*)-1;
00132     }
00133 
00134     size_t length = ucs4len(text_dictionary->lexicon[i].key);
00135 
00136     if (length > text_dictionary->max_length) {
00137       text_dictionary->max_length = length;
00138     }
00139 
00140     i++;
00141   }
00142 
00143   fclose(fp);
00144 
00145   text_dictionary->entry_count = i;
00146   text_dictionary->lexicon = (TextEntry*)realloc(
00147     text_dictionary->lexicon,
00148     sizeof(TextEntry) * text_dictionary->entry_count
00149     );
00150   text_dictionary->word_buff = (ucs4_t*)
00151                                malloc(sizeof(ucs4_t) *
00152                                       (text_dictionary->max_length + 1));
00153 
00154   qsort(text_dictionary->lexicon,
00155         text_dictionary->entry_count,
00156         sizeof(text_dictionary->lexicon[0]),
00157         qsort_entry_cmp
00158         );
00159 
00160   return (Dict*)text_dictionary;
00161 }
00162 
00163 void dict_text_delete(Dict* dict) {
00164   TextDict* text_dictionary = (TextDict*)dict;
00165 
00166   size_t i;
00167 
00168   for (i = 0; i < text_dictionary->entry_count; ++i) {
00169     free(text_dictionary->lexicon[i].key);
00170 
00171     ucs4_t** j;
00172 
00173     for (j = text_dictionary->lexicon[i].value; *j; ++j) {
00174       free(*j);
00175     }
00176     free(text_dictionary->lexicon[i].value);
00177   }
00178 
00179   free(text_dictionary->lexicon);
00180   free(text_dictionary->word_buff);
00181   free(text_dictionary);
00182 }
00183 
00184 const ucs4_t* const* dict_text_match_longest(Dict* dict,
00185                                              const ucs4_t* word,
00186                                              size_t maxlen,
00187                                              size_t* match_length) {
00188   TextDict* text_dictionary = (TextDict*)dict;
00189 
00190   if (text_dictionary->entry_count == 0) {
00191     return NULL;
00192   }
00193 
00194   if (maxlen == 0) {
00195     maxlen = ucs4len(word);
00196   }
00197   size_t len = text_dictionary->max_length;
00198 
00199   if (maxlen < len) {
00200     len = maxlen;
00201   }
00202 
00203   ucs4ncpy(text_dictionary->word_buff, word, len);
00204   text_dictionary->word_buff[len] = L'\0';
00205 
00206   TextEntry buff;
00207   buff.key = text_dictionary->word_buff;
00208 
00209   for (; len > 0; len--) {
00210     text_dictionary->word_buff[len] = L'\0';
00211     TextEntry* brs = (TextEntry*)bsearch(
00212       &buff,
00213       text_dictionary->lexicon,
00214       text_dictionary->entry_count,
00215       sizeof(text_dictionary->lexicon[0]),
00216       qsort_entry_cmp
00217       );
00218 
00219     if (brs != NULL) {
00220       if (match_length != NULL) {
00221         *match_length = len;
00222       }
00223       return (const ucs4_t* const*)brs->value;
00224     }
00225   }
00226 
00227   if (match_length != NULL) {
00228     *match_length = 0;
00229   }
00230   return NULL;
00231 }
00232 
00233 size_t dict_text_get_all_match_lengths(Dict* dict,
00234                                        const ucs4_t* word,
00235                                        size_t* match_length) {
00236   TextDict* text_dictionary = (TextDict*)dict;
00237 
00238   size_t rscnt = 0;
00239 
00240   if (text_dictionary->entry_count == 0) {
00241     return rscnt;
00242   }
00243 
00244   size_t length = ucs4len(word);
00245   size_t len = text_dictionary->max_length;
00246 
00247   if (length < len) {
00248     len = length;
00249   }
00250 
00251   ucs4ncpy(text_dictionary->word_buff, word, len);
00252   text_dictionary->word_buff[len] = L'\0';
00253 
00254   TextEntry buff;
00255   buff.key = text_dictionary->word_buff;
00256 
00257   for (; len > 0; len--) {
00258     text_dictionary->word_buff[len] = L'\0';
00259     TextEntry* brs = (TextEntry*)bsearch(
00260       &buff,
00261       text_dictionary->lexicon,
00262       text_dictionary->entry_count,
00263       sizeof(text_dictionary->lexicon[0]),
00264       qsort_entry_cmp
00265       );
00266 
00267     if (brs != NULL) {
00268       match_length[rscnt++] = len;
00269     }
00270   }
00271 
00272   return rscnt;
00273 }
00274 
00275 size_t dict_text_get_lexicon(Dict* dict, TextEntry* lexicon) {
00276   TextDict* text_dictionary = (TextDict*)dict;
00277 
00278   size_t i;
00279 
00280   for (i = 0; i < text_dictionary->entry_count; i++) {
00281     lexicon[i].key = text_dictionary->lexicon[i].key;
00282     lexicon[i].value = text_dictionary->lexicon[i].value;
00283   }
00284 
00285   return text_dictionary->entry_count;
00286 }
 All Data Structures Files Functions Variables Defines