Open Chinese Convert
0.4.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2010-2013 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #include "../encoding.h" 00020 #include "text.h" 00021 00022 #define INITIAL_DICTIONARY_SIZE 1024 00023 #define ENTRY_BUFF_SIZE 128 00024 #define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t) 00025 00026 int qsort_entry_cmp(const void* a, const void* b) { 00027 return ucs4cmp(((TextEntry*)a)->key, ((TextEntry*)b)->key); 00028 } 00029 00030 int parse_entry(const char* buff, TextEntry* entry_i) { 00031 size_t length; 00032 const char* pbuff; 00033 00034 /* 解析鍵 */ 00035 for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++pbuff) {} 00036 00037 if (*pbuff == '\0') { 00038 return -1; 00039 } 00040 length = pbuff - buff; 00041 00042 ucs4_t* ucs4_buff; 00043 ucs4_buff = utf8_to_ucs4(buff, length); 00044 00045 if (ucs4_buff == (ucs4_t*)-1) { 00046 return -1; 00047 } 00048 entry_i->key = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t)); 00049 ucs4cpy(entry_i->key, ucs4_buff); 00050 free(ucs4_buff); 00051 00052 /* 解析值 */ 00053 size_t value_i, value_count = INITIAL_DICTIONARY_SIZE; 00054 entry_i->value = (ucs4_t**)malloc(value_count * sizeof(ucs4_t*)); 00055 00056 for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++value_i) { 00057 if (value_i >= value_count) { 00058 value_count += value_count; 00059 entry_i->value = (ucs4_t**)realloc( 00060 entry_i->value, 00061 value_count * sizeof(ucs4_t*) 00062 ); 00063 } 00064 00065 for (buff = ++pbuff; 00066 *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n' && *pbuff != '\r'; 00067 ++pbuff) {} 00068 length = pbuff - buff; 00069 ucs4_buff = utf8_to_ucs4(buff, length); 00070 00071 if (ucs4_buff == (ucs4_t*)-1) { 00072 /* 發生錯誤 回退內存申請 */ 00073 ssize_t i; 00074 00075 for (i = value_i - 1; i >= 0; --i) { 00076 free(entry_i->value[i]); 00077 } 00078 free(entry_i->value); 00079 free(entry_i->key); 00080 return -1; 00081 } 00082 00083 entry_i->value[value_i] = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t)); 00084 ucs4cpy(entry_i->value[value_i], ucs4_buff); 00085 free(ucs4_buff); 00086 } 00087 00088 entry_i->value = (ucs4_t**)realloc( 00089 entry_i->value, 00090 value_count * sizeof(ucs4_t*) 00091 ); 00092 entry_i->value[value_i] = NULL; 00093 00094 return 0; 00095 } 00096 00097 Dict* dict_text_new(const char* filename) { 00098 TextDict* text_dictionary; 00099 00100 text_dictionary = (TextDict*)malloc(sizeof(TextDict)); 00101 text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE; 00102 text_dictionary->max_length = 0; 00103 text_dictionary->lexicon = (TextEntry*)malloc( 00104 sizeof(TextEntry) * text_dictionary->entry_count); 00105 text_dictionary->word_buff = NULL; 00106 00107 static char buff[ENTRY_BUFF_SIZE]; 00108 00109 FILE* fp = fopen(filename, "r"); 00110 00111 if (fp == NULL) { 00112 dict_text_delete((Dict*)text_dictionary); 00113 return (Dict*)-1; 00114 } 00115 skip_utf8_bom(fp); 00116 00117 size_t i = 0; 00118 00119 while (fgets(buff, ENTRY_BUFF_SIZE, fp)) { 00120 if (i >= text_dictionary->entry_count) { 00121 text_dictionary->entry_count += text_dictionary->entry_count; 00122 text_dictionary->lexicon = (TextEntry*)realloc( 00123 text_dictionary->lexicon, 00124 sizeof(TextEntry) * text_dictionary->entry_count 00125 ); 00126 } 00127 00128 if (parse_entry(buff, text_dictionary->lexicon + i) == -1) { 00129 text_dictionary->entry_count = i; 00130 dict_text_delete((Dict*)text_dictionary); 00131 return (Dict*)-1; 00132 } 00133 00134 size_t length = ucs4len(text_dictionary->lexicon[i].key); 00135 00136 if (length > text_dictionary->max_length) { 00137 text_dictionary->max_length = length; 00138 } 00139 00140 i++; 00141 } 00142 00143 fclose(fp); 00144 00145 text_dictionary->entry_count = i; 00146 text_dictionary->lexicon = (TextEntry*)realloc( 00147 text_dictionary->lexicon, 00148 sizeof(TextEntry) * text_dictionary->entry_count 00149 ); 00150 text_dictionary->word_buff = (ucs4_t*) 00151 malloc(sizeof(ucs4_t) * 00152 (text_dictionary->max_length + 1)); 00153 00154 qsort(text_dictionary->lexicon, 00155 text_dictionary->entry_count, 00156 sizeof(text_dictionary->lexicon[0]), 00157 qsort_entry_cmp 00158 ); 00159 00160 return (Dict*)text_dictionary; 00161 } 00162 00163 void dict_text_delete(Dict* dict) { 00164 TextDict* text_dictionary = (TextDict*)dict; 00165 00166 size_t i; 00167 00168 for (i = 0; i < text_dictionary->entry_count; ++i) { 00169 free(text_dictionary->lexicon[i].key); 00170 00171 ucs4_t** j; 00172 00173 for (j = text_dictionary->lexicon[i].value; *j; ++j) { 00174 free(*j); 00175 } 00176 free(text_dictionary->lexicon[i].value); 00177 } 00178 00179 free(text_dictionary->lexicon); 00180 free(text_dictionary->word_buff); 00181 free(text_dictionary); 00182 } 00183 00184 const ucs4_t* const* dict_text_match_longest(Dict* dict, 00185 const ucs4_t* word, 00186 size_t maxlen, 00187 size_t* match_length) { 00188 TextDict* text_dictionary = (TextDict*)dict; 00189 00190 if (text_dictionary->entry_count == 0) { 00191 return NULL; 00192 } 00193 00194 if (maxlen == 0) { 00195 maxlen = ucs4len(word); 00196 } 00197 size_t len = text_dictionary->max_length; 00198 00199 if (maxlen < len) { 00200 len = maxlen; 00201 } 00202 00203 ucs4ncpy(text_dictionary->word_buff, word, len); 00204 text_dictionary->word_buff[len] = L'\0'; 00205 00206 TextEntry buff; 00207 buff.key = text_dictionary->word_buff; 00208 00209 for (; len > 0; len--) { 00210 text_dictionary->word_buff[len] = L'\0'; 00211 TextEntry* brs = (TextEntry*)bsearch( 00212 &buff, 00213 text_dictionary->lexicon, 00214 text_dictionary->entry_count, 00215 sizeof(text_dictionary->lexicon[0]), 00216 qsort_entry_cmp 00217 ); 00218 00219 if (brs != NULL) { 00220 if (match_length != NULL) { 00221 *match_length = len; 00222 } 00223 return (const ucs4_t* const*)brs->value; 00224 } 00225 } 00226 00227 if (match_length != NULL) { 00228 *match_length = 0; 00229 } 00230 return NULL; 00231 } 00232 00233 size_t dict_text_get_all_match_lengths(Dict* dict, 00234 const ucs4_t* word, 00235 size_t* match_length) { 00236 TextDict* text_dictionary = (TextDict*)dict; 00237 00238 size_t rscnt = 0; 00239 00240 if (text_dictionary->entry_count == 0) { 00241 return rscnt; 00242 } 00243 00244 size_t length = ucs4len(word); 00245 size_t len = text_dictionary->max_length; 00246 00247 if (length < len) { 00248 len = length; 00249 } 00250 00251 ucs4ncpy(text_dictionary->word_buff, word, len); 00252 text_dictionary->word_buff[len] = L'\0'; 00253 00254 TextEntry buff; 00255 buff.key = text_dictionary->word_buff; 00256 00257 for (; len > 0; len--) { 00258 text_dictionary->word_buff[len] = L'\0'; 00259 TextEntry* brs = (TextEntry*)bsearch( 00260 &buff, 00261 text_dictionary->lexicon, 00262 text_dictionary->entry_count, 00263 sizeof(text_dictionary->lexicon[0]), 00264 qsort_entry_cmp 00265 ); 00266 00267 if (brs != NULL) { 00268 match_length[rscnt++] = len; 00269 } 00270 } 00271 00272 return rscnt; 00273 } 00274 00275 size_t dict_text_get_lexicon(Dict* dict, TextEntry* lexicon) { 00276 TextDict* text_dictionary = (TextDict*)dict; 00277 00278 size_t i; 00279 00280 for (i = 0; i < text_dictionary->entry_count; i++) { 00281 lexicon[i].key = text_dictionary->lexicon[i].key; 00282 lexicon[i].value = text_dictionary->lexicon[i].value; 00283 } 00284 00285 return text_dictionary->entry_count; 00286 }