Open Chinese Convert
0.4.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2010-2013 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #include "../dictionary/datrie.h" 00020 #include "../dictionary/text.h" 00021 #include "../dict_group.h" 00022 #include "../encoding.h" 00023 #include "../utils.h" 00024 #include <locale.h> 00025 #include <unistd.h> 00026 00027 #ifndef VERSION 00028 #define VERSION "" 00029 #endif 00030 00031 #define DATRIE_SIZE 1000000 00032 #define DATRIE_WORD_MAX_COUNT 500000 00033 #define DATRIE_WORD_MAX_LENGTH 32 00034 #define BUFFER_SIZE 1024 00035 00036 typedef struct { 00037 uint32_t cursor; 00038 ucs4_t* pointer; 00039 } Value; 00040 00041 typedef struct { 00042 ucs4_t* key; 00043 Value* value; 00044 size_t length; 00045 size_t value_count; 00046 } Entry; 00047 00048 Entry lexicon[DATRIE_WORD_MAX_COUNT]; 00049 uint32_t lexicon_count, words_set_count; 00050 int words_set[DATRIE_WORD_MAX_COUNT]; 00051 ucs4_t words_set_char[DATRIE_WORD_MAX_COUNT]; 00052 DatrieItem dat[DATRIE_SIZE]; 00053 uint32_t lexicon_index_length, lexicon_cursor_end; 00054 00055 void match_word(const DatrieItem* dat, 00056 const ucs4_t* word, 00057 int* match_pos, 00058 int* id, 00059 int limit) { 00060 int i, j, p; 00061 for (i = 0, p = 0; 00062 word[p] && (limit == 0 || p < limit) && dat[i].base != DATRIE_UNUSED; 00063 p++) { 00064 int k = encode_char(word[p]); 00065 j = dat[i].base + k; 00066 if ((j < 0) || (j > DATRIE_SIZE) || (dat[j].parent != i)) { 00067 break; 00068 } 00069 i = j; 00070 } 00071 if (match_pos) { 00072 *match_pos = p; 00073 } 00074 if (id) { 00075 *id = i; 00076 } 00077 } 00078 00079 int unused(int i) { 00080 if ((i >= 0) && (i < DATRIE_SIZE)) { 00081 return dat[i].parent == DATRIE_UNUSED; 00082 } 00083 return 0; 00084 } 00085 00086 int is_prefix(const ucs4_t* a, const ucs4_t* b) { 00087 const ucs4_t* p = a, * q = b; 00088 while (*p != 0) { 00089 if (*q == 0) { 00090 return 0; 00091 } 00092 if (*p != *q) { 00093 return 0; 00094 } 00095 p++; 00096 q++; 00097 } 00098 return 1; 00099 } 00100 00101 int binary_search(const ucs4_t* str) { 00102 int a = 0, b = lexicon_count - 1, c; 00103 while (a + 1 < b) { 00104 c = (a + b) / 2; 00105 00106 if (ucs4cmp(str, lexicon[c].key) <= 0) { 00107 b = c; 00108 } else { 00109 a = c + 1; 00110 } 00111 } 00112 if (is_prefix(str, 00113 lexicon[a].key) && 00114 ((a == 0) || !is_prefix(str, lexicon[a - 1].key))) { 00115 return a; 00116 } 00117 if (is_prefix(str, lexicon[b].key) && !is_prefix(str, lexicon[b - 1].key)) { 00118 return b; 00119 } 00120 return -1; 00121 } 00122 00123 int wcmp(const void* a, const void* b) { 00124 return *(const ucs4_t*)a < *(const ucs4_t*)b ? -1 : 1; 00125 } 00126 00127 void get_words_with_prefix(ucs4_t* word, int p) { 00128 int i; 00129 static ucs4_t buff[DATRIE_WORD_MAX_LENGTH]; 00130 static ucs4_t words_set_char_buff[DATRIE_WORD_MAX_COUNT]; 00131 00132 for (i = 0; i < p; i++) { 00133 buff[i] = word[i]; 00134 } 00135 buff[p] = 0; 00136 words_set_count = 0; 00137 for (i = binary_search(buff); 00138 (uint32_t)i < lexicon_count && is_prefix(buff, lexicon[i].key); i++) { 00139 if (ucs4cmp(buff, lexicon[i].key) == 0) { 00140 continue; 00141 } 00142 words_set_char_buff[words_set_count] = lexicon[i].key[p]; 00143 words_set[words_set_count++] = i; 00144 } 00145 words_set_char_buff[words_set_count] = 0; 00146 qsort(words_set_char_buff, words_set_count, sizeof(words_set_char_buff[0]), 00147 wcmp); 00148 ucs4_t* wfp, * wp, last; 00149 for (last = 0, wfp = words_set_char_buff, wp = words_set_char; *wfp; wfp++) { 00150 if (*wfp != last) { 00151 last = *wfp; 00152 *wp = *wfp; 00153 wp++; 00154 } 00155 } 00156 *wp = 0; 00157 } 00158 00159 int words_space_available(int delta) { 00160 ucs4_t* wp; 00161 for (wp = words_set_char; *wp; wp++) { 00162 if (!unused(encode_char(*wp) + delta)) { 00163 return 0; 00164 } 00165 } 00166 return 1; 00167 } 00168 00169 void insert_first_char(int id) { 00170 Entry* word = lexicon + id; 00171 int key = encode_char(word->key[0]); 00172 dat[key].base = DATRIE_UNUSED; 00173 dat[key].parent = 0; 00174 if (word->length == 1) { 00175 dat[key].word = (id); 00176 } 00177 } 00178 00179 void insert_words(int delta, int parent, size_t word_len) { 00180 int i; 00181 for (i = 0; (uint32_t)i < words_set_count; i++) { 00182 int j = words_set[i]; 00183 int k = encode_char(lexicon[j].key[word_len]) + delta; 00184 dat[k].parent = parent; 00185 if (lexicon[j].length == word_len + 1) { 00186 dat[k].word = (j); 00187 } 00188 } 00189 } 00190 00191 void insert(int id) { 00192 static int space_min = 0; 00193 Entry* word = &lexicon[id]; 00194 for (;;) { 00195 int p, i; 00196 match_word(dat, word->key, &p, &i, 0); 00197 if ((size_t)p == word->length) { 00198 return; 00199 } 00200 get_words_with_prefix(word->key, p); 00201 int delta; 00202 delta = space_min - words_set_char[0]; 00203 for (; delta < DATRIE_SIZE; delta++) { 00204 if (words_space_available(delta)) { 00205 break; 00206 } 00207 } 00208 if (delta == DATRIE_SIZE) { 00209 fprintf(stderr, "DATRIE_SIZE Not Enough!\n"); 00210 exit(1); 00211 } 00212 insert_words(delta, i, p); 00213 dat[i].base = delta; 00214 while (!unused(space_min)) { 00215 space_min++; 00216 } 00217 } 00218 } 00219 00220 void make(void) { 00221 size_t i; 00222 for (i = 1; i < DATRIE_SIZE; i++) { 00223 dat[i].parent = dat[i].base = DATRIE_UNUSED; 00224 dat[i].word = -1; 00225 } 00226 dat[0].parent = dat[0].base = 0; 00227 for (i = 0; i < lexicon_count; i++) { 00228 insert_first_char(i); 00229 } 00230 for (i = 0; i < lexicon_count; i++) { 00231 insert(i); 00232 } 00233 } 00234 00235 int cmp(const void* a, const void* b) { 00236 return ucs4cmp(((const TextEntry*)a)->key, ((const TextEntry*)b)->key); 00237 } 00238 00239 void init(const char* filename) { 00240 DictGroup* DictGroup = dict_group_new(NULL); 00241 if (dict_group_load(DictGroup, filename, 00242 OPENCC_DICTIONARY_TYPE_TEXT) == -1) { 00243 dictionary_perror("Dictionary loading error"); 00244 fprintf(stderr, _("\n")); 00245 exit(1); 00246 } 00247 Dict* dict_abs = dict_group_get_dict(DictGroup, 0); 00248 if (dict_abs == (Dict*)-1) { 00249 dictionary_perror("Dictionary loading error"); 00250 fprintf(stderr, _("\n")); 00251 exit(1); 00252 } 00253 static TextEntry tlexicon[DATRIE_WORD_MAX_COUNT]; 00254 /* TODO add datrie support */ 00255 Dict* dictionary = dict_abs->dict; 00256 lexicon_count = dict_text_get_lexicon(dictionary, tlexicon); 00257 qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp); 00258 size_t i; 00259 size_t lexicon_cursor = 0; 00260 for (i = 0; i < lexicon_count; i++) { 00261 lexicon[i].key = tlexicon[i].key; 00262 lexicon[i].length = ucs4len(lexicon[i].key); 00263 size_t j; 00264 for (j = 0; tlexicon[i].value[j] != NULL; j++) {} 00265 lexicon[i].value_count = j; 00266 lexicon_index_length += lexicon[i].value_count + 1; 00267 lexicon[i].value = (Value*)malloc(lexicon[i].value_count * sizeof(Value)); 00268 for (j = 0; j < lexicon[i].value_count; j++) { 00269 lexicon[i].value[j].cursor = lexicon_cursor; 00270 lexicon[i].value[j].pointer = tlexicon[i].value[j]; 00271 lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1; 00272 } 00273 } 00274 lexicon_cursor_end = lexicon_cursor; 00275 } 00276 00277 void output(const char* file_name) { 00278 FILE* fp = fopen(file_name, "wb"); 00279 if (!fp) { 00280 fprintf(stderr, _("Can not write file: %s\n"), file_name); 00281 exit(1); 00282 } 00283 uint32_t i, item_count; 00284 for (i = DATRIE_SIZE - 1; i > 0; i--) { 00285 if (dat[i].parent != DATRIE_UNUSED) { 00286 break; 00287 } 00288 } 00289 item_count = i + 1; 00290 fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp); 00291 /* 詞彙表長度 */ 00292 fwrite(&lexicon_cursor_end, sizeof(uint32_t), 1, fp); 00293 for (i = 0; i < lexicon_count; i++) { 00294 size_t j; 00295 for (j = 0; j < lexicon[i].value_count; j++) { 00296 fwrite(lexicon[i].value[j].pointer, sizeof(ucs4_t), 00297 ucs4len(lexicon[i].value[j].pointer) + 1, fp); 00298 } 00299 } 00300 /* 詞彙索引表長度 */ 00301 fwrite(&lexicon_index_length, sizeof(uint32_t), 1, fp); 00302 for (i = 0; i < lexicon_count; i++) { 00303 size_t j; 00304 for (j = 0; j < lexicon[i].value_count; j++) { 00305 fwrite(&lexicon[i].value[j].cursor, sizeof(uint32_t), 1, fp); 00306 } 00307 uint32_t dem = (uint32_t)-1; 00308 fwrite(&dem, sizeof(uint32_t), 1, fp); /* 分隔符 */ 00309 } 00310 fwrite(&lexicon_count, sizeof(uint32_t), 1, fp); 00311 fwrite(&item_count, sizeof(uint32_t), 1, fp); 00312 fwrite(dat, sizeof(dat[0]), item_count, fp); 00313 fclose(fp); 00314 } 00315 00316 #ifdef DEBUG_WRITE_TEXT 00317 void write_text_file() { 00318 FILE* fp; 00319 int i; 00320 fp = fopen("datrie.txt", "w"); 00321 fprintf(fp, "%d\n", lexicon_count); 00322 for (i = 0; i < lexicon_count; i++) { 00323 char* buff = ucs4_to_utf8(lexicon[i].value, (size_t)-1); 00324 fprintf(fp, "%s\n", buff); 00325 free(buff); 00326 } 00327 for (i = 0; i < DATRIE_SIZE; i++) { 00328 if (dat[i].parent != DATRIE_UNUSED) { 00329 fprintf(fp, "%d %d %d %d\n", i, dat[i].base, dat[i].parent, dat[i].word); 00330 } 00331 } 00332 fclose(fp); 00333 } 00334 00335 #endif /* ifdef DEBUG_WRITE_TEXT */ 00336 00337 void show_version() { 00338 printf(_("\nOpen Chinese Convert (OpenCC) Dictionary Tool\nVersion %s\n\n"), 00339 VERSION); 00340 } 00341 00342 void show_usage() { 00343 show_version(); 00344 printf(_("Usage:\n")); 00345 printf(_(" opencc_dict -i input_file -o output_file\n\n")); 00346 printf(_(" -i input_file\n")); 00347 printf(_(" Read data from input_file.\n")); 00348 printf(_(" -o output_file\n")); 00349 printf(_(" Write converted data to output_file.\n")); 00350 printf(_("\n")); 00351 printf(_("\n")); 00352 } 00353 00354 int main(int argc, char** argv) { 00355 static int oc; 00356 static char input_file[BUFFER_SIZE], output_file[BUFFER_SIZE]; 00357 int input_file_specified = 0, output_file_specified = 0; 00358 00359 #ifdef ENABLE_GETTEXT 00360 setlocale(LC_ALL, ""); 00361 bindtextdomain(PACKAGE_NAME, LOCALEDIR); 00362 #endif /* ifdef ENABLE_GETTEXT */ 00363 while ((oc = getopt(argc, argv, "vh-:i:o:")) != -1) { 00364 switch (oc) { 00365 case 'v': 00366 show_version(); 00367 return 0; 00368 case 'h': 00369 case '?': 00370 show_usage(); 00371 return 0; 00372 case '-': 00373 if (strcmp(optarg, "version") == 0) { 00374 show_version(); 00375 } else if (strcmp(optarg, "help") == 0) { 00376 show_usage(); 00377 } else { 00378 show_usage(); 00379 } 00380 return 0; 00381 case 'i': 00382 strcpy(input_file, optarg); 00383 input_file_specified = 1; 00384 break; 00385 case 'o': 00386 strcpy(output_file, optarg); 00387 output_file_specified = 1; 00388 break; 00389 } 00390 } 00391 if (!input_file_specified) { 00392 fprintf(stderr, _("Please specify input file using -i.\n")); 00393 show_usage(); 00394 return 1; 00395 } 00396 if (!output_file_specified) { 00397 fprintf(stderr, _("Please specify output file using -o.\n")); 00398 show_usage(); 00399 return 1; 00400 } 00401 init(input_file); 00402 make(); 00403 output(output_file); 00404 #ifdef DEBUG_WRITE_TEXT 00405 write_text_file(); 00406 #endif /* ifdef DEBUG_WRITE_TEXT */ 00407 return 0; 00408 }