Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
/usr/src/RPM/BUILD/opencc-0.4.3/src/tools/opencc_dict.c
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #include "../dictionary/datrie.h"
00020 #include "../dictionary/text.h"
00021 #include "../dict_group.h"
00022 #include "../encoding.h"
00023 #include "../utils.h"
00024 #include <locale.h>
00025 #include <unistd.h>
00026 
00027 #ifndef VERSION
00028 #define VERSION ""
00029 #endif
00030 
00031 #define DATRIE_SIZE 1000000
00032 #define DATRIE_WORD_MAX_COUNT 500000
00033 #define DATRIE_WORD_MAX_LENGTH 32
00034 #define BUFFER_SIZE 1024
00035 
00036 typedef struct {
00037   uint32_t cursor;
00038   ucs4_t* pointer;
00039 } Value;
00040 
00041 typedef struct {
00042   ucs4_t* key;
00043   Value* value;
00044   size_t length;
00045   size_t value_count;
00046 } Entry;
00047 
00048 Entry lexicon[DATRIE_WORD_MAX_COUNT];
00049 uint32_t lexicon_count, words_set_count;
00050 int words_set[DATRIE_WORD_MAX_COUNT];
00051 ucs4_t words_set_char[DATRIE_WORD_MAX_COUNT];
00052 DatrieItem dat[DATRIE_SIZE];
00053 uint32_t lexicon_index_length, lexicon_cursor_end;
00054 
00055 void match_word(const DatrieItem* dat,
00056                 const ucs4_t* word,
00057                 int* match_pos,
00058                 int* id,
00059                 int limit) {
00060   int i, j, p;
00061   for (i = 0, p = 0;
00062        word[p] && (limit == 0 || p < limit) && dat[i].base != DATRIE_UNUSED;
00063        p++) {
00064     int k = encode_char(word[p]);
00065     j = dat[i].base + k;
00066     if ((j < 0) || (j > DATRIE_SIZE) || (dat[j].parent != i)) {
00067       break;
00068     }
00069     i = j;
00070   }
00071   if (match_pos) {
00072     *match_pos = p;
00073   }
00074   if (id) {
00075     *id = i;
00076   }
00077 }
00078 
00079 int unused(int i) {
00080   if ((i >= 0) && (i < DATRIE_SIZE)) {
00081     return dat[i].parent == DATRIE_UNUSED;
00082   }
00083   return 0;
00084 }
00085 
00086 int is_prefix(const ucs4_t* a, const ucs4_t* b) {
00087   const ucs4_t* p = a, * q = b;
00088   while (*p != 0) {
00089     if (*q == 0) {
00090       return 0;
00091     }
00092     if (*p != *q) {
00093       return 0;
00094     }
00095     p++;
00096     q++;
00097   }
00098   return 1;
00099 }
00100 
00101 int binary_search(const ucs4_t* str) {
00102   int a = 0, b = lexicon_count - 1, c;
00103   while (a + 1 < b) {
00104     c = (a + b) / 2;
00105 
00106     if (ucs4cmp(str, lexicon[c].key) <= 0) {
00107       b = c;
00108     } else {
00109       a = c + 1;
00110     }
00111   }
00112   if (is_prefix(str,
00113                 lexicon[a].key) &&
00114       ((a == 0) || !is_prefix(str, lexicon[a - 1].key))) {
00115     return a;
00116   }
00117   if (is_prefix(str, lexicon[b].key) && !is_prefix(str, lexicon[b - 1].key)) {
00118     return b;
00119   }
00120   return -1;
00121 }
00122 
00123 int wcmp(const void* a, const void* b) {
00124   return *(const ucs4_t*)a < *(const ucs4_t*)b ? -1 : 1;
00125 }
00126 
00127 void get_words_with_prefix(ucs4_t* word, int p) {
00128   int i;
00129   static ucs4_t buff[DATRIE_WORD_MAX_LENGTH];
00130   static ucs4_t words_set_char_buff[DATRIE_WORD_MAX_COUNT];
00131 
00132   for (i = 0; i < p; i++) {
00133     buff[i] = word[i];
00134   }
00135   buff[p] = 0;
00136   words_set_count = 0;
00137   for (i = binary_search(buff);
00138        (uint32_t)i < lexicon_count && is_prefix(buff, lexicon[i].key); i++) {
00139     if (ucs4cmp(buff, lexicon[i].key) == 0) {
00140       continue;
00141     }
00142     words_set_char_buff[words_set_count] = lexicon[i].key[p];
00143     words_set[words_set_count++] = i;
00144   }
00145   words_set_char_buff[words_set_count] = 0;
00146   qsort(words_set_char_buff, words_set_count, sizeof(words_set_char_buff[0]),
00147         wcmp);
00148   ucs4_t* wfp, * wp, last;
00149   for (last = 0, wfp = words_set_char_buff, wp = words_set_char; *wfp; wfp++) {
00150     if (*wfp != last) {
00151       last = *wfp;
00152       *wp = *wfp;
00153       wp++;
00154     }
00155   }
00156   *wp = 0;
00157 }
00158 
00159 int words_space_available(int delta) {
00160   ucs4_t* wp;
00161   for (wp = words_set_char; *wp; wp++) {
00162     if (!unused(encode_char(*wp) + delta)) {
00163       return 0;
00164     }
00165   }
00166   return 1;
00167 }
00168 
00169 void insert_first_char(int id) {
00170   Entry* word = lexicon + id;
00171   int key = encode_char(word->key[0]);
00172   dat[key].base = DATRIE_UNUSED;
00173   dat[key].parent = 0;
00174   if (word->length == 1) {
00175     dat[key].word = (id);
00176   }
00177 }
00178 
00179 void insert_words(int delta, int parent, size_t word_len) {
00180   int i;
00181   for (i = 0; (uint32_t)i < words_set_count; i++) {
00182     int j = words_set[i];
00183     int k = encode_char(lexicon[j].key[word_len]) + delta;
00184     dat[k].parent = parent;
00185     if (lexicon[j].length == word_len + 1) {
00186       dat[k].word = (j);
00187     }
00188   }
00189 }
00190 
00191 void insert(int id) {
00192   static int space_min = 0;
00193   Entry* word = &lexicon[id];
00194   for (;;) {
00195     int p, i;
00196     match_word(dat, word->key, &p, &i, 0);
00197     if ((size_t)p == word->length) {
00198       return;
00199     }
00200     get_words_with_prefix(word->key, p);
00201     int delta;
00202     delta = space_min - words_set_char[0];
00203     for (; delta < DATRIE_SIZE; delta++) {
00204       if (words_space_available(delta)) {
00205         break;
00206       }
00207     }
00208     if (delta == DATRIE_SIZE) {
00209       fprintf(stderr, "DATRIE_SIZE Not Enough!\n");
00210       exit(1);
00211     }
00212     insert_words(delta, i, p);
00213     dat[i].base = delta;
00214     while (!unused(space_min)) {
00215       space_min++;
00216     }
00217   }
00218 }
00219 
00220 void make(void) {
00221   size_t i;
00222   for (i = 1; i < DATRIE_SIZE; i++) {
00223     dat[i].parent = dat[i].base = DATRIE_UNUSED;
00224     dat[i].word = -1;
00225   }
00226   dat[0].parent = dat[0].base = 0;
00227   for (i = 0; i < lexicon_count; i++) {
00228     insert_first_char(i);
00229   }
00230   for (i = 0; i < lexicon_count; i++) {
00231     insert(i);
00232   }
00233 }
00234 
00235 int cmp(const void* a, const void* b) {
00236   return ucs4cmp(((const TextEntry*)a)->key, ((const TextEntry*)b)->key);
00237 }
00238 
00239 void init(const char* filename) {
00240   DictGroup* DictGroup = dict_group_new(NULL);
00241   if (dict_group_load(DictGroup, filename,
00242                             OPENCC_DICTIONARY_TYPE_TEXT) == -1) {
00243     dictionary_perror("Dictionary loading error");
00244     fprintf(stderr, _("\n"));
00245     exit(1);
00246   }
00247   Dict* dict_abs = dict_group_get_dict(DictGroup, 0);
00248   if (dict_abs == (Dict*)-1) {
00249     dictionary_perror("Dictionary loading error");
00250     fprintf(stderr, _("\n"));
00251     exit(1);
00252   }
00253   static TextEntry tlexicon[DATRIE_WORD_MAX_COUNT];
00254   /* TODO add datrie support */
00255   Dict* dictionary = dict_abs->dict;
00256   lexicon_count = dict_text_get_lexicon(dictionary, tlexicon);
00257   qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp);
00258   size_t i;
00259   size_t lexicon_cursor = 0;
00260   for (i = 0; i < lexicon_count; i++) {
00261     lexicon[i].key = tlexicon[i].key;
00262     lexicon[i].length = ucs4len(lexicon[i].key);
00263     size_t j;
00264     for (j = 0; tlexicon[i].value[j] != NULL; j++) {}
00265     lexicon[i].value_count = j;
00266     lexicon_index_length += lexicon[i].value_count + 1;
00267     lexicon[i].value = (Value*)malloc(lexicon[i].value_count * sizeof(Value));
00268     for (j = 0; j < lexicon[i].value_count; j++) {
00269       lexicon[i].value[j].cursor = lexicon_cursor;
00270       lexicon[i].value[j].pointer = tlexicon[i].value[j];
00271       lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1;
00272     }
00273   }
00274   lexicon_cursor_end = lexicon_cursor;
00275 }
00276 
00277 void output(const char* file_name) {
00278   FILE* fp = fopen(file_name, "wb");
00279   if (!fp) {
00280     fprintf(stderr, _("Can not write file: %s\n"), file_name);
00281     exit(1);
00282   }
00283   uint32_t i, item_count;
00284   for (i = DATRIE_SIZE - 1; i > 0; i--) {
00285     if (dat[i].parent != DATRIE_UNUSED) {
00286       break;
00287     }
00288   }
00289   item_count = i + 1;
00290   fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp);
00291   /* 詞彙表長度 */
00292   fwrite(&lexicon_cursor_end, sizeof(uint32_t), 1, fp);
00293   for (i = 0; i < lexicon_count; i++) {
00294     size_t j;
00295     for (j = 0; j < lexicon[i].value_count; j++) {
00296       fwrite(lexicon[i].value[j].pointer, sizeof(ucs4_t),
00297              ucs4len(lexicon[i].value[j].pointer) + 1, fp);
00298     }
00299   }
00300   /* 詞彙索引表長度 */
00301   fwrite(&lexicon_index_length, sizeof(uint32_t), 1, fp);
00302   for (i = 0; i < lexicon_count; i++) {
00303     size_t j;
00304     for (j = 0; j < lexicon[i].value_count; j++) {
00305       fwrite(&lexicon[i].value[j].cursor, sizeof(uint32_t), 1, fp);
00306     }
00307     uint32_t dem = (uint32_t)-1;
00308     fwrite(&dem, sizeof(uint32_t), 1, fp);             /* 分隔符 */
00309   }
00310   fwrite(&lexicon_count, sizeof(uint32_t), 1, fp);
00311   fwrite(&item_count, sizeof(uint32_t), 1, fp);
00312   fwrite(dat, sizeof(dat[0]), item_count, fp);
00313   fclose(fp);
00314 }
00315 
00316 #ifdef DEBUG_WRITE_TEXT
00317 void write_text_file() {
00318   FILE* fp;
00319   int i;
00320   fp = fopen("datrie.txt", "w");
00321   fprintf(fp, "%d\n", lexicon_count);
00322   for (i = 0; i < lexicon_count; i++) {
00323     char* buff = ucs4_to_utf8(lexicon[i].value, (size_t)-1);
00324     fprintf(fp, "%s\n", buff);
00325     free(buff);
00326   }
00327   for (i = 0; i < DATRIE_SIZE; i++) {
00328     if (dat[i].parent != DATRIE_UNUSED) {
00329       fprintf(fp, "%d %d %d %d\n", i, dat[i].base, dat[i].parent, dat[i].word);
00330     }
00331   }
00332   fclose(fp);
00333 }
00334 
00335 #endif /* ifdef DEBUG_WRITE_TEXT */
00336 
00337 void show_version() {
00338   printf(_("\nOpen Chinese Convert (OpenCC) Dictionary Tool\nVersion %s\n\n"),
00339          VERSION);
00340 }
00341 
00342 void show_usage() {
00343   show_version();
00344   printf(_("Usage:\n"));
00345   printf(_("  opencc_dict -i input_file -o output_file\n\n"));
00346   printf(_("    -i input_file\n"));
00347   printf(_("      Read data from input_file.\n"));
00348   printf(_("    -o output_file\n"));
00349   printf(_("      Write converted data to output_file.\n"));
00350   printf(_("\n"));
00351   printf(_("\n"));
00352 }
00353 
00354 int main(int argc, char** argv) {
00355   static int oc;
00356   static char input_file[BUFFER_SIZE], output_file[BUFFER_SIZE];
00357   int input_file_specified = 0, output_file_specified = 0;
00358 
00359 #ifdef ENABLE_GETTEXT
00360   setlocale(LC_ALL, "");
00361   bindtextdomain(PACKAGE_NAME, LOCALEDIR);
00362 #endif /* ifdef ENABLE_GETTEXT */
00363   while ((oc = getopt(argc, argv, "vh-:i:o:")) != -1) {
00364     switch (oc) {
00365     case 'v':
00366       show_version();
00367       return 0;
00368     case 'h':
00369     case '?':
00370       show_usage();
00371       return 0;
00372     case '-':
00373       if (strcmp(optarg, "version") == 0) {
00374         show_version();
00375       } else if (strcmp(optarg, "help") == 0) {
00376         show_usage();
00377       } else {
00378         show_usage();
00379       }
00380       return 0;
00381     case 'i':
00382       strcpy(input_file, optarg);
00383       input_file_specified = 1;
00384       break;
00385     case 'o':
00386       strcpy(output_file, optarg);
00387       output_file_specified = 1;
00388       break;
00389     }
00390   }
00391   if (!input_file_specified) {
00392     fprintf(stderr, _("Please specify input file using -i.\n"));
00393     show_usage();
00394     return 1;
00395   }
00396   if (!output_file_specified) {
00397     fprintf(stderr, _("Please specify output file using -o.\n"));
00398     show_usage();
00399     return 1;
00400   }
00401   init(input_file);
00402   make();
00403   output(output_file);
00404 #ifdef DEBUG_WRITE_TEXT
00405   write_text_file();
00406 #endif /* ifdef DEBUG_WRITE_TEXT */
00407   return 0;
00408 }
 All Data Structures Files Functions Variables Defines