Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
/usr/src/RPM/BUILD/opencc-0.4.3/src/converter.c
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #include "common.h"
00020 #include "converter.h"
00021 #include "dict_group.h"
00022 #include "dict_chain.h"
00023 #include "encoding.h"
00024 
00025 #define DELIMITER ' '
00026 #define SEGMENT_MAXIMUM_LENGTH 0
00027 #define SEGMENT_SHORTEST_PATH 1
00028 #define SEGMENT_METHOD SEGMENT_SHORTEST_PATH
00029 
00030 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
00031 # define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024
00032 typedef struct {
00033   int initialized;
00034   size_t buffer_size;
00035   size_t* match_length;
00036   size_t* min_len;
00037   size_t* parent;
00038   size_t* path;
00039 } SpsegData;
00040 #endif
00041 
00042 static converter_error errnum = CONVERTER_ERROR_VOID;
00043 
00044 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
00045 static void sp_seg_buffer_free(SpsegData* ossb) {
00046   free(ossb->match_length);
00047   free(ossb->min_len);
00048   free(ossb->parent);
00049   free(ossb->path);
00050 }
00051 
00052 static void sp_seg_set_buffer_size(SpsegData* ossb, size_t buffer_size) {
00053   if (ossb->initialized == 1) {
00054     sp_seg_buffer_free(ossb);
00055   }
00056   ossb->buffer_size = buffer_size;
00057   ossb->match_length = (size_t*)malloc((buffer_size + 1) * sizeof(size_t));
00058   ossb->min_len = (size_t*)malloc(buffer_size * sizeof(size_t));
00059   ossb->parent = (size_t*)malloc(buffer_size * sizeof(size_t));
00060   ossb->path = (size_t*)malloc(buffer_size * sizeof(size_t));
00061   ossb->initialized = 1;
00062 }
00063 
00064 static size_t sp_seg(Converter* converter,
00065                      ucs4_t** inbuf,
00066                      size_t* inbuf_left,
00067                      ucs4_t** outbuf,
00068                      size_t* outbuf_left,
00069                      size_t length) {
00070   /* 最短路徑分詞 */
00071   /* 對長度爲1時特殊優化 */
00072   if (length == 1) {
00073     const ucs4_t* const* match_rs = dict_group_match_longest(
00074       converter->current_dict_group,
00075       *inbuf,
00076       1,
00077       NULL);
00078     size_t match_len = 1;
00079     if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
00080       if (match_rs == NULL) {
00081         **outbuf = **inbuf;
00082         (*outbuf)++, (*outbuf_left)--;
00083         (*inbuf)++, (*inbuf_left)--;
00084       } else {
00085         const ucs4_t* result = match_rs[0];
00086         /* 輸出緩衝區剩餘空間小於分詞長度 */
00087         if (ucs4len(result) > *outbuf_left) {
00088           errnum = CONVERTER_ERROR_OUTBUF;
00089           return (size_t)-1;
00090         }
00091         for (; *result; result++) {
00092           **outbuf = *result;
00093           (*outbuf)++, (*outbuf_left)--;
00094         }
00095         *inbuf += match_len;
00096         *inbuf_left -= match_len;
00097       }
00098     } else if (converter->conversion_mode ==
00099                OPENCC_CONVERSION_LIST_CANDIDATES) {
00100       if (match_rs == NULL) {
00101         **outbuf = **inbuf;
00102         (*outbuf)++, (*outbuf_left)--;
00103         (*inbuf)++, (*inbuf_left)--;
00104       } else {
00105         size_t i;
00106         for (i = 0; match_rs[i] != NULL; i++) {
00107           const ucs4_t* result = match_rs[i];
00108           int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
00109           /* 輸出緩衝區剩餘空間小於分詞長度 */
00110           if (ucs4len(result) + show_delimiter > *outbuf_left) {
00111             errnum = CONVERTER_ERROR_OUTBUF;
00112             return (size_t)-1;
00113           }
00114           for (; *result; result++) {
00115             **outbuf = *result;
00116             (*outbuf)++, (*outbuf_left)--;
00117           }
00118           if (show_delimiter) {
00119             **outbuf = DELIMITER;
00120             (*outbuf)++, (*outbuf_left)--;
00121           }
00122         }
00123         *inbuf += match_len;
00124         *inbuf_left -= match_len;
00125       }
00126     } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
00127       if (match_rs == NULL) {
00128         **outbuf = **inbuf;
00129         (*outbuf)++, (*outbuf_left)--;
00130         (*inbuf)++, (*inbuf_left)--;
00131       } else {
00132         /* 輸出緩衝區剩餘空間小於分詞長度 */
00133         if (match_len + 1 > *outbuf_left) {
00134           errnum = CONVERTER_ERROR_OUTBUF;
00135           return (size_t)-1;
00136         }
00137         size_t i;
00138         for (i = 0; i < match_len; i++) {
00139           **outbuf = **inbuf;
00140           (*outbuf)++, (*outbuf_left)--;
00141           (*inbuf)++, (*inbuf_left)--;
00142         }
00143       }
00144       **outbuf = DELIMITER;
00145       (*outbuf)++, (*outbuf_left)--;
00146     } else {
00147       debug_should_not_be_here();
00148     }
00149     /* 必須保證有一個字符空間 */
00150     return match_len;
00151   }
00152 
00153   /* 設置緩衝區空間 */
00154   SpsegData* ossb = converter->data;
00155   size_t buffer_size_need = length + 1;
00156   if ((ossb->initialized == 0) || (ossb->buffer_size < buffer_size_need)) {
00157     sp_seg_set_buffer_size(ossb, buffer_size_need);
00158   }
00159   size_t i, j;
00160   for (i = 0; i <= length; i++) {
00161     ossb->min_len[i] = INFINITY_INT;
00162   }
00163   ossb->min_len[0] = ossb->parent[0] = 0;
00164   for (i = 0; i < length; i++) {
00165     /* 獲取所有匹配長度 */
00166     size_t match_count = dict_group_get_all_match_lengths(
00167       converter->current_dict_group,
00168       (*inbuf) + i,
00169       ossb->match_length
00170       );
00171     if (ossb->match_length[0] != 1) {
00172       ossb->match_length[match_count++] = 1;
00173     }
00174     /* 動態規劃求最短分割路徑 */
00175     for (j = 0; j < match_count; j++) {
00176       size_t k = ossb->match_length[j];
00177       ossb->match_length[j] = 0;
00178       if ((k > 1) && (ossb->min_len[i] + 1 <= ossb->min_len[i + k])) {
00179         ossb->min_len[i + k] = ossb->min_len[i] + 1;
00180         ossb->parent[i + k] = i;
00181       } else if ((k == 1) &&
00182                  (ossb->min_len[i] + 1 < ossb->min_len[i + k])) {
00183         ossb->min_len[i + k] = ossb->min_len[i] + 1;
00184         ossb->parent[i + k] = i;
00185       }
00186     }
00187   }
00188   /* 取得最短分割路徑 */
00189   for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i]) {
00190     ossb->path[--j] = i;
00191   }
00192   size_t inbuf_left_start = *inbuf_left;
00193   size_t begin, end;
00194   /* 根據最短分割路徑轉換 */
00195   for (i = begin = 0; i < ossb->min_len[length]; i++) {
00196     end = ossb->path[i];
00197     size_t match_len;
00198     const ucs4_t* const* match_rs = dict_group_match_longest(
00199       converter->current_dict_group,
00200       *inbuf,
00201       end - begin,
00202       &match_len
00203       );
00204     if (match_rs == NULL) {
00205       **outbuf = **inbuf;
00206       (*outbuf)++, (*outbuf_left)--;
00207       (*inbuf)++, (*inbuf_left)--;
00208     } else {
00209       if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
00210         if (match_rs == NULL) {
00211           **outbuf = **inbuf;
00212           (*outbuf)++, (*outbuf_left)--;
00213           (*inbuf)++, (*inbuf_left)--;
00214         } else {
00215           const ucs4_t* result = match_rs[0];
00216           /* 輸出緩衝區剩餘空間小於分詞長度 */
00217           if (ucs4len(result) > *outbuf_left) {
00218             if (inbuf_left_start - *inbuf_left > 0) {
00219               break;
00220             }
00221             errnum = CONVERTER_ERROR_OUTBUF;
00222             return (size_t)-1;
00223           }
00224           for (; *result; result++) {
00225             **outbuf = *result;
00226             (*outbuf)++, (*outbuf_left)--;
00227           }
00228           *inbuf += match_len;
00229           *inbuf_left -= match_len;
00230         }
00231       } else if (converter->conversion_mode ==
00232                  OPENCC_CONVERSION_LIST_CANDIDATES) {
00233         if (match_rs == NULL) {
00234           **outbuf = **inbuf;
00235           (*outbuf)++, (*outbuf_left)--;
00236           (*inbuf)++, (*inbuf_left)--;
00237         } else {
00238           size_t i;
00239           for (i = 0; match_rs[i] != NULL; i++) {
00240             const ucs4_t* result = match_rs[i];
00241             int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
00242             /* 輸出緩衝區剩餘空間小於分詞長度 */
00243             if (ucs4len(result) + show_delimiter > *outbuf_left) {
00244               if (inbuf_left_start - *inbuf_left > 0) {
00245                 break;
00246               }
00247               errnum = CONVERTER_ERROR_OUTBUF;
00248               return (size_t)-1;
00249             }
00250             for (; *result; result++) {
00251               **outbuf = *result;
00252               (*outbuf)++, (*outbuf_left)--;
00253             }
00254             if (show_delimiter) {
00255               **outbuf = DELIMITER;
00256               (*outbuf)++, (*outbuf_left)--;
00257             }
00258           }
00259           *inbuf += match_len;
00260           *inbuf_left -= match_len;
00261         }
00262       } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
00263         if (match_rs == NULL) {
00264           **outbuf = **inbuf;
00265           (*outbuf)++, (*outbuf_left)--;
00266           (*inbuf)++, (*inbuf_left)--;
00267         } else {
00268           /* 輸出緩衝區剩餘空間小於分詞長度 */
00269           if (match_len + 1 > *outbuf_left) {
00270             if (inbuf_left_start - *inbuf_left > 0) {
00271               break;
00272             }
00273             errnum = CONVERTER_ERROR_OUTBUF;
00274             return (size_t)-1;
00275           }
00276           size_t i;
00277           for (i = 0; i < match_len; i++) {
00278             **outbuf = **inbuf;
00279             (*outbuf)++, (*outbuf_left)--;
00280             (*inbuf)++, (*inbuf_left)--;
00281           }
00282         }
00283         **outbuf = DELIMITER;
00284         (*outbuf)++, (*outbuf_left)--;
00285       } else {
00286         debug_should_not_be_here();
00287       }
00288     }
00289     begin = end;
00290   }
00291   return inbuf_left_start - *inbuf_left;
00292 }
00293 
00294 static size_t segment(Converter* converter,
00295                       ucs4_t** inbuf,
00296                       size_t* inbuf_left,
00297                       ucs4_t** outbuf,
00298                       size_t* outbuf_left) {
00299   /* 歧義分割最短路徑分詞 */
00300   size_t i, start, bound;
00301   const ucs4_t* inbuf_start = *inbuf;
00302   size_t inbuf_left_start = *inbuf_left;
00303   size_t sp_seg_length;
00304   bound = 0;
00305   for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0;
00306        i++) {
00307     if ((i != 0) && (i == bound)) {
00308       /* 對歧義部分進行最短路徑分詞 */
00309       sp_seg_length = sp_seg(converter,
00310                              inbuf,
00311                              inbuf_left,
00312                              outbuf,
00313                              outbuf_left,
00314                              bound - start);
00315 
00316       if (sp_seg_length ==  (size_t)-1) {
00317         return (size_t)-1;
00318       }
00319       if (sp_seg_length == 0) {
00320         if (inbuf_left_start - *inbuf_left > 0) {
00321           return inbuf_left_start - *inbuf_left;
00322         }
00323         /* 空間不足 */
00324         errnum = CONVERTER_ERROR_OUTBUF;
00325         return (size_t)-1;
00326       }
00327       start = i;
00328     }
00329     size_t match_len;
00330     dict_group_match_longest(
00331       converter->current_dict_group,
00332       inbuf_start + i,
00333       0,
00334       &match_len
00335       );
00336     if (match_len == 0) {
00337       match_len = 1;
00338     }
00339     if (i + match_len > bound) {
00340       bound = i + match_len;
00341     }
00342   }
00343   if ((*inbuf_left > 0) && (*outbuf_left > 0)) {
00344     sp_seg_length = sp_seg(converter,
00345                            inbuf,
00346                            inbuf_left,
00347                            outbuf,
00348                            outbuf_left,
00349                            bound - start);
00350     if (sp_seg_length ==  (size_t)-1) {
00351       return (size_t)-1;
00352     }
00353     if (sp_seg_length == 0) {
00354       if (inbuf_left_start - *inbuf_left > 0) {
00355         return inbuf_left_start - *inbuf_left;
00356       }
00357       /* 空間不足 */
00358       errnum = CONVERTER_ERROR_OUTBUF;
00359       return (size_t)-1;
00360     }
00361   }
00362   if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
00363     (*outbuf)--;
00364     (*outbuf_left)++;
00365   }
00366   return inbuf_left_start - *inbuf_left;
00367 }
00368 
00369 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */
00370 
00371 #if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH
00372 static size_t segment(Converter* converter,
00373                       ucs4_t** inbuf,
00374                       size_t* inbuf_left,
00375                       ucs4_t** outbuf,
00376                       size_t* outbuf_left) {
00377   /* 正向最大分詞 */
00378   size_t inbuf_left_start = *inbuf_left;
00379   for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) {
00380     size_t match_len;
00381     const ucs4_t* const* match_rs = dict_group_match_longest(
00382       converter->current_dict_group,
00383       *inbuf,
00384       *inbuf_left,
00385       &match_len
00386       );
00387     if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
00388       if (match_rs == NULL) {
00389         **outbuf = **inbuf;
00390         (*outbuf)++, (*outbuf_left)--;
00391         (*inbuf)++, (*inbuf_left)--;
00392       } else {
00393         const ucs4_t* result = match_rs[0];
00394         /* 輸出緩衝區剩餘空間小於分詞長度 */
00395         if (ucs4len(result) > *outbuf_left) {
00396           if (inbuf_left_start - *inbuf_left > 0) {
00397             break;
00398           }
00399           errnum = CONVERTER_ERROR_OUTBUF;
00400           return (size_t)-1;
00401         }
00402         for (; *result; result++) {
00403           **outbuf = *result;
00404           (*outbuf)++, (*outbuf_left)--;
00405         }
00406         *inbuf += match_len;
00407         *inbuf_left -= match_len;
00408       }
00409     } else if (converter->conversion_mode ==
00410                OPENCC_CONVERSION_LIST_CANDIDATES) {
00411       if (match_rs == NULL) {
00412         **outbuf = **inbuf;
00413         (*outbuf)++, (*outbuf_left)--;
00414         (*inbuf)++, (*inbuf_left)--;
00415       } else {
00416         size_t i;
00417         for (i = 0; match_rs[i] != NULL; i++) {
00418           const ucs4_t* result = match_rs[i];
00419           int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
00420           /* 輸出緩衝區剩餘空間小於分詞長度 */
00421           if (ucs4len(result) + show_delimiter > *outbuf_left) {
00422             if (inbuf_left_start - *inbuf_left > 0) {
00423               break;
00424             }
00425             errnum = CONVERTER_ERROR_OUTBUF;
00426             return (size_t)-1;
00427           }
00428           for (; *result; result++) {
00429             **outbuf = *result;
00430             (*outbuf)++, (*outbuf_left)--;
00431           }
00432           if (show_delimiter) {
00433             **outbuf = DELIMITER;
00434             (*outbuf)++, (*outbuf_left)--;
00435           }
00436         }
00437         *inbuf += match_len;
00438         *inbuf_left -= match_len;
00439       }
00440     } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
00441       if (match_rs == NULL) {
00442         **outbuf = **inbuf;
00443         (*outbuf)++, (*outbuf_left)--;
00444         (*inbuf)++, (*inbuf_left)--;
00445       } else {
00446         /* 輸出緩衝區剩餘空間小於分詞長度 */
00447         if (match_len + 1 > *outbuf_left) {
00448           if (inbuf_left_start - *inbuf_left > 0) {
00449             break;
00450           }
00451           errnum = CONVERTER_ERROR_OUTBUF;
00452           return (size_t)-1;
00453         }
00454         size_t i;
00455         for (i = 0; i < match_len; i++) {
00456           **outbuf = **inbuf;
00457           (*outbuf)++, (*outbuf_left)--;
00458           (*inbuf)++, (*inbuf_left)--;
00459         }
00460       }
00461       **outbuf = DELIMITER;
00462       (*outbuf)++, (*outbuf_left)--;
00463     } else {
00464       debug_should_not_be_here();
00465     }
00466   }
00467   if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
00468     (*outbuf)--;
00469     (*outbuf_left)++;
00470   }
00471   return inbuf_left_start - *inbuf_left;
00472 }
00473 
00474 #endif /* if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH */
00475 
00476 size_t converter_convert(Converter* converter,
00477                          ucs4_t** inbuf,
00478                          size_t* inbuf_left,
00479                          ucs4_t** outbuf,
00480                          size_t* outbuf_left) {
00481   if (converter->dict_chain == NULL) {
00482     errnum = CONVERTER_ERROR_NODICT;
00483     return (size_t)-1;
00484   }
00485   if (converter->dict_chain->count == 1) {
00486     /* 只有一個辭典,直接輸出 */
00487     return segment(converter,
00488                    inbuf,
00489                    inbuf_left,
00490                    outbuf,
00491                    outbuf_left);
00492   }
00493   // 啓用辭典轉換鏈
00494   size_t inbuf_size = *inbuf_left;
00495   size_t outbuf_size = *outbuf_left;
00496   size_t retval = (size_t)-1;
00497   size_t cinbuf_left, coutbuf_left;
00498   size_t coutbuf_delta = 0;
00499   size_t i, cur;
00500   ucs4_t* tmpbuf = (ucs4_t*)malloc(sizeof(ucs4_t) * outbuf_size);
00501   ucs4_t* orig_outbuf = *outbuf;
00502   ucs4_t* cinbuf, * coutbuf;
00503   cinbuf_left = inbuf_size;
00504   coutbuf_left = outbuf_size;
00505   cinbuf = *inbuf;
00506   coutbuf = tmpbuf;
00507   for (i = cur = 0; i < converter->dict_chain->count; ++i, cur = 1 - cur) {
00508     if (i > 0) {
00509       cinbuf_left = coutbuf_delta;
00510       coutbuf_left = outbuf_size;
00511 
00512       if (cur == 1) {
00513         cinbuf = tmpbuf;
00514         coutbuf = orig_outbuf;
00515       } else {
00516         cinbuf = orig_outbuf;
00517         coutbuf = tmpbuf;
00518       }
00519     }
00520     converter->current_dict_group = dict_chain_get_group(
00521       converter->dict_chain,
00522       i);
00523     size_t ret = segment(converter,
00524                         &cinbuf,
00525                         &cinbuf_left,
00526                         &coutbuf,
00527                         &coutbuf_left);
00528     if (ret == (size_t)-1) {
00529       free(tmpbuf);
00530       return (size_t)-1;
00531     }
00532     coutbuf_delta = outbuf_size - coutbuf_left;
00533     if (i == 0) {
00534       retval = ret;
00535       *inbuf = cinbuf;
00536       *inbuf_left = cinbuf_left;
00537     }
00538   }
00539   if (cur == 1) {
00540     // 結果在緩衝區
00541     memcpy(*outbuf, tmpbuf, coutbuf_delta * sizeof(ucs4_t));
00542   }
00543   *outbuf += coutbuf_delta;
00544   *outbuf_left = coutbuf_left;
00545   free(tmpbuf);
00546   return retval;
00547 }
00548 
00549 void converter_assign_dictionary(Converter* converter, DictChain* dict_chain) {
00550   converter->dict_chain = dict_chain;
00551   if (converter->dict_chain->count > 0) {
00552     converter->current_dict_group = dict_chain_get_group(
00553       converter->dict_chain,
00554       0);
00555   }
00556 }
00557 
00558 Converter* converter_open(void) {
00559   Converter* converter = (Converter*)malloc(sizeof(Converter));
00560   converter->dict_chain = NULL;
00561   converter->current_dict_group = NULL;
00562 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
00563   converter->data = (SpsegData*)malloc(sizeof(SpsegData));
00564   SpsegData* spseg_buffer = converter->data;
00565   spseg_buffer->initialized = 0;
00566   spseg_buffer->match_length = NULL;
00567   spseg_buffer->min_len = NULL;
00568   spseg_buffer->parent = NULL;
00569   spseg_buffer->path = NULL;
00570   sp_seg_set_buffer_size(spseg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE);
00571 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */
00572   return converter;
00573 }
00574 
00575 void converter_close(Converter* converter) {
00576 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
00577   sp_seg_buffer_free(converter->data);
00578   free((SpsegData *)converter->data);
00579 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */
00580   free(converter);
00581 }
00582 
00583 void converter_set_conversion_mode(Converter* converter,
00584                                    opencc_conversion_mode conversion_mode) {
00585   converter->conversion_mode = conversion_mode;
00586 }
00587 
00588 converter_error converter_errno(void) {
00589   return errnum;
00590 }
00591 
00592 void converter_perror(const char* spec) {
00593   perr(spec);
00594   perr("\n");
00595   switch (errnum) {
00596   case CONVERTER_ERROR_VOID:
00597     break;
00598   case CONVERTER_ERROR_NODICT:
00599     perr(_("No dictionary loaded"));
00600     break;
00601   case CONVERTER_ERROR_OUTBUF:
00602     perr(_("Output buffer not enough for one segment"));
00603     break;
00604   default:
00605     perr(_("Unknown"));
00606   }
00607 }
 All Data Structures Files Functions Variables Defines