Open Chinese Convert
0.4.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2010-2013 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #include "common.h" 00020 #include "converter.h" 00021 #include "dict_group.h" 00022 #include "dict_chain.h" 00023 #include "encoding.h" 00024 00025 #define DELIMITER ' ' 00026 #define SEGMENT_MAXIMUM_LENGTH 0 00027 #define SEGMENT_SHORTEST_PATH 1 00028 #define SEGMENT_METHOD SEGMENT_SHORTEST_PATH 00029 00030 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH 00031 # define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024 00032 typedef struct { 00033 int initialized; 00034 size_t buffer_size; 00035 size_t* match_length; 00036 size_t* min_len; 00037 size_t* parent; 00038 size_t* path; 00039 } SpsegData; 00040 #endif 00041 00042 static converter_error errnum = CONVERTER_ERROR_VOID; 00043 00044 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH 00045 static void sp_seg_buffer_free(SpsegData* ossb) { 00046 free(ossb->match_length); 00047 free(ossb->min_len); 00048 free(ossb->parent); 00049 free(ossb->path); 00050 } 00051 00052 static void sp_seg_set_buffer_size(SpsegData* ossb, size_t buffer_size) { 00053 if (ossb->initialized == 1) { 00054 sp_seg_buffer_free(ossb); 00055 } 00056 ossb->buffer_size = buffer_size; 00057 ossb->match_length = (size_t*)malloc((buffer_size + 1) * sizeof(size_t)); 00058 ossb->min_len = (size_t*)malloc(buffer_size * sizeof(size_t)); 00059 ossb->parent = (size_t*)malloc(buffer_size * sizeof(size_t)); 00060 ossb->path = (size_t*)malloc(buffer_size * sizeof(size_t)); 00061 ossb->initialized = 1; 00062 } 00063 00064 static size_t sp_seg(Converter* converter, 00065 ucs4_t** inbuf, 00066 size_t* inbuf_left, 00067 ucs4_t** outbuf, 00068 size_t* outbuf_left, 00069 size_t length) { 00070 /* 最短路徑分詞 */ 00071 /* 對長度爲1時特殊優化 */ 00072 if (length == 1) { 00073 const ucs4_t* const* match_rs = dict_group_match_longest( 00074 converter->current_dict_group, 00075 *inbuf, 00076 1, 00077 NULL); 00078 size_t match_len = 1; 00079 if (converter->conversion_mode == OPENCC_CONVERSION_FAST) { 00080 if (match_rs == NULL) { 00081 **outbuf = **inbuf; 00082 (*outbuf)++, (*outbuf_left)--; 00083 (*inbuf)++, (*inbuf_left)--; 00084 } else { 00085 const ucs4_t* result = match_rs[0]; 00086 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00087 if (ucs4len(result) > *outbuf_left) { 00088 errnum = CONVERTER_ERROR_OUTBUF; 00089 return (size_t)-1; 00090 } 00091 for (; *result; result++) { 00092 **outbuf = *result; 00093 (*outbuf)++, (*outbuf_left)--; 00094 } 00095 *inbuf += match_len; 00096 *inbuf_left -= match_len; 00097 } 00098 } else if (converter->conversion_mode == 00099 OPENCC_CONVERSION_LIST_CANDIDATES) { 00100 if (match_rs == NULL) { 00101 **outbuf = **inbuf; 00102 (*outbuf)++, (*outbuf_left)--; 00103 (*inbuf)++, (*inbuf_left)--; 00104 } else { 00105 size_t i; 00106 for (i = 0; match_rs[i] != NULL; i++) { 00107 const ucs4_t* result = match_rs[i]; 00108 int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; 00109 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00110 if (ucs4len(result) + show_delimiter > *outbuf_left) { 00111 errnum = CONVERTER_ERROR_OUTBUF; 00112 return (size_t)-1; 00113 } 00114 for (; *result; result++) { 00115 **outbuf = *result; 00116 (*outbuf)++, (*outbuf_left)--; 00117 } 00118 if (show_delimiter) { 00119 **outbuf = DELIMITER; 00120 (*outbuf)++, (*outbuf_left)--; 00121 } 00122 } 00123 *inbuf += match_len; 00124 *inbuf_left -= match_len; 00125 } 00126 } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { 00127 if (match_rs == NULL) { 00128 **outbuf = **inbuf; 00129 (*outbuf)++, (*outbuf_left)--; 00130 (*inbuf)++, (*inbuf_left)--; 00131 } else { 00132 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00133 if (match_len + 1 > *outbuf_left) { 00134 errnum = CONVERTER_ERROR_OUTBUF; 00135 return (size_t)-1; 00136 } 00137 size_t i; 00138 for (i = 0; i < match_len; i++) { 00139 **outbuf = **inbuf; 00140 (*outbuf)++, (*outbuf_left)--; 00141 (*inbuf)++, (*inbuf_left)--; 00142 } 00143 } 00144 **outbuf = DELIMITER; 00145 (*outbuf)++, (*outbuf_left)--; 00146 } else { 00147 debug_should_not_be_here(); 00148 } 00149 /* 必須保證有一個字符空間 */ 00150 return match_len; 00151 } 00152 00153 /* 設置緩衝區空間 */ 00154 SpsegData* ossb = converter->data; 00155 size_t buffer_size_need = length + 1; 00156 if ((ossb->initialized == 0) || (ossb->buffer_size < buffer_size_need)) { 00157 sp_seg_set_buffer_size(ossb, buffer_size_need); 00158 } 00159 size_t i, j; 00160 for (i = 0; i <= length; i++) { 00161 ossb->min_len[i] = INFINITY_INT; 00162 } 00163 ossb->min_len[0] = ossb->parent[0] = 0; 00164 for (i = 0; i < length; i++) { 00165 /* 獲取所有匹配長度 */ 00166 size_t match_count = dict_group_get_all_match_lengths( 00167 converter->current_dict_group, 00168 (*inbuf) + i, 00169 ossb->match_length 00170 ); 00171 if (ossb->match_length[0] != 1) { 00172 ossb->match_length[match_count++] = 1; 00173 } 00174 /* 動態規劃求最短分割路徑 */ 00175 for (j = 0; j < match_count; j++) { 00176 size_t k = ossb->match_length[j]; 00177 ossb->match_length[j] = 0; 00178 if ((k > 1) && (ossb->min_len[i] + 1 <= ossb->min_len[i + k])) { 00179 ossb->min_len[i + k] = ossb->min_len[i] + 1; 00180 ossb->parent[i + k] = i; 00181 } else if ((k == 1) && 00182 (ossb->min_len[i] + 1 < ossb->min_len[i + k])) { 00183 ossb->min_len[i + k] = ossb->min_len[i] + 1; 00184 ossb->parent[i + k] = i; 00185 } 00186 } 00187 } 00188 /* 取得最短分割路徑 */ 00189 for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i]) { 00190 ossb->path[--j] = i; 00191 } 00192 size_t inbuf_left_start = *inbuf_left; 00193 size_t begin, end; 00194 /* 根據最短分割路徑轉換 */ 00195 for (i = begin = 0; i < ossb->min_len[length]; i++) { 00196 end = ossb->path[i]; 00197 size_t match_len; 00198 const ucs4_t* const* match_rs = dict_group_match_longest( 00199 converter->current_dict_group, 00200 *inbuf, 00201 end - begin, 00202 &match_len 00203 ); 00204 if (match_rs == NULL) { 00205 **outbuf = **inbuf; 00206 (*outbuf)++, (*outbuf_left)--; 00207 (*inbuf)++, (*inbuf_left)--; 00208 } else { 00209 if (converter->conversion_mode == OPENCC_CONVERSION_FAST) { 00210 if (match_rs == NULL) { 00211 **outbuf = **inbuf; 00212 (*outbuf)++, (*outbuf_left)--; 00213 (*inbuf)++, (*inbuf_left)--; 00214 } else { 00215 const ucs4_t* result = match_rs[0]; 00216 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00217 if (ucs4len(result) > *outbuf_left) { 00218 if (inbuf_left_start - *inbuf_left > 0) { 00219 break; 00220 } 00221 errnum = CONVERTER_ERROR_OUTBUF; 00222 return (size_t)-1; 00223 } 00224 for (; *result; result++) { 00225 **outbuf = *result; 00226 (*outbuf)++, (*outbuf_left)--; 00227 } 00228 *inbuf += match_len; 00229 *inbuf_left -= match_len; 00230 } 00231 } else if (converter->conversion_mode == 00232 OPENCC_CONVERSION_LIST_CANDIDATES) { 00233 if (match_rs == NULL) { 00234 **outbuf = **inbuf; 00235 (*outbuf)++, (*outbuf_left)--; 00236 (*inbuf)++, (*inbuf_left)--; 00237 } else { 00238 size_t i; 00239 for (i = 0; match_rs[i] != NULL; i++) { 00240 const ucs4_t* result = match_rs[i]; 00241 int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; 00242 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00243 if (ucs4len(result) + show_delimiter > *outbuf_left) { 00244 if (inbuf_left_start - *inbuf_left > 0) { 00245 break; 00246 } 00247 errnum = CONVERTER_ERROR_OUTBUF; 00248 return (size_t)-1; 00249 } 00250 for (; *result; result++) { 00251 **outbuf = *result; 00252 (*outbuf)++, (*outbuf_left)--; 00253 } 00254 if (show_delimiter) { 00255 **outbuf = DELIMITER; 00256 (*outbuf)++, (*outbuf_left)--; 00257 } 00258 } 00259 *inbuf += match_len; 00260 *inbuf_left -= match_len; 00261 } 00262 } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { 00263 if (match_rs == NULL) { 00264 **outbuf = **inbuf; 00265 (*outbuf)++, (*outbuf_left)--; 00266 (*inbuf)++, (*inbuf_left)--; 00267 } else { 00268 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00269 if (match_len + 1 > *outbuf_left) { 00270 if (inbuf_left_start - *inbuf_left > 0) { 00271 break; 00272 } 00273 errnum = CONVERTER_ERROR_OUTBUF; 00274 return (size_t)-1; 00275 } 00276 size_t i; 00277 for (i = 0; i < match_len; i++) { 00278 **outbuf = **inbuf; 00279 (*outbuf)++, (*outbuf_left)--; 00280 (*inbuf)++, (*inbuf_left)--; 00281 } 00282 } 00283 **outbuf = DELIMITER; 00284 (*outbuf)++, (*outbuf_left)--; 00285 } else { 00286 debug_should_not_be_here(); 00287 } 00288 } 00289 begin = end; 00290 } 00291 return inbuf_left_start - *inbuf_left; 00292 } 00293 00294 static size_t segment(Converter* converter, 00295 ucs4_t** inbuf, 00296 size_t* inbuf_left, 00297 ucs4_t** outbuf, 00298 size_t* outbuf_left) { 00299 /* 歧義分割最短路徑分詞 */ 00300 size_t i, start, bound; 00301 const ucs4_t* inbuf_start = *inbuf; 00302 size_t inbuf_left_start = *inbuf_left; 00303 size_t sp_seg_length; 00304 bound = 0; 00305 for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0; 00306 i++) { 00307 if ((i != 0) && (i == bound)) { 00308 /* 對歧義部分進行最短路徑分詞 */ 00309 sp_seg_length = sp_seg(converter, 00310 inbuf, 00311 inbuf_left, 00312 outbuf, 00313 outbuf_left, 00314 bound - start); 00315 00316 if (sp_seg_length == (size_t)-1) { 00317 return (size_t)-1; 00318 } 00319 if (sp_seg_length == 0) { 00320 if (inbuf_left_start - *inbuf_left > 0) { 00321 return inbuf_left_start - *inbuf_left; 00322 } 00323 /* 空間不足 */ 00324 errnum = CONVERTER_ERROR_OUTBUF; 00325 return (size_t)-1; 00326 } 00327 start = i; 00328 } 00329 size_t match_len; 00330 dict_group_match_longest( 00331 converter->current_dict_group, 00332 inbuf_start + i, 00333 0, 00334 &match_len 00335 ); 00336 if (match_len == 0) { 00337 match_len = 1; 00338 } 00339 if (i + match_len > bound) { 00340 bound = i + match_len; 00341 } 00342 } 00343 if ((*inbuf_left > 0) && (*outbuf_left > 0)) { 00344 sp_seg_length = sp_seg(converter, 00345 inbuf, 00346 inbuf_left, 00347 outbuf, 00348 outbuf_left, 00349 bound - start); 00350 if (sp_seg_length == (size_t)-1) { 00351 return (size_t)-1; 00352 } 00353 if (sp_seg_length == 0) { 00354 if (inbuf_left_start - *inbuf_left > 0) { 00355 return inbuf_left_start - *inbuf_left; 00356 } 00357 /* 空間不足 */ 00358 errnum = CONVERTER_ERROR_OUTBUF; 00359 return (size_t)-1; 00360 } 00361 } 00362 if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { 00363 (*outbuf)--; 00364 (*outbuf_left)++; 00365 } 00366 return inbuf_left_start - *inbuf_left; 00367 } 00368 00369 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */ 00370 00371 #if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH 00372 static size_t segment(Converter* converter, 00373 ucs4_t** inbuf, 00374 size_t* inbuf_left, 00375 ucs4_t** outbuf, 00376 size_t* outbuf_left) { 00377 /* 正向最大分詞 */ 00378 size_t inbuf_left_start = *inbuf_left; 00379 for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) { 00380 size_t match_len; 00381 const ucs4_t* const* match_rs = dict_group_match_longest( 00382 converter->current_dict_group, 00383 *inbuf, 00384 *inbuf_left, 00385 &match_len 00386 ); 00387 if (converter->conversion_mode == OPENCC_CONVERSION_FAST) { 00388 if (match_rs == NULL) { 00389 **outbuf = **inbuf; 00390 (*outbuf)++, (*outbuf_left)--; 00391 (*inbuf)++, (*inbuf_left)--; 00392 } else { 00393 const ucs4_t* result = match_rs[0]; 00394 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00395 if (ucs4len(result) > *outbuf_left) { 00396 if (inbuf_left_start - *inbuf_left > 0) { 00397 break; 00398 } 00399 errnum = CONVERTER_ERROR_OUTBUF; 00400 return (size_t)-1; 00401 } 00402 for (; *result; result++) { 00403 **outbuf = *result; 00404 (*outbuf)++, (*outbuf_left)--; 00405 } 00406 *inbuf += match_len; 00407 *inbuf_left -= match_len; 00408 } 00409 } else if (converter->conversion_mode == 00410 OPENCC_CONVERSION_LIST_CANDIDATES) { 00411 if (match_rs == NULL) { 00412 **outbuf = **inbuf; 00413 (*outbuf)++, (*outbuf_left)--; 00414 (*inbuf)++, (*inbuf_left)--; 00415 } else { 00416 size_t i; 00417 for (i = 0; match_rs[i] != NULL; i++) { 00418 const ucs4_t* result = match_rs[i]; 00419 int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0; 00420 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00421 if (ucs4len(result) + show_delimiter > *outbuf_left) { 00422 if (inbuf_left_start - *inbuf_left > 0) { 00423 break; 00424 } 00425 errnum = CONVERTER_ERROR_OUTBUF; 00426 return (size_t)-1; 00427 } 00428 for (; *result; result++) { 00429 **outbuf = *result; 00430 (*outbuf)++, (*outbuf_left)--; 00431 } 00432 if (show_delimiter) { 00433 **outbuf = DELIMITER; 00434 (*outbuf)++, (*outbuf_left)--; 00435 } 00436 } 00437 *inbuf += match_len; 00438 *inbuf_left -= match_len; 00439 } 00440 } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { 00441 if (match_rs == NULL) { 00442 **outbuf = **inbuf; 00443 (*outbuf)++, (*outbuf_left)--; 00444 (*inbuf)++, (*inbuf_left)--; 00445 } else { 00446 /* 輸出緩衝區剩餘空間小於分詞長度 */ 00447 if (match_len + 1 > *outbuf_left) { 00448 if (inbuf_left_start - *inbuf_left > 0) { 00449 break; 00450 } 00451 errnum = CONVERTER_ERROR_OUTBUF; 00452 return (size_t)-1; 00453 } 00454 size_t i; 00455 for (i = 0; i < match_len; i++) { 00456 **outbuf = **inbuf; 00457 (*outbuf)++, (*outbuf_left)--; 00458 (*inbuf)++, (*inbuf_left)--; 00459 } 00460 } 00461 **outbuf = DELIMITER; 00462 (*outbuf)++, (*outbuf_left)--; 00463 } else { 00464 debug_should_not_be_here(); 00465 } 00466 } 00467 if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) { 00468 (*outbuf)--; 00469 (*outbuf_left)++; 00470 } 00471 return inbuf_left_start - *inbuf_left; 00472 } 00473 00474 #endif /* if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH */ 00475 00476 size_t converter_convert(Converter* converter, 00477 ucs4_t** inbuf, 00478 size_t* inbuf_left, 00479 ucs4_t** outbuf, 00480 size_t* outbuf_left) { 00481 if (converter->dict_chain == NULL) { 00482 errnum = CONVERTER_ERROR_NODICT; 00483 return (size_t)-1; 00484 } 00485 if (converter->dict_chain->count == 1) { 00486 /* 只有一個辭典,直接輸出 */ 00487 return segment(converter, 00488 inbuf, 00489 inbuf_left, 00490 outbuf, 00491 outbuf_left); 00492 } 00493 // 啓用辭典轉換鏈 00494 size_t inbuf_size = *inbuf_left; 00495 size_t outbuf_size = *outbuf_left; 00496 size_t retval = (size_t)-1; 00497 size_t cinbuf_left, coutbuf_left; 00498 size_t coutbuf_delta = 0; 00499 size_t i, cur; 00500 ucs4_t* tmpbuf = (ucs4_t*)malloc(sizeof(ucs4_t) * outbuf_size); 00501 ucs4_t* orig_outbuf = *outbuf; 00502 ucs4_t* cinbuf, * coutbuf; 00503 cinbuf_left = inbuf_size; 00504 coutbuf_left = outbuf_size; 00505 cinbuf = *inbuf; 00506 coutbuf = tmpbuf; 00507 for (i = cur = 0; i < converter->dict_chain->count; ++i, cur = 1 - cur) { 00508 if (i > 0) { 00509 cinbuf_left = coutbuf_delta; 00510 coutbuf_left = outbuf_size; 00511 00512 if (cur == 1) { 00513 cinbuf = tmpbuf; 00514 coutbuf = orig_outbuf; 00515 } else { 00516 cinbuf = orig_outbuf; 00517 coutbuf = tmpbuf; 00518 } 00519 } 00520 converter->current_dict_group = dict_chain_get_group( 00521 converter->dict_chain, 00522 i); 00523 size_t ret = segment(converter, 00524 &cinbuf, 00525 &cinbuf_left, 00526 &coutbuf, 00527 &coutbuf_left); 00528 if (ret == (size_t)-1) { 00529 free(tmpbuf); 00530 return (size_t)-1; 00531 } 00532 coutbuf_delta = outbuf_size - coutbuf_left; 00533 if (i == 0) { 00534 retval = ret; 00535 *inbuf = cinbuf; 00536 *inbuf_left = cinbuf_left; 00537 } 00538 } 00539 if (cur == 1) { 00540 // 結果在緩衝區 00541 memcpy(*outbuf, tmpbuf, coutbuf_delta * sizeof(ucs4_t)); 00542 } 00543 *outbuf += coutbuf_delta; 00544 *outbuf_left = coutbuf_left; 00545 free(tmpbuf); 00546 return retval; 00547 } 00548 00549 void converter_assign_dictionary(Converter* converter, DictChain* dict_chain) { 00550 converter->dict_chain = dict_chain; 00551 if (converter->dict_chain->count > 0) { 00552 converter->current_dict_group = dict_chain_get_group( 00553 converter->dict_chain, 00554 0); 00555 } 00556 } 00557 00558 Converter* converter_open(void) { 00559 Converter* converter = (Converter*)malloc(sizeof(Converter)); 00560 converter->dict_chain = NULL; 00561 converter->current_dict_group = NULL; 00562 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH 00563 converter->data = (SpsegData*)malloc(sizeof(SpsegData)); 00564 SpsegData* spseg_buffer = converter->data; 00565 spseg_buffer->initialized = 0; 00566 spseg_buffer->match_length = NULL; 00567 spseg_buffer->min_len = NULL; 00568 spseg_buffer->parent = NULL; 00569 spseg_buffer->path = NULL; 00570 sp_seg_set_buffer_size(spseg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE); 00571 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */ 00572 return converter; 00573 } 00574 00575 void converter_close(Converter* converter) { 00576 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH 00577 sp_seg_buffer_free(converter->data); 00578 free((SpsegData *)converter->data); 00579 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */ 00580 free(converter); 00581 } 00582 00583 void converter_set_conversion_mode(Converter* converter, 00584 opencc_conversion_mode conversion_mode) { 00585 converter->conversion_mode = conversion_mode; 00586 } 00587 00588 converter_error converter_errno(void) { 00589 return errnum; 00590 } 00591 00592 void converter_perror(const char* spec) { 00593 perr(spec); 00594 perr("\n"); 00595 switch (errnum) { 00596 case CONVERTER_ERROR_VOID: 00597 break; 00598 case CONVERTER_ERROR_NODICT: 00599 perr(_("No dictionary loaded")); 00600 break; 00601 case CONVERTER_ERROR_OUTBUF: 00602 perr(_("Output buffer not enough for one segment")); 00603 break; 00604 default: 00605 perr(_("Unknown")); 00606 } 00607 }