Libcroco
|
00001 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */ 00002 00003 /* 00004 * This file is part of The Croco Library 00005 * 00006 * This program is free software; you can redistribute it and/or 00007 * modify it under the terms of version 2.1 of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public License 00016 * along with this program; if not, write to the Free Software 00017 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00018 * USA 00019 * 00020 * Author: Dodji Seketeli 00021 * See COPYRIGHTS file for copyright information. 00022 */ 00023 00024 #include "cr-utils.h" 00025 #include "cr-string.h" 00026 00027 /** 00028 *@file: 00029 *Some misc utility functions used 00030 *in the libcroco. 00031 *Note that troughout this file I will 00032 *refer to the CSS SPECIFICATIONS DOCUMENTATION 00033 *written by the w3c guys. You can find that document 00034 *at http://www.w3.org/TR/REC-CSS2/ . 00035 */ 00036 00037 /**************************** 00038 *Encoding transformations and 00039 *encoding helpers 00040 ****************************/ 00041 00042 /* 00043 *Here is the correspondance between the ucs-4 charactere codes 00044 *and there matching utf-8 encoding pattern as dscribed by RFC 2279: 00045 * 00046 *UCS-4 range (hex.) UTF-8 octet sequence (binary) 00047 *------------------ ----------------------------- 00048 *0000 0000-0000 007F 0xxxxxxx 00049 *0000 0080-0000 07FF 110xxxxx 10xxxxxx 00050 *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 00051 *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00052 *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00053 *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx 00054 */ 00055 00056 /** 00057 *Given an utf8 string buffer, calculates 00058 *the length of this string if it was encoded 00059 *in ucs4. 00060 *@param a_in_start a pointer to the begining of 00061 *the input utf8 string. 00062 *@param a_in_end a pointre to the end of the input 00063 *utf8 string (points to the last byte of the buffer) 00064 *@param a_len out parameter the calculated length. 00065 *@return CR_OK upon succesfull completion, an error code 00066 *otherwise. 00067 */ 00068 enum CRStatus 00069 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start, 00070 const guchar * a_in_end, gulong * a_len) 00071 { 00072 guchar *byte_ptr = NULL; 00073 gint len = 0; 00074 00075 /* 00076 *to store the final decoded 00077 *unicode char 00078 */ 00079 guint c = 0; 00080 00081 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00082 CR_BAD_PARAM_ERROR); 00083 *a_len = 0; 00084 00085 for (byte_ptr = (guchar *) a_in_start; 00086 byte_ptr <= a_in_end; byte_ptr++) { 00087 gint nb_bytes_2_decode = 0; 00088 00089 if (*byte_ptr <= 0x7F) { 00090 /* 00091 *7 bits long char 00092 *encoded over 1 byte: 00093 * 0xxx xxxx 00094 */ 00095 c = *byte_ptr; 00096 nb_bytes_2_decode = 1; 00097 00098 } else if ((*byte_ptr & 0xE0) == 0xC0) { 00099 /* 00100 *up to 11 bits long char. 00101 *encoded over 2 bytes: 00102 *110x xxxx 10xx xxxx 00103 */ 00104 c = *byte_ptr & 0x1F; 00105 nb_bytes_2_decode = 2; 00106 00107 } else if ((*byte_ptr & 0xF0) == 0xE0) { 00108 /* 00109 *up to 16 bit long char 00110 *encoded over 3 bytes: 00111 *1110 xxxx 10xx xxxx 10xx xxxx 00112 */ 00113 c = *byte_ptr & 0x0F; 00114 nb_bytes_2_decode = 3; 00115 00116 } else if ((*byte_ptr & 0xF8) == 0xF0) { 00117 /* 00118 *up to 21 bits long char 00119 *encoded over 4 bytes: 00120 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00121 */ 00122 c = *byte_ptr & 0x7; 00123 nb_bytes_2_decode = 4; 00124 00125 } else if ((*byte_ptr & 0xFC) == 0xF8) { 00126 /* 00127 *up to 26 bits long char 00128 *encoded over 5 bytes. 00129 *1111 10xx 10xx xxxx 10xx xxxx 00130 *10xx xxxx 10xx xxxx 00131 */ 00132 c = *byte_ptr & 3; 00133 nb_bytes_2_decode = 5; 00134 00135 } else if ((*byte_ptr & 0xFE) == 0xFC) { 00136 /* 00137 *up to 31 bits long char 00138 *encoded over 6 bytes: 00139 *1111 110x 10xx xxxx 10xx xxxx 00140 *10xx xxxx 10xx xxxx 10xx xxxx 00141 */ 00142 c = *byte_ptr & 1; 00143 nb_bytes_2_decode = 6; 00144 00145 } else { 00146 /* 00147 *BAD ENCODING 00148 */ 00149 return CR_ENCODING_ERROR; 00150 } 00151 00152 /* 00153 *Go and decode the remaining byte(s) 00154 *(if any) to get the current character. 00155 */ 00156 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 00157 /*decode the next byte */ 00158 byte_ptr++; 00159 00160 /*byte pattern must be: 10xx xxxx */ 00161 if ((*byte_ptr & 0xC0) != 0x80) { 00162 return CR_ENCODING_ERROR; 00163 } 00164 00165 c = (c << 6) | (*byte_ptr & 0x3F); 00166 } 00167 00168 len++; 00169 } 00170 00171 *a_len = len; 00172 00173 return CR_OK; 00174 } 00175 00176 /** 00177 *Given an ucs4 string, this function 00178 *returns the size (in bytes) this string 00179 *would have occupied if it was encoded in utf-8. 00180 *@param a_in_start a pointer to the beginning of the input 00181 *buffer. 00182 *@param a_in_end a pointer to the end of the input buffer. 00183 *@param a_len out parameter. The computed length. 00184 *@return CR_OK upon successfull completion, an error code otherwise. 00185 */ 00186 enum CRStatus 00187 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start, 00188 const guint32 * a_in_end, gulong * a_len) 00189 { 00190 gint len = 0; 00191 guint32 *char_ptr = NULL; 00192 00193 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00194 CR_BAD_PARAM_ERROR); 00195 00196 for (char_ptr = (guint32 *) a_in_start; 00197 char_ptr <= a_in_end; char_ptr++) { 00198 if (*char_ptr <= 0x7F) { 00199 /*the utf-8 char would take 1 byte */ 00200 len += 1; 00201 } else if (*char_ptr <= 0x7FF) { 00202 /*the utf-8 char would take 2 bytes */ 00203 len += 2; 00204 } else if (*char_ptr <= 0xFFFF) { 00205 len += 3; 00206 } else if (*char_ptr <= 0x1FFFFF) { 00207 len += 4; 00208 } else if (*char_ptr <= 0x3FFFFFF) { 00209 len += 5; 00210 } else if (*char_ptr <= 0x7FFFFFFF) { 00211 len += 6; 00212 } 00213 } 00214 00215 *a_len = len; 00216 return CR_OK; 00217 } 00218 00219 /** 00220 *Given an ucsA string, this function 00221 *returns the size (in bytes) this string 00222 *would have occupied if it was encoded in utf-8. 00223 *@param a_in_start a pointer to the beginning of the input 00224 *buffer. 00225 *@param a_in_end a pointer to the end of the input buffer. 00226 *@param a_len out parameter. The computed length. 00227 *@return CR_OK upon successfull completion, an error code otherwise. 00228 */ 00229 enum CRStatus 00230 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start, 00231 const guchar * a_in_end, gulong * a_len) 00232 { 00233 gint len = 0; 00234 guchar *char_ptr = NULL; 00235 00236 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00237 CR_BAD_PARAM_ERROR); 00238 00239 for (char_ptr = (guchar *) a_in_start; 00240 char_ptr <= a_in_end; char_ptr++) { 00241 if (*char_ptr <= 0x7F) { 00242 /*the utf-8 char would take 1 byte */ 00243 len += 1; 00244 } else { 00245 /*the utf-8 char would take 2 bytes */ 00246 len += 2; 00247 } 00248 } 00249 00250 *a_len = len; 00251 return CR_OK; 00252 } 00253 00254 /** 00255 *Converts an utf8 buffer into an ucs4 buffer. 00256 * 00257 *@param a_in the input utf8 buffer to convert. 00258 *@param a_in_len in/out parameter. The size of the 00259 *input buffer to convert. After return, this parameter contains 00260 *the actual number of bytes consumed. 00261 *@param a_out the output converted ucs4 buffer. Must be allocated by 00262 *the caller. 00263 *@param a_out_len in/out parameter. The size of the output buffer. 00264 *If this size is actually smaller than the real needed size, the function 00265 *just converts what it can and returns a success status. After return, 00266 *this param points to the actual number of characters decoded. 00267 *@return CR_OK upon successfull completion, an error code otherwise. 00268 */ 00269 enum CRStatus 00270 cr_utils_utf8_to_ucs4 (const guchar * a_in, 00271 gulong * a_in_len, guint32 * a_out, gulong * a_out_len) 00272 { 00273 gulong in_len = 0, 00274 out_len = 0, 00275 in_index = 0, 00276 out_index = 0; 00277 enum CRStatus status = CR_OK; 00278 00279 /* 00280 *to store the final decoded 00281 *unicode char 00282 */ 00283 guint c = 0; 00284 00285 g_return_val_if_fail (a_in && a_in_len 00286 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 00287 00288 if (*a_in_len < 1) { 00289 status = CR_OK; 00290 goto end; 00291 } 00292 00293 in_len = *a_in_len; 00294 out_len = *a_out_len; 00295 00296 for (in_index = 0, out_index = 0; 00297 (in_index < in_len) && (out_index < out_len); 00298 in_index++, out_index++) { 00299 gint nb_bytes_2_decode = 0; 00300 00301 if (a_in[in_index] <= 0x7F) { 00302 /* 00303 *7 bits long char 00304 *encoded over 1 byte: 00305 * 0xxx xxxx 00306 */ 00307 c = a_in[in_index]; 00308 nb_bytes_2_decode = 1; 00309 00310 } else if ((a_in[in_index] & 0xE0) == 0xC0) { 00311 /* 00312 *up to 11 bits long char. 00313 *encoded over 2 bytes: 00314 *110x xxxx 10xx xxxx 00315 */ 00316 c = a_in[in_index] & 0x1F; 00317 nb_bytes_2_decode = 2; 00318 00319 } else if ((a_in[in_index] & 0xF0) == 0xE0) { 00320 /* 00321 *up to 16 bit long char 00322 *encoded over 3 bytes: 00323 *1110 xxxx 10xx xxxx 10xx xxxx 00324 */ 00325 c = a_in[in_index] & 0x0F; 00326 nb_bytes_2_decode = 3; 00327 00328 } else if ((a_in[in_index] & 0xF8) == 0xF0) { 00329 /* 00330 *up to 21 bits long char 00331 *encoded over 4 bytes: 00332 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00333 */ 00334 c = a_in[in_index] & 0x7; 00335 nb_bytes_2_decode = 4; 00336 00337 } else if ((a_in[in_index] & 0xFC) == 0xF8) { 00338 /* 00339 *up to 26 bits long char 00340 *encoded over 5 bytes. 00341 *1111 10xx 10xx xxxx 10xx xxxx 00342 *10xx xxxx 10xx xxxx 00343 */ 00344 c = a_in[in_index] & 3; 00345 nb_bytes_2_decode = 5; 00346 00347 } else if ((a_in[in_index] & 0xFE) == 0xFC) { 00348 /* 00349 *up to 31 bits long char 00350 *encoded over 6 bytes: 00351 *1111 110x 10xx xxxx 10xx xxxx 00352 *10xx xxxx 10xx xxxx 10xx xxxx 00353 */ 00354 c = a_in[in_index] & 1; 00355 nb_bytes_2_decode = 6; 00356 00357 } else { 00358 /*BAD ENCODING */ 00359 goto end; 00360 } 00361 00362 /* 00363 *Go and decode the remaining byte(s) 00364 *(if any) to get the current character. 00365 */ 00366 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 00367 /*decode the next byte */ 00368 in_index++; 00369 00370 /*byte pattern must be: 10xx xxxx */ 00371 if ((a_in[in_index] & 0xC0) != 0x80) { 00372 goto end; 00373 } 00374 00375 c = (c << 6) | (a_in[in_index] & 0x3F); 00376 } 00377 00378 /* 00379 *The decoded ucs4 char is now 00380 *in c. 00381 */ 00382 00383 /************************ 00384 *Some security tests 00385 ***********************/ 00386 00387 /*be sure c is a char */ 00388 if (c == 0xFFFF || c == 0xFFFE) 00389 goto end; 00390 00391 /*be sure c is inferior to the max ucs4 char value */ 00392 if (c > 0x10FFFF) 00393 goto end; 00394 00395 /* 00396 *c must be less than UTF16 "lower surrogate begin" 00397 *or higher than UTF16 "High surrogate end" 00398 */ 00399 if (c >= 0xD800 && c <= 0xDFFF) 00400 goto end; 00401 00402 /*Avoid characters that equals zero */ 00403 if (c == 0) 00404 goto end; 00405 00406 a_out[out_index] = c; 00407 } 00408 00409 end: 00410 *a_out_len = out_index + 1; 00411 *a_in_len = in_index + 1; 00412 00413 return status; 00414 } 00415 00416 /** 00417 *Reads a character from an utf8 buffer. 00418 *Actually decode the next character code (unicode character code) 00419 *and returns it. 00420 *@param a_in the starting address of the utf8 buffer. 00421 *@param a_in_len the length of the utf8 buffer. 00422 *@param a_out output parameter. The resulting read char. 00423 *@param a_consumed the number of the bytes consumed to 00424 *decode the returned character code. 00425 *@return CR_OK upon successfull completion, an error code otherwise. 00426 */ 00427 enum CRStatus 00428 cr_utils_read_char_from_utf8_buf (const guchar * a_in, 00429 gulong a_in_len, 00430 guint32 * a_out, gulong * a_consumed) 00431 { 00432 gulong in_len = 0, 00433 in_index = 0, 00434 nb_bytes_2_decode = 0; 00435 enum CRStatus status = CR_OK; 00436 00437 /* 00438 *to store the final decoded 00439 *unicode char 00440 */ 00441 guint32 c = 0; 00442 00443 g_return_val_if_fail (a_in && a_out && a_out 00444 && a_consumed, CR_BAD_PARAM_ERROR); 00445 00446 if (a_in_len < 1) { 00447 status = CR_OK; 00448 goto end; 00449 } 00450 00451 in_len = a_in_len; 00452 00453 if (*a_in <= 0x7F) { 00454 /* 00455 *7 bits long char 00456 *encoded over 1 byte: 00457 * 0xxx xxxx 00458 */ 00459 c = *a_in; 00460 nb_bytes_2_decode = 1; 00461 00462 } else if ((*a_in & 0xE0) == 0xC0) { 00463 /* 00464 *up to 11 bits long char. 00465 *encoded over 2 bytes: 00466 *110x xxxx 10xx xxxx 00467 */ 00468 c = *a_in & 0x1F; 00469 nb_bytes_2_decode = 2; 00470 00471 } else if ((*a_in & 0xF0) == 0xE0) { 00472 /* 00473 *up to 16 bit long char 00474 *encoded over 3 bytes: 00475 *1110 xxxx 10xx xxxx 10xx xxxx 00476 */ 00477 c = *a_in & 0x0F; 00478 nb_bytes_2_decode = 3; 00479 00480 } else if ((*a_in & 0xF8) == 0xF0) { 00481 /* 00482 *up to 21 bits long char 00483 *encoded over 4 bytes: 00484 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00485 */ 00486 c = *a_in & 0x7; 00487 nb_bytes_2_decode = 4; 00488 00489 } else if ((*a_in & 0xFC) == 0xF8) { 00490 /* 00491 *up to 26 bits long char 00492 *encoded over 5 bytes. 00493 *1111 10xx 10xx xxxx 10xx xxxx 00494 *10xx xxxx 10xx xxxx 00495 */ 00496 c = *a_in & 3; 00497 nb_bytes_2_decode = 5; 00498 00499 } else if ((*a_in & 0xFE) == 0xFC) { 00500 /* 00501 *up to 31 bits long char 00502 *encoded over 6 bytes: 00503 *1111 110x 10xx xxxx 10xx xxxx 00504 *10xx xxxx 10xx xxxx 10xx xxxx 00505 */ 00506 c = *a_in & 1; 00507 nb_bytes_2_decode = 6; 00508 00509 } else { 00510 /*BAD ENCODING */ 00511 goto end; 00512 } 00513 00514 if (nb_bytes_2_decode > a_in_len) { 00515 status = CR_END_OF_INPUT_ERROR; 00516 goto end; 00517 } 00518 00519 /* 00520 *Go and decode the remaining byte(s) 00521 *(if any) to get the current character. 00522 */ 00523 for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) { 00524 /*byte pattern must be: 10xx xxxx */ 00525 if ((a_in[in_index] & 0xC0) != 0x80) { 00526 goto end; 00527 } 00528 00529 c = (c << 6) | (a_in[in_index] & 0x3F); 00530 } 00531 00532 /* 00533 *The decoded ucs4 char is now 00534 *in c. 00535 */ 00536 00537 /************************ 00538 *Some security tests 00539 ***********************/ 00540 00541 /*be sure c is a char */ 00542 if (c == 0xFFFF || c == 0xFFFE) 00543 goto end; 00544 00545 /*be sure c is inferior to the max ucs4 char value */ 00546 if (c > 0x10FFFF) 00547 goto end; 00548 00549 /* 00550 *c must be less than UTF16 "lower surrogate begin" 00551 *or higher than UTF16 "High surrogate end" 00552 */ 00553 if (c >= 0xD800 && c <= 0xDFFF) 00554 goto end; 00555 00556 /*Avoid characters that equals zero */ 00557 if (c == 0) 00558 goto end; 00559 00560 *a_out = c; 00561 00562 end: 00563 *a_consumed = nb_bytes_2_decode; 00564 00565 return status; 00566 } 00567 00568 /** 00569 * 00570 */ 00571 enum CRStatus 00572 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start, 00573 const guchar * a_in_end, gulong * a_len) 00574 { 00575 /* 00576 *Note: this function can be made shorter 00577 *but it considers all the cases of the utf8 encoding 00578 *to ease further extensions ... 00579 */ 00580 00581 guchar *byte_ptr = NULL; 00582 gint len = 0; 00583 00584 /* 00585 *to store the final decoded 00586 *unicode char 00587 */ 00588 guint c = 0; 00589 00590 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00591 CR_BAD_PARAM_ERROR); 00592 *a_len = 0; 00593 00594 for (byte_ptr = (guchar *) a_in_start; 00595 byte_ptr <= a_in_end; byte_ptr++) { 00596 gint nb_bytes_2_decode = 0; 00597 00598 if (*byte_ptr <= 0x7F) { 00599 /* 00600 *7 bits long char 00601 *encoded over 1 byte: 00602 * 0xxx xxxx 00603 */ 00604 c = *byte_ptr; 00605 nb_bytes_2_decode = 1; 00606 00607 } else if ((*byte_ptr & 0xE0) == 0xC0) { 00608 /* 00609 *up to 11 bits long char. 00610 *encoded over 2 bytes: 00611 *110x xxxx 10xx xxxx 00612 */ 00613 c = *byte_ptr & 0x1F; 00614 nb_bytes_2_decode = 2; 00615 00616 } else if ((*byte_ptr & 0xF0) == 0xE0) { 00617 /* 00618 *up to 16 bit long char 00619 *encoded over 3 bytes: 00620 *1110 xxxx 10xx xxxx 10xx xxxx 00621 */ 00622 c = *byte_ptr & 0x0F; 00623 nb_bytes_2_decode = 3; 00624 00625 } else if ((*byte_ptr & 0xF8) == 0xF0) { 00626 /* 00627 *up to 21 bits long char 00628 *encoded over 4 bytes: 00629 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00630 */ 00631 c = *byte_ptr & 0x7; 00632 nb_bytes_2_decode = 4; 00633 00634 } else if ((*byte_ptr & 0xFC) == 0xF8) { 00635 /* 00636 *up to 26 bits long char 00637 *encoded over 5 bytes. 00638 *1111 10xx 10xx xxxx 10xx xxxx 00639 *10xx xxxx 10xx xxxx 00640 */ 00641 c = *byte_ptr & 3; 00642 nb_bytes_2_decode = 5; 00643 00644 } else if ((*byte_ptr & 0xFE) == 0xFC) { 00645 /* 00646 *up to 31 bits long char 00647 *encoded over 6 bytes: 00648 *1111 110x 10xx xxxx 10xx xxxx 00649 *10xx xxxx 10xx xxxx 10xx xxxx 00650 */ 00651 c = *byte_ptr & 1; 00652 nb_bytes_2_decode = 6; 00653 00654 } else { 00655 /* 00656 *BAD ENCODING 00657 */ 00658 return CR_ENCODING_ERROR; 00659 } 00660 00661 /* 00662 *Go and decode the remaining byte(s) 00663 *(if any) to get the current character. 00664 */ 00665 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 00666 /*decode the next byte */ 00667 byte_ptr++; 00668 00669 /*byte pattern must be: 10xx xxxx */ 00670 if ((*byte_ptr & 0xC0) != 0x80) { 00671 return CR_ENCODING_ERROR; 00672 } 00673 00674 c = (c << 6) | (*byte_ptr & 0x3F); 00675 } 00676 00677 /* 00678 *The decoded ucs4 char is now 00679 *in c. 00680 */ 00681 00682 if (c <= 0xFF) { /*Add other conditions to support 00683 *other char sets (ucs2, ucs3, ucs4). 00684 */ 00685 len++; 00686 } else { 00687 /*the char is too long to fit 00688 *into the supposed charset len. 00689 */ 00690 return CR_ENCODING_ERROR; 00691 } 00692 } 00693 00694 *a_len = len; 00695 00696 return CR_OK; 00697 } 00698 00699 /** 00700 *Converts an utf8 string into an ucs4 string. 00701 *@param a_in the input string to convert. 00702 *@param a_in_len in/out parameter. The length of the input 00703 *string. After return, points to the actual number of bytes 00704 *consumed. This can be usefull to debug the input stream in case 00705 *of encoding error. 00706 *@param a_out out parameter. Points to the output string. It is allocated 00707 *by this function and must be freed by the caller. 00708 *@param a_out_len out parameter. The length of the output string. 00709 *@return CR_OK upon successfull completion, an error code otherwise. 00710 * 00711 */ 00712 enum CRStatus 00713 cr_utils_utf8_str_to_ucs4 (const guchar * a_in, 00714 gulong * a_in_len, 00715 guint32 ** a_out, gulong * a_out_len) 00716 { 00717 enum CRStatus status = CR_OK; 00718 00719 g_return_val_if_fail (a_in && a_in_len 00720 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 00721 00722 status = cr_utils_utf8_str_len_as_ucs4 (a_in, 00723 &a_in[*a_in_len - 1], 00724 a_out_len); 00725 00726 g_return_val_if_fail (status == CR_OK, status); 00727 00728 *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); 00729 00730 status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len); 00731 00732 return status; 00733 } 00734 00735 /** 00736 *Converts an ucs4 buffer into an utf8 buffer. 00737 * 00738 *@param a_in the input ucs4 buffer to convert. 00739 *@param a_in_len in/out parameter. The size of the 00740 *input buffer to convert. After return, this parameter contains 00741 *the actual number of characters consumed. 00742 *@param a_out the output converted utf8 buffer. Must be allocated by 00743 *the caller. 00744 *@param a_out_len in/out parameter. The size of the output buffer. 00745 *If this size is actually smaller than the real needed size, the function 00746 *just converts what it can and returns a success status. After return, 00747 *this param points to the actual number of bytes in the buffer. 00748 *@return CR_OK upon successfull completion, an error code otherwise. 00749 */ 00750 enum CRStatus 00751 cr_utils_ucs4_to_utf8 (const guint32 * a_in, 00752 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 00753 { 00754 gulong in_len = 0, 00755 in_index = 0, 00756 out_index = 0; 00757 enum CRStatus status = CR_OK; 00758 00759 g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, 00760 CR_BAD_PARAM_ERROR); 00761 00762 if (*a_in_len < 1) { 00763 status = CR_OK; 00764 goto end; 00765 } 00766 00767 in_len = *a_in_len; 00768 00769 for (in_index = 0; in_index < in_len; in_index++) { 00770 /* 00771 *FIXME: return whenever we encounter forbidden char values. 00772 */ 00773 00774 if (a_in[in_index] <= 0x7F) { 00775 a_out[out_index] = a_in[in_index]; 00776 out_index++; 00777 } else if (a_in[in_index] <= 0x7FF) { 00778 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); 00779 a_out[out_index + 1] = 00780 (0x80 | (a_in[in_index] & 0x3F)); 00781 out_index += 2; 00782 } else if (a_in[in_index] <= 0xFFFF) { 00783 a_out[out_index] = (0xE0 | (a_in[in_index] >> 12)); 00784 a_out[out_index + 1] = 00785 (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00786 a_out[out_index + 2] = 00787 (0x80 | (a_in[in_index] & 0x3F)); 00788 out_index += 3; 00789 } else if (a_in[in_index] <= 0x1FFFFF) { 00790 a_out[out_index] = (0xF0 | (a_in[in_index] >> 18)); 00791 a_out[out_index + 1] 00792 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 00793 a_out[out_index + 2] 00794 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00795 a_out[out_index + 3] 00796 = (0x80 | (a_in[in_index] & 0x3F)); 00797 out_index += 4; 00798 } else if (a_in[in_index] <= 0x3FFFFFF) { 00799 a_out[out_index] = (0xF8 | (a_in[in_index] >> 24)); 00800 a_out[out_index + 1] = 00801 (0x80 | (a_in[in_index] >> 18)); 00802 a_out[out_index + 2] 00803 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 00804 a_out[out_index + 3] 00805 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00806 a_out[out_index + 4] 00807 = (0x80 | (a_in[in_index] & 0x3F)); 00808 out_index += 5; 00809 } else if (a_in[in_index] <= 0x7FFFFFFF) { 00810 a_out[out_index] = (0xFC | (a_in[in_index] >> 30)); 00811 a_out[out_index + 1] = 00812 (0x80 | (a_in[in_index] >> 24)); 00813 a_out[out_index + 2] 00814 = (0x80 | ((a_in[in_index] >> 18) & 0x3F)); 00815 a_out[out_index + 3] 00816 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 00817 a_out[out_index + 4] 00818 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00819 a_out[out_index + 4] 00820 = (0x80 | (a_in[in_index] & 0x3F)); 00821 out_index += 6; 00822 } else { 00823 status = CR_ENCODING_ERROR; 00824 goto end; 00825 } 00826 } /*end for */ 00827 00828 end: 00829 *a_in_len = in_index + 1; 00830 *a_out_len = out_index + 1; 00831 00832 return status; 00833 } 00834 00835 /** 00836 *Converts an ucs4 string into an utf8 string. 00837 *@param a_in the input string to convert. 00838 *@param a_in_len in/out parameter. The length of the input 00839 *string. After return, points to the actual number of characters 00840 *consumed. This can be usefull to debug the input string in case 00841 *of encoding error. 00842 *@param a_out out parameter. Points to the output string. It is allocated 00843 *by this function and must be freed by the caller. 00844 *@param a_out_len out parameter. The length (in bytes) of the output string. 00845 *@return CR_OK upon successfull completion, an error code otherwise. 00846 */ 00847 enum CRStatus 00848 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in, 00849 gulong * a_in_len, 00850 guchar ** a_out, gulong * a_out_len) 00851 { 00852 enum CRStatus status = CR_OK; 00853 00854 g_return_val_if_fail (a_in && a_in_len && a_out 00855 && a_out_len, CR_BAD_PARAM_ERROR); 00856 00857 status = cr_utils_ucs4_str_len_as_utf8 (a_in, 00858 &a_in[*a_out_len - 1], 00859 a_out_len); 00860 00861 g_return_val_if_fail (status == CR_OK, status); 00862 00863 status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len); 00864 00865 return status; 00866 } 00867 00868 /** 00869 *Converts an ucs1 buffer into an utf8 buffer. 00870 *The caller must know the size of the resulting buffer and 00871 *allocate it prior to calling this function. 00872 * 00873 *@param a_in the input ucs1 buffer. 00874 * 00875 *@param a_in_len in/out parameter. The length of the input buffer. 00876 *After return, points to the number of bytes actually consumed even 00877 *in case of encoding error. 00878 * 00879 *@param a_out out parameter. The output utf8 converted buffer. 00880 * 00881 *@param a_out_len in/out parameter. The size of the output buffer. 00882 *If the output buffer size is shorter than the actual needed size, 00883 *this function just convert what it can. 00884 * 00885 *@return CR_OK upon successfull completion, an error code otherwise. 00886 * 00887 */ 00888 enum CRStatus 00889 cr_utils_ucs1_to_utf8 (const guchar * a_in, 00890 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 00891 { 00892 gulong out_index = 0, 00893 in_index = 0, 00894 in_len = 0, 00895 out_len = 0; 00896 enum CRStatus status = CR_OK; 00897 00898 g_return_val_if_fail (a_in && a_in_len 00899 && a_out_len, 00900 CR_BAD_PARAM_ERROR); 00901 00902 if (*a_in_len == 0) { 00903 *a_out_len = 0 ; 00904 return CR_OK ; 00905 } 00906 g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ; 00907 00908 if (*a_in_len < 1) { 00909 status = CR_OK; 00910 goto end; 00911 } 00912 00913 in_len = *a_in_len; 00914 out_len = *a_out_len; 00915 00916 for (in_index = 0, out_index = 0; 00917 (in_index < in_len) && (out_index < out_len); in_index++) { 00918 /* 00919 *FIXME: return whenever we encounter forbidden char values. 00920 */ 00921 00922 if (a_in[in_index] <= 0x7F) { 00923 a_out[out_index] = a_in[in_index]; 00924 out_index++; 00925 } else { 00926 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); 00927 a_out[out_index + 1] = 00928 (0x80 | (a_in[in_index] & 0x3F)); 00929 out_index += 2; 00930 } 00931 } /*end for */ 00932 00933 end: 00934 *a_in_len = in_index; 00935 *a_out_len = out_index; 00936 00937 return CR_OK; 00938 } 00939 00940 /** 00941 *Converts an ucs1 string into an utf8 string. 00942 *@param a_in_start the beginning of the input string to convert. 00943 *@param a_in_end the end of the input string to convert. 00944 *@param a_out out parameter. The converted string. 00945 *@param a_out out parameter. The length of the converted string. 00946 *@return CR_OK upon successfull completion, an error code otherwise. 00947 * 00948 */ 00949 enum CRStatus 00950 cr_utils_ucs1_str_to_utf8 (const guchar * a_in, 00951 gulong * a_in_len, 00952 guchar ** a_out, gulong * a_out_len) 00953 { 00954 gulong in_len = 0, 00955 out_len = 0; 00956 enum CRStatus status = CR_OK; 00957 00958 g_return_val_if_fail (a_in && a_in_len && a_out 00959 && a_out_len, CR_BAD_PARAM_ERROR); 00960 00961 if (*a_in_len < 1) { 00962 *a_out_len = 0; 00963 *a_out = NULL; 00964 return CR_OK; 00965 } 00966 00967 status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1], 00968 &out_len); 00969 00970 g_return_val_if_fail (status == CR_OK, status); 00971 00972 in_len = *a_in_len; 00973 00974 *a_out = g_malloc0 (out_len); 00975 00976 status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len); 00977 00978 *a_out_len = out_len; 00979 00980 return status; 00981 } 00982 00983 /** 00984 *Converts an utf8 buffer into an ucs1 buffer. 00985 *The caller must know the size of the resulting 00986 *converted buffer, and allocated it prior to calling this 00987 *function. 00988 * 00989 *@param a_in the input utf8 buffer to convert. 00990 * 00991 *@param a_in_len in/out parameter. The size of the input utf8 buffer. 00992 *After return, points to the number of bytes consumed 00993 *by the function even in case of encoding error. 00994 * 00995 *@param a_out out parameter. Points to the resulting buffer. 00996 *Must be allocated by the caller. If the size of a_out is shorter 00997 *than its required size, this function converts what it can and return 00998 *a successfull status. 00999 * 01000 *@param a_out_len in/out parameter. The size of the output buffer. 01001 *After return, points to the number of bytes consumed even in case of 01002 *encoding error. 01003 * 01004 *@return CR_OK upon successfull completion, an error code otherwise. 01005 */ 01006 enum CRStatus 01007 cr_utils_utf8_to_ucs1 (const guchar * a_in, 01008 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 01009 { 01010 gulong in_index = 0, 01011 out_index = 0, 01012 in_len = 0, 01013 out_len = 0; 01014 enum CRStatus status = CR_OK; 01015 01016 /* 01017 *to store the final decoded 01018 *unicode char 01019 */ 01020 guint32 c = 0; 01021 01022 g_return_val_if_fail (a_in && a_in_len 01023 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 01024 01025 if (*a_in_len < 1) { 01026 status = CR_OK; 01027 goto end; 01028 } 01029 01030 in_len = *a_in_len; 01031 out_len = *a_out_len; 01032 01033 for (in_index = 0, out_index = 0; 01034 (in_index < in_len) && (out_index < out_len); 01035 in_index++, out_index++) { 01036 gint nb_bytes_2_decode = 0; 01037 01038 if (a_in[in_index] <= 0x7F) { 01039 /* 01040 *7 bits long char 01041 *encoded over 1 byte: 01042 * 0xxx xxxx 01043 */ 01044 c = a_in[in_index]; 01045 nb_bytes_2_decode = 1; 01046 01047 } else if ((a_in[in_index] & 0xE0) == 0xC0) { 01048 /* 01049 *up to 11 bits long char. 01050 *encoded over 2 bytes: 01051 *110x xxxx 10xx xxxx 01052 */ 01053 c = a_in[in_index] & 0x1F; 01054 nb_bytes_2_decode = 2; 01055 01056 } else if ((a_in[in_index] & 0xF0) == 0xE0) { 01057 /* 01058 *up to 16 bit long char 01059 *encoded over 3 bytes: 01060 *1110 xxxx 10xx xxxx 10xx xxxx 01061 */ 01062 c = a_in[in_index] & 0x0F; 01063 nb_bytes_2_decode = 3; 01064 01065 } else if ((a_in[in_index] & 0xF8) == 0xF0) { 01066 /* 01067 *up to 21 bits long char 01068 *encoded over 4 bytes: 01069 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 01070 */ 01071 c = a_in[in_index] & 0x7; 01072 nb_bytes_2_decode = 4; 01073 01074 } else if ((a_in[in_index] & 0xFC) == 0xF8) { 01075 /* 01076 *up to 26 bits long char 01077 *encoded over 5 bytes. 01078 *1111 10xx 10xx xxxx 10xx xxxx 01079 *10xx xxxx 10xx xxxx 01080 */ 01081 c = a_in[in_index] & 3; 01082 nb_bytes_2_decode = 5; 01083 01084 } else if ((a_in[in_index] & 0xFE) == 0xFC) { 01085 /* 01086 *up to 31 bits long char 01087 *encoded over 6 bytes: 01088 *1111 110x 10xx xxxx 10xx xxxx 01089 *10xx xxxx 10xx xxxx 10xx xxxx 01090 */ 01091 c = a_in[in_index] & 1; 01092 nb_bytes_2_decode = 6; 01093 01094 } else { 01095 /*BAD ENCODING */ 01096 status = CR_ENCODING_ERROR; 01097 goto end; 01098 } 01099 01100 /* 01101 *Go and decode the remaining byte(s) 01102 *(if any) to get the current character. 01103 */ 01104 if (in_index + nb_bytes_2_decode - 1 >= in_len) { 01105 status = CR_OK; 01106 goto end; 01107 } 01108 01109 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 01110 /*decode the next byte */ 01111 in_index++; 01112 01113 /*byte pattern must be: 10xx xxxx */ 01114 if ((a_in[in_index] & 0xC0) != 0x80) { 01115 status = CR_ENCODING_ERROR; 01116 goto end; 01117 } 01118 01119 c = (c << 6) | (a_in[in_index] & 0x3F); 01120 } 01121 01122 /* 01123 *The decoded ucs4 char is now 01124 *in c. 01125 */ 01126 01127 if (c > 0xFF) { 01128 status = CR_ENCODING_ERROR; 01129 goto end; 01130 } 01131 01132 a_out[out_index] = c; 01133 } 01134 01135 end: 01136 *a_out_len = out_index; 01137 *a_in_len = in_index; 01138 01139 return CR_OK; 01140 } 01141 01142 /** 01143 *Converts an utf8 buffer into an 01144 *ucs1 buffer. 01145 *@param a_in_start the start of the input buffer. 01146 *@param a_in_end the end of the input buffer. 01147 *@param a_out out parameter. The resulting converted ucs4 buffer. 01148 *Must be freed by the caller. 01149 *@param a_out_len out parameter. The length of the converted buffer. 01150 *@return CR_OK upon successfull completion, an error code otherwise. 01151 *Note that out parameters are valid if and only if this function 01152 *returns CR_OK. 01153 */ 01154 enum CRStatus 01155 cr_utils_utf8_str_to_ucs1 (const guchar * a_in, 01156 gulong * a_in_len, 01157 guchar ** a_out, gulong * a_out_len) 01158 { 01159 enum CRStatus status = CR_OK; 01160 01161 g_return_val_if_fail (a_in && a_in_len 01162 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 01163 01164 if (*a_in_len < 1) { 01165 *a_out_len = 0; 01166 *a_out = NULL; 01167 return CR_OK; 01168 } 01169 01170 status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1], 01171 a_out_len); 01172 01173 g_return_val_if_fail (status == CR_OK, status); 01174 01175 *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); 01176 01177 status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len); 01178 return status; 01179 } 01180 01181 /***************************************** 01182 *CSS basic types identification utilities 01183 *****************************************/ 01184 01185 /** 01186 *Returns TRUE if a_char is a white space as 01187 *defined in the css spec in chap 4.1.1. 01188 * 01189 *white-space ::= ' '| \t|\r|\n|\f 01190 * 01191 *@param a_char the character to test. 01192 *return TRUE if is a white space, false otherwise. 01193 */ 01194 gboolean 01195 cr_utils_is_white_space (guint32 a_char) 01196 { 01197 switch (a_char) { 01198 case ' ': 01199 case '\t': 01200 case '\r': 01201 case '\n': 01202 case '\f': 01203 return TRUE; 01204 break; 01205 default: 01206 return FALSE; 01207 } 01208 } 01209 01210 /** 01211 *Returns true if the character is a newline 01212 *as defined in the css spec in the chap 4.1.1. 01213 * 01214 *nl ::= \n|\r\n|\r|\f 01215 * 01216 *@param a_char the character to test. 01217 *@return TRUE if the character is a newline, FALSE otherwise. 01218 */ 01219 gboolean 01220 cr_utils_is_newline (guint32 a_char) 01221 { 01222 switch (a_char) { 01223 case '\n': 01224 case '\r': 01225 case '\f': 01226 return TRUE; 01227 break; 01228 default: 01229 return FALSE; 01230 } 01231 } 01232 01233 /** 01234 *returns TRUE if the char is part of an hexa num char: 01235 *i.e hexa_char ::= [0-9A-F] 01236 */ 01237 gboolean 01238 cr_utils_is_hexa_char (guint32 a_char) 01239 { 01240 if ((a_char >= '0' && a_char <= '9') 01241 || (a_char >= 'A' && a_char <= 'F')) { 01242 return TRUE; 01243 } 01244 return FALSE; 01245 } 01246 01247 /** 01248 *Returns true if the character is a nonascii 01249 *character (as defined in the css spec chap 4.1.1): 01250 * 01251 *nonascii ::= [^\0-\177] 01252 * 01253 *@param a_char the character to test. 01254 *@return TRUE if the character is a nonascii char, 01255 *FALSE otherwise. 01256 */ 01257 gboolean 01258 cr_utils_is_nonascii (guint32 a_char) 01259 { 01260 if (a_char <= 177) { 01261 return FALSE; 01262 } 01263 01264 return TRUE; 01265 } 01266 01267 /** 01268 *Dumps a character a_nb times on a file. 01269 *@param a_char the char to dump 01270 *@param a_fp the destination file pointer 01271 *@param a_nb the number of times a_char is to be dumped. 01272 */ 01273 void 01274 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb) 01275 { 01276 glong i = 0; 01277 01278 for (i = 0; i < a_nb; i++) { 01279 fprintf (a_fp, "%c", a_char); 01280 } 01281 } 01282 01283 void 01284 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb) 01285 { 01286 glong i = 0; 01287 01288 g_return_if_fail (a_string); 01289 01290 for (i = 0; i < a_nb; i++) { 01291 g_string_append_printf (a_string, "%c", a_char); 01292 } 01293 } 01294 01295 /** 01296 *Duplicates a list of GString instances. 01297 *@return the duplicated list of GString instances or NULL if 01298 *something bad happened. 01299 *@param a_list_of_strings the list of strings to be duplicated. 01300 */ 01301 GList * 01302 cr_utils_dup_glist_of_string (GList const * a_list_of_strings) 01303 { 01304 GList const *cur = NULL; 01305 GList *result = NULL; 01306 01307 g_return_val_if_fail (a_list_of_strings, NULL); 01308 01309 for (cur = a_list_of_strings; cur; cur = cur->next) { 01310 GString *str = NULL; 01311 01312 str = g_string_new_len (((GString *) cur->data)->str, 01313 ((GString *) cur->data)->len); 01314 if (str) 01315 result = g_list_append (result, str); 01316 } 01317 01318 return result; 01319 } 01320 01321 /** 01322 *Duplicate a GList where the GList::data is a CRString. 01323 *@param a_list_of_strings the list to duplicate 01324 *@return the duplicated list, or NULL if something bad 01325 *happened. 01326 */ 01327 GList * 01328 cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings) 01329 { 01330 GList const *cur = NULL; 01331 GList *result = NULL; 01332 01333 g_return_val_if_fail (a_list_of_strings, NULL); 01334 01335 for (cur = a_list_of_strings; cur; cur = cur->next) { 01336 CRString *str = NULL; 01337 01338 str = cr_string_dup ((CRString const *) cur->data) ; 01339 if (str) 01340 result = g_list_append (result, str); 01341 } 01342 01343 return result; 01344 }