cr-utils.c

Go to the documentation of this file.
00001 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
00002 
00003 /*
00004  * This file is part of The Croco Library
00005  *
00006  * This program is free software; you can redistribute it and/or
00007  * modify it under the terms of version 2.1 of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU Lesser General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00018  * USA
00019  *
00020  * Author: Dodji Seketeli
00021  * See COPYRIGHTS file for copyright information.
00022  */
00023 
00024 #include "cr-utils.h"
00025 #include "cr-string.h"
00026 
00027 /**
00028  *@file:
00029  *Some misc utility functions used
00030  *in the libcroco.
00031  *Note that troughout this file I will
00032  *refer to the CSS SPECIFICATIONS DOCUMENTATION
00033  *written by the w3c guys. You can find that document
00034  *at http://www.w3.org/TR/REC-CSS2/ .
00035  */
00036 
00037 /****************************
00038  *Encoding transformations and
00039  *encoding helpers
00040  ****************************/
00041 
00042 /*
00043  *Here is the correspondance between the ucs-4 charactere codes
00044  *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
00045  *
00046  *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
00047  *------------------    -----------------------------
00048  *0000 0000-0000 007F   0xxxxxxx
00049  *0000 0080-0000 07FF   110xxxxx 10xxxxxx
00050  *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
00051  *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00052  *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00053  *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
00054  */
00055 
00056 /**
00057  *Given an utf8 string buffer, calculates
00058  *the length of this string if it was encoded
00059  *in ucs4.
00060  *@param a_in_start a pointer to the begining of
00061  *the input utf8 string.
00062  *@param a_in_end a pointre to the end of the input
00063  *utf8 string (points to the last byte of the buffer)
00064  *@param a_len out parameter the calculated length.
00065  *@return CR_OK upon succesfull completion, an error code
00066  *otherwise.
00067  */
00068 enum CRStatus
00069 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
00070                                const guchar * a_in_end, gulong * a_len)
00071 {
00072         guchar *byte_ptr = NULL;
00073         gint len = 0;
00074 
00075         /*
00076          *to store the final decoded 
00077          *unicode char
00078          */
00079         guint c = 0;
00080 
00081         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00082                               CR_BAD_PARAM_ERROR);
00083         *a_len = 0;
00084 
00085         for (byte_ptr = (guchar *) a_in_start;
00086              byte_ptr <= a_in_end; byte_ptr++) {
00087                 gint nb_bytes_2_decode = 0;
00088 
00089                 if (*byte_ptr <= 0x7F) {
00090                         /*
00091                          *7 bits long char
00092                          *encoded over 1 byte:
00093                          * 0xxx xxxx
00094                          */
00095                         c = *byte_ptr;
00096                         nb_bytes_2_decode = 1;
00097 
00098                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
00099                         /*
00100                          *up to 11 bits long char.
00101                          *encoded over 2 bytes:
00102                          *110x xxxx  10xx xxxx
00103                          */
00104                         c = *byte_ptr & 0x1F;
00105                         nb_bytes_2_decode = 2;
00106 
00107                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
00108                         /*
00109                          *up to 16 bit long char
00110                          *encoded over 3 bytes:
00111                          *1110 xxxx  10xx xxxx  10xx xxxx
00112                          */
00113                         c = *byte_ptr & 0x0F;
00114                         nb_bytes_2_decode = 3;
00115 
00116                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
00117                         /*
00118                          *up to 21 bits long char
00119                          *encoded over 4 bytes:
00120                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00121                          */
00122                         c = *byte_ptr & 0x7;
00123                         nb_bytes_2_decode = 4;
00124 
00125                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
00126                         /*
00127                          *up to 26 bits long char
00128                          *encoded over 5 bytes.
00129                          *1111 10xx  10xx xxxx  10xx xxxx  
00130                          *10xx xxxx  10xx xxxx
00131                          */
00132                         c = *byte_ptr & 3;
00133                         nb_bytes_2_decode = 5;
00134 
00135                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
00136                         /*
00137                          *up to 31 bits long char
00138                          *encoded over 6 bytes:
00139                          *1111 110x  10xx xxxx  10xx xxxx  
00140                          *10xx xxxx  10xx xxxx  10xx xxxx
00141                          */
00142                         c = *byte_ptr & 1;
00143                         nb_bytes_2_decode = 6;
00144 
00145                 } else {
00146                         /*
00147                          *BAD ENCODING
00148                          */
00149                         return CR_ENCODING_ERROR;
00150                 }
00151 
00152                 /*
00153                  *Go and decode the remaining byte(s)
00154                  *(if any) to get the current character.
00155                  */
00156                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
00157                         /*decode the next byte */
00158                         byte_ptr++;
00159 
00160                         /*byte pattern must be: 10xx xxxx */
00161                         if ((*byte_ptr & 0xC0) != 0x80) {
00162                                 return CR_ENCODING_ERROR;
00163                         }
00164 
00165                         c = (c << 6) | (*byte_ptr & 0x3F);
00166                 }
00167 
00168                 len++;
00169         }
00170 
00171         *a_len = len;
00172 
00173         return CR_OK;
00174 }
00175 
00176 /**
00177  *Given an ucs4 string, this function
00178  *returns the size (in bytes) this string
00179  *would have occupied if it was encoded in utf-8.
00180  *@param a_in_start a pointer to the beginning of the input
00181  *buffer.
00182  *@param a_in_end a pointer to the end of the input buffer.
00183  *@param a_len out parameter. The computed length.
00184  *@return CR_OK upon successfull completion, an error code otherwise.
00185  */
00186 enum CRStatus
00187 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
00188                                const guint32 * a_in_end, gulong * a_len)
00189 {
00190         gint len = 0;
00191         guint32 *char_ptr = NULL;
00192 
00193         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00194                               CR_BAD_PARAM_ERROR);
00195 
00196         for (char_ptr = (guint32 *) a_in_start;
00197              char_ptr <= a_in_end; char_ptr++) {
00198                 if (*char_ptr <= 0x7F) {
00199                         /*the utf-8 char would take 1 byte */
00200                         len += 1;
00201                 } else if (*char_ptr <= 0x7FF) {
00202                         /*the utf-8 char would take 2 bytes */
00203                         len += 2;
00204                 } else if (*char_ptr <= 0xFFFF) {
00205                         len += 3;
00206                 } else if (*char_ptr <= 0x1FFFFF) {
00207                         len += 4;
00208                 } else if (*char_ptr <= 0x3FFFFFF) {
00209                         len += 5;
00210                 } else if (*char_ptr <= 0x7FFFFFFF) {
00211                         len += 6;
00212                 }
00213         }
00214 
00215         *a_len = len;
00216         return CR_OK;
00217 }
00218 
00219 /**
00220  *Given an ucsA string, this function
00221  *returns the size (in bytes) this string
00222  *would have occupied if it was encoded in utf-8.
00223  *@param a_in_start a pointer to the beginning of the input
00224  *buffer.
00225  *@param a_in_end a pointer to the end of the input buffer.
00226  *@param a_len out parameter. The computed length.
00227  *@return CR_OK upon successfull completion, an error code otherwise.
00228  */
00229 enum CRStatus
00230 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
00231                                const guchar * a_in_end, gulong * a_len)
00232 {
00233         gint len = 0;
00234         guchar *char_ptr = NULL;
00235 
00236         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00237                               CR_BAD_PARAM_ERROR);
00238 
00239         for (char_ptr = (guchar *) a_in_start;
00240              char_ptr <= a_in_end; char_ptr++) {
00241                 if (*char_ptr <= 0x7F) {
00242                         /*the utf-8 char would take 1 byte */
00243                         len += 1;
00244                 } else {
00245                         /*the utf-8 char would take 2 bytes */
00246                         len += 2;
00247                 }
00248         }
00249 
00250         *a_len = len;
00251         return CR_OK;
00252 }
00253 
00254 /**
00255  *Converts an utf8 buffer into an ucs4 buffer.
00256  *
00257  *@param a_in the input utf8 buffer to convert.
00258  *@param a_in_len in/out parameter. The size of the
00259  *input buffer to convert. After return, this parameter contains
00260  *the actual number of bytes consumed.
00261  *@param a_out the output converted ucs4 buffer. Must be allocated by
00262  *the caller.
00263  *@param a_out_len in/out parameter. The size of the output buffer.
00264  *If this size is actually smaller than the real needed size, the function
00265  *just converts what it can and returns a success status. After return,
00266  *this param points to the actual number of characters decoded.
00267  *@return CR_OK upon successfull completion, an error code otherwise.
00268  */
00269 enum CRStatus
00270 cr_utils_utf8_to_ucs4 (const guchar * a_in,
00271                        gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
00272 {
00273         gulong in_len = 0,
00274                 out_len = 0,
00275                 in_index = 0,
00276                 out_index = 0;
00277         enum CRStatus status = CR_OK;
00278 
00279         /*
00280          *to store the final decoded 
00281          *unicode char
00282          */
00283         guint c = 0;
00284 
00285         g_return_val_if_fail (a_in && a_in_len
00286                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
00287 
00288         if (*a_in_len < 1) {
00289                 status = CR_OK;
00290                 goto end;
00291         }
00292 
00293         in_len = *a_in_len;
00294         out_len = *a_out_len;
00295 
00296         for (in_index = 0, out_index = 0;
00297              (in_index < in_len) && (out_index < out_len);
00298              in_index++, out_index++) {
00299                 gint nb_bytes_2_decode = 0;
00300 
00301                 if (a_in[in_index] <= 0x7F) {
00302                         /*
00303                          *7 bits long char
00304                          *encoded over 1 byte:
00305                          * 0xxx xxxx
00306                          */
00307                         c = a_in[in_index];
00308                         nb_bytes_2_decode = 1;
00309 
00310                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
00311                         /*
00312                          *up to 11 bits long char.
00313                          *encoded over 2 bytes:
00314                          *110x xxxx  10xx xxxx
00315                          */
00316                         c = a_in[in_index] & 0x1F;
00317                         nb_bytes_2_decode = 2;
00318 
00319                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
00320                         /*
00321                          *up to 16 bit long char
00322                          *encoded over 3 bytes:
00323                          *1110 xxxx  10xx xxxx  10xx xxxx
00324                          */
00325                         c = a_in[in_index] & 0x0F;
00326                         nb_bytes_2_decode = 3;
00327 
00328                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
00329                         /*
00330                          *up to 21 bits long char
00331                          *encoded over 4 bytes:
00332                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00333                          */
00334                         c = a_in[in_index] & 0x7;
00335                         nb_bytes_2_decode = 4;
00336 
00337                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
00338                         /*
00339                          *up to 26 bits long char
00340                          *encoded over 5 bytes.
00341                          *1111 10xx  10xx xxxx  10xx xxxx  
00342                          *10xx xxxx  10xx xxxx
00343                          */
00344                         c = a_in[in_index] & 3;
00345                         nb_bytes_2_decode = 5;
00346 
00347                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
00348                         /*
00349                          *up to 31 bits long char
00350                          *encoded over 6 bytes:
00351                          *1111 110x  10xx xxxx  10xx xxxx  
00352                          *10xx xxxx  10xx xxxx  10xx xxxx
00353                          */
00354                         c = a_in[in_index] & 1;
00355                         nb_bytes_2_decode = 6;
00356 
00357                 } else {
00358                         /*BAD ENCODING */
00359                         goto end;
00360                 }
00361 
00362                 /*
00363                  *Go and decode the remaining byte(s)
00364                  *(if any) to get the current character.
00365                  */
00366                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
00367                         /*decode the next byte */
00368                         in_index++;
00369 
00370                         /*byte pattern must be: 10xx xxxx */
00371                         if ((a_in[in_index] & 0xC0) != 0x80) {
00372                                 goto end;
00373                         }
00374 
00375                         c = (c << 6) | (a_in[in_index] & 0x3F);
00376                 }
00377 
00378                 /*
00379                  *The decoded ucs4 char is now
00380                  *in c.
00381                  */
00382 
00383                 /************************
00384                  *Some security tests
00385                  ***********************/
00386 
00387                 /*be sure c is a char */
00388                 if (c == 0xFFFF || c == 0xFFFE)
00389                         goto end;
00390 
00391                 /*be sure c is inferior to the max ucs4 char value */
00392                 if (c > 0x10FFFF)
00393                         goto end;
00394 
00395                 /*
00396                  *c must be less than UTF16 "lower surrogate begin"
00397                  *or higher than UTF16 "High surrogate end"
00398                  */
00399                 if (c >= 0xD800 && c <= 0xDFFF)
00400                         goto end;
00401 
00402                 /*Avoid characters that equals zero */
00403                 if (c == 0)
00404                         goto end;
00405 
00406                 a_out[out_index] = c;
00407         }
00408 
00409       end:
00410         *a_out_len = out_index + 1;
00411         *a_in_len = in_index + 1;
00412 
00413         return status;
00414 }
00415 
00416 /**
00417  *Reads a character from an utf8 buffer.
00418  *Actually decode the next character code (unicode character code)
00419  *and returns it.
00420  *@param a_in the starting address of the utf8 buffer.
00421  *@param a_in_len the length of the utf8 buffer.
00422  *@param a_out output parameter. The resulting read char.
00423  *@param a_consumed the number of the bytes consumed to
00424  *decode the returned character code.
00425  *@return CR_OK upon successfull completion, an error code otherwise.
00426  */
00427 enum CRStatus
00428 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
00429                                   gulong a_in_len,
00430                                   guint32 * a_out, gulong * a_consumed)
00431 {
00432         gulong in_len = 0,
00433                 in_index = 0,
00434                 nb_bytes_2_decode = 0;
00435         enum CRStatus status = CR_OK;
00436 
00437         /*
00438          *to store the final decoded 
00439          *unicode char
00440          */
00441         guint32 c = 0;
00442 
00443         g_return_val_if_fail (a_in && a_out && a_out
00444                               && a_consumed, CR_BAD_PARAM_ERROR);
00445 
00446         if (a_in_len < 1) {
00447                 status = CR_OK;
00448                 goto end;
00449         }
00450 
00451         in_len = a_in_len;
00452 
00453         if (*a_in <= 0x7F) {
00454                 /*
00455                  *7 bits long char
00456                  *encoded over 1 byte:
00457                  * 0xxx xxxx
00458                  */
00459                 c = *a_in;
00460                 nb_bytes_2_decode = 1;
00461 
00462         } else if ((*a_in & 0xE0) == 0xC0) {
00463                 /*
00464                  *up to 11 bits long char.
00465                  *encoded over 2 bytes:
00466                  *110x xxxx  10xx xxxx
00467                  */
00468                 c = *a_in & 0x1F;
00469                 nb_bytes_2_decode = 2;
00470 
00471         } else if ((*a_in & 0xF0) == 0xE0) {
00472                 /*
00473                  *up to 16 bit long char
00474                  *encoded over 3 bytes:
00475                  *1110 xxxx  10xx xxxx  10xx xxxx
00476                  */
00477                 c = *a_in & 0x0F;
00478                 nb_bytes_2_decode = 3;
00479 
00480         } else if ((*a_in & 0xF8) == 0xF0) {
00481                 /*
00482                  *up to 21 bits long char
00483                  *encoded over 4 bytes:
00484                  *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00485                  */
00486                 c = *a_in & 0x7;
00487                 nb_bytes_2_decode = 4;
00488 
00489         } else if ((*a_in & 0xFC) == 0xF8) {
00490                 /*
00491                  *up to 26 bits long char
00492                  *encoded over 5 bytes.
00493                  *1111 10xx  10xx xxxx  10xx xxxx  
00494                  *10xx xxxx  10xx xxxx
00495                  */
00496                 c = *a_in & 3;
00497                 nb_bytes_2_decode = 5;
00498 
00499         } else if ((*a_in & 0xFE) == 0xFC) {
00500                 /*
00501                  *up to 31 bits long char
00502                  *encoded over 6 bytes:
00503                  *1111 110x  10xx xxxx  10xx xxxx  
00504                  *10xx xxxx  10xx xxxx  10xx xxxx
00505                  */
00506                 c = *a_in & 1;
00507                 nb_bytes_2_decode = 6;
00508 
00509         } else {
00510                 /*BAD ENCODING */
00511                 goto end;
00512         }
00513 
00514         if (nb_bytes_2_decode > a_in_len) {
00515                 status = CR_END_OF_INPUT_ERROR;
00516                 goto end;
00517         }
00518 
00519         /*
00520          *Go and decode the remaining byte(s)
00521          *(if any) to get the current character.
00522          */
00523         for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
00524                 /*byte pattern must be: 10xx xxxx */
00525                 if ((a_in[in_index] & 0xC0) != 0x80) {
00526                         goto end;
00527                 }
00528 
00529                 c = (c << 6) | (a_in[in_index] & 0x3F);
00530         }
00531 
00532         /*
00533          *The decoded ucs4 char is now
00534          *in c.
00535          */
00536 
00537     /************************
00538      *Some security tests
00539      ***********************/
00540 
00541         /*be sure c is a char */
00542         if (c == 0xFFFF || c == 0xFFFE)
00543                 goto end;
00544 
00545         /*be sure c is inferior to the max ucs4 char value */
00546         if (c > 0x10FFFF)
00547                 goto end;
00548 
00549         /*
00550          *c must be less than UTF16 "lower surrogate begin"
00551          *or higher than UTF16 "High surrogate end"
00552          */
00553         if (c >= 0xD800 && c <= 0xDFFF)
00554                 goto end;
00555 
00556         /*Avoid characters that equals zero */
00557         if (c == 0)
00558                 goto end;
00559 
00560         *a_out = c;
00561 
00562       end:
00563         *a_consumed = nb_bytes_2_decode;
00564 
00565         return status;
00566 }
00567 
00568 /**
00569  *
00570  */
00571 enum CRStatus
00572 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
00573                                const guchar * a_in_end, gulong * a_len)
00574 {
00575         /*
00576          *Note: this function can be made shorter
00577          *but it considers all the cases of the utf8 encoding
00578          *to ease further extensions ...
00579          */
00580 
00581         guchar *byte_ptr = NULL;
00582         gint len = 0;
00583 
00584         /*
00585          *to store the final decoded 
00586          *unicode char
00587          */
00588         guint c = 0;
00589 
00590         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00591                               CR_BAD_PARAM_ERROR);
00592         *a_len = 0;
00593 
00594         for (byte_ptr = (guchar *) a_in_start;
00595              byte_ptr <= a_in_end; byte_ptr++) {
00596                 gint nb_bytes_2_decode = 0;
00597 
00598                 if (*byte_ptr <= 0x7F) {
00599                         /*
00600                          *7 bits long char
00601                          *encoded over 1 byte:
00602                          * 0xxx xxxx
00603                          */
00604                         c = *byte_ptr;
00605                         nb_bytes_2_decode = 1;
00606 
00607                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
00608                         /*
00609                          *up to 11 bits long char.
00610                          *encoded over 2 bytes:
00611                          *110x xxxx  10xx xxxx
00612                          */
00613                         c = *byte_ptr & 0x1F;
00614                         nb_bytes_2_decode = 2;
00615 
00616                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
00617                         /*
00618                          *up to 16 bit long char
00619                          *encoded over 3 bytes:
00620                          *1110 xxxx  10xx xxxx  10xx xxxx
00621                          */
00622                         c = *byte_ptr & 0x0F;
00623                         nb_bytes_2_decode = 3;
00624 
00625                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
00626                         /*
00627                          *up to 21 bits long char
00628                          *encoded over 4 bytes:
00629                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00630                          */
00631                         c = *byte_ptr & 0x7;
00632                         nb_bytes_2_decode = 4;
00633 
00634                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
00635                         /*
00636                          *up to 26 bits long char
00637                          *encoded over 5 bytes.
00638                          *1111 10xx  10xx xxxx  10xx xxxx  
00639                          *10xx xxxx  10xx xxxx
00640                          */
00641                         c = *byte_ptr & 3;
00642                         nb_bytes_2_decode = 5;
00643 
00644                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
00645                         /*
00646                          *up to 31 bits long char
00647                          *encoded over 6 bytes:
00648                          *1111 110x  10xx xxxx  10xx xxxx  
00649                          *10xx xxxx  10xx xxxx  10xx xxxx
00650                          */
00651                         c = *byte_ptr & 1;
00652                         nb_bytes_2_decode = 6;
00653 
00654                 } else {
00655                         /*
00656                          *BAD ENCODING
00657                          */
00658                         return CR_ENCODING_ERROR;
00659                 }
00660 
00661                 /*
00662                  *Go and decode the remaining byte(s)
00663                  *(if any) to get the current character.
00664                  */
00665                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
00666                         /*decode the next byte */
00667                         byte_ptr++;
00668 
00669                         /*byte pattern must be: 10xx xxxx */
00670                         if ((*byte_ptr & 0xC0) != 0x80) {
00671                                 return CR_ENCODING_ERROR;
00672                         }
00673 
00674                         c = (c << 6) | (*byte_ptr & 0x3F);
00675                 }
00676 
00677                 /*
00678                  *The decoded ucs4 char is now
00679                  *in c.
00680                  */
00681 
00682                 if (c <= 0xFF) { /*Add other conditions to support
00683                                   *other char sets (ucs2, ucs3, ucs4).
00684                                   */
00685                         len++;
00686                 } else {
00687                         /*the char is too long to fit
00688                          *into the supposed charset len.
00689                          */
00690                         return CR_ENCODING_ERROR;
00691                 }
00692         }
00693 
00694         *a_len = len;
00695 
00696         return CR_OK;
00697 }
00698 
00699 /**
00700  *Converts an utf8 string into an ucs4 string.
00701  *@param a_in the input string to convert.
00702  *@param a_in_len in/out parameter. The length of the input
00703  *string. After return, points to the actual number of bytes
00704  *consumed. This can be usefull to debug the input stream in case
00705  *of encoding error.
00706  *@param a_out out parameter. Points to the output string. It is allocated 
00707  *by this function and must be freed by the caller.
00708  *@param a_out_len out parameter. The length of the output string.
00709  *@return CR_OK upon successfull completion, an error code otherwise.
00710  *
00711  */
00712 enum CRStatus
00713 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
00714                            gulong * a_in_len,
00715                            guint32 ** a_out, gulong * a_out_len)
00716 {
00717         enum CRStatus status = CR_OK;
00718 
00719         g_return_val_if_fail (a_in && a_in_len
00720                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
00721 
00722         status = cr_utils_utf8_str_len_as_ucs4 (a_in,
00723                                                 &a_in[*a_in_len - 1],
00724                                                 a_out_len);
00725 
00726         g_return_val_if_fail (status == CR_OK, status);
00727 
00728         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
00729 
00730         status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
00731 
00732         return status;
00733 }
00734 
00735 /**
00736  *Converts an ucs4 buffer into an utf8 buffer.
00737  *
00738  *@param a_in the input ucs4 buffer to convert.
00739  *@param a_in_len in/out parameter. The size of the
00740  *input buffer to convert. After return, this parameter contains
00741  *the actual number of characters consumed.
00742  *@param a_out the output converted utf8 buffer. Must be allocated by
00743  *the caller.
00744  *@param a_out_len in/out parameter. The size of the output buffer.
00745  *If this size is actually smaller than the real needed size, the function
00746  *just converts what it can and returns a success status. After return,
00747  *this param points to the actual number of bytes in the buffer.
00748  *@return CR_OK upon successfull completion, an error code otherwise.
00749  */
00750 enum CRStatus
00751 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
00752                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
00753 {
00754         gulong in_len = 0,
00755                 in_index = 0,
00756                 out_index = 0;
00757         enum CRStatus status = CR_OK;
00758 
00759         g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
00760                               CR_BAD_PARAM_ERROR);
00761 
00762         if (*a_in_len < 1) {
00763                 status = CR_OK;
00764                 goto end;
00765         }
00766 
00767         in_len = *a_in_len;
00768 
00769         for (in_index = 0; in_index < in_len; in_index++) {
00770                 /*
00771                  *FIXME: return whenever we encounter forbidden char values.
00772                  */
00773 
00774                 if (a_in[in_index] <= 0x7F) {
00775                         a_out[out_index] = a_in[in_index];
00776                         out_index++;
00777                 } else if (a_in[in_index] <= 0x7FF) {
00778                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
00779                         a_out[out_index + 1] =
00780                                 (0x80 | (a_in[in_index] & 0x3F));
00781                         out_index += 2;
00782                 } else if (a_in[in_index] <= 0xFFFF) {
00783                         a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
00784                         a_out[out_index + 1] =
00785                                 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00786                         a_out[out_index + 2] =
00787                                 (0x80 | (a_in[in_index] & 0x3F));
00788                         out_index += 3;
00789                 } else if (a_in[in_index] <= 0x1FFFFF) {
00790                         a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
00791                         a_out[out_index + 1]
00792                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
00793                         a_out[out_index + 2]
00794                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00795                         a_out[out_index + 3]
00796                                 = (0x80 | (a_in[in_index] & 0x3F));
00797                         out_index += 4;
00798                 } else if (a_in[in_index] <= 0x3FFFFFF) {
00799                         a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
00800                         a_out[out_index + 1] =
00801                                 (0x80 | (a_in[in_index] >> 18));
00802                         a_out[out_index + 2]
00803                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
00804                         a_out[out_index + 3]
00805                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00806                         a_out[out_index + 4]
00807                                 = (0x80 | (a_in[in_index] & 0x3F));
00808                         out_index += 5;
00809                 } else if (a_in[in_index] <= 0x7FFFFFFF) {
00810                         a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
00811                         a_out[out_index + 1] =
00812                                 (0x80 | (a_in[in_index] >> 24));
00813                         a_out[out_index + 2]
00814                                 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
00815                         a_out[out_index + 3]
00816                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
00817                         a_out[out_index + 4]
00818                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00819                         a_out[out_index + 4]
00820                                 = (0x80 | (a_in[in_index] & 0x3F));
00821                         out_index += 6;
00822                 } else {
00823                         status = CR_ENCODING_ERROR;
00824                         goto end;
00825                 }
00826         }                       /*end for */
00827 
00828       end:
00829         *a_in_len = in_index + 1;
00830         *a_out_len = out_index + 1;
00831 
00832         return status;
00833 }
00834 
00835 /**
00836  *Converts an ucs4 string into an utf8 string.
00837  *@param a_in the input string to convert.
00838  *@param a_in_len in/out parameter. The length of the input
00839  *string. After return, points to the actual number of characters
00840  *consumed. This can be usefull to debug the input string in case
00841  *of encoding error.
00842  *@param a_out out parameter. Points to the output string. It is allocated 
00843  *by this function and must be freed by the caller.
00844  *@param a_out_len out parameter. The length (in bytes) of the output string.
00845  *@return CR_OK upon successfull completion, an error code otherwise.
00846  */
00847 enum CRStatus
00848 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
00849                            gulong * a_in_len,
00850                            guchar ** a_out, gulong * a_out_len)
00851 {
00852         enum CRStatus status = CR_OK;
00853 
00854         g_return_val_if_fail (a_in && a_in_len && a_out
00855                               && a_out_len, CR_BAD_PARAM_ERROR);
00856 
00857         status = cr_utils_ucs4_str_len_as_utf8 (a_in,
00858                                                 &a_in[*a_out_len - 1],
00859                                                 a_out_len);
00860 
00861         g_return_val_if_fail (status == CR_OK, status);
00862 
00863         status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
00864 
00865         return status;
00866 }
00867 
00868 /**
00869  *Converts an ucs1 buffer into an utf8 buffer.
00870  *The caller must know the size of the resulting buffer and
00871  *allocate it prior to calling this function.
00872  *
00873  *@param a_in the input ucs1 buffer.
00874  *
00875  *@param a_in_len in/out parameter. The length of the input buffer.
00876  *After return, points to the number of bytes actually consumed even
00877  *in case of encoding error.
00878  *
00879  *@param a_out out parameter. The output utf8 converted buffer.
00880  *
00881  *@param a_out_len in/out parameter. The size of the output buffer.
00882  *If the output buffer size is shorter than the actual needed size, 
00883  *this function just convert what it can.
00884  *
00885  *@return CR_OK upon successfull completion, an error code otherwise.
00886  *
00887  */
00888 enum CRStatus
00889 cr_utils_ucs1_to_utf8 (const guchar * a_in,
00890                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
00891 {
00892         gulong out_index = 0,
00893                 in_index = 0,
00894                 in_len = 0,
00895                 out_len = 0;
00896         enum CRStatus status = CR_OK;
00897 
00898         g_return_val_if_fail (a_in && a_in_len
00899                               && a_out_len, 
00900                               CR_BAD_PARAM_ERROR);
00901 
00902         if (*a_in_len == 0) {
00903                 *a_out_len = 0 ;
00904                 return CR_OK ;
00905         }
00906         g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
00907 
00908         if (*a_in_len < 1) {
00909                 status = CR_OK;
00910                 goto end;
00911         }
00912 
00913         in_len = *a_in_len;
00914         out_len = *a_out_len;
00915 
00916         for (in_index = 0, out_index = 0;
00917              (in_index < in_len) && (out_index < out_len); in_index++) {
00918                 /*
00919                  *FIXME: return whenever we encounter forbidden char values.
00920                  */
00921 
00922                 if (a_in[in_index] <= 0x7F) {
00923                         a_out[out_index] = a_in[in_index];
00924                         out_index++;
00925                 } else {
00926                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
00927                         a_out[out_index + 1] =
00928                                 (0x80 | (a_in[in_index] & 0x3F));
00929                         out_index += 2;
00930                 }
00931         }                       /*end for */
00932 
00933       end:
00934         *a_in_len = in_index;
00935         *a_out_len = out_index;
00936 
00937         return CR_OK;
00938 }
00939 
00940 /**
00941  *Converts an ucs1 string into an utf8 string.
00942  *@param a_in_start the beginning of the input string to convert.
00943  *@param a_in_end the end of the input string to convert.
00944  *@param a_out out parameter. The converted string.
00945  *@param a_out out parameter. The length of the converted string.
00946  *@return CR_OK upon successfull completion, an error code otherwise.
00947  *
00948  */
00949 enum CRStatus
00950 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
00951                            gulong * a_in_len,
00952                            guchar ** a_out, gulong * a_out_len)
00953 {
00954         gulong in_len = 0,
00955                 out_len = 0;
00956         enum CRStatus status = CR_OK;
00957 
00958         g_return_val_if_fail (a_in && a_in_len && a_out
00959                               && a_out_len, CR_BAD_PARAM_ERROR);
00960 
00961         if (*a_in_len < 1) {
00962                 *a_out_len = 0;
00963                 *a_out = NULL;
00964                 return CR_OK;
00965         }
00966 
00967         status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
00968                                                 &out_len);
00969 
00970         g_return_val_if_fail (status == CR_OK, status);
00971 
00972         in_len = *a_in_len;
00973 
00974         *a_out = g_malloc0 (out_len);
00975 
00976         status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
00977 
00978         *a_out_len = out_len;
00979 
00980         return status;
00981 }
00982 
00983 /**
00984  *Converts an utf8 buffer into an ucs1 buffer.
00985  *The caller must know the size of the resulting
00986  *converted buffer, and allocated it prior to calling this
00987  *function.
00988  *
00989  *@param a_in the input utf8 buffer to convert.
00990  *
00991  *@param a_in_len in/out parameter. The size of the input utf8 buffer.
00992  *After return, points to the number of bytes consumed
00993  *by the function even in case of encoding error.
00994  *
00995  *@param a_out out parameter. Points to the resulting buffer.
00996  *Must be allocated by the caller. If the size of a_out is shorter
00997  *than its required size, this function converts what it can and return
00998  *a successfull status.
00999  *
01000  *@param a_out_len in/out parameter. The size of the output buffer.
01001  *After return, points to the number of bytes consumed even in case of
01002  *encoding error.
01003  *
01004  *@return CR_OK upon successfull completion, an error code otherwise.
01005  */
01006 enum CRStatus
01007 cr_utils_utf8_to_ucs1 (const guchar * a_in,
01008                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
01009 {
01010         gulong in_index = 0,
01011                 out_index = 0,
01012                 in_len = 0,
01013                 out_len = 0;
01014         enum CRStatus status = CR_OK;
01015 
01016         /*
01017          *to store the final decoded 
01018          *unicode char
01019          */
01020         guint32 c = 0;
01021 
01022         g_return_val_if_fail (a_in && a_in_len
01023                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
01024 
01025         if (*a_in_len < 1) {
01026                 status = CR_OK;
01027                 goto end;
01028         }
01029 
01030         in_len = *a_in_len;
01031         out_len = *a_out_len;
01032 
01033         for (in_index = 0, out_index = 0;
01034              (in_index < in_len) && (out_index < out_len);
01035              in_index++, out_index++) {
01036                 gint nb_bytes_2_decode = 0;
01037 
01038                 if (a_in[in_index] <= 0x7F) {
01039                         /*
01040                          *7 bits long char
01041                          *encoded over 1 byte:
01042                          * 0xxx xxxx
01043                          */
01044                         c = a_in[in_index];
01045                         nb_bytes_2_decode = 1;
01046 
01047                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
01048                         /*
01049                          *up to 11 bits long char.
01050                          *encoded over 2 bytes:
01051                          *110x xxxx  10xx xxxx
01052                          */
01053                         c = a_in[in_index] & 0x1F;
01054                         nb_bytes_2_decode = 2;
01055 
01056                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
01057                         /*
01058                          *up to 16 bit long char
01059                          *encoded over 3 bytes:
01060                          *1110 xxxx  10xx xxxx  10xx xxxx
01061                          */
01062                         c = a_in[in_index] & 0x0F;
01063                         nb_bytes_2_decode = 3;
01064 
01065                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
01066                         /*
01067                          *up to 21 bits long char
01068                          *encoded over 4 bytes:
01069                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
01070                          */
01071                         c = a_in[in_index] & 0x7;
01072                         nb_bytes_2_decode = 4;
01073 
01074                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
01075                         /*
01076                          *up to 26 bits long char
01077                          *encoded over 5 bytes.
01078                          *1111 10xx  10xx xxxx  10xx xxxx  
01079                          *10xx xxxx  10xx xxxx
01080                          */
01081                         c = a_in[in_index] & 3;
01082                         nb_bytes_2_decode = 5;
01083 
01084                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
01085                         /*
01086                          *up to 31 bits long char
01087                          *encoded over 6 bytes:
01088                          *1111 110x  10xx xxxx  10xx xxxx  
01089                          *10xx xxxx  10xx xxxx  10xx xxxx
01090                          */
01091                         c = a_in[in_index] & 1;
01092                         nb_bytes_2_decode = 6;
01093 
01094                 } else {
01095                         /*BAD ENCODING */
01096                         status = CR_ENCODING_ERROR;
01097                         goto end;
01098                 }
01099 
01100                 /*
01101                  *Go and decode the remaining byte(s)
01102                  *(if any) to get the current character.
01103                  */
01104                 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
01105                         status = CR_OK;
01106                         goto end;
01107                 }
01108 
01109                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
01110                         /*decode the next byte */
01111                         in_index++;
01112 
01113                         /*byte pattern must be: 10xx xxxx */
01114                         if ((a_in[in_index] & 0xC0) != 0x80) {
01115                                 status = CR_ENCODING_ERROR;
01116                                 goto end;
01117                         }
01118 
01119                         c = (c << 6) | (a_in[in_index] & 0x3F);
01120                 }
01121 
01122                 /*
01123                  *The decoded ucs4 char is now
01124                  *in c.
01125                  */
01126 
01127                 if (c > 0xFF) {
01128                         status = CR_ENCODING_ERROR;
01129                         goto end;
01130                 }
01131 
01132                 a_out[out_index] = c;
01133         }
01134 
01135       end:
01136         *a_out_len = out_index;
01137         *a_in_len = in_index;
01138 
01139         return CR_OK;
01140 }
01141 
01142 /**
01143  *Converts an utf8 buffer into an
01144  *ucs1 buffer.
01145  *@param a_in_start the start of the input buffer.
01146  *@param a_in_end the end of the input buffer.
01147  *@param a_out out parameter. The resulting converted ucs4 buffer.
01148  *Must be freed by the caller.
01149  *@param a_out_len out parameter. The length of the converted buffer.
01150  *@return CR_OK upon successfull completion, an error code otherwise.
01151  *Note that out parameters are valid if and only if this function
01152  *returns CR_OK.
01153  */
01154 enum CRStatus
01155 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
01156                            gulong * a_in_len,
01157                            guchar ** a_out, gulong * a_out_len)
01158 {
01159         enum CRStatus status = CR_OK;
01160 
01161         g_return_val_if_fail (a_in && a_in_len
01162                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
01163 
01164         if (*a_in_len < 1) {
01165                 *a_out_len = 0;
01166                 *a_out = NULL;
01167                 return CR_OK;
01168         }
01169 
01170         status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
01171                                                 a_out_len);
01172 
01173         g_return_val_if_fail (status == CR_OK, status);
01174 
01175         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
01176 
01177         status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
01178         return status;
01179 }
01180 
01181 /*****************************************
01182  *CSS basic types identification utilities
01183  *****************************************/
01184 
01185 /**
01186  *Returns TRUE if a_char is a white space as
01187  *defined in the css spec in chap 4.1.1.
01188  *
01189  *white-space ::= ' '| \t|\r|\n|\f
01190  *
01191  *@param a_char the character to test.
01192  *return TRUE if is a white space, false otherwise.
01193  */
01194 gboolean
01195 cr_utils_is_white_space (guint32 a_char)
01196 {
01197         switch (a_char) {
01198         case ' ':
01199         case '\t':
01200         case '\r':
01201         case '\n':
01202         case '\f':
01203                 return TRUE;
01204                 break;
01205         default:
01206                 return FALSE;
01207         }
01208 }
01209 
01210 /**
01211  *Returns true if the character is a newline
01212  *as defined in the css spec in the chap 4.1.1.
01213  *
01214  *nl ::= \n|\r\n|\r|\f
01215  *
01216  *@param a_char the character to test.
01217  *@return TRUE if the character is a newline, FALSE otherwise.
01218  */
01219 gboolean
01220 cr_utils_is_newline (guint32 a_char)
01221 {
01222         switch (a_char) {
01223         case '\n':
01224         case '\r':
01225         case '\f':
01226                 return TRUE;
01227                 break;
01228         default:
01229                 return FALSE;
01230         }
01231 }
01232 
01233 /**
01234  *returns TRUE if the char is part of an hexa num char:
01235  *i.e hexa_char ::= [0-9A-F]
01236  */
01237 gboolean
01238 cr_utils_is_hexa_char (guint32 a_char)
01239 {
01240         if ((a_char >= '0' && a_char <= '9')
01241             || (a_char >= 'A' && a_char <= 'F')) {
01242                 return TRUE;
01243         }
01244         return FALSE;
01245 }
01246 
01247 /**
01248  *Returns true if the character is a nonascii
01249  *character (as defined in the css spec chap 4.1.1):
01250  *
01251  *nonascii ::= [^\0-\177]
01252  *
01253  *@param a_char the character to test.
01254  *@return TRUE if the character is a nonascii char,
01255  *FALSE otherwise.
01256  */
01257 gboolean
01258 cr_utils_is_nonascii (guint32 a_char)
01259 {
01260         if (a_char <= 177) {
01261                 return FALSE;
01262         }
01263 
01264         return TRUE;
01265 }
01266 
01267 /**
01268  *Dumps a character a_nb times on a file.
01269  *@param a_char the char to dump
01270  *@param a_fp the destination file pointer
01271  *@param a_nb the number of times a_char is to be dumped.
01272  */
01273 void
01274 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
01275 {
01276         glong i = 0;
01277 
01278         for (i = 0; i < a_nb; i++) {
01279                 fprintf (a_fp, "%c", a_char);
01280         }
01281 }
01282 
01283 void
01284 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
01285 {
01286         glong i = 0;
01287 
01288         g_return_if_fail (a_string);
01289 
01290         for (i = 0; i < a_nb; i++) {
01291                 g_string_append_printf (a_string, "%c", a_char);
01292         }
01293 }
01294 
01295 gdouble
01296 cr_utils_n_to_0_dot_n (glong a_n, glong decimal_places)
01297 {
01298         gdouble result = a_n;
01299 
01300         while (decimal_places > 0) {
01301                 result = result / 10;
01302                 decimal_places--;
01303         }
01304 
01305         return result;
01306 }
01307 
01308 /**
01309  *Duplicates a list of GString instances.
01310  *@return the duplicated list of GString instances or NULL if
01311  *something bad happened.
01312  *@param a_list_of_strings the list of strings to be duplicated.
01313  */
01314 GList *
01315 cr_utils_dup_glist_of_string (GList * a_list_of_strings)
01316 {
01317         GList *cur = NULL,
01318                 *result = NULL;
01319 
01320         g_return_val_if_fail (a_list_of_strings, NULL);
01321 
01322         for (cur = a_list_of_strings; cur; cur = cur->next) {
01323                 GString *str = NULL;
01324 
01325                 str = g_string_new_len (((GString *) cur->data)->str,
01326                                         ((GString *) cur->data)->len);
01327                 if (str)
01328                         result = g_list_append (result, str);
01329         }
01330 
01331         return result;
01332 }
01333 
01334 /**
01335  *Duplicate a GList where the GList::data is a CRString.
01336  *@param a_list_of_strings the list to duplicate
01337  *@return the duplicated list, or NULL if something bad
01338  *happened.
01339  */
01340 GList *
01341 cr_utils_dup_glist_of_cr_string (GList * a_list_of_strings)
01342 {
01343         GList *cur = NULL, *result = NULL;
01344 
01345         g_return_val_if_fail (a_list_of_strings, NULL);
01346 
01347         for (cur = a_list_of_strings; cur; cur = cur->next) {
01348                 CRString *str = NULL;
01349 
01350                 str = cr_string_dup ((CRString *) cur->data) ;
01351                 if (str)
01352                         result = g_list_append (result, str);
01353         }
01354 
01355         return result;
01356 }

Generated on Thu Mar 9 19:19:09 2006 for Libcroco by  doxygen 1.4.6