core.h

Go to the documentation of this file.
00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00031 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00032 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00033 
00034 #include <boost/cstdint.hpp>
00035 
00036 #include <iterator>
00037 
00038 namespace utf8
00039 {
00040     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
00041     // You may need to change them to match your system. 
00042     // These typedefs have the same names as ones from cstdint, or boost/cstdint
00043     typedef boost::uint8_t  uint8_t;
00044     typedef boost::uint16_t uint16_t;
00045     typedef boost::uint32_t uint32_t;
00046 
00047 // Helper code - not intended to be directly called by the library users. May be changed at any time
00048 namespace internal
00049 {    
00050     // Unicode constants
00051     // Leading (high) surrogates: 0xd800 - 0xdbff
00052     // Trailing (low) surrogates: 0xdc00 - 0xdfff
00053     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
00054     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
00055     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00056     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00057     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00058     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00059 
00060     // Maximum valid value for a Unicode code point
00061     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
00062 
00063     template<typename octet_type>
00064     inline uint8_t mask8(octet_type oc)
00065     {
00066         return static_cast<uint8_t>(0xff & oc);
00067     }
00068     template<typename u16_type>
00069     inline uint16_t mask16(u16_type oc)
00070     {
00071         return static_cast<uint16_t>(0xffff & oc);
00072     }
00073     template<typename octet_type>
00074     inline bool is_trail(octet_type oc)
00075     {
00076         return ((mask8(oc) >> 6) == 0x2);
00077     }
00078 
00079     template <typename u16>
00080     inline bool is_surrogate(u16 cp)
00081     {
00082         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00083     }
00084 
00085     template <typename u32>
00086     inline bool is_code_point_valid(u32 cp)
00087     {
00088         return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
00089     }  
00090 
00091     template <typename octet_iterator>
00092     inline typename std::iterator_traits<octet_iterator>::difference_type
00093     sequence_length(octet_iterator lead_it)
00094     {
00095         uint8_t lead = mask8(*lead_it);
00096         if (lead < 0x80) 
00097             return 1;
00098         else if ((lead >> 5) == 0x6)
00099             return 2;
00100         else if ((lead >> 4) == 0xe)
00101             return 3;
00102         else if ((lead >> 3) == 0x1e)
00103             return 4;
00104         else 
00105             return 0;
00106     }
00107 
00108     enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00109 
00110     template <typename octet_iterator>
00111     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00112     {
00113         uint32_t cp = mask8(*it);
00114         // Check the lead octet
00115         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00116         octet_difference_type length = sequence_length(it);
00117 
00118         // "Shortcut" for ASCII characters
00119         if (length == 1) {
00120             if (end - it > 0) {
00121                 if (code_point)
00122                     *code_point = cp;
00123                 ++it;
00124                 return OK;
00125             }
00126             else
00127                 return NOT_ENOUGH_ROOM;
00128         }
00129 
00130         // Do we have enough memory?     
00131         if (std::distance(it, end) < length)
00132             return NOT_ENOUGH_ROOM;
00133         
00134         // Check trail octets and calculate the code point
00135         switch (length) {
00136             case 0:
00137                 return INVALID_LEAD;
00138                 break;
00139             case 2:
00140                 if (is_trail(*(++it))) { 
00141                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00142                 }
00143                 else {
00144                     --it;
00145                     return INCOMPLETE_SEQUENCE;
00146                 }
00147             break;
00148             case 3:
00149                 if (is_trail(*(++it))) {
00150                     cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
00151                     if (is_trail(*(++it))) {
00152                         cp += (*it) & 0x3f;
00153                     }
00154                     else {
00155                         std::advance(it, -2);
00156                         return INCOMPLETE_SEQUENCE;
00157                     }
00158                 }
00159                 else {
00160                     --it;
00161                     return INCOMPLETE_SEQUENCE;
00162                 }
00163             break;
00164             case 4:
00165                 if (is_trail(*(++it))) {
00166                     cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);                
00167                     if (is_trail(*(++it))) {
00168                         cp += (mask8(*it) << 6) & 0xfff;
00169                         if (is_trail(*(++it))) {
00170                             cp += (*it) & 0x3f; 
00171                         }
00172                         else {
00173                             std::advance(it, -3);
00174                             return INCOMPLETE_SEQUENCE;
00175                         }
00176                     }
00177                     else {
00178                         std::advance(it, -2);
00179                         return INCOMPLETE_SEQUENCE;
00180                     }
00181                 }
00182                 else {
00183                     --it;
00184                     return INCOMPLETE_SEQUENCE;
00185                 }
00186             break;
00187         }
00188         // Is the code point valid?
00189         if (!is_code_point_valid(cp)) {
00190             for (octet_difference_type i = 0; i < length - 1; ++i) 
00191                 --it;
00192             return INVALID_CODE_POINT;
00193         }
00194             
00195         if (code_point)
00196             *code_point = cp;
00197             
00198         if (cp < 0x80) {
00199             if (length != 1) {
00200                 std::advance(it, -(length-1));
00201                 return OVERLONG_SEQUENCE;
00202             }
00203         }
00204         else if (cp < 0x800) {
00205             if (length != 2) {
00206                 std::advance(it, -(length-1));
00207                 return OVERLONG_SEQUENCE;
00208             }
00209         }
00210         else if (cp < 0x10000) {
00211             if (length != 3) {
00212                 std::advance(it, -(length-1));
00213                 return OVERLONG_SEQUENCE;
00214             }
00215         }
00216            
00217         ++it;
00218         return OK;    
00219     }
00220 
00221     template <typename octet_iterator>
00222     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00223         return validate_next(it, end, 0);
00224     }
00225 
00226 } // namespace internal 
00227 
00229 
00230     // Byte order mark
00231     const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 
00232 
00233     template <typename octet_iterator>
00234     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00235     {
00236         octet_iterator result = start;
00237         while (result != end) {
00238             internal::utf_error err_code = internal::validate_next(result, end);
00239             if (err_code != internal::OK)
00240                 return result;
00241         }
00242         return result;
00243     }
00244 
00245     template <typename octet_iterator>
00246     inline bool is_valid(octet_iterator start, octet_iterator end)
00247     {
00248         return (find_invalid(start, end) == end);
00249     }
00250 
00251     template <typename octet_iterator>
00252     inline bool is_bom (octet_iterator it)
00253     {
00254         return (
00255             (internal::mask8(*it++)) == bom[0] &&
00256             (internal::mask8(*it++)) == bom[1] &&
00257             (internal::mask8(*it))   == bom[2]
00258            );
00259     }
00260 } // namespace utf8
00261 
00262 #endif // header guard
00263 
00264 

Generated on Sat Mar 26 07:08:37 2011 for GG by  doxygen 1.5.9