GG
|
00001 // Copyright 2006 Nemanja Trifunovic 00002 00003 /* 00004 Permission is hereby granted, free of charge, to any person or organization 00005 obtaining a copy of the software and accompanying documentation covered by 00006 this license (the "Software") to use, reproduce, display, distribute, 00007 execute, and transmit the Software, and to prepare derivative works of the 00008 Software, and to permit third-parties to whom the Software is furnished to 00009 do so, all subject to the following: 00010 00011 The copyright notices in the Software and this entire statement, including 00012 the above license grant, this restriction and the following disclaimer, 00013 must be included in all copies of the Software, in whole or in part, and 00014 all derivative works of the Software, unless such copies or derivative 00015 works are solely in the form of machine-executable object code generated by 00016 a source language processor. 00017 00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00024 DEALINGS IN THE SOFTWARE. 00025 */ 00026 00031 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00032 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00033 00034 #include <boost/cstdint.hpp> 00035 00036 #include <iterator> 00037 00038 namespace utf8 00039 { 00040 // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 00041 // You may need to change them to match your system. 00042 // These typedefs have the same names as ones from cstdint, or boost/cstdint 00043 typedef boost::uint8_t uint8_t; 00044 typedef boost::uint16_t uint16_t; 00045 typedef boost::uint32_t uint32_t; 00046 00047 // Helper code - not intended to be directly called by the library users. May be changed at any time 00048 namespace internal 00049 { 00050 // Unicode constants 00051 // Leading (high) surrogates: 0xd800 - 0xdbff 00052 // Trailing (low) surrogates: 0xdc00 - 0xdfff 00053 const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 00054 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 00055 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 00056 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 00057 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 00058 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 00059 00060 // Maximum valid value for a Unicode code point 00061 const uint32_t CODE_POINT_MAX = 0x0010ffffu; 00062 00063 template<typename octet_type> 00064 inline uint8_t mask8(octet_type oc) 00065 { 00066 return static_cast<uint8_t>(0xff & oc); 00067 } 00068 template<typename u16_type> 00069 inline uint16_t mask16(u16_type oc) 00070 { 00071 return static_cast<uint16_t>(0xffff & oc); 00072 } 00073 template<typename octet_type> 00074 inline bool is_trail(octet_type oc) 00075 { 00076 return ((mask8(oc) >> 6) == 0x2); 00077 } 00078 00079 template <typename u16> 00080 inline bool is_surrogate(u16 cp) 00081 { 00082 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 00083 } 00084 00085 template <typename u32> 00086 inline bool is_code_point_valid(u32 cp) 00087 { 00088 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff); 00089 } 00090 00091 template <typename octet_iterator> 00092 inline typename std::iterator_traits<octet_iterator>::difference_type 00093 sequence_length(octet_iterator lead_it) 00094 { 00095 uint8_t lead = mask8(*lead_it); 00096 if (lead < 0x80) 00097 return 1; 00098 else if ((lead >> 5) == 0x6) 00099 return 2; 00100 else if ((lead >> 4) == 0xe) 00101 return 3; 00102 else if ((lead >> 3) == 0x1e) 00103 return 4; 00104 else 00105 return 0; 00106 } 00107 00108 enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 00109 00110 template <typename octet_iterator> 00111 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) 00112 { 00113 uint32_t cp = mask8(*it); 00114 // Check the lead octet 00115 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; 00116 octet_difference_type length = sequence_length(it); 00117 00118 // "Shortcut" for ASCII characters 00119 if (length == 1) { 00120 if (end - it > 0) { 00121 if (code_point) 00122 *code_point = cp; 00123 ++it; 00124 return OK; 00125 } 00126 else 00127 return NOT_ENOUGH_ROOM; 00128 } 00129 00130 // Do we have enough memory? 00131 if (std::distance(it, end) < length) 00132 return NOT_ENOUGH_ROOM; 00133 00134 // Check trail octets and calculate the code point 00135 switch (length) { 00136 case 0: 00137 return INVALID_LEAD; 00138 break; 00139 case 2: 00140 if (is_trail(*(++it))) { 00141 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 00142 } 00143 else { 00144 --it; 00145 return INCOMPLETE_SEQUENCE; 00146 } 00147 break; 00148 case 3: 00149 if (is_trail(*(++it))) { 00150 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); 00151 if (is_trail(*(++it))) { 00152 cp += (*it) & 0x3f; 00153 } 00154 else { 00155 std::advance(it, -2); 00156 return INCOMPLETE_SEQUENCE; 00157 } 00158 } 00159 else { 00160 --it; 00161 return INCOMPLETE_SEQUENCE; 00162 } 00163 break; 00164 case 4: 00165 if (is_trail(*(++it))) { 00166 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); 00167 if (is_trail(*(++it))) { 00168 cp += (mask8(*it) << 6) & 0xfff; 00169 if (is_trail(*(++it))) { 00170 cp += (*it) & 0x3f; 00171 } 00172 else { 00173 std::advance(it, -3); 00174 return INCOMPLETE_SEQUENCE; 00175 } 00176 } 00177 else { 00178 std::advance(it, -2); 00179 return INCOMPLETE_SEQUENCE; 00180 } 00181 } 00182 else { 00183 --it; 00184 return INCOMPLETE_SEQUENCE; 00185 } 00186 break; 00187 } 00188 // Is the code point valid? 00189 if (!is_code_point_valid(cp)) { 00190 for (octet_difference_type i = 0; i < length - 1; ++i) 00191 --it; 00192 return INVALID_CODE_POINT; 00193 } 00194 00195 if (code_point) 00196 *code_point = cp; 00197 00198 if (cp < 0x80) { 00199 if (length != 1) { 00200 std::advance(it, -(length-1)); 00201 return OVERLONG_SEQUENCE; 00202 } 00203 } 00204 else if (cp < 0x800) { 00205 if (length != 2) { 00206 std::advance(it, -(length-1)); 00207 return OVERLONG_SEQUENCE; 00208 } 00209 } 00210 else if (cp < 0x10000) { 00211 if (length != 3) { 00212 std::advance(it, -(length-1)); 00213 return OVERLONG_SEQUENCE; 00214 } 00215 } 00216 00217 ++it; 00218 return OK; 00219 } 00220 00221 template <typename octet_iterator> 00222 inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 00223 return validate_next(it, end, 0); 00224 } 00225 00226 } // namespace internal 00227 00229 00230 // Byte order mark 00231 const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 00232 00233 template <typename octet_iterator> 00234 octet_iterator find_invalid(octet_iterator start, octet_iterator end) 00235 { 00236 octet_iterator result = start; 00237 while (result != end) { 00238 internal::utf_error err_code = internal::validate_next(result, end); 00239 if (err_code != internal::OK) 00240 return result; 00241 } 00242 return result; 00243 } 00244 00245 template <typename octet_iterator> 00246 inline bool is_valid(octet_iterator start, octet_iterator end) 00247 { 00248 return (find_invalid(start, end) == end); 00249 } 00250 00251 template <typename octet_iterator> 00252 inline bool is_bom (octet_iterator it) 00253 { 00254 return ( 00255 (internal::mask8(*it++)) == bom[0] && 00256 (internal::mask8(*it++)) == bom[1] && 00257 (internal::mask8(*it)) == bom[2] 00258 ); 00259 } 00260 } // namespace utf8 00261 00262 #endif // header guard 00263 00264