GG
|
00001 // Copyright 2006 Nemanja Trifunovic 00002 00003 /* 00004 Permission is hereby granted, free of charge, to any person or organization 00005 obtaining a copy of the software and accompanying documentation covered by 00006 this license (the "Software") to use, reproduce, display, distribute, 00007 execute, and transmit the Software, and to prepare derivative works of the 00008 Software, and to permit third-parties to whom the Software is furnished to 00009 do so, all subject to the following: 00010 00011 The copyright notices in the Software and this entire statement, including 00012 the above license grant, this restriction and the following disclaimer, 00013 must be included in all copies of the Software, in whole or in part, and 00014 all derivative works of the Software, unless such copies or derivative 00015 works are solely in the form of machine-executable object code generated by 00016 a source language processor. 00017 00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00024 DEALINGS IN THE SOFTWARE. 00025 */ 00026 00031 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00032 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00033 00034 #include "core.h" 00035 #include <stdexcept> 00036 00037 namespace utf8 00038 { 00039 // Exceptions that may be thrown from the library functions. 00040 class invalid_code_point : public std::exception { 00041 uint32_t cp; 00042 public: 00043 invalid_code_point(uint32_t cp) : cp(cp) {} 00044 virtual const char* what() const throw() { return "Invalid code point"; } 00045 uint32_t code_point() const {return cp;} 00046 }; 00047 00048 class invalid_utf8 : public std::exception { 00049 uint8_t u8; 00050 public: 00051 invalid_utf8 (uint8_t u) : u8(u) {} 00052 virtual const char* what() const throw() { return "Invalid UTF-8"; } 00053 uint8_t utf8_octet() const {return u8;} 00054 }; 00055 00056 class invalid_utf16 : public std::exception { 00057 uint16_t u16; 00058 public: 00059 invalid_utf16 (uint16_t u) : u16(u) {} 00060 virtual const char* what() const throw() { return "Invalid UTF-16"; } 00061 uint16_t utf16_word() const {return u16;} 00062 }; 00063 00064 class not_enough_room : public std::exception { 00065 public: 00066 virtual const char* what() const throw() { return "Not enough space"; } 00067 }; 00068 00070 00071 template <typename octet_iterator, typename output_iterator> 00072 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 00073 { 00074 while (start != end) { 00075 octet_iterator sequence_start = start; 00076 internal::utf_error err_code = internal::validate_next(start, end); 00077 switch (err_code) { 00078 case internal::OK : 00079 for (octet_iterator it = sequence_start; it != start; ++it) 00080 *out++ = *it; 00081 break; 00082 case internal::NOT_ENOUGH_ROOM: 00083 throw not_enough_room(); 00084 case internal::INVALID_LEAD: 00085 append (replacement, out); 00086 ++start; 00087 break; 00088 case internal::INCOMPLETE_SEQUENCE: 00089 case internal::OVERLONG_SEQUENCE: 00090 case internal::INVALID_CODE_POINT: 00091 append (replacement, out); 00092 ++start; 00093 // just one replacement mark for the sequence 00094 while (internal::is_trail(*start) && start != end) 00095 ++start; 00096 break; 00097 } 00098 } 00099 return out; 00100 } 00101 00102 template <typename octet_iterator, typename output_iterator> 00103 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 00104 { 00105 static const uint32_t replacement_marker = internal::mask16(0xfffd); 00106 return replace_invalid(start, end, out, replacement_marker); 00107 } 00108 00109 template <typename octet_iterator> 00110 octet_iterator append(uint32_t cp, octet_iterator result) 00111 { 00112 if (!internal::is_code_point_valid(cp)) 00113 throw invalid_code_point(cp); 00114 00115 if (cp < 0x80) // one octet 00116 *(result++) = static_cast<uint8_t>(cp); 00117 else if (cp < 0x800) { // two octets 00118 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); 00119 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00120 } 00121 else if (cp < 0x10000) { // three octets 00122 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); 00123 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); 00124 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00125 } 00126 else if (cp <= internal::CODE_POINT_MAX) { // four octets 00127 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); 00128 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80); 00129 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); 00130 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00131 } 00132 else 00133 throw invalid_code_point(cp); 00134 00135 return result; 00136 } 00137 00138 template <typename octet_iterator> 00139 uint32_t next(octet_iterator& it, octet_iterator end) 00140 { 00141 uint32_t cp = 0; 00142 internal::utf_error err_code = internal::validate_next(it, end, &cp); 00143 switch (err_code) { 00144 case internal::OK : 00145 break; 00146 case internal::NOT_ENOUGH_ROOM : 00147 throw not_enough_room(); 00148 case internal::INVALID_LEAD : 00149 case internal::INCOMPLETE_SEQUENCE : 00150 case internal::OVERLONG_SEQUENCE : 00151 throw invalid_utf8(*it); 00152 case internal::INVALID_CODE_POINT : 00153 throw invalid_code_point(cp); 00154 } 00155 return cp; 00156 } 00157 00158 template <typename octet_iterator> 00159 uint32_t peek_next(octet_iterator it, octet_iterator end) 00160 { 00161 return next(it, end); 00162 } 00163 00164 template <typename octet_iterator> 00165 uint32_t prior(octet_iterator& it, octet_iterator start) 00166 { 00167 octet_iterator end = it; 00168 while (internal::is_trail(*(--it))) 00169 if (it < start) 00170 throw invalid_utf8(*it); // error - no lead byte in the sequence 00171 octet_iterator temp = it; 00172 return next(temp, end); 00173 } 00174 00176 template <typename octet_iterator> 00177 uint32_t previous(octet_iterator& it, octet_iterator pass_start) 00178 { 00179 octet_iterator end = it; 00180 while (internal::is_trail(*(--it))) 00181 if (it == pass_start) 00182 throw invalid_utf8(*it); // error - no lead byte in the sequence 00183 octet_iterator temp = it; 00184 return next(temp, end); 00185 } 00186 00187 template <typename octet_iterator, typename distance_type> 00188 void advance (octet_iterator& it, distance_type n, octet_iterator end) 00189 { 00190 for (distance_type i = 0; i < n; ++i) 00191 next(it, end); 00192 } 00193 00194 template <typename octet_iterator> 00195 typename std::iterator_traits<octet_iterator>::difference_type 00196 distance (octet_iterator first, octet_iterator last) 00197 { 00198 typename std::iterator_traits<octet_iterator>::difference_type dist; 00199 for (dist = 0; first < last; ++dist) 00200 next(first, last); 00201 return dist; 00202 } 00203 00204 template <typename u16bit_iterator, typename octet_iterator> 00205 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 00206 { 00207 while (start != end) { 00208 uint32_t cp = internal::mask16(*start++); 00209 // Take care of surrogate pairs first 00210 if (internal::is_surrogate(cp)) { 00211 if (start != end) { 00212 uint32_t trail_surrogate = internal::mask16(*start++); 00213 if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX) 00214 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 00215 else 00216 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); 00217 } 00218 else 00219 throw invalid_utf16(static_cast<uint16_t>(*start)); 00220 00221 } 00222 result = append(cp, result); 00223 } 00224 return result; 00225 } 00226 00227 template <typename u16bit_iterator, typename octet_iterator> 00228 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 00229 { 00230 while (start != end) { 00231 uint32_t cp = next(start, end); 00232 if (cp > 0xffff) { //make a surrogate pair 00233 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); 00234 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 00235 } 00236 else 00237 *result++ = static_cast<uint16_t>(cp); 00238 } 00239 return result; 00240 } 00241 00242 template <typename octet_iterator, typename u32bit_iterator> 00243 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 00244 { 00245 while (start != end) 00246 result = append(*(start++), result); 00247 00248 return result; 00249 } 00250 00251 template <typename octet_iterator, typename u32bit_iterator> 00252 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 00253 { 00254 while (start < end) 00255 (*result++) = next(start, end); 00256 00257 return result; 00258 } 00259 00260 // The iterator class 00261 template <typename octet_iterator> 00262 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 00263 octet_iterator it; 00264 octet_iterator range_start; 00265 octet_iterator range_end; 00266 public: 00267 iterator () {}; 00268 explicit iterator (const octet_iterator& octet_it, 00269 const octet_iterator& range_start, 00270 const octet_iterator& range_end) : 00271 it(octet_it), range_start(range_start), range_end(range_end) 00272 { 00273 if (it < range_start || it > range_end) 00274 throw std::out_of_range("Invalid utf-8 iterator position"); 00275 } 00276 // the default "big three" are OK 00277 octet_iterator base () const { return it; } 00278 uint32_t operator * () const 00279 { 00280 octet_iterator temp = it; 00281 return next(temp, range_end); 00282 } 00283 bool operator == (const iterator& rhs) const 00284 { 00285 if (range_start != rhs.range_start || range_end != rhs.range_end) 00286 throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 00287 return (it == rhs.it); 00288 } 00289 bool operator != (const iterator& rhs) const 00290 { 00291 return !(operator == (rhs)); 00292 } 00293 iterator& operator ++ () 00294 { 00295 next(it, range_end); 00296 return *this; 00297 } 00298 iterator operator ++ (int) 00299 { 00300 iterator temp = *this; 00301 next(it, range_end); 00302 return temp; 00303 } 00304 iterator& operator -- () 00305 { 00306 prior(it, range_start); 00307 return *this; 00308 } 00309 iterator operator -- (int) 00310 { 00311 iterator temp = *this; 00312 prior(it, range_start); 00313 return temp; 00314 } 00315 }; // class iterator 00316 00317 // The wchar_t iterator class 00318 template <typename octet_iterator> 00319 class wchar_iterator : 00320 public std::iterator<std::bidirectional_iterator_tag, wchar_t> 00321 { 00322 octet_iterator it; 00323 octet_iterator range_start; 00324 octet_iterator range_end; 00325 public: 00326 wchar_iterator () {}; 00327 wchar_iterator (const octet_iterator& octet_it, 00328 const octet_iterator& range_start, 00329 const octet_iterator& range_end) : 00330 it(octet_it), range_start(range_start), range_end(range_end) 00331 { 00332 if (it < range_start || it > range_end) 00333 throw std::out_of_range("Invalid utf-8 iterator position"); 00334 } 00335 // the default "big three" are OK 00336 octet_iterator base () const { return it; } 00337 wchar_t operator * () const 00338 { 00339 octet_iterator temp = it; 00340 uint32_t retval = next(temp, range_end); 00341 assert(retval <= WCHAR_MAX); 00342 return retval; 00343 } 00344 bool operator == (const wchar_iterator& rhs) const 00345 { 00346 if (range_start != rhs.range_start || range_end != rhs.range_end) 00347 throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 00348 return (it == rhs.it); 00349 } 00350 bool operator != (const wchar_iterator& rhs) const 00351 { 00352 return !(operator == (rhs)); 00353 } 00354 wchar_iterator& operator ++ () 00355 { 00356 next(it, range_end); 00357 return *this; 00358 } 00359 wchar_iterator operator ++ (int) 00360 { 00361 wchar_iterator temp = *this; 00362 next(it, range_end); 00363 return temp; 00364 } 00365 wchar_iterator& operator -- () 00366 { 00367 prior(it, range_start); 00368 return *this; 00369 } 00370 wchar_iterator operator -- (int) 00371 { 00372 wchar_iterator temp = *this; 00373 prior(it, range_start); 00374 return temp; 00375 } 00376 }; 00377 00378 } // namespace utf8 00379 00380 #endif //header guard 00381 00382