00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00031 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00032 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00033
00034 #include "core.h"
00035 #include <stdexcept>
00036
00037 namespace utf8
00038 {
00039
00040 class invalid_code_point : public std::exception {
00041 uint32_t cp;
00042 public:
00043 invalid_code_point(uint32_t cp) : cp(cp) {}
00044 virtual const char* what() const throw() { return "Invalid code point"; }
00045 uint32_t code_point() const {return cp;}
00046 };
00047
00048 class invalid_utf8 : public std::exception {
00049 uint8_t u8;
00050 public:
00051 invalid_utf8 (uint8_t u) : u8(u) {}
00052 virtual const char* what() const throw() { return "Invalid UTF-8"; }
00053 uint8_t utf8_octet() const {return u8;}
00054 };
00055
00056 class invalid_utf16 : public std::exception {
00057 uint16_t u16;
00058 public:
00059 invalid_utf16 (uint16_t u) : u16(u) {}
00060 virtual const char* what() const throw() { return "Invalid UTF-16"; }
00061 uint16_t utf16_word() const {return u16;}
00062 };
00063
00064 class not_enough_room : public std::exception {
00065 public:
00066 virtual const char* what() const throw() { return "Not enough space"; }
00067 };
00068
00070
00071 template <typename octet_iterator, typename output_iterator>
00072 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
00073 {
00074 while (start != end) {
00075 octet_iterator sequence_start = start;
00076 internal::utf_error err_code = internal::validate_next(start, end);
00077 switch (err_code) {
00078 case internal::OK :
00079 for (octet_iterator it = sequence_start; it != start; ++it)
00080 *out++ = *it;
00081 break;
00082 case internal::NOT_ENOUGH_ROOM:
00083 throw not_enough_room();
00084 case internal::INVALID_LEAD:
00085 append (replacement, out);
00086 ++start;
00087 break;
00088 case internal::INCOMPLETE_SEQUENCE:
00089 case internal::OVERLONG_SEQUENCE:
00090 case internal::INVALID_CODE_POINT:
00091 append (replacement, out);
00092 ++start;
00093
00094 while (internal::is_trail(*start) && start != end)
00095 ++start;
00096 break;
00097 }
00098 }
00099 return out;
00100 }
00101
00102 template <typename octet_iterator, typename output_iterator>
00103 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
00104 {
00105 static const uint32_t replacement_marker = internal::mask16(0xfffd);
00106 return replace_invalid(start, end, out, replacement_marker);
00107 }
00108
00109 template <typename octet_iterator>
00110 octet_iterator append(uint32_t cp, octet_iterator result)
00111 {
00112 if (!internal::is_code_point_valid(cp))
00113 throw invalid_code_point(cp);
00114
00115 if (cp < 0x80)
00116 *(result++) = static_cast<uint8_t>(cp);
00117 else if (cp < 0x800) {
00118 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
00119 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00120 }
00121 else if (cp < 0x10000) {
00122 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
00123 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
00124 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00125 }
00126 else if (cp <= internal::CODE_POINT_MAX) {
00127 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
00128 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
00129 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
00130 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00131 }
00132 else
00133 throw invalid_code_point(cp);
00134
00135 return result;
00136 }
00137
00138 template <typename octet_iterator>
00139 uint32_t next(octet_iterator& it, octet_iterator end)
00140 {
00141 uint32_t cp = 0;
00142 internal::utf_error err_code = internal::validate_next(it, end, &cp);
00143 switch (err_code) {
00144 case internal::OK :
00145 break;
00146 case internal::NOT_ENOUGH_ROOM :
00147 throw not_enough_room();
00148 case internal::INVALID_LEAD :
00149 case internal::INCOMPLETE_SEQUENCE :
00150 case internal::OVERLONG_SEQUENCE :
00151 throw invalid_utf8(*it);
00152 case internal::INVALID_CODE_POINT :
00153 throw invalid_code_point(cp);
00154 }
00155 return cp;
00156 }
00157
00158 template <typename octet_iterator>
00159 uint32_t peek_next(octet_iterator it, octet_iterator end)
00160 {
00161 return next(it, end);
00162 }
00163
00164 template <typename octet_iterator>
00165 uint32_t prior(octet_iterator& it, octet_iterator start)
00166 {
00167 octet_iterator end = it;
00168 while (internal::is_trail(*(--it)))
00169 if (it < start)
00170 throw invalid_utf8(*it);
00171 octet_iterator temp = it;
00172 return next(temp, end);
00173 }
00174
00176 template <typename octet_iterator>
00177 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
00178 {
00179 octet_iterator end = it;
00180 while (internal::is_trail(*(--it)))
00181 if (it == pass_start)
00182 throw invalid_utf8(*it);
00183 octet_iterator temp = it;
00184 return next(temp, end);
00185 }
00186
00187 template <typename octet_iterator, typename distance_type>
00188 void advance (octet_iterator& it, distance_type n, octet_iterator end)
00189 {
00190 for (distance_type i = 0; i < n; ++i)
00191 next(it, end);
00192 }
00193
00194 template <typename octet_iterator>
00195 typename std::iterator_traits<octet_iterator>::difference_type
00196 distance (octet_iterator first, octet_iterator last)
00197 {
00198 typename std::iterator_traits<octet_iterator>::difference_type dist;
00199 for (dist = 0; first < last; ++dist)
00200 next(first, last);
00201 return dist;
00202 }
00203
00204 template <typename u16bit_iterator, typename octet_iterator>
00205 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00206 {
00207 while (start != end) {
00208 uint32_t cp = internal::mask16(*start++);
00209
00210 if (internal::is_surrogate(cp)) {
00211 if (start != end) {
00212 uint32_t trail_surrogate = internal::mask16(*start++);
00213 if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
00214 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
00215 else
00216 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
00217 }
00218 else
00219 throw invalid_utf16(static_cast<uint16_t>(*start));
00220
00221 }
00222 result = append(cp, result);
00223 }
00224 return result;
00225 }
00226
00227 template <typename u16bit_iterator, typename octet_iterator>
00228 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00229 {
00230 while (start != end) {
00231 uint32_t cp = next(start, end);
00232 if (cp > 0xffff) {
00233 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
00234 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00235 }
00236 else
00237 *result++ = static_cast<uint16_t>(cp);
00238 }
00239 return result;
00240 }
00241
00242 template <typename octet_iterator, typename u32bit_iterator>
00243 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00244 {
00245 while (start != end)
00246 result = append(*(start++), result);
00247
00248 return result;
00249 }
00250
00251 template <typename octet_iterator, typename u32bit_iterator>
00252 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00253 {
00254 while (start < end)
00255 (*result++) = next(start, end);
00256
00257 return result;
00258 }
00259
00260
00261 template <typename octet_iterator>
00262 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
00263 octet_iterator it;
00264 octet_iterator range_start;
00265 octet_iterator range_end;
00266 public:
00267 iterator () {};
00268 explicit iterator (const octet_iterator& octet_it,
00269 const octet_iterator& range_start,
00270 const octet_iterator& range_end) :
00271 it(octet_it), range_start(range_start), range_end(range_end)
00272 {
00273 if (it < range_start || it > range_end)
00274 throw std::out_of_range("Invalid utf-8 iterator position");
00275 }
00276
00277 octet_iterator base () const { return it; }
00278 uint32_t operator * () const
00279 {
00280 octet_iterator temp = it;
00281 return next(temp, range_end);
00282 }
00283 bool operator == (const iterator& rhs) const
00284 {
00285 if (range_start != rhs.range_start || range_end != rhs.range_end)
00286 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00287 return (it == rhs.it);
00288 }
00289 bool operator != (const iterator& rhs) const
00290 {
00291 return !(operator == (rhs));
00292 }
00293 iterator& operator ++ ()
00294 {
00295 next(it, range_end);
00296 return *this;
00297 }
00298 iterator operator ++ (int)
00299 {
00300 iterator temp = *this;
00301 next(it, range_end);
00302 return temp;
00303 }
00304 iterator& operator -- ()
00305 {
00306 prior(it, range_start);
00307 return *this;
00308 }
00309 iterator operator -- (int)
00310 {
00311 iterator temp = *this;
00312 prior(it, range_start);
00313 return temp;
00314 }
00315 };
00316
00317
00318 template <typename octet_iterator>
00319 class wchar_iterator :
00320 public std::iterator<std::bidirectional_iterator_tag, wchar_t>
00321 {
00322 octet_iterator it;
00323 octet_iterator range_start;
00324 octet_iterator range_end;
00325 public:
00326 wchar_iterator () {};
00327 wchar_iterator (const octet_iterator& octet_it,
00328 const octet_iterator& range_start,
00329 const octet_iterator& range_end) :
00330 it(octet_it), range_start(range_start), range_end(range_end)
00331 {
00332 if (it < range_start || it > range_end)
00333 throw std::out_of_range("Invalid utf-8 iterator position");
00334 }
00335
00336 octet_iterator base () const { return it; }
00337 wchar_t operator * () const
00338 {
00339 octet_iterator temp = it;
00340 uint32_t retval = next(temp, range_end);
00341 assert(retval <= WCHAR_MAX);
00342 return retval;
00343 }
00344 bool operator == (const wchar_iterator& rhs) const
00345 {
00346 if (range_start != rhs.range_start || range_end != rhs.range_end)
00347 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00348 return (it == rhs.it);
00349 }
00350 bool operator != (const wchar_iterator& rhs) const
00351 {
00352 return !(operator == (rhs));
00353 }
00354 wchar_iterator& operator ++ ()
00355 {
00356 next(it, range_end);
00357 return *this;
00358 }
00359 wchar_iterator operator ++ (int)
00360 {
00361 wchar_iterator temp = *this;
00362 next(it, range_end);
00363 return temp;
00364 }
00365 wchar_iterator& operator -- ()
00366 {
00367 prior(it, range_start);
00368 return *this;
00369 }
00370 wchar_iterator operator -- (int)
00371 {
00372 wchar_iterator temp = *this;
00373 prior(it, range_start);
00374 return temp;
00375 }
00376 };
00377
00378 }
00379
00380 #endif //header guard
00381
00382