GG

checked.h

Go to the documentation of this file.
00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00031 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00032 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00033 
00034 #include "core.h"
00035 #include <stdexcept>
00036 
00037 namespace utf8
00038 {
00039     // Exceptions that may be thrown from the library functions.
00040     class invalid_code_point : public std::exception {
00041         uint32_t cp;
00042     public:
00043         invalid_code_point(uint32_t cp) : cp(cp) {}
00044         virtual const char* what() const throw() { return "Invalid code point"; }
00045         uint32_t code_point() const {return cp;}
00046     };
00047 
00048     class invalid_utf8 : public std::exception {
00049         uint8_t u8;
00050     public:
00051         invalid_utf8 (uint8_t u) : u8(u) {}
00052         virtual const char* what() const throw() { return "Invalid UTF-8"; }
00053         uint8_t utf8_octet() const {return u8;}
00054     };
00055 
00056     class invalid_utf16 : public std::exception {
00057         uint16_t u16;
00058     public:
00059         invalid_utf16 (uint16_t u) : u16(u) {}
00060         virtual const char* what() const throw() { return "Invalid UTF-16"; }
00061         uint16_t utf16_word() const {return u16;}
00062     };
00063 
00064     class not_enough_room : public std::exception {
00065     public:
00066         virtual const char* what() const throw() { return "Not enough space"; }
00067     };
00068 
00070  
00071     template <typename octet_iterator, typename output_iterator>
00072     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
00073     {
00074         while (start != end) {
00075             octet_iterator sequence_start = start;
00076             internal::utf_error err_code = internal::validate_next(start, end);
00077             switch (err_code) {
00078                 case internal::OK :
00079                     for (octet_iterator it = sequence_start; it != start; ++it)
00080                         *out++ = *it;
00081                     break;
00082                 case internal::NOT_ENOUGH_ROOM:
00083                     throw not_enough_room();
00084                 case internal::INVALID_LEAD:
00085                     append (replacement, out);
00086                     ++start;
00087                     break;
00088                 case internal::INCOMPLETE_SEQUENCE:
00089                 case internal::OVERLONG_SEQUENCE:
00090                 case internal::INVALID_CODE_POINT:
00091                     append (replacement, out);
00092                     ++start;
00093                     // just one replacement mark for the sequence
00094                     while (internal::is_trail(*start) && start != end)
00095                         ++start;
00096                     break;
00097             }
00098         }   
00099         return out;
00100     }
00101 
00102     template <typename octet_iterator, typename output_iterator>
00103     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
00104     {
00105         static const uint32_t replacement_marker = internal::mask16(0xfffd);
00106         return replace_invalid(start, end, out, replacement_marker);
00107     }
00108 
00109     template <typename octet_iterator>
00110     octet_iterator append(uint32_t cp, octet_iterator result)
00111     {
00112         if (!internal::is_code_point_valid(cp)) 
00113             throw invalid_code_point(cp);
00114 
00115         if (cp < 0x80)                        // one octet
00116             *(result++) = static_cast<uint8_t>(cp);  
00117         else if (cp < 0x800) {                // two octets
00118             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
00119             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00120         }
00121         else if (cp < 0x10000) {              // three octets
00122             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
00123             *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f     | 0x80);
00124             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00125         }
00126         else if (cp <= internal::CODE_POINT_MAX) {      // four octets
00127             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
00128             *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f     | 0x80);
00129             *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f     | 0x80);
00130             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00131         }
00132         else
00133             throw invalid_code_point(cp);
00134 
00135         return result;
00136     }
00137 
00138     template <typename octet_iterator>
00139     uint32_t next(octet_iterator& it, octet_iterator end)
00140     {
00141         uint32_t cp = 0;
00142         internal::utf_error err_code = internal::validate_next(it, end, &cp);
00143         switch (err_code) {
00144             case internal::OK :
00145                 break;
00146             case internal::NOT_ENOUGH_ROOM :
00147                 throw not_enough_room();
00148             case internal::INVALID_LEAD :
00149             case internal::INCOMPLETE_SEQUENCE :
00150             case internal::OVERLONG_SEQUENCE :
00151                 throw invalid_utf8(*it);
00152             case internal::INVALID_CODE_POINT :
00153                 throw invalid_code_point(cp);
00154         }
00155         return cp;        
00156     }
00157 
00158     template <typename octet_iterator>
00159     uint32_t peek_next(octet_iterator it, octet_iterator end)
00160     {
00161         return next(it, end);
00162     }
00163 
00164     template <typename octet_iterator>
00165     uint32_t prior(octet_iterator& it, octet_iterator start)
00166     {
00167         octet_iterator end = it;
00168         while (internal::is_trail(*(--it))) 
00169             if (it < start)
00170                 throw invalid_utf8(*it); // error - no lead byte in the sequence
00171         octet_iterator temp = it;
00172         return next(temp, end);
00173     }
00174 
00176     template <typename octet_iterator>
00177     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
00178     {
00179         octet_iterator end = it;
00180         while (internal::is_trail(*(--it))) 
00181             if (it == pass_start)
00182                 throw invalid_utf8(*it); // error - no lead byte in the sequence
00183         octet_iterator temp = it;
00184         return next(temp, end);
00185     }
00186 
00187     template <typename octet_iterator, typename distance_type>
00188     void advance (octet_iterator& it, distance_type n, octet_iterator end)
00189     {
00190         for (distance_type i = 0; i < n; ++i)
00191             next(it, end);
00192     }
00193 
00194     template <typename octet_iterator>
00195     typename std::iterator_traits<octet_iterator>::difference_type
00196     distance (octet_iterator first, octet_iterator last)
00197     {
00198         typename std::iterator_traits<octet_iterator>::difference_type dist;
00199         for (dist = 0; first < last; ++dist) 
00200             next(first, last);
00201         return dist;
00202     }
00203 
00204     template <typename u16bit_iterator, typename octet_iterator>
00205     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00206     {       
00207         while (start != end) {
00208             uint32_t cp = internal::mask16(*start++);
00209             // Take care of surrogate pairs first
00210             if (internal::is_surrogate(cp)) {
00211                 if (start != end) {
00212                     uint32_t trail_surrogate = internal::mask16(*start++);
00213                     if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
00214                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;                    
00215                     else 
00216                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
00217                 }
00218                 else 
00219                     throw invalid_utf16(static_cast<uint16_t>(*start));
00220             
00221             }
00222             result = append(cp, result);
00223         }
00224         return result;        
00225     }
00226 
00227     template <typename u16bit_iterator, typename octet_iterator>
00228     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00229     {
00230         while (start != end) {
00231             uint32_t cp = next(start, end);
00232             if (cp > 0xffff) { //make a surrogate pair
00233                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
00234                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00235             }
00236             else
00237                 *result++ = static_cast<uint16_t>(cp);
00238         }
00239         return result;
00240     }
00241 
00242     template <typename octet_iterator, typename u32bit_iterator>
00243     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00244     {
00245         while (start != end)
00246             result = append(*(start++), result);
00247 
00248         return result;
00249     }
00250 
00251     template <typename octet_iterator, typename u32bit_iterator>
00252     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00253     {
00254         while (start < end)
00255             (*result++) = next(start, end);
00256 
00257         return result;
00258     }
00259 
00260     // The iterator class
00261     template <typename octet_iterator>
00262     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
00263       octet_iterator it;
00264       octet_iterator range_start;
00265       octet_iterator range_end;
00266       public:
00267       iterator () {};
00268       explicit iterator (const octet_iterator& octet_it, 
00269                          const octet_iterator& range_start,
00270                          const octet_iterator& range_end) :
00271                it(octet_it), range_start(range_start), range_end(range_end)
00272       {
00273           if (it < range_start || it > range_end)
00274               throw std::out_of_range("Invalid utf-8 iterator position");
00275       }
00276       // the default "big three" are OK
00277       octet_iterator base () const { return it; }
00278       uint32_t operator * () const
00279       {
00280           octet_iterator temp = it;
00281           return next(temp, range_end);
00282       }
00283       bool operator == (const iterator& rhs) const 
00284       { 
00285           if (range_start != rhs.range_start || range_end != rhs.range_end)
00286               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00287           return (it == rhs.it);
00288       }
00289       bool operator != (const iterator& rhs) const
00290       {
00291           return !(operator == (rhs));
00292       }
00293       iterator& operator ++ () 
00294       {
00295           next(it, range_end);
00296           return *this;
00297       }
00298       iterator operator ++ (int)
00299       {
00300           iterator temp = *this;
00301           next(it, range_end);
00302           return temp;
00303       }  
00304       iterator& operator -- ()
00305       {
00306           prior(it, range_start);
00307           return *this;
00308       }
00309       iterator operator -- (int)
00310       {
00311           iterator temp = *this;
00312           prior(it, range_start);
00313           return temp;
00314       }
00315     }; // class iterator
00316 
00317     // The wchar_t iterator class
00318     template <typename octet_iterator>
00319     class wchar_iterator :
00320         public std::iterator<std::bidirectional_iterator_tag, wchar_t>
00321     { 
00322         octet_iterator it;
00323         octet_iterator range_start;
00324         octet_iterator range_end;
00325     public:
00326         wchar_iterator () {};
00327         wchar_iterator (const octet_iterator& octet_it, 
00328                         const octet_iterator& range_start,
00329                         const octet_iterator& range_end) :
00330             it(octet_it), range_start(range_start), range_end(range_end)
00331         {
00332             if (it < range_start || it > range_end)
00333                 throw std::out_of_range("Invalid utf-8 iterator position");
00334         }
00335         // the default "big three" are OK
00336         octet_iterator base () const { return it; }
00337         wchar_t operator * () const
00338         {
00339             octet_iterator temp = it;
00340             uint32_t retval = next(temp, range_end);
00341             assert(retval <= WCHAR_MAX);
00342             return retval;
00343         }
00344         bool operator == (const wchar_iterator& rhs) const 
00345         { 
00346             if (range_start != rhs.range_start || range_end != rhs.range_end)
00347                 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00348             return (it == rhs.it);
00349         }
00350         bool operator != (const wchar_iterator& rhs) const
00351         {
00352             return !(operator == (rhs));
00353         }
00354         wchar_iterator& operator ++ () 
00355         {
00356             next(it, range_end);
00357             return *this;
00358         }
00359         wchar_iterator operator ++ (int)
00360         {
00361             wchar_iterator temp = *this;
00362             next(it, range_end);
00363             return temp;
00364         }  
00365         wchar_iterator& operator -- ()
00366         {
00367             prior(it, range_start);
00368             return *this;
00369         }
00370         wchar_iterator operator -- (int)
00371         {
00372             wchar_iterator temp = *this;
00373             prior(it, range_start);
00374             return temp;
00375         }
00376     };
00377 
00378 } // namespace utf8
00379 
00380 #endif //header guard
00381 
00382