GG

unchecked.h

Go to the documentation of this file.
00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00031 #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00032 #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00033 
00034 #include "core.h"
00035 
00036 namespace utf8
00037 {
00038     namespace unchecked 
00039     {
00040         template <typename octet_iterator>
00041         octet_iterator append(uint32_t cp, octet_iterator result)
00042         {
00043             if (cp < 0x80)                        // one octet
00044                 *(result++) = static_cast<uint8_t>(cp);  
00045             else if (cp < 0x800) {                // two octets
00046                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
00047                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
00048             }
00049             else if (cp < 0x10000) {              // three octets
00050                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
00051                 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
00052                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
00053             }
00054             else {                                // four octets
00055                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
00056                 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f   | 0x80);
00057                 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
00058                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
00059             }
00060             return result;
00061         }
00062 
00063         template <typename octet_iterator>
00064         uint32_t next(octet_iterator& it)
00065         {
00066             uint32_t cp = internal::mask8(*it);
00067             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
00068             switch (length) {
00069                 case 1:
00070                     break;
00071                 case 2:
00072                     it++;
00073                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00074                     break;
00075                 case 3:
00076                     ++it; 
00077                     cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
00078                     ++it;
00079                     cp += (*it) & 0x3f;
00080                     break;
00081                 case 4:
00082                     ++it;
00083                     cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);                
00084                     ++it;
00085                     cp += (internal::mask8(*it) << 6) & 0xfff;
00086                     ++it;
00087                     cp += (*it) & 0x3f; 
00088                     break;
00089             }
00090             ++it;
00091             return cp;        
00092         }
00093 
00094         template <typename octet_iterator>
00095         uint32_t peek_next(octet_iterator it)
00096         {
00097             return next(it);    
00098         }
00099 
00100         template <typename octet_iterator>
00101         uint32_t prior(octet_iterator& it)
00102         {
00103             while (internal::is_trail(*(--it))) ;
00104             octet_iterator temp = it;
00105             return next(temp);
00106         }
00107 
00108         // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
00109         template <typename octet_iterator>
00110         inline uint32_t previous(octet_iterator& it)
00111         {
00112             return prior(it);
00113         }
00114 
00115         template <typename octet_iterator, typename distance_type>
00116         void advance (octet_iterator& it, distance_type n)
00117         {
00118             for (distance_type i = 0; i < n; ++i)
00119                 next(it);
00120         }
00121 
00122         template <typename octet_iterator>
00123         typename std::iterator_traits<octet_iterator>::difference_type
00124         distance (octet_iterator first, octet_iterator last)
00125         {
00126             typename std::iterator_traits<octet_iterator>::difference_type dist;
00127             for (dist = 0; first < last; ++dist) 
00128                 next(first);
00129             return dist;
00130         }
00131 
00132         template <typename u16bit_iterator, typename octet_iterator>
00133         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00134         {       
00135             while (start != end) {
00136                 uint32_t cp = internal::mask16(*start++);
00137             // Take care of surrogate pairs first
00138                 if (internal::is_surrogate(cp)) {
00139                     uint32_t trail_surrogate = internal::mask16(*start++);
00140                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
00141                 }
00142                 result = append(cp, result);
00143             }
00144             return result;         
00145         }
00146 
00147         template <typename u16bit_iterator, typename octet_iterator>
00148         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00149         {
00150             while (start != end) {
00151                 uint32_t cp = next(start);
00152                 if (cp > 0xffff) { //make a surrogate pair
00153                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
00154                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00155                 }
00156                 else
00157                     *result++ = static_cast<uint16_t>(cp);
00158             }
00159             return result;
00160         }
00161 
00162         template <typename octet_iterator, typename u32bit_iterator>
00163         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00164         {
00165             while (start != end)
00166                 result = append(*(start++), result);
00167 
00168             return result;
00169         }
00170 
00171         template <typename octet_iterator, typename u32bit_iterator>
00172         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00173         {
00174             while (start < end)
00175                 (*result++) = next(start);
00176 
00177             return result;
00178         }
00179 
00180         // The iterator class
00181         template <typename octet_iterator>
00182           class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
00183             octet_iterator it;
00184             public:
00185             iterator () {};
00186             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
00187             // the default "big three" are OK
00188             octet_iterator base () const { return it; }
00189             uint32_t operator * () const
00190             {
00191                 octet_iterator temp = it;
00192                 return next(temp);
00193             }
00194             bool operator == (const iterator& rhs) const 
00195             { 
00196                 return (it == rhs.it);
00197             }
00198             bool operator != (const iterator& rhs) const
00199             {
00200                 return !(operator == (rhs));
00201             }
00202             iterator& operator ++ () 
00203             {
00204                 std::advance(it, internal::sequence_length(it));
00205                 return *this;
00206             }
00207             iterator operator ++ (int)
00208             {
00209                 iterator temp = *this;
00210                 std::advance(it, internal::sequence_length(it));
00211                 return temp;
00212             }  
00213             iterator& operator -- ()
00214             {
00215                 prior(it);
00216                 return *this;
00217             }
00218             iterator operator -- (int)
00219             {
00220                 iterator temp = *this;
00221                 prior(it);
00222                 return temp;
00223             }
00224           }; // class iterator
00225 
00226     } // namespace utf8::unchecked
00227 } // namespace utf8 
00228 
00229 
00230 #endif // header guard
00231