GG
|
00001 // Copyright 2006 Nemanja Trifunovic 00002 00003 /* 00004 Permission is hereby granted, free of charge, to any person or organization 00005 obtaining a copy of the software and accompanying documentation covered by 00006 this license (the "Software") to use, reproduce, display, distribute, 00007 execute, and transmit the Software, and to prepare derivative works of the 00008 Software, and to permit third-parties to whom the Software is furnished to 00009 do so, all subject to the following: 00010 00011 The copyright notices in the Software and this entire statement, including 00012 the above license grant, this restriction and the following disclaimer, 00013 must be included in all copies of the Software, in whole or in part, and 00014 all derivative works of the Software, unless such copies or derivative 00015 works are solely in the form of machine-executable object code generated by 00016 a source language processor. 00017 00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00024 DEALINGS IN THE SOFTWARE. 00025 */ 00026 00031 #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00032 #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00033 00034 #include "core.h" 00035 00036 namespace utf8 00037 { 00038 namespace unchecked 00039 { 00040 template <typename octet_iterator> 00041 octet_iterator append(uint32_t cp, octet_iterator result) 00042 { 00043 if (cp < 0x80) // one octet 00044 *(result++) = static_cast<uint8_t>(cp); 00045 else if (cp < 0x800) { // two octets 00046 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); 00047 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00048 } 00049 else if (cp < 0x10000) { // three octets 00050 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); 00051 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); 00052 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00053 } 00054 else { // four octets 00055 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); 00056 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80); 00057 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); 00058 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00059 } 00060 return result; 00061 } 00062 00063 template <typename octet_iterator> 00064 uint32_t next(octet_iterator& it) 00065 { 00066 uint32_t cp = internal::mask8(*it); 00067 typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it); 00068 switch (length) { 00069 case 1: 00070 break; 00071 case 2: 00072 it++; 00073 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 00074 break; 00075 case 3: 00076 ++it; 00077 cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); 00078 ++it; 00079 cp += (*it) & 0x3f; 00080 break; 00081 case 4: 00082 ++it; 00083 cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); 00084 ++it; 00085 cp += (internal::mask8(*it) << 6) & 0xfff; 00086 ++it; 00087 cp += (*it) & 0x3f; 00088 break; 00089 } 00090 ++it; 00091 return cp; 00092 } 00093 00094 template <typename octet_iterator> 00095 uint32_t peek_next(octet_iterator it) 00096 { 00097 return next(it); 00098 } 00099 00100 template <typename octet_iterator> 00101 uint32_t prior(octet_iterator& it) 00102 { 00103 while (internal::is_trail(*(--it))) ; 00104 octet_iterator temp = it; 00105 return next(temp); 00106 } 00107 00108 // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) 00109 template <typename octet_iterator> 00110 inline uint32_t previous(octet_iterator& it) 00111 { 00112 return prior(it); 00113 } 00114 00115 template <typename octet_iterator, typename distance_type> 00116 void advance (octet_iterator& it, distance_type n) 00117 { 00118 for (distance_type i = 0; i < n; ++i) 00119 next(it); 00120 } 00121 00122 template <typename octet_iterator> 00123 typename std::iterator_traits<octet_iterator>::difference_type 00124 distance (octet_iterator first, octet_iterator last) 00125 { 00126 typename std::iterator_traits<octet_iterator>::difference_type dist; 00127 for (dist = 0; first < last; ++dist) 00128 next(first); 00129 return dist; 00130 } 00131 00132 template <typename u16bit_iterator, typename octet_iterator> 00133 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 00134 { 00135 while (start != end) { 00136 uint32_t cp = internal::mask16(*start++); 00137 // Take care of surrogate pairs first 00138 if (internal::is_surrogate(cp)) { 00139 uint32_t trail_surrogate = internal::mask16(*start++); 00140 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 00141 } 00142 result = append(cp, result); 00143 } 00144 return result; 00145 } 00146 00147 template <typename u16bit_iterator, typename octet_iterator> 00148 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 00149 { 00150 while (start != end) { 00151 uint32_t cp = next(start); 00152 if (cp > 0xffff) { //make a surrogate pair 00153 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); 00154 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 00155 } 00156 else 00157 *result++ = static_cast<uint16_t>(cp); 00158 } 00159 return result; 00160 } 00161 00162 template <typename octet_iterator, typename u32bit_iterator> 00163 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 00164 { 00165 while (start != end) 00166 result = append(*(start++), result); 00167 00168 return result; 00169 } 00170 00171 template <typename octet_iterator, typename u32bit_iterator> 00172 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 00173 { 00174 while (start < end) 00175 (*result++) = next(start); 00176 00177 return result; 00178 } 00179 00180 // The iterator class 00181 template <typename octet_iterator> 00182 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 00183 octet_iterator it; 00184 public: 00185 iterator () {}; 00186 explicit iterator (const octet_iterator& octet_it): it(octet_it) {} 00187 // the default "big three" are OK 00188 octet_iterator base () const { return it; } 00189 uint32_t operator * () const 00190 { 00191 octet_iterator temp = it; 00192 return next(temp); 00193 } 00194 bool operator == (const iterator& rhs) const 00195 { 00196 return (it == rhs.it); 00197 } 00198 bool operator != (const iterator& rhs) const 00199 { 00200 return !(operator == (rhs)); 00201 } 00202 iterator& operator ++ () 00203 { 00204 std::advance(it, internal::sequence_length(it)); 00205 return *this; 00206 } 00207 iterator operator ++ (int) 00208 { 00209 iterator temp = *this; 00210 std::advance(it, internal::sequence_length(it)); 00211 return temp; 00212 } 00213 iterator& operator -- () 00214 { 00215 prior(it); 00216 return *this; 00217 } 00218 iterator operator -- (int) 00219 { 00220 iterator temp = *this; 00221 prior(it); 00222 return temp; 00223 } 00224 }; // class iterator 00225 00226 } // namespace utf8::unchecked 00227 } // namespace utf8 00228 00229 00230 #endif // header guard 00231