core.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00031 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00032 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00033
00034 #include <boost/cstdint.hpp>
00035
00036 #include <iterator>
00037
00038 namespace utf8
00039 {
00040
00041
00042
00043 typedef boost::uint8_t uint8_t;
00044 typedef boost::uint16_t uint16_t;
00045 typedef boost::uint32_t uint32_t;
00046
00047
00048 namespace internal
00049 {
00050
00051
00052
00053 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
00054 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
00055 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00056 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00057 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00058 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00059
00060
00061 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
00062
00063 template<typename octet_type>
00064 inline uint8_t mask8(octet_type oc)
00065 {
00066 return static_cast<uint8_t>(0xff & oc);
00067 }
00068 template<typename u16_type>
00069 inline uint16_t mask16(u16_type oc)
00070 {
00071 return static_cast<uint16_t>(0xffff & oc);
00072 }
00073 template<typename octet_type>
00074 inline bool is_trail(octet_type oc)
00075 {
00076 return ((mask8(oc) >> 6) == 0x2);
00077 }
00078
00079 template <typename u16>
00080 inline bool is_surrogate(u16 cp)
00081 {
00082 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00083 }
00084
00085 template <typename u32>
00086 inline bool is_code_point_valid(u32 cp)
00087 {
00088 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
00089 }
00090
00091 template <typename octet_iterator>
00092 inline typename std::iterator_traits<octet_iterator>::difference_type
00093 sequence_length(octet_iterator lead_it)
00094 {
00095 uint8_t lead = mask8(*lead_it);
00096 if (lead < 0x80)
00097 return 1;
00098 else if ((lead >> 5) == 0x6)
00099 return 2;
00100 else if ((lead >> 4) == 0xe)
00101 return 3;
00102 else if ((lead >> 3) == 0x1e)
00103 return 4;
00104 else
00105 return 0;
00106 }
00107
00108 enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00109
00110 template <typename octet_iterator>
00111 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00112 {
00113 uint32_t cp = mask8(*it);
00114
00115 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00116 octet_difference_type length = sequence_length(it);
00117
00118
00119 if (length == 1) {
00120 if (end - it > 0) {
00121 if (code_point)
00122 *code_point = cp;
00123 ++it;
00124 return OK;
00125 }
00126 else
00127 return NOT_ENOUGH_ROOM;
00128 }
00129
00130
00131 if (std::distance(it, end) < length)
00132 return NOT_ENOUGH_ROOM;
00133
00134
00135 switch (length) {
00136 case 0:
00137 return INVALID_LEAD;
00138 break;
00139 case 2:
00140 if (is_trail(*(++it))) {
00141 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00142 }
00143 else {
00144 --it;
00145 return INCOMPLETE_SEQUENCE;
00146 }
00147 break;
00148 case 3:
00149 if (is_trail(*(++it))) {
00150 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
00151 if (is_trail(*(++it))) {
00152 cp += (*it) & 0x3f;
00153 }
00154 else {
00155 std::advance(it, -2);
00156 return INCOMPLETE_SEQUENCE;
00157 }
00158 }
00159 else {
00160 --it;
00161 return INCOMPLETE_SEQUENCE;
00162 }
00163 break;
00164 case 4:
00165 if (is_trail(*(++it))) {
00166 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
00167 if (is_trail(*(++it))) {
00168 cp += (mask8(*it) << 6) & 0xfff;
00169 if (is_trail(*(++it))) {
00170 cp += (*it) & 0x3f;
00171 }
00172 else {
00173 std::advance(it, -3);
00174 return INCOMPLETE_SEQUENCE;
00175 }
00176 }
00177 else {
00178 std::advance(it, -2);
00179 return INCOMPLETE_SEQUENCE;
00180 }
00181 }
00182 else {
00183 --it;
00184 return INCOMPLETE_SEQUENCE;
00185 }
00186 break;
00187 }
00188
00189 if (!is_code_point_valid(cp)) {
00190 for (octet_difference_type i = 0; i < length - 1; ++i)
00191 --it;
00192 return INVALID_CODE_POINT;
00193 }
00194
00195 if (code_point)
00196 *code_point = cp;
00197
00198 if (cp < 0x80) {
00199 if (length != 1) {
00200 std::advance(it, -(length-1));
00201 return OVERLONG_SEQUENCE;
00202 }
00203 }
00204 else if (cp < 0x800) {
00205 if (length != 2) {
00206 std::advance(it, -(length-1));
00207 return OVERLONG_SEQUENCE;
00208 }
00209 }
00210 else if (cp < 0x10000) {
00211 if (length != 3) {
00212 std::advance(it, -(length-1));
00213 return OVERLONG_SEQUENCE;
00214 }
00215 }
00216
00217 ++it;
00218 return OK;
00219 }
00220
00221 template <typename octet_iterator>
00222 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00223 return validate_next(it, end, 0);
00224 }
00225
00226 }
00227
00229
00230
00231 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
00232
00233 template <typename octet_iterator>
00234 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00235 {
00236 octet_iterator result = start;
00237 while (result != end) {
00238 internal::utf_error err_code = internal::validate_next(result, end);
00239 if (err_code != internal::OK)
00240 return result;
00241 }
00242 return result;
00243 }
00244
00245 template <typename octet_iterator>
00246 inline bool is_valid(octet_iterator start, octet_iterator end)
00247 {
00248 return (find_invalid(start, end) == end);
00249 }
00250
00251 template <typename octet_iterator>
00252 inline bool is_bom (octet_iterator it)
00253 {
00254 return (
00255 (internal::mask8(*it++)) == bom[0] &&
00256 (internal::mask8(*it++)) == bom[1] &&
00257 (internal::mask8(*it)) == bom[2]
00258 );
00259 }
00260 }
00261
00262 #endif // header guard
00263
00264