Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include "utf8_codecvt.hpp"
00014
00015 #include <boost/detail/utf8_codecvt_facet.hpp>
00016
00017 #include <cstdlib>
00018 #include <cassert>
00019
00020 #include <boost/limits.hpp>
00021 #include <boost/config.hpp>
00022
00023
00024
00025
00026
00027
00028 #ifndef BOOST_NO_STD_WSTRING
00029
00030 BOOST_UTF8_BEGIN_NAMESPACE
00031
00033
00034
00035
00036 std::codecvt_base::result utf8_codecvt_facet::do_in(
00037 std::mbstate_t& ,
00038 const char * from,
00039 const char * from_end,
00040 const char * & from_next,
00041 wchar_t * to,
00042 wchar_t * to_end,
00043 wchar_t * & to_next
00044 ) const {
00045
00046
00047
00048
00049
00050
00051
00052
00053 while (from != from_end && to != to_end) {
00054
00055
00056 if (invalid_leading_octet(*from)){
00057 from_next = from;
00058 to_next = to;
00059 return std::codecvt_base::error;
00060 }
00061
00062
00063
00064 const int cont_octet_count = get_cont_octet_count(*from);
00065 const wchar_t octet1_modifier_table[] = {
00066 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
00067 };
00068
00069
00070
00071 wchar_t ucs_result =
00072 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
00073
00074
00075
00076
00077
00078 int i = 0;
00079 while(i != cont_octet_count && from != from_end) {
00080
00081
00082 if (invalid_continuing_octet(*from)) {
00083 from_next = from;
00084 to_next = to;
00085 return std::codecvt_base::error;
00086 }
00087
00088 ucs_result *= (1 << 6);
00089
00090
00091
00092 ucs_result += (unsigned char)(*from++) - 0x80;
00093 ++i;
00094 }
00095
00096
00097 if (from == from_end && i != cont_octet_count) {
00098
00099 from_next = from - (i+1);
00100 to_next = to;
00101 return std::codecvt_base::partial;
00102 }
00103 *to++ = ucs_result;
00104 }
00105 from_next = from;
00106 to_next = to;
00107
00108
00109 if(from == from_end) return std::codecvt_base::ok;
00110 else return std::codecvt_base::partial;
00111 }
00112
00113 std::codecvt_base::result utf8_codecvt_facet::do_out(
00114 std::mbstate_t& ,
00115 const wchar_t * from,
00116 const wchar_t * from_end,
00117 const wchar_t * & from_next,
00118 char * to,
00119 char * to_end,
00120 char * & to_next
00121 ) const
00122 {
00123
00124 const wchar_t octet1_modifier_table[] = {
00125 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
00126 };
00127
00128 wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
00129 while (from != from_end && to != to_end) {
00130
00131
00132 if (*from > max_wchar) {
00133 from_next = from;
00134 to_next = to;
00135 return std::codecvt_base::error;
00136 }
00137
00138 int cont_octet_count = get_cont_octet_out_count(*from);
00139
00140
00141 int shift_exponent = (cont_octet_count) * 6;
00142
00143
00144 *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
00145 (unsigned char)(*from / (1 << shift_exponent)));
00146
00147
00148
00149
00150
00151
00152 int i = 0;
00153 while (i != cont_octet_count && to != to_end) {
00154 shift_exponent -= 6;
00155 *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
00156 ++i;
00157 }
00158
00159 if(to == to_end && i != cont_octet_count) {
00160 from_next = from;
00161 to_next = to - (i+1);
00162 return std::codecvt_base::partial;
00163 }
00164 *from++;
00165 }
00166 from_next = from;
00167 to_next = to;
00168
00169 if(from == from_end) return std::codecvt_base::ok;
00170 else return std::codecvt_base::partial;
00171 }
00172
00173
00174
00175 int utf8_codecvt_facet::do_length(
00176 BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
00177 const char * from,
00178 const char * from_end,
00179 std::size_t max_limit
00180 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
00181 ) const throw()
00182 #else
00183 ) const
00184 #endif
00185 {
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195 int last_octet_count=0;
00196 std::size_t char_count = 0;
00197 const char* from_next = from;
00198
00199 while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
00200 from_next += last_octet_count;
00201 last_octet_count = (get_octet_count(*from_next));
00202 ++char_count;
00203 }
00204 return static_cast<int>(from_next-from_end);
00205 }
00206
00207 unsigned int utf8_codecvt_facet::get_octet_count(
00208 unsigned char lead_octet
00209 ){
00210
00211 if (lead_octet <= 0x7f) return 1;
00212
00213
00214
00215
00216 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
00217 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
00218 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
00219 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
00220 else return 6;
00221 }
00222 BOOST_UTF8_END_NAMESPACE
00223
00224 namespace {
00225 template<std::size_t s>
00226 int get_cont_octet_out_count_impl(wchar_t word){
00227 if (word < 0x80) {
00228 return 0;
00229 }
00230 if (word < 0x800) {
00231 return 1;
00232 }
00233 return 2;
00234 }
00235
00236
00237
00238
00239 template<>
00240 int get_cont_octet_out_count_impl<4>(wchar_t word){
00241 if (word < 0x80) {
00242 return 0;
00243 }
00244 if (word < 0x800) {
00245 return 1;
00246 }
00247 if (word < 0x10000) {
00248 return 2;
00249 }
00250 if (word < 0x200000) {
00251 return 3;
00252 }
00253 if (word < 0x4000000) {
00254 return 4;
00255 }
00256 return 5;
00257 }
00258
00259 }
00260
00261 BOOST_UTF8_BEGIN_NAMESPACE
00262
00263
00264 int utf8_codecvt_facet::get_cont_octet_out_count(
00265 wchar_t word
00266 ) const {
00267 return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
00268 }
00269 BOOST_UTF8_END_NAMESPACE
00270
00271 #endif