c++-gtk-utils
|
00001 /* Copyright (C) 2005 to 2011 Chris Vine 00002 00003 The library comprised in this file or of which this file is part is 00004 distributed by Chris Vine under the GNU Lesser General Public 00005 License as follows: 00006 00007 This library is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU Lesser General Public License 00009 as published by the Free Software Foundation; either version 2.1 of 00010 the License, or (at your option) any later version. 00011 00012 This library is distributed in the hope that it will be useful, but 00013 WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 Lesser General Public License, version 2.1, for more details. 00016 00017 You should have received a copy of the GNU Lesser General Public 00018 License, version 2.1, along with this library (see the file LGPL.TXT 00019 which came with this source code package in the src/utils sub-directory); 00020 if not, write to the Free Software Foundation, Inc., 00021 59 Temple Place - Suite 330, Boston, MA, 02111-1307, USA. 00022 00023 However, it is not intended that the object code of a program whose 00024 source code instantiates a template from this file or uses macros or 00025 inline functions (of any length) should by reason only of that 00026 instantiation or use be subject to the restrictions of use in the GNU 00027 Lesser General Public License. With that in mind, the words "and 00028 macros, inline functions and instantiations of templates (of any 00029 length)" shall be treated as substituted for the words "and small 00030 macros and small inline functions (ten lines or less in length)" in 00031 the fourth paragraph of section 5 of that licence. This does not 00032 affect any other reason why object code may be subject to the 00033 restrictions in that licence (nor for the avoidance of doubt does it 00034 affect the application of section 2 of that licence to modifications 00035 of the source code in this file). 00036 00037 */ 00038 00039 #ifndef CGU_CONVERT_H 00040 #define CGU_CONVERT_H 00041 00042 #include <string> 00043 #include <iterator> 00044 #include <exception> 00045 00046 #include <glib.h> 00047 00048 #include <c++-gtk-utils/shared_handle.h> 00049 #include <c++-gtk-utils/cgu_config.h> 00050 00051 namespace Cgu { 00052 00053 /** 00054 * @file convert.h 00055 * @brief This file contains functions for converting between 00056 * character sets. 00057 * 00058 * \#include <c++-gtk-utils/convert.h> 00059 * 00060 * This file contains functions for converting between character sets. 00061 * If you want these functions to work, you will generally have needed 00062 * to have set the locale in the relevant program with either 00063 * <em>std::locale::global(std::locale(""))</em> (from the C++ 00064 * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C 00065 * standard library). 00066 */ 00067 00068 /** 00069 * @namespace Cgu::Utf8 00070 * @brief This namespace contains utilities relevant to the use of 00071 * UTF-8 in programs. 00072 * 00073 * \#include <c++-gtk-utils/convert.h> (for conversion and validation 00074 * functions) 00075 * 00076 * \#include <c++-gtk-utils/reassembler.h> (for Reassembler class) 00077 * @sa convert.h reassembler.h 00078 * 00079 * This namespace contains utilities relevant to the use of UTF-8 in 00080 * programs. If you want these functions to work, you will generally 00081 * have needed to have set the locale in the relevant program with 00082 * either <em>std::locale::global(std::locale(""))</em> (from the C++ 00083 * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C standard 00084 * library). 00085 */ 00086 00087 namespace Utf8 { 00088 00089 class ConversionError: public std::exception { 00090 GcharSharedHandle message; 00091 public: 00092 virtual const char* what() const throw() {return (const char*)message.get();} 00093 ConversionError(const char* msg): 00094 message(g_strdup_printf("Utf8::ConversionError: %s", msg)) {} 00095 ConversionError(GError* error): 00096 message(g_strdup_printf("Utf8::ConversionError: %s", error->message)) {} 00097 ~ConversionError() throw() {} 00098 }; 00099 00100 /** 00101 * Converts text from UTF-8 to the system's Unicode wide character 00102 * representation, which will be UTF-32/UCS-4 for systems with a wide 00103 * character size of 4 (almost all unix-like systems), and UTF-16 for 00104 * systems with a wide character size of 2. 00105 * @param input Text in valid UTF-8 format. 00106 * @return The input text converted to UTF-32 or UTF-16. 00107 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00108 * if conversion fails because the input string is not in valid UTF-8 00109 * format or the system does not support wide character Unicode 00110 * strings. 00111 * @exception std::bad_alloc This function might throw std::bad_alloc 00112 * if memory is exhausted and the system throws in that case. 00113 */ 00114 std::wstring uniwide_from_utf8(const std::string& input); 00115 00116 /** 00117 * Converts text from the system's Unicode wide character 00118 * representation, which will be UTF-32/UCS-4 for systems with a wide 00119 * character size of 4 (almost all unix-like systems) and UTF-16 for 00120 * systems with a wide character size of 2, to narrow character UTF-8 00121 * format. 00122 * @param input Text in valid UTF-32 or UTF-16 format. 00123 * @return The input text converted to UTF-8. 00124 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00125 * if conversion fails because the input string is not in valid 00126 * UTF-32/UCS-4 or UTF-16 format or the system does not support wide 00127 * character Unicode strings. 00128 * @exception std::bad_alloc This function might throw std::bad_alloc 00129 * if memory is exhausted and the system throws in that case. 00130 */ 00131 std::string uniwide_to_utf8(const std::wstring& input); 00132 00133 /** 00134 * Converts text from UTF-8 to UTF-32/USC-4. 00135 * @param input Text in valid UTF-8 format. 00136 * @return The input text converted to UTF-32. 00137 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00138 * if conversion fails because the input string is not in valid UTF-8 00139 * format or the system does not support wide character Unicode 00140 * strings. 00141 * @exception std::bad_alloc This function might throw std::bad_alloc 00142 * if memory is exhausted and the system throws in that case. 00143 */ 00144 std::u32string utf32_from_utf8(const std::string& input); 00145 00146 /** 00147 * Converts text from UFF-32/UTF4 to narrow character UTF-8 format. 00148 * @param input Text in valid UTF-32 format. 00149 * @return The input text converted to UTF-8. 00150 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00151 * if conversion fails because the input string is not in valid 00152 * UTF-32/UCS-4 format or the system does not support wide character 00153 * Unicode strings. 00154 * @exception std::bad_alloc This function might throw std::bad_alloc 00155 * if memory is exhausted and the system throws in that case. 00156 */ 00157 std::string utf32_to_utf8(const std::u32string& input); 00158 00159 /** 00160 * Converts text from UTF-8 to UTF-16. 00161 * @param input Text in valid UTF-8 format. 00162 * @return The input text converted to UTF-16. 00163 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00164 * if conversion fails because the input string is not in valid UTF-8 00165 * format or the system does not support wide character Unicode 00166 * strings. 00167 * @exception std::bad_alloc This function might throw std::bad_alloc 00168 * if memory is exhausted and the system throws in that case. 00169 */ 00170 std::u16string utf16_from_utf8(const std::string& input); 00171 00172 /** 00173 * Converts text from UFF-16 to narrow character UTF-8 format. 00174 * @param input Text in valid UTF-16 format. 00175 * @return The input text converted to UTF-8. 00176 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00177 * if conversion fails because the input string is not in valid UTF-16 00178 * format or the system does not support wide character Unicode 00179 * strings. 00180 * @exception std::bad_alloc This function might throw std::bad_alloc 00181 * if memory is exhausted and the system throws in that case. 00182 */ 00183 std::string utf16_to_utf8(const std::u16string& input); 00184 00185 /** 00186 * Converts text from UTF-8 to the system's wide character locale 00187 * representation. For this function to work correctly, the system's 00188 * installed iconv() must support conversion to a generic wchar_t 00189 * target, but in POSIX whether it does so is implementation defined 00190 * (GNU's C library implemention does). For most unix-like systems 00191 * the wide character representation will be Unicode (UCS-4/UTF-32 or 00192 * UTF-16), and where that is the case use the uniwide_from_utf8() 00193 * function instead, which will not rely on the generic target being 00194 * available. 00195 * @param input Text in valid UTF-8 format. 00196 * @return The input text converted to the system's wide character 00197 * locale representation. 00198 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00199 * if conversion fails because the input string is not in valid UTF-8 00200 * format, or cannot be converted to the system's wide character 00201 * locale representation (eg because the input characters cannot be 00202 * represented by that encoding, or the system's installed iconv() 00203 * function does not support conversion to a generic wchar_t target). 00204 * @exception std::bad_alloc This function might throw std::bad_alloc 00205 * if memory is exhausted and the system throws in that case. 00206 */ 00207 00208 std::wstring wide_from_utf8(const std::string& input); 00209 00210 /** 00211 * Converts text from the system's wide character locale 00212 * representation to UTF-8. For this function to work correctly, the 00213 * system's installed iconv() must support conversion from a generic 00214 * wchar_t target, but in POSIX whether it does so is implementation 00215 * defined (GNU's C library implemention does). For most unix-like 00216 * systems the wide character representation will be Unicode 00217 * (UCS-4/UTF-32 or UTF-16), and where that is the case use the 00218 * uniwide_to_utf8() function instead, which will not rely on the 00219 * generic target being available. 00220 * @param input Text in a valid wide character locale format. 00221 * @return The input text converted to UTF-8. 00222 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00223 * if conversion fails because the input string is not in a valid wide 00224 * character locale format, or cannot be converted to UTF-8 (eg 00225 * because the system's installed iconv() function does not support 00226 * conversion from a generic wchar_t target). 00227 * @exception std::bad_alloc This function might throw std::bad_alloc 00228 * if memory is exhausted and the system throws in that case. 00229 */ 00230 std::string wide_to_utf8(const std::wstring& input); 00231 00232 /** 00233 * Converts text from UTF-8 to the system's filename encoding. 00234 * @param input Text in valid UTF-8 format. 00235 * @return The input text converted to filename encoding. 00236 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00237 * if conversion fails because the input string is not in valid UTF-8 00238 * format, or cannot be converted to filename encoding (eg because the 00239 * input characters cannot be represented by that encoding). 00240 * @exception std::bad_alloc This function might throw std::bad_alloc 00241 * if memory is exhausted and the system throws in that case. 00242 * @note glib takes the system's filename encoding from the 00243 * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES. 00244 * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not 00245 * set, it will be assumed that the filename encoding is the same as 00246 * the locale encoding. If G_FILENAME_ENCODING is set, then 00247 * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from 00248 * the value held by G_FILENAME_ENCODING. 00249 */ 00250 std::string filename_from_utf8(const std::string& input); 00251 00252 /** 00253 * Converts text from the system's filename encoding to UTF-8. 00254 * @param input Text in valid filename encoding. 00255 * @return The input text converted to UTF-8. 00256 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00257 * if conversion fails because the input string is not in valid 00258 * filename encoding. 00259 * @exception std::bad_alloc This function might throw std::bad_alloc 00260 * if memory is exhausted and the system throws in that case. 00261 * @note glib takes the system's filename encoding from the 00262 * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES. 00263 * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not 00264 * set, it will be assumed that the filename encoding is the same as 00265 * the locale encoding. If G_FILENAME_ENCODING is set, then 00266 * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from 00267 * the value held by G_FILENAME_ENCODING. 00268 */ 00269 std::string filename_to_utf8(const std::string& input); 00270 00271 /** 00272 * Converts text from UTF-8 to the system's locale encoding. 00273 * @param input Text in valid UTF-8 format. 00274 * @return The input text converted to locale encoding. 00275 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00276 * if conversion fails because the input string is not in valid UTF-8 00277 * format, or cannot be converted to locale encoding (eg because the 00278 * input characters cannot be represented by that encoding). 00279 * @exception std::bad_alloc This function might throw std::bad_alloc 00280 * if memory is exhausted and the system throws in that case. 00281 */ 00282 std::string locale_from_utf8(const std::string& input); 00283 00284 /** 00285 * Converts text from the system's locale encoding to UTF-8. 00286 * @param input Text in valid locale encoding. 00287 * @return The input text converted to UTF-8. 00288 * @exception Cgu::Utf8::ConversionError This exception will be thrown 00289 * if conversion fails because the input string is not in valid locale 00290 * encoding. 00291 * @exception std::bad_alloc This function might throw std::bad_alloc 00292 * if memory is exhausted and the system throws in that case. 00293 */ 00294 std::string locale_to_utf8(const std::string& input); 00295 00296 /** 00297 * Indicates whether the input text comprises valid UTF-8. 00298 * @param text The text to be tested. 00299 * @return true if the input text is in valid UTF-8 format, otherwise 00300 * false. 00301 * @exception std::bad_alloc This function might throw std::bad_alloc 00302 * if std::string::data() might throw when memory is exhausted. 00303 * @note \#include <c++-gtk-utils/convert.h> for this function. 00304 */ 00305 inline bool validate(const std::string& text) { 00306 return g_utf8_validate(text.data(), text.size(), 0); 00307 } 00308 00309 /************** Iterator class **************/ 00310 00311 /** 00312 * @class Iterator convert.h c++-gtk-utils/convert.h 00313 * @brief A class which will iterate through a std::string object by 00314 * reference to unicode characters rather than by bytes. 00315 * @sa Cgu::Utf8::ReverseIterator 00316 * 00317 * The Cgu::Utf8::Iterator class does the same as 00318 * std::string::const_iterator, except that when iterating through a 00319 * std::string object using the ++ and -- postfix and prefix 00320 * operators, it iterates by increments of whole unicode code points 00321 * rather than by reference to bytes. In addition, the dereferencing 00322 * operator returns the whole unicode code point (a UCS-4 gunichar 00323 * type) rather than a char type. 00324 * 00325 * Where, as in practically all unix-like systems, sizeof(wchar_t) == 00326 * 4, then the gunichar return value of the dereferencing operator can 00327 * be converted by a simple static_cast to the wchar_t type. So far 00328 * as displaying individual code points is concerned however, it 00329 * should be noted that because unicode allows combining characters, a 00330 * unicode code point may not contain the whole representation of a 00331 * character as displayed. This effect can be dealt with for all 00332 * characters capable of representation by Level 1 unicode (ie by 00333 * precomposed characters) using g_utf8_normalize() before iterating. 00334 * There will still however be some non-European scripts, in 00335 * particular some Chinese/Japanese/Korean ideograms, where 00336 * description of the ideogram requires more than one code point to be 00337 * finally resolved. For these, printing individual code points 00338 * sequentially one by one directly to a display (say with std::wcout) 00339 * may or not may not have the desired result, depending on how the 00340 * display device (eg console) deals with that case. 00341 * 00342 * A Cgu::Utf8::Iterator only allows reading from and not writing to 00343 * the std::string object being iterated through. This is because in 00344 * UTF-8 the representation of any one unicode code point will require 00345 * between 1 and 6 bytes: accordingly modifying a UTF-8 string may 00346 * change its length (in bytes) even though the number of unicode 00347 * characters stays the same. For the same reason, this iterator is a 00348 * bidirectional iterator but not a random access iterator. 00349 * 00350 * The std::string object concerned should contain valid UTF-8 text. 00351 * If necessary, this should be checked with Cgu::Utf8::validate() 00352 * first. In addition, before use, the Cgu::Utf8::Iterator object 00353 * must be initialized by a std::string::const_iterator or 00354 * std::string::iterator object pointing to the first byte of a valid 00355 * UTF-8 character in the string (or by another Cgu::Utf8::Iterator 00356 * object or by a Cgu::Utf8::ReverseIterator object), and iteration 00357 * will begin at the point of initialization: therefore, assuming the 00358 * string contains valid UTF-8 text, passing std::string::begin() to a 00359 * Cgu::Utf8::Iterator object will always be safe. Initialization by 00360 * std::string::end() is also valid if the first iteration is 00361 * backwards with the -- operator. This initialization can be done 00362 * either in the constructor or by assignment. Comparison operators 00363 * ==, !=, <, <=, > and >= are provided enabling the position of 00364 * Cgu::Utf8::Iterator objects to be compared with each other or with 00365 * std::string::const_iterator and std::string::iterator objects. 00366 * 00367 * This is an example: 00368 * @code 00369 * using namespace Cgu; 00370 * 00371 * std::wstring wide_str(L"ßøǿón"); 00372 * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str)); 00373 * 00374 * Utf8::Iterator iter; 00375 * for (iter = narrow_str.begin(); 00376 * iter != narrow_str.end(); 00377 * ++iter) 00378 * std::wcout << static_cast<wchar_t>(*iter) << std::endl; 00379 * @endcode 00380 * 00381 * This class assumes in using g_utf8_next_char(), g_utf8_prev_char() 00382 * and g_utf8_get_char() that the std::string object keeps its 00383 * internal string in contiguous storage. This is required by the 00384 * C++11 standard, but not formally by C++98/C++03. However, known 00385 * implementations of std::string in fact store the string 00386 * contiguously. 00387 */ 00388 00389 class ReverseIterator; 00390 00391 class Iterator { 00392 public: 00393 typedef gunichar value_type; 00394 typedef gunichar reference; // read only 00395 typedef void pointer; // read only 00396 typedef std::string::difference_type difference_type; 00397 typedef std::bidirectional_iterator_tag iterator_category; 00398 00399 private: 00400 std::string::const_iterator pos; 00401 public: 00402 00403 /** 00404 * Increments the iterator so that it moves from the beginning of the 00405 * current UTF-8 character to the beginning of the next UTF-8 00406 * character. It is a prefix operator. It will not throw. 00407 * @return A reference to the iterator in its new position. 00408 */ 00409 Iterator& operator++(); 00410 00411 /** 00412 * Increments the iterator so that it moves from the beginning of the 00413 * current UTF-8 character to the beginning of the next UTF-8 00414 * character. It is a postfix operator. It will not throw provided 00415 * that copy constructing and assigning a std::string::const_iterator 00416 * object does not throw, as it will not in any sane implementation. 00417 * @return A copy of the iterator in its former position. 00418 */ 00419 Iterator operator++(int); 00420 00421 /** 00422 * Decrements the iterator so that it moves from the beginning of the 00423 * current UTF-8 character to the beginning of the previous UTF-8 00424 * character. It is a prefix operator. It will not throw. 00425 * @return A reference to the iterator in its new position. 00426 */ 00427 Iterator& operator--(); 00428 00429 /** 00430 * Decrements the iterator so that it moves from the beginning of the 00431 * current UTF-8 character to the beginning of the previous UTF-8 00432 * character. It is a postfix operator. It will not throw provided 00433 * that copy constructing and assigning a std::string::const_iterator 00434 * object does not throw, as it will not in any sane implementation. 00435 * @return A copy of the iterator in its former position. 00436 */ 00437 Iterator operator--(int); 00438 00439 /** 00440 * Assigns a std::string::const_iterator object to this object. It 00441 * should point to the beginning of a UTF-8 character (eg 00442 * std::string::begin()) or to std::string::end(). It will not throw 00443 * provided assigning a std::string::const_iterator object does not 00444 * throw, as it will not in any sane implementation. 00445 * @param iter The std::string::const_iterator. 00446 * @return A reference to this Cgu::Utf8::Iterator object after 00447 * assignment. 00448 */ 00449 Iterator& operator=(const std::string::const_iterator& iter) {pos = iter; return *this;} 00450 00451 /** 00452 * Assigns a std::string::iterator object to this object. It should 00453 * point to the beginning of a UTF-8 character (eg 00454 * std::string::begin()) or to std::string::end(). It will not throw 00455 * provided assigning a std::string::const_iterator object does not 00456 * throw, as it will not in any sane implementation. 00457 * @param iter The std::string::iterator. 00458 * @return A reference to this Cgu::Utf8::Iterator object after 00459 * assignment. 00460 */ 00461 Iterator& operator=(const std::string::iterator& iter) {pos = iter; return *this;} 00462 00463 /** 00464 * Assigns a Cgu::Utf8::Iterator object to this object. It will not 00465 * throw provided assigning a std::string::const_iterator object does 00466 * not throw, as it will not in any sane implementation. 00467 * @param iter The iterator. 00468 * @return A reference to this Cgu::Utf8::Iterator object after 00469 * assignment. 00470 */ 00471 Iterator& operator=(const Iterator& iter) {pos = iter.pos; return *this;} 00472 00473 /** 00474 * Assigns a Cgu::Utf8::ReverseIterator object to this object, so that 00475 * this iterator adopts the same physical position (but the logical 00476 * position will be offset to the following UTF-8 character). It will 00477 * not throw provided assigning a std::string::const_iterator object 00478 * does not throw, as it will not in any sane implementation. 00479 * @param iter The iterator. 00480 * @return A reference to this Cgu::Utf8::Iterator object after 00481 * assignment. 00482 */ 00483 Iterator& operator=(const ReverseIterator& iter); 00484 00485 /** 00486 * The dereference operator. 00487 * @return A 32-bit gunichar object containing the whole unicode code 00488 * point which is currently represented by this iterator. It will not 00489 * throw. 00490 */ 00491 Iterator::value_type operator*() const {return g_utf8_get_char(&(*pos));} 00492 00493 /** 00494 * @return The current underlying std::string::const_iterator kept by 00495 * this iterator. Once this iterator has been correctly initialized, 00496 * that will point to the beginning of the UTF-8 character currently 00497 * represented by this iterator or to std::string::end(). It will not 00498 * throw provided assigning a std::string::const_iterator object does 00499 * not throw, as it will not in any sane implementation. 00500 */ 00501 std::string::const_iterator base() const {return pos;} 00502 00503 /** 00504 * Constructs this iterator and initialises it with a 00505 * std::string::const_iterator object. It should point to the 00506 * beginning of a UTF-8 character (eg std::string::begin()) or to 00507 * std::string::end(). It will not throw provided that copy 00508 * constructing a std::string::const_iterator object does not throw, 00509 * as it will not in any sane implementation. This is a type 00510 * conversion constructor (it is not marked explicit) so that it can 00511 * be used with Cgu::Utf8::Iterator comparison operators to compare 00512 * the position of Cgu::Utf8::Iterator with 00513 * std::string::const_iterator objects. 00514 * @param iter The std::string::const_iterator. 00515 */ 00516 Iterator(const std::string::const_iterator& iter): pos(iter) {} 00517 00518 /** 00519 * Constructs this iterator and initialises it with a 00520 * std::string::iterator object. It should point to the beginning of 00521 * a UTF-8 character (eg std::string::begin()) or to 00522 * std::string::end(). It will not throw provided that copy 00523 * constructing a std::string::const_iterator object does not throw, 00524 * as it will not in any sane implementation. This is a type 00525 * conversion constructor (it is not marked explicit) so that it can 00526 * be used with Cgu::Utf8::Iterator comparison operators to compare 00527 * the position of Cgu::Utf8::Iterator with std::string::iterator 00528 * objects. 00529 * @param iter The std::string::iterator. 00530 */ 00531 Iterator(const std::string::iterator& iter): pos(iter) {} 00532 00533 /** 00534 * Constructs this iterator and initialises it with another 00535 * Cgu::Utf8::Iterator object. It will not throw provided that copy 00536 * constructing a std::string::const_iterator object does not throw, 00537 * as it will not in any sane implementation. 00538 * @param iter The iterator. 00539 */ 00540 Iterator(const Iterator& iter): pos(iter.pos) {} 00541 00542 /** 00543 * Constructs this iterator and initialises it with a 00544 * Cgu::Utf8::ReverseIterator object, so that this iterator adopts the 00545 * same physical position (but the logical position will be offset to 00546 * the following UTF-8 character). It will not throw provided that 00547 * copy constructing a std::string::const_iterator object does not 00548 * throw, as it will not in any sane implementation. 00549 * @param iter The iterator. 00550 */ 00551 explicit Iterator(const ReverseIterator& iter); 00552 00553 /** 00554 * The default constructor will not throw. 00555 */ 00556 Iterator() {} 00557 00558 /* Only has effect if --with-glib-memory-slices-compat or 00559 * --with-glib-memory-slices-no-compat option picked */ 00560 CGU_GLIB_MEMORY_SLICES_FUNCS 00561 }; 00562 00563 inline Iterator& Iterator::operator++() { 00564 const std::string::value_type* tmp = &(*pos); 00565 // using g_utf8_next_char is safe even when pos points to the last character - 00566 // that macro calls up the g_utf8_skip look-up table rather than attempting to 00567 // read the following character, so we can safely iterate to std::string::end() 00568 pos += g_utf8_next_char(tmp) - tmp; 00569 return *this; 00570 } 00571 00572 inline Iterator Iterator::operator++(int) { 00573 Iterator tmp{*this}; 00574 ++(*this); 00575 return tmp; 00576 } 00577 00578 inline Iterator& Iterator::operator--() { 00579 // we might be iterating from std::string::end() so we need 00580 // to decrement before dereferencing and then increment again 00581 const std::string::value_type* tmp = &(*(pos-1)); 00582 ++tmp; 00583 pos -= tmp - g_utf8_prev_char(tmp); 00584 return *this; 00585 } 00586 00587 inline Iterator Iterator::operator--(int) { 00588 Iterator tmp{*this}; 00589 --(*this); 00590 return tmp; 00591 } 00592 00593 /** 00594 * The comparison operators will not throw provided assigning a 00595 * std::string::const_iterator object does not throw, as it will not 00596 * in any sane implementation. 00597 */ 00598 inline bool operator==(const Iterator& iter1, const Iterator& iter2) { 00599 return (iter1.base() == iter2.base()); 00600 } 00601 00602 /** 00603 * The comparison operators will not throw provided assigning a 00604 * std::string::const_iterator object does not throw, as it will not 00605 * in any sane implementation. 00606 */ 00607 inline bool operator!=(const Iterator& iter1, const Iterator& iter2) { 00608 return (iter1.base() != iter2.base()); 00609 } 00610 00611 /** 00612 * The comparison operators will not throw provided assigning a 00613 * std::string::const_iterator object does not throw, as it will not 00614 * in any sane implementation. 00615 */ 00616 inline bool operator<(const Iterator& iter1, const Iterator& iter2) { 00617 return (iter1.base() < iter2.base()); 00618 } 00619 00620 /** 00621 * The comparison operators will not throw provided assigning a 00622 * std::string::const_iterator object does not throw, as it will not 00623 * in any sane implementation. 00624 */ 00625 inline bool operator<=(const Iterator& iter1, const Iterator& iter2) { 00626 return (iter1.base() <= iter2.base()); 00627 } 00628 00629 /** 00630 * The comparison operators will not throw provided assigning a 00631 * std::string::const_iterator object does not throw, as it will not 00632 * in any sane implementation. 00633 */ 00634 inline bool operator>(const Iterator& iter1, const Iterator& iter2) { 00635 return (iter1.base() > iter2.base()); 00636 } 00637 00638 /** 00639 * The comparison operators will not throw provided assigning a 00640 * std::string::const_iterator object does not throw, as it will not 00641 * in any sane implementation. 00642 */ 00643 inline bool operator>=(const Iterator& iter1, const Iterator& iter2) { 00644 return (iter1.base() >= iter2.base()); 00645 } 00646 00647 /************** ReverseIterator class **************/ 00648 00649 /** 00650 * @class ReverseIterator convert.h c++-gtk-utils/convert.h 00651 * @brief A class which will iterate in reverse through a std::string 00652 * object by reference to unicode characters rather than by bytes. 00653 * @sa Cgu::Utf8::Iterator 00654 * 00655 * The Cgu::Utf8::ReverseIterator class does the same as 00656 * std::string::const_reverse_iterator, except that when iterating 00657 * through a std::string object using the ++ and -- postfix and prefix 00658 * operators, it iterates by increments of whole unicode code points 00659 * rather than by reference to bytes. In addition, the dereferencing 00660 * operator returns the whole unicode code point (a UCS-4 gunichar 00661 * type) rather than a char type. 00662 * 00663 * Before use, the Cgu::Utf8::ReverseIterator object must be 00664 * initialized by a std::string::const_reverse_iterator or 00665 * std::string::reverse_iterator object representing the first byte of 00666 * a valid UTF-8 character in the string (or by another 00667 * Cgu::Utf8::ReverseIterator object or by a Cgu::Utf8::Iterator 00668 * object): so assuming the string contains valid UTF-8 text, it is 00669 * always valid to initialise a Cgu::Utf8::ReverseIterator with 00670 * std::string::rbegin(). Initialization by std::string::rend() is 00671 * also valid if the first interation is backwards with the -- 00672 * operator. This initialization can be done either in the 00673 * constructor or by assignment. Comparison operators ==, !=, <, <=, 00674 * > and >= are provided enabling the position of 00675 * Cgu::Utf8::ReverseIterator objects to be compared with each other 00676 * or with std::string::const_reverse_iterator and 00677 * std::string::reverse_iterator objects. 00678 * 00679 * This is an example: 00680 * @code 00681 * using namespace Cgu; 00682 * 00683 * std::wstring wide_str(L"ßøǿón"); 00684 * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str)); 00685 * 00686 * Utf8::ReverseIterator iter; 00687 * for (iter = narrow_str.rbegin(); 00688 * iter != narrow_str.rend(); 00689 * ++iter) 00690 * std::wcout << static_cast<wchar_t>(*iter) << std::endl; 00691 * @endcode 00692 * 00693 * For further information on its use, see the Utf8::Iterator 00694 * documentation. 00695 */ 00696 00697 class ReverseIterator { 00698 public: 00699 typedef gunichar value_type; 00700 typedef gunichar reference; // read only 00701 typedef void pointer; // read only 00702 typedef std::string::difference_type difference_type; 00703 typedef std::bidirectional_iterator_tag iterator_category; 00704 00705 private: 00706 std::string::const_iterator pos; 00707 // we use cache to make iterating and then dereferencing more efficient 00708 mutable std::string::const_iterator cache; 00709 public: 00710 00711 /** 00712 * Increments the iterator in the reverse direction so that it moves 00713 * from the beginning of the current UTF-8 character to the beginning 00714 * of the previous UTF-8 character in the std::string object 00715 * concerned. It is a prefix operator. It will not throw provided 00716 * assigning a std::string::const_iterator object does not throw, as 00717 * it will not in any sane implementation. 00718 * @return A reference to the iterator in its new position 00719 */ 00720 ReverseIterator& operator++(); 00721 00722 /** 00723 * Increments the iterator in the reverse direction so that it moves 00724 * from the beginning of the current UTF-8 character to the beginning 00725 * of the previous UTF-8 character in the std::string object 00726 * concerned. It is a postfix operator. It will not throw provided 00727 * that copy constructing and assigning a std::string::const_iterator 00728 * object does not throw, as it will not in any sane implementation. 00729 * @return A copy of the iterator in its former position 00730 */ 00731 ReverseIterator operator++(int); 00732 00733 /** 00734 * Decrements the iterator in the reverse direction so that it moves 00735 * from the beginning of the current UTF-8 character to the beginning 00736 * of the following UTF-8 character in the std::string object 00737 * concerned. It is a prefix operator. It will not throw provided 00738 * assigning a std::string::const_iterator object does not throw, as 00739 * it will not in any sane implementation. 00740 * @return A reference to the iterator in its new position 00741 */ 00742 ReverseIterator& operator--(); 00743 00744 /** 00745 * Decrements the iterator in the reverse direction so that it moves 00746 * from the beginning of the current UTF-8 character to the beginning 00747 * of the following UTF-8 character in the std::string object 00748 * concerned. It is a postfix operator. It will not throw provided 00749 * that copy constructing and assigning a std::string::const_iterator 00750 * object does not throw, as it will not in any sane implementation. 00751 * @return A copy of the iterator in its former position 00752 */ 00753 ReverseIterator operator--(int); 00754 00755 /** 00756 * Assigns a std::string::const_reverse_iterator object to this 00757 * object. It should represent the beginning of a UTF-8 character (eg 00758 * std::string::rbegin()) or comprise std::string::rend(). It will 00759 * not throw provided assigning a std::string::const_iterator object 00760 * does not throw, as it will not in any sane implementation. 00761 * @param iter The const_reverse_iterator. 00762 * @return A reference to this Cgu::Utf8::ReverseIterator object after 00763 * assignment. 00764 */ 00765 ReverseIterator& operator=(const std::string::const_reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;} 00766 00767 /** 00768 * Assigns a std::string::reverse_iterator object to this object. It 00769 * should represent the beginning of a UTF-8 character (eg 00770 * std::string::rbegin()) or comprise std::string::rend(). It will 00771 * not throw provided assigning a std::string::const_iterator object 00772 * does not throw, as it will not in any sane implementation. 00773 * @param iter The reverse_iterator. 00774 * @return A reference to this Cgu::Utf8::ReverseIterator object after 00775 * assignment. 00776 */ 00777 ReverseIterator& operator=(const std::string::reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;} 00778 00779 /** 00780 * Assigns a Cgu::Utf8::ReverseIterator object to this object. It 00781 * will not throw provided assigning a std::string::const_iterator 00782 * object does not throw, as it will not in any sane implementation. 00783 * @param iter The iterator. 00784 * @return A reference to this Cgu::Utf8::ReverseIterator object after 00785 * assignment. 00786 */ 00787 ReverseIterator& operator=(const ReverseIterator& iter) {pos = iter.pos; cache = iter.cache; return *this;} 00788 00789 /** 00790 * Assigns a Cgu::Utf8::Iterator object to this object, so that this 00791 * iterator adopts the same physical position (but the logical 00792 * position will be offset to the previous UTF-8 character in the 00793 * std::string object concerned). It will not throw provided 00794 * assigning a std::string::const_iterator object does not throw, as 00795 * it will not in any sane implementation. 00796 * @param iter The iterator. 00797 * @return A reference to this Cgu::Utf8::ReverseIterator object after 00798 * assignment. 00799 */ 00800 ReverseIterator& operator=(const Iterator& iter) {pos = iter.base(); cache = pos; return *this;} 00801 00802 /** 00803 * The dereference operator. 00804 * @return A 32-bit gunichar object containing the whole unicode code 00805 * point which is currently represented by this iterator. It will not 00806 * throw. 00807 */ 00808 ReverseIterator::value_type operator*() const; 00809 00810 /** 00811 * @return The current underlying std::string::const_iterator kept by 00812 * this iterator. Once this iterator has been correctly initialized, 00813 * that will point to the beginning of the UTF-8 character after the 00814 * one currently represented by this iterator or to 00815 * std::string::end(). It will not throw provided assigning a 00816 * std::string::const_iterator object does not throw, as it will not 00817 * in any sane implementation. 00818 */ 00819 std::string::const_iterator base() const {return pos;} 00820 00821 /** 00822 * Constructs this iterator and initialises it with a 00823 * std::string::const_reverse_iterator object. It should represent 00824 * the beginning of a UTF-8 character (eg std::string::rbegin()) or 00825 * comprise std::string::rend(). It will not throw provided that copy 00826 * constructing a std::string::const_iterator object does not throw, 00827 * as it will not in any sane implementation. This is a type 00828 * conversion constructor (it is not marked explicit) so that it can 00829 * be used with Cgu::Utf8::ReverseIterator comparison operators to 00830 * compare the position of Cgu::Utf8::ReverseIterator with 00831 * std::string::const_reverse_iterator objects. 00832 * @param iter The const_reverse_iterator. 00833 */ 00834 ReverseIterator(const std::string::const_reverse_iterator& iter): pos(iter.base()), cache(pos) {} 00835 00836 /** 00837 * Constructs this iterator and initialises it with a 00838 * std::string::reverse_iterator object. It should represent the 00839 * beginning of a UTF-8 character (eg std::string::rbegin()) or 00840 * comprise std::string::rend(). It will not throw provided that copy 00841 * constructing a std::string::const_iterator object does not throw, 00842 * as it will not in any sane implementation. This is a type 00843 * conversion constructor (it is not marked explicit) so that it can 00844 * be used with Cgu::Utf8::ReverseIterator comparison operators to 00845 * compare the position of Cgu::Utf8::ReverseIterator with 00846 * std::string::reverse_iterator objects. 00847 * @param iter The reverse_iterator. 00848 */ 00849 ReverseIterator(const std::string::reverse_iterator& iter): pos(iter.base()), cache(pos) {} 00850 00851 /** 00852 * Constructs this iterator and initialises it with another 00853 * Cgu::Utf8::ReverseIterator object. It will not throw provided that 00854 * copy constructing a std::string::const_iterator object does not 00855 * throw, as it will not in any sane implementation. 00856 * @param iter The iterator. 00857 */ 00858 ReverseIterator(const ReverseIterator& iter): pos(iter.pos), cache(iter.cache) {} 00859 00860 /** 00861 * Constructs this iterator and initialises it with a 00862 * Cgu::Utf8::Iterator object, so that this iterator adopts the same 00863 * physical position (but the logical position will be offset to the 00864 * previous UTF-8 character in the std::string object concerned). It 00865 * will not throw provided that copy constructing a 00866 * std::string::const_iterator object does not throw, as it will not 00867 * in any sane implementation. 00868 * @param iter The iterator. 00869 */ 00870 explicit ReverseIterator(const Iterator& iter): pos(iter.base()), cache(pos) {} 00871 00872 /** 00873 * The default constructor will not throw. 00874 */ 00875 ReverseIterator() {} 00876 00877 /* Only has effect if --with-glib-memory-slices-compat or 00878 * --with-glib-memory-slices-no-compat option picked */ 00879 CGU_GLIB_MEMORY_SLICES_FUNCS 00880 }; 00881 00882 inline ReverseIterator& ReverseIterator::operator++() { 00883 00884 if (pos > cache) pos = cache; 00885 00886 else { 00887 // we might be iterating from std::string::end()/std::string::rbegin() so 00888 // we need to decrement before dereferencing and then increment again 00889 const std::string::value_type* tmp = &(*(pos-1)); 00890 ++tmp; 00891 pos -= tmp - g_utf8_prev_char(tmp); 00892 } 00893 return *this; 00894 } 00895 00896 inline ReverseIterator ReverseIterator::operator++(int) { 00897 ReverseIterator tmp{*this}; 00898 ++(*this); 00899 return tmp; 00900 } 00901 00902 inline ReverseIterator& ReverseIterator::operator--() { 00903 cache = pos; 00904 const std::string::value_type* tmp = &(*pos); 00905 // using g_utf8_next_char is safe even when pos points to the first character - 00906 // that macro calls up the g_utf8_skip look-up table rather than attempting to 00907 // read the following character, so we can safely iterate to std::string::rbegin() 00908 pos += g_utf8_next_char(tmp) - tmp; 00909 return *this; 00910 } 00911 00912 inline ReverseIterator ReverseIterator::operator--(int) { 00913 ReverseIterator tmp{*this}; 00914 --(*this); 00915 return tmp; 00916 } 00917 00918 inline ReverseIterator::value_type ReverseIterator::operator*() const { 00919 Iterator tmp{*this}; 00920 --tmp; 00921 cache = tmp.base(); 00922 return g_utf8_get_char(&(*(tmp.base()))); 00923 } 00924 00925 /** 00926 * The comparison operators will not throw provided assigning a 00927 * std::string::const_iterator object does not throw, as it will not 00928 * in any sane implementation. 00929 */ 00930 inline bool operator==(const ReverseIterator& iter1, const ReverseIterator& iter2) { 00931 return (iter1.base() == iter2.base()); 00932 } 00933 00934 /** 00935 * The comparison operators will not throw provided assigning a 00936 * std::string::const_iterator object does not throw, as it will not 00937 * in any sane implementation. 00938 */ 00939 inline bool operator!=(const ReverseIterator& iter1, const ReverseIterator& iter2) { 00940 return (iter1.base() != iter2.base()); 00941 } 00942 00943 /** 00944 * The comparison operators will not throw provided assigning a 00945 * std::string::const_iterator object does not throw, as it will not 00946 * in any sane implementation. Ordering is viewed from the 00947 * perspective of the logical operation (reverse iteration), so that 00948 * for example an iterator at position std::string::rbegin() is less 00949 * than an iterator at position std::string::rend(). 00950 */ 00951 inline bool operator<(const ReverseIterator& iter1, const ReverseIterator& iter2) { 00952 return (iter1.base() > iter2.base()); 00953 } 00954 00955 /** 00956 * The comparison operators will not throw provided assigning a 00957 * std::string::const_iterator object does not throw, as it will not 00958 * in any sane implementation. Ordering is viewed from the 00959 * perspective of the logical operation (reverse iteration), so that 00960 * for example an iterator at position std::string::rbegin() is less 00961 * than an iterator at position std::string::rend(). 00962 */ 00963 inline bool operator<=(const ReverseIterator& iter1, const ReverseIterator& iter2) { 00964 return (iter1.base() >= iter2.base()); 00965 } 00966 00967 /** 00968 * The comparison operators will not throw provided assigning a 00969 * std::string::const_iterator object does not throw, as it will not 00970 * in any sane implementation. Ordering is viewed from the 00971 * perspective of the logical operation (reverse iteration), so that 00972 * for example an iterator at position std::string::rbegin() is less 00973 * than an iterator at position std::string::rend(). 00974 */ 00975 inline bool operator>(const ReverseIterator& iter1, const ReverseIterator& iter2) { 00976 return (iter1.base() < iter2.base()); 00977 } 00978 00979 /** 00980 * The comparison operators will not throw provided assigning a 00981 * std::string::const_iterator object does not throw, as it will not 00982 * in any sane implementation. Ordering is viewed from the 00983 * perspective of the logical operation (reverse iteration), so that 00984 * for example an iterator at position std::string::rbegin() is less 00985 * than an iterator at position std::string::rend(). 00986 */ 00987 inline bool operator>=(const ReverseIterator& iter1, const ReverseIterator& iter2) { 00988 return (iter1.base() <= iter2.base()); 00989 } 00990 00991 /*** Iterator class methods which require ReverseIterator as a complete type ***/ 00992 00993 inline Iterator& Iterator::operator=(const ReverseIterator& iter) { 00994 pos = iter.base(); 00995 return *this; 00996 } 00997 00998 inline Iterator::Iterator(const ReverseIterator& iter): pos(iter.base()) {} 00999 01000 } // namespace Utf8 01001 01002 } // namespace Cgu 01003 01004 #endif