ucommon
|
00001 // Copyright (C) 2009-2014 David Sugar, Tycho Softworks. 00002 // 00003 // This file is part of GNU uCommon C++. 00004 // 00005 // GNU uCommon C++ is free software: you can redistribute it and/or modify 00006 // it under the terms of the GNU Lesser General Public License as published 00007 // by the Free Software Foundation, either version 3 of the License, or 00008 // (at your option) any later version. 00009 // 00010 // GNU uCommon C++ is distributed in the hope that it will be useful, 00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 // GNU Lesser General Public License for more details. 00014 // 00015 // You should have received a copy of the GNU Lesser General Public License 00016 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>. 00017 00032 #ifndef _UCOMMON_UNICODE_H_ 00033 #define _UCOMMON_UNICODE_H_ 00034 00035 #ifndef _UCOMMON_STRING_H_ 00036 #include <ucommon/string.h> 00037 #endif 00038 00039 namespace ucommon { 00040 00045 typedef int32_t ucs4_t; 00046 00050 typedef int16_t ucs2_t; 00051 00055 typedef void *unicode_t; 00056 00062 class __EXPORT utf8 00063 { 00064 public: 00068 static const unsigned ucsize; 00069 00073 static const char *nil; 00074 00080 static unsigned size(const char *codepoint); 00081 00087 static size_t count(const char *string); 00088 00095 static char *offset(char *string, ssize_t position); 00096 00102 static ucs4_t codepoint(const char *encoded); 00103 00109 static size_t chars(const unicode_t string); 00110 00116 static size_t chars(ucs4_t character); 00117 00124 static size_t unpack(const unicode_t string, CharacterProtocol& buffer); 00125 00133 static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size); 00134 00138 static ucs4_t *udup(const char *string); 00139 00143 static ucs2_t *wdup(const char *string); 00144 00152 static const char *find(const char *string, ucs4_t character, size_t start = 0); 00153 00161 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l); 00162 00169 static unsigned ccount(const char *string, ucs4_t character); 00170 00176 static ucs4_t get(CharacterProtocol& buffer); 00177 00184 static ucs4_t put(ucs4_t character, CharacterProtocol& buffer); 00185 }; 00186 00193 class __EXPORT UString : public String, public utf8 00194 { 00195 protected: 00199 UString(); 00200 00205 UString(strsize_t size); 00206 00211 UString(const unicode_t text); 00212 00219 UString(const char *text, strsize_t size); 00220 00227 UString(const unicode_t *text, const unicode_t *end); 00228 00234 UString(const UString& existing); 00235 00240 virtual ~UString(); 00241 00248 UString get(strsize_t codepoint, strsize_t size = 0) const; 00249 00256 size_t get(unicode_t unicode, size_t size) const; 00257 00262 void set(const unicode_t unicode); 00263 00268 void add(const unicode_t unicode); 00269 00275 ucs4_t at(int position) const; 00276 00283 inline size_t operator()(unicode_t unicode, size_t size) const 00284 {return get(unicode, size);} 00285 00292 UString operator()(int codepoint, strsize_t size) const; 00293 00299 inline UString left(strsize_t size) const 00300 {return operator()(0, size);} 00301 00307 inline UString right(strsize_t offset) const 00308 {return operator()(-((int)offset), 0);} 00309 00316 inline UString copy(strsize_t offset, strsize_t size) const 00317 {return operator()((int)offset, size);} 00318 00324 void cut(strsize_t offset, strsize_t size = 0); 00325 00332 void paste(strsize_t offset, const char *text, strsize_t size = 0); 00333 00341 const char *operator()(int offset) const; 00342 00348 inline ucs4_t operator[](int position) const 00349 {return UString::at(position);} 00350 00355 inline strsize_t count(void) const 00356 {return utf8::count(str->text);} 00357 00363 unsigned ccount(ucs4_t character) const; 00364 00371 const char *find(ucs4_t character, strsize_t start = 0) const; 00372 00379 const char *rfind(ucs4_t character, strsize_t end = npos) const; 00380 }; 00381 00387 class __EXPORT utf8_pointer 00388 { 00389 protected: 00390 uint8_t *text; 00391 00392 public: 00396 utf8_pointer(); 00397 00402 utf8_pointer(const char *string); 00403 00408 utf8_pointer(const utf8_pointer& copy); 00409 00414 utf8_pointer& operator ++(); 00415 00420 utf8_pointer& operator --(); 00421 00427 utf8_pointer& operator +=(long offset); 00428 00434 utf8_pointer& operator -=(long offset); 00435 00441 utf8_pointer operator+(long offset) const; 00442 00448 utf8_pointer operator-(long offset) const; 00449 00454 inline operator bool() const 00455 {return text != NULL;} 00456 00461 inline bool operator!() const 00462 {return text == NULL;} 00463 00469 ucs4_t operator[](long codepoint) const; 00470 00476 utf8_pointer& operator=(const char *string); 00477 00481 void inc(void); 00482 00486 void dec(void); 00487 00493 inline bool operator==(const char *string) const 00494 {return (const char *)text == string;} 00495 00501 inline bool operator!=(const char *string) const 00502 {return (const char *)text != string;} 00503 00508 inline ucs4_t operator*() const 00509 {return utf8::codepoint((const char *)text);} 00510 00515 inline char *c_str(void) const 00516 {return (char *)text;} 00517 00522 inline operator char*() const 00523 {return (char *)text;} 00524 00529 inline size_t len(void) const 00530 {return utf8::count((const char *)text);} 00531 }; 00532 00533 inline ucs4_t *strudup(const char *string) 00534 {return utf8::udup(string);} 00535 00536 inline ucs2_t *strwdup(const char *string) 00537 {return utf8::wdup(string);} 00538 00539 __EXPORT unicode_t unidup(const char *string); 00540 00541 template<> 00542 inline void dupfree<ucs2_t*>(ucs2_t *string) 00543 {::free(string);} 00544 00545 template<> 00546 inline void dupfree<ucs4_t*>(ucs4_t *string) 00547 {::free(string);} 00548 00549 template<> 00550 inline void dupfree<unicode_t>(unicode_t string) 00551 {::free(string);} 00552 00556 typedef UString ustring_t; 00557 00561 typedef utf8_pointer utf8_t; 00562 00563 } // namespace ucommon 00564 00565 #endif