UCommon
|
00001 // Copyright (C) 2009-2014 David Sugar, Tycho Softworks. 00002 // Copyright (C) 2015 Cherokees of Idaho. 00003 // 00004 // This file is part of GNU uCommon C++. 00005 // 00006 // GNU uCommon C++ is free software: you can redistribute it and/or modify 00007 // it under the terms of the GNU Lesser General Public License as published 00008 // by the Free Software Foundation, either version 3 of the License, or 00009 // (at your option) any later version. 00010 // 00011 // GNU uCommon C++ is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU Lesser General Public License for more details. 00015 // 00016 // You should have received a copy of the GNU Lesser General Public License 00017 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>. 00018 00033 #ifndef _UCOMMON_UNICODE_H_ 00034 #define _UCOMMON_UNICODE_H_ 00035 00036 #ifndef _UCOMMON_STRING_H_ 00037 #include <ucommon/string.h> 00038 #endif 00039 00040 #ifdef nil 00041 #undef nil 00042 #endif 00043 00044 namespace ucommon { 00045 00050 typedef int32_t ucs4_t; 00051 00055 typedef int16_t ucs2_t; 00056 00060 typedef void *unicode_t; 00061 00067 class __EXPORT utf8 00068 { 00069 public: 00073 static const unsigned ucsize; 00074 00078 static const char *nil; 00079 00085 static unsigned size(const char *codepoint); 00086 00092 static size_t count(const char *string); 00093 00100 static char *offset(char *string, ssize_t position); 00101 00107 static ucs4_t codepoint(const char *encoded); 00108 00114 static size_t chars(const unicode_t string); 00115 00121 static size_t chars(ucs4_t character); 00122 00129 static size_t unpack(const unicode_t string, CharacterProtocol& buffer); 00130 00138 static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size); 00139 00143 static ucs4_t *udup(const char *string); 00144 00148 static ucs2_t *wdup(const char *string); 00149 00157 static const char *find(const char *string, ucs4_t character, size_t start = 0); 00158 00166 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l); 00167 00174 static unsigned ccount(const char *string, ucs4_t character); 00175 00181 static ucs4_t get(CharacterProtocol& buffer); 00182 00189 static ucs4_t put(ucs4_t character, CharacterProtocol& buffer); 00190 }; 00191 00198 class __EXPORT UString : public String, public utf8 00199 { 00200 protected: 00204 UString(); 00205 00210 UString(strsize_t size); 00211 00216 UString(const unicode_t text); 00217 00224 UString(const char *text, strsize_t size); 00225 00232 UString(const unicode_t *text, const unicode_t *end); 00233 00239 UString(const UString& existing); 00240 00245 virtual ~UString(); 00246 00253 UString get(strsize_t codepoint, strsize_t size = 0) const; 00254 00261 size_t get(unicode_t unicode, size_t size) const; 00262 00267 void set(const unicode_t unicode); 00268 00273 void add(const unicode_t unicode); 00274 00280 ucs4_t at(int position) const; 00281 00288 inline size_t operator()(unicode_t unicode, size_t size) const 00289 {return get(unicode, size);} 00290 00297 UString operator()(int codepoint, strsize_t size) const; 00298 00304 inline UString left(strsize_t size) const 00305 {return operator()(0, size);} 00306 00312 inline UString right(strsize_t offset) const 00313 {return operator()(-((int)offset), 0);} 00314 00321 inline UString copy(strsize_t offset, strsize_t size) const 00322 {return operator()((int)offset, size);} 00323 00329 void cut(strsize_t offset, strsize_t size = 0); 00330 00337 void paste(strsize_t offset, const char *text, strsize_t size = 0); 00338 00346 const char *operator()(int offset) const; 00347 00353 inline ucs4_t operator[](int position) const 00354 {return UString::at(position);} 00355 00360 inline strsize_t count(void) const 00361 {return utf8::count(str->text);} 00362 00368 unsigned ccount(ucs4_t character) const; 00369 00376 const char *find(ucs4_t character, strsize_t start = 0) const; 00377 00384 const char *rfind(ucs4_t character, strsize_t end = npos) const; 00385 }; 00386 00392 class __EXPORT utf8_pointer 00393 { 00394 protected: 00395 uint8_t *text; 00396 00397 public: 00401 utf8_pointer(); 00402 00407 utf8_pointer(const char *string); 00408 00413 utf8_pointer(const utf8_pointer& copy); 00414 00419 utf8_pointer& operator ++(); 00420 00425 utf8_pointer& operator --(); 00426 00432 utf8_pointer& operator +=(long offset); 00433 00439 utf8_pointer& operator -=(long offset); 00440 00446 utf8_pointer operator+(long offset) const; 00447 00453 utf8_pointer operator-(long offset) const; 00454 00459 inline operator bool() const 00460 {return text != NULL;} 00461 00466 inline bool operator!() const 00467 {return text == NULL;} 00468 00474 ucs4_t operator[](long codepoint) const; 00475 00481 utf8_pointer& operator=(const char *string); 00482 00486 void inc(void); 00487 00491 void dec(void); 00492 00498 inline bool operator==(const char *string) const 00499 {return (const char *)text == string;} 00500 00506 inline bool operator!=(const char *string) const 00507 {return (const char *)text != string;} 00508 00513 inline ucs4_t operator*() const 00514 {return utf8::codepoint((const char *)text);} 00515 00520 inline char *c_str(void) const 00521 {return (char *)text;} 00522 00527 inline operator char*() const 00528 {return (char *)text;} 00529 00534 inline size_t len(void) const 00535 {return utf8::count((const char *)text);} 00536 }; 00537 00538 inline ucs4_t *strudup(const char *string) 00539 {return utf8::udup(string);} 00540 00541 inline ucs2_t *strwdup(const char *string) 00542 {return utf8::wdup(string);} 00543 00544 __EXPORT unicode_t unidup(const char *string); 00545 00546 template<> 00547 inline void dupfree<ucs2_t*>(ucs2_t *string) 00548 {::free(string);} 00549 00550 template<> 00551 inline void dupfree<ucs4_t*>(ucs4_t *string) 00552 {::free(string);} 00553 00554 template<> 00555 inline void dupfree<unicode_t>(unicode_t string) 00556 {::free(string);} 00557 00561 typedef UString ustring_t; 00562 00566 typedef utf8_pointer utf8_t; 00567 00568 } // namespace ucommon 00569 00570 #endif