ucommon
ucommon/unicode.h
Go to the documentation of this file.
00001 // Copyright (C) 2009-2014 David Sugar, Tycho Softworks.
00002 //
00003 // This file is part of GNU uCommon C++.
00004 //
00005 // GNU uCommon C++ is free software: you can redistribute it and/or modify
00006 // it under the terms of the GNU Lesser General Public License as published
00007 // by the Free Software Foundation, either version 3 of the License, or
00008 // (at your option) any later version.
00009 //
00010 // GNU uCommon C++ is distributed in the hope that it will be useful,
00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013 // GNU Lesser General Public License for more details.
00014 //
00015 // You should have received a copy of the GNU Lesser General Public License
00016 // along with GNU uCommon C++.  If not, see <http://www.gnu.org/licenses/>.
00017 
00032 #ifndef _UCOMMON_UNICODE_H_
00033 #define _UCOMMON_UNICODE_H_
00034 
00035 #ifndef _UCOMMON_STRING_H_
00036 #include <ucommon/string.h>
00037 #endif
00038 
00039 namespace ucommon {
00040 
00045 typedef int32_t ucs4_t;
00046 
00050 typedef int16_t ucs2_t;
00051 
00055 typedef void *unicode_t;
00056 
00062 class __EXPORT utf8
00063 {
00064 public:
00068     static const unsigned ucsize;
00069 
00073     static const char *nil;
00074 
00080     static unsigned size(const char *codepoint);
00081 
00087     static size_t count(const char *string);
00088 
00095     static char *offset(char *string, ssize_t position);
00096 
00102     static ucs4_t codepoint(const char *encoded);
00103 
00109     static size_t chars(const unicode_t string);
00110 
00116     static size_t chars(ucs4_t character);
00117 
00124     static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
00125 
00133     static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
00134 
00138     static ucs4_t *udup(const char *string);
00139 
00143     static ucs2_t *wdup(const char *string);
00144 
00152     static const char *find(const char *string, ucs4_t character, size_t start = 0);
00153 
00161     static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
00162 
00169     static unsigned ccount(const char *string, ucs4_t character);
00170 
00176     static ucs4_t get(CharacterProtocol& buffer);
00177 
00184     static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
00185 };
00186 
00193 class __EXPORT UString : public String, public utf8
00194 {
00195 protected:
00199     UString();
00200 
00205     UString(strsize_t size);
00206 
00211     UString(const unicode_t text);
00212 
00219     UString(const char *text, strsize_t size);
00220 
00227     UString(const unicode_t *text, const unicode_t *end);
00228 
00234     UString(const UString& existing);
00235 
00240     virtual ~UString();
00241 
00248     UString get(strsize_t codepoint, strsize_t size = 0) const;
00249 
00256     size_t get(unicode_t unicode, size_t size) const;
00257 
00262     void set(const unicode_t unicode);
00263 
00268     void add(const unicode_t unicode);
00269 
00275     ucs4_t at(int position) const;
00276 
00283     inline size_t operator()(unicode_t unicode, size_t size) const
00284         {return get(unicode, size);}
00285 
00292     UString operator()(int codepoint, strsize_t size) const;
00293 
00299     inline UString left(strsize_t size) const
00300         {return operator()(0, size);}
00301 
00307     inline UString right(strsize_t offset) const
00308         {return operator()(-((int)offset), 0);}
00309 
00316     inline UString copy(strsize_t offset, strsize_t size) const
00317         {return operator()((int)offset, size);}
00318 
00324     void cut(strsize_t offset, strsize_t size = 0);
00325 
00332     void paste(strsize_t offset, const char *text, strsize_t size = 0);
00333 
00341     const char *operator()(int offset) const;
00342 
00348     inline ucs4_t operator[](int position) const
00349         {return UString::at(position);}
00350 
00355     inline strsize_t count(void) const
00356         {return utf8::count(str->text);}
00357 
00363     unsigned ccount(ucs4_t character) const;
00364 
00371     const char *find(ucs4_t character, strsize_t start = 0) const;
00372 
00379     const char *rfind(ucs4_t character, strsize_t end = npos) const;
00380 };
00381 
00387 class __EXPORT utf8_pointer
00388 {
00389 protected:
00390     uint8_t *text;
00391 
00392 public:
00396     utf8_pointer();
00397 
00402     utf8_pointer(const char *string);
00403 
00408     utf8_pointer(const utf8_pointer& copy);
00409 
00414     utf8_pointer& operator ++();
00415 
00420     utf8_pointer& operator --();
00421 
00427     utf8_pointer& operator +=(long offset);
00428 
00434     utf8_pointer& operator -=(long offset);
00435 
00441     utf8_pointer operator+(long offset) const;
00442 
00448     utf8_pointer operator-(long offset) const;
00449 
00454     inline operator bool() const
00455         {return text != NULL;}
00456 
00461     inline bool operator!() const
00462         {return text == NULL;}
00463 
00469     ucs4_t operator[](long codepoint) const;
00470 
00476     utf8_pointer& operator=(const char *string);
00477 
00481     void inc(void);
00482 
00486     void dec(void);
00487 
00493     inline bool operator==(const char *string) const
00494         {return (const char *)text == string;}
00495 
00501     inline bool operator!=(const char *string) const
00502         {return (const char *)text != string;}
00503 
00508     inline  ucs4_t operator*() const
00509         {return utf8::codepoint((const char *)text);}
00510 
00515     inline char *c_str(void) const
00516         {return (char *)text;}
00517 
00522     inline operator char*() const
00523         {return (char *)text;}
00524 
00529     inline size_t len(void) const
00530         {return utf8::count((const char *)text);}
00531 };
00532 
00533 inline ucs4_t *strudup(const char *string)
00534     {return utf8::udup(string);}
00535 
00536 inline ucs2_t *strwdup(const char *string)
00537     {return utf8::wdup(string);}
00538 
00539 __EXPORT unicode_t unidup(const char *string);
00540 
00541 template<>
00542 inline void dupfree<ucs2_t*>(ucs2_t *string)
00543     {::free(string);}
00544 
00545 template<>
00546 inline void dupfree<ucs4_t*>(ucs4_t *string)
00547     {::free(string);}
00548 
00549 template<>
00550 inline void dupfree<unicode_t>(unicode_t string)
00551     {::free(string);}
00552 
00556 typedef UString ustring_t;
00557 
00561 typedef utf8_pointer utf8_t;
00562 
00563 } // namespace ucommon
00564 
00565 #endif