ucommon
unicode.h
Go to the documentation of this file.
1 // Copyright (C) 2009-2014 David Sugar, Tycho Softworks.
2 //
3 // This file is part of GNU uCommon C++.
4 //
5 // GNU uCommon C++ is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU Lesser General Public License as published
7 // by the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // GNU uCommon C++ is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
17 
32 #ifndef _UCOMMON_UNICODE_H_
33 #define _UCOMMON_UNICODE_H_
34 
35 #ifndef _UCOMMON_STRING_H_
36 #include <ucommon/string.h>
37 #endif
38 
39 #ifdef nil
40 #undef nil
41 #endif
42 
43 namespace ucommon {
44 
49 typedef int32_t ucs4_t;
50 
54 typedef int16_t ucs2_t;
55 
59 typedef void *unicode_t;
60 
66 class __EXPORT utf8
67 {
68 public:
72  static const unsigned ucsize;
73 
77  static const char *nil;
78 
84  static unsigned size(const char *codepoint);
85 
91  static size_t count(const char *string);
92 
99  static char *offset(char *string, ssize_t position);
100 
106  static ucs4_t codepoint(const char *encoded);
107 
113  static size_t chars(const unicode_t string);
114 
120  static size_t chars(ucs4_t character);
121 
128  static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
129 
137  static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
138 
142  static ucs4_t *udup(const char *string);
143 
147  static ucs2_t *wdup(const char *string);
148 
156  static const char *find(const char *string, ucs4_t character, size_t start = 0);
157 
165  static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
166 
173  static unsigned ccount(const char *string, ucs4_t character);
174 
180  static ucs4_t get(CharacterProtocol& buffer);
181 
188  static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
189 };
190 
197 class __EXPORT UString : public String, public utf8
198 {
199 protected:
203  UString();
204 
209  UString(strsize_t size);
210 
215  UString(const unicode_t text);
216 
223  UString(const char *text, strsize_t size);
224 
231  UString(const unicode_t *text, const unicode_t *end);
232 
238  UString(const UString& existing);
239 
244  virtual ~UString();
245 
252  UString get(strsize_t codepoint, strsize_t size = 0) const;
253 
260  size_t get(unicode_t unicode, size_t size) const;
261 
266  void set(const unicode_t unicode);
267 
272  void add(const unicode_t unicode);
273 
279  ucs4_t at(int position) const;
280 
287  inline size_t operator()(unicode_t unicode, size_t size) const
288  {return get(unicode, size);}
289 
296  UString operator()(int codepoint, strsize_t size) const;
297 
303  inline UString left(strsize_t size) const
304  {return operator()(0, size);}
305 
311  inline UString right(strsize_t offset) const
312  {return operator()(-((int)offset), 0);}
313 
320  inline UString copy(strsize_t offset, strsize_t size) const
321  {return operator()((int)offset, size);}
322 
328  void cut(strsize_t offset, strsize_t size = 0);
329 
336  void paste(strsize_t offset, const char *text, strsize_t size = 0);
337 
345  const char *operator()(int offset) const;
346 
352  inline ucs4_t operator[](int position) const
353  {return UString::at(position);}
354 
359  inline strsize_t count(void) const
360  {return utf8::count(str->text);}
361 
367  unsigned ccount(ucs4_t character) const;
368 
375  const char *find(ucs4_t character, strsize_t start = 0) const;
376 
383  const char *rfind(ucs4_t character, strsize_t end = npos) const;
384 };
385 
391 class __EXPORT utf8_pointer
392 {
393 protected:
394  uint8_t *text;
395 
396 public:
400  utf8_pointer();
401 
406  utf8_pointer(const char *string);
407 
413 
418  utf8_pointer& operator ++();
419 
424  utf8_pointer& operator --();
425 
431  utf8_pointer& operator +=(long offset);
432 
438  utf8_pointer& operator -=(long offset);
439 
445  utf8_pointer operator+(long offset) const;
446 
452  utf8_pointer operator-(long offset) const;
453 
458  inline operator bool() const
459  {return text != NULL;}
460 
465  inline bool operator!() const
466  {return text == NULL;}
467 
473  ucs4_t operator[](long codepoint) const;
474 
480  utf8_pointer& operator=(const char *string);
481 
485  void inc(void);
486 
490  void dec(void);
491 
497  inline bool operator==(const char *string) const
498  {return (const char *)text == string;}
499 
505  inline bool operator!=(const char *string) const
506  {return (const char *)text != string;}
507 
512  inline ucs4_t operator*() const
513  {return utf8::codepoint((const char *)text);}
514 
519  inline char *c_str(void) const
520  {return (char *)text;}
521 
526  inline operator char*() const
527  {return (char *)text;}
528 
533  inline size_t len(void) const
534  {return utf8::count((const char *)text);}
535 };
536 
537 inline ucs4_t *strudup(const char *string)
538  {return utf8::udup(string);}
539 
540 inline ucs2_t *strwdup(const char *string)
541  {return utf8::wdup(string);}
542 
543 __EXPORT unicode_t unidup(const char *string);
544 
545 template<>
546 inline void dupfree<ucs2_t*>(ucs2_t *string)
547  {::free(string);}
548 
549 template<>
550 inline void dupfree<ucs4_t*>(ucs4_t *string)
551  {::free(string);}
552 
553 template<>
554 inline void dupfree<unicode_t>(unicode_t string)
555  {::free(string);}
556 
561 
566 
567 } // namespace ucommon
568 
569 #endif
bool operator==(const char *string) const
check if pointer equals another string.
Definition: unicode.h:497
utf8_pointer utf8_t
Convenience type for utf8_pointer strings.
Definition: unicode.h:565
static ucs4_t codepoint(const char *encoded)
Convert a utf8 encoded codepoint to a ucs4 character value.
strsize_t count(void) const
Count codepoints in current string.
Definition: unicode.h:359
size_t len(void) const
Get length of null terminated utf8 string in codepoints.
Definition: unicode.h:533
A core class of ut8 encoded string functions.
Definition: unicode.h:66
int32_t ucs4_t
32 bit unicode character code.
Definition: unicode.h:49
A copy-on-write string class that operates by reference count.
Definition: string.h:82
Common namespace for all ucommon objects.
Definition: access.h:46
UString left(strsize_t size) const
Convenience method for left of string.
Definition: unicode.h:303
size_t operator()(unicode_t unicode, size_t size) const
Extract a unicode byte sequence from utf8 object.
Definition: unicode.h:287
static const char * nil
A convenient NULL pointer value.
Definition: unicode.h:77
UString right(strsize_t offset) const
Convenience method for right of string.
Definition: unicode.h:311
ucs4_t operator*() const
Get unicode character pointed to by pointer.
Definition: unicode.h:512
ucs4_t at(int position) const
Return unicode character found at a specific codepoint in the string.
UString copy(strsize_t offset, strsize_t size) const
Convenience method for substring extraction.
Definition: unicode.h:320
void start(JoinableThread *thread, int priority=0)
Convenience function to start a joinable thread.
Definition: thread.h:1870
bool operator!() const
Check if text is an invalid pointer.
Definition: unicode.h:465
A common string class and character string support functions.
static ucs2_t * wdup(const char *string)
Dup a utf8 string into a ucs2_t representation.
void * unicode_t
Resolves issues where wchar_t is not defined.
Definition: unicode.h:59
A copy-on-write utf8 string class that operates by reference count.
Definition: unicode.h:197
static ucs4_t * udup(const char *string)
Dup a utf8 string into a ucs4_t string.
unsigned short strsize_t
A convenience class for size of strings.
Definition: string.h:70
ObjectProtocol * copy(ObjectProtocol *object)
Convenience function to access object copy.
Definition: object.h:479
static const unsigned ucsize
Size of "unicode_t" character codes, may not be ucs4_t size.
Definition: unicode.h:72
bool operator!=(const char *string) const
check if pointer does not equal another string.
Definition: unicode.h:505
ucs4_t operator[](int position) const
Reference a unicode character in string object by array offset.
Definition: unicode.h:352
Pointer to utf8 encoded character data.
Definition: unicode.h:391
UString ustring_t
Convenience type for utf8 encoded strings.
Definition: unicode.h:560
static size_t count(const char *string)
Count ut8 encoded ucs4 codepoints in string.
char * c_str(void) const
Get c string we point to.
Definition: unicode.h:519
Common character processing protocol.
Definition: protocols.h:174
int16_t ucs2_t
16 bit unicode character code.
Definition: unicode.h:54