00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef XAPIAN_INCLUDED_UNICODE_H
00024 #define XAPIAN_INCLUDED_UNICODE_H
00025
00026 #include <xapian/visibility.h>
00027
00028 #include <string>
00029
00030 namespace Xapian {
00031
00035 class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
00036 const unsigned char *p;
00037 const unsigned char *end;
00038 mutable unsigned seqlen;
00039
00040 void calculate_sequence_length() const;
00041
00042 unsigned get_char() const;
00043
00044 Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
00045 : p(p_), end(end_), seqlen(seqlen_) { }
00046
00047 public:
00049 const char * raw() const {
00050 return reinterpret_cast<const char *>(p ? p : end);
00051 }
00052
00054 size_t left() const { return p ? end - p : 0; }
00055
00067 void assign(const char *p_, size_t len) {
00068 if (len) {
00069 p = reinterpret_cast<const unsigned char*>(p_);
00070 end = p + len;
00071 seqlen = 0;
00072 } else {
00073 p = NULL;
00074 }
00075 }
00076
00087 void assign(const std::string &s) { assign(s.data(), s.size()); }
00088
00097 explicit Utf8Iterator(const char *p_);
00098
00109 Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
00110
00120 Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
00121
00127 Utf8Iterator() : p(NULL), end(0), seqlen(0) { }
00128
00133 unsigned operator*() const;
00134
00139 Utf8Iterator operator++(int) {
00140
00141 if (seqlen == 0) calculate_sequence_length();
00142 const unsigned char *old_p = p;
00143 unsigned old_seqlen = seqlen;
00144 p += seqlen;
00145 if (p == end) p = NULL;
00146 seqlen = 0;
00147 return Utf8Iterator(old_p, end, old_seqlen);
00148 }
00149
00154 Utf8Iterator & operator++() {
00155 this->operator++(0);
00156 return *this;
00157 }
00158
00163 bool operator==(const Utf8Iterator &other) const { return p == other.p; }
00164
00169 bool operator!=(const Utf8Iterator &other) const { return p != other.p; }
00170
00172
00173 typedef std::input_iterator_tag iterator_category;
00174 typedef unsigned value_type;
00175 typedef size_t difference_type;
00176 typedef const unsigned * pointer;
00177 typedef const unsigned & reference;
00179 };
00180
00181 namespace Unicode {
00182
00184 typedef enum {
00185 UNASSIGNED,
00186 UPPERCASE_LETTER,
00187 LOWERCASE_LETTER,
00188 TITLECASE_LETTER,
00189 MODIFIER_LETTER,
00190 OTHER_LETTER,
00191 NON_SPACING_MARK,
00192 ENCLOSING_MARK,
00193 COMBINING_SPACING_MARK,
00194 DECIMAL_DIGIT_NUMBER,
00195 LETTER_NUMBER,
00196 OTHER_NUMBER,
00197 SPACE_SEPARATOR,
00198 LINE_SEPARATOR,
00199 PARAGRAPH_SEPARATOR,
00200 CONTROL,
00201 FORMAT,
00202 PRIVATE_USE,
00203 SURROGATE,
00204 CONNECTOR_PUNCTUATION,
00205 DASH_PUNCTUATION,
00206 OPEN_PUNCTUATION,
00207 CLOSE_PUNCTUATION,
00208 INITIAL_QUOTE_PUNCTUATION,
00209 FINAL_QUOTE_PUNCTUATION,
00210 OTHER_PUNCTUATION,
00211 MATH_SYMBOL,
00212 CURRENCY_SYMBOL,
00213 MODIFIER_SYMBOL,
00214 OTHER_SYMBOL
00215 } category;
00216
00217 namespace Internal {
00218
00219
00220
00221
00222
00223 XAPIAN_VISIBILITY_DEFAULT
00224 int get_character_info(unsigned ch);
00225
00227 inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
00228
00230 inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
00231
00233 inline int get_delta(int info) {
00234
00235
00236
00237
00238
00239 return (info >= 0) ? (info >> 15) : (~(~info >> 15));
00240 }
00241 }
00242
00252 XAPIAN_VISIBILITY_DEFAULT
00253 unsigned nonascii_to_utf8(unsigned ch, char * buf);
00254
00262 inline unsigned to_utf8(unsigned ch, char *buf) {
00263 if (ch < 128) {
00264 *buf = static_cast<unsigned char>(ch);
00265 return 1;
00266 }
00267 return Xapian::Unicode::nonascii_to_utf8(ch, buf);
00268 }
00269
00273 inline void append_utf8(std::string &s, unsigned ch) {
00274 char buf[4];
00275 s.append(buf, to_utf8(ch, buf));
00276 }
00277
00279 inline category get_category(unsigned ch) {
00280
00281 if (ch >= 0x110000) return Xapian::Unicode::UNASSIGNED;
00282 return Internal::get_category(Internal::get_character_info(ch));
00283 }
00284
00286 inline bool is_wordchar(unsigned ch) {
00287 const unsigned int WORDCHAR_MASK =
00288 (1 << Xapian::Unicode::UPPERCASE_LETTER) |
00289 (1 << Xapian::Unicode::LOWERCASE_LETTER) |
00290 (1 << Xapian::Unicode::TITLECASE_LETTER) |
00291 (1 << Xapian::Unicode::MODIFIER_LETTER) |
00292 (1 << Xapian::Unicode::OTHER_LETTER) |
00293 (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
00294 (1 << Xapian::Unicode::LETTER_NUMBER) |
00295 (1 << Xapian::Unicode::OTHER_NUMBER) |
00296 (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
00297 return ((WORDCHAR_MASK >> get_category(ch)) & 1);
00298 }
00299
00301 inline bool is_whitespace(unsigned ch) {
00302 const unsigned int WHITESPACE_MASK =
00303 (1 << Xapian::Unicode::CONTROL) |
00304 (1 << Xapian::Unicode::SPACE_SEPARATOR) |
00305 (1 << Xapian::Unicode::LINE_SEPARATOR) |
00306 (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
00307 return ((WHITESPACE_MASK >> get_category(ch)) & 1);
00308 }
00309
00311 inline bool is_currency(unsigned ch) {
00312 return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
00313 }
00314
00316 inline unsigned tolower(unsigned ch) {
00317 int info;
00318
00319 if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 2))
00320 return ch;
00321 return ch + Internal::get_delta(info);
00322 }
00323
00325 inline std::string
00326 tolower(const std::string &term)
00327 {
00328 std::string result;
00329 result.reserve(term.size());
00330 for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
00331 append_utf8(result, tolower(*i));
00332 }
00333 return result;
00334 }
00335
00336 }
00337
00338 }
00339
00340 #endif // XAPIAN_INCLUDED_UNICODE_H