ICU 49.1.1  49.1.1
uniset.h
Go to the documentation of this file.
00001 /*
00002 ***************************************************************************
00003 * Copyright (C) 1999-2011, International Business Machines Corporation
00004 * and others. All Rights Reserved.
00005 ***************************************************************************
00006 *   Date        Name        Description
00007 *   10/20/99    alan        Creation.
00008 ***************************************************************************
00009 */
00010 
00011 #ifndef UNICODESET_H
00012 #define UNICODESET_H
00013 
00014 #include "unicode/unifilt.h"
00015 #include "unicode/unistr.h"
00016 #include "unicode/uset.h"
00017 
00023 U_NAMESPACE_BEGIN
00024 
00025 class BMPSet;
00026 class ParsePosition;
00027 class RBBIRuleScanner;
00028 class SymbolTable;
00029 class UnicodeSetStringSpan;
00030 class UVector;
00031 class RuleCharacterIterator;
00032 
00273 class U_COMMON_API UnicodeSet : public UnicodeFilter {
00274 
00275     int32_t len; // length of list used; 0 <= len <= capacity
00276     int32_t capacity; // capacity of list
00277     UChar32* list; // MUST be terminated with HIGH
00278     BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
00279     UChar32* buffer; // internal buffer, may be NULL
00280     int32_t bufferCapacity; // capacity of buffer
00281     int32_t patLen;
00282 
00292     UChar *pat;
00293     UVector* strings; // maintained in sorted order
00294     UnicodeSetStringSpan *stringSpan;
00295 
00296 private:
00297     enum { // constants
00298         kIsBogus = 1       // This set is bogus (i.e. not valid)
00299     };
00300     uint8_t fFlags;         // Bit flag (see constants above)
00301 public:
00311     inline UBool isBogus(void) const;
00312     
00329     void setToBogus();
00330 
00331 public:
00332 
00333     enum {
00338         MIN_VALUE = 0,
00339 
00344         MAX_VALUE = 0x10ffff
00345     };
00346 
00347     //----------------------------------------------------------------
00348     // Constructors &c
00349     //----------------------------------------------------------------
00350 
00351 public:
00352 
00357     UnicodeSet();
00358 
00367     UnicodeSet(UChar32 start, UChar32 end);
00368 
00377     UnicodeSet(const UnicodeString& pattern,
00378                UErrorCode& status);
00379 
00380 #ifndef U_HIDE_INTERNAL_API
00381 
00393     UnicodeSet(const UnicodeString& pattern,
00394                uint32_t options,
00395                const SymbolTable* symbols,
00396                UErrorCode& status);
00397 #endif  /* U_HIDE_INTERNAL_API */
00398 
00412     UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
00413                uint32_t options,
00414                const SymbolTable* symbols,
00415                UErrorCode& status);
00416 
00421     UnicodeSet(const UnicodeSet& o);
00422 
00427     virtual ~UnicodeSet();
00428 
00434     UnicodeSet& operator=(const UnicodeSet& o);
00435 
00447     virtual UBool operator==(const UnicodeSet& o) const;
00448 
00454     UBool operator!=(const UnicodeSet& o) const;
00455 
00465     virtual UnicodeFunctor* clone() const;
00466 
00474     virtual int32_t hashCode(void) const;
00475 
00484     inline static UnicodeSet *fromUSet(USet *uset);
00485 
00494     inline static const UnicodeSet *fromUSet(const USet *uset);
00495     
00503     inline USet *toUSet();
00504 
00505 
00513     inline const USet * toUSet() const;
00514 
00515 
00516     //----------------------------------------------------------------
00517     // Freezable API
00518     //----------------------------------------------------------------
00519 
00528     inline UBool isFrozen() const;
00529 
00543     UnicodeFunctor *freeze();
00544 
00553     UnicodeFunctor *cloneAsThawed() const;
00554 
00555     //----------------------------------------------------------------
00556     // Public API
00557     //----------------------------------------------------------------
00558 
00569     UnicodeSet& set(UChar32 start, UChar32 end);
00570 
00576     static UBool resemblesPattern(const UnicodeString& pattern,
00577                                   int32_t pos);
00578 
00591     UnicodeSet& applyPattern(const UnicodeString& pattern,
00592                              UErrorCode& status);
00593 
00594 #ifndef U_HIDE_INTERNAL_API
00595 
00611     UnicodeSet& applyPattern(const UnicodeString& pattern,
00612                              uint32_t options,
00613                              const SymbolTable* symbols,
00614                              UErrorCode& status);
00615 #endif  /* U_HIDE_INTERNAL_API */
00616 
00648     UnicodeSet& applyPattern(const UnicodeString& pattern,
00649                              ParsePosition& pos,
00650                              uint32_t options,
00651                              const SymbolTable* symbols,
00652                              UErrorCode& status);
00653 
00667     virtual UnicodeString& toPattern(UnicodeString& result,
00668                              UBool escapeUnprintable = FALSE) const;
00669 
00692     UnicodeSet& applyIntPropertyValue(UProperty prop,
00693                                       int32_t value,
00694                                       UErrorCode& ec);
00695 
00725     UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
00726                                    const UnicodeString& value,
00727                                    UErrorCode& ec);
00728 
00737     virtual int32_t size(void) const;
00738 
00745     virtual UBool isEmpty(void) const;
00746 
00754     virtual UBool contains(UChar32 c) const;
00755 
00764     virtual UBool contains(UChar32 start, UChar32 end) const;
00765 
00773     UBool contains(const UnicodeString& s) const;
00774 
00782     virtual UBool containsAll(const UnicodeSet& c) const;
00783 
00791     UBool containsAll(const UnicodeString& s) const;
00792 
00801     UBool containsNone(UChar32 start, UChar32 end) const;
00802 
00810     UBool containsNone(const UnicodeSet& c) const;
00811 
00819     UBool containsNone(const UnicodeString& s) const;
00820 
00829     inline UBool containsSome(UChar32 start, UChar32 end) const;
00830 
00838     inline UBool containsSome(const UnicodeSet& s) const;
00839 
00847     inline UBool containsSome(const UnicodeString& s) const;
00848 
00867     int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
00868 
00881     inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
00882 
00900     int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
00901 
00915     inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
00916 
00935     int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00936 
00954     int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00955 
00960     virtual UMatchDegree matches(const Replaceable& text,
00961                          int32_t& offset,
00962                          int32_t limit,
00963                          UBool incremental);
00964 
00965 private:
00988     static int32_t matchRest(const Replaceable& text,
00989                              int32_t start, int32_t limit,
00990                              const UnicodeString& s);
00991 
01001     int32_t findCodePoint(UChar32 c) const;
01002 
01003 public:
01004 
01012     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
01013 
01022     int32_t indexOf(UChar32 c) const;
01023 
01033     UChar32 charAt(int32_t index) const;
01034 
01049     virtual UnicodeSet& add(UChar32 start, UChar32 end);
01050 
01058     UnicodeSet& add(UChar32 c);
01059 
01071     UnicodeSet& add(const UnicodeString& s);
01072 
01073  private:
01079     static int32_t getSingleCP(const UnicodeString& s);
01080 
01081     void _add(const UnicodeString& s);
01082 
01083  public:
01092     UnicodeSet& addAll(const UnicodeString& s);
01093 
01102     UnicodeSet& retainAll(const UnicodeString& s);
01103 
01112     UnicodeSet& complementAll(const UnicodeString& s);
01113 
01122     UnicodeSet& removeAll(const UnicodeString& s);
01123 
01132     static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
01133 
01134 
01142     static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
01143 
01157     virtual UnicodeSet& retain(UChar32 start, UChar32 end);
01158 
01159 
01165     UnicodeSet& retain(UChar32 c);
01166 
01180     virtual UnicodeSet& remove(UChar32 start, UChar32 end);
01181 
01189     UnicodeSet& remove(UChar32 c);
01190 
01200     UnicodeSet& remove(const UnicodeString& s);
01201 
01209     virtual UnicodeSet& complement(void);
01210 
01225     virtual UnicodeSet& complement(UChar32 start, UChar32 end);
01226 
01234     UnicodeSet& complement(UChar32 c);
01235 
01246     UnicodeSet& complement(const UnicodeString& s);
01247 
01260     virtual UnicodeSet& addAll(const UnicodeSet& c);
01261 
01273     virtual UnicodeSet& retainAll(const UnicodeSet& c);
01274 
01286     virtual UnicodeSet& removeAll(const UnicodeSet& c);
01287 
01298     virtual UnicodeSet& complementAll(const UnicodeSet& c);
01299 
01306     virtual UnicodeSet& clear(void);
01307 
01333     UnicodeSet& closeOver(int32_t attribute);
01334 
01341     virtual UnicodeSet &removeAllStrings();
01342 
01350     virtual int32_t getRangeCount(void) const;
01351 
01359     virtual UChar32 getRangeStart(int32_t index) const;
01360 
01368     virtual UChar32 getRangeEnd(int32_t index) const;
01369 
01418     int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
01419 
01426     virtual UnicodeSet& compact();
01427 
01439     static UClassID U_EXPORT2 getStaticClassID(void);
01440 
01449     virtual UClassID getDynamicClassID(void) const;
01450 
01451 private:
01452 
01453     // Private API for the USet API
01454 
01455     friend class USetAccess;
01456 
01457     int32_t getStringCount() const;
01458 
01459     const UnicodeString* getString(int32_t index) const;
01460 
01461     //----------------------------------------------------------------
01462     // RuleBasedTransliterator support
01463     //----------------------------------------------------------------
01464 
01465 private:
01466 
01472     virtual UBool matchesIndexValue(uint8_t v) const;
01473 
01474 private:
01475     friend class RBBIRuleScanner;
01476 
01477     //----------------------------------------------------------------
01478     // Implementation: Clone as thawed (see ICU4J Freezable)
01479     //----------------------------------------------------------------
01480 
01481     UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
01482 
01483     //----------------------------------------------------------------
01484     // Implementation: Pattern parsing
01485     //----------------------------------------------------------------
01486 
01487     void applyPatternIgnoreSpace(const UnicodeString& pattern,
01488                                  ParsePosition& pos,
01489                                  const SymbolTable* symbols,
01490                                  UErrorCode& status);
01491 
01492     void applyPattern(RuleCharacterIterator& chars,
01493                       const SymbolTable* symbols,
01494                       UnicodeString& rebuiltPat,
01495                       uint32_t options,
01496                       UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
01497                       UErrorCode& ec);
01498 
01499     //----------------------------------------------------------------
01500     // Implementation: Utility methods
01501     //----------------------------------------------------------------
01502 
01503     void ensureCapacity(int32_t newLen, UErrorCode& ec);
01504 
01505     void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
01506 
01507     void swapBuffers(void);
01508 
01509     UBool allocateStrings(UErrorCode &status);
01510 
01511     UnicodeString& _toPattern(UnicodeString& result,
01512                               UBool escapeUnprintable) const;
01513 
01514     UnicodeString& _generatePattern(UnicodeString& result,
01515                                     UBool escapeUnprintable) const;
01516 
01517     static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
01518 
01519     static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
01520 
01521     //----------------------------------------------------------------
01522     // Implementation: Fundamental operators
01523     //----------------------------------------------------------------
01524 
01525     void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
01526 
01527     void add(const UChar32* other, int32_t otherLen, int8_t polarity);
01528 
01529     void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
01530 
01536     static UBool resemblesPropertyPattern(const UnicodeString& pattern,
01537                                           int32_t pos);
01538 
01539     static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
01540                                           int32_t iterOpts);
01541 
01581     UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
01582                                      ParsePosition& ppos,
01583                                      UErrorCode &ec);
01584 
01585     void applyPropertyPattern(RuleCharacterIterator& chars,
01586                               UnicodeString& rebuiltPat,
01587                               UErrorCode& ec);
01588 
01589     static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
01590 
01595     typedef UBool (*Filter)(UChar32 codePoint, void* context);
01596 
01606     void applyFilter(Filter filter,
01607                      void* context,
01608                      int32_t src,
01609                      UErrorCode &status);
01610 
01614     void setPattern(const UnicodeString& newPat);
01618     void releasePattern();
01619 
01620     friend class UnicodeSetIterator;
01621 };
01622 
01623 
01624 
01625 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
01626     return !operator==(o);
01627 }
01628 
01629 inline UBool UnicodeSet::isFrozen() const {
01630     return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
01631 }
01632 
01633 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
01634     return !containsNone(start, end);
01635 }
01636 
01637 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
01638     return !containsNone(s);
01639 }
01640 
01641 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
01642     return !containsNone(s);
01643 }
01644 
01645 inline UBool UnicodeSet::isBogus() const {
01646     return (UBool)(fFlags & kIsBogus);
01647 }
01648 
01649 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
01650     return reinterpret_cast<UnicodeSet *>(uset);
01651 }
01652 
01653 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
01654     return reinterpret_cast<const UnicodeSet *>(uset);
01655 }
01656 
01657 inline USet *UnicodeSet::toUSet() {
01658     return reinterpret_cast<USet *>(this);
01659 }
01660 
01661 inline const USet *UnicodeSet::toUSet() const {
01662     return reinterpret_cast<const USet *>(this);
01663 }
01664 
01665 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
01666     int32_t sLength=s.length();
01667     if(start<0) {
01668         start=0;
01669     } else if(start>sLength) {
01670         start=sLength;
01671     }
01672     return start+span(s.getBuffer()+start, sLength-start, spanCondition);
01673 }
01674 
01675 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
01676     int32_t sLength=s.length();
01677     if(limit<0) {
01678         limit=0;
01679     } else if(limit>sLength) {
01680         limit=sLength;
01681     }
01682     return spanBack(s.getBuffer(), limit, spanCondition);
01683 }
01684 
01685 U_NAMESPACE_END
01686 
01687 #endif