ICU 4.6
4.6
|
00001 /* 00002 ********************************************************************** 00003 * Copyright (C) 2002-2010, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ********************************************************************** 00006 * file name: regex.h 00007 * encoding: US-ASCII 00008 * indentation:4 00009 * 00010 * created on: 2002oct22 00011 * created by: Andy Heninger 00012 * 00013 * ICU Regular Expressions, API for C++ 00014 */ 00015 00016 #ifndef REGEX_H 00017 #define REGEX_H 00018 00019 //#define REGEX_DEBUG 00020 00045 #include "unicode/utypes.h" 00046 00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 00048 00049 #include "unicode/uobject.h" 00050 #include "unicode/unistr.h" 00051 #include "unicode/utext.h" 00052 #include "unicode/parseerr.h" 00053 00054 #include "unicode/uregex.h" 00055 00056 U_NAMESPACE_BEGIN 00057 00058 00059 // Forward Declarations... 00060 00061 class RegexMatcher; 00062 class RegexPattern; 00063 class UVector; 00064 class UVector32; 00065 class UVector64; 00066 class UnicodeSet; 00067 struct REStackFrame; 00068 struct Regex8BitSet; 00069 class RuleBasedBreakIterator; 00070 class RegexCImpl; 00071 00072 00073 00074 00079 #ifdef REGEX_DEBUG 00080 U_INTERNAL void U_EXPORT2 00081 RegexPatternDump(const RegexPattern *pat); 00082 #else 00083 #undef RegexPatternDump 00084 #define RegexPatternDump(pat) 00085 #endif 00086 00087 00088 00100 class U_I18N_API RegexPattern: public UObject { 00101 public: 00102 00110 RegexPattern(); 00111 00118 RegexPattern(const RegexPattern &source); 00119 00125 virtual ~RegexPattern(); 00126 00135 UBool operator==(const RegexPattern& that) const; 00136 00145 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}; 00146 00152 RegexPattern &operator =(const RegexPattern &source); 00153 00161 virtual RegexPattern *clone() const; 00162 00163 00188 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00189 UParseError &pe, 00190 UErrorCode &status); 00191 00192 00219 static RegexPattern * U_EXPORT2 compile( UText *regex, 00220 UParseError &pe, 00221 UErrorCode &status); 00222 00247 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00248 uint32_t flags, 00249 UParseError &pe, 00250 UErrorCode &status); 00251 00252 00279 static RegexPattern * U_EXPORT2 compile( UText *regex, 00280 uint32_t flags, 00281 UParseError &pe, 00282 UErrorCode &status); 00283 00284 00307 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00308 uint32_t flags, 00309 UErrorCode &status); 00310 00311 00336 static RegexPattern * U_EXPORT2 compile( UText *regex, 00337 uint32_t flags, 00338 UErrorCode &status); 00339 00340 00346 virtual uint32_t flags() const; 00347 00365 virtual RegexMatcher *matcher(const UnicodeString &input, 00366 UErrorCode &status) const; 00367 00368 00373 enum PatternIsUTextFlag { PATTERN_IS_UTEXT }; 00374 00394 virtual RegexMatcher *matcher(UText *input, 00395 PatternIsUTextFlag flag, 00396 UErrorCode &status) const; 00397 00398 private: 00412 RegexMatcher *matcher(const UChar *input, 00413 UErrorCode &status) const; 00414 public: 00415 00416 00428 virtual RegexMatcher *matcher(UErrorCode &status) const; 00429 00430 00445 static UBool U_EXPORT2 matches(const UnicodeString ®ex, 00446 const UnicodeString &input, 00447 UParseError &pe, 00448 UErrorCode &status); 00449 00450 00465 static UBool U_EXPORT2 matches(UText *regex, 00466 UText *input, 00467 UParseError &pe, 00468 UErrorCode &status); 00469 00470 00479 virtual UnicodeString pattern() const; 00480 00481 00492 virtual UText *patternText(UErrorCode &status) const; 00493 00494 00520 virtual int32_t split(const UnicodeString &input, 00521 UnicodeString dest[], 00522 int32_t destCapacity, 00523 UErrorCode &status) const; 00524 00525 00551 virtual int32_t split(UText *input, 00552 UText *dest[], 00553 int32_t destCapacity, 00554 UErrorCode &status) const; 00555 00556 00562 virtual UClassID getDynamicClassID() const; 00563 00569 static UClassID U_EXPORT2 getStaticClassID(); 00570 00571 private: 00572 // 00573 // Implementation Data 00574 // 00575 UText *fPattern; // The original pattern string. 00576 UnicodeString *fPatternString; // The original pattern UncodeString if relevant 00577 uint32_t fFlags; // The flags used when compiling the pattern. 00578 // 00579 UVector64 *fCompiledPat; // The compiled pattern p-code. 00580 UnicodeString fLiteralText; // Any literal string data from the pattern, 00581 // after un-escaping, for use during the match. 00582 00583 UVector *fSets; // Any UnicodeSets referenced from the pattern. 00584 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 00585 00586 00587 UErrorCode fDeferredStatus; // status if some prior error has left this 00588 // RegexPattern in an unusable state. 00589 00590 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 00591 // >= this value. For some patterns, this calculated 00592 // value may be less than the true shortest 00593 // possible match. 00594 00595 int32_t fFrameSize; // Size of a state stack frame in the 00596 // execution engine. 00597 00598 int32_t fDataSize; // The size of the data needed by the pattern that 00599 // does not go on the state stack, but has just 00600 // a single copy per matcher. 00601 00602 UVector32 *fGroupMap; // Map from capture group number to position of 00603 // the group's variables in the matcher stack frame. 00604 00605 int32_t fMaxCaptureDigits; 00606 00607 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 00608 // regex character classes, e.g. Word. 00609 00610 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 00611 // sets for predefined regex classes. 00612 00613 int32_t fStartType; // Info on how a match must start. 00614 int32_t fInitialStringIdx; // 00615 int32_t fInitialStringLen; 00616 UnicodeSet *fInitialChars; 00617 UChar32 fInitialChar; 00618 Regex8BitSet *fInitialChars8; 00619 UBool fNeedsAltInput; 00620 00621 friend class RegexCompile; 00622 friend class RegexMatcher; 00623 friend class RegexCImpl; 00624 00625 // 00626 // Implementation Methods 00627 // 00628 void init(); // Common initialization, for use by constructors. 00629 void zap(); // Common cleanup 00630 #ifdef REGEX_DEBUG 00631 void dumpOp(int32_t index) const; 00632 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *); 00633 #endif 00634 00635 }; 00636 00637 00638 00648 class U_I18N_API RegexMatcher: public UObject { 00649 public: 00650 00665 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); 00666 00682 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); 00683 00705 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 00706 uint32_t flags, UErrorCode &status); 00707 00729 RegexMatcher(UText *regexp, UText *input, 00730 uint32_t flags, UErrorCode &status); 00731 00732 private: 00746 RegexMatcher(const UnicodeString ®exp, const UChar *input, 00747 uint32_t flags, UErrorCode &status); 00748 public: 00749 00750 00756 virtual ~RegexMatcher(); 00757 00758 00765 virtual UBool matches(UErrorCode &status); 00766 00767 00778 virtual UBool matches(int64_t startIndex, UErrorCode &status); 00779 00780 00794 virtual UBool lookingAt(UErrorCode &status); 00795 00796 00810 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status); 00811 00812 00825 virtual UBool find(); 00826 00827 00837 virtual UBool find(int64_t start, UErrorCode &status); 00838 00839 00849 virtual UnicodeString group(UErrorCode &status) const; 00850 00851 00864 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 00865 00866 00872 virtual int32_t groupCount() const; 00873 00874 00889 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 00890 00894 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const; 00895 00911 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; 00912 00913 00921 virtual int32_t start(UErrorCode &status) const; 00922 00926 virtual int64_t start64(UErrorCode &status) const; 00927 00928 00942 virtual int32_t start(int32_t group, UErrorCode &status) const; 00943 00947 virtual int64_t start64(int32_t group, UErrorCode &status) const; 00948 00949 00962 virtual int32_t end(UErrorCode &status) const; 00963 00967 virtual int64_t end64(UErrorCode &status) const; 00968 00969 00986 virtual int32_t end(int32_t group, UErrorCode &status) const; 00987 00991 virtual int64_t end64(int32_t group, UErrorCode &status) const; 00992 00993 01002 virtual RegexMatcher &reset(); 01003 01004 01020 virtual RegexMatcher &reset(int64_t index, UErrorCode &status); 01021 01022 01040 virtual RegexMatcher &reset(const UnicodeString &input); 01041 01042 01056 virtual RegexMatcher &reset(UText *input); 01057 01058 private: 01072 RegexMatcher &reset(const UChar *input); 01073 public: 01074 01082 virtual const UnicodeString &input() const; 01083 01092 virtual UText *inputText() const; 01093 01103 virtual UText *getInput(UText *dest, UErrorCode &status) const; 01104 01105 01124 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status); 01125 01135 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status); 01136 01145 virtual int32_t regionStart() const; 01146 01150 virtual int64_t regionStart64() const; 01151 01152 01161 virtual int32_t regionEnd() const; 01162 01166 virtual int64_t regionEnd64() const; 01167 01176 virtual UBool hasTransparentBounds() const; 01177 01196 virtual RegexMatcher &useTransparentBounds(UBool b); 01197 01198 01206 virtual UBool hasAnchoringBounds() const; 01207 01208 01221 virtual RegexMatcher &useAnchoringBounds(UBool b); 01222 01223 01236 virtual UBool hitEnd() const; 01237 01247 virtual UBool requireEnd() const; 01248 01249 01255 virtual const RegexPattern &pattern() const; 01256 01257 01274 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 01275 01276 01297 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); 01298 01299 01320 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 01321 01322 01347 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); 01348 01349 01377 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 01378 const UnicodeString &replacement, UErrorCode &status); 01379 01380 01408 virtual RegexMatcher &appendReplacement(UText *dest, 01409 UText *replacement, UErrorCode &status); 01410 01411 01422 virtual UnicodeString &appendTail(UnicodeString &dest); 01423 01424 01437 virtual UText *appendTail(UText *dest, UErrorCode &status); 01438 01439 01463 virtual int32_t split(const UnicodeString &input, 01464 UnicodeString dest[], 01465 int32_t destCapacity, 01466 UErrorCode &status); 01467 01468 01492 virtual int32_t split(UText *input, 01493 UText *dest[], 01494 int32_t destCapacity, 01495 UErrorCode &status); 01496 01518 virtual void setTimeLimit(int32_t limit, UErrorCode &status); 01519 01526 virtual int32_t getTimeLimit() const; 01527 01549 virtual void setStackLimit(int32_t limit, UErrorCode &status); 01550 01558 virtual int32_t getStackLimit() const; 01559 01560 01574 virtual void setMatchCallback(URegexMatchCallback *callback, 01575 const void *context, 01576 UErrorCode &status); 01577 01578 01589 virtual void getMatchCallback(URegexMatchCallback *&callback, 01590 const void *&context, 01591 UErrorCode &status); 01592 01593 01607 virtual void setFindProgressCallback(URegexFindProgressCallback *callback, 01608 const void *context, 01609 UErrorCode &status); 01610 01611 01622 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback, 01623 const void *&context, 01624 UErrorCode &status); 01625 01626 01632 void setTrace(UBool state); 01633 01634 01640 static UClassID U_EXPORT2 getStaticClassID(); 01641 01647 virtual UClassID getDynamicClassID() const; 01648 01649 private: 01650 // Constructors and other object boilerplate are private. 01651 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 01652 RegexMatcher(); // default constructor not implemented 01653 RegexMatcher(const RegexPattern *pat); 01654 RegexMatcher(const RegexMatcher &other); 01655 RegexMatcher &operator =(const RegexMatcher &rhs); 01656 void init(UErrorCode &status); // Common initialization 01657 void init2(UText *t, UErrorCode &e); // Common initialization, part 2. 01658 01659 friend class RegexPattern; 01660 friend class RegexCImpl; 01661 public: 01663 void resetPreserveRegion(); // Reset matcher state, but preserve any region. 01664 private: 01665 01666 // 01667 // MatchAt This is the internal interface to the match engine itself. 01668 // Match status comes back in matcher member variables. 01669 // 01670 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); 01671 inline void backTrack(int64_t &inputIdx, int32_t &patIdx); 01672 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test 01673 UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test 01674 REStackFrame *resetStack(); 01675 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status); 01676 void IncrementTime(UErrorCode &status); 01677 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status); 01678 01679 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; 01680 01681 UBool findUsingChunk(); 01682 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); 01683 UBool isChunkWordBoundary(int32_t pos); 01684 01685 const RegexPattern *fPattern; 01686 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 01687 // should delete it when through. 01688 01689 const UnicodeString *fInput; // The string being matched. Only used for input() 01690 UText *fInputText; // The text being matched. Is never NULL. 01691 UText *fAltInputText; // A shallow copy of the text being matched. 01692 // Only created if the pattern contains backreferences. 01693 int64_t fInputLength; // Full length of the input text. 01694 int32_t fFrameSize; // The size of a frame in the backtrack stack. 01695 01696 int64_t fRegionStart; // Start of the input region, default = 0. 01697 int64_t fRegionLimit; // End of input region, default to input.length. 01698 01699 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). 01700 int64_t fAnchorLimit; // See useAnchoringBounds 01701 01702 int64_t fLookStart; // Region bounds for look-ahead/behind and 01703 int64_t fLookLimit; // and other boundary tests. See 01704 // useTransparentBounds 01705 01706 int64_t fActiveStart; // Currently active bounds for matching. 01707 int64_t fActiveLimit; // Usually is the same as region, but 01708 // is changed to fLookStart/Limit when 01709 // entering look around regions. 01710 01711 UBool fTransparentBounds; // True if using transparent bounds. 01712 UBool fAnchoringBounds; // True if using anchoring bounds. 01713 01714 UBool fMatch; // True if the last attempted match was successful. 01715 int64_t fMatchStart; // Position of the start of the most recent match 01716 int64_t fMatchEnd; // First position after the end of the most recent match 01717 // Zero if no previous match, even when a region 01718 // is active. 01719 int64_t fLastMatchEnd; // First position after the end of the previous match, 01720 // or -1 if there was no previous match. 01721 int64_t fAppendPosition; // First position after the end of the previous 01722 // appendReplacement(). As described by the 01723 // JavaDoc for Java Matcher, where it is called 01724 // "append position" 01725 UBool fHitEnd; // True if the last match touched the end of input. 01726 UBool fRequireEnd; // True if the last match required end-of-input 01727 // (matched $ or Z) 01728 01729 UVector64 *fStack; 01730 REStackFrame *fFrame; // After finding a match, the last active stack frame, 01731 // which will contain the capture group results. 01732 // NOT valid while match engine is running. 01733 01734 int64_t *fData; // Data area for use by the compiled pattern. 01735 int64_t fSmallData[8]; // Use this for data if it's enough. 01736 01737 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the 01738 // match engine run. Zero for unlimited. 01739 01740 int32_t fTime; // Match time, accumulates while matching. 01741 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. 01742 // Kept separately from fTime to keep as much 01743 // code as possible out of the inline 01744 // StateSave function. 01745 01746 int32_t fStackLimit; // Maximum memory size to use for the backtrack 01747 // stack, in bytes. Zero for unlimited. 01748 01749 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct. 01750 // NULL if there is no callback. 01751 const void *fCallbackContext; // User Context ptr for callback function. 01752 01753 URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct. 01754 // NULL if there is no callback. 01755 const void *fFindProgressCallbackContext; // User Context ptr for callback function. 01756 01757 01758 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. 01759 01760 UBool fTraceDebug; // Set true for debug tracing of match engine. 01761 01762 UErrorCode fDeferredStatus; // Save error state that cannot be immediately 01763 // reported, or that permanently disables this matcher. 01764 01765 RuleBasedBreakIterator *fWordBreakItr; 01766 01767 01768 }; 01769 01770 U_NAMESPACE_END 01771 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 01772 #endif