00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018
00019
00039 #include "unicode/utypes.h"
00040
00041 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00042
00043 #include "unicode/uobject.h"
00044 #include "unicode/unistr.h"
00045 #include "unicode/parseerr.h"
00046
00047 U_NAMESPACE_BEGIN
00048
00049
00050
00051
00052 class RegexMatcher;
00053 class UVector;
00054 class UVector32;
00055 class UnicodeSet;
00056 struct REStackFrame;
00057 struct Regex8BitSet;
00058 class RuleBasedBreakIterator;
00059
00060
00061
00066 enum {
00068 UREGEX_CANON_EQ = 128,
00069
00071 UREGEX_CASE_INSENSITIVE = 2,
00072
00074 UREGEX_COMMENTS = 4,
00075
00078 UREGEX_DOTALL = 32,
00079
00084 UREGEX_MULTILINE = 8,
00085
00093 UREGEX_UWORD = 256
00094 };
00095
00096
00097
00109 class U_I18N_API RegexPattern: public UObject {
00110 public:
00111
00119 RegexPattern();
00120
00126 RegexPattern(const RegexPattern &source);
00127
00133 virtual ~RegexPattern();
00134
00143 UBool operator==(const RegexPattern& that) const;
00144
00153 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00154
00160 RegexPattern &operator =(const RegexPattern &source);
00161
00169 virtual RegexPattern *clone() const;
00170
00171
00192 static RegexPattern *compile( const UnicodeString ®ex,
00193 UParseError &pe,
00194 UErrorCode &status);
00195
00216 static RegexPattern *compile( const UnicodeString ®ex,
00217 uint32_t flags,
00218 UParseError &pe,
00219 UErrorCode &status);
00220
00221
00240 static RegexPattern *compile( const UnicodeString ®ex,
00241 uint32_t flags,
00242 UErrorCode &status);
00243
00244
00250 virtual uint32_t flags() const;
00251
00264 virtual RegexMatcher *matcher(const UnicodeString &input,
00265 UErrorCode &status) const;
00266
00267
00279 virtual RegexMatcher *matcher(UErrorCode &status) const;
00280
00281
00296 static UBool matches(const UnicodeString ®ex,
00297 const UnicodeString &input,
00298 UParseError &pe,
00299 UErrorCode &status);
00300
00301
00306 virtual UnicodeString pattern() const;
00307
00308
00334 virtual int32_t split(const UnicodeString &input,
00335 UnicodeString dest[],
00336 int32_t destCapacity,
00337 UErrorCode &status) const;
00338
00339
00340
00345 void dump() const;
00346
00352 virtual UClassID getDynamicClassID() const;
00353
00359 static UClassID getStaticClassID();
00360
00361 private:
00362
00363
00364
00365 UnicodeString fPattern;
00366 uint32_t fFlags;
00367
00368 UVector32 *fCompiledPat;
00369 UnicodeString fLiteralText;
00370
00371
00372 UVector *fSets;
00373 Regex8BitSet *fSets8;
00374
00375
00376 UErrorCode fDeferredStatus;
00377
00378
00379 int32_t fMinMatchLen;
00380
00381
00382
00383
00384 int32_t fFrameSize;
00385
00386
00387 int32_t fDataSize;
00388
00389
00390
00391 UVector32 *fGroupMap;
00392
00393
00394 int32_t fMaxCaptureDigits;
00395
00396 UnicodeSet **fStaticSets;
00397
00398
00399 Regex8BitSet *fStaticSets8;
00400
00401
00402 int32_t fStartType;
00403 int32_t fInitialStringIdx;
00404 int32_t fInitialStringLen;
00405 UnicodeSet *fInitialChars;
00406 UChar32 fInitialChar;
00407 Regex8BitSet *fInitialChars8;
00408
00409 friend class RegexCompile;
00410 friend class RegexMatcher;
00411
00412
00413
00414
00415 void init();
00416 void zap();
00417 void dumpOp(int32_t index) const;
00418
00419
00420 };
00421
00422
00423
00424
00425
00426
00427
00428
00429
00439 class U_I18N_API RegexMatcher: public UObject {
00440 public:
00441
00456 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status);
00457
00473 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
00474 uint32_t flags, UErrorCode &status);
00475
00476
00482 virtual ~RegexMatcher();
00483
00484
00491 virtual UBool matches(UErrorCode &status);
00492
00501 virtual UBool matches(int32_t startIndex, UErrorCode &status);
00502
00503
00504
00505
00518 virtual UBool lookingAt(UErrorCode &status);
00519
00520
00534 virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00535
00548 virtual UBool find();
00549
00550
00560 virtual UBool find(int32_t start, UErrorCode &status);
00561
00562
00572 virtual UnicodeString group(UErrorCode &status) const;
00573
00574
00587 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00588
00589
00595 virtual int32_t groupCount() const;
00596
00597
00605 virtual int32_t start(UErrorCode &status) const;
00606
00607
00621 virtual int32_t start(int group, UErrorCode &status) const;
00622
00623
00633 virtual int32_t end(UErrorCode &status) const;
00634
00635
00649 virtual int32_t end(int group, UErrorCode &status) const;
00650
00651
00661 virtual UBool touchedEnd();
00662
00663
00672 virtual RegexMatcher &reset();
00673
00674
00684 virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00685
00686
00694 virtual RegexMatcher &reset(const UnicodeString &input);
00695
00696
00703 virtual const UnicodeString &input() const;
00704
00705
00711 virtual const RegexPattern &pattern() const;
00712
00713
00730 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
00731
00732
00753 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
00754
00782 virtual RegexMatcher &appendReplacement(UnicodeString &dest,
00783 const UnicodeString &replacement, UErrorCode &status);
00784
00785
00796 virtual UnicodeString &appendTail(UnicodeString &dest);
00797
00798
00799
00824 virtual int32_t split(const UnicodeString &input,
00825 UnicodeString dest[],
00826 int32_t destCapacity,
00827 UErrorCode &status);
00828
00829
00830
00836 void setTrace(UBool state);
00837
00838
00844 static UClassID getStaticClassID();
00845
00851 virtual UClassID getDynamicClassID() const;
00852
00853 private:
00854
00855
00856 RegexMatcher();
00857 RegexMatcher(const RegexPattern *pat);
00858 RegexMatcher(const RegexMatcher &other);
00859 RegexMatcher &operator =(const RegexMatcher &rhs);
00860 friend class RegexPattern;
00861
00862
00863
00864
00865
00866
00867 void MatchAt(int32_t startIdx, UErrorCode &status);
00868 inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
00869 UBool isWordBoundary(int32_t pos);
00870 UBool isUWordBoundary(int32_t pos);
00871 REStackFrame *resetStack();
00872 inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
00873 int32_t frameSize, UErrorCode &status);
00874
00875
00876 const RegexPattern *fPattern;
00877 RegexPattern *fPatternOwned;
00878
00879 const UnicodeString *fInput;
00880
00881 UBool fMatch;
00882 int32_t fMatchStart;
00883 int32_t fMatchEnd;
00884 int32_t fLastMatchEnd;
00885
00886 UVector32 *fStack;
00887 REStackFrame *fFrame;
00888
00889
00890
00891 int32_t *fData;
00892 int32_t fSmallData[8];
00893
00894 UBool fTraceDebug;
00895
00896 UErrorCode fDeferredStatus;
00897
00898
00899 UBool fTouchedEnd;
00900
00901
00902 RuleBasedBreakIterator *fWordBreakItr;
00903
00904 };
00905
00906 U_NAMESPACE_END
00907 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
00908 #endif