ICU 53.1  53.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
59 
60 struct Regex8BitSet;
61 class RegexCImpl;
62 class RegexMatcher;
63 class RegexPattern;
64 struct REStackFrame;
65 class RuleBasedBreakIterator;
66 class UnicodeSet;
67 class UVector;
68 class UVector32;
69 class UVector64;
70 
71 
84 public:
85 
93  RegexPattern();
94 
101  RegexPattern(const RegexPattern &source);
102 
108  virtual ~RegexPattern();
109 
118  UBool operator==(const RegexPattern& that) const;
119 
128  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
129 
135  RegexPattern &operator =(const RegexPattern &source);
136 
144  virtual RegexPattern *clone() const;
145 
146 
171  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
172  UParseError &pe,
173  UErrorCode &status);
174 
201  static RegexPattern * U_EXPORT2 compile( UText *regex,
202  UParseError &pe,
203  UErrorCode &status);
204 
229  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
230  uint32_t flags,
231  UParseError &pe,
232  UErrorCode &status);
233 
260  static RegexPattern * U_EXPORT2 compile( UText *regex,
261  uint32_t flags,
262  UParseError &pe,
263  UErrorCode &status);
264 
287  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
288  uint32_t flags,
289  UErrorCode &status);
290 
315  static RegexPattern * U_EXPORT2 compile( UText *regex,
316  uint32_t flags,
317  UErrorCode &status);
318 
324  virtual uint32_t flags() const;
325 
343  virtual RegexMatcher *matcher(const UnicodeString &input,
344  UErrorCode &status) const;
345 
346 private:
359  RegexMatcher *matcher(const UChar *input,
360  UErrorCode &status) const;
361 public:
362 
363 
375  virtual RegexMatcher *matcher(UErrorCode &status) const;
376 
377 
392  static UBool U_EXPORT2 matches(const UnicodeString &regex,
393  const UnicodeString &input,
394  UParseError &pe,
395  UErrorCode &status);
396 
411  static UBool U_EXPORT2 matches(UText *regex,
412  UText *input,
413  UParseError &pe,
414  UErrorCode &status);
415 
424  virtual UnicodeString pattern() const;
425 
426 
437  virtual UText *patternText(UErrorCode &status) const;
438 
439 
478  virtual int32_t split(const UnicodeString &input,
479  UnicodeString dest[],
480  int32_t destCapacity,
481  UErrorCode &status) const;
482 
483 
522  virtual int32_t split(UText *input,
523  UText *dest[],
524  int32_t destCapacity,
525  UErrorCode &status) const;
526 
527 
533  virtual UClassID getDynamicClassID() const;
534 
540  static UClassID U_EXPORT2 getStaticClassID();
541 
542 private:
543  //
544  // Implementation Data
545  //
546  UText *fPattern; // The original pattern string.
547  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
548  uint32_t fFlags; // The flags used when compiling the pattern.
549  //
550  UVector64 *fCompiledPat; // The compiled pattern p-code.
551  UnicodeString fLiteralText; // Any literal string data from the pattern,
552  // after un-escaping, for use during the match.
553 
554  UVector *fSets; // Any UnicodeSets referenced from the pattern.
555  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
556 
557 
558  UErrorCode fDeferredStatus; // status if some prior error has left this
559  // RegexPattern in an unusable state.
560 
561  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
562  // >= this value. For some patterns, this calculated
563  // value may be less than the true shortest
564  // possible match.
565 
566  int32_t fFrameSize; // Size of a state stack frame in the
567  // execution engine.
568 
569  int32_t fDataSize; // The size of the data needed by the pattern that
570  // does not go on the state stack, but has just
571  // a single copy per matcher.
572 
573  UVector32 *fGroupMap; // Map from capture group number to position of
574  // the group's variables in the matcher stack frame.
575 
576  int32_t fMaxCaptureDigits;
577 
578  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
579  // regex character classes, e.g. Word.
580 
581  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
582  // sets for predefined regex classes.
583 
584  int32_t fStartType; // Info on how a match must start.
585  int32_t fInitialStringIdx; //
586  int32_t fInitialStringLen;
587  UnicodeSet *fInitialChars;
588  UChar32 fInitialChar;
589  Regex8BitSet *fInitialChars8;
590  UBool fNeedsAltInput;
591 
592  friend class RegexCompile;
593  friend class RegexMatcher;
594  friend class RegexCImpl;
595 
596  //
597  // Implementation Methods
598  //
599  void init(); // Common initialization, for use by constructors.
600  void zap(); // Common cleanup
601 
602  void dumpOp(int32_t index) const;
603 
604  public:
605 #ifndef U_HIDE_INTERNAL_API
606 
610  void dumpPattern() const;
611 #endif
612 };
613 
614 
615 
626 public:
627 
642  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
643 
659  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
660 
682  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
683  uint32_t flags, UErrorCode &status);
684 
706  RegexMatcher(UText *regexp, UText *input,
707  uint32_t flags, UErrorCode &status);
708 
709 private:
722  RegexMatcher(const UnicodeString &regexp, const UChar *input,
723  uint32_t flags, UErrorCode &status);
724 public:
725 
726 
732  virtual ~RegexMatcher();
733 
734 
741  virtual UBool matches(UErrorCode &status);
742 
743 
754  virtual UBool matches(int64_t startIndex, UErrorCode &status);
755 
756 
770  virtual UBool lookingAt(UErrorCode &status);
771 
772 
786  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
787 
788 
801  virtual UBool find();
802 
803 
813  virtual UBool find(int64_t start, UErrorCode &status);
814 
815 
825  virtual UnicodeString group(UErrorCode &status) const;
826 
827 
840  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
841 
842 
848  virtual int32_t groupCount() const;
849 
850 
865  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
866 
882  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
883 
899  virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
900 
901 
909  virtual int32_t start(UErrorCode &status) const;
910 
918  virtual int64_t start64(UErrorCode &status) const;
919 
920 
934  virtual int32_t start(int32_t group, UErrorCode &status) const;
935 
949  virtual int64_t start64(int32_t group, UErrorCode &status) const;
950 
951 
965  virtual int32_t end(UErrorCode &status) const;
966 
980  virtual int64_t end64(UErrorCode &status) const;
981 
982 
1000  virtual int32_t end(int32_t group, UErrorCode &status) const;
1001 
1019  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1020 
1021 
1030  virtual RegexMatcher &reset();
1031 
1032 
1048  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1049 
1050 
1068  virtual RegexMatcher &reset(const UnicodeString &input);
1069 
1070 
1084  virtual RegexMatcher &reset(UText *input);
1085 
1086 
1111  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1112 
1113 private:
1126  RegexMatcher &reset(const UChar *input);
1127 public:
1128 
1136  virtual const UnicodeString &input() const;
1137 
1146  virtual UText *inputText() const;
1147 
1158  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1159 
1160 
1179  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1180 
1192  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1193 
1202  virtual int32_t regionStart() const;
1203 
1212  virtual int64_t regionStart64() const;
1213 
1214 
1223  virtual int32_t regionEnd() const;
1224 
1233  virtual int64_t regionEnd64() const;
1234 
1243  virtual UBool hasTransparentBounds() const;
1244 
1263  virtual RegexMatcher &useTransparentBounds(UBool b);
1264 
1265 
1273  virtual UBool hasAnchoringBounds() const;
1274 
1275 
1288  virtual RegexMatcher &useAnchoringBounds(UBool b);
1289 
1290 
1303  virtual UBool hitEnd() const;
1304 
1314  virtual UBool requireEnd() const;
1315 
1316 
1322  virtual const RegexPattern &pattern() const;
1323 
1324 
1341  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1342 
1343 
1364  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1365 
1366 
1387  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1388 
1389 
1414  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1415 
1416 
1444  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1445  const UnicodeString &replacement, UErrorCode &status);
1446 
1447 
1475  virtual RegexMatcher &appendReplacement(UText *dest,
1476  UText *replacement, UErrorCode &status);
1477 
1478 
1489  virtual UnicodeString &appendTail(UnicodeString &dest);
1490 
1491 
1505  virtual UText *appendTail(UText *dest, UErrorCode &status);
1506 
1507 
1531  virtual int32_t split(const UnicodeString &input,
1532  UnicodeString dest[],
1533  int32_t destCapacity,
1534  UErrorCode &status);
1535 
1536 
1560  virtual int32_t split(UText *input,
1561  UText *dest[],
1562  int32_t destCapacity,
1563  UErrorCode &status);
1564 
1586  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1587 
1594  virtual int32_t getTimeLimit() const;
1595 
1617  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1618 
1626  virtual int32_t getStackLimit() const;
1627 
1628 
1642  virtual void setMatchCallback(URegexMatchCallback *callback,
1643  const void *context,
1644  UErrorCode &status);
1645 
1646 
1657  virtual void getMatchCallback(URegexMatchCallback *&callback,
1658  const void *&context,
1659  UErrorCode &status);
1660 
1661 
1675  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1676  const void *context,
1677  UErrorCode &status);
1678 
1679 
1690  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1691  const void *&context,
1692  UErrorCode &status);
1693 
1694 #ifndef U_HIDE_INTERNAL_API
1695 
1700  void setTrace(UBool state);
1701 #endif /* U_HIDE_INTERNAL_API */
1702 
1708  static UClassID U_EXPORT2 getStaticClassID();
1709 
1715  virtual UClassID getDynamicClassID() const;
1716 
1717 private:
1718  // Constructors and other object boilerplate are private.
1719  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1720  RegexMatcher(); // default constructor not implemented
1721  RegexMatcher(const RegexPattern *pat);
1722  RegexMatcher(const RegexMatcher &other);
1723  RegexMatcher &operator =(const RegexMatcher &rhs);
1724  void init(UErrorCode &status); // Common initialization
1725  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1726 
1727  friend class RegexPattern;
1728  friend class RegexCImpl;
1729 public:
1730 #ifndef U_HIDE_INTERNAL_API
1731 
1732  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1733 #endif /* U_HIDE_INTERNAL_API */
1734 private:
1735 
1736  //
1737  // MatchAt This is the internal interface to the match engine itself.
1738  // Match status comes back in matcher member variables.
1739  //
1740  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1741  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1742  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1743  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1744  REStackFrame *resetStack();
1745  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1746  void IncrementTime(UErrorCode &status);
1747  UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
1748 
1749  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1750 
1751  UBool findUsingChunk();
1752  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1753  UBool isChunkWordBoundary(int32_t pos);
1754 
1755  const RegexPattern *fPattern;
1756  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1757  // should delete it when through.
1758 
1759  const UnicodeString *fInput; // The string being matched. Only used for input()
1760  UText *fInputText; // The text being matched. Is never NULL.
1761  UText *fAltInputText; // A shallow copy of the text being matched.
1762  // Only created if the pattern contains backreferences.
1763  int64_t fInputLength; // Full length of the input text.
1764  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1765 
1766  int64_t fRegionStart; // Start of the input region, default = 0.
1767  int64_t fRegionLimit; // End of input region, default to input.length.
1768 
1769  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1770  int64_t fAnchorLimit; // See useAnchoringBounds
1771 
1772  int64_t fLookStart; // Region bounds for look-ahead/behind and
1773  int64_t fLookLimit; // and other boundary tests. See
1774  // useTransparentBounds
1775 
1776  int64_t fActiveStart; // Currently active bounds for matching.
1777  int64_t fActiveLimit; // Usually is the same as region, but
1778  // is changed to fLookStart/Limit when
1779  // entering look around regions.
1780 
1781  UBool fTransparentBounds; // True if using transparent bounds.
1782  UBool fAnchoringBounds; // True if using anchoring bounds.
1783 
1784  UBool fMatch; // True if the last attempted match was successful.
1785  int64_t fMatchStart; // Position of the start of the most recent match
1786  int64_t fMatchEnd; // First position after the end of the most recent match
1787  // Zero if no previous match, even when a region
1788  // is active.
1789  int64_t fLastMatchEnd; // First position after the end of the previous match,
1790  // or -1 if there was no previous match.
1791  int64_t fAppendPosition; // First position after the end of the previous
1792  // appendReplacement(). As described by the
1793  // JavaDoc for Java Matcher, where it is called
1794  // "append position"
1795  UBool fHitEnd; // True if the last match touched the end of input.
1796  UBool fRequireEnd; // True if the last match required end-of-input
1797  // (matched $ or Z)
1798 
1799  UVector64 *fStack;
1800  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1801  // which will contain the capture group results.
1802  // NOT valid while match engine is running.
1803 
1804  int64_t *fData; // Data area for use by the compiled pattern.
1805  int64_t fSmallData[8]; // Use this for data if it's enough.
1806 
1807  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1808  // match engine run. Zero for unlimited.
1809 
1810  int32_t fTime; // Match time, accumulates while matching.
1811  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1812  // Kept separately from fTime to keep as much
1813  // code as possible out of the inline
1814  // StateSave function.
1815 
1816  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1817  // stack, in bytes. Zero for unlimited.
1818 
1819  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1820  // NULL if there is no callback.
1821  const void *fCallbackContext; // User Context ptr for callback function.
1822 
1823  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1824  // NULL if there is no callback.
1825  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1826 
1827 
1828  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1829 
1830  UBool fTraceDebug; // Set true for debug tracing of match engine.
1831 
1832  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1833  // reported, or that permanently disables this matcher.
1834 
1835  RuleBasedBreakIterator *fWordBreakItr;
1836 };
1837 
1839 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1840 #endif
virtual UClassID getDynamicClassID() const
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:91
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:83
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1550
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:625
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:358
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:298
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:276
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:278
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1476
C API: Parse Error Information.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:128
UText struct.
Definition: utext.h:1343
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:245
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:221
int8_t UBool
The ICU boolean type.
Definition: umachine.h:200