00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018
00019
00020
00045 #include "unicode/utypes.h"
00046
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053
00054 #include "unicode/uregex.h"
00055
00056
00057
00058 U_NAMESPACE_BEGIN
00059
00060 struct Regex8BitSet;
00061 class RegexCImpl;
00062 class RegexMatcher;
00063 class RegexPattern;
00064 struct REStackFrame;
00065 class RuleBasedBreakIterator;
00066 class UnicodeSet;
00067 class UVector;
00068 class UVector32;
00069 class UVector64;
00070
00075 #ifdef REGEX_DEBUG
00076 U_INTERNAL void U_EXPORT2
00077 RegexPatternDump(const RegexPattern *pat);
00078 #else
00079 #undef RegexPatternDump
00080 #define RegexPatternDump(pat)
00081 #endif
00082
00083
00084
00096 class U_I18N_API RegexPattern: public UObject {
00097 public:
00098
00106 RegexPattern();
00107
00114 RegexPattern(const RegexPattern &source);
00115
00121 virtual ~RegexPattern();
00122
00131 UBool operator==(const RegexPattern& that) const;
00132
00141 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
00142
00148 RegexPattern &operator =(const RegexPattern &source);
00149
00157 virtual RegexPattern *clone() const;
00158
00159
00184 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00185 UParseError &pe,
00186 UErrorCode &status);
00187
00214 static RegexPattern * U_EXPORT2 compile( UText *regex,
00215 UParseError &pe,
00216 UErrorCode &status);
00217
00242 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00243 uint32_t flags,
00244 UParseError &pe,
00245 UErrorCode &status);
00246
00273 static RegexPattern * U_EXPORT2 compile( UText *regex,
00274 uint32_t flags,
00275 UParseError &pe,
00276 UErrorCode &status);
00277
00300 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00301 uint32_t flags,
00302 UErrorCode &status);
00303
00328 static RegexPattern * U_EXPORT2 compile( UText *regex,
00329 uint32_t flags,
00330 UErrorCode &status);
00331
00337 virtual uint32_t flags() const;
00338
00356 virtual RegexMatcher *matcher(const UnicodeString &input,
00357 UErrorCode &status) const;
00358
00359 private:
00373 RegexMatcher *matcher(const UChar *input,
00374 UErrorCode &status) const;
00375 public:
00376
00377
00389 virtual RegexMatcher *matcher(UErrorCode &status) const;
00390
00391
00406 static UBool U_EXPORT2 matches(const UnicodeString ®ex,
00407 const UnicodeString &input,
00408 UParseError &pe,
00409 UErrorCode &status);
00410
00425 static UBool U_EXPORT2 matches(UText *regex,
00426 UText *input,
00427 UParseError &pe,
00428 UErrorCode &status);
00429
00438 virtual UnicodeString pattern() const;
00439
00440
00451 virtual UText *patternText(UErrorCode &status) const;
00452
00453
00492 virtual int32_t split(const UnicodeString &input,
00493 UnicodeString dest[],
00494 int32_t destCapacity,
00495 UErrorCode &status) const;
00496
00497
00536 virtual int32_t split(UText *input,
00537 UText *dest[],
00538 int32_t destCapacity,
00539 UErrorCode &status) const;
00540
00541
00547 virtual UClassID getDynamicClassID() const;
00548
00554 static UClassID U_EXPORT2 getStaticClassID();
00555
00556 private:
00557
00558
00559
00560 UText *fPattern;
00561 UnicodeString *fPatternString;
00562 uint32_t fFlags;
00563
00564 UVector64 *fCompiledPat;
00565 UnicodeString fLiteralText;
00566
00567
00568 UVector *fSets;
00569 Regex8BitSet *fSets8;
00570
00571
00572 UErrorCode fDeferredStatus;
00573
00574
00575 int32_t fMinMatchLen;
00576
00577
00578
00579
00580 int32_t fFrameSize;
00581
00582
00583 int32_t fDataSize;
00584
00585
00586
00587 UVector32 *fGroupMap;
00588
00589
00590 int32_t fMaxCaptureDigits;
00591
00592 UnicodeSet **fStaticSets;
00593
00594
00595 Regex8BitSet *fStaticSets8;
00596
00597
00598 int32_t fStartType;
00599 int32_t fInitialStringIdx;
00600 int32_t fInitialStringLen;
00601 UnicodeSet *fInitialChars;
00602 UChar32 fInitialChar;
00603 Regex8BitSet *fInitialChars8;
00604 UBool fNeedsAltInput;
00605
00606 friend class RegexCompile;
00607 friend class RegexMatcher;
00608 friend class RegexCImpl;
00609
00610
00611
00612
00613 void init();
00614 void zap();
00615 #ifdef REGEX_DEBUG
00616 void dumpOp(int32_t index) const;
00617 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00618 #endif
00619
00620 };
00621
00622
00623
00633 class U_I18N_API RegexMatcher: public UObject {
00634 public:
00635
00650 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status);
00651
00667 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00668
00690 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
00691 uint32_t flags, UErrorCode &status);
00692
00714 RegexMatcher(UText *regexp, UText *input,
00715 uint32_t flags, UErrorCode &status);
00716
00717 private:
00731 RegexMatcher(const UnicodeString ®exp, const UChar *input,
00732 uint32_t flags, UErrorCode &status);
00733 public:
00734
00735
00741 virtual ~RegexMatcher();
00742
00743
00750 virtual UBool matches(UErrorCode &status);
00751
00752
00763 virtual UBool matches(int64_t startIndex, UErrorCode &status);
00764
00765
00779 virtual UBool lookingAt(UErrorCode &status);
00780
00781
00795 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
00796
00797
00810 virtual UBool find();
00811
00812
00822 virtual UBool find(int64_t start, UErrorCode &status);
00823
00824
00834 virtual UnicodeString group(UErrorCode &status) const;
00835
00836
00849 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00850
00851
00857 virtual int32_t groupCount() const;
00858
00859
00874 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
00875
00891 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
00892
00908 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00909
00910
00918 virtual int32_t start(UErrorCode &status) const;
00919
00927 virtual int64_t start64(UErrorCode &status) const;
00928
00929
00943 virtual int32_t start(int32_t group, UErrorCode &status) const;
00944
00958 virtual int64_t start64(int32_t group, UErrorCode &status) const;
00959
00960
00974 virtual int32_t end(UErrorCode &status) const;
00975
00989 virtual int64_t end64(UErrorCode &status) const;
00990
00991
01009 virtual int32_t end(int32_t group, UErrorCode &status) const;
01010
01028 virtual int64_t end64(int32_t group, UErrorCode &status) const;
01029
01030
01039 virtual RegexMatcher &reset();
01040
01041
01057 virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
01058
01059
01077 virtual RegexMatcher &reset(const UnicodeString &input);
01078
01079
01093 virtual RegexMatcher &reset(UText *input);
01094
01095
01120 virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
01121
01122 private:
01136 RegexMatcher &reset(const UChar *input);
01137 public:
01138
01146 virtual const UnicodeString &input() const;
01147
01156 virtual UText *inputText() const;
01157
01168 virtual UText *getInput(UText *dest, UErrorCode &status) const;
01169
01170
01189 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status);
01190
01202 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
01203
01212 virtual int32_t regionStart() const;
01213
01222 virtual int64_t regionStart64() const;
01223
01224
01233 virtual int32_t regionEnd() const;
01234
01243 virtual int64_t regionEnd64() const;
01244
01253 virtual UBool hasTransparentBounds() const;
01254
01273 virtual RegexMatcher &useTransparentBounds(UBool b);
01274
01275
01283 virtual UBool hasAnchoringBounds() const;
01284
01285
01298 virtual RegexMatcher &useAnchoringBounds(UBool b);
01299
01300
01313 virtual UBool hitEnd() const;
01314
01324 virtual UBool requireEnd() const;
01325
01326
01332 virtual const RegexPattern &pattern() const;
01333
01334
01351 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01352
01353
01374 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01375
01376
01397 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01398
01399
01424 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01425
01426
01454 virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01455 const UnicodeString &replacement, UErrorCode &status);
01456
01457
01485 virtual RegexMatcher &appendReplacement(UText *dest,
01486 UText *replacement, UErrorCode &status);
01487
01488
01499 virtual UnicodeString &appendTail(UnicodeString &dest);
01500
01501
01515 virtual UText *appendTail(UText *dest, UErrorCode &status);
01516
01517
01541 virtual int32_t split(const UnicodeString &input,
01542 UnicodeString dest[],
01543 int32_t destCapacity,
01544 UErrorCode &status);
01545
01546
01570 virtual int32_t split(UText *input,
01571 UText *dest[],
01572 int32_t destCapacity,
01573 UErrorCode &status);
01574
01596 virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01597
01604 virtual int32_t getTimeLimit() const;
01605
01627 virtual void setStackLimit(int32_t limit, UErrorCode &status);
01628
01636 virtual int32_t getStackLimit() const;
01637
01638
01652 virtual void setMatchCallback(URegexMatchCallback *callback,
01653 const void *context,
01654 UErrorCode &status);
01655
01656
01667 virtual void getMatchCallback(URegexMatchCallback *&callback,
01668 const void *&context,
01669 UErrorCode &status);
01670
01671
01685 virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
01686 const void *context,
01687 UErrorCode &status);
01688
01689
01700 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
01701 const void *&context,
01702 UErrorCode &status);
01703
01704 #ifndef U_HIDE_INTERNAL_API
01705
01710 void setTrace(UBool state);
01711 #endif
01712
01718 static UClassID U_EXPORT2 getStaticClassID();
01719
01725 virtual UClassID getDynamicClassID() const;
01726
01727 private:
01728
01729
01730 RegexMatcher();
01731 RegexMatcher(const RegexPattern *pat);
01732 RegexMatcher(const RegexMatcher &other);
01733 RegexMatcher &operator =(const RegexMatcher &rhs);
01734 void init(UErrorCode &status);
01735 void init2(UText *t, UErrorCode &e);
01736
01737 friend class RegexPattern;
01738 friend class RegexCImpl;
01739 public:
01740 #ifndef U_HIDE_INTERNAL_API
01741
01742 void resetPreserveRegion();
01743 #endif
01744 private:
01745
01746
01747
01748
01749
01750 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01751 inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
01752 UBool isWordBoundary(int64_t pos);
01753 UBool isUWordBoundary(int64_t pos);
01754 REStackFrame *resetStack();
01755 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01756 void IncrementTime(UErrorCode &status);
01757 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
01758
01759 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01760
01761 UBool findUsingChunk();
01762 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01763 UBool isChunkWordBoundary(int32_t pos);
01764
01765 const RegexPattern *fPattern;
01766 RegexPattern *fPatternOwned;
01767
01768
01769 const UnicodeString *fInput;
01770 UText *fInputText;
01771 UText *fAltInputText;
01772
01773 int64_t fInputLength;
01774 int32_t fFrameSize;
01775
01776 int64_t fRegionStart;
01777 int64_t fRegionLimit;
01778
01779 int64_t fAnchorStart;
01780 int64_t fAnchorLimit;
01781
01782 int64_t fLookStart;
01783 int64_t fLookLimit;
01784
01785
01786 int64_t fActiveStart;
01787 int64_t fActiveLimit;
01788
01789
01790
01791 UBool fTransparentBounds;
01792 UBool fAnchoringBounds;
01793
01794 UBool fMatch;
01795 int64_t fMatchStart;
01796 int64_t fMatchEnd;
01797
01798
01799 int64_t fLastMatchEnd;
01800
01801 int64_t fAppendPosition;
01802
01803
01804
01805 UBool fHitEnd;
01806 UBool fRequireEnd;
01807
01808
01809 UVector64 *fStack;
01810 REStackFrame *fFrame;
01811
01812
01813
01814 int64_t *fData;
01815 int64_t fSmallData[8];
01816
01817 int32_t fTimeLimit;
01818
01819
01820 int32_t fTime;
01821 int32_t fTickCounter;
01822
01823
01824
01825
01826 int32_t fStackLimit;
01827
01828
01829 URegexMatchCallback *fCallbackFn;
01830
01831 const void *fCallbackContext;
01832
01833 URegexFindProgressCallback *fFindProgressCallbackFn;
01834
01835 const void *fFindProgressCallbackContext;
01836
01837
01838 UBool fInputUniStrMaybeMutable;
01839
01840 UBool fTraceDebug;
01841
01842 UErrorCode fDeferredStatus;
01843
01844
01845 RuleBasedBreakIterator *fWordBreakItr;
01846 };
01847
01848 U_NAMESPACE_END
01849 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
01850 #endif