regex.h

Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2012, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 //#define REGEX_DEBUG
00020 
00045 #include "unicode/utypes.h"
00046 
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048 
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053 
00054 #include "unicode/uregex.h"
00055 
00056 // Forward Declarations
00057 
00058 U_NAMESPACE_BEGIN
00059 
00060 struct Regex8BitSet;
00061 class  RegexCImpl;
00062 class  RegexMatcher;
00063 class  RegexPattern;
00064 struct REStackFrame;
00065 class  RuleBasedBreakIterator;
00066 class  UnicodeSet;
00067 class  UVector;
00068 class  UVector32;
00069 class  UVector64;
00070 
00075 #ifdef REGEX_DEBUG
00076 U_INTERNAL void U_EXPORT2
00077     RegexPatternDump(const RegexPattern *pat);
00078 #else
00079     #undef RegexPatternDump
00080     #define RegexPatternDump(pat)
00081 #endif
00082 
00083 
00084 
00096 class U_I18N_API RegexPattern: public UObject {
00097 public:
00098 
00106     RegexPattern();
00107 
00114     RegexPattern(const RegexPattern &source);
00115 
00121     virtual ~RegexPattern();
00122 
00131     UBool           operator==(const RegexPattern& that) const;
00132 
00141     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);}
00142 
00148     RegexPattern  &operator =(const RegexPattern &source);
00149 
00157     virtual RegexPattern  *clone() const;
00158 
00159 
00184     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00185         UParseError          &pe,
00186         UErrorCode           &status);
00187 
00214     static RegexPattern * U_EXPORT2 compile( UText *regex,
00215         UParseError          &pe,
00216         UErrorCode           &status);
00217 
00242     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00243         uint32_t             flags,
00244         UParseError          &pe,
00245         UErrorCode           &status);
00246 
00273     static RegexPattern * U_EXPORT2 compile( UText *regex,
00274         uint32_t             flags,
00275         UParseError          &pe,
00276         UErrorCode           &status);
00277 
00300     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00301         uint32_t             flags,
00302         UErrorCode           &status);
00303 
00328     static RegexPattern * U_EXPORT2 compile( UText *regex,
00329         uint32_t             flags,
00330         UErrorCode           &status);
00331 
00337     virtual uint32_t flags() const;
00338 
00356     virtual RegexMatcher *matcher(const UnicodeString &input,
00357         UErrorCode          &status) const;
00358         
00359 private:
00373     RegexMatcher *matcher(const UChar *input,
00374         UErrorCode          &status) const;
00375 public:
00376 
00377 
00389     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00390 
00391 
00406     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
00407         const UnicodeString   &input,
00408               UParseError     &pe,
00409               UErrorCode      &status);
00410 
00425     static UBool U_EXPORT2 matches(UText *regex,
00426         UText           *input,
00427         UParseError     &pe,
00428         UErrorCode      &status);
00429 
00438     virtual UnicodeString pattern() const;
00439     
00440     
00451     virtual UText *patternText(UErrorCode      &status) const;
00452 
00453 
00492     virtual int32_t  split(const UnicodeString &input,
00493         UnicodeString    dest[],
00494         int32_t          destCapacity,
00495         UErrorCode       &status) const;
00496 
00497 
00536     virtual int32_t  split(UText *input,
00537         UText            *dest[],
00538         int32_t          destCapacity,
00539         UErrorCode       &status) const;
00540 
00541 
00547     virtual UClassID getDynamicClassID() const;
00548 
00554     static UClassID U_EXPORT2 getStaticClassID();
00555 
00556 private:
00557     //
00558     //  Implementation Data
00559     //
00560     UText          *fPattern;      // The original pattern string.
00561     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
00562     uint32_t        fFlags;        // The flags used when compiling the pattern.
00563                                    //
00564     UVector64       *fCompiledPat; // The compiled pattern p-code.
00565     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00566                                    //   after un-escaping, for use during the match.
00567 
00568     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00569     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00570 
00571 
00572     UErrorCode      fDeferredStatus; // status if some prior error has left this
00573                                    //  RegexPattern in an unusable state.
00574 
00575     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00576                                    //   >= this value.  For some patterns, this calculated
00577                                    //   value may be less than the true shortest
00578                                    //   possible match.
00579     
00580     int32_t         fFrameSize;    // Size of a state stack frame in the
00581                                    //   execution engine.
00582 
00583     int32_t         fDataSize;     // The size of the data needed by the pattern that
00584                                    //   does not go on the state stack, but has just
00585                                    //   a single copy per matcher.
00586 
00587     UVector32       *fGroupMap;    // Map from capture group number to position of
00588                                    //   the group's variables in the matcher stack frame.
00589 
00590     int32_t         fMaxCaptureDigits;
00591 
00592     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00593                                    //   regex character classes, e.g. Word.
00594 
00595     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00596                                    //  sets for predefined regex classes.
00597 
00598     int32_t         fStartType;    // Info on how a match must start.
00599     int32_t         fInitialStringIdx;     //
00600     int32_t         fInitialStringLen;
00601     UnicodeSet     *fInitialChars;
00602     UChar32         fInitialChar;
00603     Regex8BitSet   *fInitialChars8;
00604     UBool           fNeedsAltInput;
00605 
00606     friend class RegexCompile;
00607     friend class RegexMatcher;
00608     friend class RegexCImpl;
00609 
00610     //
00611     //  Implementation Methods
00612     //
00613     void        init();            // Common initialization, for use by constructors.
00614     void        zap();             // Common cleanup
00615 #ifdef REGEX_DEBUG
00616     void        dumpOp(int32_t index) const;
00617     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00618 #endif
00619 
00620 };
00621 
00622 
00623 
00633 class U_I18N_API RegexMatcher: public UObject {
00634 public:
00635 
00650     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00651 
00667     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00668 
00690     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00691         uint32_t flags, UErrorCode &status);
00692 
00714     RegexMatcher(UText *regexp, UText *input,
00715         uint32_t flags, UErrorCode &status);
00716 
00717 private:
00731     RegexMatcher(const UnicodeString &regexp, const UChar *input,
00732         uint32_t flags, UErrorCode &status);
00733 public:
00734 
00735 
00741     virtual ~RegexMatcher();
00742 
00743 
00750     virtual UBool matches(UErrorCode &status);
00751 
00752 
00763     virtual UBool matches(int64_t startIndex, UErrorCode &status);
00764 
00765 
00779     virtual UBool lookingAt(UErrorCode &status);
00780 
00781 
00795     virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
00796 
00797 
00810     virtual UBool find();
00811 
00812 
00822     virtual UBool find(int64_t start, UErrorCode &status);
00823 
00824 
00834     virtual UnicodeString group(UErrorCode &status) const;
00835 
00836 
00849     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00850 
00851 
00857     virtual int32_t groupCount() const;
00858 
00859 
00874     virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 
00875 
00891     virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
00892 
00908     virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00909 
00910 
00918     virtual int32_t start(UErrorCode &status) const;
00919 
00927     virtual int64_t start64(UErrorCode &status) const;
00928 
00929 
00943     virtual int32_t start(int32_t group, UErrorCode &status) const;
00944 
00958     virtual int64_t start64(int32_t group, UErrorCode &status) const;
00959 
00960 
00974     virtual int32_t end(UErrorCode &status) const;
00975 
00989     virtual int64_t end64(UErrorCode &status) const;
00990 
00991 
01009     virtual int32_t end(int32_t group, UErrorCode &status) const;
01010 
01028     virtual int64_t end64(int32_t group, UErrorCode &status) const;
01029 
01030 
01039     virtual RegexMatcher &reset();
01040 
01041 
01057     virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
01058 
01059 
01077     virtual RegexMatcher &reset(const UnicodeString &input);
01078 
01079 
01093     virtual RegexMatcher &reset(UText *input);
01094 
01095 
01120     virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
01121 
01122 private:
01136     RegexMatcher &reset(const UChar *input);
01137 public:
01138 
01146     virtual const UnicodeString &input() const;
01147     
01156     virtual UText *inputText() const;
01157     
01168     virtual UText *getInput(UText *dest, UErrorCode &status) const;
01169     
01170 
01189      virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
01190 
01202      virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
01203 
01212      virtual int32_t regionStart() const;
01213 
01222      virtual int64_t regionStart64() const;
01223 
01224 
01233       virtual int32_t regionEnd() const;
01234 
01243       virtual int64_t regionEnd64() const;
01244 
01253       virtual UBool hasTransparentBounds() const;
01254 
01273       virtual RegexMatcher &useTransparentBounds(UBool b);
01274 
01275      
01283       virtual UBool hasAnchoringBounds() const;
01284 
01285 
01298       virtual RegexMatcher &useAnchoringBounds(UBool b);
01299 
01300 
01313       virtual UBool hitEnd() const;
01314 
01324       virtual UBool requireEnd() const;
01325 
01326 
01332     virtual const RegexPattern &pattern() const;
01333 
01334 
01351     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01352 
01353 
01374     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01375     
01376 
01397     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01398     
01399 
01424     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01425     
01426     
01454     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01455         const UnicodeString &replacement, UErrorCode &status);
01456     
01457     
01485     virtual RegexMatcher &appendReplacement(UText *dest,
01486         UText *replacement, UErrorCode &status);
01487 
01488 
01499     virtual UnicodeString &appendTail(UnicodeString &dest);
01500 
01501 
01515     virtual UText *appendTail(UText *dest, UErrorCode &status);
01516 
01517 
01541     virtual int32_t  split(const UnicodeString &input,
01542         UnicodeString    dest[],
01543         int32_t          destCapacity,
01544         UErrorCode       &status);
01545 
01546 
01570     virtual int32_t  split(UText *input,
01571         UText           *dest[],
01572         int32_t          destCapacity,
01573         UErrorCode       &status);
01574     
01596     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01597 
01604     virtual int32_t getTimeLimit() const;
01605 
01627     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
01628     
01636     virtual int32_t  getStackLimit() const;
01637 
01638 
01652     virtual void setMatchCallback(URegexMatchCallback     *callback,
01653                                   const void              *context,
01654                                   UErrorCode              &status);
01655 
01656 
01667     virtual void getMatchCallback(URegexMatchCallback     *&callback,
01668                                   const void              *&context,
01669                                   UErrorCode              &status);
01670 
01671 
01685     virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
01686                                               const void                              *context,
01687                                               UErrorCode                              &status);
01688 
01689 
01700     virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
01701                                               const void                      *&context,
01702                                               UErrorCode                      &status);
01703 
01704 #ifndef U_HIDE_INTERNAL_API
01705 
01710     void setTrace(UBool state);
01711 #endif  /* U_HIDE_INTERNAL_API */
01712 
01718     static UClassID U_EXPORT2 getStaticClassID();
01719 
01725     virtual UClassID getDynamicClassID() const;
01726 
01727 private:
01728     // Constructors and other object boilerplate are private.
01729     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
01730     RegexMatcher();                  // default constructor not implemented
01731     RegexMatcher(const RegexPattern *pat);
01732     RegexMatcher(const RegexMatcher &other);
01733     RegexMatcher &operator =(const RegexMatcher &rhs);
01734     void init(UErrorCode &status);                      // Common initialization
01735     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
01736 
01737     friend class RegexPattern;
01738     friend class RegexCImpl;
01739 public:
01740 #ifndef U_HIDE_INTERNAL_API
01741 
01742     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
01743 #endif  /* U_HIDE_INTERNAL_API */
01744 private:
01745 
01746     //
01747     //  MatchAt   This is the internal interface to the match engine itself.
01748     //            Match status comes back in matcher member variables.
01749     //
01750     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01751     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
01752     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
01753     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
01754     REStackFrame        *resetStack();
01755     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01756     void                 IncrementTime(UErrorCode &status);
01757     UBool                ReportFindProgress(int64_t matchIndex, UErrorCode &status);
01758     
01759     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01760     
01761     UBool                findUsingChunk();
01762     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01763     UBool                isChunkWordBoundary(int32_t pos);
01764 
01765     const RegexPattern  *fPattern;
01766     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
01767                                            //   should delete it when through.
01768 
01769     const UnicodeString *fInput;           // The string being matched. Only used for input()
01770     UText               *fInputText;       // The text being matched. Is never NULL.
01771     UText               *fAltInputText;    // A shallow copy of the text being matched.
01772                                            //   Only created if the pattern contains backreferences.
01773     int64_t              fInputLength;     // Full length of the input text.
01774     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
01775     
01776     int64_t              fRegionStart;     // Start of the input region, default = 0.
01777     int64_t              fRegionLimit;     // End of input region, default to input.length.
01778     
01779     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
01780     int64_t              fAnchorLimit;     //   See useAnchoringBounds
01781     
01782     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
01783     int64_t              fLookLimit;       //   and other boundary tests.  See
01784                                            //   useTransparentBounds
01785 
01786     int64_t              fActiveStart;     // Currently active bounds for matching.
01787     int64_t              fActiveLimit;     //   Usually is the same as region, but
01788                                            //   is changed to fLookStart/Limit when
01789                                            //   entering look around regions.
01790 
01791     UBool                fTransparentBounds;  // True if using transparent bounds.
01792     UBool                fAnchoringBounds; // True if using anchoring bounds.
01793 
01794     UBool                fMatch;           // True if the last attempted match was successful.
01795     int64_t              fMatchStart;      // Position of the start of the most recent match
01796     int64_t              fMatchEnd;        // First position after the end of the most recent match
01797                                            //   Zero if no previous match, even when a region
01798                                            //   is active.
01799     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
01800                                            //   or -1 if there was no previous match.
01801     int64_t              fAppendPosition;  // First position after the end of the previous
01802                                            //   appendReplacement().  As described by the
01803                                            //   JavaDoc for Java Matcher, where it is called 
01804                                            //   "append position"
01805     UBool                fHitEnd;          // True if the last match touched the end of input.
01806     UBool                fRequireEnd;      // True if the last match required end-of-input
01807                                            //    (matched $ or Z)
01808 
01809     UVector64           *fStack;
01810     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
01811                                            //   which will contain the capture group results.
01812                                            //   NOT valid while match engine is running.
01813 
01814     int64_t             *fData;            // Data area for use by the compiled pattern.
01815     int64_t             fSmallData[8];     //   Use this for data if it's enough.
01816 
01817     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
01818                                            //   match engine run.  Zero for unlimited.
01819     
01820     int32_t             fTime;             // Match time, accumulates while matching.
01821     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
01822                                            //   Kept separately from fTime to keep as much
01823                                            //   code as possible out of the inline
01824                                            //   StateSave function.
01825 
01826     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
01827                                            //   stack, in bytes.  Zero for unlimited.
01828 
01829     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
01830                                            //   NULL if there is no callback.
01831     const void         *fCallbackContext;  // User Context ptr for callback function.
01832 
01833     URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
01834                                                            //   NULL if there is no callback.
01835     const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
01836 
01837 
01838     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
01839 
01840     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
01841 
01842     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
01843                                            //   reported, or that permanently disables this matcher.
01844 
01845     RuleBasedBreakIterator  *fWordBreakItr;
01846 };
01847 
01848 U_NAMESPACE_END
01849 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
01850 #endif

Generated on 27 Oct 2013 for ICU 50.1.2 by  doxygen 1.4.7