00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018
00019
00020
00045 #include "unicode/utypes.h"
00046
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053
00054 #include "unicode/uregex.h"
00055
00056 U_NAMESPACE_BEGIN
00057
00058
00059
00060
00061 class RegexMatcher;
00062 class RegexPattern;
00063 class UVector;
00064 class UVector32;
00065 class UVector64;
00066 class UnicodeSet;
00067 struct REStackFrame;
00068 struct Regex8BitSet;
00069 class RuleBasedBreakIterator;
00070 class RegexCImpl;
00071
00072
00073
00074
00079 #ifdef REGEX_DEBUG
00080 U_INTERNAL void U_EXPORT2
00081 RegexPatternDump(const RegexPattern *pat);
00082 #else
00083 #undef RegexPatternDump
00084 #define RegexPatternDump(pat)
00085 #endif
00086
00087
00088
00100 class U_I18N_API RegexPattern: public UObject {
00101 public:
00102
00110 RegexPattern();
00111
00118 RegexPattern(const RegexPattern &source);
00119
00125 virtual ~RegexPattern();
00126
00135 UBool operator==(const RegexPattern& that) const;
00136
00145 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00146
00152 RegexPattern &operator =(const RegexPattern &source);
00153
00161 virtual RegexPattern *clone() const;
00162
00163
00188 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00189 UParseError &pe,
00190 UErrorCode &status);
00191
00192
00219 static RegexPattern * U_EXPORT2 compile( UText *regex,
00220 UParseError &pe,
00221 UErrorCode &status);
00222
00247 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00248 uint32_t flags,
00249 UParseError &pe,
00250 UErrorCode &status);
00251
00252
00279 static RegexPattern * U_EXPORT2 compile( UText *regex,
00280 uint32_t flags,
00281 UParseError &pe,
00282 UErrorCode &status);
00283
00284
00307 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00308 uint32_t flags,
00309 UErrorCode &status);
00310
00311
00336 static RegexPattern * U_EXPORT2 compile( UText *regex,
00337 uint32_t flags,
00338 UErrorCode &status);
00339
00340
00346 virtual uint32_t flags() const;
00347
00365 virtual RegexMatcher *matcher(const UnicodeString &input,
00366 UErrorCode &status) const;
00367
00368
00373 enum PatternIsUTextFlag { PATTERN_IS_UTEXT };
00374
00394 virtual RegexMatcher *matcher(UText *input,
00395 PatternIsUTextFlag flag,
00396 UErrorCode &status) const;
00397
00398 private:
00412 RegexMatcher *matcher(const UChar *input,
00413 UErrorCode &status) const;
00414 public:
00415
00416
00428 virtual RegexMatcher *matcher(UErrorCode &status) const;
00429
00430
00445 static UBool U_EXPORT2 matches(const UnicodeString ®ex,
00446 const UnicodeString &input,
00447 UParseError &pe,
00448 UErrorCode &status);
00449
00450
00465 static UBool U_EXPORT2 matches(UText *regex,
00466 UText *input,
00467 UParseError &pe,
00468 UErrorCode &status);
00469
00470
00479 virtual UnicodeString pattern() const;
00480
00481
00492 virtual UText *patternText() const;
00493
00494
00520 virtual int32_t split(const UnicodeString &input,
00521 UnicodeString dest[],
00522 int32_t destCapacity,
00523 UErrorCode &status) const;
00524
00525
00551 virtual int32_t split(UText *input,
00552 UText *dest[],
00553 int32_t destCapacity,
00554 UErrorCode &status) const;
00555
00556
00562 virtual UClassID getDynamicClassID() const;
00563
00569 static UClassID U_EXPORT2 getStaticClassID();
00570
00571 private:
00572
00573
00574
00575 UText *fPattern;
00576 UnicodeString *fPatternString;
00577 uint32_t fFlags;
00578
00579 UVector64 *fCompiledPat;
00580 UnicodeString fLiteralText;
00581
00582
00583 UVector *fSets;
00584 Regex8BitSet *fSets8;
00585
00586
00587 UErrorCode fDeferredStatus;
00588
00589
00590 int32_t fMinMatchLen;
00591
00592
00593
00594
00595 int32_t fFrameSize;
00596
00597
00598 int32_t fDataSize;
00599
00600
00601
00602 UVector32 *fGroupMap;
00603
00604
00605 int32_t fMaxCaptureDigits;
00606
00607 UnicodeSet **fStaticSets;
00608
00609
00610 Regex8BitSet *fStaticSets8;
00611
00612
00613 int32_t fStartType;
00614 int32_t fInitialStringIdx;
00615 int32_t fInitialStringLen;
00616 UnicodeSet *fInitialChars;
00617 UChar32 fInitialChar;
00618 Regex8BitSet *fInitialChars8;
00619 UBool fNeedsAltInput;
00620
00621 friend class RegexCompile;
00622 friend class RegexMatcher;
00623 friend class RegexCImpl;
00624
00625
00626
00627
00628 void init();
00629 void zap();
00630 #ifdef REGEX_DEBUG
00631 void dumpOp(int32_t index) const;
00632 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00633 #endif
00634
00635 };
00636
00637
00638
00648 class U_I18N_API RegexMatcher: public UObject {
00649 public:
00650
00665 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status);
00666
00682 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00683
00705 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
00706 uint32_t flags, UErrorCode &status);
00707
00729 RegexMatcher(UText *regexp, UText *input,
00730 uint32_t flags, UErrorCode &status);
00731
00732 private:
00746 RegexMatcher(const UnicodeString ®exp, const UChar *input,
00747 uint32_t flags, UErrorCode &status);
00748 public:
00749
00750
00756 virtual ~RegexMatcher();
00757
00758
00765 virtual UBool matches(UErrorCode &status);
00766
00767
00778 virtual UBool matches(int32_t startIndex, UErrorCode &status);
00779
00780
00794 virtual UBool lookingAt(UErrorCode &status);
00795
00796
00810 virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00811
00812
00825 virtual UBool find();
00826
00827
00837 virtual UBool find(int32_t start, UErrorCode &status);
00838
00839
00849 virtual UnicodeString group(UErrorCode &status) const;
00850
00851
00856 enum MatcherDestIsUTextFlag { MATCHER_DEST_IS_UTEXT };
00857
00873 virtual UText *group(UText *dest, MatcherDestIsUTextFlag flag, UErrorCode &status) const;
00874
00875
00888 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00889
00890
00906 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00907
00908
00914 virtual int32_t groupCount() const;
00915
00916
00924 virtual int32_t start(UErrorCode &status) const;
00925
00926
00940 virtual int32_t start(int32_t group, UErrorCode &status) const;
00941
00942
00952 virtual int32_t end(UErrorCode &status) const;
00953
00954
00968 virtual int32_t end(int32_t group, UErrorCode &status) const;
00969
00970
00979 virtual RegexMatcher &reset();
00980
00981
00997 virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00998
00999
01017 virtual RegexMatcher &reset(const UnicodeString &input);
01018
01019
01033 virtual RegexMatcher &reset(UText *input);
01034
01035 private:
01049 RegexMatcher &reset(const UChar *input);
01050 public:
01051
01059 virtual const UnicodeString &input() const;
01060
01069 virtual UText *inputText() const;
01070
01080 virtual UText *getInput(UText *dest) const;
01081
01082
01101 virtual RegexMatcher ®ion(int32_t start, int32_t limit, UErrorCode &status);
01102
01103
01112 virtual int32_t regionStart() const;
01113
01114
01123 virtual int32_t regionEnd() const;
01124
01133 virtual UBool hasTransparentBounds() const;
01134
01153 virtual RegexMatcher &useTransparentBounds(UBool b);
01154
01155
01163 virtual UBool hasAnchoringBounds() const;
01164
01165
01178 virtual RegexMatcher &useAnchoringBounds(UBool b);
01179
01180
01193 virtual UBool hitEnd() const;
01194
01204 virtual UBool requireEnd() const;
01205
01206
01212 virtual const RegexPattern &pattern() const;
01213
01214
01231 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01232
01233
01254 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01255
01256
01277 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01278
01279
01304 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01305
01306
01334 virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01335 const UnicodeString &replacement, UErrorCode &status);
01336
01337
01365 virtual RegexMatcher &appendReplacement(UText *dest,
01366 UText *replacement, UErrorCode &status);
01367
01368
01379 virtual UnicodeString &appendTail(UnicodeString &dest);
01380
01381
01394 virtual UText *appendTail(UText *dest);
01395
01396
01420 virtual int32_t split(const UnicodeString &input,
01421 UnicodeString dest[],
01422 int32_t destCapacity,
01423 UErrorCode &status);
01424
01425
01449 virtual int32_t split(UText *input,
01450 UText *dest[],
01451 int32_t destCapacity,
01452 UErrorCode &status);
01453
01475 virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01476
01483 virtual int32_t getTimeLimit() const;
01484
01506 virtual void setStackLimit(int32_t limit, UErrorCode &status);
01507
01515 virtual int32_t getStackLimit() const;
01516
01517
01531 virtual void setMatchCallback(URegexMatchCallback *callback,
01532 const void *context,
01533 UErrorCode &status);
01534
01535
01546 virtual void getMatchCallback(URegexMatchCallback *&callback,
01547 const void *&context,
01548 UErrorCode &status);
01549
01550
01556 void setTrace(UBool state);
01557
01558
01564 static UClassID U_EXPORT2 getStaticClassID();
01565
01571 virtual UClassID getDynamicClassID() const;
01572
01573 private:
01574
01575
01576 RegexMatcher();
01577 RegexMatcher(const RegexPattern *pat);
01578 RegexMatcher(const RegexMatcher &other);
01579 RegexMatcher &operator =(const RegexMatcher &rhs);
01580 void init(UErrorCode &status);
01581 void init2(UText *t, UErrorCode &e);
01582
01583 friend class RegexPattern;
01584 friend class RegexCImpl;
01585 public:
01587 void resetPreserveRegion();
01588 private:
01589
01590
01591
01592
01593
01594 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01595 inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
01596 UBool isWordBoundary(int64_t pos);
01597 UBool isUWordBoundary(int64_t pos);
01598 REStackFrame *resetStack();
01599 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01600 void IncrementTime(UErrorCode &status);
01601
01602 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01603
01604 UBool findUsingChunk();
01605 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01606 UBool isChunkWordBoundary(int32_t pos);
01607
01608 const RegexPattern *fPattern;
01609 RegexPattern *fPatternOwned;
01610
01611
01612 const UnicodeString *fInput;
01613 UText *fInputText;
01614 UText *fAltInputText;
01615
01616 int64_t fInputLength;
01617 int32_t fFrameSize;
01618
01619 int64_t fRegionStart;
01620 int64_t fRegionLimit;
01621
01622 int64_t fAnchorStart;
01623 int64_t fAnchorLimit;
01624
01625 int64_t fLookStart;
01626 int64_t fLookLimit;
01627
01628
01629 int64_t fActiveStart;
01630 int64_t fActiveLimit;
01631
01632
01633
01634 UBool fTransparentBounds;
01635 UBool fAnchoringBounds;
01636
01637 UBool fMatch;
01638 int64_t fMatchStart;
01639 int64_t fMatchEnd;
01640
01641
01642 int64_t fLastMatchEnd;
01643
01644 int64_t fAppendPosition;
01645
01646
01647
01648 UBool fHitEnd;
01649 UBool fRequireEnd;
01650
01651
01652 UVector64 *fStack;
01653 REStackFrame *fFrame;
01654
01655
01656
01657 int64_t *fData;
01658 int64_t fSmallData[8];
01659
01660 int32_t fTimeLimit;
01661
01662
01663 int32_t fTime;
01664 int32_t fTickCounter;
01665
01666
01667
01668
01669 int32_t fStackLimit;
01670
01671
01672 URegexMatchCallback *fCallbackFn;
01673
01674 const void *fCallbackContext;
01675
01676 UBool fInputUniStrMaybeMutable;
01677
01678 UBool fTraceDebug;
01679
01680 UErrorCode fDeferredStatus;
01681
01682
01683 RuleBasedBreakIterator *fWordBreakItr;
01684
01685
01686 };
01687
01688 U_NAMESPACE_END
01689 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
01690 #endif