/* ********************************************************************** * Copyright (C) 1999-2000 IBM and others. All rights reserved. ********************************************************************** * Date Name Description * 03/22/2000 helena Creation. ********************************************************************** */ #ifndef STRSRCH_H #define STRSRCH_H #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/chariter.h" #include "unicode/tblcoll.h" #include "unicode/brkiter.h" #include "srchiter.h" class SearchIterator; /** * StringSearch is a SearchIterator that provides * language-sensitive text searching based on the comparison rules defined * in a {@link RuleBasedCollator} object. * Instances of StringSearch function as iterators * maintain a current position and scan over text returning the index of * characters where the pattern occurs and the length of each match. *

* StringSearch uses a version of the fast Boyer-Moore search * algorithm that has been adapted to work with the large character set of * Unicode. See "Efficient Text Searching in Java", to be published in * Java Report in February, 1999, for further information on the algorithm. *

* Consult the SearchIterator documentation for information on * and examples of how to use instances of this class to implement text * searching. SearchIterator provides all of the necessary * API; this class only provides constructors and internal implementation * methods. * * @see SearchIterator * @see RuleBasedCollator * * @author Laura Werner * @version 1.0 */ class StringSearch : public SearchIterator { public: /** * Construct a StringSearch object using a specific collator and set * of boundary-detection rules. *

* @param pat The text for which this object will search. * * @param target The text in which to search for the pattern. * * @param coll A RuleBasedCollator object which defines the * language-sensitive comparison rules used to determine * whether text in the pattern and target matches. * * @param breaker A BreakIterator object used to constrain the matches * that are found. Matches whose start and end indices * in the target text are not boundaries as determined * by the BreakIterator are ignored. If this behavior * is not desired, null can be passed in instead. */ StringSearch(const UnicodeString& pat, CharacterIterator* target, RuleBasedCollator* coll, BreakIterator* breaker, UErrorCode& status); /** * Construct a StringSearch object using a specific collator. *

* @param pattern The text for which this object will search. * * @param target The text in which to search for the pattern. * * @param collator A RuleBasedCollator object which defines the * language-sensitive comparison rules used to determine * whether text in the pattern and target matches. */ StringSearch(const UnicodeString& pattern, CharacterIterator* target, RuleBasedCollator* collator, UErrorCode& status); /** * copy constructor */ StringSearch(const StringSearch& that); /** * Construct a StringSearch object using the collator and * character boundary detection rules for a given locale *

* @param pattern The text for which this object will search. * * @param target The text in which to search for the pattern. * * @param loc The locale whose collation and break-detection rules * should be used. * * @exception ClassCastException thrown if the collator for the specified * locale is not a RuleBasedCollator. */ StringSearch(const UnicodeString& pattern, CharacterIterator* target, const Locale& loc, UErrorCode& status); /** * Construct a StringSearch object using the collator for the default * locale *

* @param pattern The text for which this object will search. * * @param target The text in which to search for the pattern. * * @param collator A RuleBasedCollator object which defines the * language-sensitive comparison rules used to determine * whether text in the pattern and target matches. */ StringSearch(const UnicodeString& pattern, const UnicodeString& target, UErrorCode& status); virtual ~StringSearch(void); /** * Assignment operator. Sets this iterator to have the same behavior, * and iterate over the same text, as the one passed in. */ StringSearch& operator=(const StringSearch& that); /** * Equality operator. Returns TRUE if both BreakIterators are of the * same class, have the same behavior, and iterate over the same text. */ virtual UBool operator==(const SearchIterator& that) const; /** * Not-equal operator. If operator== returns TRUE, this returns FALSE, * and vice versa. */ UBool operator!=(const SearchIterator& that) const; /** * Returns a newly-constructed RuleBasedBreakIterator with the same * behavior, and iterating over the same text, as this one. */ virtual SearchIterator* clone(void) const; //------------------------------------------------------------------- // Getters and Setters //------------------------------------------------------------------- /** * Sets this object's strength property. The strength determines the * minimum level of difference considered significant during a * search. Generally, {@link Collator#TERTIARY} and * {@link Collator#IDENTICAL} indicate that all differences are * considered significant, {@link Collator#SECONDARY} indicates * that upper/lower case distinctions should be ignored, and * {@link Collator#PRIMARY} indicates that both case and accents * should be ignored. However, the exact meanings of these constants * are determined by individual Collator objects. *

* @see Collator#PRIMARY * @see Collator#SECONDARY * @see Collator#TERTIARY * @see Collator#IDENTICAL */ void setStrength(Collator::ECollationStrength newStrength, UErrorCode& status); /** * Returns this object's strength property, which indicates what level * of differences are considered significant during a search. *

* @see #setStrength */ Collator::ECollationStrength getStrength(void) const; /** * Set the collator to be used for this string search. Also changes * the search strength to match that of the new collator. *

* This method causes internal data such as Boyer-Moore shift tables * to be recalculated, but the iterator's position is unchanged. *

* @see #getCollator */ void setCollator(const RuleBasedCollator* coll, UErrorCode& status); /** * Return the RuleBasedCollator being used for this string search. */ const RuleBasedCollator& getCollator() const; /** * Set the pattern for which to search. * This method causes internal data such as Boyer-Moore shift tables * to be recalculated, but the iterator's position is unchanged. */ void setPattern(const UnicodeString& pat, UErrorCode& status); /** * Returns the pattern for which this object is searching. */ const UnicodeString& getPattern() const; /** * Set the target text which should be searched and resets the * iterator's position to point before the start of the new text. * This method is useful if you want to re-use an iterator to * search for the same pattern within a different body of text. */ virtual void setTarget(const UnicodeString& newText); /** * Set the target text which should be searched and resets the * iterator's position to point before the start of the target text. * This method is useful if you want to re-use an iterator to * search for the same pattern within a different body of text. * * @see #getTarget */ virtual void adoptTarget(CharacterIterator* iterator); /** Reset iterator */ virtual void reset(void); /** * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. * This method is to implement a simple version of RTTI, since not all * C++ compilers support genuine RTTI. Polymorphic operator==() and * clone() methods call this method. * * @return The class ID for this object. All objects of a * given class have the same class ID. Objects of * other classes have different class IDs. */ inline virtual UClassID getDynamicClassID(void) const; /** * Returns the class ID for this class. This is useful only for * comparing to a return value from getDynamicClassID(). For example: * * Base* polymorphic_pointer = createPolymorphicObject(); * if (polymorphic_pointer->getDynamicClassID() == * Derived::getStaticClassID()) ... * * @return The class ID for all objects of this class. */ inline static UClassID getStaticClassID(void); protected: //------------------------------------------------------------------- // Privates //------------------------------------------------------------------- /** * Search forward for matching text, starting at a given location. * Clients should not call this method directly; instead they should call * {@link SearchIterator#next}. *

* If a match is found, this method returns the index at which the match * starts and calls {@link SearchIterator#setMatchLength} * with the number of characters in the target * text that make up the match. If no match is found, the method returns * DONE and does not call setMatchLength. *

* @param start The index in the target text at which the search starts. * * @return The index at which the matched text in the target starts, or DONE * if no match was found. *

* @see SearchIterator#next * @see SearchIterator#DONE */ virtual int32_t handleNext(int32_t start, UErrorCode& status); /** * Search backward for matching text ,starting at a given location. * Clients should not call this method directly; instead they should call * SearchIterator.previous(), which this method overrides. *

* If a match is found, this method returns the index at which the match * starts and calls {@link SearchIterator#setMatchLength} * with the number of characters in the target * text that make up the match. If no match is found, the method returns * DONE and does not call setMatchLength. *

* @param start The index in the target text at which the search starts. * * @return The index at which the matched text in the target starts, or DONE * if no match was found. *

* @see SearchIterator#previous * @see SearchIterator#DONE */ virtual int32_t handlePrev(int32_t start, UErrorCode& status); private: /** * Return a bitmask that will select only the portions of a collation * element that are significant at the given strength level. */ static int32_t getMask(Collator::ECollationStrength strength); void initialize(UErrorCode& status); /** * Method used by StringSearch to determine how far to the right to * shift the pattern during a Boyer-Moore search. * * @param curValue The current value in the target text * @param curIndex The index in the pattern at which we failed to match * curValue in the target text. */ int32_t getShift( int32_t curValue, int32_t curIndex ) const; /** * Method used by StringSearch to determine how far to the left to * shift the pattern during a reverse Boyer-Moore search. * * @param curValue The current value in the target text * @param curIndex The index in the pattern at which we failed to match * curValue in the target text. */ int32_t getBackShift( int32_t curValue, int32_t curIndex ) const; /** * Hash a collation element from its full size (32 bits) down into a * value that can be used as an index into the shift tables. Right * now we do a modulus by the size of the hash table. * * TODO: At some point I should experiment to see whether a slightly * more complicated hash function gives us a better distribution * on multilingual text. I doubt it will have much effect on * performance, though. */ static int32_t hash(int32_t order); //------------------------------------------------------------------------ // Private Data // CollationElementIterator *iter; RuleBasedCollator *collator; /* HSYS ? Why? Changes to this will not affect collator. no changes to the comparsion result */ Collator::ECollationStrength strength; //------------------------------------------------------------------------ // Everything from here on down is the data used to represent the // Boyer-Moore shift tables and the code that generates and manipulates // them. // int32_t *valueList; int32_t valueListLen; int32_t shiftTable[256]; int32_t backShiftTable[256]; UnicodeString pattern; // The pattern string int32_t normLen; // num. of collation elements in pattern. int32_t minLen; // Min of composed, decomposed versions int32_t maxLen; // Max CollationElementIterator *it; // to be removed private: /* to be removed */ void dumpTables(); /** * Class ID */ static char fgClassID; }; inline UBool StringSearch::operator!=(const SearchIterator& that) const { return !operator==(that); } inline UClassID StringSearch::getDynamicClassID(void) const { return StringSearch::getStaticClassID(); } inline UClassID StringSearch::getStaticClassID(void) { return (UClassID)(&fgClassID); } #endif