/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #include "CLucene/StdHeader.h" #include "FuzzyQuery.h" #ifndef NO_FUZZY_QUERY CL_NS_USE(index) CL_NS_USE(util) CL_NS_DEF(search) /** * Constructor for enumeration of all terms from specified reader which share a prefix of * length prefixLength with term and which have a fuzzy similarity > * minSimilarity. * * @param reader Delivers terms. * @param term Pattern term. * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. * @param prefixLength Length of required common prefix. Default value is 0. * @throws IOException */ FuzzyTermEnum::FuzzyTermEnum(const IndexReader* reader, Term* term, float_t minSimilarity, size_t prefixLength): distance(0), _endEnum(false), prefix(LUCENE_BLANK_STRING), prefixLength(0), minimumSimilarity(minSimilarity) { //todo: check, used to pass (reader,term) //Func - Constructor //Pre - reader contains a valid reference to an IndexReader // term != NULL //Post - The instance has been created CND_PRECONDITION(term != NULL,"term is NULL"); scale_factor = 1.0f / (1.0f - minimumSimilarity); searchTerm = _CL_POINTER(term); text = STRDUP_TtoT(term->text()); textLen = term->textLength(); //Initialize e to NULL e = NULL; eWidth = 0; eHeight = 0; if(prefixLength > 0 && prefixLength < textLen){ this->prefixLength = prefixLength; prefix = _CL_NEWARRAY(TCHAR,prefixLength+1); _tcsncpy(prefix,text,prefixLength); prefix[prefixLength]='\0'; textLen = prefixLength; text[textLen]='\0'; } //Set the enumeration Term* trm = _CLNEW Term(term->field(), prefix,false); setEnum(reader->terms(trm)); _CLDECDELETE(trm); } FuzzyTermEnum::~FuzzyTermEnum(){ //Func - Destructor //Pre - true //Post - FuzzyTermEnum has been destroyed //Close the enumeration close(); } bool FuzzyTermEnum::endEnum() { //Func - Returns the fact if the current term in the enumeration has reached the end //Pre - true //Post - The boolean value of endEnum has been returned return _endEnum; } void FuzzyTermEnum::close(){ //Func - Close the enumeration //Pre - true //Post - The enumeration has been closed FilteredTermEnum::close(); //Finalize the searchTerm _CLDECDELETE(searchTerm); //Destroy e _CLDELETE_ARRAY(e); _CLDELETE_CARRAY(text); if ( prefix != LUCENE_BLANK_STRING ) _CLDELETE_CARRAY(prefix); } bool FuzzyTermEnum::termCompare(Term* term) { //Func - Compares term with the searchTerm using the Levenshtein distance. //Pre - term is NULL or term points to a Term //Post - if pre(term) is NULL then false is returned otherwise // if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD // then true is returned if (term == NULL){ return false; //Note that endEnum is not set to true! } const TCHAR* termText = term->text(); size_t termTextLen = term->textLength(); //Check if the field name of searchTerm of term match //(we can use == because fields are interned) if ( searchTerm->field() == term->field() && (prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) { const TCHAR* target = termText+prefixLength; size_t targetLen = termTextLen-prefixLength; //Calculate the Levenshtein distance int32_t dist = editDistance(text, target, textLen, targetLen); distance = 1 - ((float_t)dist / (float_t)min(textLen, targetLen)); return (distance > minimumSimilarity); } _endEnum = true; return false; } float_t FuzzyTermEnum::difference() { //Func - Returns the difference between the distance and the fuzzy threshold // multiplied by the scale factor //Pre - true //Post - The difference is returned return (float_t)((distance - minimumSimilarity) * scale_factor ); } /** Finds and returns the smallest of three integers precondition: Must define int32_t __t for temporary storage and result */ #define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c; int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) { //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity // between two strings where the distance is measured as the number of character // deletions, insertions or substitutions required to transform one string to // the other string. //Pre - s != NULL and contains the source string // t != NULL and contains the target string // n >= 0 and contains the length of the source string // m >= 0 and containts the length of th target string //Post - The distance has been returned CND_PRECONDITION(s != NULL, "s is NULL"); CND_PRECONDITION(t != NULL, "t is NULL"); CND_PRECONDITION(n >= 0," n is a negative number"); CND_PRECONDITION(n >= 0," n is a negative number"); int32_t i; // iterates through s int32_t j; // iterates through t TCHAR s_i; // ith character of s if (n == 0) return m; if (m == 0) return n; //Check if the array must be reallocated because it is too small or does not exist if (e == NULL || eWidth <= n || eHeight <= m) { //Delete e if possible _CLDELETE_ARRAY(e); //resize e eWidth = max(eWidth, n+1); eHeight = max(eHeight, m+1); e = _CL_NEWARRAY(int32_t,eWidth*eHeight); } CND_CONDITION(e != NULL,"e is NULL"); // init matrix e for (i = 0; i <= n; i++){ e[i + (0*eWidth)] = i; } for (j = 0; j <= m; j++){ e[0 + (j*eWidth)] = j; } int32_t __t; //temporary variable for min3 // start computing edit distance for (i = 1; i <= n; i++) { s_i = s[i - 1]; for (j = 1; j <= m; j++) { if (s_i != t[j-1]){ min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]); e[i + (j*eWidth)] = __t+1; }else{ min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]); e[i + (j*eWidth)] = __t; } } } // we got the result! return e[n + ((m)*eWidth)]; } /** * Create a new FuzzyQuery that will match terms with a similarity * of at least minimumSimilarity to term. * If a prefixLength > 0 is specified, a common prefix * of that length is also required. * * @param term the term to search for * @param minimumSimilarity a value between 0 and 1 to set the required similarity * between the query term and the matching terms. For example, for a * minimumSimilarity of 0.5 a term of the same length * as the query term is considered similar to the query term if the edit distance * between both terms is less than length(term)*0.5 * @param prefixLength length of common (non-fuzzy) prefix * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0 * or if prefixLength < 0 or > term.text().length(). */ FuzzyQuery::FuzzyQuery(Term* term, float_t minimumSimilarity, size_t prefixLength): MultiTermQuery(term) { //Func - Constructor //Pre - term != NULL //Post - The instance has been created CND_PRECONDITION(term != NULL,"term is NULL"); if (minimumSimilarity > 1.0f) _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity > 1"); else if (minimumSimilarity < 0.0f) _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity < 0"); this->minimumSimilarity = minimumSimilarity; if(prefixLength >= term->textLength()) _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()"); this->prefixLength = prefixLength; } float_t FuzzyQuery::defaultMinSimilarity = 0.5f; FuzzyQuery::~FuzzyQuery(){ //Func - Destructor //Pre - true //Post - Instance has been destroyed } TCHAR* FuzzyQuery::toString(const TCHAR* field) const{ //Func - Returns the query string //Pre - field != NULL //Post - The query string has been returned CND_PRECONDITION(field != NULL,"field is NULL"); StringBuffer buffer; const TCHAR* b = MultiTermQuery::toString(field); buffer.append ( b ); _CLDELETE_CARRAY(b); buffer.append( _T("~") ); buffer.appendFloat(minimumSimilarity,1); //todo: how many digits? return buffer.toString(); } const TCHAR* FuzzyQuery::getQueryName() const{ //Func - Returns the name of the query //Pre - true //post - The string FuzzyQuery has been returned return getClassName(); } const TCHAR* FuzzyQuery::getClassName(){ //Func - Returns the name of the query //Pre - true //post - The string FuzzyQuery has been returned return _T("FuzzyQuery"); } /** * Returns the minimum similarity that is required for this query to match. * @return float value between 0.0 and 1.0 */ float_t FuzzyQuery::getMinSimilarity() const { return minimumSimilarity; } FuzzyQuery::FuzzyQuery(const FuzzyQuery& clone): MultiTermQuery(clone) { this->minimumSimilarity = clone.getMinSimilarity(); this->prefixLength = clone.getPrefixLength(); //if(prefixLength < 0) // _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0"); //else if(prefixLength >= clone.getTerm()->textLength()) _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()"); } Query* FuzzyQuery::clone() const{ return _CLNEW FuzzyQuery(*this); } size_t FuzzyQuery::hashCode() const{ //todo: we should give the query a seeding value... but //need to do it for all hascode functions size_t val = Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode(); val ^= Similarity::floatToByte(this->getMinSimilarity()); val ^= this->getPrefixLength(); return val; } bool FuzzyQuery::equals(Query* other) const{ if (!(other->instanceOf(FuzzyQuery::getClassName()))) return false; FuzzyQuery* fq = (FuzzyQuery*)other; return (this->getBoost() == fq->getBoost()) && this->getMinSimilarity() == fq->getMinSimilarity() && this->getPrefixLength() == fq->getPrefixLength() && getTerm()->equals(fq->getTerm()); } /** * Returns the prefix length, i.e. the number of characters at the start * of a term that must be identical (not fuzzy) to the query term if the query * is to match that term. */ size_t FuzzyQuery::getPrefixLength() const { return prefixLength; } FilteredTermEnum* FuzzyQuery::getEnum(IndexReader* reader){ Term* term = getTerm(false); FuzzyTermEnum* ret = _CLNEW FuzzyTermEnum(reader, term, minimumSimilarity, prefixLength); return ret; } CL_NS_END #endif