/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "FuzzyQuery.h"
#ifndef NO_FUZZY_QUERY
CL_NS_USE(index)
CL_NS_USE(util)
CL_NS_DEF(search)
/**
* Constructor for enumeration of all terms from specified reader
which share a prefix of
* length prefixLength
with term
and which have a fuzzy similarity >
* minSimilarity
.
*
* @param reader Delivers terms.
* @param term Pattern term.
* @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
* @param prefixLength Length of required common prefix. Default value is 0.
* @throws IOException
*/
FuzzyTermEnum::FuzzyTermEnum(const IndexReader* reader, Term* term, float_t minSimilarity, size_t prefixLength):
distance(0),
_endEnum(false),
prefix(LUCENE_BLANK_STRING),
prefixLength(0),
minimumSimilarity(minSimilarity)
{ //todo: check, used to pass (reader,term)
//Func - Constructor
//Pre - reader contains a valid reference to an IndexReader
// term != NULL
//Post - The instance has been created
CND_PRECONDITION(term != NULL,"term is NULL");
scale_factor = 1.0f / (1.0f - minimumSimilarity);
searchTerm = _CL_POINTER(term);
text = STRDUP_TtoT(term->text());
textLen = term->textLength();
//Initialize e to NULL
e = NULL;
eWidth = 0;
eHeight = 0;
if(prefixLength > 0 && prefixLength < textLen){
this->prefixLength = prefixLength;
prefix = _CL_NEWARRAY(TCHAR,prefixLength+1);
_tcsncpy(prefix,text,prefixLength);
prefix[prefixLength]='\0';
textLen = prefixLength;
text[textLen]='\0';
}
//Set the enumeration
Term* trm = _CLNEW Term(term->field(), prefix,false);
setEnum(reader->terms(trm));
_CLDECDELETE(trm);
}
FuzzyTermEnum::~FuzzyTermEnum(){
//Func - Destructor
//Pre - true
//Post - FuzzyTermEnum has been destroyed
//Close the enumeration
close();
}
bool FuzzyTermEnum::endEnum() {
//Func - Returns the fact if the current term in the enumeration has reached the end
//Pre - true
//Post - The boolean value of endEnum has been returned
return _endEnum;
}
void FuzzyTermEnum::close(){
//Func - Close the enumeration
//Pre - true
//Post - The enumeration has been closed
FilteredTermEnum::close();
//Finalize the searchTerm
_CLDECDELETE(searchTerm);
//Destroy e
_CLDELETE_ARRAY(e);
_CLDELETE_CARRAY(text);
if ( prefix != LUCENE_BLANK_STRING )
_CLDELETE_CARRAY(prefix);
}
bool FuzzyTermEnum::termCompare(Term* term) {
//Func - Compares term with the searchTerm using the Levenshtein distance.
//Pre - term is NULL or term points to a Term
//Post - if pre(term) is NULL then false is returned otherwise
// if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD
// then true is returned
if (term == NULL){
return false; //Note that endEnum is not set to true!
}
const TCHAR* termText = term->text();
size_t termTextLen = term->textLength();
//Check if the field name of searchTerm of term match
//(we can use == because fields are interned)
if ( searchTerm->field() == term->field() &&
(prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) {
const TCHAR* target = termText+prefixLength;
size_t targetLen = termTextLen-prefixLength;
//Calculate the Levenshtein distance
int32_t dist = editDistance(text, target, textLen, targetLen);
distance = 1 - ((float_t)dist / (float_t)min(textLen, targetLen));
return (distance > minimumSimilarity);
}
_endEnum = true;
return false;
}
float_t FuzzyTermEnum::difference() {
//Func - Returns the difference between the distance and the fuzzy threshold
// multiplied by the scale factor
//Pre - true
//Post - The difference is returned
return (float_t)((distance - minimumSimilarity) * scale_factor );
}
/** Finds and returns the smallest of three integers
precondition: Must define int32_t __t for temporary storage and result
*/
#define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c;
int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) {
//Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity
// between two strings where the distance is measured as the number of character
// deletions, insertions or substitutions required to transform one string to
// the other string.
//Pre - s != NULL and contains the source string
// t != NULL and contains the target string
// n >= 0 and contains the length of the source string
// m >= 0 and containts the length of th target string
//Post - The distance has been returned
CND_PRECONDITION(s != NULL, "s is NULL");
CND_PRECONDITION(t != NULL, "t is NULL");
CND_PRECONDITION(n >= 0," n is a negative number");
CND_PRECONDITION(n >= 0," n is a negative number");
int32_t i; // iterates through s
int32_t j; // iterates through t
TCHAR s_i; // ith character of s
if (n == 0)
return m;
if (m == 0)
return n;
//Check if the array must be reallocated because it is too small or does not exist
if (e == NULL || eWidth <= n || eHeight <= m) {
//Delete e if possible
_CLDELETE_ARRAY(e);
//resize e
eWidth = max(eWidth, n+1);
eHeight = max(eHeight, m+1);
e = _CL_NEWARRAY(int32_t,eWidth*eHeight);
}
CND_CONDITION(e != NULL,"e is NULL");
// init matrix e
for (i = 0; i <= n; i++){
e[i + (0*eWidth)] = i;
}
for (j = 0; j <= m; j++){
e[0 + (j*eWidth)] = j;
}
int32_t __t; //temporary variable for min3
// start computing edit distance
for (i = 1; i <= n; i++) {
s_i = s[i - 1];
for (j = 1; j <= m; j++) {
if (s_i != t[j-1]){
min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]);
e[i + (j*eWidth)] = __t+1;
}else{
min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]);
e[i + (j*eWidth)] = __t;
}
}
}
// we got the result!
return e[n + ((m)*eWidth)];
}
/**
* Create a new FuzzyQuery that will match terms with a similarity
* of at least minimumSimilarity
to term
.
* If a prefixLength
> 0 is specified, a common prefix
* of that length is also required.
*
* @param term the term to search for
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* minimumSimilarity
of 0.5
a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
* @param prefixLength length of common (non-fuzzy) prefix
* @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0
* or if prefixLength < 0 or > term.text().length()
.
*/
FuzzyQuery::FuzzyQuery(Term* term, float_t minimumSimilarity, size_t prefixLength):
MultiTermQuery(term)
{
//Func - Constructor
//Pre - term != NULL
//Post - The instance has been created
CND_PRECONDITION(term != NULL,"term is NULL");
if (minimumSimilarity > 1.0f)
_CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity > 1");
else if (minimumSimilarity < 0.0f)
_CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity < 0");
this->minimumSimilarity = minimumSimilarity;
if(prefixLength >= term->textLength())
_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
this->prefixLength = prefixLength;
}
float_t FuzzyQuery::defaultMinSimilarity = 0.5f;
FuzzyQuery::~FuzzyQuery(){
//Func - Destructor
//Pre - true
//Post - Instance has been destroyed
}
TCHAR* FuzzyQuery::toString(const TCHAR* field) const{
//Func - Returns the query string
//Pre - field != NULL
//Post - The query string has been returned
CND_PRECONDITION(field != NULL,"field is NULL");
StringBuffer buffer;
const TCHAR* b = MultiTermQuery::toString(field);
buffer.append ( b );
_CLDELETE_CARRAY(b);
buffer.append( _T("~") );
buffer.appendFloat(minimumSimilarity,1); //todo: how many digits?
return buffer.toString();
}
const TCHAR* FuzzyQuery::getQueryName() const{
//Func - Returns the name of the query
//Pre - true
//post - The string FuzzyQuery has been returned
return getClassName();
}
const TCHAR* FuzzyQuery::getClassName(){
//Func - Returns the name of the query
//Pre - true
//post - The string FuzzyQuery has been returned
return _T("FuzzyQuery");
}
/**
* Returns the minimum similarity that is required for this query to match.
* @return float value between 0.0 and 1.0
*/
float_t FuzzyQuery::getMinSimilarity() const {
return minimumSimilarity;
}
FuzzyQuery::FuzzyQuery(const FuzzyQuery& clone):
MultiTermQuery(clone)
{
this->minimumSimilarity = clone.getMinSimilarity();
this->prefixLength = clone.getPrefixLength();
//if(prefixLength < 0)
// _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
//else
if(prefixLength >= clone.getTerm()->textLength())
_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
}
Query* FuzzyQuery::clone() const{
return _CLNEW FuzzyQuery(*this);
}
size_t FuzzyQuery::hashCode() const{
//todo: we should give the query a seeding value... but
//need to do it for all hascode functions
size_t val = Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode();
val ^= Similarity::floatToByte(this->getMinSimilarity());
val ^= this->getPrefixLength();
return val;
}
bool FuzzyQuery::equals(Query* other) const{
if (!(other->instanceOf(FuzzyQuery::getClassName())))
return false;
FuzzyQuery* fq = (FuzzyQuery*)other;
return (this->getBoost() == fq->getBoost())
&& this->getMinSimilarity() == fq->getMinSimilarity()
&& this->getPrefixLength() == fq->getPrefixLength()
&& getTerm()->equals(fq->getTerm());
}
/**
* Returns the prefix length, i.e. the number of characters at the start
* of a term that must be identical (not fuzzy) to the query term if the query
* is to match that term.
*/
size_t FuzzyQuery::getPrefixLength() const {
return prefixLength;
}
FilteredTermEnum* FuzzyQuery::getEnum(IndexReader* reader){
Term* term = getTerm(false);
FuzzyTermEnum* ret = _CLNEW FuzzyTermEnum(reader, term, minimumSimilarity, prefixLength);
return ret;
}
CL_NS_END
#endif