/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #include "CLucene/StdHeader.h" #include "QueryParserBase.h" #include "CLucene/search/TermQuery.h" #include "CLucene/search/PhraseQuery.h" #include "CLucene/search/RangeQuery.h" #include "CLucene/search/FuzzyQuery.h" #include "CLucene/search/WildcardQuery.h" #include "CLucene/search/PrefixQuery.h" CL_NS_USE(search) CL_NS_USE(util) CL_NS_USE(analysis) CL_NS_USE(index) CL_NS_DEF(queryParser) QueryParserBase::QueryParserBase(Analyzer* analyzer){ //Func - Constructor //Pre - true //Post - instance has been created with PhraseSlop = 0 this->analyzer = analyzer; this->defaultOperator = OR_OPERATOR; this->phraseSlop = 0; this->lowercaseExpandedTerms = true; } QueryParserBase::~QueryParserBase(){ //Func - Destructor //Pre - true //Post - The instance has been destroyed } void QueryParserBase::discardEscapeChar(TCHAR* source) const{ int len = _tcslen(source); //int j = 0; for (int i = 0; i < len; i++) { if (source[i] == '\\' && source[i+1] != '\0' ) { _tcscpy(source+i,source+i+1); len--; } } } void QueryParserBase::AddClause(vector& clauses, int32_t conj, int32_t mods, Query* q){ //Func - Adds the next parsed clause. //Pre - //Post - bool required, prohibited; // If this term is introduced by AND, make the preceding term required, // unless it's already prohibited. const uint32_t nPreviousClauses = clauses.size(); if (nPreviousClauses > 0 && conj == CONJ_AND) { BooleanClause* c = clauses[nPreviousClauses-1]; if (!c->prohibited) c->required = true; } if (nPreviousClauses > 0 && defaultOperator == AND_OPERATOR && conj == CONJ_OR) { // If this term is introduced by OR, make the preceding term optional, // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b) // notice if the input is a OR b, first term is parsed as required; without // this modification a OR b would parse as +a OR b BooleanClause* c = clauses[nPreviousClauses-1]; if (!c->prohibited){ c->required = false; c->prohibited = false; } } // We might have been passed a NULL query; the term might have been // filtered away by the analyzer. if (q == NULL) return; if (defaultOperator == OR_OPERATOR) { // We set REQUIRED if we're introduced by AND or +; PROHIBITED if // introduced by NOT or -; make sure not to set both. prohibited = (mods == MOD_NOT); required = (mods == MOD_REQ); if (conj == CONJ_AND && !prohibited) { required = true; } } else { // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED // if not PROHIBITED and not introduced by OR prohibited = (mods == MOD_NOT); required = (!prohibited && conj != CONJ_OR); } if ( required && prohibited ) throwParserException( _T("Clause cannot be both required and prohibited"), ' ',0,0); clauses.push_back(_CLNEW BooleanClause(q,true, required, prohibited)); } void QueryParserBase::throwParserException(const TCHAR* message, TCHAR ch, int32_t col, int32_t line ) { TCHAR msg[1024]; _sntprintf(msg,1024,message,ch,col,line); _CLTHROWT (CL_ERR_Parse, msg ); } Query* QueryParserBase::GetFieldQuery(const TCHAR* field, TCHAR* queryText, int32_t slop){ Query* ret = GetFieldQuery(field,queryText); if ( ret && ret->getQueryName() == PhraseQuery::getClassName() ) ((PhraseQuery*)ret)->setSlop(slop); return ret; } Query* QueryParserBase::GetFieldQuery(const TCHAR* field, TCHAR* queryText){ //Func - Returns a query for the specified field. // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count //Pre - field != NULL // analyzer contains a valid reference to an Analyzer // queryText != NULL and contains the query //Post - A query instance has been returned for the specified field CND_PRECONDITION(field != NULL, "field is NULL"); CND_PRECONDITION(queryText != NULL, "queryText is NULL"); //Instantiate a stringReader for queryText StringReader reader(queryText); TokenStream* source = analyzer->tokenStream(field, &reader); CND_CONDITION(source != NULL,"source is NULL"); StringArrayConstWithDeletor v; Token t; int positionCount = 0; bool severalTokensAtSamePosition = false; //Get the tokens from the source try{ while (source->next(&t)){ v.push_back(STRDUP_TtoT(t.termText())); if (t.getPositionIncrement() != 0) positionCount += t.getPositionIncrement(); else severalTokensAtSamePosition = true; } }catch(CLuceneError& err){ if ( err.number() != CL_ERR_IO ) throw err; } _CLDELETE(source); //Check if there are any tokens retrieved if (v.size() == 0){ return NULL; }else{ if (v.size() == 1){ Term* t = _CLNEW Term(field, v[0]); Query* ret = _CLNEW TermQuery( t ); _CLDECDELETE(t); return ret; }else{ if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery* q = _CLNEW BooleanQuery; //todo: disableCoord=true here, but not implemented in BooleanQuery StringArrayConst::iterator itr = v.begin(); while ( itr != v.end() ){ Term* t = _CLNEW Term(field, *itr); q->add(_CLNEW TermQuery(t),true, false,false);//should occur... _CLDECDELETE(t); ++itr; } return q; }else { _CLTHROWA(CL_ERR_UnsupportedOperation, "MultiPhraseQuery NOT Implemented"); } }else{ PhraseQuery* q = _CLNEW PhraseQuery; q->setSlop(phraseSlop); StringArrayConst::iterator itr = v.begin(); while ( itr != v.end() ){ const TCHAR* data = *itr; Term* t = _CLNEW Term(field, data); q->add(t); _CLDECDELETE(t); ++itr; } return q; } } } } void QueryParserBase::setLowercaseExpandedTerms(bool lowercaseExpandedTerms){ this->lowercaseExpandedTerms = lowercaseExpandedTerms; } bool QueryParserBase::getLowercaseExpandedTerms() const { return lowercaseExpandedTerms; } void QueryParserBase::setDefaultOperator(int oper){ this->defaultOperator=oper; } int QueryParserBase::getDefaultOperator() const{ return defaultOperator; } Query* QueryParserBase::ParseRangeQuery(const TCHAR* field, TCHAR* queryText, bool inclusive) { //todo: this must be fixed, [-1--5] (-1 to -5) should yield a result, but won't parse properly //because it uses an analyser, should split it up differently... // Use the analyzer to get all the tokens. There should be 1 or 2. StringReader reader(queryText); TokenStream* source = analyzer->tokenStream(field, &reader); TCHAR* terms[2]; terms[0]=NULL;terms[1]=NULL; Token t; bool tret=true; bool from=true; while(tret) { try{ tret = source->next(&t); }catch (CLuceneError& err){ if ( err.number() == CL_ERR_IO ) tret=false; else throw err; } if (tret) { if ( !from && _tcscmp(t.termText(),_T("TO"))==0 ) continue; TCHAR* tmp = STRDUP_TtoT(t.termText()); discardEscapeChar(tmp); terms[from? 0 : 1] = tmp; if (from) from = false; else break; } } Query* ret = GetRangeQuery(field, terms[0], terms[1],inclusive); _CLDELETE_CARRAY(terms[0]); _CLDELETE_CARRAY(terms[1]); _CLDELETE(source); return ret; } Query* QueryParserBase::GetPrefixQuery(const TCHAR* field, TCHAR* termStr){ //Pre - field != NULL and field contains the name of the field that the query will use // termStr != NULL and is the token to use for building term for the query // (WITH or WITHOUT a trailing '*' character!) //Post - A PrefixQuery instance has been returned CND_PRECONDITION(field != NULL,"field is NULL"); CND_PRECONDITION(termStr != NULL,"termStr is NULL"); if ( lowercaseExpandedTerms ) _tcslwr(termStr); Term* t = _CLNEW Term(field, termStr); CND_CONDITION(t != NULL,"Could not allocate memory for term t"); Query *q = _CLNEW PrefixQuery(t); CND_CONDITION(q != NULL,"Could not allocate memory for PrefixQuery q"); _CLDECDELETE(t); return q; } Query* QueryParserBase::GetFuzzyQuery(const TCHAR* field, TCHAR* termStr){ //Func - Factory method for generating a query (similar to getPrefixQuery}). Called when parser parses // an input term token that has the fuzzy suffix (~) appended. //Pre - field != NULL and field contains the name of the field that the query will use // termStr != NULL and is the token to use for building term for the query // (WITH or WITHOUT a trailing '*' character!) //Post - A FuzzyQuery instance has been returned CND_PRECONDITION(field != NULL,"field is NULL"); CND_PRECONDITION(termStr != NULL,"termStr is NULL"); if ( lowercaseExpandedTerms ) _tcslwr(termStr); Term* t = _CLNEW Term(field, termStr); CND_CONDITION(t != NULL,"Could not allocate memory for term t"); Query *q = _CLNEW FuzzyQuery(t); CND_CONDITION(q != NULL,"Could not allocate memory for FuzzyQuery q"); _CLDECDELETE(t); return q; } Query* QueryParserBase::GetWildcardQuery(const TCHAR* field, TCHAR* termStr){ CND_PRECONDITION(field != NULL,"field is NULL"); CND_PRECONDITION(termStr != NULL,"termStr is NULL"); if ( lowercaseExpandedTerms ) _tcslwr(termStr); Term* t = _CLNEW Term(field, termStr); CND_CONDITION(t != NULL,"Could not allocate memory for term t"); Query* q = _CLNEW WildcardQuery(t); _CLDECDELETE(t); return q; } Query* QueryParserBase::GetBooleanQuery(std::vector& clauses){ if ( clauses.size() == 0 ) return NULL; BooleanQuery* query = _CLNEW BooleanQuery(); //Condition check to see if query has been allocated properly CND_CONDITION(query != NULL, "No memory could be allocated for query"); //iterate through all the clauses for( uint32_t i=0;iadd(clauses[i]); } return query; } CL_NS(search)::Query* QueryParserBase::GetRangeQuery(const TCHAR* field, TCHAR* part1, TCHAR* part2, bool inclusive){ //todo: does jlucene handle rangequeries differntly? if we are using //a certain type of analyser, the terms may be filtered out, which //is not necessarily what we want. if (lowercaseExpandedTerms) { _tcslwr(part1); _tcslwr(part2); } //todo: should see if we can parse the strings as dates... currently we leave that up to the end-developer... Term* t1 = _CLNEW Term(field,part1); Term* t2 = _CLNEW Term(field,part2); Query* ret = _CLNEW RangeQuery(t1, t2, inclusive); _CLDECDELETE(t1); _CLDECDELETE(t2); return ret; } CL_NS_END