/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #ifndef _lucene_analysis_Analyzers_ #define _lucene_analysis_Analyzers_ #if defined(_LUCENE_PRAGMA_ONCE) # pragma once #endif #include "CLucene/util/Reader.h" #include "AnalysisHeader.h" #include "CLucene/util/Misc.h" CL_NS_DEF(analysis) class CharTokenizer:public Tokenizer { private: int32_t offset, bufferIndex, dataLen; TCHAR buffer[LUCENE_MAX_WORD_LEN+1]; const TCHAR* ioBuffer; protected: // Returns true iff a character should be included in a token. This // tokenizer generates as tokens adjacent sequences of characters which // satisfy this predicate. Characters for which this is false are used to // define token boundaries and are not included in tokens. virtual bool isTokenChar(const TCHAR c) const = 0; // Called on each token character to normalize it before it is added to the // token. The default implementation does nothing. Subclasses may use this // to, e.g., lowercase tokens. virtual TCHAR normalize(const TCHAR c) const; public: CharTokenizer(CL_NS(util)::Reader* in); virtual ~CharTokenizer(){ } // Returns the next token in the stream, or null at EOS. // *** This is not a pointer. Use of it must deleted. bool next(Token* token); }; class LetterTokenizer:public CharTokenizer { public: // Construct a new LetterTokenizer. LetterTokenizer(CL_NS(util)::Reader* in): CharTokenizer(in) {} ~LetterTokenizer(){} protected: // Collects only characters which satisfy // {@link Character#isLetter(TCHAR)}. bool isTokenChar(const TCHAR c) const; }; // LowerCaseTokenizer performs the function of LetterTokenizer // and LowerCaseFilter together. It divides text at non-letters and converts // them to lower case. While it is functionally equivalent to the combination // of LetterTokenizer and LowerCaseFilter, there is a performance advantage // to doing the two tasks at once, hence this (redundant) implementation. //
// Note: this does a decent job for most European languages, but does a terrible
// job for some Asian languages, where words are not separated by spaces.
class LowerCaseTokenizer:public LetterTokenizer {
public:
// Construct a new LowerCaseTokenizer.
LowerCaseTokenizer(CL_NS(util)::Reader* in):
LetterTokenizer(in) {}
~LowerCaseTokenizer(){}
protected:
// Collects only characters which satisfy
// {@link Character#isLetter(TCHAR)}.
TCHAR normalize(const TCHAR chr) const;
};
class WhitespaceTokenizer: public CharTokenizer {
public:
// Construct a new WhitespaceTokenizer.
WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
~WhitespaceTokenizer(){}
protected:
// Collects only characters which do not satisfy
// {@link Character#isWhitespace(TCHAR)}.
bool isTokenChar(const TCHAR c) const;
};
// An Analyzer that uses WhitespaceTokenizer.
class WhitespaceAnalyzer: public Analyzer {
public:
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
~WhitespaceAnalyzer(){}
};
class SimpleAnalyzer: public Analyzer {
public:
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
~SimpleAnalyzer(){}
};
/**
* Normalizes token text to lower case.
*
* @version $Id: Analyzers.h 2209 2006-06-15 14:12:41Z ustramooner $
*/
class LowerCaseFilter: public TokenFilter {
public:
LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
~LowerCaseFilter(){}
bool next(Token* token);
};
// Removes stop words from a token stream.
class StopFilter: public TokenFilter {
private:
//bvk: i found this to work faster with a non-hash table. the number of items
//in the stop table is not like to make it worth having hashing.
CL_NS(util)::CLSetList Example usage:
*
* In this example, StandardAnalyzer will be used for all fields except "firstname"
* and "lastname", for which KeywordAnalyzer will be used.
*
* A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
* and query parsing.
*/
class PerFieldAnalyzerWrapper : public Analyzer {
private:
Analyzer* defaultAnalyzer;
CL_NS(util)::CLHashMap
* For instance, 'à' will be replaced by 'a'.
*
*/
class ISOLatin1AccentFilter: public TokenFilter {
public:
ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
TokenFilter(input,deleteTs)
{
}
/**
* To replace accented characters in a String by unaccented equivalents.
*/
bool next(Token* token);
};
CL_NS_END
#endif
* PerFieldAnalyzerWrapper aWrapper =
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
*
*
*