/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #ifndef _lucene_analysis_Analyzers_ #define _lucene_analysis_Analyzers_ #if defined(_LUCENE_PRAGMA_ONCE) # pragma once #endif #include "CLucene/util/Reader.h" #include "AnalysisHeader.h" #include "CLucene/util/Misc.h" CL_NS_DEF(analysis) /** An abstract base class for simple, character-oriented tokenizers.*/ class CharTokenizer:public Tokenizer { private: int32_t offset, bufferIndex, dataLen; TCHAR buffer[LUCENE_MAX_WORD_LEN+1]; const TCHAR* ioBuffer; protected: /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which * satisfy this predicate. Characters for which this is false are used to * define token boundaries and are not included in tokens. */ virtual bool isTokenChar(const TCHAR c) const = 0; /** Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this * to, e.g., lowercase tokens. */ virtual TCHAR normalize(const TCHAR c) const; public: CharTokenizer(CL_NS(util)::Reader* in); virtual ~CharTokenizer(){ } bool next(Token* token); }; /** A LetterTokenizer is a tokenizer that divides text at non-letters. That's to say, it defines tokens as maximal strings of adjacent letters, as defined by java.lang.Character.isLetter() predicate. Note: this does a decent job for most European languages, but does a terrible job for some Asian languages, where words are not separated by spaces. */ class LetterTokenizer:public CharTokenizer { public: // Construct a new LetterTokenizer. LetterTokenizer(CL_NS(util)::Reader* in): CharTokenizer(in) {} ~LetterTokenizer(){} protected: /** Collects only characters which satisfy _istalpha.*/ bool isTokenChar(const TCHAR c) const; }; /** * LowerCaseTokenizer performs the function of LetterTokenizer * and LowerCaseFilter together. It divides text at non-letters and converts * them to lower case. While it is functionally equivalent to the combination * of LetterTokenizer and LowerCaseFilter, there is a performance advantage * to doing the two tasks at once, hence this (redundant) implementation. *
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
*/
class LowerCaseTokenizer:public LetterTokenizer {
public:
/** Construct a new LowerCaseTokenizer. */
LowerCaseTokenizer(CL_NS(util)::Reader* in):
LetterTokenizer(in) {}
~LowerCaseTokenizer(){}
protected:
/** Collects only characters which satisfy _totlower. */
TCHAR normalize(const TCHAR chr) const;
};
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. */
class WhitespaceTokenizer: public CharTokenizer {
public:
/** Construct a new WhitespaceTokenizer. */
WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
~WhitespaceTokenizer(){}
protected:
/** Collects only characters which do not satisfy _istspace.
*/
bool isTokenChar(const TCHAR c) const;
};
/** An Analyzer that uses WhitespaceTokenizer. */
class WhitespaceAnalyzer: public Analyzer {
public:
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
~WhitespaceAnalyzer(){}
};
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
class SimpleAnalyzer: public Analyzer {
public:
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
~SimpleAnalyzer(){}
};
/**
* Normalizes token text to lower case.
*/
class LowerCaseFilter: public TokenFilter {
public:
LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
~LowerCaseFilter(){}
bool next(Token* token);
};
/**
* Removes stop words from a token stream.
*/
class StopFilter: public TokenFilter {
private:
//bvk: i found this to work faster with a non-hash table. the number of items
//in the stop table is not like to make it worth having hashing.
CL_NS(util)::CLSetList Example usage:
*
* In this example, StandardAnalyzer will be used for all fields except "firstname"
* and "lastname", for which KeywordAnalyzer will be used.
*
* A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
* and query parsing.
*/
class PerFieldAnalyzerWrapper : public Analyzer {
private:
Analyzer* defaultAnalyzer;
CL_NS(util)::CLHashMap
* For instance, 'à' will be replaced by 'a'.
*
*/
class ISOLatin1AccentFilter: public TokenFilter {
public:
ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
TokenFilter(input,deleteTs)
{
}
/**
* To replace accented characters in a String by unaccented equivalents.
*/
bool next(Token* token);
};
/**
* Emits the entire input as a single token.
*/
class KeywordTokenizer: public Tokenizer {
private:
LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
bool done;
int bufferSize;
public:
KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
virtual ~KeywordTokenizer();
bool next(Token* token);
};
/**
* "Tokenizes" the entire stream as a single token. This is useful
* for data like zip codes, ids, and some product names.
*/
class KeywordAnalyzer: public Analyzer {
public:
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
virtual ~KeywordAnalyzer(){}
};
/**
* Removes words that are too long and too short from the stream.
*
*/
class LengthFilter: public TokenFilter {
private:
int _min;
int _max;
public:
/**
* Build a filter that removes words that are too long or too
* short from the text.
*/
LengthFilter(TokenStream* in, int _min, int _max);
/**
* Returns the next input Token whose termText() is the right len
*/
bool next(Token* token);
};
CL_NS_END
#endif
* PerFieldAnalyzerWrapper aWrapper =
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
*
*
*