/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #ifndef _lucene_analysis_Analyzers_ #define _lucene_analysis_Analyzers_ #if defined(_LUCENE_PRAGMA_ONCE) # pragma once #endif #include "CLucene/util/Reader.h" #include "AnalysisHeader.h" #include "CLucene/util/Misc.h" CL_NS_DEF(analysis) /** An abstract base class for simple, character-oriented tokenizers.*/ class CharTokenizer:public Tokenizer { private: int32_t offset, bufferIndex, dataLen; TCHAR buffer[LUCENE_MAX_WORD_LEN+1]; const TCHAR* ioBuffer; protected: /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which * satisfy this predicate. Characters for which this is false are used to * define token boundaries and are not included in tokens. */ virtual bool isTokenChar(const TCHAR c) const = 0; /** Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this * to, e.g., lowercase tokens. */ virtual TCHAR normalize(const TCHAR c) const; public: CharTokenizer(CL_NS(util)::Reader* in); virtual ~CharTokenizer(){ } bool next(Token* token); }; /** A LetterTokenizer is a tokenizer that divides text at non-letters. That's to say, it defines tokens as maximal strings of adjacent letters, as defined by java.lang.Character.isLetter() predicate. Note: this does a decent job for most European languages, but does a terrible job for some Asian languages, where words are not separated by spaces. */ class LetterTokenizer:public CharTokenizer { public: // Construct a new LetterTokenizer. LetterTokenizer(CL_NS(util)::Reader* in): CharTokenizer(in) {} ~LetterTokenizer(){} protected: /** Collects only characters which satisfy _istalpha.*/ bool isTokenChar(const TCHAR c) const; }; /** * LowerCaseTokenizer performs the function of LetterTokenizer * and LowerCaseFilter together. It divides text at non-letters and converts * them to lower case. While it is functionally equivalent to the combination * of LetterTokenizer and LowerCaseFilter, there is a performance advantage * to doing the two tasks at once, hence this (redundant) implementation. *

* Note: this does a decent job for most European languages, but does a terrible * job for some Asian languages, where words are not separated by spaces. */ class LowerCaseTokenizer:public LetterTokenizer { public: /** Construct a new LowerCaseTokenizer. */ LowerCaseTokenizer(CL_NS(util)::Reader* in): LetterTokenizer(in) {} ~LowerCaseTokenizer(){} protected: /** Collects only characters which satisfy _totlower. */ TCHAR normalize(const TCHAR chr) const; }; /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace. * Adjacent sequences of non-Whitespace characters form tokens. */ class WhitespaceTokenizer: public CharTokenizer { public: /** Construct a new WhitespaceTokenizer. */ WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {} ~WhitespaceTokenizer(){} protected: /** Collects only characters which do not satisfy _istspace. */ bool isTokenChar(const TCHAR c) const; }; /** An Analyzer that uses WhitespaceTokenizer. */ class WhitespaceAnalyzer: public Analyzer { public: TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); ~WhitespaceAnalyzer(){} }; /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */ class SimpleAnalyzer: public Analyzer { public: TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); ~SimpleAnalyzer(){} }; /** * Normalizes token text to lower case. */ class LowerCaseFilter: public TokenFilter { public: LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {} ~LowerCaseFilter(){} bool next(Token* token); }; /** * Removes stop words from a token stream. */ class StopFilter: public TokenFilter { private: //bvk: i found this to work faster with a non-hash table. the number of items //in the stop table is not like to make it worth having hashing. CL_NS(util)::CLSetList* table; public: // Constructs a filter which removes words from the input // TokenStream that are named in the array of words. StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords); ~StopFilter(){} /** Constructs a filter which removes words from the input * TokenStream that are named in the CLSetList. */ StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList* stopTable): TokenFilter(in, deleteTokenStream), table(stopTable) {} /** * Builds a Hashtable from an array of stop words, appropriate for passing * into the StopFilter constructor. This permits this table construction to * be cached once when an Analyzer is constructed. * Note: the stopWords list must be a static list because the strings are not copied */ static void fillStopTable(CL_NS(util)::CLSetList* stopTable, const TCHAR** stopWords); /** * Returns the next input Token whose termText() is not a stop word. */ bool next(Token* token); }; /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */ class StopAnalyzer: public Analyzer { CL_NS(util)::CLSetList stopTable; public: /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ StopAnalyzer(); ~StopAnalyzer(); /** Builds an analyzer which removes words in the provided array. */ StopAnalyzer( const TCHAR** stopWords ); /** Filters LowerCaseTokenizer with StopFilter. */ TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); /** An array containing some common English words that are not usually useful for searching. */ static const TCHAR* ENGLISH_STOP_WORDS[]; }; /** * This analyzer is used to facilitate scenarios where different * fields require different analysis techniques. Use {@link #addAnalyzer} * to add a non-default analyzer on a field name basis. * *

Example usage: * *

 *   PerFieldAnalyzerWrapper aWrapper =
 *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
 *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
 *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
 * 
* *

In this example, StandardAnalyzer will be used for all fields except "firstname" * and "lastname", for which KeywordAnalyzer will be used. * *

A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing * and query parsing. */ class PerFieldAnalyzerWrapper : public Analyzer { private: Analyzer* defaultAnalyzer; CL_NS(util)::CLHashMap > analyzerMap; public: /** * Constructs with default analyzer. * * @param defaultAnalyzer Any fields not specifically * defined to use a different analyzer will use the one provided here. */ PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer); ~PerFieldAnalyzerWrapper(); /** * Defines an analyzer to use for the specified field. * * @param fieldName field name requiring a non-default analyzer * @param analyzer non-default analyzer to use for field */ void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer); TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); }; /** * A filter that replaces accented characters in the ISO Latin 1 character set * (ISO-8859-1) by their unaccented equivalent. The case will not be altered. *

* For instance, 'à' will be replaced by 'a'. *

*/ class ISOLatin1AccentFilter: public TokenFilter { public: ISOLatin1AccentFilter(TokenStream* input, bool deleteTs): TokenFilter(input,deleteTs) { } /** * To replace accented characters in a String by unaccented equivalents. */ bool next(Token* token); }; /** * Emits the entire input as a single token. */ class KeywordTokenizer: public Tokenizer { private: LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256); bool done; int bufferSize; public: KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1); virtual ~KeywordTokenizer(); bool next(Token* token); }; /** * "Tokenizes" the entire stream as a single token. This is useful * for data like zip codes, ids, and some product names. */ class KeywordAnalyzer: public Analyzer { public: TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); virtual ~KeywordAnalyzer(){} }; /** * Removes words that are too long and too short from the stream. * */ class LengthFilter: public TokenFilter { private: int _min; int _max; public: /** * Build a filter that removes words that are too long or too * short from the text. */ LengthFilter(TokenStream* in, int _min, int _max); /** * Returns the next input Token whose termText() is the right len */ bool next(Token* token); }; CL_NS_END #endif