/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #ifndef _lucene_index_termvector_h #define _lucene_index_termvector_h #if defined(_LUCENE_PRAGMA_ONCE) # pragma once #endif #include "FieldInfos.h" CL_NS_DEF(index) //predefine classes struct TermVectorOffsetInfo; class TermPositionVector; /** Provides access to stored term vector of * a document field. */ class TermFreqVector:LUCENE_BASE { public: virtual ~TermFreqVector(){ } /** * * @return The field this vector is associated with. * */ virtual const TCHAR* getField() = 0; /** * @return The number of terms in the term vector. */ virtual int32_t size() = 0; /** * @return An Array of term texts in ascending order. */ virtual const TCHAR** getTerms() = 0; /** Array of term frequencies. Locations of the array correspond one to one * to the terms in the array obtained from getTerms * method. Each location in the array contains the number of times this * term occurs in the document or the document field. * * The size of the returned array is size() * @memory Returning a pointer to internal data. Do not delete. */ virtual const Array* getTermFrequencies() = 0; /** Return an index in the term numbers array returned from * getTerms at which the term with the specified * term appears. If this term does not appear in the array, * return -1. */ virtual int32_t indexOf(const TCHAR* term) = 0; /** Just like indexOf(int32_t) but searches for a number of terms * at the same time. Returns an array that has the same size as the number * of terms searched for, each slot containing the result of searching for * that term number. * * @param terms array containing terms to look for * @param start index in the array where the list of terms starts * @param len the number of terms in the list */ virtual void indexesOf(const TCHAR** terms, const int32_t start, const int32_t len, Array& ret) = 0; /** Solve the diamond inheritence problem by providing a reinterpret function. * No dynamic casting is required and no RTTI data is needed to do this */ virtual TermPositionVector* __asTermPositionVector()=0; }; /** * Writer works by opening a document and then opening the fields within the document and then * writing out the vectors for each field. * * Rough usage: * for each document { writer.openDocument(); for each field on the document { writer.openField(field); for all of the terms { writer.addTerm(...) } writer.closeField } writer.closeDocument() } */ class TermVectorsWriter:LUCENE_BASE { private: class TVField:LUCENE_BASE { public: int32_t number; int64_t tvfPointer; int32_t length; // number of distinct term positions bool storePositions; bool storeOffsets; TVField(int32_t number, bool storePos, bool storeOff): tvfPointer(0),length(0){ this->number = number; this->storePositions = storePos; this->storeOffsets = storeOff; } ~TVField(){} }; class TVTerm:LUCENE_BASE { const TCHAR* termText; int32_t termTextLen; //textlen cache public: TVTerm(); ~TVTerm(); int32_t freq; Array* positions; Array* offsets; const TCHAR* getTermText() const; size_t getTermTextLen(); void setTermText(const TCHAR* val); }; CL_NS(store)::IndexOutput* tvx, *tvd, *tvf; CL_NS(util)::CLVector > fields; CL_NS(util)::CLVector > terms; FieldInfos* fieldInfos; TVField* currentField; int64_t currentDocPointer; void addTermInternal(const TCHAR* termText, const int32_t freq, Array* positions, Array* offsets); void writeField(); void writeDoc(); void openField(int32_t fieldNumber, bool storePositionWithTermVector, bool storeOffsetWithTermVector); public: LUCENE_STATIC_CONSTANT(int32_t, FORMAT_VERSION = 2); //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file LUCENE_STATIC_CONSTANT(int32_t, FORMAT_SIZE = 4); LUCENE_STATIC_CONSTANT(uint8_t, STORE_POSITIONS_WITH_TERMVECTOR = 0x1); LUCENE_STATIC_CONSTANT(uint8_t, STORE_OFFSET_WITH_TERMVECTOR = 0x2); static const char* LUCENE_TVX_EXTENSION; static const char* LUCENE_TVD_EXTENSION; static const char* LUCENE_TVF_EXTENSION; TermVectorsWriter(CL_NS(store)::Directory* directory, const char* segment, FieldInfos* fieldInfos); ~TermVectorsWriter(); void openDocument(); void closeDocument(); /** Close all streams. */ void close(); bool isDocumentOpen() const; /** Start processing a field. This can be followed by a number of calls to * addTerm, and a final call to closeField to indicate the end of * processing of this field. If a field was previously open, it is * closed automatically. */ void openField(const TCHAR* field); /** Finished processing current field. This should be followed by a call to * openField before future calls to addTerm. */ void closeField(); /** Return true if a field is currently open. */ bool isFieldOpen() const; /** * Add a complete document specified by all its term vectors. If document has no * term vectors, add value for tvx. * * @param vectors * @throws IOException */ void addAllDocVectors(Array& vectors); /** Add term to the field's term vector. Field must already be open. * Terms should be added in * increasing order of terms, one call per unique termNum. ProxPointer * is a pointer into the TermPosition file (prx). Freq is the number of * times this term appears in this field, in this document. * @throws IllegalStateException if document or field is not open */ void addTerm(const TCHAR* termText, int32_t freq, Array* positions = NULL, Array* offsets = NULL); }; /** */ class SegmentTermVector: public virtual TermFreqVector { private: const TCHAR* field; TCHAR** terms; int32_t termsLen; //cache Array* termFreqs; int32_t binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const; public: //note: termFreqs must be the same length as terms SegmentTermVector(const TCHAR* field, TCHAR** terms, Array* termFreqs); virtual ~SegmentTermVector(); /** * * @return The number of the field this vector is associated with */ const TCHAR* getField(); TCHAR* toString() const; int32_t size(); const TCHAR** getTerms(); const Array* getTermFrequencies(); int32_t indexOf(const TCHAR* termText); void indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array& ret); virtual TermPositionVector* __asTermPositionVector(); }; /** * @version $Id: */ class TermVectorsReader:LUCENE_BASE { private: FieldInfos* fieldInfos; CL_NS(store)::IndexInput* tvx; CL_NS(store)::IndexInput* tvd; CL_NS(store)::IndexInput* tvf; int64_t _size; int32_t tvdFormat; int32_t tvfFormat; int32_t checkValidFormat(CL_NS(store)::IndexInput* in); void readTermVectors(const TCHAR** fields, const int64_t* tvfPointers, const int32_t len, Array& _return); /** * * @param field The field to read in * @param tvfPointer The pointer within the tvf file where we should start reading * @return The TermVector located at that position * @throws IOException */ SegmentTermVector* readTermVector(const TCHAR* field, const int64_t tvfPointer); int64_t size(); DEFINE_MUTEX(THIS_LOCK) TermVectorsReader(const TermVectorsReader& copy); public: TermVectorsReader(CL_NS(store)::Directory* d, const char* segment, FieldInfos* fieldInfos); ~TermVectorsReader(); void close(); TermVectorsReader* clone() const; /** * Retrieve the term vector for the given document and field * @param docNum The document number to retrieve the vector for * @param field The field within the document to retrieve * @return The TermFreqVector for the document and field or null if there is no termVector for this field. * @throws IOException if there is an error reading the term vector files */ TermFreqVector* get(const int32_t docNum, const TCHAR* field); /** * Return all term vectors stored for this document or null if the could not be read in. * * @param docNum The document number to retrieve the vector for * @return All term frequency vectors * @throws IOException if there is an error reading the term vector files */ bool get(int32_t docNum, Array& result); }; struct TermVectorOffsetInfo { int startOffset; int endOffset; public: static Array EMPTY_OFFSET_INFO; TermVectorOffsetInfo(); ~TermVectorOffsetInfo(); TermVectorOffsetInfo(int32_t startOffset, int32_t endOffset); int32_t getEndOffset() const; void setEndOffset(int32_t endOffset); int32_t getStartOffset() const; void setStartOffset(int32_t startOffset); bool equals(TermVectorOffsetInfo* o); size_t hashCode() const; }; /** Extends TermFreqVector to provide additional information about * positions in which each of the terms is found. A TermPositionVector not necessarily * contains both positions and offsets, but at least one of these arrays exists. */ class TermPositionVector: public virtual TermFreqVector { public: /** Returns an array of positions in which the term is found. * Terms are identified by the index at which its number appears in the * term String array obtained from the indexOf method. * May return null if positions have not been stored. */ virtual Array* getTermPositions(int32_t index) = 0; /** * Returns an array of TermVectorOffsetInfo in which the term is found. * May return null if offsets have not been stored. * * @see org.apache.lucene.analysis.Token * * @param index The position in the array to get the offsets from * @return An array of TermVectorOffsetInfo objects or the empty list */ virtual Array* getOffsets(int32_t index) = 0; virtual ~TermPositionVector(){ } }; class SegmentTermPositionVector: public SegmentTermVector, public TermPositionVector { protected: Array< Array >* positions; Array< Array >* offsets; static Array EMPTY_TERM_POS; public: SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array* termFreqs, Array< Array >* positions, Array< Array >* offsets); ~SegmentTermPositionVector(); /** * Returns an array of TermVectorOffsetInfo in which the term is found. * * @param index The position in the array to get the offsets from * @return An array of TermVectorOffsetInfo objects or the empty list * @see org.apache.lucene.analysis.Token */ Array* getOffsets(int32_t index); /** * Returns an array of positions in which the term is found. * Terms are identified by the index at which its number appears in the * term String array obtained from the indexOf method. */ Array* getTermPositions(int32_t index); const TCHAR* getField(){ return SegmentTermVector::getField(); } TCHAR* toString() const{ return SegmentTermVector::toString(); } int32_t size(){ return SegmentTermVector::size(); } const TCHAR** getTerms(){ return SegmentTermVector::getTerms(); } const Array* getTermFrequencies(){ return SegmentTermVector::getTermFrequencies(); } int32_t indexOf(const TCHAR* termText){ return SegmentTermVector::indexOf(termText); } void indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array& ret) { SegmentTermVector::indexesOf(termNumbers, start, len, ret); } virtual TermPositionVector* __asTermPositionVector(); }; CL_NS_END #endif