/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_index_termvector_h
#define _lucene_index_termvector_h
#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif
#include "FieldInfos.h"
CL_NS_DEF(index)
/** Provides access to stored term vector of
* a document field.
*/
class TermFreqVector:LUCENE_BASE {
public:
virtual ~TermFreqVector(){
}
/**
*
* @return The field this vector is associated with.
*
*/
virtual const TCHAR* getField() = 0;
/**
* @return The number of terms in the term vector.
*/
virtual int32_t size() = 0;
/**
* @return An Array of term texts in ascending order.
*/
virtual const TCHAR** getTerms() = 0;
/** Array of term frequencies. Locations of the array correspond one to one
* to the terms in the array obtained from getTerms
* method. Each location in the array contains the number of times this
* term occurs in the document or the document field.
*
* The size of the returned array is size()
* @memory Returning a pointer to internal data. Do not delete.
*/
virtual const int32_t* getTermFrequencies() = 0;
/** Return an index in the term numbers array returned from
* getTerms
at which the term with the specified
* term
appears. If this term does not appear in the array,
* return -1.
*/
virtual int32_t indexOf(const TCHAR* term) = 0;
/** Just like indexOf(int32_t)
but searches for a number of terms
* at the same time. Returns an array that has the same size as the number
* of terms searched for, each slot containing the result of searching for
* that term number.
*
* @param terms array containing terms to look for
* @param start index in the array where the list of terms starts
* @param len the number of terms in the list
*/
virtual const int32_t* indexesOf(const TCHAR** terms, const int32_t start, const int32_t len) = 0;
};
/**
* Writer works by opening a document and then opening the fields within the document and then
* writing out the vectors for each field.
*
* Rough usage:
*
for each document
{
writer.openDocument();
for each field on the document
{
writer.openField(field);
for all of the terms
{
writer.addTerm(...)
}
writer.closeField
}
writer.closeDocument()
}
*/
class TermVectorsWriter:LUCENE_BASE {
private:
class TVField:LUCENE_BASE {
public:
int32_t number;
int64_t tvfPointer;
int32_t length; // number of distinct term positions
TVField(int32_t number): tvfPointer(0),length(0) {
this->number = number;
}
~TVField(){}
};
class TVTerm:LUCENE_BASE {
const TCHAR* termText;
int32_t termTextLen; //textlen cache
public:
int32_t freq;
const TCHAR* getTermText() const;
size_t getTermTextLen();
void setTermText(const TCHAR* val);
TVTerm();
~TVTerm();
};
#define LUCENE_TVX_EXTENSION ".tvx"
#define LUCENE_TVD_EXTENSION ".tvd"
#define LUCENE_TVF_EXTENSION ".tvf"
CL_NS(store)::IndexOutput* tvx, *tvd, *tvf;
CL_NS(util)::CLVector > fields;
CL_NS(util)::CLVector > terms;
FieldInfos* fieldInfos;
TVField* currentField;
int64_t currentDocPointer;
void addTermInternal(const TCHAR* termText, const int32_t freq);
void writeField();
void writeDoc();
public:
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_VERSION = 1);
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_SIZE = 4);
//TODO: Figure out how to write with or w/o position information and read back in
/** Create term vectors writer for the specified segment in specified
* directory. A new TermVectorsWriter should be created for each
* segment. The parameter maxFields
indicates how many total
* fields are found in this document. Not all of these fields may require
* termvectors to be stored, so the number of calls to
* openField
is less or equal to this number.
*/
TermVectorsWriter(CL_NS(store)::Directory* directory, const char* segment,
FieldInfos* fieldInfos);
~TermVectorsWriter();
void openDocument();
void closeDocument();
/** Close all streams. */
void close();
bool isDocumentOpen() const;
/** Start processing a field. This can be followed by a number of calls to
* addTerm, and a final call to closeField to indicate the end of
* processing of this field. If a field was previously open, it is
* closed automatically.
*/
void openField(const TCHAR* field);
/** Finished processing current field. This should be followed by a call to
* openField before future calls to addTerm.
*/
void closeField();
/** Return true if a field is currently open. */
bool isFieldOpen() const;
/** Add specified vectors to the document.
*/
void addVectors(TermFreqVector** vectors);
/** Add term to the field's term vector. Field must already be open
* of NullPointerException is thrown. Terms should be added in
* increasing order of terms, one call per unique termNum. ProxPointer
* is a pointer into the TermPosition file (prx). Freq is the number of
* times this term appears in this field, in this document.
*/
void addTerm(const TCHAR* termText, int32_t freq);
/** Add specified vector to the document. Document must be open but no field
* should be open or exception is thrown. The same document can have addTerm
* and addVectors
calls mixed, however a given field must either be
* populated with addTerm
or with addVector
. *
*/
void addTermFreqVector(TermFreqVector* vectr);
void addTermFreqVectorInternal(TermFreqVector* vectr);
};
/**
*/
class SegmentTermVector: public TermFreqVector {
private:
const TCHAR* field;
const TCHAR** terms;
int32_t termsLen; //cache
int32_t* termFreqs;
int32_t binarySearch(const TCHAR** a, const int32_t arraylen, const TCHAR* key) const;
public:
SegmentTermVector(const TCHAR* field, const TCHAR** terms, int32_t* termFreqs);
~SegmentTermVector();
/**
*
* @return The number of the field this vector is associated with
*/
const TCHAR* getField();
TCHAR* toString() const;
int32_t size();
const TCHAR** getTerms();
const int32_t* getTermFrequencies();
int32_t indexOf(const TCHAR* termText);
const int32_t* indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len);
};
/** TODO: relax synchro!
* Look at Java Lucene issue #30736 ([PATCH] to remove synchronized code from TermVectorsReader)
*/
class TermVectorsReader:LUCENE_BASE {
private:
FieldInfos* fieldInfos;
CL_NS(store)::IndexInput* tvx;
CL_NS(store)::IndexInput* tvd;
CL_NS(store)::IndexInput* tvf;
int64_t _size;
void checkValidFormat(CL_NS(store)::IndexInput* in);
SegmentTermVector** readTermVectors(const TCHAR** fields, const int64_t* tvfPointers, const int32_t len);
SegmentTermVector* readTermVector(const TCHAR* field, const int64_t tvfPointer);
int64_t size();
DEFINE_MUTEX(THIS_LOCK)
public:
TermVectorsReader(CL_NS(store)::Directory* d, const char* segment, FieldInfos* fieldInfos);
~TermVectorsReader();
void close();
TermFreqVector* get(const int32_t docNum, const TCHAR* field);
TermFreqVector** get(int32_t docNum);
};
class TermVectorOffsetInfo: LUCENE_BASE {
static TermVectorOffsetInfo** _EMPTY_OFFSET_INFO;
int startOffset;
int endOffset;
public:
static TermVectorOffsetInfo** EMPTY_OFFSET_INFO();
TermVectorOffsetInfo();
~TermVectorOffsetInfo();
TermVectorOffsetInfo(int32_t startOffset, int32_t endOffset);
int32_t getEndOffset() const;
void setEndOffset(int32_t endOffset);
int32_t getStartOffset() const;
void setStartOffset(int32_t startOffset);
bool equals(TermVectorOffsetInfo* o);
size_t hashCode() const;
};
/** Extends TermFreqVector
to provide additional information about
* positions in which each of the terms is found.
*/
class TermPositionVector: public TermFreqVector {
public:
/** Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
* term String array obtained from the indexOf
method.
* May return null if positions have not been stored.
*/
virtual int32_t* getTermPositions(int32_t index) = 0;
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
* May return null if offsets have not been stored.
*
* @see org.apache.lucene.analysis.Token
*
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
*/
virtual TermVectorOffsetInfo** getOffsets(int32_t index) = 0;
virtual ~TermPositionVector(){
}
};
CL_NS_END
#endif