/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #include "CLucene/StdHeader.h" #include "TermInfosReader.h" #include "Term.h" #include "Terms.h" #include "SegmentTermEnum.h" #include "CLucene/store/Directory.h" #include "FieldInfos.h" #include "TermInfo.h" #include "TermInfosWriter.h" #include "CLucene/util/Misc.h" CL_NS_USE(store) CL_NS_USE(util) CL_NS_DEF(index) TermInfosReader::TermInfosReader(Directory* dir, const char* seg, FieldInfos* fis): directory (dir),fieldInfos (fis) { //Func - Constructor. // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii) //Pre - dir is a reference to a valid Directory // Fis contains a valid reference to an FieldInfos instance // seg != NULL and contains the name of the segment //Post - An instance has been created and the index named seg has been read. (Remember // a segment is nothing more then an independently readable index) CND_PRECONDITION(seg != NULL, "seg is NULL"); //Initialize the name of the segment segment = seg; //There are no indexTerms yet indexTerms = NULL; //So there are no indexInfos indexInfos = NULL; //So there are no indexPointers indexPointers = NULL; //Create a filname fo a Term Info File char* tisFile = Misc::segmentname(segment,".tis"); char* tiiFile = Misc::segmentname(segment,".tii"); //Create an SegmentTermEnum for storing all the terms read of the segment origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false); indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true); //Check if enumerator points to a valid instance CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator"); CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); _CLDELETE_CaARRAY(tisFile); _CLDELETE_CaARRAY(tiiFile); //Get the size of the enumeration and store it in size _size = origEnum->size; } TermInfosReader::~TermInfosReader(){ //Func - Destructor //Pre - true //Post - The instance has been destroyed //Close the TermInfosReader to be absolutly sure that enumerator has been closed //and the arrays indexTerms, indexPointers and indexInfos and their elements //have been destroyed close(); } void TermInfosReader::close() { //Func - Close the enumeration of TermInfos //Pre - true //Post - The _enumeration has been closed and the arrays //Check if indexTerms and indexInfos exist if (indexTerms && indexInfos){ //Iterate through arrays indexTerms and indexPointer to //destroy their elements #ifdef _DEBUG for ( int32_t i=0; iclose(); //Get a pointer to IndexInput used by the enumeration but //instantiated in the constructor by directory.open( tisFile ) IndexInput *is = origEnum->input; //Delete the enumuration enumerator _CLDELETE(origEnum); //Delete the IndexInput _CLDELETE(is); } if (indexEnum != NULL){ indexEnum->close(); //Get a pointer to IndexInput used by the enumeration but //instantiated in the constructor by directory.open( tiiFile ) IndexInput *is = indexEnum->input; //Delete the enumuration enumerator _CLDELETE(indexEnum); //Delete the IndexInput _CLDELETE(is); } } int64_t TermInfosReader::size() const{ //Func - Return the size of the enumeration of TermInfos //Pre - true //Post - size has been returened return _size; } Term* TermInfosReader::get(const int32_t position) { //Func - Returns the nth term in the set //Pre - position > = 0 //Post - The n-th term in the set has been returned //Check if the size is 0 because then there are no terms if (_size == 0) return NULL; SegmentTermEnum* enumerator = getEnum(); //if if ( enumerator != NULL //an enumeration exists && enumerator->term(false) != NULL // term is at or past current && position >= enumerator->position && position < (enumerator->position + enumerator->indexInterval) ) { return scanEnum(position); // can avoid seek } //random-access: must seek seekEnum(position / enumerator->indexInterval); //Get the Term at position return scanEnum(position); } //todo: currently there is no way of cleaning up a thread, if the thread ends. //we are stuck with the terminfosreader of that thread. Hopefully this won't //be too big a problem... solutions anyone? SegmentTermEnum* TermInfosReader::getEnum(){ SegmentTermEnum* termEnum = enumerators.get(); if (termEnum == NULL){ termEnum = terms(); enumerators.set(termEnum); } return termEnum; } TermInfo* TermInfosReader::get(const Term* term){ //Func - Returns a TermInfo for a term //Pre - term holds a valid reference to term //Post - if term can be found its TermInfo has been returned otherwise NULL //If the size of the enumeration is 0 then no Terms have been read if (_size == 0) return NULL; ensureIndexIsRead(); // optimize sequential access: first try scanning cached enum w/o seeking SegmentTermEnum* enumerator = getEnum(); // optimize sequential access: first try scanning cached enumerator w/o seeking //if if ( //the current term of the enumeration enumerator is not at the end AND enumerator->term(false) != NULL && ( //there exists a previous current called prev and term is positioned after this prev OR ( enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) || //term is positioned at the same position as the current of enumerator or at a higher position term->compareTo(enumerator->term(false)) >= 0 ) ) { //Calculate the offset for the position int32_t _enumOffset = (int32_t)(enumerator->position/enumerator->indexInterval)+1; // but before end of block if ( //the length of indexTerms (the number of terms in enumerator) equals //_enum_offset OR indexTermsLength == _enumOffset || //term is positioned in front of term found at _enumOffset in indexTerms term->compareTo(&indexTerms[_enumOffset]) < 0){ //no need to seek, retrieve the TermInfo for term return scanEnum(term); } } //Reposition current term in the enumeration seekEnum(getIndexOffset(term)); //Return the TermInfo for term return scanEnum(term); } int64_t TermInfosReader::getPosition(const Term* term) { //Func - Returns the position of a Term in the set //Pre - term holds a valid reference to a Term // enumerator != NULL //Post - If term was found then its position is returned otherwise -1 //if the enumeration is empty then return -1 if (_size == 0) return -1; ensureIndexIsRead(); //Retrieve the indexOffset for term int32_t indexOffset = getIndexOffset(term); seekEnum(indexOffset); SegmentTermEnum* enumerator = getEnum(); while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {} if ( term->equals(enumerator->term(false)) ){ return enumerator->position; }else return -1; } SegmentTermEnum* TermInfosReader::terms(const Term* term) { //Func - Returns an enumeration of terms starting at or after the named term. // If term is null then enumerator is set to the beginning //Pre - term holds a valid reference to a Term // enumerator != NULL //Post - An enumeration of terms starting at or after the named term has been returned SegmentTermEnum* enumerator = NULL; if ( term != NULL ){ //Seek enumerator to term; delete the new TermInfo that's returned. TermInfo* ti = get(term); _CLDELETE(ti); enumerator = getEnum(); }else enumerator = origEnum; //Clone the entire enumeration SegmentTermEnum* cln = enumerator->clone(); //Check if cln points to a valid instance CND_CONDITION(cln != NULL,"cln is NULL"); return cln; } void TermInfosReader::ensureIndexIsRead() { //Func - Reads the term info index file or .tti file. // This file contains every IndexInterval-th entry from the .tis file, // along with its location in the "tis" file. This is designed to be read entirely // into memory and used to provide random access to the "tis" file. //Pre - indexTerms = NULL // indexInfos = NULL // indexPointers = NULL //Post - The term info index file has been read into memory SCOPED_LOCK_MUTEX(THIS_LOCK) if ( indexTerms != NULL ) return; try { indexTermsLength = (size_t)indexEnum->size; //Instantiate an block of Term's,so that each one doesn't have to be new'd indexTerms = _CL_NEWARRAY(Term,indexTermsLength); CND_CONDITION(indexTerms != NULL,"No memory could be allocated for indexTerms");//Check if is indexTerms is a valid array //Instantiate an big block of TermInfo's, so that each one doesn't have to be new'd indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength); CND_CONDITION(indexInfos != NULL,"No memory could be allocated for indexInfos"); //Check if is indexInfos is a valid array //Instantiate an array indexPointers that contains pointers to the term info index file indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength); CND_CONDITION(indexPointers != NULL,"No memory could be allocated for indexPointers");//Check if is indexPointers is a valid array //Iterate through the terms of indexEnum for (int32_t i = 0; indexEnum->next(); ++i){ indexTerms[i].set(indexEnum->term(false),indexEnum->term(false)->text()); indexEnum->getTermInfo(&indexInfos[i]); indexPointers[i] = indexEnum->indexPointer; } }_CLFINALLY( indexEnum->close(); //Close and delete the IndexInput is. The close is done by the destructor. _CLDELETE( indexEnum->input ); _CLDELETE( indexEnum ); ); } int32_t TermInfosReader::getIndexOffset(const Term* term){ //Func - Returns the offset of the greatest index entry which is less than or equal to term. //Pre - term holds a reference to a valid term // indexTerms != NULL //Post - The new offset has been returned //Check if is indexTerms is a valid array CND_PRECONDITION(indexTerms != NULL,"indexTerms is NULL"); int32_t lo = 0; int32_t hi = indexTermsLength - 1; int32_t mid; int32_t delta; while (hi >= lo) { //Start in the middle betwee hi and lo mid = (lo + hi) >> 1; //Check if is indexTerms[mid] is a valid instance of Term CND_PRECONDITION(&indexTerms[mid] != NULL,"indexTerms[mid] is NULL"); CND_PRECONDITION(mid < indexTermsLength,"mid >= indexTermsLength"); //Determine if term is before mid or after mid delta = term->compareTo(&indexTerms[mid]); if (delta < 0){ //Calculate the new hi hi = mid - 1; }else if (delta > 0){ //Calculate the new lo lo = mid + 1; }else{ //term has been found so return its position return mid; } } // the new starting offset return hi; } void TermInfosReader::seekEnum(const int32_t indexOffset) { //Func - Reposition the current Term and TermInfo to indexOffset //Pre - indexOffset >= 0 // indexTerms != NULL // indexInfos != NULL // indexPointers != NULL //Post - The current Term and Terminfo have been repositioned to indexOffset CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number"); CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL"); CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL"); SegmentTermEnum* enumerator = getEnum(); enumerator->seek( indexPointers[indexOffset], (indexOffset * enumerator->indexInterval) - 1, &indexTerms[indexOffset], &indexInfos[indexOffset] ); } TermInfo* TermInfosReader::scanEnum(const Term* term) { //Func - Scans the Enumeration of terms for term and returns the corresponding TermInfo instance if found. // The search is started from the current term. //Pre - term contains a valid reference to a Term // enumerator != NULL //Post - if term has been found the corresponding TermInfo has been returned otherwise NULL // has been returned SegmentTermEnum* enumerator = getEnum(); enumerator->scanTo(term); //Check if the at the position the Term term can be found if (enumerator->term(false) != NULL && term->equals(enumerator->term(false)) ){ //Return the TermInfo instance about term return enumerator->getTermInfo(); }else{ //term was not found so no TermInfo can be returned return NULL; } } Term* TermInfosReader::scanEnum(const int32_t position) { //Func - Scans the enumeration to the requested position and returns the // Term located at that position //Pre - position > = 0 // enumerator != NULL //Post - The Term at the requested position has been returned SegmentTermEnum* enumerator = getEnum(); //As long the position of the enumeration enumerator is smaller than the requested one while(enumerator->position < position){ //Move the current of enumerator to the next if (!enumerator->next()){ //If there is no next it means that the requested position was to big return NULL; } } //Return the Term a the requested position return enumerator->term(); } CL_NS_END