/*------------------------------------------------------------------------------ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team * * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #ifndef _lucene_search_Similarity_ #define _lucene_search_Similarity_ #if defined(_LUCENE_PRAGMA_ONCE) # pragma once #endif #include "CLucene/index/Term.h" CL_NS_DEF(search) class Searcher;//save including the searchheader.h class DefaultSimilarity; /** Expert: Scoring API. *
Subclasses implement search scoring. * *
The score of query q
for document d
is defined
* in terms of these methods as follows:
*
*
score(q,d) = |
* * Σ | ** {@link #tf(int32_t) tf}(t in d) * * {@link #idf(Term,Searcher) idf}(t) * * {@link Field#getBoost getBoost}(t.field in d) * * {@link #lengthNorm(TCHAR*,int32_t) lengthNorm}(t.field in d) * | ** * {@link #coord(int32_t,int32_t) coord}(q,d) * * {@link #queryNorm(float_t) queryNorm}(q) * | *
* t in q * | *
This is initially an instance of {@link DefaultSimilarity}. * * @see Searcher#setSimilarity(Similarity) * @see IndexWriter#setSimilarity(Similarity) */ static Similarity* getDefault(); /** Encodes a normalization factor for storage in an index. * *
The encoding uses a five-bit exponent and three-bit mantissa, thus * representing values from around 7x10^9 to 2x10^-9 with about one * significant decimal digit of accuracy. Zero is also represented. * Negative numbers are rounded up to zero. Values too large to represent * are rounded down to the largest representable value. Positive values too * small to represent are rounded up to the smallest positive representable * value. * * @see Field#setBoost(float_t) */ static uint8_t encodeNorm(float_t f); /** Decodes a normalization factor stored in an index. * @see #encodeNorm(float_t) */ static float_t decodeNorm(uint8_t b); static uint8_t floatToByte(float_t f); static float_t byteToFloat(uint8_t b); /** Computes a score factor for a phrase. * *
The default implementation sums the {@link #idf(Term,Searcher)} factor
* for each term in the phrase.
*
* @param terms the terms in the phrase
* @param searcher the document collection being searched
* @return a score factor for the phrase
*/
float_t idf(CL_NS(util)::CLVector The default implementation is: Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when The default implementation calls {@link #tf(float_t)}.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
inline float_t tf(int32_t freq){ return tf((float_t)freq); }
/** Computes the normalization value for a field given the total number of
* terms contained in a field. These values, together with field boosts, are
* stored in an index and multipled into scores for hits on each field by the
* search code.
*
* Matches in longer fields are less precise, so implemenations of this
* method usually return smaller values when That these values are computed under {@link
* IndexWriter#addDocument(Document)} and stored then using
* {#encodeNorm(float_t)}. Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* @param fieldName the name of the field
* @param numTokens the total number of tokens contained in fields named
* fieldName of doc.
* @return a normalization factor for hits on this field of this document
*
* @see Field#setBoost(float_t)
*/
virtual float_t lengthNorm(const TCHAR* fieldName, int32_t numTokens) = 0;
/** Computes the normalization value for a query given the sum of the squared
* weights of each of the query terms. This value is then multipled into the
* weight of each query term.
*
* This does not affect ranking, but rather just attempts to make scores
* from different queries comparable.
*
* @param sumOfSquaredWeights the sum of the squares of query term weights
* @return a normalization factor for query weights
*/
virtual float_t queryNorm(float_t sumOfSquaredWeights) = 0;
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to {@link #tf(float_t)}.
*
* A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* @see PhraseQuery#setSlop(int32_t)
* @param distance the edit distance of this sloppy phrase match
* @return the frequency increment for this match
*/
virtual float_t sloppyFreq(int32_t distance) = 0;
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(Term, Searcher)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* Terms and phrases repeated in a document indicate the topic of the
* document, so implemenations of this method usually return larger values
* when Terms that occur in fewer documents are better indicators of topic, so
* implemenations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
virtual float_t idf(int32_t docFreq, int32_t numDocs) = 0;
/** Computes a score factor based on the fraction of all query terms that a
* document contains. This value is multiplied into scores.
*
* The presence of a large portion of the query terms indicates a better
* match with the query, so implemenations of this method usually return
* larger values when the ratio between these parameters is large and smaller
* values when the ratio between them is small.
*
* @param overlap the number of query terms matched in the document
* @param maxOverlap the total number of terms in the query
* @return a score factor based on term overlap with the query
*/
virtual float_t coord(int32_t overlap, int32_t maxOverlap) = 0;
};
/** Expert: Default scoring implementation. */
class DefaultSimilarity: public Similarity {
public:
DefaultSimilarity();
~DefaultSimilarity();
/** Implemented as
* return idf(searcher.docFreq(term), searcher.maxDoc());
*
*
* Note that {@link Searcher#maxDoc()} is used instead of
* {@link IndexReader#numDocs()} because it is proportional to
* {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
* so is the other, and in the same direction.
*
* @param term the term in question
* @param searcher the document collection being searched
* @return a score factor for the term
*/
float_t idf(CL_NS(index)::Term* term, Searcher* searcher);
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(Term, Searcher)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* freq
is large, and smaller values when freq
* is small.
*
* numTokens
is large,
* and larger values when numTokens
is small.
*
* freq
is large, and smaller values when freq
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
virtual float_t tf(float_t freq) = 0;
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int32_t)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* 1/sqrt(numTerms)
. */
float_t lengthNorm(const TCHAR* fieldName, int32_t numTerms);
/** Implemented as 1/sqrt(sumOfSquaredWeights)
. */
float_t queryNorm(float_t sumOfSquaredWeights);
/** Implemented as sqrt(freq)
. */
inline float_t tf(float_t freq);
/** Implemented as 1 / (distance + 1)
. */
float_t sloppyFreq(int32_t distance);
/** Implemented as log(numDocs/(docFreq+1)) + 1
. */
float_t idf(int32_t docFreq, int32_t numDocs);
/** Implemented as overlap / maxOverlap
. */
float_t coord(int32_t overlap, int32_t maxOverlap);
};
CL_NS_END
#endif