[sword-svn] r3267 - in trunk: . examples/cmdline src/modules
scribe at crosswire.org
scribe at crosswire.org
Wed Oct 8 12:43:28 MST 2014
Author: scribe
Date: 2014-10-08 12:43:28 -0700 (Wed, 08 Oct 2014)
New Revision: 3267
Modified:
trunk/configure.ac
trunk/examples/cmdline/search.cpp
trunk/src/modules/swmodule.cpp
Log:
Added initial support for Xapian search engine
Modified: trunk/configure.ac
===================================================================
--- trunk/configure.ac 2014-10-08 15:39:57 UTC (rev 3266)
+++ trunk/configure.ac 2014-10-08 19:43:28 UTC (rev 3267)
@@ -69,6 +69,8 @@
# AC_HELP_STRING([--with-lucene],[include lucene support for searching (default=no)]),,with_lucene=no)
AC_ARG_WITH([internalregex],
AS_HELP_STRING([--with-internalregex], [Compile using SWORDs internal copy of regex]))
+AC_ARG_WITH(xapian,
+ AC_HELP_STRING([--with-xapian],[use xapian search engine (default=yes)]),,with_xapian=yes)
# ---------------------------------------------------------------------
@@ -304,6 +306,23 @@
fi
fi
+if test x$with_xapian = xyes; then
+ AC_LANG_CPLUSPLUS
+ AC_CHECK_LIB(xapian,main,,with_xapian="no")
+else
+ with_xapian="no"
+fi
+
+if test x$with_xapian = xyes; then
+ AM_CFLAGS="$AM_CFLAGS -DUSEXAPIAN"
+ AM_CXXFLAGS="$AM_CXXFLAGS -DUSEXAPIAN"
+ if test x$with_clucene = xno; then
+ with_clucene="no"
+ else
+ with_clucene="$with_clucene; but using XAPIAN instead"
+ fi
+fi
+
AC_CHECK_FUNCS(vsnprintf, [have_vsnprintf="yes"])
# ---------------------------------------------------------------------
@@ -425,6 +444,7 @@
echo " INTERNAL FTPLIB: $with_internalftplib"
echo " INTERNAL REGEX: $with_internalregex"
echo " CLUCENE: $with_clucene"
+echo " XAPIAN: $with_xapian"
echo
echo
Modified: trunk/examples/cmdline/search.cpp
===================================================================
--- trunk/examples/cmdline/search.cpp 2014-10-08 15:39:57 UTC (rev 3266)
+++ trunk/examples/cmdline/search.cpp 2014-10-08 19:43:28 UTC (rev 3267)
@@ -44,7 +44,7 @@
* -4 - Lucene
*/
-char SEARCH_TYPE=-2;
+char SEARCH_TYPE=-4;
char printed = 0;
void percentUpdate(char percent, void *userData) {
Modified: trunk/src/modules/swmodule.cpp
===================================================================
--- trunk/src/modules/swmodule.cpp 2014-10-08 15:39:57 UTC (rev 3266)
+++ trunk/src/modules/swmodule.cpp 2014-10-08 19:43:28 UTC (rev 3267)
@@ -48,7 +48,9 @@
#include <regex.h> // GNU
#endif
-#ifdef USELUCENE
+#if defined USEXAPIAN
+#include <xapian.h>
+#elif defined USELUCENE
#include <CLucene.h>
//Lucence includes
@@ -380,16 +382,22 @@
SWBuf term = istr;
bool includeComponents = false; // for entryAttrib e.g., /Lemma.1/
-#ifdef USELUCENE
SWBuf target = getConfigEntry("AbsoluteDataPath");
if (!target.endsWith("/") && !target.endsWith("\\")) {
target.append('/');
}
+#if defined USEXAPIAN
+ target.append("xapian");
+#elif defined USELUCENE
target.append("lucene");
#endif
if (justCheckIfSupported) {
*justCheckIfSupported = (searchType >= -3);
-#ifdef USELUCENE
+#if defined USEXAPIAN
+ if ((searchType == -4) && (FileMgr::existsDir(target))) {
+ *justCheckIfSupported = true;
+ }
+#elif defined USELUCENE
if ((searchType == -4) && (IndexReader::indexExists(target.c_str()))) {
*justCheckIfSupported = true;
}
@@ -460,8 +468,23 @@
(*percent)(++perc, percentUserData);
-#ifdef USELUCENE
- if (searchType == -4) { // lucene
+#if defined USEXAPIAN || defined USELUCENE
+ (*percent)(10, percentUserData);
+ if (searchType == -4) { // indexed search
+#if defined USEXAPIAN
+ SWTRY {
+ Xapian::Database database(target.c_str());
+ Xapian::QueryParser queryParser;
+ queryParser.set_stemmer(Xapian::Stem("en"));
+ queryParser.set_stemming_strategy(queryParser.STEM_SOME);
+ queryParser.add_prefix("content", "C");
+ queryParser.add_prefix("lemma", "L");
+ queryParser.add_prefix("morph", "M");
+ queryParser.add_prefix("prox", "P");
+ queryParser.add_prefix("proxlem", "PL");
+ queryParser.add_prefix("proxmorph", "PM");
+
+#elif defined USELUCENE
lucene::index::IndexReader *ir = 0;
lucene::search::IndexSearcher *is = 0;
@@ -470,22 +493,44 @@
SWTRY {
ir = IndexReader::open(target);
is = new IndexSearcher(ir);
- (*percent)(10, percentUserData);
-
const TCHAR *stopWords[] = { 0 };
standard::StandardAnalyzer analyzer(stopWords);
+#endif
+
+ // parse the query
+#if defined USEXAPIAN
+ Xapian::Query q = queryParser.parse_query(istr);
+ Xapian::Enquire enquire = Xapian::Enquire(database);
+#elif defined USELUCENE
q = QueryParser::parse((wchar_t *)utf8ToWChar(istr).getRawData(), _T("content"), &analyzer);
+#endif
(*percent)(20, percentUserData);
+
+ // perform the search
+#if defined USEXAPIAN
+ enquire.set_query(q);
+ Xapian::MSet h = enquire.get_mset(0, 99999);
+#elif defined USELUCENE
h = is->search(q);
+#endif
(*percent)(80, percentUserData);
// iterate thru each good module position that meets the search
bool checkBounds = getKey()->isBoundSet();
+#if defined USEXAPIAN
+ Xapian::MSetIterator i;
+ for (i = h.begin(); i != h.end(); ++i) {
+// cout << "Document ID " << *i << "\t";
+ __u64 score = i.get_percent();
+ Xapian::Document doc = i.get_document();
+ *resultKey = doc.get_data().c_str();
+#elif defined USELUCENE
for (unsigned long i = 0; i < (unsigned long)h->length(); i++) {
Document &doc = h->doc(i);
-
// set a temporary verse key to this module position
*resultKey = wcharToUTF8(doc.get(_T("key"))); //TODO Does a key always accept utf8?
+ __u64 score = (__u64)((__u32)(h->score(i)*100));
+#endif
// check to see if it sets ok (within our bounds) and if not, skip
if (checkBounds) {
@@ -495,14 +540,19 @@
}
}
listKey << *resultKey;
- listKey.getElement()->userData = (__u64)((__u32)(h->score(i)*100));
+ listKey.getElement()->userData = score;
}
(*percent)(98, percentUserData);
}
SWCATCH (...) {
+#if defined USEXAPIAN
+#elif defined USELUCENE
q = 0;
+#endif
// invalid clucene query
}
+#if defined USEXAPIAN
+#elif defined USELUCENE
delete h;
delete q;
@@ -510,6 +560,7 @@
if (ir) {
ir->close();
}
+#endif
}
#endif
@@ -1011,12 +1062,17 @@
signed char SWModule::createSearchFramework(void (*percent)(char, void *), void *percentUserData) {
-#ifdef USELUCENE
+#if defined USELUCENE || defined USEXAPIAN
SWBuf target = getConfigEntry("AbsoluteDataPath");
if (!target.endsWith("/") && !target.endsWith("\\")) {
target.append('/');
}
+#if defined USEXAPIAN
+ target.append("xapian");
+#elif defined USELUCENE
+ const int MAX_CONV_SIZE = 1024 * 1024;
target.append("lucene");
+#endif
int status = FileMgr::createParent(target+"/dummy");
if (status) return -1;
@@ -1025,7 +1081,6 @@
SWKey textkey;
SWBuf c;
- const int MAX_CONV_SIZE = 1024 * 1024;
// turn all filters to default values
StringList filterSettings;
@@ -1059,6 +1114,15 @@
setKey(*searchKey);
}
+ bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch");
+
+ // lets create or open our search index
+#if defined USEXAPIAN
+ Xapian::WritableDatabase database(target.c_str(), Xapian::DB_CREATE_OR_OPEN);
+ Xapian::TermGenerator termGenerator;
+ termGenerator.set_stemmer(Xapian::Stem("en"));
+
+#elif defined USELUCENE
RAMDirectory *ramDir = 0;
IndexWriter *coreWriter = 0;
IndexWriter *fsWriter = 0;
@@ -1066,11 +1130,11 @@
const TCHAR *stopWords[] = { 0 };
standard::StandardAnalyzer *an = new standard::StandardAnalyzer(stopWords);
- bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch");
ramDir = new RAMDirectory();
coreWriter = new IndexWriter(ramDir, an, true);
coreWriter->setMaxFieldLength(MAX_CONV_SIZE);
+#endif
@@ -1127,7 +1191,12 @@
bool good = false;
// start out entry
+#if defined USEXAPIAN
+ Xapian::Document doc;
+ termGenerator.set_document(doc);
+#elif defined USELUCENE
Document *doc = new Document();
+#endif
// get "key" field
SWBuf keyText = (vkcheck) ? vkcheck->getOSISRef() : getKeyText();
if (content && *content) {
@@ -1173,7 +1242,11 @@
}
}
+#if defined USEXAPIAN
+ doc.set_data(keyText.c_str());
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("key"), (wchar_t *)utf8ToWChar(keyText).getRawData(), Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+#endif
if (includeKeyInSearch) {
c = keyText;
@@ -1182,11 +1255,21 @@
content = c.c_str();
}
+#if defined USEXAPIAN
+ termGenerator.index_text(content);
+ termGenerator.index_text(content, 1, "C");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("content"), (wchar_t *)utf8ToWChar(content).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
if (strong.length() > 0) {
+#if defined USEXAPIAN
+ termGenerator.index_text(strong.c_str(), 1, "L");
+ termGenerator.index_text(morph.c_str(), 1, "M");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("lemma"), (wchar_t *)utf8ToWChar(strong).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
doc->add(*_CLNEW Field(_T("morph"), (wchar_t *)utf8ToWChar(morph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
//printf("setting fields (%s).\ncontent: %s\nlemma: %s\n", (const char *)*key, content, strong.c_str());
}
@@ -1331,20 +1414,39 @@
if (proxBuf.length() > 0) {
+#if defined USEXAPIAN
+ termGenerator.index_text(proxBuf.c_str(), 1, "P");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("prox"), (wchar_t *)utf8ToWChar(proxBuf).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
good = true;
}
if (proxLem.length() > 0) {
+#if defined USEXAPIAN
+ termGenerator.index_text(proxLem.c_str(), 1, "PL");
+ termGenerator.index_text(proxMorph.c_str(), 1, "PM");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("proxlem"), (wchar_t *)utf8ToWChar(proxLem).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) );
doc->add(*_CLNEW Field(_T("proxmorph"), (wchar_t *)utf8ToWChar(proxMorph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) );
+#endif
good = true;
}
if (good) {
//printf("writing (%s).\n", (const char *)*key);
//fflush(stdout);
+#if defined USEXAPIAN
+ SWBuf idTerm;
+ idTerm.setFormatted("Q%ld", key->getIndex());
+ doc.add_boolean_term(idTerm.c_str());
+ database.replace_document(idTerm.c_str(), doc);
+#elif defined USELUCENE
coreWriter->addDocument(doc);
+#endif
}
+#if defined USEXAPIAN
+#elif defined USELUCENE
delete doc;
+#endif
(*this)++;
err = popError();
@@ -1352,6 +1454,8 @@
// Optimizing automatically happens with the call to addIndexes
//coreWriter->optimize();
+#if defined USEXAPIAN
+#elif defined USELUCENE
coreWriter->close();
#ifdef CLUCENE2
@@ -1386,6 +1490,7 @@
delete coreWriter;
delete fsWriter;
delete an;
+#endif
// reposition module back to where it was before we were called
setKey(*saveKey);
More information about the sword-cvs
mailing list