[sword-svn] r2098 - in trunk: . include src/modules
scribe at www.crosswire.org
scribe at www.crosswire.org
Sun Oct 7 11:57:08 MST 2007
Author: scribe
Date: 2007-10-07 11:57:07 -0700 (Sun, 07 Oct 2007)
New Revision: 2098
Modified:
trunk/ChangeLog
trunk/include/stringmgr.h
trunk/include/swbuf.h
trunk/src/modules/swmodule.cpp
Log:
Added UTF-8 processing for case insensitive unindexed
searching
Modified: trunk/ChangeLog
===================================================================
--- trunk/ChangeLog 2007-10-07 06:27:34 UTC (rev 2097)
+++ trunk/ChangeLog 2007-10-07 18:57:07 UTC (rev 2098)
@@ -12,6 +12,8 @@
(Deji Akingunola <dakingun at gmail dot com>)
Modified OSISLemma to only strip lemma parts with
lemma.* prefix or no prefix
+ Added UTF-8 processing for case insensitive unindexed
+ searching
13-Sep-2007 Troy A. Griffitts <scribe at crosswire.org>
Added InstallMgr::getModuleStatus to return a list
Modified: trunk/include/stringmgr.h
===================================================================
--- trunk/include/stringmgr.h 2007-10-07 06:27:34 UTC (rev 2097)
+++ trunk/include/stringmgr.h 2007-10-07 18:57:07 UTC (rev 2098)
@@ -98,6 +98,13 @@
return StringMgr::getSystemStringMgr()->upperUTF8(t, max);
}
+/**
+ * Converts an SWBuf filled with UTF-8 to upper case
+ *
+ * @param b SWBuf to change to upper case
+ *
+ * @return b for convenience
+ */
inline SWBuf &toupperstr(SWBuf &b) {
char *utf8 = 0;
stdstr(&utf8, b.c_str(), 2);
Modified: trunk/include/swbuf.h
===================================================================
--- trunk/include/swbuf.h 2007-10-07 06:27:34 UTC (rev 2097)
+++ trunk/include/swbuf.h 2007-10-07 18:57:07 UTC (rev 2098)
@@ -375,7 +375,7 @@
* @param separator to use (e.g. ':')
* @return prefix if separator character found; otherwise, null and leaves buffer unmodified
*/
- inline const char *stripPrefix(char separator) { const char *m = strchr(buf, ':'); if (m) { int len = m-buf; char *hold = new char[len]; memcpy(hold, buf, len); *this << (len+1); memcpy(end+1, hold, len); delete [] hold; end[len+1] = 0; } return (m) ? end+1 : 0; } // safe. we know we don't actually realloc and shrink buffer when shifting, so we can place our return val at end.
+ inline const char *stripPrefix(char separator) { const char *m = strchr(buf, separator); if (m) { int len = m-buf; char *hold = new char[len]; memcpy(hold, buf, len); *this << (len+1); memcpy(end+1, hold, len); delete [] hold; end[len+1] = 0; } return (m) ? end+1 : 0; } // safe. we know we don't actually realloc and shrink buffer when shifting, so we can place our return val at end.
// this could be nicer, like replacing a contiguous series of target bytes with single replacement; offering replacement const char *
/**
Modified: trunk/src/modules/swmodule.cpp
===================================================================
--- trunk/src/modules/swmodule.cpp 2007-10-07 06:27:34 UTC (rev 2097)
+++ trunk/src/modules/swmodule.cpp 2007-10-07 18:57:07 UTC (rev 2098)
@@ -13,6 +13,7 @@
#include <treekeyidx.h> // KLUDGE for Search
#include <swoptfilter.h>
#include <filemgr.h>
+#include <stringmgr.h>
#ifndef _MSC_VER
#include <iostream>
#endif
@@ -403,6 +404,7 @@
ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *scope, bool *justCheckIfSupported, void (*percent)(char, void *), void *percentUserData) {
listKey.ClearList();
+ SWBuf term = istr;
#ifdef USELUCENE
SWBuf target = getConfigEntry("AbsoluteDataPath");
@@ -425,9 +427,7 @@
SWKey *searchKey = 0;
SWKey *resultKey = CreateKey();
regex_t preg;
- char **words = 0;
- char *wordBuf = 0;
- int wordCount = 0;
+ vector<SWBuf> words;
const char *sres;
terminateSearch = false;
char perc = 1;
@@ -564,51 +564,49 @@
}
#endif
+ // some pre-loop processing
+ switch (searchType) {
+ // phrase
+ case -1:
+ // let's see if we're told to ignore case. If so, then we'll touppstr our term
+ if ((flags & REG_ICASE) == REG_ICASE) toupperstr(term);
+ break;
+
// multi-word
- if (searchType == -2) {
- wordBuf = (char *)calloc(sizeof(char), strlen(istr) + 1);
- strcpy(wordBuf, istr);
- words = (char **)calloc(sizeof(char *), 10);
- int allocWords = 10;
- words[wordCount] = strtok(wordBuf, " ");
- while (words[wordCount]) {
- wordCount++;
- if (wordCount == allocWords) {
- allocWords+=10;
- words = (char **)realloc(words, sizeof(char *)*allocWords);
+ case -2:
+ // let's break the term down into our words vector
+ while (1) {
+ const char *word = term.stripPrefix(' ');
+ if (!word) {
+ words.push_back(term);
+ break;
}
- words[wordCount] = strtok(NULL, " ");
+ words.push_back(word);
}
- }
+ if ((flags & REG_ICASE) == REG_ICASE) {
+ for (unsigned int i = 0; i < words.size(); i++) {
+ toupperstr(words[i]);
+ }
+ }
+ break;
// entry attributes
- if (searchType == -3) {
- wordBuf = (char *)calloc(sizeof(char), strlen(istr) + 1);
- char *checkSlash = wordBuf;
- strcpy(wordBuf, istr);
- words = (char **)calloc(sizeof(char *), 10);
- int allocWords = 10;
- while (*checkSlash == '/')
- words[wordCount++] = checkSlash++;
- words[wordCount] = strtok(wordBuf, "/");
- while (words[wordCount]) {
- wordCount++;
- if (wordCount == allocWords) {
- allocWords+=10;
- words = (char **)realloc(words, sizeof(char *)*allocWords);
+ case -3:
+ // let's break the attribute segs down. We'll reuse our words vector for each segment
+ while (1) {
+ const char *word = term.stripPrefix('/');
+ if (!word) {
+ words.push_back(term);
+ break;
}
- checkSlash = words[wordCount-1] + (strlen(words[wordCount-1]))+1;
- while (*checkSlash == '/')
- words[wordCount++] = checkSlash++;
- words[wordCount] = strtok(NULL, "/");
+ words.push_back(word);
}
- for (int i = 0; i < wordCount; i++) {
- if (words[i][0] == '/')
- words[i][0] = 0;
- }
+ break;
}
+
+ // our main loop to iterate the module and find the stuff
perc = 5;
(*percent)(perc, percentUserData);
@@ -645,111 +643,120 @@
}
// phrase
- else if (searchType == -1) {
- sres = ((flags & REG_ICASE) == REG_ICASE) ? stristr(StripText(), istr) : strstr(StripText(), istr);
- if (sres) { //it's also in the StripText(), so we have a valid search result item now
- *resultKey = *getKey();
- listKey << *resultKey;
- }
- }
+ else {
+ SWBuf textBuf;
+ switch (searchType) {
- // multiword
- else if (searchType == -2) {
- int loopCount = 0;
- int foundWords = 0;
- do {
- const char* textBuf = ((loopCount == 0)&&(!specialStrips)) ? getRawEntry() : StripText();
- foundWords = 0;
+ // phrase
+ case -1:
+ textBuf = StripText();
+ if ((flags & REG_ICASE) == REG_ICASE) toupperstr(textBuf);
+ sres = strstr(textBuf.c_str(), term.c_str());
+ if (sres) { //it's also in the StripText(), so we have a valid search result item now
+ *resultKey = *getKey();
+ listKey << *resultKey;
+ }
+ break;
+
+ // multiword
+ case -2: { // enclose our allocations
+ int loopCount = 0;
+ unsigned int foundWords = 0;
+ do {
+ textBuf = ((loopCount == 0)&&(!specialStrips)) ? getRawEntry() : StripText();
+ foundWords = 0;
+
+ for (unsigned int i = 0; i < words.size(); i++) {
+ if ((flags & REG_ICASE) == REG_ICASE) toupperstr(textBuf);
+ sres = strstr(textBuf.c_str(), words[i].c_str());
+ if (!sres) {
+ break; //for loop
+ }
+ foundWords++;
+ }
+
+ loopCount++;
+ } while ( (loopCount < 2) && (foundWords == words.size()));
- for (int i = 0; i < wordCount; ++i) {
- sres = ((flags & REG_ICASE) == REG_ICASE) ? stristr(textBuf, words[i]) : strstr(textBuf, words[i]);
- if (!sres) {
- break; //for loop
- }
- ++foundWords;
+ if ((loopCount == 2) && (foundWords == words.size())) { //we found the right words in both raw and stripped text, which means it's a valid result item
+ *resultKey = *getKey();
+ listKey << *resultKey;
}
-
- ++loopCount;
- } while ( (loopCount < 2) && (foundWords == wordCount));
-
- if ((loopCount == 2) && (foundWords == wordCount)) { //we found the right words in both raw and stripped text, which means it's a valid result item
- *resultKey = *getKey();
- listKey << *resultKey;
- }
- }
+ } break;
- // entry attributes
- else if (searchType == -3) {
- RenderText(); // force parse
- AttributeTypeList &entryAttribs = getEntryAttributes();
- AttributeTypeList::iterator i1Start, i1End;
- AttributeList::iterator i2Start, i2End;
- AttributeValue::iterator i3Start, i3End;
+ // entry attributes
+ case -3:
+ RenderText(); // force parse
+ AttributeTypeList &entryAttribs = getEntryAttributes();
+ AttributeTypeList::iterator i1Start, i1End;
+ AttributeList::iterator i2Start, i2End;
+ AttributeValue::iterator i3Start, i3End;
- if ((words[0]) && (words[0][0])) {
- i1Start = entryAttribs.find(words[0]);
- i1End = i1Start;
- if (i1End != entryAttribs.end())
- i1End++;
- }
- else {
- i1Start = entryAttribs.begin();
- i1End = entryAttribs.end();
- }
- for (;i1Start != i1End; i1Start++) {
- if ((words[1]) && (words[1][0])) {
- i2Start = i1Start->second.find(words[1]);
- i2End = i2Start;
- if (i2End != i1Start->second.end())
- i2End++;
+ if ((words.size()) && (words[0].length())) {
+ i1Start = entryAttribs.find(words[0]);
+ i1End = i1Start;
+ if (i1End != entryAttribs.end())
+ i1End++;
}
else {
- i2Start = i1Start->second.begin();
- i2End = i1Start->second.end();
+ i1Start = entryAttribs.begin();
+ i1End = entryAttribs.end();
}
- for (;i2Start != i2End; i2Start++) {
- if ((words[2]) && (words[2][0])) {
- i3Start = i2Start->second.find(words[2]);
- i3End = i3Start;
- if (i3End != i2Start->second.end())
- i3End++;
+ for (;i1Start != i1End; i1Start++) {
+ if ((words.size()>1) && (words[1].length())) {
+ i2Start = i1Start->second.find(words[1]);
+ i2End = i2Start;
+ if (i2End != i1Start->second.end())
+ i2End++;
}
else {
- i3Start = i2Start->second.begin();
- i3End = i2Start->second.end();
+ i2Start = i1Start->second.begin();
+ i2End = i1Start->second.end();
}
- for (;i3Start != i3End; i3Start++) {
- if (flags & SEARCHFLAG_MATCHWHOLEENTRY) {
- bool found = !(((flags & REG_ICASE) == REG_ICASE) ? stricmp(i3Start->second.c_str(), words[3]) : strcmp(i3Start->second.c_str(), words[3]));
- sres = (found) ? i3Start->second.c_str() : 0;
+ for (;i2Start != i2End; i2Start++) {
+ if ((words.size()>2) && (words[2].length())) {
+ i3Start = i2Start->second.find(words[2]);
+ i3End = i3Start;
+ if (i3End != i2Start->second.end())
+ i3End++;
}
else {
- sres = ((flags & REG_ICASE) == REG_ICASE) ? stristr(i3Start->second.c_str(), words[3]) : strstr(i3Start->second.c_str(), words[3]);
+ i3Start = i2Start->second.begin();
+ i3End = i2Start->second.end();
}
- if (sres) {
- *resultKey = *getKey();
- listKey << *resultKey;
+ for (;i3Start != i3End; i3Start++) {
+ if ((words.size()>3) && (words[3].length())) {
+ if (flags & SEARCHFLAG_MATCHWHOLEENTRY) {
+ bool found = !(((flags & REG_ICASE) == REG_ICASE) ? stricmp(i3Start->second.c_str(), words[3]) : strcmp(i3Start->second.c_str(), words[3]));
+ sres = (found) ? i3Start->second.c_str() : 0;
+ }
+ else {
+ sres = ((flags & REG_ICASE) == REG_ICASE) ? stristr(i3Start->second.c_str(), words[3]) : strstr(i3Start->second.c_str(), words[3]);
+ }
+ if (sres) {
+ *resultKey = *getKey();
+ listKey << *resultKey;
+ break;
+ }
+ }
+ }
+ if (i3Start != i3End)
break;
- }
}
- if (i3Start != i3End)
+ if (i2Start != i2End)
break;
}
- if (i2Start != i2End)
- break;
- }
+ break;
+ } // end switch
}
(*this)++;
}
+
+ // cleaup work
if (searchType >= 0)
regfree(&preg);
- if (searchType == -2) {
- free(words);
- free(wordBuf);
- }
-
setKey(*saveKey);
if (!saveKey->Persist())
@@ -776,7 +783,7 @@
* ENT: buf - buf to massage instead of this modules current text
* len - max len of buf
*
- * RET: this module's text at specified key location massaged by Strip filters
+ * RET: this module's text at current key location massaged by Strip filters
*/
const char *SWModule::StripText(const char *buf, int len) {
@@ -789,7 +796,7 @@
*
* ENT: buf - buffer to Render instead of current module position
*
- * RET: this module's text at specified key location massaged by RenderText filters
+ * RET: this module's text at current key location massaged by RenderText filters
*/
const char *SWModule::RenderText(const char *buf, int len, bool render) {
@@ -830,7 +837,7 @@
*
* ENT: tmpKey - key to use to grab text
*
- * RET: this module's text at specified key location massaged by RenderFilers
+ * RET: this module's text at current key location massaged by RenderFilers
*/
const char *SWModule::RenderText(SWKey *tmpKey) {
More information about the sword-cvs
mailing list