[jsword-svn] r1318 - in trunk/jsword/src/main/java/org/crosswire/jsword: book book/sword index/lucene
dmsmith at www.crosswire.org
dmsmith at www.crosswire.org
Sun May 6 08:36:36 MST 2007
Author: dmsmith
Date: 2007-05-06 08:36:35 -0700 (Sun, 06 May 2007)
New Revision: 1318
Added:
trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java
Modified:
trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java
trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java
trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java
trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java
trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java
Log:
Added the ability to index Strong's Numbers and cross references.
Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java 2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/BookData.java 2007-05-06 15:36:35 UTC (rev 1318)
@@ -88,6 +88,24 @@
}
/**
+ * Return just the Strong's numbers.
+ * @return The Book's Strong's numbers as a space separated string.
+ */
+ public String getStrongsNumbers()
+ {
+ return OSISUtil.getStrongsNumbers(getOsis());
+ }
+
+ /**
+ * Return just the scripture references in the book.
+ * @return The Book's scripture references
+ */
+ public String getReferences()
+ {
+ return OSISUtil.getReferences(getOsis());
+ }
+
+ /**
* Check that a BibleData is valid.
* Currently, this does nothing, and isn't used. it was broken when we used
* JAXB, however it wasn't much use then becuase JAXB did a lot to keep the
Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java 2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/FeatureType.java 2007-05-06 15:36:35 UTC (rev 1318)
@@ -52,7 +52,6 @@
*/
public static final FeatureType HEBREW_PARSE = new FeatureType("HebrewParse"); //$NON-NLS-1$
-
/**
* The book is one of Daily Devotions.
*/
@@ -64,8 +63,38 @@
public static final FeatureType GLOSSARY = new FeatureType("Glossary"); //$NON-NLS-1$
/**
- * @param name The name of the BookCategory
+ * The book contains Strong's Numbers
*/
+ public static final FeatureType STRONGS_NUMBERS = new FeatureType("StrongsNumbers"); //$NON-NLS-1$
+
+ /**
+ * The book contains footnotes
+ */
+ public static final FeatureType FOOTNOTES = new FeatureType("Footnotes"); //$NON-NLS-1$
+
+ /**
+ * The book contains Scripture cross references
+ */
+ public static final FeatureType SCRIPTURE_REFERENCES = new FeatureType("Scripref"); //$NON-NLS-1$
+
+ /**
+ * The book marks the Word's of Christ
+ */
+ public static final FeatureType WORDS_OF_CHRIST = new FeatureType("RedLetterText"); //$NON-NLS-1$
+
+ /**
+ * The book contains Morphology info
+ */
+ public static final FeatureType MORPHOLOGY = new FeatureType("Morph"); //$NON-NLS-1$
+
+ /**
+ * The book contains Headings
+ */
+ public static final FeatureType HEADINGS = new FeatureType("Headings"); //$NON-NLS-1$
+
+ /**
+ * @param name The name of the FeatureType
+ */
private FeatureType(String name)
{
this.name = name;
@@ -106,7 +135,7 @@
}
/**
- * The name of the BookCategory
+ * The name of the FeatureType
*/
private String name;
@@ -127,6 +156,12 @@
HEBREW_PARSE,
DAILY_DEVOTIONS,
GLOSSARY,
+ STRONGS_NUMBERS,
+ FOOTNOTES,
+ SCRIPTURE_REFERENCES,
+ WORDS_OF_CHRIST,
+ MORPHOLOGY,
+ HEADINGS,
};
/**
Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java 2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/OSISUtil.java 2007-05-06 15:36:35 UTC (rev 1318)
@@ -28,9 +28,15 @@
import java.util.Iterator;
import java.util.List;
import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.crosswire.common.util.Logger;
+import org.crosswire.jsword.passage.Key;
+import org.crosswire.jsword.passage.KeyFactory;
+import org.crosswire.jsword.passage.NoSuchKeyException;
import org.crosswire.jsword.passage.NoSuchVerseException;
+import org.crosswire.jsword.passage.PassageKeyFactory;
import org.crosswire.jsword.passage.Verse;
import org.crosswire.jsword.passage.VerseFactory;
import org.jdom.Content;
@@ -617,6 +623,86 @@
return buffer.toString().trim();
}
+ /**
+ * A space separate string containing Strong's numbers.
+ * @return The Strong's numbers in the text
+ */
+ public static String getStrongsNumbers(Element root)
+ {
+ StringBuffer buffer = new StringBuffer();
+
+ Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_W).iterator();
+ while (contentIter.hasNext())
+ {
+ Element ele = (Element) contentIter.next();
+ String attr = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
+ if (attr != null)
+ {
+ if (buffer.length() > 0)
+ {
+ buffer.append(' ');
+ }
+
+ buffer.append(attr);
+ }
+ }
+
+ String lemmas = buffer.toString();
+
+ // Clear out the buffer for re-use
+ int len = buffer.length();
+ if (len > 0)
+ {
+ buffer.delete(0, len);
+ }
+
+ Matcher matcher = strongsNumberPattern.matcher(lemmas);
+ while (matcher.find())
+ {
+ String strongType = matcher.group(1);
+ String strongsNum = matcher.group(2);
+ if (buffer.length() > 0)
+ {
+ buffer.append(' ');
+ }
+ buffer.append(strongType);
+ buffer.append(strongsNum);
+ }
+
+ return buffer.toString().trim();
+ }
+
+ /**
+ * A space separate string containing osisID from the reference element.
+ * @return The references in the text
+ */
+ public static String getReferences(Element root)
+ {
+ KeyFactory keyf = PassageKeyFactory.instance();
+ Key collector = keyf.createEmptyKeyList();
+
+ Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_REFERENCE).iterator();
+ while (contentIter.hasNext())
+ {
+ Element ele = (Element) contentIter.next();
+ String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_REF);
+ if (attr != null)
+ {
+ try
+ {
+ Key key = keyf.getKey(attr);
+ collector.addAll(key);
+ }
+ catch (NoSuchKeyException e)
+ {
+ log.warn("Unable to parse: " + attr, e); //$NON-NLS-1$
+ }
+ }
+ }
+
+ return collector.getOsisID();
+ }
+
private static void getCanonicalContent(String sName, String sID, Iterator iter, StringBuffer buffer)
{
Object data = null;
@@ -697,6 +783,7 @@
throw new BookException(Msg.OSIS_BADID, ex, new Object[] { osisid });
}
}
+
// So we just walk up the tree trying to find a verse
Parent parent = ele.getParent();
if (parent instanceof Element)
@@ -742,11 +829,17 @@
reply.add(start);
}
+ Object data = null;
+ Element ele = null;
Iterator contentIter = start.getContent().iterator();
while (contentIter.hasNext())
{
- Element ele = (Element) contentIter.next();
- recurseDeepContent(ele, name, reply);
+ data = contentIter.next();
+ if (data instanceof Element)
+ {
+ ele = (Element) data;
+ recurseDeepContent(ele, name, reply);
+ }
}
}
@@ -785,4 +878,7 @@
recurseElement(sub, buffer);
}
}
+
+ private static String strongsNumber = "strong:([GH])0*([0-9]+)"; //$NON-NLS-1$
+ private static Pattern strongsNumberPattern = Pattern.compile(strongsNumber);
}
Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java 2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/sword/SwordBookMetaData.java 2007-05-06 15:36:35 UTC (rev 1318)
@@ -263,8 +263,22 @@
*/
/* @Override */
public boolean hasFeature(FeatureType feature)
- {
- return cet.match(ConfigEntryType.FEATURE, feature.toString());
+ {
+ if (cet.match(ConfigEntryType.FEATURE, feature.toString()))
+ {
+ return true;
+ }
+ // Many "features" are GlobalOptionFilters, which in the Sword C++ API
+ // indicate a class to use for filtering.
+ // These mostly have the source type prepended to the feature
+ StringBuffer buffer = new StringBuffer(getProperty(ConfigEntryType.SOURCE_TYPE));
+ buffer.append(feature);
+ if (cet.match(ConfigEntryType.GLOBAL_OPTION_FILTER, buffer.toString()))
+ {
+ return true;
+ }
+ // But some do not
+ return cet.match(ConfigEntryType.GLOBAL_OPTION_FILTER, feature.toString());
}
private void buildProperties()
Added: trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java (rev 0)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneAnalyzer.java 2007-05-06 15:36:35 UTC (rev 1318)
@@ -0,0 +1,45 @@
+package org.crosswire.jsword.index.lucene;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+public class LuceneAnalyzer extends Analyzer
+{
+
+ public LuceneAnalyzer()
+ {
+ }
+
+ public TokenStream tokenStream(String fieldName, Reader reader)
+ {
+ // do not tokenize keys
+ if (LuceneIndex.FIELD_KEY.equals(fieldName))
+ {
+ return KEYWORD.tokenStream(fieldName, reader);
+ }
+ // Split Strong's Numbers on whitespace
+ else if (LuceneIndex.FIELD_STRONG.equals(fieldName))
+ {
+ return WHITESPACE.tokenStream(fieldName, reader);
+ }
+ // Split xrefs's on whitespace
+ else if (LuceneIndex.FIELD_XREF.equals(fieldName))
+ {
+ return WHITESPACE.tokenStream(fieldName, reader);
+ }
+ // just use the standard tokenizer
+ else
+ {
+ return SIMPLE.tokenStream(fieldName, reader);
+ }
+ }
+
+ private static final Analyzer KEYWORD = new KeywordAnalyzer();
+ private static final Analyzer WHITESPACE = new WhitespaceAnalyzer();
+ private static final Analyzer SIMPLE = new SimpleAnalyzer();
+}
Modified: trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java 2007-05-06 15:24:35 UTC (rev 1317)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/index/lucene/LuceneIndex.java 2007-05-06 15:36:35 UTC (rev 1318)
@@ -30,7 +30,6 @@
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@@ -53,6 +52,7 @@
import org.crosswire.jsword.book.Book;
import org.crosswire.jsword.book.BookData;
import org.crosswire.jsword.book.BookException;
+import org.crosswire.jsword.book.FeatureType;
import org.crosswire.jsword.index.AbstractIndex;
import org.crosswire.jsword.index.IndexStatus;
import org.crosswire.jsword.index.search.SearchModifier;
@@ -114,7 +114,7 @@
Progress job = JobManager.createJob(Msg.INDEX_START.toString(), Thread.currentThread(), false);
IndexStatus finalStatus = IndexStatus.UNDONE;
- Analyzer analyzer = new SimpleAnalyzer();
+ Analyzer analyzer = new LuceneAnalyzer();
List errors = new ArrayList();
File tempPath = new File(path + '.' + IndexStatus.CREATING.toString());
@@ -202,7 +202,7 @@
try
{
- Analyzer analyzer = new SimpleAnalyzer();
+ Analyzer analyzer = new LuceneAnalyzer();
QueryParser parser = new QueryParser(LuceneIndex.FIELD_BODY, analyzer);
Query query = parser.parse(search);
Hits hits = searcher.search(query);
@@ -216,7 +216,7 @@
results = tally;
for (int i = 0; i < hits.length(); i++)
{
- Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_NAME));
+ Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_KEY));
// PassageTally understands a score of 0 as the verse not participating
int score = (int) (hits.score(i) * 100 + 1);
tally.add(key, score);
@@ -238,7 +238,7 @@
}
for (int i = 0; i < hits.length(); i++)
{
- Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_NAME));
+ Key key = VerseFactory.fromString(hits.doc(i).get(LuceneIndex.FIELD_KEY));
results.addAll(key);
}
if (passage != null)
@@ -339,10 +339,15 @@
*/
private void generateSearchIndexImpl(Progress job, List errors, IndexWriter writer, Key key, int count) throws BookException, IOException
{
+ boolean hasStrongs = book.getBookMetaData().hasFeature(FeatureType.STRONGS_NUMBERS);
+ boolean hasXRefs = book.getBookMetaData().hasFeature(FeatureType.SCRIPTURE_REFERENCES);
+
String oldRootName = ""; //$NON-NLS-1$
int percent = 0;
String rootName = ""; //$NON-NLS-1$
String text = ""; //$NON-NLS-1$
+ String strongs = ""; //$NON-NLS-1$
+ String xrefs = ""; //$NON-NLS-1$
BookData data = null;
Key subkey = null;
Document doc = null;
@@ -369,13 +374,36 @@
}
text = data.getCanonicalText();
+ if (hasStrongs)
+ {
+ strongs = data.getStrongsNumbers();
+ }
// Do the actual indexing
if (text != null && text.length() > 0)
{
doc = new Document();
- doc.add(new Field(FIELD_NAME, subkey.getOsisRef(), Field.Store.YES, Field.Index.NO));
+ doc.add(new Field(FIELD_KEY, subkey.getOsisRef(), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(FIELD_BODY, new StringReader(text)));
+
+ if (hasStrongs)
+ {
+ strongs = data.getStrongsNumbers();
+ if (strongs != null && strongs.length() > 0)
+ {
+ doc.add(new Field(FIELD_STRONG, strongs, Field.Store.NO, Field.Index.TOKENIZED));
+ }
+ }
+
+ if (hasXRefs)
+ {
+ xrefs = data.getReferences();
+ if (xrefs != null && xrefs.length() > 0)
+ {
+ doc.add(new Field(FIELD_XREF, xrefs, Field.Store.NO, Field.Index.TOKENIZED));
+ }
+ }
+
writer.addDocument(doc);
}
@@ -423,7 +451,7 @@
/**
* The Lucene field for the osisID
*/
- protected static final String FIELD_NAME = "key"; //$NON-NLS-1$
+ protected static final String FIELD_KEY = "key"; //$NON-NLS-1$
/**
* The Lucene field for the text contents
@@ -436,6 +464,16 @@
protected static final String FIELD_STRONG = "strong"; //$NON-NLS-1$
/**
+ * The Lucene field for cross references
+ */
+ protected static final String FIELD_XREF = "xref"; //$NON-NLS-1$
+
+ /**
+ * The Lucene field for notes
+ */
+ protected static final String FIELD_NOTES = "note"; //$NON-NLS-1$
+
+ /**
* The Book that we are indexing
*/
protected Book book;
More information about the jsword-svn
mailing list