[jsword-svn]
jsword/java/jsword/org/crosswire/jsword/book/search/ser s
jswordcvs at crosswire.org
jswordcvs at crosswire.org
Wed Sep 29 15:21:25 MST 2004
Update of /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/search/ser
In directory www.crosswire.org:/tmp/cvs-serv8429/java/jsword/org/crosswire/jsword/book/search/ser
Modified Files:
Msg.java Msg.properties
Added Files:
SerIndex.java
Removed Files:
SerSearchEngine.java
Log Message:
Fixes for [JS-7] and [JS-6]
Lots of search work and re-factoring
--- SerSearchEngine.java DELETED ---
Index: Msg.properties
===================================================================
RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/search/ser/Msg.properties,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** Msg.properties 27 Jul 2004 21:42:38 -0000 1.2
--- Msg.properties 29 Sep 2004 22:21:23 -0000 1.3
***************
*** 6,14 ****
# It should have no spaces or other punctuation (e.g. _, -, ', ...)
! SerSearchEngine.SearchFail=Could not start search engine
! SerSearchEngine.RepeatedReadError=Too many errors while reading data.
! SerSearchEngine.WriteError=Write Error.
! SerSearchEngine.FindingWords=Finding Words ({0})
! SerSearchEngine.WritingWords=Writing Words ({0})
! SerSearchEngine.Saving=Saving Index
! SerSearchEngine.Interrupted=Interrupted while creating index.
\ No newline at end of file
--- 6,13 ----
# It should have no spaces or other punctuation (e.g. _, -, ', ...)
! SerIndex.RepeatedReadError=Too many errors while reading data.
! SerIndex.WriteError=Write Error.
! SerIndex.FindingWords=Finding Words ({0})
! SerIndex.WritingWords=Writing Words ({0})
! SerIndex.Saving=Saving Index
! SerIndex.Interrupted=Interrupted while creating index.
\ No newline at end of file
--- NEW FILE: SerIndex.java ---
package org.crosswire.jsword.book.search.ser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.crosswire.common.activate.Activatable;
import org.crosswire.common.activate.Activator;
import org.crosswire.common.activate.Lock;
import org.crosswire.common.progress.Job;
import org.crosswire.common.util.FileUtil;
import org.crosswire.common.util.Logger;
import org.crosswire.common.util.NetUtil;
import org.crosswire.common.util.Reporter;
import org.crosswire.jsword.book.Book;
import org.crosswire.jsword.book.BookData;
import org.crosswire.jsword.book.BookException;
import org.crosswire.jsword.book.SentanceUtil;
import org.crosswire.jsword.book.search.Index;
import org.crosswire.jsword.book.search.IndexManager;
import org.crosswire.jsword.passage.BibleInfo;
import org.crosswire.jsword.passage.Key;
import org.crosswire.jsword.passage.KeyUtil;
import org.crosswire.jsword.passage.NoSuchKeyException;
import org.crosswire.jsword.passage.Passage;
import org.crosswire.jsword.passage.PassageKeyFactory;
import org.crosswire.jsword.passage.Verse;
import org.crosswire.jsword.util.Project;
/**
* A search engine - This is a stepping stone on the way to allowing use of
* Lucene in place of our search engine.
*
* <p><table border='1' cellPadding='3' cellSpacing='0'>
* <tr><td bgColor='white' class='TableRowColor'><font size='-7'>
*
* Distribution Licence:<br />
* JSword is free software; you can redistribute it
* and/or modify it under the terms of the GNU General Public License,
* version 2 as published by the Free Software Foundation.<br />
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.<br />
* The License is available on the internet
* <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, or by writing to:
* Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
* MA 02111-1307, USA<br />
* The copyright to this program is held by it's authors.
* </font></td></tr></table>
* @see gnu.gpl.Licence
* @author Joe Walker [joe at eireneh dot com]
* @version $Id: SerIndex.java,v 1.1 2004/09/29 22:21:23 joe Exp $
*/
public class SerIndex implements Index, Activatable
{
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.SearchEngine#init(org.crosswire.jsword.book.Bible, java.net.URL)
*/
public void init(Book newbook) throws BookException
{
this.book = newbook;
try
{
String driverName = book.getBookMetaData().getDriverName();
String bookName = book.getBookMetaData().getInitials();
assert driverName != null;
assert bookName != null;
url = Project.instance().getTempScratchSpace(driverName + "-" + bookName, false); //$NON-NLS-1$
}
catch (IOException ex)
{
throw new BookException(Msg.WRITE_ERROR);
}
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.Index#getKey(java.lang.String)
*/
public Key getKey(String name) throws NoSuchKeyException
{
return book.getKey(name);
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.SearchEngine#delete()
*/
public void delete()
{
checkActive();
// LATER(joe): write delete()
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.parse.Index#getStartsWith(java.lang.String)
*/
public Collection getStartsWith(String word)
{
checkActive();
word = word.toLowerCase();
SortedMap submap = datamap.subMap(word, word + "\u9999"); //$NON-NLS-1$
return submap.keySet();
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.parse.Index#findWord(java.lang.String)
*/
public Key findWord(String word)
{
checkActive();
if (word == null)
{
return book.createEmptyKeyList();
}
Section section = (Section) datamap.get(word.toLowerCase());
if (section == null)
{
return book.createEmptyKeyList();
}
try
{
// Read blob
byte[] blob = new byte[section.length];
dataRaf.seek(section.offset);
int read = dataRaf.read(blob);
// Probably a bit harsh, but it would be wrong to just drop it.
if (read != blob.length)
{
throw new IOException();
}
// De-serialize
return PassageKeyFactory.fromBinaryRepresentation(blob);
}
catch (Exception ex)
{
log.warn("Search failed on:"); //$NON-NLS-1$
log.warn(" word=" + word); //$NON-NLS-1$
log.warn(" offset=" + section.offset); //$NON-NLS-1$
log.warn(" length=" + section.length); //$NON-NLS-1$
Reporter.informUser(this, ex);
return book.createEmptyKeyList();
}
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.AbstractIndex#isIndexed()
*/
public boolean isIndexed()
{
if (generating)
{
return false;
}
URL indexIn = NetUtil.lengthenURL(url, FILE_INDEX);
return NetUtil.isFile(indexIn);
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.AbstractIndex#generateSearchIndex(org.crosswire.common.progress.Job)
*/
public void generateSearchIndex(Job job) throws BookException
{
// create a word/passage hashmap
Map matchmap = new HashMap();
generateSearchIndexImpl(job, book.getGlobalKeyList(), matchmap);
// For the progress listener
int count = 0;
int words = matchmap.size();
// Now we need to write the words into our index
try
{
NetUtil.makeDirectory(url);
URL dataUrl = NetUtil.lengthenURL(url, FILE_DATA);
dataRaf = new RandomAccessFile(NetUtil.getAsFile(dataUrl), FileUtil.MODE_WRITE);
}
catch (IOException ex)
{
throw new BookException(Msg.WRITE_ERROR, ex);
}
for (Iterator it = matchmap.keySet().iterator(); it.hasNext(); )
{
String word = (String) it.next();
Key match = (Key) matchmap.get(word);
recordFoundPassage(word, match);
// Fire a progress event?
int percent = PERCENT_READ + (PERCENT_WRITE * count++ / words) / BibleInfo.versesInBible();
job.setProgress(percent, Msg.WRITING_WORDS.toString(word));
// This could take a long time ...
Thread.yield();
if (Thread.currentThread().isInterrupted())
{
break;
}
}
// Store the indexes on disk
try
{
job.setProgress(PERCENT_READ + PERCENT_WRITE, Msg.SAVING.toString());
// Save the ascii Passage index
URL indexurl = NetUtil.lengthenURL(url, FILE_INDEX);
PrintWriter indexout = new PrintWriter(NetUtil.getOutputStream(indexurl));
Iterator it = datamap.keySet().iterator();
while (it.hasNext())
{
String word = (String) it.next();
Section section = (Section) datamap.get(word);
indexout.println(word + ":" + section.offset + ":" + section.length); //$NON-NLS-1$ //$NON-NLS-2$
}
indexout.close();
}
catch (IOException ex)
{
throw new BookException(Msg.WRITE_ERROR, ex);
}
}
/**
* Dig down into a Key indexing as we go.
*/
private void generateSearchIndexImpl(Job job, Key key, Map matchmap) throws BookException
{
// loop through all the verses
int percent = 0;
for (Iterator it = key.iterator(); it.hasNext(); )
{
Key sublist = (Key) it.next();
if (sublist.canHaveChildren())
{
generateSearchIndexImpl(job, sublist, matchmap);
}
else
{
BookData data = book.getData(sublist);
String text = data.getPlainText();
String[] words = SentanceUtil.getWords(text);
for (int i = 0; i < words.length; i++)
{
// ensure there is a Passage for this word in the word/passage hashmap
Key matches = (Key) matchmap.get(words[i]);
if (matches == null)
{
matches = book.createEmptyKeyList();
matchmap.put(words[i], matches);
}
// add this verse to this words passage
matches.addAll(sublist);
}
// report progress
if (sublist instanceof Passage)
{
Verse verse = KeyUtil.getVerse(sublist);
percent = PERCENT_READ * verse.getOrdinal() / BibleInfo.versesInBible();
}
job.setProgress(percent, Msg.FINDING_WORDS.toString(sublist.getName()));
// This could take a long time ...
Thread.yield();
if (Thread.currentThread().isInterrupted())
{
break;
}
}
}
}
/**
* Add to the main index data the references against this word
* @param word The word to write
* @param key The references to the word
*/
private void recordFoundPassage(String word, Key key) throws BookException
{
if (word == null)
{
return;
}
try
{
Passage ref = KeyUtil.getPassage(key);
byte[] buffer = PassageKeyFactory.toBinaryRepresentation(ref);
Section section = new Section(dataRaf.getFilePointer(), buffer.length);
dataRaf.write(buffer);
datamap.put(word.toLowerCase(), section);
}
catch (Exception ex)
{
throw new BookException(Msg.WRITE_ERROR, ex);
}
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.SearchEngine#activate()
*/
public final void activate(Lock lock)
{
// Load the ascii Passage index
if (isIndexed())
{
try
{
URL dataUrl = NetUtil.lengthenURL(url, FILE_DATA);
dataRaf = new RandomAccessFile(NetUtil.getAsFile(dataUrl), FileUtil.MODE_READ);
URL indexUrl = NetUtil.lengthenURL(url, FILE_INDEX);
BufferedReader indexIn = new BufferedReader(new InputStreamReader(indexUrl.openStream()));
while (true)
{
String line = indexIn.readLine();
if (line == null)
{
break;
}
try
{
int colon1 = line.indexOf(":"); //$NON-NLS-1$
int colon2 = line.lastIndexOf(":"); //$NON-NLS-1$
String word = line.substring(0, colon1);
long offset = Long.parseLong(line.substring(colon1 + 1, colon2));
int length = Integer.parseInt(line.substring(colon2 + 1));
Section section = new Section(offset, length);
datamap.put(word, section);
}
catch (NumberFormatException ex)
{
log.error("NumberFormatException reading line: " + line, ex); //$NON-NLS-1$
}
}
}
catch (IOException ex)
{
log.error("Read failed on indexin", ex); //$NON-NLS-1$
}
}
else
{
IndexManager.instance().createIndex(this);
}
active = true;
}
/* (non-Javadoc)
* @see org.crosswire.jsword.book.search.SearchEngine#deactivate()
*/
public final void deactivate(Lock lock)
{
datamap.clear();
dataRaf = null;
active = false;
}
/**
* Helper method so we can quickly activate ourselves on access
*/
private final void checkActive()
{
if (!active)
{
Activator.activate(this);
}
}
/**
* Are we active
*/
private boolean active = false;
/**
* Are we in the middle of generating an index?
*/
private boolean generating = false;
/**
* The name of the data file
*/
private static final String FILE_DATA = "ref.data"; //$NON-NLS-1$
/**
* The name of the index file
*/
private static final String FILE_INDEX = "ref.index"; //$NON-NLS-1$
/**
* The Bible we are indexing
*/
protected Book book;
/**
* The directory to which to write the index
*/
private URL url;
/**
* The passages random access file
*/
private RandomAccessFile dataRaf;
/**
* The hash of indexes into the passages file
*/
private SortedMap datamap = new TreeMap();
/**
* The log stream
*/
private static final Logger log = Logger.getLogger(SerIndex.class);
/**
* The percentages taken but by different parts
*/
private static final int PERCENT_READ = 60;
private static final int PERCENT_WRITE = 39;
// private static final int PERCENT_INDEX = 1;
/**
* A simple class to hold an offset and length into the passages random
* access file
*/
public static class Section
{
protected Section(long offset, int length)
{
this.offset = offset;
this.length = length;
}
protected long offset;
protected int length;
}
}
Index: Msg.java
===================================================================
RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/search/ser/Msg.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** Msg.java 27 Jul 2004 21:42:38 -0000 1.13
--- Msg.java 29 Sep 2004 22:21:23 -0000 1.14
***************
*** 29,39 ****
class Msg extends MsgBase
{
! static final Msg SEARCH_FAIL = new Msg("SerSearchEngine.SearchFail"); //$NON-NLS-1$
! static final Msg REPEATED_READ_ERROR = new Msg("SerSearchEngine.RepeatedReadError"); //$NON-NLS-1$
! static final Msg WRITE_ERROR = new Msg("SerSearchEngine.WriteError"); //$NON-NLS-1$
! static final Msg FINDING_WORDS = new Msg("SerSearchEngine.FindingWords"); //$NON-NLS-1$
! static final Msg WRITING_WORDS = new Msg("SerSearchEngine.WritingWords"); //$NON-NLS-1$
! static final Msg SAVING = new Msg("SerSearchEngine.Saving"); //$NON-NLS-1$
! static final Msg INTERRUPTED = new Msg("SerSearchEngine.Interrupted"); //$NON-NLS-1$
/**
--- 29,38 ----
class Msg extends MsgBase
{
! static final Msg REPEATED_READ_ERROR = new Msg("SerIndex.RepeatedReadError"); //$NON-NLS-1$
! static final Msg WRITE_ERROR = new Msg("SerIndex.WriteError"); //$NON-NLS-1$
! static final Msg FINDING_WORDS = new Msg("SerIndex.FindingWords"); //$NON-NLS-1$
! static final Msg WRITING_WORDS = new Msg("SerIndex.WritingWords"); //$NON-NLS-1$
! static final Msg SAVING = new Msg("SerIndex.Saving"); //$NON-NLS-1$
! static final Msg INTERRUPTED = new Msg("SerIndex.Interrupted"); //$NON-NLS-1$
/**
More information about the jsword-svn
mailing list