[jsword-svn] jsword/java/jsword/org/crosswire/jsword/book s
jswordcvs at crosswire.org
jswordcvs at crosswire.org
Sun May 1 18:29:34 MST 2005
Update of /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book
In directory www.crosswire.org:/tmp/cvs-serv26301/java/jsword/org/crosswire/jsword/book
Modified Files:
BookData.java OSISUtil.java
Log Message:
Fixed the bug where notes were being indexed by adding getVerseText as a replacement for getPlainText. The latter is still needed for non-bibles.
Also cleaned up checkstyle reports and added/corrected javadoc.
Index: OSISUtil.java
===================================================================
RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/OSISUtil.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** OSISUtil.java 5 Apr 2005 00:33:40 -0000 1.13
--- OSISUtil.java 2 May 2005 01:29:32 -0000 1.14
***************
*** 2,8 ****
--- 2,11 ----
import java.util.ArrayList;
+ import java.util.Arrays;
import java.util.Collection;
+ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+ import java.util.Set;
import org.crosswire.common.util.Logger;
***************
*** 131,136 ****
public static final String ATTRIBUTE_VERSE_OSISID = "osisID"; //$NON-NLS-1$
public static final String ATTRIBUTE_DIV_OSISID = "osisID"; //$NON-NLS-1$
public static final String ATTRIBUTE_W_LEMMA = "lemma"; //$NON-NLS-1$
! public static final String ATTRIBUTE_HI_TYPE = "rend"; //$NON-NLS-1$
public static final String ATTRIBUTE_SEG_TYPE = "type"; //$NON-NLS-1$
public static final String ATTRIBUTE_REFERENCE_OSISREF = "osisRef"; //$NON-NLS-1$
--- 134,141 ----
public static final String ATTRIBUTE_VERSE_OSISID = "osisID"; //$NON-NLS-1$
public static final String ATTRIBUTE_DIV_OSISID = "osisID"; //$NON-NLS-1$
+ public static final String OSIS_ATTR_SID = "sID"; //$NON-NLS-1$
+ public static final String OSIS_ATTR_EID = "eID"; //$NON-NLS-1$
public static final String ATTRIBUTE_W_LEMMA = "lemma"; //$NON-NLS-1$
! public static final String ATTRIBUTE_HI_TYPE = "type"; //$NON-NLS-1$
public static final String ATTRIBUTE_SEG_TYPE = "type"; //$NON-NLS-1$
public static final String ATTRIBUTE_REFERENCE_OSISREF = "osisRef"; //$NON-NLS-1$
***************
*** 146,149 ****
--- 151,159 ----
private static final String OSISID_PREFIX_BIBLE = "Bible."; //$NON-NLS-1$
+ private static final Set EXTRA_BIBLICAL_ELEMENTS = new HashSet(Arrays.asList(new String[]
+ {
+ OSIS_ELEMENT_NOTE,
+ }));
+
/**
* The log stream
***************
*** 379,387 ****
/**
* A simplified plain text version of the data in this Element with all
* the markup stripped out.
* @return The Bible text without markup
*/
! public static String getPlainText(Element ele)
{
StringBuffer buffer = new StringBuffer();
--- 389,536 ----
/**
+ * Get the verse text from an osis document consisting of a single verse.
+ * The document is assumed to be valid OSIS2.0 XML. While xml valid
+ * is rigidly defined as meaning that an xml parser can validate the document,
+ * it does not mean that the document is valid OSIS. This is a semantic
+ * problem that is not validated. This method assumes that the
+ * root element is also semantically valid.
+ *
+ * <p>This means that the top level element's tagname is osis.
+ * This can contain either a osisText or an osisCorpus.
+ * If it is an osisCorpus, then it contains an osisText.
+ * However, as a simplification, since JSword constructs
+ * the whole doc for the fragment, osisCorpus can be ignored.
+ * <p>The osisText element contains a div element that is either
+ * a container or a milestone. Again, JSword is providing the
+ * div element and it will be provided as a container. It is this div
+ * that "contains" the verse element.</p>
+ * <p>The verse element may either be
+ * a container or a milestone. Sword OSIS modules differ in whether
+ * they provide the verse element. Most do not. The few that do are
+ * using the container model, but it has been proposed that milestones
+ * are the best practice.</p>
+ *
+ * <p>The verse may contain elements that are not a part of the
+ * original text. These are things such as notes.</p>
+ *
+ * <p>Milestones require special handling. Beginning milestones
+ * elements have an sID attribute, while ending milestones have
+ * an eID with the same value as the opening. So everything between
+ * the start and the corresponding end is the content of the element.
+ * Also, for a given element, say div, they have to be properly nested
+ * as if they were container elements.</p>
+ *
+ * @param root the whole osis document.
+ * @return The Bible text without markup
+ */
+ public static String getVerseText(Element root)
+ {
+ StringBuffer buffer = new StringBuffer();
+
+ Element osisText = root.getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
+ Element div = osisText.getChild(OSISUtil.OSIS_ELEMENT_DIV);
+
+ Iterator dit = div.getContent().iterator();
+ String sid = null;
+ Object data = null;
+ Element ele = null;
+ while (dit.hasNext())
+ {
+ data = dit.next();
+ if (data instanceof Element)
+ {
+ ele = (Element) data;
+ if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE))
+ {
+ sid = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
+ if (sid != null)
+ {
+ getVerseContent(dit, buffer);
+ }
+ else
+ {
+ getVerseContent(ele.getContent().iterator(), buffer);
+ }
+ }
+ }
+ else if (data instanceof Text)
+ {
+ buffer.append(((Text) data).getText());
+ }
+ }
+
+ return buffer.toString().trim();
+ }
+
+ /**
* A simplified plain text version of the data in this Element with all
* the markup stripped out.
* @return The Bible text without markup
*/
! public static String getPlainText(Element root)
! {
! StringBuffer buffer = new StringBuffer();
!
! Element osisText = root.getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
! List divs = osisText.getChildren(OSISUtil.OSIS_ELEMENT_DIV);
!
! for (Iterator oit = divs.iterator(); oit.hasNext(); )
! {
! Element div = (Element) oit.next();
!
! Iterator dit = div.getContent().iterator();
! while (dit.hasNext())
! {
! Object data = dit.next();
! if (data instanceof Element)
! {
! Element ele = (Element) data;
! if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE))
! {
! String txt = OSISUtil.getTextContent((Element) data);
! buffer.append(txt);
! }
! }
! }
! }
!
! return buffer.toString().trim();
! }
!
! private static void getVerseContent(Iterator iter, StringBuffer buffer)
! {
! Object data = null;
! Element ele = null;
! String eleName = null;
! while (iter.hasNext())
! {
! data = iter.next();
! if (data instanceof Element)
! {
! ele = (Element) data;
! // If the verse is done then quit.
! // This should be a verse eID=, that matches sID, but it does not matter.
! // Since this gets the text of one verse, any verse element that follows
! // is the end of the previous verse.
! eleName = ele.getName();
! if (eleName.equals(OSISUtil.OSIS_ELEMENT_VERSE))
! {
! break;
! }
!
! // Ignore extra-biblical text
! if (!EXTRA_BIBLICAL_ELEMENTS.contains(eleName))
! {
! OSISUtil.getVerseContent(ele.getContent().iterator(), buffer);
! }
! }
! else if (data instanceof Text)
! {
! buffer.append(((Text) data).getText());
! }
! }
! }
!
! private static String getTextContent(Element ele)
{
StringBuffer buffer = new StringBuffer();
Index: BookData.java
===================================================================
RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/BookData.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** BookData.java 6 Mar 2005 20:21:47 -0000 1.10
--- BookData.java 2 May 2005 01:29:32 -0000 1.11
***************
*** 1,7 ****
package org.crosswire.jsword.book;
- import java.util.Iterator;
- import java.util.List;
-
import org.crosswire.common.xml.JDOMSAXEventProvider;
import org.crosswire.common.xml.SAXEventProvider;
--- 1,4 ----
***************
*** 69,104 ****
/**
! * A simplified plain text version of the data in this document with all
! * the markup stripped out.
* @return The Bible text without markup
*/
! public String getPlainText()
{
! StringBuffer buffer = new StringBuffer();
!
! Element osisText = getOsis().getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
! List divs = osisText.getChildren(OSISUtil.OSIS_ELEMENT_DIV);
!
! for (Iterator oit = divs.iterator(); oit.hasNext(); )
! {
! Element div = (Element) oit.next();
!
! Iterator dit = div.getContent().iterator();
! while (dit.hasNext())
! {
! Object data = dit.next();
! if (data instanceof Element)
! {
! Element ele = (Element) data;
! if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE))
! {
! String txt = OSISUtil.getPlainText((Element) data);
! buffer.append(txt);
! }
! }
! }
! }
! return buffer.toString().trim();
}
--- 66,86 ----
/**
! * Return the verse's text without any extra-biblical material.
* @return The Bible text without markup
*/
! public String getVerseText()
{
! return OSISUtil.getVerseText(getOsis());
! }
! /**
! * A simplified plain text version of the data in this document with all
! * the markup stripped out. This is not as simple as it seems.
! * TODO(DMS): push this into OSISUtil
! * @return The text without markup
! */
! public String getPlainText()
! {
! return OSISUtil.getPlainText(getOsis());
}
More information about the jsword-svn
mailing list