[jsword-devel] GBF tag support for Hebrew and Greek...
Joe Walker
jsword-devel@crosswire.org
Sun, 01 Jun 2003 07:36:13 +0100
Hi,
I've checked it into CVS.
I took the liberty of pressing CTRL+SHIFT+F to reformat and I renamed
ITag to Tag, just to keep everything looking similar. Hope that's not a
problem.
It seems to work fine from what I can see. I start a new job tomorrow so
I hope to have more time on the train to get some more coding done.
Cheers,
Joe.
Joe Walker wrote:
> Hi Jacky,
>
> Thanks for the patch. I'm just back from holiday and I hope to apply it
> very soon.
>
> Joe.
>
>
> On Sat, 2003-05-24 at 06:52, Jacky Cheung wrote:
>
>>Hi,
>>
>>Please find the patch for GBF tag support for Heb. and Greek attached.
>>There is a huge change in the GBFFilter class. Thanks.
>>
>>Best regards,
>>Jacky
>>
>>P.S.
>>Now, the GUI can should hyperlink for tagged word but cannot link to
>>lexcon. Furthermore, the translated output of the text looks not good
>>enough. I know that somebody has written an XSL file for to convert OSIS
>>XML to HTML.
>>
>>
>>______________________________________________________________________
>>
>>Index: jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java
>>===================================================================
>>RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java,v
>>retrieving revision 1.7
>>diff -u -r1.7 GBFFilter.java
>>--- jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java 17 May 2003 15:00:14 -0000 1.7
>>+++ jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java 24 May 2003 05:45:31 -0000
>>@@ -3,7 +3,7 @@
>>
>> import java.util.ArrayList;
>> import java.util.Iterator;
>>-import java.util.LinkedList;
>>+import java.util.Stack;
>> import java.util.List;
>>
>> import javax.xml.bind.Element;
>>@@ -13,6 +13,7 @@
>> import org.crosswire.common.util.LogicError;
>> import org.crosswire.jsword.osis.Note;
>> import org.crosswire.jsword.osis.Seg;
>>+import org.crosswire.jsword.osis.W;
>>
>> /**
>> * Filter to convert GBF data to OSIS format.
>>@@ -47,8 +48,17 @@
>> {
>> try
>> {
>>- List tokens = tokenize(plain);
>>- parseTokens(ele, tokens);
>>+ Stack stack = new Stack();
>>+ stack.push(ele);
>>+
>>+ TagGenerator generator = new TagGenerator(plain);
>>+ ITag tag = generator.getNextTag();
>>+ while (tag != null) {
>>+ tag.updateOsisStack(stack);
>>+ tag = generator.getNextTag();
>>+ }
>>+
>>+ stack.pop();
>> }
>> catch (JAXBException ex)
>> {
>>@@ -56,241 +66,367 @@
>> }
>> }
>>
>>- /**
>>- * Go through a list of tokens and add them to the listener
>>- */
>>- public void parseTokens(Element ele, List tokens) throws JAXBException, DataException
>>- {
>>- LinkedList stack = new LinkedList();
>>- stack.addFirst(ele);
>>-
>>- // For notes
>>- int marker = 1;
>>-
>>- // go through the token working out what to do with them all
>>- for (Iterator it = tokens.iterator(); it.hasNext();)
>>- {
>>- Object token = it.next();
>>- if (token instanceof String)
>>+ private static class TagGenerator {
>>+ public TagGenerator(String plain) {
>>+ int lastIndex = plain.length() - 1;
>>+ if (lastIndex >= 0 && plain.charAt(lastIndex) == ((char) 13))
>> {
>>- Element current = (Element) stack.getFirst();
>>- List list = JAXBUtil.getList(current);
>>- list.add((String) token);
>>+ plain = plain.substring(0, lastIndex);
>> }
>>- else if (token instanceof Tag)
>>- {
>>- Tag tag = (Tag) token;
>>+ remains = plain;
>>+ }
>>+
>>+ /**
>>+ * Get Next tags in the string
>>+ */
>>+ public ITag getNextTag() {
>>+ if (retval.isEmpty()) {
>>+ if (remains == null)
>>+ return null;
>>+ parseNextTag();
>>+ }
>>+ return (ITag) retval.remove(0);
>>+ }
>>
>>- // skip over the rest of the footnote
>>- if (tag.equals(FOOTNOTE_START))
>>- {
>>- List footnote = getTokensUntil(it, FOOTNOTE_STOP);
>>- String content = filterText(footnote);
>>-
>>- // This could be a marker or it could be the body of the note
>>- // We tell which by string length. <= 1 is a marker which we
>>- // ignore for simplicity
>>- if (content.length() > 1)
>>+ private void parseNextTag() {
>>+ if (remains == null)
>>+ {
>>+ return;
>>+ }
>>+
>>+ int ltpos = remains.indexOf('<');
>>+ int gtpos = remains.indexOf('>');
>>+
>>+ if (ltpos == -1 && gtpos == -1) {
>>+ // no more tags to decode
>>+ retval.add(new TextTag(remains));
>>+ remains = null;
>>+ return;
>>+ }
>>+
>>+ // check that we don't have unmatched tags
>>+ if (ltpos == -1 || gtpos == -1) {
>>+ log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>+ retval.add(new TextTag(remains));
>>+ remains = null;
>>+ return;
>>+ }
>>+
>>+ // check that the tags are in a sensible order
>>+ if (ltpos > gtpos) {
>>+ log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>+ retval.add(new TextTag(remains));
>>+ remains = null;
>>+ return;
>>+ }
>>+
>>+ // generate tags
>>+ String start = remains.substring(0, ltpos);
>>+ int strLen = start.length();
>>+ if (strLen > 0) {
>>+ int beginIndex = 0;
>>+ boolean inSepStr = isSeperator(start.charAt(0));
>>+ // split words from seperators...
>>+ // e.g., "a b c? e g." -> "a b c", "? ", "e g."
>>+ // "a b c<tag> e g." -> "a b c", tag, " ", "e g."
>>+ for(int i=1; inSepStr && i<strLen; i++) {
>>+ char currentChar = start.charAt(i);
>>+ if (!isSeperator(currentChar))
>> {
>>- Note note = JAXBUtil.factory().createNote();
>>- note.setN(""+(marker++));
>>- note.getContent().add(content);
>>- Element current = (Element) stack.getFirst();
>>-
>>- List list = JAXBUtil.getList(current);
>>- list.add(note);
>>+ retval.add(new TextTag(start.substring(beginIndex, i)));
>>+ beginIndex = i;
>>+ inSepStr = false;
>> }
>> }
>>- else if (tag.equals(PARAGRAPH))
>>- {
>>- // ignore paragraph markers
>>- }
>>- else if (tag.equals(ITALICS_START))
>>- {
>>- Seg seg = JAXBUtil.factory().createSeg();
>>- Element current = (Element) stack.getFirst();
>>-
>>- List list = JAXBUtil.getList(current);
>>- list.add(seg);
>>-
>>- stack.addFirst(seg);
>>- }
>>- else if (tag.equals(ITALICS_STOP))
>>- {
>>- Object top = stack.removeFirst();
>>-
>>- // Check that we are properly tree structured
>>- if (!(top instanceof Seg))
>>- {
>>- throw new LogicError();
>>- }
>>- }
>>- else
>>- {
>>- // unknown tags
>>- log.warn("Ignoring tag of "+tag.getTag());
>>+ if (beginIndex < strLen) {
>>+ retval.add(new TextTag(start.substring(beginIndex)));
>> }
>> }
>>- else
>>- {
>>- throw new DataException(Msg.GBF_BADTOKEN, new Object[] { token });
>>+
>>+ String tag = remains.substring(ltpos+1, gtpos);
>>+ if (tag.length() > 0) {
>>+ retval.add(createTag(tag));
>> }
>>+
>>+ remains = remains.substring(gtpos+1);
>> }
>>-
>>- stack.removeFirst();
>>+
>>+ private boolean isSeperator(char c) {
>>+ final String seperators = " ,:;.?!";
>>+ return seperators.indexOf(c) >= 0;
>>+ }
>>+
>>+ private ITag createTag(String tag) {
>>+ if (tag.equals("RB")) {
>>+ return new TextWithEmbeddedFootnote();
>>+ }
>>+ if (tag.equals("RF")) {
>>+ return new FootnoteStartTag();
>>+ }
>>+ if (tag.equals("Rf")) {
>>+ return new FootnoteEndTag();
>>+ }
>>+ if (tag.equals("FI")) {
>>+ return new ItalicStartTag();
>>+ }
>>+ if (tag.equals("Fi")) {
>>+ return new ItalicEndTag();
>>+ }
>>+ if (tag.equals("CM")) {
>>+ return new ParagraphTag();
>>+ }
>>+ if (tag.startsWith("WT")) {
>>+ return new StrongsMorphRefTag(tag);
>>+ }
>>+ if (tag.startsWith("WH") || tag.startsWith("WG")) {
>>+ return new StrongsWordRefTag(tag);
>>+ }
>>+ return new UnknownTag(tag);
>>+ }
>>+ private String remains;
>>+ private List retval = new ArrayList();
>> }
>>
>> /**
>>- * Strip all the Tags from a List and return just the text
>>+ * GBF Tag interface
>>+ *
>>+ * Now the number of supported tags are small.
>>+ * If the number become large, refactor...
>>+ * 1. refactor ITag to public abstract class GBFTag
>>+ * 2. move createTag() to GBFTag
>>+ * 3. move tag classes to GBFTag.java so that adding tags updates only GBFTag.java
>>+ *
>>+ * On adding new tags, implements new tag classes and update createTag()
>> */
>>- private String filterText(List list)
>>+ private static interface ITag
>> {
>>- StringBuffer buffer = new StringBuffer();
>>-
>>- // go through the token working out what to do with them all
>>- for (Iterator it = list.iterator(); it.hasNext();)
>>+ /**
>>+ * Sub-classes should implement this method to generate OSIS Object
>>+ */
>>+ public void updateOsisStack(Stack osisStack) throws JAXBException;
>>+ }
>>+
>>+ /**
>>+ * Tag syntax: <RB>Words<RF>note<Rf>
>>+ */
>>+ private static class TextWithEmbeddedFootnote implements ITag
>>+ {
>>+ public void updateOsisStack(Stack stack) throws JAXBException
>> {
>>- Object token = it.next();
>>- if (token instanceof String)
>>- {
>>- buffer.append((String) token);
>>- }
>>+ Note note = JAXBUtil.factory().createNote();
>>+ note.setNoteType("x-StudyNote");
>>+ Element current = (Element) stack.peek();
>>+
>>+ List list = JAXBUtil.getList(current);
>>+ list.add(note);
>>+ stack.push(note);
>> }
>>-
>>- return buffer.toString();
>> }
>>-
>>+
>> /**
>>- * Get a list for the footnote
>>+ * Tag syntax: <RF>note<Rf>
>> */
>>- private List getTokensUntil(Iterator it, Tag end) throws JAXBException
>>+ private static class FootnoteStartTag implements ITag
>> {
>>- // take tokens off the list until end of list or FOOTNOTE_END
>>- List ignored = new ArrayList();
>>-
>>- while (true)
>>+ public void updateOsisStack(Stack stack) throws JAXBException
>> {
>>- if (!it.hasNext())
>>- {
>>- break;
>>- }
>>-
>>- Object token = it.next();
>>- if (token instanceof String)
>>+ Element current = (Element) stack.peek();
>>+ if (!(current instanceof Note))
>> {
>>- ignored.add(token);
>>- }
>>- else if (token instanceof Tag)
>>- {
>>- Tag tag = (Tag) token;
>>- if (tag.equals(end))
>>- {
>>- break;
>>- }
>>- else
>>- {
>>- ignored.add(token);
>>- }
>>- }
>>- else
>>- {
>>- throw new JAXBException("Failed to parse: "+token);
>>+ Note note = JAXBUtil.factory().createNote();
>>+ note.setNoteType("x-StudyNote");
>>+
>>+ List list = JAXBUtil.getList(current);
>>+ list.add(note);
>>+ stack.push(note);
>> }
>> }
>>-
>>- return ignored;
>> }
>>
>> /**
>>- * Create a list of strings and tags
>>- * @param plain
>>- * @return List
>>+ * Tag syntax: <RF>note<Rf>
>> */
>>- private List tokenize(String plain)
>>+ private static class FootnoteEndTag implements ITag
>> {
>>- List retval = new ArrayList();
>>- String remains = plain;
>>-
>>- while (true)
>>+ public void updateOsisStack(Stack stack) throws JAXBException
>> {
>>- int ltpos = remains.indexOf('<');
>>- int gtpos = remains.indexOf('>');
>>+ Note note = (Note) stack.pop();
>>+ List list = JAXBUtil.getList(note);
>>
>>- if (ltpos == -1 && gtpos == -1)
>>- {
>>- // no more tags to decode
>>- retval.add(remains);
>>- break;
>>+ if (list.size() < 1) {
>>+ JAXBUtil.getList((Element)stack.peek()).remove(note);
>> }
>>+ }
>>+ }
>>
>>- // check that we don't have unmatched tags
>>- if (ltpos == -1 || gtpos == -1)
>>- {
>>- log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>- retval.add(remains);
>>- break;
>>- }
>>-
>>- // check that the tags are in a sensible order
>>- if (ltpos > gtpos)
>>- {
>>- log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>- retval.add(remains);
>>- break;
>>- }
>>+ /**
>>+ * Tag syntax: <FI>note<Fi>
>>+ */
>>+ private static class ItalicStartTag implements ITag
>>+ {
>>+ public void updateOsisStack(Stack stack) throws JAXBException
>>+ {
>>+ // remarked, for the XSL does not present it correctly
>>+ // The XSL should translate it to <I>...</I> but now it translated
>>+ // to <div>...</div>
>>+ /*
>>+ Seg seg = JAXBUtil.factory().createSeg();
>>+ Element current = (Element) stack.peek();
>>
>>- String start = remains.substring(0, ltpos);
>>- retval.add(start);
>>+ List list = JAXBUtil.getList(current);
>>+ list.add(seg);
>>
>>- String tag = remains.substring(ltpos+1, gtpos);
>>- retval.add(new Tag(tag));
>>-
>>- remains = remains.substring(gtpos+1);
>>+ stack.push(seg);
>>+ */
>> }
>>-
>>- return retval;
>> }
>>
>> /**
>>- * A GBF Tag
>>+ * Tag syntax: <FI>note<Fi>
>> */
>>- private static class Tag
>>+ private static class ItalicEndTag implements ITag
>> {
>>- public Tag(String tag)
>>+ public void updateOsisStack(Stack stack)
>> {
>>- this.tag = tag;
>>+ // remarked, for the XSL does not translate it correctly
>>+ // stack.pop();
>> }
>>+ }
>>
>>- public String getTag()
>>+ /**
>>+ * Tag syntax: Words<CM>
>>+ */
>>+ private static class ParagraphTag implements ITag
>>+ {
>>+ public void updateOsisStack(Stack stack) {
>>+ JAXBUtil.getList((Element)stack.peek()).add(Character.toString('ΒΆ'));
>>+ }
>>+ }
>>+
>>+ /**
>>+ * Tag syntax: word<WHxxxx> or word<WGxxxx>
>>+ */
>>+ private static class StrongsWordRefTag implements ITag
>>+ {
>>+ public StrongsWordRefTag(String tagName)
>> {
>>- return tag;
>>+ tag = tagName.trim();
>> }
>>-
>>- public boolean equals(Object obj)
>>+ public void updateOsisStack(Stack stack) throws JAXBException
>> {
>>- if (obj == null)
>>- return false;
>>-
>>- if (obj.getClass() != this.getClass())
>>- return false;
>>-
>>- Tag that = (Tag) obj;
>>- return this.tag.equals(that.tag);
>>+ Element ele = (Element) stack.peek();
>>+ List list = JAXBUtil.getList(ele);
>>+ if (list.isEmpty())
>>+ {
>>+ log.error("Source has problem for tag <" + tag + ">.");
>>+ return;
>>+ }
>>+ int lastIndex = list.size() - 1;
>>+ Object prevObj = list.get(lastIndex);
>>+ W word = null;
>>+ if (prevObj instanceof String)
>>+ {
>>+ word = JAXBUtil.factory().createW();
>>+ word.getContent().add(prevObj);
>>+ list.set(lastIndex, word);
>>+ }
>>+ else if (prevObj instanceof W) {
>>+ word = (W) prevObj;
>>+ }
>>+ else {
>>+ log.error("Source has problem for tag <" + tag + ">.");
>>+ return;
>>+ }
>>+ String existingLemma = word.getLemma();
>>+ StringBuffer newLemma = new StringBuffer();
>>+ if (existingLemma != null && existingLemma.length() > 0) {
>>+ newLemma.append(existingLemma).append('|');
>>+ }
>>+ newLemma.append("x-Strongs:").append(tag.substring(2));
>>+ word.setLemma(newLemma.toString());
>> }
>>-
>>- public int hashCode()
>>+ private String tag;
>>+ }
>>+
>>+ /**
>>+ * Tag syntax: word<WTxxxx>
>>+ */
>>+ private static class StrongsMorphRefTag implements ITag
>>+ {
>>+ public StrongsMorphRefTag(String tagName)
>> {
>>- return tag.hashCode();
>>+ tag = tagName.trim();
>>+ }
>>+ public void updateOsisStack(Stack stack) throws JAXBException
>>+ {
>>+ Element ele = (Element) stack.peek();
>>+ List list = JAXBUtil.getList(ele);
>>+ if (list.isEmpty())
>>+ {
>>+ log.error("Source has problem for tag <" + tag + ">.");
>>+ return;
>>+ }
>>+ int lastIndex = list.size() - 1;
>>+ Object prevObj = list.get(lastIndex);
>>+ W word = null;
>>+ if (prevObj instanceof String)
>>+ {
>>+ word = JAXBUtil.factory().createW();
>>+ word.getContent().add(prevObj);
>>+ list.set(lastIndex, word);
>>+ }
>>+ else if (prevObj instanceof W) {
>>+ word = (W) prevObj;
>>+ }
>>+ else {
>>+ log.error("Source has problem for tag <" + tag + ">.");
>>+ return;
>>+ }
>>+ String existingMorph = word.getMorph();
>>+ StringBuffer newMorph = new StringBuffer();
>>+ if (existingMorph != null && existingMorph.length() > 0) {
>>+ newMorph.append(existingMorph).append('|');
>>+ }
>>+ newMorph.append("x-StrongsMorph:T").append(tag.substring(2));
>>+ word.setMorph(newMorph.toString());
>> }
>>-
>> private String tag;
>> }
>>
>>- private static final Tag PARAGRAPH = new Tag("CM");
>>- private static final Tag FOOTNOTE_START = new Tag("RF");
>>- private static final Tag FOOTNOTE_STOP = new Tag("Rf");
>>- private static final Tag ITALICS_START = new Tag("FI");
>>- private static final Tag ITALICS_STOP = new Tag("Fi");
>>+ /**
>>+ * Represent a trunc of bible text without any tags
>>+ */
>>+ private static class TextTag implements ITag
>>+ {
>>+ public TextTag(String textData)
>>+ {
>>+ text = textData;
>>+ }
>>+ public void updateOsisStack(Stack stack) throws JAXBException
>>+ {
>>+ Element ele = (Element) stack.peek();
>>+ List list = JAXBUtil.getList(ele);
>>+ list.add(text);
>>+ }
>>+ private String text;
>>+ }
>>+
>>+ /**
>>+ * Unknown Tag. Either not supported tag or tag not defined in GBF specification
>>+ */
>>+ private static class UnknownTag implements ITag
>>+ {
>>+ public UnknownTag(String tagName)
>>+ {
>>+ tag = tagName;
>>+ }
>>+ public void updateOsisStack(Stack stack)
>>+ {
>>+ // unknown tags
>>+ log.warn("Ignoring tag of "+ tag);
>>+ }
>>+ private String tag;
>>+ }
>>
>> /**
>> * The log stream
>>Index: jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java
>>===================================================================
>>RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java,v
>>retrieving revision 1.3
>>diff -u -r1.3 JAXBUtil.java
>>--- jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java 17 May 2003 16:11:52 -0000 1.3
>>+++ jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java 24 May 2003 05:45:32 -0000
>>@@ -12,9 +12,11 @@
>> import org.crosswire.common.util.Logger;
>> import org.crosswire.common.util.LogicError;
>> import org.crosswire.jsword.osis.Div;
>>+import org.crosswire.jsword.osis.Note;
>> import org.crosswire.jsword.osis.ObjectFactory;
>> import org.crosswire.jsword.osis.Seg;
>> import org.crosswire.jsword.osis.Verse;
>>+import org.crosswire.jsword.osis.W;
>> import org.crosswire.jsword.util.Project;
>>
>> /**
>>@@ -148,6 +150,14 @@
>> else if (current instanceof Div)
>> {
>> return ((Div) current).getContent();
>>+ }
>>+ else if (current instanceof Note)
>>+ {
>>+ return ((Note) current).getContent();
>>+ }
>>+ else if (current instanceof W)
>>+ {
>>+ return ((W) current).getContent();
>> }
>>
>> log.error("unknown element: "+current.getClass().getName());
>>
>
>
> _______________________________________________
> jsword-devel mailing list
> jsword-devel@crosswire.org
> http://www.crosswire.org/mailman/listinfo/jsword-devel