[jsword-devel] GBF tag support for Hebrew and Greek...
Joe Walker
jsword-devel@crosswire.org
28 May 2003 19:09:59 +0100
Hi Jacky,
Thanks for the patch. I'm just back from holiday and I hope to apply it
very soon.
Joe.
On Sat, 2003-05-24 at 06:52, Jacky Cheung wrote:
> Hi,
>
> Please find the patch for GBF tag support for Heb. and Greek attached.
> There is a huge change in the GBFFilter class. Thanks.
>
> Best regards,
> Jacky
>
> P.S.
> Now, the GUI can should hyperlink for tagged word but cannot link to
> lexcon. Furthermore, the translated output of the text looks not good
> enough. I know that somebody has written an XSL file for to convert OSIS
> XML to HTML.
>
>
> ______________________________________________________________________
>
> Index: jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java
> ===================================================================
> RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java,v
> retrieving revision 1.7
> diff -u -r1.7 GBFFilter.java
> --- jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java 17 May 2003 15:00:14 -0000 1.7
> +++ jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java 24 May 2003 05:45:31 -0000
> @@ -3,7 +3,7 @@
>
> import java.util.ArrayList;
> import java.util.Iterator;
> -import java.util.LinkedList;
> +import java.util.Stack;
> import java.util.List;
>
> import javax.xml.bind.Element;
> @@ -13,6 +13,7 @@
> import org.crosswire.common.util.LogicError;
> import org.crosswire.jsword.osis.Note;
> import org.crosswire.jsword.osis.Seg;
> +import org.crosswire.jsword.osis.W;
>
> /**
> * Filter to convert GBF data to OSIS format.
> @@ -47,8 +48,17 @@
> {
> try
> {
> - List tokens = tokenize(plain);
> - parseTokens(ele, tokens);
> + Stack stack = new Stack();
> + stack.push(ele);
> +
> + TagGenerator generator = new TagGenerator(plain);
> + ITag tag = generator.getNextTag();
> + while (tag != null) {
> + tag.updateOsisStack(stack);
> + tag = generator.getNextTag();
> + }
> +
> + stack.pop();
> }
> catch (JAXBException ex)
> {
> @@ -56,241 +66,367 @@
> }
> }
>
> - /**
> - * Go through a list of tokens and add them to the listener
> - */
> - public void parseTokens(Element ele, List tokens) throws JAXBException, DataException
> - {
> - LinkedList stack = new LinkedList();
> - stack.addFirst(ele);
> -
> - // For notes
> - int marker = 1;
> -
> - // go through the token working out what to do with them all
> - for (Iterator it = tokens.iterator(); it.hasNext();)
> - {
> - Object token = it.next();
> - if (token instanceof String)
> + private static class TagGenerator {
> + public TagGenerator(String plain) {
> + int lastIndex = plain.length() - 1;
> + if (lastIndex >= 0 && plain.charAt(lastIndex) == ((char) 13))
> {
> - Element current = (Element) stack.getFirst();
> - List list = JAXBUtil.getList(current);
> - list.add((String) token);
> + plain = plain.substring(0, lastIndex);
> }
> - else if (token instanceof Tag)
> - {
> - Tag tag = (Tag) token;
> + remains = plain;
> + }
> +
> + /**
> + * Get Next tags in the string
> + */
> + public ITag getNextTag() {
> + if (retval.isEmpty()) {
> + if (remains == null)
> + return null;
> + parseNextTag();
> + }
> + return (ITag) retval.remove(0);
> + }
>
> - // skip over the rest of the footnote
> - if (tag.equals(FOOTNOTE_START))
> - {
> - List footnote = getTokensUntil(it, FOOTNOTE_STOP);
> - String content = filterText(footnote);
> -
> - // This could be a marker or it could be the body of the note
> - // We tell which by string length. <= 1 is a marker which we
> - // ignore for simplicity
> - if (content.length() > 1)
> + private void parseNextTag() {
> + if (remains == null)
> + {
> + return;
> + }
> +
> + int ltpos = remains.indexOf('<');
> + int gtpos = remains.indexOf('>');
> +
> + if (ltpos == -1 && gtpos == -1) {
> + // no more tags to decode
> + retval.add(new TextTag(remains));
> + remains = null;
> + return;
> + }
> +
> + // check that we don't have unmatched tags
> + if (ltpos == -1 || gtpos == -1) {
> + log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> + retval.add(new TextTag(remains));
> + remains = null;
> + return;
> + }
> +
> + // check that the tags are in a sensible order
> + if (ltpos > gtpos) {
> + log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> + retval.add(new TextTag(remains));
> + remains = null;
> + return;
> + }
> +
> + // generate tags
> + String start = remains.substring(0, ltpos);
> + int strLen = start.length();
> + if (strLen > 0) {
> + int beginIndex = 0;
> + boolean inSepStr = isSeperator(start.charAt(0));
> + // split words from seperators...
> + // e.g., "a b c? e g." -> "a b c", "? ", "e g."
> + // "a b c<tag> e g." -> "a b c", tag, " ", "e g."
> + for(int i=1; inSepStr && i<strLen; i++) {
> + char currentChar = start.charAt(i);
> + if (!isSeperator(currentChar))
> {
> - Note note = JAXBUtil.factory().createNote();
> - note.setN(""+(marker++));
> - note.getContent().add(content);
> - Element current = (Element) stack.getFirst();
> -
> - List list = JAXBUtil.getList(current);
> - list.add(note);
> + retval.add(new TextTag(start.substring(beginIndex, i)));
> + beginIndex = i;
> + inSepStr = false;
> }
> }
> - else if (tag.equals(PARAGRAPH))
> - {
> - // ignore paragraph markers
> - }
> - else if (tag.equals(ITALICS_START))
> - {
> - Seg seg = JAXBUtil.factory().createSeg();
> - Element current = (Element) stack.getFirst();
> -
> - List list = JAXBUtil.getList(current);
> - list.add(seg);
> -
> - stack.addFirst(seg);
> - }
> - else if (tag.equals(ITALICS_STOP))
> - {
> - Object top = stack.removeFirst();
> -
> - // Check that we are properly tree structured
> - if (!(top instanceof Seg))
> - {
> - throw new LogicError();
> - }
> - }
> - else
> - {
> - // unknown tags
> - log.warn("Ignoring tag of "+tag.getTag());
> + if (beginIndex < strLen) {
> + retval.add(new TextTag(start.substring(beginIndex)));
> }
> }
> - else
> - {
> - throw new DataException(Msg.GBF_BADTOKEN, new Object[] { token });
> +
> + String tag = remains.substring(ltpos+1, gtpos);
> + if (tag.length() > 0) {
> + retval.add(createTag(tag));
> }
> +
> + remains = remains.substring(gtpos+1);
> }
> -
> - stack.removeFirst();
> +
> + private boolean isSeperator(char c) {
> + final String seperators = " ,:;.?!";
> + return seperators.indexOf(c) >= 0;
> + }
> +
> + private ITag createTag(String tag) {
> + if (tag.equals("RB")) {
> + return new TextWithEmbeddedFootnote();
> + }
> + if (tag.equals("RF")) {
> + return new FootnoteStartTag();
> + }
> + if (tag.equals("Rf")) {
> + return new FootnoteEndTag();
> + }
> + if (tag.equals("FI")) {
> + return new ItalicStartTag();
> + }
> + if (tag.equals("Fi")) {
> + return new ItalicEndTag();
> + }
> + if (tag.equals("CM")) {
> + return new ParagraphTag();
> + }
> + if (tag.startsWith("WT")) {
> + return new StrongsMorphRefTag(tag);
> + }
> + if (tag.startsWith("WH") || tag.startsWith("WG")) {
> + return new StrongsWordRefTag(tag);
> + }
> + return new UnknownTag(tag);
> + }
> + private String remains;
> + private List retval = new ArrayList();
> }
>
> /**
> - * Strip all the Tags from a List and return just the text
> + * GBF Tag interface
> + *
> + * Now the number of supported tags are small.
> + * If the number become large, refactor...
> + * 1. refactor ITag to public abstract class GBFTag
> + * 2. move createTag() to GBFTag
> + * 3. move tag classes to GBFTag.java so that adding tags updates only GBFTag.java
> + *
> + * On adding new tags, implements new tag classes and update createTag()
> */
> - private String filterText(List list)
> + private static interface ITag
> {
> - StringBuffer buffer = new StringBuffer();
> -
> - // go through the token working out what to do with them all
> - for (Iterator it = list.iterator(); it.hasNext();)
> + /**
> + * Sub-classes should implement this method to generate OSIS Object
> + */
> + public void updateOsisStack(Stack osisStack) throws JAXBException;
> + }
> +
> + /**
> + * Tag syntax: <RB>Words<RF>note<Rf>
> + */
> + private static class TextWithEmbeddedFootnote implements ITag
> + {
> + public void updateOsisStack(Stack stack) throws JAXBException
> {
> - Object token = it.next();
> - if (token instanceof String)
> - {
> - buffer.append((String) token);
> - }
> + Note note = JAXBUtil.factory().createNote();
> + note.setNoteType("x-StudyNote");
> + Element current = (Element) stack.peek();
> +
> + List list = JAXBUtil.getList(current);
> + list.add(note);
> + stack.push(note);
> }
> -
> - return buffer.toString();
> }
> -
> +
> /**
> - * Get a list for the footnote
> + * Tag syntax: <RF>note<Rf>
> */
> - private List getTokensUntil(Iterator it, Tag end) throws JAXBException
> + private static class FootnoteStartTag implements ITag
> {
> - // take tokens off the list until end of list or FOOTNOTE_END
> - List ignored = new ArrayList();
> -
> - while (true)
> + public void updateOsisStack(Stack stack) throws JAXBException
> {
> - if (!it.hasNext())
> - {
> - break;
> - }
> -
> - Object token = it.next();
> - if (token instanceof String)
> + Element current = (Element) stack.peek();
> + if (!(current instanceof Note))
> {
> - ignored.add(token);
> - }
> - else if (token instanceof Tag)
> - {
> - Tag tag = (Tag) token;
> - if (tag.equals(end))
> - {
> - break;
> - }
> - else
> - {
> - ignored.add(token);
> - }
> - }
> - else
> - {
> - throw new JAXBException("Failed to parse: "+token);
> + Note note = JAXBUtil.factory().createNote();
> + note.setNoteType("x-StudyNote");
> +
> + List list = JAXBUtil.getList(current);
> + list.add(note);
> + stack.push(note);
> }
> }
> -
> - return ignored;
> }
>
> /**
> - * Create a list of strings and tags
> - * @param plain
> - * @return List
> + * Tag syntax: <RF>note<Rf>
> */
> - private List tokenize(String plain)
> + private static class FootnoteEndTag implements ITag
> {
> - List retval = new ArrayList();
> - String remains = plain;
> -
> - while (true)
> + public void updateOsisStack(Stack stack) throws JAXBException
> {
> - int ltpos = remains.indexOf('<');
> - int gtpos = remains.indexOf('>');
> + Note note = (Note) stack.pop();
> + List list = JAXBUtil.getList(note);
>
> - if (ltpos == -1 && gtpos == -1)
> - {
> - // no more tags to decode
> - retval.add(remains);
> - break;
> + if (list.size() < 1) {
> + JAXBUtil.getList((Element)stack.peek()).remove(note);
> }
> + }
> + }
>
> - // check that we don't have unmatched tags
> - if (ltpos == -1 || gtpos == -1)
> - {
> - log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> - retval.add(remains);
> - break;
> - }
> -
> - // check that the tags are in a sensible order
> - if (ltpos > gtpos)
> - {
> - log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> - retval.add(remains);
> - break;
> - }
> + /**
> + * Tag syntax: <FI>note<Fi>
> + */
> + private static class ItalicStartTag implements ITag
> + {
> + public void updateOsisStack(Stack stack) throws JAXBException
> + {
> + // remarked, for the XSL does not present it correctly
> + // The XSL should translate it to <I>...</I> but now it translated
> + // to <div>...</div>
> + /*
> + Seg seg = JAXBUtil.factory().createSeg();
> + Element current = (Element) stack.peek();
>
> - String start = remains.substring(0, ltpos);
> - retval.add(start);
> + List list = JAXBUtil.getList(current);
> + list.add(seg);
>
> - String tag = remains.substring(ltpos+1, gtpos);
> - retval.add(new Tag(tag));
> -
> - remains = remains.substring(gtpos+1);
> + stack.push(seg);
> + */
> }
> -
> - return retval;
> }
>
> /**
> - * A GBF Tag
> + * Tag syntax: <FI>note<Fi>
> */
> - private static class Tag
> + private static class ItalicEndTag implements ITag
> {
> - public Tag(String tag)
> + public void updateOsisStack(Stack stack)
> {
> - this.tag = tag;
> + // remarked, for the XSL does not translate it correctly
> + // stack.pop();
> }
> + }
>
> - public String getTag()
> + /**
> + * Tag syntax: Words<CM>
> + */
> + private static class ParagraphTag implements ITag
> + {
> + public void updateOsisStack(Stack stack) {
> + JAXBUtil.getList((Element)stack.peek()).add(Character.toString('ΒΆ'));
> + }
> + }
> +
> + /**
> + * Tag syntax: word<WHxxxx> or word<WGxxxx>
> + */
> + private static class StrongsWordRefTag implements ITag
> + {
> + public StrongsWordRefTag(String tagName)
> {
> - return tag;
> + tag = tagName.trim();
> }
> -
> - public boolean equals(Object obj)
> + public void updateOsisStack(Stack stack) throws JAXBException
> {
> - if (obj == null)
> - return false;
> -
> - if (obj.getClass() != this.getClass())
> - return false;
> -
> - Tag that = (Tag) obj;
> - return this.tag.equals(that.tag);
> + Element ele = (Element) stack.peek();
> + List list = JAXBUtil.getList(ele);
> + if (list.isEmpty())
> + {
> + log.error("Source has problem for tag <" + tag + ">.");
> + return;
> + }
> + int lastIndex = list.size() - 1;
> + Object prevObj = list.get(lastIndex);
> + W word = null;
> + if (prevObj instanceof String)
> + {
> + word = JAXBUtil.factory().createW();
> + word.getContent().add(prevObj);
> + list.set(lastIndex, word);
> + }
> + else if (prevObj instanceof W) {
> + word = (W) prevObj;
> + }
> + else {
> + log.error("Source has problem for tag <" + tag + ">.");
> + return;
> + }
> + String existingLemma = word.getLemma();
> + StringBuffer newLemma = new StringBuffer();
> + if (existingLemma != null && existingLemma.length() > 0) {
> + newLemma.append(existingLemma).append('|');
> + }
> + newLemma.append("x-Strongs:").append(tag.substring(2));
> + word.setLemma(newLemma.toString());
> }
> -
> - public int hashCode()
> + private String tag;
> + }
> +
> + /**
> + * Tag syntax: word<WTxxxx>
> + */
> + private static class StrongsMorphRefTag implements ITag
> + {
> + public StrongsMorphRefTag(String tagName)
> {
> - return tag.hashCode();
> + tag = tagName.trim();
> + }
> + public void updateOsisStack(Stack stack) throws JAXBException
> + {
> + Element ele = (Element) stack.peek();
> + List list = JAXBUtil.getList(ele);
> + if (list.isEmpty())
> + {
> + log.error("Source has problem for tag <" + tag + ">.");
> + return;
> + }
> + int lastIndex = list.size() - 1;
> + Object prevObj = list.get(lastIndex);
> + W word = null;
> + if (prevObj instanceof String)
> + {
> + word = JAXBUtil.factory().createW();
> + word.getContent().add(prevObj);
> + list.set(lastIndex, word);
> + }
> + else if (prevObj instanceof W) {
> + word = (W) prevObj;
> + }
> + else {
> + log.error("Source has problem for tag <" + tag + ">.");
> + return;
> + }
> + String existingMorph = word.getMorph();
> + StringBuffer newMorph = new StringBuffer();
> + if (existingMorph != null && existingMorph.length() > 0) {
> + newMorph.append(existingMorph).append('|');
> + }
> + newMorph.append("x-StrongsMorph:T").append(tag.substring(2));
> + word.setMorph(newMorph.toString());
> }
> -
> private String tag;
> }
>
> - private static final Tag PARAGRAPH = new Tag("CM");
> - private static final Tag FOOTNOTE_START = new Tag("RF");
> - private static final Tag FOOTNOTE_STOP = new Tag("Rf");
> - private static final Tag ITALICS_START = new Tag("FI");
> - private static final Tag ITALICS_STOP = new Tag("Fi");
> + /**
> + * Represent a trunc of bible text without any tags
> + */
> + private static class TextTag implements ITag
> + {
> + public TextTag(String textData)
> + {
> + text = textData;
> + }
> + public void updateOsisStack(Stack stack) throws JAXBException
> + {
> + Element ele = (Element) stack.peek();
> + List list = JAXBUtil.getList(ele);
> + list.add(text);
> + }
> + private String text;
> + }
> +
> + /**
> + * Unknown Tag. Either not supported tag or tag not defined in GBF specification
> + */
> + private static class UnknownTag implements ITag
> + {
> + public UnknownTag(String tagName)
> + {
> + tag = tagName;
> + }
> + public void updateOsisStack(Stack stack)
> + {
> + // unknown tags
> + log.warn("Ignoring tag of "+ tag);
> + }
> + private String tag;
> + }
>
> /**
> * The log stream
> Index: jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java
> ===================================================================
> RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java,v
> retrieving revision 1.3
> diff -u -r1.3 JAXBUtil.java
> --- jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java 17 May 2003 16:11:52 -0000 1.3
> +++ jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java 24 May 2003 05:45:32 -0000
> @@ -12,9 +12,11 @@
> import org.crosswire.common.util.Logger;
> import org.crosswire.common.util.LogicError;
> import org.crosswire.jsword.osis.Div;
> +import org.crosswire.jsword.osis.Note;
> import org.crosswire.jsword.osis.ObjectFactory;
> import org.crosswire.jsword.osis.Seg;
> import org.crosswire.jsword.osis.Verse;
> +import org.crosswire.jsword.osis.W;
> import org.crosswire.jsword.util.Project;
>
> /**
> @@ -148,6 +150,14 @@
> else if (current instanceof Div)
> {
> return ((Div) current).getContent();
> + }
> + else if (current instanceof Note)
> + {
> + return ((Note) current).getContent();
> + }
> + else if (current instanceof W)
> + {
> + return ((W) current).getContent();
> }
>
> log.error("unknown element: "+current.getClass().getName());
>