[jsword-devel] GBF tag support for Hebrew and Greek...
Jacky Cheung
jsword-devel@crosswire.org
Sun, 01 Jun 2003 18:56:32 +0800
Yes. It should working fine, as ITag is not used else where. Hope you
will have a good start of your new job. Unfortunately, I will be
allocated to a tough task, and I worry I may not have time to work on
jsword :( May Lord bless you.
Jacky
Joe Walker wrote:
>
> Hi,
>
> I've checked it into CVS.
> I took the liberty of pressing CTRL+SHIFT+F to reformat and I renamed
> ITag to Tag, just to keep everything looking similar. Hope that's not
> a problem.
>
> It seems to work fine from what I can see. I start a new job tomorrow
> so I hope to have more time on the train to get some more coding done.
> Cheers,
>
> Joe.
>
> Joe Walker wrote:
>
>> Hi Jacky,
>>
>> Thanks for the patch. I'm just back from holiday and I hope to apply it
>> very soon.
>>
>> Joe.
>>
>>
>> On Sat, 2003-05-24 at 06:52, Jacky Cheung wrote:
>>
>>> Hi,
>>>
>>> Please find the patch for GBF tag support for Heb. and Greek
>>> attached. There is a huge change in the GBFFilter class. Thanks.
>>>
>>> Best regards,
>>> Jacky
>>>
>>> P.S.
>>> Now, the GUI can should hyperlink for tagged word but cannot link to
>>> lexcon. Furthermore, the translated output of the text looks not
>>> good enough. I know that somebody has written an XSL file for to
>>> convert OSIS XML to HTML.
>>>
>>>
>>> ______________________________________________________________________
>>>
>>> Index: jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java
>>> ===================================================================
>>> RCS file:
>>> /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java,v
>>>
>>> retrieving revision 1.7
>>> diff -u -r1.7 GBFFilter.java
>>> ---
>>> jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java
>>> 17 May 2003 15:00:14 -0000 1.7
>>> +++
>>> jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java
>>> 24 May 2003 05:45:31 -0000
>>> @@ -3,7 +3,7 @@
>>>
>>> import java.util.ArrayList;
>>> import java.util.Iterator;
>>> -import java.util.LinkedList;
>>> +import java.util.Stack;
>>> import java.util.List;
>>>
>>> import javax.xml.bind.Element;
>>> @@ -13,6 +13,7 @@
>>> import org.crosswire.common.util.LogicError;
>>> import org.crosswire.jsword.osis.Note;
>>> import org.crosswire.jsword.osis.Seg;
>>> +import org.crosswire.jsword.osis.W;
>>>
>>> /**
>>> * Filter to convert GBF data to OSIS format.
>>> @@ -47,8 +48,17 @@
>>> {
>>> try
>>> {
>>> - List tokens = tokenize(plain);
>>> - parseTokens(ele, tokens);
>>> + Stack stack = new Stack();
>>> + stack.push(ele);
>>> + + TagGenerator generator = new
>>> TagGenerator(plain);
>>> + ITag tag = generator.getNextTag();
>>> + while (tag != null) {
>>> + tag.updateOsisStack(stack);
>>> + tag = generator.getNextTag();
>>> + }
>>> + + stack.pop();
>>> }
>>> catch (JAXBException ex)
>>> {
>>> @@ -56,241 +66,367 @@
>>> }
>>> }
>>>
>>> - /**
>>> - * Go through a list of tokens and add them to the listener
>>> - */
>>> - public void parseTokens(Element ele, List tokens) throws
>>> JAXBException, DataException
>>> - {
>>> - LinkedList stack = new LinkedList();
>>> - stack.addFirst(ele);
>>> -
>>> - // For notes
>>> - int marker = 1;
>>> -
>>> - // go through the token working out what to do with them all
>>> - for (Iterator it = tokens.iterator(); it.hasNext();)
>>> - {
>>> - Object token = it.next();
>>> - if (token instanceof String)
>>> + private static class TagGenerator {
>>> + public TagGenerator(String plain) {
>>> + int lastIndex = plain.length() - 1;
>>> + if (lastIndex >= 0 && plain.charAt(lastIndex) ==
>>> ((char) 13))
>>> {
>>> - Element current = (Element) stack.getFirst();
>>> - List list = JAXBUtil.getList(current);
>>> - list.add((String) token);
>>> + plain = plain.substring(0, lastIndex);
>>> }
>>> - else if (token instanceof Tag)
>>> - {
>>> - Tag tag = (Tag) token;
>>> + remains = plain;
>>> + }
>>> + + /**
>>> + * Get Next tags in the string
>>> + */
>>> + public ITag getNextTag() {
>>> + if (retval.isEmpty()) {
>>> + if (remains == null)
>>> + return null;
>>> + parseNextTag();
>>> + }
>>> + return (ITag) retval.remove(0);
>>> + }
>>> - // skip over the rest of the footnote
>>> - if (tag.equals(FOOTNOTE_START))
>>> - {
>>> - List footnote = getTokensUntil(it, FOOTNOTE_STOP);
>>> - String content = filterText(footnote);
>>> - - // This could be a marker
>>> or it could be the body of the note
>>> - // We tell which by string length. <= 1 is a
>>> marker which we
>>> - // ignore for simplicity
>>> - if (content.length() > 1)
>>> + private void parseNextTag() {
>>> + if (remains == null)
>>> + {
>>> + return;
>>> + }
>>> + + int ltpos = remains.indexOf('<');
>>> + int gtpos = remains.indexOf('>');
>>> + + if (ltpos == -1 && gtpos == -1) {
>>> + // no more tags to decode
>>> + retval.add(new TextTag(remains));
>>> + remains = null;
>>> + return;
>>> + }
>>> + + // check that we don't have unmatched tags
>>> + if (ltpos == -1 || gtpos == -1) {
>>> + log.warn("ignoring unmatched '<' or '>' in gbf:
>>> "+remains);
>>> + retval.add(new TextTag(remains));
>>> + remains = null;
>>> + return;
>>> + }
>>> + + // check that the tags are in a sensible
>>> order
>>> + if (ltpos > gtpos) {
>>> + log.warn("ignoring unmatched '<' or '>' in gbf:
>>> "+remains);
>>> + retval.add(new TextTag(remains));
>>> + remains = null;
>>> + return;
>>> + }
>>> + + // generate tags
>>> + String start = remains.substring(0, ltpos);
>>> + int strLen = start.length();
>>> + if (strLen > 0) {
>>> + int beginIndex = 0;
>>> + boolean inSepStr = isSeperator(start.charAt(0));
>>> + // split words from seperators...
>>> + // e.g., "a b c? e g." -> "a b c", "? ", "e g."
>>> + // "a b c<tag> e g." -> "a b c", tag, " ", "e
>>> g."
>>> + for(int i=1; inSepStr && i<strLen; i++) {
>>> + char currentChar = start.charAt(i);
>>> + if (!isSeperator(currentChar))
>>> {
>>> - Note note = JAXBUtil.factory().createNote();
>>> - note.setN(""+(marker++));
>>> - note.getContent().add(content);
>>> - Element current = (Element) stack.getFirst();
>>> - - List list =
>>> JAXBUtil.getList(current); - list.add(note);
>>> + retval.add(new
>>> TextTag(start.substring(beginIndex, i)));
>>> + beginIndex = i;
>>> + inSepStr = false;
>>> }
>>> }
>>> - else if (tag.equals(PARAGRAPH))
>>> - {
>>> - // ignore paragraph markers
>>> - } - else if
>>> (tag.equals(ITALICS_START))
>>> - {
>>> - Seg seg = JAXBUtil.factory().createSeg();
>>> - Element current = (Element) stack.getFirst();
>>> - - List list =
>>> JAXBUtil.getList(current); - list.add(seg);
>>> -
>>> - stack.addFirst(seg);
>>> - } - else if
>>> (tag.equals(ITALICS_STOP))
>>> - {
>>> - Object top = stack.removeFirst();
>>> - - // Check that we are
>>> properly tree structured
>>> - if (!(top instanceof Seg))
>>> - {
>>> - throw new LogicError();
>>> - }
>>> - } - else
>>> - {
>>> - // unknown tags
>>> - log.warn("Ignoring tag of "+tag.getTag());
>>> + if (beginIndex < strLen) {
>>> + retval.add(new
>>> TextTag(start.substring(beginIndex)));
>>> }
>>> }
>>> - else
>>> - {
>>> - throw new DataException(Msg.GBF_BADTOKEN, new
>>> Object[] { token });
>>> + + String tag = remains.substring(ltpos+1,
>>> gtpos);
>>> + if (tag.length() > 0) {
>>> + retval.add(createTag(tag));
>>> }
>>> + + remains = remains.substring(gtpos+1);
>>> }
>>> -
>>> - stack.removeFirst();
>>> + + private boolean isSeperator(char c) {
>>> + final String seperators = " ,:;.?!";
>>> + return seperators.indexOf(c) >= 0;
>>> + }
>>> + + private ITag createTag(String tag) {
>>> + if (tag.equals("RB")) {
>>> + return new TextWithEmbeddedFootnote();
>>> + }
>>> + if (tag.equals("RF")) {
>>> + return new FootnoteStartTag();
>>> + }
>>> + if (tag.equals("Rf")) {
>>> + return new FootnoteEndTag();
>>> + }
>>> + if (tag.equals("FI")) {
>>> + return new ItalicStartTag();
>>> + }
>>> + if (tag.equals("Fi")) {
>>> + return new ItalicEndTag();
>>> + }
>>> + if (tag.equals("CM")) {
>>> + return new ParagraphTag();
>>> + }
>>> + if (tag.startsWith("WT")) {
>>> + return new StrongsMorphRefTag(tag);
>>> + }
>>> + if (tag.startsWith("WH") || tag.startsWith("WG")) {
>>> + return new StrongsWordRefTag(tag);
>>> + }
>>> + return new UnknownTag(tag);
>>> + }
>>> + private String remains;
>>> + private List retval = new ArrayList();
>>> }
>>>
>>> /**
>>> - * Strip all the Tags from a List and return just the text
>>> + * GBF Tag interface
>>> + *
>>> + * Now the number of supported tags are small.
>>> + * If the number become large, refactor...
>>> + * 1. refactor ITag to public abstract class GBFTag
>>> + * 2. move createTag() to GBFTag
>>> + * 3. move tag classes to GBFTag.java so that adding tags
>>> updates only GBFTag.java
>>> + *
>>> + * On adding new tags, implements new tag classes and update
>>> createTag()
>>> */
>>> - private String filterText(List list)
>>> + private static interface ITag
>>> {
>>> - StringBuffer buffer = new StringBuffer();
>>> -
>>> - // go through the token working out what to do with them all
>>> - for (Iterator it = list.iterator(); it.hasNext();)
>>> + /**
>>> + * Sub-classes should implement this method to generate
>>> OSIS Object
>>> + */
>>> + public void updateOsisStack(Stack osisStack) throws
>>> JAXBException;
>>> + }
>>> + + /**
>>> + * Tag syntax: <RB>Words<RF>note<Rf>
>>> + */
>>> + private static class TextWithEmbeddedFootnote implements ITag
>>> + {
>>> + public void updateOsisStack(Stack stack) throws JAXBException
>>> {
>>> - Object token = it.next();
>>> - if (token instanceof String)
>>> - {
>>> - buffer.append((String) token);
>>> - }
>>> + Note note = JAXBUtil.factory().createNote();
>>> + note.setNoteType("x-StudyNote");
>>> + Element current = (Element) stack.peek();
>>> + + List list = JAXBUtil.getList(current);
>>> + list.add(note);
>>> + stack.push(note);
>>> }
>>> - - return buffer.toString();
>>> }
>>> -
>>> + /**
>>> - * Get a list for the footnote
>>> + * Tag syntax: <RF>note<Rf>
>>> */
>>> - private List getTokensUntil(Iterator it, Tag end) throws
>>> JAXBException
>>> + private static class FootnoteStartTag implements ITag
>>> {
>>> - // take tokens off the list until end of list or FOOTNOTE_END
>>> - List ignored = new ArrayList();
>>> -
>>> - while (true)
>>> + public void updateOsisStack(Stack stack) throws JAXBException
>>> {
>>> - if (!it.hasNext())
>>> - {
>>> - break;
>>> - }
>>> - - Object token = it.next();
>>> - if (token instanceof String)
>>> + Element current = (Element) stack.peek();
>>> + if (!(current instanceof Note))
>>> {
>>> - ignored.add(token);
>>> - }
>>> - else if (token instanceof Tag)
>>> - {
>>> - Tag tag = (Tag) token;
>>> - if (tag.equals(end))
>>> - {
>>> - break;
>>> - }
>>> - else
>>> - {
>>> - ignored.add(token);
>>> - }
>>> - }
>>> - else
>>> - {
>>> - throw new JAXBException("Failed to parse: "+token);
>>> + Note note = JAXBUtil.factory().createNote();
>>> + note.setNoteType("x-StudyNote");
>>> + + List list =
>>> JAXBUtil.getList(current);
>>> + list.add(note);
>>> + stack.push(note);
>>> }
>>> }
>>> -
>>> - return ignored;
>>> }
>>> /**
>>> - * Create a list of strings and tags
>>> - * @param plain
>>> - * @return List
>>> + * Tag syntax: <RF>note<Rf>
>>> */
>>> - private List tokenize(String plain)
>>> + private static class FootnoteEndTag implements ITag
>>> {
>>> - List retval = new ArrayList();
>>> - String remains = plain;
>>> -
>>> - while (true)
>>> + public void updateOsisStack(Stack stack) throws JAXBException
>>> {
>>> - int ltpos = remains.indexOf('<');
>>> - int gtpos = remains.indexOf('>');
>>> + Note note = (Note) stack.pop();
>>> + List list = JAXBUtil.getList(note);
>>>
>>> - if (ltpos == -1 && gtpos == -1)
>>> - {
>>> - // no more tags to decode
>>> - retval.add(remains);
>>> - break;
>>> + if (list.size() < 1) {
>>> + JAXBUtil.getList((Element)stack.peek()).remove(note);
>>> }
>>> + }
>>> + }
>>>
>>> - // check that we don't have unmatched tags
>>> - if (ltpos == -1 || gtpos == -1)
>>> - {
>>> - log.warn("ignoring unmatched '<' or '>' in gbf:
>>> "+remains);
>>> - retval.add(remains);
>>> - break;
>>> - }
>>> - - // check that the tags are in a sensible
>>> order
>>> - if (ltpos > gtpos)
>>> - {
>>> - log.warn("ignoring unmatched '<' or '>' in gbf:
>>> "+remains);
>>> - retval.add(remains);
>>> - break;
>>> - }
>>> + /**
>>> + * Tag syntax: <FI>note<Fi>
>>> + */
>>> + private static class ItalicStartTag implements ITag
>>> + {
>>> + public void updateOsisStack(Stack stack) throws JAXBException
>>> + {
>>> + // remarked, for the XSL does not present it correctly
>>> + // The XSL should translate it to <I>...</I> but now it
>>> translated
>>> + // to <div>...</div>
>>> + /*
>>> + Seg seg = JAXBUtil.factory().createSeg();
>>> + Element current = (Element) stack.peek();
>>>
>>> - String start = remains.substring(0, ltpos);
>>> - retval.add(start);
>>> + List list = JAXBUtil.getList(current); +
>>> list.add(seg);
>>>
>>> - String tag = remains.substring(ltpos+1, gtpos);
>>> - retval.add(new Tag(tag));
>>> - - remains = remains.substring(gtpos+1);
>>> + stack.push(seg);
>>> + */
>>> }
>>> -
>>> - return retval;
>>> }
>>>
>>> /**
>>> - * A GBF Tag
>>> + * Tag syntax: <FI>note<Fi>
>>> */
>>> - private static class Tag
>>> + private static class ItalicEndTag implements ITag
>>> {
>>> - public Tag(String tag)
>>> + public void updateOsisStack(Stack stack)
>>> {
>>> - this.tag = tag;
>>> + // remarked, for the XSL does not translate it correctly
>>> + // stack.pop();
>>> }
>>> + }
>>>
>>> - public String getTag()
>>> + /**
>>> + * Tag syntax: Words<CM>
>>> + */
>>> + private static class ParagraphTag implements ITag
>>> + {
>>> + public void updateOsisStack(Stack stack) {
>>> +
>>> JAXBUtil.getList((Element)stack.peek()).add(Character.toString('ΒΆ'));
>>> + }
>>> + }
>>> + + /**
>>> + * Tag syntax: word<WHxxxx> or word<WGxxxx>
>>> + */
>>> + private static class StrongsWordRefTag implements ITag
>>> + {
>>> + public StrongsWordRefTag(String tagName)
>>> {
>>> - return tag;
>>> + tag = tagName.trim();
>>> }
>>> -
>>> - public boolean equals(Object obj)
>>> + public void updateOsisStack(Stack stack) throws JAXBException
>>> {
>>> - if (obj == null)
>>> - return false;
>>> -
>>> - if (obj.getClass() != this.getClass())
>>> - return false;
>>> -
>>> - Tag that = (Tag) obj;
>>> - return this.tag.equals(that.tag);
>>> + Element ele = (Element) stack.peek();
>>> + List list = JAXBUtil.getList(ele);
>>> + if (list.isEmpty())
>>> + {
>>> + log.error("Source has problem for tag <" + tag +
>>> ">.");
>>> + return;
>>> + }
>>> + int lastIndex = list.size() - 1;
>>> + Object prevObj = list.get(lastIndex);
>>> + W word = null;
>>> + if (prevObj instanceof String)
>>> + {
>>> + word = JAXBUtil.factory().createW();
>>> + word.getContent().add(prevObj);
>>> + list.set(lastIndex, word);
>>> + }
>>> + else if (prevObj instanceof W) {
>>> + word = (W) prevObj;
>>> + }
>>> + else {
>>> + log.error("Source has problem for tag <" + tag +
>>> ">.");
>>> + return;
>>> + }
>>> + String existingLemma = word.getLemma();
>>> + StringBuffer newLemma = new StringBuffer();
>>> + if (existingLemma != null && existingLemma.length() > 0) {
>>> + newLemma.append(existingLemma).append('|');
>>> + }
>>> + newLemma.append("x-Strongs:").append(tag.substring(2));
>>> + word.setLemma(newLemma.toString());
>>> }
>>> - - public int hashCode()
>>> + private String tag;
>>> + }
>>> + + /**
>>> + * Tag syntax: word<WTxxxx>
>>> + */
>>> + private static class StrongsMorphRefTag implements ITag
>>> + {
>>> + public StrongsMorphRefTag(String tagName)
>>> {
>>> - return tag.hashCode();
>>> + tag = tagName.trim();
>>> + }
>>> + public void updateOsisStack(Stack stack) throws JAXBException
>>> + {
>>> + Element ele = (Element) stack.peek();
>>> + List list = JAXBUtil.getList(ele);
>>> + if (list.isEmpty())
>>> + {
>>> + log.error("Source has problem for tag <" + tag +
>>> ">.");
>>> + return;
>>> + }
>>> + int lastIndex = list.size() - 1;
>>> + Object prevObj = list.get(lastIndex);
>>> + W word = null;
>>> + if (prevObj instanceof String)
>>> + {
>>> + word = JAXBUtil.factory().createW();
>>> + word.getContent().add(prevObj);
>>> + list.set(lastIndex, word);
>>> + }
>>> + else if (prevObj instanceof W) {
>>> + word = (W) prevObj;
>>> + }
>>> + else {
>>> + log.error("Source has problem for tag <" + tag +
>>> ">.");
>>> + return;
>>> + }
>>> + String existingMorph = word.getMorph();
>>> + StringBuffer newMorph = new StringBuffer();
>>> + if (existingMorph != null && existingMorph.length() > 0) {
>>> + newMorph.append(existingMorph).append('|');
>>> + }
>>> +
>>> newMorph.append("x-StrongsMorph:T").append(tag.substring(2));
>>> + word.setMorph(newMorph.toString());
>>> }
>>> -
>>> private String tag;
>>> }
>>> - private static final Tag PARAGRAPH = new Tag("CM");
>>> - private static final Tag FOOTNOTE_START = new Tag("RF");
>>> - private static final Tag FOOTNOTE_STOP = new Tag("Rf");
>>> - private static final Tag ITALICS_START = new Tag("FI");
>>> - private static final Tag ITALICS_STOP = new Tag("Fi");
>>> + /**
>>> + * Represent a trunc of bible text without any tags
>>> + */
>>> + private static class TextTag implements ITag
>>> + {
>>> + public TextTag(String textData)
>>> + {
>>> + text = textData;
>>> + }
>>> + public void updateOsisStack(Stack stack) throws JAXBException
>>> + {
>>> + Element ele = (Element) stack.peek();
>>> + List list = JAXBUtil.getList(ele);
>>> + list.add(text);
>>> + }
>>> + private String text;
>>> + }
>>> + + /**
>>> + * Unknown Tag. Either not supported tag or tag not defined in
>>> GBF specification
>>> + */
>>> + private static class UnknownTag implements ITag
>>> + {
>>> + public UnknownTag(String tagName)
>>> + {
>>> + tag = tagName;
>>> + }
>>> + public void updateOsisStack(Stack stack)
>>> + {
>>> + // unknown tags
>>> + log.warn("Ignoring tag of "+ tag);
>>> + }
>>> + private String tag;
>>> + }
>>>
>>> /**
>>> * The log stream
>>> Index: jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java
>>> ===================================================================
>>> RCS file:
>>> /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java,v
>>>
>>> retrieving revision 1.3
>>> diff -u -r1.3 JAXBUtil.java
>>> ---
>>> jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java
>>> 17 May 2003 16:11:52 -0000 1.3
>>> +++
>>> jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java
>>> 24 May 2003 05:45:32 -0000
>>> @@ -12,9 +12,11 @@
>>> import org.crosswire.common.util.Logger;
>>> import org.crosswire.common.util.LogicError;
>>> import org.crosswire.jsword.osis.Div;
>>> +import org.crosswire.jsword.osis.Note;
>>> import org.crosswire.jsword.osis.ObjectFactory;
>>> import org.crosswire.jsword.osis.Seg;
>>> import org.crosswire.jsword.osis.Verse;
>>> +import org.crosswire.jsword.osis.W;
>>> import org.crosswire.jsword.util.Project;
>>>
>>> /**
>>> @@ -148,6 +150,14 @@
>>> else if (current instanceof Div)
>>> {
>>> return ((Div) current).getContent();
>>> + }
>>> + else if (current instanceof Note)
>>> + {
>>> + return ((Note) current).getContent();
>>> + }
>>> + else if (current instanceof W)
>>> + {
>>> + return ((W) current).getContent();
>>> }
>>> log.error("unknown element:
>>> "+current.getClass().getName());
>>>
>>
>>
>> _______________________________________________
>> jsword-devel mailing list
>> jsword-devel@crosswire.org
>> http://www.crosswire.org/mailman/listinfo/jsword-devel
>
>
> _______________________________________________
> jsword-devel mailing list
> jsword-devel@crosswire.org
> http://www.crosswire.org/mailman/listinfo/jsword-devel
>