[jsword-devel] GBF tag support for Hebrew and Greek...

Joe Walker jsword-devel@crosswire.org
Sun, 01 Jun 2003 07:36:13 +0100


Hi,

I've checked it into CVS.
I took the liberty of pressing CTRL+SHIFT+F to reformat and I renamed 
ITag to Tag, just to keep everything looking similar. Hope that's not a 
problem.

It seems to work fine from what I can see. I start a new job tomorrow so 
I hope to have more time on the train to get some more coding done.
Cheers,

Joe.

Joe Walker wrote:

> Hi Jacky,
> 
> Thanks for the patch. I'm just back from holiday and I hope to apply it
> very soon.
> 
> Joe.
> 
> 
> On Sat, 2003-05-24 at 06:52, Jacky Cheung wrote:
> 
>>Hi,
>>
>>Please find the patch for GBF tag support for Heb. and Greek attached. 
>>There is a huge change in the GBFFilter class.  Thanks.
>>
>>Best regards,
>>Jacky
>>
>>P.S.
>>Now, the GUI can should hyperlink for tagged word but cannot link to 
>>lexcon. Furthermore, the translated output of the text looks not good 
>>enough. I know that somebody has written an XSL file for to convert OSIS 
>>XML to HTML.
>>
>>
>>______________________________________________________________________
>>
>>Index: jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java
>>===================================================================
>>RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java,v
>>retrieving revision 1.7
>>diff -u -r1.7 GBFFilter.java
>>--- jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java	17 May 2003 15:00:14 -0000	1.7
>>+++ jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java	24 May 2003 05:45:31 -0000
>>@@ -3,7 +3,7 @@
>> 
>> import java.util.ArrayList;
>> import java.util.Iterator;
>>-import java.util.LinkedList;
>>+import java.util.Stack;
>> import java.util.List;
>> 
>> import javax.xml.bind.Element;
>>@@ -13,6 +13,7 @@
>> import org.crosswire.common.util.LogicError;
>> import org.crosswire.jsword.osis.Note;
>> import org.crosswire.jsword.osis.Seg;
>>+import org.crosswire.jsword.osis.W;
>> 
>> /**
>>  * Filter to convert GBF data to OSIS format.
>>@@ -47,8 +48,17 @@
>>     {
>>         try
>>         {
>>-            List tokens = tokenize(plain);
>>-            parseTokens(ele, tokens);
>>+            Stack stack = new Stack();
>>+            stack.push(ele);
>>+            
>>+            TagGenerator generator = new TagGenerator(plain);
>>+            ITag tag = generator.getNextTag();
>>+            while (tag != null) {
>>+                tag.updateOsisStack(stack);
>>+                tag = generator.getNextTag();
>>+            }
>>+            
>>+            stack.pop();
>>         }
>>         catch (JAXBException ex)
>>         {
>>@@ -56,241 +66,367 @@
>>         }
>>     }
>> 
>>-    /**
>>-     * Go through a list of tokens and add them to the listener
>>-     */
>>-    public void parseTokens(Element ele, List tokens) throws JAXBException, DataException
>>-    {
>>-        LinkedList stack = new LinkedList();
>>-        stack.addFirst(ele);
>>-
>>-        // For notes
>>-        int marker = 1;
>>-
>>-        // go through the token working out what to do with them all
>>-        for (Iterator it = tokens.iterator(); it.hasNext();)
>>-        {
>>-            Object token = it.next();
>>-            if (token instanceof String)
>>+    private static class TagGenerator {
>>+        public TagGenerator(String plain) {
>>+            int lastIndex = plain.length() - 1;
>>+            if (lastIndex >= 0 && plain.charAt(lastIndex) == ((char) 13))
>>             {
>>-                Element current = (Element) stack.getFirst();
>>-                List list = JAXBUtil.getList(current); 
>>-                list.add((String) token);
>>+                plain = plain.substring(0, lastIndex);
>>             }
>>-            else if (token instanceof Tag)
>>-            {
>>-                Tag tag = (Tag) token;
>>+            remains = plain;
>>+        }
>>+        
>>+        /**
>>+         * Get Next tags in the string
>>+         */
>>+        public ITag getNextTag() {
>>+            if (retval.isEmpty()) {
>>+                if (remains == null)
>>+                    return null;
>>+                parseNextTag();
>>+            }
>>+            return (ITag) retval.remove(0);
>>+        }
>>         
>>-                // skip over the rest of the footnote
>>-                if (tag.equals(FOOTNOTE_START))
>>-                {
>>-                    List footnote = getTokensUntil(it, FOOTNOTE_STOP);
>>-                    String content = filterText(footnote);
>>-                    
>>-                    // This could be a marker or it could be the body of the note
>>-                    // We tell which by string length. <= 1 is a marker which we
>>-                    // ignore for simplicity
>>-                    if (content.length() > 1)
>>+        private void parseNextTag() {
>>+            if (remains == null)
>>+            {
>>+                return;
>>+            }
>>+            
>>+            int ltpos = remains.indexOf('<');
>>+            int gtpos = remains.indexOf('>');
>>+            
>>+            if (ltpos == -1 && gtpos == -1) {
>>+                // no more tags to decode
>>+                retval.add(new TextTag(remains));
>>+                remains = null;
>>+                return;
>>+            }
>>+            
>>+            // check that we don't have unmatched tags
>>+            if (ltpos == -1 || gtpos == -1) {
>>+                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>+                retval.add(new TextTag(remains));
>>+                remains = null;
>>+                return;
>>+            }
>>+            
>>+            // check that the tags are in a sensible order
>>+            if (ltpos > gtpos) {
>>+                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>+                retval.add(new TextTag(remains));
>>+                remains = null;
>>+                return;
>>+            }
>>+            
>>+            // generate tags
>>+            String start = remains.substring(0, ltpos);
>>+            int strLen = start.length();
>>+            if (strLen > 0) {
>>+                int beginIndex = 0;
>>+                boolean inSepStr = isSeperator(start.charAt(0));
>>+                // split words from seperators...
>>+                // e.g., "a b c? e g." -> "a b c", "? ", "e g."
>>+                //       "a b c<tag> e g." -> "a b c", tag, " ", "e g."
>>+                for(int i=1; inSepStr && i<strLen; i++) {
>>+                    char currentChar = start.charAt(i);
>>+                    if (!isSeperator(currentChar))
>>                     {
>>-                        Note note = JAXBUtil.factory().createNote();
>>-                        note.setN(""+(marker++));
>>-                        note.getContent().add(content);
>>-                        Element current = (Element) stack.getFirst();
>>-                        
>>-                        List list = JAXBUtil.getList(current); 
>>-                        list.add(note);
>>+                        retval.add(new TextTag(start.substring(beginIndex, i)));
>>+                        beginIndex = i;
>>+                        inSepStr = false;
>>                     }
>>                 }
>>-                else if (tag.equals(PARAGRAPH))
>>-                {
>>-                    // ignore paragraph markers
>>-                }                
>>-                else if (tag.equals(ITALICS_START))
>>-                {
>>-                    Seg seg = JAXBUtil.factory().createSeg();
>>-                    Element current = (Element) stack.getFirst();
>>-                    
>>-                    List list = JAXBUtil.getList(current); 
>>-                    list.add(seg);
>>-
>>-                    stack.addFirst(seg);
>>-                }                
>>-                else if (tag.equals(ITALICS_STOP))
>>-                {
>>-                    Object top = stack.removeFirst();
>>-                    
>>-                    // Check that we are properly tree structured
>>-                    if (!(top instanceof Seg))
>>-                    {
>>-                        throw new LogicError();
>>-                    }
>>-                }                
>>-                else
>>-                {
>>-                    // unknown tags
>>-                    log.warn("Ignoring tag of "+tag.getTag());
>>+                if (beginIndex < strLen) {
>>+                    retval.add(new TextTag(start.substring(beginIndex)));
>>                 }
>>             }
>>-            else
>>-            {
>>-                throw new DataException(Msg.GBF_BADTOKEN, new Object[] { token });
>>+            
>>+            String tag = remains.substring(ltpos+1, gtpos);
>>+            if (tag.length() > 0) {
>>+                retval.add(createTag(tag));
>>             }
>>+            
>>+            remains = remains.substring(gtpos+1);
>>         }
>>-
>>-        stack.removeFirst();
>>+        
>>+        private boolean isSeperator(char c) {
>>+            final String seperators = " ,:;.?!";
>>+            return seperators.indexOf(c) >= 0;
>>+        }
>>+        
>>+        private ITag createTag(String tag) {
>>+            if (tag.equals("RB")) {
>>+                return new TextWithEmbeddedFootnote();
>>+            }
>>+            if (tag.equals("RF")) {
>>+                return new FootnoteStartTag();
>>+            }
>>+            if (tag.equals("Rf")) {
>>+                return new FootnoteEndTag();
>>+            }
>>+            if (tag.equals("FI")) {
>>+                return new ItalicStartTag();
>>+            }
>>+            if (tag.equals("Fi")) {
>>+                return new ItalicEndTag();
>>+            }
>>+            if (tag.equals("CM")) {
>>+                return new ParagraphTag();
>>+            }
>>+            if (tag.startsWith("WT")) {
>>+                return new StrongsMorphRefTag(tag);
>>+            }
>>+            if (tag.startsWith("WH") || tag.startsWith("WG")) {
>>+                return new StrongsWordRefTag(tag);
>>+            }
>>+            return new UnknownTag(tag);
>>+        }
>>+        private String remains;
>>+        private List retval = new ArrayList();
>>     }
>> 
>>     /**
>>-     * Strip all the Tags from a List and return just the text
>>+     * GBF Tag interface
>>+     *
>>+     * Now the number of supported tags are small.
>>+     * If the number become large, refactor...
>>+     * 1. refactor ITag to public abstract class GBFTag
>>+     * 2. move createTag() to GBFTag
>>+     * 3. move tag classes to GBFTag.java so that adding tags updates only GBFTag.java
>>+     *
>>+     * On adding new tags, implements new tag classes and update createTag()
>>      */
>>-    private String filterText(List list)
>>+    private static interface ITag
>>     {
>>-        StringBuffer buffer = new StringBuffer();
>>-
>>-        // go through the token working out what to do with them all
>>-        for (Iterator it = list.iterator(); it.hasNext();)
>>+        /**
>>+         * Sub-classes should implement this method to generate OSIS Object
>>+         */
>>+        public void updateOsisStack(Stack osisStack) throws JAXBException;
>>+    }
>>+    
>>+    /**
>>+     * Tag syntax: <RB>Words<RF>note<Rf>
>>+     */
>>+    private static class TextWithEmbeddedFootnote implements ITag
>>+    {
>>+        public void updateOsisStack(Stack stack) throws JAXBException
>>         {
>>-            Object token = it.next();
>>-            if (token instanceof String)
>>-            {
>>-                buffer.append((String) token);
>>-            }
>>+            Note note = JAXBUtil.factory().createNote();
>>+            note.setNoteType("x-StudyNote");
>>+            Element current = (Element) stack.peek();
>>+            
>>+            List list = JAXBUtil.getList(current);
>>+            list.add(note);
>>+            stack.push(note);
>>         }
>>-        
>>-        return buffer.toString();
>>     }
>>-
>>+    
>>     /**
>>-     * Get a list for the footnote
>>+     * Tag syntax: <RF>note<Rf>
>>      */
>>-    private List getTokensUntil(Iterator it, Tag end) throws JAXBException
>>+    private static class FootnoteStartTag implements ITag
>>     {
>>-        // take tokens off the list until end of list or FOOTNOTE_END
>>-        List ignored = new ArrayList();
>>-
>>-        while (true)
>>+        public void updateOsisStack(Stack stack) throws JAXBException
>>         {
>>-            if (!it.hasNext())
>>-            {
>>-                break;
>>-            }
>>-        
>>-            Object token = it.next();
>>-            if (token instanceof String)
>>+            Element current = (Element) stack.peek();
>>+            if (!(current instanceof Note))
>>             {
>>-                ignored.add(token);
>>-            }
>>-            else if (token instanceof Tag)
>>-            {
>>-                Tag tag = (Tag) token;
>>-                if (tag.equals(end))
>>-                {
>>-                    break;
>>-                }
>>-                else
>>-                {
>>-                    ignored.add(token);
>>-                }
>>-            }
>>-            else
>>-            {
>>-                throw new JAXBException("Failed to parse: "+token);
>>+                Note note = JAXBUtil.factory().createNote();
>>+                note.setNoteType("x-StudyNote");
>>+                
>>+                List list = JAXBUtil.getList(current);
>>+                list.add(note);
>>+                stack.push(note);
>>             }
>>         }
>>-
>>-        return ignored;
>>     }
>>     
>>     /**
>>-     * Create a list of strings and tags
>>-     * @param plain
>>-     * @return List
>>+     * Tag syntax: <RF>note<Rf>
>>      */
>>-    private List tokenize(String plain)
>>+    private static class FootnoteEndTag implements ITag
>>     {
>>-        List retval = new ArrayList();
>>-        String remains = plain;
>>-
>>-        while (true)
>>+        public void updateOsisStack(Stack stack) throws JAXBException
>>         {
>>-            int ltpos = remains.indexOf('<');
>>-            int gtpos = remains.indexOf('>');
>>+            Note note = (Note) stack.pop();
>>+            List list = JAXBUtil.getList(note);
>> 
>>-            if (ltpos == -1 && gtpos == -1)
>>-            {
>>-                // no more tags to decode
>>-                retval.add(remains);
>>-                break;
>>+            if (list.size() < 1) {
>>+                JAXBUtil.getList((Element)stack.peek()).remove(note);
>>             }
>>+        }
>>+    }
>> 
>>-            // check that we don't have unmatched tags
>>-            if (ltpos == -1 || gtpos == -1)
>>-            {
>>-                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>-                retval.add(remains);
>>-                break;
>>-            }
>>-            
>>-            // check that the tags are in a sensible order
>>-            if (ltpos > gtpos)
>>-            {
>>-                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
>>-                retval.add(remains);
>>-                break;
>>-            }
>>+    /**
>>+     * Tag syntax: <FI>note<Fi>
>>+     */
>>+    private static class ItalicStartTag implements ITag
>>+    {
>>+        public void updateOsisStack(Stack stack) throws JAXBException
>>+        {
>>+            // remarked, for the XSL does not present it correctly
>>+            // The XSL should translate it to <I>...</I> but now it translated
>>+            //  to <div>...</div>
>>+            /*
>>+            Seg seg = JAXBUtil.factory().createSeg();
>>+            Element current = (Element) stack.peek();
>> 
>>-            String start = remains.substring(0, ltpos);
>>-            retval.add(start);
>>+            List list = JAXBUtil.getList(current); 
>>+            list.add(seg);
>> 
>>-            String tag = remains.substring(ltpos+1, gtpos);
>>-            retval.add(new Tag(tag));
>>-            
>>-            remains = remains.substring(gtpos+1);
>>+            stack.push(seg);
>>+             */
>>         }
>>-
>>-        return retval;
>>     }
>> 
>>     /**
>>-     * A GBF Tag
>>+     * Tag syntax: <FI>note<Fi>
>>      */
>>-    private static class Tag
>>+    private static class ItalicEndTag implements ITag
>>     {
>>-        public Tag(String tag)
>>+        public void updateOsisStack(Stack stack)
>>         {
>>-            this.tag = tag;
>>+            // remarked, for the XSL does not translate it correctly
>>+            // stack.pop();
>>         }
>>+    }
>> 
>>-        public String getTag()
>>+    /**
>>+     * Tag syntax: Words<CM>
>>+     */
>>+    private static class ParagraphTag implements ITag
>>+    {
>>+        public void updateOsisStack(Stack stack) {
>>+            JAXBUtil.getList((Element)stack.peek()).add(Character.toString('ΒΆ'));
>>+        }
>>+    }
>>+    
>>+    /**
>>+     * Tag syntax: word<WHxxxx> or word<WGxxxx>
>>+     */
>>+    private static class StrongsWordRefTag implements ITag
>>+    {
>>+        public StrongsWordRefTag(String tagName)
>>         {
>>-            return tag;
>>+            tag = tagName.trim();
>>         }
>>-
>>-        public boolean equals(Object obj)
>>+        public void updateOsisStack(Stack stack) throws JAXBException
>>         {
>>-            if (obj == null)
>>-                return false;
>>-
>>-            if (obj.getClass() != this.getClass())
>>-                return false;
>>-
>>-            Tag that = (Tag) obj;
>>-            return this.tag.equals(that.tag);
>>+            Element ele = (Element) stack.peek();
>>+            List list = JAXBUtil.getList(ele);
>>+            if (list.isEmpty())
>>+            {
>>+                log.error("Source has problem for tag <" + tag + ">.");
>>+                return;
>>+            }
>>+            int lastIndex = list.size() - 1;
>>+            Object prevObj = list.get(lastIndex);
>>+            W word = null;
>>+            if (prevObj instanceof String)
>>+            {
>>+                word = JAXBUtil.factory().createW();
>>+                word.getContent().add(prevObj);
>>+                list.set(lastIndex, word);
>>+            }
>>+            else if (prevObj instanceof W) {
>>+                word = (W) prevObj;
>>+            }
>>+            else {
>>+                log.error("Source has problem for tag <" + tag + ">.");
>>+                return;
>>+            }
>>+            String existingLemma = word.getLemma();
>>+            StringBuffer newLemma = new StringBuffer();
>>+            if (existingLemma != null && existingLemma.length() > 0) {
>>+                newLemma.append(existingLemma).append('|');
>>+            }
>>+            newLemma.append("x-Strongs:").append(tag.substring(2));
>>+            word.setLemma(newLemma.toString());
>>         }
>>-        
>>-        public int hashCode()
>>+        private String tag;
>>+    }
>>+    
>>+    /**
>>+     * Tag syntax: word<WTxxxx>
>>+     */
>>+    private static class StrongsMorphRefTag implements ITag
>>+    {
>>+        public StrongsMorphRefTag(String tagName)
>>         {
>>-            return tag.hashCode();
>>+            tag = tagName.trim();
>>+        }
>>+        public void updateOsisStack(Stack stack) throws JAXBException
>>+        {
>>+            Element ele = (Element) stack.peek();
>>+            List list = JAXBUtil.getList(ele);
>>+            if (list.isEmpty())
>>+            {
>>+                log.error("Source has problem for tag <" + tag + ">.");
>>+                return;
>>+            }
>>+            int lastIndex = list.size() - 1;
>>+            Object prevObj = list.get(lastIndex);
>>+            W word = null;
>>+            if (prevObj instanceof String)
>>+            {
>>+                word = JAXBUtil.factory().createW();
>>+                word.getContent().add(prevObj);
>>+                list.set(lastIndex, word);
>>+            }
>>+            else if (prevObj instanceof W) {
>>+                word = (W) prevObj;
>>+            }
>>+            else {
>>+                log.error("Source has problem for tag <" + tag + ">.");
>>+                return;
>>+            }
>>+            String existingMorph = word.getMorph();
>>+            StringBuffer newMorph = new StringBuffer();
>>+            if (existingMorph != null && existingMorph.length() > 0) {
>>+                newMorph.append(existingMorph).append('|');
>>+            }
>>+            newMorph.append("x-StrongsMorph:T").append(tag.substring(2));
>>+            word.setMorph(newMorph.toString());
>>         }
>>-
>>         private String tag;
>>     }
>>     
>>-    private static final Tag PARAGRAPH = new Tag("CM");
>>-    private static final Tag FOOTNOTE_START = new Tag("RF");
>>-    private static final Tag FOOTNOTE_STOP = new Tag("Rf");
>>-    private static final Tag ITALICS_START = new Tag("FI");
>>-    private static final Tag ITALICS_STOP = new Tag("Fi");
>>+    /**
>>+     * Represent a trunc of bible text without any tags
>>+     */
>>+    private static class TextTag implements ITag
>>+    {
>>+        public TextTag(String textData)
>>+        {
>>+            text = textData;
>>+        }
>>+        public void updateOsisStack(Stack stack) throws JAXBException
>>+        {
>>+            Element ele = (Element) stack.peek();
>>+            List list = JAXBUtil.getList(ele);
>>+            list.add(text);
>>+        }
>>+        private String text;
>>+    }
>>+    
>>+    /**
>>+     * Unknown Tag. Either not supported tag or tag not defined in GBF specification
>>+     */
>>+    private static class UnknownTag implements ITag
>>+    {
>>+        public UnknownTag(String tagName)
>>+        {
>>+            tag = tagName;
>>+        }
>>+        public void updateOsisStack(Stack stack)
>>+        {
>>+            // unknown tags
>>+            log.warn("Ignoring tag of "+ tag);
>>+        }
>>+        private String tag;
>>+    }
>> 
>>     /**
>>      * The log stream
>>Index: jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java
>>===================================================================
>>RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java,v
>>retrieving revision 1.3
>>diff -u -r1.3 JAXBUtil.java
>>--- jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java	17 May 2003 16:11:52 -0000	1.3
>>+++ jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java	24 May 2003 05:45:32 -0000
>>@@ -12,9 +12,11 @@
>> import org.crosswire.common.util.Logger;
>> import org.crosswire.common.util.LogicError;
>> import org.crosswire.jsword.osis.Div;
>>+import org.crosswire.jsword.osis.Note;
>> import org.crosswire.jsword.osis.ObjectFactory;
>> import org.crosswire.jsword.osis.Seg;
>> import org.crosswire.jsword.osis.Verse;
>>+import org.crosswire.jsword.osis.W;
>> import org.crosswire.jsword.util.Project;
>> 
>> /**
>>@@ -148,6 +150,14 @@
>>         else if (current instanceof Div)
>>         {
>>             return ((Div) current).getContent();
>>+        }
>>+        else if (current instanceof Note)
>>+        {
>>+            return ((Note) current).getContent();
>>+        }
>>+        else if (current instanceof W)
>>+        {
>>+            return ((W) current).getContent();
>>         }
>>         
>>         log.error("unknown element: "+current.getClass().getName());
>>
> 
> 
> _______________________________________________
> jsword-devel mailing list
> jsword-devel@crosswire.org
> http://www.crosswire.org/mailman/listinfo/jsword-devel