[jsword-devel] GBF tag support for Hebrew and Greek...

Joe Walker jsword-devel@crosswire.org
28 May 2003 19:09:59 +0100


Hi Jacky,

Thanks for the patch. I'm just back from holiday and I hope to apply it
very soon.

Joe.


On Sat, 2003-05-24 at 06:52, Jacky Cheung wrote:
> Hi,
> 
> Please find the patch for GBF tag support for Heb. and Greek attached. 
> There is a huge change in the GBFFilter class.  Thanks.
> 
> Best regards,
> Jacky
> 
> P.S.
> Now, the GUI can should hyperlink for tagged word but cannot link to 
> lexcon. Furthermore, the translated output of the text looks not good 
> enough. I know that somebody has written an XSL file for to convert OSIS 
> XML to HTML.
> 
> 
> ______________________________________________________________________
> 
> Index: jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java
> ===================================================================
> RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java,v
> retrieving revision 1.7
> diff -u -r1.7 GBFFilter.java
> --- jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java	17 May 2003 15:00:14 -0000	1.7
> +++ jsword/java/jsword/org/crosswire/jsword/book/data/GBFFilter.java	24 May 2003 05:45:31 -0000
> @@ -3,7 +3,7 @@
>  
>  import java.util.ArrayList;
>  import java.util.Iterator;
> -import java.util.LinkedList;
> +import java.util.Stack;
>  import java.util.List;
>  
>  import javax.xml.bind.Element;
> @@ -13,6 +13,7 @@
>  import org.crosswire.common.util.LogicError;
>  import org.crosswire.jsword.osis.Note;
>  import org.crosswire.jsword.osis.Seg;
> +import org.crosswire.jsword.osis.W;
>  
>  /**
>   * Filter to convert GBF data to OSIS format.
> @@ -47,8 +48,17 @@
>      {
>          try
>          {
> -            List tokens = tokenize(plain);
> -            parseTokens(ele, tokens);
> +            Stack stack = new Stack();
> +            stack.push(ele);
> +            
> +            TagGenerator generator = new TagGenerator(plain);
> +            ITag tag = generator.getNextTag();
> +            while (tag != null) {
> +                tag.updateOsisStack(stack);
> +                tag = generator.getNextTag();
> +            }
> +            
> +            stack.pop();
>          }
>          catch (JAXBException ex)
>          {
> @@ -56,241 +66,367 @@
>          }
>      }
>  
> -    /**
> -     * Go through a list of tokens and add them to the listener
> -     */
> -    public void parseTokens(Element ele, List tokens) throws JAXBException, DataException
> -    {
> -        LinkedList stack = new LinkedList();
> -        stack.addFirst(ele);
> -
> -        // For notes
> -        int marker = 1;
> -
> -        // go through the token working out what to do with them all
> -        for (Iterator it = tokens.iterator(); it.hasNext();)
> -        {
> -            Object token = it.next();
> -            if (token instanceof String)
> +    private static class TagGenerator {
> +        public TagGenerator(String plain) {
> +            int lastIndex = plain.length() - 1;
> +            if (lastIndex >= 0 && plain.charAt(lastIndex) == ((char) 13))
>              {
> -                Element current = (Element) stack.getFirst();
> -                List list = JAXBUtil.getList(current); 
> -                list.add((String) token);
> +                plain = plain.substring(0, lastIndex);
>              }
> -            else if (token instanceof Tag)
> -            {
> -                Tag tag = (Tag) token;
> +            remains = plain;
> +        }
> +        
> +        /**
> +         * Get Next tags in the string
> +         */
> +        public ITag getNextTag() {
> +            if (retval.isEmpty()) {
> +                if (remains == null)
> +                    return null;
> +                parseNextTag();
> +            }
> +            return (ITag) retval.remove(0);
> +        }
>          
> -                // skip over the rest of the footnote
> -                if (tag.equals(FOOTNOTE_START))
> -                {
> -                    List footnote = getTokensUntil(it, FOOTNOTE_STOP);
> -                    String content = filterText(footnote);
> -                    
> -                    // This could be a marker or it could be the body of the note
> -                    // We tell which by string length. <= 1 is a marker which we
> -                    // ignore for simplicity
> -                    if (content.length() > 1)
> +        private void parseNextTag() {
> +            if (remains == null)
> +            {
> +                return;
> +            }
> +            
> +            int ltpos = remains.indexOf('<');
> +            int gtpos = remains.indexOf('>');
> +            
> +            if (ltpos == -1 && gtpos == -1) {
> +                // no more tags to decode
> +                retval.add(new TextTag(remains));
> +                remains = null;
> +                return;
> +            }
> +            
> +            // check that we don't have unmatched tags
> +            if (ltpos == -1 || gtpos == -1) {
> +                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> +                retval.add(new TextTag(remains));
> +                remains = null;
> +                return;
> +            }
> +            
> +            // check that the tags are in a sensible order
> +            if (ltpos > gtpos) {
> +                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> +                retval.add(new TextTag(remains));
> +                remains = null;
> +                return;
> +            }
> +            
> +            // generate tags
> +            String start = remains.substring(0, ltpos);
> +            int strLen = start.length();
> +            if (strLen > 0) {
> +                int beginIndex = 0;
> +                boolean inSepStr = isSeperator(start.charAt(0));
> +                // split words from seperators...
> +                // e.g., "a b c? e g." -> "a b c", "? ", "e g."
> +                //       "a b c<tag> e g." -> "a b c", tag, " ", "e g."
> +                for(int i=1; inSepStr && i<strLen; i++) {
> +                    char currentChar = start.charAt(i);
> +                    if (!isSeperator(currentChar))
>                      {
> -                        Note note = JAXBUtil.factory().createNote();
> -                        note.setN(""+(marker++));
> -                        note.getContent().add(content);
> -                        Element current = (Element) stack.getFirst();
> -                        
> -                        List list = JAXBUtil.getList(current); 
> -                        list.add(note);
> +                        retval.add(new TextTag(start.substring(beginIndex, i)));
> +                        beginIndex = i;
> +                        inSepStr = false;
>                      }
>                  }
> -                else if (tag.equals(PARAGRAPH))
> -                {
> -                    // ignore paragraph markers
> -                }                
> -                else if (tag.equals(ITALICS_START))
> -                {
> -                    Seg seg = JAXBUtil.factory().createSeg();
> -                    Element current = (Element) stack.getFirst();
> -                    
> -                    List list = JAXBUtil.getList(current); 
> -                    list.add(seg);
> -
> -                    stack.addFirst(seg);
> -                }                
> -                else if (tag.equals(ITALICS_STOP))
> -                {
> -                    Object top = stack.removeFirst();
> -                    
> -                    // Check that we are properly tree structured
> -                    if (!(top instanceof Seg))
> -                    {
> -                        throw new LogicError();
> -                    }
> -                }                
> -                else
> -                {
> -                    // unknown tags
> -                    log.warn("Ignoring tag of "+tag.getTag());
> +                if (beginIndex < strLen) {
> +                    retval.add(new TextTag(start.substring(beginIndex)));
>                  }
>              }
> -            else
> -            {
> -                throw new DataException(Msg.GBF_BADTOKEN, new Object[] { token });
> +            
> +            String tag = remains.substring(ltpos+1, gtpos);
> +            if (tag.length() > 0) {
> +                retval.add(createTag(tag));
>              }
> +            
> +            remains = remains.substring(gtpos+1);
>          }
> -
> -        stack.removeFirst();
> +        
> +        private boolean isSeperator(char c) {
> +            final String seperators = " ,:;.?!";
> +            return seperators.indexOf(c) >= 0;
> +        }
> +        
> +        private ITag createTag(String tag) {
> +            if (tag.equals("RB")) {
> +                return new TextWithEmbeddedFootnote();
> +            }
> +            if (tag.equals("RF")) {
> +                return new FootnoteStartTag();
> +            }
> +            if (tag.equals("Rf")) {
> +                return new FootnoteEndTag();
> +            }
> +            if (tag.equals("FI")) {
> +                return new ItalicStartTag();
> +            }
> +            if (tag.equals("Fi")) {
> +                return new ItalicEndTag();
> +            }
> +            if (tag.equals("CM")) {
> +                return new ParagraphTag();
> +            }
> +            if (tag.startsWith("WT")) {
> +                return new StrongsMorphRefTag(tag);
> +            }
> +            if (tag.startsWith("WH") || tag.startsWith("WG")) {
> +                return new StrongsWordRefTag(tag);
> +            }
> +            return new UnknownTag(tag);
> +        }
> +        private String remains;
> +        private List retval = new ArrayList();
>      }
>  
>      /**
> -     * Strip all the Tags from a List and return just the text
> +     * GBF Tag interface
> +     *
> +     * Now the number of supported tags are small.
> +     * If the number become large, refactor...
> +     * 1. refactor ITag to public abstract class GBFTag
> +     * 2. move createTag() to GBFTag
> +     * 3. move tag classes to GBFTag.java so that adding tags updates only GBFTag.java
> +     *
> +     * On adding new tags, implements new tag classes and update createTag()
>       */
> -    private String filterText(List list)
> +    private static interface ITag
>      {
> -        StringBuffer buffer = new StringBuffer();
> -
> -        // go through the token working out what to do with them all
> -        for (Iterator it = list.iterator(); it.hasNext();)
> +        /**
> +         * Sub-classes should implement this method to generate OSIS Object
> +         */
> +        public void updateOsisStack(Stack osisStack) throws JAXBException;
> +    }
> +    
> +    /**
> +     * Tag syntax: <RB>Words<RF>note<Rf>
> +     */
> +    private static class TextWithEmbeddedFootnote implements ITag
> +    {
> +        public void updateOsisStack(Stack stack) throws JAXBException
>          {
> -            Object token = it.next();
> -            if (token instanceof String)
> -            {
> -                buffer.append((String) token);
> -            }
> +            Note note = JAXBUtil.factory().createNote();
> +            note.setNoteType("x-StudyNote");
> +            Element current = (Element) stack.peek();
> +            
> +            List list = JAXBUtil.getList(current);
> +            list.add(note);
> +            stack.push(note);
>          }
> -        
> -        return buffer.toString();
>      }
> -
> +    
>      /**
> -     * Get a list for the footnote
> +     * Tag syntax: <RF>note<Rf>
>       */
> -    private List getTokensUntil(Iterator it, Tag end) throws JAXBException
> +    private static class FootnoteStartTag implements ITag
>      {
> -        // take tokens off the list until end of list or FOOTNOTE_END
> -        List ignored = new ArrayList();
> -
> -        while (true)
> +        public void updateOsisStack(Stack stack) throws JAXBException
>          {
> -            if (!it.hasNext())
> -            {
> -                break;
> -            }
> -        
> -            Object token = it.next();
> -            if (token instanceof String)
> +            Element current = (Element) stack.peek();
> +            if (!(current instanceof Note))
>              {
> -                ignored.add(token);
> -            }
> -            else if (token instanceof Tag)
> -            {
> -                Tag tag = (Tag) token;
> -                if (tag.equals(end))
> -                {
> -                    break;
> -                }
> -                else
> -                {
> -                    ignored.add(token);
> -                }
> -            }
> -            else
> -            {
> -                throw new JAXBException("Failed to parse: "+token);
> +                Note note = JAXBUtil.factory().createNote();
> +                note.setNoteType("x-StudyNote");
> +                
> +                List list = JAXBUtil.getList(current);
> +                list.add(note);
> +                stack.push(note);
>              }
>          }
> -
> -        return ignored;
>      }
>      
>      /**
> -     * Create a list of strings and tags
> -     * @param plain
> -     * @return List
> +     * Tag syntax: <RF>note<Rf>
>       */
> -    private List tokenize(String plain)
> +    private static class FootnoteEndTag implements ITag
>      {
> -        List retval = new ArrayList();
> -        String remains = plain;
> -
> -        while (true)
> +        public void updateOsisStack(Stack stack) throws JAXBException
>          {
> -            int ltpos = remains.indexOf('<');
> -            int gtpos = remains.indexOf('>');
> +            Note note = (Note) stack.pop();
> +            List list = JAXBUtil.getList(note);
>  
> -            if (ltpos == -1 && gtpos == -1)
> -            {
> -                // no more tags to decode
> -                retval.add(remains);
> -                break;
> +            if (list.size() < 1) {
> +                JAXBUtil.getList((Element)stack.peek()).remove(note);
>              }
> +        }
> +    }
>  
> -            // check that we don't have unmatched tags
> -            if (ltpos == -1 || gtpos == -1)
> -            {
> -                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> -                retval.add(remains);
> -                break;
> -            }
> -            
> -            // check that the tags are in a sensible order
> -            if (ltpos > gtpos)
> -            {
> -                log.warn("ignoring unmatched '<' or '>' in gbf: "+remains);
> -                retval.add(remains);
> -                break;
> -            }
> +    /**
> +     * Tag syntax: <FI>note<Fi>
> +     */
> +    private static class ItalicStartTag implements ITag
> +    {
> +        public void updateOsisStack(Stack stack) throws JAXBException
> +        {
> +            // remarked, for the XSL does not present it correctly
> +            // The XSL should translate it to <I>...</I> but now it translated
> +            //  to <div>...</div>
> +            /*
> +            Seg seg = JAXBUtil.factory().createSeg();
> +            Element current = (Element) stack.peek();
>  
> -            String start = remains.substring(0, ltpos);
> -            retval.add(start);
> +            List list = JAXBUtil.getList(current); 
> +            list.add(seg);
>  
> -            String tag = remains.substring(ltpos+1, gtpos);
> -            retval.add(new Tag(tag));
> -            
> -            remains = remains.substring(gtpos+1);
> +            stack.push(seg);
> +             */
>          }
> -
> -        return retval;
>      }
>  
>      /**
> -     * A GBF Tag
> +     * Tag syntax: <FI>note<Fi>
>       */
> -    private static class Tag
> +    private static class ItalicEndTag implements ITag
>      {
> -        public Tag(String tag)
> +        public void updateOsisStack(Stack stack)
>          {
> -            this.tag = tag;
> +            // remarked, for the XSL does not translate it correctly
> +            // stack.pop();
>          }
> +    }
>  
> -        public String getTag()
> +    /**
> +     * Tag syntax: Words<CM>
> +     */
> +    private static class ParagraphTag implements ITag
> +    {
> +        public void updateOsisStack(Stack stack) {
> +            JAXBUtil.getList((Element)stack.peek()).add(Character.toString('ΒΆ'));
> +        }
> +    }
> +    
> +    /**
> +     * Tag syntax: word<WHxxxx> or word<WGxxxx>
> +     */
> +    private static class StrongsWordRefTag implements ITag
> +    {
> +        public StrongsWordRefTag(String tagName)
>          {
> -            return tag;
> +            tag = tagName.trim();
>          }
> -
> -        public boolean equals(Object obj)
> +        public void updateOsisStack(Stack stack) throws JAXBException
>          {
> -            if (obj == null)
> -                return false;
> -
> -            if (obj.getClass() != this.getClass())
> -                return false;
> -
> -            Tag that = (Tag) obj;
> -            return this.tag.equals(that.tag);
> +            Element ele = (Element) stack.peek();
> +            List list = JAXBUtil.getList(ele);
> +            if (list.isEmpty())
> +            {
> +                log.error("Source has problem for tag <" + tag + ">.");
> +                return;
> +            }
> +            int lastIndex = list.size() - 1;
> +            Object prevObj = list.get(lastIndex);
> +            W word = null;
> +            if (prevObj instanceof String)
> +            {
> +                word = JAXBUtil.factory().createW();
> +                word.getContent().add(prevObj);
> +                list.set(lastIndex, word);
> +            }
> +            else if (prevObj instanceof W) {
> +                word = (W) prevObj;
> +            }
> +            else {
> +                log.error("Source has problem for tag <" + tag + ">.");
> +                return;
> +            }
> +            String existingLemma = word.getLemma();
> +            StringBuffer newLemma = new StringBuffer();
> +            if (existingLemma != null && existingLemma.length() > 0) {
> +                newLemma.append(existingLemma).append('|');
> +            }
> +            newLemma.append("x-Strongs:").append(tag.substring(2));
> +            word.setLemma(newLemma.toString());
>          }
> -        
> -        public int hashCode()
> +        private String tag;
> +    }
> +    
> +    /**
> +     * Tag syntax: word<WTxxxx>
> +     */
> +    private static class StrongsMorphRefTag implements ITag
> +    {
> +        public StrongsMorphRefTag(String tagName)
>          {
> -            return tag.hashCode();
> +            tag = tagName.trim();
> +        }
> +        public void updateOsisStack(Stack stack) throws JAXBException
> +        {
> +            Element ele = (Element) stack.peek();
> +            List list = JAXBUtil.getList(ele);
> +            if (list.isEmpty())
> +            {
> +                log.error("Source has problem for tag <" + tag + ">.");
> +                return;
> +            }
> +            int lastIndex = list.size() - 1;
> +            Object prevObj = list.get(lastIndex);
> +            W word = null;
> +            if (prevObj instanceof String)
> +            {
> +                word = JAXBUtil.factory().createW();
> +                word.getContent().add(prevObj);
> +                list.set(lastIndex, word);
> +            }
> +            else if (prevObj instanceof W) {
> +                word = (W) prevObj;
> +            }
> +            else {
> +                log.error("Source has problem for tag <" + tag + ">.");
> +                return;
> +            }
> +            String existingMorph = word.getMorph();
> +            StringBuffer newMorph = new StringBuffer();
> +            if (existingMorph != null && existingMorph.length() > 0) {
> +                newMorph.append(existingMorph).append('|');
> +            }
> +            newMorph.append("x-StrongsMorph:T").append(tag.substring(2));
> +            word.setMorph(newMorph.toString());
>          }
> -
>          private String tag;
>      }
>      
> -    private static final Tag PARAGRAPH = new Tag("CM");
> -    private static final Tag FOOTNOTE_START = new Tag("RF");
> -    private static final Tag FOOTNOTE_STOP = new Tag("Rf");
> -    private static final Tag ITALICS_START = new Tag("FI");
> -    private static final Tag ITALICS_STOP = new Tag("Fi");
> +    /**
> +     * Represent a trunc of bible text without any tags
> +     */
> +    private static class TextTag implements ITag
> +    {
> +        public TextTag(String textData)
> +        {
> +            text = textData;
> +        }
> +        public void updateOsisStack(Stack stack) throws JAXBException
> +        {
> +            Element ele = (Element) stack.peek();
> +            List list = JAXBUtil.getList(ele);
> +            list.add(text);
> +        }
> +        private String text;
> +    }
> +    
> +    /**
> +     * Unknown Tag. Either not supported tag or tag not defined in GBF specification
> +     */
> +    private static class UnknownTag implements ITag
> +    {
> +        public UnknownTag(String tagName)
> +        {
> +            tag = tagName;
> +        }
> +        public void updateOsisStack(Stack stack)
> +        {
> +            // unknown tags
> +            log.warn("Ignoring tag of "+ tag);
> +        }
> +        private String tag;
> +    }
>  
>      /**
>       * The log stream
> Index: jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java
> ===================================================================
> RCS file: /cvs/jsword/jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java,v
> retrieving revision 1.3
> diff -u -r1.3 JAXBUtil.java
> --- jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java	17 May 2003 16:11:52 -0000	1.3
> +++ jsword/java/jsword/org/crosswire/jsword/book/data/JAXBUtil.java	24 May 2003 05:45:32 -0000
> @@ -12,9 +12,11 @@
>  import org.crosswire.common.util.Logger;
>  import org.crosswire.common.util.LogicError;
>  import org.crosswire.jsword.osis.Div;
> +import org.crosswire.jsword.osis.Note;
>  import org.crosswire.jsword.osis.ObjectFactory;
>  import org.crosswire.jsword.osis.Seg;
>  import org.crosswire.jsword.osis.Verse;
> +import org.crosswire.jsword.osis.W;
>  import org.crosswire.jsword.util.Project;
>  
>  /**
> @@ -148,6 +150,14 @@
>          else if (current instanceof Div)
>          {
>              return ((Div) current).getContent();
> +        }
> +        else if (current instanceof Note)
> +        {
> +            return ((Note) current).getContent();
> +        }
> +        else if (current instanceof W)
> +        {
> +            return ((W) current).getContent();
>          }
>          
>          log.error("unknown element: "+current.getClass().getName());
>