[jsword-svn] r1934 - in trunk: common/src/main/java/org/crosswire/common/xml jsword/src/main/java/org/crosswire/jsword/book/filter/thml
dmsmith at crosswire.org
dmsmith at crosswire.org
Tue Feb 24 04:39:08 MST 2009
Author: dmsmith
Date: 2009-02-24 04:39:08 -0700 (Tue, 24 Feb 2009)
New Revision: 1934
Modified:
trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java
trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java
Log:
Allow ThML to contain <br>, <img> and <hr>
Modified: trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java
===================================================================
--- trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java 2009-02-24 03:13:27 UTC (rev 1933)
+++ trunk/common/src/main/java/org/crosswire/common/xml/XMLUtil.java 2009-02-24 11:39:08 UTC (rev 1934)
@@ -253,6 +253,23 @@
}
/**
+ * Common HTML tags such as <br>,<hr> and <img> may be
+ * left open causing XML parsing to fail. This method closes these tags.
+ *
+ * @param broken the string to be cleaned
+ * @return the cleaned string
+ */
+ public static String closeEmptyTags(String broken)
+ {
+ if (broken == null)
+ {
+ return null;
+ }
+
+ return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>"); //$NON-NLS-1$
+ }
+
+ /**
* XML parse failed, so we can try getting rid of all the tags and having
* another go. We define a tag to start at a < and end at the end of the
* next word (where a word is what comes in between spaces) that does not
@@ -489,4 +506,9 @@
* Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*/
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
+
+ /**
+ * Pattern that matches open <br>,<hr> and <img> tags.
+ */
+ private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>"); //$NON-NLS-1$
}
Modified: trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java 2009-02-24 03:13:27 UTC (rev 1933)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/book/filter/thml/THMLFilter.java 2009-02-24 11:39:08 UTC (rev 1934)
@@ -124,6 +124,11 @@
String clean = XMLUtil.cleanAllCharacters(plain);
Element ele = parse(book, key, clean, "cleaning text"); //$NON-NLS-1$
+ if (ele == null)
+ {
+ ele = parse(book, key, XMLUtil.closeEmptyTags(clean), "closing empty tags"); //$NON-NLS-1$
+ }
+
if (ele == null)
{
ele = cleanTags(book, key, clean);
More information about the jsword-svn
mailing list