1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2005 - 2016
18   *
19   */
20  package org.crosswire.common.xml;
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.util.ArrayList;
25  import java.util.Collections;
26  import java.util.HashSet;
27  import java.util.List;
28  import java.util.Set;
29  import java.util.regex.Matcher;
30  import java.util.regex.Pattern;
31  
32  import org.crosswire.common.util.FileUtil;
33  import org.crosswire.common.util.PropertyMap;
34  import org.crosswire.common.util.ResourceUtil;
35  import org.jdom2.Document;
36  import org.jdom2.JDOMException;
37  import org.jdom2.input.SAXBuilder;
38  import org.jdom2.input.sax.XMLReaders;
39  import org.slf4j.Logger;
40  import org.slf4j.LoggerFactory;
41  import org.xml.sax.Attributes;
42  import org.xml.sax.ContentHandler;
43  import org.xml.sax.SAXException;
44  
45  /**
46   * Utilities for working with SAX XML parsing.
47   * 
48   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
49   * @author Joe Walker
50   * @author DM Smith
51   */
52  public final class XMLUtil {
53      /**
54       * Prevent instantiation
55       */
56      private XMLUtil() {
57      }
58  
59      /**
60       * Get and load an XML file from the classpath and a few other places into a
61       * JDOM Document object.
62       * 
63       * @param subject
64       *            The name of the desired resource (without any extension)
65       * @return The requested resource
66       * @throws IOException
67       *             if there is a problem reading the file
68       * @throws JDOMException
69       *             If the resource is not valid XML
70       */
71      public static Document getDocument(String subject) throws JDOMException, IOException {
72          String resource = subject + FileUtil.EXTENSION_XML;
73          InputStream in = ResourceUtil.getResourceAsStream(resource);
74  
75          log.debug("Loading {}.xml from classpath: [OK]", subject);
76          // With JDom 1.x this passed true
77          SAXBuilder builder = new SAXBuilder(XMLReaders.DTDVALIDATING);
78          return builder.build(in);
79      }
80  
81      /**
82       * Serialize a SAXEventProvider into an XML String
83       * 
84       * @param provider
85       *            The source of SAX events
86       * @return a serialized string
87       * @throws SAXException 
88       */
89      public static String writeToString(SAXEventProvider provider) throws SAXException {
90          ContentHandler ser = new PrettySerializingContentHandler();
91          provider.provideSAXEvents(ser);
92          return ser.toString();
93      }
94  
95      /**
96       * Get the full name of the attribute, including the namespace if any.
97       * 
98       * @param attrs
99       *            the collection of attributes
100      * @param index
101      *            the index of the desired attribute
102      * @return the requested attribute
103      */
104     public static String getAttributeName(Attributes attrs, int index) {
105         String qName = attrs.getQName(index);
106         if (qName != null) {
107             return qName;
108         }
109         return attrs.getLocalName(index);
110     }
111 
112     /**
113      * Show the attributes of an element as debug
114      * @param attrs 
115      */
116     public static void debugSAXAttributes(Attributes attrs) {
117         for (int i = 0; i < attrs.getLength(); i++) {
118             log.debug("attr[{}]: {}={}", Integer.toString(i), attrs.getQName(i), attrs.getValue(i));
119         }
120     }
121 
122     /**
123      * Normalizes the given string
124      * @param s 
125      * @return the escaped string
126      */
127     public static String escape(String s) {
128         if (s == null) {
129             return s;
130         }
131         int len = s.length();
132         StringBuilder str = new StringBuilder(len);
133 
134         for (int i = 0; i < len; i++) {
135             char ch = s.charAt(i);
136             switch (ch) {
137             case '<':
138                 str.append("&lt;");
139                 break;
140 
141             case '>':
142                 str.append("&gt;");
143                 break;
144 
145             case '&':
146                 str.append("&amp;");
147                 break;
148 
149             case '"':
150                 str.append("&quot;");
151                 break;
152 
153             default:
154                 str.append(ch);
155             }
156         }
157 
158         return str.toString();
159     }
160 
161     /**
162      * For each entity in the input that is not allowed in XML, replace the
163      * entity with its unicode equivalent or remove it. For each instance of a
164      * bare &, replace it with &amp;<br>
165      * XML only allows 4 entities: &amp;amp;, &amp;quot;, &amp;lt; and &amp;gt;.
166      * 
167      * @param broken
168      *            the string to handle entities
169      * @return the string with entities appropriately fixed up
170      */
171     public static String cleanAllEntities(String broken) {
172         if (broken == null) {
173             return null;
174         }
175 
176         String working = broken;
177         int cleanfrom = 0;
178 
179         while (true) {
180             int amp = working.indexOf('&', cleanfrom);
181 
182             // If there are no more amps then we are done
183             if (amp == -1) {
184                 break;
185             }
186 
187             // Skip references of the kind &#ddd;
188             if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
189                 cleanfrom = working.indexOf(';', amp) + 1;
190                 continue;
191             }
192 
193             int i = amp + 1;
194             while (true) {
195                 // if we are at the end of the string then just escape the '&';
196                 if (i >= working.length()) {
197                     // String entity = working.substring(amp);
198                     // String replace = guessEntity(entity);
199                     // DataPolice.report("replacing unterminated entity: '" +
200                     // entity + "' with: '" + replace + "'");
201 
202                     return working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
203                 }
204 
205                 // if we have come to a ; then we have an entity
206                 // If it is something that xml can't handle then replace it.
207                 char c = working.charAt(i);
208                 if (c == ';') {
209                     String entity = working.substring(amp, i + 1);
210                     String replace = handleEntity(entity);
211                     // log.warn("replacing entity: '{}' with: '{}'", entity, replace);
212 
213                     working = working.substring(0, amp) + replace + working.substring(i + 1);
214                     break;
215                 }
216 
217                 // Did we end an entity without finding a closing ;
218                 // Then treat it as an '&' that needs to be replaced with &amp;
219                 if (!Character.isLetterOrDigit(c)) {
220                     // String entity = working.substring(amp, i);
221                     // String replace = "&amp;" + working.substring(amp + 1, i);
222                     // log.warn("replacing invalid entity: '{}' with: '{}': {}", entity, replace, broken);
223 
224                     working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1);
225                     amp = i + 4; // account for the 4 extra characters
226                     break;
227                 }
228 
229                 i++;
230             }
231 
232             cleanfrom = amp + 1;
233         }
234 
235         return working;
236     }
237 
238     /**
239      * Remove all invalid characters in the input, replacing them with a space. XML has stringent
240      * requirements as to which characters are or are not allowed. The set of
241      * allowable characters are:<br>
242      * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]<br>
243      * Note: Java handles to ￿
244      * 
245      * @param broken
246      *            the string to be cleaned
247      * @return the cleaned string
248      */
249     public static String cleanAllCharacters(String broken) {
250         return invalidCharacterPattern.matcher(broken).replaceAll(" ");
251     }
252 
253     /**
254      * Strip all closing tags from the end of the XML fragment, and then
255      * re-close all tags that are open at the end of the string.
256      * 
257      * @param broken
258      *            the string to be cleaned.
259      * @return cleaned string, or {@code null} if the string could not be
260      *         cleaned due to more broken XML
261      */
262     public static String recloseTags(String broken) {
263         String result = broken;
264         // remove closing tags from the end
265         while (result.matches(".*</[a-zA-Z]+>[ \t\r\n]*")) {
266             result = result.substring(0, result.lastIndexOf('<'));
267         }
268         // close tags again
269         List<String> openTags = new ArrayList<String>();
270         Matcher m = Pattern.compile("</?[a-zA-Z]+").matcher(result);
271         boolean lTagFound = false;
272         boolean lgTagFound = false;
273         while (m.find()) {
274             String match = m.group();
275             if (match.startsWith("</")) {
276                 if (openTags.size() == 0 && "</l".equals(match) && !lTagFound) {
277                     return recloseTags("<l>" + broken);
278                 }
279                 if (openTags.size() == 0 && "</lg".equals(match) && !lgTagFound) {
280                     return recloseTags("<lg>" + broken);
281                 }
282                 if (openTags.size() == 0) {
283                     return null;
284                 }
285                 String lastTag = openTags.remove(openTags.size() - 1);
286                 if (!("</" + lastTag).equals(match)) {
287                     return null;
288                 }
289             } else {
290                 int closePos = result.indexOf('>', m.end());
291                 if (closePos == -1) {
292                     return null;
293                 }
294                 while (Character.isWhitespace(result.charAt(closePos - 1))) {
295                     --closePos;
296                 }
297                 if (result.charAt(closePos - 1) != '/') {
298                     if ("<l".equals(match)) {
299                         lTagFound = true;
300                     }
301                     if ("<lg".equals(match)) {
302                         lgTagFound = true;
303                     }
304                     openTags.add(match.substring(1));
305                 }
306             }
307         }
308         Collections.reverse(openTags);
309         for (String openTag : openTags) {
310             result += "</" + openTag + ">";
311         }
312         return result;
313     }
314 
315     /**
316      * Common HTML tags such as &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; may be
317      * left open causing XML parsing to fail. This method closes these tags.
318      * 
319      * @param broken
320      *            the string to be cleaned
321      * @return the cleaned string
322      */
323     public static String closeEmptyTags(String broken) {
324         if (broken == null) {
325             return null;
326         }
327 
328         return openHTMLTagPattern.matcher(broken).replaceAll("<$1$2/>");
329     }
330 
331     /**
332      * XML parse failed, so we can try getting rid of all the tags and having
333      * another go. We define a tag to start at a &lt; and end at the end of the
334      * next word (where a word is what comes in between spaces) that does not
335      * contain an = sign, or at a >, whichever is earlier.
336      * @param broken 
337      * @return the string without any tags
338      */
339     public static String cleanAllTags(String broken) {
340         if (broken == null) {
341             return null;
342         }
343 
344         String working = broken;
345 
346         allTags: while (true) {
347             int lt = working.indexOf('<');
348 
349             // If there are no more amps then we are done
350             if (lt == -1) {
351                 break allTags;
352             }
353 
354             // loop to find the end of this tag
355             int i = lt;
356             int startattr = -1;
357 
358             singletag: while (true) {
359                 i++;
360 
361                 // the tag can't exist past the end of the string
362                 if (i >= working.length()) {
363                     // go back one so we can safely chop
364                     i--;
365                     break singletag;
366                 }
367 
368                 char c = working.charAt(i);
369 
370                 // normal end of tag
371                 if (c == '>') {
372                     break singletag;
373                 }
374 
375                 // we declare end-of-tag if this 'word' is not an attribute
376                 if (c == ' ') {
377                     if (startattr == -1) {
378                         // NOTE(joe): should we skip over consecutive spaces?
379                         startattr = i;
380                     } else {
381                         // so we've already had a space indicating start of
382                         // attribute, so this must be the beginning of the next
383                         // NOTE(joe): no - spaces can exist in attr values
384                         String value = working.substring(startattr, i);
385                         if (value.indexOf('=') == -1) {
386                             // this 'attribute' does not contain an equals so
387                             // we call it a word and end the parse
388                             break singletag;
389                         }
390                     }
391                 }
392             }
393 
394             // So we have the end of the tag, delete it, but leave a space in it's place
395             // DataPolice.report("discarding tag: " + working.substring(lt, i + 1));
396             working = working.substring(0, lt) + " " + working.substring(i + 1);
397         }
398 
399         return working;
400     }
401 
402     /**
403      * Replace entity with its unicode equivalent, if it is not a valid XML
404      * entity. Otherwise strip it out. XML only allows 4 entities: &amp;amp;,
405      * &amp;quot;, &amp;lt; and &amp;gt;.
406      * 
407      * @param entity
408      *            the entity to be replaced
409      * @return the substitution for the entity, either itself, the unicode
410      *         equivalent or an empty string.
411      */
412     private static String handleEntity(String entity) {
413         if (goodEntities.contains(entity)) {
414             return entity;
415         }
416 
417         String replace = badEntities.get(entity);
418         if (replace != null) {
419             return replace;
420         }
421 
422         // replace unknown entities with a space
423         return " ";
424     }
425 
426     // Map entities to their unicode equivalent
427     private static Set<String> goodEntities = new HashSet<String>();
428     private static PropertyMap badEntities = new PropertyMap();
429     static {
430         // pre-defined XML entities
431         goodEntities.add("&quot;"); // quotation mark
432         goodEntities.add("&amp;"); // ampersand
433         goodEntities.add("&lt;"); // less-than sign
434         goodEntities.add("&gt;"); // greater-than sign
435 
436         // misc entities
437         badEntities.put("&euro;", "\u20AC"); // euro
438         badEntities.put("&lsquo;", "\u2018"); // left single quotation mark
439         badEntities.put("&rsquo;", "\u2019"); // right single quotation mark
440 
441         // Latin 1 entities
442         badEntities.put("&nbsp;", "\u00A0"); // no-break space
443         badEntities.put("&iexcl;", "\u00A1"); // inverted exclamation mark
444         badEntities.put("&cent;", "\u00A2"); // cent sign
445         badEntities.put("&pound;", "\u00A3"); // pound sign
446         badEntities.put("&curren;", "\u00A4"); // currency sign
447         badEntities.put("&yen;", "\u00A5"); // yen sign
448         badEntities.put("&brvbar;", "\u00A6"); // broken vertical bar
449         badEntities.put("&sect;", "\u00A7"); // section sign
450         badEntities.put("&uml;", "\u00A8"); // diaeresis
451         badEntities.put("&copy;", "\u00A9"); // copyright sign
452         badEntities.put("&ordf;", "\u00AA"); // feminine ordinal indicator
453         badEntities.put("&laquo;", "\u00AB"); // left-pointing double angle quotation mark
454         badEntities.put("&not;", "\u00AC"); // not sign
455         badEntities.put("&shy;", "\u00AD"); // soft hyphen
456         badEntities.put("&reg;", "\u00AE"); // registered sign
457         badEntities.put("&macr;", "\u00AF"); // macron
458         badEntities.put("&deg;", "\u00B0"); // degree sign
459         badEntities.put("&plusmn;", "\u00B1"); // plus-minus sign
460         badEntities.put("&sup2;", "\u00B2"); // superscript two
461         badEntities.put("&sup3;", "\u00B3"); // superscript three
462         badEntities.put("&acute;", "\u00B4"); // acute accent
463         badEntities.put("&micro;", "\u00B5"); // micro sign
464         badEntities.put("&para;", "\u00B6"); // pilcrow sign
465         badEntities.put("&middot;", "\u00B7"); // middle dot
466         badEntities.put("&cedil;", "\u00B8"); // cedilla
467         badEntities.put("&sup1;", "\u00B9"); // superscript one
468         badEntities.put("&ordm;", "\u00BA"); // masculine ordinal indicator
469         badEntities.put("&raquo;", "\u00BB"); // right-pointing double angle quotation mark
470         badEntities.put("&frac14;", "\u00BC"); // vulgar fraction one quarter
471         badEntities.put("&frac12;", "\u00BD"); // vulgar fraction one half
472         badEntities.put("&frac34;", "\u00BE"); // vulgar fraction three quarters
473         badEntities.put("&iquest;", "\u00BF"); // inverted question mark
474         badEntities.put("&Agrave;", "\u00C0"); // latin capital letter A with grave
475         badEntities.put("&Aacute;", "\u00C1"); // latin capital letter A with acute
476         badEntities.put("&Acirc;", "\u00C2"); // latin capital letter A with circumflex
477         badEntities.put("&Atilde;", "\u00C3"); // latin capital letter A with tilde
478         badEntities.put("&Auml;", "\u00C4"); // latin capital letter A with diaeresis
479         badEntities.put("&Aring;", "\u00C5"); // latin capital letter A with ring above
480         badEntities.put("&AElig;", "\u00C6"); // latin capital letter AE
481         badEntities.put("&Ccedil;", "\u00C7"); // latin capital letter C with cedilla
482         badEntities.put("&Egrave;", "\u00C8"); // latin capital letter E with grave
483         badEntities.put("&Eacute;", "\u00C9"); // latin capital letter E with acute
484         badEntities.put("&Ecirc;", "\u00CA"); // latin capital letter E with circumflex
485         badEntities.put("&Euml;", "\u00CB"); // latin capital letter E with diaeresis
486         badEntities.put("&Igrave;", "\u00CC"); // latin capital letter I with grave
487         badEntities.put("&Iacute;", "\u00CD"); // latin capital letter I with acute
488         badEntities.put("&Icirc;", "\u00CE"); // latin capital letter I with circumflex
489         badEntities.put("&Iuml;", "\u00CF"); // latin capital letter I with diaeresis
490         badEntities.put("&ETH;", "\u00D0"); // latin capital letter ETH
491         badEntities.put("&Ntilde;", "\u00D1"); // latin capital letter N with tilde
492         badEntities.put("&Ograve;", "\u00D2"); // latin capital letter O with grave
493         badEntities.put("&Oacute;", "\u00D3"); // latin capital letter O with acute
494         badEntities.put("&Ocirc;", "\u00D4"); // latin capital letter O with circumflex
495         badEntities.put("&Otilde;", "\u00D5"); // latin capital letter O with tilde
496         badEntities.put("&Ouml;", "\u00D6"); // latin capital letter O with diaeresis
497         badEntities.put("&times;", "\u00D7"); // multiplication sign
498         badEntities.put("&Oslash;", "\u00D8"); // latin capital letter O with stroke
499         badEntities.put("&Ugrave;", "\u00D9"); // latin capital letter U with grave
500         badEntities.put("&Uacute;", "\u00DA"); // latin capital letter U with acute
501         badEntities.put("&Ucirc;", "\u00DB"); // latin capital letter U with circumflex
502         badEntities.put("&Uuml;", "\u00DC"); // latin capital letter U with diaeresis
503         badEntities.put("&Yacute;", "\u00DD"); // latin capital letter Y with acute
504         badEntities.put("&THORN;", "\u00DE"); // latin capital letter THORN
505         badEntities.put("&szlig;", "\u00DF"); // latin small letter sharp s
506         badEntities.put("&agrave;", "\u00E0"); // latin small letter a with grave
507         badEntities.put("&aacute;", "\u00E1"); // latin small letter a with acute
508         badEntities.put("&acirc;", "\u00E2"); // latin small letter a with circumflex
509         badEntities.put("&atilde;", "\u00E3"); // latin small letter a with tilde
510         badEntities.put("&auml;", "\u00E4"); // latin small letter a with diaeresis
511         badEntities.put("&aring;", "\u00E5"); // latin small letter a with ring above
512         badEntities.put("&aelig;", "\u00E6"); // latin small letter ae
513         badEntities.put("&ccedil;", "\u00E7"); // latin small letter c with cedilla
514         badEntities.put("&egrave;", "\u00E8"); // latin small letter e with grave
515         badEntities.put("&eacute;", "\u00E9"); // latin small letter e with acute
516         badEntities.put("&ecirc;", "\u00EA"); // latin small letter e with circumflex
517         badEntities.put("&euml;", "\u00EB"); // latin small letter e with diaeresis
518         badEntities.put("&igrave;", "\u00EC"); // latin small letter i with grave
519         badEntities.put("&iacute;", "\u00ED"); // latin small letter i with acute
520         badEntities.put("&icirc;", "\u00EE"); // latin small letter i with circumflex
521         badEntities.put("&iuml;", "\u00EF"); // latin small letter i with diaeresis
522         badEntities.put("&eth;", "\u00F0"); // latin small letter eth
523         badEntities.put("&ntilde;", "\u00F1"); // latin small letter n with tilde
524         badEntities.put("&ograve;", "\u00F2"); // latin small letter o with grave
525         badEntities.put("&oacute;", "\u00F3"); // latin small letter o with acute
526         badEntities.put("&ocirc;", "\u00F4"); // latin small letter o with circumflex
527         badEntities.put("&otilde;", "\u00F5"); // latin small letter o with tilde
528         badEntities.put("&ouml;", "\u00F6"); // latin small letter o with diaeresis
529         badEntities.put("&divide;", "\u00F7"); // division sign
530         badEntities.put("&oslash;", "\u00F8"); // latin small letter o with stroke
531         badEntities.put("&ugrave;", "\u00F9"); // latin small letter u with grave
532         badEntities.put("&uacute;", "\u00FA"); // latin small letter u with acute
533         badEntities.put("&ucirc;", "\u00FB"); // latin small letter u with circumflex
534         badEntities.put("&uuml;", "\u00FC"); // latin small letter u with diaeresis
535         badEntities.put("&yacute;", "\u00FD"); // latin small letter y with acute
536         badEntities.put("&thorn;", "\u00FE"); // latin small letter thorn
537         badEntities.put("&yuml;", "\u00FF"); // latin small letter y with diaeresis
538     }
539 
540     /**
541      * Pattern for numeric entities.
542      */
543     private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};");
544 
545     /**
546      * Pattern that negates the allowable XML 4 byte unicode characters. Valid
547      * are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
548      * [#x10000-#x10FFFF]
549      */
550     private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]");
551 
552     /**
553      * Pattern that matches open &lt;br&gt;,&lt;hr&gt; and &lt;img&gt; tags.
554      */
555     private static Pattern openHTMLTagPattern = Pattern.compile("<(img|hr|br)([^>]*)(?<!/)>");
556 
557     /**
558      * The log stream
559      */
560     private static final Logger log = LoggerFactory.getLogger(XMLUtil.class);
561 }
562