[sword-svn] r374 - trunk/modules/python

chrislit at crosswire.org chrislit at crosswire.org
Fri Aug 10 05:55:17 MST 2012


Author: chrislit
Date: 2012-08-10 05:55:16 -0700 (Fri, 10 Aug 2012)
New Revision: 374

Modified:
   trunk/modules/python/usfm2osis.py
Log:
print unhandled tags on a file-by-file basis (in debug mode)
attempted to fix raw strings as appropriate


Modified: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py	2012-08-10 11:30:42 UTC (rev 373)
+++ trunk/modules/python/usfm2osis.py	2012-08-10 12:55:16 UTC (rev 374)
@@ -255,11 +255,11 @@
 
     def cvtPreprocess(osis, relaxedConformance):
         # lines should never start with non-tags
-        osis = re.sub(r'\n\s*([^\\\s])', r' \1', osis)  # TODO: test this
+        osis = re.sub('\n\s*([^\\\s])', r' \1', osis)  # TODO: test this
         # convert CR to LF
-        osis = osis.replace(r'\r', r'\n')
+        osis = osis.replace('\r', '\n')
         # lines should never end with whitespace (other than \n)
-        osis = re.sub(r'\s+\n', r'\n', osis)
+        osis = re.sub('\s+\n', '\n', osis)
         # XML-encode as necessary
         osis = osis.replace('&', '&')
         osis = osis.replace('<', '&lt;')
@@ -310,17 +310,17 @@
         """
         global loc2osisBk, osis2locBk
         # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.)
-        osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\\n]*?)\n(.*)(?=\\id|$)', lambda m: u'﷐<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') +  m.group(3) + u'</div type="book">﷐\n' , osis, flags=re.DOTALL)
+        osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'﷐<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') +  m.group(3) + u'</div type="book">﷐\n' , osis, flags=re.DOTALL)#@
         # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
         osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
         if osisBook:
             osisBook = bookDict[osisBook.group(1)]
 
         # \ide_<ENCODING>
-        osis = re.sub(r'\\ide\b.*\n', r'', osis) # delete, since this was handled above
+        osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above
 
         # \sts_<STATUS CODE>
-        osis = re.sub(r'\\sts\b\s+(.+)\s*\n', r'<milestone type="x-usfm-sts" n="\1"/>\n', osis)
+        osis = re.sub(r'\\sts\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-sts" n="\1"/>'+'\n', osis)
 
         # \rem_text...
         osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis)
@@ -330,23 +330,23 @@
             osis = re.sub(r'\\restore\b\s+(.+)', r'<!-- restore - \1 -->', osis)
 
         # \h#_text...
-        osis = re.sub(r'\\h\b\s+(.+)\s*\n', r'<title type="runningHead">\1</title>\n', osis)
-        osis = re.sub(r'\\h(\d)\b\s+(.+)\s*\n', r'<title type="runningHead" n="\1">\2</title>\n', osis)
+        osis = re.sub(r'\\h\b\s+(.+)\s*'+'\n', r'<title type="runningHead">\1</title>'+'\n', osis)
+        osis = re.sub(r'\\h(\d)\b\s+(.+)\s*'+'\n', r'<title type="runningHead" n="\1">\2</title>'+'\n', osis)
 
         # \toc1_text...
-        osis = re.sub(r'\\toc1\b\s+(.+)\s*\n', r'<milestone type="x-usfm-toc1" n="\1"/>\n', osis)
+        osis = re.sub(r'\\toc1\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc1" n="\1"/>'+'\n', osis)
 
         # \toc2_text...
-        osis = re.sub(r'\\toc2\b\s+(.+)\s*\n', r'<milestone type="x-utfm-toc2" n="\1"/>\n', osis)
+        osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r'<milestone type="x-utfm-toc2" n="\1"/>'+'\n', osis)
 
         # \toc3_text...
-        locBook = re.search(r'\\toc3\b\s+(.+)\s*\n', osis)
+        locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
         if locBook:
             locBook = locBook.group(1)
             if osisBook:
                 osis2locBk[osisBook]=locBook
                 loc2osisBk[locBook]=osisBook
-        osis = re.sub(r'\\toc3\b\s+(.+)\s*\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>\n', osis)
+        osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>'+'\n', osis)
 
         return osis
 
@@ -357,7 +357,7 @@
         supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili#, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
         """
         # \imt#_text...
-        osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-introduction">' + m.group(2) + r'</title>', osis)
+        osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction">' + m.group(2) + '</title>', osis)
 
         # \is#_text...
         osis = re.sub(r'\\is1?\s+(.+)', lambda m: u'﷚<div type="section" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
@@ -388,10 +388,10 @@
         osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\i?q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL)
 
         # \ib
-        osis = re.sub(r'\\ib\b\s?', r'<lb type="x-p"/>', osis)
+        osis = re.sub(r'\\ib\b\s?', '<lb type="x-p"/>', osis)
         osis = osis.replace('\n</l>', '</l>\n')
         osis = re.sub(u'(<l [^﷐﷑﷓﷔]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
-        osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace(r'<lb type="x-p"/>', r'</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
+        osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
 
         # \ili#_text...
         osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\ili[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL)
@@ -406,7 +406,7 @@
         osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="head">\1</item type="head">', osis, flags=re.DOTALL)
         osis = osis.replace('\n</item>', '</item>\n')
         osis = re.sub(u'(<item [^﷐﷑﷓﷔]+</item>)', r'<div type="outline"><list>\1</list></div>', osis, flags=re.DOTALL)
-        osis = re.sub(r'item type="head"', r'head', osis)
+        osis = re.sub('item type="head"', 'head', osis)
 
         # \ior_text...\ior*
         osis = re.sub(r'\\ior\b\s+(.+?)\\ior\*', r'<reference>\1</reference>', osis, flags=re.DOTALL)
@@ -418,10 +418,10 @@
         osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'<q subType="x-introduction">\1</q>', osis, flags=re.DOTALL)
 
         # \imte#_text...
-        osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-introduction-end">' + m.group(2) + r'</title>', osis)
+        osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis)
 
         # \ie
-        osis = re.sub(r'\\ie\b\s*', r'<milestone type="x-usfm-ie"/>', osis)
+        osis = re.sub(r'\\ie\b\s*', '<milestone type="x-usfm-ie"/>', osis)
 
         return osis
 
@@ -462,22 +462,22 @@
         osis = re.sub(u'(﷞[^﷕﷐﷖﷗﷘﷙﷚﷛﷜﷝﷞]+)', r'\1'+u'</div>﷞\n', osis, re.DOTALL)
 
         # \sr_text...
-        osis = re.sub(r'\\sr\s+(.+)', u'﷔<title type="scope"><reference>'+r'\1</reference></title>', osis)
+        osis = re.sub(r'\\sr\s+(.+)', ur'﷔<title type="scope"><reference>\1</reference></title>', osis)
         # \r_text...
-        osis = re.sub(r'\\r\s+(.+)', u'﷔<title type="parallel"><reference type="parallel">'+r'\1</reference></title>', osis)
+        osis = re.sub(r'\\r\s+(.+)', ur'﷔<title type="parallel"><reference type="parallel">\1</reference></title>', osis)
         # \rq_text...\rq*
-        osis = re.sub(r'\\rq\s+(.+?)\\rq\*', u'<reference type="source">'+r'\1</reference>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\rq\s+(.+?)\\rq\*', ur'<reference type="source">\1</reference>', osis, flags=re.DOTALL)
 
         # \d_text...
-        osis = re.sub(r'\\d\s+(.+)', u'﷔<title canonical="true" type="psalm">'+r'\1</title>', osis)
+        osis = re.sub(r'\\d\s+(.+)', ur'﷔<title canonical="true" type="psalm">\1</title>', osis)
 
         # \sp_text...
         osis = re.sub(r'\\sp\s+(.+)', r'<speaker>\1</speaker>', osis)
 
         # \mt#_text...
-        osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main">' + m.group(2) + r'</title>', osis)
+        osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main">' + m.group(2) + '</title>', osis)
         # \mte#_text...
-        osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-end">' + m.group(2) + r'</title>', osis)
+        osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-end">' + m.group(2) + '</title>', osis)
 
         return osis
 
@@ -498,7 +498,7 @@
             if cp:
                 ctext = re.sub(r'\\cp\s+(.+?)\\cp*', '', ctext, flags=re.DOTALL)
                 cp = cp.group(1)
-                ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', r'"$BOOK$.'+ca+'"', ctext)
+                ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', '"$BOOK$.'+ca+'"', ctext)
             ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext)
             if ca:
                 ctext = re.sub(r'\\ca\s+(.+?)\\ca*', '', ctext, flags=re.DOTALL)
@@ -514,7 +514,7 @@
         osis = re.sub(r'\\cd\b\s+(.+)', u'﷔<title type="x-description">'+r'\1</title>', osis)
 
         # \v_#
-        osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'﷒<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + r'" sID="$BOOK$.$CHAP$.' + m.group(1) + r'"/>' + m.group(2) +  r'<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>﷒\n', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'﷒<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + '" sID="$BOOK$.$CHAP$.' + m.group(1) + '"/>' + m.group(2) +  '<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>﷒\n', osis, flags=re.DOTALL)
 
         # \vp_#\vp*
         # \va_#\va*
@@ -524,7 +524,7 @@
             if vp:
                 vtext = re.sub(r'\\vp\s+(.+?)\\vp*', '', vtext, flags=re.DOTALL)
                 vp = vp.group(1)
-                vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', r'"$BOOK$.$CHAP$.'+va+'"', vtext)
+                vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', '"$BOOK$.$CHAP$.'+va+'"', vtext)
             va = re.search(r'\\va\s+(.+?)\\va\*', vtext)
             if va:
                 vtext = re.sub(r'\\va\s+(.+?)\\va*', '', vtext, flags=re.DOTALL)
@@ -570,7 +570,7 @@
         osis = re.sub(u'(<item [^﷐﷑﷓﷔]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL)
 
         # \b
-        osis = re.sub(r'\\b\b\s?', r'<lb type="x-p"/>', osis)
+        osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis)
 
         if relaxedConformance:
             # TODO: \phi: DEP: Paragraph text, indented with hanging indent
@@ -598,13 +598,13 @@
         # \qc_text...
         # \qm#(_text...)
         qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
-        osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: r'<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u'﷐﷑﷓﷔'+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
 
         osis = osis.replace('\n</l>', '</l>\n')
         osis = re.sub(u'(<l [^﷐﷑﷓﷔]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
 
         # \b
-        osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace(r'<lb type="x-p"/>', r'</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
+        osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
 
         # \qa_text...
         osis = re.sub(r'\\qa\s+(.+)', u'﷔<title type="acrostic">'+r'\1</title>', osis)
@@ -628,7 +628,7 @@
         # \tc#_text...
         # \tcr#_text...
         tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'}
-        osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: r'<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: '<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)
 
         return osis
 
@@ -646,7 +646,7 @@
         note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<rdg type="alternate">\1</rdg>', note)
 
         # \ft_
-        note = re.sub(r'\\ft\s', r'', note)
+        note = re.sub(r'\\ft\s', '', note)
 
         # \fr_##SEP##
         note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<reference>\1</reference>', note)
@@ -665,14 +665,14 @@
         note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<hi type="super">\1</hi>', note)
 
         if relaxedConformance:
-            note = note.replace(r'\fq*', r'')
-            note = note.replace(r'\fqa*', r'')
-            note = note.replace(r'\ft*', r'')
-            note = note.replace(r'\fr*', r'')
-            note = note.replace(r'\fk*', r'')
-            note = note.replace(r'\fl*', r'')
-            note = note.replace(r'\fp*', r'')
-            note = note.replace(r'\fv*', r'')
+            note = note.replace(r'\fq*', '')
+            note = note.replace(r'\fqa*', '')
+            note = note.replace(r'\ft*', '')
+            note = note.replace(r'\fr*', '')
+            note = note.replace(r'\fk*', '')
+            note = note.replace(r'\fl*', '')
+            note = note.replace(r'\fp*', '')
+            note = note.replace(r'\fv*', '')
 
         note = note.replace(u'﷟', '')
         return note
@@ -684,10 +684,10 @@
         supported:\f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm*
         """
         # \f_+_...\f*
-        osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
 
         # \fe_+_...\fe*
-        osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
 
         osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
 
@@ -713,7 +713,7 @@
         note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)
 
         # \xt_
-        note = re.sub(r'\\xt\s', r'', note)
+        note = re.sub(r'\\xt\s', '', note)
 
         if relaxedConformance:
             # TODO: \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
@@ -728,10 +728,10 @@
         note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)
 
         if relaxedConformance:
-            note = note.replace(r'\xq*', r'')
-            note = note.replace(r'\xt*', r'')
-            note = note.replace(r'\xo*', r'')
-            note = note.replace(r'\xk*', r'')
+            note = note.replace(r'\xq*', '')
+            note = note.replace(r'\xt*', '')
+            note = note.replace(r'\xo*', '')
+            note = note.replace(r'\xk*', '')
 
         note = note.replace(u'﷟', '')
         return note
@@ -743,7 +743,7 @@
         supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc*
         """
         # \x_+_...\x*
-        osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference>﷟</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference>﷟</note>', osis, flags=re.DOTALL)
 
         osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
 
@@ -920,7 +920,7 @@
             periph += '">\n' +  contents + '</div>\n'
             return periph
 
-        osis = re.sub(r'\\periph\s+([^\n]+)\s*\n(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)
+        osis = re.sub(r'\\periph\s+([^'+'\n'+r']+)\s*'+'\n'+r'(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)
 
         return osis
 
@@ -931,15 +931,15 @@
         supported: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat
         """
         # \ef...\ef*
-        osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'﷟</note>', osis, flags=re.DOTALL)
         osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
 
         # \ex...\ex*
-        osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference>﷟</note>', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference>﷟</note>', osis, flags=re.DOTALL)
         osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
 
         # \esb...\esbex  # TODO: this likely needs to go much earlier in the process
-        osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '﷕<div type="x-sidebar">\1</div>﷕\n', osis, flags=re.DOTALL)
+        osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', ur'﷕<div type="x-sidebar">\1</div>﷕'+'\n', osis, flags=re.DOTALL)
 
         # \cat_<TAG>\cat*
         osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'<index index="category" level1="\1"/>', osis)
@@ -1009,27 +1009,27 @@
 
     def osisReorderAndCleanup(osis):
         # assorted re-orderings
-        osis = re.sub(u'(﷓<chapter eID=.+?\n)(<verse eID=.+?>﷒)\n?', r'\2\n\1', osis)
+        osis = re.sub(u'(﷓<chapter eID=.+?\n)(<verse eID=.+?>﷒)\n?', r'\2'+'\n'+r'\1', osis)
         osis = re.sub(u'([﷕﷖﷗﷘﷙]</div>)([^﷕﷖﷗﷘﷙]*<chapter eID.+?>)', r'\2\1', osis)
-        osis = re.sub(u'(﷓</p>\n?﷓<p>)\n?(<verse eID=.+?>﷒)\n?', r'\2\n\1\n', osis)
-        osis = re.sub(u'\n(<verse eID=.+?>﷒)', r'\1\n', osis)
+        osis = re.sub(u'(﷓</p>\n?﷓<p>)\n?(<verse eID=.+?>﷒)\n?', r'\2'+'\n'+r'\1'+'\n', osis)
+        osis = re.sub(u'\n(<verse eID=.+?>﷒)', r'\1'+'\n', osis)
         osis = re.sub(u'\n*(<l.+?>)(<verse eID=.+?>[﷒\n]*<verse osisID=.+?>)', r'\2\1', osis)
 
         # delete attributes from end tags (since they are invalid)
         osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis)
-        osis = osis.replace(r'<lb type="x-p"/>', r'<lb/>')
+        osis = osis.replace('<lb type="x-p"/>', '<lb/>')
         # delete Unicode tags
         for c in u'﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟':
             osis = osis.replace(c, '')
 
         for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse']:
-            osis = re.sub(r' +</'+endBlock+r'>', r'</'+endBlock+r'>', osis)
-            osis = re.sub(r' +<'+endBlock+r'( eID=[^/>]+/>)', r'</'+endBlock+r'\1', osis)
-        osis = re.sub(r' +((</[^>]+>)+) *', r'\1 ', osis)
+            osis = re.sub(' +</'+endBlock+'>', '</'+endBlock+r'>', osis)
+            osis = re.sub(' +<'+endBlock+'( eID=[^/>]+/>)', '</'+endBlock+r'\1', osis)
+        osis = re.sub(' +((</[^>]+>)+) *', r'\1 ', osis)
 
         # strip extra spaces & newlines
-        osis = re.sub(r'  +', r' ', osis)
-        osis = re.sub(r' ?\n\n+', r'\n', osis)
+        osis = re.sub('  +', ' ', osis)
+        osis = re.sub(' ?\n\n+', '\n', osis)
         return osis
 
 
@@ -1040,7 +1040,7 @@
         encoding = 'utf-8'
         osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
         # \ide_<ENCODING>
-        encoding = re.search(r'\\ide\s+(.+)\n', osis)
+        encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
         if encoding:
             encoding = encoding.group(1).lower().strip()
             if encoding != 'utf-8':
@@ -1078,6 +1078,11 @@
     for sb in specialBooks:
         osis = osis.replace('<div type="book" osisID="' + sb  + '">', '<div type="' + sb.lower() + '">')
 
+    if DEBUG:
+        localUnhandledTags = set(re.findall(r'(\\[^\s\*]+?\b\*?)', osis))
+        if localUnhandledTags:
+            print('Unhandled USFM tags in ' + sFile + ': ' + ', '.join(localUnhandledTags) + ' (' + str(len(localUnhandledTags)) + ' total)')
+
     return osis
 
 
@@ -1091,10 +1096,6 @@
     if verbose:
         print text
 
-def printUnhandled():
-    global relaxedConformance
-
-
 def printUsage():
     print('usfm2osis.py -- USFM ' + USFMversion + ' to OSIS ' + OSISversion + ' converter version ' + scriptVersion)
     print('                Revision: ' + rev + ' (' + date + ')')
@@ -1164,13 +1165,13 @@
         verbose = False
 
     if '-d' in sys.argv:
-        debugMode = True
+        DEBUG = True
         inputFilesIdx += 1
         num_processes = 1
         num_jobs = 1
         verbose = True
     else:
-        debugMode = False
+        DEBUG = False
 
     if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
         printUsage()
@@ -1249,6 +1250,7 @@
         for doc in sorted(usfmDocList, key=keynat):
             unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
             OSISfile.write(osisSegment[doc])
+
         writeOSISFooter(OSISfile)
 
         if unhandledTags:




More information about the sword-cvs mailing list