[sword-svn] r374 - trunk/modules/python
chrislit at crosswire.org
chrislit at crosswire.org
Fri Aug 10 05:55:17 MST 2012
Author: chrislit
Date: 2012-08-10 05:55:16 -0700 (Fri, 10 Aug 2012)
New Revision: 374
Modified:
trunk/modules/python/usfm2osis.py
Log:
print unhandled tags on a file-by-file basis (in debug mode)
attempted to fix raw strings as appropriate
Modified: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py 2012-08-10 11:30:42 UTC (rev 373)
+++ trunk/modules/python/usfm2osis.py 2012-08-10 12:55:16 UTC (rev 374)
@@ -255,11 +255,11 @@
def cvtPreprocess(osis, relaxedConformance):
# lines should never start with non-tags
- osis = re.sub(r'\n\s*([^\\\s])', r' \1', osis) # TODO: test this
+ osis = re.sub('\n\s*([^\\\s])', r' \1', osis) # TODO: test this
# convert CR to LF
- osis = osis.replace(r'\r', r'\n')
+ osis = osis.replace('\r', '\n')
# lines should never end with whitespace (other than \n)
- osis = re.sub(r'\s+\n', r'\n', osis)
+ osis = re.sub('\s+\n', '\n', osis)
# XML-encode as necessary
osis = osis.replace('&', '&')
osis = osis.replace('<', '<')
@@ -310,17 +310,17 @@
"""
global loc2osisBk, osis2locBk
# \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.)
- osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\\n]*?)\n(.*)(?=\\id|$)', lambda m: u'<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + u'</div type="book">\n' , osis, flags=re.DOTALL)
+ osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + u'</div type="book">\n' , osis, flags=re.DOTALL)#@
# keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
if osisBook:
osisBook = bookDict[osisBook.group(1)]
# \ide_<ENCODING>
- osis = re.sub(r'\\ide\b.*\n', r'', osis) # delete, since this was handled above
+ osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above
# \sts_<STATUS CODE>
- osis = re.sub(r'\\sts\b\s+(.+)\s*\n', r'<milestone type="x-usfm-sts" n="\1"/>\n', osis)
+ osis = re.sub(r'\\sts\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-sts" n="\1"/>'+'\n', osis)
# \rem_text...
osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis)
@@ -330,23 +330,23 @@
osis = re.sub(r'\\restore\b\s+(.+)', r'<!-- restore - \1 -->', osis)
# \h#_text...
- osis = re.sub(r'\\h\b\s+(.+)\s*\n', r'<title type="runningHead">\1</title>\n', osis)
- osis = re.sub(r'\\h(\d)\b\s+(.+)\s*\n', r'<title type="runningHead" n="\1">\2</title>\n', osis)
+ osis = re.sub(r'\\h\b\s+(.+)\s*'+'\n', r'<title type="runningHead">\1</title>'+'\n', osis)
+ osis = re.sub(r'\\h(\d)\b\s+(.+)\s*'+'\n', r'<title type="runningHead" n="\1">\2</title>'+'\n', osis)
# \toc1_text...
- osis = re.sub(r'\\toc1\b\s+(.+)\s*\n', r'<milestone type="x-usfm-toc1" n="\1"/>\n', osis)
+ osis = re.sub(r'\\toc1\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc1" n="\1"/>'+'\n', osis)
# \toc2_text...
- osis = re.sub(r'\\toc2\b\s+(.+)\s*\n', r'<milestone type="x-utfm-toc2" n="\1"/>\n', osis)
+ osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r'<milestone type="x-utfm-toc2" n="\1"/>'+'\n', osis)
# \toc3_text...
- locBook = re.search(r'\\toc3\b\s+(.+)\s*\n', osis)
+ locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
if locBook:
locBook = locBook.group(1)
if osisBook:
osis2locBk[osisBook]=locBook
loc2osisBk[locBook]=osisBook
- osis = re.sub(r'\\toc3\b\s+(.+)\s*\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>\n', osis)
+ osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>'+'\n', osis)
return osis
@@ -357,7 +357,7 @@
supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili#, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
"""
# \imt#_text...
- osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-introduction">' + m.group(2) + r'</title>', osis)
+ osis = re.sub(r'\\imt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction">' + m.group(2) + '</title>', osis)
# \is#_text...
osis = re.sub(r'\\is1?\s+(.+)', lambda m: u'<div type="section" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
@@ -388,10 +388,10 @@
osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+u''+r']|\\i?q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL)
# \ib
- osis = re.sub(r'\\ib\b\s?', r'<lb type="x-p"/>', osis)
+ osis = re.sub(r'\\ib\b\s?', '<lb type="x-p"/>', osis)
osis = osis.replace('\n</l>', '</l>\n')
osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
- osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace(r'<lb type="x-p"/>', r'</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
+ osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
# \ili#_text...
osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+u''+r']|\\ili[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1" subType="x-introduction">\1</item>', osis, flags=re.DOTALL)
@@ -406,7 +406,7 @@
osis = re.sub(r'\\iot\b\s*(.*?)(?=(['+u''+r']|\\io[t\d\s]|<lb\b|<title\b))', r'<item type="head">\1</item type="head">', osis, flags=re.DOTALL)
osis = osis.replace('\n</item>', '</item>\n')
osis = re.sub(u'(<item [^]+</item>)', r'<div type="outline"><list>\1</list></div>', osis, flags=re.DOTALL)
- osis = re.sub(r'item type="head"', r'head', osis)
+ osis = re.sub('item type="head"', 'head', osis)
# \ior_text...\ior*
osis = re.sub(r'\\ior\b\s+(.+?)\\ior\*', r'<reference>\1</reference>', osis, flags=re.DOTALL)
@@ -418,10 +418,10 @@
osis = re.sub(r'\\iqt\s+(.+?)\\iqt\*', r'<q subType="x-introduction">\1</q>', osis, flags=re.DOTALL)
# \imte#_text...
- osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-introduction-end">' + m.group(2) + r'</title>', osis)
+ osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis)
# \ie
- osis = re.sub(r'\\ie\b\s*', r'<milestone type="x-usfm-ie"/>', osis)
+ osis = re.sub(r'\\ie\b\s*', '<milestone type="x-usfm-ie"/>', osis)
return osis
@@ -462,22 +462,22 @@
osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
# \sr_text...
- osis = re.sub(r'\\sr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis)
+ osis = re.sub(r'\\sr\s+(.+)', ur'<title type="scope"><reference>\1</reference></title>', osis)
# \r_text...
- osis = re.sub(r'\\r\s+(.+)', u'<title type="parallel"><reference type="parallel">'+r'\1</reference></title>', osis)
+ osis = re.sub(r'\\r\s+(.+)', ur'<title type="parallel"><reference type="parallel">\1</reference></title>', osis)
# \rq_text...\rq*
- osis = re.sub(r'\\rq\s+(.+?)\\rq\*', u'<reference type="source">'+r'\1</reference>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\rq\s+(.+?)\\rq\*', ur'<reference type="source">\1</reference>', osis, flags=re.DOTALL)
# \d_text...
- osis = re.sub(r'\\d\s+(.+)', u'<title canonical="true" type="psalm">'+r'\1</title>', osis)
+ osis = re.sub(r'\\d\s+(.+)', ur'<title canonical="true" type="psalm">\1</title>', osis)
# \sp_text...
osis = re.sub(r'\\sp\s+(.+)', r'<speaker>\1</speaker>', osis)
# \mt#_text...
- osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main">' + m.group(2) + r'</title>', osis)
+ osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main">' + m.group(2) + '</title>', osis)
# \mte#_text...
- osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-end">' + m.group(2) + r'</title>', osis)
+ osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-end">' + m.group(2) + '</title>', osis)
return osis
@@ -498,7 +498,7 @@
if cp:
ctext = re.sub(r'\\cp\s+(.+?)\\cp*', '', ctext, flags=re.DOTALL)
cp = cp.group(1)
- ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', r'"$BOOK$.'+ca+'"', ctext)
+ ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', '"$BOOK$.'+ca+'"', ctext)
ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext)
if ca:
ctext = re.sub(r'\\ca\s+(.+?)\\ca*', '', ctext, flags=re.DOTALL)
@@ -514,7 +514,7 @@
osis = re.sub(r'\\cd\b\s+(.+)', u'<title type="x-description">'+r'\1</title>', osis)
# \v_#
- osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + r'" sID="$BOOK$.$CHAP$.' + m.group(1) + r'"/>' + m.group(2) + r'<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + '" sID="$BOOK$.$CHAP$.' + m.group(1) + '"/>' + m.group(2) + '<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL)
# \vp_#\vp*
# \va_#\va*
@@ -524,7 +524,7 @@
if vp:
vtext = re.sub(r'\\vp\s+(.+?)\\vp*', '', vtext, flags=re.DOTALL)
vp = vp.group(1)
- vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', r'"$BOOK$.$CHAP$.'+va+'"', vtext)
+ vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', '"$BOOK$.$CHAP$.'+va+'"', vtext)
va = re.search(r'\\va\s+(.+?)\\va\*', vtext)
if va:
vtext = re.sub(r'\\va\s+(.+?)\\va*', '', vtext, flags=re.DOTALL)
@@ -570,7 +570,7 @@
osis = re.sub(u'(<item [^]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL)
# \b
- osis = re.sub(r'\\b\b\s?', r'<lb type="x-p"/>', osis)
+ osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis)
if relaxedConformance:
# TODO: \phi: DEP: Paragraph text, indented with hanging indent
@@ -598,13 +598,13 @@
# \qc_text...
# \qm#(_text...)
qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
- osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: r'<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
osis = osis.replace('\n</l>', '</l>\n')
osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
# \b
- osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace(r'<lb type="x-p"/>', r'</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
+ osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
# \qa_text...
osis = re.sub(r'\\qa\s+(.+)', u'<title type="acrostic">'+r'\1</title>', osis)
@@ -628,7 +628,7 @@
# \tc#_text...
# \tcr#_text...
tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'}
- osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: r'<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: '<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)
return osis
@@ -646,7 +646,7 @@
note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'))', u''+r'<rdg type="alternate">\1</rdg>', note)
# \ft_
- note = re.sub(r'\\ft\s', r'', note)
+ note = re.sub(r'\\ft\s', '', note)
# \fr_##SEP##
note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'))', u''+r'<reference>\1</reference>', note)
@@ -665,14 +665,14 @@
note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'))', u''+r'<hi type="super">\1</hi>', note)
if relaxedConformance:
- note = note.replace(r'\fq*', r'')
- note = note.replace(r'\fqa*', r'')
- note = note.replace(r'\ft*', r'')
- note = note.replace(r'\fr*', r'')
- note = note.replace(r'\fk*', r'')
- note = note.replace(r'\fl*', r'')
- note = note.replace(r'\fp*', r'')
- note = note.replace(r'\fv*', r'')
+ note = note.replace(r'\fq*', '')
+ note = note.replace(r'\fqa*', '')
+ note = note.replace(r'\ft*', '')
+ note = note.replace(r'\fr*', '')
+ note = note.replace(r'\fk*', '')
+ note = note.replace(r'\fl*', '')
+ note = note.replace(r'\fp*', '')
+ note = note.replace(r'\fv*', '')
note = note.replace(u'', '')
return note
@@ -684,10 +684,10 @@
supported:\f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm*
"""
# \f_+_...\f*
- osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
# \fe_+_...\fe*
- osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
@@ -713,7 +713,7 @@
note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note)
# \xt_
- note = re.sub(r'\\xt\s', r'', note)
+ note = re.sub(r'\\xt\s', '', note)
if relaxedConformance:
# TODO: \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
@@ -728,10 +728,10 @@
note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note)
if relaxedConformance:
- note = note.replace(r'\xq*', r'')
- note = note.replace(r'\xt*', r'')
- note = note.replace(r'\xo*', r'')
- note = note.replace(r'\xk*', r'')
+ note = note.replace(r'\xq*', '')
+ note = note.replace(r'\xt*', '')
+ note = note.replace(r'\xo*', '')
+ note = note.replace(r'\xk*', '')
note = note.replace(u'', '')
return note
@@ -743,7 +743,7 @@
supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc*
"""
# \x_+_...\x*
- osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL)
osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
@@ -920,7 +920,7 @@
periph += '">\n' + contents + '</div>\n'
return periph
- osis = re.sub(r'\\periph\s+([^\n]+)\s*\n(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)
+ osis = re.sub(r'\\periph\s+([^'+'\n'+r']+)\s*'+'\n'+r'(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)
return osis
@@ -931,15 +931,15 @@
supported: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat
"""
# \ef...\ef*
- osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
# \ex...\ex*
- osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: '<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL)
osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
# \esb...\esbex # TODO: this likely needs to go much earlier in the process
- osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '<div type="x-sidebar">\1</div>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', ur'<div type="x-sidebar">\1</div>'+'\n', osis, flags=re.DOTALL)
# \cat_<TAG>\cat*
osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'<index index="category" level1="\1"/>', osis)
@@ -1009,27 +1009,27 @@
def osisReorderAndCleanup(osis):
# assorted re-orderings
- osis = re.sub(u'(<chapter eID=.+?\n)(<verse eID=.+?>)\n?', r'\2\n\1', osis)
+ osis = re.sub(u'(<chapter eID=.+?\n)(<verse eID=.+?>)\n?', r'\2'+'\n'+r'\1', osis)
osis = re.sub(u'([]</div>)([^]*<chapter eID.+?>)', r'\2\1', osis)
- osis = re.sub(u'(</p>\n?<p>)\n?(<verse eID=.+?>)\n?', r'\2\n\1\n', osis)
- osis = re.sub(u'\n(<verse eID=.+?>)', r'\1\n', osis)
+ osis = re.sub(u'(</p>\n?<p>)\n?(<verse eID=.+?>)\n?', r'\2'+'\n'+r'\1'+'\n', osis)
+ osis = re.sub(u'\n(<verse eID=.+?>)', r'\1'+'\n', osis)
osis = re.sub(u'\n*(<l.+?>)(<verse eID=.+?>[\n]*<verse osisID=.+?>)', r'\2\1', osis)
# delete attributes from end tags (since they are invalid)
osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis)
- osis = osis.replace(r'<lb type="x-p"/>', r'<lb/>')
+ osis = osis.replace('<lb type="x-p"/>', '<lb/>')
# delete Unicode tags
for c in u'':
osis = osis.replace(c, '')
for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse']:
- osis = re.sub(r' +</'+endBlock+r'>', r'</'+endBlock+r'>', osis)
- osis = re.sub(r' +<'+endBlock+r'( eID=[^/>]+/>)', r'</'+endBlock+r'\1', osis)
- osis = re.sub(r' +((</[^>]+>)+) *', r'\1 ', osis)
+ osis = re.sub(' +</'+endBlock+'>', '</'+endBlock+r'>', osis)
+ osis = re.sub(' +<'+endBlock+'( eID=[^/>]+/>)', '</'+endBlock+r'\1', osis)
+ osis = re.sub(' +((</[^>]+>)+) *', r'\1 ', osis)
# strip extra spaces & newlines
- osis = re.sub(r' +', r' ', osis)
- osis = re.sub(r' ?\n\n+', r'\n', osis)
+ osis = re.sub(' +', ' ', osis)
+ osis = re.sub(' ?\n\n+', '\n', osis)
return osis
@@ -1040,7 +1040,7 @@
encoding = 'utf-8'
osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
# \ide_<ENCODING>
- encoding = re.search(r'\\ide\s+(.+)\n', osis)
+ encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
if encoding:
encoding = encoding.group(1).lower().strip()
if encoding != 'utf-8':
@@ -1078,6 +1078,11 @@
for sb in specialBooks:
osis = osis.replace('<div type="book" osisID="' + sb + '">', '<div type="' + sb.lower() + '">')
+ if DEBUG:
+ localUnhandledTags = set(re.findall(r'(\\[^\s\*]+?\b\*?)', osis))
+ if localUnhandledTags:
+ print('Unhandled USFM tags in ' + sFile + ': ' + ', '.join(localUnhandledTags) + ' (' + str(len(localUnhandledTags)) + ' total)')
+
return osis
@@ -1091,10 +1096,6 @@
if verbose:
print text
-def printUnhandled():
- global relaxedConformance
-
-
def printUsage():
print('usfm2osis.py -- USFM ' + USFMversion + ' to OSIS ' + OSISversion + ' converter version ' + scriptVersion)
print(' Revision: ' + rev + ' (' + date + ')')
@@ -1164,13 +1165,13 @@
verbose = False
if '-d' in sys.argv:
- debugMode = True
+ DEBUG = True
inputFilesIdx += 1
num_processes = 1
num_jobs = 1
verbose = True
else:
- debugMode = False
+ DEBUG = False
if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
printUsage()
@@ -1249,6 +1250,7 @@
for doc in sorted(usfmDocList, key=keynat):
unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
OSISfile.write(osisSegment[doc])
+
writeOSISFooter(OSISfile)
if unhandledTags:
More information about the sword-cvs
mailing list