[sword-svn] r418 - trunk/modules/python
chrislit at crosswire.org
chrislit at crosswire.org
Sat Jun 8 08:20:51 MST 2013
Author: chrislit
Date: 2013-06-08 08:20:50 -0700 (Sat, 08 Jun 2013)
New Revision: 418
Modified:
trunk/modules/python/usfm2osis.py
Log:
fixed a bunch of bugs to address MODTOOLS-42 and other issues revealed by the TUREV Bible
Modified: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py 2013-06-01 04:44:57 UTC (rev 417)
+++ trunk/modules/python/usfm2osis.py 2013-06-08 15:20:50 UTC (rev 418)
@@ -57,7 +57,7 @@
# check Python2/3 compatibility
### Key to non-characters:
-# Used : \uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1
+# Used : \uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6
# Unused : \uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF
# \uFDD0 book
# \uFDD1 chapter
@@ -477,19 +477,19 @@
osis = re.sub(r'\\imte(\d?)\b\s+(.+)', lambda m: '<title ' + ('level="'+m.group(1)+'" ' if m.group(1) else '') + 'type="main" subType="x-introduction-end">' + m.group(2) + '</title>', osis)
# \is#_text...
- osis = re.sub(r'\\is1?\s+(.+)', lambda m: '\uFDDA<div type="section" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
- osis = re.sub('(\uFDDA[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA]+)', r'\1'+'</div>\uFDDA\n', osis, flags=re.DOTALL)
- osis = re.sub(r'\\is2\s+(.+)', lambda m: '\uFDDB<div type="subSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
- osis = re.sub('(\uFDDB[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB]+)', r'\1'+'</div>\uFDDB\n', osis, flags=re.DOTALL)
- osis = re.sub(r'\\is3\s+(.+)', lambda m: '\uFDDC<div type="x-subSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
- osis = re.sub('(\uFDDC[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC]+)', r'\1'+'</div>\uFDDC\n', osis, flags=re.DOTALL)
- osis = re.sub(r'\\is4\s+(.+)', lambda m: '\uFDDD<div type="x-subSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
- osis = re.sub('(\uFDDD[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD]+)', r'\1'+'</div>\uFDDD\n', osis, flags=re.DOTALL)
- osis = re.sub(r'\\is5\s+(.+)', lambda m: '\uFDDE<div type="x-subSubSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
- osis = re.sub('(\uFDDE[^\uFDD5\uFDD0\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE]+)', r'\1'+'</div>\uFDDE\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\is1?\s+(.+)', lambda m: '\uFDE2<div type="section" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub('(\uFDE2<div type="section" subType="x-introduction">[^\uFDE2]+)(?!\\c\b)', r'\1'+'</div>\uFDE2\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\is2\s+(.+)', lambda m: '\uFDE3<div type="subSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub('(\uFDE3<div type="subSection" subType="x-introduction">[^\uFDE2\uFDE3]+)(?!\\c\b)', r'\1'+'</div>\uFDE3\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\is3\s+(.+)', lambda m: '\uFDE4<div type="x-subSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub('(\uFDE4<div type="subSubSection" subType="x-introduction">[^\uFDE2\uFDE3\uFDE4]+)(?!\\c\b)', r'\1'+'</div>\uFDE4\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\is4\s+(.+)', lambda m: '\uFDE5<div type="x-subSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub('(\uFDE5<div type="subSubSubSection" subType="x-introduction">[^\uFDE2\uFDE3\uFDE4\uFDE5]+)(?!\\c\b)', r'\1'+'</div>\uFDE5\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\is5\s+(.+)', lambda m: '\uFDE6<div type="x-subSubSubSubSection" subType="x-introduction"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub('(\uFDE6<div type="subSubSubSubSection" subType="x-introduction">[^\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6]+?)(?!\\c\b)', r'\1'+'</div>\uFDE6\n', osis, flags=re.DOTALL)
# \ip_text...
- osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p subType="x-introduction">\n' + m.group(1) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\ip\s+(.*?)(?=(\\(i?m|i?p|lit|cls|tr|io|iq|i?li|iex?|s|c)\b|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p subType="x-introduction">\n' + m.group(1) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
# \ipi_text...
# \im_text...
@@ -498,17 +498,17 @@
# \imq_text...
# \ipr_text...
pType = {'ipi':'x-indented', 'im':'x-noindent', 'imi':'x-noindent-indented', 'ipq':'x-quote', 'imq':'x-noindent-quote', 'ipr':'x-right'}
- osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr|io|iq|i?li|iex?|s)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '" subType="x-introduction">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\(ipi|im|ipq|imq|ipr)\s+(.*?)(?=(\\(i?m|i?p|lit|cls|tr|io|iq|i?li|iex?|s|c)\b|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '" subType="x-introduction">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
# \iq#_text...
- osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\iq[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL)
- osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\iq[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\iq\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\(iq[\d\s]|fig|q\d?|b)\b|<title\b))', r'<l level="1" subType="x-introduction">\1</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\iq(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\(iq[\d\s]|fig|q\d?|b)\b|<title\b))', r'<l level="\1" subType="x-introduction">\2</l>', osis, flags=re.DOTALL)
# \ib
osis = re.sub(r'\\ib\b\s?', '<lb type="x-p"/>', osis)
osis = osis.replace('\n</l>', '</l>\n')
- osis = re.sub('(<l [^\uFDD0\uFDD1\uFDD3\uFDD4]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
- osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
+ #osis = re.sub('(<l [^\uFDD0\uFDD1\uFDD3\uFDD4]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
+ #osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
# \ili#_text...
osis = re.sub(r'\\ili\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\ili[\d\s]|<lb\b|<title\b|<item\b))', '<item type="x-indent-1" subType="x-introduction">\uFDE0'+r'\1'+'\uFDE0</item>', osis, flags=re.DOTALL)
@@ -684,9 +684,12 @@
relaxedConformance -- Boolean value indicating whether to process non-standard & deprecated USFM tags.
"""
+ paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb'
+ if relaxedConformance:
+ paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5'
# \p(_text...)
- osis = re.sub(r'\\p\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p>\n' + m.group(1) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\p\s+(.*?)(?=(\\(i?m|i?p|lit|cls|tr|p|'+paragraphregex+r')\b|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p>\n' + m.group(1) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
# \pc(_text...)
# \pr(_text...)
@@ -703,22 +706,19 @@
# \psi # deprecated
# \p# # deprecated
pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'}
- paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb'
- if relaxedConformance:
- paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5'
- osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|lit|cls|tr|'+paragraphregex+r')\b|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
# \cls_text...
- osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<closer>' + m.group(1) + '\uFDD3</closer>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|lit|cls|tr)\b|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<closer>' + m.group(1) + '\uFDD3</closer>\n', osis, flags=re.DOTALL)
# \ph#(_text...)
# \li#(_text...)
osis = re.sub(r'\\ph\b\s*', r'\\li ', osis)
osis = re.sub(r'\\ph(\d)\b\s*', r'\\li\1 ', osis)
- osis = re.sub(r'\\li\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL)
- osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\li\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b|</?div\b|</?chapter\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\li(\d)\b\s*(.*?)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE'+r']|\\li[\d\s]|<lb\b|<title\b|<item\b|</?div\b|</?chapter\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL)
osis = osis.replace('\n</item>', '</item>\n')
- osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1]+</item>)', '\uFDD3<list>'+r'\1'+'</list>\uFDD3', osis, flags=re.DOTALL)
+ osis = re.sub('(<item [^\uFDD0\uFDD1\uFDD3\uFDD4\uFDE0\uFDE1\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE]+</item>)', '\uFDD3<list>'+r'\1'+'</list>\uFDD3', osis, flags=re.DOTALL)
# \b
osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis)
@@ -737,6 +737,12 @@
"""
+ # \qa_text...
+ osis = re.sub(r'\\qa\s+(.+)', '\uFDD4<title type="acrostic">'+r'\1</title>', osis)
+
+ # \qac_text...\qac*
+ osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'<hi type="acrostic">\1</hi>', osis, flags=re.DOTALL)
+
# \qs_(Selah)\qs*
osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL)
@@ -756,12 +762,6 @@
# \b
osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
- # \qa_text...
- osis = re.sub(r'\\qa\s+(.+)', '\uFDD4<title type="acrostic">'+r'\1</title>', osis)
-
- # \qac_text...\qac*
- osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'<hi type="acrostic">\1</hi>', osis, flags=re.DOTALL)
-
return osis
@@ -786,6 +786,8 @@
tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'}
osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: '<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)
+ osis = re.sub(r'(<row>.*?</row>)(?=(['+'\uFDD0\uFDD1\uFDD3\uFDD4'+r']|\\tr\s|<lb\b|<title\b))', r'<table>\1</table>', osis, flags=re.DOTALL)
+
return osis
@@ -966,7 +968,7 @@
osis = re.sub(r'\\k\s+(.+?)\\k\*', r'<seg type="keyword">\1</seg>', osis, flags=re.DOTALL)
# \lit
- osis = re.sub(r'\\lit\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="x-liturgical">\n' + m.group(1) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\lit\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)\b|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: '\uFDD3<p type="x-liturgical">\n' + m.group(1) + '\uFDD3</p>\n', osis, flags=re.DOTALL)
# \dc_...\dc* # TODO: Find an example---should this really be transChange?
osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'<transChange type="added" editions="dc">\1</transChange>', osis, flags=re.DOTALL)
@@ -1277,10 +1279,12 @@
osis = re.sub('(\uFDD3</p>\n?\uFDD3<p>)\n?(<verse eID=.+?>\uFDD2)\n?', r'\2'+'\n'+r'\1'+'\n', osis)
osis = re.sub('\n(<verse eID=.+?>\uFDD2)', r'\1'+'\n', osis)
osis = re.sub('\n*(<l.+?>)(<verse eID=.+?>[\uFDD2\n]*<verse osisID=.+?>)', r'\2\1', osis)
+ osis = re.sub('(</l>)(<note .+?</note>)', r'\2\1', osis)
# delete attributes from end tags (since they are invalid)
osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis)
osis = osis.replace('<lb type="x-p"/>', '<lb/>')
+
# delete Unicode non-characters
for c in '\uFDD0\uFDD1\uFDD2\uFDD3\uFDD4\uFDD5\uFDD6\uFDD7\uFDD8\uFDD9\uFDDA\uFDDB\uFDDC\uFDDD\uFDDE\uFDDF\uFDE0\uFDE1\uFDE2\uFDE3\uFDE4\uFDE5\uFDE6\uFDE7\uFDE8\uFDE9\uFDEA\uFDEB\uFDEC\uFDED\uFDEE\uFDEF':
osis = osis.replace(c, '')
@@ -1337,8 +1341,8 @@
osis = cvtStudyBibleContent(osis, relaxedConformance)
osis = cvtPrivateUseExtensions(osis, relaxedConformance)
- #osis = processOsisIDs(osis)
- #osis = osisReorderAndCleanup(osis)
+ osis = processOsisIDs(osis)
+ osis = osisReorderAndCleanup(osis)
# change type on special books
for sb in specialBooks:
More information about the sword-cvs
mailing list