[sword-svn] r376 - trunk/modules/python
chrislit at crosswire.org
chrislit at crosswire.org
Fri Aug 10 12:34:38 MST 2012
Author: chrislit
Date: 2012-08-10 12:34:38 -0700 (Fri, 10 Aug 2012)
New Revision: 376
Modified:
trunk/modules/python/usfm2osis.py
Log:
completed handling of non-USFM tags from style sheet & deprecated tags
Modified: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py 2012-08-10 17:09:14 UTC (rev 375)
+++ trunk/modules/python/usfm2osis.py 2012-08-10 19:34:38 UTC (rev 376)
@@ -75,6 +75,8 @@
# is4
# is5
+# sections
+
import sys, codecs, re
from encodings.aliases import aliases
import multiprocessing, Queue
@@ -272,6 +274,8 @@
osis = osis.replace('<', '<')
osis = osis.replace('>', '>')
+ #osis = re.sub('\n'+r'(\\[^\s]+\b\*)', r' \1', osis)
+
return osis
@@ -561,8 +565,15 @@
# \pi#(_Sample text...)
# \mi(_text...)
# \nb
- pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak'}
- osis = re.sub(r'\\(pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb)\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL)
+ # \phi # deprecated
+ # \ps # deprecated
+ # \psi # deprecated
+ # \p# # deprecated
+ pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak', 'phi':'x-indented-hanging', 'ps':'x-nobreakNext', 'psi':'x-nobreakNext-indented', 'p1':'x-level-1', 'p2':'x-level-2', 'p3':'x-level-3', 'p4':'x-level-4', 'p5':'x-level-5'}
+ paragraphregex = 'pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb'
+ if relaxedConformance:
+ paragraphregex += '|phi|ps|psi|p1|p2|p3|p4|p5'
+ osis = re.sub(r'\\('+paragraphregex+r')\s+(.*?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL)
# \cls_text...
osis = re.sub(r'\\m\s+(.+?)(?=(\\(i?m|i?p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<closer>' + m.group(1) + u'</closer>\n', osis, flags=re.DOTALL)
@@ -579,13 +590,6 @@
# \b
osis = re.sub(r'\\b\b\s?', '<lb type="x-p"/>', osis)
- if relaxedConformance:
- # TODO: \phi: DEP: Paragraph text, indented with hanging indent
- # TODO: \ps: DEP: Paragraph text, no break with next paragraph text at chapter boundary
- # TODO: \psi: DEP: Paragraph text, indented, with no break with next paragraph text (at chapter boundary)
- # TODO: \p#: Front or back matter text paragraph, level # (if multiple levels)
- pass
-
return osis
@@ -598,17 +602,17 @@
osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL)
# \q#(_text...)
- osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
- osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\q(\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
# \qr_text...
# \qc_text...
# \qm#(_text...)
qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
- osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\(qr|qc|qm\d)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|\\fig|<l\b|<lb\b|<title\b|<list\b|</?div\b))', lambda m: '<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
osis = osis.replace('\n</l>', '</l>\n')
- osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
+ osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
# \b
osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace('<lb type="x-p"/>', '</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
@@ -729,9 +733,11 @@
note = re.sub(r'\\xt\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference>\1</reference>', note)
if relaxedConformance:
- # TODO: \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
- # TODO: \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference.
- pass
+ # TODO: move this to a concorance/index-specific section?
+ # \xtSee..\xtSee*: Concordance and Names Index markup for an alternate entry target reference.
+ note = re.sub(r'\\xtSee\b\s(.+?)\\xtSee\b\*', u''+r'<reference osisRef="\1">See: \1</reference>', note)
+ # \xtSeeAlso...\xtSeeAlso: Concordance and Names Index markup for an additional entry target reference.
+ note = re.sub(r'\\xtSeeAlso\b\s(.+?)\\xtSeeAlso\b\*', u''+r'<reference osisRef="\1">See also: \1</reference>', note)
if relaxedConformance:
note = note.replace(r'\xq*', '')
@@ -804,9 +810,15 @@
osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL) # find a better mapping than <foreign>?
if relaxedConformance:
- # TODO: \addpn...\addpn*: For chinese words to be dot underline & underline
- # TODO: \k#: Concordance main entry text or keyword, level #
- pass
+ # \addpn...\addpn*
+ osis = re.sub(r'\\addpn\s+(.+?)\\addpn\*', r'<hi type="x-dotUndeline">\1</hi>', osis, flags=re.DOTALL)
+ # \k# # TODO: unsure of this tag's purpose
+ osis = re.sub(r'\\k1\s+(.+?)\\k1\*', r'<seg type="keyword" n="1">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k2\s+(.+?)\\k2\*', r'<seg type="keyword" n="2">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k3\s+(.+?)\\k3\*', r'<seg type="keyword" n="3">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k4\s+(.+?)\\k4\*', r'<seg type="keyword" n="4">\1</seg>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\k5\s+(.+?)\\k5\*', r'<seg type="keyword" n="5">\1</seg>', osis, flags=re.DOTALL)
+
return osis
@@ -902,8 +914,8 @@
osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1<index index="Hebrew" level1="\1"/>\2', osis, flags=re.DOTALL)
if relaxedConformance:
- # TODO: \wr...\wr*: OBS: Auxiliary - Wordlist/Glossary Reference
- pass
+ # \wr...\wr*
+ osis = re.sub(r'\\wr\s+(.+?)(\s*)\\wr\*', r'\1<index index="Reference" level1="\1"/>\2', osis, flags=re.DOTALL)
return osis
@@ -959,15 +971,18 @@
supported: \z<Extension>
We can't really know what these mean, but will preserve them as <milestone/> elements.
"""
- if relaxedConformance:
- # publishing assistant markers
- # \zpa-xb...\zpa-xb* : \periph Book
- # \zpa-xc...\zpa-xc* : \periph Chapter
- # \zpa-xv...\zpa-xv* : \periph Verse
- # \zpa-xd...\zpa-xd* : \periph Description
- pass
+ # publishing assistant markers
+ # \zpa-xb...\zpa-xb* : \periph Book
+ # \zpa-xc...\zpa-xc* : \periph Chapter
+ # \zpa-xv...\zpa-xv* : \periph Verse
+ # \zpa-xd...\zpa-xd* : \periph Description
+ # TODO: Decide how these should actually be encoded. In lieu of that,
+ # these can all be handled by the default \z Namespace handlers:
- # \z
+ # \z{X}...\z{X}*
+ osis = re.sub(r'\z([^\s]+)\s(.+?)(\z\1\*)', r'<seg type="x-\1">\2</seg>', osis, flags=re.DOTALL)
+
+ # \z{X}
osis = re.sub(r'\\z([^\s]+)', r'<milestone type="x-usfm-z-\1"/>', osis)
return osis
More information about the sword-cvs
mailing list