[sword-svn] r371 - trunk/modules/python
chrislit at crosswire.org
chrislit at crosswire.org
Thu Aug 9 22:35:45 MST 2012
Author: chrislit
Date: 2012-08-09 22:35:45 -0700 (Thu, 09 Aug 2012)
New Revision: 371
Modified:
trunk/modules/python/usfm2osis.py
Log:
bug fixes
improved handling of \ide, fixed backoff to utf-8, added warning message
added more endtag-stripping for -r relaxed mode
added -d debug mode
Modified: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py 2012-08-10 01:21:19 UTC (rev 370)
+++ trunk/modules/python/usfm2osis.py 2012-08-10 05:35:45 UTC (rev 371)
@@ -315,8 +315,8 @@
def cvtIntroductions(osis, relaxedConformance):
"""
Introductions
- supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
- NB: tags are generally 'supported' to the degree that their non-introduction equivalents are supported
+ supported:
+ unsupported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
"""
# \imt#
# \is#
@@ -350,7 +350,7 @@
Titles, Headings, and Labels
supported: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp
"""
- # \ms#_text... ###TESTED### ##NB: supports only \ms1 to \ms3
+ # \ms#_text... ###TESTED###
osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'<div type="majorSection"><title>' + m.group(1) + '</title>', osis)
osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis)
@@ -365,7 +365,7 @@
# \mr_text...
osis = re.sub(r'\\mr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis)
- # \s#_text... ###TESTED### ##NB: supports only \s1 to \s3
+ # \s#_text... ###TESTED###
osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'<div type="section"><title>' + m.group(1) + '</title>', osis)
osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
if relaxedConformance:
@@ -577,9 +577,14 @@
note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'))', u''+r'<hi type="super">\1</hi>', note)
if relaxedConformance:
- note = note.replace(r'\ft*', r'')
note = note.replace(r'\fq*', r'')
note = note.replace(r'\fqa*', r'')
+ note = note.replace(r'\ft*', r'')
+ note = note.replace(r'\fr*', r'')
+ note = note.replace(r'\fk*', r'')
+ note = note.replace(r'\fl*', r'')
+ note = note.replace(r'\fp*', r'')
+ note = note.replace(r'\fv*', r'')
note = note.replace(u'', '')
return note
@@ -629,8 +634,10 @@
note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note)
if relaxedConformance:
+ note = note.replace(r'\xq*', r'')
note = note.replace(r'\xt*', r'')
- note = note.replace(r'\xq*', r'')
+ note = note.replace(r'\xo*', r'')
+ note = note.replace(r'\xk*', r'')
note = note.replace(u'', '')
return note
@@ -749,23 +756,23 @@
"""
# \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig*
def makeFigure(matchObject):
- fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject
+ fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject.groups()
figure = '<figure'
if fig_file:
- figure += ' src="' + matchObject.group('fig_file') + '"'
+ figure += ' src="' + fig_file + '"'
if fig_size:
- figure += ' size="' + matchObject.group('fig_size') + '"'
+ figure += ' size="' + fig_size + '"'
if fig_copy:
- figure += ' rights="' + matchObject.group('fig_copy') + '"'
+ figure += ' rights="' + fig_copy + '"'
""" TODO: implement parsing in osisParse(Bible reference string)
if fig_ref:
- figure += ' annotateRef="' + osisParse(matchObject.group('fig_ref')) + '"'
+ figure += ' annotateRef="' + osisParse(fig_ref) + '"'
"""
figure += '>\n'
if fig_cap:
- figure += '<caption>' + matchObject.group('fig_cap') + '</caption>\n'
+ figure += '<caption>' + fig_cap + '</caption>\n'
if fig_ref:
- figure += '<reference>' + matchObject.group('fig_ref') + '</reference>\n'
+ figure += '<reference>' + fig_ref + '</reference>\n'
if fig_desc:
figure += '<!-- fig DESC - ' + fig_desc + ' -->\n'
if fig_loc:
@@ -923,12 +930,13 @@
# \ide_<ENCODING>
encoding = re.search(r'\\ide\s+(.+)\n', osis)
if encoding:
- encoding = encoding.group(1).lower()
+ encoding = encoding.group(1).lower().strip()
if encoding != 'utf-8':
if encoding in aliases:
osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
else:
- print('Encoding unknown, processing as UTF-8.')
+ print('WARNING: Encoding "' + encoding + '" unknown, processing ' + sFile + ' as UTF-8.')
+ encoding = 'utf-8'
# call individual conversion processors in series
@@ -981,6 +989,7 @@
print('')
print('Usage: usfm2osis.py <osisWork> [OPTION] ... <USFM filename|wildcard> ...')
print('')
+ print(' -d debug mode (single-threaded, verbose output')
print(' -e ENCODING input encoding override (default is to read the USFM file\'s')
print(' \\ide value or assume UTF-8 encoding in its absence)')
print(' -h, --help print this usage information')
@@ -1042,6 +1051,15 @@
else:
verbose = False
+ if '-d' in sys.argv:
+ debugMode = True
+ inputFilesIdx += 1
+ num_processes = 1
+ num_jobs = 1
+ verbose = True
+ else:
+ debugMode = False
+
if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
printUsage()
else:
More information about the sword-cvs
mailing list