[sword-svn] r371 - trunk/modules/python

Thu Aug 9 22:35:45 MST 2012

Author: chrislit
Date: 2012-08-09 22:35:45 -0700 (Thu, 09 Aug 2012)
New Revision: 371

Modified:
   trunk/modules/python/usfm2osis.py
Log:
bug fixes
improved handling of \ide, fixed backoff to utf-8, added warning message
added more endtag-stripping for -r relaxed mode
added -d debug mode


Modified: trunk/modules/python/usfm2osis.py
===================================================================

--- trunk/modules/python/usfm2osis.py	2012-08-10 01:21:19 UTC (rev 370)
+++ trunk/modules/python/usfm2osis.py	2012-08-10 05:35:45 UTC (rev 371)
@@ -315,8 +315,8 @@
     def cvtIntroductions(osis, relaxedConformance):
         """
         Introductions
-        supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
-        NB: tags are generally 'supported' to the degree that their non-introduction equivalents are supported
+        supported:
+        unsupported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
         """
         # \imt#
         # \is#
@@ -350,7 +350,7 @@
         Titles, Headings, and Labels
         supported: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp
         """
-        # \ms#_text...  ###TESTED###  ##NB: supports only \ms1 to \ms3
+        # \ms#_text...  ###TESTED###
         osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'﷕<div type="majorSection"><title>' + m.group(1) + '</title>', osis)
         osis = re.sub(u'(﷕[^﷕﷐]+)', r'\1'+u'</div>﷕\n', osis, re.DOTALL)
         osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'﷖<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis)
@@ -365,7 +365,7 @@
         # \mr_text...
         osis = re.sub(r'\\mr\s+(.+)', u'﷔<title type="scope"><reference>'+r'\1</reference></title>', osis)
 
-        # \s#_text...  ###TESTED###  ##NB: supports only \s1 to \s3
+        # \s#_text...  ###TESTED###
         osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'﷚<div type="section"><title>' + m.group(1) + '</title>', osis)
         osis = re.sub(u'(﷚[^﷕﷐﷖﷗﷘﷙﷚]+)', r'\1'+u'</div>﷚\n', osis, re.DOTALL)
         if relaxedConformance:
@@ -577,9 +577,14 @@
         note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'﷟))', u'﷟'+r'<hi type="super">\1</hi>', note)
 
         if relaxedConformance:
-            note = note.replace(r'\ft*', r'')
             note = note.replace(r'\fq*', r'')
             note = note.replace(r'\fqa*', r'')
+            note = note.replace(r'\ft*', r'')
+            note = note.replace(r'\fr*', r'')
+            note = note.replace(r'\fk*', r'')
+            note = note.replace(r'\fl*', r'')
+            note = note.replace(r'\fp*', r'')
+            note = note.replace(r'\fv*', r'')
 
         note = note.replace(u'﷟', '')
         return note
@@ -629,8 +634,10 @@
         note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'﷟))', u'﷟'+r'<catchWord>\1</catchWord>', note)
 
         if relaxedConformance:
+            note = note.replace(r'\xq*', r'')
             note = note.replace(r'\xt*', r'')
-            note = note.replace(r'\xq*', r'')
+            note = note.replace(r'\xo*', r'')
+            note = note.replace(r'\xk*', r'')
 
         note = note.replace(u'﷟', '')
         return note
@@ -749,23 +756,23 @@
         """
         # \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig*
         def makeFigure(matchObject):
-            fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject
+            fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject.groups()
             figure = '<figure'
             if  fig_file:
-                figure += ' src="' + matchObject.group('fig_file') + '"'
+                figure += ' src="' + fig_file + '"'
             if fig_size:
-                figure += ' size="' + matchObject.group('fig_size') + '"'
+                figure += ' size="' + fig_size + '"'
             if fig_copy:
-                figure += ' rights="' + matchObject.group('fig_copy') + '"'
+                figure += ' rights="' + fig_copy + '"'
             """ TODO: implement parsing in osisParse(Bible reference string)
             if fig_ref:
-                figure += ' annotateRef="' + osisParse(matchObject.group('fig_ref')) + '"'
+                figure += ' annotateRef="' + osisParse(fig_ref) + '"'
             """
             figure += '>\n'
             if fig_cap:
-                figure += '<caption>' + matchObject.group('fig_cap') + '</caption>\n'
+                figure += '<caption>' + fig_cap + '</caption>\n'
             if fig_ref:
-                figure += '<reference>' + matchObject.group('fig_ref') + '</reference>\n'
+                figure += '<reference>' + fig_ref + '</reference>\n'
             if fig_desc:
                 figure += '<!-- fig DESC - ' + fig_desc + ' -->\n'
             if fig_loc:
@@ -923,12 +930,13 @@
         # \ide_<ENCODING>
         encoding = re.search(r'\\ide\s+(.+)\n', osis)
         if encoding:
-            encoding = encoding.group(1).lower()
+            encoding = encoding.group(1).lower().strip()
             if encoding != 'utf-8':
                 if encoding in aliases:
                     osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
                 else:
-                    print('Encoding unknown, processing as UTF-8.')
+                    print('WARNING: Encoding "' + encoding + '" unknown, processing ' + sFile + ' as UTF-8.')
+                    encoding = 'utf-8'
 
 
     # call individual conversion processors in series
@@ -981,6 +989,7 @@
     print('')
     print('Usage: usfm2osis.py <osisWork> [OPTION] ...  <USFM filename|wildcard> ...')
     print('')
+    print('  -d               debug mode (single-threaded, verbose output')
     print('  -e ENCODING      input encoding override (default is to read the USFM file\'s')
     print('                     \\ide value or assume UTF-8 encoding in its absence)')
     print('  -h, --help       print this usage information')
@@ -1042,6 +1051,15 @@
     else:
         verbose = False
 
+    if '-d' in sys.argv:
+        debugMode = True
+        inputFilesIdx += 1
+        num_processes = 1
+        num_jobs = 1
+        verbose = True
+    else:
+        debugMode = False
+
     if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
         printUsage()
     else: