[sword-svn] r397 - trunk/modules/python

Sun Aug 26 15:25:30 MST 2012

Author: chrislit
Date: 2012-08-26 15:25:30 -0700 (Sun, 26 Aug 2012)
New Revision: 397

Modified:
   trunk/modules/python/usfm2osis.py
Log:
added a scan of the USFM to capture id & toc3 fields for a global variable, allowing book sorting to work correctly
made book sorting a one-time event that applies immediately after the this scan and now applies to the processing stage so that printed feedback about processing is an indicator of the eventual output book order


Modified: trunk/modules/python/usfm2osis.py
===================================================================

--- trunk/modules/python/usfm2osis.py	2012-08-26 09:28:04 UTC (rev 396)
+++ trunk/modules/python/usfm2osis.py	2012-08-26 22:25:30 UTC (rev 397)
@@ -319,18 +319,13 @@
 """
 
 def keycanon(filename):
-    if filename2osis:
-        return canonicalOrder.index(filename2osis[filename])
-    else:
-        return keynat(filename)
+    global filename2osis
+    return canonicalOrder.index(filename2osis[filename])
 
 def keyusfm(filename):
-    if filename2osis:
-        return usfmNumericOrder.index(filename2osis[filename])
-    else:
-        return keynat(filename)
+    return usfmNumericOrder.index(filename2osis[filename])
 
-def convertToOSIS(sFile):
+def convertToOsis(sFile):
     global encoding
     global relaxedConformance
 
@@ -393,14 +388,9 @@
         Identification
         supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3
         """
-        global loc2osisBk, osis2locBk, filename2osis
+
         # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.)
         osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\'+'\n'+']*?)'+'\n'+r'(.*)(?=\\id|$)', lambda m: u'﷐<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') +  m.group(3) + u'</div type="book">﷐\n' , osis, flags=re.DOTALL)
-        # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
-        osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
-        if osisBook:
-            osisBook = bookDict[osisBook.group(1)]
-            filename2osis[filename] = osisBook
 
         # \ide_<ENCODING>
         osis = re.sub(r'\\ide\b.*'+'\n', '', osis) # delete, since this was handled above
@@ -426,12 +416,6 @@
         osis = re.sub(r'\\toc2\b\s+(.+)\s*'+'\n', r'<milestone type="x-usfm-toc2" n="\1"/>'+'\n', osis)
 
         # \toc3_text...
-        locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
-        if locBook:
-            locBook = locBook.group(1)
-            if osisBook:
-                osis2locBk[osisBook]=locBook
-                loc2osisBk[locBook]=osisBook
         osis = re.sub(r'\\toc3\b\s+(.+)\s*'+'\n', lambda m: r'<milestone type="x-usfm-toc3" n="\1"/>'+'\n', osis)
 
         return osis
@@ -1173,6 +1157,40 @@
 
     return osis
 
+def readIdentifiersFromOsis(filename):
+    global encoding
+    global loc2osisBk, osis2locBk, filename2osis
+
+    ### Processing starts here
+    if encoding:
+        osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+    else:
+        encoding = 'utf-8'
+        osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+        # \ide_<ENCODING>
+        encoding = re.search(r'\\ide\s+(.+)'+'\n', osis)
+        if encoding:
+            encoding = encoding.group(1).lower().strip()
+            if encoding != 'utf-8':
+                if encoding in aliases:
+                    osis = codecs.open(filename, 'r', encoding).read().strip() + '\n'
+                else:
+                    #print(('WARNING: Encoding "' + encoding + '" unknown, processing ' + filename + ' as UTF-8.'))
+                    encoding = 'utf-8'
+
+    # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
+    osisBook = re.search(r'\\id\s+([A-Z0-9]+)', osis)
+    if osisBook:
+        osisBook = bookDict[osisBook.group(1)]
+        filename2osis[filename] = osisBook
+
+    locBook = re.search(r'\\toc3\b\s+(.+)\s*'+'\n', osis)
+    if locBook:
+        locBook = locBook.group(1)
+        if osisBook:
+            osis2locBk[osisBook]=locBook
+            loc2osisBk[locBook]=osisBook
+
 def verbosePrint(text):
     if verbose:
         print(text)
@@ -1222,7 +1240,7 @@
                 break
 
             # the actual processing
-            osis = convertToOSIS(job)
+            osis = convertToOsis(job)
 
             # store the result
             self.result_queue.put((job,osis))
@@ -1315,12 +1333,14 @@
 
         usfmDocList = sys.argv[inputFilesIdx:]
 
-        osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n'
+        for filename in usfmDocList:
+            readIdentifiersFromOsis(filename)
+        usfmDocList = sorted(usfmDocList, key=sortKey)
 
         # run
         # load up work queue
         work_queue = multiprocessing.Queue()
-        for job in sorted(usfmDocList, key=sortKey):
+        for job in usfmDocList:
             work_queue.put(job)
 
         # create a queue to pass to workers to store the results
@@ -1338,8 +1358,11 @@
             osisSegment[k]=v
 
         
+        verbosePrint('Assembling OSIS document...')
+        osisDoc = '<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+osisVersion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="und" osisIDWork="' + osisWork + '">\n<header>\n<work osisWork="' + osisWork + '"/>\n</header>\n'
+
         unhandledTags = set()
-        for doc in sorted(usfmDocList, key=sortKey):
+        for doc in usfmDocList:
             unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
             osisDoc += osisSegment[doc]