[sword-svn] r360 - in trunk/modules: . python
chrislit at crosswire.org
chrislit at crosswire.org
Sat Aug 4 04:10:28 MST 2012
Author: chrislit
Date: 2012-08-04 04:10:27 -0700 (Sat, 04 Aug 2012)
New Revision: 360
Added:
trunk/modules/python/
trunk/modules/python/usfm2osis.py
trunk/modules/python/usfmtags.py
Log:
Initial commits of usfmtags.py & usfm2osis.py
Added: trunk/modules/python/usfm2osis.py
===================================================================
--- trunk/modules/python/usfm2osis.py (rev 0)
+++ trunk/modules/python/usfm2osis.py 2012-08-04 11:10:27 UTC (rev 360)
@@ -0,0 +1,1018 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+date = '$Date: 2012-03-09 01:23:40 -0800 (Fri, 09 Mar 2012) $'
+rev = '$Rev: 355 $'
+
+USFMversion = '2.35' # http://ubs-icap.org/chm/usfm/2.35/index.html
+OSISversion = '2.1.1' # http://www.bibletechnologies.net/osisCore.2.1.1.xsd
+scriptVersion = '0.5'
+
+
+# usfm2osis.py
+# Copyright 2012 by the CrossWire Bible Society <http://www.crosswire.org/>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# The full text of the GNU General Public License is available at:
+# <http://www.gnu.org/licenses/gpl-3.0.txt>.
+
+
+### Guidelines & objectives:
+# Target Python 2.7+ (but not 3)
+# Use no non-default libraries (this may change in the future)
+# Don't use SWORD bindings (this will probably change to allow *optional* use of bindings, if installed)
+# Achieve full coverage of USFM according to UBS spec:
+# <http://paratext.ubs-translations.org/about/usfm>
+# Employ best-practice conformant OSIS
+# Employ modularity (functions rather than a big long script)
+# Employ the same command-line syntax as usfm2osis.pl
+# Use & abuse Unicode tags (http://unicode.org/charts/PDF/UE0000.pdf) to simplify Regex processing
+
+### Roadmap:
+# 0.5 initial commit, including full coverage of core USFM tags
+# 0.6 test suite incorporating all USFM examples from UBS ICAP and other complex cases
+# 0.x more clean-up & re-ordering to correctly encapsulate milestones within appropriate containers
+# 0.x clean-up code: make fully OO? docstrings?
+# 1.0 feature complete for release & production use
+# 1.x xreffix.pl-functionality (osisParse(ref)), requiring SWORD bindings
+# 1.x SWORD-mode output?
+# 1.x IMP output?
+# 1.x SWORD module output?, requiring SWORD bindings
+
+
+import sys, codecs, re
+from encodings.aliases import aliases
+import multiprocessing, Queue
+
+date = date.replace('$', '').strip()[6:16]
+rev = rev.replace('$', '').strip()[5:]
+
+bookDict = {
+ ### Known USFM Book codes from Paratext
+ # OT
+ 'GEN':'Gen', 'EXO':'Exod', 'LEV':'Lev', 'NUM':'Num', 'DEU':'Deut', 'JOS':'Josh', 'JDG':'Judg', 'RUT':'Ruth',
+ '1SA':'1Sam', '2SA':'2Sam', '1KI':'1Kgs', '2KI':'2Kgs', '1CH':'1Chr', '2CH':'2Chr', 'EZR':'Ezra', 'NEH':'Neh',
+ 'EST':'Esth', 'JOB':'Job', 'PSA':'Ps', 'PRO':'Prov', 'ECC':'Eccl', 'SNG':'Song', 'ISA':'Isa', 'JER':'Jer',
+ 'LAM':'Lam', 'EZK':'Ezek', 'DAN':'Dan', 'HOS':'Hos', 'JOL':'Joel', 'AMO':'Amos', 'OBA':'Obad', 'JON':'Jonah',
+ 'MIC':'Mic', 'NAM':'Nah', 'HAB':'Hab', 'ZEP':'Zeph', 'HAG':'Hag', 'ZEC':'Zech', 'MAL':'Mal',
+ # NT
+ 'MAT':'Matt', 'MRK':'Mark', 'LUK':'Luke', 'JHN':'John', 'ACT':'Acts', 'ROM':'Rom', '1CO':'1Cor', '2CO':'2Cor',
+ 'GAL':'Gal', 'EPH':'Eph', 'PHP':'Phil', 'COL':'Col', '1TH':'1Thess', '2TH':'2Thess', '1TI':'1Tim', '2TI':'2Tim',
+ 'TIT':'Titus', 'PHM':'Phlm', 'HEB':'Heb', 'JAS':'Jas', '1PE':'1Pet', '2PE':'2Pet', '1JN':'1John', '2JN':'2John',
+ '3JN':'3John', 'JUD':'Jude', 'REV':'Rev',
+ # DC - Catholic
+ 'TOB':'Tob', 'JDT':'Jdt', 'ESG':'EsthGr', 'WIS':'Wis', 'SIR':'Sir', 'BAR':'Bar', 'LJE':'EpJer', 'S3Y':'PrAzar',
+ 'SUS':'Sus', 'BEL':'Bel', '1MA':'1Macc', '2MA':'2Macc',
+ # DC - Eastern Orthodox
+ '3MA':'3Macc', '4MA':'4Macc', '1ES':'1Esd', '2ES':'2Esd', 'MAN':'PrMan', 'PS2':'Ps151',
+ # Rahlfs' LXX
+ 'ODA':'Odes', 'PSS':'PssSol', 'JSA':'JoshA', 'JDB':'JudgB', 'TBS':'TobS', 'SST':'SusTh', 'DNT':'DanTh',
+ 'BLT':'BelTh',
+ # Esdrae
+ '4ES':'4Ezra', '5ES':'5Ezra', '6ES':'6Ezra',
+ # Additional non-biblical books
+ 'XXA':'XXA', 'XXB':'XXB', 'XXC':'XXC', 'XXD':'XXD', 'XXE':'XXE', 'XXF':'XXF', 'XXG':'XXG',
+ ###
+
+ ### Proposed Additions <http://lc.bfbs.org.uk/e107_files/downloads/canonicalissuesinparatext.pdf>
+ # Inconsistency with Esther
+ 'DAG':'DanGr',
+ # Alternate Psalms
+ 'PSB':'Ps',
+ # Ethiopic
+ 'JUB':'Jub', 'ENO':'1En', 'REP':'Reproof', # == Tegsas
+ '1MQ':'1Meq', '2MQ':'2Meq', '3MQ':'3Meq', '4BA':'4Bar',
+ # Syriac
+ '2BA':'2Bar', 'LBA':'EpBar', 'PS3':'5ApocSyrPss',
+ # Vulgate
+ 'LAO':'EpLao', 'PSO':'PrSol', 'PJE':'PrJer',
+ # Armenian
+ 'WSI':'WSir', 'COP':'CorCorr', '3CO':'3Cor', 'EUT':'PrEut', 'DOJ':'DJohn',
+ # Apostolic Fathers
+ '1CL':'1Clem', '2CL':'2Clem', 'SHE':'Herm', 'LBA':'Barn', 'DID':'Did',
+ ###
+
+ # Proposed replacements <http://lc.bfbs.org.uk/e107_files/downloads/canonicalissuesinparatext.pdf>
+ 'ODE':'Odes', 'EZA':'4Ezra', '5EZ':'5Ezra', '6EZ':'6Ezra',
+
+ # Additional biblical books
+ 'ADE':'AddEsth',
+
+ # Peripheral books
+ 'FRT':'FRONT', 'INT':'INTRODUCTION', 'BAK':'BACK', 'CNC':'CONCORDANCE', 'GLO':'GLOSSARY',
+ 'TDX':'INDEX', 'NDX':'GAZETTEER', 'OTH':'X-OTHER'
+ }
+
+specialBooks = ['FRONT', 'INTRODUCTION', 'BACK', 'CONCORDANCE', 'GLOSSARY', 'INDEX', 'GAZETTEER', 'X-OTHER']
+
+peripherals = {
+ 'Title Page':'titlePage', 'Half Title Page':'x-halfTitlePage', 'Promotional Page':'x-promotionalPage',
+ 'Imprimatur':'imprimatur', 'Publication Data':'publicationData', 'Foreward':'x-foreward', 'Preface':'preface',
+ 'Table of Contents':'tableofContents', 'Alphabetical Contents':'x-alphabeticalContents',
+ 'Table of Abbreviations':'x-tableofAbbreviations', 'Chronology':'x-chronology',
+ 'Weights and Measures':'x-weightsAndMeasures', 'Map Index':'x-mapIndex',
+ 'NT Quotes from LXX':'x-ntQuotesFromLXX'
+ }
+
+introPeripherals = {
+ 'Bible Introduction':'bible', 'Old Testament Introduction':'oldTestament',
+ 'Pentateuch Introduction':'pentateuch', 'History Introduction':'history', 'Poetry Introduction':'poetry',
+ 'Prophecy Introduction':'prophecy', 'New Testament Introduction':'newTestament',
+ 'Gospels Introduction':'gospels', 'Acts Introduction':'acts', 'Epistles Introduction':'epistles',
+ 'Revelation Introduction':'revelation', 'Deuterocanon Introduction':'deuterocanon'
+ }
+
+osis2locBk = dict()
+loc2osisBk = dict()
+verbose = bool()
+
+"""
+BEGIN PSF-licensed segment
+"""
+"""
+keynat from http://code.activestate.com/recipes/285264-natural-string-sorting/
+"""
+def keynat(string):
+ r'''A natural sort helper function for sort() and sorted()
+ without using regular expressions or exceptions.
+
+ >>> items = ('Z', 'a', '10th', '1st', '9')
+ >>> sorted(items)
+ ['10th', '1st', '9', 'Z', 'a']
+ >>> sorted(items, key=keynat)
+ ['1st', '9', '10th', 'a', 'Z']
+ '''
+ it = type(1)
+ r = []
+ for c in string:
+ if c.isdigit():
+ d = int(c)
+ if r and type( r[-1] ) == it:
+ r[-1] = r[-1] * 10 + d
+ else:
+ r.append(d)
+ else:
+ r.append(c.lower())
+ return r
+"""
+END PSF-licened segment
+"""
+
+def convertToOSIS(sFile):
+ global encoding
+ global relaxedConformance
+
+ verbosePrint('Processing: ' + sFile)
+
+ def cvtPreprocess(osis, relaxedConformance):
+ # lines should never start with non-tags
+ osis = re.sub(r'\n\s*([^\\\s])', r' \1', osis) # TODO: test this
+ # convert CR to LF
+ osis = osis.replace(r'\r', r'\n')
+ # lines should never end with whitespace (other than \n)
+ osis = re.sub(r'\s+\n', r'\n', osis)
+ # XML-encode as necessary
+ osis = osis.replace('&', '&')
+ osis = osis.replace('<', '<')
+ osis = osis.replace('>', '>')
+
+ return osis
+
+
+ def cvtIdentification(osis, relaxedConformance):
+ """
+ Identification
+ supported: \id, \ide, \sts, \rem, \h, \toc1, \toc2, \toc3
+ """
+ global loc2osisBk, osis2locBk
+ # \id_<CODE>_(Name of file, Book name, Language, Last edited, Date etc.) ###TESTED###
+ osis = re.sub(r'\\id\s+([A-Z0-9]{3})\b\s*([^\\\n]*?)\n(.*)(?=\\id|$)', lambda m: u'<div type="book" osisID="' + bookDict[m.group(1)] + '">\n' + (('<!-- id comment - ' + m.group(2) + ' -->\n') if m.group(2) else '') + m.group(3) + u'</div type="book">\n', osis, flags=re.DOTALL)
+ # keep a copy of the OSIS book abbreviation for below (\toc3 processing) to store for mapping localized book names to/from OSIS
+ osisBook = re.search(r'\\id\s+([A-Z0-9]{3})', osis)
+ if osisBook:
+ osisBook = bookDict[osisBook.group(1)]
+
+ # \ide_<ENCODING> ###TESTED###
+ osis = re.sub(r'\\ide\b.*\n', r'', osis) # delete, since this was handled above
+
+ # \sts_<STATUS CODE>
+ osis = re.sub(r'\\sts\b\s+(.+)\s*\n', r'<milestone type="x-sts" n="\1"/>\n', osis)
+
+ # \rem_text... ###TESTED###
+ osis = re.sub(r'\\rem\b\s+(.+)', r'<!-- rem - \1 -->', osis)
+
+ # \h#_text... ###TESTED###
+ osis = re.sub(r'\\h\b\s+(.+)\s*\n', r'<title type="runningHead">\1</title>\n', osis)
+
+ # \toc1_text...
+ osis = re.sub(r'\\toc1\b\s+(.+)\s*\n', r'<milestone type="x-toc1" n="\1"/>\n', osis)
+
+ # \toc2_text...
+ osis = re.sub(r'\\toc2\b\s+(.+)\s*\n', r'<milestone type="x-toc2" n="\1"/>\n', osis)
+
+ # \toc3_text...
+ locBook = re.search(r'\\toc3\b\s+(.+)\s*\n', osis)
+ if locBook:
+ locBook = locBook.group(1)
+ if osisBook:
+ osis2locBk[osisBook]=locBook
+ loc2osisBk[locBook]=osisBook
+ osis = re.sub(r'\\toc3\b\s+(.+)\s*\n', lambda m: r'<milestone type="x-toc3" n="\1"/>\n', osis)
+
+ return osis
+
+
+ def cvtIntroductions(osis, relaxedConformance):
+ """
+ Introductions
+ supported: \imt#, \is#, \ip, \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili, \iot, \io#, \ior...\ior*, \iex, \iqt...\iqt*, \imte, \ie
+ NB: tags are 'supported' to the degree that their non-introduction equivalents are supported
+ """
+ # \imt#
+ # \is# ###TESTED###
+ # \ip ###TESTED###
+ # \ipi
+ # \im
+ # \imi
+ # \ipq
+ # \imq
+ # \ipr
+ # \iq#
+ # \ib
+ # \ili
+ # \iot
+ # \io#
+ # \ior...\ior*
+ # \iex
+ # \iqt...\iqt*
+ # \imte
+ # \ie
+ # encapsulate introduction elements in a <div>
+ osis = re.sub(r'(\\i(mt|mt\d+|s|d\d+|p|pi|m|mi|pq|mq|pr|q|q\d+|b|li|ot|o|o\d+|or|or*|ex|qt|qt*|mte|e)\b.+?)(?=\n\\(c|s|m|p|d))', u'<div type="introduction">'+r'\1'+u'</div>\n', osis, flags=re.DOTALL)
+ # map all introduction elements to their non-introduction equivalents
+ for e in [r'mt', r'mt\d+', r's', r'd\d+', r'p', r'pi', r'm', r'mi', r'pq', r'mq', r'pr', r'q', r'q\d+', r'b', r'li', r'ot', r'o', r'o\d+', r'or', r'or*', r'ex', r'qt', r'qt*', r'mte', r'e']:
+ osis = re.sub(r'\\i('+e+r')\b', r'\\\1', osis)
+ return osis
+
+
+ def cvtTitles(osis, relaxedConformance):
+ """
+ Titles, Headings, and Labels
+ supported: \mt#, \mte#, \ms#, \mr, \s#, \sr, \r, \rq...\rq*, \d, \sp
+ """
+ # \ms#_text... ###TESTED### ##NB: supports only \ms1 to \ms3
+ osis = re.sub(r'\\ms1?\s+(.+)', lambda m: u'<div type="majorSection"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
+ osis = re.sub(r'\\ms2\s+(.+)', lambda m: u'<div type="majorSection" n="2"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
+ osis = re.sub(r'\\ms3\s+(.+)', lambda m: u'<div type="majorSection" n="3"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
+
+ # \mr_text...
+ osis = re.sub(r'\\mr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis)
+
+ # \s#_text... ###TESTED### ##NB: supports only \s1 to \s3
+ osis = re.sub(r'\\s1?\s+(.+)', lambda m: u'<div type="section"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
+ if relaxedConformance:
+ osis = re.sub(r'\\ss\s+', r'\\s2 ', osis)
+ osis = re.sub(r'\\sss\s+', r'\\s3 ', osis)
+ osis = re.sub(r'\\s2\s+(.+)', lambda m: u'<div type="subsection"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
+ osis = re.sub(r'\\s3\s+(.+)', lambda m: u'<div type="x-subSubSection"><title>' + m.group(1) + '</title>', osis)
+ osis = re.sub(u'([^]+)', r'\1'+u'</div>\n', osis, re.DOTALL)
+
+ # \sr_text...
+ osis = re.sub(r'\\sr\s+(.+)', u'<title type="scope"><reference>'+r'\1</reference></title>', osis)
+ # \r_text...
+ osis = re.sub(r'\\r\s+(.+)', u'<title type="parallel"><reference type="parallel">'+r'\1</reference></title>', osis)
+ # \rq_text...\rq*
+ osis = re.sub(r'\\rq\s+(.+?)\\rq\*', u'<reference type="source">'+r'\1</reference>', osis, flags=re.DOTALL)
+
+ # \d_text... ###TESTED###
+ osis = re.sub(r'\\d\s+(.+)', u'<title canonical="true" type="psalm">'+r'\1</title>', osis)
+
+ # \sp_text... ###TESTED###
+ osis = re.sub(r'\\sp\s+(.+)', r'<speaker>\1</speaker>', osis)
+
+ # \mt#_text... ###TESTED###
+ osis = re.sub(r'\\mt(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main">' + m.group(2) + r'</title>', osis)
+ # \mte#_text...
+ osis = re.sub(r'\\mte(\d?)\s+(.+)', lambda m: r'<title ' + (r'level="'+m.group(1)+r'" ' if m.group(1) else r'') + r'type="main" subType="x-end">' + m.group(2) + r'</title>', osis)
+
+ return osis
+
+
+ def cvtChaptersAndVerses(osis, relaxedConformance):
+ """
+ Chapters and Verses
+ supported: \c, \ca...\ca*, \cl, \cp, \cd, \v, \va...\va*, \vp...\vp*
+ """
+ # \c_# ###TESTED###
+ osis = re.sub(r'\\c\s+([^\s]+)\b(.+?)(?=(\\c\s+|</div type="book"))', lambda m: u'<chapter osisID="$BOOK$.' + m.group(1) + r'" sID="$BOOK$.' + m.group(1) + '"/>' + m.group(2) + u'<chapter eID="$BOOK$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL)
+
+ # \cp_#
+ # \ca_#\ca*
+ def replaceChapterNumber(matchObj):
+ ctext = matchObj.group(1)
+ cp = re.search(r'\\cp\s+(.+?)(?=(\\|\s))', ctext)
+ if cp:
+ ctext = re.sub(r'\\cp\s+(.+?)\\cp*', '', ctext, flags=re.DOTALL)
+ cp = cp.group(1)
+ ctext = re.sub(r'"\$BOOK\$\.([^"\.]+)"', r'"$BOOK$.'+ca+'"', ctext)
+ ca = re.search(r'\\ca\s+(.+?)\\ca\*', ctext)
+ if ca:
+ ctext = re.sub(r'\\ca\s+(.+?)\\ca*', '', ctext, flags=re.DOTALL)
+ ca = ca.group(1)
+ ctext = re.sub(r'(osisID="\$BOOK\$\.[^"\.]+)"', r'\1 $BOOK$.'+ca+'"', ctext)
+ return ctext
+ osis = re.sub(r'(<chapter [^<]+sID[^<]+/>.+?<chapter eID[^>]+/>)', replaceChapterNumber, osis, flags=re.DOTALL)
+
+ # \cl_
+ osis = re.sub(r'\\cl\s+(.+)', u'<title>'+r'\1</title>', osis)
+
+ # \cd_# <--This # seems to be an error
+ osis = re.sub(r'\\cd\b\s+(.+)', u'<title type="x-description">'+r'\1</title>', osis)
+
+ # \v_# ###TESTED###
+ osis = re.sub(r'\\v\s+([^\s]+)\b\s*(.+?)(?=(\\v\s+|</div type="book"|<chapter eID))', lambda m: u'<verse osisID="$BOOK$.$CHAP$.' + m.group(1) + r'" sID="$BOOK$.$CHAP$.' + m.group(1) + r'"/>' + m.group(2) + r'<verse eID="$BOOK$.$CHAP$.' + m.group(1) + u'"/>\n', osis, flags=re.DOTALL)
+
+ # \vp_#\vp*
+ # \va_#\va*
+ def replaceVerseNumber(matchObj):
+ vtext = matchObj.group(1)
+ vp = re.search(r'\\vp\s+(.+?)\\vp*', vtext)
+ if vp:
+ vtext = re.sub(r'\\vp\s+(.+?)\\vp*', '', vtext, flags=re.DOTALL)
+ vp = vp.group(1)
+ vtext = re.sub(r'"\$BOOK\$\.\$CHAP\$\.([^"\.]+)"', r'"$BOOK$.$CHAP$.'+va+'"', vtext)
+ va = re.search(r'\\va\s+(.+?)\\va\*', vtext)
+ if va:
+ vtext = re.sub(r'\\va\s+(.+?)\\va*', '', vtext, flags=re.DOTALL)
+ va = va.group(1)
+ vtext = re.sub(r'(osisID="\$BOOK\$\.\$CHAP\$\.[^"\.]+)"', r'\1 $BOOK$.$CHAP$.'+va+'"', vtext)
+ return vtext
+ osis = re.sub(r'(<verse [^<]+sID[^<]+/>.+?<verse eID[^>]+/>)', replaceVerseNumber, osis, flags=re.DOTALL)
+
+ return osis
+
+
+ def cvtParagraphs(osis, relaxedConformance):
+ """
+ Paragraphs
+ supported: \p, \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \nb, \cls, \li#, \pc, \pr, \ph#, \b
+ """
+ # \p(_text...) ###TESTED###
+ osis = re.sub(r'\\p\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p>\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL)
+
+ # \pc(_text...)
+ # \pr(_text...)
+ # \m(_text...) ###TESTED###
+ # \pmo(_text...)
+ # \pm(_text...)
+ # \pmc(_text...)
+ # \pmr_text... # deprecated: map to same as \pr
+ # \pi#(_Sample text...)
+ # \mi(_text...)
+ # \nb ###TESTED###
+ pType = {'pc':'x-center', 'pr':'x-right', 'm':'x-noindent', 'pmo':'x-embedded-opening', 'pm':'x-embedded', 'pmc':'x-embedded-closing', 'pmr':'x-right', 'pi':'x-indented-1', 'pi1':'x-indented-1', 'pi2':'x-indented-2', 'pi3':'x-indented-3', 'pi4':'x-indented-4', 'pi5':'x-indented-5', 'mi':'x-noindent-indented', 'nb':'x-nobreak'}
+ osis = re.sub(r'\\(pc|pr|m|pmo|pm|pmc|pmr|pi|pi1|pi2|pi3|pi4|pi5|mi|nb)\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="' + pType[m.group(1)] + '">\n' + m.group(2) + u'</p>\n', osis, flags=re.DOTALL)
+
+ # \cls_text...
+ osis = re.sub(r'\\m\s+(.+?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<closer>' + m.group(1) + u'</closer>\n', osis, flags=re.DOTALL)
+
+ # \ph#(_text...)
+ # \li#(_text...) ###TESTED###
+ osis = re.sub(r'\\ph\b\s*', r'\\li ', osis)
+ osis = re.sub(r'\\ph(\d+)\b\s*', r'\\li\1 ', osis)
+ osis = re.sub(r'\\li\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-1">\1</item>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\li(\d+)\b\s*(.*?)(?=(['+u''+r']|\\li[\d\s]|<lb\b|<title\b))', r'<item type="x-indent-\1">\2</item>', osis, flags=re.DOTALL)
+ osis = osis.replace('\n</item>', '</item>\n')
+ osis = re.sub(u'(<item [^]+</item>)', r'<list>\1</list>', osis, flags=re.DOTALL)
+
+ # \b ###TESTED###
+ osis = re.sub(r'\\b\b\s?', r'<lb type="p"/>', osis)
+
+ return osis
+
+
+ def cvtPoetry(osis, relaxedConformance):
+ """
+ Poetry
+ supported: \q#, \qr, \qc, \qs...\qs*, \qa, \qac...\qac*, \qm#, \b
+ """
+ # \qs_(Selah)\qs*
+ osis = re.sub(r'\\qs\b\s(.+?)\\qs\*', r'<l type="selah">\1</l>', osis, flags=re.DOTALL)
+
+ # \q#(_text...) ###TESTED###
+ osis = re.sub(r'\\q\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="1">\1</l>', osis, flags=re.DOTALL)
+ osis = re.sub(r'\\q(\d+)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', r'<l level="\1">\2</l>', osis, flags=re.DOTALL)
+
+ # \qr_text...
+ # \qc_text...
+ # \qm#(_text...)
+ qType = {'qr':'x-right', 'qc':'x-center', 'qm':'x-embedded" level="1', 'qm1':'x-embedded" level="1', 'qm2':'x-embedded" level="2', 'qm3':'x-embedded" level="3', 'qm4':'x-embedded" level="4', 'qm5':'x-embedded" level="5'}
+ osis = re.sub(r'\\(qr|qc|qm\d+)\b\s*(.*?)(?=(['+u''+r']|\\q[\d\s]|<l\b|<lb\b|<title\b))', lambda m: r'<l type="' + qType[m.group(1)] + '">' + m.group(2) + '</l>', osis, flags=re.DOTALL)
+
+ osis = osis.replace('\n</l>', '</l>\n')
+ osis = re.sub(u'(<l [^]+</l>)', r'<lg>\1</lg>', osis, flags=re.DOTALL)
+
+ # \b ###TESTED###
+ osis = re.sub('(<lg>.+?</lg>)', lambda m: m.group(1).replace(r'<lb type="p"/>', r'</lg><lg>'), osis, flags=re.DOTALL) # re-handle \b that occurs within <lg>
+
+ # \qa_text...
+ osis = re.sub(r'\\qa\s+(.+)', u'<title type="acrostic">'+r'\1</title>', osis)
+
+ # \qac_text...\qac*
+ osis = re.sub(r'\\qac\s+(.+?)\\qac\*', r'<hi type="acrostic">\1</hi>', osis, flags=re.DOTALL)
+
+ return osis
+
+
+ def cvtTables(osis, relaxedConformance):
+ """
+ Tables
+ supported: \tr, \th#, \thr#, \tc#, \tcr#
+ """
+ # \tr_
+ osis = re.sub(r'\\tr\b\s*(.*?)(?=(['+u''+r']|\\tr\s|<lb\b|<title\b))', r'<row>\1</row>', osis, flags=re.DOTALL)
+
+ # \th#_text...
+ # \thr#_text...
+ # \tc#_text...
+ # \tcr#_text...
+ tType = {'th':' role="label"', 'thr':' role="label" type="x-right"', 'tc':'', 'tcr':' type="x-right'}
+ osis = re.sub(r'\\(thr?|tcr?)\d*\b\s*(.*?)(?=(\\t[hc]|</row))', lambda m: r'<cell' + tType[m.group(1)] + '>' + m.group(2) + '</cell>', osis, flags=re.DOTALL)
+
+ return osis
+
+
+ def processNote(note):
+ note = note.replace('\n', ' ')
+
+ # \fdc_refs...\fdc*
+ note = re.sub(r'\\fdc\b\s(.+?)\\fdc\b\*', r'<seg editions="dc">\1</seg>', note)
+
+ # \fq_ ###TESTED###
+ note = re.sub(r'\\fq\b\s(.+?)(?=(\\f|'+u'))', u''+r'<catchWord>\1</catchWord>', note)
+
+ # \fqa_ ###TESTED###
+ note = re.sub(r'\\fqa\b\s(.+?)(?=(\\f|'+u'))', u''+r'<rdg type="alternate">\1</rdg>', note)
+
+ # \ft_ ###TESTED###
+ note = re.sub(r'\\ft\s', r'', note)
+
+ # \fr_##SEP##
+ note = re.sub(r'\\fr\b\s(.+?)(?=(\\f|'+u'))', u''+r'<reference>\1</reference>', note)
+
+ # \fk_
+ note = re.sub(r'\\fk\b\s(.+?)(?=(\\f|'+u'))', u''+r'<catchWord>\1</catchWord>', note)
+
+ # \fl_
+ note = re.sub(r'\\fl\b\s(.+?)(?=(\\f|'+u'))', u''+r'<label>\1</label>', note)
+
+ # \fp_
+ note = re.sub(r'\\fp\b\s(.+?)(?=(\\fp|$))', r'<p>\1</p>', note)
+ note = re.sub(r'(<note\b[^>]*?>)(.*?)<p>', r'\1<p>\2</p><p>', note)
+
+ # \fv_
+ note = re.sub(r'\\fv\b\s(.+?)(?=(\\f|'+u'))', u''+r'<hi type="super">\1</hi>', note)
+
+ if relaxedConformance:
+ note = note.replace(r'\ft*', r'')
+ note = note.replace(r'\fq*', r'')
+ note = note.replace(r'\fqa*', r'')
+
+ note = note.replace(u'', '')
+ return note
+
+
+ def cvtFootnotes(osis, relaxedConformance):
+ """
+ Footnotes
+ supported:\f...\f*, \fe...\fe*, \fr, \fk, \fq, \fqa, \fl, \fp, \fv, \ft, \fdc...\fdc*, \fm...\fm*
+ """
+ # \f_+_...\f* ###TESTED###
+ osis = re.sub(r'\\f\s+([^\s\\]+)?\s*(.+?)\s*\\f\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="foot">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
+
+ # \fe_+_...\fe*
+ osis = re.sub(r'\\fe\s+([^\s\\]+?)\s*(.+?)\s*\\fe\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' placement="end">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
+
+ osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
+
+ # \fm_...\fm*
+ osis = re.sub(r'\\fm\b\s(.+?)\\fm\*', r'<hi type="super">\1</hi>', osis)
+
+ return osis
+
+
+ def processXref(note):
+ note = note.replace('\n', ' ')
+
+ # \xot_refs...\xot*
+ note = re.sub(r'\\xot\b\s(.+?)\\xot\b\*', u''+r'<seg editions="ot">\1</seg>', note)
+
+ # \xnt_refs...\xnt*
+ note = re.sub(r'\\xnt\b\s(.+?)\\xnt\b\*', u''+r'<seg editions="nt">\1</seg>', note)
+
+ # \xdc_refs...\xdc*
+ note = re.sub(r'\\xdc\b\s(.+?)\\xdc\b\*', u''+r'<seg editions="dc">\1</seg>', note)
+
+ # \xq_
+ note = re.sub(r'\\xq\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note)
+
+ # \xt_ ###TESTED###
+ note = re.sub(r'\\xt\s', r'', note)
+
+ # \xo_##SEP##
+ note = re.sub(r'\\xo\b\s(.+?)(?=(\\x|'+u'))', u''+r'<reference>\1</reference>', note)
+
+ # \xk_
+ note = re.sub(r'\\xk\b\s(.+?)(?=(\\x|'+u'))', u''+r'<catchWord>\1</catchWord>', note)
+
+ if relaxedConformance:
+ note = note.replace(r'\xt*', r'')
+ note = note.replace(r'\xq*', r'')
+
+ note = note.replace(u'', '')
+ return note
+
+
+ def cvtCrossReferences(osis, relaxedConformance):
+ """
+ Cross References
+ supported: \\x...\\x*, \\xo, \\xk, \\xq, \\xt, \\xot...\\xot*, \\xnt...\\xnt*, \\xdc...\\xdc*
+ """
+ # \x_+_...\x* ###TESTED###
+ osis = re.sub(r'\\x\s+([^\s]+?)\s+(.+?)\s*\\x\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL)
+
+ osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
+
+ return osis
+
+
+ """
+ Special Text and Character Styles
+ """
+ def cvtSpecialText(osis, relaxedConformance):
+ """
+ Special Text
+ supported: \add...\add*, \bk...\bk*, \dc...\dc*, \k...\k*, \lit, \nd...\nd*, \ord...\ord*, \pn...\pn*, \qt...\qt*, \sig...\sig*, \sls...\sls*, \tl...\tl*, \wj...\wj*
+ """
+ # \add_...\add* ###TESTED###
+ osis = re.sub(r'\\add\s+(.+?)\\add\*', r'<transChange type="added">\1</transChange>', osis, flags=re.DOTALL)
+
+ # \wj_...\wj* ###TESTED###
+ osis = re.sub(r'\\wj\s+(.+?)\\wj\*', r'<q who="Jesus" marker="">\1</q>', osis, flags=re.DOTALL)
+
+ # \nd_...\nd*
+ osis = re.sub(r'\\nd\s+(.+?)\\nd\*', r'<divineName>\1</divineName>', osis, flags=re.DOTALL)
+
+ # \pn_...\pn*
+ osis = re.sub(r'\\pn\s+(.+?)\\pn\*', r'<name>\1</name>', osis, flags=re.DOTALL)
+
+ # \qt_...\qt*
+ osis = re.sub(r'\\qt\s+(.+?)\\qt\*', r'<seg type="otPassage">\1</seg>', osis, flags=re.DOTALL)
+
+ # \sig_...\sig*
+ osis = re.sub(r'\\sig\s+(.+?)\\sig\*', r'<signed>\1</signed>', osis, flags=re.DOTALL)
+
+ # \ord_...\ord*
+ osis = re.sub(r'\\ord\s+(.+?)\\ord\*', r'<hi type="super">\1</hi>', osis, flags=re.DOTALL) # semantic incongruity (ordinal -> superscript)
+
+ # \tl_...\tl*
+ osis = re.sub(r'\\tl\s+(.+?)\\tl\*', r'<foreign>\1</foreign>', osis, flags=re.DOTALL)
+
+ # \bk_...\bk* ###TESTED###
+ osis = re.sub(r'\\bk\s+(.+?)\\bk\*', r'<name type="x-workTitle">\1</name>', osis, flags=re.DOTALL)
+
+ # \k_...\k* ###TESTED###
+ osis = re.sub(r'\\k\s+(.+?)\\k\*', r'<seg type="keyword">\1</seg>', osis, flags=re.DOTALL)
+
+ # \lit
+ osis = re.sub(r'\\lit\s+(.*?)(?=(\\(m|p|nb|lit|cls|tr)|<chapter eID|</?div\b|<(p|closer)\b))', lambda m: u'<p type="x-liturgical">\n' + m.group(1) + u'</p>\n', osis, flags=re.DOTALL)
+
+ # \dc_...\dc* #### TODO: Find an example---should this really be transChange?
+ osis = re.sub(r'\\dc\b\s*(.+?)\\dc\*', r'<transChange type="added" editions="dc">\1</transChange>', osis, flags=re.DOTALL)
+
+ # \sls_...\sls*
+ osis = re.sub(r'\\sls\b\s*(.+?)\\sls\*', r'<foreign>/1</foreign>', osis, flags=re.DOTALL) # find a better mapping than <foreign>?
+
+ return osis
+
+
+ def cvtCharacterStyling(osis, relaxedConformance):
+ """
+ Character Styling
+ supported: \em...\em*, \bd...\bd*, \it...\it*, \bdit...\bdit*, \no...\no*, \sc...\sc*
+ """
+ # \em_...\em*
+ osis = re.sub(r'\\em\s+(.+?)\\em\*', r'<hi type="emphasis">\1</hi>', osis, flags=re.DOTALL)
+
+ # \bd_...\bd*
+ osis = re.sub(r'\\bd\s+(.+?)\\bd\*', r'<hi type="bold">\1</hi>', osis, flags=re.DOTALL)
+
+ # \it_...\it* ###TESTED###
+ osis = re.sub(r'\\it\s+(.+?)\\it\*', r'<hi type="italic">\1</hi>', osis, flags=re.DOTALL)
+
+ # \bdit_...\bdit*
+ osis = re.sub(r'\\bdit\s+(.+?)\\bdit\*', r'<hi type="bold"><hi type="italic">\1</hi></hi>', osis, flags=re.DOTALL)
+
+ # \no_...\no*
+ osis = re.sub(r'\\no\s+(.+?)\\no\*', r'<hi type="normal">\1</hi>', osis, flags=re.DOTALL)
+
+ # \sc_...\sc*
+ osis = re.sub(r'\\sc\s+(.+?)\\sc\*', r'<hi type="small-caps">\1</hi>', osis, flags=re.DOTALL)
+
+ return osis
+
+
+ def cvtSpacingAndBreaks(osis, relaxedConformance):
+ """
+ Spacing and Breaks
+ supported: ~, //, \pb
+ """
+ # ~
+ osis = osis.replace('~', '\uA0')
+
+ # //
+ osis = osis.replace('//', '')
+
+ # \pb
+ osis = re.sub(r'\\pb\s*', '<milestone type="pb"/>\n', osis, flags=re.DOTALL)
+
+ return osis
+
+
+ def cvtSpecialFeatures(osis, relaxedConformance):
+ """
+ Special Features
+ supported:
+ unsupported: \fig...\fig*, \ndx...\ndx*, \pro...\pro*, \w...\w*, \wg...\wg*, \wh...\wh*
+ """
+ # \fig DESC|FILE|SIZE|LOC|COPY|CAP|REF\fig*
+ def makeFigure(matchObject):
+ fig_desc,fig_file,fig_size,fig_loc,fig_copy,fig_cap,fig_ref = matchObject
+ figure = '<figure'
+ if fig_file:
+ figure += ' src="' + matchObject.group('fig_file') + '"'
+ if fig_size:
+ figure += ' size="' + matchObject.group('fig_size') + '"'
+ if fig_copy:
+ figure += ' rights="' + matchObject.group('fig_copy') + '"'
+ """ TODO: implement parsing in osisParse(Bible reference string)
+ if fig_ref:
+ figure += ' annotateRef="' + osisParse(matchObject.group('fig_ref')) + '"'
+ """
+ figure += '>\n'
+ if fig_cap:
+ figure += '<caption>' + matchObject.group('fig_cap') + '</caption>\n'
+ if fig_ref:
+ figure += '<reference>' + matchObject.group('fig_ref') + '</reference>\n'
+ if fig_desc:
+ figure += '<!-- fig DESC - ' + fig_desc + ' -->\n'
+ if fig_loc:
+ figure += '<!-- fig LOC - ' + fig_loc + ' -->\n'
+ figure += '</figure>'
+ return figure
+ osis = re.sub(r'\\fig\b\s+([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\|]*)\s*\|([^\\]*)\s*\\fig\*', makeFigure, osis)
+
+ # \ndx_...\ndx*
+ osis = re.sub(r'\\ndx\s+(.+?)(\s*)\\ndx\*', r'\1<index index="Index" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+ # \pro_...\pro*
+ osis = re.sub(r'([^\s]+)(\s*)\\pro\s+(.+?)(\s*)\\pro\*', r'<w xlit="\3">\1</w>\2\4', osis, flags=re.DOTALL)
+
+ # \w_...\w*
+ osis = re.sub(r'\\w\s+(.+?)(\s*)\\w\*', r'\1<index index="Glossay" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+ # \wg_...\wg*
+ osis = re.sub(r'\\wg\s+(.+?)(\s*)\\wg\*', r'\1<index index="Greek" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+ # \wh_...\wh*
+ osis = re.sub(r'\\wh\s+(.+?)(\s*)\\wh\*', r'\1<index index="Hebrew" level1="\1"/>\2', osis, flags=re.DOTALL)
+
+ return osis
+
+
+ def cvtPeripherals(osis, relaxedConformance):
+ """
+ Peripherals
+ supported:
+ unsupported: \periph
+ """
+ # \periph
+ def tagPeriph(matchObject):
+ periphType,contents = matchObject
+ periph = '<div type="'
+ if periphType in peripherals:
+ periph += peripherals[periphType]
+ elif periphType in introPeripherals:
+ periph += 'introduction" subType="x-' + introPeripherals[periphType]
+ else:
+ periph += 'x-unknown'
+ periph += '">\n' + contents + '</div>\n'
+ return periph
+ osis = re.sub(r'\\periph\s+([^\n]+)\s*\n(.+?)(?=(</div type="book">|\\periph\s+))', tagPeriph, osis, flags=re.DOTALL)
+
+ return osis
+
+
+ def cvtStudyBibleContent(osis, relaxedConformance):
+ """
+ Study Bible Content
+ supported: \ef...\ef*, \ex...\ex*, \esb...\esbe, \cat
+ """
+ # \ef...\ef*
+ osis = re.sub(r'\\ef\s+([^\s\\]+?)\s*(.+?)\s*\\ef\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="study">' + m.group(2) + u'</note>', osis, flags=re.DOTALL)
+ osis = re.sub(r'(<note\b[^>]*?>.*?</note>)', lambda m: processNote(m.group(1)), osis, flags=re.DOTALL)
+
+ # \ex...\ex*
+ osis = re.sub(r'\\ex\s+([^\s]+?)\s+(.+?)\s*\\ex\*', lambda m: r'<note' + ((' n=""') if (m.group(1) == u'-') else ('' if (m.group(1) == '+') else (' n="' + m.group(1) + '"'))) + ' type="crossReference" subType="x-study"><reference>' + m.group(2) + u'</reference></note>', osis, flags=re.DOTALL)
+ osis = re.sub(r'(<note [^>]*?type="crossReference"[^>]*>.*?</note>)', lambda m: processXref(m.group(1)), osis, flags=re.DOTALL)
+
+ # \esb...\esbex ### TODO: this likely needs to go much earlier in the process
+ osis = re.sub(r'\\esb\b\s*(.+?)\\esbe\b\s*', '<div type="x-sidebar">\1</div>\n', osis, flags=re.DOTALL)
+
+ # \cat_<TAG>\cat*
+ osis = re.sub(r'\\cat\b\s+(.+?)\\cat\*', r'<index index="category" level1="\1"/>', osis)
+
+ return osis
+
+
+ def cvtPrivateUseExtensions(osis, relaxedConformance):
+ """
+ \z namespace
+ supported: \z<Extension>
+ We can't really know what these mean, but will preserve them as <milestone/> elements.
+ """
+ # \z
+ osis = re.sub(r'\\z([^\s]+)', r'<milestone type="x-z-\1"/>', osis)
+
+ return osis
+
+
+ def processOsisIDs(osis):
+ # expand verse ranges, series
+ def expandRange(vRange):
+ vRange = re.findall(r'\d+', vRange)
+ osisID = list()
+ for n in range(int(vRange[0]), int(vRange[1])+1):
+ osisID.append('$BOOK$.$CHAP$.'+str(n))
+ return ' '.join(osisID)
+ osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+-\d+)"', lambda m: expandRange(m.group(1))+'"', osis)
+
+ def expandSeries(vSeries):
+ vSeries = re.findall(r'\d+', vSeries)
+ osisID = list()
+ for n in vSeries:
+ osisID.append('$BOOK$.$CHAP$.'+str(n))
+ return ' '.join(osisID)
+ osis = re.sub(r'\$BOOK\$\.\$CHAP\$\.(\d+(,\d+)+)"', lambda m: expandSeries(m.group(1))+'"', osis)
+
+
+ # fill in book & chapter values
+ bookChunks = osis.split(u'')
+ osis = ''
+ for bc in bookChunks:
+ bookValue = re.search(r'<div type="book" osisID="([^"]+?)"', bc)
+ if bookValue:
+ bookValue = bookValue.group(1)
+ bc = bc.replace('$BOOK$', bookValue)
+ chapChunks = bc.split(u'')
+ newbc = ''
+ for cc in chapChunks:
+ chapValue = re.search(r'<chapter osisID="[^\."]+\.([^"]+)', cc)
+ if chapValue:
+ chapValue = chapValue.group(1)
+ cc = cc.replace('$CHAP$', chapValue)
+ newbc += cc
+ bc = newbc
+ osis += bc
+ return osis
+
+
+ def osisReorderAndCleanup(osis):
+ # assorted re-orderings
+ osis = re.sub(u'(<chapter eID=.+?\n)(<verse eID=.+?>)\n?', r'\2\n\1', osis)
+ osis = re.sub(u'([]</div>)([^]*<chapter eID.+?>)', r'\2\1', osis)
+ osis = re.sub(u'(</p>\n?<p>)\n?(<verse eID=.+?>)\n?', r'\2\n\1\n', osis)
+ osis = re.sub(u'\n(<verse eID=.+?>)', r'\1\n', osis)
+ osis = re.sub(u'\n*(<l.+?>)(<verse eID=.+?>[\n]*<verse osisID=.+?>)', r'\2\1', osis)
+
+ # delete attributes from end tags (since they are invalid)
+ osis = re.sub(r'(</[^\s>]+) [^>]*>', r'\1>', osis)
+ osis = osis.replace(r'<lb type="p"/>', r'<lb/>')
+ # delete Unicode tags
+ for c in u'':
+ osis = osis.replace(c, '')
+
+ for endBlock in ['p', 'div', 'note', 'l', 'lg', 'chapter', 'verse']:
+ osis = re.sub(r' +</'+endBlock+r'>', r'</'+endBlock+r'>', osis)
+ osis = re.sub(r' +<'+endBlock+r'( eID=[^/>]+/>)', r'</'+endBlock+r'\1', osis)
+ osis = re.sub(r' +((</[^>]+>)+) *', r'\1 ', osis)
+
+ # strip extra spaces & newlines
+ osis = re.sub(r' +', r' ', osis)
+ osis = re.sub(r' ?\n\n+', r'\n', osis)
+ return osis
+
+
+ ### Processing starts here
+ if encoding:
+ osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
+ else:
+ encoding = 'utf-8'
+ osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
+ # \ide_<ENCODING>
+ encoding = re.search(r'\\ide\s+(.+)\n', osis)
+ if encoding:
+ encoding = encoding.group(1).lower()
+ if encoding != 'utf-8':
+ if encoding in aliases:
+ osis = codecs.open(sFile, 'r', encoding).read().strip() + '\n'
+ else:
+ print('Encoding unknown, processing as UTF-8.')
+
+
+ # call individual conversion processors in series
+ osis = cvtPreprocess(osis, relaxedConformance)
+ osis = cvtIdentification(osis, relaxedConformance)
+ osis = cvtIntroductions(osis, relaxedConformance)
+ osis = cvtTitles(osis, relaxedConformance)
+ osis = cvtChaptersAndVerses(osis, relaxedConformance)
+ osis = cvtParagraphs(osis, relaxedConformance)
+ osis = cvtPoetry(osis, relaxedConformance)
+ osis = cvtTables(osis, relaxedConformance)
+ osis = cvtFootnotes(osis, relaxedConformance)
+ osis = cvtCrossReferences(osis, relaxedConformance)
+ osis = cvtSpecialText(osis, relaxedConformance)
+ osis = cvtCharacterStyling(osis, relaxedConformance)
+ osis = cvtSpacingAndBreaks(osis, relaxedConformance)
+ osis = cvtSpecialFeatures(osis, relaxedConformance)
+ osis = cvtPeripherals(osis, relaxedConformance)
+ osis = cvtStudyBibleContent(osis, relaxedConformance)
+ osis = cvtPrivateUseExtensions(osis, relaxedConformance)
+
+ osis = processOsisIDs(osis)
+ osis = osisReorderAndCleanup(osis)
+
+ # change type on special books
+ for sb in specialBooks:
+ osis = osis.replace('<div type="book" osisID="' + sb + '">', '<div type="' + sb.lower() + '">')
+
+ return osis
+
+
+
+def writeOSISHeader(oFile, workID, lang='en'):
+ oFile.write('<?xml version="1.0" encoding="UTF-8"?>\n<osis xmlns="http://www.bibletechnologies.net/2003/OSIS/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.'+OSISversion+'.xsd">\n<osisText osisRefWork="Bible" xml:lang="' + lang + '" osisIDWork="' + workID + '">\n<header>\n<work osisWork="' + workID + '"/>\n</header>\n')
+
+def writeOSISFooter(oFile):
+ oFile.write('</osisText>\n</osis>\n')
+
+def verbosePrint(text):
+ if verbose:
+ print text
+
+def printUnhandled():
+ global relaxedConformance
+
+
+def printUsage():
+ print('usfm2osis.py -- USFM ' + USFMversion + ' to OSIS ' + OSISversion + ' converter version ' + scriptVersion)
+ print(' Revision: ' + rev + ' (' + date + ')')
+ print('')
+ print('Usage: usfm2osis.py <osisWork> [OPTION] ... <USFM filename|wildcard> ...')
+ print('')
+ print(' -e ENCODING input encoding override (default is to read the USFM file\'s')
+ print(' \\ide value or assume UTF-8 encoding in its absence)')
+ print(' -h, --help print this usage information')
+ print(' -o FILENAME output filename (default is: <osisWork>.osis.xml)')
+ print(' -r enable relaxed markup processing (for non-standard USFM)')
+ print(' -v verbose feedback')
+ print('')
+ print('As an example, if you want to generate the osisWork <Bible.KJV> and your USFM')
+ print(' are located in the ./KJV folder, enter:')
+ print(' python usfm2osis.py Bible.KJV ./KJV/*.usfm')
+ verbosePrint('')
+ verbosePrint('Supported encodings: ' + ', '.join(aliases))
+ exit()
+
+class Worker(multiprocessing.Process):
+ def __init__(self, work_queue, result_queue):
+
+ # base class initialization
+ multiprocessing.Process.__init__(self)
+
+ # job management stuff
+ self.work_queue = work_queue
+ self.result_queue = result_queue
+ self.kill_received = False
+
+ def run(self):
+ while not self.kill_received:
+
+ # get a task
+ #job = self.work_queue.get_nowait()
+ try:
+ job = self.work_queue.get_nowait()
+ except Queue.Empty:
+ break
+
+ # the actual processing
+ osis = convertToOSIS(job)
+
+ # store the result
+ self.result_queue.put((job,osis))
+
+
+if __name__ == "__main__":
+ global encoding
+ global relaxedConformance
+
+ num_processes = multiprocessing.cpu_count()
+ num_jobs = num_processes
+
+ encoding = ''
+ relaxedConformance = False
+ inputFilesIdx = 2 # This marks the point in the sys.argv array, after which all values represent USFM files to be converted.
+
+ if '-v' in sys.argv:
+ verbose = True
+ inputFilesIdx += 1
+ else:
+ verbose = False
+
+ if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) < 3:
+ printUsage()
+ else:
+ OSISwork = sys.argv[1]
+
+ if '-o' in sys.argv:
+ i = sys.argv.index('-o')+1
+ if len(sys.argv) < i+1:
+ printUsage()
+ OSISfileName = sys.argv[i]
+ inputFilesIdx += 2 # increment 2, reflecting 2 args for -o
+ else:
+ OSISfileName = OSISwork + '.osis.xml'
+
+ if '-e' in sys.argv:
+ i = sys.argv.index('-e')+1
+ if len(sys.argv) < i+1:
+ printUsage()
+ encoding = sys.argv[i]
+ inputFilesIdx += 2 # increment 2, reflecting 2 args for -e
+
+ if '-r' in sys.argv:
+ relaxedConformance = True
+ inputFilesIdx += 1
+
+ usfmDocList = sys.argv[inputFilesIdx:]
+
+ OSISfile = codecs.open(OSISfileName, 'w', 'utf-8')
+ writeOSISHeader(OSISfile, OSISwork)
+
+
+ # run
+ # load up work queue
+ work_queue = multiprocessing.Queue()
+ for job in sorted(usfmDocList, key=keynat):
+ work_queue.put(job)
+
+ # create a queue to pass to workers to store the results
+ result_queue = multiprocessing.Queue()
+
+ # spawn workers
+ for i in range(num_processes):
+ worker = Worker(work_queue, result_queue)
+ worker.start()
+
+ # collect the results off the queue
+ osisSegment = dict()
+ for i in usfmDocList:
+ k,v=result_queue.get()
+ osisSegment[k]=v
+
+ unhandledTags = set()
+ for doc in sorted(usfmDocList, key=keynat):
+ unhandledTags |= set(re.findall(r'(\\[^\s\*]+?\b\*?)', osisSegment[doc]))
+ OSISfile.write(osisSegment[doc])
+ writeOSISFooter(OSISfile)
+
+ if unhandledTags:
+ if verbose:
+ print('')
+ print('Unhandled USFM tags: ' + ', '.join(sorted(unhandledTags)) + ' (' + str(len(unhandledTags)) + ' total)')
+ if not relaxedConformance:
+ print('Consider using the -r option for relaxed markup processing.')
Property changes on: trunk/modules/python/usfm2osis.py
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/modules/python/usfmtags.py
===================================================================
--- trunk/modules/python/usfmtags.py (rev 0)
+++ trunk/modules/python/usfmtags.py 2012-08-04 11:10:27 UTC (rev 360)
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+date = '$Date: 2012-03-09 01:23:40 -0800 (Fri, 09 Mar 2012) $'
+rev = '$Rev: 355 $'
+
+USFMversion = '2.35' # http://ubs-icap.org/chm/usfm/2.35/index.html
+
+# usfmtags.py version 1.0
+# Copyright 2012 by the CrossWire Bible Society <http://www.crosswire.org/>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# The full text of the GNU General Public License is available at:
+# <http://www.gnu.org/licenses/gpl-3.0.txt>.
+
+import re, sys, codecs
+
+date = date.replace('$', '').strip()[6:16]
+rev = rev.replace('$', '').strip()[5:]
+
+simpleTags = (['\\id', '\\ide', '\\sts', '\\rem', '\\h', '\\toc1', '\\toc2', '\\toc3', '\\ip', '\\ipi', '\\im', '\\imi', '\\ipq', '\\imq', '\\ipr', '\\ib', '\\ili', '\\iot', '\\ior', '\\ior*', '\\iex', '\\iqt', '\\iqt*', '\\imte', '\\ie', '\\mr', '\\sr', '\\r', '\\rq', '\\rq*', '\\d', '\\sp', '\\c', '\\ca', '\\ca*', '\\cl', '\\cp', '\\cd', '\\v', '\\va', '\\va*', '\\vp', '\\vp*', '\\p', '\\m', '\\pmo', '\\pm', '\\pmc', '\\pmr', '\\mi', '\\nb', '\\cls', '\\pc', '\\pr', '\\b', '\\qr', '\\qc', '\\qs', '\\qs*', '\\qa', '\\qac', '\\qac*', '\\tr', '\\f', '\\f*', '\\fe', '\\fe*', '\\fr', '\\fk', '\\fq', '\\fqa', '\\fl', '\\fp', '\\fv', '\\ft', '\\fdc', '\\fdc*', '\\fm', '\\fm*', '\\x', '\\x*', '\\xo', '\\xk', '\\xq', '\\xt', '\\xot', '\\xot*', '\\xnt', '\\xnt*', '\\xdc', '\\xdc*', '\\add', '\\add*', '\\bk', '\\bk*', '\\dc', '\\dc*', '\\k', '\\k*', '\\lit', '\\nd', '\\nd*', '\\ord', '\\ord*', '\\pn', '\\pn*', '\\qt', '\\qt*', '\\sig', '\\sig*', '\\sls', '\\sls*', '\\tl', '\\tl*', '\\wj', '\\wj*', '\\em', '\\em*', '\\bd', '\\bd*', '\\it', '\\it*', '\\bdit', '\\bdit*', '\\no', '\\no*', '\\sc', '\\sc*', '\\pb', '\\fig', '\\fig*', '\\ndx', '\\ndx*', '\\pro', '\\pro*', '\\w', '\\w*', '\\wg', '\\wg*', '\\wh', '\\wh*', '\\periph', '\\ef', '\\ef*', '\\ex', '\\ex*', '\\esb', '\\esbe', '\\cat', '\\z'])
+digitTags = set(['\\imt', '\\is', '\\iq', '\\io', '\\mt', '\\mte', '\\ms', '\\s', '\\pi', '\\li', '\\ph', '\\q', '\\qm', '\\th', '\\thr', '\\tc', '\\tcr'])
+
+def main(argv):
+ tagSet = set()
+ knownSet = set()
+ unknownSet = set()
+
+ if '-h' in argv or '--help' in argv or len(argv) < 2:
+ printUsage()
+ else:
+ for doc in argv[1:]:
+ text = codecs.open(doc, 'r', 'utf-8').read()
+ tagSet.update(set(re.findall(r'(\\[a-zA-Z0-9]+\b\*?)', text)))
+
+ for tag in tagSet:
+ if tag in simpleTags:
+ knownSet.add(tag)
+ elif tag.rstrip('1234567890') in digitTags:
+ knownSet.add(tag)
+ else:
+ unknownSet.add(tag)
+
+ print 'Known USFM Tags: ' + ', '.join(sorted(knownSet))
+ print 'Unrecognized USFM Tags: ' + ', '.join(sorted(unknownSet))
+
+
+
+def printUsage():
+ print 'usfmtags.py <USFM filenames|wildcard>'
+ print ' Revision: ' + rev + ' (' + date + ')'
+ print ''
+ print ' This utility will scan USFM files and print two lists of all unique tags in them.'
+ print ' The first list identifies all valid tags, identified in the USFM ' + USFMversion + ' spec.'
+ print ' The second list identifies tags unknown to that spec.'
+ exit()
+
+if __name__ == "__main__":
+ main(sys.argv)
More information about the sword-cvs
mailing list