#!/usr/bin/env python3 # -*- coding: utf-8 -*- # confmaker.py - Provides a initial conf file for a new module by analyzing # the related OSIS xml file. ## The programme searches for relevant tags and creates the GlobalOptionFilter # entries and other relevant conf entries. This a port to Python from the # previous confmaker.pl Perl script we were using. It fixes detection of # diacritics and OSISMorphSegmentation (GlobalOpionFilters) and adds support # for genbook and modules with large entries > 64Kb. # Copyright (C) 2020 CrossWire Bible Society # Author: kris & domcox # This file is part of Sword Modules # Sword Modules is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # Sword Modules is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with Sword Modules. If not, see . # Created: 2021-01-08 # # Revision: # 2021-01-16 domcox # Changed language library from iso-639 to langtags # 2023-07-30 domcox # Full rewrite using ElementTree XML parsing module # TODO: # - EntrySize for verses that do not use milestone elements # - EntrySize for book titles & introduction # - Implement GlobalOptionsFilter=OSISReferenceLinks import argparse import sys import time import xml.etree.ElementTree as ET from datetime import date from pathlib import Path try: import langtags except: sys.stderr.write("You do not have the Python langtags library installed. Please install it (pip install langtags).\n") sys.exit(1) try: import Sword except: sys.stderr.write("You do not have the SWORD library installed. Please install it.\n") sys.exit(1) # Variables Version = '2.0' # List of V11n and relative SWORD Minimum version versification = { 'KJV': '1.5.9', 'KJVA': '1.6.0', 'NRSV': '1.6.0', 'NRSVA': '1.6.0', 'MT': '1.6.0', 'Leningrad': '1.6.0', 'Synodal': '1.6.1', 'Vulg': '1.6.1', 'Luther': '1.6.1', 'German': '1.6.1', 'Catholic': '1.6.2', 'Catholic2': '1.6.2', 'LXX': '1.7.2', 'Orthodox': '1.7.2', 'SynodalProt': '1.7.2', 'DarbyFr': '1.8.0', 'Segond': '1.8.0', 'Calvin': '1.8.0' } # Functions def die(msg): """ Show an error message then exit on error """ print('ERROR! ' + msg, file=sys.stderr) sys.exit(1) def get_parameters(): """ Get Parse command-line options. Returns dict containing parameters values """ # Creating parser description = ''' provides a conf file for a module by analysing the given OSIS XML file. Optionally include extra elements from a conf.in file. This option will be removed in a future version. ''' parser = argparse.ArgumentParser(description=description) # Adding arguments parser.add_argument('osis', help='name of the OSIS XML file') parser.add_argument("-o", "--outfile", help="name of generated conf file, (default to screen)") parser.add_argument("-v", "--v11n", default='KJV', help="versification schema, (default: KJV)") parser.add_argument("-s", "--size", default='2', help="set -s 4 for modules with large entries > 64Kb, (default -s 2)") parser.add_argument("-i", "--infile", help="conf.in file containing extra elements to include, (default none)") # Parsing arguments args = parser.parse_args() return (vars(args)) def check_parameters(params): """ Check command arguments """ # Check OSIS file value osisfile = params['osis'] fileObj = Path(osisfile) if not fileObj.is_file(): die(f"File '{osisfile}' does not exist.") # Check conf.in file value if params['infile']: infile = params['infile'] fileObj = Path(infile) if not fileObj.is_file(): die(f"File '{infile}' does not exist.") # Check versification schema v11n = params['v11n'] av11n = versification.keys() if v11n not in av11n: die(f"'{v11n}': Unknown versification schema.") return (True) def get_language(lang): """ Search BCP-47 Languages Database for the given lang """ found = False try: tag = langtags.Tag(lang) found = True except: die(f"Language '{lang}' not found in BCP 47 Languages Database") # Sometimes language description is multiline -> remove '\n' return (tag.language.description.replace('\n', ' ')) def is_diacritic(xml_file, lang, diacritic): ''' Search for 'diacritic' in OSIS File Returns True or False ''' # Don't search OSIS targetting other languages than Hebrew, Greek, Arabic if not lang in ('ar','grc','he','hbo'): return False elif not lang in 'ar' and diacritic == 'Arabic Vowel Points': return False elif not lang in 'grc' and diacritic == 'Greek Accents': return False elif not lang in ('he','hbo') and diacritic == 'Hebrew Cantillation': return False elif not lang in ('he','hbo') and diacritic == 'Hebrew Vowel Points': return False else: # Grab the base SWORD manager mgr = Sword.SWMgr() mgr.setGlobalOption("Arabic Vowel Points", "Off"); mgr.setGlobalOption("Greek Accents", "Off"); mgr.setGlobalOption("Hebrew Cantillation", "Off"); mgr.setGlobalOption("Hebrew Vowel Points", "Off"); # Parse XML xml_text = ET.parse(xml_file) xml_root = xml_text.getroot() # Remove all tags and keep bare text only, make 2 sets strip_text = ET.tostring(xml_root, encoding='unicode', method='text') ref_text = Sword.SWBuf(strip_text) mod_text = Sword.SWBuf(strip_text) # Apply filter on 1 text mgr.filterText(diacritic, mod_text) # Compare original bare text and filtered one # return True is the filter has made changes to the text, False otherwise return(ref_text.c_str() != mod_text.c_str()) def osis2conf_parser(args): ''' This function Parses the OSIS file. searches for specific tags and creates the relevant conf elements that will be used to build the conf file. ''' # Variables: # 1. List of Key elements of the resulting SWORD conf file Elements = [] # 2. OSIS sections Header = False Chapter = False Verse = False # 3. Big entry if int(args['size']) > 2: EntrySize = 655536 else: EntrySize=0 # 4. Key elements of a SWORD module.conf Name = '' Type = '' Lang = '' Description = '' About = '' TextSource = '' DistributionLicense = '' OSISFootnotes = False OSISHeadings = False OSISScripref = False OSISRedLetterWords = False OSISVariants = False OSISMorphSegmentation = False OSISLemma = False OSISStrongs = False OSISGlosses = False OSISMorph = False OSISEnum = False OSISXlit = False Images = False NoParagraphs = True Copyright = '' CopyrightHolder = '' CopyrightDate = '' CopyrightNotes = '' CopyrightContactName = '' CopyrightContactNotes = '' CopyrightContactAddress = '' Abbreviation = '' KeyType = '' DisplayLevel = '' CaseSensitiveKeys='' PreferredCSSXHTML = '' Obsoletes = '' Companion = '' # Let's parse for event, node in ET.iterparse(args['osis'], events=("start", "end")): # OsisText content if not Name: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}osisText': # Get osisIDWork Name = node.get('osisIDWork') if not Name: die('FATAL: osisIDWork is empty.') # Get osisRefWork Type = node.get('osisRefWork').lower() if Type not in ['bible', 'commentary', 'genbook']: die(f"FATAL: Invalid attribute osisRefWork: {osiswork}") # Get Language Lang = node.get('{http://www.w3.org/XML/1998/namespace}lang') if not Lang: die(f'FATAL: Missing lang element') # Select header if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "start": Header = True if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "end": Header = False # Select Chapter if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "start": Chapter = True if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "end": Chapter = False # GlobalOptionFilters if not Header: # Footnotes if not OSISFootnotes: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}note': OSISFootnotes = True # Headings if not OSISHeadings: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}title': OSISHeadings = True # Scripref if not OSISScripref: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}reference': OSISScripref = True # RedLetterWords if not OSISRedLetterWords: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}q': OSISRedLetterWords = True # Variants if not OSISVariants: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg': if 'type' in node.keys(): if 'x-variant' in node.get('type'): OSISVariants = True # MorphSegmentation if not OSISMorphSegmentation: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg': if 'type' in node.keys(): if 'morph:' in node.get('type'): OSISMorphSegmentation = True # Lemma if not OSISLemma: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': if node.get('lemma') != None: OSISLemma = True # Strongs if not OSISStrongs: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': if 'lemma' in node.keys(): if 'strong' in node.get('lemma'): OSISStrongs = True # Glosses if not OSISGlosses: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': if 'gloss' in node.keys(): OSISGlosses = True # Morph if not OSISMorph: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': if 'morph' in node.keys(): OSISMorph = True # Enum if not OSISEnum: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': if 'n' in node.keys(): OSISEnum = True # Xlit if not OSISXlit: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': if 'xlit' in node.keys(): OSISXlit = True # Images if not Images: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}figure': Images = True # Search only inside Chapters if Chapter: # NoParagraphs if NoParagraphs: if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}p': NoParagraphs = False # Entries length - Get verse max size if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}verse': if 'sID' in node.keys(): Verse = True rawtext = '' if 'eID' in node.keys(): Verse = False if len(rawtext) > EntrySize: EntrySize =len(rawtext) # Entries length - Get verse text if Verse: if node.text != None: rawtext = rawtext + node.text if node.tail != None: rawtext = rawtext + node.tail # Define Elements # Set Name Elements.append("[" + Name + "]") # Derive module name module = Name.lower() # Set big entry option for entries greater than 64K bytes big = '' if EntrySize > 64000: big='4' # Set ModDrv if Type in 'bible': Elements.append("ModDrv=zText" + big) if Type in 'commentary': Elements.append("ModDrv=zCom" + big) if Type in 'genbook': Elements.append("ModDrv=RawGenBook" + big) # Set Datapath if Type in 'bible': Elements.append("DataPath=./modules/texts/ztext" + big + "/" + module + "/") if Type in 'commentary': Elements.append("DataPath=./modules/comments/zcom" + big + "/" + module + "/") if Type in 'genbook': Elements.append("DataPath=./modules/genbook/rawgenbook/" + big + "/" + module + "/" + module) # Set Compression if Type in ['bible', 'commentary']: Elements.append('CompressType=ZIP') # Set misc. elements Elements.append('BlockType=BOOK') Elements.append('Encoding=UTF-8') Elements.append('SourceType=OSIS') Elements.append('OSISVersion=2.1.1') Elements.append('SwordVersionDate=' + str(date.today())) # Set Lang Elements.append('Lang=' + Lang) # Set GlobalOptionFilters if OSISFootnotes: Elements.append('GlobalOptionFilter=OSISFootnotes') if OSISHeadings: Elements.append('GlobalOptionFilter=OSISHeadings') if OSISScripref: Elements.append('GlobalOptionFilter=OSISScripref') if OSISRedLetterWords: Elements.append('GlobalOptionFilter=OSISRedLetterWords') if OSISVariants: Elements.append('GlobalOptionFilter=OSISVariants') if OSISMorphSegmentation: Elements.append('GlobalOptionFilter=OSISMorphSegmentation') if OSISLemma: Elements.append('GlobalOptionFilter=OSISLemma') if OSISStrongs: Elements.append('GlobalOptionFilter=OSISStrongs') if OSISGlosses: Elements.append('GlobalOptionFilter=OSISGlosses') if OSISMorph: Elements.append('GlobalOptionFilter=OSISMorph') if OSISEnum: Elements.append('GlobalOptionFilter=OSISEnum') if OSISXlit: Elements.append('GlobalOptionFilter=OSISXlit') # Set Diacritics # Hebrew Vowel Points if is_diacritic(args['osis'], Lang, 'Hebrew Vowel Points'): Elements.append('GlobalOptionFilter=UTF8HebrewPoints') # Arabic Vowel Points if is_diacritic(args['osis'], Lang, 'Arabic Vowel Points'): Elements.append('GlobalOptionFilter=UTF8ArabicPoints') # Hebrew Cantillation if is_diacritic(args['osis'], Lang, 'Hebrew Cantillation'): Elements.append('GlobalOptionFilter=UTF8Cantillation') # Greek Accents if is_diacritic(args['osis'], Lang, 'Greek Accents'): Elements.append('GlobalOptionFilter=UTF8GreekAccents ') # Set Features if OSISStrongs: Elements.append('Feature=StrongsNumbers') # Images if Images: Elements.append('Feature=Images') # NoParagraphs if NoParagraphs: Elements.append('Feature=NoParagraphs') # Set LCSH language = get_language(Lang) if Type not in 'genbook': Elements.append('LCSH=' + Type.capitalize() + '.' + language) # Set Sword Minimum Version Elements.append('MinimumVersion=' + versification[args['v11n']]) # Set Versification if Type not in 'genbook': Elements.append('Versification=' + args['v11n']) # End print('EntrySize=',EntrySize) return Elements def main(): ''' Main function ''' # Start benchmark start_time = time.perf_counter() # Read CLI params params = get_parameters() check_parameters(params) # Parse OSIS conf = osis2conf_parser(params) # print('conf=', conf) # Generate conf file outfile = params['outfile'] if not outfile: # Default to screen for key in conf: print(key) else: # Write config to file with open(outfile, 'w') as f: for key in conf: print(key, file=f) # Include conf.in file if it exists infile = params['infile'] if infile: # Read and include conf.in contents with open(infile, 'r', encoding='utf-8', newline='\n') as f: for line in f: if not outfile: # Default to screen print(line.rstrip()) else: # Write config to file with open(outfile, 'a') as f: print(line.rstrip(), file=f) else: # No conf.in file -> generate default values if not outfile: # Default to screen print('DistributionLicense=Copyrighted') print('Description=This is a new module') print('Version=1.0') print('History_1.0=First release') else: # Write config to file with open(outfile, 'a') as f: print('DistributionLicense=Copyrighted', file=f) print('Description=This is a new module', file=f) print('Version=1.0', file=f) print('History_1.0=First release', file=f) # Benchmark results end_time = time.perf_counter() total_time = round(end_time - start_time, 1) print(f'-- Module Config generated in {total_time} s') main()