[sword-svn] r554 - trunk/migratetags

scribe at crosswire.org scribe at crosswire.org
Thu Apr 13 04:31:02 EDT 2023


Author: scribe
Date: 2023-04-13 04:31:02 -0400 (Thu, 13 Apr 2023)
New Revision: 554

Modified:
   trunk/migratetags/migratetags.cpp
Log:
updated migratetags to allow a TEI xml file as input


Modified: trunk/migratetags/migratetags.cpp
===================================================================
--- trunk/migratetags/migratetags.cpp	2022-09-18 17:05:59 UTC (rev 553)
+++ trunk/migratetags/migratetags.cpp	2023-04-13 08:31:02 UTC (rev 554)
@@ -2,6 +2,7 @@
 #include <utf8greekaccents.h>
 #include <swmgr.h>
 #include <utilxml.h>
+#include <filemgr.h>
 #include <swbuf.h>
 #include <swconfig.h>
 #include <swmodule.h>
@@ -22,9 +23,11 @@
 // hard code your from and to modules here or pass them on the command line with -
 SWBuf strongsSourceModuleName = "WHNU";
 SWBuf targetModuleName = "NA28FromImp";
+SWBuf targetTEIFile = "";
 
 
 const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊  ";
+//const char *ignoreSeries = "";
 
 typedef vector<unsigned long> BibMap;
 
@@ -48,6 +51,7 @@
 	fprintf(stderr, "\nusage: %s [options]\n", progName);
 	fprintf(stderr, "  -ss <moduleName>\t provide the Strong's source module name\n");
 	fprintf(stderr, "  -t  <moduleName>\t provide the target module name\n");
+	fprintf(stderr, "  -tei <filename>\t provide the target tei filename\n");
 	fprintf(stderr, "  -e  <exception file>\t provide an ini-style .conf file with overriding tag exceptions.\n");
 	fprintf(stderr, "  -fa\t\t\t filter accents: remove Greek accents from final text\n");
 	fprintf(stderr, "  -fc\t\t\t filter critical apparatus markers from final text\n");
@@ -58,7 +62,123 @@
 }
 
 
+SWModule *targetMod = 0;
+bool getNextVerse(VerseKey *targetModKey, SWBuf *targetModText) {
+	static int z = 0;
+	static bool finished = false;
+	if (++z == 1) {
+		((VerseKey *)targetMod->getKey())->setIntros(true);
+		targetMod->getKey()->setText("mat0.0");
+	}
 
+	// assert our source is in good condition to give us more data
+	if (finished) return false;
+
+	// grab our raw, fully marked up TargetMod text for this verse
+	(*targetModText) = targetMod->getRawEntryBuf();
+	(*targetModKey) = (*(targetMod->getKey()));
+
+	// clear any error from retrieving text
+	targetMod->popError();
+	(*targetMod)++;
+	finished = targetMod->popError();
+
+	return true;
+}
+
+FileDesc *targetInput = 0;
+bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) {
+	static bool finished = false;
+	static bool fileEnd = false;
+	static SWBuf line = "";
+
+	XMLTag lastAB("</ab>");
+	(*targetModText) = "";
+
+	while (!fileEnd || line.size()) {
+		if (!line.size()) {
+			fileEnd = !FileMgr::getLine(targetInput, line);
+			if (!fileEnd) line.append("\n");
+		}
+		int offset = line.indexOf("<ab ");
+		int endOffset = line.indexOf("</ab>");
+		if (offset < 0) offset = endOffset;
+		else if (endOffset > -1 && endOffset < offset) offset = endOffset;
+		if (offset > -1) {
+			targetModText->append(line, offset);
+			line << offset;
+			int end = line.indexOf(">");
+			if (end > -1) {
+				SWBuf abText = "";
+				abText.append(line, end+1);
+				XMLTag ab(abText);
+				targetModText->append(abText);
+				line << (end+1);
+				if (ab.isEndTag()) {
+					break;
+				}
+				lastAB = ab;
+			}
+		}
+		else {
+			targetModText->append(line);
+			line = "";
+		}
+	}
+
+	// assert our source is in good condition to give us more data
+	if (fileEnd && !line.size()) return false;
+
+	// grab our raw, fully marked up TargetMod text for this verse
+	if (lastAB.isEndTag()) {
+		// we are just returning interverse material so targetModKey is out of bounds
+		// just set to any error
+		targetModKey->setError(-99);
+	}
+	else {
+		// <ab xml:id="V-B33K1V2-33-MATT" n="2">
+		SWBuf id = lastAB.getAttribute("xml:id");
+		SWBuf bkv = "";
+		SWBuf bookName = "";
+		SWBuf bookNum = "";
+		SWBuf chapter = "";
+		SWBuf verse = "";
+		SWBuf segment = id.stripPrefix('-');
+		if (!segment.size()) bkv = id;
+		if (!bkv.size() && !segment.startsWith("B")) {
+			segment = id.stripPrefix('-');
+		}
+		else if (!bkv.size()) bkv = segment;
+		if (!bkv.size() && !segment.startsWith("B")) {
+			segment = id.stripPrefix('-');
+		}
+		else if (!bkv.size()) bkv = segment;
+		// if we have more segments, find the last segment
+		// because this is likely the bookName
+		if (bkv.size() && id.size() && id != bkv) {
+			id.stripPrefix('-');
+			id.stripPrefix('-');
+			id.stripPrefix('-');
+			id.stripPrefix('-');
+			bookName = id;
+		}
+		if (bkv.size()) {
+			bkv << 1;
+			bookNum = bkv.stripPrefix('K');
+			chapter = bkv.stripPrefix('V');
+			verse = bkv;
+
+			SWBuf osisID = (bookName.size() ? bookName : bookNum);
+			osisID.appendFormatted(".%s.%s", chapter.c_str(), verse.c_str());
+			(*targetModKey) = osisID;
+		}
+
+	}
+
+	return true;
+}
+
+
 int main(int argc, char **argv) {
 	const char *progName   = argv[0];
 	for (int i = 1; i < argc; ++i) {
@@ -83,6 +203,12 @@
 			}
 			else usage(progName, "-t argument requires a module name.");
 		}
+		else if (!strcmp(argv[i], "-tei")) {
+			if ((i + 1) < argc) {
+				targetTEIFile = argv[++i];
+			}
+			else usage(progName, "-tei argument requires a tei filename.");
+		}
 		else if (!strcmp(argv[i], "-e")) {
 			if (i+1 < argc) {
 				optionExceptionFile.push_back(argv[++i]);
@@ -94,13 +220,25 @@
 
 	SWMgr lib;
 	lib.setGlobalOption("Textual Variants", "Secondary Reading");
-	SWModule *m = lib.getModule(targetModuleName);
-	if (!m) {
-		cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n";
-		if (argc < 2) usage(progName, "Use -t to supply target module name");
-		exit(1);
+	SWModule *m = 0;
+	if (targetTEIFile.size()) {
+		targetInput = FileMgr::getSystemFileMgr()->open(targetTEIFile, FileMgr::RDONLY);
+		if (!targetInput || targetInput->getFd() < 1) {
+			cerr << "\nERROR: couldn't open tei file: " << targetTEIFile << ".\n";
+			usage(progName, "Use -tei <filename> to supply tei filename");
+			exit(1);
+		}
 	}
-	SWModule &targetMod = *m;
+	else {
+		m = lib.getModule(targetModuleName);
+		if (!m) {
+			cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n";
+			if (argc < 2) usage(progName, "Use -t to supply target module name");
+			exit(1);
+		}
+		targetMod = m;
+	}
+
 	m = lib.getModule(strongsSourceModuleName.c_str());
 	if (!m) {
 		cerr << "\nERROR: couldn't find Strong's source module: " << strongsSourceModuleName.c_str() << ".\n";
@@ -115,15 +253,17 @@
 		else (*exceptionFile) += SWConfig(fileName);
 	}
 
-	// we'll do the whole Bible eventually, but let's just get one verse
-	// working well.
-	((VerseKey *)targetMod.getKey())->setIntros(true);
-	targetMod.getKey()->setText("mat0.0");		// let's try this verse
-	int z = 0;
-	for (;
-//!z &&
-!targetMod.popError(); targetMod++) {
-	z++;
+	VerseKey *targetModKey = (VerseKey *)(targetInput ? fromMod.createKey() : targetMod->createKey());
+	targetModKey->setIntros(true);
+	SWBuf targetModText;
+	while ((targetInput ? getNextVerseTEI(targetModKey, &targetModText) : getNextVerse(targetModKey, &targetModText))) {
+		if (targetModKey->getError()) {
+			cout << targetModText;
+			cout << endl;
+			continue;
+		}
+		// we'll do the whole Bible eventually, but let's just get one verse
+		// working well.
 
 		// XML word tags which should be placed in this verse (start tag)
 		// eg., <w lemma=...>
@@ -177,11 +317,13 @@
 		bibMap.clear();
 		wTags.clear();
 
-		fromMod.setKey(targetMod.getKey());
-		cout << "$$$ " << targetMod.getKeyText() << endl;
+		fromMod.setKey(targetModKey);
+		if (!targetTEIFile.size()) {
+			cout << "$$$ " << targetModKey->getText() << endl;
+		}
 
 if (optionDebug) {
-		cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
+		cout << "\nProcessing Verse: " << targetModKey->getText() << endl;
 		cout << "---------------------" << endl;
 
 		cout << "\nOur strongsSourceModule Markup" << endl;
@@ -192,7 +334,7 @@
 
 
 		// grab our raw, fully marked up TargetMod text for this verse
-		SWBuf orig = targetMod.getRawEntryBuf();
+		SWBuf orig = targetModText;
 
 
 if (optionDebug) {
@@ -268,7 +410,7 @@
 
 		// ok, now that we have our targetWordTags magically populated
 		// let's do the grunt work of inserting the <w> and </w> tags
-		insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+		insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
 
 
 if (optionDebug) {
@@ -281,13 +423,13 @@
 		for (int i = 0; i < targetWords.size(); ++i) {
 			if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
 				if (!warned) {
-					cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+					cerr << "*** Error: didn't match all words: " << targetModKey->getText() << endl;
 					cerr << strongsSourceModuleName.c_str() << ":";
 					for (int j = 0; j < fromWords.size(); ++j) {
 						cerr << " " << fromWords[j];
 					}
 					cerr << endl;
-					cerr << targetModuleName << ":";
+					cerr << (targetTEIFile.size() ? targetTEIFile : targetModuleName) << ":";
 					for (int j = 0; j < targetWords.size(); ++j) {
 						cerr << " " << targetWords[j];
 					}
@@ -303,8 +445,8 @@
 }
 		}
 		if (warned) {
-			cerr << "\n" << targetModuleName << " Tags:\n";
-			VerseKey *vk = (VerseKey *)targetMod.getKey();
+			cerr << "\n" << (targetTEIFile.size() ? targetTEIFile : targetModuleName) << " Tags:\n";
+			VerseKey *vk = (VerseKey *)targetModKey;
 			for (int j = 0; j < targetWords.size(); ++j) {
 				if (!strstr(ignoreSeries, targetWords[j])) {
 					cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl;
@@ -325,7 +467,10 @@
 		cout << "\nAND... Here's our final output" << endl;
 		cout << "---------------------" << endl;
 }
-		cout << newTargetModMarkup << endl;
+		cout << newTargetModMarkup;
+		if (!targetTEIFile.size()) {
+			cout << endl;
+		}
 if (optionDebug) {
 		cout << endl;
 }



More information about the sword-cvs mailing list