[sword-svn] r554 - trunk/migratetags
scribe at crosswire.org
scribe at crosswire.org
Thu Apr 13 04:31:02 EDT 2023
Author: scribe
Date: 2023-04-13 04:31:02 -0400 (Thu, 13 Apr 2023)
New Revision: 554
Modified:
trunk/migratetags/migratetags.cpp
Log:
updated migratetags to allow a TEI xml file as input
Modified: trunk/migratetags/migratetags.cpp
===================================================================
--- trunk/migratetags/migratetags.cpp 2022-09-18 17:05:59 UTC (rev 553)
+++ trunk/migratetags/migratetags.cpp 2023-04-13 08:31:02 UTC (rev 554)
@@ -2,6 +2,7 @@
#include <utf8greekaccents.h>
#include <swmgr.h>
#include <utilxml.h>
+#include <filemgr.h>
#include <swbuf.h>
#include <swconfig.h>
#include <swmodule.h>
@@ -22,9 +23,11 @@
// hard code your from and to modules here or pass them on the command line with -
SWBuf strongsSourceModuleName = "WHNU";
SWBuf targetModuleName = "NA28FromImp";
+SWBuf targetTEIFile = "";
const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ ";
+//const char *ignoreSeries = "";
typedef vector<unsigned long> BibMap;
@@ -48,6 +51,7 @@
fprintf(stderr, "\nusage: %s [options]\n", progName);
fprintf(stderr, " -ss <moduleName>\t provide the Strong's source module name\n");
fprintf(stderr, " -t <moduleName>\t provide the target module name\n");
+ fprintf(stderr, " -tei <filename>\t provide the target tei filename\n");
fprintf(stderr, " -e <exception file>\t provide an ini-style .conf file with overriding tag exceptions.\n");
fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n");
fprintf(stderr, " -fc\t\t\t filter critical apparatus markers from final text\n");
@@ -58,7 +62,123 @@
}
+SWModule *targetMod = 0;
+bool getNextVerse(VerseKey *targetModKey, SWBuf *targetModText) {
+ static int z = 0;
+ static bool finished = false;
+ if (++z == 1) {
+ ((VerseKey *)targetMod->getKey())->setIntros(true);
+ targetMod->getKey()->setText("mat0.0");
+ }
+ // assert our source is in good condition to give us more data
+ if (finished) return false;
+
+ // grab our raw, fully marked up TargetMod text for this verse
+ (*targetModText) = targetMod->getRawEntryBuf();
+ (*targetModKey) = (*(targetMod->getKey()));
+
+ // clear any error from retrieving text
+ targetMod->popError();
+ (*targetMod)++;
+ finished = targetMod->popError();
+
+ return true;
+}
+
+FileDesc *targetInput = 0;
+bool getNextVerseTEI(VerseKey *targetModKey, SWBuf *targetModText) {
+ static bool finished = false;
+ static bool fileEnd = false;
+ static SWBuf line = "";
+
+ XMLTag lastAB("</ab>");
+ (*targetModText) = "";
+
+ while (!fileEnd || line.size()) {
+ if (!line.size()) {
+ fileEnd = !FileMgr::getLine(targetInput, line);
+ if (!fileEnd) line.append("\n");
+ }
+ int offset = line.indexOf("<ab ");
+ int endOffset = line.indexOf("</ab>");
+ if (offset < 0) offset = endOffset;
+ else if (endOffset > -1 && endOffset < offset) offset = endOffset;
+ if (offset > -1) {
+ targetModText->append(line, offset);
+ line << offset;
+ int end = line.indexOf(">");
+ if (end > -1) {
+ SWBuf abText = "";
+ abText.append(line, end+1);
+ XMLTag ab(abText);
+ targetModText->append(abText);
+ line << (end+1);
+ if (ab.isEndTag()) {
+ break;
+ }
+ lastAB = ab;
+ }
+ }
+ else {
+ targetModText->append(line);
+ line = "";
+ }
+ }
+
+ // assert our source is in good condition to give us more data
+ if (fileEnd && !line.size()) return false;
+
+ // grab our raw, fully marked up TargetMod text for this verse
+ if (lastAB.isEndTag()) {
+ // we are just returning interverse material so targetModKey is out of bounds
+ // just set to any error
+ targetModKey->setError(-99);
+ }
+ else {
+ // <ab xml:id="V-B33K1V2-33-MATT" n="2">
+ SWBuf id = lastAB.getAttribute("xml:id");
+ SWBuf bkv = "";
+ SWBuf bookName = "";
+ SWBuf bookNum = "";
+ SWBuf chapter = "";
+ SWBuf verse = "";
+ SWBuf segment = id.stripPrefix('-');
+ if (!segment.size()) bkv = id;
+ if (!bkv.size() && !segment.startsWith("B")) {
+ segment = id.stripPrefix('-');
+ }
+ else if (!bkv.size()) bkv = segment;
+ if (!bkv.size() && !segment.startsWith("B")) {
+ segment = id.stripPrefix('-');
+ }
+ else if (!bkv.size()) bkv = segment;
+ // if we have more segments, find the last segment
+ // because this is likely the bookName
+ if (bkv.size() && id.size() && id != bkv) {
+ id.stripPrefix('-');
+ id.stripPrefix('-');
+ id.stripPrefix('-');
+ id.stripPrefix('-');
+ bookName = id;
+ }
+ if (bkv.size()) {
+ bkv << 1;
+ bookNum = bkv.stripPrefix('K');
+ chapter = bkv.stripPrefix('V');
+ verse = bkv;
+
+ SWBuf osisID = (bookName.size() ? bookName : bookNum);
+ osisID.appendFormatted(".%s.%s", chapter.c_str(), verse.c_str());
+ (*targetModKey) = osisID;
+ }
+
+ }
+
+ return true;
+}
+
+
int main(int argc, char **argv) {
const char *progName = argv[0];
for (int i = 1; i < argc; ++i) {
@@ -83,6 +203,12 @@
}
else usage(progName, "-t argument requires a module name.");
}
+ else if (!strcmp(argv[i], "-tei")) {
+ if ((i + 1) < argc) {
+ targetTEIFile = argv[++i];
+ }
+ else usage(progName, "-tei argument requires a tei filename.");
+ }
else if (!strcmp(argv[i], "-e")) {
if (i+1 < argc) {
optionExceptionFile.push_back(argv[++i]);
@@ -94,13 +220,25 @@
SWMgr lib;
lib.setGlobalOption("Textual Variants", "Secondary Reading");
- SWModule *m = lib.getModule(targetModuleName);
- if (!m) {
- cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n";
- if (argc < 2) usage(progName, "Use -t to supply target module name");
- exit(1);
+ SWModule *m = 0;
+ if (targetTEIFile.size()) {
+ targetInput = FileMgr::getSystemFileMgr()->open(targetTEIFile, FileMgr::RDONLY);
+ if (!targetInput || targetInput->getFd() < 1) {
+ cerr << "\nERROR: couldn't open tei file: " << targetTEIFile << ".\n";
+ usage(progName, "Use -tei <filename> to supply tei filename");
+ exit(1);
+ }
}
- SWModule &targetMod = *m;
+ else {
+ m = lib.getModule(targetModuleName);
+ if (!m) {
+ cerr << "\nERROR: couldn't find target module: " << targetModuleName << ".\n";
+ if (argc < 2) usage(progName, "Use -t to supply target module name");
+ exit(1);
+ }
+ targetMod = m;
+ }
+
m = lib.getModule(strongsSourceModuleName.c_str());
if (!m) {
cerr << "\nERROR: couldn't find Strong's source module: " << strongsSourceModuleName.c_str() << ".\n";
@@ -115,15 +253,17 @@
else (*exceptionFile) += SWConfig(fileName);
}
- // we'll do the whole Bible eventually, but let's just get one verse
- // working well.
- ((VerseKey *)targetMod.getKey())->setIntros(true);
- targetMod.getKey()->setText("mat0.0"); // let's try this verse
- int z = 0;
- for (;
-//!z &&
-!targetMod.popError(); targetMod++) {
- z++;
+ VerseKey *targetModKey = (VerseKey *)(targetInput ? fromMod.createKey() : targetMod->createKey());
+ targetModKey->setIntros(true);
+ SWBuf targetModText;
+ while ((targetInput ? getNextVerseTEI(targetModKey, &targetModText) : getNextVerse(targetModKey, &targetModText))) {
+ if (targetModKey->getError()) {
+ cout << targetModText;
+ cout << endl;
+ continue;
+ }
+ // we'll do the whole Bible eventually, but let's just get one verse
+ // working well.
// XML word tags which should be placed in this verse (start tag)
// eg., <w lemma=...>
@@ -177,11 +317,13 @@
bibMap.clear();
wTags.clear();
- fromMod.setKey(targetMod.getKey());
- cout << "$$$ " << targetMod.getKeyText() << endl;
+ fromMod.setKey(targetModKey);
+ if (!targetTEIFile.size()) {
+ cout << "$$$ " << targetModKey->getText() << endl;
+ }
if (optionDebug) {
- cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
+ cout << "\nProcessing Verse: " << targetModKey->getText() << endl;
cout << "---------------------" << endl;
cout << "\nOur strongsSourceModule Markup" << endl;
@@ -192,7 +334,7 @@
// grab our raw, fully marked up TargetMod text for this verse
- SWBuf orig = targetMod.getRawEntryBuf();
+ SWBuf orig = targetModText;
if (optionDebug) {
@@ -268,7 +410,7 @@
// ok, now that we have our targetWordTags magically populated
// let's do the grunt work of inserting the <w> and </w> tags
- insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+ insertWordTags((VerseKey *)targetModKey, newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
if (optionDebug) {
@@ -281,13 +423,13 @@
for (int i = 0; i < targetWords.size(); ++i) {
if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
if (!warned) {
- cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+ cerr << "*** Error: didn't match all words: " << targetModKey->getText() << endl;
cerr << strongsSourceModuleName.c_str() << ":";
for (int j = 0; j < fromWords.size(); ++j) {
cerr << " " << fromWords[j];
}
cerr << endl;
- cerr << targetModuleName << ":";
+ cerr << (targetTEIFile.size() ? targetTEIFile : targetModuleName) << ":";
for (int j = 0; j < targetWords.size(); ++j) {
cerr << " " << targetWords[j];
}
@@ -303,8 +445,8 @@
}
}
if (warned) {
- cerr << "\n" << targetModuleName << " Tags:\n";
- VerseKey *vk = (VerseKey *)targetMod.getKey();
+ cerr << "\n" << (targetTEIFile.size() ? targetTEIFile : targetModuleName) << " Tags:\n";
+ VerseKey *vk = (VerseKey *)targetModKey;
for (int j = 0; j < targetWords.size(); ++j) {
if (!strstr(ignoreSeries, targetWords[j])) {
cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl;
@@ -325,7 +467,10 @@
cout << "\nAND... Here's our final output" << endl;
cout << "---------------------" << endl;
}
- cout << newTargetModMarkup << endl;
+ cout << newTargetModMarkup;
+ if (!targetTEIFile.size()) {
+ cout << endl;
+ }
if (optionDebug) {
cout << endl;
}
More information about the sword-cvs
mailing list