#include #include #include #include #include #include #include #include #include #include using namespace sword; using namespace std; #include "matchers/matcher.h" // select your matcher here #include "matchers/gntmatcher.h" Matcher *matcher = new GNTMatcher(); const char *targetModuleName="NA28raw"; SWBuf strongsSourceModuleName = "WHNU"; const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ "; typedef vector BibMap; void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false); SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags); SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector &targetWords, vector &targetWordStarts, vector &targetWordEnds); void pullFromModData(SWModule &fromMod, vector&wordTags, vector &fromWords, vector &fromWordTags); void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector &targetWordTags, const vector &wordTags, const vector &targetWordStarts, const vector &targetWordEnds); // app options bool optionFilterAccents = false; bool optionFilterAppCrit = false; bool optionDebug = false; vector optionExceptionFile; SWConfig *exceptionFile = 0; void usage(const char *progName, const char *error = 0) { if (error) fprintf(stderr, "\n%s: %s\n", progName, error); fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n"); fprintf(stderr, "\nusage: %s [options]\n", progName); fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n"); fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n"); fprintf(stderr, " -fc\t\t\t filter critical apparatus markers from final text\n"); fprintf(stderr, " -e \t provide an ini-style .conf file with overriding tag exceptions.\n"); fprintf(stderr, "\n\n"); exit(-1); } int main(int argc, char **argv) { const char *progName = argv[0]; for (int i = 1; i < argc; ++i) { if (!strcmp(argv[i], "-v")) { optionDebug = true; } else if (!strcmp(argv[i], "-fa")) { optionFilterAccents = true; } else if (!strcmp(argv[i], "-fc")) { optionFilterAppCrit = true; } else if (!strcmp(argv[i], "-ss")) { if ((i + 1) < argc) { strongsSourceModuleName = argv[++i]; } else usage(progName, "-ss argument requires a module name."); } else if (!strcmp(argv[i], "-e")) { if (i+1 < argc) { optionExceptionFile.push_back(argv[++i]); } else usage(progName, "-e argument requires a file name."); } else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } SWMgr lib; lib.setGlobalOption("Textual Variants", "Secondary Reading"); SWModule *m = lib.getModule(targetModuleName); if (!m) { cerr << "couldn't find target module: " << targetModuleName << ".\n"; exit(1); } SWModule &targetMod = *m; m = lib.getModule(strongsSourceModuleName.c_str()); if (!m) { cerr << "couldn't find source module: " << strongsSourceModuleName.c_str() << ".\n"; exit(1); } SWModule &fromMod = *m; for (int i = 0; i < optionExceptionFile.size(); ++i) { SWBuf fileName = optionExceptionFile[i]; if (!i) exceptionFile = new SWConfig(fileName); else (*exceptionFile) += SWConfig(fileName); } // we'll do the whole Bible eventually, but let's just get one verse // working well. ((VerseKey *)targetMod.getKey())->setIntros(true); targetMod.getKey()->setText("mat0.0"); // let's try this verse int z = 0; for (; //!z && !targetMod.popError(); targetMod++) { z++; // XML word tags which should be placed in this verse (start tag) // eg., // pulled from FromMod vector wordTags; // Just the raw canonical Bible text of this verse with no tags // eg., "In the beginning God created the heavens and the earth." SWBuf justTargetModBibleText = ""; // a mapping for each character in justTargetModBibleText to the real location // in our out buffer. This allows us to insert our and // tags in the correct place amongst the fully marked up // TargetMod out buffer. This work is all done in the insert() method // above BibMap bibMap; BibMap wTags; // justTargetModBibleText (above) broken down into separate words // ie. all words in the TargetMod from this verse // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ... vector targetWords; // where each corresponding targetWords[x] starts in justTargetModBibleText // eg. for "In the beginning..." // [0] = 0; [1] = 3; [2] = 7; ... // Needed to pass to insert method so we know where // to insert the start tag vector targetWordStarts; // same as targetWordStarts, but the end of each word // eg. [0] = 1; [1] = 5; [2] = 15 // Needed to pass to insert method so we know where // to insert the end tag vector targetWordEnds; // This is the doozy. This maps each TargetMod word to the correct // wordTags entry. vector targetWordTags; // Equivalent to targetWords above, but for the FromMod. // Useful for helping determine matches to TargetMod words vector fromWords; // Equivalent to targetWordTag which we need to produce, // but this one is produced for us from the FromMod data // If we can match a fromWords[x] entry, then we can assign // targetWorkTags[ourMatch] = fromWordTags[x] vector fromWordTags; bibMap.clear(); wTags.clear(); fromMod.setKey(targetMod.getKey()); cout << "$$$ " << targetMod.getKeyText() << endl; if (optionDebug) { cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl; cout << "---------------------" << endl; cout << "\nOur FromMod Verse Markup" << endl; cout << "---------------------" << endl; cout << fromMod.getRawEntry() << endl; cout << "---------------------" << endl; } // grab our raw, fully marked up TargetMod text for this verse SWBuf orig = targetMod.getRawEntryBuf(); if (optionDebug) { cout << "\nOur Original TargetMod Markup" << endl; cout << "---------------------" << endl; cout << orig << endl; cout << "---------------------" << endl; } if (optionFilterAppCrit) { SWBuf o = orig; const unsigned char* from = (unsigned char*)o.c_str(); orig = ""; while (*from) { SW_u32 ch = getUniCharFromUTF8(&from, true); // if ch is bad, then convert to replacement char if (!ch) ch = 0xFFFD; SWBuf checkChar; getUTF8FromUniChar(ch, &checkChar); if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue; orig.append(checkChar); } } // let's find where just the canonical text is amongst // all our markup // newTargetModMarkup will eventually hold our updated markup with // the new tags, but we'll start here by setting it to // the processed original markup. // on return, bibMap will be populated with each character // and the corresponding location into newTargetModMarkup where // the character resides. SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags); if (optionDebug) { cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl; cout << "---------------------" << endl; cout << newTargetModMarkup << endl; cout << "---------------------" << endl; cout << "\nOur bibMap" << endl; cout << "---------------------" << endl; for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) { cout << *it << " "; } cout << "\n---------------------" << endl; } // let's populate our TargetMod word data and fill in our // justTargetModBibleText buffer justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds); if (optionDebug) { cout << "\nJust TargetMod Bible Text" << endl; cout << "---------------------" << endl; cout << justTargetModBibleText << endl; cout << "---------------------" << endl; } // ok, now lets grab out the groovy data from the FromMod module pullFromModData(fromMod, wordTags, fromWords, fromWordTags); // // ok, here's the real work. // // This method needs to guess which TargetMod words match which FromMod // words and then point them to their same original language // word tag by populating targetWordTags // matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags); // ok, now that we have our targetWordTags magically populated // let's do the grunt work of inserting the and tags insertWordTags((VerseKey *)targetMod.getKey(), newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds); if (optionDebug) { cout << "\nHere's how you mapped things..." << endl; cout << "---------------------" << endl; cout << "Total wordTags: " << wordTags.size() << endl; cout << "\nTargetMod Words: " << endl; } bool warned = false; for (int i = 0; i < targetWords.size(); ++i) { if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) { if (!warned) { cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl; cerr << strongsSourceModuleName.c_str() << ":"; for (int j = 0; j < fromWords.size(); ++j) { cerr << " " << fromWords[j]; } cerr << endl; cerr << targetModuleName << ":"; for (int j = 0; j < targetWords.size(); ++j) { cerr << " " << targetWords[j]; } cerr << endl; cerr << endl; cerr << "Unmatched Words:" << endl; warned = true; } cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl; } if (optionDebug) { cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] > -1 ? wordTags[targetWordTags[i]] : "") << endl; } } if (warned) { cerr << "\n" << targetModuleName << " Tags:\n"; VerseKey *vk = (VerseKey *)targetMod.getKey(); for (int j = 0; j < targetWords.size(); ++j) { if (!strstr(ignoreSeries, targetWords[j])) { cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] > -1 ? (const char *)wordTags[targetWordTags[j]] : (targetWordTags[j] == -2 ? "{Using Exception}" : "")) << endl; } } cerr << "---------------------" << endl; } if (optionFilterAccents) { UTF8GreekAccents filter; filter.setOptionValue("off"); filter.processText(newTargetModMarkup); } if (optionDebug) { cout << "---------------------" << endl; cout << "\nAND... Here's your final output" << endl; cout << "---------------------" << endl; } cout << newTargetModMarkup << endl; if (optionDebug) { cout << endl; } } delete exceptionFile; return 0; } // builds up bibMap to contain only characters of Biblical text // and each character's corresponding real location in our output // buffer (returned value) SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) { SWBuf out = ""; SWBuf tag = ""; int tagLevel = 0; int wTag = -1; int inTag = 0; SWBuf lastElementText = ""; for (int i = 0; i < orig.length(); ++i) { if (orig[i] == '<') { inTag = true; } else if (orig[i] == '>') { inTag = false; XMLTag t = tag.c_str(); bool skipTag = false; if (!t.isEmpty()) { if (t.isEndTag()) { // clear out empty w tags if (t.getName() && !strcmp("w", t.getName())) { if (!lastElementText.size()) { out.setSize(wTag); if (out.endsWith(' ')) { // && i < (orig.length() - 1) && orig[i+1] == ' ') { out.setSize(out.size() - 1); bibMap.pop_back(); wTags.pop_back(); } skipTag = true; } } tagLevel--; wTag = -1; } else { lastElementText = ""; tagLevel++; wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1; } } if (!skipTag) out += t; tag = ""; } else if (inTag) { tag += orig[i]; } else { // for texts without tags // if (!tagLevel || wTag != -1) { if (wTag != -1 || orig[i] == ' ') { bibMap.push_back(out.size()); wTags.push_back(wTag); } out += orig[i]; lastElementText += orig[i]; } } return out; } // Inserts addText into out buffer and adjusts Bible character pointers accordingly // void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) { int to = 0; if (!after && wTags[bibPos] != -1) { to = wTags[bibPos] + 2; addText--; // discard the '>' addText << 2; // discard the ' &targetWords, vector &targetWordStarts, vector &targetWordEnds) { SWBuf bibWord = ""; SWBuf fromWord = ""; SWBuf bibText = ""; for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) { /* char *b1 = markupBuf.getRawData()+*it; char *b2 = b1; __u32 uc = getUniCharFromUTF8(&b2); bool wordBreak = false; if (uc) { SWBuf u8c; u8c.append(b1, b2-b1); if (strstr(ignoreSeries, u8c.getRawData())) } */ char c = markupBuf[*it]; if (c != ' ' && c != '.' && c != ';' && c != ',') { if (!bibWord.length()) targetWordStarts.push_back(bibText.length()); bibWord += c; } else { if (bibWord.length()) { targetWordEnds.push_back(bibText.length()-1); targetWords.push_back(bibWord); bibWord = ""; } } bibText += c; } if (bibWord.length()) { targetWordEnds.push_back(bibText.length()-1); targetWords.push_back(bibWord); } return bibText; } void pullFromModData(SWModule &fromMod, vector&wordTags, vector &fromWords, vector &fromWordTags) { fromMod.renderText(); // be sure FromMod has processed entry attributes AttributeList &words = fromMod.getEntryAttributes()["Word"]; SWBuf fromWord = ""; SWBuf bibWord = ""; for (AttributeList::iterator it = words.begin(); it != words.end(); it++) { // this is our new XMLTag. // attributes will be added below XMLTag w("w"); // this only gives us word count, not if we have multiple entries per word // don't use as loop int parts = atoi(it->second["PartCount"]); SWBuf lemma = ""; SWBuf morph = ""; bool found = true; for (int i = 1; found; ++i) { found = false; SWBuf key = ""; key = SWBuf().setFormatted("Lemma.%d", i); AttributeValue::iterator li = it->second.find(key); if (i == 1 && li == it->second.end()) li = it->second.find("Lemma"); if (li != it->second.end()) { found = true; if (i > 1) lemma += " "; key = SWBuf().setFormatted("LemmaClass.%d", i); AttributeValue::iterator lci = it->second.find(key); if (i == 1 && lci == it->second.end()) lci = it->second.find("LemmaClass"); if (lci != it->second.end()) { lemma += lci->second + ":"; } lemma += li->second; } key = SWBuf().setFormatted("Morph.%d", i); li = it->second.find(key); if (i == 1 && li == it->second.end()) li = it->second.find("Morph"); if (li != it->second.end()) { found = true; if (i > 1) morph += " "; key = SWBuf().setFormatted("MorphClass.%d", i); AttributeValue::iterator lci = it->second.find(key); if (i == 1 && lci == it->second.end()) lci = it->second.find("MorphClass"); if (lci != it->second.end()) { morph += lci->second + ":"; } morph += li->second; } // TODO: add src tags and maybe other attributes } if (lemma.length()) w.setAttribute("lemma", lemma); if (morph.length()) w.setAttribute("morph", morph); fromWord = it->second["Text"]; bibWord = ""; for (int j = 0; j < fromWord.length(); ++j) { char c = fromWord[j]; if (c != ' ' && c != '.' && c != ';' && c != ',') { bibWord += c; } else { if (bibWord.length()) { fromWords.push_back(bibWord); fromWordTags.push_back(wordTags.size()); bibWord = ""; } } } if (bibWord.length()) { fromWords.push_back(bibWord); fromWordTags.push_back(wordTags.size()); } wordTags.push_back(w); } } void insertWordTags(VerseKey *vk, SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, vector &targetWordTags, const vector &wordTags, const vector &targetWordStarts, const vector &targetWordEnds) { // TODO: this method needs some work, // like putting multiple consecutive words // together in one tag ConfigEntMap exceptions; if (exceptionFile) { exceptions = exceptionFile->getSection("exceptions"); } for (int i = 0; i < targetWordTags.size(); ++i) { SWBuf wordTag = ""; if (targetWordTags[i] > -1) { wordTag = wordTags[targetWordTags[i]]; } if (exceptionFile) { SWBuf key; key.setFormatted("%s.%d", vk->getOSISRef(), i); ConfigEntMap::const_iterator it = exceptions.find(key); if (it != exceptions.end()) { targetWordTags[i] = -2; // note that we are using an exception, not a mapping, not unset (-1) wordTag = it->second; } } if (wordTag.length()) { insert((const char *)wordTag, markupBuf, targetWordStarts[i], bibMap, wTags); insert("", markupBuf, targetWordEnds[i], bibMap, wTags, true); } } }