[sword-svn] r532 - in trunk/migratetags: . matchers
scribe at crosswire.org
scribe at crosswire.org
Sun Apr 14 15:36:24 MST 2019
Author: scribe
Date: 2019-04-14 15:36:24 -0700 (Sun, 14 Apr 2019)
New Revision: 532
Added:
trunk/migratetags/matchers/
trunk/migratetags/matchers/defaultmatcher.h
trunk/migratetags/matchers/matcher.h
trunk/migratetags/migratetags.cpp
Removed:
trunk/migratetags/esvtag.cpp
Modified:
trunk/migratetags/Makefile
Log:
Generalized migratetags and extracted matcher logic
Modified: trunk/migratetags/Makefile
===================================================================
--- trunk/migratetags/Makefile 2018-12-22 09:01:18 UTC (rev 531)
+++ trunk/migratetags/Makefile 2019-04-14 22:36:24 UTC (rev 532)
@@ -1,5 +1,6 @@
-TARGETS= esvtag
+TARGETS= migratetags
+
all: $(TARGETS)
clean:
Deleted: trunk/migratetags/esvtag.cpp
===================================================================
--- trunk/migratetags/esvtag.cpp 2018-12-22 09:01:18 UTC (rev 531)
+++ trunk/migratetags/esvtag.cpp 2019-04-14 22:36:24 UTC (rev 532)
@@ -1,389 +0,0 @@
-#include <versekey.h>
-#include <swmgr.h>
-#include <utilxml.h>
-#include <swbuf.h>
-#include <swmodule.h>
-#include <iostream>
-#include <vector>
-
-using namespace sword;
-using namespace std;
-
-typedef vector<unsigned long> BibMap;
-
-void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after = false);
-int compare(const SWBuf &s1, const SWBuf &s2);
-
-SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap);
-SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds);
-void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags);
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds);
-
-
-//
-// This is where the magic happens
-//
-// we must point each esv word to an XMLTag
-//
-// when the magic is done, and your guess is made
-// populate esvWordTags with the integer offset
-// into wordTags for which XMLTag you think it should
-// be.
-//
-void matchWords(vector<int> &esvWordTags, const vector<SWBuf> &esvWords, const vector<SWBuf> &kjvWords, const vector<int> &kjvWordTags) {
-
- // initialize our results to all -1 so we can pop around and set
- // words as we find them, and know which ones we haven't yet set
- for (int i = 0; i < esvWords.size(); i++) esvWordTags.push_back(-1);
-
-
- // poor effort attempt
- int j = 0;
- for (int i = 0; i < esvWords.size(); i++) {
- while (true) {
- int match = compare(esvWords[i], kjvWords[j]);
- // if we have a better than 75% match of sequencial characters
- // then we'll say we have a match
- if (match > 75) {
- esvWordTags[i] = kjvWordTags[j++];
- break;
- }
- // TOTRY: maybe check one word before and after?
- //
- // be creative!
- //
- }
- }
-}
-
-
-int main(int argc, char **argv) {
- VerseKey vk;
- SWMgr lib;
- SWModule &esv = *lib.getModule("ESV");
- SWModule &kjv = *lib.getModule("KJV");
-
- // we'll do the whole Bible eventually, but let's just get one verse
- // working well.
- esv.setKey("gen1.1"); // lets try this verse
-// for (esv = TOP; !esv.Error(); esv++) {
-
- // XML word tags which should be placed in this verse (start tag)
- // eg., <w lemma=...>
- // pulled from KJV
- vector<XMLTag> wordTags;
-
- // Just the raw canonical Bible text of this verse with no tags
- // eg., "In the beginning God created the heavens and the earth."
- SWBuf justESVBibleText = "";
-
- // a mapping for each character in justESVBibleText to the real location
- // in our out buffer. This allows us to insert our <w> and </w>
- // tags in the correct place amongst the fully marked up
- // ESV out buffer. This work is all done in the insert() method
- // above
- BibMap bibMap;
-
- // justESVBibleText (above) broken down into separate words
- // ie. all words in the ESV from this verse
- // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
- vector<SWBuf> esvWords;
-
- // where each corresponding esvWords[x] starts in justESVBibleText
- // eg. for "In the beginning..."
- // [0] = 0; [1] = 3; [2] = 7; ...
- // Needed to pass to insert method so we know where
- // to insert the <w> start tag
- vector<int> esvWordStarts;
-
- // same as esvWordStarts, but the end of each word
- // eg. [0] = 1; [1] = 5; [2] = 15
- // Needed to pass to insert method so we know where
- // to insert the </w> end tag
- vector<int> esvWordEnds;
-
- // This is the doozy. This maps each ESV word to the correct
- // wordTags entry.
- vector<int> esvWordTags;
-
- // Equivalent to esvWords above, but for the KJV.
- // Useful for helping determine matches to ESV words
- vector<SWBuf> kjvWords;
-
- // Equivalent to esvWordTag which we need to produce,
- // but this one is produced for us from the KJV data
- // If we can match a kjvWords[x] entry, then we can assign
- // esvWorkTags[ourMatch] = kjvWordTags[x]
- vector<int> kjvWordTags;
-
- bibMap.clear();
-
- kjv.setKey(esv.getKey());
-
- cout << "\nProcessing Verse: " << esv.getKeyText() << endl;
- cout << "---------------------" << endl;
-
- cout << "\nOur KJV Verse Markup" << endl;
- cout << "---------------------" << endl;
- cout << kjv.getRawEntry() << endl;
- cout << "---------------------" << endl;
-
-
- // grab our raw, fully marked up ESV text for this verse
- SWBuf orig = esv.getRawEntryBuf();
-
- cout << "\nOur Original ESV Markup" << endl;
- cout << "---------------------" << endl;
- cout << orig << endl;
- cout << "---------------------" << endl;
-
- // let's find where just the canonical text is amongst
- // all our markup
- // newESVMarkup will eventually hold our updated markup with
- // the new <w> tags, but we'll start here by setting it to
- // the processed original markup.
- // on return, bibMap will be populated with each character
- // and the corresponding location into newESVMarkup where
- // the character resides.
- SWBuf newESVMarkup = findCanonicalBibleText(orig, bibMap);
-
- cout << "\nOur Original ESV Markup After XMLTag-ifying" << endl;
- cout << "---------------------" << endl;
- cout << newESVMarkup << endl;
- cout << "---------------------" << endl;
-
- // let's populate or ESV word data and fill in our
- // justESVBibleText buffer
- justESVBibleText = buildWordMaps(newESVMarkup, bibMap, esvWords, esvWordStarts, esvWordEnds);
-
- cout << "\nJust ESV Bible Text" << endl;
- cout << "---------------------" << endl;
- cout << justESVBibleText << endl;
- cout << "---------------------" << endl;
-
-
- // ok, now lets grab out the groovy data from the KJV module
- pullKJVData(kjv, wordTags, kjvWords, kjvWordTags);
-
-
- //
- // ok, here's the real work.
- //
- // This method needs to guess which ESV words match which KJV
- // words and then point them to their same original language
- // word tag by populating esvWordTags
- //
- matchWords(esvWordTags, esvWords, kjvWords, kjvWordTags);
-
- // ok, now that we have our esvWordTags magically populated
- // let's do the grunt work of inserting the <w> and </w> tags
- insertWordTags(newESVMarkup, bibMap, esvWordTags, wordTags, esvWordStarts, esvWordEnds);
-
-
- cout << "\nHere's how you mapped things..." << endl;
- cout << "---------------------" << endl;
- cout << "Total wordTags: " << wordTags.size() << endl;
- cout << "\nESV Words: " << endl;
- for (int i = 0; i < esvWords.size(); i++) {
- cout << esvWords[i] << " : " << esvWordTags[i] << " => " << wordTags[esvWordTags[i]] << endl;
- }
- cout << "---------------------" << endl;
-
- cout << "\nAND... Here's your final output" << endl;
- cout << "---------------------" << endl;
- cout << newESVMarkup << endl;
- cout << endl;
-// }
- return 0;
-}
-
-
-// builds up bibMap to contain only characters of Biblical text
-// and each character's corresponding real location in our output
-// buffer (returned value)
-SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap) {
- SWBuf out = "";
- SWBuf tag = "";
- int tagLevel = 0;
- int inTag = 0;
- for (int i = 0; i < orig.length(); i++) {
- if (orig[i] == '<') {
- inTag = true;
- }
- else if (orig[i] == '>') {
- inTag = false;
- XMLTag t = tag.c_str();
- if (!t.isEmpty()) {
- if (t.isEndTag()) {
- tagLevel--;
- }
- else {
- tagLevel++;
- }
- }
- out += t;
- tag = "";
- }
- else if (inTag) {
- tag += orig[i];
- }
- else {
- if (!tagLevel) {
- bibMap.push_back(out.size());
- }
- out += orig[i];
- }
- }
- return out;
-}
-
-
-// Inserts addText into out buffer and adjusts Bible character pointers accordingly
-//
-void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after) {
- out.insert(bibMap[bibPos]+((after)?1:0), addText);
- for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
- bibMap[i] += addText.length();
- }
-}
-
-
-// Compares 2 words and tries to give a percentage assurance of a match
-// TODO: could use more smarts here
-//
-int compare(const SWBuf &s1, const SWBuf &s2) {
- int retVal = 0;
- SWBuf largest = (s1.length() > s2.length()) ? s1 : s2;
- SWBuf smallest = (s1.length() > s2.length()) ? s2 : s1;
- int matches = 0;
- int j = 0;
- for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
- while (j < largest.length()) {
- if (smallest[i] == largest[j++]) {
- matches++;
- break;
- }
- }
- }
- return (((float)matches) / largest.length()) * 100;
-}
-
-
-SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds) {
- SWBuf bibWord = "";
- SWBuf kjvWord = "";
- SWBuf bibText = "";
- for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
- char c = markupBuf[*it];
- if ((c >= 'a' && c <='z') ||
- (c >= 'A' && c <='Z')
- ) {
- if (!bibWord.length()) esvWordStarts.push_back(bibText.length());
- bibWord += c;
- }
- else {
- if (bibWord.length()) {
- esvWordEnds.push_back(bibText.length()-1);
- esvWords.push_back(bibWord);
- bibWord = "";
- }
- }
- bibText += c;
- }
- if (bibWord.length()) {
- esvWordEnds.push_back(bibText.length()-1);
- esvWords.push_back(bibWord);
- }
- return bibText;
-}
-
-
-void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags) {
- kjv.RenderText(); // be sure KJV has processed entry attributes
- AttributeList &words = kjv.getEntryAttributes()["Word"];
- SWBuf kjvWord = "";
- SWBuf bibWord = "";
- for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
- // this is our new <w> XMLTag.
- // attributes will be added below
- XMLTag w("w");
- int parts = atoi(it->second["PartCount"]);
- SWBuf lemma = "";
- SWBuf morph = "";
- for (int i = 1; i <= parts; i++) {
- SWBuf key = "";
- key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
- AttributeValue::iterator li = it->second.find(key);
- if (li != it->second.end()) {
- if (i > 1) lemma += " ";
- key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
- AttributeValue::iterator lci = it->second.find(key);
- if (lci != it->second.end()) {
- lemma += lci->second + ":";
- }
- lemma += li->second;
- }
- key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
- li = it->second.find(key);
- // silly. sometimes morph counts don't equal lemma counts
- if (i == 1 && parts != 1 && li == it->second.end()) {
- li = it->second.find("Morph");
- }
- if (li != it->second.end()) {
- if (i > 1) morph += " ";
- key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
- AttributeValue::iterator lci = it->second.find(key);
- // silly. sometimes morph counts don't equal lemma counts
- if (i == 1 && parts != 1 && lci == it->second.end()) {
- lci = it->second.find("MorphClass");
- }
- if (lci != it->second.end()) {
- morph += lci->second + ":";
- }
- morph += li->second;
- }
- // TODO: add src tags and maybe other attributes
- }
-
- if (lemma.length()) w.setAttribute("lemma", lemma);
- if (morph.length()) w.setAttribute("morph", morph);
-
-
- kjvWord = it->second["Text"];
- bibWord = "";
- for (int j = 0; j < kjvWord.length(); j++) {
- char c = kjvWord[j];
- if ((c >= 'a' && c <='z') ||
- (c >= 'A' && c <='Z')
- ) {
- bibWord += c;
- }
- else {
- if (bibWord.length()) {
- kjvWords.push_back(bibWord);
- kjvWordTags.push_back(wordTags.size());
- bibWord = "";
- }
- }
- }
- if (bibWord.length()) {
- kjvWords.push_back(bibWord);
- kjvWordTags.push_back(wordTags.size());
- }
-
- wordTags.push_back(w);
- }
-}
-
-
-void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds) {
- // TODO: this method needs some work,
- // like putting multiple consecutive words
- // together in one tag
- for (int i = 0; i < esvWordTags.size(); i++) {
- if (esvWordTags[i] > -1) {
- insert((const char *)wordTags[esvWordTags[i]], markupBuf, esvWordStarts[i], bibMap);
- insert("</w>", markupBuf, esvWordEnds[i], bibMap, true);
- }
- }
-}
Added: trunk/migratetags/matchers/defaultmatcher.h
===================================================================
--- trunk/migratetags/matchers/defaultmatcher.h (rev 0)
+++ trunk/migratetags/matchers/defaultmatcher.h 2019-04-14 22:36:24 UTC (rev 532)
@@ -0,0 +1,80 @@
+#include "matcher.h"
+
+#ifndef defaultmatcher_h
+#define defaultmatcher_h
+
+class DefaultMatcher : public Matcher {
+public:
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) {
+ SWBuf t1 = s1;
+ SWBuf t2 = s2;
+ UTF8GreekAccents filter;
+ filter.setOptionValue("off");
+
+ // remove greek accents
+ filter.processText(t1);
+ filter.processText(t2);
+
+ // change to uppercase to match
+ StringMgr::getSystemStringMgr()->upperUTF8(t1.getRawData());
+ StringMgr::getSystemStringMgr()->upperUTF8(t2.getRawData());
+
+ int retVal = 0;
+ SWBuf largest = (t1.length() > t2.length()) ? t1 : t2;
+ SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
+ int matches = 0;
+ int j = 0;
+ for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+ while (j < largest.length()) {
+ if (smallest[i] == largest[j++]) {
+ matches++;
+ break;
+ }
+ }
+ }
+ return (((float)matches) / largest.length()) * 100;
+}
+//
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {
+
+ // initialize our results to all -1 so we can pop around and set
+ // words as we find them, and know which ones we haven't yet set
+ for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);
+
+
+ // poor effort attempt
+ int j = 0;
+ for (int i = 0; i < targetWords.size(); ++i) {
+ for (int j = 0; j < fromWords.size(); ++j) {
+ if (fromWordTags[j] == -1) continue;
+ int match = compare(targetWords[i], fromWords[j]);
+ // if we have a better than XX% match of sequencial characters
+ // then we'll say we have a match
+ if (match > 49) {
+ targetWordTags[i] = fromWordTags[j];
+ fromWordTags[j] = -1;
+ break;
+ }
+ // TOTRY: maybe check one word before and after?
+ //
+ // be creative!
+ //
+ }
+ }
+}
+};
+#endif
Added: trunk/migratetags/matchers/matcher.h
===================================================================
--- trunk/migratetags/matchers/matcher.h (rev 0)
+++ trunk/migratetags/matchers/matcher.h 2019-04-14 22:36:24 UTC (rev 532)
@@ -0,0 +1,25 @@
+#ifndef matcher_h
+#define matcher_h
+
+class Matcher {
+public:
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) = 0;
+
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) = 0;
+
+
+};
+#endif
Added: trunk/migratetags/migratetags.cpp
===================================================================
--- trunk/migratetags/migratetags.cpp (rev 0)
+++ trunk/migratetags/migratetags.cpp 2019-04-14 22:36:24 UTC (rev 532)
@@ -0,0 +1,467 @@
+#include <versekey.h>
+#include <utf8greekaccents.h>
+#include <swmgr.h>
+#include <utilxml.h>
+#include <swbuf.h>
+#include <swmodule.h>
+#include <stringmgr.h>
+#include <iostream>
+#include <vector>
+
+using namespace sword;
+using namespace std;
+
+#include "matchers/matcher.h"
+#include "matchers/defaultmatcher.h"
+
+// select your matcher here
+Matcher *matcher = new DefaultMatcher();
+const char *targetModuleName="NA28";
+const char *strongsSourceModuleName="WHNU";
+
+
+const char *ignoreSeries = "⸆¹⸆²⸆⸇᾿˸¹˸²˸³˸·¹²⟦–ʹ°¹°²⸋¹⸋²⸋⸌¹⸌°*[];⸀¹⸀²⸀³⸁⸀◆⟧ ⸂¹⸂²⸄⸂⸅⸃⸉¹⸈⸉⸊ ";
+
+typedef vector<unsigned long> BibMap;
+
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after = false);
+
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &tTags);
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds);
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags);
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds);
+
+// app options
+bool optionFilterAccents = false;
+bool optionFilterAppCrit = false;
+bool optionDebug = false;
+
+void usage(const char *progName, const char *error = 0) {
+ if (error) fprintf(stderr, "\n%s: %s\n", progName, error);
+ fprintf(stderr, "\n=== migratetags (Revision $Rev$) Migrate word morphology from one module to another.\n");
+ fprintf(stderr, "\nusage: %s [options]\n", progName);
+ fprintf(stderr, " -v\t\t\t verbose: print lots of information while processing\n");
+ fprintf(stderr, " -fa\t\t\t filter accents: remove Greek accents from final text\n");
+ fprintf(stderr, "\n\n");
+ exit(-1);
+}
+
+
+
+int main(int argc, char **argv) {
+ const char *progName = argv[0];
+ for (int i = 1; i < argc; i++) {
+ if (!strcmp(argv[i], "-v")) {
+ optionDebug = true;
+ }
+ else if (!strcmp(argv[i], "-fa")) {
+ optionFilterAccents = true;
+ }
+ else if (!strcmp(argv[i], "-fc")) {
+ optionFilterAppCrit = true;
+ }
+ else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
+ }
+ VerseKey vk;
+ SWMgr lib;
+ lib.setGlobalOption("Textual Variants", "Secondary Reading");
+ SWModule *m = lib.getModule(targetModuleName);
+ if (!m) {
+ cerr << "couldn't find target module: " << targetModuleName << ".\n";
+ exit(1);
+ }
+ SWModule &targetMod = *m;
+ m = lib.getModule(strongsSourceModuleName);
+ if (!m) {
+ cerr << "couldn't find source module: " << strongsSourceModuleName << ".\n";
+ exit(1);
+ }
+ SWModule &fromMod = *m;
+
+ // we'll do the whole Bible eventually, but let's just get one verse
+ // working well.
+ targetMod.setKey("mat1.1"); // let's try this verse
+ int z = 0;
+ for (;
+//!z &&
+!targetMod.popError(); targetMod++) {
+ z++;
+
+ // XML word tags which should be placed in this verse (start tag)
+ // eg., <w lemma=...>
+ // pulled from FromMod
+ vector<XMLTag> wordTags;
+
+ // Just the raw canonical Bible text of this verse with no tags
+ // eg., "In the beginning God created the heavens and the earth."
+ SWBuf justTargetModBibleText = "";
+
+ // a mapping for each character in justTargetModBibleText to the real location
+ // in our out buffer. This allows us to insert our <w> and </w>
+ // tags in the correct place amongst the fully marked up
+ // TargetMod out buffer. This work is all done in the insert() method
+ // above
+ BibMap bibMap;
+ BibMap wTags;
+
+ // justTargetModBibleText (above) broken down into separate words
+ // ie. all words in the TargetMod from this verse
+ // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
+ vector<SWBuf> targetWords;
+
+ // where each corresponding targetWords[x] starts in justTargetModBibleText
+ // eg. for "In the beginning..."
+ // [0] = 0; [1] = 3; [2] = 7; ...
+ // Needed to pass to insert method so we know where
+ // to insert the <w> start tag
+ vector<int> targetWordStarts;
+
+ // same as targetWordStarts, but the end of each word
+ // eg. [0] = 1; [1] = 5; [2] = 15
+ // Needed to pass to insert method so we know where
+ // to insert the </w> end tag
+ vector<int> targetWordEnds;
+
+ // This is the doozy. This maps each TargetMod word to the correct
+ // wordTags entry.
+ vector<int> targetWordTags;
+
+ // Equivalent to targetWords above, but for the FromMod.
+ // Useful for helping determine matches to TargetMod words
+ vector<SWBuf> fromWords;
+
+ // Equivalent to targetWordTag which we need to produce,
+ // but this one is produced for us from the FromMod data
+ // If we can match a fromWords[x] entry, then we can assign
+ // targetWorkTags[ourMatch] = fromWordTags[x]
+ vector<int> fromWordTags;
+
+ bibMap.clear();
+ wTags.clear();
+
+ fromMod.setKey(targetMod.getKey());
+ cout << "$$$ " << targetMod.getKeyText() << endl;
+
+if (optionDebug) {
+ cout << "\nProcessing Verse: " << targetMod.getKeyText() << endl;
+ cout << "---------------------" << endl;
+
+ cout << "\nOur FromMod Verse Markup" << endl;
+ cout << "---------------------" << endl;
+ cout << fromMod.getRawEntry() << endl;
+ cout << "---------------------" << endl;
+}
+
+
+ // grab our raw, fully marked up TargetMod text for this verse
+ SWBuf orig = targetMod.getRawEntryBuf();
+
+ if (optionFilterAccents) {
+ UTF8GreekAccents filter;
+ filter.setOptionValue("off");
+ filter.processText(orig);
+ }
+
+ if (optionFilterAppCrit) {
+ SWBuf o = orig;
+ const unsigned char* from = (unsigned char*)o.c_str();
+ orig = "";
+ while (*from) {
+ __u32 ch = getUniCharFromUTF8(&from, true);
+ // if ch is bad, then convert to replacement char
+ if (!ch) ch = 0xFFFD;
+ SWBuf checkChar;
+ getUTF8FromUniChar(ch, &checkChar);
+ if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue;
+ orig.append(checkChar);
+ }
+ }
+
+if (optionDebug) {
+ cout << "\nOur Original TargetMod Markup" << endl;
+ cout << "---------------------" << endl;
+ cout << orig << endl;
+ cout << "---------------------" << endl;
+}
+
+ // let's find where just the canonical text is amongst
+ // all our markup
+ // newTargetModMarkup will eventually hold our updated markup with
+ // the new <w> tags, but we'll start here by setting it to
+ // the processed original markup.
+ // on return, bibMap will be populated with each character
+ // and the corresponding location into newTargetModMarkup where
+ // the character resides.
+ SWBuf newTargetModMarkup = findCanonicalBibleText(orig, bibMap, wTags);
+
+if (optionDebug) {
+ cout << "\nOur Original TargetMod Markup After XMLTag-ifying" << endl;
+ cout << "---------------------" << endl;
+ cout << newTargetModMarkup << endl;
+ cout << "---------------------" << endl;
+
+ cout << "\nOur bibMap" << endl;
+ cout << "---------------------" << endl;
+ for (BibMap::iterator it = bibMap.begin(); it != bibMap.end(); ++it) {
+ cout << *it << " ";
+ }
+ cout << "\n---------------------" << endl;
+}
+
+ // let's populate our TargetMod word data and fill in our
+ // justTargetModBibleText buffer
+ justTargetModBibleText = buildWordMaps(newTargetModMarkup, bibMap, targetWords, targetWordStarts, targetWordEnds);
+
+if (optionDebug) {
+ cout << "\nJust TargetMod Bible Text" << endl;
+ cout << "---------------------" << endl;
+ cout << justTargetModBibleText << endl;
+ cout << "---------------------" << endl;
+}
+
+
+ // ok, now lets grab out the groovy data from the FromMod module
+ pullFromModData(fromMod, wordTags, fromWords, fromWordTags);
+
+
+ //
+ // ok, here's the real work.
+ //
+ // This method needs to guess which TargetMod words match which FromMod
+ // words and then point them to their same original language
+ // word tag by populating targetWordTags
+ //
+ matcher->matchWords(targetWordTags, targetWords, fromWords, fromWordTags);
+
+
+ // ok, now that we have our targetWordTags magically populated
+ // let's do the grunt work of inserting the <w> and </w> tags
+ insertWordTags(newTargetModMarkup, bibMap, wTags, targetWordTags, wordTags, targetWordStarts, targetWordEnds);
+
+
+if (optionDebug) {
+ cout << "\nHere's how you mapped things..." << endl;
+ cout << "---------------------" << endl;
+ cout << "Total wordTags: " << wordTags.size() << endl;
+ cout << "\nTargetMod Words: " << endl;
+}
+ bool warned = false;
+ for (int i = 0; i < targetWords.size(); i++) {
+ if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
+ if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+ warned = true;
+ }
+if (optionDebug) {
+ cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
+}
+ }
+if (optionDebug) {
+ cout << "---------------------" << endl;
+
+ cout << "\nAND... Here's your final output" << endl;
+ cout << "---------------------" << endl;
+}
+ cout << newTargetModMarkup << endl;
+if (optionDebug) {
+ cout << endl;
+}
+ }
+ return 0;
+}
+
+
+// builds up bibMap to contain only characters of Biblical text
+// and each character's corresponding real location in our output
+// buffer (returned value)
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap, BibMap &wTags) {
+ SWBuf out = "";
+ SWBuf tag = "";
+ int tagLevel = 0;
+ int wTag = -1;
+ int inTag = 0;
+ for (int i = 0; i < orig.length(); i++) {
+ if (orig[i] == '<') {
+ inTag = true;
+ }
+ else if (orig[i] == '>') {
+ inTag = false;
+ XMLTag t = tag.c_str();
+ if (!t.isEmpty()) {
+ if (t.isEndTag()) {
+ tagLevel--;
+ wTag = -1;
+ }
+ else {
+ tagLevel++;
+ wTag = (t.getName() && !strcmp("w", t.getName())) ? out.size() : -1;
+ }
+ }
+ out += t;
+ tag = "";
+ }
+ else if (inTag) {
+ tag += orig[i];
+ }
+ else {
+// for texts without <w> tags
+// if (!tagLevel || wTag != -1) {
+ if (wTag != -1 || orig[i] == ' ') {
+ bibMap.push_back(out.size());
+ wTags.push_back(wTag);
+ }
+ out += orig[i];
+ }
+ }
+ return out;
+}
+
+
+// Inserts addText into out buffer and adjusts Bible character pointers accordingly
+//
+void insert(SWBuf addText, SWBuf &out, int bibPos, BibMap &bibMap, BibMap &wTags, bool after) {
+ int to = 0;
+ if (!after && wTags[bibPos] != -1) {
+ to = wTags[bibPos] + 2;
+ addText--; // discard the '>'
+ addText << 2; // discard the '<w'
+ }
+ else {
+ to = bibMap[bibPos]+((after)?1:0);
+ }
+ if (!after || wTags[bibPos] == -1) {
+ out.insert(to, addText);
+ for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+ bibMap[i] += addText.length();
+ if (wTags[i] != -1) wTags[i] += addText.length();
+ }
+ }
+}
+
+
+
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
+ SWBuf bibWord = "";
+ SWBuf fromWord = "";
+ SWBuf bibText = "";
+ for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
+/*
+ char *b1 = markupBuf.getRawData()+*it;
+ char *b2 = b1;
+ __u32 uc = getUniCharFromUTF8(&b2);
+ bool wordBreak = false;
+ if (uc) {
+ SWBuf u8c;
+ u8c.append(b1, b2-b1);
+ if (strstr(ignoreSeries, u8c.getRawData()))
+ }
+*/
+ char c = markupBuf[*it];
+ if (c != ' ' && c != '.' && c != ';' && c != ',') {
+ if (!bibWord.length()) targetWordStarts.push_back(bibText.length());
+ bibWord += c;
+ }
+ else {
+ if (bibWord.length()) {
+ targetWordEnds.push_back(bibText.length()-1);
+ targetWords.push_back(bibWord);
+ bibWord = "";
+ }
+ }
+ bibText += c;
+ }
+ if (bibWord.length()) {
+ targetWordEnds.push_back(bibText.length()-1);
+ targetWords.push_back(bibWord);
+ }
+ return bibText;
+}
+
+
+void pullFromModData(SWModule &fromMod, vector<XMLTag>&wordTags, vector<SWBuf> &fromWords, vector<int> &fromWordTags) {
+ fromMod.renderText(); // be sure FromMod has processed entry attributes
+ AttributeList &words = fromMod.getEntryAttributes()["Word"];
+ SWBuf fromWord = "";
+ SWBuf bibWord = "";
+ for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
+ // this is our new <w> XMLTag.
+ // attributes will be added below
+ XMLTag w("w");
+ int parts = atoi(it->second["PartCount"]);
+ SWBuf lemma = "";
+ SWBuf morph = "";
+ for (int i = 1; i <= parts; i++) {
+ SWBuf key = "";
+ key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+ AttributeValue::iterator li = it->second.find(key);
+ if (li != it->second.end()) {
+ if (i > 1) lemma += " ";
+ key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+ AttributeValue::iterator lci = it->second.find(key);
+ if (lci != it->second.end()) {
+ lemma += lci->second + ":";
+ }
+ lemma += li->second;
+ }
+ key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+ li = it->second.find(key);
+ // silly. sometimes morph counts don't equal lemma counts
+ if (i == 1 && parts != 1 && li == it->second.end()) {
+ li = it->second.find("Morph");
+ }
+ if (li != it->second.end()) {
+ if (i > 1) morph += " ";
+ key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+ AttributeValue::iterator lci = it->second.find(key);
+ // silly. sometimes morph counts don't equal lemma counts
+ if (i == 1 && parts != 1 && lci == it->second.end()) {
+ lci = it->second.find("MorphClass");
+ }
+ if (lci != it->second.end()) {
+ morph += lci->second + ":";
+ }
+ morph += li->second;
+ }
+ // TODO: add src tags and maybe other attributes
+ }
+
+ if (lemma.length()) w.setAttribute("lemma", lemma);
+ if (morph.length()) w.setAttribute("morph", morph);
+
+
+ fromWord = it->second["Text"];
+ bibWord = "";
+ for (int j = 0; j < fromWord.length(); j++) {
+ char c = fromWord[j];
+// if (!strchr(ignoreSeries, c)) {
+ if (c != ' ' && c != '.' && c != ';' && c != ',') {
+ bibWord += c;
+ }
+ else {
+ if (bibWord.length()) {
+ fromWords.push_back(bibWord);
+ fromWordTags.push_back(wordTags.size());
+ bibWord = "";
+ }
+ }
+ }
+ if (bibWord.length()) {
+ fromWords.push_back(bibWord);
+ fromWordTags.push_back(wordTags.size());
+ }
+
+ wordTags.push_back(w);
+ }
+}
+
+
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, BibMap &wTags, const vector<int> &targetWordTags, const vector<XMLTag> &wordTags, const vector<int> &targetWordStarts, const vector<int> &targetWordEnds) {
+ // TODO: this method needs some work,
+ // like putting multiple consecutive words
+ // together in one tag
+ for (int i = 0; i < targetWordTags.size(); i++) {
+ if (targetWordTags[i] > -1) {
+ insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
+ insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
+ }
+ }
+}
More information about the sword-cvs
mailing list