[sword-svn] r533 - in trunk/migratetags: . matchers
scribe at crosswire.org
scribe at crosswire.org
Tue May 5 20:50:10 MST 2020
Author: scribe
Date: 2020-05-05 20:50:10 -0700 (Tue, 05 May 2020)
New Revision: 533
Added:
trunk/migratetags/matchers/gntmatcher.h
Modified:
trunk/migratetags/matchers/defaultmatcher.h
trunk/migratetags/matchers/matcher.h
trunk/migratetags/migratetags.cpp
Log:
Updated from NA28 strongs migration effort. Added GNTMatcher
Modified: trunk/migratetags/matchers/defaultmatcher.h
===================================================================
--- trunk/migratetags/matchers/defaultmatcher.h 2019-04-14 22:36:24 UTC (rev 532)
+++ trunk/migratetags/matchers/defaultmatcher.h 2020-05-06 03:50:10 UTC (rev 533)
@@ -6,23 +6,16 @@
class DefaultMatcher : public Matcher {
public:
+ DefaultMatcher() {
+ }
+
// Compares 2 words and tries to give a percentage assurance of a match
// TODO: could use more smarts here
//
virtual int compare(const SWBuf &s1, const SWBuf &s2) {
- SWBuf t1 = s1;
- SWBuf t2 = s2;
- UTF8GreekAccents filter;
- filter.setOptionValue("off");
+ SWBuf t1 = sanitizeWord(s1);
+ SWBuf t2 = sanitizeWord(s2);
- // remove greek accents
- filter.processText(t1);
- filter.processText(t2);
-
- // change to uppercase to match
- StringMgr::getSystemStringMgr()->upperUTF8(t1.getRawData());
- StringMgr::getSystemStringMgr()->upperUTF8(t2.getRawData());
-
int retVal = 0;
SWBuf largest = (t1.length() > t2.length()) ? t1 : t2;
SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
@@ -38,6 +31,7 @@
}
return (((float)matches) / largest.length()) * 100;
}
+
//
// This is where the magic happens
//
@@ -76,5 +70,10 @@
}
}
}
+virtual SWBuf sanitizeWord(const SWBuf &word) {
+ SWBuf t1 = word;
+ t1.toUpper();
+ return t1;
+}
};
#endif
Added: trunk/migratetags/matchers/gntmatcher.h
===================================================================
--- trunk/migratetags/matchers/gntmatcher.h (rev 0)
+++ trunk/migratetags/matchers/gntmatcher.h 2020-05-06 03:50:10 UTC (rev 533)
@@ -0,0 +1,114 @@
+#include "matcher.h"
+#include <utf8greekaccents.h>
+
+#ifndef gntmatcher_h
+#define gntmatcher_h
+
+class GNTMatcher : public Matcher {
+ UTF8GreekAccents sanitizeGreekAccentFilter;
+public:
+
+ GNTMatcher() : sanitizeGreekAccentFilter() {
+ sanitizeGreekAccentFilter.setOptionValue("off");
+ }
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+virtual int compare(const SWBuf &s1, const SWBuf &s2) {
+ SWBuf t1 = sanitizeWord(s1);
+ SWBuf t2 = sanitizeWord(s2);
+
+ int retVal = 0;
+ SWBuf largest = (t1.length() > t2.length()) ? t1 : t2;
+ SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1;
+ int matches = 0;
+ int j = 0;
+ for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+ while (j < largest.length()) {
+ if (smallest[i] == largest[j++]) {
+ matches++;
+ break;
+ }
+ }
+ }
+ return (((float)matches) / largest.length()) * 100;
+}
+
+//
+// This is where the magic happens
+//
+// we must point each targetMod word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate targetWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+
+virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) {
+
+ // initialize our results to all -1 so we can pop around and set
+ // words as we find them, and know which ones we haven't yet set
+ for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1);
+
+
+ // poor effort attempt
+ int j = 0;
+ for (int i = 0; i < targetWords.size(); ++i) {
+ SWBuf w1 = targetWords[i];
+ int j = 0;
+ for (; j < fromWords.size(); ++j) {
+ if (fromWordTags[j] == -1) continue;
+
+ SWBuf w2 = fromWords[j];
+ int match = compare(w1, w2);
+ // if we have a better than 75% match of sequencial characters
+ // then we'll say we have a match
+ if (match > 99) {
+ targetWordTags[i] = fromWordTags[j];
+ fromWordTags[j] = -1;
+ break;
+ }
+ }
+ // didn't match
+ if (j == fromWords.size()) {
+ // TOTRY: maybe check one word before and after?
+ //
+ // be creative!
+ //
+
+ // let's see if we have common misses, regularize and recheck
+ SWBuf w1Orig = w1;
+ if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα";
+
+ if (w1 != w1Orig) {
+ for (int j = 0; j < fromWords.size(); ++j) {
+ if (fromWordTags[j] == -1) continue;
+
+ SWBuf w2 = fromWords[j];
+ int match = compare(w1, w2);
+ // if we have a better than 75% match of sequencial characters
+ // then we'll say we have a match
+ if (match > 99) {
+ targetWordTags[i] = fromWordTags[j];
+ fromWordTags[j] = -1;
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+
+virtual SWBuf sanitizeWord(const SWBuf &word) {
+ SWBuf t1 = word;
+ // remove greek accents
+ sanitizeGreekAccentFilter.processText(t1);
+ t1.toUpper();
+ t1.replaceBytes("[]", 0);
+ return t1;
+}
+
+};
+#endif
Modified: trunk/migratetags/matchers/matcher.h
===================================================================
--- trunk/migratetags/matchers/matcher.h 2019-04-14 22:36:24 UTC (rev 532)
+++ trunk/migratetags/matchers/matcher.h 2020-05-06 03:50:10 UTC (rev 533)
@@ -20,6 +20,8 @@
//
virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &targetWords, const vector<SWBuf> &fromWords, vector<int> fromWordTags) = 0;
+// sanitize word for comparing (e.g., toUpper, strip accents, etc)
+virtual SWBuf sanitizeWord(const SWBuf &word) = 0;
};
#endif
Modified: trunk/migratetags/migratetags.cpp
===================================================================
--- trunk/migratetags/migratetags.cpp 2019-04-14 22:36:24 UTC (rev 532)
+++ trunk/migratetags/migratetags.cpp 2020-05-06 03:50:10 UTC (rev 533)
@@ -50,7 +50,7 @@
int main(int argc, char **argv) {
const char *progName = argv[0];
- for (int i = 1; i < argc; i++) {
+ for (int i = 1; i < argc; ++i) {
if (!strcmp(argv[i], "-v")) {
optionDebug = true;
}
@@ -62,7 +62,7 @@
}
else usage(progName, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
}
- VerseKey vk;
+
SWMgr lib;
lib.setGlobalOption("Textual Variants", "Secondary Reading");
SWModule *m = lib.getModule(targetModuleName);
@@ -246,15 +246,40 @@
cout << "\nTargetMod Words: " << endl;
}
bool warned = false;
- for (int i = 0; i < targetWords.size(); i++) {
+ for (int i = 0; i < targetWords.size(); ++i) {
if (targetWordTags[i] == -1 && !strstr(ignoreSeries, targetWords[i])) {
- if (!warned) cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
- warned = true;
+ if (!warned) {
+ cerr << "*** Error: didn't match all words: " << targetMod.getKeyText() << endl;
+ cerr << strongsSourceModuleName << ":";
+ for (int j = 0; j < fromWords.size(); ++j) {
+ cerr << " " << fromWords[j];
+ }
+ cerr << endl;
+ cerr << targetModuleName << ":";
+ for (int j = 0; j < targetWords.size(); ++j) {
+ cerr << " " << targetWords[j];
+ }
+ cerr << endl;
+ cerr << endl;
+ cerr << "Unmatched Words:" << endl;
+ warned = true;
+ }
+ cerr << " " << i << ": " << targetWords[i] << " (" << matcher->sanitizeWord(targetWords[i]) << ")" << endl;
}
if (optionDebug) {
cout << targetWords[i] << " : " << targetWordTags[i] << " => " << (targetWordTags[i] != -1 ? wordTags[targetWordTags[i]] : "") << endl;
}
}
+ if (warned) {
+ cerr << "\n" << targetModuleName << " Tags:\n";
+ VerseKey *vk = (VerseKey *)targetMod.getKey();
+ for (int j = 0; j < targetWords.size(); ++j) {
+ if (!strstr(ignoreSeries, targetWords[j])) {
+ cerr << targetWords[j] << "\t\t " << vk->getOSISRef() << "." << j << "=" << (targetWordTags[j] != -1 ? wordTags[targetWordTags[j]] : "") << endl;
+ }
+ }
+ cerr << "---------------------" << endl;
+ }
if (optionDebug) {
cout << "---------------------" << endl;
@@ -279,7 +304,7 @@
int tagLevel = 0;
int wTag = -1;
int inTag = 0;
- for (int i = 0; i < orig.length(); i++) {
+ for (int i = 0; i < orig.length(); ++i) {
if (orig[i] == '<') {
inTag = true;
}
@@ -330,7 +355,7 @@
}
if (!after || wTags[bibPos] == -1) {
out.insert(to, addText);
- for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+ for (int i = bibPos+((after)?1:0); i < bibMap.size(); ++i) {
bibMap[i] += addText.length();
if (wTags[i] != -1) wTags[i] += addText.length();
}
@@ -338,7 +363,6 @@
}
-
SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &targetWords, vector<int> &targetWordStarts, vector<int> &targetWordEnds) {
SWBuf bibWord = "";
SWBuf fromWord = "";
@@ -386,36 +410,38 @@
// this is our new <w> XMLTag.
// attributes will be added below
XMLTag w("w");
+ // this only gives us word count, not if we have multiple entries per word
+ // don't use as loop
int parts = atoi(it->second["PartCount"]);
SWBuf lemma = "";
SWBuf morph = "";
- for (int i = 1; i <= parts; i++) {
+ bool found = true;
+ for (int i = 1; found; ++i) {
+ found = false;
SWBuf key = "";
- key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+ key = SWBuf().setFormatted("Lemma.%d", i);
AttributeValue::iterator li = it->second.find(key);
+ if (i == 1 && li == it->second.end()) li = it->second.find("Lemma");
if (li != it->second.end()) {
+ found = true;
if (i > 1) lemma += " ";
- key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+ key = SWBuf().setFormatted("LemmaClass.%d", i);
AttributeValue::iterator lci = it->second.find(key);
+ if (i == 1 && lci == it->second.end()) lci = it->second.find("LemmaClass");
if (lci != it->second.end()) {
lemma += lci->second + ":";
}
lemma += li->second;
}
- key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+ key = SWBuf().setFormatted("Morph.%d", i);
li = it->second.find(key);
- // silly. sometimes morph counts don't equal lemma counts
- if (i == 1 && parts != 1 && li == it->second.end()) {
- li = it->second.find("Morph");
- }
+ if (i == 1 && li == it->second.end()) li = it->second.find("Morph");
if (li != it->second.end()) {
+ found = true;
if (i > 1) morph += " ";
- key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+ key = SWBuf().setFormatted("MorphClass.%d", i);
AttributeValue::iterator lci = it->second.find(key);
- // silly. sometimes morph counts don't equal lemma counts
- if (i == 1 && parts != 1 && lci == it->second.end()) {
- lci = it->second.find("MorphClass");
- }
+ if (i == 1 && lci == it->second.end()) lci = it->second.find("MorphClass");
if (lci != it->second.end()) {
morph += lci->second + ":";
}
@@ -430,9 +456,8 @@
fromWord = it->second["Text"];
bibWord = "";
- for (int j = 0; j < fromWord.length(); j++) {
+ for (int j = 0; j < fromWord.length(); ++j) {
char c = fromWord[j];
-// if (!strchr(ignoreSeries, c)) {
if (c != ' ' && c != '.' && c != ';' && c != ',') {
bibWord += c;
}
@@ -458,7 +483,7 @@
// TODO: this method needs some work,
// like putting multiple consecutive words
// together in one tag
- for (int i = 0; i < targetWordTags.size(); i++) {
+ for (int i = 0; i < targetWordTags.size(); ++i) {
if (targetWordTags[i] > -1) {
insert((const char *)wordTags[targetWordTags[i]], markupBuf, targetWordStarts[i], bibMap, wTags);
insert("</w>", markupBuf, targetWordEnds[i], bibMap, wTags, true);
More information about the sword-cvs
mailing list