#include "matcher.h" #include #ifndef gntmatcher_h #define gntmatcher_h class GNTMatcher : public Matcher { UTF8GreekAccents sanitizeGreekAccentFilter; public: GNTMatcher() : sanitizeGreekAccentFilter() { sanitizeGreekAccentFilter.setOptionValue("off"); } // Compares 2 words and tries to give a percentage assurance of a match // TODO: could use more smarts here // virtual int compare(const SWBuf &s1, const SWBuf &s2) { SWBuf t1 = sanitizeWord(s1); SWBuf t2 = sanitizeWord(s2); int retVal = 0; SWBuf largest = (t1.length() > t2.length()) ? t1 : t2; SWBuf smallest = (t1.length() > t2.length()) ? t2 : t1; int matches = 0; int j = 0; for (int i = 0; i < smallest.length() && j < largest.length(); i++) { while (j < largest.length()) { if (smallest[i] == largest[j++]) { matches++; break; } } } return (((float)matches) / largest.length()) * 100; } // // This is where the magic happens // // we must point each targetMod word to an XMLTag // // when the magic is done, and your guess is made // populate targetWordTags with the integer offset // into wordTags for which XMLTag you think it should // be. // virtual void matchWords(vector &targetWordTags, const vector &targetWords, const vector &fromWords, vector fromWordTags) { // initialize our results to all -1 so we can pop around and set // words as we find them, and know which ones we haven't yet set for (int i = 0; i < targetWords.size(); i++) targetWordTags.push_back(-1); // poor effort attempt int j = 0; for (int i = 0; i < targetWords.size(); ++i) { SWBuf w1 = targetWords[i]; int j = 0; for (; j < fromWords.size(); ++j) { if (fromWordTags[j] == -1) continue; SWBuf w2 = fromWords[j]; int match = compare(w1, w2); // if we have a better than 75% match of sequencial characters // then we'll say we have a match if (match > 99) { targetWordTags[i] = fromWordTags[j]; fromWordTags[j] = -1; break; } } // didn't match if (j == fromWords.size()) { // TOTRY: maybe check one word before and after? // // be creative! // // let's see if we have common misses, regularize and recheck SWBuf w1Orig = w1; if (w1 == "ἀλλ" || w1 == "Ἀλλ") w1 = "αλλα"; if (w1 != w1Orig) { for (int j = 0; j < fromWords.size(); ++j) { if (fromWordTags[j] == -1) continue; SWBuf w2 = fromWords[j]; int match = compare(w1, w2); // if we have a better than 75% match of sequencial characters // then we'll say we have a match if (match > 99) { targetWordTags[i] = fromWordTags[j]; fromWordTags[j] = -1; break; } } } } } } virtual SWBuf sanitizeWord(const SWBuf &word) { SWBuf t1 = word; // remove greek accents sanitizeGreekAccentFilter.processText(t1); t1.toUpper(); t1.replaceBytes("[]", 0); return t1; } }; #endif