[sword-svn] r470 - in trunk: . migratetags
scribe at crosswire.org
scribe at crosswire.org
Wed Oct 23 07:33:38 MST 2013
Author: scribe
Date: 2013-10-23 07:33:38 -0700 (Wed, 23 Oct 2013)
New Revision: 470
Added:
trunk/migratetags/
trunk/migratetags/Makefile
trunk/migratetags/esvtag.cpp
Log:
committed first cut at tag migration tool to move <w> tags from one module to another
Added: trunk/migratetags/Makefile
===================================================================
--- trunk/migratetags/Makefile (rev 0)
+++ trunk/migratetags/Makefile 2013-10-23 14:33:38 UTC (rev 470)
@@ -0,0 +1,11 @@
+TARGETS= esvtag
+
+all: $(TARGETS)
+
+clean:
+ rm $(TARGETS)
+
+.cpp:
+ g++ -g `pkg-config --cflags sword` $< -o $@ `pkg-config --libs sword`
+
+
Added: trunk/migratetags/esvtag.cpp
===================================================================
--- trunk/migratetags/esvtag.cpp (rev 0)
+++ trunk/migratetags/esvtag.cpp 2013-10-23 14:33:38 UTC (rev 470)
@@ -0,0 +1,389 @@
+#include <versekey.h>
+#include <swmgr.h>
+#include <utilxml.h>
+#include <swbuf.h>
+#include <swmodule.h>
+#include <iostream>
+#include <vector>
+
+using namespace sword;
+using namespace std;
+
+typedef vector<unsigned long> BibMap;
+
+void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after = false);
+int compare(const SWBuf &s1, const SWBuf &s2);
+
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap);
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds);
+void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags);
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds);
+
+
+//
+// This is where the magic happens
+//
+// we must point each esv word to an XMLTag
+//
+// when the magic is done, and your guess is made
+// populate esvWordTags with the integer offset
+// into wordTags for which XMLTag you think it should
+// be.
+//
+void matchWords(vector<int> &esvWordTags, const vector<SWBuf> &esvWords, const vector<SWBuf> &kjvWords, const vector<int> &kjvWordTags) {
+
+ // initialize our results to all -1 so we can pop around and set
+ // words as we find them, and know which ones we haven't yet set
+ for (int i = 0; i < esvWords.size(); i++) esvWordTags.push_back(-1);
+
+
+ // poor effort attempt
+ int j = 0;
+ for (int i = 0; i < esvWords.size(); i++) {
+ while (true) {
+ int match = compare(esvWords[i], kjvWords[j]);
+ // if we have a better than 75% match of sequencial characters
+ // then we'll say we have a match
+ if (match > 75) {
+ esvWordTags[i] = kjvWordTags[j++];
+ break;
+ }
+ // TOTRY: maybe check one word before and after?
+ //
+ // be creative!
+ //
+ }
+ }
+}
+
+
+int main(int argc, char **argv) {
+ VerseKey vk;
+ SWMgr lib;
+ SWModule &esv = *lib.getModule("ESV");
+ SWModule &kjv = *lib.getModule("KJV");
+
+ // we'll do the whole Bible eventually, but let's just get one verse
+ // working well.
+ esv.setKey("gen1.1"); // lets try this verse
+// for (esv = TOP; !esv.Error(); esv++) {
+
+ // XML word tags which should be placed in this verse (start tag)
+ // eg., <w lemma=...>
+ // pulled from KJV
+ vector<XMLTag> wordTags;
+
+ // Just the raw canonical Bible text of this verse with no tags
+ // eg., "In the beginning God created the heavens and the earth."
+ SWBuf justESVBibleText = "";
+
+ // a mapping for each character in justESVBibleText to the real location
+ // in our out buffer. This allows us to insert our <w> and </w>
+ // tags in the correct place amongst the fully marked up
+ // ESV out buffer. This work is all done in the insert() method
+ // above
+ BibMap bibMap;
+
+ // justESVBibleText (above) broken down into separate words
+ // ie. all words in the ESV from this verse
+ // eg. [0] = "In"; [1] = "the"; [2] = "beginning"; ...
+ vector<SWBuf> esvWords;
+
+ // where each corresponding esvWords[x] starts in justESVBibleText
+ // eg. for "In the beginning..."
+ // [0] = 0; [1] = 3; [2] = 7; ...
+ // Needed to pass to insert method so we know where
+ // to insert the <w> start tag
+ vector<int> esvWordStarts;
+
+ // same as esvWordStarts, but the end of each word
+ // eg. [0] = 1; [1] = 5; [2] = 15
+ // Needed to pass to insert method so we know where
+ // to insert the </w> end tag
+ vector<int> esvWordEnds;
+
+ // This is the doozy. This maps each ESV word to the correct
+ // wordTags entry.
+ vector<int> esvWordTags;
+
+ // Equivalent to esvWords above, but for the KJV.
+ // Useful for helping determine matches to ESV words
+ vector<SWBuf> kjvWords;
+
+ // Equivalent to esvWordTag which we need to produce,
+ // but this one is produced for us from the KJV data
+ // If we can match a kjvWords[x] entry, then we can assign
+ // esvWorkTags[ourMatch] = kjvWordTags[x]
+ vector<int> kjvWordTags;
+
+ bibMap.clear();
+
+ kjv.setKey(esv.getKey());
+
+ cout << "\nProcessing Verse: " << esv.getKeyText() << endl;
+ cout << "---------------------" << endl;
+
+ cout << "\nOur KJV Verse Markup" << endl;
+ cout << "---------------------" << endl;
+ cout << kjv.getRawEntry() << endl;
+ cout << "---------------------" << endl;
+
+
+ // grab our raw, fully marked up ESV text for this verse
+ SWBuf orig = esv.getRawEntryBuf();
+
+ cout << "\nOur Original ESV Markup" << endl;
+ cout << "---------------------" << endl;
+ cout << orig << endl;
+ cout << "---------------------" << endl;
+
+ // let's find where just the canonical text is amongst
+ // all our markup
+ // newESVMarkup will eventually hold our updated markup with
+ // the new <w> tags, but we'll start here by setting it to
+ // the processed original markup.
+ // on return, bibMap will be populated with each character
+ // and the corresponding location into newESVMarkup where
+ // the character resides.
+ SWBuf newESVMarkup = findCanonicalBibleText(orig, bibMap);
+
+ cout << "\nOur Original ESV Markup After XMLTag-ifying" << endl;
+ cout << "---------------------" << endl;
+ cout << newESVMarkup << endl;
+ cout << "---------------------" << endl;
+
+ // let's populate or ESV word data and fill in our
+ // justESVBibleText buffer
+ justESVBibleText = buildWordMaps(newESVMarkup, bibMap, esvWords, esvWordStarts, esvWordEnds);
+
+ cout << "\nJust ESV Bible Text" << endl;
+ cout << "---------------------" << endl;
+ cout << justESVBibleText << endl;
+ cout << "---------------------" << endl;
+
+
+ // ok, now lets grab out the groovy data from the KJV module
+ pullKJVData(kjv, wordTags, kjvWords, kjvWordTags);
+
+
+ //
+ // ok, here's the real work.
+ //
+ // This method needs to guess which ESV words match which KJV
+ // words and then point them to their same original language
+ // word tag by populating esvWordTags
+ //
+ matchWords(esvWordTags, esvWords, kjvWords, kjvWordTags);
+
+ // ok, now that we have our esvWordTags magically populated
+ // let's do the grunt work of inserting the <w> and </w> tags
+ insertWordTags(newESVMarkup, bibMap, esvWordTags, wordTags, esvWordStarts, esvWordEnds);
+
+
+ cout << "\nHere's how you mapped things..." << endl;
+ cout << "---------------------" << endl;
+ cout << "Total wordTags: " << wordTags.size() << endl;
+ cout << "\nESV Words: " << endl;
+ for (int i = 0; i < esvWords.size(); i++) {
+ cout << esvWords[i] << " : " << esvWordTags[i] << " => " << wordTags[esvWordTags[i]] << endl;
+ }
+ cout << "---------------------" << endl;
+
+ cout << "\nAND... Here's your final output" << endl;
+ cout << "---------------------" << endl;
+ cout << newESVMarkup << endl;
+ cout << endl;
+// }
+ return 0;
+}
+
+
+// builds up bibMap to contain only characters of Biblical text
+// and each character's corresponding real location in our output
+// buffer (returned value)
+SWBuf findCanonicalBibleText(SWBuf orig, BibMap &bibMap) {
+ SWBuf out = "";
+ SWBuf tag = "";
+ int tagLevel = 0;
+ int inTag = 0;
+ for (int i = 0; i < orig.length(); i++) {
+ if (orig[i] == '<') {
+ inTag = true;
+ }
+ else if (orig[i] == '>') {
+ inTag = false;
+ XMLTag t = tag.c_str();
+ if (!t.isEmpty()) {
+ if (t.isEndTag()) {
+ tagLevel--;
+ }
+ else {
+ tagLevel++;
+ }
+ }
+ out += t;
+ tag = "";
+ }
+ else if (inTag) {
+ tag += orig[i];
+ }
+ else {
+ if (!tagLevel) {
+ bibMap.push_back(out.size());
+ }
+ out += orig[i];
+ }
+ }
+ return out;
+}
+
+
+// Inserts addText into out buffer and adjusts Bible character pointers accordingly
+//
+void insert(const SWBuf &addText, SWBuf &out, int bibPos, BibMap &bibMap, bool after) {
+ out.insert(bibMap[bibPos]+((after)?1:0), addText);
+ for (int i = bibPos+((after)?1:0); i < bibMap.size(); i++) {
+ bibMap[i] += addText.length();
+ }
+}
+
+
+// Compares 2 words and tries to give a percentage assurance of a match
+// TODO: could use more smarts here
+//
+int compare(const SWBuf &s1, const SWBuf &s2) {
+ int retVal = 0;
+ SWBuf largest = (s1.length() > s2.length()) ? s1 : s2;
+ SWBuf smallest = (s1.length() > s2.length()) ? s2 : s1;
+ int matches = 0;
+ int j = 0;
+ for (int i = 0; i < smallest.length() && j < largest.length(); i++) {
+ while (j < largest.length()) {
+ if (smallest[i] == largest[j++]) {
+ matches++;
+ break;
+ }
+ }
+ }
+ return (((float)matches) / largest.length()) * 100;
+}
+
+
+SWBuf buildWordMaps(const SWBuf &markupBuf, const BibMap &bibMap, vector<SWBuf> &esvWords, vector<int> &esvWordStarts, vector<int> &esvWordEnds) {
+ SWBuf bibWord = "";
+ SWBuf kjvWord = "";
+ SWBuf bibText = "";
+ for (BibMap::const_iterator it = bibMap.begin(); it != bibMap.end(); it++) {
+ char c = markupBuf[*it];
+ if ((c >= 'a' && c <='z') ||
+ (c >= 'A' && c <='Z')
+ ) {
+ if (!bibWord.length()) esvWordStarts.push_back(bibText.length());
+ bibWord += c;
+ }
+ else {
+ if (bibWord.length()) {
+ esvWordEnds.push_back(bibText.length()-1);
+ esvWords.push_back(bibWord);
+ bibWord = "";
+ }
+ }
+ bibText += c;
+ }
+ if (bibWord.length()) {
+ esvWordEnds.push_back(bibText.length()-1);
+ esvWords.push_back(bibWord);
+ }
+ return bibText;
+}
+
+
+void pullKJVData(SWModule &kjv, vector<XMLTag>&wordTags, vector<SWBuf> &kjvWords, vector<int> &kjvWordTags) {
+ kjv.RenderText(); // be sure KJV has processed entry attributes
+ AttributeList &words = kjv.getEntryAttributes()["Word"];
+ SWBuf kjvWord = "";
+ SWBuf bibWord = "";
+ for (AttributeList::iterator it = words.begin(); it != words.end(); it++) {
+ // this is our new <w> XMLTag.
+ // attributes will be added below
+ XMLTag w("w");
+ int parts = atoi(it->second["PartCount"]);
+ SWBuf lemma = "";
+ SWBuf morph = "";
+ for (int i = 1; i <= parts; i++) {
+ SWBuf key = "";
+ key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i);
+ AttributeValue::iterator li = it->second.find(key);
+ if (li != it->second.end()) {
+ if (i > 1) lemma += " ";
+ key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i);
+ AttributeValue::iterator lci = it->second.find(key);
+ if (lci != it->second.end()) {
+ lemma += lci->second + ":";
+ }
+ lemma += li->second;
+ }
+ key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i);
+ li = it->second.find(key);
+ // silly. sometimes morph counts don't equal lemma counts
+ if (i == 1 && parts != 1 && li == it->second.end()) {
+ li = it->second.find("Morph");
+ }
+ if (li != it->second.end()) {
+ if (i > 1) morph += " ";
+ key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i);
+ AttributeValue::iterator lci = it->second.find(key);
+ // silly. sometimes morph counts don't equal lemma counts
+ if (i == 1 && parts != 1 && lci == it->second.end()) {
+ lci = it->second.find("MorphClass");
+ }
+ if (lci != it->second.end()) {
+ morph += lci->second + ":";
+ }
+ morph += li->second;
+ }
+ // TODO: add src tags and maybe other attributes
+ }
+
+ if (lemma.length()) w.setAttribute("lemma", lemma);
+ if (morph.length()) w.setAttribute("morph", morph);
+
+
+ kjvWord = it->second["Text"];
+ bibWord = "";
+ for (int j = 0; j < kjvWord.length(); j++) {
+ char c = kjvWord[j];
+ if ((c >= 'a' && c <='z') ||
+ (c >= 'A' && c <='Z')
+ ) {
+ bibWord += c;
+ }
+ else {
+ if (bibWord.length()) {
+ kjvWords.push_back(bibWord);
+ kjvWordTags.push_back(wordTags.size());
+ bibWord = "";
+ }
+ }
+ }
+ if (bibWord.length()) {
+ kjvWords.push_back(bibWord);
+ kjvWordTags.push_back(wordTags.size());
+ }
+
+ wordTags.push_back(w);
+ }
+}
+
+
+void insertWordTags(SWBuf &markupBuf, BibMap &bibMap, const vector<int> &esvWordTags, const vector<XMLTag> &wordTags, const vector<int> &esvWordStarts, const vector<int> &esvWordEnds) {
+ // TODO: this method needs some work,
+ // like putting multiple consecutive words
+ // together in one tag
+ for (int i = 0; i < esvWordTags.size(); i++) {
+ if (esvWordTags[i] > -1) {
+ insert((const char *)wordTags[esvWordTags[i]], markupBuf, esvWordStarts[i], bibMap);
+ insert("</w>", markupBuf, esvWordEnds[i], bibMap, true);
+ }
+ }
+}
More information about the sword-cvs
mailing list