[sword-svn] r2358 - trunk/utilities

dmsmith at crosswire.org dmsmith at crosswire.org
Tue Apr 28 09:06:08 MST 2009


Author: dmsmith
Date: 2009-04-28 09:06:08 -0700 (Tue, 28 Apr 2009)
New Revision: 2358

Modified:
   trunk/utilities/osis2mod.cpp
Log:
osis2mod: changed preverse title to milestoned preverse div. all tags included in module.

Modified: trunk/utilities/osis2mod.cpp
===================================================================
--- trunk/utilities/osis2mod.cpp	2009-04-27 19:03:47 UTC (rev 2357)
+++ trunk/utilities/osis2mod.cpp	2009-04-28 16:06:08 UTC (rev 2358)
@@ -44,6 +44,7 @@
 #include <latin1utf8.h>
 #endif
 
+// Debug for everything else
 //#define DEBUG
 
 // Debug for simple transformation stack
@@ -61,12 +62,12 @@
 // Debug for titles
 //#define DEBUG_TITLE
 
+// Debug for interverse material
+//#define DEBUG_INTERVERSE
+
 // Debug for re-v11n
 //#define DEBUG_REV11N
 
-//Include all tags starting with the first div in the module
-//#define INCLUDE_TAGS
-
 #ifndef NO_SWORD_NAMESPACE
 using namespace sword;
 #endif
@@ -206,7 +207,7 @@
 		if (utf8State > 0) {
 			SWBuf before = text;
 			normalizer.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
-			if (before != activeVerseText) {
+			if (before != text) {
 				normalized++;
 			}
 		}
@@ -574,12 +575,9 @@
 	// Flags indicating whether we are processing the content of a verse
 	static bool               inVerse         = false;
 
-	// Used to remember titles that need to be handle specially
-	static SWBuf              header          = "";
-	static SWBuf              lastTitle       = "";
-	static int                titleOffset     = -1;
-	static bool               inTitle         = false;
-	static int                titleDepth      = 0;
+	// Flags indicating whether we are processing the content of to be prepended to a verse
+	static bool               inPreVerse      = false;
+	static int                genID           = 1;
 
 	// Flag indicating whether we are in "Words of Christ"
 	static bool               inWOC           = false;
@@ -609,52 +607,7 @@
 	bool                      isEndTag        = token.isEndTag() || token.getAttribute("eID");
 	const char               *typeAttr        = token.getAttribute("type");
 
-	//Titles are treated specially.
-	// If the title has an attribute type of "main" or "chapter"
-	// it belongs to its <div> or <chapter> and is treated as part of its heading
-	// Otherwise if it a title in a chapter before the first the first verse it
-	// is put into the verse as a preverse title.
-	if (!inVerse) {
-		if (!token.isEmpty() &&
-		    !isEndTag &&
-		    titleDepth == 0 &&
-		    (!strcmp(tokenName, "title")) &&
-		    (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter")))) {
-			titleOffset = text.length(); //start of the title tag
-			lastTitle = "";
-			inTitle = true;
-			tagStack.push(token);
-#ifdef DEBUG_STACK
-			cout << currentOsisID << ": push (" << tagStack.size() << ") " << token.getName() << endl;
-#endif
-			titleDepth = tagStack.size();
-			return false;
-		}
-
-		// Check titleDepth since titles can be nested. Don't want to quit too early.
-		else if (inTitle && isEndTag && tagDepth == titleDepth && (!strcmp(tokenName, "title"))) {
-			lastTitle.append(text.c_str() + titleOffset); //<title ...> up to the end </title>
-			lastTitle.append(token); //</title>
-
-#ifdef DEBUG_TITLE
-			cout << currentOsisID << ":" << endl;
-			cout << "\tlastTitle:      " << lastTitle.c_str() << endl;
-			cout << "\ttext-lastTitle: " << text.c_str()+titleOffset << endl;
-			cout << "\ttext:	   " << text.c_str() << endl;
-#endif
-			inTitle = false;
-			titleDepth = 0;
-#ifdef DEBUG_STACK
-			cout << currentOsisID << ": pop(" << tagStack.size() << ") " << tagStack.top().getName() << endl;
-#endif
-			tagStack.pop();
-			return false; // don't add </title> to the text itself
-		}
-	}
-
-
-//-- START TAG -------------------------------------------------------------------------
-
+	// process start tags
 	if (!isEndTag) {
 
 		// Remember non-empty start tags
@@ -666,24 +619,32 @@
 		}
 
 		// throw away everything up to the first div
-		if (!firstDiv && !strcmp(tokenName, "div")) {
-			firstDiv = true;
+		if (!firstDiv) {
+			if (!strcmp(tokenName, "div")) {
 #ifdef DEBUG
-			cout << "Found first div and pitching prior material: " << text << endl;
+				cout << "Found first div and pitching prior material: " << text << endl;
 #endif
-			text     = "";
+				// TODO: Save off the content to use it to suggest the module's conf.
+				firstDiv = true;
+				text     = "";
+			}
+			else {
+				// Collect the content so it can be used to suggest the module's conf.
+				return false;
+			}
 		}
 
-		//-- WITH OSIS ID      -------------------------------------------------------------------------
-		//--   OR ANNOTATE REF -------------------------------------------------------------------------
+		//-- WITH osisID OR annotateRef -------------------------------------------------------------------------
+		// Handle Book, Chapter, and Verse (or commentary equivalent)
 		if (token.getAttribute("osisID") || token.getAttribute("annotateRef")) {
 
 			// BOOK START, <div type="book" ...>
 			if ((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "book"))) {
-				inVerse = false;
 				if (inBookHeader || inChapterHeader) {	// this one should never happen, but just in case
 #ifdef DEBUG_TITLE
-					cout << currentOsisID << ": HEADING ";
+					cout << currentOsisID << ": OOPS HEADING " << endl;
+					cout << "inChapterHeader = " << inChapterHeader << endl;
+					cout << "inBookHeader = " << inBookHeader << endl;
 #endif
 					currentVerse.Testament(0);
 					currentVerse.Book(0);
@@ -691,40 +652,32 @@
 					currentVerse.Verse(0);
 					writeEntry(text);
 				}
-				// Initializing a temporary and copying that because there were problems with setting the text directly
-				VerseKey t;
-				t.setVersificationSystem(currentVerse.getVersificationSystem());
-				t.AutoNormalize(0);
-				t.Headings(1);
-				t = token.getAttribute("osisID");
-				currentVerse = t;
+				currentVerse = token.getAttribute("osisID");
 				currentVerse.Chapter(0);
 				currentVerse.Verse(0);
 				strcpy(currentOsisID, currentVerse.getOSISRef());
-				inBookHeader = true;
+
+				inVerse         = false;
+				inPreVerse      = false;
+				inBookHeader    = true;
 				inChapterHeader = false;
-				lastTitle = "";
-				bookDepth = tagStack.size();
-				chapterDepth = 0;
-				verseDepth = 0;
 
+				bookDepth       = tagStack.size();
+				chapterDepth    = 0;
+				verseDepth      = 0;
+
 				inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID"));
 #ifdef DEBUG
 				cout << "Current book is " << currentVerse << (!inCanonicalOSISBook ? "not in versification, ignoring" : "") << endl;
 #endif
 
-#ifndef INCLUDE_TAGS
-				return true;
-#else
 				return false;
-#endif
 			}
 
 			// CHAPTER START, <div type="chapter" ...> or <chapter ...>
 			if (((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "chapter"))) ||
 			     (!strcmp(tokenName, "chapter"))
 			   ) {
-				inVerse = false;
 				if (inBookHeader) {
 #ifdef DEBUG_TITLE
 					cout << currentOsisID << ": BOOK HEADING "<< text.c_str() << endl;
@@ -732,31 +685,22 @@
 					writeEntry(text);
 				}
 
-				// I don't know why, but I cannot do the following,
-				// as it does not change the content of VerseKey!
-				// currentVerse = token.getAttribute("osisID");
-				VerseKey t;
-				t.setVersificationSystem(currentVerse.getVersificationSystem());
-				t.AutoNormalize(0);
-				t.Headings(1);
-				t = token.getAttribute("osisID");
-				currentVerse = t;
+				currentVerse = token.getAttribute("osisID");
 				currentVerse.Verse(0);
 #ifdef DEBUG
 				cout << "Current chapter is " << currentVerse << " (" << token.getAttribute("osisID") << ")" << endl;
 #endif
 				strcpy(currentOsisID, currentVerse.getOSISRef());
-				inBookHeader = false;
+
+				inVerse         = false;
+				inPreVerse      = false;
+				inBookHeader    = false;
 				inChapterHeader = true;
-				lastTitle = "";
-				chapterDepth = tagStack.size();
-				verseDepth = 0;
 
-#ifndef INCLUDE_TAGS
-				return true;
-#else
+				chapterDepth    = tagStack.size();
+				verseDepth      = 0;
+
 				return false;
-#endif
 			}
 
 			// VERSE, <verse ...> OR COMMENTARY START, <div annotateType="xxx" ...>
@@ -765,36 +709,10 @@
 #ifdef DEBUG
 				cout << "Entering verse" << endl;
 #endif
-				inVerse = true;
 				if (inChapterHeader) {
 					SWBuf heading = text;
+					text = "";
 
-					//make sure we don't insert the preverse title which belongs to the first verse of this chapter!
-					// Did we have a preverse title?
-					if (lastTitle.length()) {
-						//Was the preVerse title in the header (error if not)?
-						const char* header = heading.c_str();
-						const char* preVerse = strstr(header, lastTitle);
-						if (preVerse) {
-							if (preVerse == header) {
-								heading = ""; // do nothing
-							}
-							else {
-								// remove everything before the title from the beginning.
-								text = preVerse;
-								// Remove text from the end of the header.
-								heading.setSize(preVerse - header);
-							}
-						}
-						else {
-							cout << currentOsisID << ": Warning: Bug in code. Could not find title." << endl;
-						}
-					}
-					else {
-						text = "";
-					}
-
-
 					if (heading.length()) {
 #ifdef DEBUG_TITLE
 						cout << currentOsisID << ": CHAPTER HEADING "<< heading.c_str() << endl;
@@ -805,11 +723,21 @@
 					inChapterHeader = false;
 				}
 
+				// Did we have pre-verse material that needs to be marked?
+				if (inPreVerse) {
+					char genBuf[200];
+					sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" eID=\"pv%d\"/>", genID++);
+					text.append(genBuf);
+				}
+
+				// Get osisID for verse or annotateRef for commentary
 				SWBuf keyVal = token.getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID");
-				// The osisID or annotateRef can be more than a single verse
-				// The first or only one is the currentVerse
+
+				// Massage the key into a form that ParseVerseList can accept
 				prepareSWVerseKey(keyVal);
 
+				// The osisID or annotateRef can be more than a single verse
+				// The first or only one is the currentVerse
 				// Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing.
 				// This should never happen if the references are valid OSIS references
 				ListKey verseKeys = currentVerse.ParseVerseList(keyVal, currentVerse, true);
@@ -833,22 +761,27 @@
 
 				strcpy(currentOsisID, currentVerse.getOSISRef());
 #ifdef DEBUG
-				cout << "Current verse is " << currentVerse << endl;
+				cout << "New current verse is " << currentVerse << endl;
 				cout << "osisID/annotateRef is adjusted to: " << keyVal << endl;
 #endif
 
-				verseDepth = tagStack.size();
+				inVerse         = true;
+				inPreVerse      = false;
+				inBookHeader    = false;
+				inChapterHeader = false;
+				verseDepth      = tagStack.size();
 
-#ifdef INCLUDE_TAGS
 				text.append(token);
-#endif
+
 				if (inWOC) {
 					text.append(wocTag);
 				}
 				return true;
 			}
-		}
+		} // done with Handle Book, Chapter, and Verse (or commentary equivalent)
 
+		// Now consider everything else.
+
 		// Handle WOC quotes.
 		// Note this requires transformBSP to make them into milestones
 		// Otherwise have to do it here
@@ -868,38 +801,60 @@
 			return false;
 		}
 
-		// Handle stuff between the verses
-		// Whitespace producing empty tokens are appended to prior entry
-		// Also the quote
-		// This is a hack to get ESV to work
-		// Don't write if there is not a valid osisID yet.
-		if (!inTitle && !inVerse && token.isEmpty() && strcmp(currentOsisID, "N/A")) {
-			if ((!strcmp(tokenName, "div") && (!typeAttr || strcmp(typeAttr, "paragraph"))) ||
-			    !strcmp(tokenName, "q")   ||
-			    !strcmp(tokenName, "l")   ||
-			    (!strcmp(tokenName, "lb") 
-				 	// If these were paragraphs, don't wrest them from their
-					// rightful place in the preverse text
-				 	&& strcmp("x-begin-paragraph", token.getAttribute("type"))
-					&& strcmp("x-end-paragraph", token.getAttribute("type"))
-				) ||
-			    !strcmp(tokenName, "lg")
-			   ) {
-#ifdef DEBUG
-				cout << currentOsisID << ": appending interverse start token " << token << ":" << text.c_str() << endl;
-#endif
-				SWBuf tmp = token.toString();
-				writeEntry(tmp);
-				return true;
+		// Have we found the start of pre-verse material?
+		// Pre-verse material follows the following rules
+		// 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book.
+		// 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter
+		//    and the first verse of the chapter.
+		//    A <div> with a type other than section will be taken as a chapter introduction.
+		//    A <title> of type acrostic, psalm or no type, will be taken as a title for the verse.
+		//    A <title> of type main or chapter will be seen as a chapter title.
+		// 3) Between verses, the material is split between the prior verse and the next verse.
+		//    Basically, while end and empty tags are found, they belong to the prior verse.
+		//    Once a begin tag is found, it belongs to the next verse.
+		// If the title has an attribute type of "main" or "chapter"
+		// it belongs to its <div> or <chapter> and is treated as part of its heading
+		// Otherwise if it a title in a chapter before the first the first verse it
+		// is put into the verse as a preverse title.
+
+		if (!inPreVerse && !inBookHeader) {
+			if (inChapterHeader) {
+				// Determine when we are no longer in a chapter heading, but in pre-verse material:
+				// If we see one of the following:
+				// 	a section div
+				// 	a title that is not main or chapter
+				if ((!strcmp(tokenName, "div") && (typeAttr && !strcmp(typeAttr, "section"))) ||
+				    (!strcmp(tokenName, "title") && (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter"))))
+				   ) {
+					// Since we have found the boundary, we need to write out the chapter heading
+					writeEntry(text);
+					// And we are no longer in the chapter heading
+					inChapterHeader = false;
+					// But rather, we are now in pre-verse material
+					inPreVerse      = true;
+				}
 			}
-#ifdef DEBUG
+			else if (!inVerse) {
+				inPreVerse = true;
+			}
+
+			if (inPreVerse) {
+				char genBuf[200];
+				sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID);
+				text.append(genBuf);
+			}
+		}
+
+#ifdef DEBUG_INTERVERSE
+		if (!inVerse && !inBookHeader && !inChapterHeader) {
 			cout << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl;
-#endif
 		}
-	}
+#endif
 
-//-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
+		return false;
+	} // Done with procesing start and empty tags
 
+	// Process end tags
 	else {
 
 		if (tagStack.empty()) {
@@ -907,6 +862,7 @@
 			exit(1);
 		}
 
+		// Note: empty end tags have the eID attribute
 		if (!token.isEmpty()) {
 			XMLTag topToken = tagStack.top();
 			tagDepth = tagStack.size();
@@ -922,64 +878,32 @@
 			}
 		}
 
+		// We haven't seen the first div so there is nothing to do.
+		if (!firstDiv) {
+			// Collect the content so it can be used to suggest the module's conf.
+			return false;
+		}
+
 		// VERSE and COMMENTARY END
 		if (!strcmp(tokenName, "verse") || (inVerse && !strcmp(tokenName, "div"))) {
-			inVerse = false;
 
 			if (tagDepth != verseDepth) {
 				cout << "Warning verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl;
 			}
 
-			if (lastTitle.length()) {
-				const char* end = strchr(lastTitle, '>');
-#ifdef DEBUG_TITLE
-				cout << currentOsisID << ":" << endl;
-				cout << "\t" << lastTitle << endl;
-	 			cout << "\tlength=" << int(end+1 - lastTitle.c_str()) << ", tag:" << lastTitle.c_str() << endl;
-#endif
-
-				SWBuf titleTagText;
-				titleTagText.append(lastTitle.c_str(), end+1 - lastTitle.c_str());
-#ifdef DEBUG_TITLE
-				cout << currentOsisID << ": tagText: " << titleTagText.c_str() << endl;;
-#endif
-
-				XMLTag titleTag(titleTagText);
-				titleTag.setAttribute("type", "section");
-				titleTag.setAttribute("subType", "x-preverse");
-
-				//we insert the title into the text again - make sure to remove the old title text
-				const char* pos = strstr(text, lastTitle);
-				if (pos) {
-					SWBuf temp;
-					temp.append(text, pos-text.c_str());
-					temp.append(pos+lastTitle.length());
-					text = temp;
-				}
-
-				//if a title was already inserted at the beginning insert this one after that first title
-				int titlePos = 0;
-				if (!strncmp(text.c_str(),"<title ",7)) {
-					const char* tmp = strstr(text.c_str(), "</title>");
-					if (tmp) {
-						titlePos = (tmp-text.c_str()) + 8;
-					}
-				}
-				text.insert(titlePos, end+1);
-				text.insert(titlePos, titleTag);
-			}
-
 			// If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
 			if (inWOC) {
 				text.append("</q>");
 			}
-#ifdef INCLUDE_TAGS
+
 			text.append(token);
-#endif
+
 			writeEntry(text);
 
-			lastTitle = "";
-			verseDepth = 0;
+			inVerse     = false;
+			inPreVerse  = false;
+			verseDepth  = 0;
+
 			return true;
 		}
 		
@@ -1020,60 +944,57 @@
 			return false;
 		}
 
-		if (!inTitle && !inVerse && !inBookHeader && !inChapterHeader) {
+		// Look for the end of document, book and chapter
+		// Also for material that goes with last entry
+		if (!inVerse && !inBookHeader && !inChapterHeader) {
 			// Is this the end of a chapter.
 			if (tagDepth == chapterDepth && (!strcmp(tokenName, "div") || !strcmp(tokenName, "chapter"))) {
-#ifdef INCLUDE_TAGS
 				text.append(token);
 				writeEntry(text);
-#endif
 				chapterDepth = 0;
-				verseDepth = 0;
+				verseDepth   = 0;
 				return true;
 			}
 
 			// Is it the end of a book
 			if (tagDepth == bookDepth && (!strcmp(tokenName, "div"))) {
-#ifdef INCLUDE_TAGS
 				text.append(token);
 				writeEntry(text);
-#endif
-				bookDepth = 0;
+				bookDepth    = 0;
 				chapterDepth = 0;
-				verseDepth = 0;
+				verseDepth   = 0;
 				return true;
 			}
 
-			// OTHER MISC END TAGS WHEN !INVERSE
-			// Test that is between verses, or after the last is appended to the preceeding verse.
-			if (!strcmp(tokenName, "div") ||
-			    !strcmp(tokenName, "chapter") ||
-			    !strcmp(tokenName, "q")   ||
-			    !strcmp(tokenName, "l")   ||
-			    !strcmp(tokenName, "lb")  ||
-			    !strcmp(tokenName, "lg")
-			   ) {
+			// Do not include the end of an osis document
+			if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) {
+				bookDepth    = 0;
+				chapterDepth = 0;
+				verseDepth   = 0;
+				text         = "";
+				return true;
+			}
+
+			// When we are not inPreVerse, the interverse tags get appended to the preceeding verse.
+			if (!inPreVerse) {
 				text.append(token);
 				writeEntry(text);
-#ifdef DEBUG
+#ifdef DEBUG_INTERVERSE
 				cout << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
 #endif
 				return true;
 			}
 
-			// Is it the end of an osis document
-			if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) {
-				bookDepth = 0;
-				chapterDepth = 0;
-				verseDepth = 0;
-				text = "";
-				return true;
-			}
-#ifdef DEBUG
+#ifdef DEBUG_INTERVERSE
 			cout << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
 #endif
+			return false;
+
 		}
-	}
+
+		return false;
+	} // done with Processing end tags
+
 	return false;
 }
 




More information about the sword-cvs mailing list