[sword-svn] r3912 - trunk/utilities

dmsmith at crosswire.org dmsmith at crosswire.org
Wed Aug 6 15:57:57 EDT 2025


Author: dmsmith
Date: 2025-08-06 15:57:56 -0400 (Wed, 06 Aug 2025)
New Revision: 3912

Modified:
   trunk/utilities/osis2mod.cpp
Log:
MODTOOLS-17 To osis2mod, added conversion of hex and decimal numeric entities to UTF-8, with special handling of <, >, &, ', and ".

Also:
  * Fixed a bug in hex numeric entities which defined &xHHHH; rather than &#xHHHH;.
  * Added entity sanity check of maximum length of 32.
  * Refactored entity handling into handleEntities and comment handling into handleComments.
  * Changed t_entitytype and t_commentstate into class enums EntityType and CommentState.
  * Added -d 1024 for entity and comment parsing.

 Note: The coding allows for 0 padding of the numeric entities.


Modified: trunk/utilities/osis2mod.cpp
===================================================================
--- trunk/utilities/osis2mod.cpp	2025-07-29 21:19:48 UTC (rev 3911)
+++ trunk/utilities/osis2mod.cpp	2025-08-06 19:57:56 UTC (rev 3912)
@@ -75,17 +75,18 @@
 
 //using namespace std;
 
-int       debug            =   0; // mask of debug flags
-const int DEBUG_WRITE      =   1; // writing to module
-const int DEBUG_VERSE      =   2; // verse start and end
-const int DEBUG_QUOTE      =   4; // quotes, especially Words of Christ (WOC)
-const int DEBUG_TITLE      =   8; // titles
-const int DEBUG_INTERVERSE =  16; // inter-verse material
-const int DEBUG_XFORM      =  32; // transformations
-const int DEBUG_REV11N     =  64; // versification
-const int DEBUG_REF        = 128; // parsing of osisID and osisRef
-const int DEBUG_STACK      = 256; // cleanup of references
-const int DEBUG_OTHER      = 512; // ins and outs of books, chapters and verses
+int       debug            =    0; // mask of debug flags
+const int DEBUG_WRITE      =    1; // writing to module
+const int DEBUG_VERSE      =    2; // verse start and end
+const int DEBUG_QUOTE      =    4; // quotes, especially Words of Christ (WOC)
+const int DEBUG_TITLE      =    8; // titles
+const int DEBUG_INTERVERSE =   16; // inter-verse material
+const int DEBUG_XFORM      =   32; // transformations
+const int DEBUG_REV11N     =   64; // versification
+const int DEBUG_REF        =  128; // parsing of osisID and osisRef
+const int DEBUG_STACK      =  256; // cleanup of references
+const int DEBUG_OTHER      =  512; // ins and outs of books, chapters and verses
+const int DEBUG_PARSE      = 1024; // parsing of numeric and character entities.
 
 // Exit codes
 const int EXIT_BAD_ARG     =   1; // Bad parameter given for program
@@ -1539,6 +1540,7 @@
 		fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n");
 		fprintf(stderr, "\t\t\t\t\t256 - internal stack\n");
 		fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n");
+		fprintf(stderr, "\t\t\t\t\t1024 - parsing of numeric and character entities and comments.\n");
 		fprintf(stderr, "\t\t\t\t This argument can be used more than once. (Or\n");
 		fprintf(stderr, "\t\t\t\t the flags may be added together.)\n");
 	}
@@ -1550,25 +1552,502 @@
 	exit(EXIT_BAD_ARG);
 }
 
+// Maximum length for an entity (including & and ;), sufficient for valid XML/HTML entities
+constexpr size_t MAX_ENTITY_LENGTH = 32;
+
+// Enum for entity types
+enum class EntityType { START, NUM_HASH, NUM_DEC, NUM_HEX, CHAR, ERR };
+
+enum class CommentState {
+	START,         // Not in a comment or have seen '<'
+	SLAM,          // Seen '<!'
+	DASH1,         // Seen '<!-'
+	COMMENT,       // Having seen '<--' inside comment content
+	END_DASH1,     // Seen '-' in comment
+	END_DASH2      // Seen '--' in comment
+};
+
+/**
+ * @brief Handles XML comment parsing for a single character at a time.
+ * @param c The current character to process.
+ * @param currentOsisID The current OSIS ID for error reporting.
+ * @param incomment Whether currently inside a comment.
+ * @param commentstate The current comment parsing state.
+ * @param token The token buffer to append characters during comment start.
+ * @return true if the character is consumed (continue loop), false otherwise.
+ */
+bool handleComment(unsigned char c, const char* currentOsisID, bool& intoken, bool& incomment, CommentState& commentstate, SWBuf& token) {
+	if (!incomment) {
+		switch (commentstate) {
+		case CommentState::START:
+			if (c == '!') {
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "Found <!" << std::endl;
+				}
+				commentstate = CommentState::SLAM;
+				token.append((char)c);
+				return true;
+			}
+			return false;
+
+		case CommentState::SLAM:
+			if (c == '-') {
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "Found <!-" << std::endl;
+				}
+				commentstate = CommentState::DASH1;
+				token.append((char)c);
+				return true;
+			}
+			commentstate = CommentState::START;
+			return false;
+
+		case CommentState::DASH1:
+			if (c == '-') { // having seen the second dash we are in the comment
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "Found <!-- Now in comment." << std::endl;
+				}
+				incomment = true;
+				commentstate = CommentState::COMMENT;
+				token.append((char)c);
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "In comment" << std::endl;
+				}
+				return true;
+			}
+			commentstate = CommentState::START;
+			return false;
+
+		default:
+			std::cout << identifyMsg("FATAL", "COMMENTS") << "Unknown commentstate on comment start: " << (int) commentstate << std::endl;
+			exit(EXIT_BAD_NESTING);
+		}
+	}
+	else {
+		switch (commentstate) {
+		case CommentState::COMMENT:
+			if (c == '-') {
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "Found - in comment." << std::endl;
+				}
+				commentstate = CommentState::END_DASH1;
+				return true;
+			}
+			// Ignore the character
+			return true;
+
+		case CommentState::END_DASH1:
+			if (c == '-') {
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "Found -- in comment." << std::endl;
+				}
+				commentstate = CommentState::END_DASH2;
+				return true;
+			}
+			// Ignore the character
+			commentstate = CommentState::COMMENT;
+			return true;
+
+		case CommentState::END_DASH2:
+			if (c == '>') { // having seen the --> we are done and return to the original state
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "Found --> comment ended." << std::endl;
+				}
+				intoken = false;
+				incomment = false;
+				commentstate = CommentState::START;
+				if (debug & DEBUG_PARSE) {
+					std::cout << identifyMsg("DEBUG", "COMMENTS") << "Out of comment" << std::endl;
+				}
+				return true;
+			}
+			// Ignore the character
+			commentstate = CommentState::COMMENT;
+			return true;
+
+		default:
+			std::cout << identifyMsg("FATAL", "COMMENTS") << "Unknown commentstate on comment end: " << (int) commentstate << std::endl;
+			exit(EXIT_BAD_NESTING);
+		}
+	}
+	return false; // Should never reach here
+}
+
+/**
+ * \brief Handles ' and " entities, converting them to plain characters or keeping them based on attribute context.
+ *
+ * This function processes `'` and `"` entities, replacing them with `'` or `"` respectively when outside attributes
+ * or when used in attributes with non-matching quote characters (e.g., `'` in a double-quoted attribute). It logs
+ * appropriate warning messages and updates the entityToken with the converted value.
+ *
+ * \param entityToken [in/out] The entity string (e.g., "'" or """) to process; modified to the converted value (e.g., "'" or "'").
+ * \param currentOsisID [in] The OSIS ID for context in warning messages.
+ * \param msgPrefix [in] Pre-formatted message prefix for logging (includes level, type, and OSIS ID).
+ * \param inattribute [in] True if the entity is within an attribute value, false otherwise.
+ * \param attrQuoteChar [in] The quote character (' or ") used in the attribute, or '\0' if not applicable.
+ * \param debug [in] Debug flags from osis2mod; logs if (debug & DEBUG_PARSE) is set.
+ *
+ * \note Logs warnings to std::cout if (debug & DEBUG_PARSE).
+ * \note Thread-safe as it does not modify shared state beyond std::cout.
+ */
+void handleQuoteEntity(SWBuf& entityToken, const char* currentOsisID, SWBuf& msgPrefix, bool inattribute, char attrQuoteChar) {
+	if (entityToken == "'") {
+		if (!inattribute) {
+			if (debug & DEBUG_PARSE) {
+				std::cout << msgPrefix
+					  << "' unnecessary outside attributes. Replacing with '."
+					  << std::endl;
+			}
+			entityToken = "'";
+		} else if (attrQuoteChar == '"') {
+			if (debug & DEBUG_PARSE) {
+				std::cout << msgPrefix
+					  << "' unnecessary in double-quoted attributes. Replacing with '."
+					  << std::endl;
+			}
+			entityToken = "'";
+		} else if (attrQuoteChar == '\'') {
+			if (debug & DEBUG_PARSE) {
+				std::cout << msgPrefix
+					  << "' only needed in single-quoted attributes. Consider double quotes." 
+					  << std::endl;
+			}
+		} else {
+			if (debug & DEBUG_PARSE) {
+				std::cout << identifyMsg("ERROR", "PARSE", currentOsisID)
+					  << "Invalid attrQuoteChar: "
+				 	  << attrQuoteChar
+					  << std::endl;
+			}
+		}
+	} else if (entityToken == """) {
+		if (!inattribute) {
+			if (debug & DEBUG_PARSE) {
+				std::cout << msgPrefix
+					  << "" unnecessary outside attributes. Replacing with \"." 
+					  << std::endl;
+			}
+			entityToken = "\"";
+		} else if (attrQuoteChar == '\'') {
+			if (debug & DEBUG_PARSE) {
+				std::cout << msgPrefix
+					  << "" unnecessary in single-quoted attributes. Replacing with \"." 
+					  << std::endl;
+			}
+			entityToken = "\"";
+		} else if (attrQuoteChar == '"') {
+			if (debug & DEBUG_PARSE) {
+				std::cout << msgPrefix
+			  		  << "" only needed in double-quoted attributes. Consider single quotes."
+					  << std::endl;
+			}
+		} else {
+			if (debug & DEBUG_PARSE) {
+				std::cout << identifyMsg("ERROR", "PARSE", currentOsisID)
+					  << "Invalid attrQuoteChar: " 
+					  << attrQuoteChar 
+					  << std::endl;
+			}
+		}
+	}
+}
+
+/**
+ * \brief Converts a validated Unicode code point to its UTF-8 representation.
+ *
+ * This function takes a pre-parsed Unicode code point (1 to 0x10FFFF) and converts it to its UTF-8 encoded form,
+ * storing the result in entityToken. It handles single-byte, two-byte, three-byte, and four-byte UTF-8 sequences
+ * based on the code point value. The original entity string is provided for diagnostic logging.
+ *
+ * \param entityToken [in/out] The original entity string (e.g., "A") for logging; modified to contain the UTF-8 encoded character(s).
+ * \param codepoint [in] The Unicode code point (1 to 0x10FFFF) to convert to UTF-8.
+ * \param msgPrefix [in] Pre-formatted message prefix for logging (includes level, type, and OSIS ID).
+ *
+ * \return Always returns true, as the codepoint is assumed to be valid.
+ *
+ * \note The codepoint must be pre-validated (1 to 0x10FFFF) by the caller to avoid undefined behavior.
+ * \note Logs conversion details to std::cout if (debug & DEBUG_PARSE).
+ * \note Thread-safe as it does not modify shared state beyond std::cout.
+ */
+void convertNumericEntityToUTF8(SWBuf& entityToken, long codepoint, SWBuf& msgPrefix) {
+	// Save original entity for logging
+	SWBuf originalEntity = entityToken;
+
+	// Convert to UTF-8
+	if (codepoint <= 0x7F) {
+		entityToken.setSize(1);
+		entityToken[0] = static_cast<char>(codepoint);
+	} else if (codepoint <= 0x7FF) {
+		entityToken.setSize(2);
+		entityToken[0] = 0xC0 | (codepoint >> 6);
+		entityToken[1] = 0x80 | (codepoint & 0x3F);
+	} else if (codepoint <= 0xFFFF) {
+		entityToken.setSize(3);
+		entityToken[0] = 0xE0 | (codepoint >> 12);
+		entityToken[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+		entityToken[2] = 0x80 | (codepoint & 0x3F);
+	} else {
+		entityToken.setSize(4);
+		entityToken[0] = 0xF0 | (codepoint >> 18);
+		entityToken[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+		entityToken[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+		entityToken[3] = 0x80 | (codepoint & 0x3F);
+	}
+	if (debug & DEBUG_PARSE) {
+		std::cout << msgPrefix
+			  << "Converted numeric entity "
+			  << originalEntity
+			  << " to UTF-8 character "
+			  << entityToken
+			  << std::endl;
+	}
+}
+
+/**
+ * \brief Parses and processes XML/HTML entities in a character stream using a finite state automaton.
+ *
+ * This function processes a single character in the context of an XML/HTML entity, maintaining a finite state automaton
+ * to track entity parsing states (START, NUM_HASH, NUM_DEC, NUM_HEX, CHAR, ERR). It handles named entities (e.g., &),
+ * numeric entities (e.g., A, &#x41;), and malformed entities. Special numeric entities (e.g., & to &) are
+ * converted to named entities, while others are converted to UTF-8. The function updates the entityToken and appends
+ * results to either the text or token buffer based on intoken. Malformed entities are replaced with & followed by
+ * the invalid sequence.
+ *
+ * \param curChar [in] The current character to process.
+ * \param inentity [in/out] True if currently parsing an entity (starts with '&'), false otherwise.
+ * \param inWhitespace [in/out] True if the parser is in a whitespace sequence, reset when an entity starts.
+ * \param entitytype [in/out] The current state of the entity parser (START, NUM_HASH, NUM_DEC, NUM_HEX, CHAR, ERR).
+ * \param entityToken [in/out] The current entity being built (e.g., "&" or "A"); modified with the converted value.
+ * \param token [in/out] Buffer for entity output if intoken is true (e.g., within a tag).
+ * \param text [in/out] Buffer for entity output if intoken is false (e.g., plain text).
+ * \param intoken [in] True if the entity is within a token (e.g., tag name or attribute), false for plain text.
+ * \param inattribute [in] True if the entity is within an attribute value, false otherwise.
+ * \param attrQuoteChar [in] The quote character (' or ") used in the attribute, or '\0' if not applicable.
+ * \param currentOsisID [in] The OSIS ID for context in warning messages.
+ *
+ * \return True if the character was consumed by the entity parser, false otherwise.
+ *
+ * \note Logs warnings and errors to std::cout if (debug & DEBUG_PARSE).
+ * \note Thread-safe as long as inentity, inWhitespace, entitytype, entityToken, token, and text are not shared across threads without synchronization.
+ * \note Uses SWBuf::operator<< for shifting entityToken in error cases.
+ * \note Throws std::runtime_error for invalid entitytype values.
+ */
+bool handleEntity(char curChar, bool& inentity, bool& inWhitespace, EntityType& entitytype,
+		SWBuf& entityToken, SWBuf& token, SWBuf& text, bool intoken,
+		bool inattribute, char attrQuoteChar, const char* currentOsisID) {
+	if (!inentity && curChar != '&') {
+		return false; // Fast-path for non-entity characters
+	}
+	if (!inentity && curChar == '&') {
+		inentity = true;
+		inWhitespace = false;
+		entitytype = EntityType::START;
+		entityToken = "&";
+		return true;
+	}
+	if (inentity) {
+		if (entityToken.length() >= MAX_ENTITY_LENGTH) {
+			inentity = false;
+			entitytype = EntityType::ERR;
+			if (debug & DEBUG_PARSE) {
+				auto msgPrefix = identifyMsg("WARNING", "PARSE", currentOsisID);
+				std::cout << msgPrefix
+					  << "Entity length exceeds maximum ("
+					  << MAX_ENTITY_LENGTH
+					  << " characters), treating as malformed: "
+					  << entityToken
+					  << std::endl;
+			}
+		} else if (curChar == ';') {
+			inentity = false;
+		}
+		if (entitytype != EntityType::ERR) {
+			entityToken.append(curChar);
+		}
+		if (inentity) {
+			switch (entitytype) {
+			case EntityType::START:
+				if (curChar == '#') {
+					entitytype = EntityType::NUM_HASH;
+				} else if (std::isalnum(curChar)) {
+					entitytype = EntityType::CHAR;
+				} else {
+					inentity = false;
+					entitytype = EntityType::ERR;
+				}
+				break;
+			case EntityType::NUM_HASH:
+				if (curChar == 'x' || curChar == 'X') {
+					entitytype = EntityType::NUM_HEX;
+				} else if (std::isdigit(curChar)) {
+					entitytype = EntityType::NUM_DEC;
+				} else {
+					inentity = false;
+					entitytype = EntityType::ERR;
+				}
+				break;
+			case EntityType::NUM_DEC:
+				if (!std::isdigit(curChar)) {
+					inentity = false;
+					entitytype = EntityType::ERR;
+				}
+				break;
+			case EntityType::NUM_HEX:
+				if (!std::isxdigit(curChar)) {
+					inentity = false;
+					entitytype = EntityType::ERR;
+				}
+				break;
+			case EntityType::CHAR:
+				if (!std::isalnum(curChar)) {
+					inentity = false;
+					entitytype = EntityType::ERR;
+				}
+				break;
+			default:
+				std::cout << identifyMsg("FATAL", "PARSE") << "Unknown EntityType: " << (int) entitytype << std::endl;
+				exit(EXIT_BAD_NESTING);
+			}
+			return true;
+		}
+		if (!inentity) {
+			auto msgPrefix = identifyMsg("WARNING", "PARSE", currentOsisID);
+			// Handle numeric entities before switch
+			if (entitytype == EntityType::NUM_DEC || entitytype == EntityType::NUM_HEX) {
+				const char* p = entityToken.c_str();
+				p += 2; // Skip &#
+				int base = 10;
+				if (*p == 'x' || *p == 'X') {
+					base = 16;
+					++p;
+				}
+				char* end = nullptr;
+				errno = 0;
+				long codepoint = strtol(p, &end, base);
+				bool isValid = end && *end == ';' && codepoint > 0 && codepoint <= 0x10FFFF && errno != ERANGE;
+				if (isValid) {
+					switch (codepoint) {
+					case 38: // & -> &
+						if (debug & DEBUG_PARSE) {
+							std::cout << msgPrefix
+								  << "Converted numeric entity "
+								  << entityToken
+								  << " to named entity &"
+								  << std::endl;
+						}
+						entityToken = "&";
+						entitytype = EntityType::CHAR;
+						break;
+					case 60: // < -> <
+						if (debug & DEBUG_PARSE) {
+							std::cout << msgPrefix
+								  << "Converted numeric entity "
+								  << entityToken
+								  << " to named entity <"
+								  << std::endl;
+						}
+						entityToken = "<";
+						entitytype = EntityType::CHAR;
+						break;
+					case 62: // > -> >
+						if (debug & DEBUG_PARSE) {
+							std::cout << msgPrefix
+								  << "Converted numeric entity "
+								  << entityToken
+								  << " to named entity >"
+								  << std::endl;
+						}
+						entityToken = ">";
+						entitytype = EntityType::CHAR;
+						break;
+					case 34: // " -> "
+						if (debug & DEBUG_PARSE) {
+							std::cout << msgPrefix
+								  << "Converted numeric entity "
+								  << entityToken
+								  << " to named entity ""
+								  << std::endl;
+						}
+						entityToken = """;
+						entitytype = EntityType::CHAR;
+						break;
+					case 39: // ' -> '
+						if (debug & DEBUG_PARSE) {
+							std::cout << msgPrefix
+								  << "Converted numeric entity "
+								  << entityToken
+								  << " to named entity '"
+								  << std::endl;
+						}
+						entityToken = "'";
+						entitytype = EntityType::CHAR;
+						break;
+					default:
+						// Non-special codepoints go to UTF-8 conversion
+						break;
+					}
+				} else {
+					if (debug & DEBUG_PARSE) {
+						std::cout << msgPrefix
+							  << "Invalid numeric entity, codepoint out of range or malformed: " 
+							  << entityToken
+							  << std::endl;
+					}
+					entitytype = EntityType::ERR;
+				}
+				// Handle non-special valid codepoints
+				if (entitytype == EntityType::NUM_DEC || entitytype == EntityType::NUM_HEX) {
+					convertNumericEntityToUTF8(entityToken, codepoint, msgPrefix);
+				}
+			}
+			switch (entitytype) {
+			case EntityType::ERR:
+				entityToken << 1;
+				if (debug & DEBUG_PARSE) {
+					std::cout << msgPrefix
+						  << "Malformed entity, replacing with &" 
+						  << entityToken
+						  << std::endl;
+				}
+				(intoken ? token : text).append("&").append(entityToken);
+				break;
+			case EntityType::NUM_HEX:
+			case EntityType::NUM_DEC:
+				(intoken ? token : text).append(entityToken);
+				break;
+			case EntityType::CHAR:
+				if (entityToken != "&" && entityToken != "<" && 
+				    entityToken != ">" && entityToken != """ && 
+				    entityToken != "'") {
+					if (debug & DEBUG_PARSE) {
+						std::cout << msgPrefix 
+							  << "XML only supports &, <, >, ", ', found " 
+							  << entityToken
+							  << std::endl;
+					}
+					(intoken ? token : text).append(entityToken);
+				} else if (entityToken == "'" || entityToken == """) {
+					handleQuoteEntity(entityToken, currentOsisID, msgPrefix, inattribute, attrQuoteChar);
+					(intoken ? token : text).append(entityToken);
+				} else {
+					(intoken ? token : text).append(entityToken);
+				}
+				break;
+			default:
+				(intoken ? token : text).append(entityToken);
+				break;
+			}
+			if (curChar == ';') {
+				return true;
+			}
+		}
+	}
+		
+	return false;
+}
+
 void processOSIS(std::istream& infile) {
-	typedef enum {
-		CS_NOT_IN_COMMENT,            // or seen starting "<"
-		CS_SEEN_STARTING_EXCLAMATION,
-		CS_SEEN_STARTING_HYPHEN,
-		CS_IN_COMMENT,
-		CS_SEEN_ENDING_HYPHEN,
-		CS_SEEN_SECOND_ENDING_HYPHEN,
-		CS_SEEN_ENDING_GREATER_THAN
-	} t_commentstate;
 
-	typedef enum {
-		ET_NUM,
-		ET_HEX,
-		ET_CHAR,
-		ET_NONE,
-		ET_ERR
-	} t_entitytype;
-
 	activeOsisID[0] = '\0';
 
 	strcpy(currentOsisID,"N/A");
@@ -1584,7 +2063,7 @@
 	SWBuf token;
 	SWBuf text;
 	bool incomment = false;
-	t_commentstate commentstate = CS_NOT_IN_COMMENT;
+	CommentState commentstate = CommentState::START;
 	bool intoken = false;
 	bool inWhitespace = false;
 	bool seeingSpace = false;
@@ -1591,7 +2070,7 @@
 	unsigned char curChar = '\0';
 	SWBuf entityToken;
 	bool inentity = false;
-	t_entitytype entitytype = ET_NONE;
+	EntityType entitytype = EntityType::START;
 	unsigned char attrQuoteChar = '\0';
 	bool inattribute = false;
 
@@ -1618,18 +2097,6 @@
 		}
 		charPos++;
 
-		// Look for entities:
-		// These are of the form &#dddd;, &xHHHH; or &llll;
-		// where dddd is a sequence of digits
-		//       HHHH is a sequence of [A-Fa-f0-9]
-		//       llll is amp, lt, gt, quot or apos
-		//            but we will look for a sequence of [A-Za-z0-9]
-		// All but &, <, >, ", ' will produce a WARNING
-		// In the future:
-		//    &#dddd; and &xHHHH; should be converted to UTF-8,
-		//        with a WARNING if the text is not UTF-8
-		//    &llll; other than the xml standard 5 should produce a WARNING
-
 		// For entity diagnostics track whether the text is an attribute value
 		if (inattribute && (curChar == '\'' || curChar == '"')) {
 			if (attrQuoteChar == curChar) {
@@ -1640,186 +2107,16 @@
 				attrQuoteChar = curChar;
 			}
 		}
+
 		if (intoken && curChar == '=') {
 			inattribute = true;
 			attrQuoteChar = '\0';
 		}
 
-		if (!inentity && curChar == '&') {
-			inentity = true;
-			inWhitespace = false;
-			entitytype = ET_NONE;
-			entityToken = "&";
-			continue;
+		if (handleEntity(curChar, inentity, inWhitespace, entitytype, entityToken, token, text, intoken, inattribute, attrQuoteChar, currentOsisID)) {
+			continue; // Character consumed, move to next
 		}
 
-		if (inentity) {
-			if (curChar == ';') {
-				inentity = false;
-			}
-			else {
-				switch (entitytype) {
-				    case ET_NONE:
-					// A hex entity cannot start with X in XML, but it can in HTML
-					// Allow for it here and complain later
-					if (curChar == 'x' || curChar == 'X') {
-						entitytype = ET_HEX;
-					}
-					else
-					if (curChar == '#') {
-						entitytype = ET_NUM;
-					}
-					else
-					if ((curChar >= 'A' && curChar <= 'Z') ||
-					    (curChar >= 'a' && curChar <= 'z') ||
-					    (curChar >= '0' && curChar <= '9')) {
-						entitytype = ET_CHAR;
-					}
-					else {
-						inentity = false;
-						entitytype = ET_ERR;
-					}
-					break;
-
-				    case ET_NUM :
-					if (!(curChar >= '0' && curChar <= '9')) {
-						inentity = false;
-						entitytype = ET_ERR;
-					}
-					break;
-				    case ET_HEX :
-					if ((curChar >= 'G' && curChar <= 'Z') ||
-					    (curChar >= 'g' && curChar <= 'z')) {
-						// Starts out as a HEX entity, but it isn't one
-						entitytype = ET_CHAR;
-					}
-					else
-					if (!((curChar >= 'A' && curChar <= 'F') ||
-					      (curChar >= 'a' && curChar <= 'f') ||
-					      (curChar >= '0' && curChar <= '9'))) {
-						inentity = false;
-						entitytype = ET_ERR;
-					}
-					break;
-				    case ET_CHAR :
-					if (!((curChar >= 'A' && curChar <= 'Z') ||
-					      (curChar >= 'a' && curChar <= 'z') ||
-					      (curChar >= '0' && curChar <= '9'))) {
-						inentity = false;
-						entitytype = ET_ERR;
-					}
-					break;
-				    default:
-					std::cout << identifyMsg("FATAL", "ENTITY", currentOsisID) << "Unknown entitytype on entity end: " << entitytype << std::endl;
-					exit(EXIT_BAD_NESTING);
-				}
-			}
-
-			if (entitytype != ET_ERR) {
-				entityToken.append((char) curChar);
-			}
-
-			// It is an entity, perhaps invalid, if curChar is ';', error otherwise
-			// Test to see if we now have an entity or a failure
-			// It may not be a valid entity.
-			if (!inentity) {
-				switch (entitytype) {
-				    case ET_ERR :
-					// Remove the leading &
-					entityToken << 1;
-					std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "malformed entity, replacing &" << entityToken << " with &" << entityToken << std::endl;
-					if (intoken) {
-						token.append("&");
-						token.append(entityToken);
-					}
-					else {
-						text.append("&");
-						text.append(entityToken);
-					}
-					break;
-				    case ET_HEX :
-					if (entityToken[1] != 'x') {
-						std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "HEX entity must begin with &x, found " << entityToken << std::endl;
-					}
-					else {
-						std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "SWORD does not search HEX entities, found " << entityToken << std::endl;
-					}
-					break;
-				    case ET_CHAR :
-					if (strcmp(entityToken, "&")  &&
-				            strcmp(entityToken, "<")   &&
-				            strcmp(entityToken, ">")   &&
-				            strcmp(entityToken, """) &&
-				            strcmp(entityToken, "'")) {
-						std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "XML only supports 5 Character entities &, <, >, " and ', found " << entityToken << std::endl;
-					}
-					else
-					if (!strcmp(entityToken, "'")) {
-						std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "While valid for XML, XHTML does not support '." << std::endl;
-						if (!inattribute) {
-							std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "' is unnecessary outside of attribute values. Replacing with '. " << std::endl;
-							entityToken = "'";
-						}
-						else {
-							switch (attrQuoteChar) {
-							    case '"' :
-								std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "' is unnecessary inside double quoted attribute values. Replacing with '. " << std::endl;
-								entityToken = "'";
-								break;
-							    case '\'' :
-								std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "' is only needed within single quoted attribute values. Considering using double quoted attribute and replacing with '." << std::endl;
-								break;
-							}
-						}
-					}
-					else
-					if (!strcmp(entityToken, """)) {
-						std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "While valid for XML, " is only needed within double quoted attribute values" << std::endl;
-						if (!inattribute) {
-							std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "" is unnecessary outside of attribute values. Replace with \"." << std::endl;
-							entityToken = "\"";
-						}
-						else {
-							switch (attrQuoteChar) {
-							    case '"' :
-								std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "" is only needed within double quoted attribute values. Considering using single quoted attribute and replacing with \"." << std::endl;
-								break;
-							    case '\'' :
-								std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "" is unnecessary inside single quoted attribute values. Replace with \"." << std::endl;
-								entityToken = "\"";
-								break;
-							}
-						}
-					}
-					break;
-				    case ET_NUM :
-					std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "SWORD does not search numeric entities, found " << entityToken << std::endl;
-					break;
-				    case ET_NONE :
-				    default:
-					break;
-				}
-
-				// Put the entity into the stream.
-				if (intoken) {
-					token.append(entityToken);
-				}
-				else {
-					text.append(entityToken);
-				}
-
-				if (curChar == ';') {
-					// The character was handled, so go get the next one.
-					continue;
-				}
-			}
-			else {
-				// The character was handled, so go get the next one.
-				continue;
-			}
-		}
-
-
 		if (!intoken && curChar == '<') {
 			intoken = true;
 			token = "<";
@@ -1830,90 +2127,13 @@
 
 		// Handle XML comments starting with "<!--", ending with "-->"
 		if (intoken && !incomment) {
-			switch (commentstate) {
-				case CS_NOT_IN_COMMENT :
-					if (curChar == '!') {
-						commentstate = CS_SEEN_STARTING_EXCLAMATION;
-						token.append((char) curChar);
-						continue;
-					} else {
-						break;
-					}
-
-				case CS_SEEN_STARTING_EXCLAMATION :
-					if (curChar == '-') {
-						commentstate = CS_SEEN_STARTING_HYPHEN;
-						token.append((char) curChar);
-						continue;
-					} else {
-						commentstate = CS_NOT_IN_COMMENT;
-						break;
-					}
-
-				case CS_SEEN_STARTING_HYPHEN :
-					if (curChar == '-') {
-						incomment = true;
-						commentstate = CS_IN_COMMENT;
-						token.append((char) curChar);
-
-						if (debug & DEBUG_OTHER) {
-							std::cout << identifyMsg("DEBUG", "COMMENTS") << "In comment" << std::endl;
-						}
-
-						continue;
-					} else {
-						commentstate = CS_NOT_IN_COMMENT;
-						break;
-					}
-
-				default:
-					std::cout << identifyMsg("FATAL", "COMMENTS") << "Unknown commentstate on comment start: " << commentstate << std::endl;
-					exit(EXIT_BAD_NESTING);
+			if (handleComment(curChar, currentOsisID, intoken, incomment, commentstate, token)) {
+				continue; // Character consumed, move to next
 			}
 		}
 
-		if (incomment) {
-			switch (commentstate) {
-				case CS_IN_COMMENT:
-					if (curChar == '-') {
-						commentstate = CS_SEEN_ENDING_HYPHEN;
-						continue;
-					} else {
-						// ignore the character
-						continue;
-					}
-
-				case CS_SEEN_ENDING_HYPHEN :
-					if (curChar == '-') {
-						commentstate = CS_SEEN_SECOND_ENDING_HYPHEN;
-						continue;
-					} else {
-						// ignore character
-						commentstate = CS_IN_COMMENT;
-						continue;
-					}
-
-				case CS_SEEN_SECOND_ENDING_HYPHEN :
-					if (curChar == '>') {
-						intoken = false;
-						incomment = false;
-						commentstate = CS_NOT_IN_COMMENT;
-
-						if (debug & DEBUG_OTHER) {
-							std::cout << identifyMsg("DEBUG", "COMMENTS") << "Out of comment" << std::endl;
-						}
-
-						continue;
-					} else {
-						// ignore character
-						commentstate = CS_IN_COMMENT;
-						continue;
-					}
-
-				default:
-					std::cout << identifyMsg("FATAL", "COMMENTS") << "Unknown commentstate on comment end: " << commentstate << std::endl;
-					exit(EXIT_BAD_NESTING);
-			}
+		if (incomment && handleComment(curChar, currentOsisID, intoken, incomment, commentstate, token)) {
+			continue; // Character consumed, move to next
 		}
 
 		// Outside of tokens merge adjacent whitespace
@@ -1953,9 +2173,9 @@
 		}
 		else {
 			switch (curChar) {
-				case '>' : std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "> should be >" << std::endl; text.append(">"); break;
-				case '<' : std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "< should be <" << std::endl; text.append("<"); break;
-				default  : text.append((char) curChar); break;
+			case '>' : std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "> should be >" << std::endl; text.append(">"); break;
+			case '<' : std::cout << identifyMsg("WARNING", "PARSE", currentOsisID) << "< should be <" << std::endl; text.append("<"); break;
+			default  : text.append((char) curChar); break;
 			}
 		}
 	}



More information about the sword-cvs mailing list