[sword-svn] r3401 - trunk/utilities
dmsmith at crosswire.org
dmsmith at crosswire.org
Sat Feb 6 10:12:20 MST 2016
Author: dmsmith
Date: 2016-02-06 10:12:20 -0700 (Sat, 06 Feb 2016)
New Revision: 3401
Modified:
trunk/utilities/osis2mod.cpp
Log:
Added entity handling. div type='colophon' now not changed to milestones.
Modified: trunk/utilities/osis2mod.cpp
===================================================================
--- trunk/utilities/osis2mod.cpp 2016-02-06 16:41:53 UTC (rev 3400)
+++ trunk/utilities/osis2mod.cpp 2016-02-06 17:12:20 UTC (rev 3401)
@@ -1213,6 +1213,7 @@
static std::stack<XMLTag> bspTagStack;
static int sID = 1;
char buf[11];
+ SWBuf typeAttr = t.getAttribute("type");
// Support simplification transformations
if (t.isEmpty()) {
@@ -1240,9 +1241,10 @@
// abbr When would this ever cross a boundary?
// seg as it is used for a divineName hack
// foreign so that it can be easily italicized
+ // div type="colophon" so that it can be treated as a block
else if (tagName == "chapter" ||
tagName == "closer" ||
- tagName == "div" ||
+ (tagName == "div" && typeAttr != "colophon") ||
tagName == "l" ||
tagName == "lg" ||
tagName == "q" ||
@@ -1272,11 +1274,13 @@
}
bspTagStack.pop();
+ SWBuf topTypeAttr = topToken.getAttribute("type");
// Look for the milestoneable container tags handled above.
+ // Have to treat div type="colophon" differently
if (tagName == "chapter" ||
tagName == "closer" ||
- tagName == "div" ||
+ (tagName == "div" && topTypeAttr != "colophon") ||
tagName == "l" ||
tagName == "lg" ||
tagName == "p" ||
@@ -1429,6 +1433,14 @@
CS_SEEN_ENDING_GREATER_THAN
} t_commentstate;
+ typedef enum {
+ ET_NUM,
+ ET_HEX,
+ ET_CHAR,
+ ET_NONE,
+ ET_ERR
+ } t_entitytype;
+
activeOsisID[0] = '\0';
strcpy(currentOsisID,"N/A");
@@ -1449,6 +1461,13 @@
bool inWhitespace = false;
bool seeingSpace = false;
unsigned char curChar = '\0';
+ SWBuf entityToken;
+ bool inentity = false;
+ t_entitytype entitytype = ET_NONE;
+ unsigned char attrQuoteChar = '\0';
+ bool inattribute = false;
+ unsigned int linePos = 1;
+ unsigned int charPos = 0;
while (infile.good()) {
@@ -1465,16 +1484,221 @@
// Does a SWORD module actually require this?
if (curChar == '\n') {
curChar = ' ';
+ charPos = 0;
+ linePos++;
}
+ charPos++;
+ // Look for entities:
+ // These are of the form &#dddd;, &xHHHH; or &llll;
+ // where dddd is a sequence of digits
+ // HHHH is a sequence of [A-Fa-f0-9]
+ // llll is amp, lt, gt, quot or apos
+ // but we will look for a sequence of [A-Za-z0-9]
+ // All but &, <, >, ", ' will produce a WARNING
+ // In the future:
+ // &#dddd; and &xHHHH; should be converted to UTF-8,
+ // with a WARNING if the text is not UTF-8
+ // &llll; other than the xml standard 5 should produce a WARNING
+
+ // For entity diagnostics track whether the text is an attribute value
+ if (inattribute && (curChar == '\'' || curChar == '"')) {
+ if (attrQuoteChar == curChar) {
+ inattribute = false;
+ attrQuoteChar = '\0';
+ }
+ else {
+ attrQuoteChar = curChar;
+ }
+ }
+ if (intoken && curChar == '=') {
+ inattribute = true;
+ attrQuoteChar = '\0';
+ }
+
+ if (!inentity && curChar == '&') {
+ inentity = true;
+ entitytype = ET_NONE;
+ entityToken = "&";
+ continue;
+ }
+
+ if (inentity) {
+ if (curChar == ';') {
+ inentity = false;
+ }
+ else {
+ switch (entitytype) {
+ case ET_NONE:
+ // A hex entity cannot start with X in XML, but it can in HTML
+ // Allow for it here and complain later
+ if (curChar == 'x' || curChar == 'X') {
+ entitytype = ET_HEX;
+ }
+ else
+ if (curChar == '#') {
+ entitytype = ET_NUM;
+ }
+ else
+ if ((curChar >= 'A' && curChar <= 'Z') ||
+ (curChar >= 'a' && curChar <= 'z') ||
+ (curChar >= '0' && curChar <= '9')) {
+ entitytype = ET_CHAR;
+ }
+ else {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+
+ case ET_NUM :
+ if (!(curChar >= '0' && curChar <= '9')) {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+ case ET_HEX :
+ if ((curChar >= 'G' && curChar <= 'Z') ||
+ (curChar >= 'g' && curChar <= 'z')) {
+ // Starts out as a HEX entity, but it isn't one
+ entitytype = ET_CHAR;
+ }
+ else
+ if (!((curChar >= 'A' && curChar <= 'F') ||
+ (curChar >= 'a' && curChar <= 'f') ||
+ (curChar >= '0' && curChar <= '9'))) {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+ case ET_CHAR :
+ if (!((curChar >= 'A' && curChar <= 'Z') ||
+ (curChar >= 'a' && curChar <= 'z') ||
+ (curChar >= '0' && curChar <= '9'))) {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+ default:
+ cout << "FATAL(ENTITY): unknown entitytype on entity end: " << entitytype << endl;
+ exit(EXIT_BAD_NESTING);
+ }
+ }
+
+ if (entitytype != ET_ERR) {
+ entityToken.append((char) curChar);
+ }
+
+ // It is an entity, perhaps invalid, if curChar is ';', error otherwise
+ // Test to see if we now have an entity or a failure
+ // It may not be a valid entity.
+ if (!inentity) {
+ switch (entitytype) {
+ case ET_ERR :
+ // Remove the leading &
+ entityToken << 1;
+ cout << "WARNING(PARSE): malformed entity, replacing &" << entityToken << " with &" << entityToken << endl;
+ if (intoken) {
+ token.append("&");
+ token.append(entityToken);
+ }
+ else {
+ text.append("&");
+ text.append(entityToken);
+ }
+ break;
+ case ET_HEX :
+ if (entityToken[1] != 'x') {
+ cout << "WARNING(PARSE): HEX entity must begin with &x, found " << entityToken << endl;
+ }
+ else {
+ cout << "WARNING(PARSE): SWORD does not search HEX entities, found " << entityToken << endl;
+ }
+ break;
+ case ET_CHAR :
+ if (strcmp(entityToken, "&") &&
+ strcmp(entityToken, "<") &&
+ strcmp(entityToken, ">") &&
+ strcmp(entityToken, """) &&
+ strcmp(entityToken, "'")) {
+ cout << "WARNING(PARSE): XML only supports 5 Character entities &, <, >, " and ', found " << entityToken << endl;
+ }
+ else
+ if (!strcmp(entityToken, "'")) {
+ cout << "WARNING(PARSE): While valid for XML, XHTML does not support '." << endl;
+ if (!inattribute) {
+ cout << "WARNING(PARSE): ' is unnecessary outside of attribute values. Replacing with '. " << endl;
+ entityToken = "'";
+ }
+ else {
+ switch (attrQuoteChar) {
+ case '"' :
+ cout << "WARNING(PARSE): ' is unnecessary inside double quoted attribute values. Replacing with '. " << endl;
+ entityToken = "'";
+ break;
+ case '\'' :
+ cout << "WARNING(PARSE): ' is only needed within single quoted attribute values. Considering using double quoted attribute and replacing with '." << endl;
+ break;
+ }
+ }
+ }
+ else
+ if (!strcmp(entityToken, """)) {
+ cout << "WARNING(PARSE): While valid for XML, " is only needed within double quoted attribute values" << endl;
+ if (!inattribute) {
+ cout << "WARNING(PARSE): " is unnecessary outside of attribute values. Replace with \"." << endl;
+ entityToken = "\"";
+ }
+ else {
+ switch (attrQuoteChar) {
+ case '"' :
+ cout << "WARNING(PARSE): " is only needed within double quoted attribute values. Considering using single quoted attribute and replacing with \"." << endl;
+ break;
+ case '\'' :
+ cout << "WARNING(PARSE): " is unnecessary inside single quoted attribute values. Replace with \"." << endl;
+ entityToken = "\"";
+ break;
+ }
+ }
+ }
+ break;
+ case ET_NUM :
+ cout << "WARNING(PARSE): SWORD does not search numeric entities, found " << entityToken << endl;
+ break;
+ case ET_NONE :
+ default:
+ break;
+ }
+
+ // Put the entity into the stream.
+ if (intoken) {
+ token.append(entityToken);
+ }
+ else {
+ text.append(entityToken);
+ }
+
+ if (curChar == ';') {
+ // The character was handled, so go get the next one.
+ continue;
+ }
+ }
+ else {
+ // The character was handled, so go get the next one.
+ continue;
+ }
+ }
+
+
if (!intoken && curChar == '<') {
intoken = true;
token = "<";
+ inattribute = false;
+ attrQuoteChar = '\0';
continue;
}
// Handle XML comments starting with "<!--", ending with "-->"
-
if (intoken && !incomment) {
switch (commentstate) {
case CS_NOT_IN_COMMENT :
More information about the sword-cvs
mailing list