of type other than main, chapter or sub, will be taken as a title for the verse. // Once one of these conditions is met, the division between chapter introduction and pre-verse is set. // 3) Between verses, the material is split between the prior verse and the next verse. // Basically, while end and empty tags are found, they belong to the prior verse. // Once a begin tag is found, it belongs to the next verse. if (!inPreVerse && !inBookIntro) { if (inChapterIntro) { // Determine when we are no longer in a chapter heading, but in pre-verse material: // If we see one of the following: // a section div // a title that is not main, chapter or sub or unclassified (no type attribute) if ((tokenName == "div" && typeAttr == "section") || (tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub") ) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl; } if (text.length()) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl; } // Since we have found the boundary, we need to write out the chapter heading writeEntry(text); } // And we are no longer in the chapter heading inChapterIntro = false; // But rather, we are now in pre-verse material inPreVerse = true; } } else if (!inVerse && inChapter) { inPreVerse = true; } if (inPreVerse) { char genBuf[200]; sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID); text.append(genBuf); } } if (debug & DEBUG_INTERVERSE) { if (!inVerse && !inBookIntro && !inChapterIntro) { cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl; } } return false; } // Done with procesing start and empty tags // Process end tags else { if (tagStack.empty()) { cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl; exit(EXIT_BAD_NESTING); } // Note: empty end tags have the eID attribute if (!token.isEmpty()) { XMLTag topToken = tagStack.top(); tagDepth = tagStack.size(); if (debug & DEBUG_STACK) { cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl; } tagStack.pop(); if (tokenName != topToken.getName()) { cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl; // exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here. // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway. // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting. } } // We haven't seen the first div outside the header so there is little to do. if (!firstDiv) { if (tokenName == "header") { headerEnded = true; if (debug & DEBUG_OTHER) { cout << "DEBUG(FOUND): End of header found" << endl; } } // Collect the content so it can be used to suggest the module's conf. return false; } // VERSE and COMMENTARY END if ((tokenName == "verse") || (tokenName == "div" && eidAttr == sidVerse) ) { if (tagDepth != verseDepth) { cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl; } // If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. if (inWOC) { text.append("</q>"); } // Include the token if it is not a verse if (tokenName != "verse") { text.append(token); } else if (debug & DEBUG_VERSE) { // transform the verse into a milestone XMLTag t = "<milestone resp=\"v\" />"; // copy all the attributes of the verse element to the milestone StringList attrNames = token.getAttributeNames(); for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) { const char* attr = (*loop).c_str(); t.setAttribute(attr, token.getAttribute(attr)); } text.append(t); } writeEntry(text); inVerse = false; inPreVerse = false; verseDepth = 0; return true; } // Handle WOC quotes. // Note this requires transformBSP to make them into milestones // Otherwise have to manage it here if (tokenName == "q") { XMLTag topToken = quoteStack.top(); if (debug & DEBUG_QUOTE) { cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl; } quoteStack.pop(); // If we have found an end tag for a <q who="Jesus"> then we are done with the WOC // and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) { if (debug & DEBUG_QUOTE) { cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl; } inWOC = false; const char *sID = topToken.getAttribute("sID"); const char *eID = token.getAttribute("eID"); if (!sID) { sID = ""; } if (!eID) { eID = ""; } if (strcmp(sID, eID)) { cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl; } // Output the quotation mark if appropriate, inside the WOC. // If there is no marker attribute, let the SWORD engine manufacture one. // If there is a marker attribute and it has content, then output that. // If the marker attribute is present and empty, then there is nothing to do. // And have it within the WOC markup if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) { token.setAttribute("who", 0); // remove the who="Jesus" text.append(token); } // Now close the WOC text.append("</q>"); return true; } return false; } // Look for the end of document, book and chapter // Also for material that goes with last entry if (!inVerse && !inBookIntro && !inChapterIntro) { // Is this the end of a chapter. if ((tokenName == "chapter") || (tokenName == "div" && eidAttr == sidChapter) ) { text.append(token); writeEntry(text); inChapter = false; sidChapter = ""; chapterDepth = 0; verseDepth = 0; return true; } // Is it the end of a book if (tokenName == "div" && eidAttr == sidBook) { text.append(token); writeEntry(text); bookDepth = 0; chapterDepth = 0; verseDepth = 0; return true; } // Do not include the end of an osis document if (tokenName == "osisText" || tokenName == "osis") { bookDepth = 0; chapterDepth = 0; verseDepth = 0; text = ""; return true; } // When we are not inPreVerse, the interverse tags get appended to the preceeding verse. if (!inPreVerse) { text.append(token); writeEntry(text); if (debug & DEBUG_INTERVERSE) { cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; } return true; } if (debug & DEBUG_INTERVERSE) { cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; } return false; } return false; } // done with Processing end tags return false; } /** * Support normalizations necessary for a SWORD module. * OSIS allows for document structure (Book, Section, Paragraph or BSP) * to overlap Bible versification (Book, Chapter, Verse). * Most SWORD applications need to display verses in isolation or in HTML table cells, * requiring each stored entry (i.e. verses) to be well-formed xml. * This routine normalizes container elements which could cross verse boundaries into milestones. * For most of these OSIS elements, there is a milestone form. However, p is not milestoneable. * For this reason, p is transformed into div elements with type x-p. * param t the tag to transform * return the transformed tag or the original one */ XMLTag transformBSP(XMLTag t) { static std::stack<XMLTag> bspTagStack; static int sID = 1; char buf[11]; // Support simplification transformations if (t.isEmpty()) { if (debug & DEBUG_XFORM) { cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl; } return t; } SWBuf tagName = t.getName(); if (!t.isEndTag()) { // Transform <p> into <div type="x-p"> and milestone it if (tagName == "p") { t.setText("<div type=\"x-p\" />"); sprintf(buf, "gen%d", sID++); t.setAttribute("sID", buf); } // Transform <tag> into <tag sID="">, where tag is a milestoneable element. // The following containers are milestoneable. // abbr, closer, div, foreign, l, lg, salute, signed, speech // Leaving out: // abbr When would this ever cross a boundary? // seg as it is used for a divineName hack // foreign so that it can be easily italicized else if (tagName == "chapter" || tagName == "closer" || tagName == "div" || tagName == "l" || tagName == "lg" || tagName == "q" || tagName == "salute" || tagName == "signed" || tagName == "speech" || tagName == "verse" ) { t.setEmpty(true); sprintf(buf, "gen%d", sID++); t.setAttribute("sID", buf); } bspTagStack.push(t); if (debug & DEBUG_XFORM) { cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl; XMLTag topToken = bspTagStack.top(); cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl; } } else { if (!bspTagStack.empty()) { XMLTag topToken = bspTagStack.top(); if (debug & DEBUG_XFORM) { cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl; } bspTagStack.pop(); // Look for the milestoneable container tags handled above. if (tagName == "chapter" || tagName == "closer" || tagName == "div" || tagName == "l" || tagName == "lg" || tagName == "p" || tagName == "q" || tagName == "salute" || tagName == "signed" || tagName == "speech" || tagName == "verse" ) { // make this a clone of the start tag with sID changed to eID // Note: in the case of </p> the topToken is a <div type="x-p"> t = topToken; t.setAttribute("eID", t.getAttribute("sID")); t.setAttribute("sID", 0); } } else { cout << "FATAL(TAGSTACK): " << currentOsisID << ": closing tag without opening tag" << endl; } } return t; } /** * Write out all links in the module. * Waiting is necessary because writeEntry might ultimately append * text to a verse moving it's offset in the data file. * While we are minimizing it by postponing the write until we have * gathered the next verse, the following scenario is happening: * A module is using linked verses and has some verses that are not * in the chosen versification. If the out-of-canon verse happens following * a linked verse, the out-of-canon verse is appended to the prior * verse. Care has to be taken that the linked verses all point to * the first of the set. */ void writeLinks() { // Link all the verses VerseKey destKey; destKey.setVersificationSystem(currentVerse.getVersificationSystem()); destKey.setAutoNormalize(0); destKey.setIntros(1); VerseKey linkKey; linkKey.setVersificationSystem(currentVerse.getVersificationSystem()); linkKey.setAutoNormalize(0); linkKey.setIntros(1); for (unsigned int i = 0; i < linkedVerses.size(); i++) { // The verseKeys is a list of verses // where the first is the real verse // and the others link to it. ListKey verseKeys = linkedVerses[i]; verseKeys.setPosition(TOP); destKey = verseKeys.getElement(); verseKeys.increment(1); while (!verseKeys.popError()) { linkKey = verseKeys.getElement(); linkToEntry(linkKey, destKey); verseKeys.increment(1); } } } void usage(const char *app, const char *error = 0, const bool verboseHelp = false) { if (error) fprintf(stderr, "\n%s: %s\n", app, error); fprintf(stderr, "OSIS Bible/commentary module creation tool for The SWORD Project\n"); fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app); fprintf(stderr, " <output/path>\t\t an existing folder that the module will be written\n"); fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to\n"); fprintf(stderr, "\t\t\t\t read from standard input\n"); fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n"); fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); fprintf(stderr, " -b <2|3|4>\t\t compression block size (default 4):\n"); fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n"); fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n"); fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); #ifdef _ICU_ fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n"); if (verboseHelp) { fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n"); fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n"); fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n"); } #endif fprintf(stderr, " -s <2|4>\t\t bytes used to store entry size (default is 2).\n"); if (verboseHelp) { fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large\n"); fprintf(stderr, "\t\t\t\t entries in uncompressed modules\n"); fprintf(stderr, "\t\t\t\t (2 bytes to store size equal 65535 characters)\n"); } fprintf(stderr, " -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n"); fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:"); VersificationMgr *vmgr = VersificationMgr::getSystemVersificationMgr(); StringList av11n = vmgr->getVersificationSystems(); for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) { if ((distance(av11n.begin(), loop) % 3) == 0) { fprintf(stderr, "\n\t\t\t\t %-12s", (*loop).c_str()); } else { fprintf(stderr, "\t%-12s", (*loop).c_str()); } } fprintf(stderr, "\n"); if (verboseHelp) { fprintf(stderr, " -d <flags>\t\t turn on debugging (default is 0)\n"); fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n"); fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n"); fprintf(stderr, "\t\t\t\t\t0 - no debugging\n"); fprintf(stderr, "\t\t\t\t\t1 - writes to module, very verbose\n"); fprintf(stderr, "\t\t\t\t\t2 - verse start and end\n"); fprintf(stderr, "\t\t\t\t\t4 - quotes, esp. Words of Christ\n"); fprintf(stderr, "\t\t\t\t\t8 - titles\n"); fprintf(stderr, "\t\t\t\t\t16 - inter-verse material\n"); fprintf(stderr, "\t\t\t\t\t32 - BSP to BCV transformations\n"); fprintf(stderr, "\t\t\t\t\t64 - v11n exceptions\n"); fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n"); fprintf(stderr, "\t\t\t\t\t256 - internal stack\n"); fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n"); fprintf(stderr, "\t\t\t\t This argument can be used more than once. (Or\n"); fprintf(stderr, "\t\t\t\t the flags may be added together.)\n"); } fprintf(stderr, " -h \t\t\t print verbose usage text\n"); fprintf(stderr, "\n"); fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n"); fprintf(stderr, "\n"); exit(EXIT_BAD_ARG); } void processOSIS(istream& infile) { typedef enum { CS_NOT_IN_COMMENT, // or seen starting "<" CS_SEEN_STARTING_EXCLAMATION, CS_SEEN_STARTING_HYPHEN, CS_IN_COMMENT, CS_SEEN_ENDING_HYPHEN, CS_SEEN_SECOND_ENDING_HYPHEN, CS_SEEN_ENDING_GREATER_THAN } t_commentstate; activeOsisID[0] = '\0'; strcpy(currentOsisID,"N/A"); currentVerse.setVersificationSystem(v11n); currentVerse.setAutoNormalize(false); currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings currentVerse.setPersist(true); module->setKey(currentVerse); module->setPosition(TOP); SWBuf token; SWBuf text; bool incomment = false; t_commentstate commentstate = CS_NOT_IN_COMMENT; bool intoken = false; bool inWhitespace = false; bool seeingSpace = false; unsigned char curChar = '\0'; while (infile.good()) { int possibleChar = infile.get(); // skip the character if it is bad. infile.good() will catch the problem if (possibleChar == -1) { continue; } curChar = (unsigned char) possibleChar; // All newlines are simply whitespace // Does a SWORD module actually require this? if (curChar == '\n') { curChar = ' '; } if (!intoken && curChar == '<') { intoken = true; token = "<"; continue; } // Handle XML comments starting with "" if (intoken && !incomment) { switch (commentstate) { case CS_NOT_IN_COMMENT : if (curChar == '!') { commentstate = CS_SEEN_STARTING_EXCLAMATION; token.append((char) curChar); continue; } else { break; } case CS_SEEN_STARTING_EXCLAMATION : if (curChar == '-') { commentstate = CS_SEEN_STARTING_HYPHEN; token.append((char) curChar); continue; } else { commentstate = CS_NOT_IN_COMMENT; break; } case CS_SEEN_STARTING_HYPHEN : if (curChar == '-') { incomment = true; commentstate = CS_IN_COMMENT; token.append((char) curChar); if (debug & DEBUG_OTHER) { cout << "DEBUG(COMMENTS): in comment" << endl; } continue; } else { commentstate = CS_NOT_IN_COMMENT; break; } default: cout << "FATAL(COMMENTS): unknown commentstate on comment start: " << commentstate << endl; exit(EXIT_BAD_NESTING); } } if (incomment) { switch (commentstate) { case CS_IN_COMMENT: if (curChar == '-') { commentstate = CS_SEEN_ENDING_HYPHEN; continue; } else { // ignore the character continue; } case CS_SEEN_ENDING_HYPHEN : if (curChar == '-') { commentstate = CS_SEEN_SECOND_ENDING_HYPHEN; continue; } else { // ignore character commentstate = CS_IN_COMMENT; continue; } case CS_SEEN_SECOND_ENDING_HYPHEN : if (curChar == '>') { intoken = false; incomment = false; commentstate = CS_NOT_IN_COMMENT; if (debug & DEBUG_OTHER) { cout << "DEBUG(COMMENTS): out of comment" << endl; } continue; } else { // ignore character commentstate = CS_IN_COMMENT; continue; } default: cout << "FATAL(COMMENTS): unknown commentstate on comment end: " << commentstate << endl; exit(EXIT_BAD_NESTING); } } // Outside of tokens merge adjacent whitespace if (!intoken) { seeingSpace = isspace(curChar)!=0; if (seeingSpace) { if (inWhitespace) { continue; } // convert all whitespace to blanks curChar = ' '; } inWhitespace = seeingSpace; } if (intoken && curChar == '>') { intoken = false; inWhitespace = false; token.append('>'); // take this isalpha if out to check for bugs in text if (isalpha(token[1]) || (((token[1] == '/') || (token[1] == '?')) && isalpha(token[2]))) { //cout << "Handle:" << token.c_str() << endl; XMLTag t = transformBSP(token.c_str()); if (!handleToken(text, t)) { text.append(t); } } else { cout << "WARNING(PARSE): malformed token: " << token << endl; } continue; } if (intoken) { token.append((char) curChar); } else { switch (curChar) { case '>' : text.append(">"); break; case '<' : text.append("<"); break; default : text.append((char) curChar); break; } } } // Force the last entry from the text buffer. text = ""; writeEntry(text, true); writeLinks(); #ifdef _ICU_ if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted); if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized); #endif } int main(int argc, char **argv) { fprintf(stderr, "You are running osis2mod: $Rev: 3314 $\n"); if (argc > 1) { for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { usage(*argv, "", true); } } } // Let's test our command line arguments if (argc < 3) { usage(*argv); } // variables for arguments, holding defaults const char* program = argv[0]; const char* path = argv[1]; const char* osisDoc = argv[2]; int append = 0; SWBuf compType = ""; bool isCommentary = false; int iType = 4; int entrySize = 0; SWBuf cipherKey = ""; SWCompress *compressor = 0; for (int i = 3; i < argc; i++) { if (!strcmp(argv[i], "-a")) { append = 1; } else if (!strcmp(argv[i], "-z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (entrySize) usage(*argv, "Cannot specify both -z and -s"); compType = "ZIP"; } else if (!strcmp(argv[i], "-Z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (entrySize) usage(*argv, "Cannot specify both -Z and -s"); compType = "LZSS"; } else if (!strcmp(argv[i], "-b")) { if (i+1 < argc) { iType = atoi(argv[++i]); if ((iType >= 2) && (iType <= 4)) continue; } usage(*argv, "-b requires one of <2|3|4>"); } else if (!strcmp(argv[i], "-N")) { normalize = false; } else if (!strcmp(argv[i], "-c")) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires <cipher_key>"); } else if (!strcmp(argv[i], "-v")) { if (i+1 < argc) v11n = argv[++i]; else usage(*argv, "-v requires <v11n>"); } else if (!strcmp(argv[i], "-s")) { if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z"); if (i+1 < argc) { entrySize = atoi(argv[++i]); if (entrySize == 2 || entrySize == 4) { continue; } } usage(*argv, "-s requires one of <2|4>"); } else if (!strcmp(argv[i], "-C")) { isCommentary = true; } else if (!strcmp(argv[i], "-d")) { if (i+1 < argc) debug |= atoi(argv[++i]); else usage(*argv, "-d requires <flags>"); } else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } if (isCommentary) isCommentary = true; // avoid unused warning for now if (compType == "ZIP") { #ifndef EXCLUDEZLIB compressor = new ZipCompress(); #else usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library"); #endif } else if (compType == "LZSS") { compressor = new LZSSCompress(); } #ifndef _ICU_ if (normalize) { normalize = false; cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl; } #endif if (debug & DEBUG_OTHER) { cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl; } if (!append) { // == 0 then create module // Try to initialize a default set of datafiles and indicies at our // datapath location passed to us from the user. if (compressor) { if (zText::createModule(path, iType, v11n)) { fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); exit(EXIT_NO_CREATE); } } else if (entrySize == 4) { if (RawText4::createModule(path, v11n)) { fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); exit(EXIT_NO_CREATE); } } else { if (RawText::createModule(path, v11n)) { fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); exit(EXIT_NO_CREATE); } } } // Do some initialization stuff if (compressor) { // Create a compressed text module allowing very large entries // Taking defaults except for first, fourth, fifth and last argument module = new zText( path, // ipath 0, // iname 0, // idesc iType, // iblockType compressor, // icomp 0, // idisp ENC_UNKNOWN, // enc DIRECTION_LTR, // dir FMT_UNKNOWN, // markup 0, // lang v11n // versification ); } else if (entrySize == 4) { // Create a raw text module allowing very large entries // Taking defaults except for first and last argument module = new RawText4( path, // ipath 0, // iname 0, // idesc 0, // idisp ENC_UNKNOWN, // encoding DIRECTION_LTR, // dir FMT_UNKNOWN, // markup 0, // ilang v11n // versification ); } else { // Create a raw text module allowing reasonable sized entries // Taking defaults except for first and last argument module = new RawText( path, // ipath 0, // iname 0, // idesc 0, // idisp ENC_UNKNOWN, // encoding DIRECTION_LTR, // dir FMT_UNKNOWN, // markup 0, // ilang v11n // versification ); } SWFilter *cipherFilter = 0; if (cipherKey.length()) { fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); cipherFilter = new CipherFilter(cipherKey.c_str()); module->addRawFilter(cipherFilter); } if (!module->isWritable()) { fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); exit(EXIT_NO_WRITE); } // Either read from std::cin (aka stdin), when the argument is a '-' // or from a specified file. if (!strcmp(osisDoc, "-")) { processOSIS(cin); } else { // Let's see if we can open our input file ifstream infile(osisDoc); if (infile.fail()) { fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc); exit(EXIT_NO_READ); } processOSIS(infile); infile.close(); } delete module; if (cipherFilter) delete cipherFilter; fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program); exit(0); // success }

if (tokenName == "div" && typeAttr == "book") { if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS INTRO " << endl; cout << "\tinChapterIntro = " << inChapterIntro << endl; cout << "\tinBookIntro = " << inBookIntro << endl; } currentVerse.setTestament(0); currentVerse.setBook(0); currentVerse.setChapter(0); currentVerse.setVerse(0); writeEntry(text); } currentVerse = token.getAttribute("osisID"); currentVerse.setChapter(0); currentVerse.setVerse(0); strcpy(currentOsisID, currentVerse.getOSISRef()); sidBook = token.getAttribute("sID"); inChapter = false; inVerse = false; inPreVerse = false; inBookIntro = true; inChapterIntro = false; if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for book introduction" << endl; } bookDepth = tagStack.size(); chapterDepth = 0; verseDepth = 0; inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID")); if (!inCanonicalOSISBook) { cout << "WARNING(V11N): New book is " << token.getAttribute("osisID") << " and is not in " << v11n << " versification, ignoring" << endl; } else if (debug & DEBUG_OTHER) { cout << "DEBUG(FOUND): New book is " << currentVerse.getOSISRef() << endl; } return false; } // CHAPTER START, or

if ((tokenName == "chapter") || (tokenName == "div" && typeAttr == "chapter") ) { if (inBookIntro) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl; } writeEntry(text); } currentVerse = token.getAttribute("osisID"); currentVerse.setVerse(0); if (debug & DEBUG_OTHER) { cout << "DEBUG(FOUND): Current chapter is " << currentVerse.getOSISRef() << " (" << token.getAttribute("osisID") << ")" << endl; } strcpy(currentOsisID, currentVerse.getOSISRef()); sidChapter = token.getAttribute("sID"); inChapter = true; inVerse = false; inPreVerse = false; inBookIntro = false; inChapterIntro = true; if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl; } chapterDepth = tagStack.size(); verseDepth = 0; return false; } // VERSE, OR COMMENTARY START,

if ((tokenName == "verse") || (tokenName == "div" && token.getAttribute("annotateType")) ) { if (debug & DEBUG_OTHER) { cout << "DEBUG(FOUND): Entering verse" << endl; } if (inChapterIntro) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl; } if (text.length()) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl; } writeEntry(text); } } // Did we have pre-verse material that needs to be marked? if (inPreVerse) { char genBuf[200]; sprintf(genBuf, "

", genID++); text.append(genBuf); } // Get osisID for verse or annotateRef for commentary SWBuf keyVal = token.getAttribute(tokenName == "verse" ? "osisID" : "annotateRef"); // Massage the key into a form that parseVerseList can accept prepareSWVerseKey(keyVal); // The osisID or annotateRef can be more than a single verse // The first or only one is the currentVerse // Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing. // This should never happen if the references are valid OSIS references ListKey verseKeys = currentVerse.parseVerseList(keyVal, currentVerse, true); int memberKeyCount = verseKeys.getCount(); if (memberKeyCount) { currentVerse = verseKeys.getElement(0); // See if this osisID or annotateRef refers to more than one verse. // If it does, save it until all verses have been seen. // At that point we will output links. // This can be done by incrementing, which will produce an error // if there is only one verse. if (memberKeyCount > 1) { verseKeys.setPosition(TOP); verseKeys.increment(1); if (!verseKeys.popError()) { cout << "DEBUG(LINK): " << currentVerse.getOSISRef() << endl; linkedVerses.push_back(verseKeys); } } } else { cout << "ERROR(REF): Invalid osisID/annotateRef: " << token.getAttribute((tokenName == "verse") ? "osisID" : "annotateRef") << endl; } strcpy(currentOsisID, currentVerse.getOSISRef()); if (debug & DEBUG_OTHER) { cout << "DEBUG(FOUND): New current verse is " << currentVerse.getOSISRef() << endl; cout << "DEBUG(FOUND): osisID/annotateRef is adjusted to: " << keyVal << endl; } sidVerse = token.getAttribute("sID"); inVerse = true; inPreVerse = false; inBookIntro = false; inChapterIntro = false; verseDepth = tagStack.size(); // Include the token if it is not a verse if (tokenName != "verse") { text.append(token); } else if (debug & DEBUG_VERSE) { // transform the verse into a milestone XMLTag t = ""; // copy all the attributes of the verse element to the milestone StringList attrNames = token.getAttributeNames(); for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) { const char* attr = (*loop).c_str(); t.setAttribute(attr, token.getAttribute(attr)); } text.append(t); } if (inWOC) { text.append(wocTag); } return true; } } // done with Handle Book, Chapter, and Verse (or commentary equivalent) // Now consider everything else. // "majorSection" is code for the Book 1-5 of Psalms if (tokenName == "div" && typeAttr == "majorSection") { if (inBookIntro) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl; } writeEntry(text); } if (debug & DEBUG_OTHER) { cout << "DEBUG(FOUND): majorSection found " << currentVerse.getOSISRef() << endl; } strcpy(currentOsisID, currentVerse.getOSISRef()); inChapter = false; inVerse = false; inPreVerse = false; inBookIntro = false; inChapterIntro = true; if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl; } verseDepth = 0; return false; } // Handle WOC quotes. // Note this requires transformBSP to make them into milestones // Otherwise have to do it here if (tokenName == "q") { quoteStack.push(token); if (debug & DEBUG_QUOTE) { cout << "DEBUG(QUOTE): " << currentOsisID << ": quote top(" << quoteStack.size() << ") " << token << endl; } if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) { inWOC = true; // Output per verse WOC markup. text.append(wocTag); // Output the quotation mark if appropriate, inside the WOC. // If there is no marker attribute, let the SWORD engine manufacture one. // If there is a marker attribute and it has content, then output that. // If the marker attribute is present and empty, then there is nothing to do. // And have it within the WOC markup if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) { token.setAttribute("who", 0); // remove the who="Jesus" text.append(token); } return true; } return false; } // Have we found the start of pre-verse material? // Pre-verse material follows the following rules // 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book. // 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter // and the first verse of the chapter. // A

with a type of section will be taken as surrounding verses. // A of type other than main, chapter or sub, will be taken as a title for the verse. // Once one of these conditions is met, the division between chapter introduction and pre-verse is set. // 3) Between verses, the material is split between the prior verse and the next verse. // Basically, while end and empty tags are found, they belong to the prior verse. // Once a begin tag is found, it belongs to the next verse. if (!inPreVerse && !inBookIntro) { if (inChapterIntro) { // Determine when we are no longer in a chapter heading, but in pre-verse material: // If we see one of the following: // a section div // a title that is not main, chapter or sub or unclassified (no type attribute) if ((tokenName == "div" && typeAttr == "section") || (tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub") ) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": Done looking for chapter introduction" << endl; } if (text.length()) { if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER INTRO "<< text << endl; } // Since we have found the boundary, we need to write out the chapter heading writeEntry(text); } // And we are no longer in the chapter heading inChapterIntro = false; // But rather, we are now in pre-verse material inPreVerse = true; } } else if (!inVerse && inChapter) { inPreVerse = true; } if (inPreVerse) { char genBuf[200]; sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID); text.append(genBuf); } } if (debug & DEBUG_INTERVERSE) { if (!inVerse && !inBookIntro && !inChapterIntro) { cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl; } } return false; } // Done with procesing start and empty tags // Process end tags else { if (tagStack.empty()) { cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl; exit(EXIT_BAD_NESTING); } // Note: empty end tags have the eID attribute if (!token.isEmpty()) { XMLTag topToken = tagStack.top(); tagDepth = tagStack.size(); if (debug & DEBUG_STACK) { cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl; } tagStack.pop(); if (tokenName != topToken.getName()) { cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl; // exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here. // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway. // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting. } } // We haven't seen the first div outside the header so there is little to do. if (!firstDiv) { if (tokenName == "header") { headerEnded = true; if (debug & DEBUG_OTHER) { cout << "DEBUG(FOUND): End of header found" << endl; } } // Collect the content so it can be used to suggest the module's conf. return false; } // VERSE and COMMENTARY END if ((tokenName == "verse") || (tokenName == "div" && eidAttr == sidVerse) ) { if (tagDepth != verseDepth) { cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl; } // If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. if (inWOC) { text.append("</q>"); } // Include the token if it is not a verse if (tokenName != "verse") { text.append(token); } else if (debug & DEBUG_VERSE) { // transform the verse into a milestone XMLTag t = "<milestone resp=\"v\" />"; // copy all the attributes of the verse element to the milestone StringList attrNames = token.getAttributeNames(); for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) { const char* attr = (*loop).c_str(); t.setAttribute(attr, token.getAttribute(attr)); } text.append(t); } writeEntry(text); inVerse = false; inPreVerse = false; verseDepth = 0; return true; } // Handle WOC quotes. // Note this requires transformBSP to make them into milestones // Otherwise have to manage it here if (tokenName == "q") { XMLTag topToken = quoteStack.top(); if (debug & DEBUG_QUOTE) { cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl; } quoteStack.pop(); // If we have found an end tag for a <q who="Jesus"> then we are done with the WOC // and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) { if (debug & DEBUG_QUOTE) { cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl; } inWOC = false; const char *sID = topToken.getAttribute("sID"); const char *eID = token.getAttribute("eID"); if (!sID) { sID = ""; } if (!eID) { eID = ""; } if (strcmp(sID, eID)) { cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl; } // Output the quotation mark if appropriate, inside the WOC. // If there is no marker attribute, let the SWORD engine manufacture one. // If there is a marker attribute and it has content, then output that. // If the marker attribute is present and empty, then there is nothing to do. // And have it within the WOC markup if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) { token.setAttribute("who", 0); // remove the who="Jesus" text.append(token); } // Now close the WOC text.append("</q>"); return true; } return false; } // Look for the end of document, book and chapter // Also for material that goes with last entry if (!inVerse && !inBookIntro && !inChapterIntro) { // Is this the end of a chapter. if ((tokenName == "chapter") || (tokenName == "div" && eidAttr == sidChapter) ) { text.append(token); writeEntry(text); inChapter = false; sidChapter = ""; chapterDepth = 0; verseDepth = 0; return true; } // Is it the end of a book if (tokenName == "div" && eidAttr == sidBook) { text.append(token); writeEntry(text); bookDepth = 0; chapterDepth = 0; verseDepth = 0; return true; } // Do not include the end of an osis document if (tokenName == "osisText" || tokenName == "osis") { bookDepth = 0; chapterDepth = 0; verseDepth = 0; text = ""; return true; } // When we are not inPreVerse, the interverse tags get appended to the preceeding verse. if (!inPreVerse) { text.append(token); writeEntry(text); if (debug & DEBUG_INTERVERSE) { cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; } return true; } if (debug & DEBUG_INTERVERSE) { cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; } return false; } return false; } // done with Processing end tags return false; } /** * Support normalizations necessary for a SWORD module. * OSIS allows for document structure (Book, Section, Paragraph or BSP) * to overlap Bible versification (Book, Chapter, Verse). * Most SWORD applications need to display verses in isolation or in HTML table cells, * requiring each stored entry (i.e. verses) to be well-formed xml. * This routine normalizes container elements which could cross verse boundaries into milestones. * For most of these OSIS elements, there is a milestone form. However, p is not milestoneable. * For this reason, p is transformed into div elements with type x-p. * param t the tag to transform * return the transformed tag or the original one */ XMLTag transformBSP(XMLTag t) { static std::stack<XMLTag> bspTagStack; static int sID = 1; char buf[11]; // Support simplification transformations if (t.isEmpty()) { if (debug & DEBUG_XFORM) { cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl; } return t; } SWBuf tagName = t.getName(); if (!t.isEndTag()) { // Transform <p> into <div type="x-p"> and milestone it if (tagName == "p") { t.setText("<div type=\"x-p\" />"); sprintf(buf, "gen%d", sID++); t.setAttribute("sID", buf); } // Transform <tag> into <tag sID="">, where tag is a milestoneable element. // The following containers are milestoneable. // abbr, closer, div, foreign, l, lg, salute, signed, speech // Leaving out: // abbr When would this ever cross a boundary? // seg as it is used for a divineName hack // foreign so that it can be easily italicized else if (tagName == "chapter" || tagName == "closer" || tagName == "div" || tagName == "l" || tagName == "lg" || tagName == "q" || tagName == "salute" || tagName == "signed" || tagName == "speech" || tagName == "verse" ) { t.setEmpty(true); sprintf(buf, "gen%d", sID++); t.setAttribute("sID", buf); } bspTagStack.push(t); if (debug & DEBUG_XFORM) { cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl; XMLTag topToken = bspTagStack.top(); cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl; } } else { if (!bspTagStack.empty()) { XMLTag topToken = bspTagStack.top(); if (debug & DEBUG_XFORM) { cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl; } bspTagStack.pop(); // Look for the milestoneable container tags handled above. if (tagName == "chapter" || tagName == "closer" || tagName == "div" || tagName == "l" || tagName == "lg" || tagName == "p" || tagName == "q" || tagName == "salute" || tagName == "signed" || tagName == "speech" || tagName == "verse" ) { // make this a clone of the start tag with sID changed to eID // Note: in the case of </p> the topToken is a <div type="x-p"> t = topToken; t.setAttribute("eID", t.getAttribute("sID")); t.setAttribute("sID", 0); } } else { cout << "FATAL(TAGSTACK): " << currentOsisID << ": closing tag without opening tag" << endl; } } return t; } /** * Write out all links in the module. * Waiting is necessary because writeEntry might ultimately append * text to a verse moving it's offset in the data file. * While we are minimizing it by postponing the write until we have * gathered the next verse, the following scenario is happening: * A module is using linked verses and has some verses that are not * in the chosen versification. If the out-of-canon verse happens following * a linked verse, the out-of-canon verse is appended to the prior * verse. Care has to be taken that the linked verses all point to * the first of the set. */ void writeLinks() { // Link all the verses VerseKey destKey; destKey.setVersificationSystem(currentVerse.getVersificationSystem()); destKey.setAutoNormalize(0); destKey.setIntros(1); VerseKey linkKey; linkKey.setVersificationSystem(currentVerse.getVersificationSystem()); linkKey.setAutoNormalize(0); linkKey.setIntros(1); for (unsigned int i = 0; i < linkedVerses.size(); i++) { // The verseKeys is a list of verses // where the first is the real verse // and the others link to it. ListKey verseKeys = linkedVerses[i]; verseKeys.setPosition(TOP); destKey = verseKeys.getElement(); verseKeys.increment(1); while (!verseKeys.popError()) { linkKey = verseKeys.getElement(); linkToEntry(linkKey, destKey); verseKeys.increment(1); } } } void usage(const char *app, const char *error = 0, const bool verboseHelp = false) { if (error) fprintf(stderr, "\n%s: %s\n", app, error); fprintf(stderr, "OSIS Bible/commentary module creation tool for The SWORD Project\n"); fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app); fprintf(stderr, " <output/path>\t\t an existing folder that the module will be written\n"); fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to\n"); fprintf(stderr, "\t\t\t\t read from standard input\n"); fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n"); fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); fprintf(stderr, " -b <2|3|4>\t\t compression block size (default 4):\n"); fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n"); fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n"); fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); #ifdef _ICU_ fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n"); if (verboseHelp) { fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n"); fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n"); fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n"); } #endif fprintf(stderr, " -s <2|4>\t\t bytes used to store entry size (default is 2).\n"); if (verboseHelp) { fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large\n"); fprintf(stderr, "\t\t\t\t entries in uncompressed modules\n"); fprintf(stderr, "\t\t\t\t (2 bytes to store size equal 65535 characters)\n"); } fprintf(stderr, " -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n"); fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:"); VersificationMgr *vmgr = VersificationMgr::getSystemVersificationMgr(); StringList av11n = vmgr->getVersificationSystems(); for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) { if ((distance(av11n.begin(), loop) % 3) == 0) { fprintf(stderr, "\n\t\t\t\t %-12s", (*loop).c_str()); } else { fprintf(stderr, "\t%-12s", (*loop).c_str()); } } fprintf(stderr, "\n"); if (verboseHelp) { fprintf(stderr, " -d <flags>\t\t turn on debugging (default is 0)\n"); fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n"); fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n"); fprintf(stderr, "\t\t\t\t\t0 - no debugging\n"); fprintf(stderr, "\t\t\t\t\t1 - writes to module, very verbose\n"); fprintf(stderr, "\t\t\t\t\t2 - verse start and end\n"); fprintf(stderr, "\t\t\t\t\t4 - quotes, esp. Words of Christ\n"); fprintf(stderr, "\t\t\t\t\t8 - titles\n"); fprintf(stderr, "\t\t\t\t\t16 - inter-verse material\n"); fprintf(stderr, "\t\t\t\t\t32 - BSP to BCV transformations\n"); fprintf(stderr, "\t\t\t\t\t64 - v11n exceptions\n"); fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n"); fprintf(stderr, "\t\t\t\t\t256 - internal stack\n"); fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n"); fprintf(stderr, "\t\t\t\t This argument can be used more than once. (Or\n"); fprintf(stderr, "\t\t\t\t the flags may be added together.)\n"); } fprintf(stderr, " -h \t\t\t print verbose usage text\n"); fprintf(stderr, "\n"); fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n"); fprintf(stderr, "\n"); exit(EXIT_BAD_ARG); } void processOSIS(istream& infile) { typedef enum { CS_NOT_IN_COMMENT, // or seen starting "<" CS_SEEN_STARTING_EXCLAMATION, CS_SEEN_STARTING_HYPHEN, CS_IN_COMMENT, CS_SEEN_ENDING_HYPHEN, CS_SEEN_SECOND_ENDING_HYPHEN, CS_SEEN_ENDING_GREATER_THAN } t_commentstate; activeOsisID[0] = '\0'; strcpy(currentOsisID,"N/A"); currentVerse.setVersificationSystem(v11n); currentVerse.setAutoNormalize(false); currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings currentVerse.setPersist(true); module->setKey(currentVerse); module->setPosition(TOP); SWBuf token; SWBuf text; bool incomment = false; t_commentstate commentstate = CS_NOT_IN_COMMENT; bool intoken = false; bool inWhitespace = false; bool seeingSpace = false; unsigned char curChar = '\0'; while (infile.good()) { int possibleChar = infile.get(); // skip the character if it is bad. infile.good() will catch the problem if (possibleChar == -1) { continue; } curChar = (unsigned char) possibleChar; // All newlines are simply whitespace // Does a SWORD module actually require this? if (curChar == '\n') { curChar = ' '; } if (!intoken && curChar == '<') { intoken = true; token = "<"; continue; } // Handle XML comments starting with "" if (intoken && !incomment) { switch (commentstate) { case CS_NOT_IN_COMMENT : if (curChar == '!') { commentstate = CS_SEEN_STARTING_EXCLAMATION; token.append((char) curChar); continue; } else { break; } case CS_SEEN_STARTING_EXCLAMATION : if (curChar == '-') { commentstate = CS_SEEN_STARTING_HYPHEN; token.append((char) curChar); continue; } else { commentstate = CS_NOT_IN_COMMENT; break; } case CS_SEEN_STARTING_HYPHEN : if (curChar == '-') { incomment = true; commentstate = CS_IN_COMMENT; token.append((char) curChar); if (debug & DEBUG_OTHER) { cout << "DEBUG(COMMENTS): in comment" << endl; } continue; } else { commentstate = CS_NOT_IN_COMMENT; break; } default: cout << "FATAL(COMMENTS): unknown commentstate on comment start: " << commentstate << endl; exit(EXIT_BAD_NESTING); } } if (incomment) { switch (commentstate) { case CS_IN_COMMENT: if (curChar == '-') { commentstate = CS_SEEN_ENDING_HYPHEN; continue; } else { // ignore the character continue; } case CS_SEEN_ENDING_HYPHEN : if (curChar == '-') { commentstate = CS_SEEN_SECOND_ENDING_HYPHEN; continue; } else { // ignore character commentstate = CS_IN_COMMENT; continue; } case CS_SEEN_SECOND_ENDING_HYPHEN : if (curChar == '>') { intoken = false; incomment = false; commentstate = CS_NOT_IN_COMMENT; if (debug & DEBUG_OTHER) { cout << "DEBUG(COMMENTS): out of comment" << endl; } continue; } else { // ignore character commentstate = CS_IN_COMMENT; continue; } default: cout << "FATAL(COMMENTS): unknown commentstate on comment end: " << commentstate << endl; exit(EXIT_BAD_NESTING); } } // Outside of tokens merge adjacent whitespace if (!intoken) { seeingSpace = isspace(curChar)!=0; if (seeingSpace) { if (inWhitespace) { continue; } // convert all whitespace to blanks curChar = ' '; } inWhitespace = seeingSpace; } if (intoken && curChar == '>') { intoken = false; inWhitespace = false; token.append('>'); // take this isalpha if out to check for bugs in text if (isalpha(token[1]) || (((token[1] == '/') || (token[1] == '?')) && isalpha(token[2]))) { //cout << "Handle:" << token.c_str() << endl; XMLTag t = transformBSP(token.c_str()); if (!handleToken(text, t)) { text.append(t); } } else { cout << "WARNING(PARSE): malformed token: " << token << endl; } continue; } if (intoken) { token.append((char) curChar); } else { switch (curChar) { case '>' : text.append(">"); break; case '<' : text.append("<"); break; default : text.append((char) curChar); break; } } } // Force the last entry from the text buffer. text = ""; writeEntry(text, true); writeLinks(); #ifdef _ICU_ if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted); if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized); #endif } int main(int argc, char **argv) { fprintf(stderr, "You are running osis2mod: $Rev: 3314 $\n"); if (argc > 1) { for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { usage(*argv, "", true); } } } // Let's test our command line arguments if (argc < 3) { usage(*argv); } // variables for arguments, holding defaults const char* program = argv[0]; const char* path = argv[1]; const char* osisDoc = argv[2]; int append = 0; SWBuf compType = ""; bool isCommentary = false; int iType = 4; int entrySize = 0; SWBuf cipherKey = ""; SWCompress *compressor = 0; for (int i = 3; i < argc; i++) { if (!strcmp(argv[i], "-a")) { append = 1; } else if (!strcmp(argv[i], "-z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (entrySize) usage(*argv, "Cannot specify both -z and -s"); compType = "ZIP"; } else if (!strcmp(argv[i], "-Z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (entrySize) usage(*argv, "Cannot specify both -Z and -s"); compType = "LZSS"; } else if (!strcmp(argv[i], "-b")) { if (i+1 < argc) { iType = atoi(argv[++i]); if ((iType >= 2) && (iType <= 4)) continue; } usage(*argv, "-b requires one of <2|3|4>"); } else if (!strcmp(argv[i], "-N")) { normalize = false; } else if (!strcmp(argv[i], "-c")) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires <cipher_key>"); } else if (!strcmp(argv[i], "-v")) { if (i+1 < argc) v11n = argv[++i]; else usage(*argv, "-v requires <v11n>"); } else if (!strcmp(argv[i], "-s")) { if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z"); if (i+1 < argc) { entrySize = atoi(argv[++i]); if (entrySize == 2 || entrySize == 4) { continue; } } usage(*argv, "-s requires one of <2|4>"); } else if (!strcmp(argv[i], "-C")) { isCommentary = true; } else if (!strcmp(argv[i], "-d")) { if (i+1 < argc) debug |= atoi(argv[++i]); else usage(*argv, "-d requires <flags>"); } else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } if (isCommentary) isCommentary = true; // avoid unused warning for now if (compType == "ZIP") { #ifndef EXCLUDEZLIB compressor = new ZipCompress(); #else usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library"); #endif } else if (compType == "LZSS") { compressor = new LZSSCompress(); } #ifndef _ICU_ if (normalize) { normalize = false; cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl; } #endif if (debug & DEBUG_OTHER) { cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl; } if (!append) { // == 0 then create module // Try to initialize a default set of datafiles and indicies at our // datapath location passed to us from the user. if (compressor) { if (zText::createModule(path, iType, v11n)) { fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); exit(EXIT_NO_CREATE); } } else if (entrySize == 4) { if (RawText4::createModule(path, v11n)) { fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); exit(EXIT_NO_CREATE); } } else { if (RawText::createModule(path, v11n)) { fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); exit(EXIT_NO_CREATE); } } } // Do some initialization stuff if (compressor) { // Create a compressed text module allowing very large entries // Taking defaults except for first, fourth, fifth and last argument module = new zText( path, // ipath 0, // iname 0, // idesc iType, // iblockType compressor, // icomp 0, // idisp ENC_UNKNOWN, // enc DIRECTION_LTR, // dir FMT_UNKNOWN, // markup 0, // lang v11n // versification ); } else if (entrySize == 4) { // Create a raw text module allowing very large entries // Taking defaults except for first and last argument module = new RawText4( path, // ipath 0, // iname 0, // idesc 0, // idisp ENC_UNKNOWN, // encoding DIRECTION_LTR, // dir FMT_UNKNOWN, // markup 0, // ilang v11n // versification ); } else { // Create a raw text module allowing reasonable sized entries // Taking defaults except for first and last argument module = new RawText( path, // ipath 0, // iname 0, // idesc 0, // idisp ENC_UNKNOWN, // encoding DIRECTION_LTR, // dir FMT_UNKNOWN, // markup 0, // ilang v11n // versification ); } SWFilter *cipherFilter = 0; if (cipherKey.length()) { fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); cipherFilter = new CipherFilter(cipherKey.c_str()); module->addRawFilter(cipherFilter); } if (!module->isWritable()) { fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); exit(EXIT_NO_WRITE); } // Either read from std::cin (aka stdin), when the argument is a '-' // or from a specified file. if (!strcmp(osisDoc, "-")) { processOSIS(cin); } else { // Let's see if we can open our input file ifstream infile(osisDoc); if (infile.fail()) { fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc); exit(EXIT_NO_READ); } processOSIS(infile); infile.close(); } delete module; if (cipherFilter) delete cipherFilter; fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program); exit(0); // success }