[sword-cvs] sword/utilities osis2mod.cpp,NONE,1.1 Makefile.am,1.15,1.16
sword@www.crosswire.org
sword@www.crosswire.org
Mon, 26 May 2003 11:57:30 -0700
Update of /usr/local/cvsroot/sword/utilities
In directory www:/tmp/cvs-serv26339/utilities
Modified Files:
Makefile.am
Added Files:
osis2mod.cpp
Log Message:
Added first cut of osis2mod using the ESV as a
basic template. Works with VerseKey mods
only and doesn't capture all data.
--- NEW FILE: osis2mod.cpp ---
#include <ctype.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
#ifndef __GNUC__
#include <io.h>
#else
#include <unistd.h>
#endif
#include <swmgr.h>
#include <rawtext.h>
#include <iostream>
#include <swbuf.h>
#include <utilxml.h>
#ifndef O_BINARY
#define O_BINARY 0
#endif
#ifndef NO_SWORD_NAMESPACE
using namespace sword;
#endif
using namespace std;
char readline(int fd, char **buf) {
char ch;
if (*buf)
delete [] *buf;
*buf = 0;
int len;
long index = lseek(fd, 0, SEEK_CUR);
// clean up any preceding white space
while ((len = read(fd, &ch, 1)) == 1) {
if ((ch != 13) && (ch != ' ') && (ch != '\t'))
break;
else index++;
}
while (ch != 10) {
if ((len = read(fd, &ch, 1)) != 1)
break;
}
int size = (lseek(fd, 0, SEEK_CUR) - index) - 1;
*buf = new char [ size + 1 ];
if (size > 0) {
lseek(fd, index, SEEK_SET);
read(fd, *buf, size);
read(fd, &ch, 1); //pop terminating char
(*buf)[size] = 0;
// clean up any trailing junk on buf
for (char *it = *buf+(strlen(*buf)-1); it > *buf; it--) {
if ((*it != 10) && (*it != 13) && (*it != ' ') && (*it != '\t'))
break;
else *it = 0;
}
}
else **buf = 0;
return !len;
}
bool isKJVRef(const char *buf) {
VerseKey vk, test;
vk.AutoNormalize(0);
vk.Headings(1); // turn on mod/testmnt/book/chap headings
vk.Persist(1);
// lets do some tests on the verse --------------
vk = buf;
test = buf;
if (vk.Testament() && vk.Book() && vk.Chapter() && vk.Verse()) { // if we're not a heading
// std::cerr << (const char*)vk << " == " << (const char*)test << std::endl;
return (vk == test);
}
else return true; // no check if we're a heading... Probably bad.
}
void writeEntry(VerseKey &key, SWBuf &text) {
cout << "Verse: " << key << "\n";
cout << "TEXT: " << text << "\n\n";
}
bool handleToken(SWBuf &text, XMLTag token) {
static VerseKey currentVerse;
static bool inHeader = false;
static SWBuf headerType = "";
static SWBuf header = "";
static SWBuf lastTitle = "";
static int titleOffset = -1;
currentVerse.Headings(0);
currentVerse.AutoNormalize(0);
if ((!strcmp(token.getName(), "title")) && (!token.isEndTag())) {
titleOffset = text.length();
return false;
}
if ((!strcmp(token.getName(), "title")) && (token.isEndTag())) {
lastTitle = (text.c_str() + titleOffset);
lastTitle += token;
return false;
}
if ((!strcmp(token.getName(), "div")) && (!token.isEndTag()) && (token.getAttribute("osisID"))) {
if (!strcmp(token.getAttribute("type"), "book")) {
if (inHeader) { // this one should never happen, but just in case
cout << "HEADING ";
writeEntry(currentVerse, text);
inHeader = false;
}
currentVerse = token.getAttribute("osisID");
currentVerse.Chapter(0);
currentVerse.Verse(0);
inHeader = true;
headerType = "book";
lastTitle = "";
text = "";
}
if (!strcmp(token.getAttribute("type"), "chapter")) {
if (inHeader) {
cout << "HEADING ";
writeEntry(currentVerse, text);
inHeader = false;
}
currentVerse = token.getAttribute("osisID");
currentVerse.Verse(0);
inHeader = true;
headerType = "chap";
lastTitle = "";
text = "";
}
}
if ((!strcmp(token.getName(), "verse")) && (!token.isEndTag())) {
if (inHeader) {
cout << "HEADING ";
writeEntry(currentVerse, text);
inHeader = false;
}
currentVerse = token.getAttribute("osisID");
text = "";
return true;
}
if ((!strcmp(token.getName(), "verse")) && (token.isEndTag())) {
if (lastTitle.length()) {
SWBuf titleHead = lastTitle;
char *end = strchr(lastTitle.getRawData(), '>');
titleHead.setSize((end - lastTitle.getRawData())+1);
XMLTag titleTag(titleHead);
titleTag.setAttribute("type", "section");
titleTag.setAttribute("subtype", "x-preverse");
text = SWBuf(titleTag) + SWBuf(end+1) + text;
}
writeEntry(currentVerse, text);
lastTitle = "";
text = "";
return true;
}
return false;
}
int main(int argc, char **argv) {
// Let's test our command line arguments
if (argc < 3) {
fprintf(stderr, "usage: %s <path/to/mod/files> <osisDoc> [0|1 - create|augment module]\n\n", argv[0]);
exit(-1);
}
if ((argc>3)&&(strcmp(argv[3], "1"))) { // != 1 then create module
// Try to initialize a default set of datafiles and indicies at our
// datapath location passed to us from the user.
if (RawText::createModule(argv[1])) {
fprintf(stderr, "error: %s: couldn't create module at path: %s \n", argv[0], argv[1]);
exit(-3);
}
exit(0);
}
// Let's see if we can open our input file
int fd = open(argv[2], O_RDONLY|O_BINARY);
if (fd < 0) {
fprintf(stderr, "error: %s: couldn't open input file: %s \n", argv[0], argv[2]);
exit(-2);
}
// Do some initialization stuff
char *buffer = 0;
RawText mod(argv[1]); // open our datapath with our RawText driver.
VerseKey vk;
vk.AutoNormalize(0);
vk.Headings(1); // turn on mod/testmnt/book/chap headings
vk.Persist(1);
mod.setKey(vk);
mod = TOP;
int successive = 0; //part of hack below
char *from;
SWBuf token;
SWBuf text;
bool intoken = false;
while (!readline(fd, &buffer)) {
for (from = buffer; *from; from++) {
if (*from == '<') {
intoken = true;
token = "<";
continue;
}
if (*from == '>') {
intoken = false;
token += ">";
if (!handleToken(text, token.c_str())) {
text += token;
}
continue;
}
if (intoken)
token += *from;
else text += *from;
}
}
// clear up our buffer that readline might have allocated
if (buffer)
delete [] buffer;
}
/*
string verseText = "";
// chapter number
if (!strncmp("$$$ ", buffer, 4)) {
buffer[7] = 0;
chapter = atoi(buffer+4);
continue;
}
// header
if (!strncmp("<TD COLSPAN=4 VALIGN=TOP><FONT SIZE=2><B>", buffer, 41)) {
char *end = strstr(buffer+41, "</B>");
*end = 0;
header = buffer+41;
continue;
}
// verse number
if (!strncmp("<TD VALIGN=TOP ALIGN=RIGHT WIDTH=12><FONT SIZE=2 COLOR=RED><B><SUP>", buffer, 67)) {
char *end = strstr(buffer+67, "</SUP>");
*end = 0;
verse = atoi(buffer+67);
continue;
}
// Actual verse data
if (!strncmp("<TD VALIGN=TOP><FONT SIZE=2>", buffer, 28)) {
char *end = strstr(buffer+28, "</FONT>");
*end = 0;
}
// extra
else {
continue;
}
verseText = buffer + 28;
if (header.length()) {
verseText = "<title type=\"section\" subtype=\"x-preverse\">" + header + "</title>" + verseText;
header = "";
}
string vsbuf = argv[3];
sprintf(tmpBuf, "%i", chapter);
vsbuf += ((string)" ") + tmpBuf;
sprintf(tmpBuf, "%i", verse);
vsbuf += ((string)":") + tmpBuf;
vk = vsbuf.c_str();
if (vk.Error()) {
std::cerr << "Error parsing key: " << vsbuf << "\n";
exit(-5);
}
string orig = mod.getRawEntry();
if (!isKJVRef(vsbuf.c_str())) {
VerseKey origVK = vk;
// This block is functioning improperly -- problem with AutoNormalize???
// do {
// vk--;
// }
// while (!vk.Error() && !isKJVRef(vk));
//hack to replace above:
successive++;
vk -= successive;
orig = mod.getRawEntry();
std::cerr << "Not a valid KJV ref: " << origVK << "\n";
std::cerr << "appending to ref: " << vk << "\n";
orig += " [ (";
orig += origVK;
orig += ") ";
orig += verseText;
orig += " ] ";
verseText = orig.c_str();
}
else {
successive = 0;
}
if (orig.length() > 1)
std::cerr << "Warning, overwriting verse: " << vk << std::endl;
// ------------- End verse tests -----------------
std::cout << "adding "<< vk << "\n";
mod << verseText.c_str(); // save text to module at current position
}
*/
Index: Makefile.am
===================================================================
RCS file: /usr/local/cvsroot/sword/utilities/Makefile.am,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** Makefile.am 13 Feb 2003 22:27:53 -0000 1.15
--- Makefile.am 26 May 2003 18:57:28 -0000 1.16
***************
*** 5,9 ****
noinst_PROGRAMS = cipherraw ciphertest ciphertest2 lexdump mkfastmod \
mod2vpl vpl2mod stepdump step2vpl mod2zmod gbfidx modwrite addvs addld emptyvss \
! txt2sword addgb imp2gbs imp2ld imp2vs mod2imp thml2gbs mod2osis xml2gbs
if ICU
--- 5,9 ----
noinst_PROGRAMS = cipherraw ciphertest ciphertest2 lexdump mkfastmod \
mod2vpl vpl2mod stepdump step2vpl mod2zmod gbfidx modwrite addvs addld emptyvss \
! txt2sword addgb imp2gbs imp2ld imp2vs mod2imp thml2gbs mod2osis xml2gbs osis2mod
if ICU
***************
*** 43,44 ****
--- 43,45 ----
mod2osis_SOURCES = mod2osis.cpp
xml2gbs_SOURCES = xml2gbs.cpp
+ osis2mod_SOURCES = osis2mod.cpp