The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
xml2gbs.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * xml2gbs.cpp - Importer for GenBooks formatted as OSIS, ThML, or TEI
4  *
5  * $Id: xml2gbs.cpp 3063 2014-03-04 13:04:11Z chrislit $
6  *
7  * Copyright 2003-2012 CrossWire Bible Society (http://www.crosswire.org)
8  * CrossWire Bible Society
9  * P. O. Box 2528
10  * Tempe, AZ 85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 #ifdef _MSC_VER
24  #pragma warning( disable: 4251 )
25 #endif
26 
27 #include <ctype.h>
28 #include <stdio.h>
29 #include <fcntl.h>
30 #include <errno.h>
31 #include <stdlib.h>
32 
33 #include <entriesblk.h>
34 #include <iostream>
35 #include <string>
36 #include <fstream>
37 #include <treekeyidx.h>
38 #include <rawgenbook.h>
39 
40 
41 #ifndef NO_SWORD_NAMESPACE
42 using sword::TreeKeyIdx;
43 using sword::RawGenBook;
44 using sword::SWKey;
45 #endif
46 
47 //#define DEBUG
48 
49 
51 
52 #define HELPTEXT "xml2gbs 1.0 OSIS/ThML/TEI General Book module creation tool for the SWORD Project\n usage:\n xml2gbs [-l] [-i] [-fT|-fO|-fE] <filename> [modname]\n -l uses long div names in ThML files\n -i exports to IMP format instead of creating a module\n -fO, -fT, and -fE will set the importer to expect OSIS, ThML, or TEI format respectively\n (otherwise it attempts to autodetect)\n"
53 
54 unsigned char detectFormat(char* filename) {
55 
56  unsigned char format = F_AUTODETECT;
57 
58  std::ifstream infile(filename);
59  std::string entbuffer;
60 
61  if (!infile.is_open()) {
62  std::cerr << HELPTEXT;
63  std::cerr << std::endl << std::endl << "Could not open file \"" << filename << "\"" << std::endl;
64  }
65  else {
66  while (std::getline(infile, entbuffer) && format == F_AUTODETECT) {
67  if (strstr(entbuffer.c_str(), "<osis")) {
68  format = F_OSIS;
69  }
70  else if (strstr(entbuffer.c_str(), "<ThML")) {
71  format = F_THML;
72  }
73  else if (strstr(entbuffer.c_str(), "<TEI")) {
74  format = F_TEI;
75  }
76  }
77  infile.close();
78  }
79 
80  return format;
81 }
82 
83 int processXML(const char* filename, char* modname, bool longnames, bool exportfile, unsigned char format) {
84  signed long i = 0;
85  char* strtmp;
86  std::string entbuffer;
87 
88 #ifdef DEBUG
89  printf ("%s :%s :%d :%d :%d\n\n", filename, modname, longnames, exportfile, format);
90 #endif
91 
92  std::ifstream infile(filename);
93  if (!infile.is_open()) {
94  std::cerr << HELPTEXT;
95  std::cerr << std::endl << std::endl << "Could not open file \"" << filename << "\"" << std::endl;
96  return -1;
97  }
98  std::ofstream outfile;
99  if (exportfile) {
100  strcat (modname, ".imp");
101  outfile.open(modname);
102  }
103 
104  TreeKeyIdx * treeKey;
105  RawGenBook * book = NULL;
106 
107  std::string divs[32];
108 
109  int level = 0;
110  std::string keybuffer = "";
111  std::string keybuffer2;
112  std::string n;
113  std::string type;
114  std::string title;
115  unsigned long entrysize = 0;
116  unsigned long keysize = 0;
117  bool closer = false;
118 
119  if (!exportfile) {
120  // Do some initialization stuff
121  TreeKeyIdx::create(modname);
122  treeKey = new TreeKeyIdx(modname);
123  RawGenBook::createModule(modname);
124  delete treeKey;
125  book = new RawGenBook(modname);
126  }
127 
128 #ifdef DEBUG
129 // TreeKeyIdx root = *((TreeKeyIdx *)((SWKey *)(*book)));
130 #endif
131 
132  int c;
133  while ((c = infile.get()) != EOF) {
134  if (c == '<') {
135  {
136  keybuffer = "";
137  while ((c = infile.get()) != '>')
138  keybuffer += c;
139  keybuffer += c;
140  }
141 
142  if (keybuffer.length()) {
143  if (((format == F_OSIS) && ((!strncmp(keybuffer.c_str(), "/div>", 5)) || (!strncmp(keybuffer.c_str(), "/verse>", 7)) || (!strncmp(keybuffer.c_str(), "/chapter>", 9)))) ||
144  ((format == F_THML) && ((!strncmp(keybuffer.c_str(), "/div", 4)) && (keybuffer[4] > '0' && keybuffer[4] < '7')))) {
145  if (!closer) {
146  keysize = 0;
147  keybuffer2 = "";
148  for (i = 0; i < level; i++) {
149  keybuffer2 += '/';
150  keysize++;
151  keybuffer2 += divs[i];
152  keysize += divs[i].length();
153  std::cout << keybuffer2 << std::endl;
154  }
155 
156  if (level) {
157  std::cout << keybuffer2 << std::endl;
158  if (exportfile) {
159  outfile << "$$$" << keybuffer2 << std::endl << entbuffer << std::endl;
160  }
161  else {
162  book->setKey(keybuffer2.c_str());
163  book->setEntry(entbuffer.c_str(), entrysize); // save text to module at current position
164  }
165  }
166  }
167  level--;
168  entbuffer = "";
169  entrysize = 0;
170 
171  closer = true;
172  }
173  else if (((format == F_OSIS) && !((!strncmp(keybuffer.c_str(), "div>", 4) || !strncmp(keybuffer.c_str(), "div ", 4)) || (!strncmp(keybuffer.c_str(), "verse>", 6) || !strncmp(keybuffer.c_str(), "verse ", 6)) || (!strncmp(keybuffer.c_str(), "chapter>", 8) || !strncmp(keybuffer.c_str(), "chapter ", 8)))) ||
174  ((format == F_THML) && !((!strncmp(keybuffer.c_str(), "div", 3)) && (keybuffer[3] > '0' && keybuffer[3] < '7')))) {
175  entbuffer += '<';
176  entrysize++;
177  entrysize += keybuffer.length();
178  entbuffer += keybuffer;
179  }
180  else {
181  //we have a divN...
182  if (!closer) {
183  keysize = 0;
184  keybuffer2= "";
185  for (i = 0; i < level; i++) {
186  keybuffer2 += '/';
187  keysize++;
188  keybuffer2 += divs[i];
189  keysize += divs[i].length();
190  std::cout << keybuffer2 << std::endl;
191  }
192 
193  if (level) {
194  std::cout << keybuffer2 << std::endl;
195  if (exportfile) {
196  outfile << "$$$" << keybuffer2 << std::endl << entbuffer << std::endl;
197  }
198  else {
199  book->setKey(keybuffer2.c_str());
200  book->setEntry(entbuffer.c_str(), entrysize); // save text to module at current position
201  }
202  }
203  }
204 
205  entbuffer= "";
206  entrysize = 0;
207 
208  level++;
209  keysize = keybuffer.length()-1;
210 
211  type = "";
212  n = "";
213  title = "";
214 
215  if (format == F_OSIS && longnames == false) {
216  strtmp = (char*)strstr(keybuffer.c_str(), "osisID=\"");
217  if (strtmp) {
218  strtmp += 8;
219  for (;*strtmp != '\"'; strtmp++) {
220  if (*strtmp == 10) {
221  title += ' ';
222  }
223  else if (*strtmp == '.') {
224  title = "";
225  }
226  else if (*strtmp != 13) {
227  title += *strtmp;
228  }
229  }
230  }
231  keybuffer = title;
232  }
233  else {
234  strtmp = (char*)strstr(keybuffer.c_str(), "type=\"");
235  if (strtmp) {
236  strtmp += 6;
237  for (;*strtmp != '\"'; strtmp++) {
238  if (*strtmp == 10) {
239  type+= ' ';
240  }
241  else if (*strtmp != 13) {
242  type+= *strtmp;
243  }
244  }
245  }
246 
247  strtmp = (char*)strstr(keybuffer.c_str(), "n=\"");
248  if (strtmp) {
249  strtmp += 3;
250  for (;*strtmp != '\"'; strtmp++) {
251  if (*strtmp == 10) {
252  n += ' ';
253  }
254  else if (*strtmp != 13) {
255  n += *strtmp;
256  }
257  }
258  }
259 
260  if (format == F_OSIS) {
261  strtmp = (char*)strstr(keybuffer.c_str(), "title=\"");
262  if (strtmp) {
263  strtmp += 7;
264  for (;*strtmp != '\"'; strtmp++) {
265  if (*strtmp == 10) {
266  title += ' ';
267  }
268  else if (*strtmp != 13) {
269  title += *strtmp;
270  }
271  }
272  }
273  }
274  else if (format == F_THML) {
275  strtmp = (char*)strstr(keybuffer.c_str(), "title=\"");
276  if (strtmp) {
277  strtmp += 7;
278  for (;*strtmp != '\"'; strtmp++) {
279  if (*strtmp == 10) {
280  title += ' ';
281  }
282  else if (*strtmp != 13) {
283  title += *strtmp;
284  }
285  }
286  }
287  }
288 
289  keybuffer = type;
290  if (keybuffer.length() && n.length())
291  keybuffer += " ";
292  keybuffer += n;
293 
294  if (longnames && keybuffer.length())
295  keybuffer += ": ";
296  if (longnames || !keybuffer.length())
297  keybuffer += title;
298  }
299  divs[level-1] = keybuffer;
300 
301  closer = false;
302  }
303  }
304  }
305  else if (c != 13) {
306  entbuffer += c;
307  entrysize++;
308  }
309  }
310 
311 #ifdef DEBUG
312 // printTree(root, treeKey);
313 #endif
314 
315 // delete book; //causes nasty-bad errors upon execution
316  return 0;
317 }
318 
319 int main(int argc, char **argv) {
320  int i = 0;
321 
322  char modname[256];
323  *modname = 0;
324  char filename[256];
325  *filename = 0;
326 
327  bool longnames = false;
328  bool exportfile = false;
329  unsigned char format = F_AUTODETECT;
330 
331  if (argc > 2) {
332  for (i = 1; i < argc; i++) {
333  if (argv[i][0] == '-') {
334  switch (argv[i][1]) {
335  case 'l':
336  longnames = true;
337  continue;
338  case 'i':
339  exportfile = true;
340  continue;
341  case 'f':
342  if (argv[i][2] == 'O') {
343  format = F_OSIS;
344  }
345  else if (argv[i][2] == 'T') {
346  format = F_OSIS;
347  }
348  else {
349  format = F_AUTODETECT;
350  }
351  continue;
352  }
353  }
354  else if (*filename == 0) {
355  strncpy (filename, argv[i], 200);
356  }
357  else if (*modname == 0) {
358  strncpy (modname, argv[i], 200);
359  }
360  }
361  }
362  else if (argc > 1) {
363  strncpy (filename, argv[1], 200);
364  }
365 
366  if (!*filename) {
367  std::cerr << HELPTEXT << std::endl;
368  return -1;
369  }
370  else {
371  if (!*modname) {
372  for (i = 0; (i < 256) && (filename[i]) && (filename[i] != '.'); i++) {
373  modname[i] = filename[i];
374  }
375  modname[i] = 0;
376  }
377 
378  format = (format == F_AUTODETECT) ? detectFormat(filename) : format;
379  if (format == F_AUTODETECT) {
380  fprintf(stderr, HELPTEXT);
381  fprintf(stderr, "\n\nCould not detect file format for file \"%s\", please specify.\n", filename);
382  return -1;
383  }
384 
385  int retCode = processXML (filename, modname, longnames, exportfile, format);
386 
387  return retCode;
388  }
389 }
390 
391 
392 
static char createModule(const char *ipath)
Definition: rawgenbook.cpp:191
virtual char setKey(const SWKey *ikey)
Definition: swmodule.cpp:298
int main(int argc, char **argv)
Definition: addcomment.cpp:32
static signed char create(const char *path)
Definition: treekeyidx.cpp:290
return NULL
Definition: regex.c:7953
unsigned char detectFormat(char *filename)
Definition: xml2gbs.cpp:54
virtual void setEntry(const char *inbuf, long len=-1)
Definition: rawgenbook.cpp:134
XML_FORMATS
Definition: xml2gbs.cpp:50
const char * string
Definition: regex.c:5014
int processXML(const char *filename, char *modname, bool longnames, bool exportfile, unsigned char format)
Definition: xml2gbs.cpp:83
#define HELPTEXT
Definition: xml2gbs.cpp:52