[sword-devel] parser
David White
sword-devel@crosswire.org
10 Mar 2002 20:09:26 +1100
--=-CLQZ4YxA0cpeCpQAJ1Kp
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
--=-CLQZ4YxA0cpeCpQAJ1Kp
Content-Disposition: attachment; filename=parse_verse.cpp
Content-Transfer-Encoding: quoted-printable
Content-Type: text/x-c; charset=ISO-8859-1
#include <algorithm>
#include <cctype>
#include <functional>
#include <iterator>
#include <sstream>
#include <string>
#include <vector>
using std::string;
struct verse {
verse(string b, string c, string v)
: book_name(b), chapter_num(c), verse_num(v) {}
string book_name, chapter_num, verse_num;
};
struct range {
range(const verse& v) : lower(v), upper(v) {}
range(const verse& l, const verse& u) : lower(l), upper(u) {}
verse lower, upper;
};
//tokenize_refs: this function takes a reference string, and tokenizes
//it. Tokenizing rules are as follows:
//- whitespace separates into different tokens, but whitespace itself
// is never included in a token. The characters . and : are treated
// like whitespace
//- a non-number followed by a number is seperated into different tokens
// (but not a number followed by a non-number)
//- the characters ,;- are always placed in tokens of their own
// (and thus cause separation on either side)
//
// e.g. "1 Peter1:5-8" -> "1","Peter","1",":","5","-","8"
void tokenize_refs(const string& ref, std::vector<string>& res)
{
//find the first non-space character
const string::const_iterator first =3D std::find_if(
ref.begin(), ref.end(), isgraph
);
//if we didn't find anything, just return
if(first =3D=3D ref.end())
return;
//the list of possible separators
static const string sep =3D ",;:-. ";
//if this is a separator character, add it as a token, and
//recurse with the remaining substring (we know that this
//character must not be a space from above)
if(std::find(sep.begin(),sep.end(),*first) !=3D sep.end()) {
//things with no semantic meaning, which should be ignored
static const string skip =3D ".:";
if(std::find(skip.begin(),skip.end(),*first) =3D=3D skip.end())
res.push_back(string(1,*first));
tokenize_refs(string(first+1,ref.end()),res);
return;
}
=09
//find the separator, or the end of the string
const string::const_iterator end =3D std::find_first_of(
first, ref.end(),
sep.begin(), sep.end()
);
=09
//we still haven't satisfied the rule that if a non-number is
//followed by a number, we have to treat that as a boundary.
//Iterate over every digit in the substring we now have,
//and if it is preceeded by a non-number, we have to stop there
string::const_iterator last =3D std::find_if(first+1,end,isdigit);
while(last !=3D end && isdigit(*(last-1)))
last =3D std::find_if(last+1,end,isdigit);
=09
//add this token
res.push_back(string(first,last));
//call the function again, with the remaining substring
tokenize_refs(string(last,ref.end()),res);
}
bool is_separator(const string& str) {
static const string sep[] =3D {",",";",":","-","v","ver","V","VER","Ver"};
static const string* const begin =3D sep;
static const string* const end =3D sep + sizeof(sep)/sizeof(*sep);
return std::find(begin,end,str) !=3D end;
}
bool is_entity(const string& str) {
return std::find_if(str.begin(),str.end(),isalnum) !=3D str.end() &&
!is_separator(str);
}
bool is_roman(const string& str) {
return str.find_first_not_of("ivxlIVXL") =3D=3D string::npos;
}
bool is_number(const string& str) {
return str.find_first_not_of("0123456789") =3D=3D string::npos;
}
bool is_chapter_verse(const string& str) {
return is_number(str) || is_roman(str);
}
bool is_word(const string& str) {
return std::find_if(str.begin(),str.end(),isalpha) !=3D str.end() &&
!is_roman(str) && !is_separator(str);
}
void get_verses(std::vector<string>::const_iterator start,
std::vector<string>::const_iterator end,
verse& default_ref,
std::vector<std::pair<bool,verse> >& res)
{ =09
typedef std::vector<string> token_list;
typedef token_list::const_iterator token_itor;
token_itor first =3D std::find_if(start,end,is_entity);
if(first =3D=3D end)
return;
//find the end of this reference portion
static const string ref_sep[] =3D {";",",","-"};
static const int nref_sep =3D sizeof(ref_sep)/sizeof(*ref_sep);
token_itor last =3D std::find_first_of(first,end,ref_sep,ref_sep+nref_sep)=
;
//try to find a book name. If we find a word, we assume everything
//before it is part of it (e.g. 1 Peter)
const token_itor word =3D std::find_if(first,last,is_word);
const token_itor end_book =3D
(word !=3D last) ? std::find_if(word+1,last,is_chapter_verse) : first=
;
=09
//ok, [first,end_book) now holds the book, we now want to find
//two numbers - the chapter and verse
token_itor chap =3D std::find_if(end_book,last,is_chapter_verse);
token_itor vers =3D std::find_if(chap+1,last,is_chapter_verse);
//if there are more tokens before the separator, we might as well
//leave the rest, and attempt to parse it, it might hold another
//reference
if(vers < last)
last =3D vers+1;
=09
//form the book, by joining the book tokens together, separate with spaces
std::ostringstream book_stream;
std::copy(first,end_book,std::ostream_iterator<string>(book_stream," "));
string book_name =3D book_stream.str();
if(book_name.empty()) {
book_name =3D default_ref.book_name;
//if the verse could not be found, it should take priority over
//the chapter in terms of finding a match, so swap them
//if however, we don't have a default for the verse, it means
//we are in a construction like John 3-8, and we are now
//parsing the '8', in which case we shouldn't swap
if(vers =3D=3D last && default_ref.verse_num !=3D "*")
std::swap(chap,vers);
} else {
book_name.resize(book_name.size()-1); //cut off extra space at end
default_ref.chapter_num =3D "*";
default_ref.verse_num =3D "*";
}
=09
//work out the chapter and verse, use default values if they
//are not available
const string chapter_num =3D chap !=3D last ?*chap:default_ref.chapter_num=
;
const string verse_num =3D vers !=3D last ?*vers:default_ref.verse_num;
=09
//this is a range if the first value was a '-'
const bool is_range =3D (*start =3D=3D "-");
=09
verse new_verse(book_name,chapter_num,verse_num);
res.push_back(std::make_pair(is_range,new_verse));
get_verses(last,end,new_verse,res);
}
#include <iostream>
int main()
{
char buf[500];
for(;;) {
std::cin.getline(buf,500);
const string input(buf);
std::vector<string> tokens;
tokenize_refs(input,tokens);
std::vector<std::pair<bool,verse> > verses;
verse v("Genesis","1","1");
get_verses(tokens.begin(),tokens.end(),v,verses);
=09
std::cout << "\"" << input << "\" -> ";
for(std::vector<std::pair<bool,verse> >::const_iterator i =3D verses.begi=
n(); i !=3D verses.end(); ++i) {
std::cout << (i->first ? "-":",") << "\""
<< i->second.book_name << "|"
<< i->second.chapter_num << "|"
<< i->second.verse_num << "\"";
}
std::cout << std::endl;
}
}
--=-CLQZ4YxA0cpeCpQAJ1Kp--