[sword-devel] parser
David White
sword-devel@crosswire.org
10 Mar 2002 20:29:55 +1100
oops, please ignore that email, it was sent by mistake..
On Sun, 2002-03-10 at 20:09, David White wrote:
> #include <algorithm>
> #include <cctype>
> #include <functional>
> #include <iterator>
> #include <sstream>
> #include <string>
> #include <vector>
>
> using std::string;
>
> struct verse {
> verse(string b, string c, string v)
> : book_name(b), chapter_num(c), verse_num(v) {}
> string book_name, chapter_num, verse_num;
> };
>
> struct range {
> range(const verse& v) : lower(v), upper(v) {}
> range(const verse& l, const verse& u) : lower(l), upper(u) {}
> verse lower, upper;
> };
>
> //tokenize_refs: this function takes a reference string, and tokenizes
> //it. Tokenizing rules are as follows:
> //- whitespace separates into different tokens, but whitespace itself
> // is never included in a token. The characters . and : are treated
> // like whitespace
> //- a non-number followed by a number is seperated into different tokens
> // (but not a number followed by a non-number)
> //- the characters ,;- are always placed in tokens of their own
> // (and thus cause separation on either side)
> //
> // e.g. "1 Peter1:5-8" -> "1","Peter","1",":","5","-","8"
> void tokenize_refs(const string& ref, std::vector<string>& res)
> {
> //find the first non-space character
> const string::const_iterator first = std::find_if(
> ref.begin(), ref.end(), isgraph
> );
> //if we didn't find anything, just return
> if(first == ref.end())
> return;
>
> //the list of possible separators
> static const string sep = ",;:-. ";
>
> //if this is a separator character, add it as a token, and
> //recurse with the remaining substring (we know that this
> //character must not be a space from above)
> if(std::find(sep.begin(),sep.end(),*first) != sep.end()) {
> //things with no semantic meaning, which should be ignored
> static const string skip = ".:";
> if(std::find(skip.begin(),skip.end(),*first) == skip.end())
> res.push_back(string(1,*first));
> tokenize_refs(string(first+1,ref.end()),res);
> return;
> }
>
> //find the separator, or the end of the string
> const string::const_iterator end = std::find_first_of(
> first, ref.end(),
> sep.begin(), sep.end()
> );
>
> //we still haven't satisfied the rule that if a non-number is
> //followed by a number, we have to treat that as a boundary.
> //Iterate over every digit in the substring we now have,
> //and if it is preceeded by a non-number, we have to stop there
> string::const_iterator last = std::find_if(first+1,end,isdigit);
> while(last != end && isdigit(*(last-1)))
> last = std::find_if(last+1,end,isdigit);
>
> //add this token
> res.push_back(string(first,last));
>
> //call the function again, with the remaining substring
> tokenize_refs(string(last,ref.end()),res);
> }
>
> bool is_separator(const string& str) {
> static const string sep[] = {",",";",":","-","v","ver","V","VER","Ver"};
> static const string* const begin = sep;
> static const string* const end = sep + sizeof(sep)/sizeof(*sep);
> return std::find(begin,end,str) != end;
> }
>
> bool is_entity(const string& str) {
> return std::find_if(str.begin(),str.end(),isalnum) != str.end() &&
> !is_separator(str);
> }
>
> bool is_roman(const string& str) {
> return str.find_first_not_of("ivxlIVXL") == string::npos;
> }
>
> bool is_number(const string& str) {
> return str.find_first_not_of("0123456789") == string::npos;
> }
>
> bool is_chapter_verse(const string& str) {
> return is_number(str) || is_roman(str);
> }
>
> bool is_word(const string& str) {
> return std::find_if(str.begin(),str.end(),isalpha) != str.end() &&
> !is_roman(str) && !is_separator(str);
> }
>
> void get_verses(std::vector<string>::const_iterator start,
> std::vector<string>::const_iterator end,
> verse& default_ref,
> std::vector<std::pair<bool,verse> >& res)
> {
> typedef std::vector<string> token_list;
> typedef token_list::const_iterator token_itor;
> token_itor first = std::find_if(start,end,is_entity);
> if(first == end)
> return;
>
> //find the end of this reference portion
> static const string ref_sep[] = {";",",","-"};
> static const int nref_sep = sizeof(ref_sep)/sizeof(*ref_sep);
> token_itor last = std::find_first_of(first,end,ref_sep,ref_sep+nref_sep);
>
> //try to find a book name. If we find a word, we assume everything
> //before it is part of it (e.g. 1 Peter)
> const token_itor word = std::find_if(first,last,is_word);
> const token_itor end_book =
> (word != last) ? std::find_if(word+1,last,is_chapter_verse) : first;
>
>
> //ok, [first,end_book) now holds the book, we now want to find
> //two numbers - the chapter and verse
> token_itor chap = std::find_if(end_book,last,is_chapter_verse);
> token_itor vers = std::find_if(chap+1,last,is_chapter_verse);
>
> //if there are more tokens before the separator, we might as well
> //leave the rest, and attempt to parse it, it might hold another
> //reference
> if(vers < last)
> last = vers+1;
>
> //form the book, by joining the book tokens together, separate with spaces
> std::ostringstream book_stream;
> std::copy(first,end_book,std::ostream_iterator<string>(book_stream," "));
> string book_name = book_stream.str();
> if(book_name.empty()) {
> book_name = default_ref.book_name;
>
> //if the verse could not be found, it should take priority over
> //the chapter in terms of finding a match, so swap them
> //if however, we don't have a default for the verse, it means
> //we are in a construction like John 3-8, and we are now
> //parsing the '8', in which case we shouldn't swap
> if(vers == last && default_ref.verse_num != "*")
> std::swap(chap,vers);
> } else {
> book_name.resize(book_name.size()-1); //cut off extra space at end
> default_ref.chapter_num = "*";
> default_ref.verse_num = "*";
> }
>
> //work out the chapter and verse, use default values if they
> //are not available
> const string chapter_num = chap != last ?*chap:default_ref.chapter_num;
> const string verse_num = vers != last ?*vers:default_ref.verse_num;
>
> //this is a range if the first value was a '-'
> const bool is_range = (*start == "-");
>
> verse new_verse(book_name,chapter_num,verse_num);
> res.push_back(std::make_pair(is_range,new_verse));
> get_verses(last,end,new_verse,res);
> }
>
> #include <iostream>
>
> int main()
> {
> char buf[500];
> for(;;) {
> std::cin.getline(buf,500);
> const string input(buf);
> std::vector<string> tokens;
> tokenize_refs(input,tokens);
> std::vector<std::pair<bool,verse> > verses;
> verse v("Genesis","1","1");
> get_verses(tokens.begin(),tokens.end(),v,verses);
>
>
> std::cout << "\"" << input << "\" -> ";
> for(std::vector<std::pair<bool,verse> >::const_iterator i = verses.begin(); i != verses.end(); ++i) {
> std::cout << (i->first ? "-":",") << "\""
> << i->second.book_name << "|"
> << i->second.chapter_num << "|"
> << i->second.verse_num << "\"";
> }
> std::cout << std::endl;
> }
> }