[sword-devel] parser

David White sword-devel@crosswire.org
10 Mar 2002 20:09:26 +1100


--=-CLQZ4YxA0cpeCpQAJ1Kp
Content-Type: text/plain
Content-Transfer-Encoding: 7bit



--=-CLQZ4YxA0cpeCpQAJ1Kp
Content-Disposition: attachment; filename=parse_verse.cpp
Content-Transfer-Encoding: quoted-printable
Content-Type: text/x-c; charset=ISO-8859-1

#include <algorithm>
#include <cctype>
#include <functional>
#include <iterator>
#include <sstream>
#include <string>
#include <vector>

using std::string;

struct verse {
	verse(string b, string c, string v)
			: book_name(b), chapter_num(c), verse_num(v) {}
	string book_name, chapter_num, verse_num;
};

struct range {
	range(const verse& v) : lower(v), upper(v) {}
	range(const verse& l, const verse& u) : lower(l), upper(u) {}
	verse lower, upper;
};

//tokenize_refs: this function takes a reference string, and tokenizes
//it. Tokenizing rules are as follows:
//- whitespace separates into different tokens, but whitespace itself
//  is never included in a token. The characters . and : are treated
//  like whitespace
//- a non-number followed by a number is seperated into different tokens
//  (but not a number followed by a non-number)
//- the characters ,;- are always placed in tokens of their own
//  (and thus cause separation on either side)
//
//  e.g. "1 Peter1:5-8" -> "1","Peter","1",":","5","-","8"
void tokenize_refs(const string& ref, std::vector<string>& res)
{
	//find the first non-space character
	const string::const_iterator first =3D std::find_if(
					                         ref.begin(), ref.end(), isgraph
											);
	//if we didn't find anything, just return
	if(first =3D=3D ref.end())
		return;

	//the list of possible separators
	static const string sep =3D ",;:-. ";

	//if this is a separator character, add it as a token, and
	//recurse with the remaining substring (we know that this
	//character must not be a space from above)
	if(std::find(sep.begin(),sep.end(),*first) !=3D sep.end()) {
		//things with no semantic meaning, which should be ignored
		static const string skip =3D ".:";
		if(std::find(skip.begin(),skip.end(),*first) =3D=3D skip.end())
			res.push_back(string(1,*first));
		tokenize_refs(string(first+1,ref.end()),res);
		return;
	}
=09
	//find the separator, or the end of the string
	const string::const_iterator end =3D std::find_first_of(
					                         first, ref.end(),
										     sep.begin(), sep.end()
											);
=09
	//we still haven't satisfied the rule that if a non-number is
	//followed by a number, we have to treat that as a boundary.
	//Iterate over every digit in the substring we now have,
	//and if it is preceeded by a non-number, we have to stop there
	string::const_iterator last =3D std::find_if(first+1,end,isdigit);
	while(last !=3D end && isdigit(*(last-1)))
		last =3D std::find_if(last+1,end,isdigit);
=09
	//add this token
	res.push_back(string(first,last));

	//call the function again, with the remaining substring
	tokenize_refs(string(last,ref.end()),res);
}

bool is_separator(const string& str) {
	static const string sep[] =3D {",",";",":","-","v","ver","V","VER","Ver"};
	static const string* const begin =3D sep;
	static const string* const end =3D sep + sizeof(sep)/sizeof(*sep);
	return std::find(begin,end,str) !=3D end;
}

bool is_entity(const string& str) {
	return std::find_if(str.begin(),str.end(),isalnum) !=3D str.end() &&
		   !is_separator(str);
}

bool is_roman(const string& str) {
	return str.find_first_not_of("ivxlIVXL") =3D=3D string::npos;
}

bool is_number(const string& str) {
	return str.find_first_not_of("0123456789") =3D=3D string::npos;
}

bool is_chapter_verse(const string& str) {
	return is_number(str) || is_roman(str);
}

bool is_word(const string& str) {
	return std::find_if(str.begin(),str.end(),isalpha) !=3D str.end() &&
	       !is_roman(str) && !is_separator(str);
}

void get_verses(std::vector<string>::const_iterator start,
				std::vector<string>::const_iterator end,
				verse& default_ref,
				std::vector<std::pair<bool,verse> >& res)
{	=09
	typedef std::vector<string> token_list;
	typedef token_list::const_iterator token_itor;
	token_itor first =3D std::find_if(start,end,is_entity);
	if(first =3D=3D end)
		return;

	//find the end of this reference portion
	static const string ref_sep[] =3D {";",",","-"};
	static const int nref_sep =3D sizeof(ref_sep)/sizeof(*ref_sep);
	token_itor last =3D std::find_first_of(first,end,ref_sep,ref_sep+nref_sep)=
;

	//try to find a book name. If we find a word, we assume everything
	//before it is part of it (e.g. 1 Peter)
	const token_itor word =3D std::find_if(first,last,is_word);
	const token_itor end_book =3D
	     (word !=3D last) ? std::find_if(word+1,last,is_chapter_verse) : first=
;
=09

	//ok, [first,end_book) now holds the book, we now want to find
	//two numbers - the chapter and verse
	token_itor chap =3D std::find_if(end_book,last,is_chapter_verse);
	token_itor vers =3D std::find_if(chap+1,last,is_chapter_verse);

	//if there are more tokens before the separator, we might as well
	//leave the rest, and attempt to parse it, it might hold another
	//reference
	if(vers < last)
		last =3D vers+1;
=09
	//form the book, by joining the book tokens together, separate with spaces
	std::ostringstream book_stream;
	std::copy(first,end_book,std::ostream_iterator<string>(book_stream," "));
	string book_name =3D book_stream.str();
	if(book_name.empty()) {
		book_name =3D default_ref.book_name;

		//if the verse could not be found, it should take priority over
		//the chapter in terms of finding a match, so swap them
		//if however, we don't have a default for the verse, it means
		//we are in a construction like John 3-8, and we are now
		//parsing the '8', in which case we shouldn't swap
		if(vers =3D=3D last && default_ref.verse_num !=3D "*")
			std::swap(chap,vers);
	} else {
		book_name.resize(book_name.size()-1); //cut off extra space at end
		default_ref.chapter_num =3D "*";
		default_ref.verse_num =3D "*";
	}
=09
	//work out the chapter and verse, use default values if they
	//are not available
	const string chapter_num =3D chap !=3D last ?*chap:default_ref.chapter_num=
;
	const string verse_num =3D vers !=3D last ?*vers:default_ref.verse_num;
=09
	//this is a range if the first value was a '-'
	const bool is_range =3D (*start =3D=3D "-");
=09
	verse new_verse(book_name,chapter_num,verse_num);
	res.push_back(std::make_pair(is_range,new_verse));
	get_verses(last,end,new_verse,res);
}

#include <iostream>

int main()
{
	char buf[500];
	for(;;) {
		std::cin.getline(buf,500);
		const string input(buf);
		std::vector<string> tokens;
		tokenize_refs(input,tokens);
		std::vector<std::pair<bool,verse> > verses;
		verse v("Genesis","1","1");
		get_verses(tokens.begin(),tokens.end(),v,verses);

	=09
		std::cout << "\"" << input << "\" -> ";
		for(std::vector<std::pair<bool,verse> >::const_iterator i =3D verses.begi=
n(); i !=3D verses.end(); ++i) {
			std::cout << (i->first ? "-":",") << "\""
					  << i->second.book_name << "|"
					  << i->second.chapter_num << "|"
					  << i->second.verse_num << "\"";
		}
		std::cout << std::endl;
	}
}

--=-CLQZ4YxA0cpeCpQAJ1Kp--