/****************************************************************************** * flash.cpp - Automation of flashcards generation * * Copyright 2007 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * Contributors: * Lyndon Drake * Troy A. Griffitts */ #include #include #include #include #include #include #include #include #include #include #include #include using namespace sword; using namespace std; namespace { const int GREEK_START = 0x370; const int GREEK_END = 0x3FF; }; // used to hold a KJV translation phrase for a greek/hebrew word // and any greek/hebrew words combined to make this KJV phrase // e.g. hO QEOS = QEOS: [+ hO ]: God class Phrase { public: Phrase() : phrase("") {} SWBuf phrase; vector with; inline bool operator ==(const Phrase &other) const { return !compare(other); } inline bool operator !=(const Phrase &other) const { return compare(other); } inline bool operator > (const Phrase &other) const { return compare(other) > 0; } inline bool operator < (const Phrase &other) const { return compare(other) < 0; } inline bool operator <=(const Phrase &other) const { return compare(other) <= 0; } inline bool operator >=(const Phrase &other) const { return compare(other) >= 0; } int compare(const Phrase &right) const { int c = phrase.compare(right.phrase); if (c) return c; vector::const_iterator lit = with.begin(); vector::const_iterator rit = right.with.begin(); while (lit != with.end() && rit != right.with.end()) { c = lit->compare(*rit); if (c) return c; lit++; rit++; } if (lit != with.end()) return 1; if (rit != right.with.end()) return -1; return 0; } }; // KJV phrases and their occurance frequency typedef map KJVPhrases; // primary result class class Word { public: Word() : utf8("") , strong("") , freq(0) , def("") {} // lexical form of this word in utf8 greek/hebrew SWBuf utf8; vector utf16; // strongs number for this word (e.g. G3588) SWBuf strong; // frequency of occurance in the iterated text int freq; // definition pulled from short strongs def SWBuf def; // kjv translation phrases and their frequencies KJVPhrases kjvFreq; }; string itoa(int v) { stringstream str; str << v; return str.str(); } bool compareFreq(const Word &w1, const Word &w2) { return w1.freq > w2.freq; } bool compareSeqLenFreq(const Word &w1, const Word &w2) { if (w1.utf16.size() != w2.utf16.size()) { return (w1.utf16.size() > w2.utf16.size()); } return w1.freq > w2.freq; } bool compareKJVFreq(const KJVPhrases::const_iterator &i1, const KJVPhrases::const_iterator &i2) { return i1->second > i2->second; } // sort and pretty up all the KJV phrases for a word into a nice output buffer SWBuf prettyKJVFreq(KJVPhrases in) { SWBuf retVal; vector sorted; for (KJVPhrases::const_iterator it = in.begin(); it != in.end(); it++) { // combine cap words with lowercase, if exists Phrase k = it->first; if (k.phrase.size() && toupper(k.phrase[0]) == k.phrase[0] && k.phrase != "God" && k.phrase != "Lord") { k.phrase[0] = tolower(k.phrase[0]); if (k != it->first) { KJVPhrases::iterator i = in.find(k); if (i != in.end()) { i->second += it->second; // don't include us in the list cuz we added our freq to another continue; } } } sorted.push_back(it); } sort(sorted.begin(), sorted.end(), compareKJVFreq); for (vector::const_iterator it = sorted.begin(); it != sorted.end(); it++) { if (retVal.size()) retVal += "; "; // prepend 'with other strongs' if present if ((*it)->first.with.size()) { retVal += "[+"; for (int i = 0; i < (*it)->first.with.size(); i++) { retVal.appendFormatted(" %s", (*it)->first.with[i].c_str()); } retVal += " ] "; } retVal.appendFormatted("%s (%d)", (*it)->first.phrase.c_str(), (*it)->second); } return retVal; } // take utf8 text and spit out equiv. text substituting escaped codes for multibyte chars // java .properties files wants this format (flashcard .flash lessons use this format) SWBuf escapedUTF8(SWBuf inText) { static UTF8UTF16 convert; convert.processText(inText); SWBuf retBuf; for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) { if (*i < 128) { retBuf += (char)*i; } else { retBuf.appendFormatted("\\u%.4x", *i); // change hex alpha values to upper case for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) { retBuf[i] = toupper(retBuf[i]); } } } return retBuf; } SWBuf toUTF8(const vector &utf16) { static UTF16UTF8 convert; SWBuf retVal; retVal.size((utf16.size()+1)*2); unsigned short *i = (unsigned short *)retVal.getRawData(); int j; for (j = 0; j < utf16.size(); j++) { i[j] = utf16[j]; } i[j] = 0; convert.processText(retVal); return retVal; } // output a simple CSV ('|' separated really) format for importing into OOo or excel void outputCSV(const vector &seqList) { for (vector::const_iterator it = seqList.begin(); it != seqList.end(); it++) { const Word &w = (*it); // cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n"; cout << w.freq << "," << toUTF8(w.utf16).c_str() << "," << w.utf16.size() << "\n"; } std::cout << std::endl; } void outputHTML(const vector &seqList) { for (vector::const_iterator it = seqList.begin(); it != seqList.end(); it++) { const Word &w = (*it); // cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n"; cout << "" << w.freq << "" << toUTF8(w.utf16).c_str() << "\n"; } std::cout << std::endl; } void outputXML(const vector &seqList) { for (vector::const_iterator it = seqList.begin(); it != seqList.end(); it++) { const Word &w = (*it); // cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n"; cout << "" << w.freq << ""; cout << "" << toUTF8(w.utf16).c_str() << ""; cout << "" << w.utf16.size() << "\n"; } std::cout << std::endl; } /** * output our flashcard .flash file format * * seqList - duh * outputDir - directory path where to write files, e.g. "./hebFreq" * kjvFreq - if true, process KJV translation frequencies and use these as * the word answers; otherwise, use short strongs defs. * maxPerLesson - maximum number of words per lesson * */ void outputFlash(const vector &seqList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25) { ThMLPlain strip; ofstream ofile; int wordCount = 0; int lessonNumber = 0; int startFreq = 0; int lastFreq = 0; vector::const_iterator it = seqList.begin(); while (it != seqList.end()) { const Word &w = (*it); if (!wordCount) { SWBuf fname = outputDir; fname += "/lesson"; fname.appendFormatted("%d", lessonNumber); fname += ".flash"; ofile.open(fname); startFreq = w.freq; } SWBuf word = w.utf8; word.trim(); SWBuf answers = ""; answers.trim(); // if we want answers as KJV phrases if (kjvFreq) { answers = prettyKJVFreq(w.kjvFreq); if (answers.size() > 200) answers.size(200); } // if we would rather have short strongs else { answers = w.def; strip.processText(answers); // remove html tags answers.replaceBytes("\n\r", ' '); // remove newlines } // be sure we have both a word and an answer if (word.size() && answers.size()) { ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n"; ofile << "answers" << wordCount << "=" << answers << "\n"; lastFreq = w.freq; wordCount++; } it++; if (it == seqList.end() || wordCount >= maxPerLesson) { // close lesson SWBuf lessonTitle = ""; lessonTitle.appendFormatted("lessonTitle=%.3d Freqs. %d-%d\n", lessonNumber, startFreq, lastFreq); ofile << lessonTitle; ofile << "wordCount=" << wordCount << "\n"; ofile.close(); wordCount = 0; lessonNumber++; } } } /** * do the work * * range - the range of verses to process (e.g. "gen-mal") * addAll - if we should add all words in our lexicon for the testaments * included in the range even if they don't exist in the text * (useful for generating complete OT or NT strongs word lists) * */ vector processSequences(const char *range, int seqLength) { SWMgr manager; manager.setGlobalOption("Greek Accents", "Off"); UTF8UTF16 toUTF16; map, Word> seqList; SWModule *tmpBible = manager.getModule("WHNU"); if (!tmpBible) { cerr << "Unable to locate WHNU module" << endl; exit(1); } SWModule &bible = *tmpBible; VerseKey parser; ListKey r = parser.ParseVerseList(range, 0, true); r.Persist(true); bible.setKey(r); for (bible = TOP; !bible.Error(); bible++) { bible.RenderText(); // force an entry lookup to resolve key to something in the index SWBuf text = bible.StripText(); toUTF16.processText(text); for (unsigned short *i = (unsigned short *)text.getRawData(); *i; i++) { vector seq; int j; for (j = 0; ((j < seqLength) && (i[j] >= GREEK_START) && (i[j] <= GREEK_END)); j++) { seq.push_back(i[j]); } if (seq.size() == seqLength) { seqList[seq].freq++; } else { if (!i[j]) { // we don't need to process the rest of this text as all remaining seq lengths will fail break; } } } } vector sorted; for (map, Word>::iterator it = seqList.begin(); it != seqList.end(); it++) { // pull utf16 key from map and populate Word it->second.utf16 = it->first; // put only word in sorted container sorted.push_back(it->second); } sort(sorted.begin(), sorted.end(), compareFreq); return sorted; } int main(int argc, char **argv) { int minLength = 1; int maxLength = 3; char *range = "mat-rev"; int order = 1; int format = 1; if (argc > 1) minLength = atoi(argv[1]); if (argc > 2) maxLength = atoi(argv[2]); if (argc > 3) range = argv[3]; if (argc > 4) order = atoi(argv[4]); if (argc > 5) format = atoi(argv[5]); vector results; for (int i = minLength; i <= maxLength; i++) { vector pass = processSequences(range, i); results.insert(results.end(), pass.begin(), pass.end()); } if (order == 1) { sort(results.begin(), results.end(), compareFreq); } else { sort(results.begin(), results.end(), compareSeqLenFreq); } if (format == 1) { outputCSV(results); } else if (format == 2) { outputHTML(results); } else { outputXML(results); } return 0; }