[sword-svn] r2295 - in trunk: include src/modules/filters src/utilfuns tests utilities
scribe at crosswire.org
scribe at crosswire.org
Sun Mar 29 10:11:27 MST 2009
Author: scribe
Date: 2009-03-29 10:11:27 -0700 (Sun, 29 Mar 2009)
New Revision: 2295
Modified:
trunk/include/filemgr.h
trunk/include/utilstr.h
trunk/src/modules/filters/osishtmlhref.cpp
trunk/src/modules/filters/utf8utf16.cpp
trunk/src/utilfuns/utilstr.cpp
trunk/tests/filtertest.cpp
trunk/utilities/imp2vs.cpp
Log:
extracted UTF8 codepoint logic from utf8->utf16 filter
into utilstr
Applied Ben Morgan's patch to handle multibyte divine name data
Modified: trunk/include/filemgr.h
===================================================================
--- trunk/include/filemgr.h 2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/include/filemgr.h 2009-03-29 17:11:27 UTC (rev 2295)
@@ -143,7 +143,7 @@
* Will only close the file if it was created by this FileMgr object.
* @param file The file to close.
*/
- void close(FileDesc * file);
+ void close(FileDesc *file);
/** Cacher methods overridden
*/
Modified: trunk/include/utilstr.h
===================================================================
--- trunk/include/utilstr.h 2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/include/utilstr.h 2009-03-29 17:11:27 UTC (rev 2295)
@@ -23,6 +23,7 @@
#define UTILSTR_H
#include <defs.h>
+#include <sysdata.h>
SWORD_NAMESPACE_START
@@ -42,5 +43,18 @@
extern const unsigned char SW_toupper_array[256];
#define SW_toupper(c) SW_toupper_array[(unsigned char)c]
+/******************************************************************************
+ * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
+ * and increments buf to start of next codepoint
+ *
+ * ENT: buf - address of a utf8 buffer
+ *
+ * RET: buf - incremented past last byte used in computing the current codepoint
+ * unicode codepoint value (0 with buf incremented is invalid UTF8 byte
+ */
+
+__u32 getUniCharFromUTF8(const unsigned char **buf);
+
+
SWORD_NAMESPACE_END
#endif
Modified: trunk/src/modules/filters/osishtmlhref.cpp
===================================================================
--- trunk/src/modules/filters/osishtmlhref.cpp 2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/src/modules/filters/osishtmlhref.cpp 2009-03-29 17:11:27 UTC (rev 2295)
@@ -432,6 +432,16 @@
if (lastText.size()) {
toupperstr(lastText);
scratch.setFormatted("%c<font size=\"-1\">%s</font>", lastText[0], lastText.c_str()+1);
+
+ const unsigned char *tmpBuf = (const unsigned char *)lastText.c_str();
+ getUniCharFromUTF8(&tmpBuf);
+ int char_length = (tmpBuf - (const unsigned char *)lastText.c_str());
+ scratch.setFormatted("%.*s<font size=\"-1\">%s</font>",
+ char_length,
+ lastText.c_str(),
+ lastText.c_str() + char_length
+ );
+
outText(scratch.c_str(), buf, u);
}
}
Modified: trunk/src/modules/filters/utf8utf16.cpp
===================================================================
--- trunk/src/modules/filters/utf8utf16.cpp 2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/src/modules/filters/utf8utf16.cpp 2009-03-29 17:11:27 UTC (rev 2295)
@@ -23,7 +23,9 @@
#include <stdlib.h>
#include <stdio.h>
+#include <sysdata.h>
#include <utf8utf16.h>
+#include <utilstr.h>
#include <swbuf.h>
SWORD_NAMESPACE_START
@@ -31,61 +33,36 @@
UTF8UTF16::UTF8UTF16() {
}
+
char UTF8UTF16::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
const unsigned char *from;
- unsigned long ch;
- signed short utf16;
- unsigned char from2[7];
-
SWBuf orig = text;
from = (const unsigned char *)orig.c_str();
// -------------------------------
- for (text = ""; *from; from++) {
- ch = 0;
- //case: ANSI
- if ((*from & 128) != 128) {
+ text = "";
+ while (*from) {
+
+ __u32 ch = getUniCharFromUTF8(&from);
+
+ if (!ch) continue; // invalid char
+
+ if (ch < 0x10000) {
text.setSize(text.size()+2);
- *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)*from;
- continue;
+ *((__u16 *)(text.getRawData()+(text.size()-2))) = (__u16)ch;
}
- //case: Invalid UTF-8 (illegal continuing byte in initial position)
- if ((*from & 128) && ((*from & 64) != 64)) {
- continue;
+ else {
+ __u16 utf16;
+ utf16 = (__s16)((ch - 0x10000) / 0x400 + 0xD800);
+ text.setSize(text.size()+4);
+ *((__u16 *)(text.getRawData()+(text.size()-4))) = utf16;
+ utf16 = (__s16)((ch - 0x10000) % 0x400 + 0xDC00);
+ *((__u16 *)(text.getRawData()+(text.size()-2))) = utf16;
}
- //case: 2+ byte codepoint
- from2[0] = *from;
- from2[0] <<= 1;
- int subsequent;
- for (subsequent = 1; (from2[0] & 128) && (subsequent < 7); subsequent++) {
- from2[0] <<= 1;
- from2[subsequent] = from[subsequent];
- from2[subsequent] &= 63;
- ch <<= 6;
- ch |= from2[subsequent];
- }
- subsequent--;
- from2[0] <<= 1;
- char significantFirstBits = 8 - (2+subsequent);
-
- ch |= (((short)from2[0]) << (((6*subsequent)+significantFirstBits)-8));
- from += subsequent;
- if (ch < 0x10000) {
- text.setSize(text.size()+2);
- *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)ch;
- }
- else {
- utf16 = (signed short)((ch - 0x10000) / 0x400 + 0xD800);
- text.setSize(text.size()+2);
- *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)utf16;
- utf16 = (signed short)((ch - 0x10000) % 0x400 + 0xDC00);
- text.setSize(text.size()+2);
- *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)utf16;
- }
}
text.setSize(text.size()+2);
- *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)0;
+ *((__u16 *)(text.getRawData()+(text.size()-2))) = (__u16)0;
return 0;
Modified: trunk/src/utilfuns/utilstr.cpp
===================================================================
--- trunk/src/utilfuns/utilstr.cpp 2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/src/utilfuns/utilstr.cpp 2009-03-29 17:11:27 UTC (rev 2295)
@@ -19,20 +19,9 @@
#include <ctype.h>
#include <string.h>
-#include <localemgr.h>
+#include <sysdata.h>
-#ifdef _ICU_
-#include <unicode/utypes.h>
-#include <unicode/ucnv.h>
-#include <unicode/ustring.h>
-#include <unicode/uchar.h>
-
-#include <unicode/unistr.h>
-#include <unicode/translit.h>
-
-#endif
-
SWORD_NAMESPACE_START
const unsigned char SW_toupper_array[256] =
@@ -192,68 +181,58 @@
#endif
}
+
/******************************************************************************
- * toupperstr - converts a string to uppercase string
+ * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
+ * and increments buf to start of next codepoint
*
- * ENT: target - string to convert
+ * ENT: buf - address of a utf8 buffer
*
- * RET: target
+ * RET: buf - incremented past last byte used in computing the current codepoint
+ * unicode codepoint value (0 with buf incremented is invalid UTF8 byte
*/
-// char *toupperstr(char *buf) {
-// char *ret = buf;
-//
-// /*if (StringHelper::getSystemStringHelper()) {
-// StringHelper::getSystemStringHelper()->upperStringLatin1( ret );
-// }
-// else*/ {
-// while (*buf) {
-// *buf++ = SW_toupper(*buf);
-// }
-// // }
-// return ret;
-// }
+__u32 getUniCharFromUTF8(const unsigned char **buf) {
+ __u32 ch = 0;
+ unsigned char multibuf[7];
+ //case: We're at the end
+ if (!(**buf)) {
+ return ch;
+ }
-/******************************************************************************
- * toupperstr - converts a string to uppercase string
- *
- * ENT: target - string to convert
- *
- * RET: target
- */
+ //case: ANSI
+ if (!(**buf & 128)) {
+ ch = **buf;
+ (*buf)++;
+ return ch;
+ }
-// char *toupperstr_utf8(char *buf, unsigned int max) {
-// char *ret = buf;
-//
-// /* if (StringHelper::getSystemStringHelper()) {
-// StringHelper::getSystemStringHelper()->upperStringUtf8( ret );
-// return ret;
-// }*/
-//
-// #ifndef _ICU_
-// // try to decide if it's worth trying to toupper. Do we have more
-// // characters that are probably lower latin than not?
-// long performOp = 0;
-// for (const char *ch = buf; *ch; ch++)
-// performOp += (*ch > 0) ? 1 : -1;
-//
-// if (performOp > 0) {
-// while (*buf)
-// *buf = SW_toupper(*buf++);
-// }
-// #else
-// if (!max)
-// max = strlen(ret);
-// UErrorCode err = U_ZERO_ERROR;
-// UConverter *conv = ucnv_open("UTF-8", &err);
-// UnicodeString str(buf, -1, conv, err);
-// UnicodeString ustr = str.toUpper();
-// ustr.extract(ret, max, conv, err);
-// ucnv_close(conv);
-// #endif
-//
-// return ret;
-// }
+ //case: Invalid UTF-8 (illegal continuing byte in initial position)
+ if ((**buf & 128) && (!(**buf & 64))) {
+ (*buf)++;
+ return ch;
+ }
+ //case: 2+ byte codepoint
+ multibuf[0] = **buf;
+ multibuf[0] <<= 1;
+ int subsequent;
+ for (subsequent = 1; (multibuf[0] & 128) && (subsequent < 7); subsequent++) {
+ multibuf[0] <<= 1;
+ multibuf[subsequent] = (*buf)[subsequent];
+ multibuf[subsequent] &= 63;
+ ch <<= 6;
+ ch |= multibuf[subsequent];
+ }
+ subsequent--;
+ multibuf[0] <<= 1;
+ char significantFirstBits = 8 - (2+subsequent);
+
+ ch |= (((__s16)multibuf[0]) << (((6*subsequent)+significantFirstBits)-8));
+ *buf += (subsequent+1);
+ return ch;
+}
+
+
SWORD_NAMESPACE_END
Modified: trunk/tests/filtertest.cpp
===================================================================
--- trunk/tests/filtertest.cpp 2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/tests/filtertest.cpp 2009-03-29 17:11:27 UTC (rev 2295)
@@ -17,7 +17,9 @@
#include <iostream>
#include <swbuf.h>
+#include <filemgr.h>
#include <papyriplain.h>
+#include <utf8utf16.h>
//#include <swmgr.h>
#ifndef NO_SWORD_NAMESPACE
using namespace sword;
@@ -26,15 +28,42 @@
int main(int argc, char **argv) {
-// SWMgr mgr;
-// SWModule *module = mgr.getModule("KJV");
- PapyriPlain filter;
- SWBuf buf;
- buf = "This is t<e>xt which has papy-\nri markings in it.\n L[et's be] sure it gets--\n cleaned up well for s(earching)";
- std::cout << "Original:\n\n" << buf << "\n\n-------\n\n";
- filter.processText(buf);
-// filter.processText(buf, module->getKey(), module);
- std::cout << buf << "\n\n+++++++\n";
+ UTF8UTF16 filter;
+// PapyriPlain filter;
+//
+ FileDesc *fd = (argc > 1) ? FileMgr::getSystemFileMgr()->open(argv[1], FileMgr::RDONLY) : 0;
+ SWBuf lineBuffer = "This is t<e>xt which has papy-\nri markings in it.\n L[et's be] sure it gets--\n cleaned up well for s(earching)";
+
+ std::cout << "Original:\n\n";
+
+ while (!fd || FileMgr::getLine(fd, lineBuffer)) {
+ cout << lineBuffer << "\n";
+ if (!fd) break;
+ }
+
+ cout << "\n\n-------\n\n";
+
+ if (fd) {
+ FileMgr::getSystemFileMgr()->close(fd);
+ fd = FileMgr::getSystemFileMgr()->open(argv[1], FileMgr::RDONLY);
+ }
+
+ while (!fd || FileMgr::getLine(fd, lineBuffer)) {
+ filter.processText(lineBuffer);
+ for (unsigned int i = 0; i < lineBuffer.size(); i++) {
+ printf("%c", lineBuffer[i]);
+ }
+ cout << "\n";
+ if (!fd) break;
+ }
+
+ std::cout << "\n\n+++++++\n";
+
+ if (fd) {
+ FileMgr::getSystemFileMgr()->close(fd);
+ }
+
return 0;
}
+
Modified: trunk/utilities/imp2vs.cpp
===================================================================
--- trunk/utilities/imp2vs.cpp 2009-03-29 14:48:47 UTC (rev 2294)
+++ trunk/utilities/imp2vs.cpp 2009-03-29 17:11:27 UTC (rev 2295)
@@ -131,6 +131,8 @@
}
writeEntry(currentKey, currentEntry, module);
+ FileMgr::getSystemFileMgr()->close(fd);
+
delete vkey;
return 0;
More information about the sword-cvs
mailing list