[sword-svn] r3494 - in trunk: . include src/mgr src/modules/filters src/utilfuns tests
scribe at crosswire.org
scribe at crosswire.org
Mon Sep 11 04:39:51 MST 2017
Author: scribe
Date: 2017-09-11 04:39:50 -0700 (Mon, 11 Sep 2017)
New Revision: 3494
Modified:
trunk/configure.ac
trunk/include/filemgr.h
trunk/include/swbuf.h
trunk/include/utilstr.h
trunk/src/mgr/filemgr.cpp
trunk/src/modules/filters/rtfhtml.cpp
trunk/src/utilfuns/swbuf.cpp
trunk/src/utilfuns/utilstr.cpp
trunk/tests/utf8norm.cpp
Log:
Improved UTF-8 handling. Optimization pass before release
Modified: trunk/configure.ac
===================================================================
--- trunk/configure.ac 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/configure.ac 2017-09-11 11:39:50 UTC (rev 3494)
@@ -115,8 +115,8 @@
AM_CXXFLAGS="-O0 -Wall -Werror -Woverloaded-virtual"
fi
else
- AM_CFLAGS="-O3"
- AM_CXXFLAGS="-O3"
+ AM_CFLAGS="-Ofast"
+ AM_CXXFLAGS="-Ofast"
fi
AM_CFLAGS="$AM_CFLAGS -fPIC"
Modified: trunk/include/filemgr.h
===================================================================
--- trunk/include/filemgr.h 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/include/filemgr.h 2017-09-11 11:39:50 UTC (rev 3494)
@@ -32,7 +32,7 @@
SWORD_NAMESPACE_START
-class SWDLLEXPORT FileMgr;
+class SWDLLEXPORT FileDesc;
struct SWDLLEXPORT DirEntry {
public:
@@ -40,44 +40,7 @@
unsigned long size;
bool isDirectory;
};
-/**
-* This class represents one file. It works with the FileMgr object.
-*/
-class SWDLLEXPORT FileDesc {
- friend class FileMgr;
-
- long offset;
- int fd; // -77 closed;
- FileMgr *parent;
- FileDesc *next;
-
- FileDesc(FileMgr * parent, const char *path, int mode, int perms, bool tryDowngrade);
- virtual ~FileDesc();
-
-public:
- /** @return File handle.
- */
- int getFd();
-
- long seek(long offset, int whence);
- long read(void *buf, long count);
- long write(const void *buf, long count);
-
- /** Path to file.
- */
- char *path;
- /** File access mode.
- */
- int mode;
- /** File permissions.
- */
- int perms;
- /**
- */
- bool tryDowngrade;
-};
-
/**
* This class ist used make file access operations easier.
* It keeps a list of all open files internally and closes them
@@ -186,6 +149,51 @@
};
+/**
+* This class represents one file. It works with the FileMgr object.
+*/
+class SWDLLEXPORT FileDesc {
+ friend class FileMgr;
+
+ long offset;
+ int fd; // -77 closed;
+ FileMgr *parent;
+ FileDesc *next;
+
+ FileDesc(FileMgr * parent, const char *path, int mode, int perms, bool tryDowngrade);
+ virtual ~FileDesc();
+
+public:
+ /** @return File handle.
+ * NOTE: magic file descriptor -77 = closed to avoid os limits
+ */
+ inline int getFd() {
+ if (fd == -77)
+ fd = parent->sysOpen(this);
+// if ((fd < -1) && (fd != -77)) // kludge to handle ce
+// return 777;
+ return fd;
+ }
+
+ long seek(long offset, int whence);
+ long read(void *buf, long count);
+ long write(const void *buf, long count);
+
+ /** Path to file.
+ */
+ char *path;
+ /** File access mode.
+ */
+ int mode;
+ /** File permissions.
+ */
+ int perms;
+ /**
+ */
+ bool tryDowngrade;
+};
+
+
SWORD_NAMESPACE_END
#endif
Modified: trunk/include/swbuf.h
===================================================================
--- trunk/include/swbuf.h 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/include/swbuf.h 2017-09-11 11:39:50 UTC (rev 3494)
@@ -90,28 +90,41 @@
init(0);
}
- /**
- * SWBuf Constructor - Creates an SWBuf initialized
+ /******************************************************************************
+ * SWBuf Constructor - Creates an empty SWBuf object or an SWBuf initialized
* to a value from a const char *
- *
- */
- SWBuf(const char *initVal, unsigned long initSize = 0);
-// SWBuf(unsigned long initSize);
+ *
+ */
+ inline SWBuf(const char *initVal, unsigned long initSize = 0) {
+ init(initSize);
+ if (initVal)
+ set(initVal);
+ }
- /**
+ /******************************************************************************
* SWBuf Constructor - Creates an SWBuf initialized
- * to a value from a char
+ * to a value from another SWBuf
*
*/
- SWBuf(char initVal, unsigned long initSize = 0);
+ inline SWBuf(const SWBuf &other, unsigned long initSize = 0) {
+ init(initSize);
+ set(other);
+ }
- /**
+ /******************************************************************************
* SWBuf Constructor - Creates an SWBuf initialized
- * to a value from another SWBuf
+ * to a value from a char
*
*/
- SWBuf(const SWBuf &other, unsigned long initSize = 0);
+ inline SWBuf(char initVal, unsigned long initSize = 0) {
+ init(initSize+1);
+ *buf = initVal;
+ end = buf+1;
+ *end = 0;
+ }
+// SWBuf(unsigned long initSize);
+
/******************************************************************************
* SWBuf Destructor - Cleans up instance of SWBuf
*/
@@ -220,7 +233,13 @@
* SWBuf::setSize - Size this buffer to a specific length.
* @param len The new size of the buffer. One byte for the null will be added.
*/
- void setSize(unsigned long len);
+ inline void setSize(unsigned long len) {
+ assureSize(len+1);
+ if ((unsigned)(end - buf) < len)
+ memset(end, fillByte, len - (end-buf));
+ end = buf + len;
+ *end = 0;
+ }
/**
* SWBuf::resize - Resize this buffer to a specific length.
* @param len The new size of the buffer. One byte for the null will be added.
@@ -233,7 +252,17 @@
* @param str Append this.
* @param max Append only max chars.
*/
- SWBuf &append(const char *str, long max = -1);
+ inline SWBuf &append(const char *str, long max = -1) {
+ // if (!str) //A null string was passed
+ // return;
+ if (max < 0)
+ max = strlen(str);
+ assureMore(max+1);
+ for (;((max)&&(*str));max--)
+ *end++ = *str++;
+ *end = 0;
+ return *this;
+ }
/**
* SWBuf::append - appends a value to the current value of this SWBuf
Modified: trunk/include/utilstr.h
===================================================================
--- trunk/include/utilstr.h 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/include/utilstr.h 2017-09-11 11:39:50 UTC (rev 3494)
@@ -29,9 +29,29 @@
SWORD_NAMESPACE_START
-/** stdstr - clone a string
-*/
-SWDLLEXPORT char *stdstr (char **iistr, const char *istr, unsigned int memPadFactor = 1);
+
+/******************************************************************************
+ * stdstr - clones a string
+ *
+ * ENT: ipstr - pointer to a string pointer to set if necessary
+ * istr - string to set to *ipstr
+ * 0 - only get
+ *
+ * RET: *ipstr
+ */
+
+inline char *stdstr(char **ipstr, const char *istr, unsigned int memPadFactor = 1) {
+ if (*ipstr)
+ delete [] *ipstr;
+ if (istr) {
+ int len = (int)strlen(istr) + 1;
+ *ipstr = new char [ len * memPadFactor ];
+ memcpy(*ipstr, istr, len);
+ }
+ else *ipstr = 0;
+ return *ipstr;
+}
+
SWDLLEXPORT char *strstrip (char *istr);
SWDLLEXPORT const char *stristr (const char *s1, const char *s2);
SWDLLEXPORT int strnicmp(const char *s1, const char *s2, int len);
@@ -55,9 +75,82 @@
* unicode codepoint value (0 with buf incremented is invalid UTF8 byte
*/
-__u32 getUniCharFromUTF8(const unsigned char **buf);
+/******************************************************************************
+ * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
+ * and increments buf to start of next codepoint
+ *
+ * ENT: buf - address of a utf8 buffer
+ *
+ * RET: buf - incremented past last byte used in computing the current codepoint
+ * unicode codepoint value (0 with buf incremented is invalid UTF8 byte
+ */
+inline __u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation = false) {
+ __u32 ch = 0;
+
+ //case: We're at the end
+ if (!(**buf)) {
+ return ch;
+ }
+
+ //case: ANSI
+ if (!(**buf & 128)) {
+ ch = **buf;
+ (*buf)++;
+ return ch;
+ }
+
+ //case: Invalid UTF-8 (illegal continuing byte in initial position)
+ if ((**buf >> 6) == 2) {
+ (*buf)++;
+ return ch;
+ }
+
+
+ //case: 2+ byte codepoint
+ int subsequent = 1;
+ if ((**buf & 32) == 0) { subsequent = 1; }
+ else if ((**buf & 16) == 0) { subsequent = 2; }
+ else if ((**buf & 8) == 0) { subsequent = 3; }
+ else if ((**buf & 4) == 0) { subsequent = 4; }
+ else if ((**buf & 2) == 0) { subsequent = 5; }
+ else if ((**buf & 1) == 0) { subsequent = 6; }
+ else subsequent = 7; // is this legal?
+
+ ch = **buf & (0xFF>>(subsequent + 1));
+
+ for (int i = 1; i <= subsequent; ++i) {
+ // subsequent byte did not begin with 10XXXXXX
+ // move our buffer to here and error out
+ // this also catches our null if we hit the string terminator
+ if (((*buf)[i] >> 6) != 2) {
+ *buf += i;
+ return 0;
+ }
+ ch <<= 6;
+ ch |= (*buf)[i] & 63;
+ }
+ *buf += (subsequent+1);
+
+ if (!skipValidation) {
+ // I THINK THIS IS STUPID BUT THE SPEC SAYS NO MORE THAN 4 BYTES
+ if (subsequent > 3) ch = 0;
+ // AGAIN stupid, but spec says UTF-8 can't use more than 21 bits
+ if (ch > 0x1FFFFF) ch = 0;
+ // This would be out of Unicode bounds
+ if (ch > 0x10FFFF) ch = 0;
+ // these would be values which could be represented in less bytes
+ if (ch < 0x80 && subsequent > 0) ch = 0;
+ if (ch < 0x800 && subsequent > 1) ch = 0;
+ if (ch < 0x10000 && subsequent > 2) ch = 0;
+ if (ch < 0x200000 && subsequent > 3) ch = 0;
+ }
+
+ return ch;
+}
+
+
/******************************************************************************
* getUTF8FromUniChar - retrieves us UTF8 string from a
* Unicode codepoint
@@ -66,11 +159,95 @@
*
* RET: buf - a UTF8 string which consists of the proper UTF8 sequence of
* bytes for the given Unicode codepoint
+ * NOTE: for speed and thread safety, this method now requires a buffer
+ * to work with
*/
-SWBuf getUTF8FromUniChar(__u32 uchar);
+inline SWBuf *getUTF8FromUniChar(__u32 uchar, SWBuf *appendTo) {
+ unsigned long base = appendTo->size();
+ // This would be out of Unicode bounds
+ if (uchar > 0x10FFFF) uchar = 0xFFFD;
+ char bytes = uchar < 0x80 ? 1 : uchar < 0x800 ? 2 : uchar < 0x10000 ? 3 : 4;
+ appendTo->setSize(base+bytes);
+ switch (bytes) {
+ case 1:
+ (*appendTo)[base ] = (unsigned char)uchar;
+ break;
+ case 2:
+ (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
+ uchar >>= 6;
+ (*appendTo)[base ] = (unsigned char)(0xc0 | (uchar & 0x1f));
+ break;
+ case 3:
+ (*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
+ uchar >>= 6;
+ (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
+ uchar >>= 6;
+ (*appendTo)[base ] = (unsigned char)(0xe0 | (uchar & 0x0f));
+ break;
+ case 4:
+ (*appendTo)[base+3] = (unsigned char)(0x80 | (uchar & 0x3f));
+ uchar >>= 6;
+ (*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
+ uchar >>= 6;
+ (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
+ uchar >>= 6;
+ (*appendTo)[base ] = (unsigned char)(0xf0 | (uchar & 0x07));
+ break;
+ }
+/*
+ else if (uchar < 0x4000000) {
+ appendTo->setSize(base+5);
+ i = uchar & 0x3f;
+ (*appendTo)[base+4] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+ i = uchar & 0x3f;
+ (*appendTo)[base+3] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x3f;
+ (*appendTo)[base+2] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x3f;
+ (*appendTo)[base+1] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x03;
+ (*appendTo)[base] = (unsigned char)(0xf8 | i);
+ }
+ else if (uchar < 0x80000000) {
+ appendTo->setSize(base+6);
+ i = uchar & 0x3f;
+ (*appendTo)[base+5] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x3f;
+ (*appendTo)[base+4] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x3f;
+ (*appendTo)[base+3] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x3f;
+ (*appendTo)[base+2] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x3f;
+ (*appendTo)[base+1] = (unsigned char)(0x80 | i);
+ uchar >>= 6;
+
+ i = uchar & 0x01;
+ (*appendTo)[base] = (unsigned char)(0xfc | i);
+ }
+*/
+ return appendTo;
+}
+
+
/******************************************************************************
* assureValidUTF8 - iterates the supplied UTF-8 buffer and checks for validity
* replacing invalid bytes if necessary and returning a
Modified: trunk/src/mgr/filemgr.cpp
===================================================================
--- trunk/src/mgr/filemgr.cpp 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/mgr/filemgr.cpp 2017-09-11 11:39:50 UTC (rev 3494)
@@ -131,15 +131,6 @@
}
-int FileDesc::getFd() {
- if (fd == -77)
- fd = parent->sysOpen(this);
-// if ((fd < -1) && (fd != -77)) // kludge to hand ce
-// return 777;
- return fd;
-}
-
-
long FileDesc::seek(long offset, int whence) {
return lseek(getFd(), offset, whence);
}
Modified: trunk/src/modules/filters/rtfhtml.cpp
===================================================================
--- trunk/src/modules/filters/rtfhtml.cpp 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/modules/filters/rtfhtml.cpp 2017-09-11 11:39:50 UTC (rev 3494)
@@ -55,7 +55,7 @@
num.append(from, end-from);
__s16 n = atoi(num.c_str());
__u32 u = (__u16)n;
- text.append(getUTF8FromUniChar(u));
+ getUTF8FromUniChar(u, &text);
from += (end-from);
continue;
}
Modified: trunk/src/utilfuns/swbuf.cpp
===================================================================
--- trunk/src/utilfuns/swbuf.cpp 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/utilfuns/swbuf.cpp 2017-09-11 11:39:50 UTC (rev 3494)
@@ -32,39 +32,7 @@
char *SWBuf::nullStr = (char *)"";
-/******************************************************************************
-* SWBuf Constructor - Creates an empty SWBuf object or an SWBuf initialized
-* to a value from a const char *
-*
-*/
-SWBuf::SWBuf(const char *initVal, unsigned long initSize) {
- init(initSize);
- if (initVal)
- set(initVal);
-}
-/******************************************************************************
-* SWBuf Constructor - Creates an SWBuf initialized
-* to a value from another SWBuf
-*
-*/
-SWBuf::SWBuf(const SWBuf &other, unsigned long initSize) {
- init(initSize);
- set(other);
-}
-
-/******************************************************************************
-* SWBuf Constructor - Creates an SWBuf initialized
-* to a value from a char
-*
-*/
-SWBuf::SWBuf(char initVal, unsigned long initSize) {
- init(initSize+1);
- *buf = initVal;
- end = buf+1;
- *end = 0;
-}
-
/*
SWBuf::SWBuf(unsigned long initSize) {
init(initSize);
@@ -97,33 +65,6 @@
}
/******************************************************************************
-* SWBuf::append - appends a value to the current value of this SWBuf
-*
-*/
-SWBuf &SWBuf::append(const char *str, long max) {
-// if (!str) //A null string was passed
-// return;
- if (max < 0)
- max = strlen(str);
- assureMore(max+1);
- for (;((max)&&(*str));max--)
- *end++ = *str++;
- *end = 0;
- return *this;
-}
-
-/******************************************************************************
-* SWBuf::setSize - Size this buffer to a specific length
-*/
-void SWBuf::setSize(unsigned long len) {
- assureSize(len+1);
- if ((unsigned)(end - buf) < len)
- memset(end, fillByte, len - (end-buf));
- end = buf + len;
- *end = 0;
-}
-
-/******************************************************************************
* SWBuf::appendFormatted - appends formatted strings to the current value of this SWBuf
* WARNING: This function can only write at most
* JUNKBUFSIZE to the string per call.
Modified: trunk/src/utilfuns/utilstr.cpp
===================================================================
--- trunk/src/utilfuns/utilstr.cpp 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/utilfuns/utilstr.cpp 2017-09-11 11:39:50 UTC (rev 3494)
@@ -67,29 +67,7 @@
};
-/******************************************************************************
- * stdstr - clones a string
- *
- * ENT: ipstr - pointer to a string pointer to set if necessary
- * istr - string to set to *ipstr
- * 0 - only get
- *
- * RET: *ipstr
- */
-char *stdstr(char **ipstr, const char *istr, unsigned int memPadFactor) {
- if (*ipstr)
- delete [] *ipstr;
- if (istr) {
- int len = (int)strlen(istr) + 1;
- *ipstr = new char [ len * memPadFactor ];
- memcpy(*ipstr, istr, len);
- }
- else *ipstr = 0;
- return *ipstr;
-}
-
-
/******************************************************************************
* strstrip - Removes leading and trailing spaces from a string
*
@@ -187,163 +165,6 @@
}
-/******************************************************************************
- * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
- * and increments buf to start of next codepoint
- *
- * ENT: buf - address of a utf8 buffer
- *
- * RET: buf - incremented past last byte used in computing the current codepoint
- * unicode codepoint value (0 with buf incremented is invalid UTF8 byte
- */
-
-__u32 getUniCharFromUTF8(const unsigned char **buf) {
- __u32 ch = 0;
- unsigned char multibuf[7];
-
- //case: We're at the end
- if (!(**buf)) {
- return ch;
- }
-
- //case: ANSI
- if (!(**buf & 128)) {
- ch = **buf;
- (*buf)++;
- return ch;
- }
-
- //case: Invalid UTF-8 (illegal continuing byte in initial position)
- if ((**buf & 128) && (!(**buf & 64))) {
- (*buf)++;
- return ch;
- }
-
- //case: 2+ byte codepoint
- multibuf[0] = **buf;
- multibuf[0] <<= 1;
- int subsequent;
- for (subsequent = 1; (multibuf[0] & 128) && (subsequent < 7); subsequent++) {
- multibuf[0] <<= 1;
- multibuf[subsequent] = (*buf)[subsequent];
- multibuf[subsequent] &= 63;
- // subsequent byte did not begin with 10XXXXXX
- // move our buffer to here and error out
- if (((*buf)[subsequent] - multibuf[subsequent]) != 128) {
- *buf += subsequent;
- return 0;
- }
- ch <<= 6;
- ch |= multibuf[subsequent];
- }
- subsequent--;
- multibuf[0] <<= 1;
- char significantFirstBits = 8 - (2+subsequent);
-
- ch |= (((__s16)multibuf[0]) << (((6*subsequent)+significantFirstBits)-8));
- *buf += (subsequent+1);
- return ch;
-}
-
-
-SWBuf getUTF8FromUniChar(__u32 uchar) {
- SWBuf retVal("", 7);
- unsigned int i;
-
- if (uchar < 0x80) {
- retVal.append((unsigned char)uchar);
- retVal.setSize(1);
- }
- else if (uchar < 0x800) {
- retVal.setSize(2);
- i = uchar & 0x3f;
- retVal[1] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x1f;
- retVal[0] = (unsigned char)(0xc0 | i);
- }
- else if (uchar < 0x10000) {
- retVal.setSize(3);
- i = uchar & 0x3f;
- retVal[2] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[1] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x0f;
- retVal[0] = (unsigned char)(0xe0 | i);
- }
- else if (uchar < 0x200000) {
- retVal.setSize(4);
- i = uchar & 0x3f;
- retVal[3] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[2] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[1] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x07;
- retVal[0] = (unsigned char)(0xf0 | i);
- }
- else if (uchar < 0x4000000) {
- retVal.setSize(5);
- i = uchar & 0x3f;
- retVal[4] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[3] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[2] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[1] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x03;
- retVal[0] = (unsigned char)(0xf8 | i);
- }
- else if (uchar < 0x80000000) {
- retVal.setSize(6);
- i = uchar & 0x3f;
- retVal[5] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[4] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[3] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[2] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x3f;
- retVal[1] = (unsigned char)(0x80 | i);
- uchar >>= 6;
-
- i = uchar & 0x01;
- retVal[0] = (unsigned char)(0xfc | i);
- }
-
- return retVal;
-}
-
-
SWBuf assureValidUTF8(const char *buf) {
SWBuf myCopy = buf;
@@ -404,7 +225,7 @@
SWBuf utf8Buf;
while (*buf) {
- utf8Buf.append(getUTF8FromUniChar(*buf++));
+ getUTF8FromUniChar(*buf++, &utf8Buf);
}
return utf8Buf;
}
Modified: trunk/tests/utf8norm.cpp
===================================================================
--- trunk/tests/utf8norm.cpp 2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/tests/utf8norm.cpp 2017-09-11 11:39:50 UTC (rev 3494)
@@ -23,16 +23,62 @@
#include <iostream>
#include <utilstr.h>
#include <swbuf.h>
+#if !defined(__GNUC__) && !defined(_WIN32_WCE)
+#include <io.h>
+#include <direct.h>
+#else
+#include <unistd.h>
+#endif
+#include <utf8greekaccents.h>
using namespace sword;
using namespace std;
int main(int argc, char **argv) {
- const char *buf = (argc > 1) ? argv[1] : "Description=German Unrevidierte Luther Übersetzung von 1545";
+ const char *buf = (argc > 1 && argv[1][0] != '-') ? argv[1] : 0; // "Description=German Unrevidierte Luther Übersetzung von 1545";
- SWBuf fixed = assureValidUTF8(buf);
+ if (buf) {
+ SWBuf fixed = assureValidUTF8(buf);
- cout << "input / processed:\n" << buf << "\n" << fixed << endl;
+ cout << "input / processed:\n" << buf << "\n" << fixed << endl;
+ }
+ else {
+ SWOptionFilter *filter = 0;
+ if (argc > 1 && !strcmp(argv[1], "-ga")) filter = new UTF8GreekAccents();
+ if (filter && filter->isBoolean()) filter->setOptionValue("Off");
+ int repeat = 1;
+ if (argc > 2) repeat = atoi(argv[2]);
+ SWBuf contents = "";
+ char chunk[255];
+ int count = 254;
+ while (count > 0) {
+ count = read(STDIN_FILENO, chunk, 254);
+ if (count > 0) {
+ chunk[count] = 0;
+ contents.append(chunk);
+ }
+ }
+ SWBuf filteredContents = contents;
+ if (filter) {
+ for (int i = 0; i < repeat; ++i) {
+ filteredContents = contents;
+ filter->processText(filteredContents);
+ }
+ }
+ const unsigned char *c = (const unsigned char *)filteredContents.getRawData();
+ // UTF-32 BOM
+ __u32 ch = 0xfeff;
+// write(STDOUT_FILENO, &ch, 4);
+ while (c && *c) {
+ ch = getUniCharFromUTF8(&c);
+// ch = __swswap32(ch);
+ if (!ch) ch = 0xFFFD;
+ SWBuf c8;
+ getUTF8FromUniChar(ch, &c8);
+ write(STDOUT_FILENO, c8.getRawData(), c8.length());
+ }
+ delete filter;
+ }
return 0;
}
More information about the sword-cvs
mailing list