[sword-svn] r3494 - in trunk: . include src/mgr src/modules/filters src/utilfuns tests

scribe at crosswire.org scribe at crosswire.org
Mon Sep 11 04:39:51 MST 2017


Author: scribe
Date: 2017-09-11 04:39:50 -0700 (Mon, 11 Sep 2017)
New Revision: 3494

Modified:
   trunk/configure.ac
   trunk/include/filemgr.h
   trunk/include/swbuf.h
   trunk/include/utilstr.h
   trunk/src/mgr/filemgr.cpp
   trunk/src/modules/filters/rtfhtml.cpp
   trunk/src/utilfuns/swbuf.cpp
   trunk/src/utilfuns/utilstr.cpp
   trunk/tests/utf8norm.cpp
Log:
Improved UTF-8 handling.  Optimization pass before release

Modified: trunk/configure.ac
===================================================================
--- trunk/configure.ac	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/configure.ac	2017-09-11 11:39:50 UTC (rev 3494)
@@ -115,8 +115,8 @@
     AM_CXXFLAGS="-O0 -Wall -Werror -Woverloaded-virtual"
   fi
 else
-  AM_CFLAGS="-O3"
-  AM_CXXFLAGS="-O3"
+  AM_CFLAGS="-Ofast"
+  AM_CXXFLAGS="-Ofast"
 fi
 
 AM_CFLAGS="$AM_CFLAGS -fPIC"

Modified: trunk/include/filemgr.h
===================================================================
--- trunk/include/filemgr.h	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/include/filemgr.h	2017-09-11 11:39:50 UTC (rev 3494)
@@ -32,7 +32,7 @@
 
 SWORD_NAMESPACE_START
 
-class SWDLLEXPORT FileMgr;
+class SWDLLEXPORT FileDesc;
 
 struct SWDLLEXPORT DirEntry {
 public:
@@ -40,44 +40,7 @@
 	unsigned long size;
 	bool isDirectory;
 };
-/**
-* This class represents one file. It works with the FileMgr object.
-*/
-class SWDLLEXPORT FileDesc {
 
-	friend class FileMgr;
-
-	long offset;
-	int fd;			// -77 closed;
-	FileMgr *parent;
-	FileDesc *next;
-
-	FileDesc(FileMgr * parent, const char *path, int mode, int perms, bool tryDowngrade);
-	virtual ~FileDesc();
-
-public:
-	/** @return File handle.
-	*/
-	int getFd();
-
-	long seek(long offset, int whence);
-	long read(void *buf, long count);
-	long write(const void *buf, long count);
-
-	/** Path to file.
-	*/
-	char *path;
-	/** File access mode.
-	*/
-	int mode;
-	/** File permissions.
-	*/
-	int perms;
-	/**
-	*/
-	bool tryDowngrade;
-};
-
 /**
 *	This class ist used make file access operations easier.
 * It keeps a list of all open files internally and closes them
@@ -186,6 +149,51 @@
 
 };
 
+/**
+* This class represents one file. It works with the FileMgr object.
+*/
+class SWDLLEXPORT FileDesc {
 
+	friend class FileMgr;
+
+	long offset;
+	int fd;			// -77 closed;
+	FileMgr *parent;
+	FileDesc *next;
+
+	FileDesc(FileMgr * parent, const char *path, int mode, int perms, bool tryDowngrade);
+	virtual ~FileDesc();
+
+public:
+	/** @return File handle.
+	 * NOTE: magic file descriptor -77 = closed to avoid os limits
+	*/
+	inline int getFd() {
+		if (fd == -77)
+			fd = parent->sysOpen(this);
+//		if ((fd < -1) && (fd != -77))  // kludge to handle ce
+//			return 777;
+		return fd;
+	}
+
+	long seek(long offset, int whence);
+	long read(void *buf, long count);
+	long write(const void *buf, long count);
+
+	/** Path to file.
+	*/
+	char *path;
+	/** File access mode.
+	*/
+	int mode;
+	/** File permissions.
+	*/
+	int perms;
+	/**
+	*/
+	bool tryDowngrade;
+};
+
+
 SWORD_NAMESPACE_END
 #endif

Modified: trunk/include/swbuf.h
===================================================================
--- trunk/include/swbuf.h	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/include/swbuf.h	2017-09-11 11:39:50 UTC (rev 3494)
@@ -90,28 +90,41 @@
 		init(0);
 	}
 
-	/**
-	* SWBuf Constructor - Creates an SWBuf initialized
+	/******************************************************************************
+	* SWBuf Constructor - Creates an empty SWBuf object or an SWBuf initialized
 	* 		to a value from a const char *
- 	*
- 	*/
-	SWBuf(const char *initVal, unsigned long initSize = 0);
-//	SWBuf(unsigned long initSize);
+	*
+	*/
+	inline SWBuf(const char *initVal, unsigned long initSize = 0) {
+		init(initSize);
+		if (initVal)
+			set(initVal);
+	}
 
-	/**
+	/******************************************************************************
 	* SWBuf Constructor - Creates an SWBuf initialized
-	* 		to a value from a char
+	* 		to a value from another SWBuf
 	*
 	*/
-	SWBuf(char initVal, unsigned long initSize = 0);
+	inline SWBuf(const SWBuf &other, unsigned long initSize = 0) {
+		init(initSize);
+		set(other);
+	}
 
-	/**
+	/******************************************************************************
 	* SWBuf Constructor - Creates an SWBuf initialized
-	* 		to a value from another SWBuf
+	* 		to a value from a char
 	*
 	*/
-	SWBuf(const SWBuf &other, unsigned long initSize = 0);
+	inline SWBuf(char initVal, unsigned long initSize = 0) {
+		init(initSize+1);
+		*buf = initVal;
+		end = buf+1;
+		*end = 0;
+	}
+//	SWBuf(unsigned long initSize);
 
+
 	/******************************************************************************
 	* SWBuf Destructor - Cleans up instance of SWBuf
 	*/
@@ -220,7 +233,13 @@
 	* SWBuf::setSize - Size this buffer to a specific length.
 	* @param len The new size of the buffer. One byte for the null will be added.
 	*/
-	void setSize(unsigned long len);
+	inline void setSize(unsigned long len) {
+		assureSize(len+1);
+		if ((unsigned)(end - buf) < len)
+			memset(end, fillByte, len - (end-buf));
+		end = buf + len;
+		*end = 0;
+	}
 	/**
 	* SWBuf::resize - Resize this buffer to a specific length.
 	* @param len The new size of the buffer. One byte for the null will be added.
@@ -233,7 +252,17 @@
 	* @param str Append this.
 	* @param max Append only max chars.
 	*/
-	SWBuf &append(const char *str, long max = -1);
+	inline SWBuf &append(const char *str, long max = -1) {
+	//	if (!str) //A null string was passed
+	//		return;
+		if (max < 0)
+			max = strlen(str);
+		assureMore(max+1);
+		for (;((max)&&(*str));max--)
+			*end++ = *str++;
+		*end = 0;
+		return *this;
+	}
 
 	/**
 	* SWBuf::append - appends a value to the current value of this SWBuf

Modified: trunk/include/utilstr.h
===================================================================
--- trunk/include/utilstr.h	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/include/utilstr.h	2017-09-11 11:39:50 UTC (rev 3494)
@@ -29,9 +29,29 @@
 
 SWORD_NAMESPACE_START
 
-/** stdstr - clone a string
-*/
-SWDLLEXPORT char *stdstr (char **iistr, const char *istr, unsigned int memPadFactor = 1);
+
+/******************************************************************************
+ * stdstr - clones a string
+ *
+ * ENT:	ipstr	- pointer to a string pointer to set if necessary
+ *	istr	- string to set to *ipstr
+ *			0 - only get
+ *
+ * RET:	*ipstr
+ */
+
+inline char *stdstr(char **ipstr, const char *istr, unsigned int memPadFactor = 1) {
+	if (*ipstr)
+		delete [] *ipstr;
+	if (istr) {
+		int len = (int)strlen(istr) + 1;
+		*ipstr = new char [ len * memPadFactor ];
+		memcpy(*ipstr, istr, len);
+	}
+	else *ipstr = 0;
+	return *ipstr;
+}
+
 SWDLLEXPORT char *strstrip (char *istr);
 SWDLLEXPORT const char *stristr (const char *s1, const char *s2);
 SWDLLEXPORT int strnicmp(const char *s1, const char *s2, int len);
@@ -55,9 +75,82 @@
  * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
  */
 
-__u32 getUniCharFromUTF8(const unsigned char **buf);
 
+/******************************************************************************
+ * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
+ * 					and increments buf to start of next codepoint
+ *
+ * ENT:	buf - address of a utf8 buffer
+ *
+ * RET:	buf - incremented past last byte used in computing the current codepoint
+ * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
+ */
 
+inline __u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation = false) {
+	__u32 ch = 0;
+
+	//case: We're at the end
+	if (!(**buf)) {
+		return ch;
+	}
+
+	//case: ANSI
+	if (!(**buf & 128)) {
+		ch = **buf;
+		(*buf)++;
+		return ch;
+	}
+
+	//case: Invalid UTF-8 (illegal continuing byte in initial position)
+	if ((**buf >> 6) == 2) {
+		(*buf)++;
+		return ch;
+	}
+
+
+	//case: 2+ byte codepoint
+	int subsequent = 1;
+	if ((**buf & 32) == 0) { subsequent = 1; }
+	else if ((**buf & 16) == 0) { subsequent = 2; }
+	else if ((**buf &  8) == 0) { subsequent = 3; }
+	else if ((**buf &  4) == 0) { subsequent = 4; }
+	else if ((**buf &  2) == 0) { subsequent = 5; }
+	else if ((**buf &  1) == 0) { subsequent = 6; }
+	else subsequent = 7; // is this legal?
+
+	ch = **buf & (0xFF>>(subsequent + 1));
+
+	for (int i = 1; i <= subsequent; ++i) {
+		// subsequent byte did not begin with 10XXXXXX
+		// move our buffer to here and error out
+		// this also catches our null if we hit the string terminator
+		if (((*buf)[i] >> 6) != 2) {
+			*buf += i;
+			return 0;
+		}
+		ch <<= 6;
+		ch |= (*buf)[i] & 63;
+	}
+	*buf += (subsequent+1);
+
+	if (!skipValidation) {
+		// I THINK THIS IS STUPID BUT THE SPEC SAYS NO MORE THAN 4 BYTES
+		if (subsequent > 3) ch = 0;
+		// AGAIN stupid, but spec says UTF-8 can't use more than 21 bits
+		if (ch > 0x1FFFFF) ch = 0;
+		// This would be out of Unicode bounds
+		if (ch > 0x10FFFF) ch = 0;
+		// these would be values which could be represented in less bytes
+		if (ch < 0x80 && subsequent > 0) ch = 0;
+		if (ch < 0x800 && subsequent > 1) ch = 0;
+		if (ch < 0x10000 && subsequent > 2) ch = 0;
+		if (ch < 0x200000 && subsequent > 3) ch = 0;
+	}
+
+	return ch;
+}
+
+
 /******************************************************************************
  * getUTF8FromUniChar - retrieves us UTF8 string from a
  * 					Unicode codepoint
@@ -66,11 +159,95 @@
  *
  * RET:	buf - a UTF8 string which consists of the proper UTF8 sequence of
  * 				bytes for the given Unicode codepoint
+ * NOTE: for speed and thread safety, this method now requires a buffer
+ * 		to work with
  */
 
-SWBuf getUTF8FromUniChar(__u32 uchar);
+inline SWBuf *getUTF8FromUniChar(__u32 uchar, SWBuf *appendTo) {
+	unsigned long base = appendTo->size();
 
+	// This would be out of Unicode bounds
+	if (uchar > 0x10FFFF) uchar = 0xFFFD;
+	char bytes = uchar < 0x80 ? 1 : uchar < 0x800 ? 2 : uchar < 0x10000 ? 3 : 4;
+	appendTo->setSize(base+bytes);
+	switch (bytes) {
+	case 1:
+		(*appendTo)[base  ] = (unsigned char)uchar;
+		break;
+	case 2:
+		(*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
+		uchar >>= 6;
+		(*appendTo)[base  ] = (unsigned char)(0xc0 | (uchar & 0x1f));
+		break;
+	case 3:
+		(*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
+		uchar >>= 6;
+		(*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
+		uchar >>= 6;
+		(*appendTo)[base  ] = (unsigned char)(0xe0 | (uchar & 0x0f));
+		break;
+	case 4:
+		(*appendTo)[base+3] = (unsigned char)(0x80 | (uchar & 0x3f));
+		uchar >>= 6;
+		(*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
+		uchar >>= 6;
+		(*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
+		uchar >>= 6;
+		(*appendTo)[base  ] = (unsigned char)(0xf0 | (uchar & 0x07));
+		break;
+	}
+/*
+	else if (uchar < 0x4000000) {
+		appendTo->setSize(base+5);
+		i = uchar & 0x3f;
+		(*appendTo)[base+4] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
 
+		i = uchar & 0x3f;
+		(*appendTo)[base+3] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x3f;
+		(*appendTo)[base+2] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x3f;
+		(*appendTo)[base+1] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x03;
+		(*appendTo)[base] = (unsigned char)(0xf8 | i);
+	}
+	else if (uchar < 0x80000000) {
+		appendTo->setSize(base+6);
+		i = uchar & 0x3f;
+		(*appendTo)[base+5] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x3f;
+		(*appendTo)[base+4] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x3f;
+		(*appendTo)[base+3] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x3f;
+		(*appendTo)[base+2] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x3f;
+		(*appendTo)[base+1] = (unsigned char)(0x80 | i);
+		uchar >>= 6;
+
+		i = uchar & 0x01;
+		(*appendTo)[base] = (unsigned char)(0xfc | i);
+	}
+*/
+	return appendTo;
+}
+
+
 /******************************************************************************
  * assureValidUTF8 - iterates the supplied UTF-8 buffer and checks for validity
  * 					replacing invalid bytes if necessary and returning a

Modified: trunk/src/mgr/filemgr.cpp
===================================================================
--- trunk/src/mgr/filemgr.cpp	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/mgr/filemgr.cpp	2017-09-11 11:39:50 UTC (rev 3494)
@@ -131,15 +131,6 @@
 }
 
 
-int FileDesc::getFd() {
-	if (fd == -77)
-		fd = parent->sysOpen(this);
-//	if ((fd < -1) && (fd != -77))  // kludge to hand ce
-//		return 777;
-	return fd;
-}
-
-
 long FileDesc::seek(long offset, int whence) {
 	return lseek(getFd(), offset, whence);
 }

Modified: trunk/src/modules/filters/rtfhtml.cpp
===================================================================
--- trunk/src/modules/filters/rtfhtml.cpp	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/modules/filters/rtfhtml.cpp	2017-09-11 11:39:50 UTC (rev 3494)
@@ -55,7 +55,7 @@
 				num.append(from, end-from);
 				__s16 n = atoi(num.c_str());
 				__u32 u = (__u16)n;
-				text.append(getUTF8FromUniChar(u));
+				getUTF8FromUniChar(u, &text);
 				from += (end-from);
 				continue;
 			}

Modified: trunk/src/utilfuns/swbuf.cpp
===================================================================
--- trunk/src/utilfuns/swbuf.cpp	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/utilfuns/swbuf.cpp	2017-09-11 11:39:50 UTC (rev 3494)
@@ -32,39 +32,7 @@
 
 char *SWBuf::nullStr = (char *)"";
 
-/******************************************************************************
-* SWBuf Constructor - Creates an empty SWBuf object or an SWBuf initialized
-* 		to a value from a const char *
-*
-*/
-SWBuf::SWBuf(const char *initVal, unsigned long initSize) {
-	init(initSize);
-	if (initVal)
-		set(initVal);
-}
 
-/******************************************************************************
-* SWBuf Constructor - Creates an SWBuf initialized
-* 		to a value from another SWBuf
-*
-*/
-SWBuf::SWBuf(const SWBuf &other, unsigned long initSize) {
-	init(initSize);
-	set(other);
-}
-
-/******************************************************************************
-* SWBuf Constructor - Creates an SWBuf initialized
-* 		to a value from a char
-*
-*/
-SWBuf::SWBuf(char initVal, unsigned long initSize) {
-	init(initSize+1);
-	*buf = initVal;
-	end = buf+1;
-	*end = 0;
-}
-
 /*
 SWBuf::SWBuf(unsigned long initSize) {
 	init(initSize);
@@ -97,33 +65,6 @@
 }
 
 /******************************************************************************
-* SWBuf::append - appends a value to the current value of this SWBuf
-* 
-*/
-SWBuf &SWBuf::append(const char *str, long max) {
-//	if (!str) //A null string was passed
-//		return;
-	if (max < 0)
-		max = strlen(str);
-	assureMore(max+1);
-	for (;((max)&&(*str));max--)
-		*end++ = *str++;
-	*end = 0;
-	return *this;
-}
-
-/******************************************************************************
-* SWBuf::setSize - Size this buffer to a specific length
-*/
-void SWBuf::setSize(unsigned long len) {
-	assureSize(len+1);
-	if ((unsigned)(end - buf) < len)
-		memset(end, fillByte, len - (end-buf));
-	end = buf + len;
-	*end = 0;
-}
-
-/******************************************************************************
 * SWBuf::appendFormatted - appends formatted strings to the current value of this SWBuf
 * WARNING: This function can only write at most
 * JUNKBUFSIZE to the string per call.

Modified: trunk/src/utilfuns/utilstr.cpp
===================================================================
--- trunk/src/utilfuns/utilstr.cpp	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/src/utilfuns/utilstr.cpp	2017-09-11 11:39:50 UTC (rev 3494)
@@ -67,29 +67,7 @@
 };
 
 
-/******************************************************************************
- * stdstr - clones a string
- *
- * ENT:	ipstr	- pointer to a string pointer to set if necessary
- *	istr	- string to set to *ipstr
- *			0 - only get
- *
- * RET:	*ipstr
- */
 
-char *stdstr(char **ipstr, const char *istr, unsigned int memPadFactor) {
-	if (*ipstr)
-		delete [] *ipstr;
-	if (istr) {
-		int len = (int)strlen(istr) + 1;
-		*ipstr = new char [ len * memPadFactor ];
-		memcpy(*ipstr, istr, len);
-	}
-	else *ipstr = 0;
-	return *ipstr;
-}
-
-
 /******************************************************************************
  * strstrip - Removes leading and trailing spaces from a string
  *
@@ -187,163 +165,6 @@
 }
 
 
-/******************************************************************************
- * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
- * 					and increments buf to start of next codepoint
- *
- * ENT:	buf - address of a utf8 buffer
- *
- * RET:	buf - incremented past last byte used in computing the current codepoint
- * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
- */
-
-__u32 getUniCharFromUTF8(const unsigned char **buf) {
-	__u32 ch = 0;
-	unsigned char multibuf[7];
-
-	//case: We're at the end
-	if (!(**buf)) {
-		return ch;
-	}
-
-	//case: ANSI
-	if (!(**buf & 128)) {
-		ch = **buf;
-		(*buf)++;
-		return ch;
-	}
-
-	//case: Invalid UTF-8 (illegal continuing byte in initial position)
-	if ((**buf & 128) && (!(**buf & 64))) {
-		(*buf)++;
-		return ch;
-	}
-
-	//case: 2+ byte codepoint
-	multibuf[0] = **buf;
-	multibuf[0] <<= 1;
-	int subsequent;
-	for (subsequent = 1; (multibuf[0] & 128) && (subsequent < 7); subsequent++) {
-		multibuf[0] <<= 1;
-		multibuf[subsequent] = (*buf)[subsequent];
-		multibuf[subsequent] &= 63;
-		// subsequent byte did not begin with 10XXXXXX
-		// move our buffer to here and error out
-		if (((*buf)[subsequent] - multibuf[subsequent]) != 128) {
-			*buf += subsequent;
-			return 0;
-		}
-		ch <<= 6;
-		ch |= multibuf[subsequent];
-	}
-	subsequent--;
-	multibuf[0] <<= 1;
-	char significantFirstBits = 8 - (2+subsequent);
-	
-	ch |= (((__s16)multibuf[0]) << (((6*subsequent)+significantFirstBits)-8));
-	*buf += (subsequent+1);
-	return ch;
-}
-
-
-SWBuf getUTF8FromUniChar(__u32 uchar) {
-	SWBuf retVal("", 7);
-	unsigned int i;
-
-	if (uchar < 0x80) {
-		retVal.append((unsigned char)uchar);
-		retVal.setSize(1);
-	}
-	else if (uchar < 0x800) {
-		retVal.setSize(2);
-		i = uchar & 0x3f;
-		retVal[1] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x1f;
-		retVal[0] = (unsigned char)(0xc0 | i);
-	}
-	else if (uchar < 0x10000) {
-		retVal.setSize(3);
-		i = uchar & 0x3f;
-		retVal[2] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[1] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x0f;
-		retVal[0] = (unsigned char)(0xe0 | i);
-	}
-	else if (uchar < 0x200000) {
-		retVal.setSize(4);
-		i = uchar & 0x3f;
-		retVal[3] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[2] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[1] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x07;
-		retVal[0] = (unsigned char)(0xf0 | i);
-	}
-	else if (uchar < 0x4000000) {
-		retVal.setSize(5);
-		i = uchar & 0x3f;
-		retVal[4] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[3] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[2] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[1] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x03;
-		retVal[0] = (unsigned char)(0xf8 | i);
-	}
-	else if (uchar < 0x80000000) {
-		retVal.setSize(6);
-		i = uchar & 0x3f;
-		retVal[5] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[4] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[3] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[2] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x3f;
-		retVal[1] = (unsigned char)(0x80 | i);
-		uchar >>= 6;
-
-		i = uchar & 0x01;
-		retVal[0] = (unsigned char)(0xfc | i);
-	}
-
-	return retVal;
-}
-
-
 SWBuf assureValidUTF8(const char *buf) {
 
 	SWBuf myCopy = buf;
@@ -404,7 +225,7 @@
 
 	SWBuf utf8Buf;
 	while (*buf) {
-		utf8Buf.append(getUTF8FromUniChar(*buf++));
+		getUTF8FromUniChar(*buf++, &utf8Buf);
 	}
 	return utf8Buf;
 }

Modified: trunk/tests/utf8norm.cpp
===================================================================
--- trunk/tests/utf8norm.cpp	2017-09-11 11:38:52 UTC (rev 3493)
+++ trunk/tests/utf8norm.cpp	2017-09-11 11:39:50 UTC (rev 3494)
@@ -23,16 +23,62 @@
 #include <iostream>
 #include <utilstr.h>
 #include <swbuf.h>
+#if !defined(__GNUC__) && !defined(_WIN32_WCE)
+#include <io.h>
+#include <direct.h>
+#else
+#include <unistd.h>
+#endif
+#include <utf8greekaccents.h>
 
 using namespace sword;
 using namespace std;
 
 int main(int argc, char **argv) {
-	const char *buf = (argc > 1) ? argv[1] : "Description=German Unrevidierte Luther Übersetzung von 1545";
+	const char *buf = (argc > 1 && argv[1][0] != '-') ? argv[1] : 0; // "Description=German Unrevidierte Luther Übersetzung von 1545";
 
-	SWBuf fixed = assureValidUTF8(buf);
+	if (buf) {
+		SWBuf fixed = assureValidUTF8(buf);
 
-	cout << "input / processed:\n" << buf << "\n" << fixed << endl;
+		cout << "input / processed:\n" << buf << "\n" << fixed << endl;
+	}
+	else {
+		SWOptionFilter *filter = 0;
+		if (argc > 1 && !strcmp(argv[1], "-ga")) filter = new UTF8GreekAccents();
+		if (filter && filter->isBoolean()) filter->setOptionValue("Off");
+		int repeat = 1;
+		if (argc > 2) repeat = atoi(argv[2]);
+		SWBuf contents = "";
+		char chunk[255];
+		int count = 254;
+		while (count > 0) {
+			count = read(STDIN_FILENO, chunk, 254);
+			if (count > 0) {
+				chunk[count] = 0;
+				contents.append(chunk);
+			}
+		}
+		SWBuf filteredContents = contents;
+		if (filter) {
+			for (int i = 0; i < repeat; ++i) {
+				filteredContents = contents;
+				filter->processText(filteredContents);
+			}
+		}
+		const unsigned char *c = (const unsigned char *)filteredContents.getRawData();
+		// UTF-32 BOM
+		__u32 ch = 0xfeff;
+//		write(STDOUT_FILENO, &ch, 4);
+		while (c && *c) {
+			ch = getUniCharFromUTF8(&c);
+//			ch = __swswap32(ch);
+			if (!ch) ch = 0xFFFD;
+			SWBuf c8;
+		        getUTF8FromUniChar(ch, &c8);
+			write(STDOUT_FILENO, c8.getRawData(), c8.length());
+		}
+		delete filter;
+	}
 
 	return 0;
 }




More information about the sword-cvs mailing list