[sword-svn] r3496 - in trunk: src/modules/filters tests/testsuite

Mon Sep 11 04:41:08 MST 2017

Author: scribe
Date: 2017-09-11 04:41:08 -0700 (Mon, 11 Sep 2017)
New Revision: 3496

Added:
   trunk/tests/testsuite/README
   trunk/tests/testsuite/UTF-8-test.txt
   trunk/tests/testsuite/greekaccents.good
   trunk/tests/testsuite/greekaccents.sh
   trunk/tests/testsuite/greekaccents.txt
   trunk/tests/testsuite/utf8basic.good
   trunk/tests/testsuite/utf8basic.sh
Modified:
   trunk/src/modules/filters/utf8greekaccents.cpp
Log:
updated Greek Accents filter to be more sane and safe.  Added tests for UTF8 and Greek accents

Modified: trunk/src/modules/filters/utf8greekaccents.cpp
===================================================================

--- trunk/src/modules/filters/utf8greekaccents.cpp	2017-09-11 11:40:19 UTC (rev 3495)
+++ trunk/src/modules/filters/utf8greekaccents.cpp	2017-09-11 11:41:08 UTC (rev 3496)
@@ -22,8 +22,10 @@
  */
 
 #include <stdlib.h>
+#include <map>
 #include <stdio.h>
 #include <utf8greekaccents.h>
+#include <utilstr.h>
 
 
 #ifdef _ICU_
@@ -31,6 +33,7 @@
 sword::UTF8NFKD decompose;
 #endif
 
+using std::map;
 
 SWORD_NAMESPACE_START
 
@@ -44,6 +47,297 @@
 		static const StringList oVals(&choices[0], &choices[2]);
 		return &oVals;
 	}
+
+	std::map<__u32, SWBuf> converters;
+	class converters_init {
+	public:
+		converters_init() {
+			SWBuf myBuf = "";
+			//first just remove combining characters
+			converters[0x2019] = "";	// RIGHT SINGLE QUOTATION MARK
+			converters[0x1FBF] = "";	// GREEK PSILI
+			converters[0x2CFF] = "";	// COPTIC MORPHOLOGICAL DIVIDER
+			converters[0xFE24] = "";	// COMBINING MACRON LEFT HALF
+			converters[0xFE25] = "";	// COMBINING MACRON RIGHT HALF
+			converters[0xFE26] = "";	// COMBINING CONJOINING MACRON
+			converters[0x0300] = "";	// COMBINING GRAVE ACCENT
+			converters[0x0301] = "";	// COMBINING ACUTE ACCENT
+			converters[0x0302] = "";	// COMBINING CIRCUMFLEX ACCENT
+			converters[0x0308] = "";	// COMBINING DIAERESIS
+			converters[0x0313] = "";	// COMBINING COMMA ABOVE
+			converters[0x0314] = "";	// COMBINING REVERSED COMMA ABOVE
+			converters[0x037A] = "";	// GREEK YPOGEGRAMMENI
+			converters[0x0342] = "";	// COMBINING GREEK PERISPOMENI
+			// Now converted pre-composed characters to their alphabetic bases, discarding the accents
+			// Greek
+			// UPPER case
+			converters[0x0386] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH TONOS
+			converters[0x0388] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH TONOS
+			converters[0x0389] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH TONOS
+			converters[0x038A] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH TONOS
+			converters[0x03AA] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
+			converters[0x038C] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH TONOS
+			converters[0x038E] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH TONOS
+			converters[0x03AB] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
+			converters[0x038F] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH TONOS
+
+			// lower case
+			converters[0x03AC] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH TONOS
+			converters[0x03AD] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH TONOS
+			converters[0x03AE] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH TONOS
+			converters[0x03AF] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH TONOS
+			converters[0x03CA] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DIALYTIKA
+			converters[0x03CC] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH TONOS
+			converters[0x03CD] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH TONOS
+			converters[0x03CB] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DIALYTIKA
+			converters[0x03CE] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH TONOS
+
+			// Extended Greek
+			// UPPER case
+			converters[0x1F08] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI
+			converters[0x1F09] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA
+			converters[0x1F0A] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA
+			converters[0x1F0B] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA
+			converters[0x1F0C] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA
+			converters[0x1F0D] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA
+			converters[0x1F0E] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI
+			converters[0x1F0F] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI
+			converters[0x1F88] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+			converters[0x1F89] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+			converters[0x1F8A] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+			converters[0x1F8B] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+			converters[0x1F8C] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+			converters[0x1F8D] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+			converters[0x1F8E] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+			converters[0x1F8F] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+			converters[0x1FB8] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH VRACHY
+			converters[0x1FB9] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH MACRON
+			converters[0x1FBA] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH VARIA
+			converters[0x1FBB] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH OXIA
+			converters[0x1FBC] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+			
+			converters[0x1F18] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH PSILI
+			converters[0x1F19] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH DASIA
+			converters[0x1F1A] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA
+			converters[0x1F1B] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA
+			converters[0x1F1C] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA
+			converters[0x1F1D] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
+			converters[0x1FC8] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH VARIA
+			converters[0x1FC9] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER EPSILON WITH OXIA
+
+			converters[0x1F28] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI
+			converters[0x1F29] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA
+			converters[0x1F2A] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA
+			converters[0x1F2B] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA
+			converters[0x1F2C] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA
+			converters[0x1F2D] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA
+			converters[0x1F2E] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI
+			converters[0x1F2F] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI
+			converters[0x1F98] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+			converters[0x1F99] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+			converters[0x1F9A] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+			converters[0x1F9B] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+			converters[0x1F9C] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+			converters[0x1F9D] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+			converters[0x1F9E] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+			converters[0x1F9F] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+			converters[0x1FCA] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH VARIA
+			converters[0x1FCB] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH OXIA
+			converters[0x1FCC] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+
+			converters[0x1F38] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH PSILI
+			converters[0x1F39] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH DASIA
+			converters[0x1F3A] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA
+			converters[0x1F3B] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA
+			converters[0x1F3C] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA
+			converters[0x1F3D] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA
+			converters[0x1F3E] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI
+			converters[0x1F3F] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI
+			converters[0x1FD8] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH VRACHY
+			converters[0x1FD9] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH MACRON
+			converters[0x1FDA] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH VARIA
+			converters[0x1FDB] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER IOTA WITH OXIA
+
+			converters[0x1F48] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH PSILI
+			converters[0x1F49] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH DASIA
+			converters[0x1F4A] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA
+			converters[0x1F4B] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA
+			converters[0x1F4C] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA
+			converters[0x1F4D] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA
+			converters[0x1FF8] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH VARIA
+			converters[0x1FF9] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMICRON WITH OXIA
+
+			converters[0x1F59] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH DASIA
+			converters[0x1F5A] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH PSILI AND VARIA
+			converters[0x1F5B] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA
+			converters[0x1F5C] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH PSILI AND OXIA
+			converters[0x1F5D] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA
+			converters[0x1F5E] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH PSILI AND PERISPOMENI
+			converters[0x1F5F] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI
+			converters[0x1FE8] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH VRACHY
+			converters[0x1FE9] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH MACRON
+			converters[0x1FEA] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH VARIA
+			converters[0x1FEB] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER UPSILON WITH OXIA
+
+			converters[0x1F68] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI
+			converters[0x1F69] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA
+			converters[0x1F6A] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA
+			converters[0x1F6B] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA
+			converters[0x1F6C] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA
+			converters[0x1F6D] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA
+			converters[0x1F6E] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI
+			converters[0x1F6F] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI
+			converters[0x1FA8] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+			converters[0x1FA9] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+			converters[0x1FAA] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+			converters[0x1FAB] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+			converters[0x1FAC] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+			converters[0x1FAD] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+			converters[0x1FAE] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+			converters[0x1FAF] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+			converters[0x1FFA] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH VARIA
+			converters[0x1FFB] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH OXIA
+			converters[0x1FFC] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+
+			converters[0x1FEC] = *getUTF8FromUniChar(0x03A1, &myBuf); myBuf.setSize(0);	// GREEK CAPITAL LETTER RHO WITH DASIA
+
+			// lower case
+			//alpha
+			converters[0x1F00] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI
+			converters[0x1F01] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA
+			converters[0x1F02] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA
+			converters[0x1F03] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA
+			converters[0x1F04] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA
+			converters[0x1F05] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA
+			converters[0x1F06] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI
+			converters[0x1F07] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI
+			converters[0x1F80] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+			converters[0x1F81] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+			converters[0x1F82] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+			converters[0x1F83] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+			converters[0x1F84] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+			converters[0x1F85] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+			converters[0x1F86] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+			converters[0x1F87] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+			converters[0x1F70] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH VARIA
+			converters[0x1F71] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH OXIA
+			converters[0x1FB0] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH VRACHY
+			converters[0x1FB1] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH MACRON
+			converters[0x1FB2] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+			converters[0x1FB3] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+			converters[0x1FB4] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+			converters[0x1FB5] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// unused?
+			converters[0x1FB6] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+			converters[0x1FB7] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+
+			converters[0x1F10] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH PSILI
+			converters[0x1F11] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH DASIA
+			converters[0x1F12] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH PSILI AND VARIA
+			converters[0x1F13] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH DASIA AND VARIA
+			converters[0x1F14] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH PSILI AND OXIA
+			converters[0x1F15] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
+			converters[0x1F72] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH VARIA
+			converters[0x1F73] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER EPSILON WITH OXIA
+
+			converters[0x1F90] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+			converters[0x1F91] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+			converters[0x1F92] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+			converters[0x1F93] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+			converters[0x1F94] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+			converters[0x1F95] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+			converters[0x1F96] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+			converters[0x1F97] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+			converters[0x1F20] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI
+			converters[0x1F21] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA
+			converters[0x1F22] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI AND VARIA
+			converters[0x1F23] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA AND VARIA
+			converters[0x1F24] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI AND OXIA
+			converters[0x1F25] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA AND OXIA
+			converters[0x1F26] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI
+			converters[0x1F27] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI
+			converters[0x1FC2] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+			converters[0x1FC3] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+			converters[0x1FC4] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+			converters[0x1FC5] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// unused?
+			converters[0x1FC6] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PERISPOMENI
+			converters[0x1FC7] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+			converters[0x1F74] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH VARIA
+			converters[0x1F75] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER ETA WITH OXIA
+
+			converters[0x1F30] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH PSILI
+			converters[0x1F31] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DASIA
+			converters[0x1F32] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH PSILI AND VARIA
+			converters[0x1F33] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DASIA AND VARIA
+			converters[0x1F34] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH PSILI AND OXIA
+			converters[0x1F35] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DASIA AND OXIA
+			converters[0x1F36] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH PSILI AND PERISPOMENI
+			converters[0x1F37] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI
+			converters[0x1F76] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH VARIA
+			converters[0x1F77] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH OXIA
+			converters[0x1FD0] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH VRACHY
+			converters[0x1FD1] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH MACRON
+			converters[0x1FD2] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+			converters[0x1FD3] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+			converters[0x1FD4] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// unused?
+			converters[0x1FD5] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// unused?
+			converters[0x1FD6] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH PERISPOMENI
+			converters[0x1FD7] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+
+			converters[0x1F40] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH PSILI
+			converters[0x1F41] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH DASIA
+			converters[0x1F42] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH PSILI AND VARIA
+			converters[0x1F43] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH DASIA AND VARIA
+			converters[0x1F44] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH PSILI AND OXIA
+			converters[0x1F45] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA
+			converters[0x1F78] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH VARIA
+			converters[0x1F79] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMICRON WITH OXIA
+
+			converters[0x1F50] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH PSILI
+			converters[0x1F51] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DASIA
+			converters[0x1F52] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+			converters[0x1F53] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DASIA AND VARIA
+			converters[0x1F54] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+			converters[0x1F55] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DASIA AND OXIA
+			converters[0x1F56] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+			converters[0x1F57] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI
+			converters[0x1F7A] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH VARIA
+			converters[0x1F7B] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH OXIA
+			converters[0x1FE0] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH VRACHY
+			converters[0x1FE1] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH MACRON
+			converters[0x1FE2] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+			converters[0x1FE3] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+			converters[0x1FE6] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+			converters[0x1FE7] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+
+			converters[0x1F60] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI
+			converters[0x1F61] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA
+			converters[0x1F62] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA
+			converters[0x1F63] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA
+			converters[0x1F64] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA
+			converters[0x1F65] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA
+			converters[0x1F66] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI
+			converters[0x1F67] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
+			converters[0x1F7C] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH VARIA
+			converters[0x1F7D] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH OXIA
+			converters[0x1FA0] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+			converters[0x1FA1] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+			converters[0x1FA2] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+			converters[0x1FA3] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+			converters[0x1FA4] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+			converters[0x1FA5] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+			converters[0x1FA6] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+			converters[0x1FA7] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+			converters[0x1FF2] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+			converters[0x1FF3] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+			converters[0x1FF4] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+			converters[0x1FF5] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// unused?
+			converters[0x1FF6] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+			converters[0x1FF7] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+
+			converters[0x1FE4] = *getUTF8FromUniChar(0x03C1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER RHO WITH PSILI
+			converters[0x1FE5] = *getUTF8FromUniChar(0x03C1, &myBuf); myBuf.setSize(0);	// GREEK SMALL LETTER RHO WITH DASIA
+		}
+	} __converters_init;
 }
 
 
@@ -57,229 +351,24 @@
 char UTF8GreekAccents::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
 
 	if (!option) { //we don't want greek accents
-    		//unsigned char *to, *from;
-		//to = (unsigned char*)text;
-		//for (from = (unsigned char*)text; *from; from++) {
-#ifdef _ICU_
-		decompose.processText(text, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
-#endif
-		
 		SWBuf orig = text;
 		const unsigned char* from = (unsigned char*)orig.c_str();
-		for (text = ""; *from; from++) {		
-			//first just remove combining characters
-			if (*from == 0xE2 && *(from + 1) == 0x80 && *(from + 2) == 0x99) {
-				from += 2;
-			}
-			else if (*from == 0xCC && *(from + 1)) {
-				if (*(from + 1) == 0x80 || *(from + 1) == 0x81 || *(from + 1) == 0x82 || *(from + 1) == 0x88 || *(from + 1) == 0x93 || *(from + 1) == 0x94) {
-					from++;
-				}
-			}
-			else if (*from == 0xCD && (*(from + 1) == 0xBA || *(from + 1) == 0x82)) {
-				from++;
-			}
-			//now converted pre-composed characters to their alphabetic bases, discarding the accents
+		text = "";
+		map<__u32, SWBuf>::const_iterator it = converters.end();
+		while (*from) {		
+			__u32 ch = getUniCharFromUTF8(&from, true);
+			// if ch is bad, then convert to replacement char
+			if (!ch) ch = 0xFFFD;
 
-			//Greek
-			//capital alpha
-			else if ((*from == 0xCE && *(from + 1) == 0x86)) {
-				text += 0xCE;
-				text += 0x91;
-				from++;
+			it = converters.find(ch);
+			if (it == converters.end()) {
+				getUTF8FromUniChar(ch, &text);
 			}
-			//capital epsilon
-			else if ((*from == 0xCE && *(from + 1) == 0x88)) {
-				text += 0xCE;
-				text += 0x95;
-				from++;
-			}
-			//capital eta
-			else if ((*from == 0xCE && *(from + 1) == 0x89)) {
-				text += 0xCE;
-				text += 0x97;
-				from++;
-			}
-			//capital iota
-			else if ((*from == 0xCE && (*(from + 1) == 0x8A || *(from + 1) == 0xAA))) {
-				text += 0xCE;
-				text += 0x99;
-				from++;
-			}
-			//capital omicron
-			else if ((*from == 0xCE && *(from + 1) == 0x8C)) {
-				text += 0xCE;
-				text += 0x9F;
-				from++;
-			}
-			//capital upsilon
-			else if ((*from == 0xCE && (*(from + 1) == 0x8E || *(from + 1) == 0xAB))) {
-				text += 0xCE;
-				text += 0xA5;
-				from++;
-			}
-			//capital omega
-			else if ((*from == 0xCE && *(from + 1) == 0x8F)) {
-				text += 0xCE;
-				text += 0xA9;
-				from++;
-			}
-
-			//alpha
-			else if ((*from == 0xCE && *(from + 1) == 0xAC)) {
-				text += 0xCE;
-				text += 0xB1;
-				from++;
-			}
-			//epsilon
-			else if ((*from == 0xCE && *(from + 1) == 0xAD)) {
-				text += 0xCE;
-				text += 0xB5;
-				from++;
-			}
-			//eta
-			else if ((*from == 0xCE && *(from + 1) == 0xAE)) {
-				text += 0xCE;
-				text += 0xB7;
-				from++;
-			}
-			//iota
-			else if ((*from == 0xCE && *(from + 1) == 0xAF) || (*from == 0xCF && *(from + 1) == 0x8A)) {
-				text += 0xCE;
-				text += 0xB9;
-				from++;
-			}
-			//omicron
-			else if ((*from == 0xCF && *(from + 1) == 0x8C)) {
-				text += 0xCE;
-				text += 0xBF;
-				from++;
-			}
-			//upsilon
-			else if ((*from == 0xCE && *(from + 1) == 0x88) || (*from == 0xCF && (*(from + 1) == 0x8B || *(from + 1) == 0x8D))) {
-				text += 0xCF;
-				text += 0x85;
-				from++;
-			}
-			//omega
-			else if ((*from == 0xCF && *(from + 1) == 0x8E)) {
-				text += 0xCF;
-				text += 0x89;
-				from++;
-			}
-
-			//Extended Greek
-			//capital alpha
-			else if (*from == 0xE1 && (((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x88 && *(from + 2) <= 0x8F) || (*(from + 1) == 0xBE && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBC))) {
-				text += 0xCE;
-				text += 0x91;
-				from+=2;
-			}
-			//capital epsilon
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x98 && *(from + 2) <= 0x9D) || (*(from + 1) == 0xBF && (*(from + 2) == 0x88 || *(from + 2) == 0x89)))) {
-				text += 0xCE;
-				text += 0x95;
-				from+=2;
-			}
-			//capital eta
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBE && *(from + 2) >= 0x98 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0x8A && *(from + 2) <= 0x8C))) {
-				text += 0xCE;
-				text += 0x97;
-				from+=2;
-			}
-			//capital iota
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBF) || (*(from + 1) == 0xBF && *(from + 2) >= 0x98 && *(from + 2) <= 0x9B))) {
-				text += 0xCE;
-				text += 0x99;
-				from+=2;
-			}
-			//capital omicron
-			else if (*from == 0xE1 && (((*(from + 1) == 0xBD && *(from + 2) >= 0x88 && *(from + 2) <= 0x8D)) || ((*(from + 1) == 0xBF && (*(from + 2) == 0xB8 || *(from + 2) == 0xB9))))) {
-				text += 0xCE;
-				text += 0x9F;
-				from+=2;
-			}
-			//capital upsilon
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBD && *(from + 2) >= 0x99 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAB))) {
-				text += 0xCE;
-				text += 0xA5;
-				from+=2;
-			}
-			//capital omega
-			else if (*from == 0xE1 && (((*(from + 1) == 0xBD || *(from + 1) == 0xBE) && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBF && *(from + 2) >= 0xBA && *(from + 2) <= 0xBC))) {
-				text += 0xCE;
-				text += 0xA9;
-				from+=2;
-			}
-			//capital rho
-			else if (*from == 0xE1 && *(from + 1) == 0xBF && *(from + 2) == 0xAC) {
-				text += 0xCE;
-				text += 0xA1;
-				from+=2;
-			}
-
-			//alpha
-			else if (*from == 0xE1 && (
-                            ((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x80 && *(from + 2) <= 0x87)
-                         || (*(from + 1) == 0xBD && (*(from + 2) == 0xB0 || *(from + 2) == 0xB1))
-                         || (*(from + 1) == 0xBE && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7))) {
-				text += 0xCE;
-				text += 0xB1;
-				from+=2;
-			}
-			//epsilon
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x90 && *(from + 2) <= 0x95) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB2 || *(from + 2) == 0xB3)))) {
-				text += 0xCE;
-				text += 0xB5;
-				from+=2;
-			}
-			//eta
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBE && *(from + 2) >= 0x90 && *(from + 2) <= 0x97) || (*(from + 1) == 0xBC && *(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 1) == 0xBF && *(from + 2) >= 0x82 && *(from + 2) <= 0x87) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB4 || *(from + 2) == 0xB5)))) {
-				text += 0xCE;
-				text += 0xB7;
-				from+=2;
-			}
-			//iota
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB6 || *(from + 2) == 0xB7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0x90 && *(from + 2) <= 0x97))) {
-				text += 0xCE;
-				text += 0xB9;
-				from+=2;
-			}
-			//omicron
-			else if (*from == 0xE1 && (*(from + 1) == 0xBD && ((*(from + 2) >= 0x80 && *(from + 2) <= 0x85) || (*(from + 2) == 0xB8 || *(from + 2) == 0xB9)))) {
-				text += 0xCE;
-				text += 0xBF;
-				from+=2;
-			}
-			//upsilon
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0x90 && *(from + 2) <= 0x97) || *(from + 2) == 0xBA || *(from + 2) == 0xBB)) || (*(from + 1) == 0xBF && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA3) || *(from + 2) == 0xA6 || *(from + 2) == 0xA7)))) {
-				text += 0xCF;
-				text += 0x85;
-				from+=2;
-			}
-			//omega
-			else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 2) == 0xBC || *(from + 2) == 0xBD))) || (*(from + 1) == 0xBE && (*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0xB2 && *(from + 2) <= 0xB7))) {
-				text += 0xCF;
-				text += 0x89;
-				from+=2;
-			}
-			//rho
-			else if (*from == 0xE1 && *(from + 1) == 0xBF && (*(from + 2) == 0xA4 && *(from + 2) == 0xA5)) {
-				text += 0xCF;
-				text += 0x81;
-				from+=2;
-			}
-			else { //no characters we filter
-				text += *from;
-			}
+			else text.append((const char *)it->second, it->second.size());	// save a strlen, since we know our size
 		}
 	}
 	return 0;
 }
 
 
-
-
-
-
 SWORD_NAMESPACE_END

Added: trunk/tests/testsuite/README
===================================================================
--- trunk/tests/testsuite/README	                        (rev 0)
+++ trunk/tests/testsuite/README	2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,24 @@
+All tests are represented by a <test>.sh / <test>.good file pair.
+
+To run a test:
+
+./runtest.sh test
+
+This will run test.sh > test.try and compair test.try to test.good and report any differences (failures)
+
+To run all tests:
+
+./runall.sh
+
+===================================
+
+To create a new test, do whatever you want in your new mytest.sh file,
+call, executables, do anything you'd like and output results which
+matter for a good test.
+
+When all is running fine, output your .good file with:
+
+./mytest.sh > mytest.good
+
+That's it.  Simple right?  :)  So make more unit tests!
+

Added: trunk/tests/testsuite/UTF-8-test.txt
===================================================================
--- trunk/tests/testsuite/UTF-8-test.txt	                        (rev 0)
+++ trunk/tests/testsuite/UTF-8-test.txt	2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,300 @@
+UTF-8 decoder capability and stress test
+----------------------------------------
+
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
+
+This test file can help you examine, how your UTF-8 decoder handles
+various types of correct, malformed, or otherwise interesting UTF-8
+sequences. This file is not meant to be a conformance test. It does
+not prescribe any particular outcome. Therefore, there is no way to
+"pass" or "fail" this test file, even though the text does suggest a
+preferable decoder behaviour at some places. Its aim is, instead, to
+help you think about, and test, the behaviour of your UTF-8 decoder on a
+systematic collection of unusual inputs. Experience so far suggests
+that most first-time authors of UTF-8 decoders find at least one
+serious problem in their decoder using this file.
+
+The test lines below cover boundary conditions, malformed UTF-8
+sequences, as well as correctly encoded UTF-8 sequences of Unicode code
+points that should never occur in a correct UTF-8 file.
+
+According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
+receiving UTF-8 shall interpret a "malformed sequence in the same way
+that it interprets a character that is outside the adopted subset" and
+"characters that are not within the adopted subset shall be indicated
+to the user" by a receiving device. One commonly used approach in
+UTF-8 decoders is to replace any malformed UTF-8 sequence by a
+replacement character (U+FFFD), which looks a bit like an inverted
+question mark, or a similar symbol. It might be a good idea to
+visually distinguish a malformed UTF-8 sequence from a correctly
+encoded Unicode character that is just not available in the current
+font but otherwise fully legal, even though ISO 10646-1 doesn't
+mandate this. In any case, just ignoring malformed sequences or
+unavailable characters does not conform to ISO 10646, will make
+debugging more difficult, and can lead to user confusion.
+
+Please check, whether a malformed UTF-8 sequence is (1) represented at
+all, (2) represented by exactly one single replacement character (or
+equivalent signal), and (3) the following quotation mark after an
+illegal UTF-8 sequence is correctly displayed, i.e. proper
+resynchronization takes place immediately after any malformed
+sequence. This file says "THE END" in the last line, so if you don't
+see that, your decoder crashed somehow before, which should always be
+cause for concern.
+
+All lines in this file are exactly 79 characters long (plus the line
+feed). In addition, all lines end with "|", except for the two test
+lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
+U+0000 and U+007F. If you display this file with a fixed-width font,
+these "|" characters should all line up in column 79 (right margin).
+This allows you to test quickly, whether your UTF-8 decoder finds the
+correct number of characters in every line, that is whether each
+malformed sequences is replaced by a single replacement character.
+
+Note that, as an alternative to the notion of malformed sequence used
+here, it is also a perfectly acceptable (and in some situations even
+preferable) solution to represent each individual byte of a malformed
+sequence with a replacement character. If you follow this strategy in
+your decoder, then please ignore the "|" column.
+
+
+Here come the tests:                                                          |
+                                                                              |
+1  Some correct UTF-8 text                                                    |
+                                                                              |
+You should see the Greek word 'kosme':       "Îºá½¹ÏƒÎ¼Îµ"                          |
+                                                                              |
+2  Boundary condition test cases                                              |
+                                                                              |
+2.1  First possible sequence of a certain length                              |
+                                                                              |
+2.1.1  1 byte  (U-00000000):        "^@" // SWORD: removed. we don't support null mid-string, <- that's a literal <caret at>
+2.1.2  2 bytes (U-00000080):        "Â€"                                       |
+2.1.3  3 bytes (U-00000800):        "à €"                                       |
+2.1.4  4 bytes (U-00010000):        "ð€€"                                       |
+2.1.5  5 bytes (U-00200000):        "øˆ€€€"                                       |
+2.1.6  6 bytes (U-04000000):        "ü„€€€€"                                       |
+                                                                              |
+2.2  Last possible sequence of a certain length                               |
+                                                                              |
+2.2.1  1 byte  (U-0000007F):        ""                                        
+2.2.2  2 bytes (U-000007FF):        "ß¿"                                       |
+2.2.3  3 bytes (U-0000FFFF):        "ï¿¿"                                       |
+2.2.4  4 bytes (U-001FFFFF):        "÷¿¿¿"                                       |
+2.2.5  5 bytes (U-03FFFFFF):        "û¿¿¿¿"                                       |
+2.2.6  6 bytes (U-7FFFFFFF):        "ý¿¿¿¿¿"                                       |
+                                                                              |
+2.3  Other boundary conditions                                                |
+                                                                              |
+2.3.1  U-0000D7FF = ed 9f bf = "íŸ¿"                                            |
+2.3.2  U-0000E000 = ee 80 80 = "î€€"                                            |
+2.3.3  U-0000FFFD = ef bf bd = "ï¿½"                                            |
+2.3.4  U-0010FFFF = f4 8f bf bf = "ô¿¿"                                         |
+2.3.5  U-00110000 = f4 90 80 80 = "ô€€"                                         |
+                                                                              |
+3  Malformed sequences                                                        |
+                                                                              |
+3.1  Unexpected continuation bytes                                            |
+                                                                              |
+Each unexpected continuation byte should be separately signalled as a         |
+malformed sequence of its own.                                                |
+                                                                              |
+3.1.1  First continuation byte 0x80: "€"                                      |
+3.1.2  Last  continuation byte 0xbf: "¿"                                      |
+                                                                              |
+3.1.3  2 continuation bytes: "€¿"                                             |
+3.1.4  3 continuation bytes: "€¿€"                                            |
+3.1.5  4 continuation bytes: "€¿€¿"                                           |
+3.1.6  5 continuation bytes: "€¿€¿€"                                          |
+3.1.7  6 continuation bytes: "€¿€¿€¿"                                         |
+3.1.8  7 continuation bytes: "€¿€¿€¿€"                                        |
+                                                                              |
+3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
+                                                                              |
+   "€‚ƒ„…†‡ˆ‰Š‹ŒŽ                                                          |
+    ‘’“”•–—˜™š›œžŸ                                                          |
+     ¡¢£¤¥¦§¨©ª«¬®¯                                                          |
+    °±²³´µ¶·¸¹º»¼½¾¿"                                                         |
+                                                                              |
+3.2  Lonely start characters                                                  |
+                                                                              |
+3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï                                           |
+    Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß "                                         |
+                                                                              |
+3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "à á â ã ä å æ ç è é ê ë ì í î ï "                                         |
+                                                                              |
+3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ð ñ ò ó ô õ ö ÷ "                                                         |
+                                                                              |
+3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ø ù ú û "                                                                 |
+                                                                              |
+3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ü ý "                                                                     |
+                                                                              |
+3.3  Sequences with last continuation byte missing                            |
+                                                                              |
+All bytes of an incomplete sequence should be signalled as a single           |
+malformed sequence, i.e., you should see only a single replacement            |
+character in each of the next 10 tests. (Characters as in section 2)          |
+                                                                              |
+3.3.1  2-byte sequence with last byte missing (U+0000):     "À"               |
+3.3.2  3-byte sequence with last byte missing (U+0000):     "à€"               |
+3.3.3  4-byte sequence with last byte missing (U+0000):     "ð€€"               |
+3.3.4  5-byte sequence with last byte missing (U+0000):     "ø€€€"               |
+3.3.5  6-byte sequence with last byte missing (U+0000):     "ü€€€€"               |
+3.3.6  2-byte sequence with last byte missing (U-000007FF): "ß"               |
+3.3.7  3-byte sequence with last byte missing (U-0000FFFF): "ï¿"               |
+3.3.8  4-byte sequence with last byte missing (U-001FFFFF): "÷¿¿"               |
+3.3.9  5-byte sequence with last byte missing (U-03FFFFFF): "û¿¿¿"               |
+3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "ý¿¿¿¿"               |
+                                                                              |
+3.4  Concatenation of incomplete sequences                                    |
+                                                                              |
+All the 10 sequences of 3.3 concatenated, you should see 10 malformed         |
+sequences being signalled:                                                    |
+                                                                              |
+   "Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿"                                                               |
+                                                                              |
+3.5  Impossible bytes                                                         |
+                                                                              |
+The following two bytes cannot appear in a correct UTF-8 string               |
+                                                                              |
+3.5.1  fe = "þ"                                                               |
+3.5.2  ff = "ÿ"                                                               |
+3.5.3  fe fe ff ff = "þþÿÿ"                                                   |
+                                                                              |
+4  Overlong sequences                                                         |
+                                                                              |
+The following sequences are not malformed according to the letter of          |
+the Unicode 2.0 standard. However, they are longer then necessary and         |
+a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8         |
+decoder" should reject them just like malformed sequences for two             |
+reasons: (1) It helps to debug applications if overlong sequences are         |
+not treated as valid representations of characters, because this helps        |
+to spot problems more quickly. (2) Overlong sequences provide                 |
+alternative representations of characters, that could maliciously be          |
+used to bypass filters that check only for ASCII characters. For              |
+instance, a 2-byte encoded line feed (LF) would not be caught by a            |
+line counter that counts only 0x0a bytes, but it would still be               |
+processed as a line feed by an unsafe UTF-8 decoder later in the              |
+pipeline. From a security point of view, ASCII compatibility of UTF-8         |
+sequences means also, that ASCII characters are *only* allowed to be          |
+represented by ASCII bytes in the range 0x00-0x7f. To ensure this             |
+aspect of ASCII compatibility, use only "safe UTF-8 decoders" that            |
+reject overlong UTF-8 sequences for which a shorter encoding exists.          |
+                                                                              |
+4.1  Examples of an overlong ASCII character                                  |
+                                                                              |
+With a safe UTF-8 decoder, all of the following five overlong                 |
+representations of the ASCII character slash ("/") should be rejected         |
+like a malformed UTF-8 sequence, for instance by substituting it with         |
+a replacement character. If you see a slash below, you do not have a          |
+safe UTF-8 decoder!                                                           |
+                                                                              |
+4.1.1 U+002F = c0 af             = "À¯"                                        |
+4.1.2 U+002F = e0 80 af          = "à€¯"                                        |
+4.1.3 U+002F = f0 80 80 af       = "ð€€¯"                                        |
+4.1.4 U+002F = f8 80 80 80 af    = "ø€€€¯"                                        |
+4.1.5 U+002F = fc 80 80 80 80 af = "ü€€€€¯"                                        |
+                                                                              |
+4.2  Maximum overlong sequences                                               |
+                                                                              |
+Below you see the highest Unicode value that is still resulting in an         |
+overlong sequence if represented with the given number of bytes. This         |
+is a boundary test for safe UTF-8 decoders. All five characters should        |
+be rejected like malformed UTF-8 sequences.                                   |
+                                                                              |
+4.2.1  U-0000007F = c1 bf             = "Á¿"                                   |
+4.2.2  U-000007FF = e0 9f bf          = "àŸ¿"                                   |
+4.2.3  U-0000FFFF = f0 8f bf bf       = "ð¿¿"                                   |
+4.2.4  U-001FFFFF = f8 87 bf bf bf    = "ø‡¿¿¿"                                   |
+4.2.5  U-03FFFFFF = fc 83 bf bf bf bf = "üƒ¿¿¿¿"                                   |
+                                                                              |
+4.3  Overlong representation of the NUL character                             |
+                                                                              |
+The following five sequences should also be rejected like malformed           |
+UTF-8 sequences and should not be treated like the ASCII NUL                  |
+character.                                                                    |
+                                                                              |
+4.3.1  U+0000 = c0 80             = "À€"                                       |
+4.3.2  U+0000 = e0 80 80          = "à€€"                                       |
+4.3.3  U+0000 = f0 80 80 80       = "ð€€€"                                       |
+4.3.4  U+0000 = f8 80 80 80 80    = "ø€€€€"                                       |
+4.3.5  U+0000 = fc 80 80 80 80 80 = "ü€€€€€"                                       |
+                                                                              |
+5  Illegal code positions                                                     |
+                                                                              |
+The following UTF-8 sequences should be rejected like malformed               |
+sequences, because they never represent valid ISO 10646 characters and        |
+a UTF-8 decoder that accepts them might introduce security problems           |
+comparable to overlong UTF-8 sequences.                                       |
+                                                                              |
+5.1 Single UTF-16 surrogates                                                  |
+                                                                              |
+5.1.1  U+D800 = ed a0 80 = "í €"                                                |
+5.1.2  U+DB7F = ed ad bf = "í¿"                                                |
+5.1.3  U+DB80 = ed ae 80 = "í®€"                                                |
+5.1.4  U+DBFF = ed af bf = "í¯¿"                                                |
+5.1.5  U+DC00 = ed b0 80 = "í°€"                                                |
+5.1.6  U+DF80 = ed be 80 = "í¾€"                                                |
+5.1.7  U+DFFF = ed bf bf = "í¿¿"                                                |
+                                                                              |
+5.2 Paired UTF-16 surrogates                                                  |
+                                                                              |
+5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = "í €í°€"                               |
+5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = "í €í¿¿"                               |
+5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = "í¿í°€"                               |
+5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = "í¿í¿¿"                               |
+5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = "í®€í°€"                               |
+5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = "í®€í¿¿"                               |
+5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = "í¯¿í°€"                               |
+5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = "í¯¿í¿¿"                               |
+                                                                              |
+5.3 Noncharacter code positions                                               |
+                                                                              |
+The following "noncharacters" are "reserved for internal use" by              |
+applications, and according to older versions of the Unicode Standard         |
+"should never be interchanged". Unicode Corrigendum #9 dropped the            |
+latter restriction. Nevertheless, their presence in incoming UTF-8 data       |
+can remain a potential security risk, depending on what use is made of        |
+these codes subsequently. Examples of such internal use:                      |
+                                                                              |
+ - Some file APIs with 16-bit characters may use the integer value -1         |
+   = U+FFFF to signal an end-of-file (EOF) or error condition.                |
+                                                                              |
+ - In some UTF-16 receivers, code point U+FFFE might trigger a                |
+   byte-swap operation (to convert between UTF-16LE and UTF-16BE).            |
+                                                                              |
+With such internal use of noncharacters, it may be desirable and safer        |
+to block those code points in UTF-8 decoders, as they should never            |
+occur legitimately in incoming UTF-8 data, and could trigger unsafe           |
+behaviour in subsequent processing.                                           |
+                                                                              |
+Particularly problematic noncharacters in 16-bit applications:                |
+                                                                              |
+5.3.1  U+FFFE = ef bf be = "ï¿¾"                                                |
+5.3.2  U+FFFF = ef bf bf = "ï¿¿"                                                |
+                                                                              |
+Other noncharacters:                                                          |
+                                                                              |
+5.3.3  U+FDD0 .. U+FDEF = "ï·ï·‘ï·’ï·“ï·”ï·•ï·–ï·—ï·˜ï·™ï·šï·›ï·œï·ï·žï·Ÿï· ï·¡ï·¢ï·£ï·¤ï·¥ï·¦ï·§ï·¨ï·©ï·ªï·«ï·¬ï·ï·®ï·¯"|
+                                                                              |
+5.3.4  U+nFFFE U+nFFFF (for n = 1..10)                                        |
+                                                                              |
+       "ðŸ¿¾ðŸ¿¿ð¯¿¾ð¯¿¿ð¿¿¾ð¿¿¿ñ¿¾ñ¿¿ñŸ¿¾ñŸ¿¿ñ¯¿¾ñ¯¿¿ñ¿¿¾ñ¿¿¿ò¿¾ò¿¿                                    |
+        òŸ¿¾òŸ¿¿ò¯¿¾ò¯¿¿ò¿¿¾ò¿¿¿ó¿¾ó¿¿óŸ¿¾óŸ¿¿ó¯¿¾ó¯¿¿ó¿¿¾ó¿¿¿ô¿¾ô¿¿"                                   |
+                                                                              |
+THE END                                                                       |

Added: trunk/tests/testsuite/greekaccents.good
===================================================================
--- trunk/tests/testsuite/greekaccents.good	                        (rev 0)
+++ trunk/tests/testsuite/greekaccents.good	2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,7 @@
+ÎšÎ±Î¹ ÎºÎ±Î¸Ï‰Ï‚ ÎœÏ‰Ï…ÏƒÎ·Ï‚ Ï…ÏˆÏ‰ÏƒÎµÎ½ Ï„Î¿Î½ Î¿Ï†Î¹Î½ ÎµÎ½ Ï„Î· ÎµÏÎ·Î¼Ï‰, Î¿Ï…Ï„Ï‰Ï‚ Ï…ÏˆÏ‰Î¸Î·Î½Î±Î¹ Î´ÎµÎ¹ Ï„Î¿Î½ Ï…Î¹Î¿Î½ Ï„Î¿Ï… Î±Î½Î¸ÏÏ‰Ï€Î¿Ï…,
+Î¹Î½Î± Ï€Î±Ï‚ Î¿ Ï€Î¹ÏƒÏ„ÎµÏ…Ï‰Î½ â¸‚ÎµÎ½ Î±Ï…Ï„Ï‰â¸ƒ â¸† ÎµÏ‡Î· Î¶Ï‰Î·Î½ Î±Î¹Ï‰Î½Î¹Î¿Î½.
+Î¿Ï…Ï„Ï‰Ï‚ Î³Î±Ï Î·Î³Î±Ï€Î·ÏƒÎµÎ½ Î¿ Î¸ÎµÎ¿Ï‚ Ï„Î¿Î½ ÎºÎ¿ÏƒÎ¼Î¿Î½, Ï‰ÏƒÏ„Îµ Ï„Î¿Î½ Ï…Î¹Î¿Î½ â¸† Ï„Î¿Î½ Î¼Î¿Î½Î¿Î³ÎµÎ½Î· ÎµÎ´Ï‰ÎºÎµÎ½, Î¹Î½Î± Ï€Î±Ï‚ Î¿ Ï€Î¹ÏƒÏ„ÎµÏ…Ï‰Î½ ÎµÎ¹Ï‚ Î±Ï…Ï„Î¿Î½ Î¼Î· Î±Ï€Î¿Î»Î·Ï„Î±Î¹ Î±Î»Î» ÎµÏ‡Î· Î¶Ï‰Î·Î½ Î±Î¹Ï‰Î½Î¹Î¿Î½.
+Î¿Ï… Î³Î±Ï Î±Ï€ÎµÏƒÏ„ÎµÎ¹Î»ÎµÎ½ Î¿ Î¸ÎµÎ¿Ï‚ Ï„Î¿Î½ Ï…Î¹Î¿Î½ â¸† ÎµÎ¹Ï‚ Ï„Î¿Î½ ÎºÎ¿ÏƒÎ¼Î¿Î½ Î¹Î½Î± ÎºÏÎ¹Î½Î· Ï„Î¿Î½ ÎºÎ¿ÏƒÎ¼Î¿Î½, Î±Î»Î» Î¹Î½Î± ÏƒÏ‰Î¸Î· Î¿ ÎºÎ¿ÏƒÎ¼Î¿Ï‚ Î´Î¹ Î±Ï…Ï„Î¿Ï….
+Î¿ Ï€Î¹ÏƒÏ„ÎµÏ…Ï‰Î½ ÎµÎ¹Ï‚ Î±Ï…Ï„Î¿Î½ Î¿Ï… ÎºÏÎ¹Î½ÎµÏ„Î±Î¹Â· Î¿ Â°Î´Îµ Î¼Î· Ï€Î¹ÏƒÏ„ÎµÏ…Ï‰Î½ Î·Î´Î· ÎºÎµÎºÏÎ¹Ï„Î±Î¹, Î¿Ï„Î¹ Î¼Î· Ï€ÎµÏ€Î¹ÏƒÏ„ÎµÏ…ÎºÎµÎ½ ÎµÎ¹Ï‚ Ï„Î¿ Î¿Î½Î¿Î¼Î± Ï„Î¿Ï… Î¼Î¿Î½Î¿Î³ÎµÎ½Î¿Ï…Ï‚ Ï…Î¹Î¿Ï… Ï„Î¿Ï… Î¸ÎµÎ¿Ï….
+Î±Ï…Ï„Î· Î´Îµ ÎµÏƒÏ„Î¹Î½ Î· ÎºÏÎ¹ÏƒÎ¹Ï‚ Î¿Ï„Î¹ Â°Ï„Î¿ Ï†Ï‰Ï‚ ÎµÎ»Î·Î»Ï…Î¸ÎµÎ½ ÎµÎ¹Ï‚ Ï„Î¿Î½ ÎºÎ¿ÏƒÎ¼Î¿Î½ ÎºÎ±Î¹ â¸‰Î·Î³Î±Ï€Î·ÏƒÎ±Î½ Î¿Î¹ Î±Î½Î¸ÏÏ‰Ï€Î¿Î¹ Î¼Î±Î»Î»Î¿Î½ Ï„Î¿ ÏƒÎºÎ¿Ï„Î¿Ï‚â¸Š Î· Ï„Î¿ Ï†Ï‰Ï‚Â· Î·Î½ Î³Î±Ï â¸‰Â¹Î±Ï…Ï„Ï‰Î½ Ï€Î¿Î½Î·ÏÎ±â¸Š Ï„Î± ÎµÏÎ³Î±.
+

Added: trunk/tests/testsuite/greekaccents.sh
===================================================================
--- trunk/tests/testsuite/greekaccents.sh	                        (rev 0)
+++ trunk/tests/testsuite/greekaccents.sh	2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,8 @@
+#/bin/sh
+
+# there is an iteration value as the last parameter and can be used
+# for testing speed. Set to 999999 my results on my Dell Precision 5510
+# real	0m8.952s
+# user	0m8.939s
+# sys	0m0.004s
+../utf8norm -ga 999 < greekaccents.txt


Property changes on: trunk/tests/testsuite/greekaccents.sh
___________________________________________________________________
Added: svn:executable
   + *

Added: trunk/tests/testsuite/greekaccents.txt
===================================================================
--- trunk/tests/testsuite/greekaccents.txt	                        (rev 0)
+++ trunk/tests/testsuite/greekaccents.txt	2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,7 @@
+ÎšÎ±á½¶ ÎºÎ±Î¸á½¼Ï‚ ÎœÏ‰Ï‹Ïƒá¿†Ï‚ á½•ÏˆÏ‰ÏƒÎµÎ½ Ï„á½¸Î½ á½„Ï†Î¹Î½ á¼Î½ Ï„á¿‡ á¼ÏÎ®Î¼á¿³, Î¿á½•Ï„Ï‰Ï‚ á½‘ÏˆÏ‰Î¸á¿†Î½Î±Î¹ Î´Îµá¿– Ï„á½¸Î½ Ï…á¼±á½¸Î½ Ï„Î¿á¿¦ á¼€Î½Î¸Ïá½½Ï€Î¿Ï…,
+á¼µÎ½Î± Ï€á¾¶Ï‚ á½ Ï€Î¹ÏƒÏ„Îµá½»Ï‰Î½ â¸‚á¼Î½ Î±á½Ï„á¿·â¸ƒ â¸† á¼”Ï‡á¿ƒ Î¶Ï‰á½´Î½ Î±á¼°á½½Î½Î¹Î¿Î½.
+Î¿á½•Ï„Ï‰Ï‚ Î³á½°Ï á¼ Î³Î¬Ï€Î·ÏƒÎµÎ½ á½ Î¸Îµá½¸Ï‚ Ï„á½¸Î½ Îºá½¹ÏƒÎ¼Î¿Î½, á½¥ÏƒÏ„Îµ Ï„á½¸Î½ Ï…á¼±á½¸Î½ â¸† Ï„á½¸Î½ Î¼Î¿Î½Î¿Î³ÎµÎ½á¿† á¼”Î´Ï‰ÎºÎµÎ½, á¼µÎ½Î± Ï€á¾¶Ï‚ á½ Ï€Î¹ÏƒÏ„Îµá½»Ï‰Î½ Îµá¼°Ï‚ Î±á½Ï„á½¸Î½ Î¼á½´ á¼€Ï€á½¹Î»Î·Ï„Î±Î¹ á¼€Î»Î»á¾¿ á¼”Ï‡á¿ƒ Î¶Ï‰á½´Î½ Î±á¼°á½½Î½Î¹Î¿Î½.
+Î¿á½ Î³á½°Ï á¼€Ï€á½³ÏƒÏ„ÎµÎ¹Î»ÎµÎ½ á½ Î¸Îµá½¸Ï‚ Ï„á½¸Î½ Ï…á¼±á½¸Î½ â¸† Îµá¼°Ï‚ Ï„á½¸Î½ Îºá½¹ÏƒÎ¼Î¿Î½ á¼µÎ½Î± ÎºÏá½·Î½á¿ƒ Ï„á½¸Î½ Îºá½¹ÏƒÎ¼Î¿Î½, á¼€Î»Î»á¾¿ á¼µÎ½Î± ÏƒÏ‰Î¸á¿‡ á½ Îºá½¹ÏƒÎ¼Î¿Ï‚ Î´Î¹á¾¿ Î±á½Ï„Î¿á¿¦.
+á½ Ï€Î¹ÏƒÏ„Îµá½»Ï‰Î½ Îµá¼°Ï‚ Î±á½Ï„á½¸Î½ Î¿á½ ÎºÏá½·Î½ÎµÏ„Î±Î¹Â· á½ Â°Î´á½² Î¼á½´ Ï€Î¹ÏƒÏ„Îµá½»Ï‰Î½ á¼¤Î´Î· Îºá½³ÎºÏÎ¹Ï„Î±Î¹, á½…Ï„Î¹ Î¼á½´ Ï€ÎµÏ€á½·ÏƒÏ„ÎµÏ…ÎºÎµÎ½ Îµá¼°Ï‚ Ï„á½¸ á½„Î½Î¿Î¼Î± Ï„Î¿á¿¦ Î¼Î¿Î½Î¿Î³ÎµÎ½Î¿á¿¦Ï‚ Ï…á¼±Î¿á¿¦ Ï„Î¿á¿¦ Î¸ÎµÎ¿á¿¦.
+Î±á½•Ï„Î· Î´á½³ á¼ÏƒÏ„Î¹Î½ á¼¡ ÎºÏá½·ÏƒÎ¹Ï‚ á½…Ï„Î¹ Â°Ï„á½¸ Ï†á¿¶Ï‚ á¼Î»Î®Î»Ï…Î¸ÎµÎ½ Îµá¼°Ï‚ Ï„á½¸Î½ Îºá½¹ÏƒÎ¼Î¿Î½ ÎºÎ±á½¶ â¸‰á¼ Î³Î¬Ï€Î·ÏƒÎ±Î½ Î¿á¼± á¼„Î½Î¸ÏÏ‰Ï€Î¿Î¹ Î¼á¾¶Î»Î»Î¿Î½ Ï„á½¸ ÏƒÎºá½¹Ï„Î¿Ï‚â¸Š á¼¢ Ï„á½¸ Ï†á¿¶Ï‚Â· á¼¦Î½ Î³á½°Ï â¸‰Â¹Î±á½Ï„á¿¶Î½ Ï€Î¿Î½Î·Ïá½°â¸Š Ï„á½° á¼”ÏÎ³Î±.
+

Added: trunk/tests/testsuite/utf8basic.good
===================================================================
--- trunk/tests/testsuite/utf8basic.good	                        (rev 0)
+++ trunk/tests/testsuite/utf8basic.good	2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,300 @@
+UTF-8 decoder capability and stress test
+----------------------------------------
+
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
+
+This test file can help you examine, how your UTF-8 decoder handles
+various types of correct, malformed, or otherwise interesting UTF-8
+sequences. This file is not meant to be a conformance test. It does
+not prescribe any particular outcome. Therefore, there is no way to
+"pass" or "fail" this test file, even though the text does suggest a
+preferable decoder behaviour at some places. Its aim is, instead, to
+help you think about, and test, the behaviour of your UTF-8 decoder on a
+systematic collection of unusual inputs. Experience so far suggests
+that most first-time authors of UTF-8 decoders find at least one
+serious problem in their decoder using this file.
+
+The test lines below cover boundary conditions, malformed UTF-8
+sequences, as well as correctly encoded UTF-8 sequences of Unicode code
+points that should never occur in a correct UTF-8 file.
+
+According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
+receiving UTF-8 shall interpret a "malformed sequence in the same way
+that it interprets a character that is outside the adopted subset" and
+"characters that are not within the adopted subset shall be indicated
+to the user" by a receiving device. One commonly used approach in
+UTF-8 decoders is to replace any malformed UTF-8 sequence by a
+replacement character (U+FFFD), which looks a bit like an inverted
+question mark, or a similar symbol. It might be a good idea to
+visually distinguish a malformed UTF-8 sequence from a correctly
+encoded Unicode character that is just not available in the current
+font but otherwise fully legal, even though ISO 10646-1 doesn't
+mandate this. In any case, just ignoring malformed sequences or
+unavailable characters does not conform to ISO 10646, will make
+debugging more difficult, and can lead to user confusion.
+
+Please check, whether a malformed UTF-8 sequence is (1) represented at
+all, (2) represented by exactly one single replacement character (or
+equivalent signal), and (3) the following quotation mark after an
+illegal UTF-8 sequence is correctly displayed, i.e. proper
+resynchronization takes place immediately after any malformed
+sequence. This file says "THE END" in the last line, so if you don't
+see that, your decoder crashed somehow before, which should always be
+cause for concern.
+
+All lines in this file are exactly 79 characters long (plus the line
+feed). In addition, all lines end with "|", except for the two test
+lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
+U+0000 and U+007F. If you display this file with a fixed-width font,
+these "|" characters should all line up in column 79 (right margin).
+This allows you to test quickly, whether your UTF-8 decoder finds the
+correct number of characters in every line, that is whether each
+malformed sequences is replaced by a single replacement character.
+
+Note that, as an alternative to the notion of malformed sequence used
+here, it is also a perfectly acceptable (and in some situations even
+preferable) solution to represent each individual byte of a malformed
+sequence with a replacement character. If you follow this strategy in
+your decoder, then please ignore the "|" column.
+
+
+Here come the tests:                                                          |
+                                                                              |
+1  Some correct UTF-8 text                                                    |
+                                                                              |
+You should see the Greek word 'kosme':       "Îºá½¹ÏƒÎ¼Îµ"                          |
+                                                                              |
+2  Boundary condition test cases                                              |
+                                                                              |
+2.1  First possible sequence of a certain length                              |
+                                                                              |
+2.1.1  1 byte  (U-00000000):        "^@" // SWORD: removed. we don't support null mid-string, <- that's a literal <caret at>
+2.1.2  2 bytes (U-00000080):        "Â€"                                       |
+2.1.3  3 bytes (U-00000800):        "à €"                                       |
+2.1.4  4 bytes (U-00010000):        "ð€€"                                       |
+2.1.5  5 bytes (U-00200000):        "ï¿½"                                       |
+2.1.6  6 bytes (U-04000000):        "ï¿½"                                       |
+                                                                              |
+2.2  Last possible sequence of a certain length                               |
+                                                                              |
+2.2.1  1 byte  (U-0000007F):        ""                                        
+2.2.2  2 bytes (U-000007FF):        "ß¿"                                       |
+2.2.3  3 bytes (U-0000FFFF):        "ï¿¿"                                       |
+2.2.4  4 bytes (U-001FFFFF):        "ï¿½"                                       |
+2.2.5  5 bytes (U-03FFFFFF):        "ï¿½"                                       |
+2.2.6  6 bytes (U-7FFFFFFF):        "ï¿½"                                       |
+                                                                              |
+2.3  Other boundary conditions                                                |
+                                                                              |
+2.3.1  U-0000D7FF = ed 9f bf = "íŸ¿"                                            |
+2.3.2  U-0000E000 = ee 80 80 = "î€€"                                            |
+2.3.3  U-0000FFFD = ef bf bd = "ï¿½"                                            |
+2.3.4  U-0010FFFF = f4 8f bf bf = "ô¿¿"                                         |
+2.3.5  U-00110000 = f4 90 80 80 = "ï¿½"                                         |
+                                                                              |
+3  Malformed sequences                                                        |
+                                                                              |
+3.1  Unexpected continuation bytes                                            |
+                                                                              |
+Each unexpected continuation byte should be separately signalled as a         |
+malformed sequence of its own.                                                |
+                                                                              |
+3.1.1  First continuation byte 0x80: "ï¿½"                                      |
+3.1.2  Last  continuation byte 0xbf: "ï¿½"                                      |
+                                                                              |
+3.1.3  2 continuation bytes: "ï¿½ï¿½"                                             |
+3.1.4  3 continuation bytes: "ï¿½ï¿½ï¿½"                                            |
+3.1.5  4 continuation bytes: "ï¿½ï¿½ï¿½ï¿½"                                           |
+3.1.6  5 continuation bytes: "ï¿½ï¿½ï¿½ï¿½ï¿½"                                          |
+3.1.7  6 continuation bytes: "ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½"                                         |
+3.1.8  7 continuation bytes: "ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½"                                        |
+                                                                              |
+3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
+                                                                              |
+   "ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½                                                          |
+    ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½                                                          |
+    ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½                                                          |
+    ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½"                                                         |
+                                                                              |
+3.2  Lonely start characters                                                  |
+                                                                              |
+3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½                                           |
+    ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ "                                         |
+                                                                              |
+3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ "                                         |
+                                                                              |
+3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ ï¿½ "                                                         |
+                                                                              |
+3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ï¿½ ï¿½ ï¿½ ï¿½ "                                                                 |
+                                                                              |
+3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ï¿½ ï¿½ "                                                                     |
+                                                                              |
+3.3  Sequences with last continuation byte missing                            |
+                                                                              |
+All bytes of an incomplete sequence should be signalled as a single           |
+malformed sequence, i.e., you should see only a single replacement            |
+character in each of the next 10 tests. (Characters as in section 2)          |
+                                                                              |
+3.3.1  2-byte sequence with last byte missing (U+0000):     "ï¿½"               |
+3.3.2  3-byte sequence with last byte missing (U+0000):     "ï¿½"               |
+3.3.3  4-byte sequence with last byte missing (U+0000):     "ï¿½"               |
+3.3.4  5-byte sequence with last byte missing (U+0000):     "ï¿½"               |
+3.3.5  6-byte sequence with last byte missing (U+0000):     "ï¿½"               |
+3.3.6  2-byte sequence with last byte missing (U-000007FF): "ï¿½"               |
+3.3.7  3-byte sequence with last byte missing (U-0000FFFF): "ï¿½"               |
+3.3.8  4-byte sequence with last byte missing (U-001FFFFF): "ï¿½"               |
+3.3.9  5-byte sequence with last byte missing (U-03FFFFFF): "ï¿½"               |
+3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "ï¿½"               |
+                                                                              |
+3.4  Concatenation of incomplete sequences                                    |
+                                                                              |
+All the 10 sequences of 3.3 concatenated, you should see 10 malformed         |
+sequences being signalled:                                                    |
+                                                                              |
+   "ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½"                                                               |
+                                                                              |
+3.5  Impossible bytes                                                         |
+                                                                              |
+The following two bytes cannot appear in a correct UTF-8 string               |
+                                                                              |
+3.5.1  fe = "ï¿½"                                                               |
+3.5.2  ff = "ï¿½"                                                               |
+3.5.3  fe fe ff ff = "ï¿½ï¿½ï¿½ï¿½"                                                   |
+                                                                              |
+4  Overlong sequences                                                         |
+                                                                              |
+The following sequences are not malformed according to the letter of          |
+the Unicode 2.0 standard. However, they are longer then necessary and         |
+a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8         |
+decoder" should reject them just like malformed sequences for two             |
+reasons: (1) It helps to debug applications if overlong sequences are         |
+not treated as valid representations of characters, because this helps        |
+to spot problems more quickly. (2) Overlong sequences provide                 |
+alternative representations of characters, that could maliciously be          |
+used to bypass filters that check only for ASCII characters. For              |
+instance, a 2-byte encoded line feed (LF) would not be caught by a            |
+line counter that counts only 0x0a bytes, but it would still be               |
+processed as a line feed by an unsafe UTF-8 decoder later in the              |
+pipeline. From a security point of view, ASCII compatibility of UTF-8         |
+sequences means also, that ASCII characters are *only* allowed to be          |
+represented by ASCII bytes in the range 0x00-0x7f. To ensure this             |
+aspect of ASCII compatibility, use only "safe UTF-8 decoders" that            |
+reject overlong UTF-8 sequences for which a shorter encoding exists.          |
+                                                                              |
+4.1  Examples of an overlong ASCII character                                  |
+                                                                              |
+With a safe UTF-8 decoder, all of the following five overlong                 |
+representations of the ASCII character slash ("/") should be rejected         |
+like a malformed UTF-8 sequence, for instance by substituting it with         |
+a replacement character. If you see a slash below, you do not have a          |
+safe UTF-8 decoder!                                                           |
+                                                                              |
+4.1.1 U+002F = c0 af             = "ï¿½"                                        |
+4.1.2 U+002F = e0 80 af          = "ï¿½"                                        |
+4.1.3 U+002F = f0 80 80 af       = "ï¿½"                                        |
+4.1.4 U+002F = f8 80 80 80 af    = "ï¿½"                                        |
+4.1.5 U+002F = fc 80 80 80 80 af = "ï¿½"                                        |
+                                                                              |
+4.2  Maximum overlong sequences                                               |
+                                                                              |
+Below you see the highest Unicode value that is still resulting in an         |
+overlong sequence if represented with the given number of bytes. This         |
+is a boundary test for safe UTF-8 decoders. All five characters should        |
+be rejected like malformed UTF-8 sequences.                                   |
+                                                                              |
+4.2.1  U-0000007F = c1 bf             = "ï¿½"                                   |
+4.2.2  U-000007FF = e0 9f bf          = "ï¿½"                                   |
+4.2.3  U-0000FFFF = f0 8f bf bf       = "ï¿½"                                   |
+4.2.4  U-001FFFFF = f8 87 bf bf bf    = "ï¿½"                                   |
+4.2.5  U-03FFFFFF = fc 83 bf bf bf bf = "ï¿½"                                   |
+                                                                              |
+4.3  Overlong representation of the NUL character                             |
+                                                                              |
+The following five sequences should also be rejected like malformed           |
+UTF-8 sequences and should not be treated like the ASCII NUL                  |
+character.                                                                    |
+                                                                              |
+4.3.1  U+0000 = c0 80             = "ï¿½"                                       |
+4.3.2  U+0000 = e0 80 80          = "ï¿½"                                       |
+4.3.3  U+0000 = f0 80 80 80       = "ï¿½"                                       |
+4.3.4  U+0000 = f8 80 80 80 80    = "ï¿½"                                       |
+4.3.5  U+0000 = fc 80 80 80 80 80 = "ï¿½"                                       |
+                                                                              |
+5  Illegal code positions                                                     |
+                                                                              |
+The following UTF-8 sequences should be rejected like malformed               |
+sequences, because they never represent valid ISO 10646 characters and        |
+a UTF-8 decoder that accepts them might introduce security problems           |
+comparable to overlong UTF-8 sequences.                                       |
+                                                                              |
+5.1 Single UTF-16 surrogates                                                  |
+                                                                              |
+5.1.1  U+D800 = ed a0 80 = "í €"                                                |
+5.1.2  U+DB7F = ed ad bf = "í¿"                                                |
+5.1.3  U+DB80 = ed ae 80 = "í®€"                                                |
+5.1.4  U+DBFF = ed af bf = "í¯¿"                                                |
+5.1.5  U+DC00 = ed b0 80 = "í°€"                                                |
+5.1.6  U+DF80 = ed be 80 = "í¾€"                                                |
+5.1.7  U+DFFF = ed bf bf = "í¿¿"                                                |
+                                                                              |
+5.2 Paired UTF-16 surrogates                                                  |
+                                                                              |
+5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = "í €í°€"                               |
+5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = "í €í¿¿"                               |
+5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = "í¿í°€"                               |
+5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = "í¿í¿¿"                               |
+5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = "í®€í°€"                               |
+5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = "í®€í¿¿"                               |
+5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = "í¯¿í°€"                               |
+5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = "í¯¿í¿¿"                               |
+                                                                              |
+5.3 Noncharacter code positions                                               |
+                                                                              |
+The following "noncharacters" are "reserved for internal use" by              |
+applications, and according to older versions of the Unicode Standard         |
+"should never be interchanged". Unicode Corrigendum #9 dropped the            |
+latter restriction. Nevertheless, their presence in incoming UTF-8 data       |
+can remain a potential security risk, depending on what use is made of        |
+these codes subsequently. Examples of such internal use:                      |
+                                                                              |
+ - Some file APIs with 16-bit characters may use the integer value -1         |
+   = U+FFFF to signal an end-of-file (EOF) or error condition.                |
+                                                                              |
+ - In some UTF-16 receivers, code point U+FFFE might trigger a                |
+   byte-swap operation (to convert between UTF-16LE and UTF-16BE).            |
+                                                                              |
+With such internal use of noncharacters, it may be desirable and safer        |
+to block those code points in UTF-8 decoders, as they should never            |
+occur legitimately in incoming UTF-8 data, and could trigger unsafe           |
+behaviour in subsequent processing.                                           |
+                                                                              |
+Particularly problematic noncharacters in 16-bit applications:                |
+                                                                              |
+5.3.1  U+FFFE = ef bf be = "ï¿¾"                                                |
+5.3.2  U+FFFF = ef bf bf = "ï¿¿"                                                |
+                                                                              |
+Other noncharacters:                                                          |
+                                                                              |
+5.3.3  U+FDD0 .. U+FDEF = "ï·ï·‘ï·’ï·“ï·”ï·•ï·–ï·—ï·˜ï·™ï·šï·›ï·œï·ï·žï·Ÿï· ï·¡ï·¢ï·£ï·¤ï·¥ï·¦ï·§ï·¨ï·©ï·ªï·«ï·¬ï·ï·®ï·¯"|
+                                                                              |
+5.3.4  U+nFFFE U+nFFFF (for n = 1..10)                                        |
+                                                                              |
+       "ðŸ¿¾ðŸ¿¿ð¯¿¾ð¯¿¿ð¿¿¾ð¿¿¿ñ¿¾ñ¿¿ñŸ¿¾ñŸ¿¿ñ¯¿¾ñ¯¿¿ñ¿¿¾ñ¿¿¿ò¿¾ò¿¿                                    |
+        òŸ¿¾òŸ¿¿ò¯¿¾ò¯¿¿ò¿¿¾ò¿¿¿ó¿¾ó¿¿óŸ¿¾óŸ¿¿ó¯¿¾ó¯¿¿ó¿¿¾ó¿¿¿ô¿¾ô¿¿"                                   |
+                                                                              |
+THE END                                                                       |

Added: trunk/tests/testsuite/utf8basic.sh
===================================================================
--- trunk/tests/testsuite/utf8basic.sh	                        (rev 0)
+++ trunk/tests/testsuite/utf8basic.sh	2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,10 @@
+#/bin/sh
+
+# utf8basic.good originally generated with:
+# uconv --from-code UTF-8 --to-code UTF-8 --from-callback substitute UTF-8-test.txt > utf8basic.good
+# but modified to ignore UTF-16 surrogates which are apparently illegal.  We return multiple replacement
+# characters there, but the spec apparently says we are only supposed to return 1 per UTF-16 surrogate
+# there are comments in the spec about "security vulnerability" but we always check if we're at the
+# end of our buffer before continuing processing each byte (shouldn't all decoders do this?), so there
+# shouldn't be a problem.  Ignoring the UTF-16 non-conformance for now.
+../utf8norm < UTF-8-test.txt


Property changes on: trunk/tests/testsuite/utf8basic.sh
___________________________________________________________________
Added: svn:executable
   + *