[sword-svn] r3496 - in trunk: src/modules/filters tests/testsuite
scribe at crosswire.org
scribe at crosswire.org
Mon Sep 11 04:41:08 MST 2017
Author: scribe
Date: 2017-09-11 04:41:08 -0700 (Mon, 11 Sep 2017)
New Revision: 3496
Added:
trunk/tests/testsuite/README
trunk/tests/testsuite/UTF-8-test.txt
trunk/tests/testsuite/greekaccents.good
trunk/tests/testsuite/greekaccents.sh
trunk/tests/testsuite/greekaccents.txt
trunk/tests/testsuite/utf8basic.good
trunk/tests/testsuite/utf8basic.sh
Modified:
trunk/src/modules/filters/utf8greekaccents.cpp
Log:
updated Greek Accents filter to be more sane and safe. Added tests for UTF8 and Greek accents
Modified: trunk/src/modules/filters/utf8greekaccents.cpp
===================================================================
--- trunk/src/modules/filters/utf8greekaccents.cpp 2017-09-11 11:40:19 UTC (rev 3495)
+++ trunk/src/modules/filters/utf8greekaccents.cpp 2017-09-11 11:41:08 UTC (rev 3496)
@@ -22,8 +22,10 @@
*/
#include <stdlib.h>
+#include <map>
#include <stdio.h>
#include <utf8greekaccents.h>
+#include <utilstr.h>
#ifdef _ICU_
@@ -31,6 +33,7 @@
sword::UTF8NFKD decompose;
#endif
+using std::map;
SWORD_NAMESPACE_START
@@ -44,6 +47,297 @@
static const StringList oVals(&choices[0], &choices[2]);
return &oVals;
}
+
+ std::map<__u32, SWBuf> converters;
+ class converters_init {
+ public:
+ converters_init() {
+ SWBuf myBuf = "";
+ //first just remove combining characters
+ converters[0x2019] = ""; // RIGHT SINGLE QUOTATION MARK
+ converters[0x1FBF] = ""; // GREEK PSILI
+ converters[0x2CFF] = ""; // COPTIC MORPHOLOGICAL DIVIDER
+ converters[0xFE24] = ""; // COMBINING MACRON LEFT HALF
+ converters[0xFE25] = ""; // COMBINING MACRON RIGHT HALF
+ converters[0xFE26] = ""; // COMBINING CONJOINING MACRON
+ converters[0x0300] = ""; // COMBINING GRAVE ACCENT
+ converters[0x0301] = ""; // COMBINING ACUTE ACCENT
+ converters[0x0302] = ""; // COMBINING CIRCUMFLEX ACCENT
+ converters[0x0308] = ""; // COMBINING DIAERESIS
+ converters[0x0313] = ""; // COMBINING COMMA ABOVE
+ converters[0x0314] = ""; // COMBINING REVERSED COMMA ABOVE
+ converters[0x037A] = ""; // GREEK YPOGEGRAMMENI
+ converters[0x0342] = ""; // COMBINING GREEK PERISPOMENI
+ // Now converted pre-composed characters to their alphabetic bases, discarding the accents
+ // Greek
+ // UPPER case
+ converters[0x0386] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH TONOS
+ converters[0x0388] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH TONOS
+ converters[0x0389] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH TONOS
+ converters[0x038A] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH TONOS
+ converters[0x03AA] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
+ converters[0x038C] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH TONOS
+ converters[0x038E] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH TONOS
+ converters[0x03AB] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
+ converters[0x038F] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH TONOS
+
+ // lower case
+ converters[0x03AC] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH TONOS
+ converters[0x03AD] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH TONOS
+ converters[0x03AE] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH TONOS
+ converters[0x03AF] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH TONOS
+ converters[0x03CA] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DIALYTIKA
+ converters[0x03CC] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH TONOS
+ converters[0x03CD] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH TONOS
+ converters[0x03CB] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DIALYTIKA
+ converters[0x03CE] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH TONOS
+
+ // Extended Greek
+ // UPPER case
+ converters[0x1F08] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI
+ converters[0x1F09] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA
+ converters[0x1F0A] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA
+ converters[0x1F0B] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA
+ converters[0x1F0C] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA
+ converters[0x1F0D] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA
+ converters[0x1F0E] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI
+ converters[0x1F0F] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI
+ converters[0x1F88] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+ converters[0x1F89] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+ converters[0x1F8A] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+ converters[0x1F8B] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+ converters[0x1F8C] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+ converters[0x1F8D] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+ converters[0x1F8E] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+ converters[0x1F8F] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+ converters[0x1FB8] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH VRACHY
+ converters[0x1FB9] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH MACRON
+ converters[0x1FBA] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH VARIA
+ converters[0x1FBB] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH OXIA
+ converters[0x1FBC] = *getUTF8FromUniChar(0x0391, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+
+ converters[0x1F18] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH PSILI
+ converters[0x1F19] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH DASIA
+ converters[0x1F1A] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA
+ converters[0x1F1B] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA
+ converters[0x1F1C] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA
+ converters[0x1F1D] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
+ converters[0x1FC8] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH VARIA
+ converters[0x1FC9] = *getUTF8FromUniChar(0x0395, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER EPSILON WITH OXIA
+
+ converters[0x1F28] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI
+ converters[0x1F29] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA
+ converters[0x1F2A] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA
+ converters[0x1F2B] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA
+ converters[0x1F2C] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA
+ converters[0x1F2D] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA
+ converters[0x1F2E] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI
+ converters[0x1F2F] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI
+ converters[0x1F98] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+ converters[0x1F99] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+ converters[0x1F9A] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+ converters[0x1F9B] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+ converters[0x1F9C] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+ converters[0x1F9D] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+ converters[0x1F9E] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+ converters[0x1F9F] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+ converters[0x1FCA] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH VARIA
+ converters[0x1FCB] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH OXIA
+ converters[0x1FCC] = *getUTF8FromUniChar(0x0397, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+
+ converters[0x1F38] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH PSILI
+ converters[0x1F39] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH DASIA
+ converters[0x1F3A] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA
+ converters[0x1F3B] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA
+ converters[0x1F3C] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA
+ converters[0x1F3D] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA
+ converters[0x1F3E] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI
+ converters[0x1F3F] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI
+ converters[0x1FD8] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH VRACHY
+ converters[0x1FD9] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH MACRON
+ converters[0x1FDA] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH VARIA
+ converters[0x1FDB] = *getUTF8FromUniChar(0x0399, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER IOTA WITH OXIA
+
+ converters[0x1F48] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH PSILI
+ converters[0x1F49] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH DASIA
+ converters[0x1F4A] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA
+ converters[0x1F4B] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA
+ converters[0x1F4C] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA
+ converters[0x1F4D] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA
+ converters[0x1FF8] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH VARIA
+ converters[0x1FF9] = *getUTF8FromUniChar(0x039F, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMICRON WITH OXIA
+
+ converters[0x1F59] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH DASIA
+ converters[0x1F5A] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH PSILI AND VARIA
+ converters[0x1F5B] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA
+ converters[0x1F5C] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH PSILI AND OXIA
+ converters[0x1F5D] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA
+ converters[0x1F5E] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH PSILI AND PERISPOMENI
+ converters[0x1F5F] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI
+ converters[0x1FE8] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH VRACHY
+ converters[0x1FE9] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH MACRON
+ converters[0x1FEA] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH VARIA
+ converters[0x1FEB] = *getUTF8FromUniChar(0x03A5, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER UPSILON WITH OXIA
+
+ converters[0x1F68] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI
+ converters[0x1F69] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA
+ converters[0x1F6A] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA
+ converters[0x1F6B] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA
+ converters[0x1F6C] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA
+ converters[0x1F6D] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA
+ converters[0x1F6E] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI
+ converters[0x1F6F] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI
+ converters[0x1FA8] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+ converters[0x1FA9] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+ converters[0x1FAA] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+ converters[0x1FAB] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+ converters[0x1FAC] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+ converters[0x1FAD] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+ converters[0x1FAE] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+ converters[0x1FAF] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+ converters[0x1FFA] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH VARIA
+ converters[0x1FFB] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH OXIA
+ converters[0x1FFC] = *getUTF8FromUniChar(0x03A9, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+
+ converters[0x1FEC] = *getUTF8FromUniChar(0x03A1, &myBuf); myBuf.setSize(0); // GREEK CAPITAL LETTER RHO WITH DASIA
+
+ // lower case
+ //alpha
+ converters[0x1F00] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI
+ converters[0x1F01] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA
+ converters[0x1F02] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA
+ converters[0x1F03] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA
+ converters[0x1F04] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA
+ converters[0x1F05] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA
+ converters[0x1F06] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI
+ converters[0x1F07] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI
+ converters[0x1F80] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+ converters[0x1F81] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+ converters[0x1F82] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+ converters[0x1F83] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+ converters[0x1F84] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+ converters[0x1F85] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+ converters[0x1F86] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+ converters[0x1F87] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+ converters[0x1F70] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH VARIA
+ converters[0x1F71] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH OXIA
+ converters[0x1FB0] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH VRACHY
+ converters[0x1FB1] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH MACRON
+ converters[0x1FB2] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+ converters[0x1FB3] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+ converters[0x1FB4] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+ converters[0x1FB5] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // unused?
+ converters[0x1FB6] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+ converters[0x1FB7] = *getUTF8FromUniChar(0x03B1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+
+ converters[0x1F10] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH PSILI
+ converters[0x1F11] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH DASIA
+ converters[0x1F12] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH PSILI AND VARIA
+ converters[0x1F13] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH DASIA AND VARIA
+ converters[0x1F14] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH PSILI AND OXIA
+ converters[0x1F15] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
+ converters[0x1F72] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH VARIA
+ converters[0x1F73] = *getUTF8FromUniChar(0x03B5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER EPSILON WITH OXIA
+
+ converters[0x1F90] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+ converters[0x1F91] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+ converters[0x1F92] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+ converters[0x1F93] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+ converters[0x1F94] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+ converters[0x1F95] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+ converters[0x1F96] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+ converters[0x1F97] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+ converters[0x1F20] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI
+ converters[0x1F21] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA
+ converters[0x1F22] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI AND VARIA
+ converters[0x1F23] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA AND VARIA
+ converters[0x1F24] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI AND OXIA
+ converters[0x1F25] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA AND OXIA
+ converters[0x1F26] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI
+ converters[0x1F27] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI
+ converters[0x1FC2] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+ converters[0x1FC3] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+ converters[0x1FC4] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+ converters[0x1FC5] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // unused?
+ converters[0x1FC6] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PERISPOMENI
+ converters[0x1FC7] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+ converters[0x1F74] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH VARIA
+ converters[0x1F75] = *getUTF8FromUniChar(0x03B7, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER ETA WITH OXIA
+
+ converters[0x1F30] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH PSILI
+ converters[0x1F31] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DASIA
+ converters[0x1F32] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH PSILI AND VARIA
+ converters[0x1F33] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DASIA AND VARIA
+ converters[0x1F34] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH PSILI AND OXIA
+ converters[0x1F35] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DASIA AND OXIA
+ converters[0x1F36] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH PSILI AND PERISPOMENI
+ converters[0x1F37] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI
+ converters[0x1F76] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH VARIA
+ converters[0x1F77] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH OXIA
+ converters[0x1FD0] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH VRACHY
+ converters[0x1FD1] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH MACRON
+ converters[0x1FD2] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+ converters[0x1FD3] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+ converters[0x1FD4] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // unused?
+ converters[0x1FD5] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // unused?
+ converters[0x1FD6] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH PERISPOMENI
+ converters[0x1FD7] = *getUTF8FromUniChar(0x03B9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+
+ converters[0x1F40] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH PSILI
+ converters[0x1F41] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH DASIA
+ converters[0x1F42] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH PSILI AND VARIA
+ converters[0x1F43] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH DASIA AND VARIA
+ converters[0x1F44] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH PSILI AND OXIA
+ converters[0x1F45] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA
+ converters[0x1F78] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH VARIA
+ converters[0x1F79] = *getUTF8FromUniChar(0x03BF, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMICRON WITH OXIA
+
+ converters[0x1F50] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH PSILI
+ converters[0x1F51] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DASIA
+ converters[0x1F52] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+ converters[0x1F53] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DASIA AND VARIA
+ converters[0x1F54] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+ converters[0x1F55] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DASIA AND OXIA
+ converters[0x1F56] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+ converters[0x1F57] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI
+ converters[0x1F7A] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH VARIA
+ converters[0x1F7B] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH OXIA
+ converters[0x1FE0] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH VRACHY
+ converters[0x1FE1] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH MACRON
+ converters[0x1FE2] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+ converters[0x1FE3] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+ converters[0x1FE6] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+ converters[0x1FE7] = *getUTF8FromUniChar(0x03C5, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+
+ converters[0x1F60] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI
+ converters[0x1F61] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA
+ converters[0x1F62] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA
+ converters[0x1F63] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA
+ converters[0x1F64] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA
+ converters[0x1F65] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA
+ converters[0x1F66] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI
+ converters[0x1F67] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
+ converters[0x1F7C] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH VARIA
+ converters[0x1F7D] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH OXIA
+ converters[0x1FA0] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+ converters[0x1FA1] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+ converters[0x1FA2] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+ converters[0x1FA3] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+ converters[0x1FA4] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+ converters[0x1FA5] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+ converters[0x1FA6] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+ converters[0x1FA7] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+ converters[0x1FF2] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+ converters[0x1FF3] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+ converters[0x1FF4] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+ converters[0x1FF5] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // unused?
+ converters[0x1FF6] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+ converters[0x1FF7] = *getUTF8FromUniChar(0x03C9, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+
+ converters[0x1FE4] = *getUTF8FromUniChar(0x03C1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER RHO WITH PSILI
+ converters[0x1FE5] = *getUTF8FromUniChar(0x03C1, &myBuf); myBuf.setSize(0); // GREEK SMALL LETTER RHO WITH DASIA
+ }
+ } __converters_init;
}
@@ -57,229 +351,24 @@
char UTF8GreekAccents::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
if (!option) { //we don't want greek accents
- //unsigned char *to, *from;
- //to = (unsigned char*)text;
- //for (from = (unsigned char*)text; *from; from++) {
-#ifdef _ICU_
- decompose.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
-#endif
-
SWBuf orig = text;
const unsigned char* from = (unsigned char*)orig.c_str();
- for (text = ""; *from; from++) {
- //first just remove combining characters
- if (*from == 0xE2 && *(from + 1) == 0x80 && *(from + 2) == 0x99) {
- from += 2;
- }
- else if (*from == 0xCC && *(from + 1)) {
- if (*(from + 1) == 0x80 || *(from + 1) == 0x81 || *(from + 1) == 0x82 || *(from + 1) == 0x88 || *(from + 1) == 0x93 || *(from + 1) == 0x94) {
- from++;
- }
- }
- else if (*from == 0xCD && (*(from + 1) == 0xBA || *(from + 1) == 0x82)) {
- from++;
- }
- //now converted pre-composed characters to their alphabetic bases, discarding the accents
+ text = "";
+ map<__u32, SWBuf>::const_iterator it = converters.end();
+ while (*from) {
+ __u32 ch = getUniCharFromUTF8(&from, true);
+ // if ch is bad, then convert to replacement char
+ if (!ch) ch = 0xFFFD;
- //Greek
- //capital alpha
- else if ((*from == 0xCE && *(from + 1) == 0x86)) {
- text += 0xCE;
- text += 0x91;
- from++;
+ it = converters.find(ch);
+ if (it == converters.end()) {
+ getUTF8FromUniChar(ch, &text);
}
- //capital epsilon
- else if ((*from == 0xCE && *(from + 1) == 0x88)) {
- text += 0xCE;
- text += 0x95;
- from++;
- }
- //capital eta
- else if ((*from == 0xCE && *(from + 1) == 0x89)) {
- text += 0xCE;
- text += 0x97;
- from++;
- }
- //capital iota
- else if ((*from == 0xCE && (*(from + 1) == 0x8A || *(from + 1) == 0xAA))) {
- text += 0xCE;
- text += 0x99;
- from++;
- }
- //capital omicron
- else if ((*from == 0xCE && *(from + 1) == 0x8C)) {
- text += 0xCE;
- text += 0x9F;
- from++;
- }
- //capital upsilon
- else if ((*from == 0xCE && (*(from + 1) == 0x8E || *(from + 1) == 0xAB))) {
- text += 0xCE;
- text += 0xA5;
- from++;
- }
- //capital omega
- else if ((*from == 0xCE && *(from + 1) == 0x8F)) {
- text += 0xCE;
- text += 0xA9;
- from++;
- }
-
- //alpha
- else if ((*from == 0xCE && *(from + 1) == 0xAC)) {
- text += 0xCE;
- text += 0xB1;
- from++;
- }
- //epsilon
- else if ((*from == 0xCE && *(from + 1) == 0xAD)) {
- text += 0xCE;
- text += 0xB5;
- from++;
- }
- //eta
- else if ((*from == 0xCE && *(from + 1) == 0xAE)) {
- text += 0xCE;
- text += 0xB7;
- from++;
- }
- //iota
- else if ((*from == 0xCE && *(from + 1) == 0xAF) || (*from == 0xCF && *(from + 1) == 0x8A)) {
- text += 0xCE;
- text += 0xB9;
- from++;
- }
- //omicron
- else if ((*from == 0xCF && *(from + 1) == 0x8C)) {
- text += 0xCE;
- text += 0xBF;
- from++;
- }
- //upsilon
- else if ((*from == 0xCE && *(from + 1) == 0x88) || (*from == 0xCF && (*(from + 1) == 0x8B || *(from + 1) == 0x8D))) {
- text += 0xCF;
- text += 0x85;
- from++;
- }
- //omega
- else if ((*from == 0xCF && *(from + 1) == 0x8E)) {
- text += 0xCF;
- text += 0x89;
- from++;
- }
-
- //Extended Greek
- //capital alpha
- else if (*from == 0xE1 && (((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x88 && *(from + 2) <= 0x8F) || (*(from + 1) == 0xBE && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBC))) {
- text += 0xCE;
- text += 0x91;
- from+=2;
- }
- //capital epsilon
- else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x98 && *(from + 2) <= 0x9D) || (*(from + 1) == 0xBF && (*(from + 2) == 0x88 || *(from + 2) == 0x89)))) {
- text += 0xCE;
- text += 0x95;
- from+=2;
- }
- //capital eta
- else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBE && *(from + 2) >= 0x98 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0x8A && *(from + 2) <= 0x8C))) {
- text += 0xCE;
- text += 0x97;
- from+=2;
- }
- //capital iota
- else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBF) || (*(from + 1) == 0xBF && *(from + 2) >= 0x98 && *(from + 2) <= 0x9B))) {
- text += 0xCE;
- text += 0x99;
- from+=2;
- }
- //capital omicron
- else if (*from == 0xE1 && (((*(from + 1) == 0xBD && *(from + 2) >= 0x88 && *(from + 2) <= 0x8D)) || ((*(from + 1) == 0xBF && (*(from + 2) == 0xB8 || *(from + 2) == 0xB9))))) {
- text += 0xCE;
- text += 0x9F;
- from+=2;
- }
- //capital upsilon
- else if (*from == 0xE1 && ((*(from + 1) == 0xBD && *(from + 2) >= 0x99 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAB))) {
- text += 0xCE;
- text += 0xA5;
- from+=2;
- }
- //capital omega
- else if (*from == 0xE1 && (((*(from + 1) == 0xBD || *(from + 1) == 0xBE) && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBF && *(from + 2) >= 0xBA && *(from + 2) <= 0xBC))) {
- text += 0xCE;
- text += 0xA9;
- from+=2;
- }
- //capital rho
- else if (*from == 0xE1 && *(from + 1) == 0xBF && *(from + 2) == 0xAC) {
- text += 0xCE;
- text += 0xA1;
- from+=2;
- }
-
- //alpha
- else if (*from == 0xE1 && (
- ((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x80 && *(from + 2) <= 0x87)
- || (*(from + 1) == 0xBD && (*(from + 2) == 0xB0 || *(from + 2) == 0xB1))
- || (*(from + 1) == 0xBE && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7))) {
- text += 0xCE;
- text += 0xB1;
- from+=2;
- }
- //epsilon
- else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x90 && *(from + 2) <= 0x95) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB2 || *(from + 2) == 0xB3)))) {
- text += 0xCE;
- text += 0xB5;
- from+=2;
- }
- //eta
- else if (*from == 0xE1 && ((*(from + 1) == 0xBE && *(from + 2) >= 0x90 && *(from + 2) <= 0x97) || (*(from + 1) == 0xBC && *(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 1) == 0xBF && *(from + 2) >= 0x82 && *(from + 2) <= 0x87) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB4 || *(from + 2) == 0xB5)))) {
- text += 0xCE;
- text += 0xB7;
- from+=2;
- }
- //iota
- else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB6 || *(from + 2) == 0xB7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0x90 && *(from + 2) <= 0x97))) {
- text += 0xCE;
- text += 0xB9;
- from+=2;
- }
- //omicron
- else if (*from == 0xE1 && (*(from + 1) == 0xBD && ((*(from + 2) >= 0x80 && *(from + 2) <= 0x85) || (*(from + 2) == 0xB8 || *(from + 2) == 0xB9)))) {
- text += 0xCE;
- text += 0xBF;
- from+=2;
- }
- //upsilon
- else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0x90 && *(from + 2) <= 0x97) || *(from + 2) == 0xBA || *(from + 2) == 0xBB)) || (*(from + 1) == 0xBF && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA3) || *(from + 2) == 0xA6 || *(from + 2) == 0xA7)))) {
- text += 0xCF;
- text += 0x85;
- from+=2;
- }
- //omega
- else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 2) == 0xBC || *(from + 2) == 0xBD))) || (*(from + 1) == 0xBE && (*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0xB2 && *(from + 2) <= 0xB7))) {
- text += 0xCF;
- text += 0x89;
- from+=2;
- }
- //rho
- else if (*from == 0xE1 && *(from + 1) == 0xBF && (*(from + 2) == 0xA4 && *(from + 2) == 0xA5)) {
- text += 0xCF;
- text += 0x81;
- from+=2;
- }
- else { //no characters we filter
- text += *from;
- }
+ else text.append((const char *)it->second, it->second.size()); // save a strlen, since we know our size
}
}
return 0;
}
-
-
-
-
SWORD_NAMESPACE_END
Added: trunk/tests/testsuite/README
===================================================================
--- trunk/tests/testsuite/README (rev 0)
+++ trunk/tests/testsuite/README 2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,24 @@
+All tests are represented by a <test>.sh / <test>.good file pair.
+
+To run a test:
+
+./runtest.sh test
+
+This will run test.sh > test.try and compair test.try to test.good and report any differences (failures)
+
+To run all tests:
+
+./runall.sh
+
+===================================
+
+To create a new test, do whatever you want in your new mytest.sh file,
+call, executables, do anything you'd like and output results which
+matter for a good test.
+
+When all is running fine, output your .good file with:
+
+./mytest.sh > mytest.good
+
+That's it. Simple right? :) So make more unit tests!
+
Added: trunk/tests/testsuite/UTF-8-test.txt
===================================================================
--- trunk/tests/testsuite/UTF-8-test.txt (rev 0)
+++ trunk/tests/testsuite/UTF-8-test.txt 2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,300 @@
+UTF-8 decoder capability and stress test
+----------------------------------------
+
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
+
+This test file can help you examine, how your UTF-8 decoder handles
+various types of correct, malformed, or otherwise interesting UTF-8
+sequences. This file is not meant to be a conformance test. It does
+not prescribe any particular outcome. Therefore, there is no way to
+"pass" or "fail" this test file, even though the text does suggest a
+preferable decoder behaviour at some places. Its aim is, instead, to
+help you think about, and test, the behaviour of your UTF-8 decoder on a
+systematic collection of unusual inputs. Experience so far suggests
+that most first-time authors of UTF-8 decoders find at least one
+serious problem in their decoder using this file.
+
+The test lines below cover boundary conditions, malformed UTF-8
+sequences, as well as correctly encoded UTF-8 sequences of Unicode code
+points that should never occur in a correct UTF-8 file.
+
+According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
+receiving UTF-8 shall interpret a "malformed sequence in the same way
+that it interprets a character that is outside the adopted subset" and
+"characters that are not within the adopted subset shall be indicated
+to the user" by a receiving device. One commonly used approach in
+UTF-8 decoders is to replace any malformed UTF-8 sequence by a
+replacement character (U+FFFD), which looks a bit like an inverted
+question mark, or a similar symbol. It might be a good idea to
+visually distinguish a malformed UTF-8 sequence from a correctly
+encoded Unicode character that is just not available in the current
+font but otherwise fully legal, even though ISO 10646-1 doesn't
+mandate this. In any case, just ignoring malformed sequences or
+unavailable characters does not conform to ISO 10646, will make
+debugging more difficult, and can lead to user confusion.
+
+Please check, whether a malformed UTF-8 sequence is (1) represented at
+all, (2) represented by exactly one single replacement character (or
+equivalent signal), and (3) the following quotation mark after an
+illegal UTF-8 sequence is correctly displayed, i.e. proper
+resynchronization takes place immediately after any malformed
+sequence. This file says "THE END" in the last line, so if you don't
+see that, your decoder crashed somehow before, which should always be
+cause for concern.
+
+All lines in this file are exactly 79 characters long (plus the line
+feed). In addition, all lines end with "|", except for the two test
+lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
+U+0000 and U+007F. If you display this file with a fixed-width font,
+these "|" characters should all line up in column 79 (right margin).
+This allows you to test quickly, whether your UTF-8 decoder finds the
+correct number of characters in every line, that is whether each
+malformed sequences is replaced by a single replacement character.
+
+Note that, as an alternative to the notion of malformed sequence used
+here, it is also a perfectly acceptable (and in some situations even
+preferable) solution to represent each individual byte of a malformed
+sequence with a replacement character. If you follow this strategy in
+your decoder, then please ignore the "|" column.
+
+
+Here come the tests: |
+ |
+1 Some correct UTF-8 text |
+ |
+You should see the Greek word 'kosme': "κόÏμε" |
+ |
+2 Boundary condition test cases |
+ |
+2.1 First possible sequence of a certain length |
+ |
+2.1.1 1 byte (U-00000000): "^@" // SWORD: removed. we don't support null mid-string, <- that's a literal <caret at>
+2.1.2 2 bytes (U-00000080): "Â" |
+2.1.3 3 bytes (U-00000800): "à " |
+2.1.4 4 bytes (U-00010000): "ð" |
+2.1.5 5 bytes (U-00200000): "ø" |
+2.1.6 6 bytes (U-04000000): "ü" |
+ |
+2.2 Last possible sequence of a certain length |
+ |
+2.2.1 1 byte (U-0000007F): ""
+2.2.2 2 bytes (U-000007FF): "ß¿" |
+2.2.3 3 bytes (U-0000FFFF): "ï¿¿" |
+2.2.4 4 bytes (U-001FFFFF): "÷¿¿¿" |
+2.2.5 5 bytes (U-03FFFFFF): "û¿¿¿¿" |
+2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿¿¿¿" |
+ |
+2.3 Other boundary conditions |
+ |
+2.3.1 U-0000D7FF = ed 9f bf = "í¿" |
+2.3.2 U-0000E000 = ee 80 80 = "î" |
+2.3.3 U-0000FFFD = ef bf bd = "�" |
+2.3.4 U-0010FFFF = f4 8f bf bf = "ô¿¿" |
+2.3.5 U-00110000 = f4 90 80 80 = "ô" |
+ |
+3 Malformed sequences |
+ |
+3.1 Unexpected continuation bytes |
+ |
+Each unexpected continuation byte should be separately signalled as a |
+malformed sequence of its own. |
+ |
+3.1.1 First continuation byte 0x80: "" |
+3.1.2 Last continuation byte 0xbf: "¿" |
+ |
+3.1.3 2 continuation bytes: "¿" |
+3.1.4 3 continuation bytes: "¿" |
+3.1.5 4 continuation bytes: "¿¿" |
+3.1.6 5 continuation bytes: "¿¿" |
+3.1.7 6 continuation bytes: "¿¿¿" |
+3.1.8 7 continuation bytes: "¿¿¿" |
+ |
+3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): |
+ |
+ "
|
+ |
+ ¡¢£¤¥¦§¨©ª«¬®¯ |
+ °±²³´µ¶·¸¹º»¼½¾¿" |
+ |
+3.2 Lonely start characters |
+ |
+3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), |
+ each followed by a space character: |
+ |
+ "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï |
+ Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß " |
+ |
+3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), |
+ each followed by a space character: |
+ |
+ "à á â ã ä å æ ç è é ê ë ì í î ï " |
+ |
+3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), |
+ each followed by a space character: |
+ |
+ "ð ñ ò ó ô õ ö ÷ " |
+ |
+3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), |
+ each followed by a space character: |
+ |
+ "ø ù ú û " |
+ |
+3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), |
+ each followed by a space character: |
+ |
+ "ü ý " |
+ |
+3.3 Sequences with last continuation byte missing |
+ |
+All bytes of an incomplete sequence should be signalled as a single |
+malformed sequence, i.e., you should see only a single replacement |
+character in each of the next 10 tests. (Characters as in section 2) |
+ |
+3.3.1 2-byte sequence with last byte missing (U+0000): "À" |
+3.3.2 3-byte sequence with last byte missing (U+0000): "à" |
+3.3.3 4-byte sequence with last byte missing (U+0000): "ð" |
+3.3.4 5-byte sequence with last byte missing (U+0000): "ø" |
+3.3.5 6-byte sequence with last byte missing (U+0000): "ü" |
+3.3.6 2-byte sequence with last byte missing (U-000007FF): "ß" |
+3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "ï¿" |
+3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "÷¿¿" |
+3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "û¿¿¿" |
+3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "ý¿¿¿¿" |
+ |
+3.4 Concatenation of incomplete sequences |
+ |
+All the 10 sequences of 3.3 concatenated, you should see 10 malformed |
+sequences being signalled: |
+ |
+ "Ààðøüßï¿÷¿¿û¿¿¿ý¿¿¿¿" |
+ |
+3.5 Impossible bytes |
+ |
+The following two bytes cannot appear in a correct UTF-8 string |
+ |
+3.5.1 fe = "þ" |
+3.5.2 ff = "ÿ" |
+3.5.3 fe fe ff ff = "þþÿÿ" |
+ |
+4 Overlong sequences |
+ |
+The following sequences are not malformed according to the letter of |
+the Unicode 2.0 standard. However, they are longer then necessary and |
+a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 |
+decoder" should reject them just like malformed sequences for two |
+reasons: (1) It helps to debug applications if overlong sequences are |
+not treated as valid representations of characters, because this helps |
+to spot problems more quickly. (2) Overlong sequences provide |
+alternative representations of characters, that could maliciously be |
+used to bypass filters that check only for ASCII characters. For |
+instance, a 2-byte encoded line feed (LF) would not be caught by a |
+line counter that counts only 0x0a bytes, but it would still be |
+processed as a line feed by an unsafe UTF-8 decoder later in the |
+pipeline. From a security point of view, ASCII compatibility of UTF-8 |
+sequences means also, that ASCII characters are *only* allowed to be |
+represented by ASCII bytes in the range 0x00-0x7f. To ensure this |
+aspect of ASCII compatibility, use only "safe UTF-8 decoders" that |
+reject overlong UTF-8 sequences for which a shorter encoding exists. |
+ |
+4.1 Examples of an overlong ASCII character |
+ |
+With a safe UTF-8 decoder, all of the following five overlong |
+representations of the ASCII character slash ("/") should be rejected |
+like a malformed UTF-8 sequence, for instance by substituting it with |
+a replacement character. If you see a slash below, you do not have a |
+safe UTF-8 decoder! |
+ |
+4.1.1 U+002F = c0 af = "À¯" |
+4.1.2 U+002F = e0 80 af = "à¯" |
+4.1.3 U+002F = f0 80 80 af = "ð¯" |
+4.1.4 U+002F = f8 80 80 80 af = "ø¯" |
+4.1.5 U+002F = fc 80 80 80 80 af = "ü¯" |
+ |
+4.2 Maximum overlong sequences |
+ |
+Below you see the highest Unicode value that is still resulting in an |
+overlong sequence if represented with the given number of bytes. This |
+is a boundary test for safe UTF-8 decoders. All five characters should |
+be rejected like malformed UTF-8 sequences. |
+ |
+4.2.1 U-0000007F = c1 bf = "Á¿" |
+4.2.2 U-000007FF = e0 9f bf = "à¿" |
+4.2.3 U-0000FFFF = f0 8f bf bf = "ð¿¿" |
+4.2.4 U-001FFFFF = f8 87 bf bf bf = "ø¿¿¿" |
+4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "ü¿¿¿¿" |
+ |
+4.3 Overlong representation of the NUL character |
+ |
+The following five sequences should also be rejected like malformed |
+UTF-8 sequences and should not be treated like the ASCII NUL |
+character. |
+ |
+4.3.1 U+0000 = c0 80 = "À" |
+4.3.2 U+0000 = e0 80 80 = "à" |
+4.3.3 U+0000 = f0 80 80 80 = "ð" |
+4.3.4 U+0000 = f8 80 80 80 80 = "ø" |
+4.3.5 U+0000 = fc 80 80 80 80 80 = "ü" |
+ |
+5 Illegal code positions |
+ |
+The following UTF-8 sequences should be rejected like malformed |
+sequences, because they never represent valid ISO 10646 characters and |
+a UTF-8 decoder that accepts them might introduce security problems |
+comparable to overlong UTF-8 sequences. |
+ |
+5.1 Single UTF-16 surrogates |
+ |
+5.1.1 U+D800 = ed a0 80 = "í " |
+5.1.2 U+DB7F = ed ad bf = "í¿" |
+5.1.3 U+DB80 = ed ae 80 = "í®" |
+5.1.4 U+DBFF = ed af bf = "í¯¿" |
+5.1.5 U+DC00 = ed b0 80 = "í°" |
+5.1.6 U+DF80 = ed be 80 = "í¾" |
+5.1.7 U+DFFF = ed bf bf = "í¿¿" |
+ |
+5.2 Paired UTF-16 surrogates |
+ |
+5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "í í°" |
+5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "í í¿¿" |
+5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "í¿í°" |
+5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "í¿í¿¿" |
+5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "í®í°" |
+5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "í®í¿¿" |
+5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "í¯¿í°" |
+5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "􏿿" |
+ |
+5.3 Noncharacter code positions |
+ |
+The following "noncharacters" are "reserved for internal use" by |
+applications, and according to older versions of the Unicode Standard |
+"should never be interchanged". Unicode Corrigendum #9 dropped the |
+latter restriction. Nevertheless, their presence in incoming UTF-8 data |
+can remain a potential security risk, depending on what use is made of |
+these codes subsequently. Examples of such internal use: |
+ |
+ - Some file APIs with 16-bit characters may use the integer value -1 |
+ = U+FFFF to signal an end-of-file (EOF) or error condition. |
+ |
+ - In some UTF-16 receivers, code point U+FFFE might trigger a |
+ byte-swap operation (to convert between UTF-16LE and UTF-16BE). |
+ |
+With such internal use of noncharacters, it may be desirable and safer |
+to block those code points in UTF-8 decoders, as they should never |
+occur legitimately in incoming UTF-8 data, and could trigger unsafe |
+behaviour in subsequent processing. |
+ |
+Particularly problematic noncharacters in 16-bit applications: |
+ |
+5.3.1 U+FFFE = ef bf be = "￾" |
+5.3.2 U+FFFF = ef bf bf = "ï¿¿" |
+ |
+Other noncharacters: |
+ |
+5.3.3 U+FDD0 .. U+FDEF = "ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï· ﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬ï·ï·®ï·¯"|
+ |
+5.3.4 U+nFFFE U+nFFFF (for n = 1..10) |
+ |
+ "ð¿¾ð¿¿ð¯¿¾ð¯¿¿ð¿¿¾ð¿¿¿ñ¿¾ñ¿¿ñ¿¾ñ¿¿ñ¯¿¾ñ¯¿¿ñ¿¿¾ñ¿¿¿ò¿¾ò¿¿ |
+ ò¿¾ò¿¿ò¯¿¾ò¯¿¿ò¿¿¾ò¿¿¿ó¿¾ó¿¿ó¿¾ó¿¿ó¯¿¾ó¯¿¿ó¿¿¾ó¿¿¿ô¿¾ô¿¿" |
+ |
+THE END |
Added: trunk/tests/testsuite/greekaccents.good
===================================================================
--- trunk/tests/testsuite/greekaccents.good (rev 0)
+++ trunk/tests/testsuite/greekaccents.good 2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,7 @@
+Îαι καθÏÏ ÎÏÏ
ÏÎ·Ï Ï
ÏÏÏεν Ïον οÏιν εν Ïη εÏημÏ, οÏ
ÏÏÏ Ï
ÏÏθηναι δει Ïον Ï
ιον ÏοÏ
ανθÏÏÏοÏ
,
+ινα ÏÎ±Ï Î¿ ÏιÏÏεÏ
Ïν â¸ÎµÎ½ αÏ
ÏÏ⸠⸠εÏη ζÏην αιÏνιον.
+οÏ
ÏÏÏ Î³Î±Ï Î·Î³Î±ÏηÏεν ο Î¸ÎµÎ¿Ï Ïον κοÏμον, ÏÏÏε Ïον Ï
ιον ⸠Ïον μονογενη εδÏκεν, ινα ÏÎ±Ï Î¿ ÏιÏÏεÏ
Ïν ÎµÎ¹Ï Î±Ï
Ïον μη αÏοληÏαι αλλ εÏη ζÏην αιÏνιον.
+οÏ
Î³Î±Ï Î±ÏεÏÏειλεν ο Î¸ÎµÎ¿Ï Ïον Ï
ιον â¸ ÎµÎ¹Ï Ïον κοÏμον ινα κÏινη Ïον κοÏμον, αλλ ινα ÏÏθη ο κοÏÎ¼Î¿Ï Î´Î¹ αÏ
ÏοÏ
.
+ο ÏιÏÏεÏ
Ïν ÎµÎ¹Ï Î±Ï
Ïον οÏ
κÏινεÏαι· ο °δε μη ÏιÏÏεÏ
Ïν ηδη κεκÏιÏαι, οÏι μη ÏεÏιÏÏεÏ
κεν ÎµÎ¹Ï Ïο ονομα ÏοÏ
μονογενοÏ
Ï Ï
ιοÏ
ÏοÏ
θεοÏ
.
+αÏ
Ïη δε εÏÏιν η κÏιÏÎ¹Ï Î¿Ïι °Ïο ÏÏÏ ÎµÎ»Î·Î»Ï
θεν ÎµÎ¹Ï Ïον κοÏμον και â¸Î·Î³Î±ÏηÏαν οι ανθÏÏÏοι μαλλον Ïο ÏκοÏοÏ⸠η Ïο ÏÏÏ· ην Î³Î±Ï â¸Â¹Î±Ï
ÏÏν ÏονηÏα⸠Ïα εÏγα.
+
Added: trunk/tests/testsuite/greekaccents.sh
===================================================================
--- trunk/tests/testsuite/greekaccents.sh (rev 0)
+++ trunk/tests/testsuite/greekaccents.sh 2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,8 @@
+#/bin/sh
+
+# there is an iteration value as the last parameter and can be used
+# for testing speed. Set to 999999 my results on my Dell Precision 5510
+# real 0m8.952s
+# user 0m8.939s
+# sys 0m0.004s
+../utf8norm -ga 999 < greekaccents.txt
Property changes on: trunk/tests/testsuite/greekaccents.sh
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tests/testsuite/greekaccents.txt
===================================================================
--- trunk/tests/testsuite/greekaccents.txt (rev 0)
+++ trunk/tests/testsuite/greekaccents.txt 2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,7 @@
+Îαὶ ÎºÎ±Î¸á½¼Ï ÎÏÏÏá¿Ï á½ÏÏÏεν Ïὸν á½Ïιν á¼Î½ Ïá¿ á¼Ïήμῳ, οá½ÏÏÏ á½ÏÏθá¿Î½Î±Î¹ δεῠÏὸν Ï
ἱὸν Ïοῦ á¼Î½Î¸Ïá½½ÏοÏ
,
+ἵνα Ïá¾¶Ï á½ ÏιÏÏεύÏν â¸á¼Î½ αá½Ïῷ⸠⸠á¼ÏῠζÏὴν αἰώνιον.
+οá½ÏÏÏ Î³á½°Ï á¼ Î³Î¬ÏηÏεν á½ Î¸Îµá½¸Ï Ïὸν κόÏμον, á½¥ÏÏε Ïὸν Ï
ἱὸν ⸠Ïὸν μονογενῠá¼Î´Ïκεν, ἵνα Ïá¾¶Ï á½ ÏιÏÏεύÏν Îµá¼°Ï Î±á½Ïὸν μὴ á¼ÏόληÏαι á¼Î»Î»á¾¿ á¼ÏῠζÏὴν αἰώνιον.
+Î¿á½ Î³á½°Ï á¼Ïá½³ÏÏειλεν á½ Î¸Îµá½¸Ï Ïὸν Ï
ἱὸν â¸ Îµá¼°Ï Ïὸν κόÏμον ἵνα κÏίνῠÏὸν κόÏμον, á¼Î»Î»á¾¿ ἵνα ÏÏθῠὠκόÏÎ¼Î¿Ï Î´Î¹á¾¿ αá½Ïοῦ.
+á½ ÏιÏÏεύÏν Îµá¼°Ï Î±á½Ïὸν οὠκÏίνεÏαι· ὠ°δὲ μὴ ÏιÏÏεύÏν ἤδη κέκÏιÏαι, á½
Ïι μὴ ÏεÏá½·ÏÏεÏ
κεν Îµá¼°Ï Ïὸ á½Î½Î¿Î¼Î± Ïοῦ Î¼Î¿Î½Î¿Î³ÎµÎ½Î¿á¿¦Ï Ï
ἱοῦ Ïοῦ θεοῦ.
+αá½Ïη δέ á¼ÏÏιν ἡ κÏá½·ÏÎ¹Ï á½
Ïι °Ïὸ Ïá¿¶Ï á¼Î»Î®Î»Ï
θεν Îµá¼°Ï Ïὸν κόÏμον καὶ â¸á¼ γάÏηÏαν οἱ á¼Î½Î¸ÏÏÏοι μᾶλλον Ïὸ ÏκόÏοÏ⸠ἢ Ïὸ ÏῶÏ· ἦν Î³á½°Ï â¸Â¹Î±á½Ïῶν ÏονηÏὰ⸠Ïá½° á¼Ïγα.
+
Added: trunk/tests/testsuite/utf8basic.good
===================================================================
--- trunk/tests/testsuite/utf8basic.good (rev 0)
+++ trunk/tests/testsuite/utf8basic.good 2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,300 @@
+UTF-8 decoder capability and stress test
+----------------------------------------
+
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
+
+This test file can help you examine, how your UTF-8 decoder handles
+various types of correct, malformed, or otherwise interesting UTF-8
+sequences. This file is not meant to be a conformance test. It does
+not prescribe any particular outcome. Therefore, there is no way to
+"pass" or "fail" this test file, even though the text does suggest a
+preferable decoder behaviour at some places. Its aim is, instead, to
+help you think about, and test, the behaviour of your UTF-8 decoder on a
+systematic collection of unusual inputs. Experience so far suggests
+that most first-time authors of UTF-8 decoders find at least one
+serious problem in their decoder using this file.
+
+The test lines below cover boundary conditions, malformed UTF-8
+sequences, as well as correctly encoded UTF-8 sequences of Unicode code
+points that should never occur in a correct UTF-8 file.
+
+According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
+receiving UTF-8 shall interpret a "malformed sequence in the same way
+that it interprets a character that is outside the adopted subset" and
+"characters that are not within the adopted subset shall be indicated
+to the user" by a receiving device. One commonly used approach in
+UTF-8 decoders is to replace any malformed UTF-8 sequence by a
+replacement character (U+FFFD), which looks a bit like an inverted
+question mark, or a similar symbol. It might be a good idea to
+visually distinguish a malformed UTF-8 sequence from a correctly
+encoded Unicode character that is just not available in the current
+font but otherwise fully legal, even though ISO 10646-1 doesn't
+mandate this. In any case, just ignoring malformed sequences or
+unavailable characters does not conform to ISO 10646, will make
+debugging more difficult, and can lead to user confusion.
+
+Please check, whether a malformed UTF-8 sequence is (1) represented at
+all, (2) represented by exactly one single replacement character (or
+equivalent signal), and (3) the following quotation mark after an
+illegal UTF-8 sequence is correctly displayed, i.e. proper
+resynchronization takes place immediately after any malformed
+sequence. This file says "THE END" in the last line, so if you don't
+see that, your decoder crashed somehow before, which should always be
+cause for concern.
+
+All lines in this file are exactly 79 characters long (plus the line
+feed). In addition, all lines end with "|", except for the two test
+lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
+U+0000 and U+007F. If you display this file with a fixed-width font,
+these "|" characters should all line up in column 79 (right margin).
+This allows you to test quickly, whether your UTF-8 decoder finds the
+correct number of characters in every line, that is whether each
+malformed sequences is replaced by a single replacement character.
+
+Note that, as an alternative to the notion of malformed sequence used
+here, it is also a perfectly acceptable (and in some situations even
+preferable) solution to represent each individual byte of a malformed
+sequence with a replacement character. If you follow this strategy in
+your decoder, then please ignore the "|" column.
+
+
+Here come the tests: |
+ |
+1 Some correct UTF-8 text |
+ |
+You should see the Greek word 'kosme': "κόÏμε" |
+ |
+2 Boundary condition test cases |
+ |
+2.1 First possible sequence of a certain length |
+ |
+2.1.1 1 byte (U-00000000): "^@" // SWORD: removed. we don't support null mid-string, <- that's a literal <caret at>
+2.1.2 2 bytes (U-00000080): "Â" |
+2.1.3 3 bytes (U-00000800): "à " |
+2.1.4 4 bytes (U-00010000): "ð" |
+2.1.5 5 bytes (U-00200000): "�" |
+2.1.6 6 bytes (U-04000000): "�" |
+ |
+2.2 Last possible sequence of a certain length |
+ |
+2.2.1 1 byte (U-0000007F): ""
+2.2.2 2 bytes (U-000007FF): "ß¿" |
+2.2.3 3 bytes (U-0000FFFF): "ï¿¿" |
+2.2.4 4 bytes (U-001FFFFF): "�" |
+2.2.5 5 bytes (U-03FFFFFF): "�" |
+2.2.6 6 bytes (U-7FFFFFFF): "�" |
+ |
+2.3 Other boundary conditions |
+ |
+2.3.1 U-0000D7FF = ed 9f bf = "í¿" |
+2.3.2 U-0000E000 = ee 80 80 = "î" |
+2.3.3 U-0000FFFD = ef bf bd = "�" |
+2.3.4 U-0010FFFF = f4 8f bf bf = "ô¿¿" |
+2.3.5 U-00110000 = f4 90 80 80 = "�" |
+ |
+3 Malformed sequences |
+ |
+3.1 Unexpected continuation bytes |
+ |
+Each unexpected continuation byte should be separately signalled as a |
+malformed sequence of its own. |
+ |
+3.1.1 First continuation byte 0x80: "�" |
+3.1.2 Last continuation byte 0xbf: "�" |
+ |
+3.1.3 2 continuation bytes: "��" |
+3.1.4 3 continuation bytes: "���" |
+3.1.5 4 continuation bytes: "����" |
+3.1.6 5 continuation bytes: "�����" |
+3.1.7 6 continuation bytes: "������" |
+3.1.8 7 continuation bytes: "�������" |
+ |
+3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): |
+ |
+ "���������������� |
+ ���������������� |
+ ���������������� |
+ ����������������" |
+ |
+3.2 Lonely start characters |
+ |
+3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), |
+ each followed by a space character: |
+ |
+ "� � � � � � � � � � � � � � � � |
+ � � � � � � � � � � � � � � � � " |
+ |
+3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), |
+ each followed by a space character: |
+ |
+ "� � � � � � � � � � � � � � � � " |
+ |
+3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), |
+ each followed by a space character: |
+ |
+ "� � � � � � � � " |
+ |
+3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), |
+ each followed by a space character: |
+ |
+ "� � � � " |
+ |
+3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), |
+ each followed by a space character: |
+ |
+ "� � " |
+ |
+3.3 Sequences with last continuation byte missing |
+ |
+All bytes of an incomplete sequence should be signalled as a single |
+malformed sequence, i.e., you should see only a single replacement |
+character in each of the next 10 tests. (Characters as in section 2) |
+ |
+3.3.1 2-byte sequence with last byte missing (U+0000): "�" |
+3.3.2 3-byte sequence with last byte missing (U+0000): "�" |
+3.3.3 4-byte sequence with last byte missing (U+0000): "�" |
+3.3.4 5-byte sequence with last byte missing (U+0000): "�" |
+3.3.5 6-byte sequence with last byte missing (U+0000): "�" |
+3.3.6 2-byte sequence with last byte missing (U-000007FF): "�" |
+3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "�" |
+3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "�" |
+3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "�" |
+3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�" |
+ |
+3.4 Concatenation of incomplete sequences |
+ |
+All the 10 sequences of 3.3 concatenated, you should see 10 malformed |
+sequences being signalled: |
+ |
+ "����������" |
+ |
+3.5 Impossible bytes |
+ |
+The following two bytes cannot appear in a correct UTF-8 string |
+ |
+3.5.1 fe = "�" |
+3.5.2 ff = "�" |
+3.5.3 fe fe ff ff = "����" |
+ |
+4 Overlong sequences |
+ |
+The following sequences are not malformed according to the letter of |
+the Unicode 2.0 standard. However, they are longer then necessary and |
+a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 |
+decoder" should reject them just like malformed sequences for two |
+reasons: (1) It helps to debug applications if overlong sequences are |
+not treated as valid representations of characters, because this helps |
+to spot problems more quickly. (2) Overlong sequences provide |
+alternative representations of characters, that could maliciously be |
+used to bypass filters that check only for ASCII characters. For |
+instance, a 2-byte encoded line feed (LF) would not be caught by a |
+line counter that counts only 0x0a bytes, but it would still be |
+processed as a line feed by an unsafe UTF-8 decoder later in the |
+pipeline. From a security point of view, ASCII compatibility of UTF-8 |
+sequences means also, that ASCII characters are *only* allowed to be |
+represented by ASCII bytes in the range 0x00-0x7f. To ensure this |
+aspect of ASCII compatibility, use only "safe UTF-8 decoders" that |
+reject overlong UTF-8 sequences for which a shorter encoding exists. |
+ |
+4.1 Examples of an overlong ASCII character |
+ |
+With a safe UTF-8 decoder, all of the following five overlong |
+representations of the ASCII character slash ("/") should be rejected |
+like a malformed UTF-8 sequence, for instance by substituting it with |
+a replacement character. If you see a slash below, you do not have a |
+safe UTF-8 decoder! |
+ |
+4.1.1 U+002F = c0 af = "�" |
+4.1.2 U+002F = e0 80 af = "�" |
+4.1.3 U+002F = f0 80 80 af = "�" |
+4.1.4 U+002F = f8 80 80 80 af = "�" |
+4.1.5 U+002F = fc 80 80 80 80 af = "�" |
+ |
+4.2 Maximum overlong sequences |
+ |
+Below you see the highest Unicode value that is still resulting in an |
+overlong sequence if represented with the given number of bytes. This |
+is a boundary test for safe UTF-8 decoders. All five characters should |
+be rejected like malformed UTF-8 sequences. |
+ |
+4.2.1 U-0000007F = c1 bf = "�" |
+4.2.2 U-000007FF = e0 9f bf = "�" |
+4.2.3 U-0000FFFF = f0 8f bf bf = "�" |
+4.2.4 U-001FFFFF = f8 87 bf bf bf = "�" |
+4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "�" |
+ |
+4.3 Overlong representation of the NUL character |
+ |
+The following five sequences should also be rejected like malformed |
+UTF-8 sequences and should not be treated like the ASCII NUL |
+character. |
+ |
+4.3.1 U+0000 = c0 80 = "�" |
+4.3.2 U+0000 = e0 80 80 = "�" |
+4.3.3 U+0000 = f0 80 80 80 = "�" |
+4.3.4 U+0000 = f8 80 80 80 80 = "�" |
+4.3.5 U+0000 = fc 80 80 80 80 80 = "�" |
+ |
+5 Illegal code positions |
+ |
+The following UTF-8 sequences should be rejected like malformed |
+sequences, because they never represent valid ISO 10646 characters and |
+a UTF-8 decoder that accepts them might introduce security problems |
+comparable to overlong UTF-8 sequences. |
+ |
+5.1 Single UTF-16 surrogates |
+ |
+5.1.1 U+D800 = ed a0 80 = "í " |
+5.1.2 U+DB7F = ed ad bf = "í¿" |
+5.1.3 U+DB80 = ed ae 80 = "í®" |
+5.1.4 U+DBFF = ed af bf = "í¯¿" |
+5.1.5 U+DC00 = ed b0 80 = "í°" |
+5.1.6 U+DF80 = ed be 80 = "í¾" |
+5.1.7 U+DFFF = ed bf bf = "í¿¿" |
+ |
+5.2 Paired UTF-16 surrogates |
+ |
+5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "í í°" |
+5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "í í¿¿" |
+5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "í¿í°" |
+5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "í¿í¿¿" |
+5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "í®í°" |
+5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "í®í¿¿" |
+5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "í¯¿í°" |
+5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "􏿿" |
+ |
+5.3 Noncharacter code positions |
+ |
+The following "noncharacters" are "reserved for internal use" by |
+applications, and according to older versions of the Unicode Standard |
+"should never be interchanged". Unicode Corrigendum #9 dropped the |
+latter restriction. Nevertheless, their presence in incoming UTF-8 data |
+can remain a potential security risk, depending on what use is made of |
+these codes subsequently. Examples of such internal use: |
+ |
+ - Some file APIs with 16-bit characters may use the integer value -1 |
+ = U+FFFF to signal an end-of-file (EOF) or error condition. |
+ |
+ - In some UTF-16 receivers, code point U+FFFE might trigger a |
+ byte-swap operation (to convert between UTF-16LE and UTF-16BE). |
+ |
+With such internal use of noncharacters, it may be desirable and safer |
+to block those code points in UTF-8 decoders, as they should never |
+occur legitimately in incoming UTF-8 data, and could trigger unsafe |
+behaviour in subsequent processing. |
+ |
+Particularly problematic noncharacters in 16-bit applications: |
+ |
+5.3.1 U+FFFE = ef bf be = "￾" |
+5.3.2 U+FFFF = ef bf bf = "ï¿¿" |
+ |
+Other noncharacters: |
+ |
+5.3.3 U+FDD0 .. U+FDEF = "ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï·ï· ﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬ï·ï·®ï·¯"|
+ |
+5.3.4 U+nFFFE U+nFFFF (for n = 1..10) |
+ |
+ "ð¿¾ð¿¿ð¯¿¾ð¯¿¿ð¿¿¾ð¿¿¿ñ¿¾ñ¿¿ñ¿¾ñ¿¿ñ¯¿¾ñ¯¿¿ñ¿¿¾ñ¿¿¿ò¿¾ò¿¿ |
+ ò¿¾ò¿¿ò¯¿¾ò¯¿¿ò¿¿¾ò¿¿¿ó¿¾ó¿¿ó¿¾ó¿¿ó¯¿¾ó¯¿¿ó¿¿¾ó¿¿¿ô¿¾ô¿¿" |
+ |
+THE END |
Added: trunk/tests/testsuite/utf8basic.sh
===================================================================
--- trunk/tests/testsuite/utf8basic.sh (rev 0)
+++ trunk/tests/testsuite/utf8basic.sh 2017-09-11 11:41:08 UTC (rev 3496)
@@ -0,0 +1,10 @@
+#/bin/sh
+
+# utf8basic.good originally generated with:
+# uconv --from-code UTF-8 --to-code UTF-8 --from-callback substitute UTF-8-test.txt > utf8basic.good
+# but modified to ignore UTF-16 surrogates which are apparently illegal. We return multiple replacement
+# characters there, but the spec apparently says we are only supposed to return 1 per UTF-16 surrogate
+# there are comments in the spec about "security vulnerability" but we always check if we're at the
+# end of our buffer before continuing processing each byte (shouldn't all decoders do this?), so there
+# shouldn't be a problem. Ignoring the UTF-16 non-conformance for now.
+../utf8norm < UTF-8-test.txt
Property changes on: trunk/tests/testsuite/utf8basic.sh
___________________________________________________________________
Added: svn:executable
+ *
More information about the sword-cvs
mailing list