[sword-svn] r2075 - trunk/src/mgr
scribe at www.crosswire.org
scribe at www.crosswire.org
Tue Sep 4 00:10:14 MST 2007
Author: scribe
Date: 2007-09-04 00:10:14 -0700 (Tue, 04 Sep 2007)
New Revision: 2075
Modified:
trunk/src/mgr/stringmgr.cpp
Log:
added DM's isValidUTF8 method for better determining if we have latin1 or
UTF8 text.
Modified: trunk/src/mgr/stringmgr.cpp
===================================================================
--- trunk/src/mgr/stringmgr.cpp 2007-09-04 03:02:55 UTC (rev 2074)
+++ trunk/src/mgr/stringmgr.cpp 2007-09-04 07:10:14 UTC (rev 2075)
@@ -48,7 +48,71 @@
~__staticsystemStringMgr() { if (StringMgr::systemStringMgr) delete StringMgr::systemStringMgr; StringMgr::systemStringMgr = 0; }
} _staticsystemStringMgr;
+/**
+ * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range 1st 2nd 3rd 4th 5th 6th
+ * U-00000000 - U-0000007F 0nnnnnnn
+ * U-00000080 - U-000007FF 110nnnnn 10nnnnnn
+ * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
+ * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * U-00200000 - U-03FFFFFF 111110nn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * U-04000000 - U-7FFFFFFF 1111110n 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * Note:
+ * The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
+ * The number of bits of the leading byte before the first 0 is the total number of bytes
+ * The "n" are the bits of the unicode codepoint.
+ *
+ * This routine does not check to see if the code point is in the range. It could.
+ *
+ * @param txt the text to check
+ * @return 1 if all high order characters form a valid unicode sequence
+ * -1 if there are no high order characters
+ * 0 if there are high order characters that do not form a valid unicode sequence
+ * @author DM Smith [dmsmith555 at yahoo dot com]
+ */
+int isValidUTF8(unsigned char *txt) {
+ unsigned int countUTF8 = 0;
+ unsigned char parts = 0;
+
+ unsigned char *p = txt;
+ while (*p) {
+ // Is the high order bit set?
+ if (*p & 0x80) {
+ // then count the number of high order bits that are set
+ // this determines the number of following bytes need to have high order bits set
+ unsigned char i = *p;
+ for (parts = 0; i & 0x80; parts++) {
+ i <<= 1;
+ }
+
+
+ // The pattern 10nnnnnn is not a unicode character
+ if (parts == 1) {
+ return 0;
+ }
+ else {
+ while (--parts && ++*p) {
+ // The pattern of each following character must be: 10nnnnnn
+ if (0xc0 & *p != 0x80) {
+ return 0;
+ }
+ }
+
+ // Oops, we've run out of bytes too soon: Cannot be UTF-8
+ if (parts) {
+ return 0;
+ }
+ }
+ countUTF8++;
+ }
+ }
+
+ // At this point it is either UTF-8 or ascii
+ return countUTF8 ? 1 : -1;
+}
+
+
#ifdef _ICU_
//here comes our ICUStringMgr reimplementation
@@ -111,15 +175,33 @@
}
-/** Converts the param to an upper case Utf8 string
-* @param The text encoded in utf8 which should be turned into an upper case string
-*/
+/**
+ * This is a fallback method. It should never be called.
+ * If UTF8 support is desired, then a UTF8 StringMgr needs
+ * to be used.
+ *
+ * Here we just do our best.
+ *
+ * Converts the param to an upper case UTF8 string
+ * @param t - The text encoded in utf8 which should be turned into an upper case string
+ *
+ */
char *StringMgr::upperUTF8(char *t, unsigned int maxlen) const {
// try to decide if it's worth trying to toupper. Do we have more
// characters which are probably lower latin than not?
+ // we still don't use isValidUTF8 optimally. what if we have 1 unicode
+ // character in the string? should we not try to upper any of the string?
+ // dunno. Best solution is to upper all other characters. Don't have
+ // time to write that before release.
long performOp = 0;
- for (const char *ch = t; *ch; ch++)
- performOp += (*ch > 0) ? 1 : -1;
+ if (!isValidUTF8((unsigned char *)t)) {
+ performOp = 1;
+ }
+ else {
+ for (const char *ch = t; *ch; ch++) {
+ performOp += (*ch > 0) ? 1 : -1;
+ }
+ }
if (performOp > 0) {
return upperLatin1(t);
@@ -128,10 +210,12 @@
return t;
}
-/** Converts the param to an uppercase latin1 string
-* @param The text encoded in latin1 which should be turned into an upper case string
-*/
-char* StringMgr::upperLatin1(char* buf, unsigned int maxlen) const {
+
+/**
+ * Converts the param to an uppercase latin1 string
+ * @param The text encoded in latin1 which should be turned into an upper case string
+ */
+char *StringMgr::upperLatin1(char *buf, unsigned int maxlen) const {
if (!buf)
return 0;
More information about the sword-cvs
mailing list