[sword-svn] r2141 - trunk/utilities
dmsmith at www.crosswire.org
dmsmith at www.crosswire.org
Fri Feb 29 13:01:01 MST 2008
Author: dmsmith
Date: 2008-02-29 13:01:00 -0700 (Fri, 29 Feb 2008)
New Revision: 2141
Modified:
trunk/utilities/osis2mod.cpp
Log:
added utf-8 detection and utf-8 conversion to osis2mod, making that the default behavior
Modified: trunk/utilities/osis2mod.cpp
===================================================================
--- trunk/utilities/osis2mod.cpp 2008-02-29 19:07:53 UTC (rev 2140)
+++ trunk/utilities/osis2mod.cpp 2008-02-29 20:01:00 UTC (rev 2141)
@@ -25,6 +25,7 @@
#ifdef _ICU_
#include <utf8nfc.h>
+#include <latin1utf8.h>
#endif
//#define DEBUG
@@ -46,6 +47,10 @@
#ifdef _ICU_
UTF8NFC normalizer;
+int normalized = 0;
+
+Latin1UTF8 converter;
+int converted = 0;
#endif
SWText *module = 0;
@@ -77,6 +82,80 @@
return match;
}
+
+/**
+ * Determine whether the string contains a valid unicode sequence.
+ * The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range 1st 2nd 3rd 4th
+ * U-00000000 - U-0000007F 0nnnnnnn
+ * U-00000080 - U-000007FF 110nnnnn 10nnnnnn
+ * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
+ * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * Note:
+ * 1. The latest UTF-8 RFC allows for a max of 4 bytes.
+ * Earlier allowed 6.
+ * 2. The number of bits of the leading byte before the first 0
+ * is the total number of bytes.
+ * 3. The "n" are the bits of the unicode codepoint.
+ * This routine does not check to see if the code point is in the range.
+ * It could.
+ *
+ * param txt the text to check
+ * return 1 if all high order characters form a valid unicode sequence
+ * -1 if there are no high order characters.
+ * Note: this is also a valid unicode sequence
+ * 0 if there are high order characters that do not form
+ * a valid unicode sequence
+ * author DM Smith
+ */
+int detectUTF8(const char *txt) {
+ unsigned int countUTF8 = 0;
+ int count = 0;
+
+ // Cast it to make masking and shifting easier
+ const unsigned char *p = (const unsigned char*) txt;
+ while (*p) {
+ // Is the high order bit set?
+ if (*p & 0x80) {
+ // Then count the number of high order bits that are set.
+ // This determines the number of following bytes
+ // that are a part of the unicode character
+ unsigned char i = *p;
+ for (count = 0; i & 0x80; count++) {
+ i <<= 1;
+ }
+
+ // Validate count:
+ // Count 0: bug in code that would cause core walking
+ // Count 1: is a pattern of 10nnnnnn,
+ // which does not signal the start of a unicode character
+ // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
+ // are not legal starts, either
+ if (count < 2 || count > 4) return 0;
+
+ // At this point we expect (count - 1) following characters
+ // of the pattern 10nnnnnn
+ while (--count && *++p) {
+ // The pattern of each following character must be: 10nnnnnn
+ // So, compare the top 2 bits.
+ if ((0xc0 & *p) != 0x80) return 0;
+ }
+
+ // Oops, we've run out of bytes too soon: Cannot be UTF-8
+ if (count) return 0;
+
+ // We have a valid UTF-8 character, so count it
+ countUTF8++;
+ }
+
+ // Advance to the next character to examine.
+ p++;
+ }
+
+ // At this point it is either UTF-8 or 7-bit ascii
+ return countUTF8 ? 1 : -1;
+}
+
// This routine converts an osisID or osisRef into one that SWORD can parse into a verse list
// An osisRef is made up of:
// a single osisID
@@ -260,8 +339,33 @@
}
#ifdef _ICU_
+ int utf8State = detectUTF8(activeVerseText.c_str());
if (normalize) {
- normalizer.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ // Don't need to normalize text that is ASCII
+ // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
+ if (!utf8State) {
+ cout << "Warning: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl;
+ converter.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ converted++;
+
+ // Prepare for double check. This probably can be removed.
+ // But for now we are running the check again.
+ // This is to determine whether we need to normalize output of the conversion.
+ utf8State = detectUTF8(activeVerseText.c_str());
+ }
+
+ // Double check. This probably can be removed.
+ if (!utf8State) {
+ cout << "Error: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl;
+ }
+
+ if (utf8State > 0) {
+ SWBuf before = activeVerseText;
+ normalizer.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ if (before != activeVerseText) {
+ normalized++;
+ }
+ }
}
#endif
@@ -778,8 +882,8 @@
fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
- fprintf(stderr, " -N\t\t\t Do not normalize UTF-8 to NFC\n");
- fprintf(stderr, "\t\t\t\t Note: assumes text is UTF-8\n");
+ fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
+ fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC");
fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n");
exit(-1);
}
@@ -977,5 +1081,8 @@
if (cipherFilter)
delete cipherFilter;
infile.close();
+
+ if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
+ if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
}
More information about the sword-cvs
mailing list