[sword-svn] r3082 - in trunk: include src/modules/filters
chrislit at crosswire.org
chrislit at crosswire.org
Wed Mar 5 14:49:39 MST 2014
Author: chrislit
Date: 2014-03-05 14:49:39 -0700 (Wed, 05 Mar 2014)
New Revision: 3082
Modified:
trunk/include/scsuutf8.h
trunk/src/modules/filters/scsuutf8.cpp
Log:
first pass attempt at SCSU to UTF-8 conversion
Modified: trunk/include/scsuutf8.h
===================================================================
--- trunk/include/scsuutf8.h 2014-03-05 19:52:08 UTC (rev 3081)
+++ trunk/include/scsuutf8.h 2014-03-05 21:49:39 UTC (rev 3082)
@@ -25,16 +25,31 @@
#include <swfilter.h>
+#ifdef _ICU_
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#include <unicode/uchar.h>
+#endif
+
SWORD_NAMESPACE_START
/** This filter converts SCSU compressed (encoded) text to UTF-8
*/
class SWDLLEXPORT SCSUUTF8 : public SWFilter {
- unsigned long c, d;
- unsigned char* UTF8Output(unsigned long, unsigned char* text);
+private:
+#ifdef _ICU_
+ UConverter* scsuConv;
+ UConverter* utf8Conv;
+ UErrorCode err;
+#else
+ // without ICU, we'll attempt to use Roman Czyborra's SCSU decoder code
+ unsigned long c, d;
+ unsigned char* UTF8Output(unsigned long, unsigned char* text);
+#endif
public:
SCSUUTF8();
+ ~SCSUUTF8();
virtual char processText(SWBuf &text, const SWKey *key = 0, const SWModule *module = 0);
};
Modified: trunk/src/modules/filters/scsuutf8.cpp
===================================================================
--- trunk/src/modules/filters/scsuutf8.cpp 2014-03-05 19:52:08 UTC (rev 3081)
+++ trunk/src/modules/filters/scsuutf8.cpp 2014-03-05 21:49:39 UTC (rev 3082)
@@ -32,214 +32,241 @@
* in http://www.unicode.org/unicode/reports/tr6.html
*/
-#include <stdlib.h>
-#include <stdio.h>
-#include <swmodule.h>
-
#include <scsuutf8.h>
+#include <swbuf.h>
-
SWORD_NAMESPACE_START
SCSUUTF8::SCSUUTF8() {
+#ifdef _ICU_
+ // initialize SCSU converter
+ scsuConv = ucnv_open("SCSU", &err);
+ // initialize UTF-8 converter
+ utf8Conv = ucnv_open("UTF-8", &err);
+#endif
}
+SCSUUTF8::~SCSUUTF8() {
+#ifdef _ICU_
+ ucnv_close(scsuConv);
+ ucnv_close(utf8Conv);
+#endif
+}
+
+#ifndef _ICU_
unsigned char* SCSUUTF8::UTF8Output(unsigned long uchar, unsigned char* text)
{
- /* join UTF-16 surrogates without any pairing sanity checks */
+ /* join UTF-16 surrogates without any pairing sanity checks */
- static int d;
+ static int d;
- if (uchar >= 0xd800 && uchar <= 0xdbff) { d = uchar & 0x3f; return text; }
- if (uchar >= 0xdc00 && uchar <= 0xdfff) { uchar = uchar + 0x2400 + d * 0x400; }
+ if (uchar >= 0xd800 && uchar <= 0xdbff) { d = uchar & 0x3f; return text; }
+ if (uchar >= 0xdc00 && uchar <= 0xdfff) { uchar = uchar + 0x2400 + d * 0x400; }
- /* output one character as UTF-8 multibyte sequence */
+ /* output one character as UTF-8 multibyte sequence */
- if (uchar < 0x80) {
- *text++ = c;
- }
- else if (uchar < 0x800) {
- *text++ = 0xc0 | uchar >> 6;
- *text++ = 0x80 | (uchar & 0x3f);
- }
- else if (uchar < 0x10000) {
- *text++ = 0xe0 | uchar >> 12;
- *text++ = 0x80 | (uchar >> 6 & 0x3f);
- *text++ = 0x80 | (uchar & 0x3f);
- }
- else if (uchar < 0x200000) {
- *text++ = 0xf0 | uchar >> 18;
- *text++ = 0x80 | (uchar >> 12 & 0x3f);
- *text++ = 0x80 | (uchar >> 6 & 0x3f);
- *text++ = 0x80 | (uchar & 0x3f);
- }
-
- return text;
+ if (uchar < 0x80) {
+ *text++ = c;
+ }
+ else if (uchar < 0x800) {
+ *text++ = 0xc0 | uchar >> 6;
+ *text++ = 0x80 | (uchar & 0x3f);
+ }
+ else if (uchar < 0x10000) {
+ *text++ = 0xe0 | uchar >> 12;
+ *text++ = 0x80 | (uchar >> 6 & 0x3f);
+ *text++ = 0x80 | (uchar & 0x3f);
+ }
+ else if (uchar < 0x200000) {
+ *text++ = 0xf0 | uchar >> 18;
+ *text++ = 0x80 | (uchar >> 12 & 0x3f);
+ *text++ = 0x80 | (uchar >> 6 & 0x3f);
+ *text++ = 0x80 | (uchar & 0x3f);
+ }
+
+ return text;
}
+#endif
+char SCSUUTF8::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
+#ifdef _ICU_
-char SCSUUTF8::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
-/*
- unsigned char *to, *from;
- unsigned long buflen = len * FILTERPAD;
- char active = 0, mode = 0;
- if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering
+ if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering
return -1;
- static unsigned short start[8] = {0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000};
- static unsigned short slide[8] = {0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00};
- static unsigned short win[256] = {
- 0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
- 0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
- 0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
- 0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
- 0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
- 0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
- 0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
- 0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
- 0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
- 0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
- 0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
- 0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
- 0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
- 0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
- 0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
- 0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
- 0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
- 0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
- 0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
- 0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
- 0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60
- };
+ err = U_ZERO_ERROR;
+ UnicodeString utf16Text(text.getRawData(), text.length(), scsuConv, err);
+ err = U_ZERO_ERROR;
+ int32_t len = utf16Text.extract(text.getRawData(), text.size(), utf8Conv, err);
+ if (len > text.size()+1) {
+ text.setSize(len+1);
+ int32_t len = utf16Text.extract(text.getRawData(), text.size(), scsuConv, err);
+ }
- if (!len)
- return 0;
+#else
- memmove(&text[buflen - len], text, len);
- from = (unsigned char*)&text[buflen - len];
- to = (unsigned char *)text;
-
- // -------------------------------
-
- for (int i = 0; i < len;) {
-
-
- if (i >= len) break;
- c = from[i++];
-
- if (c >= 0x80)
- {
- to = UTF8Output (c - 0x80 + slide[active], to);
- }
- else if (c >= 0x20 && c <= 0x7F)
- {
- to = UTF8Output (c, to);
- }
- else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD)
- {
- to = UTF8Output (c, to);
- }
- else if (c >= 0x1 && c <= 0x8) // SQn
- {
+ unsigned char *to, *from;
+ unsigned long buflen = len * FILTERPAD;
+ char active = 0, mode = 0;
+ if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering
+ return -1;
+
+ static unsigned short start[8] = {0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000};
+ static unsigned short slide[8] = {0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00};
+ static unsigned short win[256] = {
+ 0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
+ 0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
+ 0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
+ 0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
+ 0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
+ 0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
+ 0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
+ 0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
+ 0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
+ 0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
+ 0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
+ 0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
+ 0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
+ 0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
+ 0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
+ 0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
+ 0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
+ 0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
+ 0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
+ 0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
+ 0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60
+ };
+
+ if (!len)
+ return 0;
+
+ memmove(&text[buflen - len], text, len);
+ from = (unsigned char*)&text[buflen - len];
+ to = (unsigned char *)text;
+
+ // -------------------------------
+
+ for (int i = 0; i < len;) {
+
+
if (i >= len) break;
- d = from[i++]; // single quote
-
- to = UTF8Output (d < 0x80 ? d + start [c - 0x1] :
- d - 0x80 + slide [c - 0x1], to);
- }
- else if (c >= 0x10 && c <= 0x17) // SCn
- {
- active = c - 0x10; // change window
- }
- else if (c >= 0x18 && c <= 0x1F) // SDn
- {
- active = c - 0x18; // define window
- if (i >= len) break;
- slide [active] = win [from[i++]];
- }
- else if (c == 0xB) // SDX
- {
- if (i >= len) break;
c = from[i++];
-
- if (i >= len) break;
- d = from[i++];
-
- slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
- }
- else if (c == 0xE) // SQU
- {
- if (i >= len) break;
- c = from[i++]; // SQU
-
- if (i >= len) break;
- to = UTF8Output (c << 8 | from[i++], to);
+
+ if (c >= 0x80)
+ {
+ to = UTF8Output (c - 0x80 + slide[active], to);
}
- else if (c == 0xF) // SCU
- {
- mode = 1; // change to Unicode mode
-
- while (mode)
- {
- if (i >= len) break;
- c = from[i++];
-
- if (c <= 0xDF || c >= 0xF3)
+ else if (c >= 0x20 && c <= 0x7F)
{
- if (i >= len) break;
- to = UTF8Output (c << 8 | from[i++], to);
+ to = UTF8Output (c, to);
}
- else if (c == 0xF0) // UQU
+ else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD)
{
- if (i >= len) break;
- c = from[i++];
-
- if (i >= len) break;
- to = UTF8Output (c << 8 | from[i++], to);
+ to = UTF8Output (c, to);
}
- else if (c >= 0xE0 && c <= 0xE7) // UCn
+ else if (c >= 0x1 && c <= 0x8) // SQn
{
- active = c - 0xE0; mode = 0;
+ if (i >= len) break;
+ d = from[i++]; // single quote
+
+ to = UTF8Output (d < 0x80 ? d + start [c - 0x1] :
+ d - 0x80 + slide [c - 0x1], to);
}
- else if (c >= 0xE8 && c <= 0xEF) // UDn
+ else if (c >= 0x10 && c <= 0x17) // SCn
{
- if (i >= len) break;
- slide [active=c-0xE8] = win [from[i++]]; mode = 0;
+ active = c - 0x10; // change window
}
- else if (c == 0xF1) // UDX
+ else if (c >= 0x18 && c <= 0x1F) // SDn
{
- if (i >= len) break;
- c = from[i++];
-
- if (i >= len) break;
- d = from[i++];
-
- slide [active = c>>5] =
- 0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0;
+ active = c - 0x18; // define window
+ if (i >= len) break;
+ slide [active] = win [from[i++]];
}
- }
+ else if (c == 0xB) // SDX
+ {
+ if (i >= len) break;
+ c = from[i++];
+
+ if (i >= len) break;
+ d = from[i++];
+
+ slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
+ }
+ else if (c == 0xE) // SQU
+ {
+ if (i >= len) break;
+ c = from[i++]; // SQU
+
+ if (i >= len) break;
+ to = UTF8Output (c << 8 | from[i++], to);
+ }
+ else if (c == 0xF) // SCU
+ {
+ mode = 1; // change to Unicode mode
+
+ while (mode)
+ {
+ if (i >= len) break;
+ c = from[i++];
+
+ if (c <= 0xDF || c >= 0xF3)
+ {
+ if (i >= len) break;
+ to = UTF8Output (c << 8 | from[i++], to);
+ }
+ else if (c == 0xF0) // UQU
+ {
+ if (i >= len) break;
+ c = from[i++];
+
+ if (i >= len) break;
+ to = UTF8Output (c << 8 | from[i++], to);
+ }
+ else if (c >= 0xE0 && c <= 0xE7) // UCn
+ {
+ active = c - 0xE0; mode = 0;
+ }
+ else if (c >= 0xE8 && c <= 0xEF) // UDn
+ {
+ if (i >= len) break;
+ slide [active=c-0xE8] = win [from[i++]]; mode = 0;
+ }
+ else if (c == 0xF1) // UDX
+ {
+ if (i >= len) break;
+ c = from[i++];
+
+ if (i >= len) break;
+ d = from[i++];
+
+ slide [active = c>>5] =
+ 0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0;
+ }
+ }
+ }
+
+
}
+
+ *to++ = 0;
+ *to = 0;
-
- }
-
- *to++ = 0;
- *to = 0;
-*/
- return 0;
+#endif
+
+ return 0;
}
More information about the sword-cvs
mailing list