[sword-svn] r2846 - in trunk: cmake include lib/vcppmake/vc8 src/mgr src/modules/filters

chrislit at crosswire.org chrislit at crosswire.org
Sat Jun 29 07:48:48 MST 2013


Author: chrislit
Date: 2013-06-29 07:48:47 -0700 (Sat, 29 Jun 2013)
New Revision: 2846

Added:
   trunk/include/scsuutf8.h
   trunk/src/modules/filters/scsuutf8.cpp
Modified:
   trunk/cmake/sources.cmake
   trunk/include/Makefile.am
   trunk/include/defs.h
   trunk/include/encfiltmgr.h
   trunk/include/swmodule.h
   trunk/lib/vcppmake/vc8/libsword.vcxproj
   trunk/src/mgr/encfiltmgr.cpp
   trunk/src/mgr/swmgr.cpp
   trunk/src/modules/filters/Makefile.am
Log:
added SCSUUTF8 back to the library after evaluating SCSU 'compression' on a few texts (This does not yet work and merely returns the SCSU-code to pre-r2256 status.)

Modified: trunk/cmake/sources.cmake
===================================================================
--- trunk/cmake/sources.cmake	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/cmake/sources.cmake	2013-06-29 14:48:47 UTC (rev 2846)
@@ -126,6 +126,8 @@
 	src/modules/filters/utf16utf8.cpp
 	src/modules/filters/utf8html.cpp
 	src/modules/filters/utf8latin1.cpp
+	src/modules/filters/unicodertf.cpp
+	src/modules/filters/scsuutf8.cpp
 
 	src/modules/filters/utf8cantillation.cpp
 	src/modules/filters/utf8hebrewpoints.cpp
@@ -136,7 +138,6 @@
 
 	src/modules/filters/rtfhtml.cpp
 	src/modules/filters/greeklexattribs.cpp
-	src/modules/filters/unicodertf.cpp
 	src/modules/filters/papyriplain.cpp
 
 	src/modules/genbook/swgenbook.cpp
@@ -315,6 +316,7 @@
 	include/roman.h
 	include/rtfhtml.h
 	include/sapphire.h
+	include/scsuutf8.h
 	include/strkey.h
 	include/swbasicfilter.h
 	include/swbuf.h

Modified: trunk/include/Makefile.am
===================================================================
--- trunk/include/Makefile.am	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/include/Makefile.am	2013-06-29 14:48:47 UTC (rev 2846)
@@ -87,6 +87,7 @@
 pkginclude_HEADERS += $(swincludedir)/roman.h
 pkginclude_HEADERS += $(swincludedir)/rtfhtml.h
 pkginclude_HEADERS += $(swincludedir)/sapphire.h
+pkginclude_HEADERS += $(swincludedir)/scsuutf8.h
 pkginclude_HEADERS += $(swincludedir)/strkey.h
 pkginclude_HEADERS += $(swincludedir)/swbasicfilter.h
 pkginclude_HEADERS += $(swincludedir)/swbuf.h

Modified: trunk/include/defs.h
===================================================================
--- trunk/include/defs.h	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/include/defs.h	2013-06-29 14:48:47 UTC (rev 2846)
@@ -156,7 +156,7 @@
 
 enum {DIRECTION_LTR = 0, DIRECTION_RTL, DIRECTION_BIDI};
 enum {FMT_UNKNOWN = 0, FMT_PLAIN, FMT_THML, FMT_GBF, FMT_HTML, FMT_HTMLHREF, FMT_RTF, FMT_OSIS, FMT_WEBIF, FMT_TEI, FMT_XHTML};
-enum {ENC_UNKNOWN = 0, ENC_LATIN1, ENC_UTF8, ENC_UTF16, ENC_RTF, ENC_HTML};
+enum {ENC_UNKNOWN = 0, ENC_LATIN1, ENC_UTF8, ENC_SCSU, ENC_UTF16, ENC_RTF, ENC_HTML};
 
 SWORD_NAMESPACE_END
 #endif //SWORDDEFS_H

Modified: trunk/include/encfiltmgr.h
===================================================================
--- trunk/include/encfiltmgr.h	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/include/encfiltmgr.h	2013-06-29 14:48:47 UTC (rev 2846)
@@ -40,6 +40,7 @@
 
 protected:
         SWFilter *latin1utf8;
+	SWFilter *scsuutf8;
         SWFilter *targetenc;
 
 

Added: trunk/include/scsuutf8.h
===================================================================
--- trunk/include/scsuutf8.h	                        (rev 0)
+++ trunk/include/scsuutf8.h	2013-06-29 14:48:47 UTC (rev 2846)
@@ -0,0 +1,42 @@
+/******************************************************************************
+ *
+ *  scsuutf8.h - SWFilter descendant to convert a SCSU character to UTF-8
+ *
+ * $Id$
+ *
+ * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org)
+ *	CrossWire Bible Society
+ *	P. O. Box 2528
+ *	Tempe, AZ  85280-2528
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#ifndef SCSUUTF8_H
+#define SCSUUTF8_H
+
+#include <swfilter.h>
+
+SWORD_NAMESPACE_START
+
+/** This filter converts SCSU compressed (encoded) text to UTF-8
+ */
+class SWDLLEXPORT SCSUUTF8 : public SWFilter {
+  unsigned long c, d;
+  unsigned char* UTF8Output(unsigned long, unsigned char* text);
+  
+public:
+	SCSUUTF8();
+	virtual char processText(SWBuf &text, const SWKey *key = 0, const SWModule *module = 0);
+};
+
+SWORD_NAMESPACE_END
+#endif


Property changes on: trunk/include/scsuutf8.h
___________________________________________________________________
Added: svn:keywords
   + Author Date Id
Added: svn:eol-style
   + native

Modified: trunk/include/swmodule.h
===================================================================
--- trunk/include/swmodule.h	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/include/swmodule.h	2013-06-29 14:48:47 UTC (rev 2846)
@@ -185,7 +185,7 @@
 	/**
 	 * @return  True if this module is encoded in Unicode, otherwise returns false.
 	 */
-	virtual bool isUnicode() const { return (encoding == (char)ENC_UTF8); }
+	virtual bool isUnicode() const { return (encoding == (char)ENC_UTF8 || encoding == (char)ENC_SCSU); }
 
 	// These methods are useful for modules that come from a standard SWORD install (most do).
 	// SWMgr will call setConfig.  The user may use getConfig and getConfigEntry (if they

Modified: trunk/lib/vcppmake/vc8/libsword.vcxproj
===================================================================
--- trunk/lib/vcppmake/vc8/libsword.vcxproj	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/lib/vcppmake/vc8/libsword.vcxproj	2013-06-29 14:48:47 UTC (rev 2846)
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug with ICU|Win32">
@@ -257,6 +257,7 @@
     <ClCompile Include="..\..\..\src\modules\filters\osisenum.cpp" />
     <ClCompile Include="..\..\..\src\modules\filters\osisglosses.cpp" />
     <ClCompile Include="..\..\..\src\modules\filters\osisxlit.cpp" />
+    <ClCompile Include="..\..\..\src\modules\filters\scsuutf8.cpp" />
     <ClCompile Include="..\..\..\src\utilfuns\win32\dirent.cpp" />
     <ClCompile Include="..\..\..\src\modules\tests\echomod.cpp" />
     <ClCompile Include="..\..\..\src\mgr\encfiltmgr.cpp" />
@@ -499,6 +500,7 @@
     <ClInclude Include="..\..\..\include\roman.h" />
     <ClInclude Include="..\..\..\include\rtfhtml.h" />
     <ClInclude Include="..\..\..\include\sapphire.h" />
+    <ClInclude Include="..\..\..\include\scsuutf8.h" />
     <ClInclude Include="..\..\..\include\stringmgr.h" />
     <ClInclude Include="..\..\..\include\strkey.h" />
     <ClInclude Include="..\..\..\include\swbasicfilter.h" />

Modified: trunk/src/mgr/encfiltmgr.cpp
===================================================================
--- trunk/src/mgr/encfiltmgr.cpp	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/src/mgr/encfiltmgr.cpp	2013-06-29 14:48:47 UTC (rev 2846)
@@ -25,6 +25,7 @@
 #include <encfiltmgr.h>
 #include <utilstr.h>
 
+#include <scsuutf8.h>
 #include <latin1utf8.h>
 
 #include <unicodertf.h>
@@ -47,6 +48,7 @@
 EncodingFilterMgr::EncodingFilterMgr (char enc)
 		   : SWFilterMgr() {
 
+        scsuutf8 = new SCSUUTF8();
         latin1utf8 = new Latin1UTF8();
 
         encoding = enc;
@@ -73,6 +75,8 @@
  * EncodingFilterMgr Destructor - Cleans up instance of EncodingFilterMgr
  */
 EncodingFilterMgr::~EncodingFilterMgr() {
+        if (scsuutf8)
+                delete scsuutf8;
         if (latin1utf8)
                 delete latin1utf8;
         if (targetenc)
@@ -87,6 +91,9 @@
 	if (!encoding.length() || !stricmp(encoding.c_str(), "Latin-1")) {
                 module->addRawFilter(latin1utf8);
 	}
+	else if (!stricmp(encoding.c_str(), "SCSU")) {
+		module->AddRawFilter(scsuutf8);
+	}
 }
 
 void EncodingFilterMgr::AddEncodingFilters(SWModule *module, ConfigEntMap &section) {

Modified: trunk/src/mgr/swmgr.cpp
===================================================================
--- trunk/src/mgr/swmgr.cpp	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/src/mgr/swmgr.cpp	2013-06-29 14:48:47 UTC (rev 2846)
@@ -895,7 +895,9 @@
 	else
 		markup = FMT_GBF;
 
-	if (!stricmp(encoding.c_str(), "UTF-8")) {
+	if (!stricmp(encoding.c_str(), "SCSU"))
+		enc = ENC_SCSU;
+	else if (!stricmp(encoding.c_str(), "UTF-8")) {
 		enc = ENC_UTF8;
 	}
 	else enc = ENC_LATIN1;

Modified: trunk/src/modules/filters/Makefile.am
===================================================================
--- trunk/src/modules/filters/Makefile.am	2013-06-29 12:08:05 UTC (rev 2845)
+++ trunk/src/modules/filters/Makefile.am	2013-06-29 14:48:47 UTC (rev 2846)
@@ -67,6 +67,8 @@
 libsword_la_SOURCES += $(filtersdir)/utf16utf8.cpp
 libsword_la_SOURCES += $(filtersdir)/utf8html.cpp
 libsword_la_SOURCES += $(filtersdir)/utf8latin1.cpp
+libsword_la_SOURCES += $(filtersdir)/unicodertf.cpp
+libsword_la_SOURCES += $(filtersdir)/scsuutf8.cpp
 
 libsword_la_SOURCES += $(filtersdir)/utf8cantillation.cpp
 libsword_la_SOURCES += $(filtersdir)/utf8hebrewpoints.cpp
@@ -77,7 +79,6 @@
 
 PLFIL = $(filtersdir)/rtfhtml.cpp
 PLFIL += $(filtersdir)/greeklexattribs.cpp
-PLFIL += $(filtersdir)/unicodertf.cpp
 PLFIL += $(filtersdir)/papyriplain.cpp
 
 

Added: trunk/src/modules/filters/scsuutf8.cpp
===================================================================
--- trunk/src/modules/filters/scsuutf8.cpp	                        (rev 0)
+++ trunk/src/modules/filters/scsuutf8.cpp	2013-06-29 14:48:47 UTC (rev 2846)
@@ -0,0 +1,242 @@
+/******************************************************************************
+ *
+ *  scsuutf8.cpp -	SWFilter descendant to convert a SCSU character to
+ *			UTF-8
+ *
+ * $Id$
+ *
+ * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org)
+ *	CrossWire Bible Society
+ *	P. O. Box 2528
+ *	Tempe, AZ  85280-2528
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+/* This class is based on:
+ * http://czyborra.com/scsu/scsu.c written by Roman Czyborra at dds.nl
+ * on Andrea's balcony in North Amsterdam on 1998-08-04
+ * Thanks to Richard Verhoeven <rcb5 at win.tue.nl> for his suggestion
+ * to correct the haphazard "if" after UQU to "else if" on 1998-10-01
+ * 
+ * This is a deflator to UTF-8 output for input compressed in SCSU,
+ * the (Reuters) Standard Compression Scheme for Unicode as described
+ * in http://www.unicode.org/unicode/reports/tr6.html
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <swmodule.h>
+
+#include <scsuutf8.h>
+
+SWORD_NAMESPACE_START
+
+SCSUUTF8::SCSUUTF8() {
+}
+
+
+unsigned char* SCSUUTF8::UTF8Output(unsigned long uchar, unsigned char* text)
+{
+  /* join UTF-16 surrogates without any pairing sanity checks */
+
+  static int d;
+  
+  if (uchar >= 0xd800 && uchar <= 0xdbff) { d = uchar & 0x3f; return text;  }
+  if (uchar >= 0xdc00 && uchar <= 0xdfff) { uchar = uchar + 0x2400 + d * 0x400; }
+  
+  /* output one character as UTF-8 multibyte sequence */
+  
+  if (uchar < 0x80) {
+    *text++ = c;
+  }
+  else if (uchar < 0x800) { 
+    *text++ = 0xc0 | uchar >> 6; 
+    *text++ = 0x80 | (uchar & 0x3f);
+  }
+  else if (uchar < 0x10000) {
+    *text++ = 0xe0 | uchar >> 12; 
+    *text++ = 0x80 | (uchar >> 6 & 0x3f);
+    *text++ = 0x80 | (uchar & 0x3f);
+  }
+  else if (uchar < 0x200000) {
+    *text++ = 0xf0 | uchar >> 18;
+    *text++ = 0x80 | (uchar >> 12 & 0x3f);
+    *text++ = 0x80 | (uchar >> 6 & 0x3f);
+    *text++ = 0x80 | (uchar & 0x3f);
+  }  
+  
+  return text;
+}
+
+char SCSUUTF8::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
+/*
+  unsigned char *to, *from;
+  unsigned long buflen = len * FILTERPAD;
+  char active = 0, mode = 0;
+	 if ((unsigned long)key < 2)	// hack, we're en(1)/de(0)ciphering
+		return -1;
+
+  static unsigned short start[8] = {0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000};
+  static unsigned short slide[8] = {0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00};
+  static unsigned short win[256]   = {
+    0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
+    0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
+    0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
+    0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
+    0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
+    0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
+    0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
+    0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
+    0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
+    0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
+    0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
+    0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
+    0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
+    0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
+    0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
+    0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
+    0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
+    0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
+    0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
+    0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
+    0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60
+  };
+
+  if (!len)
+	   return 0;
+
+  memmove(&text[buflen - len], text, len);
+  from = (unsigned char*)&text[buflen - len];
+  to = (unsigned char *)text;
+
+  // -------------------------------
+
+  for (int i = 0; i < len;) {
+
+
+	 if (i >= len) break;
+	 c = from[i++];
+
+	 if (c >= 0x80)
+	{
+	  to = UTF8Output (c - 0x80 + slide[active], to);
+	}
+	 else if (c >= 0x20 && c <= 0x7F)
+	{
+	  to = UTF8Output (c, to);
+	}
+	 else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD)
+	{
+	  to = UTF8Output (c, to);
+	}
+	 else if (c >= 0x1 && c <= 0x8) // SQn
+	{
+		if (i >= len) break;
+	  d = from[i++]; // single quote
+
+	  to = UTF8Output (d < 0x80 ? d + start [c - 0x1] :
+		  d - 0x80 + slide [c - 0x1], to);
+	}
+	 else if (c >= 0x10 && c <= 0x17) // SCn
+	{
+	  active = c - 0x10; // change window
+	}
+	 else if (c >= 0x18 && c <= 0x1F) // SDn
+	{
+	  active = c - 0x18;  // define window
+		if (i >= len) break;
+	  slide [active] = win [from[i++]];
+	}
+	 else if (c == 0xB) // SDX
+	{
+		if (i >= len) break;
+		c = from[i++];
+
+		if (i >= len) break;
+		d = from[i++];
+
+	  slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
+	}
+	 else if (c == 0xE) // SQU
+	{
+		if (i >= len) break;
+	  c = from[i++]; // SQU
+
+		if (i >= len) break;
+		to = UTF8Output (c << 8 | from[i++], to);
+		}
+	 else if (c == 0xF) // SCU
+	{
+	  mode = 1; // change to Unicode mode
+
+	  while (mode)
+	    {
+		    if (i >= len) break;
+		 c = from[i++];
+
+		 if (c <= 0xDF || c >= 0xF3)
+		{
+			   if (i >= len) break;
+		  to = UTF8Output (c << 8 | from[i++], to);
+		}
+		 else if (c == 0xF0) // UQU
+		{
+			   if (i >= len) break;
+		  c = from[i++];
+
+			   if (i >= len) break;
+			   to = UTF8Output (c << 8 | from[i++], to);
+		}
+		 else if (c >= 0xE0 && c <= 0xE7) // UCn
+		{
+		  active = c - 0xE0; mode = 0;
+		}
+		 else if (c >= 0xE8 && c <= 0xEF) // UDn
+		{
+			   if (i >= len) break;
+		  slide [active=c-0xE8] = win [from[i++]]; mode = 0;
+		}
+		 else if (c == 0xF1) // UDX
+		{
+			   if (i >= len) break;
+		  c = from[i++];
+
+			   if (i >= len) break;
+			   d = from[i++];
+
+		  slide [active = c>>5] =
+		    0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0;
+		}
+	    }
+	}
+
+
+  }
+
+  *to++ = 0;
+  *to = 0;
+*/
+  return 0;
+}
+
+SWORD_NAMESPACE_END


Property changes on: trunk/src/modules/filters/scsuutf8.cpp
___________________________________________________________________
Added: svn:keywords
   + Author Date Id
Added: svn:eol-style
   + native




More information about the sword-cvs mailing list