[sword-svn] r3125 - trunk/utilities
chrislit at crosswire.org
chrislit at crosswire.org
Fri Mar 14 04:31:58 MST 2014
Author: chrislit
Date: 2014-03-14 04:31:58 -0700 (Fri, 14 Mar 2014)
New Revision: 3125
Modified:
trunk/utilities/osis2mod.cpp
Log:
This should address cases where the output encoding cannot simply be concatenated, and may slightly improve SCSU encoding compression
Modified: trunk/utilities/osis2mod.cpp
===================================================================
--- trunk/utilities/osis2mod.cpp 2014-03-14 10:57:10 UTC (rev 3124)
+++ trunk/utilities/osis2mod.cpp 2014-03-14 11:31:58 UTC (rev 3125)
@@ -60,8 +60,10 @@
#include <utf8nfc.h>
#include <latin1utf8.h>
#include <utf8scsu.h>
+#include <scsuutf8.h>
#endif
#include <utf8utf16.h>
+#include <utf16utf8.h>
#ifndef NO_SWORD_NAMESPACE
using namespace sword;
@@ -92,7 +94,8 @@
UTF8NFC normalizer;
Latin1UTF8 converter;
#endif
-SWFilter* outputConverter;
+SWFilter* outputEncoder;
+SWFilter* outputDecoder;
int normalized = 0;
int converted = 0;
@@ -539,20 +542,26 @@
}
// If the desired output encoding is non-UTF-8, convert to that encoding
- if (outputConverter) {
- outputConverter->processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ if (outputEncoder) {
+ outputEncoder->processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
}
// If the entry already exists, then append this entry to the text.
// This is for verses that are outside the chosen versification. They are appended to the prior verse.
// The space should not be needed if we retained verse tags.
- // TODO: in the case of SCSU output, very slightly better compression might be
- // achieved by decoding the currentText & activeVerseText, concatenating them,
- // and re-encoding them as SCSU
SWBuf currentText = module->getRawEntry();
if (currentText.length()) {
cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
+
+ // If we have a non-UTF-8 encoding, we should decode it before concatenating, then re-encode it
+ if (outputDecoder) {
+ outputDecoder->processText(activeVerseText, (SWKey *)2);
+ outputDecoder->processText(currentText, (SWKey *)2);
+ }
activeVerseText = currentText + " " + activeVerseText;
+ if (outputEncoder) {
+ outputDecoder->processText(activeVerseText, (SWKey *)2);
+ }
}
if (debug & DEBUG_WRITE) {
@@ -1637,12 +1646,23 @@
else if (!strcmp(argv[i], "-e")) {
if (i+1 < argc) {
switch (argv[++i][0]) {
- case '1': outputConverter = NULL; break; // leave as UTF-8
- case '2': outputConverter = new UTF8UTF16(); break;
+ case '1': // leave as UTF-8
+ outputEncoder = NULL;
+ outputDecoder = NULL;
+ break;
+ case '2':
+ outputEncoder = new UTF8UTF16();
+ outputDecoder = new UTF16UTF8();
+ break;
#ifdef _ICU_
- case 's': outputConverter = new UTF8SCSU(); break;
+ case 's':
+ outputEncoder = new UTF8SCSU();
+ outputDecoder = new SCSUUTF8();
+ break;
#endif
- default: outputConverter = NULL;
+ default:
+ outputEncoder = NULL;
+ outputDecoder = NULL;
}
}
}
@@ -1830,8 +1850,10 @@
delete module;
if (cipherFilter)
delete cipherFilter;
- if (outputConverter)
- delete outputConverter;
+ if (outputEncoder)
+ delete outputEncoder;
+ if (outputDecoder)
+ delete outputDecoder;
fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program);
exit(0); // success
More information about the sword-cvs
mailing list