[sword-devel] patch

Martin Gruner sword-devel@crosswire.org
Sun, 18 Jan 2004 16:30:36 +0100


--Boundary-00=_caqCAALvtLUnOD9
Content-Type: text/plain;
  charset="us-ascii"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

Hi,

I tried to work out how utf-8 locales can work in sword. I noticed that there 
are 2 toupper() functions in sword, one of which only works on latin1, and 
the second can utilize icu.
Therefore I created a patch to be able to handle utf-8 consistently in sword 
by deleting toupper() and patching toupper_utf8() a little. Please look 
through and see if this is ok. 

It should work, I only have the problem that ICU does not do the toUpper() 
correctly on my system right now, not sure why. It leaves the string as it 
is. Chris, can you help me here? Does it work for you?

Thanks for all feedback.

Martin

--Boundary-00=_caqCAALvtLUnOD9
Content-Type: text/x-diff;
  charset="us-ascii";
  name="toupper.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="toupper.patch"

Index: debian/rules
===================================================================
RCS file: /cvs/core/sword/debian/rules,v
retrieving revision 1.7
diff -u -3 -p -u -r1.7 rules
--- debian/rules	17 Jan 2004 21:21:13 -0000	1.7
+++ debian/rules	18 Jan 2004 15:26:04 -0000
@@ -34,7 +34,7 @@ configure-stamp:
 	chmod 755 configure
 	./configure $(confflags) --prefix=/usr --mandir=\$${prefix}/share/man \
 		--infodir=\$${prefix}/share/info --with-zlib \
-		--sysconfdir=/etc --enable-shared --without-icu \
+		--sysconfdir=/etc --enable-shared --with-icu \
 		--without-lucene
 	touch configure.stamp
 
Index: include/utilstr.h
===================================================================
RCS file: /cvs/core/sword/include/utilstr.h,v
retrieving revision 1.11
diff -u -3 -p -u -r1.11 utilstr.h
--- include/utilstr.h	22 Jun 2003 23:50:23 -0000	1.11
+++ include/utilstr.h	18 Jan 2004 15:26:04 -0000
@@ -33,7 +33,7 @@ char *strstrip (char *istr);
 const char *stristr (const char *s1, const char *s2);
 const char strnicmp(const char *s1, const char *s2, int len);
 unsigned int strlenw(const char *s1);
-char *toupperstr(char *buf);
+//char *toupperstr(char *buf);
 char *toupperstr_utf8(char *buf, unsigned int max = 0);
 
 /*
Index: src/keys/versekey.cpp
===================================================================
RCS file: /cvs/core/sword/src/keys/versekey.cpp,v
retrieving revision 1.58
diff -u -3 -p -u -r1.58 versekey.cpp
--- src/keys/versekey.cpp	27 Jun 2003 01:41:07 -0000	1.58
+++ src/keys/versekey.cpp	18 Jan 2004 15:26:05 -0000
@@ -324,7 +324,7 @@ int VerseKey::getBookAbbrev(const char *
 		stdstr(&abbr, iabbr);
 		strstrip(abbr);
 		if (!i)
-			toupperstr(abbr);
+			toupperstr_utf8(abbr);
 		abLen = strlen(abbr);
 
 		if (abLen) {
Index: src/modules/filters/swbasicfilter.cpp
===================================================================
RCS file: /cvs/core/sword/src/modules/filters/swbasicfilter.cpp,v
retrieving revision 1.33
diff -u -3 -p -u -r1.33 swbasicfilter.cpp
--- src/modules/filters/swbasicfilter.cpp	24 Oct 2003 02:43:46 -0000	1.33
+++ src/modules/filters/swbasicfilter.cpp	18 Jan 2004 15:26:05 -0000
@@ -93,7 +93,7 @@ void SWBasicFilter::addTokenSubstitute(c
 
 	if (!tokenCaseSensitive) {
 		stdstr(&buf, findString);
-		toupperstr(buf);
+		toupperstr_utf8(buf);
 		tokenSubMap[buf] = replaceString;
 		delete [] buf;
 	}
@@ -114,7 +114,7 @@ void SWBasicFilter::addEscapeStringSubst
 
 	if (!escStringCaseSensitive) {
 		stdstr(&buf, findString);
-		toupperstr(buf);
+		toupperstr_utf8(buf);
 		escSubMap.insert(DualStringMap::value_type(buf, replaceString));
 		delete [] buf;
 	}
@@ -135,7 +135,7 @@ bool SWBasicFilter::substituteToken(SWBu
 	if (!tokenCaseSensitive) {
 	        char *tmp = 0;
 		stdstr(&tmp, token);
-		toupperstr(tmp);
+		toupperstr_utf8(tmp);
 		it = tokenSubMap.find(tmp);
 		delete [] tmp;
 	} else
@@ -155,7 +155,7 @@ bool SWBasicFilter::substituteEscapeStri
 	if (!escStringCaseSensitive) {
 	        char *tmp = 0;
 		stdstr(&tmp, escString);
-		toupperstr(tmp);
+		toupperstr_utf8(tmp);
 		it = escSubMap.find(tmp);
 		delete [] tmp;
 	} else 
Index: src/modules/texts/rawtext/rawtext.cpp
===================================================================
RCS file: /cvs/core/sword/src/modules/texts/rawtext/rawtext.cpp,v
retrieving revision 1.69
diff -u -3 -p -u -r1.69 rawtext.cpp
--- src/modules/texts/rawtext/rawtext.cpp	17 Jan 2004 04:33:25 -0000	1.69
+++ src/modules/texts/rawtext/rawtext.cpp	18 Jan 2004 15:26:06 -0000
@@ -282,7 +282,7 @@ signed char RawText::createSearchFramewo
 		while (word) {
 
 			// make word upper case
-			toupperstr(word);
+			toupperstr_utf8(word);
 
 			// lookup word in dictionary (or make entry in dictionary
 			// for this word) and add this module position (index) to
@@ -519,7 +519,7 @@ ListKey &RawText::search(const char *ist
 
 			// toupper our copy of search string
 			stdstr(&wordBuf, istr);
-			toupperstr(wordBuf);
+			toupperstr_utf8(wordBuf);
 
 			// get list of individual words
 			words = (char **)calloc(sizeof(char *), 10);
Index: src/utilfuns/utilstr.cpp
===================================================================
RCS file: /cvs/core/sword/src/utilfuns/utilstr.cpp,v
retrieving revision 1.25
diff -u -3 -p -u -r1.25 utilstr.cpp
--- src/utilfuns/utilstr.cpp	27 Jun 2003 02:21:05 -0000	1.25
+++ src/utilfuns/utilstr.cpp	18 Jan 2004 15:26:06 -0000
@@ -1,6 +1,7 @@
 #include <utilstr.h>
 #include <ctype.h>
 #include <string.h>
+#include <iostream>
 
 #ifdef _ICU_
 #include <unicode/utypes.h>
@@ -147,26 +148,29 @@ unsigned int strlenw(const char *s1) {
 }
 
 
-/******************************************************************************
- * toupperstr - converts a string to uppercase string
- *
- * ENT:	target - string to convert
- *
- * RET:	target
- */
-
-char *toupperstr(char *buf) {
-	char *ret = buf;
-
-	while (*buf)
-		*buf = SW_toupper(*buf++);
-
-	return ret;
-}
+///******************************************************************************
+// * toupperstr - converts a string to uppercase string
+// *
+// * ENT:	target - string to convert
+// *
+// * RET:	target
+// */
+//
+//char *toupperstr(char *buf) {
+//	char *ret = buf;
+//
+//	while (*buf)
+//		*buf = SW_toupper(*buf++);
+//
+//	return ret;
+//}
 
 
 /******************************************************************************
- * toupperstr - converts a string to uppercase string
+ * toupperstr_utf8 - converts a string to uppercase string
+ * If ICU support is enabled in sword, this function will use it to do the work.
+ * If ICU support is not enabled, this function will ONLY work correctly with
+ * Latin-1 data!
  *
  * ENT:	target - string to convert
  *
@@ -179,23 +183,26 @@ char *toupperstr_utf8(char *buf, unsigne
 #ifndef _ICU_
 	// try to decide if it's worth trying to toupper.  Do we have more
 	// characters that are probably lower latin than not?
-	long performOp = 0;
-	for (const char *ch = buf; *ch; ch++)
-		performOp += (*ch > 0) ? 1 : -1;
 
-	if (performOp) {
+//mgruner: WHAT IS THIS CODE FOR? TOUPPER IS SUPPOSED TO ALWAYS WORK...
+//	long performOp = 0;
+//	for (const char *ch = buf; *ch; ch++)
+//		performOp += (*ch > 0) ? 1 : -1;
+//
+//	if (performOp) {
 		while (*buf)
 			*buf = SW_toupper(*buf++);
-	}
+//	}
 #else
 	if (!max)
 		max = strlen(ret);
-		UErrorCode err = U_ZERO_ERROR;
-		UConverter *conv = ucnv_open("UTF-8", &err);
-		UnicodeString str(buf, -1, conv, err);
-		UnicodeString ustr = str.toUpper();
-		ustr.extract(ret, max, conv, err);
-		ucnv_close(conv);
+
+	UErrorCode err = U_ZERO_ERROR;
+	UConverter *conv = ucnv_open("UTF-8", &err);
+	UnicodeString str(buf, -1, conv, err);
+	UnicodeString ustr = str.toUpper();
+	ustr.extract(ret, max, conv, err);
+	ucnv_close(conv);
 #endif
 
 	return ret;

--Boundary-00=_caqCAALvtLUnOD9--