[sword-devel] swi18n :)

Sun, 18 Jan 2004 11:57:44 -0700

Discussion of this topic in irc has been logged on wiki for all interested.

http://www.crosswire.org/ucgi-bin/twiki/view/Main/SWI18n

Martin Gruner wrote:
> Hi,
> 
> I tried to work out how utf-8 locales can work in sword. I noticed that there 
> are 2 toupper() functions in sword, one of which only works on latin1, and 
> the second can utilize icu.
> Therefore I created a patch to be able to handle utf-8 consistently in sword 
> by deleting toupper() and patching toupper_utf8() a little. Please look 
> through and see if this is ok. 
> 
> It should work, I only have the problem that ICU does not do the toUpper() 
> correctly on my system right now, not sure why. It leaves the string as it 
> is. Chris, can you help me here? Does it work for you?
> 
> Thanks for all feedback.
> 
> Martin
> 
> 
> ------------------------------------------------------------------------
> 
> Index: debian/rules
> ===================================================================
> RCS file: /cvs/core/sword/debian/rules,v
> retrieving revision 1.7
> diff -u -3 -p -u -r1.7 rules
> --- debian/rules	17 Jan 2004 21:21:13 -0000	1.7
> +++ debian/rules	18 Jan 2004 15:26:04 -0000
> @@ -34,7 +34,7 @@ configure-stamp:
>  	chmod 755 configure
>  	./configure $(confflags) --prefix=/usr --mandir=\$${prefix}/share/man \
>  		--infodir=\$${prefix}/share/info --with-zlib \
> -		--sysconfdir=/etc --enable-shared --without-icu \
> +		--sysconfdir=/etc --enable-shared --with-icu \
>  		--without-lucene
>  	touch configure.stamp
>  
> Index: include/utilstr.h
> ===================================================================
> RCS file: /cvs/core/sword/include/utilstr.h,v
> retrieving revision 1.11
> diff -u -3 -p -u -r1.11 utilstr.h
> --- include/utilstr.h	22 Jun 2003 23:50:23 -0000	1.11
> +++ include/utilstr.h	18 Jan 2004 15:26:04 -0000
> @@ -33,7 +33,7 @@ char *strstrip (char *istr);
>  const char *stristr (const char *s1, const char *s2);
>  const char strnicmp(const char *s1, const char *s2, int len);
>  unsigned int strlenw(const char *s1);
> -char *toupperstr(char *buf);
> +//char *toupperstr(char *buf);
>  char *toupperstr_utf8(char *buf, unsigned int max = 0);
>  
>  /*
> Index: src/keys/versekey.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/keys/versekey.cpp,v
> retrieving revision 1.58
> diff -u -3 -p -u -r1.58 versekey.cpp
> --- src/keys/versekey.cpp	27 Jun 2003 01:41:07 -0000	1.58
> +++ src/keys/versekey.cpp	18 Jan 2004 15:26:05 -0000
> @@ -324,7 +324,7 @@ int VerseKey::getBookAbbrev(const char *
>  		stdstr(&abbr, iabbr);
>  		strstrip(abbr);
>  		if (!i)
> -			toupperstr(abbr);
> +			toupperstr_utf8(abbr);
>  		abLen = strlen(abbr);
>  
>  		if (abLen) {
> Index: src/modules/filters/swbasicfilter.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/modules/filters/swbasicfilter.cpp,v
> retrieving revision 1.33
> diff -u -3 -p -u -r1.33 swbasicfilter.cpp
> --- src/modules/filters/swbasicfilter.cpp	24 Oct 2003 02:43:46 -0000	1.33
> +++ src/modules/filters/swbasicfilter.cpp	18 Jan 2004 15:26:05 -0000
> @@ -93,7 +93,7 @@ void SWBasicFilter::addTokenSubstitute(c
>  
>  	if (!tokenCaseSensitive) {
>  		stdstr(&buf, findString);
> -		toupperstr(buf);
> +		toupperstr_utf8(buf);
>  		tokenSubMap[buf] = replaceString;
>  		delete [] buf;
>  	}
> @@ -114,7 +114,7 @@ void SWBasicFilter::addEscapeStringSubst
>  
>  	if (!escStringCaseSensitive) {
>  		stdstr(&buf, findString);
> -		toupperstr(buf);
> +		toupperstr_utf8(buf);
>  		escSubMap.insert(DualStringMap::value_type(buf, replaceString));
>  		delete [] buf;
>  	}
> @@ -135,7 +135,7 @@ bool SWBasicFilter::substituteToken(SWBu
>  	if (!tokenCaseSensitive) {
>  	        char *tmp = 0;
>  		stdstr(&tmp, token);
> -		toupperstr(tmp);
> +		toupperstr_utf8(tmp);
>  		it = tokenSubMap.find(tmp);
>  		delete [] tmp;
>  	} else
> @@ -155,7 +155,7 @@ bool SWBasicFilter::substituteEscapeStri
>  	if (!escStringCaseSensitive) {
>  	        char *tmp = 0;
>  		stdstr(&tmp, escString);
> -		toupperstr(tmp);
> +		toupperstr_utf8(tmp);
>  		it = escSubMap.find(tmp);
>  		delete [] tmp;
>  	} else 
> Index: src/modules/texts/rawtext/rawtext.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/modules/texts/rawtext/rawtext.cpp,v
> retrieving revision 1.69
> diff -u -3 -p -u -r1.69 rawtext.cpp
> --- src/modules/texts/rawtext/rawtext.cpp	17 Jan 2004 04:33:25 -0000	1.69
> +++ src/modules/texts/rawtext/rawtext.cpp	18 Jan 2004 15:26:06 -0000
> @@ -282,7 +282,7 @@ signed char RawText::createSearchFramewo
>  		while (word) {
>  
>  			// make word upper case
> -			toupperstr(word);
> +			toupperstr_utf8(word);
>  
>  			// lookup word in dictionary (or make entry in dictionary
>  			// for this word) and add this module position (index) to
> @@ -519,7 +519,7 @@ ListKey &RawText::search(const char *ist
>  
>  			// toupper our copy of search string
>  			stdstr(&wordBuf, istr);
> -			toupperstr(wordBuf);
> +			toupperstr_utf8(wordBuf);
>  
>  			// get list of individual words
>  			words = (char **)calloc(sizeof(char *), 10);
> Index: src/utilfuns/utilstr.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/utilfuns/utilstr.cpp,v
> retrieving revision 1.25
> diff -u -3 -p -u -r1.25 utilstr.cpp
> --- src/utilfuns/utilstr.cpp	27 Jun 2003 02:21:05 -0000	1.25
> +++ src/utilfuns/utilstr.cpp	18 Jan 2004 15:26:06 -0000
> @@ -1,6 +1,7 @@
>  #include <utilstr.h>
>  #include <ctype.h>
>  #include <string.h>
> +#include <iostream>
>  
>  #ifdef _ICU_
>  #include <unicode/utypes.h>
> @@ -147,26 +148,29 @@ unsigned int strlenw(const char *s1) {
>  }
>  
>  
> -/******************************************************************************
> - * toupperstr - converts a string to uppercase string
> - *
> - * ENT:	target - string to convert
> - *
> - * RET:	target
> - */
> -
> -char *toupperstr(char *buf) {
> -	char *ret = buf;
> -
> -	while (*buf)
> -		*buf = SW_toupper(*buf++);
> -
> -	return ret;
> -}
> +///******************************************************************************
> +// * toupperstr - converts a string to uppercase string
> +// *
> +// * ENT:	target - string to convert
> +// *
> +// * RET:	target
> +// */
> +//
> +//char *toupperstr(char *buf) {
> +//	char *ret = buf;
> +//
> +//	while (*buf)
> +//		*buf = SW_toupper(*buf++);
> +//
> +//	return ret;
> +//}
>  
>  
>  /******************************************************************************
> - * toupperstr - converts a string to uppercase string
> + * toupperstr_utf8 - converts a string to uppercase string
> + * If ICU support is enabled in sword, this function will use it to do the work.
> + * If ICU support is not enabled, this function will ONLY work correctly with
> + * Latin-1 data!
>   *
>   * ENT:	target - string to convert
>   *
> @@ -179,23 +183,26 @@ char *toupperstr_utf8(char *buf, unsigne
>  #ifndef _ICU_
>  	// try to decide if it's worth trying to toupper.  Do we have more
>  	// characters that are probably lower latin than not?
> -	long performOp = 0;
> -	for (const char *ch = buf; *ch; ch++)
> -		performOp += (*ch > 0) ? 1 : -1;
>  
> -	if (performOp) {
> +//mgruner: WHAT IS THIS CODE FOR? TOUPPER IS SUPPOSED TO ALWAYS WORK...
> +//	long performOp = 0;
> +//	for (const char *ch = buf; *ch; ch++)
> +//		performOp += (*ch > 0) ? 1 : -1;
> +//
> +//	if (performOp) {
>  		while (*buf)
>  			*buf = SW_toupper(*buf++);
> -	}
> +//	}
>  #else
>  	if (!max)
>  		max = strlen(ret);
> -		UErrorCode err = U_ZERO_ERROR;
> -		UConverter *conv = ucnv_open("UTF-8", &err);
> -		UnicodeString str(buf, -1, conv, err);
> -		UnicodeString ustr = str.toUpper();
> -		ustr.extract(ret, max, conv, err);
> -		ucnv_close(conv);
> +
> +	UErrorCode err = U_ZERO_ERROR;
> +	UConverter *conv = ucnv_open("UTF-8", &err);
> +	UnicodeString str(buf, -1, conv, err);
> +	UnicodeString ustr = str.toUpper();
> +	ustr.extract(ret, max, conv, err);
> +	ucnv_close(conv);
>  #endif
>  
>  	return ret;