[sword-devel] swi18n :)
Troy A. Griffitts
sword-devel@crosswire.org
Sun, 18 Jan 2004 11:57:44 -0700
Discussion of this topic in irc has been logged on wiki for all interested.
http://www.crosswire.org/ucgi-bin/twiki/view/Main/SWI18n
Martin Gruner wrote:
> Hi,
>
> I tried to work out how utf-8 locales can work in sword. I noticed that there
> are 2 toupper() functions in sword, one of which only works on latin1, and
> the second can utilize icu.
> Therefore I created a patch to be able to handle utf-8 consistently in sword
> by deleting toupper() and patching toupper_utf8() a little. Please look
> through and see if this is ok.
>
> It should work, I only have the problem that ICU does not do the toUpper()
> correctly on my system right now, not sure why. It leaves the string as it
> is. Chris, can you help me here? Does it work for you?
>
> Thanks for all feedback.
>
> Martin
>
>
> ------------------------------------------------------------------------
>
> Index: debian/rules
> ===================================================================
> RCS file: /cvs/core/sword/debian/rules,v
> retrieving revision 1.7
> diff -u -3 -p -u -r1.7 rules
> --- debian/rules 17 Jan 2004 21:21:13 -0000 1.7
> +++ debian/rules 18 Jan 2004 15:26:04 -0000
> @@ -34,7 +34,7 @@ configure-stamp:
> chmod 755 configure
> ./configure $(confflags) --prefix=/usr --mandir=\$${prefix}/share/man \
> --infodir=\$${prefix}/share/info --with-zlib \
> - --sysconfdir=/etc --enable-shared --without-icu \
> + --sysconfdir=/etc --enable-shared --with-icu \
> --without-lucene
> touch configure.stamp
>
> Index: include/utilstr.h
> ===================================================================
> RCS file: /cvs/core/sword/include/utilstr.h,v
> retrieving revision 1.11
> diff -u -3 -p -u -r1.11 utilstr.h
> --- include/utilstr.h 22 Jun 2003 23:50:23 -0000 1.11
> +++ include/utilstr.h 18 Jan 2004 15:26:04 -0000
> @@ -33,7 +33,7 @@ char *strstrip (char *istr);
> const char *stristr (const char *s1, const char *s2);
> const char strnicmp(const char *s1, const char *s2, int len);
> unsigned int strlenw(const char *s1);
> -char *toupperstr(char *buf);
> +//char *toupperstr(char *buf);
> char *toupperstr_utf8(char *buf, unsigned int max = 0);
>
> /*
> Index: src/keys/versekey.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/keys/versekey.cpp,v
> retrieving revision 1.58
> diff -u -3 -p -u -r1.58 versekey.cpp
> --- src/keys/versekey.cpp 27 Jun 2003 01:41:07 -0000 1.58
> +++ src/keys/versekey.cpp 18 Jan 2004 15:26:05 -0000
> @@ -324,7 +324,7 @@ int VerseKey::getBookAbbrev(const char *
> stdstr(&abbr, iabbr);
> strstrip(abbr);
> if (!i)
> - toupperstr(abbr);
> + toupperstr_utf8(abbr);
> abLen = strlen(abbr);
>
> if (abLen) {
> Index: src/modules/filters/swbasicfilter.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/modules/filters/swbasicfilter.cpp,v
> retrieving revision 1.33
> diff -u -3 -p -u -r1.33 swbasicfilter.cpp
> --- src/modules/filters/swbasicfilter.cpp 24 Oct 2003 02:43:46 -0000 1.33
> +++ src/modules/filters/swbasicfilter.cpp 18 Jan 2004 15:26:05 -0000
> @@ -93,7 +93,7 @@ void SWBasicFilter::addTokenSubstitute(c
>
> if (!tokenCaseSensitive) {
> stdstr(&buf, findString);
> - toupperstr(buf);
> + toupperstr_utf8(buf);
> tokenSubMap[buf] = replaceString;
> delete [] buf;
> }
> @@ -114,7 +114,7 @@ void SWBasicFilter::addEscapeStringSubst
>
> if (!escStringCaseSensitive) {
> stdstr(&buf, findString);
> - toupperstr(buf);
> + toupperstr_utf8(buf);
> escSubMap.insert(DualStringMap::value_type(buf, replaceString));
> delete [] buf;
> }
> @@ -135,7 +135,7 @@ bool SWBasicFilter::substituteToken(SWBu
> if (!tokenCaseSensitive) {
> char *tmp = 0;
> stdstr(&tmp, token);
> - toupperstr(tmp);
> + toupperstr_utf8(tmp);
> it = tokenSubMap.find(tmp);
> delete [] tmp;
> } else
> @@ -155,7 +155,7 @@ bool SWBasicFilter::substituteEscapeStri
> if (!escStringCaseSensitive) {
> char *tmp = 0;
> stdstr(&tmp, escString);
> - toupperstr(tmp);
> + toupperstr_utf8(tmp);
> it = escSubMap.find(tmp);
> delete [] tmp;
> } else
> Index: src/modules/texts/rawtext/rawtext.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/modules/texts/rawtext/rawtext.cpp,v
> retrieving revision 1.69
> diff -u -3 -p -u -r1.69 rawtext.cpp
> --- src/modules/texts/rawtext/rawtext.cpp 17 Jan 2004 04:33:25 -0000 1.69
> +++ src/modules/texts/rawtext/rawtext.cpp 18 Jan 2004 15:26:06 -0000
> @@ -282,7 +282,7 @@ signed char RawText::createSearchFramewo
> while (word) {
>
> // make word upper case
> - toupperstr(word);
> + toupperstr_utf8(word);
>
> // lookup word in dictionary (or make entry in dictionary
> // for this word) and add this module position (index) to
> @@ -519,7 +519,7 @@ ListKey &RawText::search(const char *ist
>
> // toupper our copy of search string
> stdstr(&wordBuf, istr);
> - toupperstr(wordBuf);
> + toupperstr_utf8(wordBuf);
>
> // get list of individual words
> words = (char **)calloc(sizeof(char *), 10);
> Index: src/utilfuns/utilstr.cpp
> ===================================================================
> RCS file: /cvs/core/sword/src/utilfuns/utilstr.cpp,v
> retrieving revision 1.25
> diff -u -3 -p -u -r1.25 utilstr.cpp
> --- src/utilfuns/utilstr.cpp 27 Jun 2003 02:21:05 -0000 1.25
> +++ src/utilfuns/utilstr.cpp 18 Jan 2004 15:26:06 -0000
> @@ -1,6 +1,7 @@
> #include <utilstr.h>
> #include <ctype.h>
> #include <string.h>
> +#include <iostream>
>
> #ifdef _ICU_
> #include <unicode/utypes.h>
> @@ -147,26 +148,29 @@ unsigned int strlenw(const char *s1) {
> }
>
>
> -/******************************************************************************
> - * toupperstr - converts a string to uppercase string
> - *
> - * ENT: target - string to convert
> - *
> - * RET: target
> - */
> -
> -char *toupperstr(char *buf) {
> - char *ret = buf;
> -
> - while (*buf)
> - *buf = SW_toupper(*buf++);
> -
> - return ret;
> -}
> +///******************************************************************************
> +// * toupperstr - converts a string to uppercase string
> +// *
> +// * ENT: target - string to convert
> +// *
> +// * RET: target
> +// */
> +//
> +//char *toupperstr(char *buf) {
> +// char *ret = buf;
> +//
> +// while (*buf)
> +// *buf = SW_toupper(*buf++);
> +//
> +// return ret;
> +//}
>
>
> /******************************************************************************
> - * toupperstr - converts a string to uppercase string
> + * toupperstr_utf8 - converts a string to uppercase string
> + * If ICU support is enabled in sword, this function will use it to do the work.
> + * If ICU support is not enabled, this function will ONLY work correctly with
> + * Latin-1 data!
> *
> * ENT: target - string to convert
> *
> @@ -179,23 +183,26 @@ char *toupperstr_utf8(char *buf, unsigne
> #ifndef _ICU_
> // try to decide if it's worth trying to toupper. Do we have more
> // characters that are probably lower latin than not?
> - long performOp = 0;
> - for (const char *ch = buf; *ch; ch++)
> - performOp += (*ch > 0) ? 1 : -1;
>
> - if (performOp) {
> +//mgruner: WHAT IS THIS CODE FOR? TOUPPER IS SUPPOSED TO ALWAYS WORK...
> +// long performOp = 0;
> +// for (const char *ch = buf; *ch; ch++)
> +// performOp += (*ch > 0) ? 1 : -1;
> +//
> +// if (performOp) {
> while (*buf)
> *buf = SW_toupper(*buf++);
> - }
> +// }
> #else
> if (!max)
> max = strlen(ret);
> - UErrorCode err = U_ZERO_ERROR;
> - UConverter *conv = ucnv_open("UTF-8", &err);
> - UnicodeString str(buf, -1, conv, err);
> - UnicodeString ustr = str.toUpper();
> - ustr.extract(ret, max, conv, err);
> - ucnv_close(conv);
> +
> + UErrorCode err = U_ZERO_ERROR;
> + UConverter *conv = ucnv_open("UTF-8", &err);
> + UnicodeString str(buf, -1, conv, err);
> + UnicodeString ustr = str.toUpper();
> + ustr.extract(ret, max, conv, err);
> + ucnv_close(conv);
> #endif
>
> return ret;