[sword-devel] MacSword 1.1.3

Thu Nov 25 19:53:21 MST 2004

Hey guys,
	What's the status on ICU and StringMgr?  I thought that if ICU was 
compiled into the library, it provided a subclass: ICUStringMgr that 
worked with UTF8.   This would mean that Will doesn't have to do 
anything if he already includes ICU support.

	-Troy.


Joachim Ansorg wrote:
> Hi,
> 
> it's simple - I hope :)
> 
> 
>>So all I have to do is replace char* StringMgr::upperUTF8(char* t,
>>const unsigned int maxlen)?
> 
> 
> Reimplement upperUTF8 and then tell Sword to use an instance of your 
> reimplementation to handle Unicode/Latin1 Strings with 
> StringMgr::setSystemStringMgr.
> 
> 
>>If so, then what is maxlen for? Is it expecting t to be overwritten or
>>just a maximum buffer allocated? I assume that sword will dealloc any
>>buffer I return.
> 
> 
> If maxlen is > 0 upper only maxlen chars. It expects t to be overwritten. No 
> buffers are allocated withing upperUTF8. I attached the BTStringMgr we use in 
> BibleTime.
> I advise to check a string if it contains unicode chars before uppering the 
> chars using Unicode. Checking is a lot faster than without.
> 
> I hope that helps. And yes, we need better documentation :)
> 
> If you return true in supportsUnicode then LocaleMgr will only load locales 
> which are in UTF-8, so you can be sure that all verse keys are in UTF-8.
> 
> Let me know if you need help,
> Joachim
> 
> 
> ------------------------------------------------------------------------
> 
> //
> // C++ Implementation: btstringmgr
> //
> // Description: 
> //
> //
> // Author: The BibleTime team <info at bibletime.info>, (C) 2004
> //
> // Copyright: See COPYING file that comes with this distribution
> //
> //
> 
> #include "btstringmgr.h"
> 
> //System includes
> #include <ctype.h>
> 
> char* BTStringMgr::upperUTF8(char* text, const unsigned int maxlen) {
> 	const int max = (maxlen>0) ? maxlen : strlen(text);
> 	
> 	if (isUtf8(text)) {
> 		strncpy(text, (const char*)QString::fromUtf8(text).upper().utf8(), max);
> 	
> 		return text;
> 	}
> 	else {
> 		char* ret = text;	
> 		while (*text) {
> 			*text = toupper(*text);
> 			text++;
> 		}
> 		
> 		return ret;
> 	}
> 
> 	return text;
> }
> 
> char* BTStringMgr::upperLatin1(char* text) {
> 	char* ret = text;	
> 	
> 	while (*text) {
> 		*text++ = toupper(*text);
> 	}
> 	
> 	return ret;
> }
> 
> const bool BTStringMgr::supportsUnicode() const {
> 	return true;
> }
> 
> const bool BTStringMgr::isUtf8(const char *buf) {
>   int i, n;
>   register unsigned char c;
>   bool gotone = false;
> 
> #define F 0   /* character never appears in text */
> #define T 1   /* character appears in plain ASCII text */
> #define I 2   /* character appears in ISO-8859 text */
> #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
> 
>   static const unsigned char text_chars[256] = {
>         /*                  BEL BS HT LF    FF CR    */
>         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
>         /*                              ESC          */
>         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
>         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
>         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
>         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
>         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
>         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
>         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
>         /*            NEL                            */
>         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
>         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
>         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
>         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
>         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
>         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
>         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
>         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
>   };
> 
>   /* *ulen = 0; */
>   for (i = 0; (c = buf[i]); i++) {
>     if ((c & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
>       /*
>        * Even if the whole file is valid UTF-8 sequences,
>        * still reject it if it uses weird control characters.
>        */
> 
>       if (text_chars[c] != T)
>         return false;
> 
>     } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
>       return false;
>     } else {                           /* 11xxxxxx begins UTF-8 */
>       int following;
> 
>     if ((c & 0x20) == 0) {             /* 110xxxxx */
>       following = 1;
>     } else if ((c & 0x10) == 0) {      /* 1110xxxx */
>       following = 2;
>     } else if ((c & 0x08) == 0) {      /* 11110xxx */
>       following = 3;
>     } else if ((c & 0x04) == 0) {      /* 111110xx */
>       following = 4;
>     } else if ((c & 0x02) == 0) {      /* 1111110x */
>       following = 5;
>     } else
>       return false;
> 
>       for (n = 0; n < following; n++) {
>         i++;
>         if (!(c = buf[i]))
>           goto done;
> 
>         if ((c & 0x80) == 0 || (c & 0x40))
>           return false;
>       }
>       gotone = true;
>     }
>   }
> done:
>   return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
> }
> 
> #undef F
> #undef T
> #undef I
> #undef X
> 
> 
> ------------------------------------------------------------------------
> 
> //
> // C++ Interface: btstringmgr
> //
> // Description: 
> //
> //
> // Author: The BibleTime team <info at bibletime.info>, (C) 2004
> //
> // Copyright: See COPYING file that comes with this distribution
> //
> //
> #ifndef BTSTRINGMGR_H
> #define BTSTRINGMGR_H
> 
> //Sword includes
> #include <stringmgr.h>
> 
> //Qt includes
> #include <qstring.h>
> 
> using namespace sword;
> 
> class BTStringMgr : public StringMgr {
> public:
> 	/** Converts the param to an upper case Utf8 string
> 	* @param The text encoded in utf8 which should be turned into an upper case string
> 	*/	
> 	virtual char* upperUTF8(char*, const unsigned int maxlen = 0);
> 	
> 	/** Converts the param to an uppercase latin1 string
> 	* @param The text encoded in latin1 which should be turned into an upper case string
> 	*/	
> 	virtual char* upperLatin1(char*);
> 
> protected:
> 	virtual const bool supportsUnicode() const;
> 	
> 	/** CODE TAKEN FROM KDELIBS 3.2
> 	* This function checks whether a string is utf8 or not.
> 	*
> 	* It was taken from kdelibs so we do not depend on KDE 3.2.
> 	*/
> 	const bool isUtf8(const char *buf);
> };
> 
> #endif
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> sword-devel mailing list
> sword-devel at crosswire.org
> http://www.crosswire.org/mailman/listinfo/sword-devel