[sword-cvs] icu-sword/source/common/unicode .cvsignore,1.2,1.3 brkiter.h,NONE,1.1 caniter.h,NONE,1.1 chariter.h,1.3,1.4 dbbi.h,NONE,1.1 docmain.h,1.3,1.4 locid.h,1.3,1.4 normlzr.h,1.4,1.5 parseerr.h,NONE,1.1 parsepos.h,NONE,1.1 platform.h.in,1.4,1.5 pmacos.h,1.2,1.3 pos400.h,1.2,1.3 putil.h,1.3,1.4 pwin32.h,1.4,1.5 rbbi.h,NONE,1.1 rep.h,1.3,1.4 resbund.h,1.3,1.4 schriter.h,1.3,1.4 strenum.h,NONE,1.1 ubidi.h,1.3,1.4 ubrk.h,NONE,1.1 ucat.h,NONE,1.1 uchar.h,1.4,1.5 uchriter.h,1.3,1.4 uclean.h,1.3,1.4 ucnv.h,1.3,1.4 ucnv_cb.h,1.3,1.4 ucnv_err.h,1.3,1.4 uconfig.h,NONE,1.1 udata.h,1.3,1.4 uenum.h,NONE,1.1 uidna.h,NONE,1.1 uiter.h,NONE,1.1 uloc.h,1.3,1.4 umachine.h,1.3,1.4 umisc.h,1.2,1.3 unifilt.h,NONE,1.1 unifunct.h,NONE,1.1 unimatch.h,NONE,1.1 uniset.h,NONE,1.1 unistr.h,1.4,1.5 unorm.h,1.3,1.4 uobject.h,NONE,1.1 urename.h,1.4,1.5 urep.h,1.3,1.4 ures.h,1.4,1.5 uscript.h,1.4,1.5 uset.h,NONE,1.1 usetiter.h,NONE,1.1 ushape.h,1.3,1.4 ustring.h,1.3,1.4 utf.h,1.3,1.4 utf16.h,1.2,1.3 utf32.h,1.2,1.3 utf8.h,1.3,1.4 utf_old.h,NONE,1.1 utypes.h,1.8,1.9 uversion.h,1.4,1.5
sword@www.crosswire.org
sword@www.crosswire.org
Tue, 9 Sep 2003 19:43:16 -0700
- Previous message: [sword-cvs] icu-sword/source/config .cvsignore,1.2,1.3 Makefile.inc.in,1.5,1.6 icu-config-bottom,NONE,1.1 icu-config-top,NONE,1.1 icu-config.1.in,NONE,1.1 make2sh.sed,NONE,1.1 mh-aix,1.4,1.5 mh-aix-va,1.4,1.5 mh-alpha-linux-cc,1.4,1.5 mh-alpha-linux-gcc,1.4,1.5 mh-alpha-osf,1.2,1.3 mh-bsd-gcc,1.4,1.5 mh-cygwin,1.4,1.5 mh-cygwin-msvc,NONE,1.1 mh-darwin,1.4,1.5 mh-hpux-acc,1.4,1.5 mh-hpux-cc,1.4,1.5 mh-hpux-gcc,NONE,1.1 mh-irix,1.4,1.5 mh-linux,1.4,1.5 mh-os390,1.4,1.5 mh-os400,1.4,1.5 mh-ptx,1.4,1.5 mh-qnx,NONE,1.1 mh-solaris,1.4,1.5 mh-solaris-gcc,1.4,1.5 mh-unknown,NONE,1.1 test-icu-config.sh,NONE,1.1
- Next message: [sword-cvs] icu-sword/source/test/hdrtst Makefile,1.2,1.3 cxxfiles.txt,NONE,1.1 dfiles.txt,NONE,1.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /usr/local/cvsroot/icu-sword/source/common/unicode
In directory www:/tmp/cvs-serv19862/source/common/unicode
Added Files:
.cvsignore brkiter.h caniter.h chariter.h dbbi.h docmain.h
locid.h normlzr.h parseerr.h parsepos.h platform.h.in pmacos.h
pos400.h putil.h pwin32.h rbbi.h rep.h resbund.h schriter.h
strenum.h ubidi.h ubrk.h ucat.h uchar.h uchriter.h uclean.h
ucnv.h ucnv_cb.h ucnv_err.h uconfig.h udata.h uenum.h uidna.h
uiter.h uloc.h umachine.h umisc.h unifilt.h unifunct.h
unimatch.h uniset.h unistr.h unorm.h uobject.h urename.h
urep.h ures.h uscript.h uset.h usetiter.h ushape.h ustring.h
utf.h utf16.h utf32.h utf8.h utf_old.h utypes.h uversion.h
Log Message:
ICU 2.6 commit
--- NEW FILE: brkiter.h ---
/*
********************************************************************************
* Copyright (C) 1997-2003, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
* File brkiter.h
*
* Modification History:
*
* Date Name Description
* 02/18/97 aliu Added typedef for TextCount. Made DONE const.
* 05/07/97 aliu Fixed DLL declaration.
* 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK
* 08/11/98 helena Sync-up JDK1.2.
* 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
********************************************************************************
*/
#ifndef BRKITER_H
#define BRKITER_H
#include "unicode/utypes.h"
#if UCONFIG_NO_BREAK_ITERATION
U_NAMESPACE_BEGIN
/*
* Allow the declaration of APIs with pointers to BreakIterator
* even when break iteration is removed from the build.
*/
class BreakIterator;
U_NAMESPACE_END
#else
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/locid.h"
#include "unicode/ubrk.h"
#include "unicode/strenum.h"
U_NAMESPACE_BEGIN
typedef const void* URegistryKey;
/**
* The BreakIterator class implements methods for finding the location
* of boundaries in text. BreakIterator is an abstract base class.
* Instances of BreakIterator maintain a current position and scan over
* text returning the index of characters where boundaries occur.
* <P>
* Line boundary analysis determines where a text string can be broken
* when line-wrapping. The mechanism correctly handles punctuation and
* hyphenated words.
* <P>
* Sentence boundary analysis allows selection with correct
* interpretation of periods within numbers and abbreviations, and
* trailing punctuation marks such as quotation marks and parentheses.
* <P>
* Word boundary analysis is used by search and replace functions, as
* well as within text editing applications that allow the user to
* select words with a double click. Word selection provides correct
* interpretation of punctuation marks within and following
* words. Characters that are not part of a word, such as symbols or
* punctuation marks, have word-breaks on both sides.
* <P>
* Character boundary analysis allows users to interact with
* characters as they expect to, for example, when moving the cursor
* through a text string. Character boundary analysis provides correct
* navigation of through character strings, regardless of how the
* character is stored. For example, an accented character might be
* stored as a base character and a diacritical mark. What users
* consider to be a character can differ between languages.
* <P>
* This is the interface for all text boundaries.
* <P>
* Examples:
* <P>
* Helper function to output text
* <pre>
* \code
* void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
* {
* UnicodeString textBuffer, temp;
* CharacterIterator *strIter = iterator.createText();
* strIter->getText(temp);
* cout << " " << start << " " << end << " |"
* << temp.extractBetween(start, end, textBuffer)
* << "|" << endl;
* delete strIter;
* }
* \endcode
* </pre>
* Print each element in order:
* <pre>
* \code
* void printEachForward( BreakIterator& boundary)
* {
* int32_t start = boundary.first();
* for (int32_t end = boundary.next();
* end != BreakIterator::DONE;
* start = end, end = boundary.next())
* {
* printTextRange( boundary, start, end );
* }
* }
* \code
* </pre>
* Print each element in reverse order:
* <pre>
* \code
* void printEachBackward( BreakIterator& boundary)
* {
* int32_t end = boundary.last();
* for (int32_t start = boundary.previous();
* start != BreakIterator::DONE;
* end = start, start = boundary.previous())
* {
* printTextRange( boundary, start, end );
* }
* }
* \endcode
* </pre>
* Print first element
* <pre>
* \code
* void printFirst(BreakIterator& boundary)
* {
* int32_t start = boundary.first();
* int32_t end = boundary.next();
* printTextRange( boundary, start, end );
* }
* \endcode
* </pre>
* Print last element
* <pre>
* \code
* void printLast(BreakIterator& boundary)
* {
* int32_t end = boundary.last();
* int32_t start = boundary.previous();
* printTextRange( boundary, start, end );
* }
* \endcode
* </pre>
* Print the element at a specified position
* <pre>
* \code
* void printAt(BreakIterator &boundary, int32_t pos )
* {
* int32_t end = boundary.following(pos);
* int32_t start = boundary.previous();
* printTextRange( boundary, start, end );
* }
* \endcode
* </pre>
* Creating and using text boundaries
* <pre>
* \code
* void BreakIterator_Example( void )
* {
* BreakIterator* boundary;
* UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
* cout << "Examining: " << stringToExamine << endl;
*
* //print each sentence in forward and reverse order
* boundary = BreakIterator::createSentenceInstance( Locale::US );
* boundary->setText(stringToExamine);
* cout << "----- forward: -----------" << endl;
* printEachForward(*boundary);
* cout << "----- backward: ----------" << endl;
* printEachBackward(*boundary);
* delete boundary;
*
* //print each word in order
* boundary = BreakIterator::createWordInstance();
* boundary->setText(stringToExamine);
* cout << "----- forward: -----------" << endl;
* printEachForward(*boundary);
* //print first element
* cout << "----- first: -------------" << endl;
* printFirst(*boundary);
* //print last element
* cout << "----- last: --------------" << endl;
* printLast(*boundary);
* //print word at charpos 10
* cout << "----- at pos 10: ---------" << endl;
* printAt(*boundary, 10 );
*
* delete boundary;
* }
* \endcode
* </pre>
*/
class U_COMMON_API BreakIterator : public UObject {
public:
/**
* destructor
* @stable ICU 2.0
*/
virtual ~BreakIterator();
/**
* Return true if another object is semantically equal to this
* one. The other object should be an instance of the same subclass of
* BreakIterator. Objects of different subclasses are considered
* unequal.
* <P>
* Return true if this BreakIterator is at the same position in the
* same text, and is the same class and type (word, line, etc.) of
* BreakIterator, as the argument. Text is considered the same if
* it contains the same characters, it need not be the same
* object, and styles are not considered.
* @stable ICU 2.0
*/
virtual UBool operator==(const BreakIterator&) const = 0;
/**
* Returns the complement of the result of operator==
* @param rhs The BreakIterator to be compared for inequality
* @return the complement of the result of operator==
* @stable ICU 2.0
*/
UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
/**
* Return a polymorphic copy of this object. This is an abstract
* method which subclasses implement.
* @stable ICU 2.0
*/
virtual BreakIterator* clone(void) const = 0;
/**
* Return a polymorphic class ID for this object. Different subclasses
* will return distinct unequal values.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID(void) const = 0;
/**
* Return a CharacterIterator over the text being analyzed.
* Changing the state of the returned iterator can have undefined consequences
* on the operation of the break iterator. If you need to change it, clone it first.
* @stable ICU 2.0
*/
virtual const CharacterIterator& getText(void) const = 0;
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
* @param text The UnicodeString used to change the text.
* @stable ICU 2.0
*/
virtual void setText(const UnicodeString &text) = 0;
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
* @param it The CharacterIterator used to change the text.
* @stable ICU 2.0
*/
virtual void adoptText(CharacterIterator* it) = 0;
/**
* DONE is returned by previous() and next() after all valid
* boundaries have been returned.
* @stable ICU 2.0
*/
static const int32_t DONE;
/**
* Return the index of the first character in the text being scanned.
* @stable ICU 2.0
*/
virtual int32_t first(void) = 0;
/**
* Return the index immediately BEYOND the last character in the text being scanned.
* @stable ICU 2.0
*/
virtual int32_t last(void) = 0;
/**
* Return the boundary preceding the current boundary.
* @return The character index of the previous text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
*/
virtual int32_t previous(void) = 0;
/**
* Return the boundary following the current boundary.
* @return The character index of the next text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
*/
virtual int32_t next(void) = 0;
/**
* Return character index of the current interator position within the text.
* @return The boundary most recently returned.
* @stable ICU 2.0
*/
virtual int32_t current(void) const = 0;
/**
* Return the first boundary following the specified offset.
* The value returned is always greater than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
* @return The first boundary after the specified offset.
* @stable ICU 2.0
*/
virtual int32_t following(int32_t offset) = 0;
/**
* Return the first boundary preceding the specified offset.
* The value returned is always smaller than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
* @return The first boundary before the specified offset.
* @stable ICU 2.0
*/
virtual int32_t preceding(int32_t offset) = 0;
/**
* Return true if the specfied position is a boundary position.
* As a side effect, the current position of the iterator is set
* to the first boundary position at or following the specified offset.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
virtual UBool isBoundary(int32_t offset) = 0;
/**
* Return the nth boundary from the current boundary
* @param n which boundary to return. A value of 0
* does nothing. Negative values move to previous boundaries
* and positive values move to later boundaries.
* @return The index of the nth boundary from the current position, or
* DONE if there are fewer than |n| boundaries in the specfied direction.
* @stable ICU 2.0
*/
virtual int32_t next(int32_t n) = 0;
/**
* Create BreakIterator for word-breaks using the given locale.
* Returns an instance of a BreakIterator implementing word breaks.
* WordBreak is useful for word selection (ex. double click)
* @param where the locale.
* @param status the error code
* @return A BreakIterator for word-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* createWordInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for line-breaks using specified locale.
* Returns an instance of a BreakIterator implementing line breaks. Line
* breaks are logically possible line breaks, actual line breaks are
* usually determined based on display width.
* LineBreak is useful for word wrapping text.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for line-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* createLineInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for character-breaks using specified locale
* Returns an instance of a BreakIterator implementing character breaks.
* Character breaks are boundaries of combining character sequences.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for character-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* createCharacterInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for sentence-breaks using specified locale
* Returns an instance of a BreakIterator implementing sentence breaks.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for sentence-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* createSentenceInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for title-casing breaks using the specified locale
* Returns an instance of a BreakIterator implementing title breaks.
* The iterator returned locates title boundaries as described for
* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
* please use Word Boundary iterator.{@link createWordInstance()}
*
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for title-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.1
*/
static BreakIterator* createTitleInstance(const Locale& where,
UErrorCode& status);
/**
* Get the set of Locales for which TextBoundaries are installed.
* <p><b>Note:</b> this will not return locales added through the register
* call.</p>
* @param count the output parameter of number of elements in the locale list
* @return available locales
* @stable ICU 2.0
*/
static const Locale* getAvailableLocales(int32_t& count);
/**
* Get name of the object for the desired Locale, in the desired langauge.
* @param objectLocale must be from getAvailableLocales.
* @param displayLocale specifies the desired locale for output.
* @param name the fill-in parameter of the return value
* Uses best match.
* @return user-displayable name
* @stable ICU 2.0
*/
static UnicodeString& getDisplayName(const Locale& objectLocale,
const Locale& displayLocale,
UnicodeString& name);
/**
* Get name of the object for the desired Locale, in the langauge of the
* default locale.
* @param objectLocale must be from getMatchingLocales
* @param name the fill-in parameter of the return value
* @return user-displayable name
* @stable ICU 2.0
*/
static UnicodeString& getDisplayName(const Locale& objectLocale,
UnicodeString& name);
/**
* Thread safe client-buffer-based cloning operation
* Do NOT call delete on a safeclone, since 'new' is not used to create it.
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* @param BufferSize reference to size of allocated space.
* If BufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If BufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
* necessary.
* @return pointer to the new clone
*
* @stable ICU 2.0
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status) = 0;
/**
* Determine whether the BreakIterator was created in user memory by
* createBufferClone(), and thus should not be deleted. Such objects
* must be closed by an explicit call to the destructor (not delete).
* @stable ICU 2.0
*/
inline UBool isBufferClone(void);
/**
* Register a new break iterator of the indicated kind, to use in the given locale.
* The break iterator will be adoped. Clones of the iterator will be returned
* if a request for a break iterator of the given kind matches or falls back to
* this locale.
* @param toAdopt the BreakIterator instance to be adopted
* @param locale the Locale for which this instance is to be registered
* @param kind the type of iterator for which this instance is to be registered
* @param status the in/out status code, no special meanings are assigned
* @return a registry key that can be used to unregister this instance
* @draft ICU 2.4
*/
static URegistryKey registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status);
/**
* Unregister a previously-registered BreakIterator using the key returned from the
* register call. Key becomes invalid after a successful call and should not be used again.
* The BreakIterator corresponding to the key will be deleted.
* @param key the registry key returned by a previous call to registerInstance
* @param status the in/out status code, no special meanings are assigned
* @return TRUE if the iterator for the key was successfully unregistered
* @draft ICU 2.4
*/
static UBool unregister(URegistryKey key, UErrorCode& status);
/**
* Return a StringEnumeration over the locales available at the time of the call,
* including registered locales.
* @return a StringEnumeration over the locales available at the time of the call
* @draft ICU 2.4
*/
static StringEnumeration* getAvailableLocales(void);
private:
static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
friend class ICUBreakIteratorFactory;
friend class ICUBreakIteratorService;
protected:
/** @internal */
BreakIterator();
/** @internal */
UBool fBufferClone;
/** @internal */
BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
private:
/**
* The assignment operator has no real implementation.
* It's provided to make the compiler happy. Do not call.
*/
BreakIterator& operator=(const BreakIterator&) { return *this; }
};
inline UBool BreakIterator::isBufferClone()
{
return fBufferClone;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif // _BRKITER
//eof
--- NEW FILE: caniter.h ---
/*
*******************************************************************************
* Copyright (C) 1996-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
#ifndef CANITER_H
#define CANITER_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uobject.h"
#include "unicode/unistr.h"
/** Should permutation skip characters with combining class zero
* Should be either TRUE or FALSE. This is a compile time option
* @draft ICU 2.4
*/
#ifndef CANITER_SKIP_ZEROES
#define CANITER_SKIP_ZEROES TRUE
#endif
U_NAMESPACE_BEGIN
class Hashtable;
/**
* This class allows one to iterate through all the strings that are canonically equivalent to a given
* string. For example, here are some sample results:
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
1: \u0041\u030A\u0064\u0307\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
2: \u0041\u030A\u0064\u0327\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
3: \u0041\u030A\u1E0B\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
4: \u0041\u030A\u1E11\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
5: \u00C5\u0064\u0307\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
6: \u00C5\u0064\u0327\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
7: \u00C5\u1E0B\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
8: \u00C5\u1E11\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
9: \u212B\u0064\u0307\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
10: \u212B\u0064\u0327\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
11: \u212B\u1E0B\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
12: \u212B\u1E11\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
* since it has not been optimized for that situation.
* Note, CanonicalIterator is not intended to be subclassed.
* @author M. Davis
* @author C++ port by V. Weinstein
* @draft ICU 2.4
*/
class U_COMMON_API CanonicalIterator : public UObject {
public:
/**
* Construct a CanonicalIterator object
* @param source string to get results for
* @param status Fill-in parameter which receives the status of this operation.
* @draft ICU 2.4
*/
CanonicalIterator(const UnicodeString &source, UErrorCode &status);
/** Destructor
* Cleans pieces
* @draft ICU 2.4
*/
~CanonicalIterator();
/**
* Gets the NFD form of the current source we are iterating over.
* @return gets the source: NOTE: it is the NFD form of source
* @draft ICU 2.4
*/
UnicodeString getSource();
/**
* Resets the iterator so that one can start again from the beginning.
* @draft ICU 2.4
*/
void reset();
/**
* Get the next canonically equivalent string.
* <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
* @return the next string that is canonically equivalent. A bogus string is returned when
* the iteration is done.
* @draft ICU 2.4
*/
UnicodeString next();
/**
* Set a new source for this iterator. Allows object reuse.
* @param newSource the source string to iterate against. This allows the same iterator to be used
* while changing the source string, saving object creation.
* @param status Fill-in parameter which receives the status of this operation.
* @draft ICU 2.4
*/
void setSource(const UnicodeString &newSource, UErrorCode &status);
/**
* Dumb recursive implementation of permutation.
* TODO: optimize
* @param source the string to find permutations for
* @param skipZeros determine if skip zeros
* @param result the results in a set.
* @param status Fill-in parameter which receives the status of this operation.
* @internal
*/
static void permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status);
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.2
*/
static inline UClassID getStaticClassID();
private:
// ===================== PRIVATES ==============================
// private default constructor
CanonicalIterator();
/**
* Copy constructor. Private for now.
* @internal
*/
CanonicalIterator(const CanonicalIterator& other);
/**
* Assignment operator. Private for now.
* @internal
*/
CanonicalIterator& operator=(const CanonicalIterator& other);
// fields
UnicodeString source;
UBool done;
// 2 dimensional array holds the pieces of the string with
// their different canonically equivalent representations
UnicodeString **pieces;
int32_t pieces_length;
int32_t *pieces_lengths;
// current is used in iterating to combine pieces
int32_t *current;
int32_t current_length;
// transient fields
UnicodeString buffer;
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)
//Set getEquivalents2(String segment);
Hashtable *getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status);
//Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status);
/**
* See if the decomposition of cp2 is at segment starting at segmentPos
* (with canonical rearrangment!)
* If so, take the remainder, and return the equivalents
*/
//Set extract(int comp, String segment, int segmentPos, StringBuffer buffer);
Hashtable *extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
//Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
void cleanPieces();
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UClassID
CanonicalIterator::getStaticClassID()
{ return (UClassID)&fgClassID; }
inline UClassID
CanonicalIterator::getDynamicClassID() const
{ return CanonicalIterator::getStaticClassID(); }
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */
#endif
--- NEW FILE: dbbi.h ---
/*
**********************************************************************
* Copyright (C) 1999-2003 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#ifndef DBBI_H
#define DBBI_H
#include "unicode/rbbi.h"
#if !UCONFIG_NO_BREAK_ITERATION
U_NAMESPACE_BEGIN
/* forward declaration */
class DictionaryBasedBreakIteratorTables;
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
* word and line breaking in Thai, which doesn't use spaces between words. The
* state-table-based algorithm used by RuleBasedBreakIterator is used to divide
* up text as far as possible, and then contiguous ranges of letters are
* repeatedly compared against a list of known words (i.e., the dictionary)
* to divide them up into words.
*
* <p>Applications do not normally need to include this header.</p>
*
* <p>This class will probably be deprecated in a future release of ICU, and replaced
* with a more flexible and capable dictionary based break iterator. This change
* should be invisible to applications, because creation and use of instances of
* DictionaryBasedBreakIterator is through the factories and abstract
* API on class BreakIterator, which will remain stable.</p>
*
* <p>This class is not intended to be subclassed.</p>
*
*
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
* but adds one more special substitution name: <dictionary>. This substitution
* name is used to identify characters in words in the dictionary. The idea is that
* if the iterator passes over a chunk of text that includes two or more characters
* in a row that are included in <dictionary>, it goes back through that range and
* derives additional break positions (if possible) using the dictionary.
*
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
* file. It follows a prescribed search path to locate the dictionary (right now,
* it looks for it in /com/ibm/text/resources in each directory in the classpath,
* and won't find it in JAR files, but this location is likely to change). The
* dictionary file is in a serialized binary format. We have a very primitive (and
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
* currently making it public. Contact us for help.
* <p>
* <b> NOTE </b> The DictionaryBasedIterator class is still under development. The
* APIs are not in stable condition yet.
*/
class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {
private:
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text
*/
int32_t* cachedBreakPositions;
/**
* The number of elements in cachedBreakPositions
*/
int32_t numCachedBreakPositions;
/**
* if cachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
*/
int32_t positionInCache;
DictionaryBasedBreakIteratorTables *fTables;
/**
* Class ID
*/
static const char fgClassID;
/**=======================================================================
* Create a dictionary based break boundary detection iterator.
* @param tablesImage The location for the dictionary to be loaded into memory
* @param dictionaryFilename The name of the dictionary file
* @param status the error code status
* @return A dictionary based break detection iterator. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success. For example,
* U_FILE_ACCESS_ERROR will be returned if the file does not exist.
* The caller owns the returned object and is responsible for deleting it.
======================================================================= */
DictionaryBasedBreakIterator(UDataMemory* tablesImage, const char* dictionaryFilename, UErrorCode& status);
public:
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
* @stable ICU 2.0
*/
virtual ~DictionaryBasedBreakIterator();
/**
* Default constructor. Creates an "empty" break iterator.
* Such an iterator can subsequently be assigned to.
* @return the newly created DictionaryBaseBreakIterator.
* @stable ICU 2.0
*/
DictionaryBasedBreakIterator();
/**
* Copy constructor.
* @param other The DictionaryBasedBreakIterator to be copied.
* @return the newly created DictionaryBasedBreakIterator.
* @stable ICU 2.0
*/
DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other);
/**
* Assignment operator.
* @param that The object to be copied.
* @return the newly set DictionaryBasedBreakIterator.
* @stable ICU 2.0
*/
DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
* @return Returns a newly-constructed RuleBasedBreakIterator.
* @stable ICU 2.0
*/
virtual BreakIterator* clone(void) const;
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Advances the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable ICU 2.0
*/
virtual int32_t previous(void);
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable ICU 2.0
*/
virtual int32_t following(int32_t offset);
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable ICU 2.0
*/
virtual int32_t preceding(int32_t offset);
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
* C++ compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* @return The class ID for this object. All objects of a
* given class have the same class ID. Objects of
* other classes have different class IDs.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID(void) const;
/**
* Returns the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
*
* Base* polymorphic_pointer = createPolymorphicObject();
* if (polymorphic_pointer->getDynamicClassID() ==
* Derived::getStaticClassID()) ...
*
* @return The class ID for all objects of this class.
* @stable ICU 2.0
*/
static inline UClassID getStaticClassID(void);
protected:
//=======================================================================
// implementation
//=======================================================================
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
* and advances through the text character by character until we reach the end
* of the text or the state machine transitions to state 0. We update our return
* value every time the state machine passes through a possible end state.
* @internal
*/
virtual int32_t handleNext(void);
/**
* removes the cache of break positions (usually in response to a change in
* position of some sort)
* @internal
*/
virtual void reset(void);
/**
* init Initialize a dbbi. Common routine for use by constructors.
* @internal
*/
void init();
/**
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* @param BufferSize reference to size of allocated space.
* If BufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If BufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
* necessary.
* @return pointer to the new clone
* @internal
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status);
private:
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
* @param startPos The start position of a range of text
* @param endPos The end position of a range of text
* @param status The error code status
*/
void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status);
/*
* HSYS : Please revisit with Rich, the ctors of the DBBI class is currently
* marked as private.
*/
friend class DictionaryBasedBreakIteratorTables;
friend class BreakIterator;
};
inline UClassID
DictionaryBasedBreakIterator::getStaticClassID(void)
{ return (UClassID)(&fgClassID); }
inline UClassID
DictionaryBasedBreakIterator::getDynamicClassID(void) const
{ return DictionaryBasedBreakIterator::getStaticClassID(); }
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif
--- NEW FILE: parseerr.h ---
/*
**********************************************************************
* Copyright (C) 1999-2000, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 03/14/00 aliu Creation.
* 06/27/00 aliu Change from C++ class to C struct
**********************************************************************
*/
#ifndef PARSEERR_H
#define PARSEERR_H
#include "unicode/utypes.h"
/**
* The capacity of the context strings in UParseError.
* @stable ICU 2.0
*/
enum { U_PARSE_CONTEXT_LEN = 16 };
/**
* A UParseError struct is used to returned detailed information about
* parsing errors. It is used by ICU parsing engines that parse long
* rules, patterns, or programs, where the text being parsed is long
* enough that more information than a UErrorCode is needed to
* localize the error.
*
* <p>The code field is an integer error code specific to each parsing
* engine, but globally unique. See the engine header file for
* possible values. The line, offset, and context fields are
* optional; parsing engines may choose not to use to use them.
*
* <p>Examples of engines which use UParseError (or may use it in the
* future) are RuleBasedTransliterator and RuleBasedBreakIterator.
*
* @stable ICU 2.0
*/
typedef struct UParseError {
/**
* An integer indicating the type of error. If no error was
* encountered, the parse engine sets this to zero, and the
* other fields' values should be ignored.
*
* <p>Each parse engine should use a range of codes from
* 0xNNNN0001 to 0xNNNNFFFF, where NNNN is a 16-bit integer
* between 0x0001 and 0xFFFF unique to each parse engine.
* Parse engines should define the enum PARSE_ERROR_BASE
* to be 0xNNNN0000.
*/
/*int32_t code; */
/**
* The line on which the error occured. If the parse engine
* is not using this field, it should set it to zero. Otherwise
* it should be a positive integer. The default value of this field
* is -1. It will be set to 0 if the code populating this struct is not
* using line numbers.
* @stable ICU 2.0
*/
int32_t line;
/**
* The character offset to the error. If the line field is
* being used, then this offset is from the start of the line.
* If the line field is not being used, then this offset is from
* the start of the text.The default value of this field
* is -1. It will be set to appropriate value by the code that
* populating the struct.
* @stable ICU 2.0
*/
int32_t offset;
/**
* Textual context before the error. Null-terminated.
* May be the empty string if not implemented by parser.
* @stable ICU 2.0
*/
UChar preContext[U_PARSE_CONTEXT_LEN];
/**
* Textual context after the error. Null-terminated.
* May be the empty string if not implemented by parser.
* @stable ICU 2.0
*/
UChar postContext[U_PARSE_CONTEXT_LEN];
} UParseError;
#endif
--- NEW FILE: parsepos.h ---
/*
* Copyright (C) {1997-2003}, International Business Machines Corporation and others. All Rights Reserved.
*******************************************************************************
*
* File PARSEPOS.H
*
* Modification History:
*
* Date Name Description
* 07/09/97 helena Converted from java.
* 07/17/98 stephen Added errorIndex support.
* 05/11/99 stephen Cleaned up.
*******************************************************************************
*/
#ifndef PARSEPOS_H
#define PARSEPOS_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
/**
* <code>ParsePosition</code> is a simple class used by <code>Format</code>
* and its subclasses to keep track of the current position during parsing.
* The <code>parseObject</code> method in the various <code>Format</code>
* classes requires a <code>ParsePosition</code> object as an argument.
*
* <p>
* By design, as you parse through a string with different formats,
* you can use the same <code>ParsePosition</code>, since the index parameter
* records the current position.
*
* The ParsePosition class is not suitable for subclassing.
*
* @version 1.3 10/30/97
* @author Mark Davis, Helena Shih
* @see java.text.Format
*/
class U_COMMON_API ParsePosition : public UObject {
public:
/**
* Default constructor, the index starts with 0 as default.
* @stable ICU 2.0
*/
ParsePosition()
: UObject()
{ this->index = 0; this->errorIndex = -1; }
/**
* Create a new ParsePosition with the given initial index.
* @param newIndex the new text offset.
* @stable ICU 2.0
*/
ParsePosition(int32_t newIndex)
: UObject()
{ this->index = newIndex; this->errorIndex = -1; }
/**
* Copy constructor
* @param copy the object to be copied from.
* @stable ICU 2.0
*/
ParsePosition(const ParsePosition& copy)
: UObject(copy)
{ this->index = copy.index; this->errorIndex = copy.errorIndex; }
/**
* Destructor
* @stable ICU 2.0
*/
~ParsePosition() {}
/**
* Assignment operator
* @stable ICU 2.0
*/
ParsePosition& operator=(const ParsePosition& copy);
/**
* Equality operator.
* @return TRUE if the two parse positions are equal, FALSE otherwise.
* @stable ICU 2.0
*/
UBool operator==(const ParsePosition& that) const;
/**
* Equality operator.
* @return TRUE if the two parse positions are not equal, FALSE otherwise.
* @stable ICU 2.0
*/
UBool operator!=(const ParsePosition& that) const;
/**
* Retrieve the current parse position. On input to a parse method, this
* is the index of the character at which parsing will begin; on output, it
* is the index of the character following the last character parsed.
* @return the current index.
* @stable ICU 2.0
*/
int32_t getIndex(void) const;
/**
* Set the current parse position.
* @param index the new index.
* @stable ICU 2.0
*/
void setIndex(int32_t index);
/**
* Set the index at which a parse error occurred. Formatters
* should set this before returning an error code from their
* parseObject method. The default value is -1 if this is not
* set.
* @stable ICU 2.0
*/
void setErrorIndex(int32_t ei);
/**
* Retrieve the index at which an error occurred, or -1 if the
* error index has not been set.
* @stable ICU 2.0
*/
int32_t getErrorIndex(void) const;
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.2
*/
static inline UClassID getStaticClassID();
private:
/**
* Input: the place you start parsing.
* <br>Output: position where the parse stopped.
* This is designed to be used serially,
* with each call setting index up for the next one.
*/
int32_t index;
/**
* The index at which a parse error occurred.
*/
int32_t errorIndex;
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UClassID
ParsePosition::getStaticClassID()
{ return (UClassID)&fgClassID; }
inline UClassID
ParsePosition::getDynamicClassID() const
{ return ParsePosition::getStaticClassID(); }
inline ParsePosition&
ParsePosition::operator=(const ParsePosition& copy)
{
index = copy.index;
errorIndex = copy.errorIndex;
return *this;
}
inline UBool
ParsePosition::operator==(const ParsePosition& copy) const
{
if(index != copy.index || errorIndex != copy.errorIndex)
return FALSE;
else
return TRUE;
}
inline UBool
ParsePosition::operator!=(const ParsePosition& copy) const
{
return !operator==(copy);
}
inline int32_t
ParsePosition::getIndex() const
{
return index;
}
inline void
ParsePosition::setIndex(int32_t offset)
{
this->index = offset;
}
inline int32_t
ParsePosition::getErrorIndex() const
{
return errorIndex;
}
inline void
ParsePosition::setErrorIndex(int32_t ei)
{
this->errorIndex = ei;
}
U_NAMESPACE_END
#endif
--- NEW FILE: rbbi.h ---
/*
***************************************************************************
* Copyright (C) 1999-2003 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
**********************************************************************
* Date Name Description
* 10/22/99 alan Creation.
* 11/11/99 rgillam Complete port from Java.
**********************************************************************
*/
#ifndef RBBI_H
#define RBBI_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/parseerr.h"
struct UTrie;
U_NAMESPACE_BEGIN
struct RBBIDataHeader;
class RuleBasedBreakIteratorTables;
class BreakIterator;
class RBBIDataWrapper;
/**
* A subclass of BreakIterator whose behavior is specified using a list of rules.
* <p>Instances of this class are most commonly created by the factory methods of
* BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
* and then used via the abstract API in class BreakIterator</p>
*
* <p>See the ICU User Guide for information on Break Iterator Rules.</p>
*
* <p>This class is not intended to be subclassed. (Class DictionaryBasedBreakIterator
* is a subclass, but that relationship is effectively internal to the ICU
* implementation. The subclassing interface to RulesBasedBreakIterator is
* not part of the ICU API, and may not remain stable.</p>
*
*/
class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
protected:
/**
* The character iterator through which this BreakIterator accesses the text
* @internal
*/
CharacterIterator* fText;
/**
* The rule data for this BreakIterator instance
* @internal
*/
RBBIDataWrapper *fData;
/** @internal */
UTrie *fCharMappings;
/** Rule {tag} value for the most recent match.
* @internal
*/
int32_t fLastBreakTag;
/**
* Rule tag value valid flag.
* Some iterator operations don't intrinsically set the correct tag value.
* This flag lets us lazily compute the value if we are ever asked for it.
* @internal
*/
UBool fLastBreakTagValid;
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set. Normal RBBI iterators don't use it, although the code
* for updating it is live. Dictionary Based break iterators (a subclass
* of us) access this field directly.
* @internal
*/
uint32_t fDictionaryCharCount;
/**
* Debugging flag. Trace operation of state machine when true.
* @internal
*/
static UBool fTrace;
private:
/**
* Class ID
*/
static const char fgClassID;
protected:
//=======================================================================
// constructors
//=======================================================================
/**
* This constructor uses the udata interface to create a BreakIterator
* whose internal tables live in a memory-mapped file. "image" is a pointer
* to the beginning of that file.
* @internal
*/
RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
/**
* Constructor from a flattened set of RBBI data in malloced memory.
* RulesBasedBreakIterators built from a custom set of rules
* are created via this constructor; the rules are compiled
* into memory, then the break iterator is constructed here.
*
* The break iterator adopts the memory, and will
* free it when done.
* @internal
*/
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
friend class RBBIRuleBuilder; /** @internal */
friend class BreakIterator;
public:
/** Default constructor. Creates an empty shell of an iterator, with no
* rules or text to iterate over. Object can subsequently be assigned to.
* @draft ICU 2.2
*/
RuleBasedBreakIterator();
/**
* Copy constructor. Will produce a break iterator with the same behavior,
* and which iterates over the same text, as the one passed in.
* @param that The RuleBasedBreakIterator passed to be copied
* @stable ICU 2.0
*/
RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
/**
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
* @param rules The break rules to be used.
* @param parseError In the event of a syntax error in the rules, provides the location
* within the rules of the problem.
* @param status Information on any errors encountered.
* @draft ICU 2.2
*/
RuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError,
UErrorCode &status);
/**
* Destructor
* @stable ICU 2.0
*/
virtual ~RuleBasedBreakIterator();
/**
* Assignment operator. Sets this iterator to have the same behavior,
* and iterate over the same text, as the one passed in.
* @param that The RuleBasedBreakItertor passed in
* @return the newly created RuleBasedBreakIterator
* @stable ICU 2.0
*/
RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
/**
* Equality operator. Returns TRUE if both BreakIterators are of the
* same class, have the same behavior, and iterate over the same text.
* @param that The BreakIterator to be compared for equality
* @Return TRUE if both BreakIterators are of the
* same class, have the same behavior, and iterate over the same text.
* @stable ICU 2.0
*/
virtual UBool operator==(const BreakIterator& that) const;
/**
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
* and vice versa.
* @param that The BreakIterator to be compared for inequality
* @return TRUE if both BreakIterators are not same.
* @stable ICU 2.0
*/
UBool operator!=(const BreakIterator& that) const;
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
* Differs from the copy constructor in that it is polymorphic, and
* will correctly clone (copy) a derived class.
* clone() is thread safe. Multiple threads may simultaeneously
* clone the same source break iterator.
* @stable ICU 2.0
*/
virtual BreakIterator* clone() const;
/**
* Compute a hash code for this BreakIterator
* @return A hash code
* @stable ICU 2.0
*/
virtual int32_t hashCode(void) const;
/**
* Returns the description used to create this iterator
* @return the description used to create this iterator
* @stable ICU 2.0
*/
virtual const UnicodeString& getRules(void) const;
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* @return An iterator over the text being analyzed.
* @stable ICU 2.0
*/
virtual const CharacterIterator& getText(void) const;
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze. The BreakIterator
* takes ownership of the character iterator. The caller MUST NOT delete it!
* @stable ICU 2.0
*/
virtual void adoptText(CharacterIterator* newText);
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText The text to analyze.
* @stable ICU 2.0
*/
virtual void setText(const UnicodeString& newText);
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
* @stable ICU 2.0
*/
virtual int32_t first(void);
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
* @stable ICU 2.0
*/
virtual int32_t last(void);
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
* equivalent to repeatedly calling next() or previous().
* @param n The number of steps to move. The sign indicates the direction
* (negative is backwards, and positive is forwards).
* @return The character offset of the boundary position n boundaries away from
* the current one.
* @stable ICU 2.0
*/
virtual int32_t next(int32_t n);
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable ICU 2.0
*/
virtual int32_t next(void);
/**
* Moves the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable ICU 2.0
*/
virtual int32_t previous(void);
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @param offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable ICU 2.0
*/
virtual int32_t following(int32_t offset);
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @param offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable ICU 2.0
*/
virtual int32_t preceding(int32_t offset);
/**
* Returns true if the specfied position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
virtual UBool isBoundary(int32_t offset);
/**
* Returns the current iteration position.
* @return The current iteration position.
* @stable ICU 2.0
*/
virtual int32_t current(void) const;
/**
* Return the status tag from the break rule that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned.
* <p>
* Of the standard types of ICU break iterators, only the word break
* iterator provides status values. The values are defined in
* <code>enum UWordBreak</code>, and allow distinguishing between words
* that contain alphabetic letters, "words" that appear to be numbers,
* punctuation and spaces, words containing ideographic characters, and
* more. Call <code>getRuleStatus</code> after obtaining a boundary
* position from <code>next()<code>, <code>previous()</code>, or
* any other break iterator functions that returns a boundary position.
* <p>
* @return the status from the break rule that determined the most recently
* returned break position.
*
* @see UWordBreak
* @draft ICU 2.2
*/
virtual int32_t getRuleStatus() const;
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
* C++ compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* @return The class ID for this object. All objects of a
* given class have the same class ID. Objects of
* other classes have different class IDs.
* @stable ICU 2.0
*/
inline virtual UClassID getDynamicClassID(void) const;
/**
* Returns the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
*
* Base* polymorphic_pointer = createPolymorphicObject();
* if (polymorphic_pointer->getDynamicClassID() ==
* Derived::getStaticClassID()) ...
*
* @return The class ID for all objects of this class.
* @stable ICU 2.0
*/
inline static UClassID getStaticClassID(void);
/*
* Create a clone (copy) of this break iterator in memory provided
* by the caller. The idea is to increase performance by avoiding
* a storage allocation. Use of this functoin is NOT RECOMMENDED.
* Performance gains are minimal, and correct buffer management is
* tricky. Use clone() instead.
*
* @param stackBuffer The pointer to the memory into which the cloned object
* should be placed. If NULL, allocate heap memory
* for the cloned object.
* @param BufferSize The size of the buffer. If zero, return the required
* buffer size, but do not clone the object. If the
* size was too small (but not zero), allocate heap
* storage for the cloned object.
*
* @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
* returned if the the provided buffer was too small, and
* the clone was therefore put on the heap.
*
* @return Pointer to the clone object. This may differ from the stackBuffer
* address if the byte alignment of the stack buffer was not suitable
* or if the stackBuffer was too small to hold the clone.
* @stable ICU 2.0
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status);
/**
* Return the binary form of compiled break rules,
* which can then be used to create a new break iterator at some
* time in the future. Creating a break iterator from pre-compiled rules
* is much faster than building one from the source form of the
* break rules.
*
* The binary data is can only be used with the same version of ICU
* and on the same platform type (processor endian-ness)
*
* @param length Returns the length of the binary data. (Out paramter.)
*
* @return A pointer to the binary (compiled) rule data. The storage
* belongs to the RulesBasedBreakIterator object, not the
* caller, and must not be modified or deleted.
* @internal
*/
virtual const uint8_t *getBinaryRules(uint32_t &length);
protected:
//=======================================================================
// implementation
//=======================================================================
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
* and advances through the text character by character until we reach the end
* of the text or the state machine transitions to state 0. We update our return
* value every time the state machine passes through a possible end state.
* @internal
*/
virtual int32_t handleNext(void);
/**
* This method backs the iterator back up to a "safe position" in the text.
* This is a position that we know, without any context, must be a break position.
* The various calling methods then iterate forward from this safe position to
* the appropriate position to return. (For more information, see the description
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
* @internal
*/
virtual int32_t handlePrevious(void);
/**
* Dumps caches and performs other actions associated with a complete change
* in text or iteration position. This function is a no-op in RuleBasedBreakIterator,
* but subclasses can and do override it.
* @internal
*/
virtual void reset(void);
/**
* Return true if the category lookup for this char
* indicates that it is in the set of dictionary lookup chars.
* This function is intended for use by dictionary based break iterators.
* @return true if the category lookup for this char
* indicates that it is in the set of dictionary lookup chars.
* @internal
*/
virtual UBool isDictionaryChar(UChar32);
/**
* Common initialization function, used by constructors and bufferClone.
* (Also used by DictionaryBasedBreakIterator::createBufferClone().)
* @internal
*/
void init();
};
//----------------------------------------------------------------------------------
//
// Inline Functions Definitions ...
//
//----------------------------------------------------------------------------------
inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
return !operator==(that);
}
inline UClassID RuleBasedBreakIterator::getStaticClassID(void) {
return (UClassID)(&fgClassID);
}
inline UClassID RuleBasedBreakIterator::getDynamicClassID(void) const {
return RuleBasedBreakIterator::getStaticClassID();
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif
--- NEW FILE: strenum.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*/
#ifndef STRENUM_H
#define STRENUM_H
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
class UnicodeString;
/**
* Base class for 'pure' C++ implementations of uenum api. Adds a
* method that returns the next UnicodeString since in C++ this can
* be a common storage format for strings.
*
* <p>The model is that the enumeration is over strings maintained by
* a 'service.' At any point, the service might change, invalidating
* the enumerator (though this is expected to be rare). The iterator
* returns an error if this has occurred. Lack of the error is no
* guarantee that the service didn't change immediately after the
* call, so the returned string still might not be 'valid' on
* subsequent use.</p>
*
* <p>Strings may take the form of const char*, const UChar*, or const
* UnicodeString*. The type you get is determine by the variant of
* 'next' that you call. In general the StringEnumeration is
* optimized for one of these types, but all StringEnumerations can
* return all types. Returned strings are each terminated with a NUL.
* Depending on the service data, they might also include embedded NUL
* characters, so API is provided to optionally return the true
* length, counting the embedded NULs but not counting the terminating
* NUL.</p>
*
* <p>The pointers returned by next, unext, and snext become invalid
* upon any subsequent call to the enumeration's destructor, next,
* unext, snext, or reset.</p>
*
* @draft ICU 2.4
*/
class U_COMMON_API StringEnumeration : public UObject {
public:
/**
* Destructor.
* @draft ICU 2.4
*/
virtual ~StringEnumeration();
/**
* <p>Return the number of elements that the iterator traverses. If
* the iterator is out of sync with its service, status is set to
* U_ENUM_OUT_OF_SYNC_ERROR, and the return value is zero.</p>
*
* <p>The return value will not change except possibly as a result of
* a subsequent call to reset, or if the iterator becomes out of sync.</p>
*
* <p>This is a convenience function. It can end up being very
* expensive as all the items might have to be pre-fetched
* (depending on the storage format of the data being
* traversed).</p>
*
* @param status the error code.
* @return number of elements in the iterator.
*
* @draft ICU 2.4 */
virtual int32_t count(UErrorCode& status) const = 0;
/**
* <p>Returns the next element as a NUL-terminated char*. If there
* are no more elements, returns NULL. If the resultLength pointer
* is not NULL, the length of the string (not counting the
* terminating NUL) is returned at that address. If an error
* status is returned, the value at resultLength is undefined.</p>
*
* <p>The returned pointer is owned by this iterator and must not be
* deleted by the caller. The pointer is valid until the next call
* to next, unext, snext, reset, or the enumerator's destructor.</p>
*
* <p>If the iterator is out of sync with its service, status is set
* to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p>
*
* <p>If the native service string is a UChar* string, it is
* converted to char* with the invariant converter. If the
* conversion fails (because a character cannot be converted) then
* status is set to U_INVARIANT_CONVERSION_ERROR and the return
* value is undefined (though not NULL).</p>
*
* @param status the error code.
* @param resultLength a pointer to receive the length, can be NULL.
* @return a pointer to the string, or NULL.
*
* @draft ICU 2.4
*/
virtual const char* next(int32_t *resultLength, UErrorCode& status) = 0;
/**
* <p>Returns the next element as a NUL-terminated UChar*. If there
* are no more elements, returns NULL. If the resultLength pointer
* is not NULL, the length of the string (not counting the
* terminating NUL) is returned at that address. If an error
* status is returned, the value at resultLength is undefined.</p>
*
* <p>The returned pointer is owned by this iterator and must not be
* deleted by the caller. The pointer is valid until the next call
* to next, unext, snext, reset, or the enumerator's destructor.</p>
*
* <p>If the iterator is out of sync with its service, status is set
* to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p>
*
* @param status the error code.
* @param resultLength a ponter to receive the length, can be NULL.
* @return a pointer to the string, or NULL.
*
* @draft ICU 2.4
*/
virtual const UChar* unext(int32_t *resultLength, UErrorCode& status) = 0;
/**
* <p>Returns the next element a UnicodeString*. If there are no
* more elements, returns NULL.</p>
*
* <p>The returned pointer is owned by this iterator and must not be
* deleted by the caller. The pointer is valid until the next call
* to next, unext, snext, reset, or the enumerator's destructor.</p>
*
* <p>If the iterator is out of sync with its service, status is set
* to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p>
*
* @param status the error code.
* @return a pointer to the string, or NULL.
*
* @draft ICU 2.4
*/
virtual const UnicodeString* snext(UErrorCode& status) = 0;
/**
* <p>Resets the iterator. This re-establishes sync with the
* service and rewinds the iterator to start at the first
* element.</p>
*
* <p>Previous pointers returned by next, unext, or snext become
* invalid, and the value returned by count might change.</p>
*
* @param status the error code.
*
* @draft ICU 2.4
*/
virtual void reset(UErrorCode& status) = 0;
};
inline StringEnumeration::~StringEnumeration() {
}
U_NAMESPACE_END
/* STRENUM_H */
#endif
--- NEW FILE: ubrk.h ---
/*
* Copyright (C) 1996-2003, International Business Machines Corporation and others. All Rights Reserved.
*****************************************************************************************
*/
#ifndef UBRK_H
#define UBRK_H
#include "unicode/utypes.h"
/**
* A text-break iterator.
* For usage in C programs.
*/
#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
# define UBRK_TYPEDEF_UBREAK_ITERATOR
/**
* Opaque type representing an ICU Break iterator object.
* @stable ICU 2.0
*/
typedef void UBreakIterator;
#endif
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/parseerr.h"
/**
* \file
* \brief C API: BreakIterator
*
* <h2> BreakIterator C API </h2>
*
* The BreakIterator C API defines methods for finding the location
* of boundaries in text. Pointer to a UBreakIterator maintain a
* current position and scan over text returning the index of characters
* where boundaries occur.
* <P>
* Line boundary analysis determines where a text string can be broken
* when line-wrapping. The mechanism correctly handles punctuation and
* hyphenated words.
* <P>
* Sentence boundary analysis allows selection with correct
* interpretation of periods within numbers and abbreviations, and
* trailing punctuation marks such as quotation marks and parentheses.
* <P>
* Word boundary analysis is used by search and replace functions, as
* well as within text editing applications that allow the user to
* select words with a double click. Word selection provides correct
* interpretation of punctuation marks within and following
* words. Characters that are not part of a word, such as symbols or
* punctuation marks, have word-breaks on both sides.
* <P>
* Character boundary analysis allows users to interact with
* characters as they expect to, for example, when moving the cursor
* through a text string. Character boundary analysis provides correct
* navigation of through character strings, regardless of how the
* character is stored. For example, an accented character might be
* stored as a base character and a diacritical mark. What users
* consider to be a character can differ between languages.
* <P>
* Title boundary analysis locates all positions,
* typically starts of words, that should be set to Title Case
* when title casing the text.
* <P>
*
* This is the interface for all text boundaries.
* <P>
* Examples:
* <P>
* Helper function to output text
* <pre>
* \code
* void printTextRange(UChar* str, int32_t start, int32_t end ) {
* UChar* result;
* UChar* temp;
* const char* res;
* temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1));
* result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1));
* u_strcpy(temp, &str[start]);
* u_strncpy(result, temp, end-start);
* res=(char*)malloc(sizeof(char) * (u_strlen(result)+1));
* u_austrcpy(res, result);
* printf("%s\n", res);
* }
* \endcode
* </pre>
* Print each element in order:
* <pre>
* \code
* void printEachForward( UBreakIterator* boundary, UChar* str) {
* int32_t end;
* int32_t start = ubrk_first(boundary);
* for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {
* printTextRange(str, start, end );
* }
* }
* \endcode
* </pre>
* Print each element in reverse order:
* <pre>
* \code
* void printEachBackward( UBreakIterator* boundary, UChar* str) {
* int32_t start;
* int32_t end = ubrk_last(boundary);
* for (start = ubrk_previous(boundary); start != UBRK_DONE; end = start, start =ubrk_previous(boundary)) {
* printTextRange( str, start, end );
* }
* }
* \endcode
* </pre>
* Print first element
* <pre>
* \code
* void printFirst(UBreakIterator* boundary, UChar* str) {
* int32_t end;
* int32_t start = ubrk_first(boundary);
* end = ubrk_next(boundary);
* printTextRange( str, start, end );
* }
* \endcode
* </pre>
* Print last element
* <pre>
* \code
* void printLast(UBreakIterator* boundary, UChar* str) {
* int32_t start;
* int32_t end = ubrk_last(boundary);
* start = ubrk_previous(boundary);
* printTextRange(str, start, end );
* }
* \endcode
* </pre>
* Print the element at a specified position
* <pre>
* \code
* void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) {
* int32_t start;
* int32_t end = ubrk_following(boundary, pos);
* start = ubrk_previous(boundary);
* printTextRange(str, start, end );
* }
* \endcode
* </pre>
* Creating and using text boundaries
* <pre>
* \code
* void BreakIterator_Example( void ) {
* UBreakIterator* boundary;
* UChar *stringToExamine;
* stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) );
* u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff.");
* printf("Examining: "Aaa bbb ccc. Ddd eee fff.");
*
* //print each sentence in forward and reverse order
* boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
* printf("----- forward: -----------\n");
* printEachForward(boundary, stringToExamine);
* printf("----- backward: ----------\n");
* printEachBackward(boundary, stringToExamine);
* ubrk_close(boundary);
*
* //print each word in order
* boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
* printf("----- forward: -----------\n");
* printEachForward(boundary, stringToExamine);
* printf("----- backward: ----------\n");
* printEachBackward(boundary, stringToExamine);
* //print first element
* printf("----- first: -------------\n");
* printFirst(boundary, stringToExamine);
* //print last element
* printf("----- last: --------------\n");
* printLast(boundary, stringToExamine);
* //print word at charpos 10
* printf("----- at pos 10: ---------\n");
* printAt(boundary, 10 , stringToExamine);
*
* ubrk_close(boundary);
* }
* \endcode
* </pre>
*/
/** The possible types of text boundaries. @stable ICU 2.0 */
typedef enum UBreakIteratorType {
/** Character breaks @stable ICU 2.0 */
UBRK_CHARACTER,
/** Word breaks @stable ICU 2.0 */
UBRK_WORD,
/** Line breaks @stable ICU 2.0 */
UBRK_LINE,
/** Sentence breaks @stable ICU 2.0 */
UBRK_SENTENCE,
/**
* Title Case breaks
* The iterator created using this type locates title boundaries as described for
* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
* please use Word Boundary iterator. @draft ICU 2.2
*
*/
UBRK_TITLE
} UBreakIteratorType;
/** Value indicating all text boundaries have been returned.
* @stable ICU 2.0
*/
#define UBRK_DONE ((int32_t) -1)
/**
* Enum constants for the word break tags returned by
* getRuleStatus(). A range of values is defined for each category of
* word, to allow for further subdivisions of a category in future releases.
* Applications should check for tag values falling within the range, rather
* than for single individual values.
* @draft ICU 2.2
*/
typedef enum UWordBreak {
/** Tag value for "words" that do not fit into any of other categories.
* Includes spaces and most punctuation. */
UBRK_WORD_NONE = 0,
/** Upper bound for tags for uncategorized words. */
UBRK_WORD_NONE_LIMIT = 100,
/** Tag value for words that appear to be numbers, lower limit. */
UBRK_WORD_NUMBER = 100,
/** Tag value for words that appear to be numbers, upper limit. */
UBRK_WORD_NUMBER_LIMIT = 200,
/** Tag value for words that contain letters, excluding
* hiragana, katakana or ideographic characters, lower limit. */
UBRK_WORD_LETTER = 200,
/** Tag value for words containing letters, upper limit */
UBRK_WORD_LETTER_LIMIT = 300,
/** Tag value for words containing kana characters, lower limit */
UBRK_WORD_KANA = 300,
/** Tag value for words containing kana characters, upper limit */
UBRK_WORD_KANA_LIMIT = 400,
/** Tag value for words containing ideographic characters, lower limit */
UBRK_WORD_IDEO = 400,
/** Tag value for words containing ideographic characters, upper limit */
UBRK_WORD_IDEO_LIMIT = 500
} UWordBreak;
/**
* Open a new UBreakIterator for locating text boundaries for a specified locale.
* A UBreakIterator may be used for detecting character, line, word,
* and sentence breaks in text.
* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
* UBRK_LINE, UBRK_SENTENCE
* @param locale The locale specifying the text-breaking conventions.
* @param text The text to be iterated over.
* @param textLength The number of characters in text, or -1 if null-terminated.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified locale.
* @see ubrk_openRules
* @stable ICU 2.0
*/
U_CAPI UBreakIterator* U_EXPORT2
ubrk_open(UBreakIteratorType type,
const char *locale,
const UChar *text,
int32_t textLength,
UErrorCode *status);
/**
* Open a new UBreakIterator for locating text boundaries using specified breaking rules.
* The rule syntax is ... (TBD)
* @param rules A set of rules specifying the text breaking conventions.
* @param rulesLength The number of characters in rules, or -1 if null-terminated.
* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
* used to specify the text to be iterated.
* @param textLength The number of characters in text, or -1 if null-terminated.
* @param parseErr Receives position and context information for any syntax errors
* detected while parsing the rules.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @draft ICU 2.2
*/
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openRules(const UChar *rules,
int32_t rulesLength,
const UChar *text,
int32_t textLength,
UParseError *parseErr,
UErrorCode *status);
/**
* Thread safe cloning operation
* @param bi iterator to be cloned
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
* @param pBufferSize pointer to size of allocated space.
* If *pBufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If *pBufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
* @return pointer to the new clone
* @stable ICU 2.0
*/
U_CAPI UBreakIterator * U_EXPORT2
ubrk_safeClone(
const UBreakIterator *bi,
void *stackBuffer,
int32_t *pBufferSize,
UErrorCode *status);
/**
* A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
* @stable ICU 2.0
*/
#define U_BRK_SAFECLONE_BUFFERSIZE 512
/**
* Close a UBreakIterator.
* Once closed, a UBreakIterator may no longer be used.
* @param bi The break iterator to close.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ubrk_close(UBreakIterator *bi);
/**
* Sets an existing iterator to point to a new piece of text
* @param bi The iterator to use
* @param text The text to be set
* @param textLength The length of the text
* @param status The error code
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ubrk_setText(UBreakIterator* bi,
const UChar* text,
int32_t textLength,
UErrorCode* status);
/**
* Determine the most recently-returned text boundary.
*
* @param bi The break iterator to use.
* @return The character index most recently returned by \Ref{ubrk_next}, \Ref{ubrk_previous},
* \Ref{ubrk_first}, or \Ref{ubrk_last}.
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_current(const UBreakIterator *bi);
/**
* Determine the text boundary following the current text boundary.
*
* @param bi The break iterator to use.
* @return The character index of the next text boundary, or UBRK_DONE
* if all text boundaries have been returned.
* @see ubrk_previous
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_next(UBreakIterator *bi);
/**
* Determine the text boundary preceding the current text boundary.
*
* @param bi The break iterator to use.
* @return The character index of the preceding text boundary, or UBRK_DONE
* if all text boundaries have been returned.
* @see ubrk_next
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_previous(UBreakIterator *bi);
/**
* Determine the index of the first character in the text being scanned.
* This is not always the same as index 0 of the text.
* @param bi The break iterator to use.
* @return The character index of the first character in the text being scanned.
* @see ubrk_last
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_first(UBreakIterator *bi);
/**
* Determine the index immediately <EM>beyond</EM> the last character in the text being
* scanned.
* This is not the same as the last character.
* @param bi The break iterator to use.
* @return The character offset immediately <EM>beyond</EM> the last character in the
* text being scanned.
* @see ubrk_first
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_last(UBreakIterator *bi);
/**
* Determine the text boundary preceding the specified offset.
* The value returned is always smaller than offset, or UBRK_DONE.
* @param bi The break iterator to use.
* @param offset The offset to begin scanning.
* @return The text boundary preceding offset, or UBRK_DONE.
* @see ubrk_following
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_preceding(UBreakIterator *bi,
int32_t offset);
/**
* Determine the text boundary following the specified offset.
* The value returned is always greater than offset, or UBRK_DONE.
* @param bi The break iterator to use.
* @param offset The offset to begin scanning.
* @return The text boundary following offset, or UBRK_DONE.
* @see ubrk_preceding
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_following(UBreakIterator *bi,
int32_t offset);
/**
* Get a locale for which text breaking information is available.
* A UBreakIterator in a locale returned by this function will perform the correct
* text breaking for the locale.
* @param index The index of the desired locale.
* @return A locale for which number text breaking information is available, or 0 if none.
* @see ubrk_countAvailable
* @stable ICU 2.0
*/
U_CAPI const char* U_EXPORT2
ubrk_getAvailable(int32_t index);
/**
* Determine how many locales have text breaking information available.
* This function is most useful as determining the loop ending condition for
* calls to \Ref{ubrk_getAvailable}.
* @return The number of locales for which text breaking information is available.
* @see ubrk_getAvailable
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_countAvailable(void);
/**
* Returns true if the specfied position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param bi The break iterator to use.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
U_CAPI UBool U_EXPORT2
ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
/**
* Return the status from the break rule that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned.
* <p>
* For word break iterators, the possible values are defined in enum UWordBreak.
* @draft ICU 2.2
*/
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi);
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif
--- NEW FILE: ucat.h ---
/*
**********************************************************************
* Copyright (c) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: March 19 2003
* Since: ICU 2.6
**********************************************************************
*/
#ifndef UCAT_H
#define UCAT_H
#include "unicode/utypes.h"
#include "unicode/ures.h"
/**
* \file
* \brief C API: Message Catalog Wrappers
*
* This C API provides look-alike functions that deliberately resemble
* the POSIX catopen, catclose, and catgets functions. The underlying
* implementation is in terms of ICU resource bundles, rather than
* POSIX message catalogs.
*
* The ICU resource bundles obey standard ICU inheritance policies.
* To facilitate this, sets and messages are flattened into one tier.
* This is done by creating resource bundle keys of the form
* <set_num>%<msg_num> where set_num is the set number and msg_num is
* the message number, formatted as decimal strings.
*
* Example: Consider a message catalog containing two sets:
*
* Set 1: Message 4 = "Good morning."
* Message 5 = "Good afternoon."
* Message 7 = "Good evening."
* Message 8 = "Good night."
* Set 4: Message 14 = "Please "
* Message 19 = "Thank you."
* Message 20 = "Sincerely,"
*
* The ICU resource bundle source file would, assuming it is named
* "greet.txt", would look like this:
*
* greet
* {
* 1%4 { "Good morning." }
* 1%5 { "Good afternoon." }
* 1%7 { "Good evening." }
* 1%8 { "Good night." }
*
* 4%14 { "Please " }
* 4%19 { "Thank you." }
* 4%20 { "Sincerely," }
* }
*
* The catgets function is commonly used in combination with functions
* like printf and strftime. ICU components like message format can
* be used instead, although they use a different format syntax.
* There is an unsupported ICU package, ustdio, that provides some of
* the POSIX-style formatting API.
*/
U_CDECL_BEGIN
/**
* An ICU message catalog descriptor, analogous to nl_catd.
*
* @draft ICU 2.6
*/
typedef UResourceBundle* u_nl_catd;
/**
* Open and return an ICU message catalog descriptor. The descriptor
* may be passed to u_catgets() to retrieve localized strings.
*
* @param name string containing the full path pointing to the
* directory where the resources reside followed by the package name
* e.g. "/usr/resource/my_app/resources/guimessages" on a Unix system.
* If NULL, ICU default data files will be used.
*
* Unlike POSIX, environment variables are not interpolated within the
* name.
*
* @param locale the locale for which we want to open the resource. If
* NULL, the default ICU locale will be used (see uloc_getDefault). If
* strlen(locale) == 0, the root locale will be used.
*
* @param ec input/output error code. Upon output,
* U_USING_FALLBACK_WARNING indicates that a fallback locale was
* used. For example, 'de_CH' was requested, but nothing was found
* there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that the
* default locale data or root locale data was used; neither the
* requested locale nor any of its fallback locales were found.
*
* @return a message catalog descriptor that may be passed to
* u_catgets(). If the ec parameter indicates success, then the caller
* is responsible for calling u_catclose() to close the message
* catalog. If the ec parameter indicates failure, then NULL will be
* returned.
*
* @draft ICU 2.6
*/
U_CAPI u_nl_catd U_EXPORT2
u_catopen(const char* name, const char* locale, UErrorCode* ec);
/**
* Close an ICU message catalog, given its descriptor.
*
* @param catd a message catalog descriptor to be closed. May be NULL,
* in which case no action is taken.
*
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
u_catclose(u_nl_catd catd);
/**
* Retrieve a localized string from an ICU message catalog.
*
* @param catd a message catalog descriptor returned by u_catopen.
*
* @param set_num the message catalog set number. Sets need not be
* numbered consecutively.
*
* @param msg_num the message catalog message number within the
* set. Messages need not be numbered consecutively.
*
* @param s the default string. This is returned if the string
* specified by the set_num and msg_num is not found. It must be
* zero-terminated.
*
* @param len fill-in parameter to receive the length of the result.
* May be NULL, in which case it is ignored.
*
* @param ec input/output error code. May be U_USING_FALLBACK_WARNING
* or U_USING_DEFAULT_WARNING. U_MISSING_RESOURCE_ERROR indicates that
* the set_num/msg_num tuple does not specify a valid message string
* in this catalog.
*
* @return a pointer to a zero-terminated UChar array which lives in
* an internal buffer area, typically a memory mapped/DLL file. The
* caller must NOT delete this pointer. If the call is unsuccessful
* for any reason, then s is returned. This includes the situation in
* which ec indicates a failing error code upon entry to this
* function.
*
* @draft ICU 2.6
*/
U_CAPI const UChar* U_EXPORT2
u_catgets(u_nl_catd catd, int32_t set_num, int32_t msg_num,
const UChar* s,
int32_t* len, UErrorCode* ec);
U_CDECL_END
#endif /*UCAT_H*/
/*eof*/
--- NEW FILE: uconfig.h ---
/*
**********************************************************************
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: uconfig.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002sep19
* created by: Markus W. Scherer
*/
#ifndef __UCONFIG_H__
#define __UCONFIG_H__
/*!
* \file
* \brief Switches for excluding parts of ICU library code modules.
*
* Allows to build partial, smaller libraries for special purposes.
* By default, all modules are built.
* The switches are fairly coarse, controlling large modules.
* Basic services cannot be turned off.
*
* @draft ICU 2.4
*/
/**
* \def UCONFIG_ONLY_COLLATION
* This switch turns off modules that are not needed for collation.
*
* It does not turn off legacy conversion because that is necessary
* for ICU to work on EBCDIC platforms (for the default converter).
* If you want "only collation" and do not build for EBCDIC,
* then you can #define UCONFIG_NO_LEGACY_CONVERSION 1 as well.
*
* @draft ICU 2.4
*/
#ifndef UCONFIG_ONLY_COLLATION
# define UCONFIG_ONLY_COLLATION 0
#endif
#if UCONFIG_ONLY_COLLATION
/* common library */
# define UCONFIG_NO_BREAK_ITERATION 1
# define UCONFIG_NO_IDNA 1
/* i18n library */
# if UCONFIG_NO_COLLATION
# error Contradictory collation switches in uconfig.h.
# endif
# define UCONFIG_NO_FORMATTING 1
# define UCONFIG_NO_TRANSLITERATION 1
# define UCONFIG_NO_REGULAR_EXPRESSIONS 1
#endif
/* common library switches -------------------------------------------------- */
/**
* \def UCONFIG_NO_LEGACY_CONVERSION
* This switch turns off all converters except for
* - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1)
* - US-ASCII
* - ISO-8859-1
*
* Turning off legacy conversion is not possible on EBCDIC platforms
* because they need ibm-37 or ibm-1047 default converters.
*
* @draft ICU 2.4
*/
#ifndef UCONFIG_NO_LEGACY_CONVERSION
# define UCONFIG_NO_LEGACY_CONVERSION 0
#endif
/**
* \def UCONFIG_NO_NORMALIZATION
* This switch turns off normalization.
* It implies turning off several other services as well, for example
* collation and IDNA.
*
* @draft ICU 2.6
*/
#ifndef UCONFIG_NO_NORMALIZATION
# define UCONFIG_NO_NORMALIZATION 0
#elif UCONFIG_NO_NORMALIZATION
/* common library */
# define UCONFIG_NO_IDNA 1
/* i18n library */
# if UCONFIG_ONLY_COLLATION
# error Contradictory collation switches in uconfig.h.
# endif
# define UCONFIG_NO_COLLATION 1
# define UCONFIG_NO_TRANSLITERATION 1
#endif
/**
* \def UCONFIG_NO_BREAK_ITERATION
* This switch turns off break iteration.
*
* @draft ICU 2.4
*/
#ifndef UCONFIG_NO_BREAK_ITERATION
# define UCONFIG_NO_BREAK_ITERATION 0
#endif
/**
* \def UCONFIG_NO_IDNA
* This switch turns off IDNA.
*
* @draft ICU 2.6
*/
#ifndef UCONFIG_NO_IDNA
# define UCONFIG_NO_IDNA 0
#endif
/* i18n library switches ---------------------------------------------------- */
/**
* \def
* This switch turns off collation and collation-based string search.
*
* @draft ICU 2.4
*/
#ifndef UCONFIG_NO_COLLATION
# define UCONFIG_NO_COLLATION 0
#endif
/**
* \def UCONFIG_NO_FORMATTING
* This switch turns off formatting and calendar/timezone services.
*
* @draft ICU 2.4
*/
#ifndef UCONFIG_NO_FORMATTING
# define UCONFIG_NO_FORMATTING 0
#endif
/**
* \def UCONFIG_NO_TRANSLITERATION
* This switch turns off transliteration.
*
* @draft ICU 2.4
*/
#ifndef UCONFIG_NO_TRANSLITERATION
# define UCONFIG_NO_TRANSLITERATION 0
#endif
/**
* \def UCONFIG_NO_REGULAR_EXPRESSIONS
* This switch turns off regular expressions.
*
* @draft ICU 2.4
*/
#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS
# define UCONFIG_NO_REGULAR_EXPRESSIONS 0
#endif
#endif
--- NEW FILE: uenum.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uenum.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:2
*
* created on: 2002jul08
* created by: Vladimir Weinstein
*/
#ifndef __UENUM_H
#define __UENUM_H
#include "unicode/utypes.h"
/**
* An enumeration object.
* For usage in C programs.
* @draft ICU 2.2
*/
struct UEnumeration;
/** structure representing an enumeration object instance @draft ICU 2.2 */
typedef struct UEnumeration UEnumeration;
/**
* Disposes of resources in use by the iterator. If en is NULL,
* does nothing. After this call, any char* or UChar* pointer
* returned by uenum_unext() or uenum_next() is invalid.
* @param en UEnumeration structure pointer
* @draft ICU 2.2
*/
U_CAPI void U_EXPORT2
uenum_close(UEnumeration* en);
/**
* Returns the number of elements that the iterator traverses. If
* the iterator is out-of-sync with its service, status is set to
* U_ENUM_OUT_OF_SYNC_ERROR.
* This is a convenience function. It can end up being very
* expensive as all the items might have to be pre-fetched (depending
* on the type of data being traversed). Use with caution and only
* when necessary.
* @param en UEnumeration structure pointer
* @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the
* iterator is out of sync.
* @return number of elements in the iterator
* @draft ICU 2.2
*/
U_CAPI int32_t U_EXPORT2
uenum_count(UEnumeration* en, UErrorCode* status);
/**
* Returns the next element in the iterator's list. If there are
* no more elements, returns NULL. If the iterator is out-of-sync
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
* NULL is returned. If the native service string is a char* string,
* it is converted to UChar* with the invariant converter.
* The result is terminated by (UChar)0.
* @param en the iterator object
* @param resultLength pointer to receive the length of the result
* (not including the terminating \0).
* If the pointer is NULL it is ignored.
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service.
* @return a pointer to the string. The string will be
* zero-terminated. The return pointer is owned by this iterator
* and must not be deleted by the caller. The pointer is valid
* until the next call to any uenum_... method, including
* uenum_next() or uenum_unext(). When all strings have been
* traversed, returns NULL.
* @draft ICU 2.2
*/
U_CAPI const UChar* U_EXPORT2
uenum_unext(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Returns the next element in the iterator's list. If there are
* no more elements, returns NULL. If the iterator is out-of-sync
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
* NULL is returned. If the native service string is a UChar*
* string, it is converted to char* with the invariant converter.
* The result is terminated by (char)0. If the conversion fails
* (because a character cannot be converted) then status is set to
* U_INVARIANT_CONVERSION_ERROR and the return value is undefined
* (but non-NULL).
* @param en the iterator object
* @param resultLength pointer to receive the length of the result
* (not including the terminating \0).
* If the pointer is NULL it is ignored.
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service. Set to
* U_INVARIANT_CONVERSION_ERROR if the underlying native string is
* UChar* and conversion to char* with the invariant converter
* fails. This error pertains only to current string, so iteration
* might be able to continue successfully.
* @return a pointer to the string. The string will be
* zero-terminated. The return pointer is owned by this iterator
* and must not be deleted by the caller. The pointer is valid
* until the next call to any uenum_... method, including
* uenum_next() or uenum_unext(). When all strings have been
* traversed, returns NULL.
* @draft ICU 2.2
*/
U_CAPI const char* U_EXPORT2
uenum_next(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Resets the iterator to the current list of service IDs. This
* re-establishes sync with the service and rewinds the iterator
* to start at the first element.
* @param en the iterator object
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service.
* @draft ICU 2.2
*/
U_CAPI void U_EXPORT2
uenum_reset(UEnumeration* en, UErrorCode* status);
#endif
--- NEW FILE: uidna.h ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uidna.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef __UIDNA_H__
#define __UIDNA_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "unicode/parseerr.h"
/**
*\file
* UIDNA API implements the IDNA protocol as defined in the IDNA RFC
* (http://www.ietf.org/rfc/rfc3490.txt).
* The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels
* containing non-ASCII code points are required to be processed by
* ToASCII operation before passing it to resolver libraries. Domain names
* that are obtained from resolver libraries are required to be processed by
* ToUnicode operation before displaying the domain name to the user.
* IDNA requires that implementations process input strings with Nameprep
* (http://www.ietf.org/rfc/rfc3491.txt),
* which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
* and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt).
* Implementations of IDNA MUST fully implement Nameprep and Punycode;
* neither Nameprep nor Punycode are optional.
* The input and output of ToASCII and ToUnicode operations are Unicode
* and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
* multiple times to an input string will yield the same result as applying the operation
* once.
* ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
* ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
*\end_file
*/
/**
* Option to prohibit processing of unassigned codepoints in the input and
* do not check if the input conforms to STD-3 ASCII rules.
*
* @see uidna_toASCII uidna_toUnicode
* @draft ICU 2.6
*/
#define UIDNA_DEFAULT 0x0000
/**
* Option to allow processing of unassigned codepoints in the input
*
* @see uidna_toASCII uidna_toUnicode
* @draft ICU 2.6
*/
#define UIDNA_ALLOW_UNASSIGNED 0x0001
/**
* Option to check if input conforms to STD-3 ASCII rules
*
* @see uidna_toASCII uidna_toUnicode
* @draft ICU 2.6
*/
#define UIDNA_USE_STD3_RULES 0x0002
/**
* This function implements the ToASCII operation as defined in the IDNA RFC.
* This operation is done on <b>single labels</b> before sending it to something that expects
* ASCII names. A label is an individual part of a domain name. Labels are usually
* separated by dots; e.g." "www.example.com" is composed of 3 labels
* "www","example", and "com".
*
*
* @param src Input UChar array containing label in Unicode.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array with ASCII (ACE encoded) label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_toASCII(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* This function implements the ToUnicode operation as defined in the IDNA RFC.
* This operation is done on <b>single labels</b> before sending it to something that expects
* Unicode names. A label is an individual part of a domain name. Labels are usually
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
* "www","example", and "com".
*
* @param src Input UChar array containing ASCII (ACE encoded) label.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Converted UChar array containing Unicode equivalent of label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points. <b> Note: </b> This option is
* required on toUnicode operation because the RFC mandates
* verification of decoded ACE input by applying toASCII and comparing
* its output with source
*
*
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of Unicode characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_toUnicode(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
* This operation is done on complete domain names, e.g: "www.example.com".
* It is important to note that this operation can fail. If it fails, then the input
* domain name cannot be used as an Internationalized Domain Name and the application
* should have methods defined to deal with the failure.
*
* <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input UChar array containing IDN in Unicode.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array with ASCII (ACE encoded) IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_IDNToASCII( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
* This operation is done on complete domain names, e.g: "www.example.com".
*
* <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input UChar array containing IDN in ASCII (ACE encoded) form.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array containing Unicode equivalent of source IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return Number of ASCII characters converted.
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* Compare two strings for IDNs for equivalence.
* This function splits the domain names into labels and compares them.
* According to IDN RFC, whenever two labels are compared, they are
* considered equal if and only if their ASCII forms (obtained by
* applying toASCII) match using an case-insensitive ASCII comparison.
* Two domain names are considered a match if and only if all labels
* match regardless of whether label separators match.
*
* @param s1 First source string.
* @param length1 Length of first source string, or -1 if NUL-terminated.
*
* @param s2 Second source string.
* @param length2 Length of second source string, or -1 if NUL-terminated.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return <0 or 0 or >0 as usual for string comparisons
* @draft ICU 2.6
*/
U_CAPI int32_t U_EXPORT2
uidna_compare( const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
int32_t options,
UErrorCode* status);
#endif /* #if !UCONFIG_NO_IDNA */
#endif
--- NEW FILE: uiter.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uiter.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan18
* created by: Markus W. Scherer
*/
#ifndef __UITER_H__
#define __UITER_H__
/**
* \file
* \brief C API: Unicode Character Iteration
*
* @see UCharIterator
*/
#include "unicode/utypes.h"
#ifdef XP_CPLUSPLUS
U_NAMESPACE_BEGIN
class CharacterIterator;
class Replaceable;
U_NAMESPACE_END
#endif
U_CDECL_BEGIN
struct UCharIterator;
typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
/**
* Origin constants for UCharIterator.getIndex() and UCharIterator.move().
* @see UCharIteratorMove
* @see UCharIterator
* @stable ICU 2.1
*/
typedef enum UCharIteratorOrigin {
UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
} UCharIteratorOrigin;
/** Constants for UCharIterator. @draft ICU 2.6 */
enum {
/**
* Constant value that may be returned by UCharIteratorMove
* indicating that the final UTF-16 index is not known, but that the move succeeded.
* This can occur when moving relative to limit or length, or
* when moving relative to the current index after a setState()
* when the current UTF-16 index is not known.
*
* It would be very inefficient to have to count from the beginning of the text
* just to get the current/limit/length index after moving relative to it.
* The actual index can be determined with getIndex(UITER_CURRENT)
* which will count the UChars if necessary.
*
* @draft ICU 2.6
*/
UITER_UNKNOWN_INDEX=-2
};
/**
* Constant for UCharIterator getState() indicating an error or
* an unknown state.
* Returned by uiter_getState()/UCharIteratorGetState
* when an error occurs.
* Also, some UCharIterator implementations may not be able to return
* a valid state for each position. This will be clearly documented
* for each such iterator (none of the public ones here).
*
* @draft ICU 2.6
*/
#define UITER_NO_STATE ((uint32_t)0xffffffff)
/**
* Function type declaration for UCharIterator.getIndex().
*
* Gets the current position, or the start or limit of the
* iteration range.
*
* This function may perform slowly for UITER_CURRENT after setState() was called,
* or for UITER_LENGTH, because an iterator implementation may have to count
* UChars if the underlying storage is not UTF-16.
*
* @param iter the UCharIterator structure ("this pointer")
* @param origin get the 0, start, limit, length, or current index
* @return the requested index, or U_SENTINEL in an error condition
*
* @see UCharIteratorOrigin
* @see UCharIterator
* @stable ICU 2.1
*/
typedef int32_t U_CALLCONV
UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
/**
* Function type declaration for UCharIterator.move().
*
* Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
*
* Moves the current position relative to the start or limit of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code units forward
* or backward by specifying a positive or negative delta.
* Out of bounds movement will be pinned to the start or limit.
*
* This function may perform slowly for moving relative to UITER_LENGTH
* because an iterator implementation may have to count the rest of the
* UChars if the native storage is not UTF-16.
*
* When moving relative to the limit or length, or
* relative to the current position after setState() was called,
* move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
* determination of the actual UTF-16 index.
* The actual index can be determined with getIndex(UITER_CURRENT)
* which will count the UChars if necessary.
* See UITER_UNKNOWN_INDEX for details.
*
* @param iter the UCharIterator structure ("this pointer")
* @param delta can be positive, zero, or negative
* @param origin move relative to the 0, start, limit, length, or current index
* @return the new index, or U_SENTINEL on an error condition,
* or UITER_UNKNOWN_INDEX when the index is not known.
*
* @see UCharIteratorOrigin
* @see UCharIterator
* @see UITER_UNKNOWN_INDEX
* @stable ICU 2.1
*/
typedef int32_t U_CALLCONV
UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
/**
* Function type declaration for UCharIterator.hasNext().
*
* Check if current() and next() can still
* return another code unit.
*
* @param iter the UCharIterator structure ("this pointer")
* @return boolean value for whether current() and next() can still return another code unit
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UBool U_CALLCONV
UCharIteratorHasNext(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.hasPrevious().
*
* Check if previous() can still return another code unit.
*
* @param iter the UCharIterator structure ("this pointer")
* @return boolean value for whether previous() can still return another code unit
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UBool U_CALLCONV
UCharIteratorHasPrevious(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.current().
*
* Return the code unit at the current position,
* or U_SENTINEL if there is none (index is at the limit).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code unit
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UChar32 U_CALLCONV
UCharIteratorCurrent(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.next().
*
* Return the code unit at the current index and increment
* the index (post-increment, like s[i++]),
* or return U_SENTINEL if there is none (index is at the limit).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code unit (and post-increment the current index)
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UChar32 U_CALLCONV
UCharIteratorNext(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.previous().
*
* Decrement the index and return the code unit from there
* (pre-decrement, like s[--i]),
* or return U_SENTINEL if there is none (index is at the start).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the previous code unit (after pre-decrementing the current index)
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UChar32 U_CALLCONV
UCharIteratorPrevious(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.reservedFn().
* Reserved for future use.
*
* @param iter the UCharIterator structure ("this pointer")
* @param something some integer argument
* @return some integer
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef int32_t U_CALLCONV
UCharIteratorReserved(UCharIterator *iter, int32_t something);
/**
* Function type declaration for UCharIterator.getState().
*
* Get the "state" of the iterator in the form of a single 32-bit word.
* It is recommended that the state value be calculated to be as small as
* is feasible. For strings with limited lengths, fewer than 32 bits may
* be sufficient.
*
* This is used together with setState()/UCharIteratorSetState
* to save and restore the iterator position more efficiently than with
* getIndex()/move().
*
* With some UCharIterator implementations (e.g., UTF-8),
* getting and setting the UTF-16 index with existing functions
* (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
* relatively slow because the iterator has to "walk" from a known index
* to the requested one.
* This takes more time the farther it needs to go.
*
* An opaque state value allows an iterator implementation to provide
* an internal index (UTF-8: the source byte array index) for
* fast, constant-time restoration.
*
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
* the UTF-16 index may not be restored as well, but the iterator can deliver
* the correct text contents and move relative to the current position
* without performance degradation.
*
* Some UCharIterator implementations may not be able to return
* a valid state for each position, in which case they return UITER_NO_STATE instead.
* This will be clearly documented for each such iterator (none of the public ones here).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the state word
*
* @see UCharIterator
* @see UCharIteratorSetState
* @see UITER_NO_STATE
* @draft ICU 2.6
*/
typedef uint32_t U_CALLCONV
UCharIteratorGetState(const UCharIterator *iter);
/**
* Function type declaration for UCharIterator.setState().
*
* Restore the "state" of the iterator using a state word from a getState() call.
* The iterator object need not be the same one as for which getState() was called,
* but it must be of the same type (set up using the same uiter_setXYZ function)
* and it must iterate over the same string
* (binary identical regardless of memory address).
* For more about the state word see UCharIteratorGetState.
*
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
* the UTF-16 index may not be restored as well, but the iterator can deliver
* the correct text contents and move relative to the current position
* without performance degradation.
*
* @param iter the UCharIterator structure ("this pointer")
* @param state the state word from a getState() call
* on a same-type, same-string iterator
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see UCharIterator
* @see UCharIteratorGetState
* @draft ICU 2.6
*/
typedef void U_CALLCONV
UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
/**
* C API for code unit iteration.
* This can be used as a C wrapper around
* CharacterIterator, Replaceable, or implemented using simple strings, etc.
*
* There are two roles for using UCharIterator:
*
* A "provider" sets the necessary function pointers and controls the "protected"
* fields of the UCharIterator structure. A "provider" passes a UCharIterator
* into C APIs that need a UCharIterator as an abstract, flexible string interface.
*
* Implementations of such C APIs are "callers" of UCharIterator functions;
* they only use the "public" function pointers and never access the "protected"
* fields directly.
*
* UCharIterator functions return code unit values 0..0xffff,
* or U_SENTINEL if the iteration bounds are reached.
*
* @stable ICU 2.1
*/
struct UCharIterator {
/**
* (protected) Pointer to string or wrapped object or similar.
* Not used by caller.
* @stable ICU 2.1
*/
const void *context;
/**
* (protected) Length of string or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t length;
/**
* (protected) Start index or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t start;
/**
* (protected) Current index or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t index;
/**
* (protected) Limit index or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t limit;
/**
* (protected) Used by UTF-8 iterators and possibly others.
* @stable ICU 2.1
*/
int32_t reservedField;
/**
* (public) Returns the current position or the
* start or limit index of the iteration range.
*
* @see UCharIteratorGetIndex
* @stable ICU 2.1
*/
UCharIteratorGetIndex *getIndex;
/**
* (public) Moves the current position relative to the start or limit of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code units forward
* or backward by specifying a positive or negative delta.
*
* @see UCharIteratorMove
* @stable ICU 2.1
*/
UCharIteratorMove *move;
/**
* (public) Check if current() and next() can still
* return another code unit.
*
* @see UCharIteratorHasNext
* @stable ICU 2.1
*/
UCharIteratorHasNext *hasNext;
/**
* (public) Check if previous() can still return another code unit.
*
* @see UCharIteratorHasPrevious
* @stable ICU 2.1
*/
UCharIteratorHasPrevious *hasPrevious;
/**
* (public) Return the code unit at the current position,
* or U_SENTINEL if there is none (index is at the limit).
*
* @see UCharIteratorCurrent
* @stable ICU 2.1
*/
UCharIteratorCurrent *current;
/**
* (public) Return the code unit at the current index and increment
* the index (post-increment, like s[i++]),
* or return U_SENTINEL if there is none (index is at the limit).
*
* @see UCharIteratorNext
* @stable ICU 2.1
*/
UCharIteratorNext *next;
/**
* (public) Decrement the index and return the code unit from there
* (pre-decrement, like s[--i]),
* or return U_SENTINEL if there is none (index is at the start).
*
* @see UCharIteratorPrevious
* @stable ICU 2.1
*/
UCharIteratorPrevious *previous;
/**
* (public) Reserved for future use. Currently NULL.
*
* @see UCharIteratorReserved
* @stable ICU 2.1
*/
UCharIteratorReserved *reservedFn;
/**
* (public) Return the state of the iterator, to be restored later with setState().
* This function pointer is NULL if the iterator does not implement it.
*
* @see UCharIteratorGet
* @draft ICU 2.6
*/
UCharIteratorGetState *getState;
/**
* (public) Restore the iterator state from the state word from a call
* to getState().
* This function pointer is NULL if the iterator does not implement it.
*
* @see UCharIteratorSet
* @draft ICU 2.6
*/
UCharIteratorSetState *setState;
};
/**
* Helper function for UCharIterator to get the code point
* at the current index.
*
* Return the code point that includes the code unit at the current position,
* or U_SENTINEL if there is none (index is at the limit).
* If the current code unit is a lead or trail surrogate,
* then the following or preceding surrogate is used to form
* the code point value.
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code point
*
* @see UCharIterator
* @see U16_GET
* @see UnicodeString::char32At()
* @stable ICU 2.1
*/
U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator *iter);
/**
* Helper function for UCharIterator to get the next code point.
*
* Return the code point at the current index and increment
* the index (post-increment, like s[i++]),
* or return U_SENTINEL if there is none (index is at the limit).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code point (and post-increment the current index)
*
* @see UCharIterator
* @see U16_NEXT
* @stable ICU 2.1
*/
U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator *iter);
/**
* Helper function for UCharIterator to get the previous code point.
*
* Decrement the index and return the code point from there
* (pre-decrement, like s[--i]),
* or return U_SENTINEL if there is none (index is at the start).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the previous code point (after pre-decrementing the current index)
*
* @see UCharIterator
* @see U16_PREV
* @stable ICU 2.1
*/
U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator *iter);
/**
* Get the "state" of the iterator in the form of a single 32-bit word.
* This is a convenience function that calls iter->getState(iter)
* if iter->getState is not NULL;
* if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
*
* Some UCharIterator implementations may not be able to return
* a valid state for each position, in which case they return UITER_NO_STATE instead.
* This will be clearly documented for each such iterator (none of the public ones here).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the state word
*
* @see UCharIterator
* @see UCharIteratorGetState
* @see UITER_NO_STATE
* @draft ICU 2.6
*/
U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator *iter);
/**
* Restore the "state" of the iterator using a state word from a getState() call.
* This is a convenience function that calls iter->setState(iter, state, pErrorCode)
* if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
*
* @param iter the UCharIterator structure ("this pointer")
* @param state the state word from a getState() call
* on a same-type, same-string iterator
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see UCharIterator
* @see UCharIteratorSetState
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
/**
* Set up a UCharIterator to iterate over a string.
*
* Sets the UCharIterator function pointers for iteration over the string s
* with iteration boundaries start=index=0 and length=limit=string length.
* The "provider" may set the start, index, and limit values at any time
* within the range 0..length.
* The length field will be ignored.
*
* The string pointer s is set into UCharIterator.context without copying
* or reallocating the string contents.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param s String to iterate over
* @param length Length of s, or -1 if NUL-terminated
*
* @see UCharIterator
* @stable ICU 2.1
*/
U_CAPI void U_EXPORT2
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
/**
* Set up a UCharIterator to iterate over a UTF-16BE string
* (byte vector with a big-endian pair of bytes per UChar).
*
* Everything works just like with a normal UChar iterator (uiter_setString),
* except that UChars are assembled from byte pairs,
* and that the length argument here indicates an even number of bytes.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param s UTF-16BE string to iterate over
* @param length Length of s as an even number of bytes, or -1 if NUL-terminated
* (NUL means pair of 0 bytes at even index from s)
*
* @see UCharIterator
* @see uiter_setString
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
/**
* Set up a UCharIterator to iterate over a UTF-8 string.
*
* Sets the UCharIterator function pointers for iteration over the UTF-8 string s
* with UTF-8 iteration boundaries 0 and length.
* The implementation counts the UTF-16 index on the fly and
* lazily evaluates the UTF-16 length of the text.
*
* The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
* When the reservedField is not 0, then it contains a supplementary code point
* and the UTF-16 index is between the two corresponding surrogates.
* At that point, the UTF-8 index is behind that code point.
*
* The UTF-8 string pointer s is set into UCharIterator.context without copying
* or reallocating the string contents.
*
* getState() returns a state value consisting of
* - the current UTF-8 source byte index (bits 31..1)
* - a flag (bit 0) that indicates whether the UChar position is in the middle
* of a surrogate pair
* (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
*
* getState() cannot also encode the UTF-16 index in the state value.
* move(relative to limit or length), or
* move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
*
* @param iter UCharIterator structure to be set for iteration
* @param s UTF-8 string to iterate over
* @param length Length of s in bytes, or -1 if NUL-terminated
*
* @see UCharIterator
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
#ifdef XP_CPLUSPLUS
/**
* Set up a UCharIterator to wrap around a C++ CharacterIterator.
*
* Sets the UCharIterator function pointers for iteration using the
* CharacterIterator charIter.
*
* The CharacterIterator pointer charIter is set into UCharIterator.context
* without copying or cloning the CharacterIterator object.
* The other "protected" UCharIterator fields are set to 0 and will be ignored.
* The iteration index and boundaries are controlled by the CharacterIterator.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param charIter CharacterIterator to wrap
*
* @see UCharIterator
* @stable ICU 2.1
*/
U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter);
/**
* Set up a UCharIterator to iterate over a C++ Replaceable.
*
* Sets the UCharIterator function pointers for iteration over the
* Replaceable rep with iteration boundaries start=index=0 and
* length=limit=rep->length().
* The "provider" may set the start, index, and limit values at any time
* within the range 0..length=rep->length().
* The length field will be ignored.
*
* The Replaceable pointer rep is set into UCharIterator.context without copying
* or cloning/reallocating the Replaceable object.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param rep Replaceable to iterate over
*
* @see UCharIterator
* @stable ICU 2.1
*/
U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep);
#endif
U_CDECL_END
#endif
--- NEW FILE: unifilt.h ---
/*
* Copyright (C) 1999, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef UNIFILT_H
#define UNIFILT_H
#include "unicode/unifunct.h"
#include "unicode/unimatch.h"
U_NAMESPACE_BEGIN
/**
* U_ETHER is used to represent character values for positions outside
* a range. For example, transliterator uses this to represent
* characters outside the range contextStart..contextLimit-1. This
* allows explicit matching by rules and UnicodeSets of text outside a
* defined range.
*/
#define U_ETHER ((UChar)0xFFFF)
/**
* <code>UnicodeFilter</code> defines a protocol for selecting a
* subset of the full range (U+0000 to U+10FFFF) of Unicode characters.
* Currently, filters are used in conjunction with classes like {@link
* Transliterator} to only process selected characters through a
* transformation.
*
* <p>Note: UnicodeFilter currently stubs out two pure virtual methods
* of its base class, UnicodeMatcher. These methods are toPattern()
* and matchesIndexValue(). This is done so that filter classes that
* are not actually used as matchers -- specifically, those in the
* UnicodeFilterLogic component, and those in tests -- can continue to
* work without defining these methods. As long as a filter is not
* used in an RBT during real transliteration, these methods will not
* be called. However, this breaks the UnicodeMatcher base class
* protocol, and it is not a correct solution.
*
* <p>In the future we may revisit the UnicodeMatcher / UnicodeFilter
* hierarchy and either redesign it, or simply remove the stubs in
* UnicodeFilter and force subclasses to implement the full
* UnicodeMatcher protocol.
*
* @see UnicodeFilterLogic
* @stable ICU 2.0
*/
class U_COMMON_API UnicodeFilter : public UnicodeFunctor, public UnicodeMatcher {
public:
/**
* Destructor
* @stable ICU 2.0
*/
virtual ~UnicodeFilter();
/**
* Returns <tt>true</tt> for characters that are in the selected
* subset. In other words, if a character is <b>to be
* filtered</b>, then <tt>contains()</tt> returns
* <b><tt>false</tt></b>.
* @stable ICU 2.0
*/
virtual UBool contains(UChar32 c) const = 0;
/**
* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
* and return the pointer.
* @draft ICU 2.4
*/
virtual UnicodeMatcher* toMatcher() const;
/**
* Implement UnicodeMatcher API.
* @draft ICU 2.4
*/
virtual UMatchDegree matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental);
/**
* UnicodeFunctor API. Nothing to do.
* @draft ICU 2.4
*/
virtual void setData(const TransliterationRuleData*) {}
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const = 0;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.2
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
protected:
/**
* @stable ICU 2.0
*/
UnicodeFilter();
private:
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UnicodeFilter::UnicodeFilter() {}
inline UnicodeFilter::~UnicodeFilter() {}
U_NAMESPACE_END
#endif
--- NEW FILE: unifunct.h ---
/*
**********************************************************************
* Copyright (c) 2002-2003, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 01/14/2002 aliu Creation.
**********************************************************************
*/
#ifndef UNIFUNCT_H
#define UNIFUNCT_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
class UnicodeMatcher;
class UnicodeReplacer;
class TransliterationRuleData;
/**
* <code>UnicodeFunctor</code> is an abstract base class for objects
* that perform match and/or replace operations on Unicode strings.
* @author Alan Liu
* @draft ICU 2.4
*/
class U_COMMON_API UnicodeFunctor : public UObject {
public:
/**
* Destructor
* @draft ICU 2.4
*/
virtual ~UnicodeFunctor();
/**
* Return a copy of this object. All UnicodeFunctor objects
* have to support cloning in order to allow classes using
* UnicodeFunctor to implement cloning.
* @draft ICU 2.4
*/
virtual UnicodeFunctor* clone() const = 0;
/**
* Cast 'this' to a UnicodeMatcher* pointer and return the
* pointer, or null if this is not a UnicodeMatcher*. Subclasses
* that mix in UnicodeMatcher as a base class must override this.
* This protocol is required because a pointer to a UnicodeFunctor
* cannot be cast to a pointer to a UnicodeMatcher, since
* UnicodeMatcher is a mixin that does not derive from
* UnicodeFunctor.
* @draft ICU 2.4
*/
virtual UnicodeMatcher* toMatcher() const;
/**
* Cast 'this' to a UnicodeReplacer* pointer and return the
* pointer, or null if this is not a UnicodeReplacer*. Subclasses
* that mix in UnicodeReplacer as a base class must override this.
* This protocol is required because a pointer to a UnicodeFunctor
* cannot be cast to a pointer to a UnicodeReplacer, since
* UnicodeReplacer is a mixin that does not derive from
* UnicodeFunctor.
* @draft ICU 2.4
*/
virtual UnicodeReplacer* toReplacer() const;
/**
* Return the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
* <pre>
* . Base* polymorphic_pointer = createPolymorphicObject();
* . if (polymorphic_pointer->getDynamicClassID() ==
* . Derived::getStaticClassID()) ...
* </pre>
* @return The class ID for all objects of this class.
* @stable ICU 2.0
*/
static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }
/**
* Returns a unique class ID <b>polymorphically</b>. This method
* is to implement a simple version of RTTI, since not all C++
* compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* <p>Concrete subclasses of UnicodeFunctor that wish clients to
* be able to identify them should implement getDynamicClassID()
* and also a static method and data member:
*
* <pre>
* static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
* static char fgClassID;
* </pre>
*
* Subclasses that do not implement this method will have a
* dynamic class ID of UnicodeFunctor::getStatisClassID().
*
* @return The class ID for this object. All objects of a given
* class have the same class ID. Objects of other classes have
* different class IDs.
* @draft ICU 2.4
*/
virtual UClassID getDynamicClassID(void) const = 0;
/**
* Set the data object associated with this functor. The data
* object provides context for functor-to-standin mapping. This
* method is required when assigning a functor to a different data
* object. This function MAY GO AWAY later if the architecture is
* changed to pass data object pointers through the API.
* @internal ICU 2.1
*/
virtual void setData(const TransliterationRuleData*) = 0;
protected:
/**
* @stable ICU 2.0
*/
UnicodeFunctor();
private:
/**
* Class identifier for subclasses of UnicodeFunctor that do not
* define their class (anonymous subclasses).
*/
static const char fgClassID;
};
inline UnicodeFunctor::UnicodeFunctor() {}
inline UnicodeFunctor::~UnicodeFunctor() {}
U_NAMESPACE_END
#endif
--- NEW FILE: unimatch.h ---
/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/18/01 aliu Creation.
**********************************************************************
*/
#ifndef UNIMATCH_H
#define UNIMATCH_H
#include "unicode/utypes.h"
U_NAMESPACE_BEGIN
class Replaceable;
class UnicodeString;
class UnicodeSet;
/**
* Constants returned by <code>UnicodeMatcher::matches()</code>
* indicating the degree of match.
* @draft ICU 2.4
*/
enum UMatchDegree {
/**
* Constant returned by <code>matches()</code> indicating a
* mismatch between the text and this matcher. The text contains
* a character which does not match, or the text does not contain
* all desired characters for a non-incremental match.
* @draft ICU 2.4
*/
U_MISMATCH,
/**
* Constant returned by <code>matches()</code> indicating a
* partial match between the text and this matcher. This value is
* only returned for incremental match operations. All characters
* of the text match, but more characters are required for a
* complete match. Alternatively, for variable-length matchers,
* all characters of the text match, and if more characters were
* supplied at limit, they might also match.
* @draft ICU 2.4
*/
U_PARTIAL_MATCH,
/**
* Constant returned by <code>matches()</code> indicating a
* complete match between the text and this matcher. For an
* incremental variable-length match, this value is returned if
* the given text matches, and it is known that additional
* characters would not alter the extent of the match.
* @draft ICU 2.4
*/
U_MATCH
};
/**
* <code>UnicodeMatcher</code> defines a protocol for objects that can
* match a range of characters in a Replaceable string.
* @draft ICU 2.4
*/
class U_COMMON_API UnicodeMatcher /* not : public UObject because this is an interface/mixin class */ {
public:
/**
* Destructor.
* @draft ICU 2.4
*/
virtual inline ~UnicodeMatcher() {};
/**
* Return a UMatchDegree value indicating the degree of match for
* the given text at the given offset. Zero, one, or more
* characters may be matched.
*
* Matching in the forward direction is indicated by limit >
* offset. Characters from offset forwards to limit-1 will be
* considered for matching.
*
* Matching in the reverse direction is indicated by limit <
* offset. Characters from offset backwards to limit+1 will be
* considered for matching.
*
* If limit == offset then the only match possible is a zero
* character match (which subclasses may implement if desired).
*
* As a side effect, advance the offset parameter to the limit of
* the matched substring. In the forward direction, this will be
* the index of the last matched character plus one. In the
* reverse direction, this will be the index of the last matched
* character minus one.
*
* <p>Note: This method is not const because some classes may
* modify their state as the result of a match.
*
* @param text the text to be matched
* @param offset on input, the index into text at which to begin
* matching. On output, the limit of the matched text. The
* number of matched characters is the output value of offset
* minus the input value. Offset should always point to the
* HIGH SURROGATE (leading code unit) of a pair of surrogates,
* both on entry and upon return.
* @param limit the limit index of text to be matched. Greater
* than offset for a forward direction match, less than offset for
* a backward direction match. The last character to be
* considered for matching will be text.charAt(limit-1) in the
* forward direction or text.charAt(limit+1) in the backward
* direction.
* @param incremental if TRUE, then assume further characters may
* be inserted at limit and check for partial matching. Otherwise
* assume the text as given is complete.
* @return a match degree value indicating a full match, a partial
* match, or a mismatch. If incremental is FALSE then
* U_PARTIAL_MATCH should never be returned.
* @draft ICU 2.4
*/
virtual UMatchDegree matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) = 0;
/**
* Returns a string representation of this matcher. If the result of
* calling this function is passed to the appropriate parser, it
* will produce another matcher that is equal to this one.
* @param result the string to receive the pattern. Previous
* contents will be deleted.
* @param escapeUnprintable if TRUE then convert unprintable
* character to their hex escape representations, \uxxxx or
* \Uxxxxxxxx. Unprintable characters are those other than
* U+000A, U+0020..U+007E.
* @draft ICU 2.4
*/
virtual UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable = FALSE) const = 0;
/**
* Returns TRUE if this matcher will match a character c, where c
* & 0xFF == v, at offset, in the forward direction (with limit >
* offset). This is used by <tt>RuleBasedTransliterator</tt> for
* indexing.
* @draft ICU 2.4
*/
virtual UBool matchesIndexValue(uint8_t v) const = 0;
/**
* Union the set of all characters that may be matched by this object
* into the given set.
* @param toUnionTo the set into which to union the source characters
* @draft ICU 2.4
*/
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const = 0;
};
U_NAMESPACE_END
#endif
--- NEW FILE: uniset.h ---
/*
**********************************************************************
* Copyright (C) 1999-2003, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 10/20/99 alan Creation.
**********************************************************************
*/
#ifndef UNICODESET_H
#define UNICODESET_H
#include "unicode/unifilt.h"
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/uchar.h"
#include "unicode/uset.h"
U_NAMESPACE_BEGIN
[...1320 lines suppressed...]
inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
return !operator==(o);
}
inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
return !containsNone(start, end);
}
inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
return !containsNone(s);
}
inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
return !containsNone(s);
}
U_NAMESPACE_END
#endif
--- NEW FILE: uobject.h ---
/*
******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uobject.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jun26
* created by: Markus W. Scherer
*/
#ifndef __UOBJECT_H__
#define __UOBJECT_H__
#include "unicode/utypes.h"
U_NAMESPACE_BEGIN
/**
* \file
* \brief C++ API: Common ICU base class UObject.
*/
/** U_OVERRIDE_CXX_ALLOCATION - Define this to override operator new and
* delete in UMemory. Enabled by default for ICU.
*
* Enabling forces all allocation of ICU object types to use ICU's
* memory allocation. On Windows, this allows the ICU DLL to be used by
* applications that statically link the C Runtime library, meaning that
* the app and ICU will be using different heaps.
*
* @draft ICU 2.2
*/
#ifndef U_OVERRIDE_CXX_ALLOCATION
#define U_OVERRIDE_CXX_ALLOCATION 1
#endif
/** U_HAVE_PLACEMENT_NEW - Define this to define the placement new and
* delete in UMemory for STL.
*
* @draft ICU 2.6
*/
#ifndef U_HAVE_PLACEMENT_NEW
#define U_HAVE_PLACEMENT_NEW 1
#endif
/**
* UMemory is the common ICU base class.
* All other ICU C++ classes are derived from UMemory (starting with ICU 2.4).
*
* This is primarily to make it possible and simple to override the
* C++ memory management by adding new/delete operators to this base class.
*
* To override ALL ICU memory management, including that from plain C code,
* replace the allocation functions declared in cmemory.h
*
* UMemory does not contain any virtual functions.
* Common "boilerplate" functions are defined in UObject.
*
* @draft ICU 2.4
*/
class U_COMMON_API UMemory {
public:
#if U_OVERRIDE_CXX_ALLOCATION
/**
* Override for ICU4C C++ memory management.
* simple, non-class types are allocated using the macros in common/cmemory.h
* (uprv_malloc(), uprv_free(), uprv_realloc());
* they or something else could be used here to implement C++ new/delete
* for ICU4C C++ classes
* @draft ICU 2.4
*/
static void *operator new(size_t size);
/**
* Override for ICU4C C++ memory management.
* See new().
* @draft ICU 2.4
*/
static void *operator new[](size_t size);
/**
* Override for ICU4C C++ memory management.
* simple, non-class types are allocated using the macros in common/cmemory.h
* (uprv_malloc(), uprv_free(), uprv_realloc());
* they or something else could be used here to implement C++ new/delete
* for ICU4C C++ classes
* @draft ICU 2.4
*/
static void operator delete(void *p);
/**
* Override for ICU4C C++ memory management.
* See delete().
* @draft ICU 2.4
*/
static void operator delete[](void *p);
#if U_HAVE_PLACEMENT_NEW
/**
* Override for ICU4C C++ memory management for STL.
* See new().
* @draft ICU 2.6
*/
static inline void * operator new(size_t, void *ptr) { return ptr; }
/**
* Override for ICU4C C++ memory management for STL.
* See delete().
* @draft ICU 2.6
*/
static inline void operator delete(void *, void *) {}
#endif /* U_HAVE_PLACEMENT_NEW */
#endif /* U_OVERRIDE_CXX_ALLOCATION */
/*
* Assignment operator not declared. The compiler will provide one
* which does nothing since this class does not contain any data members.
* API/code coverage may show the assignment operator as present and
* untested - ignore.
* Subclasses need this assignment operator if they use compiler-provided
* assignment operators of their own. An alternative to not declaring one
* here would be to declare and empty-implement a protected or public one.
UMemory &UMemory::operator=(const UMemory &);
*/
};
/**
* UObject is the common ICU "boilerplate" class.
* UObject inherits UMemory (starting with ICU 2.4),
* and all other public ICU C++ classes
* are derived from UObject (starting with ICU 2.2).
*
* UObject contains common virtual functions like for ICU's "poor man's RTTI".
* It does not contain default implementations of virtual methods
* like getDynamicClassID to allow derived classes such as Format
* to declare these as pure virtual.
*
* The clone() function is not available in UObject because it is not
* implemented by all ICU classes.
* Many ICU services provide a clone() function for their class trees,
* defined on the service's C++ base class, and all subclasses within that
* service class tree return a pointer to the service base class
* (which itself is a subclass of UObject).
* This is because some compilers do not support covariant (same-as-this)
* return types; cast to the appropriate subclass if necessary.
*
* @draft ICU 2.2
*/
class U_COMMON_API UObject : public UMemory {
public:
/**
* Destructor.
*
* @draft ICU 2.2
*/
virtual inline ~UObject() {}
/**
* ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const = 0;
protected:
// the following functions are protected to prevent instantiation and
// direct use of UObject itself
// default constructor
// commented out because UObject is abstract (see getDynamicClassID)
// inline UObject() {}
// copy constructor
// commented out because UObject is abstract (see getDynamicClassID)
// inline UObject(const UObject &other) {}
#if U_ICU_VERSION_MAJOR_NUM>2 || (U_ICU_VERSION_MAJOR_NUM==2 && U_ICU_VERSION_MINOR_NUM>6)
// TODO post ICU 2.4 (This comment inserted in 2.2)
// some or all of the following "boilerplate" functions may be made public
// in a future ICU4C release when all subclasses implement them
// assignment operator
// (not virtual, see "Taligent's Guide to Designing Programs" pp.73..74)
// commented out because the implementation is the same as a compiler's default
// UObject &operator=(const UObject &other) { return *this; }
// comparison operators
virtual inline UBool operator==(const UObject &other) const { return this==&other; }
inline UBool operator!=(const UObject &other) const { return !operator==(other); }
// clone() commented out from the base class:
// some compilers do not support co-variant return types
// (i.e., subclasses would have to return UObject * as well, instead of SubClass *)
// see also UObject class documentation.
// virtual UObject *clone() const;
#endif
/*
* Assignment operator not declared. The compiler will provide one
* which does nothing since this class does not contain any data members.
* API/code coverage may show the assignment operator as present and
* untested - ignore.
* Subclasses need this assignment operator if they use compiler-provided
* assignment operators of their own. An alternative to not declaring one
* here would be to declare and empty-implement a protected or public one.
UObject &UObject::operator=(const UObject &);
*/
};
U_NAMESPACE_END
#endif
--- NEW FILE: uset.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uset.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002mar07
* created by: Markus W. Scherer
*
* C version of UnicodeSet.
*/
/**
* \file
* \brief C API: Unicode Set
*
* <p>This is a C wrapper around the C++ UnicodeSet class.</p>
*/
#ifndef __USET_H__
#define __USET_H__
#include "unicode/utypes.h"
#ifndef UCNV_H
struct USet;
/**
* A UnicodeSet. Use the uset_* API to manipulate. Create with
* uset_open*, and destroy with uset_close.
* @draft ICU 2.4
*/
typedef struct USet USet;
#endif
/**
* Bitmask values to be passed to the UnicodeSet constructor or
* applyPattern() taking an option parameter.
* @draft
*/
enum {
/**
* Ignore white space within patterns unless quoted or escaped.
* @draft
*/
USET_IGNORE_SPACE = 1,
/**
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'.
* @draft
*/
USET_CASE_INSENSITIVE = 2,
/**
* Bitmask for UnicodeSet::closeOver() indicating letter case.
* This may be ORed together with other selectors.
* @internal
*/
USET_CASE = 2,
/**
* Enough for any single-code point set
* @internal
*/
USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
};
/**
* A serialized form of a Unicode set. Limited manipulations are
* possible directly on a serialized set. See below.
* @draft ICU 2.4
*/
typedef struct USerializedSet {
/**
* The serialized Unicode Set.
* @draft ICU 2.4
*/
const uint16_t *array;
/**
* The length of the array that contains BMP characters.
* @draft ICU 2.4
*/
int32_t bmpLength;
/**
* The total length of the array.
* @draft ICU 2.4
*/
int32_t length;
/**
* A small buffer for the array to reduce memory allocations.
* @draft ICU 2.4
*/
uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
} USerializedSet;
/*********************************************************************
* USet API
*********************************************************************/
/**
* Creates a USet object that contains the range of characters
* start..end, inclusive.
* @param start first character of the range, inclusive
* @param end last character of the range, inclusive
* @return a newly created USet. The caller must call uset_close() on
* it when done.
* @draft ICU 2.4
*/
U_CAPI USet* U_EXPORT2
uset_open(UChar32 start, UChar32 end);
/**
* Creates a set from the given pattern. See the UnicodeSet class
* description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param patternLength the length of the pattern, or -1 if null
* terminated
* @param ec the error code
* @draft ICU 2.4
*/
U_CAPI USet* U_EXPORT2
uset_openPattern(const UChar* pattern, int32_t patternLength,
UErrorCode* ec);
/**
* Creates a set from the given pattern. See the UnicodeSet class
* description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param patternLength the length of the pattern, or -1 if null
* terminated
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* @param ec the error code
* @draft ICU 2.4
*/
U_CAPI USet* U_EXPORT2
uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
uint32_t options,
UErrorCode* ec);
/**
* Disposes of the storage used by a USet object. This function should
* be called exactly once for objects returned by uset_open().
* @param set the object to dispose of
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_close(USet* set);
/**
* Returns a string representation of this set. If the result of
* calling this function is passed to a uset_openPattern(), it
* will produce another set that is equal to this one.
* @param set the set
* @param result the string to receive the rules, may be NULL
* @param resultCapacity the capacity of result, may be 0 if result is NULL
* @param escapeUnprintable if TRUE then convert unprintable
* character to their hex escape representations, \uxxxx or
* \Uxxxxxxxx. Unprintable characters are those other than
* U+000A, U+0020..U+007E.
* @param ec error code.
* @return length of string, possibly larger than resultCapacity
* @draft ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
uset_toPattern(const USet* set,
UChar* result, int32_t resultCapacity,
UBool escapeUnprintable,
UErrorCode* ec);
/**
* Adds the given character to the given USet. After this call,
* uset_contains(set, c) will return TRUE.
* @param set the object to which to add the character
* @param c the character to add
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_add(USet* set, UChar32 c);
/**
* Adds all of the elements in the specified set to this set if
* they're not already present. This operation effectively
* modifies this set so that its value is the <i>union</i> of the two
* sets. The behavior of this operation is unspecified if the specified
* collection is modified while the operation is in progress.
*
* @param set the object to which to add the set
* @param additionalSet the source set whose elements are to be added to this set.
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
uset_addAll(USet* set, const USet *additionalSet);
/**
* Adds the given range of characters to the given USet. After this call,
* uset_contains(set, start, end) will return TRUE.
* @param set the object to which to add the character
* @param start the first character of the range to add, inclusive
* @param end the last character of the range to add, inclusive
* @draft ICU 2.2
*/
U_CAPI void U_EXPORT2
uset_addRange(USet* set, UChar32 start, UChar32 end);
/**
* Adds the given string to the given USet. After this call,
* uset_containsString(set, str, strLen) will return TRUE.
* @param set the object to which to add the character
* @param str the string to add
* @param strLen the length of the string or -1 if null terminated.
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_addString(USet* set, const UChar* str, int32_t strLen);
/**
* Removes the given character from the given USet. After this call,
* uset_contains(set, c) will return FALSE.
* @param set the object from which to remove the character
* @param c the character to remove
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_remove(USet* set, UChar32 c);
/**
* Removes the given range of characters from the given USet. After this call,
* uset_contains(set, start, end) will return FALSE.
* @param set the object to which to add the character
* @param start the first character of the range to remove, inclusive
* @param end the last character of the range to remove, inclusive
* @draft ICU 2.2
*/
U_CAPI void U_EXPORT2
uset_removeRange(USet* set, UChar32 start, UChar32 end);
/**
* Removes the given string to the given USet. After this call,
* uset_containsString(set, str, strLen) will return FALSE.
* @param set the object to which to add the character
* @param str the string to remove
* @param strLen the length of the string or -1 if null terminated.
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_removeString(USet* set, const UChar* str, int32_t strLen);
/**
* Inverts this set. This operation modifies this set so that
* its value is its complement. This operation does not affect
* the multicharacter strings, if any.
* @param set the set
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_complement(USet* set);
/**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* @param set the set
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_clear(USet* set);
/**
* Returns TRUE if the given USet contains no characters and no
* strings.
* @param set the set
* @return true if set is empty
* @draft ICU 2.4
*/
U_CAPI UBool U_EXPORT2
uset_isEmpty(const USet* set);
/**
* Returns TRUE if the given USet contains the given character.
* @param set the set
* @param c The codepoint to check for within the set
* @return true if set contains c
* @draft ICU 2.4
*/
U_CAPI UBool U_EXPORT2
uset_contains(const USet* set, UChar32 c);
/**
* Returns TRUE if the given USet contains all characters c
* where start <= c && c <= end.
* @param set the set
* @param start the first character of the range to test, inclusive
* @param end the last character of the range to test, inclusive
* @return TRUE if set contains the range
* @draft ICU 2.2
*/
U_CAPI UBool U_EXPORT2
uset_containsRange(const USet* set, UChar32 start, UChar32 end);
/**
* Returns TRUE if the given USet contains the given string.
* @param set the set
* @param str the string
* @param strLen the length of the string or -1 if null terminated.
* @return true if set contains str
* @draft ICU 2.4
*/
U_CAPI UBool U_EXPORT2
uset_containsString(const USet* set, const UChar* str, int32_t strLen);
/**
* Returns the number of characters and strings contained in the given
* USet.
* @param set the set
* @return a non-negative integer counting the characters and strings
* contained in set
* @draft ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
uset_size(const USet* set);
/**
* Returns the number of items in this set. An item is either a range
* of characters or a single multicharacter string.
* @param set the set
* @return a non-negative integer counting the character ranges
* and/or strings contained in set
* @draft ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
uset_getItemCount(const USet* set);
/**
* Returns an item of this set. An item is either a range of
* characters or a single multicharacter string.
* @param set the set
* @param itemIndex a non-negative integer in the range 0..
* uset_getItemCount(set)-1
* @param start pointer to variable to receive first character
* in range, inclusive
* @param end pointer to variable to receive last character in range,
* inclusive
* @param str buffer to receive the string, may be NULL
* @param strCapacity capacity of str, or 0 if str is NULL
* @param ec error code
* @return the length of the string (>= 2), or 0 if the item is a
* range, in which case it is the range *start..*end, or -1 if
* itemIndex is out of range
* @draft ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
uset_getItem(const USet* set, int32_t itemIndex,
UChar32* start, UChar32* end,
UChar* str, int32_t strCapacity,
UErrorCode* ec);
/*********************************************************************
* Serialized set API
*********************************************************************/
/**
* Serializes this set into an array of 16-bit integers. Serialization
* (currently) only records the characters in the set; multicharacter
* strings are ignored.
*
* The array
* has following format (each line is one 16-bit integer):
*
* length = (n+2*m) | (m!=0?0x8000:0)
* bmpLength = n; present if m!=0
* bmp[0]
* bmp[1]
* ...
* bmp[n-1]
* supp-high[0]
* supp-low[0]
* supp-high[1]
* supp-low[1]
* ...
* supp-high[m-1]
* supp-low[m-1]
*
* The array starts with a header. After the header are n bmp
* code points, then m supplementary code points. Either n or m
* or both may be zero. n+2*m is always <= 0x7FFF.
*
* If there are no supplementary characters (if m==0) then the
* header is one 16-bit integer, 'length', with value n.
*
* If there are supplementary characters (if m!=0) then the header
* is two 16-bit integers. The first, 'length', has value
* (n+2*m)|0x8000. The second, 'bmpLength', has value n.
*
* After the header the code points are stored in ascending order.
* Supplementary code points are stored as most significant 16
* bits followed by least significant 16 bits.
*
* @param set the set
* @param dest pointer to buffer of destCapacity 16-bit integers.
* May be NULL only if destCapacity is zero.
* @param destCapacity size of dest, or zero. Must not be negative.
* @param pErrorCode pointer to the error code. Will be set to
* U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to
* U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
* @return the total length of the serialized format, including
* the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
* than U_BUFFER_OVERFLOW_ERROR.
* @draft ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
/**
* Given a serialized array, fill in the given serialized set object.
* @param fillSet pointer to result
* @param src pointer to start of array
* @param srcLength length of array
* @return true if the given array is valid, otherwise false
* @draft ICU 2.4
*/
U_CAPI UBool U_EXPORT2
uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
/**
* Set the USerializedSet to contain the given character (and nothing
* else).
* @param fillSet pointer to result
* @param c The codepoint to set
* @draft ICU 2.4
*/
U_CAPI void U_EXPORT2
uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
/**
* Returns TRUE if the given USerializedSet contains the given
* character.
* @param set the serialized set
* @param c The codepoint to check for within the set
* @return true if set contains c
* @draft ICU 2.4
*/
U_CAPI UBool U_EXPORT2
uset_serializedContains(const USerializedSet* set, UChar32 c);
/**
* Returns the number of disjoint ranges of characters contained in
* the given serialized set. Ignores any strings contained in the
* set.
* @param set the serialized set
* @return a non-negative integer counting the character ranges
* contained in set
* @draft ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
uset_getSerializedRangeCount(const USerializedSet* set);
/**
* Returns a range of characters contained in the given serialized
* set.
* @param set the serialized set
* @param rangeIndex a non-negative integer in the range 0..
* uset_getSerializedRangeCount(set)-1
* @param pStart pointer to variable to receive first character
* in range, inclusive
* @param pEnd pointer to variable to receive last character in range,
* inclusive
* @return true if rangeIndex is valid, otherwise false
* @draft ICU 2.4
*/
U_CAPI UBool U_EXPORT2
uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
UChar32* pStart, UChar32* pEnd);
#endif
--- NEW FILE: usetiter.h ---
/*
**********************************************************************
* Copyright (c) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* $Source: /usr/local/cvsroot/icu-sword/source/common/unicode/usetiter.h,v $
**********************************************************************
*/
#ifndef USETITER_H
#define USETITER_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/unistr.h"
U_NAMESPACE_BEGIN
class UnicodeSet;
class UnicodeString;
/**
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
* iterates over either code points or code point ranges. After all
* code points or ranges have been returned, it returns the
* multicharacter strings of the UnicodSet, if any.
*
* <p>To iterate over code points, use a loop like this:
* <pre>
* UnicodeSetIterator it(set);
* while (set.next()) {
* if (set.isString()) {
* processString(set.getString());
* } else {
* processCodepoint(set.getCodepoint());
* }
* }
* </pre>
*
* <p>To iterate over code point ranges, use a loop like this:
* <pre>
* UnicodeSetIterator it(set);
* while (it.nextRange()) {
* if (it.isString()) {
* processString(it.getString());
* } else {
* processCodepointRange(it.getCodepoint(), it.getCodepointEnd());
* }
* }
* </pre>
* @author M. Davis
* @draft ICU 2.2
*/
class U_COMMON_API UnicodeSetIterator : public UObject {
protected:
/**
* Value of <tt>codepoint</tt> if the iterator points to a string.
* If <tt>codepoint == IS_STRING</tt>, then examine
* <tt>string</tt> for the current iteration result.
* @draft ICU 2.4
*/
enum { IS_STRING = -1 };
/**
* Current code point, or the special value <tt>IS_STRING</tt>, if
* the iterator points to a string.
* @draft ICU 2.4
*/
UChar32 codepoint;
/**
* When iterating over ranges using <tt>nextRange()</tt>,
* <tt>codepointEnd</tt> contains the inclusive end of the
* iteration range, if <tt>codepoint != IS_STRING</tt>. If
* iterating over code points using <tt>next()</tt>, or if
* <tt>codepoint == IS_STRING</tt>, then the value of
* <tt>codepointEnd</tt> is undefined.
* @draft ICU 2.4
*/
UChar32 codepointEnd;
/**
* If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points
* to the current string. If <tt>codepoint != IS_STRING</tt>, the
* value of <tt>string</tt> is undefined.
* @draft ICU 2.4
*/
const UnicodeString* string;
public:
/**
* Create an iterator over the given set. The iterator is valid
* only so long as <tt>set</tt> is valid.
* @param set set to iterate over
* @draft ICU 2.4
*/
UnicodeSetIterator(const UnicodeSet& set);
/**
* Create an iterator over nothing. <tt>next()</tt> and
* <tt>nextRange()</tt> return false. This is a convenience
* constructor allowing the target to be set later.
* @draft ICU 2.4
*/
UnicodeSetIterator();
/**
* Destructor.
* @draft ICU 2.4
*/
virtual ~UnicodeSetIterator();
/**
* Returns true if the current element is a string. If so, the
* caller can retrieve it with <tt>getString()</tt>. If this
* method returns false, the current element is a code point or
* code point range, depending on whether <tt>next()</tt> or
* <tt>nextRange()</tt> was called, and the caller can retrieve it
* with <tt>getCodepoint()</tt> and, for a range,
* <tt>getCodepointEnd()</tt>.
* @draft ICU 2.4
*/
inline UBool isString() const;
/**
* Returns the current code point, if <tt>isString()</tt> returned
* false. Otherwise returns an undefined result.
* @draft ICU 2.4
*/
inline UChar32 getCodepoint() const;
/**
* Returns the end of the current code point range, if
* <tt>isString()</tt> returned false and <tt>nextRange()</tt> was
* called. Otherwise returns an undefined result.
* @draft ICU 2.4
*/
inline UChar32 getCodepointEnd() const;
/**
* Returns the current string, if <tt>isString()</tt> returned
* true. Otherwise returns an undefined result.
* @draft ICU 2.4
*/
inline const UnicodeString& getString() const;
/**
* Returns the next element in the set, either a single code point
* or a string. If there are no more elements in the set, return
* false. If <tt>codepoint == IS_STRING</tt>, the value is a
* string in the <tt>string</tt> field. Otherwise the value is a
* single code point in the <tt>codepoint</tt> field.
*
* <p>The order of iteration is all code points in sorted order,
* followed by all strings sorted order. <tt>codepointEnd</tt> is
* undefined after calling this method. <tt>string</tt> is
* undefined unless <tt>codepoint == IS_STRING</tt>. Do not mix
* calls to <tt>next()</tt> and <tt>nextRange()</tt> without
* calling <tt>reset()</tt> between them. The results of doing so
* are undefined.
*
* @return true if there was another element in the set and this
* object contains the element.
* @draft ICU 2.4
*/
UBool next();
/**
* Returns the next element in the set, either a code point range
* or a string. If there are no more elements in the set, return
* false. If <tt>codepoint == IS_STRING</tt>, the value is a
* string in the <tt>string</tt> field. Otherwise the value is a
* range of one or more code points from <tt>codepoint</tt> to
* <tt>codepointeEnd</tt> inclusive.
*
* <p>The order of iteration is all code points ranges in sorted
* order, followed by all strings sorted order. Ranges are
* disjoint and non-contiguous. <tt>string</tt> is undefined
* unless <tt>codepoint == IS_STRING</tt>. Do not mix calls to
* <tt>next()</tt> and <tt>nextRange()</tt> without calling
* <tt>reset()</tt> between them. The results of doing so are
* undefined.
*
* @return true if there was another element in the set and this
* object contains the element.
* @draft ICU 2.4
*/
UBool nextRange();
/**
* Sets this iterator to visit the elements of the given set and
* resets it to the start of that set. The iterator is valid only
* so long as <tt>set</tt> is valid.
* @param set the set to iterate over.
* @draft ICU 2.4
*/
void reset(const UnicodeSet& set);
/**
* Resets this iterator to the start of the set.
* @draft ICU 2.4
*/
void reset();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.2
*/
static inline UClassID getStaticClassID();
// ======================= PRIVATES ===========================
protected:
// endElement and nextElements are really UChar32's, but we keep
// them as signed int32_t's so we can do comparisons with
// endElement set to -1. Leave them as int32_t's.
/** The set
* @draft ICU 2.4
*/
const UnicodeSet* set;
/** End range
* @draft ICU 2.4
*/
int32_t endRange;
/** Range
* @draft ICU 2.4
*/
int32_t range;
/** End element
* @draft ICU 2.4
*/
int32_t endElement;
/** Next element
* @draft ICU 2.4
*/
int32_t nextElement;
//UBool abbreviated;
/** Next string
* @draft ICU 2.4
*/
int32_t nextString;
/** String count
* @draft ICU 2.4
*/
int32_t stringCount;
/** Copy constructor. Disallowed.
* @draft ICU 2.4
*/
UnicodeSetIterator(const UnicodeSetIterator&); // disallow
/** Assignment operator. Disallowed.
* @draft ICU 2.4
*/
UnicodeSetIterator& operator=(const UnicodeSetIterator&); // disallow
/** Load range
* @draft ICU 2.4
*/
virtual void loadRange(int32_t range);
private:
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UClassID
UnicodeSetIterator::getStaticClassID()
{ return (UClassID)&fgClassID; }
inline UClassID
UnicodeSetIterator::getDynamicClassID() const
{ return UnicodeSetIterator::getStaticClassID(); }
inline UBool UnicodeSetIterator::isString() const {
return codepoint == (UChar32)IS_STRING;
}
inline UChar32 UnicodeSetIterator::getCodepoint() const {
return codepoint;
}
inline UChar32 UnicodeSetIterator::getCodepointEnd() const {
return codepointEnd;
}
inline const UnicodeString& UnicodeSetIterator::getString() const {
return *string;
}
U_NAMESPACE_END
#endif
--- NEW FILE: utf_old.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002sep21
* created by: Markus W. Scherer
*/
/**
* \file
* The macros in utf_old.h are all deprecated and their use discouraged.
[...1121 lines suppressed...]
*/
#define UTF_BACK_N(s, start, i, n) U16_BACK_N(s, start, i, n)
/**
* Take the random-access index i and adjust it so that it points beyond
* a code point. The input index points beyond any code unit
* of a code point and is moved to point beyond the last code unit of the same
* code point. i is never decremented.
* In other words, if i points to a trail surrogate that is preceded by a matching
* lead surrogate, then i is incremented. Otherwise it is not modified.
* This can be used to start an iteration with UTF_PREV_CHAR() from a random index.
* Same as UTF16_SET_CHAR_LIMIT.
* \pre start<i<=length
* \post start<i<=length
*
* @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h.
*/
#define UTF_SET_CHAR_LIMIT(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length)
#endif
- Previous message: [sword-cvs] icu-sword/source/config .cvsignore,1.2,1.3 Makefile.inc.in,1.5,1.6 icu-config-bottom,NONE,1.1 icu-config-top,NONE,1.1 icu-config.1.in,NONE,1.1 make2sh.sed,NONE,1.1 mh-aix,1.4,1.5 mh-aix-va,1.4,1.5 mh-alpha-linux-cc,1.4,1.5 mh-alpha-linux-gcc,1.4,1.5 mh-alpha-osf,1.2,1.3 mh-bsd-gcc,1.4,1.5 mh-cygwin,1.4,1.5 mh-cygwin-msvc,NONE,1.1 mh-darwin,1.4,1.5 mh-hpux-acc,1.4,1.5 mh-hpux-cc,1.4,1.5 mh-hpux-gcc,NONE,1.1 mh-irix,1.4,1.5 mh-linux,1.4,1.5 mh-os390,1.4,1.5 mh-os400,1.4,1.5 mh-ptx,1.4,1.5 mh-qnx,NONE,1.1 mh-solaris,1.4,1.5 mh-solaris-gcc,1.4,1.5 mh-unknown,NONE,1.1 test-icu-config.sh,NONE,1.1
- Next message: [sword-cvs] icu-sword/source/test/hdrtst Makefile,1.2,1.3 cxxfiles.txt,NONE,1.1 dfiles.txt,NONE,1.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]