[sword-cvs] icu-sword/source/common .cvsignore,1.2,1.3 Makefile.in,1.3,1.4 brkdict.cpp,NONE,1.1 brkdict.h,NONE,1.1 brkiter.cpp,NONE,1.1 caniter.cpp,NONE,1.1 chariter.cpp,1.2,1.3 charstr.h,1.2,1.3 cmemory.c,NONE,1.1 cmemory.h,1.2,1.3 common.dsp,1.3,1.4 common.rc,1.2,1.3 common.vcproj,NONE,1.1 cpputils.h,1.3,1.4 cstring.c,1.3,1.4 cstring.h,1.5,1.6 cwchar.c,1.3,1.4 cwchar.h,1.3,1.4 dbbi.cpp,NONE,1.1 dbbi_tbl.cpp,NONE,1.1 dbbi_tbl.h,NONE,1.1 digitlst.cpp,1.2,1.3 digitlst.h,1.2,1.3 filestrm.c,1.3,1.4 filestrm.h,1.3,1.4 hash.h,1.2,1.3 icucfg.h.in,1.2,1.3 iculserv.cpp,NONE,1.1 iculserv.h,NONE,1.1 icunotif.cpp,NONE,1.1 icunotif.h,NONE,1.1 icuserv.cpp,NONE,1.1 icuserv.h,NONE,1.1 locid.cpp,1.3,1.4 locmap.c,1.3,1.4 locmap.h,1.3,1.4 mutex.cpp,1.2,1.3 mutex.h,1.2,1.3 nameprep.cpp,NONE,1.1 nameprep.h,NONE,1.1 normlzr.cpp,1.4,1.5 propname.cpp,NONE,1.1 propname.h,NONE,1.1 punycode.c,NONE,1.1 punycode.h,NONE,1.1 putil.c,1.5,1.6 rbbi.cpp,NONE,1.1 rbbicst.pl,NONE,1.1 rbbidata.cpp,NONE,1.1 rbbidata.h,NONE,1.1 rbbinode.cpp,NONE,1.1 rbbinode.h,NONE,1.1 rbbirb.cpp,NONE,1.1 rbbirb.h,NONE,1.1 rbbirpt.h,NONE,1.1 rbbirpt.txt,NONE,1.1 rbbiscan.cpp,NONE,1.1 rbbiscan.h,NONE,1.1 rbbisetb.cpp,NONE,1.1 rbbisetb.h,NONE,1.1 rbbistbl.cpp,NONE,1.1 rbbitblb.cpp,NONE,1.1 rbbitblb.h,NONE,1.1 resbund.cpp,1.2,1.3 schriter.cpp,1.2,1.3 sprpimpl.h,NONE,1.1 strprep.cpp,NONE,1.1 strprep.h,NONE,1.1 symtable.h,NONE,1.1 uassert.h,NONE,1.1 ubidi.c,1.2,1.3 ubidiimp.h,1.2,1.3 ubidiln.c,1.2,1.3 ubidiwrt.c,1.2,1.3 ubrk.cpp,NONE,1.1 ucat.c,NONE,1.1 uchar.c,1.4,1.5 uchriter.cpp,1.3,1.4 ucln.h,1.3,1.4 ucln_cmn.c,1.3,1.4 ucln_cmn.h,1.2,1.3 ucmndata.c,1.2,1.3 ucmndata.h,1.2,1.3 ucmp8.c,1.3,1.4 ucmp8.h,1.2,1.3 ucnv.c,1.3,1.4 ucnv2022.c,1.4,1.5 ucnv_bld.c,1.3,1.4 ucnv_bld.h,1.3,1.4 ucnv_cb.c,1.3,1.4 ucnv_cnv.c,1.2,1.3 ucnv_cnv.h,1.2,1.3 ucnv_err.c,1.3,1.4 ucnv_imp.h,1.2,1.3 ucnv_io.c,1.4,1.5 ucnv_io.h,1.2,1.3 ucnv_lmb.c,1.2,1.3 ucnv_u16.c,NONE,1.1 ucnv_u32.c,NONE,1.1 ucnv_u7.c,NONE,1.1 ucnv_u8.c,NONE,1.1 ucnvbocu.c,NONE,1.1 ucnvhz.c,1.3,1.4 ucnvisci.c,1.3,1.4 ucnvlat1.c,1.2,1
sword@www.crosswire.org
sword@www.crosswire.org
Tue, 9 Sep 2003 19:43:23 -0700
- Previous message: [sword-cvs] icu-sword/source/extra/uconv/unicode uwmsg.h,1.2,1.3
- Next message: [sword-cvs] icu-sword/source/test/testmap .cvsignore,1.3,1.4 Makefile.in,1.3,1.4 testmap.c,1.2,1.3 testmap.dsp,1.2,1.3
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /usr/local/cvsroot/icu-sword/source/common
In directory www:/tmp/cvs-serv19862/source/common
Added Files:
.cvsignore Makefile.in brkdict.cpp brkdict.h brkiter.cpp
caniter.cpp chariter.cpp charstr.h cmemory.c cmemory.h
common.dsp common.rc common.vcproj cpputils.h cstring.c
cstring.h cwchar.c cwchar.h dbbi.cpp dbbi_tbl.cpp dbbi_tbl.h
digitlst.cpp digitlst.h filestrm.c filestrm.h hash.h
icucfg.h.in iculserv.cpp iculserv.h icunotif.cpp icunotif.h
icuserv.cpp icuserv.h locid.cpp locmap.c locmap.h mutex.cpp
mutex.h nameprep.cpp nameprep.h normlzr.cpp propname.cpp
propname.h punycode.c punycode.h putil.c rbbi.cpp rbbicst.pl
rbbidata.cpp rbbidata.h rbbinode.cpp rbbinode.h rbbirb.cpp
rbbirb.h rbbirpt.h rbbirpt.txt rbbiscan.cpp rbbiscan.h
rbbisetb.cpp rbbisetb.h rbbistbl.cpp rbbitblb.cpp rbbitblb.h
resbund.cpp schriter.cpp sprpimpl.h strprep.cpp strprep.h
symtable.h uassert.h ubidi.c ubidiimp.h ubidiln.c ubidiwrt.c
ubrk.cpp ucat.c uchar.c uchriter.cpp ucln.h ucln_cmn.c
ucln_cmn.h ucmndata.c ucmndata.h ucmp8.c ucmp8.h ucnv.c
ucnv2022.c ucnv_bld.c ucnv_bld.h ucnv_cb.c ucnv_cnv.c
ucnv_cnv.h ucnv_err.c ucnv_imp.h ucnv_io.c ucnv_io.h
ucnv_lmb.c ucnv_u16.c ucnv_u32.c ucnv_u7.c ucnv_u8.c
ucnvbocu.c ucnvhz.c ucnvisci.c ucnvlat1.c ucnvmbcs.c
ucnvmbcs.h ucnvscsu.c udata.c udatamem.c udatamem.h uenum.c
uenumimp.h uhash.c uhash.h uhash_us.cpp uidna.cpp uiter.cpp
uloc.c umapfile.c umapfile.h umemstrm.c umemstrm.h umutex.c
umutex.h unames.c unifilt.cpp unifunct.cpp uniset.cpp
unistr.cpp unorm.cpp unorm_it.c unorm_it.h unormimp.h
uobject.cpp uprops.c uprops.h uresbund.c uresdata.c uresdata.h
uresimp.h usc_impl.c usc_impl.h uscript.c uset.cpp
usetiter.cpp ushape.c ustr_imp.h ustrcase.c ustrenum.cpp
ustrenum.h ustrfmt.c ustrfmt.h ustring.c ustrtrns.c utf_impl.c
util.cpp util.h utrie.c utrie.h uvector.cpp uvector.h
uvectr32.cpp uvectr32.h
Log Message:
ICU 2.6 commit
--- NEW FILE: brkdict.cpp ---
/*
**********************************************************************
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rtg Ported from Java
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/resbund.h"
#include "brkdict.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
//=================================================================================
// deserialization
//=================================================================================
BreakDictionary::BreakDictionary(const char* /*dictionaryFilename*/, UErrorCode& status)
: columnMap(NULL),
table(NULL),
rowIndex(NULL),
rowIndexFlags(NULL),
rowIndexFlagsIndex(NULL),
rowIndexShifts(NULL)
{
if (U_FAILURE(status)) return;
ResourceBundle th((char *)0, Locale("th"), status);
if (U_FAILURE(status)) return;
ResourceBundle th_dict = th.get("BreakDictionaryData", status);
if (U_FAILURE(status)) return;
int32_t len;
const uint8_t * data = th_dict.getBinary(len, status);
if (U_FAILURE(status)) return;
UMemoryStream* dictionaryStream = uprv_mstrm_openBuffer(data, len);
if (dictionaryStream == 0) {
status = U_FILE_ACCESS_ERROR;
return;
}
readDictionaryFile(dictionaryStream);
uprv_mstrm_close(dictionaryStream);
}
BreakDictionary::~BreakDictionary()
{
ucmp8_close(columnMap);
uprv_free(table);
uprv_free(rowIndex);
uprv_free(rowIndexFlags);
uprv_free(rowIndexFlagsIndex);
uprv_free(rowIndexShifts);
}
// macros to support readDictionaryFile. The data files originated from a Java
// program, and Java always writes data out in big-endian format. These macros will
// byte-swap the data for appropriate use on Windows.
#if U_IS_BIG_ENDIAN
#define SWAP32(x)
#define SWAP16(x)
#else
#define SWAP32(x) x = (uint32_t)((x >> 24 & 0xff) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24 & 0xff000000))
#define SWAP16(x) x = (uint16_t)((x << 8 & 0xff00) | (x >> 8 & 0xff))
#endif
void
BreakDictionary::readDictionaryFile(UMemoryStream* in)
{
int32_t l;
int32_t version;
int i;
// read in the version number (right now we just ignore it)
uprv_mstrm_read(in, &version, 4);
// read in the column map (this is serialized in its internal form:
// an index array followed by a data array)
uprv_mstrm_read(in, &l, 4);
SWAP32(l);
uint16_t* temp = (uint16_t*) uprv_malloc(sizeof(uint16_t)*l);
uprv_mstrm_read(in, temp, l * sizeof (int16_t) );
for (i = 0; i < l; i++) {
SWAP16(temp[i]);
}
uprv_mstrm_read(in, &l, 4);
SWAP32(l);
int8_t* temp2 = (int8_t*) uprv_malloc(sizeof(int8_t)*l);
uprv_mstrm_read(in, temp2, l);
columnMap = ucmp8_openAdopt(temp, temp2, l);
// read in numCols and numColGroups
uprv_mstrm_read(in, &numCols, 4);
SWAP32(numCols);
uprv_mstrm_read(in, &numColGroups, 4);
SWAP32(numColGroups);
// read in the row-number index
uprv_mstrm_read(in, &l, 4);
SWAP32(l);
rowIndex = (int16_t *)uprv_malloc(l*2);
uprv_mstrm_read(in, rowIndex, l * sizeof (int16_t) );
for (i = 0; i < l; i++) {
SWAP16(rowIndex[i]);
}
// load in the populated-cells bitmap: index first, then bitmap list
uprv_mstrm_read(in, &l, 4);
SWAP32(l);
rowIndexFlagsIndex = (int16_t *)uprv_malloc(l*2);
uprv_mstrm_read(in, rowIndexFlagsIndex, l * sizeof(int16_t) );
for (i = 0; i < l; i++) {
SWAP16(rowIndexFlagsIndex[i]);
}
uprv_mstrm_read(in, &l, 4);
SWAP32(l);
rowIndexFlags = (int32_t *)uprv_malloc(l*4);
uprv_mstrm_read(in, rowIndexFlags, l * sizeof(int32_t));
for (i = 0; i < l; i++) {
SWAP32(rowIndexFlags[i]);
}
// load in the row-shift index
uprv_mstrm_read(in, &l, 4);
SWAP32(l);
rowIndexShifts = (int8_t *)uprv_malloc(l);
uprv_mstrm_read(in, rowIndexShifts, l);
// finally, load in the actual state table
uprv_mstrm_read(in, &l, 4);
SWAP32(l);
table = (int16_t *)uprv_malloc(l*2);
uprv_mstrm_read(in, table, l * sizeof(int16_t) );
for (i = 0; i < l; i++) {
SWAP16(table[i]);
}
// the reverse column map occurs next in the file. In the C/C++ code, for the
// time being, we're not going to worry about that.
}
//=================================================================================
// access to the words
//=================================================================================
/**
* Uses the column map to map the character to a column number, then
* passes the row and column number to the other version of at()
* @param row The current state
* @param ch The character whose column we're interested in
* @return The new state to transition to
*/
int16_t
BreakDictionary::at(int32_t row, UChar ch) const
{
int16_t col = ucmp8_get(columnMap, ch);
return at(row, (int32_t)col);
}
/**
* Returns the value in the cell with the specified (logical) row and
* column numbers. In DictionaryBasedBreakIterator, the row number is
* a state number, the column number is an input, and the return value
* is the row number of the new state to transition to. (0 is the
* "error" state, and -1 is the "end of word" state in a dictionary)
* @param row The row number of the current state
* @param col The column number of the input character (0 means "not a
* dictionary character")
* @return The row number of the new state to transition to
*/
int16_t
BreakDictionary::at(int32_t row, int32_t col) const
{
if (cellIsPopulated(row, col)) {
// we map from logical to physical row number by looking up the
// mapping in rowIndex; we map from logical column number to
// physical column number by looking up a shift value for this
// logical row and offsetting the logical column number by
// the shift amount. Then we can use internalAt() to actually
// get the value out of the table.
return internalAt(rowIndex[row], col + rowIndexShifts[row]);
}
else {
return 0;
}
}
//=================================================================================
// implementation
//=================================================================================
/**
* Given (logical) row and column numbers, returns true if the
* cell in that position is populated
*/
UBool
BreakDictionary::cellIsPopulated(int32_t row, int32_t col) const
{
// look up the entry in the bitmap index for the specified row.
// If it's a negative number, it's the column number of the only
// populated cell in the row
if (rowIndexFlagsIndex[row] < 0) {
return col == -rowIndexFlagsIndex[row];
}
// if it's a positive number, it's the offset of an entry in the bitmap
// list. If the table is more than 32 columns wide, the bitmap is stored
// successive entries in the bitmap list, so we have to divide the column
// number by 32 and offset the number we got out of the index by the result.
// Once we have the appropriate piece of the bitmap, test the appropriate
// bit and return the result.
else {
int32_t flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
return (flags & (1 << (col & 0x1f))) != 0;
}
}
/**
* Implementation of at() when we know the specified cell is populated.
* @param row The PHYSICAL row number of the cell
* @param col The PHYSICAL column number of the cell
* @return The value stored in the cell
*/
int16_t
BreakDictionary::internalAt(int32_t row, int32_t col) const
{
// the table is a one-dimensional array, so this just does the math necessary
// to treat it as a two-dimensional array (we don't just use a two-dimensional
// array because two-dimensional arrays are inefficient in Java)
return table[row * numCols + col];
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: brkdict.h ---
/*
**********************************************************************
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rtg Ported from Java
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#ifndef BRKDICT_H
#define BRKDICT_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "ucmp8.h"
#include "umemstrm.h"
U_NAMESPACE_BEGIN
/**
* This is the class that represents the list of known words used by
* DictionaryBasedBreakIterator. The conceptual data structure used
* here is a trie: there is a node hanging off the root node for every
* letter that can start a word. Each of these nodes has a node hanging
* off of it for every letter that can be the second letter of a word
* if this node is the first letter, and so on. The trie is represented
* as a two-dimensional array that can be treated as a table of state
* transitions. Indexes are used to compress this array, taking
* advantage of the fact that this array will always be very sparse.
*/
class BreakDictionary : public UMemory {
//=================================================================================
// data members
//=================================================================================
private:
/**
* Maps from characters to column numbers. The main use of this is to
* avoid making room in the array for empty columns.
*/
CompactByteArray* columnMap;
/**
* The number of actual columns in the table
*/
int32_t numCols;
/**
* Columns are organized into groups of 32. This says how many
* column groups. (We could calculate this, but we store the
* value to avoid having to repeatedly calculate it.)
*/
int32_t numColGroups;
/**
* The actual compressed state table. Each conceptual row represents
* a state, and the cells in it contain the row numbers of the states
* to transition to for each possible letter. 0 is used to indicate
* an illegal combination of letters (i.e., the error state). The
* table is compressed by eliminating all the unpopulated (i.e., zero)
* cells. Multiple conceptual rows can then be doubled up in a single
* physical row by sliding them up and possibly shifting them to one
* side or the other so the populated cells don't collide. Indexes
* are used to identify unpopulated cells and to locate populated cells.
*/
int16_t* table;
/**
* This index maps logical row numbers to physical row numbers
*/
int16_t* rowIndex;
/**
* A bitmap is used to tell which cells in the comceptual table are
* populated. This array contains all the unique bit combinations
* in that bitmap. If the table is more than 32 columns wide,
* successive entries in this array are used for a single row.
*/
int32_t* rowIndexFlags;
/**
* This index maps from a logical row number into the bitmap table above.
* (This keeps us from storing duplicate bitmap combinations.) Since there
* are a lot of rows with only one populated cell, instead of wasting space
* in the bitmap table, we just store a negative number in this index for
* rows with one populated cell. The absolute value of that number is
* the column number of the populated cell.
*/
int16_t* rowIndexFlagsIndex;
/**
* For each logical row, this index contains a constant that is added to
* the logical column number to get the physical column number
*/
int8_t* rowIndexShifts;
//=================================================================================
// deserialization
//=================================================================================
public:
/**
* Constructor. Creates the BreakDictionary by using readDictionaryFile() to
* load the dictionary tables from the disk.
* @param dictionaryFilename The name of the dictionary file
* @param status for errors if it occurs
*/
BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
/**
* Destructor.
*/
~BreakDictionary();
/**
* Reads the dictionary file on the disk and constructs the appropriate in-memory
* representation.
* @param in The given memory stream
*/
void readDictionaryFile(UMemoryStream* in);
//=================================================================================
// access to the words
//=================================================================================
/**
* Uses the column map to map the character to a column number, then
* passes the row and column number to the other version of at()
* @param row The current state
* @param ch The character whose column we're interested in
* @return The new state to transition to
*/
int16_t at(int32_t row, UChar ch) const;
/**
* Returns the value in the cell with the specified (logical) row and
* column numbers. In DictionaryBasedBreakIterator, the row number is
* a state number, the column number is an input, and the return value
* is the row number of the new state to transition to. (0 is the
* "error" state, and -1 is the "end of word" state in a dictionary)
* @param row The row number of the current state
* @param col The column number of the input character (0 means "not a
* dictionary character")
* @return The row number of the new state to transition to
*/
int16_t at(int32_t row, int32_t col) const;
private:
/**
* Given (logical) row and column numbers, returns true if the
* cell in that position is populated
* @param row The LOGICAL row number of the cell
* @param col The PHYSICAL row number of the cell
* @return true if the cell in that position is populated
*/
UBool cellIsPopulated(int32_t row, int32_t col) const;
/**
* Implementation of at() when we know the specified cell is populated.
* @param row The PHYSICAL row number of the cell
* @param col The PHYSICAL column number of the cell
* @return The value stored in the cell
*/
int16_t internalAt(int32_t row, int32_t col) const;
// the following methods are never meant to be called and so are not defined
// (if you don't declare them, you get default implementations)
BreakDictionary(const BreakDictionary& that);
BreakDictionary& operator=(const BreakDictionary& that);
};
U_NAMESPACE_END
#endif
--- NEW FILE: brkiter.cpp ---
/*
*******************************************************************************
* Copyright (C) 1997-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* File TXTBDRY.CPP
*
* Modification History:
*
* Date Name Description
* 02/18/97 aliu Converted from OpenClass. Added DONE.
* 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
*****************************************************************************************
*/
// *****************************************************************************
// This file was generated from the java source file BreakIterator.java
// *****************************************************************************
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/dbbi.h"
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/resbund.h"
#include "cstring.h"
#include "mutex.h"
#include "iculserv.h"
// *****************************************************************************
// class BreakIterator
// This class implements methods for finding the location of boundaries in text.
// Instances of BreakIterator maintain a current position and scan over text
// returning the index of characters where boundaries occur.
// *****************************************************************************
U_NAMESPACE_BEGIN
const int32_t BreakIterator::DONE = (int32_t)-1;
// -------------------------------------
// Creates a break iterator for word breaks.
BreakIterator*
BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_WORD, status);
}
BreakIterator*
BreakIterator::makeWordInstance(const Locale& key, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "word";
if (U_FAILURE(status))
return NULL;
if (!uprv_strcmp(key.getLanguage(), "th"))
{
filename = "word_th";
}
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if(!uprv_strcmp(filename, "word_th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
}
else {
result = new RuleBasedBreakIterator(file, status);
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
}
// -------------------------------------
// Creates a break iterator for line breaks.
BreakIterator*
BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_LINE, status);
}
BreakIterator*
BreakIterator::makeLineInstance(const Locale& key, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "line";
if (U_FAILURE(status))
return NULL;
if (!uprv_strcmp(key.getLanguage(), "th"))
{
filename = "line_th";
}
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if (!uprv_strcmp(key.getLanguage(), "th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
}
else {
result = new RuleBasedBreakIterator(file, status);
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
}
// -------------------------------------
// Creates a break iterator for character breaks.
BreakIterator*
BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_CHARACTER, status);
}
BreakIterator*
BreakIterator::makeCharacterInstance(const Locale& /* key */, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
static const char filename[] = "char";
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
result = new RuleBasedBreakIterator(file, status);
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
}
// -------------------------------------
// Creates a break iterator for sentence breaks.
BreakIterator*
BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_SENTENCE, status);
}
BreakIterator*
BreakIterator::makeSentenceInstance(const Locale& /*key */, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
static const char filename[] = "sent";
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
result = new RuleBasedBreakIterator(file, status);
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
}
// -------------------------------------
// Creates a break iterator for title casing breaks.
BreakIterator*
BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_TITLE, status);
}
BreakIterator*
BreakIterator::makeTitleInstance(const Locale& /* key */, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files. This function will have to be made fully general
// at some time in the future!
BreakIterator* result = NULL;
static const char filename[] = "title";
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
result = new RuleBasedBreakIterator(file, status);
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
}
// -------------------------------------
// Gets all the available locales that has localized text boundary data.
const Locale*
BreakIterator::getAvailableLocales(int32_t& count)
{
return Locale::getAvailableLocales(count);
}
// -------------------------------------
// Gets the objectLocale display name in the default locale language.
UnicodeString&
BreakIterator::getDisplayName(const Locale& objectLocale,
UnicodeString& name)
{
return objectLocale.getDisplayName(name);
}
// -------------------------------------
// Gets the objectLocale display name in the displayLocale language.
UnicodeString&
BreakIterator::getDisplayName(const Locale& objectLocale,
const Locale& displayLocale,
UnicodeString& name)
{
return objectLocale.getDisplayName(displayLocale, name);
}
// ------------------------------------------
//
// Default constructor and destructor
//
//-------------------------------------------
BreakIterator::BreakIterator()
{
fBufferClone = FALSE;
}
BreakIterator::~BreakIterator()
{
}
// ------------------------------------------
//
// Registration
//
//-------------------------------------------
static ICULocaleService* gService = NULL;
// -------------------------------------
class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
protected:
virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const {
return BreakIterator::makeInstance(loc, kind, status);
}
};
// -------------------------------------
class ICUBreakIteratorService : public ICULocaleService {
public:
ICUBreakIteratorService()
: ICULocaleService("Break Iterator")
{
UErrorCode status = U_ZERO_ERROR;
registerFactory(new ICUBreakIteratorFactory(), status);
}
virtual UObject* cloneInstance(UObject* instance) const {
return ((BreakIterator*)instance)->clone();
}
virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* actualID, UErrorCode& status) const {
LocaleKey& lkey = (LocaleKey&)key;
int32_t kind = lkey.kind();
Locale loc;
lkey.currentLocale(loc);
return BreakIterator::makeInstance(loc, kind, status);
}
virtual UBool isDefault() const {
return countFactories() == 1;
}
};
// -------------------------------------
static ICULocaleService*
getService(void)
{
UBool needsInit;
umtx_lock(NULL);
needsInit = (UBool)(gService == NULL);
umtx_unlock(NULL);
if (needsInit) {
ICULocaleService *tService = new ICUBreakIteratorService();
umtx_lock(NULL);
if (gService == NULL) {
gService = tService;
tService = NULL;
}
umtx_unlock(NULL);
delete tService;
}
return gService;
}
// -------------------------------------
static UBool
hasService(void)
{
Mutex mutex;
return gService != NULL;
}
// -------------------------------------
BreakIterator*
BreakIterator::createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status)
{
if (U_FAILURE(status)) {
return NULL;
}
if (hasService()) {
return (BreakIterator*)gService->get(loc, kind, status);
} else {
return makeInstance(loc, kind, status);
}
}
// -------------------------------------
URegistryKey
BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
{
return getService()->registerInstance(toAdopt, locale, kind, status);
}
// -------------------------------------
UBool
BreakIterator::unregister(URegistryKey key, UErrorCode& status)
{
if (U_SUCCESS(status)) {
if (hasService()) {
return gService->unregister(key, status);
}
status = U_ILLEGAL_ARGUMENT_ERROR;
}
return FALSE;
}
// -------------------------------------
StringEnumeration*
BreakIterator::getAvailableLocales(void)
{
return getService()->getAvailableLocales();
}
// -------------------------------------
BreakIterator*
BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
{
switch (kind) {
case UBRK_CHARACTER: return BreakIterator::makeCharacterInstance(loc, status);
case UBRK_WORD: return BreakIterator::makeWordInstance(loc, status);
case UBRK_LINE: return BreakIterator::makeLineInstance(loc, status);
case UBRK_SENTENCE: return BreakIterator::makeSentenceInstance(loc, status);
case UBRK_TITLE: return BreakIterator::makeTitleInstance(loc, status);
default:
status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
}
U_NAMESPACE_END
// defined in ucln_cmn.h
/**
* Release all static memory held by breakiterator.
*/
U_CFUNC UBool breakiterator_cleanup(void) {
if (gService) {
delete gService;
gService = NULL;
}
return TRUE;
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
//eof
--- NEW FILE: caniter.cpp ---
/*
*****************************************************************************
* Copyright (C) 1996-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
*****************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "unormimp.h"
#include "unicode/caniter.h"
#include "unicode/normlzr.h"
#include "unicode/uchar.h"
#include "cmemory.h"
/**
* This class allows one to iterate through all the strings that are canonically equivalent to a given
* string. For example, here are some sample results:
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
1: \u0041\u030A\u0064\u0307\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
2: \u0041\u030A\u0064\u0327\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
3: \u0041\u030A\u1E0B\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
4: \u0041\u030A\u1E11\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
5: \u00C5\u0064\u0307\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
6: \u00C5\u0064\u0327\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
7: \u00C5\u1E0B\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
8: \u00C5\u1E11\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
9: \u212B\u0064\u0307\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
10: \u212B\u0064\u0327\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
11: \u212B\u1E0B\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
12: \u212B\u1E11\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
* since it has not been optimized for that situation.
*@author M. Davis
*@draft
*/
#if 0
static UBool PROGRESS = FALSE;
#include <stdio.h>
#include "unicode/translit.h"
UErrorCode status = U_ZERO_ERROR;
// Just for testing - remove, not thread safe.
static const char* UToS(const UnicodeString &source) {
static char buffer[256];
buffer[source.extract(0, source.length(), buffer)] = 0;
return buffer;
}
static const UnicodeString &Tr(const UnicodeString &source) {
static Transliterator *NAME = Transliterator::createInstance("name", UTRANS_FORWARD, status);
static UnicodeString result;
result = source;
NAME->transliterate(result);
return result;
}
#endif
// public
U_NAMESPACE_BEGIN
// TODO: add boilerplate methods.
const char CanonicalIterator::fgClassID=0;
/**
*@param source string to get results for
*/
CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
pieces(NULL),
pieces_length(0),
pieces_lengths(NULL),
current(NULL),
current_length(0)
{
if(U_SUCCESS(status)) {
setSource(sourceStr, status);
}
}
CanonicalIterator::~CanonicalIterator() {
cleanPieces();
}
void CanonicalIterator::cleanPieces() {
int32_t i = 0;
if(pieces != NULL) {
for(i = 0; i < pieces_length; i++) {
if(pieces[i] != NULL) {
delete[] pieces[i];
}
}
uprv_free(pieces);
pieces = NULL;
if(pieces_lengths != NULL) {
uprv_free(pieces_lengths);
}
pieces_lengths = NULL;
if(current != NULL) {
uprv_free(current);
}
current = NULL;
}
}
/**
*@return gets the source: NOTE: it is the NFD form of source
*/
UnicodeString CanonicalIterator::getSource() {
return source;
}
/**
* Resets the iterator so that one can start again from the beginning.
*/
void CanonicalIterator::reset() {
done = FALSE;
for (int i = 0; i < current_length; ++i) {
current[i] = 0;
}
}
/**
*@return the next string that is canonically equivalent. The value null is returned when
* the iteration is done.
*/
UnicodeString CanonicalIterator::next() {
int32_t i = 0;
if (done) {
buffer.setToBogus();
return buffer;
}
// delete old contents
buffer.remove();
// construct return value
for (i = 0; i < pieces_length; ++i) {
buffer.append(pieces[i][current[i]]);
}
//String result = buffer.toString(); // not needed
// find next value for next time
for (i = current_length - 1; ; --i) {
if (i < 0) {
done = TRUE;
break;
}
current[i]++;
if (current[i] < pieces_lengths[i]) break; // got sequence
current[i] = 0;
}
return buffer;
}
/**
*@param set the source string to iterate against. This allows the same iterator to be used
* while changing the source string, saving object creation.
*/
void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
Normalizer::normalize(newSource, UNORM_NFD, 0, source, status);
if(U_FAILURE(status)) {
return;
}
done = FALSE;
cleanPieces();
// catch degenerate case
if (newSource.length() == 0) {
pieces_length = 1;
pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
/* test for NULL */
if (pieces == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
current_length = 1;
current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
/* test for NULL */
if (current == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(pieces);
pieces = NULL;
return;
}
current[0] = 0;
pieces[0] = new UnicodeString[1];
/* test for NULL */
if (pieces[0] == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(pieces);
pieces = NULL;
uprv_free(current);
return;
}
pieces[0][0] = UnicodeString("");
pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
/* test for NULL */
if (pieces_lengths == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(pieces);
pieces = NULL;
uprv_free(current);
return;
}
pieces_lengths[0] = 1;
return;
}
UnicodeString *list = new UnicodeString[source.length()];
/* test for NULL */
if (list == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t list_length = 0;
UChar32 cp = 0;
int32_t start = 0;
// i should initialy be the number of code units at the
// start of the string
int32_t i = UTF16_CHAR_LENGTH(source.char32At(0));
//int32_t i = 1;
// find the segments
// This code iterates through the source string and
// extracts segments that end up on a codepoint that
// doesn't start any decompositions. (Analysis is done
// on the NFD form - see above).
for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
cp = source.char32At(i);
if (unorm_isCanonSafeStart(cp)) {
source.extract(start, i-start, list[list_length++]); // add up to i
start = i;
}
}
source.extract(start, i-start, list[list_length++]); // add last one
// allocate the arrays, and find the strings that are CE to each segment
pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
/* test for NULL */
if (pieces == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
delete[] list;
return;
}
pieces_length = list_length;
pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
/* test for NULL */
if (pieces_lengths == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete[] list;
uprv_free(pieces);
pieces = NULL;
return;
}
current_length = list_length;
current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
/* test for NULL */
if (current == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete[] list;
uprv_free(pieces);
pieces = NULL;
uprv_free(pieces_lengths);
return;
}
for (i = 0; i < current_length; i++) {
current[i] = 0;
}
// for each segment, get all the combinations that can produce
// it after NFD normalization
for (i = 0; i < pieces_length; ++i) {
//if (PROGRESS) printf("SEGMENT\n");
pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
}
delete[] list;
}
/**
* Dumb recursive implementation of permutation.
* TODO: optimize
* @param source the string to find permutations for
* @return the results in a set.
*/
void CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
if(U_FAILURE(status)) {
return;
}
//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
int32_t i = 0;
// optimization:
// if zero or one character, just return a set with it
// we check for length < 2 to keep from counting code points all the time
if (source.length() <= 2 && source.countChar32() <= 1) {
UnicodeString *toPut = new UnicodeString(source);
/* test for NULL */
if (toPut == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
result->put(source, toPut, status);
return;
}
// otherwise iterate through the string, and recursively permute all the other characters
UChar32 cp;
Hashtable *subpermute = new Hashtable(FALSE, status);
/* test for NULL */
if (subpermute == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (U_SUCCESS(status)) {
subpermute->setValueDeleter(uhash_deleteUnicodeString);
}
for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
cp = source.char32At(i);
const UHashElement *ne = NULL;
int32_t el = -1;
UnicodeString subPermuteString = source;
// optimization:
// if the character is canonical combining class zero,
// don't permute it
if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
continue;
}
subpermute->removeAll();
// see what the permutations of the characters before and after this one are
//Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, subpermute, status);
/* Test for buffer overflows */
if(U_FAILURE(status)) {
delete subpermute;
return;
}
// The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents
// of source at this point.
// prefix this character to all of them
ne = subpermute->nextElement(el);
while (ne != NULL) {
UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
UnicodeString *chStr = new UnicodeString(cp);
//test for NULL
if (chStr == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
delete subpermute;
return;
}
chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));
//if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));
result->put(*chStr, chStr, status);
ne = subpermute->nextElement(el);
}
}
delete subpermute;
//return result;
}
// privates
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
//private String[] getEquivalents(String segment)
Hashtable *result = new Hashtable(FALSE, status);
/* test for NULL */
if (result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_SUCCESS(status)) {
result->setValueDeleter(uhash_deleteUnicodeString);
}
UChar USeg[256];
int32_t segLen = segment.extract(USeg, 256, status);
Hashtable *basic = getEquivalents2(USeg, segLen, status);
//Hashtable *basic = getEquivalents2(segment, segLen, status);
// now get all the permutations
// add only the ones that are canonically equivalent
// TODO: optimize by not permuting any class zero.
Hashtable *permutations = new Hashtable(FALSE, status);
/* test for NULL */
if (permutations == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete result;
delete basic;
return 0;
}
if (U_SUCCESS(status)) {
permutations->setValueDeleter(uhash_deleteUnicodeString);
}
const UHashElement *ne = NULL;
int32_t el = -1;
//Iterator it = basic.iterator();
ne = basic->nextElement(el);
//while (it.hasNext())
while (ne != NULL) {
//String item = (String) it.next();
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
permutations->removeAll();
permute(item, CANITER_SKIP_ZEROES, permutations, status);
const UHashElement *ne2 = NULL;
int32_t el2 = -1;
//Iterator it2 = permutations.iterator();
ne2 = permutations->nextElement(el2);
//while (it2.hasNext())
while (ne2 != NULL) {
//String possible = (String) it2.next();
//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString attempt;
Normalizer::normalize(possible, UNORM_NFD, 0, attempt, status);
// TODO: check if operator == is semanticaly the same as attempt.equals(segment)
if (attempt==segment) {
//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
result->put(possible, new UnicodeString(possible), status); //add(possible);
} else {
//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
}
ne2 = permutations->nextElement(el2);
}
ne = basic->nextElement(el);
}
/* Test for buffer overflows */
if(U_FAILURE(status)) {
delete result;
delete permutations;
delete basic;
return 0;
}
// convert into a String[] to clean up storage
//String[] finalResult = new String[result.size()];
UnicodeString *finalResult = NULL;
int32_t resultCount;
if((resultCount = result->count())) {
finalResult = new UnicodeString[resultCount];
} else {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
/* test for NULL */
if (finalResult == 0) {
if(U_SUCCESS(status)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
delete result;
delete permutations;
delete basic;
return 0;
}
//result.toArray(finalResult);
result_len = 0;
el = -1;
ne = result->nextElement(el);
while(ne != NULL) {
UnicodeString finResult = *((UnicodeString *)(ne->value.pointer));
finalResult[result_len++] = finResult;
ne = result->nextElement(el);
}
delete permutations;
delete basic;
delete result;
return finalResult;
}
Hashtable *CanonicalIterator::getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status) {
//Hashtable *CanonicalIterator::getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status) {
Hashtable *result = new Hashtable(FALSE, status);
/* test for NULL */
if (result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_SUCCESS(status)) {
result->setValueDeleter(uhash_deleteUnicodeString);
}
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
UnicodeString toPut(segment, segLen);
result->put(toPut, new UnicodeString(toPut), status);
USerializedSet starts;
// cycle through all the characters
UChar32 cp, end = 0;
int32_t i = 0, j;
for (i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) {
// see if any character is at the start of some decomposition
UTF_GET_CHAR(segment, 0, i, segLen, cp);
if (!unorm_getCanonStartSet(cp, &starts)) {
continue;
}
// if so, see which decompositions match
for(j = 0, cp = end+1; cp <= end || uset_getSerializedRange(&starts, j++, &cp, &end); ++cp) {
//Hashtable *remainder = extract(cp, segment, segLen, i, status);
Hashtable *remainder = extract(cp, segment, segLen, i, status);
if (remainder == NULL) continue;
// there were some matches, so add all the possibilities to the set.
UnicodeString prefix(segment, i);
prefix += cp;
const UHashElement *ne = NULL;
int32_t el = -1;
ne = remainder->nextElement(el);
while (ne != NULL) {
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
UnicodeString *toAdd = new UnicodeString(prefix);
/* test for NULL */
if (toAdd == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete result;
delete remainder;
return 0;
}
*toAdd += item;
result->put(*toAdd, toAdd, status);
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
ne = remainder->nextElement(el);
}
delete remainder;
}
}
/* Test for buffer overflows */
if(U_FAILURE(status)) {
return 0;
}
return result;
}
/**
* See if the decomposition of cp2 is at segment starting at segmentPos
* (with canonical rearrangment!)
* If so, take the remainder, and return the equivalents
*/
Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
const int32_t bufSize = 256;
int32_t bufLen = 0;
UChar temp[bufSize];
const int32_t decompSize = 64;
int32_t inputLen = 0;
UChar decomp[decompSize];
U16_APPEND_UNSAFE(temp, inputLen, comp);
int32_t decompLen = unorm_getDecomposition(comp, FALSE, decomp, decompSize);
if(decompLen < 0) {
decompLen = -decompLen;
}
UChar *buff = temp+inputLen;
// See if it matches the start of segment (at segmentPos)
UBool ok = FALSE;
UChar32 cp;
int32_t decompPos = 0;
UChar32 decompCp;
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
int32_t i;
UBool overflow = FALSE;
i = segmentPos;
while(i < segLen) {
UTF_NEXT_CHAR(segment, i, segLen, cp);
if (cp == decompCp) { // if equal, eat another cp from decomp
//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
if (decompPos == decompLen) { // done, have all decomp characters!
//u_strcat(buff+bufLen, segment+i);
uprv_memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));
bufLen+=segLen-i;
ok = TRUE;
break;
}
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
} else {
//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
// brute force approach
U16_APPEND(buff, bufLen, bufSize, cp, overflow);
if(overflow) {
/*
* ### TODO handle buffer overflow
* The buffer is large, but an overflow may still happen with
* unusual input (many combining marks?).
* Reallocate buffer and continue.
* markus 20020929
*/
overflow = FALSE;
}
/* TODO: optimize
// since we know that the classes are monotonically increasing, after zero
// e.g. 0 5 7 9 0 3
// we can do an optimization
// there are only a few cases that work: zero, less, same, greater
// if both classes are the same, we fail
// if the decomp class < the segment class, we fail
segClass = getClass(cp);
if (decompClass <= segClass) return null;
*/
}
}
if (!ok) return NULL; // we failed, characters left over
//if (PROGRESS) printf("Matches\n");
if (bufLen == 0) {
Hashtable *result = new Hashtable(FALSE, status);
/* test for NULL */
if (result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
result->setValueDeleter(uhash_deleteUnicodeString);
result->put(UnicodeString(), new UnicodeString(), status);
return result; // succeed, but no remainder
}
// brute force approach
// check to make sure result is canonically equivalent
int32_t tempLen = inputLen + bufLen;
UChar trial[bufSize];
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status);
/* Test for buffer overflows */
if(U_FAILURE(status)) {
return 0;
}
if(uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0) {
return NULL;
}
return getEquivalents2(buff, bufLen, status);
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */
--- NEW FILE: cmemory.c ---
/*
******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File cmemory.c ICU Heap allocation.
* All ICU heap allocation, both for C and C++ new of ICU
* class types, comes through these functions.
*
* If you have a need to replace ICU allocation, this is the
* place to do it.
*
* Note that uprv_malloc(0) returns a non-NULL pointer, and
* that a subsequent free of that pointer value is a NOP.
*
******************************************************************************
*/
#include "cmemory.h"
/* uprv_malloc(0) returns a pointer to this read-only data. */
static const int32_t zeroMem[] = {0, 0, 0, 0, 0, 0};
U_CAPI void * U_EXPORT2
uprv_malloc(size_t s) {
if (s > 0) {
return malloc(s);
} else {
return (void *)zeroMem;
}
}
U_CAPI void * U_EXPORT2
uprv_realloc(void * buffer, size_t size) {
if (buffer == zeroMem) {
return uprv_malloc(size);
} else if (size == 0) {
free(buffer);
return (void *)zeroMem;
} else {
return realloc(buffer, size);
}
}
U_CAPI void U_EXPORT2
uprv_free(void *buffer) {
if (buffer != zeroMem) {
free(buffer);
}
}
--- NEW FILE: common.vcproj ---
<?xml version="1.0" encoding = "Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="7.00"
Name="common"
SccProjectName=""
SccLocalPath="">
<Platforms>
<Platform
Name="Win32"/>
</Platforms>
<Configurations>
<Configuration
Name="Release|Win32"
OutputDirectory=".\..\..\lib"
IntermediateDirectory=".\Release"
ConfigurationType="2"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="FALSE"
[...1684 lines suppressed...]
</File>
<File
RelativePath=".\uidna.cpp">
</File>
<File
RelativePath=".\unicode\uidna.h">
<FileConfiguration
Name="Debug|Win32">
<Tool
Name="VCCustomBuildTool"
CommandLine="copy $(InputPath) ..\..\include\unicode
"
Outputs="..\..\include\unicode\uidna.h"/>
</FileConfiguration>
</File>
</Filter>
</Files>
<Globals>
</Globals>
</VisualStudioProject>
--- NEW FILE: dbbi.cpp ---
/*
**********************************************************************
* Copyright (C) 1999-2001 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/dbbi.h"
#include "unicode/schriter.h"
#include "dbbi_tbl.h"
#include "uvector.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
const char DictionaryBasedBreakIterator::fgClassID = 0;
//-------------------------------------------------------------------------------
//
// constructors
//
//-------------------------------------------------------------------------------
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
RuleBasedBreakIterator() {
init();
}
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
const char* dictionaryFilename,
UErrorCode& status)
: RuleBasedBreakIterator(rbbiData, status)
{
init();
if (U_FAILURE(status)) {return;};
fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
/* test for NULL */
if(fTables == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (U_FAILURE(status)) {
fTables->removeReference();
fTables = NULL;
return;
}
}
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
RuleBasedBreakIterator(other)
{
init();
if (other.fTables != NULL) {
fTables = other.fTables;
fTables->addReference();
}
}
//-------------------------------------------------------------------------------
//
// Destructor
//
//-------------------------------------------------------------------------------
DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
{
uprv_free(cachedBreakPositions);
cachedBreakPositions = NULL;
if (fTables != NULL) {fTables->removeReference();};
}
//-------------------------------------------------------------------------------
//
// Assignment operator. Sets this iterator to have the same behavior,
// and iterate over the same text, as the one passed in.
//
//-------------------------------------------------------------------------------
DictionaryBasedBreakIterator&
DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
if (this == &that) {
return *this;
}
reset(); // clears out cached break positions.
RuleBasedBreakIterator::operator=(that);
if (this->fTables != that.fTables) {
if (this->fTables != NULL) {this->fTables->removeReference();};
this->fTables = that.fTables;
if (this->fTables != NULL) {this->fTables->addReference();};
}
return *this;
}
//-------------------------------------------------------------------------------
//
// Clone() Returns a newly-constructed RuleBasedBreakIterator with the same
// behavior, and iterating over the same text, as this one.
//
//-------------------------------------------------------------------------------
BreakIterator*
DictionaryBasedBreakIterator::clone() const {
return new DictionaryBasedBreakIterator(*this);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Advances the iterator one step backwards.
* @return The position of the last boundary position before the
* current iteration position
*/
int32_t
DictionaryBasedBreakIterator::previous()
{
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (cachedBreakPositions != NULL && positionInCache > 0) {
--positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
// otherwise, dump the cache and use the inherited previous() method to move
// backward. This may fill up the cache with new break positions, in which
// case we have to mark our position in the cache
else {
reset();
int32_t result = RuleBasedBreakIterator::previous();
if (cachedBreakPositions != NULL) {
positionInCache = numCachedBreakPositions - 2;
}
return result;
}
}
/**
* Sets the current iteration position to the last boundary position
* before the specified position.
* @param offset The position to begin searching from
* @return The position of the last boundary before "offset"
*/
int32_t
DictionaryBasedBreakIterator::preceding(int32_t offset)
{
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > fText->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < fText->startIndex()) {
return fText->startIndex();
}
// if we have no cached break positions, or "offset" is outside the
// range covered by the cache, we can just call the inherited routine
// (which will eventually call other routines in this class that may
// refresh the cache)
if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
reset();
return RuleBasedBreakIterator::preceding(offset);
}
// on the other hand, if "offset" is within the range covered by the cache,
// then all we have to do is search the cache for the last break position
// before "offset"
else {
positionInCache = 0;
while (positionInCache < numCachedBreakPositions
&& offset > cachedBreakPositions[positionInCache])
++positionInCache;
--positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return fText->getIndex();
}
}
/**
* Sets the current iteration position to the first boundary position after
* the specified position.
* @param offset The position to begin searching forward from
* @return The position of the first boundary after "offset"
*/
int32_t
DictionaryBasedBreakIterator::following(int32_t offset)
{
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > fText->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < fText->startIndex()) {
return fText->startIndex();
}
// if we have no cached break positions, or if "offset" is outside the
// range covered by the cache, then dump the cache and call our
// inherited following() method. This will call other methods in this
// class that may refresh the cache.
if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
reset();
return RuleBasedBreakIterator::following(offset);
}
// on the other hand, if "offset" is within the range covered by the
// cache, then just search the cache for the first break position
// after "offset"
else {
positionInCache = 0;
while (positionInCache < numCachedBreakPositions
&& offset >= cachedBreakPositions[positionInCache])
++positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return fText->getIndex();
}
}
/**
* This is the implementation function for next().
*/
int32_t
DictionaryBasedBreakIterator::handleNext()
{
UErrorCode status = U_ZERO_ERROR;
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
// start by using the inherited handleNext() to find a tentative return
// value. dictionaryCharCount tells us how many dictionary characters
// we passed over on our way to the tentative return value
int32_t startPos = fText->getIndex();
fDictionaryCharCount = 0;
int32_t result = RuleBasedBreakIterator::handleNext();
// if we passed over more than one dictionary character, then we use
// divideUpDictionaryRange() to regenerate the cached break positions
// for the new range
if (fDictionaryCharCount > 1 && result - startPos > 1) {
divideUpDictionaryRange(startPos, result, status);
if (U_FAILURE(status)) {
return -9999; // SHOULD NEVER GET HERE!
}
}
// otherwise, the value we got back from the inherited fuction
// is our return value, and we can dump the cache
else {
reset();
return result;
}
}
// if the cache of break positions has been regenerated (or existed all
// along), then just advance to the next break position in the cache
// and return it
if (cachedBreakPositions != NULL) {
++positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
return -9999; // SHOULD NEVER GET HERE!
}
void
DictionaryBasedBreakIterator::reset()
{
uprv_free(cachedBreakPositions);
cachedBreakPositions = NULL;
numCachedBreakPositions = 0;
fDictionaryCharCount = 0;
positionInCache = 0;
}
//-------------------------------------------------------------------------------
//
// init() Common initialization routine, for use by constructors, etc.
//
//-------------------------------------------------------------------------------
void DictionaryBasedBreakIterator::init() {
cachedBreakPositions = NULL;
fTables = NULL;
numCachedBreakPositions = 0;
fDictionaryCharCount = 0;
positionInCache = 0;
}
//-------------------------------------------------------------------------------
//
// BufferClone
//
//-------------------------------------------------------------------------------
BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
int32_t &bufferSize,
UErrorCode &status)
{
if (U_FAILURE(status)){
return NULL;
}
//
// If user buffer size is zero this is a preflight operation to
// obtain the needed buffer size, allowing for worst case misalignment.
//
if (bufferSize == 0) {
bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
return NULL;
}
//
// Check the alignment and size of the user supplied buffer.
// Allocate heap memory if the user supplied memory is insufficient.
//
char *buf = (char *)stackBuffer;
uint32_t s = bufferSize;
if (stackBuffer == NULL) {
s = 0; // Ignore size, force allocation if user didn't give us a buffer.
}
if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
s -= offsetUp;
buf += offsetUp;
}
if (s < sizeof(DictionaryBasedBreakIterator)) {
buf = (char *) new DictionaryBasedBreakIterator();
if (buf == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
status = U_SAFECLONE_ALLOCATED_WARNING;
}
//
// Initialize the clone object.
// TODO: using an overloaded C++ "operator new" to directly initialize the
// copy in the user's buffer would be better, but it doesn't seem
// to get along with namespaces. Investigate why.
//
// The memcpy is only safe with an empty (default constructed)
// break iterator. Use on others can screw up reference counts
// to data. memcpy-ing objects is not really a good idea...
//
DictionaryBasedBreakIterator localIter; // Empty break iterator, source for memcpy
DictionaryBasedBreakIterator *clone = (DictionaryBasedBreakIterator *)buf;
uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
*clone = *this; // clone = the real one we want.
if (status != U_SAFECLONE_ALLOCATED_WARNING) {
clone->fBufferClone = TRUE;
}
return clone;
}
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
void
DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
{
// the range we're dividing may begin or end with non-dictionary characters
// (i.e., for line breaking, we may have leading or trailing punctuation
// that needs to be kept with the word). Seek from the beginning of the
// range to the first dictionary character
fText->setIndex(startPos);
UChar c = fText->current();
while (isDictionaryChar(c) == FALSE) {
c = fText->next();
}
// initialize. We maintain two stacks: currentBreakPositions contains
// the list of break positions that will be returned if we successfully
// finish traversing the whole range now. possibleBreakPositions lists
// all other possible word ends we've passed along the way. (Whenever
// we reach an error [a sequence of characters that can't begin any word
// in the dictionary], we back up, possibly delete some breaks from
// currentBreakPositions, move a break from possibleBreakPositions
// to currentBreakPositions, and start over from there. This process
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.) wrongBreakPositions is used to keep track of paths we've
// tried on previous iterations. As the iterator backs up further and
// further, this saves us from having to follow each possible path
// through the text all the way to the error (hopefully avoiding many
// future recursive calls as well).
UStack currentBreakPositions(status);
UStack possibleBreakPositions(status);
UVector wrongBreakPositions(status);
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
int16_t state = 0;
// these two variables are used for error handling. We keep track of the
// farthest we've gotten through the range being divided, and the combination
// of breaks that got us that far. If we use up all possible break
// combinations, the text contains an error or a word that's not in the
// dictionary. In this case, we "bless" the break positions that got us the
// farthest as real break positions, and then start over from scratch with
// the character where the error occurred.
int32_t farthestEndPoint = fText->getIndex();
UStack bestBreakPositions(status);
UBool bestBreakPositionsInitialized = FALSE;
if (U_FAILURE(status)) {
return;
}
// initialize (we always exit the loop with a break statement)
c = fText->current();
for (;;) {
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
possibleBreakPositions.push(fText->getIndex(), status);
}
// look up the new state to transition to in the dictionary
state = fTables->fDictionary->at(state, c);
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == -1) {
currentBreakPositions.push(fText->getIndex(), status);
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
// without transitioning to the "end of word" state, we've hit
// an error...
else if (state == 0 || fText->getIndex() >= endPos) {
// if this is the farthest we've gotten, take note of it in
// case there's an error in the text
if (fText->getIndex() > farthestEndPoint) {
farthestEndPoint = fText->getIndex();
bestBreakPositions.removeAllElements();
bestBreakPositionsInitialized = TRUE;
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
bestBreakPositions.push(currentBreakPositions.elementAti(i), status);
}
}
// wrongBreakPositions is a list of all break positions we've tried starting
// that didn't allow us to traverse all the way through the text. Every time
// we pop a break position off of currentBreakPositions, we put it into
// wrongBreakPositions to avoid trying it again later. If we make it to this
// spot, we're either going to back up to a break in possibleBreakPositions
// and try starting over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure. This loop prevents
// us from messing with anything in possibleBreakPositions that didn't work as
// a starting point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
possibleBreakPositions.peeki())) {
possibleBreakPositions.popi();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
// over, treating the farthest character we've reached as the beginning
// of the range, and "blessing" the break positions that got us that
// far as real break positions
if (possibleBreakPositions.isEmpty()) {
if (bestBreakPositionsInitialized) {
currentBreakPositions.removeAllElements();
for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
}
bestBreakPositions.removeAllElements();
if (farthestEndPoint < endPos) {
fText->setIndex(farthestEndPoint + 1);
}
else {
break;
}
}
else {
if ((currentBreakPositions.isEmpty()
|| currentBreakPositions.peeki() != fText->getIndex())
&& fText->getIndex() != startPos) {
currentBreakPositions.push(fText->getIndex(), status);
}
fText->next();
currentBreakPositions.push(fText->getIndex(), status);
}
}
// if we still have more break positions we can try, then promote the
// last break in possibleBreakPositions into currentBreakPositions,
// and get rid of all entries in currentBreakPositions that come after
// it. Then back up to that position and start over from there (i.e.,
// treat that position as the beginning of a new word)
else {
int32_t temp = possibleBreakPositions.popi();
int32_t temp2 = 0;
while (!currentBreakPositions.isEmpty() && temp <
currentBreakPositions.peeki()) {
temp2 = currentBreakPositions.popi();
wrongBreakPositions.addElement(temp2, status);
}
currentBreakPositions.push(temp, status);
fText->setIndex(currentBreakPositions.peeki());
}
// re-sync "c" for the next go-round, and drop out of the loop if
// we've made it off the end of the range
c = fText->current();
if (fText->getIndex() >= endPos) {
break;
}
}
// if we didn't hit any exceptional conditions on this last iteration,
// just advance to the next character and loop
else {
c = fText->next();
}
}
// dump the last break position in the list, and replace it with the actual
// end of the range (which may be the same character, or may be further on
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.popi();
}
currentBreakPositions.push(endPos, status);
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()
// and previous(), so this is where we actually refresh the cache.
if (cachedBreakPositions != NULL) {
uprv_free(cachedBreakPositions);
}
cachedBreakPositions = (int32_t *)uprv_malloc((currentBreakPositions.size() + 1) * sizeof(int32_t));
/* Test for NULL */
if(cachedBreakPositions == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
numCachedBreakPositions = currentBreakPositions.size() + 1;
cachedBreakPositions[0] = startPos;
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
cachedBreakPositions[i + 1] = currentBreakPositions.elementAti(i);
}
positionInCache = 0;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
/* eof */
--- NEW FILE: dbbi_tbl.cpp ---
/*
**********************************************************************
* Copyright (C) 1999-2002 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
* 06/14/2002 andy Gutted for new RBBI impl.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "dbbi_tbl.h"
#include "unicode/dbbi.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
//=======================================================================
// constructor
//=======================================================================
DictionaryBasedBreakIteratorTables::DictionaryBasedBreakIteratorTables(
const char* dictionaryFilename,
UErrorCode &status) {
fDictionary = new BreakDictionary(dictionaryFilename, status);
fRefCount = 1;
}
void DictionaryBasedBreakIteratorTables::addReference() {
umtx_atomic_inc(&fRefCount);
}
void DictionaryBasedBreakIteratorTables::removeReference() {
if (umtx_atomic_dec(&fRefCount) == 0) {
delete this;
}
}
/**
* Destructor
*/
DictionaryBasedBreakIteratorTables::~DictionaryBasedBreakIteratorTables() {
delete fDictionary;
fDictionary = NULL;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
/* eof */
--- NEW FILE: dbbi_tbl.h ---
/*
**********************************************************************
* Copyright (C) 1999-2000 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#ifndef DBBI_TBL_H
#define DBBI_TBL_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/udata.h"
#include "brkdict.h"
U_NAMESPACE_BEGIN
/* forward declaration */
class DictionaryBasedBreakIterator;
//
// DictionaryBasedBreakIteratorTables
//
// This class sits between instances of DictionaryBasedBreakIterator
// and the dictionary data itself, which is of type BreakDictionary.
// It provides reference counting, allowing multiple copies of a
// DictionaryBasedBreakIterator to share a single instance of
// BreakDictionary.
//
// TODO: it'd probably be cleaner to add the reference counting to
// BreakDictionary and get rid of this class, but doing it this way
// was a convenient transition from earlier code, and time is short...
//
class DictionaryBasedBreakIteratorTables : public UMemory {
private:
int32_t fRefCount;
public:
//=======================================================================
// constructor
//=======================================================================
/* @param dictionaryFilename The name of the dictionary file
* @param status The error code
* @return the newly created DictionaryBasedBreakIteratorTables
**/
DictionaryBasedBreakIteratorTables(const char* dictionaryFilename,
UErrorCode& status);
BreakDictionary *fDictionary;
void addReference();
void removeReference();
/**
* Destructor. Should not be used directly. Use removeReference() istead.
* (Not private to avoid compiler warnings.)
*/
virtual ~DictionaryBasedBreakIteratorTables();
private:
/**
* The copy constructor is declared private and not implemented.
* THIS CLASS MAY NOT BE COPIED.
* @param that The DictionaryBasedBreakIteratorTables to be copied.
* @return the newly constructed DictionaryBasedBreakIteratorTables.
*/
DictionaryBasedBreakIteratorTables(const DictionaryBasedBreakIteratorTables& that);
//=======================================================================
// boilerplate
//=======================================================================
/**
* The assignment operator is declared private and not implemented.
* THIS CLASS MAY NOT BE COPIED.
* Call addReference() and share an existing copy instead.
* @that The object to be copied
* @return the newly created DictionaryBasedBreakIteratorTables.
*/
DictionaryBasedBreakIteratorTables& operator=(
const DictionaryBasedBreakIteratorTables& that);
};
U_NAMESPACE_END
#endif
--- NEW FILE: iculserv.cpp ---
/**
*******************************************************************************
* Copyright (C) 2001-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_SERVICE
#include "unicode/resbund.h"
#include "cmemory.h"
#include "iculserv.h"
#include "ustrfmt.h"
U_NAMESPACE_BEGIN
static Hashtable * LocaleUtility_cache = NULL;
#define UNDERSCORE_CHAR ((UChar)0x005f)
/*
******************************************************************
*/
UnicodeString&
LocaleUtility::canonicalLocaleString(const UnicodeString* id, UnicodeString& result)
{
if (id == NULL) {
result.setToBogus();
} else {
result = *id;
int32_t i = 0;
int32_t n = result.indexOf(UNDERSCORE_CHAR);
if (n < 0) {
n = result.length();
}
for (; i < n; ++i) {
UChar c = result.charAt(i);
if (c >= 0x0041 && c <= 0x005a) {
c += 0x20;
result.setCharAt(i, c);
}
}
for (n = result.length(); i < n; ++i) {
UChar c = result.charAt(i);
if (c >= 0x0061 && c <= 0x007a) {
c -= 0x20;
result.setCharAt(i, c);
}
}
}
return result;
}
Locale&
LocaleUtility::initLocaleFromName(const UnicodeString& id, Locale& result)
{
if (id.isBogus()) {
result.setToBogus();
} else {
const int32_t BUFLEN = 128; // larger than ever needed
char buffer[BUFLEN];
int len = id.extract(0, BUFLEN, buffer);
if (len >= BUFLEN) {
result.setToBogus();
} else {
buffer[len] = '\0';
result = Locale::createFromName(buffer);
}
}
return result;
}
UnicodeString&
LocaleUtility::initNameFromLocale(const Locale& locale, UnicodeString& result)
{
if (locale.isBogus()) {
result.setToBogus();
} else {
result.append(locale.getName());
}
return result;
}
const Hashtable*
LocaleUtility::getAvailableLocaleNames(const UnicodeString& bundleID)
{
// have to ignore bundleID for the moment, since we don't have easy C++ api.
// assume it's the default bundle
Hashtable* htp;
umtx_lock(NULL);
htp = LocaleUtility_cache;
umtx_unlock(NULL);
if (htp == NULL) {
htp = new Hashtable();
if (htp) {
UErrorCode status = U_ZERO_ERROR;
int32_t count = uloc_countAvailable();
for (int32_t i = 0; i < count; ++i) {
UnicodeString temp(uloc_getAvailable(i), "");
htp->put(temp, (void*)htp, status);
if (U_FAILURE(status)) {
delete htp;
return NULL;
}
}
umtx_lock(NULL);
if (LocaleUtility_cache == NULL) {
LocaleUtility_cache = htp;
htp = NULL;
}
umtx_unlock(NULL);
delete htp;
}
}
return LocaleUtility_cache;
}
UBool
LocaleUtility::isFallbackOf(const UnicodeString& root, const UnicodeString& child)
{
return child.indexOf(root) == 0 &&
(child.length() == root.length() ||
child.charAt(root.length()) == UNDERSCORE_CHAR);
}
UBool
LocaleUtility::cleanup(void) {
if (LocaleUtility_cache) {
delete LocaleUtility_cache;
LocaleUtility_cache = NULL;
}
return TRUE;
}
/*
******************************************************************
*/
const int32_t LocaleKey::KIND_ANY = -1;
LocaleKey*
LocaleKey::createWithCanonicalFallback(const UnicodeString* primaryID,
const UnicodeString* canonicalFallbackID,
UErrorCode& status)
{
return LocaleKey::createWithCanonicalFallback(primaryID, canonicalFallbackID, KIND_ANY, status);
}
LocaleKey*
LocaleKey::createWithCanonicalFallback(const UnicodeString* primaryID,
const UnicodeString* canonicalFallbackID,
int32_t kind,
UErrorCode& status)
{
if (primaryID == NULL || U_FAILURE(status)) {
return NULL;
}
UnicodeString canonicalPrimaryID;
LocaleUtility::canonicalLocaleString(primaryID, canonicalPrimaryID);
return new LocaleKey(*primaryID, canonicalPrimaryID, canonicalFallbackID, kind);
}
LocaleKey::LocaleKey(const UnicodeString& primaryID,
const UnicodeString& canonicalPrimaryID,
const UnicodeString* canonicalFallbackID,
int32_t kind)
: ICUServiceKey(primaryID)
, _kind(kind)
, _primaryID(canonicalPrimaryID)
, _fallbackID()
, _currentID()
{
_fallbackID.setToBogus();
if (_primaryID.length() != 0) {
if (canonicalFallbackID != NULL && _primaryID != *canonicalFallbackID) {
_fallbackID = *canonicalFallbackID;
}
}
_currentID = _primaryID;
}
UnicodeString&
LocaleKey::prefix(UnicodeString& result) const {
if (_kind != KIND_ANY) {
UChar buffer[64];
uprv_itou(buffer, 64, _kind, 10, 0);
UnicodeString temp(buffer);
result.append(temp);
}
return result;
}
int32_t
LocaleKey::kind() const {
return _kind;
}
UnicodeString&
LocaleKey::canonicalID(UnicodeString& result) const {
return result.append(_primaryID);
}
UnicodeString&
LocaleKey::currentID(UnicodeString& result) const {
if (!_currentID.isBogus()) {
result.append(_currentID);
}
return result;
}
UnicodeString&
LocaleKey::currentDescriptor(UnicodeString& result) const {
if (!_currentID.isBogus()) {
prefix(result).append(PREFIX_DELIMITER).append(_currentID);
} else {
result.setToBogus();
}
return result;
}
Locale&
LocaleKey::canonicalLocale(Locale& result) const {
return LocaleUtility::initLocaleFromName(_primaryID, result);
}
Locale&
LocaleKey::currentLocale(Locale& result) const {
return LocaleUtility::initLocaleFromName(_currentID, result);
}
UBool
LocaleKey::fallback() {
if (!_currentID.isBogus()) {
int x = _currentID.lastIndexOf(UNDERSCORE_CHAR);
if (x != -1) {
_currentID.remove(x); // truncate current or fallback, whichever we're pointing to
return TRUE;
}
if (!_fallbackID.isBogus()) {
_currentID = _fallbackID;
_fallbackID.setToBogus();
return TRUE;
}
if (_currentID.length() > 0) {
_currentID.remove(0); // completely truncate
return TRUE;
}
_currentID.setToBogus();
}
return FALSE;
}
UBool
LocaleKey::isFallbackOf(const UnicodeString& id) const {
UnicodeString temp(id);
parseSuffix(temp);
return temp.indexOf(_primaryID) == 0 &&
(temp.length() == _primaryID.length() ||
temp.charAt(_primaryID.length()) == UNDERSCORE_CHAR);
}
#ifdef SERVICE_DEBUG
UnicodeString&
LocaleKey::debug(UnicodeString& result) const
{
ICUServiceKey::debug(result);
result.append(" kind: ");
result.append(_kind);
result.append(" primaryID: ");
result.append(_primaryID);
result.append(" fallbackID: ");
result.append(_fallbackID);
result.append(" currentID: ");
result.append(_currentID);
return result;
}
UnicodeString&
LocaleKey::debugClass(UnicodeString& result) const
{
return result.append("LocaleKey ");
}
#endif
const char LocaleKey::fgClassID = 0;
/*
******************************************************************
*/
LocaleKeyFactory::LocaleKeyFactory(int32_t coverage)
: _name()
, _coverage(coverage)
{
}
LocaleKeyFactory::LocaleKeyFactory(int32_t coverage, const UnicodeString& name)
: _name(name)
, _coverage(coverage)
{
}
LocaleKeyFactory::~LocaleKeyFactory() {
}
UObject*
LocaleKeyFactory::create(const ICUServiceKey& key, const ICUService* service, UErrorCode& status) const {
if (handlesKey(key, status)) {
const LocaleKey& lkey = (const LocaleKey&)key;
int32_t kind = lkey.kind();
Locale loc;
lkey.currentLocale(loc);
return handleCreate(loc, kind, service, status);
}
return NULL;
}
UBool
LocaleKeyFactory::handlesKey(const ICUServiceKey& key, UErrorCode& status) const {
const Hashtable* supported = getSupportedIDs(status);
if (supported) {
UnicodeString id;
key.currentID(id);
return supported->get(id) != NULL;
}
return FALSE;
}
void
LocaleKeyFactory::updateVisibleIDs(Hashtable& result, UErrorCode& status) const {
const Hashtable* supported = getSupportedIDs(status);
if (supported) {
UBool visible = (_coverage & 0x1) == 0;
const UHashElement* elem = NULL;
int32_t pos = 0;
while (elem = supported->nextElement(pos)) {
const UnicodeString& id = *((const UnicodeString*)elem->key.pointer);
if (!visible) {
result.remove(id);
} else {
result.put(id, (void*)this, status); // this is dummy non-void marker used for set semantics
if (U_FAILURE(status)) {
break;
}
}
}
}
}
UnicodeString&
LocaleKeyFactory::getDisplayName(const UnicodeString& id, const Locale& locale, UnicodeString& result) const {
if ((_coverage & 0x1) == 0) {
UErrorCode status = U_ZERO_ERROR;
if (isSupportedID(id, status)) {
Locale loc;
LocaleUtility::initLocaleFromName(id, loc);
return loc.getDisplayName(locale, result);
}
}
result.setToBogus();
return result;
}
UObject*
LocaleKeyFactory::handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const {
return NULL;
}
UBool
LocaleKeyFactory::isSupportedID(const UnicodeString& id, UErrorCode& status) const {
const Hashtable* ids = getSupportedIDs(status);
return ids && ids->get(id);
}
const Hashtable*
LocaleKeyFactory::getSupportedIDs(UErrorCode& status) const {
return NULL;
}
#ifdef SERVICE_DEBUG
UnicodeString&
LocaleKeyFactory::debug(UnicodeString& result) const
{
debugClass(result);
result.append(", name: ");
result.append(_name);
result.append(", coverage: ");
result.append(_coverage);
return result;
}
UnicodeString&
LocaleKeyFactory::debugClass(UnicodeString& result) const
{
return result.append("LocaleKeyFactory");
}
#endif
const char LocaleKeyFactory::fgClassID = 0;
/*
******************************************************************
*/
SimpleLocaleKeyFactory::SimpleLocaleKeyFactory(UObject* objToAdopt,
const UnicodeString& locale,
int32_t kind,
int32_t coverage)
: LocaleKeyFactory(coverage)
, _obj(objToAdopt)
, _id(locale)
, _kind(kind)
{
}
SimpleLocaleKeyFactory::SimpleLocaleKeyFactory(UObject* objToAdopt,
const Locale& locale,
int32_t kind,
int32_t coverage)
: LocaleKeyFactory(coverage)
, _obj(objToAdopt)
, _id()
, _kind(kind)
{
LocaleUtility::initNameFromLocale(locale, _id);
}
SimpleLocaleKeyFactory::~SimpleLocaleKeyFactory()
{
delete _obj;
_obj = NULL;
}
UObject*
SimpleLocaleKeyFactory::create(const ICUServiceKey& key, const ICUService* service, UErrorCode& status) const
{
if (U_SUCCESS(status)) {
const LocaleKey& lkey = (const LocaleKey&)key;
if (_kind == LocaleKey::KIND_ANY || _kind == lkey.kind()) {
UnicodeString keyID;
lkey.currentID(keyID);
if (_id == keyID) {
return service->cloneInstance(_obj);
}
}
}
return NULL;
}
UBool
SimpleLocaleKeyFactory::isSupportedID(const UnicodeString& id, UErrorCode& status) const
{
return id == _id;
}
void
SimpleLocaleKeyFactory::updateVisibleIDs(Hashtable& result, UErrorCode& status) const
{
if (U_SUCCESS(status)) {
if (_coverage & 0x1) {
result.remove(_id);
} else {
result.put(_id, (void*)this, status);
}
}
}
#ifdef SERVICE_DEBUG
UnicodeString&
SimpleLocaleKeyFactory::debug(UnicodeString& result) const
{
LocaleKeyFactory::debug(result);
result.append(", id: ");
result.append(_id);
result.append(", kind: ");
result.append(_kind);
return result;
}
UnicodeString&
SimpleLocaleKeyFactory::debugClass(UnicodeString& result) const
{
return result.append("SimpleLocaleKeyFactory");
}
#endif
const char SimpleLocaleKeyFactory::fgClassID = 0;
/*
******************************************************************
*/
ICUResourceBundleFactory::ICUResourceBundleFactory()
: LocaleKeyFactory(VISIBLE)
, _bundleName()
{
}
ICUResourceBundleFactory::ICUResourceBundleFactory(const UnicodeString& bundleName)
: LocaleKeyFactory(VISIBLE)
, _bundleName(bundleName)
{
}
const Hashtable*
ICUResourceBundleFactory::getSupportedIDs(UErrorCode& status) const
{
if (U_SUCCESS(status)) {
return LocaleUtility::getAvailableLocaleNames(_bundleName);
}
return NULL;
}
UObject*
ICUResourceBundleFactory::handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const
{
if (U_SUCCESS(status)) {
return new ResourceBundle(_bundleName, loc, status);
}
return NULL;
}
#ifdef SERVICE_DEBUG
UnicodeString&
ICUResourceBundleFactory::debug(UnicodeString& result) const
{
LocaleKeyFactory::debug(result);
result.append(", bundle: ");
return result.append(_bundleName);
}
UnicodeString&
ICUResourceBundleFactory::debugClass(UnicodeString& result) const
{
return result.append("ICUResourceBundleFactory");
}
#endif
const char ICUResourceBundleFactory::fgClassID = '\0';
/*
******************************************************************
*/
ICULocaleService::ICULocaleService()
: fallbackLocale(Locale::getDefault())
, llock(0)
{
umtx_init(&llock);
}
ICULocaleService::ICULocaleService(const UnicodeString& dname)
: ICUService(dname)
, fallbackLocale(Locale::getDefault())
, llock(0)
{
umtx_init(&llock);
}
ICULocaleService::~ICULocaleService()
{
umtx_destroy(&llock);
}
UObject*
ICULocaleService::get(const Locale& locale, UErrorCode& status) const
{
return get(locale, LocaleKey::KIND_ANY, NULL, status);
}
UObject*
ICULocaleService::get(const Locale& locale, int32_t kind, UErrorCode& status) const
{
return get(locale, kind, NULL, status);
}
UObject*
ICULocaleService::get(const Locale& locale, Locale* actualReturn, UErrorCode& status) const
{
return get(locale, LocaleKey::KIND_ANY, actualReturn, status);
}
UObject*
ICULocaleService::get(const Locale& locale, int32_t kind, Locale* actualReturn, UErrorCode& status) const
{
UObject* result = NULL;
if (U_FAILURE(status)) {
return result;
}
UnicodeString locName(locale.getName(), "");
if (locName.isBogus()) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
ICUServiceKey* key = createKey(&locName, kind, status);
if (key) {
if (actualReturn == NULL) {
result = getKey(*key, status);
} else {
UnicodeString temp;
result = getKey(*key, &temp, status);
if (result != NULL) {
key->parseSuffix(temp);
LocaleUtility::initLocaleFromName(temp, *actualReturn);
}
}
delete key;
}
}
return result;
}
URegistryKey
ICULocaleService::registerInstance(UObject* objToAdopt, const UnicodeString& locale,
UBool visible, UErrorCode& status)
{
Locale loc;
LocaleUtility::initLocaleFromName(locale, loc);
return registerInstance(objToAdopt, loc, LocaleKey::KIND_ANY,
visible ? LocaleKeyFactory::VISIBLE : LocaleKeyFactory::INVISIBLE, status);
}
URegistryKey
ICULocaleService::registerInstance(UObject* objToAdopt, const Locale& locale, UErrorCode& status)
{
return registerInstance(objToAdopt, locale, LocaleKey::KIND_ANY, LocaleKeyFactory::VISIBLE, status);
}
URegistryKey
ICULocaleService::registerInstance(UObject* objToAdopt, const Locale& locale, int32_t kind, UErrorCode& status)
{
return registerInstance(objToAdopt, locale, kind, LocaleKeyFactory::VISIBLE, status);
}
URegistryKey
ICULocaleService::registerInstance(UObject* objToAdopt, const Locale& locale, int32_t kind, int32_t coverage, UErrorCode& status)
{
ICUServiceFactory * factory = new SimpleLocaleKeyFactory(objToAdopt, locale, kind, coverage);
if (factory != NULL) {
return registerFactory(factory, status);
}
delete objToAdopt;
return NULL;
}
#if 0
URegistryKey
ICULocaleService::registerInstance(UObject* objToAdopt, const UnicodeString& locale, UErrorCode& status)
{
return registerInstance(objToAdopt, locale, LocaleKey::KIND_ANY, LocaleKeyFactory::VISIBLE, status);
}
URegistryKey
ICULocaleService::registerInstance(UObject* objToAdopt, const UnicodeString& locale, UBool visible, UErrorCode& status)
{
return registerInstance(objToAdopt, locale, LocaleKey::KIND_ANY,
visible ? LocaleKeyFactory::VISIBLE : LocaleKeyFactory::INVISIBLE,
status);
}
URegistryKey
ICULocaleService::registerInstance(UObject* objToAdopt, const UnicodeString& locale, int32_t kind, int32_t coverage, UErrorCode& status)
{
ICUServiceFactory * factory = new SimpleLocaleKeyFactory(objToAdopt, locale, kind, coverage);
if (factory != NULL) {
return registerFactory(factory, status);
}
delete objToAdopt;
return NULL;
}
#endif
class ServiceEnumeration : public StringEnumeration {
private:
const ICULocaleService* _service;
int32_t _timestamp;
UVector _ids;
int32_t _pos;
void* _bufp;
int32_t _buflen;
private:
ServiceEnumeration(const ICULocaleService* service, UErrorCode status)
: _service(service)
, _timestamp(service->getTimestamp())
, _ids(uhash_deleteUnicodeString, NULL, status)
, _pos(0)
, _bufp(NULL)
, _buflen(0)
{
_service->getVisibleIDs(_ids, status);
}
public:
static ServiceEnumeration* create(const ICULocaleService* service) {
UErrorCode status = U_ZERO_ERROR;
ServiceEnumeration* result = new ServiceEnumeration(service, status);
if (U_SUCCESS(status)) {
return result;
}
delete result;
return NULL;
}
virtual ~ServiceEnumeration() {
uprv_free(_bufp);
}
virtual int32_t count(UErrorCode& status) const {
return upToDate(status) ? _ids.size() : 0;
}
const char* next(int32_t* resultLength, UErrorCode& status) {
const UnicodeString* us = snext(status);
if (us) {
while (TRUE) {
int32_t newlen = us->extract((char*)_bufp, _buflen / sizeof(char), NULL, status);
if (status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR) {
resizeBuffer((newlen + 1) * sizeof(char));
status = U_ZERO_ERROR;
} else if (U_SUCCESS(status)) {
((char*)_bufp)[newlen] = 0;
if (resultLength) {
resultLength[0] = newlen;
}
return (const char*)_bufp;
} else {
break;
}
}
}
return NULL;
}
const UChar* unext(int32_t* resultLength, UErrorCode& status) {
const UnicodeString* us = snext(status);
if (us) {
while (TRUE) {
int32_t newlen = us->extract((UChar*)_bufp, _buflen / sizeof(UChar), status);
if (status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR) {
resizeBuffer((newlen + 1) * sizeof(UChar));
} else if (U_SUCCESS(status)) {
((UChar*)_bufp)[newlen] = 0;
if (resultLength) {
resultLength[0] = newlen;
}
return (const UChar*)_bufp;
} else {
break;
}
}
}
return NULL;
}
const UnicodeString* snext(UErrorCode& status) {
if (upToDate(status) && (_pos < _ids.size())) {
return (const UnicodeString*)_ids[_pos++];
}
return NULL;
}
void resizeBuffer(int32_t newlen) {
if (_bufp) {
_bufp = uprv_realloc(_bufp, newlen);
} else {
_bufp = uprv_malloc(newlen);
}
_buflen = newlen;
}
UBool upToDate(UErrorCode& status) const {
if (U_SUCCESS(status)) {
if (_timestamp == _service->getTimestamp()) {
return TRUE;
}
status = U_ENUM_OUT_OF_SYNC_ERROR;
}
return FALSE;
}
void reset(UErrorCode& status) {
if (status == U_ENUM_OUT_OF_SYNC_ERROR) {
status = U_ZERO_ERROR;
}
if (U_SUCCESS(status)) {
_timestamp = _service->getTimestamp();
_pos = 0;
_service->getVisibleIDs(_ids, status);
}
}
public:
virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); }
static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }
private:
static const char fgClassID;
};
const char ServiceEnumeration::fgClassID = '\0';
StringEnumeration*
ICULocaleService::getAvailableLocales(void) const
{
return ServiceEnumeration::create(this);
}
const UnicodeString&
ICULocaleService::validateFallbackLocale() const
{
const Locale& loc = Locale::getDefault();
ICULocaleService* ncThis = (ICULocaleService*)this;
{
Mutex mutex(&ncThis->llock);
if (loc != fallbackLocale) {
ncThis->fallbackLocale = loc;
LocaleUtility::initNameFromLocale(loc, ncThis->fallbackLocaleName);
ncThis->clearServiceCache();
}
}
return fallbackLocaleName;
}
ICUServiceKey*
ICULocaleService::createKey(const UnicodeString* id, UErrorCode& status) const
{
return LocaleKey::createWithCanonicalFallback(id, &validateFallbackLocale(), status);
}
ICUServiceKey*
ICULocaleService::createKey(const UnicodeString* id, int32_t kind, UErrorCode& status) const
{
return LocaleKey::createWithCanonicalFallback(id, &validateFallbackLocale(), kind, status);
}
U_NAMESPACE_END
// defined in ucln_cmn.h
/**
* Release all static memory held by Locale Utility.
*/
U_CFUNC UBool service_cleanup(void) {
return LocaleUtility::cleanup();
}
/* !UCONFIG_NO_SERVICE */
#endif
--- NEW FILE: iculserv.h ---
/**
*******************************************************************************
* Copyright (C) 2001-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
#ifndef ICULSERV_H
#define ICULSERV_H
#include "unicode/utypes.h"
#if UCONFIG_NO_SERVICE
U_NAMESPACE_BEGIN
/*
* Allow the declaration of APIs with pointers to ICUService
* even when service is removed from the build.
*/
class ICULocaleService;
U_NAMESPACE_END
#else
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/locid.h"
#include "unicode/ubrk.h"
#include "unicode/strenum.h"
#include "hash.h"
#include "uvector.h"
#include "icuserv.h"
U_NAMESPACE_BEGIN
class ICULocaleService;
class LocaleKey;
class LocaleKeyFactory;
class SimpleLocaleKeyFactory;
class ServiceListener;
/*
******************************************************************
*/
/**
* A subclass of Key that implements a locale fallback mechanism.
* The first locale to search for is the locale provided by the
* client, and the fallback locale to search for is the current
* default locale. If a prefix is present, the currentDescriptor
* includes it before the locale proper, separated by "/". This
* is the default key instantiated by ICULocaleService.</p>
*
* <p>Canonicalization adjusts the locale string so that the
* section before the first understore is in lower case, and the rest
* is in upper case, with no trailing underscores.</p>
*/
class U_COMMON_API LocaleKey : public ICUServiceKey {
private:
int32_t _kind;
UnicodeString _primaryID;
UnicodeString _fallbackID;
UnicodeString _currentID;
public:
static const int32_t KIND_ANY; // = -1;
/**
* Create a LocaleKey with canonical primary and fallback IDs.
*/
static LocaleKey* createWithCanonicalFallback(const UnicodeString* primaryID,
const UnicodeString* canonicalFallbackID,
UErrorCode& status);
/**
* Create a LocaleKey with canonical primary and fallback IDs.
*/
static LocaleKey* createWithCanonicalFallback(const UnicodeString* primaryID,
const UnicodeString* canonicalFallbackID,
int32_t kind,
UErrorCode& status);
protected:
/**
* PrimaryID is the user's requested locale string,
* canonicalPrimaryID is this string in canonical form,
* fallbackID is the current default locale's string in
* canonical form.
*/
LocaleKey(const UnicodeString& primaryID,
const UnicodeString& canonicalPrimaryID,
const UnicodeString* canonicalFallbackID,
int32_t kind);
public:
/**
* Append the prefix associated with the kind, or nothing if the kind is KIND_ANY.
*/
virtual UnicodeString& prefix(UnicodeString& result) const;
/**
* Return the kind code associated with this key.
*/
virtual int32_t kind() const;
/**
* Return the canonicalID.
*/
virtual UnicodeString& canonicalID(UnicodeString& result) const;
/**
* Return the currentID.
*/
virtual UnicodeString& currentID(UnicodeString& result) const;
/**
* Return the (canonical) current descriptor, or null if no current id.
*/
virtual UnicodeString& currentDescriptor(UnicodeString& result) const;
/**
* Convenience method to return the locale corresponding to the (canonical) original ID.
*/
virtual Locale& canonicalLocale(Locale& result) const;
/**
* Convenience method to return the locale corresponding to the (canonical) current ID.
*/
virtual Locale& currentLocale(Locale& result) const;
/**
* If the key has a fallback, modify the key and return true,
* otherwise return false.</p>
*
* <p>First falls back through the primary ID, then through
* the fallbackID. The final fallback is the empty string,
* unless the primary id was the empty string, in which case
* there is no fallback.
*/
virtual UBool fallback();
/**
* Return true if a key created from id matches, or would eventually
* fallback to match, the canonical ID of this key.
*/
virtual UBool isFallbackOf(const UnicodeString& id) const;
public:
/**
* UObject boilerplate.
*/
static inline UClassID getStaticClassID() {
return (UClassID)&fgClassID;
}
virtual UClassID getDynamicClassID() const {
return getStaticClassID();
}
#ifdef SERVICE_DEBUG
public:
virtual UnicodeString& debug(UnicodeString& result) const;
virtual UnicodeString& debugClass(UnicodeString& result) const;
#endif
private:
static const char fgClassID;
};
/*
******************************************************************
*/
/**
* A subclass of ICUServiceFactory that uses LocaleKeys, and is able to
* 'cover' more specific locales with more general locales that it
* supports.
*
* <p>Coverage may be either of the values VISIBLE or INVISIBLE.
*
* <p>'Visible' indicates that the specific locale(s) supported by
* the factory are registered in getSupportedIDs, 'Invisible'
* indicates that they are not.
*
* <p>Localization of visible ids is handled
* by the handling factory, regardless of kind.
*/
class U_COMMON_API LocaleKeyFactory : public ICUServiceFactory {
protected:
const UnicodeString _name;
const int32_t _coverage;
public:
enum {
/**
* Coverage value indicating that the factory makes
* its locales visible, and does not cover more specific
* locales.
*/
VISIBLE = 0,
/**
* Coverage value indicating that the factory does not make
* its locales visible, and does not cover more specific
* locales.
*/
INVISIBLE = 1
};
/**
* Destructor.
*/
virtual ~LocaleKeyFactory();
protected:
/**
* Constructor used by subclasses.
*/
LocaleKeyFactory(int32_t coverage);
/**
* Constructor used by subclasses.
*/
LocaleKeyFactory(int32_t coverage, const UnicodeString& name);
/**
* Implement superclass abstract method. This checks the currentID of
* the key against the supported IDs, and passes the canonicalLocale and
* kind off to handleCreate (which subclasses must implement).
*/
public:
virtual UObject* create(const ICUServiceKey& key, const ICUService* service, UErrorCode& status) const;
protected:
virtual UBool handlesKey(const ICUServiceKey& key, UErrorCode& status) const;
public:
/**
* Override of superclass method. This adjusts the result based
* on the coverage rule for this factory.
*/
void updateVisibleIDs(Hashtable& result, UErrorCode& status) const;
/**
* Return a localized name for the locale represented by id.
*/
UnicodeString& getDisplayName(const UnicodeString& id, const Locale& locale, UnicodeString& result) const;
protected:
/**
* Utility method used by create(ICUServiceKey, ICUService). Subclasses can implement
* this instead of create. The default returns NULL.
*/
virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const;
/**
* Return true if this id is one the factory supports (visible or
* otherwise).
*/
virtual UBool isSupportedID(const UnicodeString& id, UErrorCode& status) const;
/**
* Return the set of ids that this factory supports (visible or
* otherwise). This can be called often and might need to be
* cached if it is expensive to create.
*/
virtual const Hashtable* getSupportedIDs(UErrorCode& status) const;
public:
/**
* UObject boilerplate.
*/
static inline UClassID getStaticClassID() {
return (UClassID)&fgClassID;
}
virtual UClassID getDynamicClassID() const {
return getStaticClassID();
}
#ifdef SERVICE_DEBUG
public:
virtual UnicodeString& debug(UnicodeString& result) const;
virtual UnicodeString& debugClass(UnicodeString& result) const;
#endif
private:
static const char fgClassID;
};
/*
******************************************************************
*/
/**
* A LocaleKeyFactory that just returns a single object for a kind/locale.
*/
class U_COMMON_API SimpleLocaleKeyFactory : public LocaleKeyFactory {
private:
UObject* _obj;
UnicodeString _id;
const int32_t _kind;
public:
SimpleLocaleKeyFactory(UObject* objToAdopt,
const UnicodeString& locale,
int32_t kind,
int32_t coverage);
SimpleLocaleKeyFactory(UObject* objToAdopt,
const Locale& locale,
int32_t kind,
int32_t coverage);
/**
* Destructor.
*/
virtual ~SimpleLocaleKeyFactory();
/**
* Override of superclass method. Returns the service object if kind/locale match. Service is not used.
*/
UObject* create(const ICUServiceKey& key, const ICUService* service, UErrorCode& status) const;
/**
* Override of superclass method. This adjusts the result based
* on the coverage rule for this factory.
*/
void updateVisibleIDs(Hashtable& result, UErrorCode& status) const;
protected:
/**
* Return true if this id is equal to the locale name.
*/
virtual UBool isSupportedID(const UnicodeString& id, UErrorCode& status) const;
public:
/**
* UObject boilerplate.
*/
static inline UClassID getStaticClassID() {
return (UClassID)&fgClassID;
}
virtual UClassID getDynamicClassID() const {
return getStaticClassID();
}
#ifdef SERVICE_DEBUG
public:
virtual UnicodeString& debug(UnicodeString& result) const;
virtual UnicodeString& debugClass(UnicodeString& result) const;
#endif
private:
static const char fgClassID;
};
/*
******************************************************************
*/
/**
* A LocaleKeyFactory that creates a service based on the ICU locale data.
* This is a base class for most ICU factories. Subclasses instantiate it
* with a constructor that takes a bundle name, which determines the supported
* IDs. Subclasses then override handleCreate to create the actual service
* object. The default implementation returns a resource bundle.
*/
class U_COMMON_API ICUResourceBundleFactory : public LocaleKeyFactory
{
protected:
UnicodeString _bundleName;
public:
/**
* Convenience constructor that uses the main ICU bundle name.
*/
ICUResourceBundleFactory();
/**
* A service factory based on ICU resource data in resources
* with the given name.
*/
ICUResourceBundleFactory(const UnicodeString& bundleName);
protected:
/**
* Return the supported IDs. This is the set of all locale names in ICULocaleData.
*/
virtual const Hashtable* getSupportedIDs(UErrorCode& status) const;
/**
* Create the service. The default implementation returns the resource bundle
* for the locale, ignoring kind, and service.
*/
virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const;
public:
/**
* UObject boilerplate.
*/
virtual UClassID getDynamicClassID() const {
return getStaticClassID();
}
static UClassID getStaticClassID() {
return (UClassID)&fgClassID;
}
#ifdef SERVICE_DEBUG
public:
virtual UnicodeString& debug(UnicodeString& result) const;
virtual UnicodeString& debugClass(UnicodeString& result) const;
#endif
private:
static const char fgClassID;
};
/*
******************************************************************
*/
class U_COMMON_API ICULocaleService : public ICUService
{
private:
Locale fallbackLocale;
UnicodeString fallbackLocaleName;
UMTX llock;
public:
/**
* Construct an ICULocaleService.
*/
ICULocaleService();
/**
* Construct an ICULocaleService with a name (useful for debugging).
*/
ICULocaleService(const UnicodeString& name);
/**
* Destructor.
*/
virtual ~ICULocaleService();
#if 0
// redeclare because of overload resolution rules?
// no, causes ambiguities since both UnicodeString and Locale have constructors that take a const char*
// need some compiler flag to remove warnings
UObject* get(const UnicodeString& descriptor, UErrorCode& status) const {
return ICUService::get(descriptor, status);
}
UObject* get(const UnicodeString& descriptor, UnicodeString* actualReturn, UErrorCode& status) const {
return ICUService::get(descriptor, actualReturn, status);
}
#endif
/**
* Convenience override for callers using locales. This calls
* get(Locale, int, Locale[]) with KIND_ANY for kind and null for
* actualReturn.
*/
UObject* get(const Locale& locale, UErrorCode& status) const;
/**
* Convenience override for callers using locales. This calls
* get(Locale, int, Locale[]) with a null actualReturn.
*/
UObject* get(const Locale& locale, int32_t kind, UErrorCode& status) const;
/**
* Convenience override for callers using locales. This calls
* get(Locale, String, Locale[]) with a null kind.
*/
UObject* get(const Locale& locale, Locale* actualReturn, UErrorCode& status) const;
/**
* Convenience override for callers using locales. This uses
* createKey(Locale.toString(), kind) to create a key, calls getKey, and then
* if actualReturn is not null, returns the actualResult from
* getKey (stripping any prefix) into a Locale.
*/
UObject* get(const Locale& locale, int32_t kind, Locale* actualReturn, UErrorCode& status) const;
/**
* Convenience override for callers using locales. This calls
* registerObject(Object, Locale, int32_t kind, int coverage)
* passing KIND_ANY for the kind, and VISIBLE for the coverage.
*/
virtual URegistryKey registerInstance(UObject* objToAdopt, const Locale& locale, UErrorCode& status);
/**
* Convenience function for callers using locales. This calls
* registerObject(Object, Locale, int kind, int coverage)
* passing VISIBLE for the coverage.
*/
virtual URegistryKey registerInstance(UObject* objToAdopt, const Locale& locale, int32_t kind, UErrorCode& status);
/**
* Convenience function for callers using locales. This instantiates
* a SimpleLocaleKeyFactory, and registers the factory.
*/
virtual URegistryKey registerInstance(UObject* objToAdopt, const Locale& locale, int32_t kind, int32_t coverage, UErrorCode& status);
/**
* (Stop compiler from complaining about hidden overrides.)
* Since both UnicodeString and Locale have constructors that take const char*, adding a public
* method that takes UnicodeString causes ambiguity at call sites that use const char*.
* We really need a flag that is understood by all compilers that will suppress the warning about
* hidden overrides.
*/
virtual URegistryKey registerInstance(UObject* objToAdopt, const UnicodeString& locale, UBool visible, UErrorCode& status);
/**
* Convenience method for callers using locales. This returns the standard
* service ID enumeration.
*/
virtual StringEnumeration* getAvailableLocales(void) const;
protected:
/**
* Return the name of the current fallback locale. If it has changed since this was
* last accessed, the service cache is cleared.
*/
const UnicodeString& validateFallbackLocale() const;
/**
* Override superclass createKey method.
*/
virtual ICUServiceKey* createKey(const UnicodeString* id, UErrorCode& status) const;
/**
* Additional createKey that takes a kind.
*/
virtual ICUServiceKey* createKey(const UnicodeString* id, int32_t kind, UErrorCode& status) const;
friend class ServiceEnumeration;
};
// temporary utility functions, till I know where to find them
// in header so tests can also access them
class U_COMMON_API LocaleUtility {
public:
static UnicodeString& canonicalLocaleString(const UnicodeString* id, UnicodeString& result);
static Locale& initLocaleFromName(const UnicodeString& id, Locale& result);
static UnicodeString& initNameFromLocale(const Locale& locale, UnicodeString& result);
static const Hashtable* getAvailableLocaleNames(const UnicodeString& bundleID);
static UBool isFallbackOf(const UnicodeString& root, const UnicodeString& child);
static UBool cleanup(void);
};
U_NAMESPACE_END
/* UCONFIG_NO_SERVICE */
#endif
/* ICULSERV_H */
#endif
--- NEW FILE: icunotif.cpp ---
/**
*******************************************************************************
* Copyright (C) 2001-2002, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_SERVICE
#include "icunotif.h"
#include <stdio.h>
U_NAMESPACE_BEGIN
const char EventListener::fgClassID = '\0';
void
ICUNotifier::addListener(const EventListener* l, UErrorCode& status)
{
if (U_SUCCESS(status)) {
if (l == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
if (acceptsListener(*l)) {
Mutex lmx(¬ifyLock);
if (listeners == NULL) {
listeners = new UVector(5, status);
} else {
for (int i = 0, e = listeners->size(); i < e; ++i) {
const EventListener* el = (const EventListener*)(listeners->elementAt(i));
if (l == el) {
return;
}
}
}
listeners->addElement((void*)l, status); // cast away const
} else {
#if DEBUG
fprintf(stderr, "Listener invalid for this notifier.");
exit(1);
#endif
}
}
}
void
ICUNotifier::removeListener(const EventListener *l, UErrorCode& status)
{
if (U_SUCCESS(status)) {
if (l == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
{
Mutex lmx(¬ifyLock);
if (listeners != NULL) {
// identity equality check
for (int i = 0, e = listeners->size(); i < e; ++i) {
const EventListener* el = (const EventListener*)listeners->elementAt(i);
if (l == el) {
listeners->removeElementAt(i);
if (listeners->size() == 0) {
delete listeners;
listeners = NULL;
}
return;
}
}
}
}
}
}
void
ICUNotifier::notifyChanged(void)
{
if (listeners != NULL) {
Mutex lmx(¬ifyLock);
if (listeners != NULL) {
for (int i = 0, e = listeners->size(); i < e; ++i) {
EventListener* el = (EventListener*)listeners->elementAt(i);
notifyListener(*el);
}
}
}
}
U_NAMESPACE_END;
/* UCONFIG_NO_SERVICE */
#endif
--- NEW FILE: icunotif.h ---
/**
*******************************************************************************
* Copyright (C) 2001-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
#ifndef ICUNOTIF_H
#define ICUNOTIF_H
#include "unicode/utypes.h"
#if UCONFIG_NO_SERVICE
U_NAMESPACE_BEGIN
/*
* Allow the declaration of APIs with pointers to BreakIterator
* even when break iteration is removed from the build.
*/
class ICUNotifier;
U_NAMESPACE_END
#else
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "mutex.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
class U_COMMON_API EventListener : public UObject {
public:
virtual ~EventListener() {}
public:
static inline UClassID getStaticClassID() {
return (UClassID)&fgClassID;
}
virtual UClassID getDynamicClassID() const {
return getStaticClassID();
}
public:
virtual UnicodeString& debug(UnicodeString& result) const {
return debugClass(result);
}
virtual UnicodeString& debugClass(UnicodeString& result) const {
return result.append("Key");
}
private:
static const char fgClassID;
};
/**
* <p>Abstract implementation of a notification facility. Clients add
* EventListeners with addListener and remove them with removeListener.
* Notifiers call notifyChanged when they wish to notify listeners.
* This queues the listener list on the notification thread, which
* eventually dequeues the list and calls notifyListener on each
* listener in the list.</p>
*
* <p>Subclasses override acceptsListener and notifyListener
* to add type-safe notification. AcceptsListener should return
* true if the listener is of the appropriate type; ICUNotifier
* itself will ensure the listener is non-null and that the
* identical listener is not already registered with the Notifier.
* NotifyListener should cast the listener to the appropriate
* type and call the appropriate method on the listener.
*/
class U_COMMON_API ICUNotifier : public UMemory {
private: UMTX notifyLock;
private: UVector* listeners;
public:
ICUNotifier(void)
: notifyLock(0), listeners(NULL)
{
umtx_init(¬ifyLock);
}
virtual ~ICUNotifier(void) {
{
Mutex lmx(¬ifyLock);
delete listeners;
listeners = NULL;
}
umtx_destroy(¬ifyLock);
}
/**
* Add a listener to be notified when notifyChanged is called.
* The listener must not be null. AcceptsListener must return
* true for the listener. Attempts to concurrently
* register the identical listener more than once will be
* silently ignored.
*/
virtual void addListener(const EventListener* l, UErrorCode& status);
/**
* Stop notifying this listener. The listener must
* not be null. Attemps to remove a listener that is
* not registered will be silently ignored.
*/
virtual void removeListener(const EventListener* l, UErrorCode& status);
/**
* ICU doesn't spawn its own threads. All listeners are notified in
* the thread of the caller. Misbehaved listeners can therefore
* indefinitely block the calling thread. Callers should beware of
* deadlock situations.
*/
virtual void notifyChanged(void);
protected:
/**
* Subclasses implement this to return TRUE if the listener is
* of the appropriate type.
*/
virtual UBool acceptsListener(const EventListener& l) const = 0;
/**
* Subclasses implement this to notify the listener.
*/
virtual void notifyListener(EventListener& l) const = 0;
};
U_NAMESPACE_END
/* UCONFIG_NO_SERVICE */
#endif
/* ICUNOTIF_H */
#endif
--- NEW FILE: icuserv.cpp ---
/**
*******************************************************************************
* Copyright (C) 2001-2003, International Business Machines Corporation. *
* All Rights Reserved. *
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_SERVICE
#include "icuserv.h"
#include "umutex.h"
#undef SERVICE_REFCOUNT
// in case we use the refcount stuff
U_NAMESPACE_BEGIN
[...1146 lines suppressed...]
{
return result.append(name);
}
int32_t
ICUService::countFactories() const
{
return factories == NULL ? 0 : factories->size();
}
int32_t
ICUService::getTimestamp() const
{
return timestamp;
}
U_NAMESPACE_END
/* UCONFIG_NO_SERVICE */
#endif
--- NEW FILE: icuserv.h ---
/**
*******************************************************************************
* Copyright (C) 2001-2003, International Business Machines Corporation. *
* All Rights Reserved. *
*******************************************************************************
*/
#ifndef ICUSERV_H
#define ICUSERV_H
#include "unicode/utypes.h"
#if UCONFIG_NO_SERVICE
U_NAMESPACE_BEGIN
/*
* Allow the declaration of APIs with pointers to ICUService
* even when service is removed from the build.
[...988 lines suppressed...]
/**
* <p>Return the number of registered factories.</p>
*
* @return the number of factories registered at the time of the call.
*/
int32_t countFactories(void) const;
private:
friend class ::ICUServiceTest; // give tests access to countFactories.
};
U_NAMESPACE_END
/* UCONFIG_NO_SERVICE */
#endif
/* ICUSERV_H */
#endif
--- NEW FILE: nameprep.cpp ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: nameprep.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "nameprep.h"
// *****************************************************************************
// class NamePrep
// *****************************************************************************
U_NAMESPACE_BEGIN
const char NamePrep::fgClassID=0;
// default constructor
NamePrep::NamePrep(UErrorCode& status){
bidiCheck = TRUE;
doNFKC = TRUE;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_IDNA */
--- NEW FILE: nameprep.h ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: nameprep.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef NAMEPREP_H
#define NAMEPREP_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "strprep.h"
#include "unicode/uniset.h"
U_NAMESPACE_BEGIN
/*
A profile of stringprep MUST include all of the following:
- The intended applicability of the profile
- The character repertoire that is the input and output to stringprep
(which is Unicode 3.2 for this version of stringprep)
- The mapping tables from this document used (as described in section
3)
- Any additional mapping tables specific to the profile
- The Unicode normalization used, if any (as described in section 4)
- The tables from this document of characters that are prohibited as
output (as described in section 5)
- The bidirectional string testing used, if any (as described in
section 6)
- Any additional characters that are prohibited as output specific to
the profile
*/
class NamePrep: public StringPrep {
public :
NamePrep(UErrorCode& status);
virtual inline ~NamePrep(){};
virtual inline UBool isNotProhibited(UChar32 ch);
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.6
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.6
*/
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
private:
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UBool NamePrep::isNotProhibited(UChar32 ch){
return (UBool)(ch == 0x0020); /* ASCII_SPACE */
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_IDNA */
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/
--- NEW FILE: propname.cpp ---
/*
**********************************************************************
* Copyright (c) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: October 30 2002
* Since: ICU 2.4
**********************************************************************
*/
#include "propname.h"
#include "unicode/uchar.h"
#include "unicode/udata.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
//----------------------------------------------------------------------
// PropertyAliases implementation
const char*
PropertyAliases::chooseNameInGroup(Offset offset,
UPropertyNameChoice choice) const {
int32_t c = choice;
if (!offset || c < 0) {
return NULL;
}
const Offset* p = (const Offset*) getPointer(offset);
while (c-- > 0) {
if (*p++ < 0) return NULL;
}
Offset a = *p;
if (a < 0) a = -a;
return (const char*) getPointerNull(a);
}
const ValueMap*
PropertyAliases::getValueMap(EnumValue prop) const {
NonContiguousEnumToOffset* e2o = (NonContiguousEnumToOffset*) getPointer(enumToValue_offset);
Offset a = e2o->getOffset(prop);
return (const ValueMap*) (a ? getPointerNull(a) : NULL);
}
inline const char*
PropertyAliases::getPropertyName(EnumValue prop,
UPropertyNameChoice choice) const {
NonContiguousEnumToOffset* e2n = (NonContiguousEnumToOffset*) getPointer(enumToName_offset);
return chooseNameInGroup(e2n->getOffset(prop), choice);
}
inline EnumValue
PropertyAliases::getPropertyEnum(const char* alias) const {
NameToEnum* n2e = (NameToEnum*) getPointer(nameToEnum_offset);
return n2e->getEnum(alias, *this);
}
inline const char*
PropertyAliases::getPropertyValueName(EnumValue prop,
EnumValue value,
UPropertyNameChoice choice) const {
const ValueMap* vm = getValueMap(prop);
if (!vm) return NULL;
Offset a;
if (vm->enumToName_offset) {
a = ((EnumToOffset*) getPointer(vm->enumToName_offset))->
getOffset(value);
} else {
a = ((NonContiguousEnumToOffset*) getPointer(vm->ncEnumToName_offset))->
getOffset(value);
}
return chooseNameInGroup(a, choice);
}
inline EnumValue
PropertyAliases::getPropertyValueEnum(EnumValue prop,
const char* alias) const {
const ValueMap* vm = getValueMap(prop);
if (!vm) return UCHAR_INVALID_CODE;
NameToEnum* n2e = (NameToEnum*) getPointer(vm->nameToEnum_offset);
return n2e->getEnum(alias, *this);
}
U_NAMESPACE_END
//----------------------------------------------------------------------
// UDataMemory structures
static const PropertyAliases* PNAME = NULL;
static UDataMemory* UDATA = NULL;
//----------------------------------------------------------------------
// UDataMemory loading/unloading
/**
* udata callback to verify the zone data.
*/
U_CDECL_BEGIN
static UBool U_CALLCONV
isAcceptable(void* /*context*/,
const char* /*type*/, const char* /*name*/,
const UDataInfo* info) {
return
info->size >= sizeof(UDataInfo) &&
info->isBigEndian == U_IS_BIG_ENDIAN &&
info->charsetFamily == U_CHARSET_FAMILY &&
info->dataFormat[0] == PNAME_SIG_0 &&
info->dataFormat[1] == PNAME_SIG_1 &&
info->dataFormat[2] == PNAME_SIG_2 &&
info->dataFormat[3] == PNAME_SIG_3 &&
info->formatVersion[0] == PNAME_FORMAT_VERSION;
}
UBool
pname_cleanup() {
if (UDATA) {
udata_close(UDATA);
UDATA = NULL;
}
PNAME = NULL;
return TRUE;
}
U_CDECL_END
/**
* Load the property names data. Caller should check that data is
* not loaded BEFORE calling this function. Returns TRUE if the load
* succeeds.
*/
static UBool _load() {
UErrorCode ec = U_ZERO_ERROR;
UDataMemory* data =
udata_openChoice(0, PNAME_DATA_TYPE, PNAME_DATA_NAME,
isAcceptable, 0, &ec);
if (U_SUCCESS(ec)) {
umtx_lock(NULL);
if (UDATA == NULL) {
UDATA = data;
PNAME = (const PropertyAliases*) udata_getMemory(UDATA);
data = NULL;
}
umtx_unlock(NULL);
}
if (data) {
udata_close(data);
}
return PNAME!=NULL;
}
/**
* Inline function that expands to code that does a lazy load of the
* property names data. If the data is already loaded, avoids an
* unnecessary function call. If the data is not loaded, call _load()
* to load it, and return TRUE if the load succeeds.
*/
static inline UBool load() {
umtx_lock(NULL);
UBool f = (PNAME!=NULL);
umtx_unlock(NULL);
return f || _load();
}
//----------------------------------------------------------------------
// Public API implementation
// The C API is just a thin wrapper. Each function obtains a pointer
// to the singleton PropertyAliases, and calls the appropriate method
// on it. If it cannot obtain a pointer, because valid data is not
// available, then it returns NULL or UCHAR_INVALID_CODE.
U_CAPI const char* U_EXPORT2
u_getPropertyName(UProperty property,
UPropertyNameChoice nameChoice) {
return load() ? PNAME->getPropertyName(property, nameChoice)
: NULL;
}
U_CAPI UProperty U_EXPORT2
u_getPropertyEnum(const char* alias) {
UProperty p = load() ? (UProperty) PNAME->getPropertyEnum(alias)
: UCHAR_INVALID_CODE;
return p;
}
U_CAPI const char* U_EXPORT2
u_getPropertyValueName(UProperty property,
int32_t value,
UPropertyNameChoice nameChoice) {
return load() ? PNAME->getPropertyValueName(property, value, nameChoice)
: NULL;
}
U_CAPI int32_t U_EXPORT2
u_getPropertyValueEnum(UProperty property,
const char* alias) {
return load() ? PNAME->getPropertyValueEnum(property, alias)
: UCHAR_INVALID_CODE;
}
//eof
--- NEW FILE: propname.h ---
/*
**********************************************************************
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: October 30 2002
* Since: ICU 2.4
**********************************************************************
*/
#ifndef PROPNAME_H
#define PROPNAME_H
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "uprops.h"
class Builder;
U_NAMESPACE_BEGIN
// This header defines the in-memory layout of the property names data
// structure representing the UCD data files PropertyAliases.txt and
// PropertyValueAliases.txt. It is used by:
// propname.cpp - reads data
// genpname - creates data
//----------------------------------------------------------------------
// UDataMemory structure and signatures
#define PNAME_DATA_NAME "pnames"
#define PNAME_DATA_TYPE "icu"
// Fields in UDataInfo:
// PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler
#define PNAME_SIG_0 ((uint8_t)0x70) /* p */
#define PNAME_SIG_1 ((uint8_t)0x6E) /* n */
#define PNAME_SIG_2 ((uint8_t)0x61) /* a */
#define PNAME_SIG_3 ((uint8_t)0x6D) /* m */
#define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */
/**
* An offset from the start of the pnames data to a contained entity.
* This must be a signed value, since negative offsets are used as an
* end-of-list marker. Offsets to actual objects are non-zero. A
* zero offset indicates an absent entry; this corresponds to aliases
* marked "n/a" in the original Unicode data files.
*/
typedef int16_t Offset; // must be signed
#define MAX_OFFSET 0x7FFF
/**
* A generic value for a property or property value. Typically an
* enum from uchar.h, but sometimes a non-enum value. It must be
* large enough to accomodate the largest enum value, which as of this
* writing is the largest general category mask. Need not be signed
* but may be. Typically it doesn't matter, since the caller will
* cast it to the proper type before use. Takes the special value
* UCHAR_INVALID_CODE for invalid input.
*/
typedef int32_t EnumValue;
//----------------------------------------------------------------------
// ValueMap
/**
* For any top-level property that has named values (binary and
* enumerated properties), there is a ValueMap object. This object
* maps from enum values to two other maps. One goes from value enums
* to value names. The other goes from value names to value enums.
*
* The value enum values may be contiguous or disjoint. If they are
* contiguous then the enumToName_offset is nonzero, and the
* ncEnumToName_offset is zero. Vice versa if the value enums are
* disjoint.
*
* There are n of these objects, where n is the number of binary
* properties + the number of enumerated properties.
*/
struct ValueMap {
// -- begin pnames data --
// Enum=>name EnumToOffset / NonContiguousEnumToOffset objects.
// Exactly one of these will be nonzero.
Offset enumToName_offset;
Offset ncEnumToName_offset;
Offset nameToEnum_offset; // Name=>enum data
// -- end pnames data --
};
//----------------------------------------------------------------------
// PropertyAliases class
/**
* A class encapsulating access to the memory-mapped data representing
* property aliases and property value aliases (pnames). The class
* MUST have no v-table and declares certain methods inline -- small
* methods and methods that are called from only one point.
*
* The data members in this class correspond to the in-memory layout
* of the header of the pnames data.
*/
class PropertyAliases {
// -- begin pnames data --
// Enum=>name EnumToOffset object for binary and enumerated
// properties
Offset enumToName_offset;
// Name=>enum data for binary & enumerated properties
Offset nameToEnum_offset;
// Enum=>offset EnumToOffset object mapping enumerated properties
// to ValueMap objects
Offset enumToValue_offset;
// The following are needed by external readers of this data.
// We don't use them ourselves.
int16_t total_size; // size in bytes excluding the udata header
Offset valueMap_offset; // offset to start of array
int16_t valueMap_count; // number of entries
Offset nameGroupPool_offset; // offset to start of array
int16_t nameGroupPool_count; // number of entries (not groups)
Offset stringPool_offset; // offset to start of pool
int16_t stringPool_count; // number of strings (not size in bytes)
// -- end pnames data --
friend class ::Builder;
const ValueMap* getValueMap(EnumValue prop) const;
const char* chooseNameInGroup(Offset offset,
UPropertyNameChoice choice) const;
public:
inline const int8_t* getPointer(Offset o) const {
return ((const int8_t*) this) + o;
}
inline const int8_t* getPointerNull(Offset o) const {
return o ? getPointer(o) : NULL;
}
inline const char* getPropertyName(EnumValue prop,
UPropertyNameChoice choice) const;
inline EnumValue getPropertyEnum(const char* alias) const;
inline const char* getPropertyValueName(EnumValue prop, EnumValue value,
UPropertyNameChoice choice) const;
inline EnumValue getPropertyValueEnum(EnumValue prop,
const char* alias) const;
};
//----------------------------------------------------------------------
// EnumToOffset
/**
* A generic map from enum values to Offsets. The enum values must be
* contiguous, from enumStart to enumLimit. The Offset values may
* point to anything.
*/
class EnumToOffset {
// -- begin pnames data --
EnumValue enumStart;
EnumValue enumLimit;
Offset _offsetArray; // [array of enumLimit-enumStart]
// -- end pnames data --
friend class ::Builder;
Offset* getOffsetArray() {
return &_offsetArray;
}
const Offset* getOffsetArray() const {
return &_offsetArray;
}
static int32_t getSize(int32_t n) {
return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1);
}
public:
Offset getOffset(EnumValue enumProbe) const {
if (enumProbe < enumStart ||
enumProbe >= enumLimit) {
return 0; // not found
}
const Offset* p = getOffsetArray();
return p[enumProbe - enumStart];
}
};
//----------------------------------------------------------------------
// NonContiguousEnumToOffset
/**
* A generic map from enum values to Offsets. The enum values may be
* disjoint. If they are contiguous, an EnumToOffset should be used
* instead. The Offset values may point to anything.
*/
class NonContiguousEnumToOffset {
// -- begin pnames data --
int32_t count;
EnumValue _enumArray; // [array of count]
// Offset _offsetArray; // [array of count] after enumValue[count-1]
// -- end pnames data --
friend class ::Builder;
EnumValue* getEnumArray() {
return &_enumArray;
}
const EnumValue* getEnumArray() const {
return &_enumArray;
}
Offset* getOffsetArray() {
return (Offset*) (getEnumArray() + count);
}
const Offset* getOffsetArray() const {
return (Offset*) (getEnumArray() + count);
}
static int32_t getSize(int32_t n) {
return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n;
}
public:
Offset getOffset(EnumValue enumProbe) const {
const EnumValue* e = getEnumArray();
const Offset* p = getOffsetArray();
// linear search; binary later if warranted
// (binary is not faster for short lists)
for (int32_t i=0; i<count; ++i) {
if (e[i] < enumProbe) continue;
if (e[i] > enumProbe) break;
return p[i];
}
return 0; // not found
}
};
//----------------------------------------------------------------------
// NameToEnum
/**
* A map from names to enum values.
*/
class NameToEnum {
// -- begin pnames data --
int32_t count; // number of entries
EnumValue _enumArray; // [array of count] EnumValues
// Offset _nameArray; // [array of count] offsets to names
// -- end pnames data --
friend class ::Builder;
EnumValue* getEnumArray() {
return &_enumArray;
}
const EnumValue* getEnumArray() const {
return &_enumArray;
}
Offset* getNameArray() {
return (Offset*) (getEnumArray() + count);
}
const Offset* getNameArray() const {
return (Offset*) (getEnumArray() + count);
}
static int32_t getSize(int32_t n) {
return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n;
}
public:
EnumValue getEnum(const char* alias, const PropertyAliases& data) const {
const Offset* n = getNameArray();
const EnumValue* e = getEnumArray();
// linear search; binary later if warranted
// (binary is not faster for short lists)
for (int32_t i=0; i<count; ++i) {
const char* name = (const char*) data.getPointer(n[i]);
int32_t c = uprv_comparePropertyNames(alias, name);
if (c > 0) continue;
if (c < 0) break;
return e[i];
}
return UCHAR_INVALID_CODE;
}
};
/*----------------------------------------------------------------------
*
* In-memory layout. THIS IS NOT A STANDALONE DOCUMENT. It goes
* together with above C++ declarations and gives an overview.
*
* See above for definitions of Offset and EnumValue. Also, refer to
* above class declarations for the "bottom line" on data layout.
*
* Sizes:
* '*_offset' is an Offset (see above)
* 'count' members are typically int32_t (see above declarations)
* 'enumArray' is an array of EnumValue (see above)
* 'offsetArray' is an array of Offset (see above)
* 'nameArray' is an array of Offset (see above)
* 'enum*' is an EnumValue (see above)
* '*Array [x n]' means that *Array has n elements
*
* References:
* Instead of pointers, this flat data structure contains offsets.
* All offsets are relative to the start of 'header'. A notation
* is used to indicate what structure each offset points to:
* 'foo (>x)' the offset(s) in foo point to structure x
*
* Structures:
* Each structure is assigned a number, except for the header,
* which is called 'header'. The numbers are not contiguous
* for historical reasons. Some structures have sub-parts
* that are denoted with a letter, e.g., "5a".
*
* BEGIN LAYOUT
* ============
* header:
* enumToName_offset (>0)
* nameToEnum_offset (>2)
* enumToValue_offset (>3)
* (alignment padding build in to header)
*
* 0: # NonContiguousEnumToOffset obj for props => name groups
* count
* enumArray [x count]
* offsetArray [x count] (>98)
*
* => pad to next 4-byte boundary
*
* (1: omitted -- no longer used)
*
* 2: # NameToEnum obj for binary & enumerated props
* count
* enumArray [x count]
* nameArray [x count] (>99)
*
* => pad to next 4-byte boundary
*
* 3: # NonContiguousEnumToOffset obj for enumerated props => ValueMaps
* count
* enumArray [x count]
* offsetArray [x count] (>4)
*
* => pad to next 4-byte boundary
*
* 4: # ValueMap array [x one for each enumerated prop i]
* enumToName_offset (>5a +2*i) one of these two is NULL, one is not
* ncEnumToName_offset (>5b +2*i)
* nameToEnums_offset (>6 +2*i)
*
* => pad to next 4-byte boundary
*
* for each enumerated prop (either 5a or 5b):
*
* 5a: # EnumToOffset for enumerated prop's values => name groups
* enumStart
* enumLimit
* offsetArray [x enumLimit - enumStart] (>98)
*
* => pad to next 4-byte boundary
*
* 5b: # NonContiguousEnumToOffset for enumerated prop's values => name groups
* count
* enumArray [x count]
* offsetArray [x count] (>98)
*
* => pad to next 4-byte boundary
*
* 6: # NameToEnum for enumerated prop's values
* count
* enumArray [x count]
* nameArray [x count] (>99)
*
* => pad to next 4-byte boundary
*
* 98: # name group pool {NGP}
* [array of Offset values] (>99)
*
* 99: # string pool {SP}
* [pool of nul-terminated char* strings]
*/
U_NAMESPACE_END
#endif
//eof
--- NEW FILE: punycode.c ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: punycode.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan31
* created by: Markus W. Scherer
*/
/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
Disclaimer and license
Regarding this entire document or any portion of it (including
the pseudocode and C code), the author makes no guarantees and
is not responsible for any damage resulting from its use. The
author grants irrevocable permission to anyone to use, modify,
and distribute it in any way that does not diminish the rights
of anyone else to use, modify, and distribute it, provided that
redistributed derivative works do not contain misleading author or
version information. Derivative works need not be licensed under
similar terms.
*/
/*
* ICU modifications:
* - ICU data types and coding conventions
* - ICU string buffer handling with implicit source lengths
* and destination preflighting
* - UTF-16 handling
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "ustr_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "punycode.h"
#include "unicode/ustring.h"
/* Punycode ----------------------------------------------------------------- */
/* Punycode parameters for Bootstring */
#define BASE 36
#define TMIN 1
#define TMAX 26
#define SKEW 38
#define DAMP 700
#define INITIAL_BIAS 72
#define INITIAL_N 0x80
/* "Basic" Unicode/ASCII code points */
#define _HYPHEN 0X2d
#define DELIMITER _HYPHEN
#define _ZERO_ 0X30
#define _NINE 0x39
#define _SMALL_A 0X61
#define _SMALL_Z 0X7a
#define _CAPITAL_A 0X41
#define _CAPITAL_Z 0X5a
#define IS_BASIC(c) ((c)<0x80)
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
/**
* digitToBasic() returns the basic code point whose value
* (when used for representing integers) is d, which must be in the
* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
* nonzero, in which case the uppercase form is used.
*/
static U_INLINE char
digitToBasic(int32_t digit, UBool uppercase) {
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
if(digit<26) {
if(uppercase) {
return (char)(_CAPITAL_A+digit);
} else {
return (char)(_SMALL_A+digit);
}
} else {
return (char)((_ZERO_-26)+digit);
}
}
/**
* basicToDigit[] contains the numeric value of a basic code
* point (for use in representing integers) in the range 0 to
* BASE-1, or -1 if b is does not represent a value.
*/
static const int8_t
basicToDigit[256]={
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static U_INLINE char
asciiCaseMap(char b, UBool uppercase) {
if(uppercase) {
if(_SMALL_A<=b && b<=_SMALL_Z) {
b-=(_SMALL_A-_CAPITAL_A);
}
} else {
if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
b+=(_SMALL_A-_CAPITAL_A);
}
}
return b;
}
/* Punycode-specific Bootstring code ---------------------------------------- */
/*
* The following code omits the {parts} of the pseudo-algorithm in the spec
* that are not used with the Punycode parameter set.
*/
/* Bias adaptation function. */
static int32_t
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
int32_t count;
if(firstTime) {
delta/=DAMP;
} else {
delta/=2;
}
delta+=delta/length;
for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
delta/=(BASE-TMIN);
}
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
#define MAX_CP_COUNT 200
U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
const UBool *caseFlags,
UErrorCode *pErrorCode) {
int32_t cpBuffer[MAX_CP_COUNT];
int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
UChar c, c2;
/* argument checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/*
* Handle the basic code points and
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=destLength=0;
if(srcLength==-1) {
/* NUL-terminated input */
for(j=0; /* no condition */; ++j) {
if((c=src[j])==0) {
break;
}
if(srcCPCount==MAX_CP_COUNT) {
/* too many input code points */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if(IS_BASIC(c)) {
cpBuffer[srcCPCount++]=0;
if(destLength<destCapacity) {
dest[destLength]=
caseFlags!=NULL ?
asciiCaseMap((char)c, caseFlags[j]) :
(char)c;
}
++destLength;
} else {
n=(caseFlags!=NULL && caseFlags[j])<<31L;
if(UTF_IS_SINGLE(c)) {
n|=c;
} else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
++j;
n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
} else {
/* error: unmatched surrogate */
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
cpBuffer[srcCPCount++]=n;
}
}
} else {
/* length-specified input */
for(j=0; j<srcLength; ++j) {
if(srcCPCount==MAX_CP_COUNT) {
/* too many input code points */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
c=src[j];
if(IS_BASIC(c)) {
if(destLength<destCapacity) {
cpBuffer[srcCPCount++]=0;
dest[destLength]=
caseFlags!=NULL ?
asciiCaseMap((char)c, caseFlags[j]) :
(char)c;
}
++destLength;
} else {
n=(caseFlags!=NULL && caseFlags[j])<<31L;
if(UTF_IS_SINGLE(c)) {
n|=c;
} else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
++j;
n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
} else {
/* error: unmatched surrogate */
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
cpBuffer[srcCPCount++]=n;
}
}
}
/* Finish the basic string - if it is not empty - with a delimiter. */
basicLength=destLength;
if(basicLength>0) {
if(destLength<destCapacity) {
dest[destLength]=DELIMITER;
}
++destLength;
}
/*
* handledCPCount is the number of code points that have been handled
* basicLength is the number of basic code points
* destLength is the number of chars that have been output
*/
/* Initialize the state: */
n=INITIAL_N;
delta=0;
bias=INITIAL_BIAS;
/* Main encoding loop: */
for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
/*
* All non-basic code points < n have been handled already.
* Find the next larger one:
*/
for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(n<=q && q<m) {
m=q;
}
}
/*
* Increase delta enough to advance the decoder's
* <n,i> state to <m,0>, but guard against overflow:
*/
if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
return 0;
}
delta+=(m-n)*(handledCPCount+1);
n=m;
/* Encode a sequence of same code points n */
for(j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(q<n) {
++delta;
} else if(q==n) {
/* Represent delta as a generalized variable-length integer: */
for(q=delta, k=BASE; /* no condition */; k+=BASE) {
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(t>TMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(q<t) {
break;
}
if(destLength<destCapacity) {
dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), 0);
}
q=(q-t)/(BASE-t);
}
if(destLength<destCapacity) {
dest[destLength++]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
}
bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
delta=0;
++handledCPCount;
}
}
++delta;
++n;
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
UBool *caseFlags,
UErrorCode *pErrorCode) {
int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
destCPCount, firstSupplementaryIndex, cpLength;
UChar b;
/* argument checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(srcLength==-1) {
srcLength=u_strlen(src);
}
/*
* Handle the basic code points:
* Let basicLength be the number of input code points
* before the last delimiter, or 0 if there is none,
* then copy the first basicLength code points to the output.
*
* The two following loops iterate backward.
*/
for(j=srcLength; j>0;) {
if(src[--j]==DELIMITER) {
break;
}
}
destLength=basicLength=destCPCount=j;
while(j>0) {
b=src[--j];
if(!IS_BASIC(b)) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
if(j<destCapacity) {
dest[j]=(UChar)b;
if(caseFlags!=NULL) {
caseFlags[j]=IS_BASIC_UPPERCASE(b);
}
}
}
/* Initialize the state: */
n=INITIAL_N;
i=0;
bias=INITIAL_BIAS;
firstSupplementaryIndex=1000000000;
/*
* Main decoding loop:
* Start just after the last delimiter if any
* basic code points were copied; start at the beginning otherwise.
*/
for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
/*
* in is the index of the next character to be consumed, and
* destCPCount is the number of code points in the output array.
*
* Decode a generalized variable-length integer into delta,
* which gets added to i. The overflow checking is easier
* if we increase i as we go, then subtract off its starting
* value at the end to obtain delta.
*/
for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
if(in>=srcLength) {
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
digit=basicToDigit[(uint8_t)src[in++]];
if(digit<0) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
if(digit>(0x7fffffff-i)/w) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
i+=digit*w;
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(t>TMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(digit<t) {
break;
}
if(w>0x7fffffff/(BASE-t)) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
w*=BASE-t;
}
/*
* Modification from sample code:
* Increments destCPCount here,
* where needed instead of in for() loop tail.
*/
++destCPCount;
bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
/*
* i was supposed to wrap around from (incremented) destCPCount to 0,
* incrementing n each time, so we'll fix that now:
*/
if(i/destCPCount>(0x7fffffff-n)) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
n+=i/destCPCount;
i%=destCPCount;
/* not needed for Punycode: */
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
if(n>0x10ffff || UTF_IS_SURROGATE(n)) {
/* Unicode code point overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
/* Insert n at position i of the output: */
cpLength=UTF_CHAR_LENGTH(n);
if((destLength+cpLength)<destCapacity) {
int32_t codeUnitIndex;
/*
* Handle indexes when supplementary code points are present.
*
* In almost all cases, there will be only BMP code points before i
* and even in the entire string.
* This is handled with the same efficiency as with UTF-32.
*
* Only the rare cases with supplementary code points are handled
* more slowly - but not too bad since this is an insertion anyway.
*/
if(i<=firstSupplementaryIndex) {
codeUnitIndex=i;
if(cpLength>1) {
firstSupplementaryIndex=codeUnitIndex;
} else {
++firstSupplementaryIndex;
}
} else {
codeUnitIndex=firstSupplementaryIndex;
UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
}
/* use the UChar index codeUnitIndex instead of the code point index i */
if(codeUnitIndex<destLength) {
uprv_memmove(dest+codeUnitIndex+cpLength,
dest+codeUnitIndex,
(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
if(caseFlags!=NULL) {
uprv_memmove(caseFlags+codeUnitIndex+cpLength,
caseFlags+codeUnitIndex,
destLength-codeUnitIndex);
}
}
if(cpLength==1) {
/* BMP, insert one code unit */
dest[codeUnitIndex]=(UChar)n;
} else {
/* supplementary character, insert two code units */
dest[codeUnitIndex]=UTF16_LEAD(n);
dest[codeUnitIndex+1]=UTF16_TRAIL(n);
}
if(caseFlags!=NULL) {
/* Case of last character determines uppercase flag: */
caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
if(cpLength==2) {
caseFlags[codeUnitIndex+1]=FALSE;
}
}
}
destLength+=cpLength;
++i;
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
#endif /* #if !UCONFIG_NO_IDNA */
--- NEW FILE: punycode.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: punycode.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan31
* created by: Markus W. Scherer
*/
/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
*/
#ifndef __PUNYCODE_H__
#define __PUNYCODE_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
/**
* u_strToPunycode() converts Unicode to Punycode.
*
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
* The output string is NUL-terminated according to normal ICU
* string output rules.
*
* @param src Input Unicode string.
* This function handles a limited amount of code points
* (the limit is >=64).
* U_INDEX_OUTOFBOUNDS_ERROR is set if the limit is exceeded.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Punycode array.
* @param destCapacity Size of dest.
* @param caseFlags Vector of boolean values, one per input UChar,
* indicating that the corresponding character is to be
* marked for the decoder optionally
* uppercasing (TRUE) or lowercasing (FALSE)
* the character.
* ASCII characters are output directly in the case as marked.
* Flags corresponding to trail surrogates are ignored.
* If caseFlags==NULL then input characters are not
* case-mapped.
* @param pErrorCode ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* @return Number of ASCII characters in puny.
*
* @see u_strFromPunycode
*/
U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
const UBool *caseFlags,
UErrorCode *pErrorCode);
/**
* u_strFromPunycode() converts Punycode to Unicode.
* The Unicode string will be at most as long (in UChars)
* than the Punycode string (in chars).
*
* @param src Input Punycode string.
* @param srcLength Length of puny, or -1 if NUL-terminated
* @param dest Output Unicode string buffer.
* @param destCapacity Size of dest in number of UChars,
* and of caseFlags in numbers of UBools.
* @param caseFlags Output array for case flags as
* defined by the Punycode string.
* The caller should uppercase (TRUE) or lowercase (FASLE)
* the corresponding character in dest.
* For supplementary characters, only the lead surrogate
* is marked, and FALSE is stored for the trail surrogate.
* This is redundant and not necessary for ASCII characters
* because they are already in the case indicated.
* Can be NULL if the case flags are not needed.
* @param pErrorCode ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if a non-ASCII character
* precedes the last delimiter ('-'),
* or if an invalid character (not a-zA-Z0-9) is found
* after the last delimiter.
* U_ILLEGAL_CHAR_FOUND if the delta sequence is ill-formed.
* @return Number of UChars written to dest.
*
* @see u_strToPunycode
*/
U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
UBool *caseFlags,
UErrorCode *pErrorCode);
#endif /* #if !UCONFIG_NO_IDNA */
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/
--- NEW FILE: rbbi.cpp ---
//
// file: rbbi.c Contains the implementation of the rule based break iterator
// runtime engine and the API implementation for
// class RuleBasedBreakIterator
//
/*
***************************************************************************
* Copyright (C) 1999-2003 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/rbbi.h"
#include "unicode/schriter.h"
#include "unicode/udata.h"
[...998 lines suppressed...]
// chars.
//
// This function is intended for use by dictionary based
// break iterators.
//
//-------------------------------------------------------------------------------
UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
if (fData == NULL) {
return FALSE;
}
uint16_t category;
UTRIE_GET16(&fData->fTrie, c, category);
return (category & 0x4000) != 0;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbicst.pl ---
#**************************************************************************
# Copyright (C) 2002-2003 International Business Machines Corporation *
# and others. All rights reserved. *
#**************************************************************************
#
# rbbicst Compile the RBBI rule paser state table data into initialized C data.
# Usage:
# cd icu/source/common
# perl rbbicst.pl [-j] < rbbirpt.txt > rbbirpt.h
#
# The output file, rbbrpt.h, is included by some of the .cpp rbbi
# implementation files. This perl script is NOT run as part
# of a normal ICU build. It is run by hand when needed, and the
# rbbirpt.h generated file is put back into cvs.
#
# See rbbirpt.h for a description of the input format for this script.
#
if ($ARGV[0] eq "-j") {
$javaOutput = 1;
shift @ARGV;
}
$num_states = 1; # Always the state number for the line being compiled.
$line_num = 0; # The line number in the input file.
$states{"pop"} = 255; # Add the "pop" to the list of defined state names.
# This prevents any state from being labelled with "pop",
# and resolves references to "pop" in the next state field.
line_loop: while (<>) {
chomp();
$line = $_;
@fields = split();
$line_num++;
# Remove # comments, which are any fields beginning with a #, plus all
# that follow on the line.
for ($i=0; $i<@fields; $i++) {
if ($fields[$i] =~ /^#/) {
@fields = @fields[0 .. $i-1];
last;
}
}
# ignore blank lines, and those with no fields left after stripping comments..
if (@fields == 0) {
next;
}
#
# State Label: handling.
# Does the first token end with a ":"? If so, it's the name of a state.
# Put in a hash, together with the current state number,
# so that we can later look up the number from the name.
#
if (@fields[0] =~ /.*:$/) {
$state_name = @fields[0];
$state_name =~ s/://; # strip off the colon from the state name.
if ($states{$state_name} != 0) {
print " rbbicst: at line $line-num duplicate definition of state $state_name\n";
}
$states{$state_name} = $num_states;
$stateNames[$num_states] = $state_name;
# if the label was the only thing on this line, go on to the next line,
# otherwise assume that a state definition is on the same line and fall through.
if (@fields == 1) {
next line_loop;
}
shift @fields; # shift off label field in preparation
# for handling the rest of the line.
}
#
# State Transition line.
# syntax is this,
# character [n] target-state [^push-state] [function-name]
# where
# [something] is an optional something
# character is either a single quoted character e.g. '['
# or a name of a character class, e.g. white_space
#
$state_line_num[$num_states] = $line_num; # remember line number with each state
# so we can make better error messages later.
#
# First field, character class or literal character for this transition.
#
if ($fields[0] =~ /^'.'$/) {
# We've got a quoted literal character.
$state_literal_chars[$num_states] = $fields[0];
$state_literal_chars[$num_states] =~ s/'//g;
} else {
# We've got the name of a character class.
$state_char_class[$num_states] = $fields[0];
if ($fields[0] =~ /[\W]/) {
print " rbbicsts: at line $line_num, bad character literal or character class name.\n";
print " scanning $fields[0]\n";
exit(-1);
}
}
shift @fields;
#
# do the 'n' flag
#
$state_flag[$num_states] = "FALSE";
if ($fields[0] eq "n") {
$state_flag[$num_states] = "TRUE";
shift @fields;
}
#
# do the destination state.
#
$state_dest_state[$num_states] = $fields[0];
if ($fields[0] eq "") {
print " rbbicsts: at line $line_num, destination state missing.\n";
exit(-1);
}
shift @fields;
#
# do the push state, if present.
#
if ($fields[0] =~ /^\^/) {
$fields[0] =~ s/^\^//;
$state_push_state[$num_states] = $fields[0];
if ($fields[0] eq "" ) {
print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n";
exit(-1);
}
shift @fields;
}
#
# Lastly, do the optional action name.
#
if ($fields[0] ne "") {
$state_func_name[$num_states] = $fields[0];
shift @fields;
}
#
# There should be no fields left on the line at this point.
#
if (@fields > 0) {
print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n";
print " scanning $fields[0]\n";
}
$num_states++;
}
#
# We've read in the whole file, now go back and output the
# C source code for the state transition table.
#
# We read all states first, before writing anything, so that the state numbers
# for the destination states are all available to be written.
#
#
# Make hashes for the names of the character classes and
# for the names of the actions that appeared.
#
for ($state=1; $state < $num_states; $state++) {
if ($state_char_class[$state] ne "") {
if ($charClasses{$state_char_class[$state]} == 0) {
$charClasses{$state_char_class[$state]} = 1;
}
}
if ($state_func_name[$state] eq "") {
$state_func_name[$state] = "doNOP";
}
if ($actions{$state_action_name[$state]} == 0) {
$actions{$state_func_name[$state]} = 1;
}
}
#
# Check that all of the destination states have been defined
#
#
$states{"exit"} = 0; # Predefined state name, terminates state machine.
for ($state=1; $state<$num_states; $state++) {
if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
$errors++;
}
if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
$errors++;
}
}
die if ($errors>0);
if ($javaOutput) {
print "/*\n";
print " *******************************************************************************\n";
print " * Copyright (C) 2003,\n";
print " * International Business Machines Corporation and others. All Rights Reserved.\n";
print " *******************************************************************************\n";
print " *\n";
print " * \$Source: /usr/local/cvsroot/icu-sword/source/common/rbbicst.pl,v $\n";
print " * \$Date: 2003/09/10 02:42:02 $\n";
print " * \$Revision: 1.1 $\n";
print " *\n";
print " *******************************************************************************\n";
print " */\n";
print " \n";
print "package com.ibm.icu.text;\n";
print " \n";
print "/**\n";
print " * Generated Java File. Do not edit by hand.\n";
print " * This file contains the state table for the ICU Rule Based Break Iterator\n";
print " * rule parser.\n";
print " * It is generated by the Perl script \"rbbicst.pl\" from\n";
print " * the rule parser state definitions file \"rbbirpt.txt\".\n";
print " *\n";
print " */\n";
print "public class RuleBasedBreakIteratorStateTable\n";
print "{\n";
#
# Emit the constants for the actions to be performed.
#
$n = 1;
foreach $act (keys %actions) {
print " public static final int $act = $n;\n";
$n++;
}
print " \n";
#
# emit the state transition table
#
print " public static final String[] gRuleParseStateTable = {\n";
printf(" \"\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\"\n", doNOP, 0, 0, 0, 1);
for ($state=1; $state < $num_states; $state++) {
printf(" , \"\\u%04.4x", $state_func_name[$state]);
# print " , {$state_func_name[$state],";
if ($state_literal_chars[$state] ne "") {
printf("\\u%04.4x", $state_func_name[$state]);
}else {
printf("\\u%04.4x", $charClasses{$state_char_class[$state]});
}
printf("\\u%04.4x", $states{$state_dest_state[$state]});
# The push-state field is optional. If omitted, fill field with a zero, which flags
# the state machine that there is no push state.
if ($state_push_state[$state] eq "") {
print "\\u0000";
} else {
printf("\\u%04.4x", $states{$state_push_state[$state]});
}
printf("\\u%04.4x", $state_flag[$state]);
# For the first row of each state, append the state name.
# Used for debugging only.
if ($stateNames[$state] ne "") {
printf("%-20s", $stateNames[$state]."\"");
} else {
printf("%-20s", "\"");
}
# Put out a C++ comment showing the number (index) of this state row,
print " // $state ";
print "\n";
};
print " };\n";
print "}\n";
}
else
{
#
# C++ Output ...
#
print "//---------------------------------------------------------------------------------\n";
print "//\n";
print "// Generated Header File. Do not edit by hand.\n";
print "// This file contains the state table for the ICU Rule Based Break Iterator\n";
print "// rule parser.\n";
print "// It is generated by the Perl script \"rbbicst.pl\" from\n";
print "// the rule parser state definitions file \"rbbirpt.txt\".\n";
print "//\n";
print "// Copyright (C) 2002 International Business Machines Corporation \n";
print "// and others. All rights reserved. \n";
print "//\n";
print "//---------------------------------------------------------------------------------\n";
print "#ifndef RBBIRPT_H\n";
print "#define RBBIRPT_H\n";
print "\n";
print "U_NAMESPACE_BEGIN\n";
#
# Emit the constants for indicies of Unicode Sets
# Define one constant for each of the character classes encountered.
# At the same time, store the index corresponding to the set name back into hash.
#
print "//\n";
print "// Character classes for RBBI rule scanning.\n";
print "//\n";
$i = 128; # State Table values for Unicode char sets range from 128-250.
# Sets "default", "escaped", etc. get special handling.
# They have no corresponding UnicodeSet object in the state machine,
# but are handled by special case code. So we emit no reference
# to a UnicodeSet object to them here.
foreach $setName (keys %charClasses) {
if ($setName eq "default") {
$charClasses{$setName} = 255;}
elsif ($setName eq "escaped") {
$charClasses{$setName} = 254;}
elsif ($setName eq "escapedP") {
$charClasses{$setName} = 253;}
elsif ($setName eq "eof") {
$charClasses{$setName} = 252;}
else {
# Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
print " static const uint8_t kRuleSet_$setName = $i;\n";
$charClasses{$setName} = $i;
$i++;
}
}
print "\n\n";
#
# Emit the enum for the actions to be performed.
#
print "enum RBBI_RuleParseAction {\n";
foreach $act (keys %actions) {
print " $act,\n";
}
print " rbbiLastAction};\n\n";
#
# Emit the struct definition for transtion table elements.
#
print "//-------------------------------------------------------------------------------\n";
print "//\n";
print "// RBBIRuleTableEl represents the structure of a row in the transition table\n";
print "// for the rule parser state machine.\n";
print "//-------------------------------------------------------------------------------\n";
print "struct RBBIRuleTableEl {\n";
print " RBBI_RuleParseAction fAction;\n";
print " uint8_t fCharClass; // 0-127: an individual ASCII character\n";
print " // 128-255: character class index\n";
print " uint8_t fNextState; // 0-250: normal next-stat numbers\n";
print " // 255: pop next-state from stack.\n";
print " uint8_t fPushState;\n";
print " UBool fNextChar;\n";
print "};\n\n";
#
# emit the state transition table
#
print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1.
for ($state=1; $state < $num_states; $state++) {
print " , {$state_func_name[$state],";
if ($state_literal_chars[$state] ne "") {
$c = $state_literal_chars[$state];
printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok.
}else {
print " $charClasses{$state_char_class[$state]},";
}
print " $states{$state_dest_state[$state]},";
# The push-state field is optional. If omitted, fill field with a zero, which flags
# the state machine that there is no push state.
if ($state_push_state[$state] eq "") {
print "0, ";
} else {
print " $states{$state_push_state[$state]},";
}
print " $state_flag[$state]} ";
# Put out a C++ comment showing the number (index) of this state row,
# and, if this is the first row of the table for this state, the state name.
print " // $state ";
if ($stateNames[$state] ne "") {
print " $stateNames[$state]";
}
print "\n";
};
print " };\n";
#
# emit a mapping array from state numbers to state names.
#
# This array is used for producing debugging output from the rule parser.
#
print "static const char * const RBBIRuleStateNames[] = {";
for ($state=0; $state<$num_states; $state++) {
if ($stateNames[$state] ne "") {
print " \"$stateNames[$state]\",\n";
} else {
print " 0,\n";
}
}
print " 0};\n\n";
print "U_NAMESPACE_END\n";
print "#endif\n";
}
--- NEW FILE: rbbidata.cpp ---
/*
***************************************************************************
* Copyright (C) 1999-2003 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/utypes.h"
#include "rbbidata.h"
#include "rbbirb.h"
#include "utrie.h"
#include "udatamem.h"
#include "cmemory.h"
#include "cstring.h"
#include "umutex.h"
#include "uassert.h"
//-----------------------------------------------------------------------------------
//
// Trie access folding function. Copied as-is from properties code in uchar.c
//
//-----------------------------------------------------------------------------------
U_CDECL_BEGIN
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
U_CDECL_END
U_NAMESPACE_BEGIN
//-----------------------------------------------------------------------------
//
// Constructors.
//
//-----------------------------------------------------------------------------
RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
init(data, status);
}
RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
const RBBIDataHeader *d = (const RBBIDataHeader *)
((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
init(d, status);
fUDataMem = udm;
}
//-----------------------------------------------------------------------------
//
// init(). Does most of the work of construction, shared between the
// constructors.
//
//-----------------------------------------------------------------------------
void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fHeader = data;
if (fHeader->fMagic != 0xb1a0) {
status = U_BRK_INTERNAL_ERROR;
return;
}
fUDataMem = NULL;
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
fReverseTable = NULL;
if (data->fRTableLen != 0) {
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
}
utrie_unserialize(&fTrie,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
&status);
if (U_FAILURE(status)) {
return;
}
fTrie.getFoldingOffset=getFoldingOffset;
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
fRuleString.setTo(TRUE, fRuleSource, -1);
fRefCount = 1;
#ifdef RBBI_DEBUG
char *debugEnv = getenv("U_RBBIDEBUG");
if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
#endif
}
//-----------------------------------------------------------------------------
//
// Destructor. Don't call this - use removeReferenc() instead.
//
//-----------------------------------------------------------------------------
RBBIDataWrapper::~RBBIDataWrapper() {
U_ASSERT(fRefCount == 0);
if (fUDataMem) {
udata_close(fUDataMem);
} else {
uprv_free((void *)fHeader);
}
}
//-----------------------------------------------------------------------------
//
// Operator == Consider two RBBIDataWrappers to be equal if they
// refer to the same underlying data. Although
// the data wrappers are normally shared between
// iterator instances, it's possible to independently
// open the same data twice, and get two instances, which
// should still be ==.
//
//-----------------------------------------------------------------------------
UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
if (fHeader == other.fHeader) {
return TRUE;
}
if (fHeader->fLength != other.fHeader->fLength) {
return FALSE;
}
if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
return TRUE;
}
return FALSE;
}
int32_t RBBIDataWrapper::hashCode() {
return fHeader->fFTableLen;
}
//-----------------------------------------------------------------------------
//
// Reference Counting. A single RBBIDataWrapper object is shared among
// however many RulesBasedBreakIterator instances are
// referencing the same data.
//
//-----------------------------------------------------------------------------
void RBBIDataWrapper::removeReference() {
if (umtx_atomic_dec(&fRefCount) == 0) {
delete this;
}
}
RBBIDataWrapper *RBBIDataWrapper::addReference() {
umtx_atomic_inc(&fRefCount);
return this;
}
//-----------------------------------------------------------------------------
//
// getRuleSourceString
//
//-----------------------------------------------------------------------------
const UnicodeString &RBBIDataWrapper::getRuleSourceString() {
return fRuleString;
}
//-----------------------------------------------------------------------------
//
// print - debugging function to dump the runtime data tables.
//
//-----------------------------------------------------------------------------
void RBBIDataWrapper::printData() {
#ifdef RBBI_DEBUG
uint32_t c, s;
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
RBBIDebugPrintf(" Version = %d\n", fHeader->fVersion);
RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
RBBIDebugPrintf(" Forward State Transition Table\n");
RBBIDebugPrintf("State | Acc LA Tag");
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {RBBIDebugPrintf("----");}
RBBIDebugPrintf("\n");
for (s=0; s<fForwardTable->fNumStates; s++) {
RBBIStateTableRow *row = (RBBIStateTableRow *)
(fForwardTable->fTableData + (fForwardTable->fRowLen * s));
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTag);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->fNextState[c]);
}
RBBIDebugPrintf("\n");
}
RBBIDebugPrintf("\nOrignal Rules source:\n");
c = 0;
for (;;) {
if (fRuleSource[c] == 0)
break;
RBBIDebugPrintf("%c", fRuleSource[c]);
c++;
}
RBBIDebugPrintf("\n\n");
#endif
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbidata.h ---
// file: rbbidata.h
//
//**********************************************************************
// Copyright (C) 1999 IBM Corp. All rights reserved.
//**********************************************************************
//
// RBBI data formats Includes
//
// Structs that describes the format of the Binary RBBI data,
// as it is stored in ICU's data file.
//
// RBBIDataWrapper - Instances of this class sit between the
// raw data structs and the RulesBasedBreakIterator objects
// that are created by applications. The wrapper class
// provides reference counting for the underlying data,
// and direct pointers to data that would not otherwise
// be accessible without ugly pointer arithmetic. The
// wrapper does not attempt to provide any higher level
// abstractions for the data itself.
//
// There will be only one instance of RBBIDataWrapper for any
// set of RBBI run time data being shared by instances
// (clones) of RulesBasedBreakIterator.
//
#ifndef __RBBIDATA_H__
#define __RBBIDATA_H__
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/udata.h"
#include "utrie.h"
U_NAMESPACE_BEGIN
//
// The following structs map exactly onto the raw data from ICU common data file.
//
struct RBBIDataHeader {
uint32_t fMagic; // == 0xbla0
uint32_t fVersion; // == 1
uint32_t fLength; // Total length in bytes of this RBBI Data,
// including all sections, not just the header.
uint32_t fCatCount; // Number of character categories.
//
// Offsets and sizes of each of the subsections within the RBBI data.
// All offsets are bytes from the start of the RBBIDataHeader.
// All sizes are in bytes.
//
uint32_t fFTable; // forward state transition table.
uint32_t fFTableLen;
uint32_t fRTable; // Offset to the reverse state transition table.
uint32_t fRTableLen;
uint32_t fTrie; // Offset to Trie data for character categories
uint32_t fTrieLen;
uint32_t fRuleSource; // Offset to the source for for the break
uint32_t fRuleSourceLen; // rules. Stored UChar *.
uint32_t fReserved[8]; // Reserved for expansion
};
struct RBBIStateTableRow {
int16_t fAccepting; // Non-zero if this row is for an accepting state.
// Value is the {nnn} value to return to calling
// application.
int16_t fLookAhead; // Non-zero if this row is for a state that
// corresponds to a '/' in the rule source.
// Value is the same as the fAccepting
// value for the rule (which will appear
// in a different state.
int16_t fTag; // Non-zero if this row covers a {tagged} position
// from a rule. value is the tag number.
int16_t fReserved;
uint16_t fNextState[2]; // Next State, indexed by char category.
// Array Size is fNumCols from the
// state table header.
// CAUTION: see RBBITableBuilder::getTableSize()
// before changing anything here.
};
struct RBBIStateTable {
uint32_t fNumStates; // Number of states.
uint32_t fRowLen; // Length of a state table row, in bytes.
char fTableData[4]; // First RBBIStateTableRow begins here.
// (making it char[] simplifies ugly address
// arithmetic for indexing variable length rows.)
};
//
// The reference counting wrapper class
//
class RBBIDataWrapper : public UMemory {
public:
RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
~RBBIDataWrapper();
void init(const RBBIDataHeader *data, UErrorCode &status);
RBBIDataWrapper *addReference();
void removeReference();
UBool operator ==(const RBBIDataWrapper &other) const;
int32_t hashCode();
const UnicodeString &getRuleSourceString();
void printData();
//
// Pointers to items within the data
//
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
const RBBIStateTable *fReverseTable;
const UChar *fRuleSource;
UTrie fTrie;
private:
int32_t fRefCount;
UDataMemory *fUDataMem;
UnicodeString fRuleString;
RBBIDataWrapper(const RBBIDataWrapper &other); // forbid copying of this class
RBBIDataWrapper &operator=(const RBBIDataWrapper &other); // forbid copying of this class
};
U_NAMESPACE_END
#endif
--- NEW FILE: rbbinode.cpp ---
/*
***************************************************************************
* Copyright (C) 2002-2003 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
//
// File: rbbinode.cpp
//
// Implementation of class RBBINode, which represents a node in the
// tree generated when parsing the Rules Based Break Iterator rules.
//
// This "Class" is actually closer to a struct.
// Code using it is expected to directly access fields much of the time.
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/parsepos.h"
#include "uvector.h"
#include "rbbirb.h"
#include "rbbinode.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
int RBBINode::gLastSerial = 0;
//-------------------------------------------------------------------------
//
// Constructor. Just set the fields to reasonable default values.
//
//-------------------------------------------------------------------------
RBBINode::RBBINode(NodeType t) : UMemory() {
fSerialNum = ++gLastSerial;
fType = t;
fParent = NULL;
fLeftChild = NULL;
fRightChild = NULL;
fInputSet = NULL;
fFirstPos = 0;
fLastPos = 0;
fNullable = FALSE;
fLookAheadEnd = FALSE;
fVal = 0;
fPrecedence = precZero;
UErrorCode status = U_ZERO_ERROR;
fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
fLastPosSet = new UVector(status);
fFollowPos = new UVector(status);
if (t==opCat) {fPrecedence = precOpCat;}
else if (t==opOr) {fPrecedence = precOpOr;}
else if (t==opStart) {fPrecedence = precStart;}
else if (t==opLParen) {fPrecedence = precLParen;}
}
RBBINode::RBBINode(const RBBINode &other) : UMemory(other) {
fSerialNum = ++gLastSerial;
fType = other.fType;
fParent = NULL;
fLeftChild = NULL;
fRightChild = NULL;
fInputSet = other.fInputSet;
fPrecedence = other.fPrecedence;
fText = other.fText;
fFirstPos = other.fFirstPos;
fLastPos = other.fLastPos;
fNullable = other.fNullable;
fVal = other.fVal;
UErrorCode status = U_ZERO_ERROR;
fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
fLastPosSet = new UVector(status);
fFollowPos = new UVector(status);
}
//-------------------------------------------------------------------------
//
// Destructor. Deletes both this node AND any child nodes,
// except in the case of variable reference nodes. For
// these, the l. child points back to the definition, which
// is common for all references to the variable, meaning
// it can't be deleted here.
//
//-------------------------------------------------------------------------
RBBINode::~RBBINode() {
// printf("deleting node %8x serial %4d\n", this, this->fSerialNum);
delete fInputSet;
fInputSet = NULL;
switch (this->fType) {
case varRef:
case setRef:
// for these node types, multiple instances point to the same "children"
// Storage ownership of children handled elsewhere. Don't delete here.
break;
default:
delete fLeftChild;
fLeftChild = NULL;
delete fRightChild;
fRightChild = NULL;
}
delete fFirstPosSet;
delete fLastPosSet;
delete fFollowPos;
}
//-------------------------------------------------------------------------
//
// cloneTree Make a copy of the subtree rooted at this node.
// Discard any variable references encountered along the way,
// and replace with copies of the variable's definitions.
// Used to replicate the expression underneath variable
// references in preparation for generating the DFA tables.
//
//-------------------------------------------------------------------------
RBBINode *RBBINode::cloneTree() {
RBBINode *n;
if (fType == RBBINode::varRef) {
// If the current node is a variable reference, skip over it
// and clone the definition of the variable instead.
n = fLeftChild->cloneTree();
} else if (fType == RBBINode::uset) {
n = this;
} else {
n = new RBBINode(*this);
if (fLeftChild != NULL) {
n->fLeftChild = fLeftChild->cloneTree();
n->fLeftChild->fParent = n;
}
if (fRightChild != NULL) {
n->fRightChild = fRightChild->cloneTree();
n->fRightChild->fParent = n;
}
}
return n;
}
//-------------------------------------------------------------------------
//
// flattenVariables Walk a parse tree, replacing any variable
// references with a copy of the variable's definition.
// Aside from variables, the tree is not changed.
//
// Return the root of the tree. If the root was not a variable
// reference, it remains unchanged - the root we started with
// is the root we return. If, however, the root was a variable
// reference, the root of the newly cloned replacement tree will
// be returned, and the original tree deleted.
//
// This function works by recursively walking the tree
// without doing anything until a variable reference is
// found, then calling cloneTree() at that point. Any
// nested references are handled by cloneTree(), not here.
//
//-------------------------------------------------------------------------
RBBINode *RBBINode::flattenVariables() {
if (fType == varRef) {
RBBINode *retNode = fLeftChild->cloneTree();
delete this;
return retNode;
}
if (fLeftChild != NULL) {
fLeftChild = fLeftChild->flattenVariables();
fLeftChild->fParent = this;
}
if (fRightChild != NULL) {
fRightChild = fRightChild->flattenVariables();
fRightChild->fParent = this;
}
return this;
}
//-------------------------------------------------------------------------
//
// flattenSets Walk the parse tree, replacing any nodes of type setRef
// with a copy of the expression tree for the set. A set's
// equivalent expression tree is precomputed and saved as
// the left child of the uset node.
//
//-------------------------------------------------------------------------
void RBBINode::flattenSets() {
U_ASSERT(fType != setRef);
if (fLeftChild != NULL) {
if (fLeftChild->fType==setRef) {
RBBINode *setRefNode = fLeftChild;
RBBINode *usetNode = setRefNode->fLeftChild;
RBBINode *replTree = usetNode->fLeftChild;
fLeftChild = replTree->cloneTree();
fLeftChild->fParent = this;
delete setRefNode;
} else {
fLeftChild->flattenSets();
}
}
if (fRightChild != NULL) {
if (fRightChild->fType==setRef) {
RBBINode *setRefNode = fRightChild;
RBBINode *usetNode = setRefNode->fLeftChild;
RBBINode *replTree = usetNode->fLeftChild;
fRightChild = replTree->cloneTree();
fRightChild->fParent = this;
delete setRefNode;
} else {
fRightChild->flattenSets();
}
}
}
//-------------------------------------------------------------------------
//
// findNodes() Locate all the nodes of the specified type, starting
// at the specified root.
//
//-------------------------------------------------------------------------
void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status) {
/* test for buffer overflows */
if (U_FAILURE(status)) {
return;
}
if (fType == kind) {
dest->addElement(this, status);
}
if (fLeftChild != NULL) {
fLeftChild->findNodes(dest, kind, status);
}
if (fRightChild != NULL) {
fRightChild->findNodes(dest, kind, status);
}
}
//-------------------------------------------------------------------------
//
// print. Print out a single node, for debugging.
//
//-------------------------------------------------------------------------
void RBBINode::print() {
#ifdef RBBI_DEBUG
static const char * const nodeTypeNames[] = {
"setRef",
"uset",
"varRef",
"leafChar",
"lookAhead",
"tag",
"endMark",
"opStart",
"opCat",
"opOr",
"opStar",
"opPlus",
"opQuestion",
"opBreak",
"opReverse",
"opLParen"
};
RBBIDebugPrintf("%10p %12s %10p %10p %10p %4d %6d %d ",
(void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild,
fSerialNum, fFirstPos, fVal);
if (fType == varRef) {
printUnicodeString(fText);
}
RBBIDebugPrintf("\n");
#endif
}
#ifdef RBBI_DEBUG
void RBBINode::printUnicodeString(const UnicodeString &, int) {}
#else
void RBBINode::printUnicodeString(const UnicodeString &s, int minWidth)
{
int i;
for (i=0; i<s.length(); i++) {
RBBIDebugPrintf("%c", s.charAt(i));
// putc(s.charAt(i), stdout);
}
for (i=s.length(); i<minWidth; i++) {
RBBIDebugPrintf(" ");
}
}
#endif
//-------------------------------------------------------------------------
//
// print. Print out the tree of nodes rooted at "this"
//
//-------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBINode::printTree(UBool, UBool) {}
#else
void RBBINode::printTree(UBool printHeading, UBool doVars) {
if (printHeading) {
RBBIDebugPrintf( "-------------------------------------------------------------------\n"
" Address type Parent LeftChild RightChild serial position value\n"
);
}
this->print();
// Only dump the definition under a variable reference if asked to.
// Unconditinally dump children of all other node types.
if (fType != varRef || doVars) {
if (fLeftChild != NULL) {
fLeftChild->printTree(FALSE);
}
if (fRightChild != NULL) {
fRightChild->printTree(FALSE);
}
}
}
#endif
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbinode.h ---
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2001-2002, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
#ifndef RBBINODE_H
#define RBBINODE_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
//
// class RBBINode
//
// Represents a node in the parse tree generated when reading
// a rule file.
//
U_NAMESPACE_BEGIN
class UnicodeSet;
class UVector;
class RBBINode : public UMemory {
public:
enum NodeType {
setRef,
uset,
varRef,
leafChar,
lookAhead,
tag,
endMark,
opStart,
opCat,
opOr,
opStar,
opPlus,
opQuestion,
opBreak,
opReverse,
opLParen
};
enum OpPrecedence {
precZero,
precStart,
precLParen,
precOpOr,
precOpCat
};
NodeType fType;
RBBINode *fParent;
RBBINode *fLeftChild;
RBBINode *fRightChild;
UnicodeSet *fInputSet; // For uset nodes only.
OpPrecedence fPrecedence; // For binary ops only.
UnicodeString fText; // Text corresponding to this node.
// May be lazily evaluated when (if) needed
// for some node types.
int fFirstPos; // Position in the rule source string of the
// first text associated with the node.
// If there's a left child, this will be the same
// as that child's left pos.
int fLastPos; // Last position in the rule source string
// of any text associated with this node.
// If there's a right child, this will be the same
// as that child's last postion.
UBool fNullable; // See Aho.
int32_t fVal; // For leafChar nodes, the value.
// Values are the character category,
// corresponds to columns in the final
// state transition table.
UBool fLookAheadEnd; // For endMark nodes, set TRUE if
// marking the end of a look-ahead rule.
UVector *fFirstPosSet;
UVector *fLastPosSet; // TODO: rename fFirstPos & fLastPos to avoid confusion.
UVector *fFollowPos;
RBBINode(NodeType t);
RBBINode(const RBBINode &other);
~RBBINode();
RBBINode *cloneTree();
RBBINode *flattenVariables();
void flattenSets();
void findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status);
void print();
void printTree(UBool withHeading=TRUE, UBool doVars=FALSE);
static void printUnicodeString(const UnicodeString &s, int minWidth=0);
private:
RBBINode &operator = (const RBBINode &other); // No defs.
UBool operator == (const RBBINode &other); // Private, so these functions won't accidently be used.
int fSerialNum; // Debugging aids.
static int gLastSerial;
};
U_NAMESPACE_END
#endif
--- NEW FILE: rbbirb.cpp ---
//
// file: rbbirb.cpp
//
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the RBBIRuleBuilder class implementation. This is the main class for
// building (compiling) break rules into the tables required by the runtime
// RBBI engine.
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/brkiter.h"
#include "unicode/rbbi.h"
#include "unicode/ubrk.h"
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "cmemory.h"
#include "cstring.h"
#include "rbbirb.h"
#include "rbbinode.h"
#include "rbbiscan.h"
#include "rbbisetb.h"
#include "rbbitblb.h"
#include "rbbidata.h"
U_NAMESPACE_BEGIN
//----------------------------------------------------------------------------------------
//
// Constructor.
//
//----------------------------------------------------------------------------------------
RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
UParseError &parseErr,
UErrorCode &status)
: fRules(rules)
{
fStatus = &status;
fParseError = &parseErr;
fDebugEnv = NULL;
#ifdef RBBI_DEBUG
fDebugEnv = getenv("U_RBBIDEBUG");
#endif
fForwardTree = NULL;
fReverseTree = NULL;
fForwardTables = NULL;
fReverseTables = NULL;
fUSetNodes = new UVector(status);
fScanner = new RBBIRuleScanner(this);
fSetBuilder = new RBBISetBuilder(this);
if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
//----------------------------------------------------------------------------------------
//
// Destructor
//
//----------------------------------------------------------------------------------------
RBBIRuleBuilder::~RBBIRuleBuilder() {
int i;
for (i=0; ; i++) {
RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
if (n==NULL) {
break;
}
delete n;
}
delete fUSetNodes;
delete fSetBuilder;
delete fForwardTables;
delete fReverseTables;
delete fForwardTree;
delete fReverseTree;
delete fScanner;
}
//----------------------------------------------------------------------------------------
//
// flattenData() - Collect up the compiled RBBI rule data and put it into
// the format for saving in ICU data files,
// which is also the format needed by the RBBI runtime engine.
//
//----------------------------------------------------------------------------------------
static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
RBBIDataHeader *RBBIRuleBuilder::flattenData() {
if (U_FAILURE(*fStatus)) {
return NULL;
}
// Remove comments and whitespace from the rules to make it smaller.
UnicodeString strippedRules(RBBIRuleScanner::stripRules(fRules));
// Calculate the size of each section in the data.
// Sizes here are padded up to a multiple of 8 for better memory alignment.
// Sections sizes actually stored in the header are for the actual data
// without the padding.
//
int32_t headerSize = align8(sizeof(RBBIDataHeader));
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
+ trieSize + rulesSize;
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
if (data == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memset(data, 0, totalSize);
data->fMagic = 0xb1a0;
data->fVersion = 1;
data->fLength = totalSize;
data->fCatCount = fSetBuilder->getNumCharCategories();
data->fFTable = headerSize;
data->fFTableLen = forwardTableSize;
data->fRTable = data->fFTable + forwardTableSize;
data->fRTableLen = reverseTableSize;
data->fTrie = data->fRTable + reverseTableSize;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fRuleSource = data->fTrie + trieSize;
data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
fReverseTables->exportTable((uint8_t *)data + data->fRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
return data;
}
//----------------------------------------------------------------------------------------
//
// createRuleBasedBreakIterator construct from source rules that are passed in
// in a UnicodeString
//
//----------------------------------------------------------------------------------------
BreakIterator *
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return NULL;
}
//
// Read the input rules, generate a parse tree, symbol table,
// and list of all Unicode Sets referenced by the rules.
//
RBBIRuleBuilder builder(rules, parseError, status);
builder.fScanner->parse();
if (U_FAILURE(status)) {
return NULL;
}
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input 32-bit characters to
// the character categories.
//
builder.fSetBuilder->build();
//
// Generate the DFA state transition table.
//
builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
if(builder.fForwardTables == NULL || builder.fReverseTables == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
builder.fForwardTables->build();
builder.fReverseTables->build();
if (U_FAILURE(status)) {
return NULL;
}
//
// Package up the compiled data into a memory image
// in the run-time format.
//
RBBIDataHeader *data;
data = builder.flattenData();
//
// Clean up the compiler related stuff
//
//
// Create a break iterator from the compiled rules.
// (Identical to creation from stored pre-compiled rules)
//
RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
/* test for NULL */
if(This == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
if (U_FAILURE(status)) {
delete This;
This = NULL;
}
return This;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbirb.h ---
//
// rbbirb.h
//
// Copyright (C) 2002, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for several from the Rule Based Break Iterator rule builder.
//
#ifndef RBBIRB_H
#define RBBIRB_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
#include "uhash.h"
#include "uvector.h"
#include "symtable.h" // For UnicodeSet parsing, is the interface that
// looks up references to $variables within a set.
U_NAMESPACE_BEGIN
class RBBIRuleScanner;
struct RBBIRuleTableEl;
class RBBISetBuilder;
class RBBINode;
class RBBITableBuilder;
//--------------------------------------------------------------------------------
//
// RBBISymbolTable. Implements SymbolTable interface that is used by the
// UnicodeSet parser to resolve references to $variables.
//
//--------------------------------------------------------------------------------
class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
public: // of these structs for each entry.
RBBISymbolTableEntry();
UnicodeString key;
RBBINode *val;
~RBBISymbolTableEntry();
private:
RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
};
class RBBISymbolTable : public UMemory, public SymbolTable {
private:
const UnicodeString &fRules;
UHashtable *fHashTable;
RBBIRuleScanner *fRuleScanner;
// These next two fields are part of the mechanism for passing references to
// already-constructed UnicodeSets back to the UnicodeSet constructor
// when the pattern includes $variable references.
const UnicodeString ffffString; // = "/uffff"
UnicodeSet *fCachedSetLookup;
public:
// API inherited from class SymbolTable
virtual const UnicodeString* lookup(const UnicodeString& s) const;
virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
virtual UnicodeString parseReference(const UnicodeString& text,
ParsePosition& pos, int32_t limit) const;
// Additional Functions
RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
virtual ~RBBISymbolTable();
virtual RBBINode *lookupNode(const UnicodeString &key) const;
virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
virtual void print() const;
private:
RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
};
//--------------------------------------------------------------------------------
//
// class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
//
//--------------------------------------------------------------------------------
class RBBIRuleBuilder : public UMemory {
public:
// Create a rule based break iterator from a set of rules.
// This function is the main entry point into the rule builder. The
// public ICU API for creating RBBIs uses this function to do the actual work.
//
static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError,
UErrorCode &status);
public:
// The "public" functions and data members that appear below are accessed
// (and shared) by the various parts that make up the rule builder. They
// are NOT intended to be accessed by anything outside of the
// rule builder implementation.
RBBIRuleBuilder(const UnicodeString &rules,
UParseError &parseErr,
UErrorCode &status
);
virtual ~RBBIRuleBuilder();
char *fDebugEnv; // controls debug trace output
UErrorCode *fStatus; // Error reporting. Keeping status
UParseError *fParseError; // here avoids passing it everywhere.
const UnicodeString &fRules; // The rule string that we are compiling
RBBIRuleScanner *fScanner; // The scanner.
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
RBBINode *fReverseTree; // then manipulated by subsequent steps.
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
UVector *fUSetNodes; // Vector of all uset nodes.
RBBITableBuilder *fForwardTables; // State transition tables
RBBITableBuilder *fReverseTables;
RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
// data tables..
private:
RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
};
//----------------------------------------------------------------------------
//
// RBBISetTableEl is an entry in the hash table of UnicodeSets that have
// been encountered. The val Node will be of nodetype uset
// and contain pointers to the actual UnicodeSets.
// The Key is the source string for initializing the set.
//
// The hash table is used to avoid creating duplicate
// unnamed (not $var references) UnicodeSets.
//
// Memory Management:
// The Hash Table owns these RBBISetTableEl structs and
// the key strings. It does NOT own the val nodes.
//
//----------------------------------------------------------------------------
struct RBBISetTableEl {
UnicodeString *key;
RBBINode *val;
};
//----------------------------------------------------------------------------
//
// RBBIDebugPrintf Printf equivalent, for debugging output.
// Conditional compilation of the implementation lets us
// get rid of the stdio dependency in environments where it
// is unavailable.
//
//----------------------------------------------------------------------------
#ifdef RBBI_DEBUG
#include <stdio.h>
#define RBBIDebugPrintf printf
#else
inline void RBBIDebugPrintf(...) {}
#endif
U_NAMESPACE_END
#endif
--- NEW FILE: rbbirpt.h ---
//---------------------------------------------------------------------------------
//
// Generated Header File. Do not edit by hand.
// This file contains the state table for the ICU Rule Based Break Iterator
// rule parser.
// It is generated by the Perl script "rbbicst.pl" from
// the rule parser state definitions file "rbbirpt.txt".
//
// Copyright (C) 2002 International Business Machines Corporation
// and others. All rights reserved.
//
//---------------------------------------------------------------------------------
#ifndef RBBIRPT_H
#define RBBIRPT_H
U_NAMESPACE_BEGIN
//
// Character classes for RBBI rule scanning.
//
static const uint8_t kRuleSet_digit_char = 128;
static const uint8_t kRuleSet_rule_char = 129;
static const uint8_t kRuleSet_white_space = 130;
static const uint8_t kRuleSet_name_char = 131;
static const uint8_t kRuleSet_name_start_char = 132;
enum RBBI_RuleParseAction {
doExprOrOperator,
doRuleErrorAssignExpr,
doTagValue,
doEndAssign,
doRuleError,
doVariableNameExpectedErr,
doRuleChar,
doLParen,
doSlash,
doStartTagValue,
doDotAny,
doExprFinished,
doScanUnicodeSet,
doExprRParen,
doStartVariableName,
doTagExpectedError,
doTagDigit,
doUnaryOpStar,
doEndVariableName,
doNOP,
doUnaryOpQuestion,
doExit,
doStartAssign,
doEndOfRule,
doUnaryOpPlus,
doExprStart,
doExprCatOperator,
doReverseDir,
doCheckVarDef,
rbbiLastAction};
//-------------------------------------------------------------------------------
//
// RBBIRuleTableEl represents the structure of a row in the transition table
// for the rule parser state machine.
//-------------------------------------------------------------------------------
struct RBBIRuleTableEl {
RBBI_RuleParseAction fAction;
uint8_t fCharClass; // 0-127: an individual ASCII character
// 128-255: character class index
uint8_t fNextState; // 0-250: normal next-stat numbers
// 255: pop next-state from stack.
uint8_t fPushState;
UBool fNextChar;
};
static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doExprStart, 254, 12, 8, FALSE} // 1 start
, {doNOP, 130, 1,0, TRUE} // 2
, {doExprStart, 36 /* $ */, 71, 81, FALSE} // 3
, {doReverseDir, 33 /* ! */, 11,0, TRUE} // 4
, {doNOP, 59 /* ; */, 1,0, TRUE} // 5
, {doNOP, 252, 0,0, FALSE} // 6
, {doExprStart, 255, 12, 8, FALSE} // 7
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
, {doNOP, 130, 8,0, TRUE} // 9
, {doRuleError, 255, 86,0, FALSE} // 10
, {doExprStart, 255, 12, 8, FALSE} // 11 reverse-rule
, {doRuleChar, 254, 21,0, TRUE} // 12 term
, {doNOP, 130, 12,0, TRUE} // 13
, {doRuleChar, 129, 21,0, TRUE} // 14
, {doNOP, 91 /* [ */, 77, 21, FALSE} // 15
, {doLParen, 40 /* ( */, 12, 21, TRUE} // 16
, {doNOP, 36 /* $ */, 71, 20, FALSE} // 17
, {doDotAny, 46 /* . */, 21,0, TRUE} // 18
, {doRuleError, 255, 86,0, FALSE} // 19
, {doCheckVarDef, 255, 21,0, FALSE} // 20 term-var-ref
, {doNOP, 130, 21,0, TRUE} // 21 expr-mod
, {doUnaryOpStar, 42 /* * */, 26,0, TRUE} // 22
, {doUnaryOpPlus, 43 /* + */, 26,0, TRUE} // 23
, {doUnaryOpQuestion, 63 /* ? */, 26,0, TRUE} // 24
, {doNOP, 255, 26,0, FALSE} // 25
, {doExprCatOperator, 254, 12,0, FALSE} // 26 expr-cont
, {doNOP, 130, 26,0, TRUE} // 27
, {doExprCatOperator, 129, 12,0, FALSE} // 28
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 29
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 30
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 31
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 32
, {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 33
, {doExprCatOperator, 123 /* { */, 50,0, TRUE} // 34
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 35
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 36
, {doExprFinished, 255, 255,0, FALSE} // 37
, {doSlash, 47 /* / */, 40,0, TRUE} // 38 look-ahead
, {doNOP, 255, 86,0, FALSE} // 39
, {doExprCatOperator, 254, 12,0, FALSE} // 40 expr-cont-no-slash
, {doNOP, 130, 26,0, TRUE} // 41
, {doExprCatOperator, 129, 12,0, FALSE} // 42
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 43
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 44
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 45
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 46
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 47
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 48
, {doExprFinished, 255, 255,0, FALSE} // 49
, {doNOP, 130, 50,0, TRUE} // 50 tag-open
, {doStartTagValue, 128, 53,0, FALSE} // 51
, {doTagExpectedError, 255, 86,0, FALSE} // 52
, {doNOP, 130, 57,0, TRUE} // 53 tag-value
, {doNOP, 125 /* } */, 57,0, FALSE} // 54
, {doTagDigit, 128, 53,0, TRUE} // 55
, {doTagExpectedError, 255, 86,0, FALSE} // 56
, {doNOP, 130, 57,0, TRUE} // 57 tag-close
, {doTagValue, 125 /* } */, 60,0, TRUE} // 58
, {doTagExpectedError, 255, 86,0, FALSE} // 59
, {doExprCatOperator, 254, 12,0, FALSE} // 60 expr-cont-no-tag
, {doNOP, 130, 60,0, TRUE} // 61
, {doExprCatOperator, 129, 12,0, FALSE} // 62
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 63
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 64
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 65
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 66
, {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 67
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 68
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 69
, {doExprFinished, 255, 255,0, FALSE} // 70
, {doStartVariableName, 36 /* $ */, 73,0, TRUE} // 71 scan-var-name
, {doNOP, 255, 86,0, FALSE} // 72
, {doNOP, 132, 75,0, TRUE} // 73 scan-var-start
, {doVariableNameExpectedErr, 255, 86,0, FALSE} // 74
, {doNOP, 131, 75,0, TRUE} // 75 scan-var-body
, {doEndVariableName, 255, 255,0, FALSE} // 76
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 77 scan-unicode-set
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 78
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 79
, {doNOP, 255, 86,0, FALSE} // 80
, {doNOP, 130, 81,0, TRUE} // 81 assign-or-rule
, {doStartAssign, 61 /* = */, 12, 84, TRUE} // 82
, {doNOP, 255, 20, 8, FALSE} // 83
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 84 assign-end
, {doRuleErrorAssignExpr, 255, 86,0, FALSE} // 85
, {doExit, 255, 86,0, TRUE} // 86 errorDeath
};
static const char * const RBBIRuleStateNames[] = { 0,
"start",
0,
0,
0,
0,
0,
0,
"break-rule-end",
0,
0,
"reverse-rule",
"term",
0,
0,
0,
0,
0,
0,
0,
"term-var-ref",
"expr-mod",
0,
0,
0,
0,
"expr-cont",
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
"look-ahead",
0,
"expr-cont-no-slash",
0,
0,
0,
0,
0,
0,
0,
0,
0,
"tag-open",
0,
0,
"tag-value",
0,
0,
0,
"tag-close",
0,
0,
"expr-cont-no-tag",
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
"scan-var-name",
0,
"scan-var-start",
0,
"scan-var-body",
0,
"scan-unicode-set",
0,
0,
0,
"assign-or-rule",
0,
0,
"assign-end",
0,
"errorDeath",
0};
U_NAMESPACE_END
#endif
--- NEW FILE: rbbirpt.txt ---
#*****************************************************************************
#
# Copyright (C) 2002, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
#
# file: rbbirpt.txt
# ICU Break Iterator Rule Parser State Table
#
# This state table is used when reading and parsing a set of RBBI rules
# The rule parser uses a state machine; the data in this file define the
# state transitions that occur for each input character.
#
# *** This file defines the RBBI rule grammar. This is it.
# *** The determination of what is accepted is here.
#
# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
# that are then built with the rule parser.
#
#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
# input-char n next-state ^push-state action
# input-char n next-state ^push-state action
# | | | | |
# | | | | |--- action to be performed by state machine
# | | | | See function RBBIRuleScanner::doParseActions()
# | | | |
# | | | |--- Push this named state onto the state stack.
# | | | Later, when next state is specified as "pop",
# | | | the pushed state will become the current state.
# | | |
# | | |--- Transition to this state if the current input character matches the input
# | | character or char class in the left hand column. "pop" causes the next
# | | state to be popped from the state stack.
# | |
# | |--- When making the state transition specified on this line, advance to the next
# | character from the input only if 'n' appears here.
# |
# |--- Character or named character classes to test for. If the current character being scanned
# matches, peform the actions and go to the state specified on this line.
# The input character is tested sequentally, in the order written. The characters and
# character classes tested for do not need to be mutually exclusive. The first match wins.
#
#
# start state, scan position is at the beginning of the rules file, or in between two rules.
#
start:
escaped term ^break-rule-end doExprStart
white_space n start
'$' scan-var-name ^assign-or-rule doExprStart
'!' n reverse-rule doReverseDir
';' n start # ignore empty rules.
eof exit
default term ^break-rule-end doExprStart
#
# break-rule-end: Returned from doing a break-rule expression.
#
break-rule-end:
';' n start doEndOfRule
white_space n break-rule-end
default errorDeath doRuleError
#
# Reverse Rule We've just scanned a '!', indicating a reverse direction rule.
# A rule expression must follow.
#
reverse-rule:
default term ^break-rule-end doExprStart
#
# term. Eat through a single rule character, or a composite thing, which
# could be a parenthesized expression, a variable name, or a Unicode Set.
#
term:
escaped n expr-mod doRuleChar
white_space n term
rule_char n expr-mod doRuleChar
'[' scan-unicode-set ^expr-mod
'(' n term ^expr-mod doLParen
'$' scan-var-name ^term-var-ref
'.' n expr-mod doDotAny
default errorDeath doRuleError
#
# term-var-ref We've just finished scanning a reference to a $variable.
# Check that the variable was defined.
# The variable name scanning is in common with assignment statements,
# so the check can't be done there.
term-var-ref:
default expr-mod doCheckVarDef
#
# expr-mod We've just finished scanning a term, now look for the optional
# trailing '*', '?', '+'
#
expr-mod:
white_space n expr-mod
'*' n expr-cont doUnaryOpStar
'+' n expr-cont doUnaryOpPlus
'?' n expr-cont doUnaryOpQuestion
default expr-cont
#
# expr-cont Expression, continuation. At a point where additional terms are
# allowed, but not required.
#
expr-cont:
escaped term doExprCatOperator
white_space n expr-cont
rule_char term doExprCatOperator
'[' term doExprCatOperator
'(' term doExprCatOperator
'$' term doExprCatOperator
'.' term doExprCatOperator
'/' look-ahead doExprCatOperator
'{' n tag-open doExprCatOperator
'|' n term doExprOrOperator
')' n pop doExprRParen
default pop doExprFinished
#
# look-ahead Scanning a '/', which identifies a break point, assuming that the
# remainder of the expression matches.
#
# Generate a parse tree as if this was a special kind of input symbol
# appearing in an otherwise normal concatenation expression.
#
look-ahead:
'/' n expr-cont-no-slash doSlash
default errorDeath
#
# expr-cont-no-slash Expression, continuation. At a point where additional terms are
# allowed, but not required. Just like
# expr-cont, above, except that no '/'
# look-ahead symbol is permitted.
#
expr-cont-no-slash:
escaped term doExprCatOperator
white_space n expr-cont
rule_char term doExprCatOperator
'[' term doExprCatOperator
'(' term doExprCatOperator
'$' term doExprCatOperator
'.' term doExprCatOperator
'|' n term doExprOrOperator
')' n pop doExprRParen
default pop doExprFinished
#
# tags scanning a '{', the opening delimiter for a tag that identifies
# the kind of match. Scan the whole {dddd} tag, where d=digit
#
tag-open:
white_space n tag-open
digit_char tag-value doStartTagValue
default errorDeath doTagExpectedError
tag-value:
white_space n tag-close
'}' tag-close
digit_char n tag-value doTagDigit
default errorDeath doTagExpectedError
tag-close:
white_space n tag-close
'}' n expr-cont-no-tag doTagValue
default errorDeath doTagExpectedError
#
# expr-cont-no-tag Expression, continuation. At a point where additional terms are
# allowed, but not required. Just like
# expr-cont, above, except that no "{ddd}"
# tagging is permitted.
#
expr-cont-no-tag:
escaped term doExprCatOperator
white_space n expr-cont-no-tag
rule_char term doExprCatOperator
'[' term doExprCatOperator
'(' term doExprCatOperator
'$' term doExprCatOperator
'.' term doExprCatOperator
'/' look-ahead doExprCatOperator
'|' n term doExprOrOperator
')' n pop doExprRParen
default pop doExprFinished
#
# Variable Name Scanning.
#
# The state that branched to here must have pushed a return state
# to go to after completion of the variable name scanning.
#
# The current input character must be the $ that introduces the name.
# The $ is consummed here rather than in the state that first detected it
# so that the doStartVariableName action only needs to happen in one
# place (here), and the other states don't need to worry about it.
#
scan-var-name:
'$' n scan-var-start doStartVariableName
default errorDeath
scan-var-start:
name_start_char n scan-var-body
default errorDeath doVariableNameExpectedErr
scan-var-body:
name_char n scan-var-body
default pop doEndVariableName
#
# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
# Within the RBBI parser, after finding the first character
# of a Unicode Set, we just hand the rule input at that
# point of to the Unicode Set constructor, then pick
# up parsing after the close of the set.
#
# The action for this state invokes the UnicodeSet parser.
#
scan-unicode-set:
'[' n pop doScanUnicodeSet
'p' n pop doScanUnicodeSet
'P' n pop doScanUnicodeSet
default errorDeath
#
# assign-or-rule. A $variable was encountered at the start of something, could be
# either an assignment statement or a rule, depending on whether an '='
# follows the variable name. We get to this state when the variable name
# scanning does a return.
#
assign-or-rule:
white_space n assign-or-rule
'=' n term ^assign-end doStartAssign # variable was target of assignment
default term-var-ref ^break-rule-end # variable was a term in a rule
#
# assign-end This state is entered when the end of the expression on the
# right hand side of an assignment is found. We get here via
# a pop; this state is pushed when the '=' in an assignment is found.
#
# The only thing allowed at this point is a ';'. The RHS of an
# assignment must look like a rule expression, and we come here
# when what is being scanned no longer looks like an expression.
#
assign-end:
';' n start doEndAssign
default errorDeath doRuleErrorAssignExpr
#
# errorDeath. This state is specified as the next state whenever a syntax error
# in the source rules is detected. Barring bugs, the state machine will never
# actually get here, but will stop because of the action associated with the error.
# But, just in case, this state asks the state machine to exit.
errorDeath:
default n errorDeath doExit
--- NEW FILE: rbbiscan.cpp ---
//
// file: rbbiscan.cpp
//
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the Rule Based Break Iterator Rule Builder functions for
// scanning the rules and assembling a parse tree. This is the first phase
// of compiling the rules.
//
// The overall of the rules is managed by class RBBIRuleBuilder, which will
// create and use an instance of this class as part of the process.
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
[...1098 lines suppressed...]
RBBINode *n;
n = pushNewNode(RBBINode::setRef);
n->fFirstPos = startPos;
n->fLastPos = fNextIndex;
fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
// findSetFor() serves several purposes here:
// - Adopts storage for the UnicodeSet, will be responsible for deleting.
// - Mantains collection of all sets in use, needed later for establishing
// character categories for run time engine.
// - Eliminates mulitiple instances of the same set.
// - Creates a new uset node if necessary (if this isn't a duplicate.)
findSetFor(n->fText, n, uset);
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbiscan.h ---
//
// rbbiscan.h
//
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for class RBBIRuleScanner
//
#ifndef RBBISCAN_H
#define RBBISCAN_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
#include "uhash.h"
#include "uvector.h"
#include "symtable.h" // For UnicodeSet parsing, is the interface that
// looks up references to $variables within a set.
#include "rbbinode.h"
//#include "rbbitblb.h"
U_NAMESPACE_BEGIN
class RBBIRuleBuilder;
class RBBISymbolTable;
//--------------------------------------------------------------------------------
//
// class RBBIRuleScanner does the lowest level, character-at-a-time
// scanning of break iterator rules.
//
// The output of the scanner is parse trees for
// the rule expressions and a list of all Unicode Sets
// encountered.
//
//--------------------------------------------------------------------------------
static const int kStackSize = 100; // The size of the state stack for
// rules parsing. Corresponds roughly
// to the depth of parentheses nesting
// that is allowed in the rules.
enum EParseAction {dummy01, dummy02}; // Placeholder enum for the specifier for
// actions that are specified in the
// rule parsing state table.
class RBBIRuleScanner : public UMemory {
public:
struct RBBIRuleChar {
UChar32 fChar;
UBool fEscaped;
};
RBBIRuleScanner(RBBIRuleBuilder *rb);
virtual ~RBBIRuleScanner();
void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
// Return false if at end.
UBool push(const RBBIRuleChar &c); // Push (unget) one character.
// Only a single character may be pushed.
void parse(); // Parse the rules, generating two parse
// trees, one each for the forward and
// reverse rules,
// and a list of UnicodeSets encountered.
/**
* Return a rules string without unnecessary
* characters.
*/
static UnicodeString stripRules(const UnicodeString &rules);
private:
UBool doParseActions(EParseAction a);
void error(UErrorCode e); // error reporting convenience function.
void fixOpStack(RBBINode::OpPrecedence p);
// a character.
void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
UChar32 nextCharLL();
void printNodeStack(const char *title);
RBBINode *pushNewNode(RBBINode::NodeType t);
void scanSet();
RBBIRuleBuilder *fRB; // The rule builder that we are part of.
int32_t fScanIndex; // Index of current character being processed
// in the rule input string.
int32_t fNextIndex; // Index of the next character, which
// is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a 'quoted region'
int fLineNum; // Line number in input file.
int fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
RBBIRuleChar fC; // Current char for parse state machine
// processing.
UnicodeString fVarName; // $variableName, valid when we've just
// scanned one.
RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
// parsing. index by p[state][char-class]
uint16_t fStack[kStackSize]; // State stack, holds state pushes
int fStackPtr; // and pops as specified in the state
// transition rules.
RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
// during the parse of a rule
int fNodeStackPtr;
UBool fReverseRule; // True if the rule currently being scanned
// is a reverse direction rule (if it
// starts with a '!')
UBool fLookAheadRule; // True if the rule includes a '/'
// somewhere within it.
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
// $variable symbols.
UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
// the sets created while parsing rules.
// The key is the string used for creating
// the set.
UnicodeSet *fRuleSets[10]; // Unicode Sets that are needed during
// the scanning of RBBI rules. The
// indicies for these are assigned by the
// perl script that builds the state tables.
// See rbbirpt.h.
int32_t fRuleNum; // Counts each rule as it is scanned.
UnicodeSet *gRuleSet_rule_char;
UnicodeSet *gRuleSet_white_space;
UnicodeSet *gRuleSet_name_char;
UnicodeSet *gRuleSet_name_start_char;
RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
};
U_NAMESPACE_END
#endif
--- NEW FILE: rbbisetb.cpp ---
//
// rbbisetb.cpp
//
/*
***************************************************************************
* Copyright (C) 2002-2003 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
//
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
// (part of the rule building process.)
//
// Starting with the rules parse tree from the scanner,
//
// - Enumerate the set of UnicodeSets that are referenced
// by the RBBI rules.
// - compute a set of non-overlapping character ranges
// with all characters within a range belonging to the same
// set of input uniocde sets.
// - Derive a set of non-overlapping UnicodeSet (like things)
// that will correspond to columns in the state table for
// the RBBI execution engine. All characters within one
// of these sets belong to the same set of the original
// UnicodeSets from the user's rules.
// - construct the trie table that maps input characters
// to the index of the matching non-overlapping set of set from
// the previous step.
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uniset.h"
#include "utrie.h"
#include "uvector.h"
#include "uassert.h"
#include "cmemory.h"
#include "cstring.h"
#include "rbbisetb.h"
#include "rbbinode.h"
//------------------------------------------------------------------------
//
// getFoldedRBBIValue Call-back function used during building of Trie table.
// Folding value: just store the offset (16 bits)
// if there is any non-0 entry.
// (It'd really be nice if the Trie builder would provide a
// simple default, so this function could go away from here.)
//
//------------------------------------------------------------------------
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
U_CDECL_BEGIN
static uint32_t U_CALLCONV
getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
UBool inBlockZero;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0) {
return (uint32_t)(offset|0x8000);
} else {
++start;
}
}
return 0;
}
U_CDECL_END
U_NAMESPACE_BEGIN
//------------------------------------------------------------------------
//
// Constructor
//
//------------------------------------------------------------------------
RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
{
fRB = rb;
fStatus = rb->fStatus;
fRangeList = 0;
fTrie = 0;
fTrieSize = 0;
fGroupCount = 0;
}
//------------------------------------------------------------------------
//
// Destructor
//
//------------------------------------------------------------------------
RBBISetBuilder::~RBBISetBuilder()
{
RangeDescriptor *nextRangeDesc;
// Walk through & delete the linked list of RangeDescriptors
for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
RangeDescriptor *r = nextRangeDesc;
nextRangeDesc = r->fNext;
delete r;
}
utrie_close(fTrie);
}
//------------------------------------------------------------------------
//
// build Build the list of non-overlapping character ranges
// from the Unicode Sets.
//
//------------------------------------------------------------------------
void RBBISetBuilder::build() {
RBBINode *usetNode;
RangeDescriptor *rlRange;
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
//
// Initialize the process by creating a single range encompassing all characters
// that is in no sets.
//
fRangeList = new RangeDescriptor(*fStatus);
fRangeList->fStartChar = 0;
fRangeList->fEndChar = 0x10ffff;
//
// Find the set of non-overlapping ranges of characters
//
int ni;
for (ni=0; ; ni++) {
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
if (usetNode==NULL) {
break;
}
UnicodeSet *inputSet = usetNode->fInputSet;
int32_t inputSetRangeCount = inputSet->getRangeCount();
int inputSetRangeIndex = 0;
rlRange = fRangeList;
for (;;) {
if (inputSetRangeIndex >= inputSetRangeCount) {
break;
}
UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
// skip over ranges from the range list that are completely
// below the current range from the input unicode set.
while (rlRange->fEndChar < inputSetRangeBegin) {
rlRange = rlRange->fNext;
}
// If the start of the range from the range list is before with
// the start of the range from the unicode set, split the range list range
// in two, with one part being before (wholly outside of) the unicode set
// and the other containing the rest.
// Then continue the loop; the post-split current range will then be skipped
// over
if (rlRange->fStartChar < inputSetRangeBegin) {
rlRange->split(inputSetRangeBegin, *fStatus);
continue;
}
// Same thing at the end of the ranges...
// If the end of the range from the range list doesn't coincide with
// the end of the range from the unicode set, split the range list
// range in two. The first part of the split range will be
// wholly inside the Unicode set.
if (rlRange->fEndChar > inputSetRangeEnd) {
rlRange->split(inputSetRangeEnd+1, *fStatus);
}
// The current rlRange is now entirely within the UnicodeSet range.
// Add this unicode set to the list of sets for this rlRange
if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
rlRange->fIncludesSets->addElement(usetNode, *fStatus);
}
// Advance over ranges that we are finished with.
if (inputSetRangeEnd == rlRange->fEndChar) {
inputSetRangeIndex++;
}
rlRange = rlRange->fNext;
}
}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
//
// Group the above ranges, with each group consisting of one or more
// ranges that are in exactly the same set of original UnicodeSets.
// The groups are numbered, and these group numbers are the set of
// input symbols recognized by the run-time state machine.
//
RangeDescriptor *rlSearchRange;
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
rlRange->fNum = rlSearchRange->fNum;
break;
}
}
if (rlRange->fNum == 0) {
fGroupCount ++;
rlRange->fNum = fGroupCount;
rlRange->setDictionaryFlag();
addValToSets(rlRange->fIncludesSets, fGroupCount);
}
}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
//
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number
//
fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
NULL, // Data array (utrie will allocate one)
100000, // Max Data Length
0, // Initial value for all code points
TRUE); // Keep Latin 1 in separately
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
}
}
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int32_t RBBISetBuilder::getTrieSize() {
fTrieSize = utrie_serialize(fTrie,
NULL, // Buffer
0, // Capacity
getFoldedRBBIValue,
TRUE, // Reduce to 16 bits
fStatus);
// RBBIDebugPrintf("Trie table size is %d\n", trieSize);
return fTrieSize;
}
//-----------------------------------------------------------------------------------
//
// serializeTrie() Put the serialized trie at the specified address.
// Trust the caller to have given us enough memory.
// getTrieSize() MUST be called first.
//
//-----------------------------------------------------------------------------------
void RBBISetBuilder::serializeTrie(uint8_t *where) {
utrie_serialize(fTrie,
where, // Buffer
fTrieSize, // Capacity
getFoldedRBBIValue,
TRUE, // Reduce to 16 bits
fStatus);
}
//------------------------------------------------------------------------
//
// addValToSets Add a runtime-mapped input value to each uset from a
// list of uset nodes.
// For each of the original Unicode sets - which correspond
// directly to uset nodes - a logically equivalent expression
// is constructed in terms of the remapped runtime input
// symbol set. This function adds one runtime input symbol to
// a list of sets.
//
// The "logically equivalent expression" is the tree for an
// or-ing together of all of the symbols that go into the set.
//
//------------------------------------------------------------------------
void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
int32_t ix;
for (ix=0; ix<sets->size(); ix++) {
RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
leafNode->fVal = (unsigned short)val;
if (usetNode->fLeftChild == NULL) {
usetNode->fLeftChild = leafNode;
leafNode->fParent = usetNode;
} else {
// There are already input symbols present for this set.
// Set up an OR node, with the previous stuff as the left child
// and the new value as the right child.
RBBINode *orNode = new RBBINode(RBBINode::opOr);
orNode->fLeftChild = usetNode->fLeftChild;
orNode->fRightChild = leafNode;
orNode->fLeftChild->fParent = orNode;
orNode->fRightChild->fParent = orNode;
usetNode->fLeftChild = orNode;
orNode->fParent = usetNode;
}
}
}
//------------------------------------------------------------------------
//
// getNumOutputSets
//
//------------------------------------------------------------------------
int32_t RBBISetBuilder::getNumCharCategories() {
return fGroupCount + 1;
}
//------------------------------------------------------------------------
//
// printRanges A debugging function.
// dump out all of the range definitions.
//
//------------------------------------------------------------------------
void RBBISetBuilder::printRanges() {
#ifdef RBBI_DEBUG
RangeDescriptor *rlRange;
int i;
RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
UnicodeString setName = "anon";
RBBINode *setRef = usetNode->fParent;
if (setRef != NULL) {
RBBINode *varRef = setRef->fParent;
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
RBBINode::printUnicodeString(setName); RBBIDebugPrintf(" ");
}
RBBIDebugPrintf("\n");
}
#endif
}
//------------------------------------------------------------------------
//
// printRangeGroups A debugging function.
// dump out all of the range groups.
//
//------------------------------------------------------------------------
void RBBISetBuilder::printRangeGroups() {
RangeDescriptor *rlRange;
RangeDescriptor *tRange;
int i;
int lastPrintedGroupNum = 0;
RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
int groupNum = rlRange->fNum & 0xbfff;
if (groupNum > lastPrintedGroupNum) {
lastPrintedGroupNum = groupNum;
RBBIDebugPrintf("%2i ", groupNum);
if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
UnicodeString setName = "anon";
RBBINode *setRef = usetNode->fParent;
if (setRef != NULL) {
RBBINode *varRef = setRef->fParent;
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
RBBINode::printUnicodeString(setName); RBBIDebugPrintf(" ");
}
i = 0;
for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
if (tRange->fNum == rlRange->fNum) {
if (i++ % 5 == 0) {
RBBIDebugPrintf("\n ");
}
RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
}
}
RBBIDebugPrintf("\n");
}
}
RBBIDebugPrintf("\n");
}
//------------------------------------------------------------------------
//
// printSets A debugging function.
// dump out all of the set definitions.
//
//------------------------------------------------------------------------
void RBBISetBuilder::printSets() {
#ifdef RBBI_DEBUG
int i;
RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
for (i=0; ; i++) {
RBBINode *usetNode;
RBBINode *setRef;
RBBINode *varRef;
UnicodeString setName;
usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
if (usetNode == NULL) {
break;
}
RBBIDebugPrintf("%3d ", i);
setName = "anonymous";
setRef = usetNode->fParent;
if (setRef != NULL) {
varRef = setRef->fParent;
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
RBBINode::printUnicodeString(setName);
RBBIDebugPrintf(" ");
RBBINode::printUnicodeString(usetNode->fText);
RBBIDebugPrintf("\n");
if (usetNode->fLeftChild != NULL) {
usetNode->fLeftChild->printTree();
}
}
RBBIDebugPrintf("\n");
#endif
}
//-------------------------------------------------------------------------------------
//
// RangeDescriptor copy constructor
//
//-------------------------------------------------------------------------------------
RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
int i;
this->fStartChar = other.fStartChar;
this->fEndChar = other.fEndChar;
this->fNum = other.fNum;
this->fNext = NULL;
this->fIncludesSets = new UVector(status);
/* test for NULL */
if (this->fIncludesSets == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (i=0; i<other.fIncludesSets->size(); i++) {
this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
}
}
//-------------------------------------------------------------------------------------
//
// RangeDesriptor default constructor
//
//-------------------------------------------------------------------------------------
RangeDescriptor::RangeDescriptor(UErrorCode &status) {
this->fStartChar = 0;
this->fEndChar = 0;
this->fNum = 0;
this->fNext = NULL;
this->fIncludesSets = new UVector(status);
/* test for NULL */
if(this->fIncludesSets == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
//-------------------------------------------------------------------------------------
//
// RangeDesriptor Destructor
//
//-------------------------------------------------------------------------------------
RangeDescriptor::~RangeDescriptor() {
delete fIncludesSets;
fIncludesSets = NULL;
}
//-------------------------------------------------------------------------------------
//
// RangeDesriptor::split()
//
//-------------------------------------------------------------------------------------
void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
U_ASSERT(where>fStartChar && where<=fEndChar);
RangeDescriptor *nr = new RangeDescriptor(*this, status);
/* test for NULL */
if(nr == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// RangeDescriptor copy constructor copies all fields.
// Only need to update those that are different after the split.
nr->fStartChar = where;
this->fEndChar = where-1;
nr->fNext = this->fNext;
this->fNext = nr;
}
//-------------------------------------------------------------------------------------
//
// RangeDescriptor::setDictionaryFlag
//
// Character Category Numbers that include characters from
// the original Unicode Set named "dictionary" have bit 14
// set to 1. The RBBI runtime engine uses this to trigger
// use of the word dictionary.
//
// This function looks through the Unicode Sets that it
// (the range) includes, and sets the bit in fNum when
// "dictionary" is among them.
//
// TODO: a faster way would be to find the set node for
// "dictionary" just once, rather than looking it
// up by name every time.
//
//-------------------------------------------------------------------------------------
void RangeDescriptor::setDictionaryFlag() {
int i;
for (i=0; i<this->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
UnicodeString setName;
RBBINode *setRef = usetNode->fParent;
if (setRef != NULL) {
RBBINode *varRef = setRef->fParent;
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
if (setName.compare("dictionary") == 0) { // TODO: no string literals.
this->fNum |= 0x4000;
break;
}
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbisetb.h ---
//
// rbbisetb.h
/*
**********************************************************************
* Copyright (c) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef RBBISETB_H
#define RBBISETB_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "rbbirb.h"
#include "uvector.h"
#include "uhash.h"
struct UNewTrie;
U_NAMESPACE_BEGIN
//
// RBBISetBuilder Derives the character categories used by the runtime RBBI engine
// from the Unicode Sets appearing in the source RBBI rules, and
// creates the TRIE table used to map from Unicode to the
// character categories.
//
//
// RangeDescriptor
//
// Each of the non-overlapping character ranges gets one of these descriptors.
// All of them are strung together in a linked list, which is kept in order
// (by character)
//
class RangeDescriptor : public UMemory {
public:
UChar32 fStartChar; // Start of range, unicode 32 bit value.
UChar32 fEndChar; // End of range, unicode 32 bit value.
int32_t fNum; // runtime-mapped input value for this range.
UVector *fIncludesSets; // vector of the the original
// Unicode sets that include this range.
// (Contains ptrs to uset nodes)
RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
RangeDescriptor(UErrorCode &status);
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
~RangeDescriptor();
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
// where appearing in the second (higher) part.
void setDictionaryFlag(); // Check whether this range appears as part of
// the Unicode set named "dictionary"
private:
RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
};
//
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
//
// Starting with the rules parse tree from the scanner,
//
// - Enumerate the set of UnicodeSets that are referenced
// by the RBBI rules.
// - compute a derived set of non-overlapping UnicodeSets
// that will correspond to columns in the state table for
// the RBBI execution engine.
// - construct the trie table that maps input characters
// to set numbers in the non-overlapping set of sets.
//
class RBBISetBuilder : public UMemory {
public:
RBBISetBuilder(RBBIRuleBuilder *rb);
~RBBISetBuilder();
void build();
void addValToSets(UVector *sets, uint32_t val);
int32_t getNumCharCategories(); // CharCategories are the same as input symbol set to the
// runtime state machine, which are the same as
// columns in the DFA state table
int32_t getTrieSize(); // Size in bytes of the serialized Trie.
void serializeTrie(uint8_t *where); // write out the serialized Trie.
void printSets();
void printRanges();
void printRangeGroups();
private:
void numberSets();
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
UErrorCode *fStatus;
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
uint32_t fTrieSize; // the Unicode Sets.
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
// fGroupCount is the index of the last used group.
// The value is also the number of columns in the RBBI state table being compiled.
// Index 0 is not used. Funny counting.
int32_t fGroupCount;
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
};
U_NAMESPACE_END
#endif
--- NEW FILE: rbbistbl.cpp ---
//
// file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class
//
/*
***************************************************************************
* Copyright (C) 2002-2003 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/parsepos.h"
#include "umutex.h"
#include "rbbirb.h"
#include "rbbinode.h"
//
// RBBISymbolTableEntry_deleter Used by the UHashTable to delete the contents
// when the hash table is deleted.
//
U_CDECL_BEGIN
static void U_EXPORT2 U_CALLCONV RBBISymbolTableEntry_deleter(void *p) {
RBBISymbolTableEntry *px = (RBBISymbolTableEntry *)p;
delete px;
}
U_CDECL_END
U_NAMESPACE_BEGIN
RBBISymbolTable::RBBISymbolTable(RBBIRuleScanner *rs, const UnicodeString &rules, UErrorCode &status)
:fRules(rules), fRuleScanner(rs), ffffString(UChar(0xffff))
{
fHashTable = NULL;
fCachedSetLookup = NULL;
if (U_FAILURE(status)) {
return;
}
fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, &status);
uhash_setValueDeleter(fHashTable, RBBISymbolTableEntry_deleter);
}
RBBISymbolTable::~RBBISymbolTable()
{
uhash_close(fHashTable);
}
//
// RBBISymbolTable::lookup This function from the abstract symbol table inteface
// looks up a variable name and returns a UnicodeString
// containing the substitution text.
//
// The variable name does NOT include the leading $.
//
const UnicodeString *RBBISymbolTable::lookup(const UnicodeString& s) const
{
RBBISymbolTableEntry *el;
RBBINode *varRefNode;
RBBINode *exprNode;
RBBINode *usetNode;
const UnicodeString *retString;
RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const
el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &s);
if (el == NULL) {
return NULL;
}
varRefNode = el->val;
exprNode = varRefNode->fLeftChild; // Root node of expression for variable
if (exprNode->fType == RBBINode::setRef) {
// The $variable refers to a single UnicodeSet
// return the ffffString, which will subsequently be interpreted as a
// stand-in character for the set by RBBISymbolTable::lookupMatcher()
usetNode = exprNode->fLeftChild;
This->fCachedSetLookup = usetNode->fInputSet;
retString = &ffffString;
}
else
{
// The variable refers to something other than just a set.
// return the original source string for the expression
retString = &exprNode->fText;
This->fCachedSetLookup = NULL;
}
return retString;
}
//
// RBBISymbolTable::lookupMatcher This function from the abstract symbol table
// interface maps a single stand-in character to a
// pointer to a Unicode Set. The Unicode Set code uses this
// mechanism to get all references to the same $variable
// name to refer to a single common Unicode Set instance.
//
// This implementation cheats a little, and does not maintain a map of stand-in chars
// to sets. Instead, it takes advantage of the fact that the UnicodeSet
// constructor will always call this function right after calling lookup(),
// and we just need to remember what set to return between these two calls.
const UnicodeFunctor *RBBISymbolTable::lookupMatcher(UChar32 ch) const
{
UnicodeSet *retVal = NULL;
RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const
if (ch == 0xffff) {
retVal = fCachedSetLookup;
This->fCachedSetLookup = 0;
}
return retVal;
}
//
// RBBISymbolTable::parseReference This function from the abstract symbol table interface
// looks for a $variable name in the source text.
// It does not look it up, only scans for it.
// It is used by the UnicodeSet parser.
//
// This implementation is lifted pretty much verbatim
// from the rules based transliterator implementation.
// I didn't see an obvious way of sharing it.
//
UnicodeString RBBISymbolTable::parseReference(const UnicodeString& text,
ParsePosition& pos, int32_t limit) const
{
int32_t start = pos.getIndex();
int32_t i = start;
UnicodeString result;
while (i < limit) {
UChar c = text.charAt(i);
if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
break;
}
++i;
}
if (i == start) { // No valid name chars
return result; // Indicate failure with empty string
}
pos.setIndex(i);
text.extractBetween(start, i, result);
return result;
}
//
// RBBISymbolTable::lookupNode Given a key (a variable name), return the
// corresponding RBBI Node. If there is no entry
// in the table for this name, return NULL.
//
RBBINode *RBBISymbolTable::lookupNode(const UnicodeString &key) const{
RBBINode *retNode = NULL;
RBBISymbolTableEntry *el;
el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key);
if (el != NULL) {
retNode = el->val;
}
return retNode;
}
//
// RBBISymbolTable::addEntry Add a new entry to the symbol table.
// Indicate an error if the name already exists -
// this will only occur in the case of duplicate
// variable assignments.
//
void RBBISymbolTable::addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err) {
RBBISymbolTableEntry *e;
/* test for buffer overflows */
if (U_FAILURE(err)) {
return;
}
e = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key);
if (e != NULL) {
err = U_BRK_VARIABLE_REDFINITION;
return;
}
e = new RBBISymbolTableEntry;
if (e == NULL) {
err = U_MEMORY_ALLOCATION_ERROR;
return;
}
e->key = key;
e->val = val;
uhash_put( fHashTable, &e->key, e, &err);
}
RBBISymbolTableEntry::RBBISymbolTableEntry() : UMemory(), key(), val(NULL) {}
RBBISymbolTableEntry::~RBBISymbolTableEntry() {
// The "val" of a symbol table entry is a variable reference node.
// The l. child of the val is the rhs expression from the assignment.
// Unlike other node types, children of variable reference nodes are not
// automatically recursively deleted. We do it manually here.
delete val->fLeftChild;
val->fLeftChild = NULL;
delete val;
// Note: the key UnicodeString is destructed by virtue of being in the object by value.
}
//
// RBBISymbolTable::print Debugging function, dump out the symbol table contents.
//
void RBBISymbolTable::print() const {
RBBIDebugPrintf("Variable Definitions\n"
"Name Node Val String Val\n"
"----------------------------------------------------------------------\n");
int32_t pos = -1;
const UHashElement *e = NULL;
for (;;) {
e = uhash_nextElement(fHashTable, &pos);
if (e == NULL ) {
break;
}
RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
RBBINode::printUnicodeString(s->key, 15);
RBBIDebugPrintf(" %8p ", (void *)s->val);
RBBINode::printUnicodeString(s->val->fLeftChild->fText);
RBBIDebugPrintf("\n");
}
RBBIDebugPrintf("\nParsed Variable Definitions\n");
pos = -1;
for (;;) {
e = uhash_nextElement(fHashTable, &pos);
if (e == NULL ) {
break;
}
RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
RBBINode::printUnicodeString(s->key);
s->val->fLeftChild->printTree();
RBBIDebugPrintf("\n");
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbitblb.cpp ---
//
// rbbitblb.cpp
//
/*
**********************************************************************
* Copyright (c) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/unistr.h"
#include "rbbitblb.h"
#include "rbbirb.h"
#include "rbbisetb.h"
#include "rbbidata.h"
#include "cstring.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
fTree(*rootNode) {
fRB = rb;
fStatus = fRB->fStatus;
fDStates = new UVector(*fStatus);
}
RBBITableBuilder::~RBBITableBuilder() {
int i;
for (i=0; i<fDStates->size(); i++) {
delete (RBBIStateDescriptor *)fDStates->elementAt(i);
}
delete fDStates;
}
//-----------------------------------------------------------------------------
//
// RBBITableBuilder::build - This is the main function for building the DFA state transtion
// table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::build() {
if (U_FAILURE(*fStatus)) {
return;
}
// If there were no rules, just return. This situation can easily arise
// for the reverse rules.
if (fTree==NULL) {
return;
}
//
// Walk through the tree, replacing any references to $variables with a copy of the
// parse tree for the substition expression.
//
fTree = fTree->flattenVariables();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
RBBIDebugPrintf("Parse tree after flattening variable references.\n");
fTree->printTree(TRUE);
}
//
// Add a unique right-end marker to the expression.
// Appears as a cat-node, left child being the original tree,
// right child being the end marker.
//
RBBINode *cn = new RBBINode(RBBINode::opCat);
cn->fLeftChild = fTree;
fTree->fParent = cn;
cn->fRightChild = new RBBINode(RBBINode::endMark);
cn->fRightChild->fParent = cn;
fTree = cn;
//
// Replace all references to UnicodeSets with the tree for the equivalent
// expression.
//
fTree->flattenSets();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
RBBIDebugPrintf("Parse tree after flattening Unicode Set references.\n");
fTree->printTree(TRUE);
}
//
// calculate the functions nullable, firstpos, lastpos and followpos on
// nodes in the parse tree.
// See the alogrithm description in Aho.
// Understanding how this works by looking at the code alone will be
// nearly impossible.
//
calcNullable(fTree);
calcFirstPos(fTree);
calcLastPos(fTree);
calcFollowPos(fTree);
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
RBBIDebugPrintf("\n\n");
printPosSets(fTree);
}
//
// Build the DFA state transition tables.
//
buildStateTable();
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
//-----------------------------------------------------------------------------
//
// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcNullable(RBBINode *n) {
if (n == NULL) {
return;
}
if (n->fType == RBBINode::setRef ||
n->fType == RBBINode::endMark ) {
// These are non-empty leaf node types.
n->fNullable = FALSE;
return;
}
if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
// Lookahead marker node. It's a leaf, so no recursion on children.
// It's nullable because it does not match any literal text from the input stream.
n->fNullable = TRUE;
return;
}
// The node is not a leaf.
// Calculate nullable on its children.
calcNullable(n->fLeftChild);
calcNullable(n->fRightChild);
// Apply functions from table 3.40 in Aho
if (n->fType == RBBINode::opOr) {
n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
}
else if (n->fType == RBBINode::opCat) {
n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
}
else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
n->fNullable = TRUE;
}
else {
n->fNullable = FALSE;
}
}
//-----------------------------------------------------------------------------
//
// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcFirstPos(RBBINode *n) {
if (n == NULL) {
return;
}
if (n->fType == RBBINode::leafChar ||
n->fType == RBBINode::endMark ||
n->fType == RBBINode::lookAhead ||
n->fType == RBBINode::tag) {
// These are non-empty leaf node types.
n->fFirstPosSet->addElement(n, *fStatus);
return;
}
// The node is not a leaf.
// Calculate firstPos on its children.
calcFirstPos(n->fLeftChild);
calcFirstPos(n->fRightChild);
// Apply functions from table 3.40 in Aho
if (n->fType == RBBINode::opOr) {
setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
}
else if (n->fType == RBBINode::opCat) {
setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
if (n->fLeftChild->fNullable) {
setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
}
}
else if (n->fType == RBBINode::opStar ||
n->fType == RBBINode::opQuestion ||
n->fType == RBBINode::opPlus) {
setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
}
}
//-----------------------------------------------------------------------------
//
// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcLastPos(RBBINode *n) {
if (n == NULL) {
return;
}
if (n->fType == RBBINode::leafChar ||
n->fType == RBBINode::endMark ||
n->fType == RBBINode::lookAhead ||
n->fType == RBBINode::tag) {
// These are non-empty leaf node types.
n->fLastPosSet->addElement(n, *fStatus);
return;
}
// The node is not a leaf.
// Calculate lastPos on its children.
calcLastPos(n->fLeftChild);
calcLastPos(n->fRightChild);
// Apply functions from table 3.40 in Aho
if (n->fType == RBBINode::opOr) {
setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
}
else if (n->fType == RBBINode::opCat) {
setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
if (n->fRightChild->fNullable) {
setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
}
}
else if (n->fType == RBBINode::opStar ||
n->fType == RBBINode::opQuestion ||
n->fType == RBBINode::opPlus) {
setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
}
}
//-----------------------------------------------------------------------------
//
// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcFollowPos(RBBINode *n) {
if (n == NULL ||
n->fType == RBBINode::leafChar ||
n->fType == RBBINode::endMark) {
return;
}
calcFollowPos(n->fLeftChild);
calcFollowPos(n->fRightChild);
// Aho rule #1
if (n->fType == RBBINode::opCat) {
RBBINode *i; // is 'i' in Aho's description
uint32_t ix;
UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
}
}
// Aho rule #2
if (n->fType == RBBINode::opStar ||
n->fType == RBBINode::opPlus) {
RBBINode *i; // again, n and i are the names from Aho's description.
uint32_t ix;
for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
i = (RBBINode *)n->fLastPosSet->elementAt(ix);
setAdd(i->fFollowPos, n->fFirstPosSet);
}
}
}
//-----------------------------------------------------------------------------
//
// buildStateTable() Determine the set of runtime DFA states and the
// transition tables for these states, by the algorithm
// of fig. 3.44 in Aho.
//
// Most of the comments are quotes of Aho's psuedo-code.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::buildStateTable() {
//
// Add a dummy state 0 - the stop state. Not from Aho.
int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
failState->fPositions = new UVector(*fStatus);
fDStates->addElement(failState, *fStatus);
// initially, the only unmarked state in Dstates is firstpos(root),
// where toot is the root of the syntax tree for (r)#;
RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
initialState->fPositions = new UVector(*fStatus);
setAdd(initialState->fPositions, fTree->fFirstPosSet);
fDStates->addElement(initialState, *fStatus);
// while there is an unmarked state T in Dstates do begin
for (;;) {
RBBIStateDescriptor *T = NULL;
int32_t tx;
for (tx=1; tx<fDStates->size(); tx++) {
RBBIStateDescriptor *temp;
temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
if (temp->fMarked == FALSE) {
T = temp;
break;
}
}
if (T == NULL) {
break;
}
// mark T;
T->fMarked = TRUE;
// for each input symbol a do begin
int32_t a;
for (a = 1; a<=lastInputSymbol; a++) {
// let U be the set of positions that are in followpos(p)
// for some position p in T
// such that the symbol at position p is a;
UVector *U = NULL;
RBBINode *p;
int32_t px;
for (px=0; px<T->fPositions->size(); px++) {
p = (RBBINode *)T->fPositions->elementAt(px);
if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
if (U == NULL) {
U = new UVector(*fStatus);
}
setAdd(U, p->fFollowPos);
}
}
// if U is not empty and not in DStates then
int32_t ux = 0;
UBool UinDstates = FALSE;
if (U != NULL) {
U_ASSERT(U->size() > 0);
int ix;
for (ix=0; ix<fDStates->size(); ix++) {
RBBIStateDescriptor *temp2;
temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
if (setEquals(U, temp2->fPositions)) {
delete U;
U = temp2->fPositions;
ux = ix;
UinDstates = TRUE;
break;
}
}
// Add U as an unmarked state to Dstates
if (!UinDstates)
{
RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
newState->fPositions = U;
fDStates->addElement(newState, *fStatus);
ux = fDStates->size()-1;
}
// Dtran[T, a] := U;
T->fDtran->setElementAt(ux, a);
}
}
}
}
//-----------------------------------------------------------------------------
//
// flagAcceptingStates Identify accepting states.
// First get a list of all of the end marker nodes.
// Then, for each state s,
// if s contains one of the end marker nodes in its list of tree positions then
// s is an accepting state.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::flagAcceptingStates() {
UVector endMarkerNodes(*fStatus);
RBBINode *endMarker;
int32_t i;
int32_t n;
fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
for (i=0; i<endMarkerNodes.size(); i++) {
endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(endMarker) >= 0) {
// Any non-zero value for fAccepting means this is an accepting node.
// The value is what will be returned to the user as the break status.
// If no other value was specified, force it to -1.
sd->fAccepting = endMarker->fVal;
if (sd->fAccepting == 0) {
sd->fAccepting = -1;
}
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field or this state also.
if (endMarker->fLookAheadEnd) {
sd->fLookAhead = sd->fAccepting;
}
}
}
}
}
//-----------------------------------------------------------------------------
//
// flagLookAheadStates Very similar to flagAcceptingStates, above.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::flagLookAheadStates() {
UVector lookAheadNodes(*fStatus);
RBBINode *lookAheadNode;
int32_t i;
int32_t n;
fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
for (i=0; i<lookAheadNodes.size(); i++) {
lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
sd->fLookAhead = lookAheadNode->fVal;
}
}
}
}
//-----------------------------------------------------------------------------
//
// flagTaggedStates
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::flagTaggedStates() {
UVector tagNodes(*fStatus);
RBBINode *tagNode;
int32_t i;
int32_t n;
fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
tagNode = (RBBINode *)tagNodes.elementAt(i);
for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
if (sd->fTagVal < tagNode->fVal) {
// If more than one rule tag applies to this state, the larger
// tag takes precedence.
sd->fTagVal = tagNode->fVal;
}
}
}
}
}
//-----------------------------------------------------------------------------
//
// setAdd Set operation on UVector
// dest = dest union source
// Elements may only appear once. Order is unimportant.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
int destOriginalSize = dest->size();
int sourceSize = source->size();
int32_t si, di;
for (si=0; si<sourceSize; si++) {
void *elToAdd = source->elementAt(si);
for (di=0; di<destOriginalSize; di++) {
if (dest->elementAt(di) == elToAdd) {
goto elementAlreadyInDest;
}
}
dest->addElement(elToAdd, *fStatus);
elementAlreadyInDest: ;
}
}
//-----------------------------------------------------------------------------
//
// setEqual Set operation on UVector.
// Compare for equality.
// Elements may appear only once.
// Elements may appear in any order.
//
//-----------------------------------------------------------------------------
UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
int32_t aSize = a->size();
int32_t bSize = b->size();
if (aSize != bSize) {
return FALSE;
}
int32_t ax;
int32_t bx;
int32_t firstBx = 0;
void *aVal;
void *bVal = NULL;
for (ax=0; ax<aSize; ax++) {
aVal = a->elementAt(ax);
for (bx=firstBx; bx<bSize; bx++) {
bVal = b->elementAt(bx);
if (aVal == bVal) {
if (bx==firstBx) {
firstBx++;
}
break;
}
}
if (aVal != bVal) {
return FALSE;
}
}
return TRUE;
}
//-----------------------------------------------------------------------------
//
// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
// for each node in the tree.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::printPosSets(RBBINode *n) {
#ifdef RBBI_DEBUG
if (n==NULL) {
return;
}
n->print();
RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
RBBIDebugPrintf(" firstpos: ");
printSet(n->fFirstPosSet);
RBBIDebugPrintf(" lastpos: ");
printSet(n->fLastPosSet);
RBBIDebugPrintf(" followpos: ");
printSet(n->fFollowPos);
printPosSets(n->fLeftChild);
printPosSets(n->fRightChild);
#endif
}
//-----------------------------------------------------------------------------
//
// getTableSize() Calculate the size of the runtime form of this
// state transition table.
//
//-----------------------------------------------------------------------------
int32_t RBBITableBuilder::getTableSize() {
int32_t size = 0;
int32_t numRows;
int32_t numCols;
int32_t rowSize;
if (fTree == NULL) {
return 0;
}
size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
numRows = fDStates->size();
numCols = fRB->fSetBuilder->getNumCharCategories();
// Note The declaration of RBBIStateTableRow is for a table of two columns.
// Therefore we subtract two from numCols when determining
// how much storage to add to a row for the total columns.
rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
size += numRows * rowSize;
return size;
}
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the format required
// by the runtime engine. getTableSize() bytes of memory
// must be available at the output address "where".
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::exportTable(void *where) {
RBBIStateTable *table = (RBBIStateTable *)where;
uint32_t state;
int col;
if (U_FAILURE(*fStatus) || fTree == NULL) {
return;
}
if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
fDStates->size() > 0x7fff) {
*fStatus = U_BRK_INTERNAL_ERROR;
return;
}
table->fRowLen = sizeof(RBBIStateTableRow) +
sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
table->fNumStates = fDStates->size();
for (state=0; state<table->fNumStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
row->fAccepting = (int16_t)sd->fAccepting;
row->fLookAhead = (int16_t)sd->fLookAhead;
row->fTag = (int16_t)sd->fTagVal;
for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
}
}
}
//-----------------------------------------------------------------------------
//
// printSet Debug function. Print the contents of a UVector
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::printSet(UVector *s) {
#ifdef RBBI_DEBUG
int32_t i;
for (i=0; i<s->size(); i++) {
void *v = s->elementAt(i);
RBBIDebugPrintf("%10p", v);
}
RBBIDebugPrintf("\n");
#endif
}
//-----------------------------------------------------------------------------
//
// printStates Debug Function. Dump the fully constructed state transition table.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::printStates() {
#ifdef RBBI_DEBUG
int c; // input "character"
int n; // state number
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
RBBIDebugPrintf(" | Acc LA Tag");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf(" %2d", c);};
RBBIDebugPrintf("\n");
RBBIDebugPrintf(" |---------------");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf("---");};
RBBIDebugPrintf("\n");
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
RBBIDebugPrintf(" %3d | " , n);
RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagVal);
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
}
RBBIDebugPrintf("\n");
}
RBBIDebugPrintf("\n\n");
#endif
}
//-----------------------------------------------------------------------------
//
// RBBIStateDescriptor Methods. This is a very struct-like class
// Most access is directly to the fields.
//
//-----------------------------------------------------------------------------
RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
fMarked = FALSE;
fAccepting = 0;
fLookAhead = 0;
fTagVal = 0;
fPositions = NULL;
fDtran = NULL;
if (U_FAILURE(*fStatus)) {
return;
}
fDtran = new UVector(lastInputSymbol+1, *fStatus);
if (fDtran == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
// symbol.
}
RBBIStateDescriptor::~RBBIStateDescriptor() {
delete fPositions;
delete fDtran;
fPositions = NULL;
fDtran = NULL;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: rbbitblb.h ---
//
// rbbitblb.h
//
/*
**********************************************************************
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef RBBITBLB_H
#define RBBITBLB_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "rbbinode.h"
U_NAMESPACE_BEGIN
class RBBIRuleScanner;
class RBBIRuleBuilder;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
// It builds the state transition table used by the RBBI runtime
// from the expression syntax tree generated by the rule scanner.
//
// This class is part of the RBBI implementation only.
// There is no user-visible public API here.
//
class RBBITableBuilder : public UMemory {
public:
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
~RBBITableBuilder();
void build();
int32_t getTableSize(); // Return the runtime size in bytes of
// the built state table
void exportTable(void *where); // fill in the runtime state table.
// Sufficient memory must exist at
// the specified location.
private:
void calcNullable(RBBINode *n);
void calcFirstPos(RBBINode *n);
void calcLastPos(RBBINode *n);
void calcFollowPos(RBBINode *n);
void buildStateTable();
void flagAcceptingStates();
void flagLookAheadStates();
void flagTaggedStates();
// Set functions for UVector.
// TODO: make a USet subclass of UVector
void setAdd(UVector *dest, UVector *source);
UBool setEquals(UVector *a, UVector *b);
void printSet(UVector *s);
void printPosSets(RBBINode *n = NULL);
void printStates();
private:
RBBIRuleBuilder *fRB;
RBBINode *&fTree; // The root node of the parse tree to build a
// table for.
UErrorCode *fStatus;
UVector *fDStates; // D states (Aho's terminology)
// Index is state number
// Contents are RBBIStateDescriptor pointers.
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
};
//
// RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
// one for each state.
class RBBIStateDescriptor : public UMemory {
public:
UBool fMarked;
int32_t fAccepting;
int32_t fLookAhead;
int32_t fTagVal;
UVector *fPositions; // Set of parse tree positions associated
// with this state. Unordered (it's a set).
// UVector contents are RBBINode *
UVector *fDtran; // Transitions out of this state.
// indexed by input character
// contents is int index of dest state
// in RBBITableBuilder.fDStates
RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
~RBBIStateDescriptor();
private:
RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class
RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class
};
U_NAMESPACE_END
#endif
--- NEW FILE: sprpimpl.h ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: sprpimpl.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef SPRPIMPL_H
#define SPRPIMPL_H
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/parseerr.h"
#if !UCONFIG_NO_IDNA
enum{
UIDNA_NO_VALUE = 0x0000 ,
UIDNA_UNASSIGNED = 0x0001 ,
UIDNA_PROHIBITED = 0x0002 ,
UIDNA_MAP_NFKC = 0x0003 ,
UIDNA_LABEL_SEPARATOR = 0x0004
};
enum{
_IDNA_LENGTH_IN_MAPPING_TABLE = 0x0003 /*11*/
};
/* indexes[] value names */
enum {
_IDNA_INDEX_TRIE_SIZE, /* number of bytes in normalization trie */
_IDNA_INDEX_MAPPING_DATA_SIZE, /* The array that contains the mapping */
_IDNA_INDEX_TOP=3 /* changing this requires a new formatVersion */
};
enum {
_IDNA_MAPPING_DATA_SIZE = 2000,
_IDNA_MAP_TO_NOTHING = 0x7FF
};
#if defined(XP_CPLUSPLUS)
static inline
void uprv_syntaxError(const UChar* rules,
int32_t pos,
int32_t rulesLen,
UParseError* parseError)
{
if(parseError == NULL){
return;
}
if(pos == rulesLen && rulesLen >0){
pos--;
}
parseError->offset = pos;
parseError->line = 0 ; // we are not using line numbers
// for pre-context
int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
int32_t stop = pos;
u_memcpy(parseError->preContext,rules+start,stop-start);
//null terminate the buffer
parseError->preContext[stop-start] = 0;
//for post-context
start = pos;
if(start<rulesLen) {
U16_FWD_1(rules, start, rulesLen);
}
stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN)) :
rulesLen;
if(start < stop){
u_memcpy(parseError->postContext,rules+start,stop-start);
//null terminate the buffer
parseError->postContext[stop-start]= 0;
}
}
#endif
/* error codes for prototyping
#define U_IDNA_ERROR_START U_ERROR_LIMIT
#define U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 1))
#define U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 2))
#define U_IDNA_CHECK_BIDI_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 3))
#define U_IDNA_STD3_ASCII_RULES_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 4))
#define U_IDNA_ACE_PREFIX_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 5))
#define U_IDNA_VERIFICATION_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 6))
#define U_IDNA_LABEL_TOO_LONG_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 8))
*/
#endif /* #if !UCONFIG_NO_IDNA */
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/
--- NEW FILE: strprep.cpp ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "strprep.h"
#include "utrie.h"
#include "umutex.h"
#include "cmemory.h"
#include "sprpimpl.h"
#include "nameprep.h"
#include "ustr_imp.h"
#include "unicode/unorm.h"
#include "unicode/udata.h"
#include "unicode/ustring.h"
static const uint16_t* mappingData = NULL;
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
static UBool _isDataLoaded = FALSE;
static UTrie idnTrie={ 0,0,0,0,0,0,0 };
static UDataMemory* idnData=NULL;
static UErrorCode dataErrorCode =U_ZERO_ERROR;
/* file definitions */
static const char DATA_NAME[] = "uidna";
static const char DATA_TYPE[] = "icu";
U_CFUNC UBool
ustrprep_cleanup() {
if(idnData!=NULL) {
udata_close(idnData);
idnData=NULL;
}
dataErrorCode=U_ZERO_ERROR;
_isDataLoaded=FALSE;
return TRUE;
}
U_CDECL_BEGIN
static UBool U_CALLCONV
isAcceptable(void * /* context */,
const char * /* type */,
const char * /* name */,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */
pInfo->dataFormat[1]==0x44 &&
pInfo->dataFormat[2]==0x4e &&
pInfo->dataFormat[3]==0x41 &&
pInfo->formatVersion[0]==2 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
return TRUE;
} else {
return FALSE;
}
}
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
U_CDECL_END
static UBool U_CALLCONV
loadData(UErrorCode &errorCode) {
/* load Unicode IDNA data from file */
UBool isCached;
/* do this because double-checked locking is broken */
umtx_lock(NULL);
isCached=_isDataLoaded;
umtx_unlock(NULL);
if(!isCached) {
UTrie _idnTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
const int32_t *p=NULL;
const uint8_t *pb;
if(&errorCode==NULL || U_FAILURE(errorCode)) {
return 0;
}
/* open the data outside the mutex block */
//TODO: change the path
data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
dataErrorCode=errorCode;
if(U_FAILURE(errorCode)) {
return _isDataLoaded=FALSE;
}
p=(const int32_t *)udata_getMemory(data);
pb=(const uint8_t *)(p+_IDNA_INDEX_TOP);
utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode);
_idnTrie.getFoldingOffset=getFoldingOffset;
if(U_FAILURE(errorCode)) {
dataErrorCode=errorCode;
udata_close(data);
return _isDataLoaded=FALSE;
}
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if(idnData==NULL) {
idnData=data;
data=NULL;
uprv_memcpy(&indexes, p, sizeof(indexes));
uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie));
} else {
p=(const int32_t *)udata_getMemory(idnData);
}
umtx_unlock(NULL);
/* initialize some variables */
mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]);
_isDataLoaded = TRUE;
/* if a different thread set it first, then close the extra data */
if(data!=NULL) {
udata_close(data); /* NULL if it was set correctly */
}
}
return _isDataLoaded;
}
// *****************************************************************************
// class StringPrep
// *****************************************************************************
U_NAMESPACE_BEGIN
const char StringPrep::fgClassID=0;
UBool StringPrep::isDataLoaded(UErrorCode& status){
if(U_FAILURE(status)){
return FALSE;
}
if(_isDataLoaded==FALSE && U_FAILURE(dataErrorCode)){
status = dataErrorCode;
return FALSE;
}
loadData(dataErrorCode);
if(U_FAILURE(dataErrorCode)){
status = dataErrorCode;
return FALSE;
}
return TRUE;
}
StringPrep* StringPrep::createDefaultInstance(UErrorCode& status){
StringPrep* strprep = new StringPrep();
if(!isDataLoaded(status)){
delete strprep;
return NULL;
}
return strprep;
}
StringPrep* StringPrep::createNameprepInstance(UErrorCode& status){
StringPrep* strprep = new NamePrep(status);
if(!isDataLoaded(status)){
delete strprep;
return NULL;
}
return strprep;
}
UBool StringPrep::isNotProhibited(UChar32 /*ch*/){
return FALSE;
}
UBool StringPrep::isUnassigned(UChar32 ch){
uint32_t result;
UTRIE_GET16(&idnTrie,ch,result);
return (result == UIDNA_UNASSIGNED);
}
static inline void getValues(uint32_t result, int8_t& flag,
int8_t& length, int32_t& index){
/* first 3 bits contain the flag */
flag = (int8_t) (result & 0x07);
/* next 2 bits contain the length */
length = (int8_t) ((result>>3) & 0x03);
/* next 10 bits contain the index */
index = (result>> 5);
}
int32_t StringPrep::map(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status ){
uint32_t result;
int8_t flag;
int8_t length;
int32_t index;
int32_t destIndex=0;
int32_t srcIndex=0;
// check error status
if(U_FAILURE(status)){
return 0;
}
//check arguments
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(srcLength == -1){
srcLength = u_strlen(src);
}
for(;srcIndex<srcLength;){
UChar32 ch;
U16_NEXT(src,srcIndex,srcLength,ch);
UTRIE_GET16(&idnTrie,ch,result);
getValues(result,flag,length,index);
// check if the source codepoint is unassigned
if(flag == UIDNA_UNASSIGNED){
if(allowUnassigned == TRUE){
//copy the ch to destination
if(ch <= 0xFFFF){
if(destIndex < destCapacity ){
dest[destIndex] = (UChar)ch;
}
destIndex++;
}else{
if(destIndex+1 < destCapacity ){
dest[destIndex] = U16_LEAD(ch);
dest[destIndex+1] = U16_TRAIL(ch);
}
destIndex +=2;
}
}else{
uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError);
status = U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR;
return 0;
}
}else if((flag == UIDNA_MAP_NFKC && doNFKC == TRUE) ||
(index == _IDNA_MAP_TO_NOTHING && doNFKC == FALSE)){
if(length == _IDNA_LENGTH_IN_MAPPING_TABLE){
length = (int8_t) mappingData[index++];
}
for(int8_t i =0; i< length; i++){
if(destIndex < destCapacity ){
dest[destIndex] = mappingData[index+i];
}
destIndex++; /* for pre-flighting */
}
}else{
//copy the source into destination
if(ch <= 0xFFFF){
if(destIndex < destCapacity ){
dest[destIndex] = (UChar)ch;
}
destIndex++;
}else{
if(destIndex+1 < destCapacity ){
dest[destIndex] = U16_LEAD(ch);
dest[destIndex+1] = U16_TRAIL(ch);
}
destIndex +=2;
}
}
}
return u_terminateUChars(dest, destCapacity, destIndex, &status);
}
int32_t StringPrep::normalize( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode& status ){
return unorm_normalize(src,srcLength,UNORM_NFKC,UNORM_UNICODE_3_2,dest,destCapacity,&status);
}
/*
1) Map -- For each character in the input, check if it has a mapping
and, if so, replace it with its mapping.
2) Normalize -- Possibly normalize the result of step 1 using Unicode
normalization.
3) Prohibit -- Check for any characters that are not allowed in the
output. If any are found, return an error.
4) Check bidi -- Possibly check for right-to-left characters, and if
any are found, make sure that the whole string satisfies the
requirements for bidirectional strings. If the string does not
satisfy the requirements for bidirectional strings, return an
error.
[Unicode3.2] defines several bidirectional categories; each character
has one bidirectional category assigned to it. For the purposes of
the requirements below, an "RandALCat character" is a character that
has Unicode bidirectional categories "R" or "AL"; an "LCat character"
is a character that has Unicode bidirectional category "L". Note
that there are many characters which fall in neither of the above
definitions; Latin digits (<U+0030> through <U+0039>) are examples of
this because they have bidirectional category "EN".
In any profile that specifies bidirectional character handling, all
three of the following requirements MUST be met:
1) The characters in section 5.8 MUST be prohibited.
2) If a string contains any RandALCat character, the string MUST NOT
contain any LCat character.
3) If a string contains any RandALCat character, a RandALCat
character MUST be the first character of the string, and a
RandALCat character MUST be the last character of the string.
*/
#define MAX_STACK_BUFFER_SIZE 300
int32_t StringPrep::process(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status ){
// check error status
if(U_FAILURE(status)){
return 0;
}
//check arguments
if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar b1Stack[MAX_STACK_BUFFER_SIZE], b2Stack[MAX_STACK_BUFFER_SIZE];
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len=0,
b1Capacity = MAX_STACK_BUFFER_SIZE ,
b2Capacity = MAX_STACK_BUFFER_SIZE;
uint32_t result;
int32_t b2Index = 0;
int8_t flag;
int8_t length;
int32_t index;
UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
UBool leftToRight=FALSE, rightToLeft=FALSE;
int32_t rtlPos =-1, ltrPos =-1;
b1Len = map(src,srcLength, b1, b1Capacity,allowUnassigned, parseError, status);
if(status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
status = U_ZERO_ERROR; // reset error
b1Len = map(src,srcLength, b1, b1Len,allowUnassigned, parseError, status);
}
b2Len = normalize(b1,b1Len, b2,b2Capacity,status);
if(status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
status = U_ZERO_ERROR; // reset error
b2Len = normalize(b2,b2Len, b2,b2Len,status);
}
if(U_FAILURE(status)){
goto CLEANUP;
}
UChar32 ch;
for(; b2Index<b2Len;){
ch = 0;
U16_NEXT(b2, b2Index, b2Len, ch);
UTRIE_GET16(&idnTrie,ch,result);
getValues(result,flag,length,index);
if(flag == UIDNA_PROHIBITED
&& isNotProhibited(ch) == FALSE){
status = U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR;
uprv_syntaxError(b1, b2Index-U16_LENGTH(ch), b2Len, parseError);
goto CLEANUP;
}
direction = u_charDirection(ch);
if(firstCharDir == U_CHAR_DIRECTION_COUNT){
firstCharDir = direction;
}
if(direction == U_LEFT_TO_RIGHT){
leftToRight = TRUE;
ltrPos = b2Index-1;
}
if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
rightToLeft = TRUE;
rtlPos = b2Index-1;
}
}
// satisfy 2
if( leftToRight == TRUE && rightToLeft == TRUE){
status = U_IDNA_CHECK_BIDI_ERROR;
uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
goto CLEANUP;
}
//satisfy 3
if( rightToLeft == TRUE &&
!((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
){
status = U_IDNA_CHECK_BIDI_ERROR;
uprv_syntaxError(b2, rtlPos, b2Len, parseError);
return FALSE;
}
if(b2Len <= destCapacity){
uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR);
}
CLEANUP:
if(b1!=b1Stack){
uprv_free(b1);
}
if(b2!=b2Stack){
uprv_free(b2);
}
return u_terminateUChars(dest, destCapacity, b2Len, &status);
}
UBool StringPrep::isLabelSeparator(UChar32 ch, UErrorCode& status){
// check error status
if(U_FAILURE(status)){
return FALSE;
}
if(isDataLoaded(status)){
int32_t result;
UTRIE_GET16(&idnTrie,ch, result);
if( (result & 0x07) == UIDNA_LABEL_SEPARATOR){
return TRUE;
}
}
return FALSE;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_IDNA */
--- NEW FILE: strprep.h ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: strprep.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef STRPREP_H
#define STRPREP_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "unicode/uobject.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
U_NAMESPACE_BEGIN
/**\file
*
* This API implements RF 3454 StringPrep standard.
*
* The steps for preparing strings are:
*
* 1) Map -- For each character in the input, check if it has a mapping
* and, if so, replace it with its mapping.
* <ul>
* <li>Delete certain codepoints from the input because their
* presence or absence in the protocol identifies should not
* make two strings different</li>
* <li>Case Mapings
* <br>If Normalization is turned off
* <br> Get mappings from case map tables
* <br>else
* <br> Get mappings from case map tables for normalization
* <br> Use u_getFC_NFKC_Closure for obtaining extra mappings
* </li>
* </ul>
* 2) Normalize -- Possibly normalize the result of step 1 using Unicode
* normalization NFKC.
*
* 3) Prohibit -- Check for any characters that are not allowed in the
* output. If any are found, return an error.
*
* 4) Check bidi -- Possibly check for right-to-left characters, and if
* any are found, make sure that the whole string satisfies the
* requirements for bidirectional strings. If the string does not
* satisfy the requirements for bidirectional strings, return an
* error.
*
* Some StringPrep profiles:
* IDN: "Nameprep" http://www.ietf.org/rfc/rfc3491.txt
* XMPP Node Identifiers: "Nodeprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-nodeprep-01.txt
* XMPP Resource Identifiers: "Resourceprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-resourceprep-01.txt
* ANONYMOUS SASL tokens: "plain" http://www.ietf.org/internet-drafts/draft-ietf-sasl-anon-00.txt
* iSCSI http://www.ietf.org/internet-drafts/draft-ietf-ips-iscsi-string-prep-03.txt
*/
class StringPrep : public UObject{
protected:
UVersionInfo unicodeVersion; /** The Character repertoire version of this profile */
UBool bidiCheck; /** Option to turn BiDi checking on */
UBool doNFKC; /** Option to turn NFKC on */
/**
* Protected default constructor sub classes
*/
StringPrep(){};
public:
/**
* Destructor
*/
virtual inline ~StringPrep(){};
/**
* Map every character in input stream with mapping character
* in the mapping table and populate the output stream.
* For any individual character the mapping table may specify
* that that a character be mapped to nothing, mapped to one
* other character or to a string of other characters.
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code point.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*
*/
virtual int32_t map(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status );
/**
* Normalize the input stream using Normalization Form KC (NFKC)
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*
*
*/
virtual int32_t normalize( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode& status );
/**
* Prepare the input stream with for use. This operation maps, normalizes(NFKC),
* checks for prohited and BiDi characters in the order defined by RFC 3454
*
* @param src Pointer to UChar buffer containing a single label
* @param srcLength Number of characters in the source label
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param allowUnassigned Unassigned values can be converted to ASCII for query operations
* If TRUE unassigned values are treated as normal Unicode code point.
* If FALSE the operation fails with U_UNASSIGNED_CODE_POINT error code.
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of UChars in the destination buffer
*
*
*/
virtual int32_t process(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UBool allowUnassigned,
UParseError* parseError,
UErrorCode& status );
/**
* Create a profile from prebuilt default Nameprep profile conforming to
* nameprep internet draft (http://www.ietf.org/html.charters/idn-charter.html).
* This is a built-in/unmodifiable profile.
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Pointer to StringPrep object that is created. Should be deleted by
* by caller
*
*
*/
static StringPrep* createNameprepInstance(UErrorCode& status);
/**
* Create a profile from prebuilt default StringPrep profile conforming to
* RFC 3454 (ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt).
* User defined profiles can be created by getting the default profile and
* adding mappings, removing mappings, turning options ON/OFF and prohibiting
* characters from the output.
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Pointer to StringPrep object that is created. Should be deleted by
* the caller.
*
*
*/
static StringPrep* createDefaultInstance(UErrorCode& status);
/**
* Ascertain if the given code point is a Letter/Digit/Hyphen in the ASCII range
*
* @return TRUE is the code point is a Letter/Digit/Hyphen
*
*
*/
static inline UBool isLDHChar(UChar32 ch);
/**
* Ascertain if the given code point is a label separator as specified by IDNA
*
* @return TRUE is the code point is a label separator
*
*
*/
virtual UBool isLabelSeparator(UChar32 ch, UErrorCode& status);
/**
* Get the BiDi option of this profile
*
*
*/
inline UBool getCheckBiDi();
/**
* Get the normalization (NFKC) option of this profile
*
* @return The normalization option
*
*
*/
inline UBool getNormalization();
/**
* Get the Unicode version which this profile
* conforms to
*
*
*/
inline void getUnicodeVersion(UVersionInfo& info);
private:
// Boiler plate
/**
* Copy constructor.
*
*/
StringPrep(const StringPrep&);
/**
* Assignment operator.
*
*/
StringPrep& operator=(const StringPrep&);
/**
* Return true if another object is semantically equal to this one.
*
* @param other the object to be compared with.
* @return true if another object is semantically equal to this one.
*
*/
UBool operator==(const StringPrep& other) const {return FALSE;};
/**
* Return true if another object is semantically unequal to this one.
*
* @param other the object to be compared with.
* @return true if another object is semantically unequal to this one.
*
*/
UBool operator!=(const StringPrep& other) const { return !operator==(other); }
public:
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
*
*/
static inline UClassID getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
*
*/
virtual inline UClassID getDynamicClassID() const;
protected:
/**
* Sub classes that slightly modify the default profile
* implement this method to remove characters to
* the prohibited list. The default implementation does not
* check if the data is loaded or not. The caller is responsible
* for checking for data.
*
*/
virtual UBool isNotProhibited(UChar32 ch);
/**
* Sub classes that slightly modify the default profile
* implement this method to remove characters to
* the unassigned list. The default implementation does not
* check if the data is loaded or not. The caller is responsible
* for checking for data.
*/
virtual UBool isUnassigned(UChar32 ch);
/**
* Ascertains if uidna.icu data file is loaded.
* If data is not loaded, loads the data file.
*
*
*/
static UBool isDataLoaded(UErrorCode& status);
private:
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
inline UBool StringPrep::getCheckBiDi(){
return bidiCheck;
}
inline UBool StringPrep::getNormalization(){
return doNFKC;
}
inline void StringPrep::getUnicodeVersion(UVersionInfo& info){
for(int32_t i=0; i< (int32_t)(sizeof(info)/sizeof(info[0])); i++){
info[i] = unicodeVersion[i];
}
}
inline UClassID StringPrep::getStaticClassID() {
return (UClassID)&fgClassID;
}
inline UClassID StringPrep::getDynamicClassID() const {
return getStaticClassID();
}
inline UBool StringPrep::isLDHChar(UChar32 ch){
// high runner case
if(ch>0x007A){
return FALSE;
}
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
if( (ch==0x002D) ||
(0x0030 <= ch && ch <= 0x0039) ||
(0x0041 <= ch && ch <= 0x005A) ||
(0x0061 <= ch && ch <= 0x007A)
){
return TRUE;
}
return FALSE;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_IDNA */
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/
--- NEW FILE: symtable.h ---
/*
**********************************************************************
* Copyright (c) 2000, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 02/04/00 aliu Creation.
**********************************************************************
*/
#ifndef SYMTABLE_H
#define SYMTABLE_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
class ParsePosition;
class UnicodeFunctor;
class UnicodeSet;
class UnicodeString;
/**
* An interface that maps strings to objects. This interface defines
* both lookup protocol and parsing. This allows different components
* to share a symbol table and to handle name parsing uniformly. It
* is expected that client parse code look for the SYMBOL_REF
* character and, when seen, attempt to parse the characters after it
* using parseReference().
*
* <p>Currently, RuleBasedTransliterator and UnicodeSet use this
* interface to share variable definitions.
*/
class SymbolTable /* not : public UObject because this is an interface/mixin class */ {
public:
/**
* The character preceding a symbol reference name.
*/
enum { SYMBOL_REF = 0x0024 /*$*/ };
/**
* Destructor.
*/
virtual inline ~SymbolTable() {};
/**
* Lookup the characters associated with this string and return it.
* Return <tt>NULL</tt> if no such name exists. The resultant
* string may have length zero.
*/
virtual const UnicodeString* lookup(const UnicodeString& s) const = 0;
/**
* Lookup the UnicodeMatcher associated with the given character, and
* return it. Return <tt>null</tt> if not found.
*/
virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const = 0;
/**
* Parse a symbol reference name from the given string, starting
* at the given position. If no valid symbol reference name is
* found, return an empty string.
* @param text the text to parse for the name
* @param pos on entry, the index of the first character to parse.
* This is the character following the SYMBOL_REF character. On
* exit, the index after the last parsed character.
* @param limit the index after the last character to be parsed.
* @return the parsed name or an empty string.
*/
virtual UnicodeString parseReference(const UnicodeString& text,
ParsePosition& pos, int32_t limit) const = 0;
};
U_NAMESPACE_END
#endif
--- NEW FILE: uassert.h ---
/*
******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File uassert.h
*
* Contains U_ASSERT macro
*
* By default, U_ASSERT just wraps the C library assert macro.
* By changing the definition here, the assert behavior for ICU can be changed
* without affecting other non-ICU uses of the C library assert().
*
******************************************************************************
*/
#ifndef U_ASSERT_H
#define U_ASSERT_H
/* utypes.h is included to get the proper define for uint8_t */
#include "unicode/utypes.h"
#include <assert.h>
#define U_ASSERT(exp) assert(exp)
#endif
--- NEW FILE: ubrk.cpp ---
/*
*****************************************************************************************
* Copyright (C) 1996-2001, International Business Machines
* Corporation and others. All Rights Reserved.
*****************************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/ubrk.h"
#include "unicode/brkiter.h"
#include "unicode/uloc.h"
#include "unicode/ustring.h"
#include "unicode/uchriter.h"
#include "unicode/rbbi.h"
#include "rbbirb.h"
U_NAMESPACE_USE
//----------------------------------------------------------------------------------------
//
// ubrk_open Create a canned type of break iterator based on type (word, line, etc.)
// and locale.
//
//----------------------------------------------------------------------------------------
U_CAPI UBreakIterator* U_EXPORT2
ubrk_open(UBreakIteratorType type,
const char *locale,
const UChar *text,
int32_t textLength,
UErrorCode *status)
{
if(U_FAILURE(*status)) return 0;
BreakIterator *result = 0;
switch(type) {
case UBRK_CHARACTER:
result = BreakIterator::createCharacterInstance(Locale(locale), *status);
break;
case UBRK_WORD:
result = BreakIterator::createWordInstance(Locale(locale), *status);
break;
case UBRK_LINE:
result = BreakIterator::createLineInstance(Locale(locale), *status);
break;
case UBRK_SENTENCE:
result = BreakIterator::createSentenceInstance(Locale(locale), *status);
break;
case UBRK_TITLE:
result = BreakIterator::createTitleInstance(Locale(locale), *status);
break;
}
// check for allocation error
if (U_FAILURE(*status)) {
return 0;
}
if(result == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
UCharCharacterIterator *iter = 0;
iter = new UCharCharacterIterator(text, textLength);
if(iter == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete result;
return 0;
}
result->adoptText(iter);
return (UBreakIterator*)result;
}
//----------------------------------------------------------------------------------------
//
// ubrk_openRules open a break iterator from a set of break rules.
// Invokes the rule builder.
//
//----------------------------------------------------------------------------------------
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openRules( const UChar *rules,
int32_t rulesLength,
const UChar *text,
int32_t textLength,
UParseError *parseErr,
UErrorCode *status) {
if (status == NULL || U_FAILURE(*status)){
return 0;
}
BreakIterator *result = 0;
UnicodeString ruleString(rules, rulesLength);
result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, *parseErr, *status);
if(U_FAILURE(*status)) {
return 0;
}
if (text != NULL) {
UCharCharacterIterator *iter = 0;
iter = new UCharCharacterIterator(text, textLength);
if(iter == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete result;
return 0;
}
result->adoptText(iter);
}
return (UBreakIterator *)result;
}
U_CAPI UBreakIterator * U_EXPORT2
ubrk_safeClone(
const UBreakIterator *bi,
void *stackBuffer,
int32_t *pBufferSize,
UErrorCode *status)
{
if (status == NULL || U_FAILURE(*status)){
return 0;
}
if (!pBufferSize || !bi){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return (UBreakIterator *)(((BreakIterator*)bi)->
createBufferClone(stackBuffer, *pBufferSize, *status));
}
U_CAPI void U_EXPORT2
ubrk_close(UBreakIterator *bi)
{
BreakIterator *ubi = (BreakIterator*) bi;
if (ubi) {
if (ubi->isBufferClone()) {
ubi->~BreakIterator();
*(uint32_t *)ubi = 0xdeadbeef;
} else {
delete ubi;
}
}
}
U_CAPI void U_EXPORT2
ubrk_setText(UBreakIterator* bi,
const UChar* text,
int32_t textLength,
UErrorCode* status)
{
if (U_FAILURE(*status)) return;
const CharacterIterator& biText = ((BreakIterator*)bi)->getText();
int32_t textLen = (textLength == -1 ? u_strlen(text) : textLength);
if (biText.getDynamicClassID() == UCharCharacterIterator::getStaticClassID()) {
((UCharCharacterIterator&)biText).setText(text, textLen);
}
else {
UCharCharacterIterator *iter = 0;
iter = new UCharCharacterIterator(text, textLen);
if(iter == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
((BreakIterator*)bi)->adoptText(iter);
}
}
U_CAPI int32_t U_EXPORT2
ubrk_current(const UBreakIterator *bi)
{
return ((BreakIterator*)bi)->current();
}
U_CAPI int32_t U_EXPORT2
ubrk_next(UBreakIterator *bi)
{
return ((BreakIterator*)bi)->next();
}
U_CAPI int32_t U_EXPORT2
ubrk_previous(UBreakIterator *bi)
{
return ((BreakIterator*)bi)->previous();
}
U_CAPI int32_t U_EXPORT2
ubrk_first(UBreakIterator *bi)
{
return ((BreakIterator*)bi)->first();
}
U_CAPI int32_t U_EXPORT2
ubrk_last(UBreakIterator *bi)
{
return ((BreakIterator*)bi)->last();
}
U_CAPI int32_t U_EXPORT2
ubrk_preceding(UBreakIterator *bi,
int32_t offset)
{
return ((BreakIterator*)bi)->preceding(offset);
}
U_CAPI int32_t U_EXPORT2
ubrk_following(UBreakIterator *bi,
int32_t offset)
{
return ((BreakIterator*)bi)->following(offset);
}
U_CAPI const char* U_EXPORT2
ubrk_getAvailable(int32_t index)
{
return uloc_getAvailable(index);
}
U_CAPI int32_t U_EXPORT2
ubrk_countAvailable()
{
return uloc_countAvailable();
}
U_CAPI UBool U_EXPORT2
ubrk_isBoundary(UBreakIterator *bi, int32_t offset)
{
return ((BreakIterator *)bi)->isBoundary(offset);
}
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi)
{
return ((RuleBasedBreakIterator *)bi)->getRuleStatus();
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- NEW FILE: ucat.c ---
/*
**********************************************************************
* Copyright (c) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: March 19 2003
* Since: ICU 2.6
**********************************************************************
*/
#include "unicode/ucat.h"
#include "unicode/ustring.h"
#include "cstring.h"
#include "uassert.h"
/* Separator between set_num and msg_num */
static const char SEPARATOR = '%';
/* Maximum length of a set_num/msg_num key, incl. terminating zero.
* Longest possible key is "-2147483648%-2147483648" */
#define MAX_KEY_LEN (24)
/**
* Fill in buffer with a set_num/msg_num key string, given the numeric
* values. Numeric values must be >= 0. Buffer must be of length
* MAX_KEY_LEN or more.
*/
static char*
_catkey(char* buffer, int32_t set_num, int32_t msg_num) {
int32_t i = 0;
i = T_CString_integerToString(buffer, set_num, 10);
buffer[i++] = SEPARATOR;
T_CString_integerToString(buffer+i, msg_num, 10);
return buffer;
}
U_CAPI u_nl_catd U_EXPORT2
u_catopen(const char* name, const char* locale, UErrorCode* ec) {
return (u_nl_catd) ures_open(name, locale, ec);
}
U_CAPI void U_EXPORT2
u_catclose(u_nl_catd catd) {
ures_close((UResourceBundle*) catd); /* may be NULL */
}
U_CAPI const UChar* U_EXPORT2
u_catgets(u_nl_catd catd, int32_t set_num, int32_t msg_num,
const UChar* s,
int32_t* len, UErrorCode* ec) {
char key[MAX_KEY_LEN];
const UChar* result;
if (ec == NULL || U_FAILURE(*ec)) {
goto ERROR;
}
result = ures_getStringByKey((const UResourceBundle*) catd,
_catkey(key, set_num, msg_num),
len, ec);
if (U_FAILURE(*ec)) {
goto ERROR;
}
return result;
ERROR:
/* In case of any failure, return s */
if (len != NULL) {
*len = u_strlen(s);
}
return s;
}
/*eof*/
--- NEW FILE: ucnv_u16.c ---
/*
**********************************************************************
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u16.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jul01
* created by: Markus W. Scherer
*
* UTF-16 converter implementation. Used to be in ucnv_utf.c.
*/
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "cmemory.h"
/* UTF-16 Platform Endian --------------------------------------------------- */
static void
_UTF16PEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv = pArgs->converter;
const uint8_t *source = (const uint8_t *)pArgs->source;
UChar *target = pArgs->target;
int32_t *offsets = pArgs->offsets;
int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
int32_t length = (const uint8_t *)pArgs->sourceLimit - source;
int32_t count;
int32_t sourceIndex = 0;
if(length <= 0 && cnv->toUnicodeStatus == 0) {
/* no input, nothing to do */
return;
}
if(targetCapacity <= 0) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
return;
}
/* complete a partial UChar from the last call */
if(length != 0 && cnv->toUnicodeStatus != 0) {
/*
* copy the byte from the last call and the first one here into the target,
* byte-wise to keep the platform endianness
*/
uint8_t *p = (uint8_t *)target++;
*p++ = (uint8_t)cnv->toUnicodeStatus;
cnv->toUnicodeStatus = 0;
*p = *source++;
--length;
--targetCapacity;
if(offsets != NULL) {
*offsets++ = -1;
}
}
/* copy an even number of bytes for complete UChars */
count = 2 * targetCapacity;
if(count > length) {
count = length & ~1;
}
if(count > 0) {
uprv_memcpy(target, source, count);
source += count;
length -= count;
count >>= 1;
target += count;
targetCapacity -= count;
if(offsets != NULL) {
while(count > 0) {
*offsets++ = sourceIndex;
sourceIndex += 2;
--count;
}
}
}
/* check for a remaining source byte and store the status */
if(length >= 2) {
/* it must be targetCapacity==0 because otherwise the above would have copied more */
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
} else if(length == 1) {
if(pArgs->flush) {
/* a UChar remains incomplete */
*pErrorCode = U_TRUNCATED_CHAR_FOUND;
} else {
/* consume the last byte and store it, making sure that it will never set the status to 0 */
cnv->toUnicodeStatus = *source++ | 0x100;
}
} else /* length==0 */ if(cnv->toUnicodeStatus!=0 && pArgs->flush) {
/* a UChar remains incomplete */
*pErrorCode = U_TRUNCATED_CHAR_FOUND;
}
/* write back the updated pointers */
pArgs->source = (const char *)source;
pArgs->target = target;
pArgs->offsets = offsets;
}
static void
_UTF16PEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv = pArgs->converter;
const UChar *source = pArgs->source;
uint8_t *target = (uint8_t *)pArgs->target;
int32_t *offsets = pArgs->offsets;
int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
int32_t length = pArgs->sourceLimit - source;
int32_t count;
int32_t sourceIndex = 0;
if(length <= 0 && cnv->fromUnicodeStatus == 0) {
/* no input, nothing to do */
return;
}
if(targetCapacity <= 0) {
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
return;
}
/* complete a partial UChar from the last call */
if(cnv->fromUnicodeStatus != 0) {
*target++ = (uint8_t)cnv->fromUnicodeStatus;
cnv->fromUnicodeStatus = 0;
--targetCapacity;
if(offsets != NULL) {
*offsets++ = -1;
}
}
/* copy an even number of bytes for complete UChars */
count = 2 * length;
if(count > targetCapacity) {
count = targetCapacity & ~1;
}
if(count>0) {
uprv_memcpy(target, source, count);
target += count;
targetCapacity -= count;
count >>= 1;
source += count;
length -= count;
if(offsets != NULL) {
while(count > 0) {
*offsets++ = sourceIndex;
*offsets++ = sourceIndex++;
--count;
}
}
}
if(length > 0) {
/* it must be targetCapacity<=1 because otherwise the above would have copied more */
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
if(targetCapacity > 0) /* targetCapacity==1 */ {
/* copy one byte and keep the other in the status */
const uint8_t *p = (const uint8_t *)source++;
*target++ = *p++;
cnv->fromUnicodeStatus = *p | 0x100;
if(offsets != NULL) {
*offsets++ = sourceIndex;
}
}
}
/* write back the updated pointers */
pArgs->source = source;
pArgs->target = (char *)target;
pArgs->offsets = offsets;
}
/* UTF-16 Opposite Endian --------------------------------------------------- */
/*
* For opposite-endian UTF-16, we keep a byte pointer to the UChars
* and copy two bytes at a time and reverse them.
*/
static void
_UTF16OEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv = pArgs->converter;
const uint8_t *source = (const uint8_t *)pArgs->source;
UChar *target = pArgs->target;
uint8_t *target8 = (uint8_t *)target; /* byte pointer to the target */
int32_t *offsets = pArgs->offsets;
int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
int32_t length = (const uint8_t *)pArgs->sourceLimit - source;
int32_t count;
int32_t sourceIndex = 0;
if(length <= 0 && cnv->toUnicodeStatus == 0) {
/* no input, nothing to do */
return;
}
if(targetCapacity <= 0) {
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
return;
}
/* complete a partial UChar from the last call */
if(length != 0 && cnv->toUnicodeStatus != 0) {
/*
* copy the byte from the last call and the first one here into the target,
* byte-wise, reversing the platform endianness
*/
*target8++ = *source++;
*target8++ = (uint8_t)cnv->toUnicodeStatus;
cnv->toUnicodeStatus = 0;
++target;
--length;
--targetCapacity;
if(offsets != NULL) {
*offsets++ = -1;
}
}
/* copy an even number of bytes for complete UChars */
count = 2 * targetCapacity;
if(count > length) {
count = length & ~1;
}
if(count>0) {
length -= count;
count >>= 1;
targetCapacity -= count;
if(offsets == NULL) {
while(count > 0) {
target8[1] = *source++;
target8[0] = *source++;
target8 += 2;
--count;
}
} else {
while(count>0) {
target8[1] = *source++;
target8[0] = *source++;
target8 += 2;
*offsets++ = sourceIndex;
sourceIndex += 2;
--count;
}
}
target=(UChar *)target8;
}
/* check for a remaining source byte and store the status */
if(length >= 2) {
/* it must be targetCapacity==0 because otherwise the above would have copied more */
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
} else if(length == 1) {
if(pArgs->flush) {
/* a UChar remains incomplete */
*pErrorCode = U_TRUNCATED_CHAR_FOUND;
} else {
/* consume the last byte and store it, making sure that it will never set the status to 0 */
cnv->toUnicodeStatus = *source++ | 0x100;
}
} else /* length==0 */ if(cnv->toUnicodeStatus!=0 && pArgs->flush) {
/* a UChar remains incomplete */
*pErrorCode = U_TRUNCATED_CHAR_FOUND;
}
/* write back the updated pointers */
pArgs->source = (const char *)source;
pArgs->target = target;
pArgs->offsets = offsets;
}
static void
_UTF16OEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv = pArgs->converter;
const UChar *source = pArgs->source;
const uint8_t *source8 = (const uint8_t *)source; /* byte pointer to the source */
uint8_t *target = (uint8_t *)pArgs->target;
int32_t *offsets = pArgs->offsets;
int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
int32_t length = pArgs->sourceLimit - source;
int32_t count;
int32_t sourceIndex = 0;
if(length <= 0 && cnv->fromUnicodeStatus == 0) {
/* no input, nothing to do */
return;
}
if(targetCapacity <= 0) {
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
return;
}
/* complete a partial UChar from the last call */
if(cnv->fromUnicodeStatus != 0) {
*target++ = (uint8_t)cnv->fromUnicodeStatus;
cnv->fromUnicodeStatus = 0;
--targetCapacity;
if(offsets != NULL) {
*offsets++ = -1;
}
}
/* copy an even number of bytes for complete UChars */
count = 2 * length;
if(count > targetCapacity) {
count = targetCapacity & ~1;
}
if(count > 0) {
targetCapacity -= count;
count >>= 1;
length -= count;
if(offsets == NULL) {
while(count > 0) {
target[1] = *source8++;
target[0] = *source8++;
target += 2;
--count;
}
} else {
while(count>0) {
target[1] = *source8++;
target[0] = *source8++;
target += 2;
*offsets++ = sourceIndex;
*offsets++ = sourceIndex++;
--count;
}
}
source=(const UChar *)source8;
}
if(length > 0) {
/* it must be targetCapacity<=1 because otherwise the above would have copied more */
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
if(targetCapacity > 0) /* targetCapacity==1 */ {
/* copy one byte and keep the other in the status */
cnv->fromUnicodeStatus = *source8++ | 0x100;
*target++ = *source8;
++source;
if(offsets != NULL) {
*offsets++ = sourceIndex;
}
}
}
/* write back the updated pointers */
pArgs->source = source;
pArgs->target = (char *)target;
pArgs->offsets = offsets;
}
/* UTF-16BE ----------------------------------------------------------------- */
#if U_IS_BIG_ENDIAN
# define _UTF16BEToUnicodeWithOffsets _UTF16PEToUnicodeWithOffsets
# define _UTF16LEToUnicodeWithOffsets _UTF16OEToUnicodeWithOffsets
# define _UTF16BEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets
# define _UTF16LEFromUnicodeWithOffsets _UTF16OEFromUnicodeWithOffsets
#else
# define _UTF16BEToUnicodeWithOffsets _UTF16OEToUnicodeWithOffsets
# define _UTF16LEToUnicodeWithOffsets _UTF16PEToUnicodeWithOffsets
# define _UTF16BEFromUnicodeWithOffsets _UTF16OEFromUnicodeWithOffsets
# define _UTF16LEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets
#endif
static UChar32 T_UConverter_getNextUChar_UTF16_BE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
UChar32 myUChar;
uint16_t first;
/*Checks boundaries and set appropriate error codes*/
if (args->source+2 > args->sourceLimit)
{
if (args->source >= args->sourceLimit)
{
/*Either caller has reached the end of the byte stream*/
*err = U_INDEX_OUTOFBOUNDS_ERROR;
}
else
{
/* a character was cut in half*/
*err = U_TRUNCATED_CHAR_FOUND;
}
return 0xffff;
}
/*Gets the corresponding codepoint*/
first = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*((args->source)+1)));
myUChar = first;
args->source += 2;
if(UTF_IS_FIRST_SURROGATE(first)) {
uint16_t second;
if (args->source+2 > args->sourceLimit) {
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/* get the second surrogate and assemble the code point */
second = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*(args->source+1)));
/* ignore unmatched surrogates and just deliver the first one in such a case */
if(UTF_IS_SECOND_SURROGATE(second)) {
/* matched pair, get pair value */
myUChar = UTF16_GET_PAIR_VALUE(first, second);
args->source += 2;
}
}
return myUChar;
}
static const UConverterImpl _UTF16BEImpl={
UCNV_UTF16_BigEndian,
NULL,
NULL,
NULL,
NULL,
NULL,
_UTF16BEToUnicodeWithOffsets,
_UTF16BEToUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
T_UConverter_getNextUChar_UTF16_BE,
NULL,
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
/* The 1200 CCSID refers to any version of Unicode with any endianess of UTF-16 */
static const UConverterStaticData _UTF16BEStaticData={
sizeof(UConverterStaticData),
"UTF-16BE",
1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
{ 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _UTF16BEData={
sizeof(UConverterSharedData), ~((uint32_t) 0),
NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
0
};
/* UTF-16LE ----------------------------------------------------------------- */
static UChar32 T_UConverter_getNextUChar_UTF16_LE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
UChar32 myUChar;
uint16_t first;
/*Checks boundaries and set appropriate error codes*/
if (args->source+2 > args->sourceLimit)
{
if (args->source >= args->sourceLimit)
{
/*Either caller has reached the end of the byte stream*/
*err = U_INDEX_OUTOFBOUNDS_ERROR;
}
else
{
/* a character was cut in half*/
*err = U_TRUNCATED_CHAR_FOUND;
}
return 0xffff;
}
/*Gets the corresponding codepoint*/
first = (uint16_t)(((uint16_t)*((args->source)+1) << 8) | ((uint8_t)(*(args->source))));
myUChar=first;
/*updates the source*/
args->source += 2;
if (UTF_IS_FIRST_SURROGATE(first))
{
uint16_t second;
if (args->source+2 > args->sourceLimit)
{
*err = U_TRUNCATED_CHAR_FOUND;
return 0xffff;
}
/* get the second surrogate and assemble the code point */
second = (uint16_t)(((uint16_t)*(args->source+1) << 8) |((uint8_t)(*(args->source))));
/* ignore unmatched surrogates and just deliver the first one in such a case */
if(UTF_IS_SECOND_SURROGATE(second))
{
/* matched pair, get pair value */
myUChar = UTF16_GET_PAIR_VALUE(first, second);
args->source += 2;
}
}
return myUChar;
}
static const UConverterImpl _UTF16LEImpl={
UCNV_UTF16_LittleEndian,
NULL,
NULL,
NULL,
NULL,
NULL,
_UTF16LEToUnicodeWithOffsets,
_UTF16LEToUnicodeWithOffsets,
_UTF16LEFromUnicodeWithOffsets,
_UTF16LEFromUnicodeWithOffsets,
T_UConverter_getNextUChar_UTF16_LE,
NULL,
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
/* The 1200 CCSID refers to any version of Unicode with any endianess of UTF-16 */
static const UConverterStaticData _UTF16LEStaticData={
sizeof(UConverterStaticData),
"UTF-16LE",
1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
{ 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _UTF16LEData={
sizeof(UConverterSharedData), ~((uint32_t) 0),
NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
0
};
/* UTF-16 (Detect BOM) ------------------------------------------------------ */
/*
* Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
* accordingly.
* This is a simpler version of the UTF-32 converter below, with
* fewer states for shorter BOMs.
*
* State values:
* 0 initial state
* 1 saw FE
* 2..4 -
* 5 saw FF
* 6..7 -
* 8 UTF-16BE mode
* 9 UTF-16LE mode
*
* During detection: state&3==number of matching bytes so far.
*
* On output, emit U+FEFF as the first code point.
*/
static void
_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
if(choice<=UCNV_RESET_TO_UNICODE) {
/* reset toUnicode: state=0 */
cnv->mode=0;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
/* reset fromUnicode: prepare to output the UTF-16PE BOM */
cnv->charErrorBufferLength=2;
#if U_IS_BIG_ENDIAN
cnv->charErrorBuffer[0]=0xfe;
cnv->charErrorBuffer[1]=0xff;
#else
cnv->charErrorBuffer[0]=0xff;
cnv->charErrorBuffer[1]=0xfe;
#endif
}
}
static void
_UTF16Open(UConverter *cnv,
const char *name,
const char *locale,
uint32_t options,
UErrorCode *pErrorCode) {
_UTF16Reset(cnv, UCNV_RESET_BOTH);
}
static const char utf16BOM[8]={ (char)0xfe, (char)0xff, 0, 0, (char)0xff, (char)0xfe, 0, 0 };
static void
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv=pArgs->converter;
const char *source=pArgs->source;
const char *sourceLimit=pArgs->sourceLimit;
int32_t *offsets=pArgs->offsets;
int32_t state, offsetDelta;
char b;
state=cnv->mode;
/*
* If we detect a BOM in this buffer, then we must add the BOM size to the
* offsets because the actual converter function will not see and count the BOM.
* offsetDelta will have the number of the BOM bytes that are in the current buffer.
*/
offsetDelta=0;
while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
switch(state) {
case 0:
b=*source;
if(b==(char)0xfe) {
state=1; /* could be FE FF */
} else if(b==(char)0xff) {
state=5; /* could be FF FE */
} else {
state=8; /* default to UTF-16BE */
continue;
}
++source;
break;
case 1:
case 5:
if(*source==utf16BOM[state]) {
++source;
if(state==1) {
state=8; /* detect UTF-16BE */
offsetDelta=source-pArgs->source;
} else if(state==5) {
state=9; /* detect UTF-16LE */
offsetDelta=source-pArgs->source;
}
} else {
/* switch to UTF-16BE and pass the previous bytes */
if(source!=pArgs->source) {
/* just reset the source */
source=pArgs->source;
} else {
UBool oldFlush=pArgs->flush;
/* the first byte is from a previous buffer, replay it first */
pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
pArgs->sourceLimit=pArgs->source+1; /* replay previous byte */
pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
/* restore real pointers; pArgs->source will be set in case 8/9 */
pArgs->sourceLimit=sourceLimit;
pArgs->flush=oldFlush;
}
state=8;
continue;
}
break;
case 8:
/* call UTF-16BE */
pArgs->source=source;
_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
source=pArgs->source;
break;
case 9:
/* call UTF-16LE */
pArgs->source=source;
_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
source=pArgs->source;
break;
default:
break; /* does not occur */
}
}
/* add BOM size to offsets - see comment at offsetDelta declaration */
if(offsets!=NULL && offsetDelta!=0) {
int32_t *offsetsLimit=pArgs->offsets;
while(offsets<offsetsLimit) {
*offsets++ += offsetDelta;
}
}
pArgs->source=source;
if(source==sourceLimit && pArgs->flush) {
/* handle truncated input */
switch(state) {
case 0:
break; /* no input at all, nothing to do */
case 8:
_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
break;
case 9:
_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
break;
default:
/* handle 0<state<8: call UTF-16BE with too-short input */
pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
/* no offsets: not enough for output */
_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
pArgs->source=source;
pArgs->sourceLimit=sourceLimit;
break;
}
cnv->mode=0; /* reset */
} else {
cnv->mode=state;
}
}
static UChar32
_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
switch(pArgs->converter->mode) {
case 8:
return T_UConverter_getNextUChar_UTF16_BE(pArgs, pErrorCode);
case 9:
return T_UConverter_getNextUChar_UTF16_LE(pArgs, pErrorCode);
default:
return ucnv_getNextUCharFromToUImpl(pArgs, _UTF16ToUnicodeWithOffsets, TRUE, pErrorCode);
}
}
static const UConverterImpl _UTF16Impl = {
UCNV_UTF16,
NULL,
NULL,
_UTF16Open,
NULL,
_UTF16Reset,
_UTF16ToUnicodeWithOffsets,
_UTF16ToUnicodeWithOffsets,
_UTF16PEFromUnicodeWithOffsets,
_UTF16PEFromUnicodeWithOffsets,
_UTF16GetNextUChar,
NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
NULL,
NULL,
NULL,
ucnv_getCompleteUnicodeSet
};
static const UConverterStaticData _UTF16StaticData = {
sizeof(UConverterStaticData),
"UTF-16",
0, /* ### TODO review correctness of all Unicode CCSIDs */
UCNV_IBM, UCNV_UTF16, 2, 2,
#if U_IS_BIG_ENDIAN
{ 0xff, 0xfd, 0, 0 }, 2,
#else
{ 0xfd, 0xff, 0, 0 }, 2,
#endif
FALSE, FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _UTF16Data = {
sizeof(UConverterSharedData), ~((uint32_t) 0),
NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
0
};
--- NEW FILE: ucnv_u32.c ---
/*
**********************************************************************
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u32.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jul01
* created by: Markus W. Scherer
*
* UTF-32 converter implementation. Used to be in ucnv_utf.c.
*/
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
[...1286 lines suppressed...]
sizeof(UConverterStaticData),
"UTF-32",
0, /* ### TODO review correctness of all Unicode CCSIDs */
UCNV_IBM, UCNV_UTF32, 4, 4,
#if U_IS_BIG_ENDIAN
{ 0, 0, 0xff, 0xfd }, 4,
#else
{ 0xfd, 0xff, 0, 0 }, 4,
#endif
FALSE, FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _UTF32Data = {
sizeof(UConverterSharedData), ~((uint32_t) 0),
NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
0
};
--- NEW FILE: ucnv_u7.c ---
/*
**********************************************************************
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u7.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jul01
* created by: Markus W. Scherer
*
* UTF-7 converter implementation. Used to be in ucnv_utf.c.
*/
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
[...1515 lines suppressed...]
};
static const UConverterStaticData _IMAPStaticData={
sizeof(UConverterStaticData),
"IMAP-mailbox-name",
0, /* TODO CCSID for UTF-7 */
UCNV_IBM, UCNV_IMAP_MAILBOX,
1, 4,
{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
FALSE, FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _IMAPData={
sizeof(UConverterSharedData), ~((uint32_t)0),
NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
0
};
--- NEW FILE: ucnv_u8.c ---
/*
**********************************************************************
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u8.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jul01
* created by: Markus W. Scherer
*
* UTF-8 converter implementation. Used to be in ucnv_utf.c.
*
* Also, CESU-8 implementation, see UTR 26.
* The CESU-8 converter uses all the same functions as the
* UTF-8 converter, with a branch for converting supplementary code points.
*/
[...980 lines suppressed...]
};
/* CESU-8 converter data ---------------------------------------------------- */
static const UConverterStaticData _CESU8StaticData={
sizeof(UConverterStaticData),
"CESU-8",
0, UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _CESU8Data={
sizeof(UConverterSharedData), ~((uint32_t) 0),
NULL, NULL, &_CESU8StaticData, FALSE, &_UTF8Impl,
0
};
--- NEW FILE: ucnvbocu.c ---
/*
******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: ucnvbocu.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002mar27
* created by: Markus W. Scherer
*
* This is an implementation of the Binary Ordered Compression for Unicode,
* in its MIME-friendly form as defined in ### TODO http://... 1. doc/papers 2. design
*/
[...1582 lines suppressed...]
};
static const UConverterStaticData _Bocu1StaticData={
sizeof(UConverterStaticData),
"BOCU-1",
0, /* CCSID for BOCU-1 */
UCNV_IBM, UCNV_BOCU1,
1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
{ 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
FALSE, FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
const UConverterSharedData _Bocu1Data={
sizeof(UConverterSharedData), ~((uint32_t)0),
NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
0
};
--- NEW FILE: uenum.c ---
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uenum.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:2
*
* created on: 2002jul08
* created by: Vladimir Weinstein
*/
#include "uenumimp.h"
#include "cmemory.h"
/* Layout of the baseContext buffer. */
typedef struct {
int32_t len; /* number of bytes available starting at 'data' */
char data; /* actual data starts here */
} _UEnumBuffer;
/* Extra bytes to allocate in the baseContext buffer. */
static const int32_t PAD = 8;
/* Return a pointer to the baseContext buffer, possibly allocating
or reallocating it if at least 'capacity' bytes are not available. */
static void* _getBuffer(UEnumeration* en, int32_t capacity) {
if (en->baseContext != NULL) {
if (((_UEnumBuffer*) en->baseContext)->len < capacity) {
capacity += PAD;
en->baseContext = uprv_realloc(en->baseContext,
sizeof(int32_t) + capacity);
if (en->baseContext == NULL) {
return NULL;
}
((_UEnumBuffer*) en->baseContext)->len = capacity;
}
} else {
capacity += PAD;
en->baseContext = uprv_malloc(sizeof(int32_t) + capacity);
if (en->baseContext == NULL) {
return NULL;
}
((_UEnumBuffer*) en->baseContext)->len = capacity;
}
return (void*) & ((_UEnumBuffer*) en->baseContext)->data;
}
U_CAPI void U_EXPORT2
uenum_close(UEnumeration* en)
{
if (en) {
if (en->close != NULL) {
if (en->baseContext) {
uprv_free(en->baseContext);
}
en->close(en);
} else { /* this seems dangerous, but we better kill the object */
uprv_free(en);
}
}
}
U_CAPI int32_t U_EXPORT2
uenum_count(UEnumeration* en, UErrorCode* status)
{
if (!en || U_FAILURE(*status)) {
return -1;
}
if (en->count != NULL) {
return en->count(en, status);
} else {
*status = U_UNSUPPORTED_ERROR;
return -1;
}
}
/* Don't call this directly. Only uenum_unext should be calling this. */
U_CAPI const UChar* U_EXPORT2
uenum_unextDefault(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status)
{
if (en->next != NULL) {
UChar *tempUCharVal;
const char *tempCharVal = en->next(en, resultLength, status);
if (tempCharVal == NULL) {
return NULL;
}
tempUCharVal = (UChar*)
_getBuffer(en, (*resultLength+1) * sizeof(UChar));
if (!tempUCharVal) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
u_charsToUChars(tempCharVal, tempUCharVal, *resultLength + 1);
return tempUCharVal;
} else {
*status = U_UNSUPPORTED_ERROR;
return NULL;
}
}
/* Don't call this directly. Only uenum_next should be calling this. */
U_CAPI const char* U_EXPORT2
uenum_nextDefault(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status)
{
if (en->uNext != NULL) {
char *tempCharVal;
const UChar *tempUCharVal = en->uNext(en, resultLength, status);
if (tempUCharVal == NULL) {
return NULL;
}
tempCharVal = (char*)
_getBuffer(en, (*resultLength+1) * sizeof(char));
if (!tempCharVal) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
u_UCharsToChars(tempUCharVal, tempCharVal, *resultLength + 1);
return tempCharVal;
} else {
*status = U_UNSUPPORTED_ERROR;
return NULL;
}
}
U_CAPI const UChar* U_EXPORT2
uenum_unext(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status)
{
if (!en || U_FAILURE(*status)) {
return NULL;
}
if (en->uNext != NULL) {
return en->uNext(en, resultLength, status);
} else {
*status = U_UNSUPPORTED_ERROR;
return NULL;
}
}
U_CAPI const char* U_EXPORT2
uenum_next(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status)
{
if (!en || U_FAILURE(*status)) {
return NULL;
}
if (en->next != NULL) {
return en->next(en, resultLength, status);
} else {
*status = U_UNSUPPORTED_ERROR;
return NULL;
}
}
U_CAPI void U_EXPORT2
uenum_reset(UEnumeration* en, UErrorCode* status)
{
if (!en || U_FAILURE(*status)) {
return;
}
if (en->reset != NULL) {
en->reset(en, status);
} else {
*status = U_UNSUPPORTED_ERROR;
}
}
--- NEW FILE: uenumimp.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uenumimp.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:2
*
* created on: 2002jul08
* created by: Vladimir Weinstein
*/
#ifndef __UENUMIMP_H
#define __UENUMIMP_H
#include "unicode/uenum.h"
U_CDECL_BEGIN
/**
* following are the type declarations for
* implementations of APIs. If any of these
* functions are NULL, U_UNSUPPORTED_ERROR
* is returned. If close is NULL, the enumeration
* object is going to be released.
* Initial error checking is done in the body
* of API function, so the implementations
* need not to check the initial error condition.
*/
/**
* Function type declaration for uenum_close().
*
* This function should cleanup the enumerator object
*
* @param en enumeration to be closed
*/
typedef void U_CALLCONV
UEnumClose(UEnumeration *en);
/**
* Function type declaration for uenum_count().
*
* This function should count the number of elements
* in this enumeration
*
* @param en enumeration to be counted
* @param status pointer to UErrorCode variable
* @return number of elements in enumeration
*/
typedef int32_t U_CALLCONV
UEnumCount(UEnumeration *en, UErrorCode *status);
/**
* Function type declaration for uenum_unext().
*
* This function should return the next element
* as a UChar *
*
* @param en enumeration
* @param resultLength pointer to result length
* @param status pointer to UErrorCode variable
* @return next element as UChar *
*/
typedef const UChar* U_CALLCONV
UEnumUNext(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Function type declaration for uenum_next().
*
* This function should return the next element
* as a char *
*
* @param en enumeration
* @param resultLength pointer to result length
* @param status pointer to UErrorCode variable
* @return next element as char *
*/
typedef const char* U_CALLCONV
UEnumNext(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Function type declaration for uenum_reset().
*
* This function should reset the enumeration
* object
*
* @param en enumeration
* @param status pointer to UErrorCode variable
*/
typedef void U_CALLCONV
UEnumReset(UEnumeration* en,
UErrorCode* status);
struct UEnumeration {
/* baseContext. For the base class only. Don't touch! */
void *baseContext;
/* context. Use it for what you need */
void *context;
/**
* these are functions that will
* be used for APIs
*/
/* called from uenum_close */
UEnumClose *close;
/* called from uenum_count */
UEnumCount *count;
/* called from uenum_unext */
UEnumUNext *uNext;
/* called from uenum_next */
UEnumNext *next;
/* called from uenum_reset */
UEnumReset *reset;
};
U_CDECL_END
/* This is the default implementation for uenum_unext().
* It automatically converts the char * string to UChar *.
* Don't call this directly. This is called internally by uenum_unext
* when a UEnumeration is defined with 'uNext' pointing to this
* function.
*/
U_CAPI const UChar* U_EXPORT2
uenum_unextDefault(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/* This is the default implementation for uenum_next().
* It automatically converts the UChar * string to char *.
* Don't call this directly. This is called internally by uenum_next
* when a UEnumeration is defined with 'next' pointing to this
* function.
*/
U_CAPI const char* U_EXPORT2
uenum_nextDefault(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
#endif
--- NEW FILE: uidna.cpp ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uidna.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "unicode/uidna.h"
#include "unicode/ustring.h"
#include "strprep.h"
#include "punycode.h"
#include "ustr_imp.h"
#include "cmemory.h"
#include "sprpimpl.h"
/* it is official IDNA ACE Prefix is "xn--" */
static const UChar ACE_PREFIX[] ={ 0x0078,0x006E,0x002d,0x002d } ;
#define ACE_PREFIX_LENGTH 4
#define MAX_LABEL_LENGTH 63
#define HYPHEN 0x002D
/* The Max length of the labels should not be more than 64 */
#define MAX_LABEL_BUFFER_SIZE 100
#define MAX_IDN_BUFFER_SIZE 300
#define CAPITAL_A 0x0041
#define CAPITAL_Z 0x005A
#define LOWER_CASE_DELTA 0x0020
#define FULL_STOP 0x002E
inline static UChar
toASCIILower(UChar ch){
if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
return ch + LOWER_CASE_DELTA;
}
return ch;
}
inline static UBool
startsWithPrefix(const UChar* src , int32_t srcLength){
UBool startsWithPrefix = TRUE;
if(srcLength < ACE_PREFIX_LENGTH){
return FALSE;
}
for(int8_t i=0; i< ACE_PREFIX_LENGTH; i++){
if(toASCIILower(src[i]) != ACE_PREFIX[i]){
startsWithPrefix = FALSE;
}
}
return startsWithPrefix;
}
inline static void
toASCIILower(UChar* src, int32_t srcLen){
for(int32_t i=0; i<srcLen; i++){
src[i] = toASCIILower(src[i]);
}
}
inline static int32_t
compareCaseInsensitiveASCII(const UChar* s1, int32_t s1Len,
const UChar* s2, int32_t s2Len){
int32_t minLength;
int32_t lengthResult;
// are we comparing different lengths?
if(s1Len != s2Len) {
if(s1Len < s2Len) {
minLength = s1Len;
lengthResult = -1;
} else {
minLength = s2Len;
lengthResult = 1;
}
} else {
// ok the lengths are equal
minLength = s1Len;
lengthResult = 0;
}
UChar c1,c2;
int32_t rc;
for(int32_t i =0;/* no condition */;i++) {
/* If we reach the ends of both strings then they match */
if(i == minLength) {
return lengthResult;
}
c1 = s1[i];
c2 = s2[i];
/* Case-insensitive comparison */
if(c1!=c2) {
rc=(int32_t)toASCIILower(c1)-(int32_t)toASCIILower(c2);
if(rc!=0) {
lengthResult=rc;
break;
}
}
}
return lengthResult;
}
U_CAPI int32_t U_EXPORT2
uidna_toASCII(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE];
//initialize pointers to stack buffers
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len,
b1Capacity = MAX_LABEL_BUFFER_SIZE,
b2Capacity = MAX_LABEL_BUFFER_SIZE ,
reqLength=0;
UBool* caseFlags = NULL;
// the source contains all ascii codepoints
UBool srcIsASCII = TRUE;
// assume the source contains all LDH codepoints
UBool srcIsLDH = TRUE;
int32_t j=0;
//get the options
UBool allowUnassigned = (UBool)((options & UIDNA_ALLOW_UNASSIGNED) != 0);
UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0);
int32_t failPos = -1;
// step 2
StringPrep* prep = StringPrep::createNameprepInstance(*status);
if(U_FAILURE(*status)){
goto CLEANUP;
}
b1Len = prep->process(src,srcLength,b1, b1Capacity,allowUnassigned, parseError, *status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
// we do not have enough room so grow the buffer
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
}
// error bail out
if(U_FAILURE(*status)){
goto CLEANUP;
}
// step 3 & 4
for( j=0;j<b1Len;j++){
if(b1[j] > 0x7F){
srcIsASCII = FALSE;
}
// here we do not assemble surrogates
// since we know that LDH code points
// are in the ASCII range only
if(prep->isLDHChar(b1[j])==FALSE){
srcIsLDH = FALSE;
failPos = j;
}
}
if(useSTD3ASCIIRules == TRUE){
// verify 3a and 3b
if( srcIsLDH == FALSE /* source contains some non-LDH characters */
|| b1[0] == HYPHEN || b1[b1Len-1] == HYPHEN){
*status = U_IDNA_STD3_ASCII_RULES_ERROR;
/* populate the parseError struct */
if(srcIsLDH==FALSE){
// failPos is always set the index of failure
uprv_syntaxError(b1,failPos, b1Len,parseError);
}else if(b1[0] == HYPHEN){
// fail position is 0
uprv_syntaxError(b1,0,b1Len,parseError);
}else{
// the last index in the source is always length-1
uprv_syntaxError(b1, (b1Len>0) ? b1Len-1 : b1Len, b1Len,parseError);
}
goto CLEANUP;
}
}
if(srcIsASCII){
if(b1Len <= destCapacity){
uprv_memmove(dest, b1, b1Len * U_SIZEOF_UCHAR);
reqLength = b1Len;
}else{
reqLength = b1Len;
goto CLEANUP;
}
}else{
// step 5 : verify the sequence does not begin with ACE prefix
if(!startsWithPrefix(b1,b1Len)){
//step 6: encode the sequence with punycode
// do not preserve the case flags for now!
// TODO: Preserve the case while implementing the RFE
// caseFlags = (UBool*) uprv_malloc(b1Len * sizeof(UBool));
// uprv_memset(caseFlags,TRUE,b1Len);
b2Len = u_strToPunycode(b1,b1Len,b2,b2Capacity,caseFlags, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2 == NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = u_strToPunycode(b1,b1Len,b2,b2Len,caseFlags, status);
}
//error bail out
if(U_FAILURE(*status)){
goto CLEANUP;
}
// TODO : Reconsider while implementing the case preserve RFE
// convert all codepoints to lower case ASCII
// toASCIILower(b2,b2Len);
reqLength = b2Len+ACE_PREFIX_LENGTH;
if(reqLength > destCapacity){
*status = U_BUFFER_OVERFLOW_ERROR;
goto CLEANUP;
}
//Step 7: prepend the ACE prefix
uprv_memcpy(dest,ACE_PREFIX,ACE_PREFIX_LENGTH * U_SIZEOF_UCHAR);
//Step 6: copy the contents in b2 into dest
uprv_memcpy(dest+ACE_PREFIX_LENGTH, b2, b2Len * U_SIZEOF_UCHAR);
}else{
*status = U_IDNA_ACE_PREFIX_ERROR;
//position of failure is 0
uprv_syntaxError(b1,0,b1Len,parseError);
goto CLEANUP;
}
}
if(reqLength > MAX_LABEL_LENGTH){
*status = U_IDNA_LABEL_TOO_LONG_ERROR;
}
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
uprv_free(caseFlags);
delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
U_CAPI int32_t U_EXPORT2
uidna_toUnicode(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//get the options
UBool allowUnassigned = (UBool)((options & UIDNA_ALLOW_UNASSIGNED) != 0);
UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0);
UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE];
//initialize pointers to stack buffers
UChar *b1 = b1Stack, *b2 = b2Stack, *b1Prime=NULL, *b3=b3Stack;
int32_t b1Len, b2Len, b1PrimeLen, b3Len,
b1Capacity = MAX_LABEL_BUFFER_SIZE,
b2Capacity = MAX_LABEL_BUFFER_SIZE,
b3Capacity = MAX_LABEL_BUFFER_SIZE,
reqLength=0;
StringPrep* prep = StringPrep::createNameprepInstance(*status);
b1Len = 0;
UBool* caseFlags = NULL;
UBool srcIsASCII = TRUE;
UBool srcIsLDH = TRUE;
int32_t failPos =0;
if(U_FAILURE(*status)){
goto CLEANUP;
}
// step 1: find out if all the codepoints in src are ASCII
if(srcLength==-1){
srcLength = 0;
for(;src[srcLength]!=0;){
if(src[srcLength]> 0x7f){
srcIsASCII = FALSE;
}
// here we do not assemble surrogates
// since we know that LDH code points
// are in the ASCII range only
if(prep->isLDHChar(src[srcLength])==FALSE){
srcIsLDH = FALSE;
failPos = srcLength;
}
srcLength++;
}
}else{
for(int32_t j=0; j<srcLength; j++){
if(src[j]> 0x7f){
srcIsASCII = FALSE;
}
// here we do not assemble surrogates
// since we know that LDH code points
// are in the ASCII range only
if(prep->isLDHChar(src[j])==FALSE){
srcIsLDH = FALSE;
failPos = j;
}
}
}
if(srcIsASCII == FALSE){
// step 2: process the string
b1Len = prep->process(src,srcLength,b1,b1Capacity,allowUnassigned, parseError, *status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
}
//bail out on error
if(U_FAILURE(*status)){
goto CLEANUP;
}
}else{
//just point src to b1
b1 = (UChar*) src;
b1Len = srcLength;
}
//step 3: verify ACE Prefix
if(startsWithPrefix(src,srcLength)){
//step 4: Remove the ACE Prefix
b1Prime = b1 + ACE_PREFIX_LENGTH;
b1PrimeLen = b1Len - ACE_PREFIX_LENGTH;
//step 5: Decode using punycode
b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Capacity, caseFlags,status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = u_strFromPunycode(b1Prime, b1PrimeLen, b2, b2Len, caseFlags, status);
}
//step 6:Apply toASCII
b3Len = uidna_toASCII(b2, b2Len, b3, b3Capacity,options,parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
/* we do not have enough room so grow the buffer*/
b3 = (UChar*) uprv_malloc(b3Len * U_SIZEOF_UCHAR);
if(b3==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b3Len = uidna_toASCII(b2,b2Len,b3,b3Len,options,parseError, status);
}
//bail out on error
if(U_FAILURE(*status)){
goto CLEANUP;
}
//step 7: verify
if(compareCaseInsensitiveASCII(b1, b1Len, b3, b3Len) !=0){
*status = U_IDNA_VERIFICATION_ERROR;
goto CLEANUP;
}
//step 8: return output of step 5
reqLength = b2Len;
if(b2Len <= destCapacity) {
uprv_memmove(dest, b2, b2Len * U_SIZEOF_UCHAR);
}
}else{
// verify that STD3 ASCII rules are satisfied
if(useSTD3ASCIIRules == TRUE){
if( srcIsLDH == FALSE /* source contains some non-LDH characters */
|| src[0] == HYPHEN || src[srcLength-1] == HYPHEN){
*status = U_IDNA_STD3_ASCII_RULES_ERROR;
/* populate the parseError struct */
if(srcIsLDH==FALSE){
// failPos is always set the index of failure
uprv_syntaxError(src,failPos, srcLength,parseError);
}else if(src[0] == HYPHEN){
// fail position is 0
uprv_syntaxError(src,0,srcLength,parseError);
}else{
// the last index in the source is always length-1
uprv_syntaxError(src, (srcLength>0) ? srcLength-1 : srcLength, srcLength,parseError);
}
goto CLEANUP;
}
}
//copy the source to destination
if(srcLength <= destCapacity){
uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR);
}
reqLength = srcLength;
}
CLEANUP:
if(b1 != b1Stack && b1!=src){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
uprv_free(caseFlags);
delete prep;
// The RFC states that
// <quote>
// ToUnicode never fails. If any step fails, then the original input
// is returned immediately in that step.
// </quote>
// So if any step fails lets copy source to destination
if(U_FAILURE(*status)){
//copy the source to destination
if(dest && srcLength <= destCapacity){
if(srcLength == -1) {
uprv_memmove(dest,src,u_strlen(src)* U_SIZEOF_UCHAR);
} else {
uprv_memmove(dest,src,srcLength * U_SIZEOF_UCHAR);
}
}
reqLength = srcLength;
}
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
// returns the length of the label excluding the separator
// if *limit == separator then the length returned does not include
// the separtor.
static int32_t
getNextSeparator(UChar *src,int32_t srcLength,StringPrep* prep,
UChar **limit,
UBool *done,
UErrorCode *status){
if(srcLength == -1){
int32_t i;
for(i=0 ; ;i++){
if(src[i] == 0){
*limit = src + i; // point to null
*done = TRUE;
return i;
}
if(prep->isLabelSeparator(src[i],*status)){
*limit = src + (i+1); // go past the delimiter
return i;
}
}
}else{
int32_t i;
for(i=0;i<srcLength;i++){
if(prep->isLabelSeparator(src[i],*status)){
*limit = src + (i+1); // go past the delimiter
return i;
}
}
// we have not found the delimiter
// if(i==srcLength)
*limit = src+srcLength;
*done = TRUE;
return i;
}
}
U_CAPI int32_t U_EXPORT2
uidna_IDNToASCII( const UChar *src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError *parseError,
UErrorCode *status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t reqLength = 0;
StringPrep* prep = StringPrep::createNameprepInstance(*status);
if(U_FAILURE(*status)){
return 0;
}
//initialize pointers
UChar *delimiter = (UChar*)src;
UChar *labelStart = (UChar*)src;
UChar *currentDest = (UChar*) dest;
int32_t remainingLen = srcLength;
int32_t remainingDestCapacity = destCapacity;
int32_t labelLen = 0, labelReqLength = 0;
UBool done = FALSE;
for(;;){
labelLen = getNextSeparator(labelStart,remainingLen, prep, &delimiter,&done, status);
labelReqLength = uidna_toASCII( labelStart, labelLen,
currentDest, remainingDestCapacity,
options, parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
*status = U_ZERO_ERROR; // reset error
remainingDestCapacity = 0;
}
if(U_FAILURE(*status)){
break;
}
reqLength +=labelReqLength;
// adjust the destination pointer
if(labelReqLength < remainingDestCapacity){
currentDest = currentDest + labelReqLength;
remainingDestCapacity -= labelReqLength;
}else{
// should never occur
remainingDestCapacity = 0;
}
if(done == TRUE){
break;
}
// add the label separator
if(remainingDestCapacity > 0){
*currentDest++ = FULL_STOP;
remainingDestCapacity--;
}
reqLength++;
labelStart = delimiter;
if(remainingLen >0 ){
remainingLen = srcLength - (delimiter - src);
}
}
delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
U_CAPI int32_t U_EXPORT2
uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return 0;
}
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t reqLength = 0;
StringPrep* prep = StringPrep::createNameprepInstance(*status);
if(U_FAILURE(*status)){
return 0;
}
//initialize pointers
UChar *delimiter = (UChar*)src;
UChar *labelStart = (UChar*)src;
UChar *currentDest = (UChar*) dest;
int32_t remainingLen = srcLength;
int32_t remainingDestCapacity = destCapacity;
int32_t labelLen = 0, labelReqLength = 0;
UBool done = FALSE;
for(;;){
labelLen = getNextSeparator(labelStart,remainingLen, prep, &delimiter,&done, status);
labelReqLength = uidna_toUnicode(labelStart, labelLen,
currentDest, remainingDestCapacity,
options, parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
*status = U_ZERO_ERROR; // reset error
remainingDestCapacity = 0;
}
if(U_FAILURE(*status)){
break;
}
reqLength +=labelReqLength;
// adjust the destination pointer
if(labelReqLength < remainingDestCapacity){
currentDest = currentDest + labelReqLength;
remainingDestCapacity -= labelReqLength;
}else{
// should never occur
remainingDestCapacity = 0;
}
if(done == TRUE){
break;
}
// add the label separator
if(remainingDestCapacity > 0){
*currentDest++ = FULL_STOP;
remainingDestCapacity--;
}
reqLength++;
labelStart = delimiter;
if(remainingLen >0 ){
remainingLen = srcLength - (delimiter - src);
}
}
delete prep;
return u_terminateUChars(dest, destCapacity, reqLength, status);
}
U_CAPI int32_t U_EXPORT2
uidna_compare( const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
int32_t options,
UErrorCode* status){
if(status == NULL || U_FAILURE(*status)){
return -1;
}
UChar b1Stack[MAX_IDN_BUFFER_SIZE], b2Stack[MAX_IDN_BUFFER_SIZE];
UChar *b1 = b1Stack, *b2 = b2Stack;
int32_t b1Len, b2Len, b1Capacity = MAX_IDN_BUFFER_SIZE, b2Capacity = MAX_IDN_BUFFER_SIZE;
int32_t result=-1;
UParseError parseError;
b1Len = uidna_IDNToASCII(s1, length1, b1, b1Capacity, options, &parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
if(b1==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b1Len = uidna_IDNToASCII(s1,length1,b1,b1Len, options, &parseError, status);
}
b2Len = uidna_IDNToASCII(s2,length2, b2,b2Capacity, options, &parseError, status);
if(*status == U_BUFFER_OVERFLOW_ERROR){
// redo processing of string
b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
if(b2==NULL){
*status = U_MEMORY_ALLOCATION_ERROR;
goto CLEANUP;
}
*status = U_ZERO_ERROR; // reset error
b2Len = uidna_IDNToASCII(s2, length2, b2, b2Len, options, &parseError, status);
}
// when toASCII is applied all label separators are replaced with FULL_STOP
result = compareCaseInsensitiveASCII(b1,b1Len,b2,b2Len);
CLEANUP:
if(b1 != b1Stack){
uprv_free(b1);
}
if(b2 != b2Stack){
uprv_free(b2);
}
return result;
}
#endif /* #if !UCONFIG_NO_IDNA */
--- NEW FILE: uiter.cpp ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uiter.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan18
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/chariter.h"
[...1075 lines suppressed...]
return UITER_NO_STATE;
} else {
return iter->getState(iter);
}
}
U_CAPI void U_EXPORT2
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
/* do nothing */
} else if(iter==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else if(iter->setState==NULL) {
*pErrorCode=U_UNSUPPORTED_ERROR;
} else {
iter->setState(iter, state, pErrorCode);
}
}
U_CDECL_END
--- NEW FILE: unifilt.cpp ---
/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/18/01 aliu Creation.
**********************************************************************
*/
#include "unicode/unifilt.h"
#include "unicode/rep.h"
U_NAMESPACE_BEGIN
const char UnicodeFilter::fgClassID=0;
/**
* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
* and return the pointer.
*/
UnicodeMatcher* UnicodeFilter::toMatcher() const {
return (UnicodeMatcher*) this;
}
/**
* Default implementation of UnicodeMatcher::matches() for Unicode
* filters. Matches a single code point at offset (either one or
* two 16-bit code units).
*/
UMatchDegree UnicodeFilter::matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) {
UChar32 c;
if (offset < limit &&
contains(c = text.char32At(offset))) {
offset += UTF_CHAR_LENGTH(c);
return U_MATCH;
}
if (offset > limit &&
contains(c = text.char32At(offset))) {
// Backup offset by 1, unless the preceding character is a
// surrogate pair -- then backup by 2 (keep offset pointing at
// the lead surrogate).
--offset;
if (offset >= 0) {
offset -= UTF_CHAR_LENGTH(text.char32At(offset)) - 1;
}
return U_MATCH;
}
if (incremental && offset == limit) {
return U_PARTIAL_MATCH;
}
return U_MISMATCH;
}
U_NAMESPACE_END
//eof
--- NEW FILE: unifunct.cpp ---
/*
**********************************************************************
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* $Source: /usr/local/cvsroot/icu-sword/source/common/unifunct.cpp,v $
* $Date: 2003/09/10 02:42:03 $
* $Revision: 1.1 $
**********************************************************************
*/
#include "unicode/unifunct.h"
U_NAMESPACE_BEGIN
const char UnicodeFunctor::fgClassID = 0;
UnicodeMatcher* UnicodeFunctor::toMatcher() const {
return 0;
}
UnicodeReplacer* UnicodeFunctor::toReplacer() const {
return 0;
}
U_NAMESPACE_END
//eof
--- NEW FILE: uniset.cpp ---
/*
**********************************************************************
* Copyright (C) 1999-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 10/20/99 alan Creation.
**********************************************************************
*/
#include "unicode/uniset.h"
#include "unicode/parsepos.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
#include "symtable.h"
#include "cmemory.h"
#include "uhash.h"
#include "util.h"
#include "uvector.h"
[...3625 lines suppressed...]
if (CASE_EQUIV_CBA == NULL) {
CASE_EQUIV_CBA = cba;
cba = NULL;
}
umtx_unlock(NULL);
if (cba != NULL) {
ucmp8_close(cba);
}
}
if (CASE_EQUIV_CBA != NULL) {
int32_t index = ucmp8_getu(CASE_EQUIV_CBA, folded);
if (index != 255) {
return &CASE_NONPAIRS[index];
}
}
return NULL;
}
U_NAMESPACE_END
--- NEW FILE: unorm_it.c ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: unorm_it.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jan21
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
#include "unicode/uiter.h"
#include "unicode/unorm.h"
#include "unorm_it.h"
#include "cmemory.h"
/* UNormIterator ------------------------------------------------------------ */
enum {
INITIAL_CAPACITY=100
};
struct UNormIterator {
UCharIterator api;
UCharIterator *iter;
/*
* chars and states either use the static buffers
* or are allocated in the same memory block
*
* They are parallel arrays with states[] holding the getState() values
* from normalization boundaries, and UITER_NO_STATE in between.
*/
UChar *chars;
uint32_t *states;
/*
* api.start: first valid character & state in the arrays
* api.index: current position
* api.limit: one past the last valid character in chars[], but states[limit] is valid
* capacity: length of allocated arrays
*/
int32_t capacity;
/* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
uint32_t state;
/* there are UChars available before start or after limit? */
UBool hasPrevious, hasNext, isStackAllocated;
UNormalizationMode mode;
UChar charsBuffer[INITIAL_CAPACITY];
uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
};
static void
initIndexes(UNormIterator *uni, UCharIterator *iter) {
/* do not pass api so that the compiler knows it's an alias pointer to uni itself */
UCharIterator *api=&uni->api;
if(!iter->hasPrevious(iter)) {
/* set indexes to the beginning of the arrays */
api->start=api->index=api->limit=0;
uni->hasPrevious=FALSE;
uni->hasNext=iter->hasNext(iter);
} else if(!iter->hasNext(iter)) {
/* set indexes to the end of the arrays */
api->start=api->index=api->limit=uni->capacity;
uni->hasNext=FALSE;
uni->hasPrevious=iter->hasPrevious(iter);
} else {
/* set indexes into the middle of the arrays */
api->start=api->index=api->limit=uni->capacity/2;
uni->hasPrevious=uni->hasNext=TRUE;
}
}
static UBool
reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
/* do not pass api so that the compiler knows it's an alias pointer to uni itself */
UCharIterator *api=&uni->api;
uint32_t *states;
UChar *chars;
int32_t start, limit;
states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
if(states==NULL) {
return FALSE;
}
chars=(UChar *)(states+(capacity+1));
uni->capacity=capacity;
start=api->start;
limit=api->limit;
if(addAtStart) {
/* copy old contents to the end of the new arrays */
int32_t delta;
delta=capacity-uni->capacity;
uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
api->start=start+delta;
api->index+=delta;
api->limit=limit+delta;
} else {
/* copy old contents to the beginning of the new arrays */
uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
}
uni->chars=chars;
uni->states=states;
return TRUE;
}
static void
moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
/* move array contents up to make room */
int32_t srcIndex, destIndex, limit;
limit=api->limit;
srcIndex=delta;
if(srcIndex>api->start) {
/* look for a position in the arrays with a known state */
while(srcIndex<limit && chars[srcIndex]==UITER_NO_STATE) {
++srcIndex;
}
}
/* now actually move the array contents */
api->start=destIndex=0;
while(srcIndex<limit) {
chars[destIndex]=chars[srcIndex];
states[destIndex++]=states[srcIndex++];
}
/* copy states[limit] as well! */
states[destIndex]=states[srcIndex];
api->limit=destIndex;
}
static void
moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
/* move array contents up to make room */
int32_t srcIndex, destIndex, start;
start=api->start;
destIndex=((UNormIterator *)api)->capacity;
srcIndex=destIndex-delta;
if(srcIndex<api->limit) {
/* look for a position in the arrays with a known state */
while(srcIndex>start && chars[srcIndex]==UITER_NO_STATE) {
--srcIndex;
}
}
/* now actually move the array contents */
api->limit=destIndex;
/* copy states[limit] as well! */
states[destIndex]=states[srcIndex];
while(srcIndex>start) {
chars[--destIndex]=chars[--srcIndex];
states[destIndex]=states[srcIndex];
}
api->start=destIndex;
}
/* normalize forward from the limit, assume hasNext is true */
static UBool
readNext(UNormIterator *uni, UCharIterator *iter) {
/* do not pass api so that the compiler knows it's an alias pointer to uni itself */
UCharIterator *api=&uni->api;
/* make capacity/4 room at the end of the arrays */
int32_t limit, capacity, room, delta;
UErrorCode errorCode;
limit=api->limit;
capacity=uni->capacity;
room=capacity/4;
delta=room-(capacity-limit);
if(delta>0) {
/* move array contents to make room */
moveContentsTowardStart(api, uni->chars, uni->states, delta);
api->index=limit=api->limit;
uni->hasPrevious=TRUE;
}
/* normalize starting from the limit position */
errorCode=U_ZERO_ERROR;
if(uni->state!=uni->states[limit]) {
uiter_setState(iter, uni->states[limit], &errorCode);
if(U_FAILURE(errorCode)) {
uni->state=UITER_NO_STATE;
uni->hasNext=FALSE;
return FALSE;
}
}
room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
if(room<=capacity) {
/* empty and re-use the arrays */
uni->states[0]=uni->states[limit];
api->start=api->index=api->limit=limit=0;
uni->hasPrevious=TRUE;
} else {
capacity+=room+100;
if(!reallocArrays(uni, capacity, FALSE)) {
uni->state=UITER_NO_STATE;
uni->hasNext=FALSE;
return FALSE;
}
limit=api->limit;
}
errorCode=U_ZERO_ERROR;
uiter_setState(iter, uni->states[limit], &errorCode);
room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
}
if(U_FAILURE(errorCode) || room==0) {
uni->state=UITER_NO_STATE;
uni->hasNext=FALSE;
return FALSE;
}
/* room>0 */
++limit; /* leave the known states[limit] alone */
for(--room; room>0; --room) {
/* set unknown states for all but the normalization boundaries */
uni->states[limit++]=UITER_NO_STATE;
}
uni->states[limit]=uni->state=uiter_getState(iter);
uni->hasNext=iter->hasNext(iter);
api->limit=limit;
return TRUE;
}
/* normalize backward from the start, assume hasPrevious is true */
static UBool
readPrevious(UNormIterator *uni, UCharIterator *iter) {
/* do not pass api so that the compiler knows it's an alias pointer to uni itself */
UCharIterator *api=&uni->api;
/* make capacity/4 room at the start of the arrays */
int32_t start, capacity, room, delta;
UErrorCode errorCode;
start=api->start;
capacity=uni->capacity;
room=capacity/4;
delta=room-start;
if(delta>0) {
/* move array contents to make room */
moveContentsTowardEnd(api, uni->chars, uni->states, delta);
api->index=start=api->start;
uni->hasNext=TRUE;
}
/* normalize ending at the start position */
errorCode=U_ZERO_ERROR;
if(uni->state!=uni->states[start]) {
uiter_setState(iter, uni->states[start], &errorCode);
if(U_FAILURE(errorCode)) {
uni->state=UITER_NO_STATE;
uni->hasPrevious=FALSE;
return FALSE;
}
}
room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
if(room<=capacity) {
/* empty and re-use the arrays */
uni->states[capacity]=uni->states[start];
api->start=api->index=api->limit=start=capacity;
uni->hasNext=TRUE;
} else {
capacity+=room+100;
if(!reallocArrays(uni, capacity, TRUE)) {
uni->state=UITER_NO_STATE;
uni->hasPrevious=FALSE;
return FALSE;
}
start=api->start;
}
errorCode=U_ZERO_ERROR;
uiter_setState(iter, uni->states[start], &errorCode);
room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
}
if(U_FAILURE(errorCode) || room==0) {
uni->state=UITER_NO_STATE;
uni->hasPrevious=FALSE;
return FALSE;
}
/* room>0 */
do {
/* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
uni->chars[--start]=uni->chars[--room];
/* set unknown states for all but the normalization boundaries */
uni->states[start]=UITER_NO_STATE;
} while(room>0);
uni->states[start]=uni->state=uiter_getState(iter);
uni->hasPrevious=iter->hasPrevious(iter);
api->start=start;
return TRUE;
}
/* Iterator runtime API functions ------------------------------------------- */
static int32_t U_CALLCONV
unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
switch(origin) {
case UITER_ZERO:
case UITER_START:
return 0;
case UITER_CURRENT:
case UITER_LIMIT:
case UITER_LENGTH:
return UITER_UNKNOWN_INDEX;
default:
/* not a valid origin */
/* Should never get here! */
return -1;
}
}
static int32_t U_CALLCONV
unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
UNormIterator *uni=(UNormIterator *)api;
UCharIterator *iter=uni->iter;
int32_t pos;
switch(origin) {
case UITER_ZERO:
case UITER_START:
/* restart from the beginning */
if(uni->hasPrevious) {
iter->move(iter, 0, UITER_START);
api->start=api->index=api->limit=0;
uni->states[api->limit]=uni->state=uiter_getState(iter);
uni->hasPrevious=FALSE;
uni->hasNext=iter->hasNext(iter);
} else {
/* we already have the beginning of the normalized text */
api->index=api->start;
}
break;
case UITER_CURRENT:
break;
case UITER_LIMIT:
case UITER_LENGTH:
/* restart from the end */
if(uni->hasNext) {
iter->move(iter, 0, UITER_LIMIT);
api->start=api->index=api->limit=uni->capacity;
uni->states[api->limit]=uni->state=uiter_getState(iter);
uni->hasPrevious=iter->hasPrevious(iter);
uni->hasNext=FALSE;
} else {
/* we already have the end of the normalized text */
api->index=api->limit;
}
break;
default:
return -1; /* Error */
}
/* move relative to the current position by delta normalized UChars */
if(delta==0) {
/* nothing to do */
} else if(delta>0) {
/* go forward until the requested position is in the buffer */
for(;;) {
pos=api->index+delta; /* requested position */
delta=pos-api->limit; /* remainder beyond buffered text */
if(delta<=0) {
api->index=pos; /* position reached */
break;
}
/* go to end of buffer and normalize further */
api->index=api->limit;
if(!uni->hasNext || !readNext(uni, iter)) {
break; /* reached end of text */
}
}
} else /* delta<0 */ {
/* go backward until the requested position is in the buffer */
for(;;) {
pos=api->index+delta; /* requested position */
delta=pos-api->start; /* remainder beyond buffered text */
if(delta>=0) {
api->index=pos; /* position reached */
break;
}
/* go to start of buffer and normalize further */
api->index=api->start;
if(!uni->hasPrevious || !readPrevious(uni, iter)) {
break; /* reached start of text */
}
}
}
if(api->index==api->start && !uni->hasPrevious) {
return 0;
} else {
return UITER_UNKNOWN_INDEX;
}
}
static UBool U_CALLCONV
unormIteratorHasNext(UCharIterator *api) {
return api->index<api->limit || ((UNormIterator *)api)->hasNext;
}
static UBool U_CALLCONV
unormIteratorHasPrevious(UCharIterator *api) {
return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
}
static UChar32 U_CALLCONV
unormIteratorCurrent(UCharIterator *api) {
UNormIterator *uni=(UNormIterator *)api;
if( api->index<api->limit ||
(uni->hasNext && readNext(uni, uni->iter))
) {
return uni->chars[api->index];
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
unormIteratorNext(UCharIterator *api) {
UNormIterator *uni=(UNormIterator *)api;
if( api->index<api->limit ||
(uni->hasNext && readNext(uni, uni->iter))
) {
return uni->chars[api->index++];
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
unormIteratorPrevious(UCharIterator *api) {
UNormIterator *uni=(UNormIterator *)api;
if( api->index>api->start ||
(uni->hasPrevious && readPrevious(uni, uni->iter))
) {
return uni->chars[--api->index];
} else {
return U_SENTINEL;
}
}
static uint32_t U_CALLCONV
unormIteratorGetState(const UCharIterator *api) {
/* not uni->state because that may not be at api->index */
return ((UNormIterator *)api)->states[api->index];
}
static void U_CALLCONV
unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
/* do nothing */
} else if(api==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
} else if(state==UITER_NO_STATE) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
UNormIterator *uni=(UNormIterator *)api;
UCharIterator *iter=((UNormIterator *)api)->iter;
if(state!=uni->state) {
uni->state=state;
uiter_setState(iter, state, pErrorCode);
}
/*
* Try shortcuts: If the requested state is in the array contents
* then just set the index there.
*
* We assume that the state is unique per position!
*/
if(state==uni->states[api->index]) {
return;
} else if(state==uni->states[api->limit]) {
api->index=api->limit;
return;
} else {
/* search for the index with this state */
int32_t i;
for(i=api->start; i<api->limit; ++i) {
if(state==uni->states[i]) {
api->index=i;
return;
}
}
}
/* there is no array index for this state, reset for fresh contents */
initIndexes((UNormIterator *)api, iter);
uni->states[api->limit]=state;
}
}
static const UCharIterator unormIterator={
NULL, 0, 0, 0, 0, 0,
unormIteratorGetIndex,
unormIteratorMove,
unormIteratorHasNext,
unormIteratorHasPrevious,
unormIteratorCurrent,
unormIteratorNext,
unormIteratorPrevious,
NULL,
unormIteratorGetState,
unormIteratorSetState
};
/* Setup functions ---------------------------------------------------------- */
U_CAPI UNormIterator * U_EXPORT2
unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
UNormIterator *uni;
/* argument checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return NULL;
}
/* allocate */
uni=NULL;
if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
size_t align=U_ALIGNMENT_OFFSET(stackMem);
if(align==0) {
/* already aligned */
uni=(UNormIterator *)stackMem;
} else if((stackMemSize-=align)>=sizeof(UNormIterator)) {
/* needs alignment */
uni=(UNormIterator *)((char *)stackMem+align);
} else {
/* does not fit */
}
}
if(uni!=NULL) {
uni->isStackAllocated=TRUE;
} else {
uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
if(uni==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uni->isStackAllocated=FALSE;
}
/*
* initialize
* do not memset because that would unnecessarily initialize the arrays
*/
uni->iter=NULL;
uni->chars=uni->charsBuffer;
uni->states=uni->statesBuffer;
uni->capacity=INITIAL_CAPACITY;
uni->state=UITER_NO_STATE;
uni->hasPrevious=uni->hasNext=FALSE;
uni->mode=UNORM_NONE;
/* set a no-op iterator into the api */
uiter_setString(&uni->api, NULL, 0);
return uni;
}
U_CAPI void U_EXPORT2
unorm_closeIter(UNormIterator *uni) {
if(uni!=NULL) {
if(uni->states!=uni->statesBuffer) {
/* chars and states are allocated in the same memory block */
uprv_free(uni->states);
}
if(!uni->isStackAllocated) {
uprv_free(uni);
}
}
}
U_CAPI UCharIterator * U_EXPORT2
unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
/* argument checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return NULL;
}
if(uni==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
) {
/* set a no-op iterator into the api */
uiter_setString(&uni->api, NULL, 0);
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
/* set the iterator and initialize */
uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
uni->iter=iter;
uni->mode=mode;
initIndexes(uni, iter);
uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
return &uni->api;
}
#endif /* uconfig.h switches */
--- NEW FILE: unorm_it.h ---
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: unorm_it.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jan21
* created by: Markus W. Scherer
*/
#ifndef __UNORM_IT_H__
#define __UNORM_IT_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
#include "unicode/uiter.h"
#include "unicode/unorm.h"
/**
* Normalizing UCharIterator wrapper.
* This internal API basically duplicates the functionality of the C++ Normalizer
* but
* - it actually implements a character iterator (UCharIterator)
* with few restrictions (see unorm_setIter())
* - it supports UCharIterator getState()/setState()
* - it uses lower-level APIs and buffers more text and states,
* hopefully resulting in higher performance
*
* Usage example:
* \code
* function(UCharIterator *srcIter) {
* UNormIterator *uni;
* UCharIterator *iter;
* UErrorCode errorCode;
*
* errorCode=U_ZERO_ERROR;
* uni=unorm_openIter(&errorCode);
* if(U_FAILURE(errorCode)) {
* // report error
* return;
* }
*
* iter=unorm_setIter(uni, srcIter, UNORM_FCD, &errorCode);
* if(U_FAILURE(errorCode)) {
* // report error
* } else {
* // use iter to iterate over the canonically ordered
* // version of srcIter's text
* uint32_t state;
*
* ...
*
* state=uiter_getState(iter);
* if(state!=UITER_NO_STATE) {
* // use valid state, store it, use iter some more
* ...
*
* // later restore iter to the saved state:
* uiter_setState(iter, state, &errorCode);
*
* ...
* }
*
* ...
* }
* unorm_closeIter(uni);
* }
* \endcode
*
* See also the ICU test suites.
*
* @internal
*/
struct UNormIterator;
typedef struct UNormIterator UNormIterator;
/**
* Size of a stack buffer to hold a UNormIterator, see the stackMem parameter
* of unorm_openIter().
*
* @internal
*/
#define UNORM_ITER_SIZE 1024
/**
* Open a normalizing iterator. Must be closed later.
* Use unorm_setIter().
*
* @param stackMem Pointer to preallocated (stack-allocated) buffer to hold
* the UNormIterator if possible; can be NULL.
* @param stackMemSize Number of bytes at stackMem; can be 0,
* or should be >= UNORM_ITER_SIZE for a non-NULL stackMem.
* @param pErrorCode ICU error code
* @return an allocated and pre-initialized UNormIterator
* @internal
*/
U_CAPI UNormIterator * U_EXPORT2
unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode);
/**
* Close a normalizing iterator.
*
* @param uni UNormIterator from unorm_openIter()
* @internal
*/
U_CAPI void U_EXPORT2
unorm_closeIter(UNormIterator *uni);
/**
* Set a UCharIterator and a normalization mode for the normalizing iterator
* to wrap. The normalizing iterator will read from the character iterator,
* normalize the text, and in turn deliver it with its own wrapper UCharIterator
* interface which it returns.
*
* The source iterator remains at its current position through the unorm_setIter()
* call but will be used and moved as soon as the
* the returned normalizing iterator is.
*
* The returned interface pointer is valid for as long as the normalizing iterator
* is open and until another unorm_setIter() call is made on it.
*
* The normalizing iterator's UCharIterator interface has the following properties:
* - getIndex() and move() will almost always return UITER_UNKNOWN_INDEX
* - getState() will return UITER_NO_STATE for unknown states for positions
* that are not at normalization boundaries
*
* @param uni UNormIterator from unorm_openIter()
* @param iter The source text UCharIterator to be wrapped. It is aliases into the normalizing iterator.
* Must support getState() and setState().
* @param mode The normalization mode.
* @param pErrorCode ICU error code
* @return an alias to the normalizing iterator's UCharIterator interface
* @internal
*/
U_CAPI UCharIterator * U_EXPORT2
unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode);
#endif /* uconfig.h switches */
#endif
--- NEW FILE: uobject.cpp ---
/*
******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uobject.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jun26
* created by: Markus W. Scherer
*/
#include "unicode/uobject.h"
#if U_OVERRIDE_CXX_ALLOCATION
#include "cmemory.h"
U_NAMESPACE_BEGIN
/*
* Default implementation of UMemory::new/delete
* using uprv_malloc() and uprv_free().
*
* For testing, this is used together with a list of imported symbols to verify
* that ICU is not using the global ::new and ::delete operators.
*
* These operators can be implemented like this or any other appropriate way
* when customizing ICU for certain environments.
* Whenever ICU is customized in binary incompatible ways please be sure
* to use library name suffixes to distinguish such libraries from
* the standard build.
*
* Instead of just modifying these C++ new/delete operators, it is usually best
* to modify the uprv_malloc()/uprv_free()/uprv_realloc() functions in cmemory.c.
*
* Memory test on Windows/MSVC 6:
* The global operators new and delete look as follows:
* 04F 00000000 UNDEF notype () External | ??2@YAPAXI@Z (void * __cdecl operator new(unsigned int))
* 03F 00000000 UNDEF notype () External | ??3@YAXPAX@Z (void __cdecl operator delete(void *))
*
* These lines are from output generated by the MSVC 6 tool dumpbin with
* dumpbin /symbols *.obj
*
* ??2@YAPAXI@Z and ??3@YAXPAX@Z are the linker symbols in the .obj
* files and are imported from msvcrtd.dll (in a debug build).
*
* Make sure that with the UMemory operators new and delete defined these two symbols
* do not appear in the dumpbin /symbols output for the ICU libraries!
*
* If such a symbol appears in the output then look in the preceding lines in the output
* for which file and function calls the global new or delete operator,
* and replace with uprv_malloc/uprv_free.
*/
void *UMemory::operator new(size_t size) {
return uprv_malloc(size);
}
void UMemory::operator delete(void *p) {
if(p!=NULL) {
uprv_free(p);
}
}
void *UMemory::operator new[](size_t size) {
return uprv_malloc(size);
}
void UMemory::operator delete[](void *p) {
if(p!=NULL) {
uprv_free(p);
}
}
U_NAMESPACE_END
#endif
--- NEW FILE: uprops.c ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uprops.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002feb24
* created by: Markus W. Scherer
*
* Implementations for mostly non-core Unicode character properties
* stored in uprops.icu.
*/
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
#include "cstring.h"
#include "unormimp.h"
#include "uprops.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
/**
* Unicode property names and property value names are compared
* "loosely". Property[Value]Aliases.txt say:
* "With loose matching of property names, the case distinctions, whitespace,
* and '_' are ignored."
*
* This function does just that, for ASCII (char *) name strings.
* It is almost identical to ucnv_compareNames() but also ignores
* ASCII White_Space characters (U+0009..U+000d).
*
* @internal
*/
U_CAPI int32_t U_EXPORT2
uprv_comparePropertyNames(const char *name1, const char *name2) {
int32_t rc;
unsigned char c1, c2;
for(;;) {
/* Ignore delimiters '-', '_', and ASCII White_Space */
while((c1=(unsigned char)*name1)=='-' || c1=='_' ||
c1==' ' || c1=='\t' || c1=='\n' || c1=='\v' || c1=='\f' || c1=='\r'
) {
++name1;
}
while((c2=(unsigned char)*name2)=='-' || c2=='_' ||
c2==' ' || c2=='\t' || c2=='\n' || c2=='\v' || c2=='\f' || c2=='\r'
) {
++name2;
}
/* If we reach the ends of both strings then they match */
if((c1|c2)==0) {
return 0;
}
/* Case-insensitive comparison */
if(c1!=c2) {
rc=(int32_t)(unsigned char)uprv_tolower(c1)-(int32_t)(unsigned char)uprv_tolower(c2);
if(rc!=0) {
return rc;
}
}
++name1;
++name2;
}
}
/* API functions ------------------------------------------------------------ */
U_CAPI void U_EXPORT2
u_charAge(UChar32 c, UVersionInfo versionArray) {
if(versionArray!=NULL) {
uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT;
versionArray[0]=(uint8_t)(version>>4);
versionArray[1]=(uint8_t)(version&0xf);
versionArray[2]=versionArray[3]=0;
}
}
U_CAPI UScriptCode U_EXPORT2
uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if((uint32_t)c>0x10ffff) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK);
}
U_CAPI UBlockCode U_EXPORT2
ublock_getCode(UChar32 c) {
return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
}
static const struct {
int32_t column;
uint32_t mask;
} binProps[]={
/*
* column and mask values for binary properties from u_getUnicodeProperties().
* Must be in order of corresponding UProperty,
* and there must be exacly one entry per binary UProperty.
*/
{ 1, U_MASK(UPROPS_ALPHABETIC) },
{ 1, U_MASK(UPROPS_ASCII_HEX_DIGIT) },
{ 1, U_MASK(UPROPS_BIDI_CONTROL) },
{ -1, U_MASK(UPROPS_MIRROR_SHIFT) },
{ 1, U_MASK(UPROPS_DASH) },
{ 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT) },
{ 1, U_MASK(UPROPS_DEPRECATED) },
{ 1, U_MASK(UPROPS_DIACRITIC) },
{ 1, U_MASK(UPROPS_EXTENDER) },
{ 0, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */
{ 1, U_MASK(UPROPS_GRAPHEME_BASE) },
{ 1, U_MASK(UPROPS_GRAPHEME_EXTEND) },
{ 1, U_MASK(UPROPS_GRAPHEME_LINK) },
{ 1, U_MASK(UPROPS_HEX_DIGIT) },
{ 1, U_MASK(UPROPS_HYPHEN) },
{ 1, U_MASK(UPROPS_ID_CONTINUE) },
{ 1, U_MASK(UPROPS_ID_START) },
{ 1, U_MASK(UPROPS_IDEOGRAPHIC) },
{ 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR) },
{ 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR) },
{ 1, U_MASK(UPROPS_JOIN_CONTROL) },
{ 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION) },
{ 1, U_MASK(UPROPS_LOWERCASE) },
{ 1, U_MASK(UPROPS_MATH) },
{ 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT) },
{ 1, U_MASK(UPROPS_QUOTATION_MARK) },
{ 1, U_MASK(UPROPS_RADICAL) },
{ 1, U_MASK(UPROPS_SOFT_DOTTED) },
{ 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION) },
{ 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH) },
{ 1, U_MASK(UPROPS_UPPERCASE) },
{ 1, U_MASK(UPROPS_WHITE_SPACE) },
{ 1, U_MASK(UPROPS_XID_CONTINUE) },
{ 1, U_MASK(UPROPS_XID_START) },
{ -1, U_MASK(UPROPS_CASE_SENSITIVE_SHIFT) }
};
U_CAPI UBool U_EXPORT2
u_hasBinaryProperty(UChar32 c, UProperty which) {
/* c is range-checked in the functions that are called from here */
if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) {
/* not a known binary property */
return FALSE;
} else if(which==UCHAR_FULL_COMPOSITION_EXCLUSION) {
#if !UCONFIG_NO_NORMALIZATION
return unorm_internalIsFullCompositionExclusion(c);
#else
return FALSE;
#endif
} else {
/* systematic, directly stored properties */
return (u_getUnicodeProperties(c, binProps[which].column)&binProps[which].mask)!=0;
}
}
U_CAPI UBool U_EXPORT2
u_isUAlphabetic(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_ALPHABETIC);
}
U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_LOWERCASE);
}
U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_UPPERCASE);
}
U_CAPI UBool U_EXPORT2
u_isUWhiteSpace(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_WHITE_SPACE);
}
U_CAPI UBool U_EXPORT2
uprv_isRuleWhiteSpace(UChar32 c) {
/* "white space" in the sense of ICU rule parsers: Cf+White_Space */
return
u_charType(c)==U_FORMAT_CHAR ||
u_hasBinaryProperty(c, UCHAR_WHITE_SPACE);
}
static const UChar _PATTERN[] = {
/* "[[:Cf:][:WSpace:]]" */
91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
83, 112, 97, 99, 101, 58, 93, 93, 0
};
U_CAPI USet* U_EXPORT2
uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {
return uset_openPattern(_PATTERN,
sizeof(_PATTERN)/sizeof(_PATTERN[0])-1, ec);
}
U_CAPI int32_t U_EXPORT2
u_getIntPropertyValue(UChar32 c, UProperty which) {
UErrorCode errorCode;
if(which<UCHAR_BINARY_START) {
return 0; /* undefined */
} else if(which<UCHAR_BINARY_LIMIT) {
return (int32_t)u_hasBinaryProperty(c, which);
} else if(which<UCHAR_INT_START) {
return 0; /* undefined */
} else if(which<UCHAR_INT_LIMIT) {
switch(which) {
case UCHAR_BIDI_CLASS:
return (int32_t)u_charDirection(c);
case UCHAR_BLOCK:
return (int32_t)ublock_getCode(c);
case UCHAR_CANONICAL_COMBINING_CLASS:
#if !UCONFIG_NO_NORMALIZATION
return u_getCombiningClass(c);
#else
return 0;
#endif
case UCHAR_DECOMPOSITION_TYPE:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_DT_MASK);
case UCHAR_EAST_ASIAN_WIDTH:
return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
case UCHAR_GENERAL_CATEGORY:
return (int32_t)u_charType(c);
case UCHAR_JOINING_GROUP:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
case UCHAR_JOINING_TYPE:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JT_MASK)>>UPROPS_JT_SHIFT;
case UCHAR_LINE_BREAK:
return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
case UCHAR_NUMERIC_TYPE:
return (int32_t)GET_NUMERIC_TYPE(u_getUnicodeProperties(c, -1));
case UCHAR_SCRIPT:
errorCode=U_ZERO_ERROR;
return (int32_t)uscript_getScript(c, &errorCode);
case UCHAR_HANGUL_SYLLABLE_TYPE:
/* purely algorithmic; hardcode known characters, check for assigned new ones */
if(c<JAMO_L_BASE) {
/* U_HST_NOT_APPLICABLE */
} else if(c<=0x11ff) {
/* Jamo range */
if(c<=0x115f) {
/* Jamo L range, HANGUL CHOSEONG ... */
if(c==0x115f || c<=0x1159 || u_charType(c)==U_OTHER_LETTER) {
return U_HST_LEADING_JAMO;
}
} else if(c<=0x11a7) {
/* Jamo V range, HANGUL JUNGSEONG ... */
if(c<=0x11a2 || u_charType(c)==U_OTHER_LETTER) {
return U_HST_VOWEL_JAMO;
}
} else {
/* Jamo T range */
if(c<=0x11f9 || u_charType(c)==U_OTHER_LETTER) {
return U_HST_TRAILING_JAMO;
}
}
} else if((c-=HANGUL_BASE)<0) {
/* U_HST_NOT_APPLICABLE */
} else if(c<HANGUL_COUNT) {
/* Hangul syllable */
return c%JAMO_T_COUNT==0 ? U_HST_LV_SYLLABLE : U_HST_LVT_SYLLABLE;
}
return U_HST_NOT_APPLICABLE;
default:
return 0; /* undefined */
}
} else if(which==UCHAR_GENERAL_CATEGORY_MASK) {
return U_MASK(u_charType(c));
} else {
return 0; /* undefined */
}
}
U_CAPI int32_t U_EXPORT2
u_getIntPropertyMinValue(UProperty which) {
return 0; /* all binary/enum/int properties have a minimum value of 0 */
}
U_CAPI int32_t U_EXPORT2
u_getIntPropertyMaxValue(UProperty which) {
int32_t max;
if(which<UCHAR_BINARY_START) {
return -1; /* undefined */
} else if(which<UCHAR_BINARY_LIMIT) {
return 1; /* maximum TRUE for all binary properties */
} else if(which<UCHAR_INT_START) {
return -1; /* undefined */
} else if(which<UCHAR_INT_LIMIT) {
switch(which) {
case UCHAR_BIDI_CLASS:
return (int32_t)U_CHAR_DIRECTION_COUNT-1;
case UCHAR_BLOCK:
max=(uprv_getMaxValues(0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT;
return max!=0 ? max : (int32_t)UBLOCK_COUNT-1;
case UCHAR_CANONICAL_COMBINING_CLASS:
return 0xff; /* TODO do we need to be more precise, getting the actual maximum? */
case UCHAR_DECOMPOSITION_TYPE:
max=uprv_getMaxValues(2)&UPROPS_DT_MASK;
return max!=0 ? max : (int32_t)U_DT_COUNT-1;
case UCHAR_EAST_ASIAN_WIDTH:
max=(uprv_getMaxValues(0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
return max!=0 ? max : (int32_t)U_EA_COUNT-1;
case UCHAR_GENERAL_CATEGORY:
return (int32_t)U_CHAR_CATEGORY_COUNT-1;
case UCHAR_JOINING_GROUP:
max=(uprv_getMaxValues(2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
return max!=0 ? max : (int32_t)U_JG_COUNT-1;
case UCHAR_JOINING_TYPE:
max=(uprv_getMaxValues(2)&UPROPS_JT_MASK)>>UPROPS_JT_SHIFT;
return max!=0 ? max : (int32_t)U_JT_COUNT-1;
case UCHAR_LINE_BREAK:
max=(uprv_getMaxValues(0)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
return max!=0 ? max : (int32_t)U_LB_COUNT-1;
case UCHAR_NUMERIC_TYPE:
return (int32_t)U_NT_COUNT-1;
case UCHAR_SCRIPT:
max=uprv_getMaxValues(0)&UPROPS_SCRIPT_MASK;
return max!=0 ? max : (int32_t)USCRIPT_CODE_LIMIT-1;
case UCHAR_HANGUL_SYLLABLE_TYPE:
return (int32_t)U_HST_COUNT-1;
default:
return -1; /* undefined */
}
} else {
return -1; /* undefined */
}
}
/*----------------------------------------------------------------
* Inclusions list
*----------------------------------------------------------------*/
/*
* Return a set of characters for property enumeration.
* The set implicitly contains 0x110000 as well, which is one more than the highest
* Unicode code point.
*
* This set is used as an ordered list - its code points are ordered, and
* consecutive code points (in Unicode code point order) in the set define a range.
* For each two consecutive characters (start, limit) in the set,
* all of the UCD/normalization and related properties for
* all code points start..limit-1 are all the same,
* except for character names and ISO comments.
*
* All Unicode code points U+0000..U+10ffff are covered by these ranges.
* The ranges define a partition of the Unicode code space.
* ICU uses the inclusions set to enumerate properties for generating
* UnicodeSets containing all code points that have a certain property value.
*
* The Inclusion List is generated from the UCD. It is generated
* by enumerating the data tries, and code points for hardcoded properties
* are added as well.
*
* --------------------------------------------------------------------------
*
* The following are ideas for getting properties-unique code point ranges,
* with possible optimizations beyond the current implementation.
* These optimizations would require more code and be more fragile.
* The current implementation generates one single list (set) for all properties.
*
* To enumerate properties efficiently, one needs to know ranges of
* repetitive values, so that the value of only each start code point
* can be applied to the whole range.
* This information is in principle available in the uprops.icu/unorm.icu data.
*
* There are two obstacles:
*
* 1. Some properties are computed from multiple data structures,
* making it necessary to get repetitive ranges by intersecting
* ranges from multiple tries.
*
* 2. It is not economical to write code for getting repetitive ranges
* that are precise for each of some 50 properties.
*
* Compromise ideas:
*
* - Get ranges per trie, not per individual property.
* Each range contains the same values for a whole group of properties.
* This would generate currently five range sets, two for uprops.icu tries
* and three for unorm.icu tries.
*
* - Combine sets of ranges for multiple tries to get sufficient sets
* for properties, e.g., the uprops.icu main and auxiliary tries
* for all non-normalization properties.
*
* Ideas for representing ranges and combining them:
*
* - A UnicodeSet could hold just the start code points of ranges.
* Multiple sets are easily combined by or-ing them together.
*
* - Alternatively, a UnicodeSet could hold each even-numbered range.
* All ranges could be enumerated by using each start code point
* (for the even-numbered ranges) as well as each limit (end+1) code point
* (for the odd-numbered ranges).
* It should be possible to combine two such sets by xor-ing them,
* but no more than two.
*
* The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
* but the first one is certainly simpler and applicable for combining more than
* two range sets.
*
* It is possible to combine all range sets for all uprops/unorm tries into one
* set that can be used for all properties.
* As an optimization, there could be less-combined range sets for certain
* groups of properties.
* The relationship of which less-combined range set to use for which property
* depends on the implementation of the properties and must be hardcoded
* - somewhat error-prone and higher maintenance but can be tested easily
* by building property sets "the simple way" in test code.
*
* ---
*
* Do not use a UnicodeSet pattern because that causes infinite recursion;
* UnicodeSet depends on the inclusions set.
*/
U_CAPI void U_EXPORT2
uprv_getInclusions(USet* set, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
uset_clear(set);
#if !UCONFIG_NO_NORMALIZATION
unorm_addPropertyStarts(set, pErrorCode);
#endif
uchar_addPropertyStarts(set, pErrorCode);
}
--- NEW FILE: uprops.h ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uprops.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002feb24
* created by: Markus W. Scherer
*
* Constants for mostly non-core Unicode character properties
* stored in uprops.dat.
*/
#ifndef __UPROPS_H__
#define __UPROPS_H__
#include "unicode/utypes.h"
#include "unicode/uset.h"
/* indexes[] entries */
enum {
UPROPS_PROPS32_INDEX,
UPROPS_EXCEPTIONS_INDEX,
UPROPS_EXCEPTIONS_TOP_INDEX,
UPROPS_ADDITIONAL_TRIE_INDEX,
UPROPS_ADDITIONAL_VECTORS_INDEX,
UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX,
UPROPS_RESERVED_INDEX, /* 6 */
/* maximum values for code values in vector word 0 */
UPROPS_MAX_VALUES_INDEX=10,
/* maximum values for code values in vector word 2 */
UPROPS_MAX_VALUES_2_INDEX,
UPROPS_INDEX_COUNT=16
};
/* definitions for the main properties words */
enum {
/* general category shift==0 0 (5 bits) */
UPROPS_EXCEPTION_SHIFT=5, /* 5 (1 bit) */
UPROPS_BIDI_SHIFT, /* 6 (5 bits) */
UPROPS_MIRROR_SHIFT=UPROPS_BIDI_SHIFT+5, /* 11 (1 bit) */
UPROPS_NUMERIC_TYPE_SHIFT, /* 12 (3 bits) */
UPROPS_CASE_SENSITIVE_SHIFT=UPROPS_NUMERIC_TYPE_SHIFT+3,/* 15 (1 bit) format version 3.2 */
UPROPS_RESERVED_SHIFT, /* 16 (4 bits) */
UPROPS_VALUE_SHIFT=20, /* 20 */
UPROPS_EXCEPTION_BIT=1UL<<UPROPS_EXCEPTION_SHIFT,
UPROPS_VALUE_BITS=32-UPROPS_VALUE_SHIFT,
UPROPS_MIN_VALUE=-(1L<<(UPROPS_VALUE_BITS-1)),
UPROPS_MAX_VALUE=(1L<<(UPROPS_VALUE_BITS-1))-1,
UPROPS_MAX_EXCEPTIONS_COUNT=1L<<UPROPS_VALUE_BITS
};
#define PROPS_VALUE_IS_EXCEPTION(props) ((props)&UPROPS_EXCEPTION_BIT)
#define GET_CATEGORY(props) ((props)&0x1f)
#define GET_BIDI_CLASS(props) ((props>>UPROPS_BIDI_SHIFT)&0x1f)
#define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7)
#define GET_UNSIGNED_VALUE(props) ((props)>>UPROPS_VALUE_SHIFT)
#define GET_SIGNED_VALUE(props) ((int32_t)(props)>>UPROPS_VALUE_SHIFT)
#define GET_EXCEPTIONS(props) (exceptionsTable+GET_UNSIGNED_VALUE(props))
#define CAT_MASK(props) U_MASK(GET_CATEGORY(props))
enum {
EXC_UPPERCASE,
EXC_LOWERCASE,
EXC_TITLECASE,
EXC_UNUSED,
EXC_NUMERIC_VALUE,
EXC_DENOMINATOR_VALUE,
EXC_MIRROR_MAPPING,
EXC_SPECIAL_CASING,
EXC_CASE_FOLDING
};
/* number of properties vector words */
#define UPROPS_VECTOR_WORDS 3
/*
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 23 reserved
* 22..18 Line Break
* 17..15 East Asian Width
* 14.. 7 UBlockCode
* 6.. 0 UScriptCode
*/
/* derived age: one nibble each for major and minor version numbers */
#define UPROPS_AGE_MASK 0xff000000
#define UPROPS_AGE_SHIFT 24
#define UPROPS_LB_MASK 0x007C0000
#define UPROPS_LB_SHIFT 18
#define UPROPS_EA_MASK 0x00038000
#define UPROPS_EA_SHIFT 15
#define UPROPS_BLOCK_MASK 0x00007f80
#define UPROPS_BLOCK_SHIFT 7
#define UPROPS_SCRIPT_MASK 0x0000007f
/*
* Properties in vector word 1
* Each bit encodes one binary property.
* The following constants represent the bit number, use 1<<UPROPS_XYZ.
* UPROPS_BINARY_1_TOP<=32!
*
* Keep this list of property enums in sync with
* propListNames[] in icu/source/tools/genprops/props2.c!
*
* ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
*/
enum {
UPROPS_WHITE_SPACE,
UPROPS_BIDI_CONTROL,
UPROPS_JOIN_CONTROL,
UPROPS_DASH,
UPROPS_HYPHEN,
UPROPS_QUOTATION_MARK,
UPROPS_TERMINAL_PUNCTUATION,
UPROPS_MATH,
UPROPS_HEX_DIGIT,
UPROPS_ASCII_HEX_DIGIT,
UPROPS_ALPHABETIC,
UPROPS_IDEOGRAPHIC,
UPROPS_DIACRITIC,
UPROPS_EXTENDER,
UPROPS_LOWERCASE,
UPROPS_UPPERCASE,
UPROPS_NONCHARACTER_CODE_POINT,
UPROPS_GRAPHEME_EXTEND,
UPROPS_GRAPHEME_LINK,
UPROPS_IDS_BINARY_OPERATOR,
UPROPS_IDS_TRINARY_OPERATOR,
UPROPS_RADICAL,
UPROPS_UNIFIED_IDEOGRAPH,
UPROPS_DEFAULT_IGNORABLE_CODE_POINT,
UPROPS_DEPRECATED,
UPROPS_SOFT_DOTTED,
UPROPS_LOGICAL_ORDER_EXCEPTION,
UPROPS_XID_START,
UPROPS_XID_CONTINUE,
UPROPS_ID_START, /* ICU 2.6, uprops format version 3.2 */
UPROPS_ID_CONTINUE,
UPROPS_GRAPHEME_BASE,
UPROPS_BINARY_1_TOP /* ==32 - full! */
};
/*
* Properties in vector word 2
* Bits
* 13..11 Joining Type
* 10.. 5 Joining Group
* 4.. 0 Decomposition Type
*/
#define UPROPS_JT_MASK 0x00003800
#define UPROPS_JT_SHIFT 11
#define UPROPS_JG_MASK 0x000007e0
#define UPROPS_JG_SHIFT 5
#define UPROPS_DT_MASK 0x0000001f
/**
* Get a properties vector word for a code point.
* Implemented in uchar.c for uprops.c.
* column==-1 gets the 32-bit main properties word instead.
* @return 0 if no data or illegal argument
*/
U_CFUNC uint32_t
u_getUnicodeProperties(UChar32 c, int32_t column);
/**
* Get the the maximum values for some enum/int properties.
* Use the same column numbers as for u_getUnicodeProperties().
* The returned value will contain maximum values stored in the same bit fields
* as where the enum values are stored in the u_getUnicodeProperties()
* return values for the same columns.
*
* Valid columns are those for properties words that contain enumerated values.
* (ICU 2.6: columns 0 and 2)
* For other column numbers, this function will return 0.
*
* @internal
*/
U_CFUNC int32_t
uprv_getMaxValues(int32_t column);
/**
* Unicode property names and property value names are compared
* "loosely". Property[Value]Aliases.txt say:
* "With loose matching of property names, the case distinctions, whitespace,
* and '_' are ignored."
*
* This function does just that, for ASCII (char *) name strings.
* It is almost identical to ucnv_compareNames() but also ignores
* ASCII White_Space characters (U+0009..U+000d).
*
* @internal
*/
U_CAPI int32_t U_EXPORT2
uprv_comparePropertyNames(const char *name1, const char *name2);
/** Turn a bit index into a bit flag. @internal */
#define FLAG(n) ((uint32_t)1<<(n))
/** Flags for general categories in the order of UCharCategory. @internal */
#define _Cn FLAG(U_GENERAL_OTHER_TYPES)
#define _Lu FLAG(U_UPPERCASE_LETTER)
#define _Ll FLAG(U_LOWERCASE_LETTER)
#define _Lt FLAG(U_TITLECASE_LETTER)
#define _Lm FLAG(U_MODIFIER_LETTER)
#define _Lo FLAG(U_OTHER_LETTER)
#define _Mn FLAG(U_NON_SPACING_MARK)
#define _Me FLAG(U_ENCLOSING_MARK)
#define _Mc FLAG(U_COMBINING_SPACING_MARK)
#define _Nd FLAG(U_DECIMAL_DIGIT_NUMBER)
#define _Nl FLAG(U_LETTER_NUMBER)
#define _No FLAG(U_OTHER_NUMBER)
#define _Zs FLAG(U_SPACE_SEPARATOR)
#define _Zl FLAG(U_LINE_SEPARATOR)
#define _Zp FLAG(U_PARAGRAPH_SEPARATOR)
#define _Cc FLAG(U_CONTROL_CHAR)
#define _Cf FLAG(U_FORMAT_CHAR)
#define _Co FLAG(U_PRIVATE_USE_CHAR)
#define _Cs FLAG(U_SURROGATE)
#define _Pd FLAG(U_DASH_PUNCTUATION)
#define _Ps FLAG(U_START_PUNCTUATION)
#define _Pe FLAG(U_END_PUNCTUATION)
#define _Pc FLAG(U_CONNECTOR_PUNCTUATION)
#define _Po FLAG(U_OTHER_PUNCTUATION)
#define _Sm FLAG(U_MATH_SYMBOL)
#define _Sc FLAG(U_CURRENCY_SYMBOL)
#define _Sk FLAG(U_MODIFIER_SYMBOL)
#define _So FLAG(U_OTHER_SYMBOL)
#define _Pi FLAG(U_INITIAL_PUNCTUATION)
#define _Pf FLAG(U_FINAL_PUNCTUATION)
/** Some code points. @internal */
enum {
TAB =0x0009,
LF =0x000a,
FF =0x000c,
CR =0x000d,
U_A =0x0041,
U_Z =0x005a,
U_a =0x0061,
U_z =0x007a,
DEL =0x007f,
NL =0x0085,
NBSP =0x00a0,
CGJ =0x034f,
FIGURESP=0x2007,
HAIRSP =0x200a,
ZWNJ =0x200c,
ZWJ =0x200d,
RLM =0x200f,
NNBSP =0x202f,
WJ =0x2060,
INHSWAP =0x206a,
NOMDIG =0x206f,
ZWNBSP =0xfeff
};
/**
* Is this character a "white space" in the sense of ICU rule parsers?
* @internal
*/
U_CAPI UBool U_EXPORT2
uprv_isRuleWhiteSpace(UChar32 c);
/**
* Get the set of "white space" characters in the sense of ICU rule
* parsers. Caller must close/delete result.
* @internal
*/
U_CAPI USet* U_EXPORT2
uprv_openRuleWhiteSpaceSet(UErrorCode* ec);
/**
* Get the maximum length of a (regular/1.0/extended) character name.
* @return 0 if no character names available.
*/
U_CAPI int32_t U_EXPORT2
uprv_getMaxCharNameLength(void);
#if 0
/*
Currently not used but left for future use. Probably by UnicodeSet.
urename.h and unames.c changed accordingly.
*/
/**
* Get the maximum length of an ISO comment.
* @return 0 if no ISO comments available.
*/
U_CAPI int32_t U_EXPORT2
uprv_getMaxISOCommentLength();
#endif
/**
* Fills set with characters that are used in Unicode character names.
* Includes all characters that are used in regular/Unicode 1.0/extended names.
* Just empties the set if no character names are available.
* @param set USet to receive characters. Existing contents are deleted.
*/
U_CAPI void U_EXPORT2
uprv_getCharNameCharacters(USet* set);
#if 0
/*
Currently not used but left for future use. Probably by UnicodeSet.
urename.h and unames.c changed accordingly.
*/
/**
* Fills set with characters that are used in Unicode character names.
* Just empties the set if no ISO comments are available.
* @param set USet to receive characters. Existing contents are deleted.
*/
U_CAPI void U_EXPORT2
uprv_getISOCommentCharacters(USet* set);
*/
#endif
/**
* Enumerate each core properties data trie and add the
* start of each range of same properties to the set.
* @internal
*/
U_CAPI void U_EXPORT2
uchar_addPropertyStarts(USet *set, UErrorCode *pErrorCode);
/**
* Return a set of characters for property enumeration.
* For each two consecutive characters (start, limit) in the set,
* all of the properties for start..limit-1 are all the same.
*
* @param set USet to receive result. Existing contents are lost.
* @internal
*/
U_CAPI void U_EXPORT2
uprv_getInclusions(USet* set, UErrorCode *pErrorCode);
#endif
--- NEW FILE: usc_impl.c ---
/*
**********************************************************************
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File USC_IMPL.C
*
* Modification History:
*
* Date Name Description
* 07/08/2002 Eric Mader Creation.
******************************************************************************
*/
#include "unicode/uscript.h"
#include "usc_impl.h"
#include "cmemory.h"
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
#define PAREN_STACK_DEPTH 128
struct ParenStackEntry
{
int32_t pairIndex;
UScriptCode scriptCode;
};
struct UScriptRun
{
int32_t textLength;
const UChar *textArray;
int32_t scriptStart;
int32_t scriptLimit;
UScriptCode scriptCode;
struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
int32_t parenSP;
};
static int8_t highBit(int32_t value);
static const UChar32 pairedChars[] = {
0x0028, 0x0029, /* ascii paired punctuation */
0x003c, 0x003e,
0x005b, 0x005d,
0x007b, 0x007d,
0x00ab, 0x00bb, /* guillemets */
0x2018, 0x2019, /* general punctuation */
0x201c, 0x201d,
0x2039, 0x203a,
0x3008, 0x3009, /* chinese paired punctuation */
0x300a, 0x300b,
0x300c, 0x300d,
0x300e, 0x300f,
0x3010, 0x3011,
0x3014, 0x3015,
0x3016, 0x3017,
0x3018, 0x3019,
0x301a, 0x301b
};
static int8_t
highBit(int32_t value)
{
int8_t bit = 0;
if (value <= 0) {
return -32;
}
if (value >= 1 << 16) {
value >>= 16;
bit += 16;
}
if (value >= 1 << 8) {
value >>= 8;
bit += 8;
}
if (value >= 1 << 4) {
value >>= 4;
bit += 4;
}
if (value >= 1 << 2) {
value >>= 2;
bit += 2;
}
if (value >= 1 << 1) {
value >>= 1;
bit += 1;
}
return bit;
}
static int32_t
getPairIndex(UChar32 ch)
{
int32_t pairedCharCount = ARRAY_SIZE(pairedChars);
int32_t pairedCharPower = 1 << highBit(pairedCharCount);
int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
int32_t probe = pairedCharPower;
int32_t index = 0;
if (ch >= pairedChars[pairedCharExtra]) {
index = pairedCharExtra;
}
while (probe > (1 << 0)) {
probe >>= 1;
if (ch >= pairedChars[index + probe]) {
index += probe;
}
}
if (pairedChars[index] != ch) {
index = -1;
}
return index;
}
static UBool
sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
{
return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
}
U_CAPI UScriptRun * U_EXPORT2
uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode)
{
UScriptRun *result = NULL;
if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
return NULL;
}
result = uprv_malloc(sizeof (UScriptRun));
if (result == NULL) {
*pErrorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uscript_setRunText(result, src, length, pErrorCode);
/* Release the UScriptRun if uscript_setRunText() returns an error */
if (U_FAILURE(*pErrorCode)) {
uprv_free(result);
result = NULL;
}
return result;
}
U_CAPI void U_EXPORT2
uscript_closeRun(UScriptRun *scriptRun)
{
if (scriptRun != NULL) {
uprv_free(scriptRun);
}
}
U_CAPI void U_EXPORT2
uscript_resetRun(UScriptRun *scriptRun)
{
if (scriptRun != NULL) {
scriptRun->scriptStart = 0;
scriptRun->scriptLimit = 0;
scriptRun->scriptCode = USCRIPT_INVALID_CODE;
scriptRun->parenSP = -1;
}
}
U_CAPI void U_EXPORT2
uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode)
{
if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
return;
}
if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) {
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
scriptRun->textArray = src;
scriptRun->textLength = length;
uscript_resetRun(scriptRun);
}
U_CAPI UBool U_EXPORT2
uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
{
int32_t startSP = -1; /* used to find the first new open character */
UErrorCode error = U_ZERO_ERROR;
/* if we've fallen off the end of the text, we're done */
if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) {
return FALSE;
}
startSP = scriptRun->parenSP;
scriptRun->scriptCode = USCRIPT_COMMON;
for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
UChar high = scriptRun->textArray[scriptRun->scriptLimit];
UChar32 ch = high;
UScriptCode sc;
int32_t pairIndex;
/*
* if the character is a high surrogate and it's not the last one
* in the text, see if it's followed by a low surrogate
*/
if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1];
/*
* if it is followed by a low surrogate,
* consume it and form the full character
*/
if (low >= 0xDC00 && low <= 0xDFFF) {
ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
scriptRun->scriptLimit += 1;
}
}
sc = uscript_getScript(ch, &error);
pairIndex = getPairIndex(ch);
/*
* Paired character handling:
*
* if it's an open character, push it onto the stack.
* if it's a close character, find the matching open on the
* stack, and use that script code. Any non-matching open
* characters above it on the stack will be poped.
*/
if (pairIndex >= 0) {
if ((pairIndex & 1) == 0) {
/*
* If the paren stack is full, empty it. This
* means that deeply nested paired punctuation
* characters will be ignored, but that's an unusual
* case, and it's better to ignore them than to
* write off the end of the stack...
*/
if (++scriptRun->parenSP >= PAREN_STACK_DEPTH) {
scriptRun->parenSP = 0;
}
scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex;
scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptRun->scriptCode;
} else if (scriptRun->parenSP >= 0) {
int32_t pi = pairIndex & ~1;
while (scriptRun->parenSP >= 0 && scriptRun->parenStack[scriptRun->parenSP].pairIndex != pi) {
scriptRun->parenSP -= 1;
}
if (scriptRun->parenSP < startSP) {
startSP = scriptRun->parenSP;
}
if (scriptRun->parenSP >= 0) {
sc = scriptRun->parenStack[scriptRun->parenSP].scriptCode;
}
}
}
if (sameScript(scriptRun->scriptCode, sc)) {
if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
scriptRun->scriptCode = sc;
/*
* now that we have a final script code, fix any open
* characters we pushed before we knew the script code.
*/
while (startSP < scriptRun->parenSP) {
scriptRun->parenStack[++startSP].scriptCode = scriptRun->scriptCode;
}
}
/*
* if this character is a close paired character,
* pop it from the stack
*/
if (pairIndex >= 0 && (pairIndex & 1) != 0 && scriptRun->parenSP >= 0) {
scriptRun->parenSP -= 1;
startSP -= 1;
}
} else {
/*
* if the run broke on a surrogate pair,
* end it before the high surrogate
*/
if (ch >= 0x10000) {
scriptRun->scriptLimit -= 1;
}
break;
}
}
if (pRunStart != NULL) {
*pRunStart = scriptRun->scriptStart;
}
if (pRunLimit != NULL) {
*pRunLimit = scriptRun->scriptLimit;
}
if (pRunScript != NULL) {
*pRunScript = scriptRun->scriptCode;
}
return TRUE;
}
--- NEW FILE: usc_impl.h ---
/*
**********************************************************************
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File USC_IMPL.H
*
* Modification History:
*
* Date Name Description
* 07/08/2002 Eric Mader Creation.
******************************************************************************
*/
#ifndef USC_IMPL_H
#define USC_IMPL_H
#include "unicode/utypes.h"
#include "unicode/uscript.h"
/**
* <code>UScriptRun</code> is used to find runs of characters in
* the same script. It implements a simple iterator over an array
* of characters. The iterator will resolve script-neutral characters
* like punctuation into the script of the surrounding characters.
*
* The iterator will try to match paired punctuation. If it sees an
* opening punctuation character, it will remember the script that
* was assigned to that character, and assign the same script to the
* matching closing punctuation.
*
* Scripts are chosen based on the <code>UScriptCode</code> enumeration.
* No attempt is made to combine related scripts into a single run. In
* particular, Hiragana, Katakana, and Han characters will appear in seperate
* runs.
* Here is an example of how to iterate over script runs:
* <pre>
* \code
* void printScriptRuns(const UChar *text, int32_t length)
* {
* UErrorCode error = U_ZERO_ERROR;
* UScriptRun *scriptRun = uscript_openRun(text, testLength, &error);
* int32_t start = 0, limit = 0;
* UScriptCode code = USCRIPT_INVALID_CODE;
*
* while (uscript_nextRun(&start, &limit, &code)) {
* printf("Script '%s' from %d to %d.\n", uscript_getName(code), start, limit);
* }
*
* uscript_closeRun(scriptRun);
* }
* </pre>
*
* @draft ICU 2.2
*/
struct UScriptRun;
typedef struct UScriptRun UScriptRun;
/**
* Create a <code>UScriptRun</code> object for iterating over the given text. This object must
* be freed using <code>uscript_closeRun()</code>. Note that this object does not copy the source text,
* only the pointer to it. You must make sure that the pointer remains valid until you call
* <code>uscript_closeRun()</code> or <code>uscript_setRunText()</code>.
*
* @param src is the address of the array of characters over which to iterate.
* if <code>src == NULL</code> and <code>length == 0</code>,
* an empty <code>UScriptRun</code> object will be returned.
*
* @param length is the number of characters over which to iterate.
*
* @param pErrorCode is a pointer to a valid <code>UErrorCode</code> value. If this value
* indicates a failure on entry, the function will immediately return.
* On exit the value will indicate the success of the operation.
*
* @return the address of <code>UScriptRun</code> object which will iterate over the text,
* or <code>NULL</code> if the operation failed.
*
* @draft ICU 2.2
*/
U_CAPI UScriptRun * U_EXPORT2
uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode);
/**
* Frees the given <code>UScriptRun</code> object and any storage associated with it.
* On return, scriptRun no longer points to a valid <code>UScriptRun</code> object.
*
* @param scriptRun is the <code>UScriptRun</code> object which will be freed.
*
* @draft ICU 2.2
*/
U_CAPI void U_EXPORT2
uscript_closeRun(UScriptRun *scriptRun);
/**
* Reset the <code>UScriptRun</code> object so that it will start iterating from
* the beginning.
*
* @param scriptRun is the address of the <code>UScriptRun</code> object to be reset.
*
* @draft ICU 2.2
*/
U_CAPI void U_EXPORT2
uscript_resetRun(UScriptRun *scriptRun);
/**
* Change the text over which the given <code>UScriptRun</code> object iterates.
*
* @param scriptRun is the <code>UScriptRun</code> object which will be changed.
*
* @param src is the address of the new array of characters over which to iterate.
* If <code>src == NULL</code> and <code>length == 0</code>,
* the <code>UScriptRun</code> object will become empty.
*
* @param length is the new number of characters over which to iterate
*
* @param pErrorCode is a pointer to a valid <code>UErrorCode</code> value. If this value
* indicates a failure on entry, the function will immediately return.
* On exit the value will indicate the success of the operation.
*
* @draft ICU 2.2
*/
U_CAPI void U_EXPORT2
uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode);
/**
* Advance the <code>UScriptRun</code> object to the next script run, return the start and limit
* offsets, and the script of the run.
*
* @param scriptRun is the address of the <code>UScriptRun</code> object.
*
* @param pRunStart is a pointer to the variable to receive the starting offset of the next run.
* This pointer can be <code>NULL</code> if the value is not needed.
*
* @param pRunLimit is a pointer to the variable to receive the limit offset of the next run.
* This pointer can be <code>NULL</code> if the value is not needed.
*
* @param pRunScript is a pointer to the variable to receive the UScriptCode for the
* script of the current run. This pointer can be <code>NULL</code> if the value is not needed.
*
* @return true if there was another script run.
*
* @draft ICU 2.2
*/
U_CAPI UBool U_EXPORT2
uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript);
#endif
--- NEW FILE: uset.cpp ---
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uset.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002mar07
* created by: Markus W. Scherer
*
* The serialized structure, the array of range limits, is
* the same as in UnicodeSet, except that the HIGH value is not stored.
*
* There are functions to efficiently serialize a USet into an array of uint16_t
* and functions to use such a serialized form efficiently without
* instantiating a new USet.
*/
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/uset.h"
#include "unicode/uniset.h"
#include "cmemory.h"
#include "unicode/ustring.h"
U_CAPI USet* U_EXPORT2
uset_open(UChar32 start, UChar32 end) {
return (USet*) new UnicodeSet(start, end);
}
U_CAPI USet* U_EXPORT2
uset_openPattern(const UChar* pattern, int32_t patternLength,
UErrorCode* ec)
{
UnicodeString pat(patternLength==-1, pattern, patternLength);
UnicodeSet* set = new UnicodeSet(pat, *ec);
/* test for NULL */
if(set == 0) {
*ec = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_FAILURE(*ec)) {
delete set;
set = NULL;
}
return (USet*) set;
}
U_CAPI USet* U_EXPORT2
uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
uint32_t options,
UErrorCode* ec)
{
UnicodeString pat(patternLength==-1, pattern, patternLength);
UnicodeSet* set = new UnicodeSet(pat, options, *ec);
/* test for NULL */
if(set == 0) {
*ec = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_FAILURE(*ec)) {
delete set;
set = NULL;
}
return (USet*) set;
}
U_CAPI void U_EXPORT2
uset_close(USet* set) {
delete (UnicodeSet*) set;
}
U_CAPI int32_t U_EXPORT2
uset_toPattern(const USet* set,
UChar* result, int32_t resultCapacity,
UBool escapeUnprintable,
UErrorCode* ec) {
UnicodeString pat;
((const UnicodeSet*) set)->toPattern(pat, escapeUnprintable);
return pat.extract(result, resultCapacity, *ec);
}
U_CAPI void U_EXPORT2
uset_addAll(USet* set, const USet *additionalSet) {
((UnicodeSet*) set)->addAll(*((const UnicodeSet*)additionalSet));
}
U_CAPI void U_EXPORT2
uset_add(USet* set, UChar32 c) {
((UnicodeSet*) set)->add(c);
}
U_CAPI void U_EXPORT2
uset_addRange(USet* set, UChar32 start, UChar32 end) {
((UnicodeSet*) set)->add(start, end);
}
U_CAPI void U_EXPORT2
uset_addString(USet* set, const UChar* str, int32_t strLen) {
// WRONG! Do not alias, it will stay aliased, even after
// copying. TODO: do we need a copy ctor that unaliases
//UnicodeString s(strLen==-1, str, strLen);
// We promised -1 for zero terminated
if(strLen == -1) {
strLen = u_strlen(str);
}
UnicodeString s(str, strLen);
((UnicodeSet*) set)->add(s);
}
U_CAPI void U_EXPORT2
uset_remove(USet* set, UChar32 c) {
((UnicodeSet*) set)->remove(c);
}
U_CAPI void U_EXPORT2
uset_removeRange(USet* set, UChar32 start, UChar32 end) {
((UnicodeSet*) set)->remove(start, end);
}
U_CAPI void U_EXPORT2
uset_removeString(USet* set, const UChar* str, int32_t strLen) {
UnicodeString s(strLen==-1, str, strLen);
((UnicodeSet*) set)->remove(s);
}
U_CAPI void U_EXPORT2
uset_complement(USet* set) {
((UnicodeSet*) set)->complement();
}
U_CAPI void U_EXPORT2
uset_clear(USet* set) {
((UnicodeSet*) set)->clear();
}
U_CAPI UBool U_EXPORT2
uset_isEmpty(const USet* set) {
return ((const UnicodeSet*) set)->isEmpty();
}
U_CAPI UBool U_EXPORT2
uset_contains(const USet* set, UChar32 c) {
return ((const UnicodeSet*) set)->contains(c);
}
U_CAPI UBool U_EXPORT2
uset_containsRange(const USet* set, UChar32 start, UChar32 end) {
return ((const UnicodeSet*) set)->contains(start, end);
}
U_CAPI UBool U_EXPORT2
uset_containsString(const USet* set, const UChar* str, int32_t strLen) {
UnicodeString s(strLen==-1, str, strLen);
return ((const UnicodeSet*) set)->contains(s);
}
U_CAPI int32_t U_EXPORT2
uset_size(const USet* set) {
return ((const UnicodeSet*) set)->size();
}
U_NAMESPACE_BEGIN
/**
* This class only exists to provide access to the UnicodeSet private
* USet support API. Declaring a class a friend is more portable than
* trying to declare extern "C" functions as friends.
*/
class USetAccess /* not : public UObject because all methods are static */ {
public:
/* Try to have the compiler inline these*/
inline static int32_t getStringCount(const UnicodeSet& set) {
return set.getStringCount();
}
inline static const UnicodeString* getString(const UnicodeSet& set,
int32_t i) {
return set.getString(i);
}
private:
/* do not instantiate*/
USetAccess();
};
U_NAMESPACE_END
U_CAPI int32_t U_EXPORT2
uset_getItemCount(const USet* uset) {
const UnicodeSet& set = *(const UnicodeSet*)uset;
return set.getRangeCount() + USetAccess::getStringCount(set);
}
U_CAPI int32_t U_EXPORT2
uset_getItem(const USet* uset, int32_t itemIndex,
UChar32* start, UChar32* end,
UChar* str, int32_t strCapacity,
UErrorCode* ec) {
if (U_FAILURE(*ec)) return 0;
const UnicodeSet& set = *(const UnicodeSet*)uset;
int32_t rangeCount;
if (itemIndex < 0) {
*ec = U_ILLEGAL_ARGUMENT_ERROR;
return -1;
} else if (itemIndex < (rangeCount = set.getRangeCount())) {
*start = set.getRangeStart(itemIndex);
*end = set.getRangeEnd(itemIndex);
return 0;
} else {
itemIndex -= rangeCount;
if (itemIndex < USetAccess::getStringCount(set)) {
const UnicodeString* s = USetAccess::getString(set, itemIndex);
return s->extract(str, strCapacity, *ec);
} else {
*ec = U_INDEX_OUTOFBOUNDS_ERROR;
return -1;
}
}
}
//U_CAPI int32_t U_EXPORT2
//uset_getRangeCount(const USet* set) {
// return ((const UnicodeSet*) set)->getRangeCount();
//}
//
//U_CAPI UBool U_EXPORT2
//uset_getRange(const USet* set, int32_t rangeIndex,
// UChar32* pStart, UChar32* pEnd) {
// if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
// return FALSE;
// }
// const UnicodeSet* us = (const UnicodeSet*) set;
// *pStart = us->getRangeStart(rangeIndex);
// *pEnd = us->getRangeEnd(rangeIndex);
// return TRUE;
//}
/*
* Serialize a USet into 16-bit units.
* Store BMP code points as themselves with one 16-bit unit each.
*
* Important: the code points in the array are in ascending order,
* therefore all BMP code points precede all supplementary code points.
*
* Store each supplementary code point in 2 16-bit units,
* simply with higher-then-lower 16-bit halfs.
*
* Precede the entire list with the length.
* If there are supplementary code points, then set bit 15 in the length
* and add the bmpLength between it and the array.
*
* In other words:
* - all BMP: (length=bmpLength) BMP, .., BMP
* - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
*/
U_CAPI int32_t U_EXPORT2
uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) {
if (ec==NULL || U_FAILURE(*ec)) {
return 0;
}
return ((const UnicodeSet*) set)->serialize(dest, destCapacity,* ec);
}
U_CAPI UBool U_EXPORT2
uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) {
int32_t length;
if(fillSet==NULL) {
return FALSE;
}
if(src==NULL || srcLength<=0) {
fillSet->length=fillSet->bmpLength=0;
return FALSE;
}
length=*src++;
if(length&0x8000) {
/* there are supplementary values */
length&=0x7fff;
if(srcLength<(2+length)) {
fillSet->length=fillSet->bmpLength=0;
return FALSE;
}
fillSet->bmpLength=*src++;
} else {
/* only BMP values */
if(srcLength<(1+length)) {
fillSet->length=fillSet->bmpLength=0;
return FALSE;
}
fillSet->bmpLength=length;
}
fillSet->array=src;
fillSet->length=length;
return TRUE;
}
U_CAPI void U_EXPORT2
uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) {
if(fillSet==NULL || (uint32_t)c>0x10ffff) {
return;
}
fillSet->array=fillSet->staticArray;
if(c<0xffff) {
fillSet->bmpLength=fillSet->length=2;
fillSet->staticArray[0]=(uint16_t)c;
fillSet->staticArray[1]=(uint16_t)c+1;
} else if(c==0xffff) {
fillSet->bmpLength=1;
fillSet->length=3;
fillSet->staticArray[0]=0xffff;
fillSet->staticArray[1]=1;
fillSet->staticArray[2]=0;
} else if(c<0x10ffff) {
fillSet->bmpLength=0;
fillSet->length=4;
fillSet->staticArray[0]=(uint16_t)(c>>16);
fillSet->staticArray[1]=(uint16_t)c;
++c;
fillSet->staticArray[2]=(uint16_t)(c>>16);
fillSet->staticArray[3]=(uint16_t)c;
} else /* c==0x10ffff */ {
fillSet->bmpLength=0;
fillSet->length=2;
fillSet->staticArray[0]=0x10;
fillSet->staticArray[1]=0xffff;
}
}
U_CAPI UBool U_EXPORT2
uset_serializedContains(const USerializedSet* set, UChar32 c) {
const uint16_t* array;
if(set==NULL || (uint32_t)c>0x10ffff) {
return FALSE;
}
array=set->array;
if(c<=0xffff) {
/* find c in the BMP part */
int32_t i, bmpLength=set->bmpLength;
for(i=0; i<bmpLength && (uint16_t)c>=array[i]; ++i) {}
return (UBool)(i&1);
} else {
/* find c in the supplementary part */
int32_t i, length=set->length;
uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c;
for(i=set->bmpLength;
i<length && (high>array[i] || (high==array[i] && low>=array[i+1]));
i+=2) {}
/* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
return (UBool)(((i+set->bmpLength)&2)!=0);
}
}
U_CAPI int32_t U_EXPORT2
uset_getSerializedRangeCount(const USerializedSet* set) {
if(set==NULL) {
return 0;
}
return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2;
}
U_CAPI UBool U_EXPORT2
uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
UChar32* pStart, UChar32* pEnd) {
const uint16_t* array;
int32_t bmpLength, length;
if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) {
return FALSE;
}
array=set->array;
length=set->length;
bmpLength=set->bmpLength;
rangeIndex*=2; /* address start/limit pairs */
if(rangeIndex<bmpLength) {
*pStart=array[rangeIndex++];
if(rangeIndex<bmpLength) {
*pEnd=array[rangeIndex];
} else if(rangeIndex<length) {
*pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
} else {
*pEnd=0x110000;
}
--*pEnd;
return TRUE;
} else {
rangeIndex-=bmpLength;
rangeIndex*=2; /* address pairs of pairs of units */
length-=bmpLength;
if(rangeIndex<length) {
array+=bmpLength;
*pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
rangeIndex+=2;
if(rangeIndex<length) {
*pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
} else {
*pEnd=0x110000;
}
--*pEnd;
return TRUE;
} else {
return FALSE;
}
}
}
// TODO The old, internal uset.c had an efficient uset_containsOne function.
// Returned the one and only code point, or else -1 or something.
// Consider adding such a function to both C and C++ UnicodeSet/uset.
// See tools/gennorm/store.c for usage, now usetContainsOne there.
// TODO Investigate incorporating this code into UnicodeSet to improve
// efficiency.
// ---
// #define USET_GROW_DELTA 20
//
// static U_INLINE int32_t
// findChar(const UChar32* array, int32_t length, UChar32 c) {
// int32_t i;
//
// /* check the last range limit first for more efficient appending */
// if(length>0) {
// if(c>=array[length-1]) {
// return length;
// }
//
// /* do not check the last range limit again in the loop below */
// --length;
// }
//
// for(i=0; i<length && c>=array[i]; ++i) {}
// return i;
// }
//
// static UBool
// addRemove(USet* set, UChar32 c, int32_t doRemove) {
// int32_t i, length, more;
//
// if(set==NULL || (uint32_t)c>0x10ffff) {
// return FALSE;
// }
//
// length=set->length;
// i=findChar(set->array, length, c);
// if((i&1)^doRemove) {
// /* c is already in the set */
// return TRUE;
// }
//
// /* how many more array items do we need? */
// if(i<length && (c+1)==set->array[i]) {
// /* c is just before the following range, extend that in-place by one */
// set->array[i]=c;
// if(i>0) {
// --i;
// if(c==set->array[i]) {
// /* the previous range collapsed, remove it */
// set->length=length-=2;
// if(i<length) {
// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
// }
// }
// }
// return TRUE;
// } else if(i>0 && c==set->array[i-1]) {
// /* c is just after the previous range, extend that in-place by one */
// if(++c<=0x10ffff) {
// set->array[i-1]=c;
// if(i<length && c==set->array[i]) {
// /* the following range collapsed, remove it */
// --i;
// set->length=length-=2;
// if(i<length) {
// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
// }
// }
// } else {
// /* extend the previous range (had limit 0x10ffff) to the end of Unicode */
// set->length=i-1;
// }
// return TRUE;
// } else if(i==length && c==0x10ffff) {
// /* insert one range limit c */
// more=1;
// } else {
// /* insert two range limits c, c+1 */
// more=2;
// }
//
// /* insert <more> range limits */
// if(length+more>set->capacity) {
// /* reallocate */
// int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
// UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
// if(newArray==NULL) {
// return FALSE;
// }
// set->capacity=newCapacity;
// uprv_memcpy(newArray, set->array, length*4);
//
// if(set->array!=set->staticBuffer) {
// uprv_free(set->array);
// }
// set->array=newArray;
// }
//
// if(i<length) {
// uprv_memmove(set->array+i+more, set->array+i, (length-i)*4);
// }
// set->array[i]=c;
// if(more==2) {
// set->array[i+1]=c+1;
// }
// set->length+=more;
//
// return TRUE;
// }
//
// U_CAPI UBool U_EXPORT2
// uset_add(USet* set, UChar32 c) {
// return addRemove(set, c, 0);
// }
//
// U_CAPI void U_EXPORT2
// uset_remove(USet* set, UChar32 c) {
// addRemove(set, c, 1);
// }
--- NEW FILE: usetiter.cpp ---
/*
**********************************************************************
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* $Source: /usr/local/cvsroot/icu-sword/source/common/usetiter.cpp,v $
**********************************************************************
*/
#include "unicode/usetiter.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
const char UnicodeSetIterator::fgClassID=0;
/**
* Create an iterator
* @param set set to iterate over
*/
UnicodeSetIterator::UnicodeSetIterator(const UnicodeSet& uSet) {
reset(uSet);
}
/**
* Create an iterator. Convenience for when the contents are to be set later.
*/
UnicodeSetIterator::UnicodeSetIterator() {
this->set = NULL;
reset();
}
UnicodeSetIterator::~UnicodeSetIterator() {
// Nothing to do
}
/**
* Returns the next element in the set.
* @return true if there was another element in the set.
* if so, if codepoint == IS_STRING, the value is a string in the string field
* else the value is a single code point in the codepoint field.
* <br>You are guaranteed that the codepoints are in sorted order, and the strings are in sorted order,
* and that all code points are returned before any strings are returned.
* <br>Note also that the codepointEnd is undefined after calling this method.
*/
UBool UnicodeSetIterator::next() {
if (nextElement <= endElement) {
codepoint = codepointEnd = nextElement++;
return TRUE;
}
if (range < endRange) {
loadRange(++range);
codepoint = codepointEnd = nextElement++;
return TRUE;
}
if (nextString >= stringCount) return FALSE;
codepoint = (UChar32)IS_STRING; // signal that value is actually a string
string = (const UnicodeString*) set->strings->elementAt(nextString++);
return TRUE;
}
/**
* @return true if there was another element in the set.
* if so, if codepoint == IS_STRING, the value is a string in the string field
* else the value is a range of codepoints in the <codepoint, codepointEnd> fields.
* <br>Note that the codepoints are in sorted order, and the strings are in sorted order,
* and that all code points are returned before any strings are returned.
* <br>You are guaranteed that the ranges are in sorted order, and the strings are in sorted order,
* and that all ranges are returned before any strings are returned.
* <br>You are also guaranteed that ranges are disjoint and non-contiguous.
* <br>Note also that the codepointEnd is undefined after calling this method.
*/
UBool UnicodeSetIterator::nextRange() {
if (nextElement <= endElement) {
codepointEnd = endElement;
codepoint = nextElement;
nextElement = endElement+1;
return TRUE;
}
if (range < endRange) {
loadRange(++range);
codepointEnd = endElement;
codepoint = nextElement;
nextElement = endElement+1;
return TRUE;
}
if (nextString >= stringCount) return FALSE;
codepoint = (UChar32)IS_STRING; // signal that value is actually a string
string = (const UnicodeString*) set->strings->elementAt(nextString++);
return TRUE;
}
/**
*@param set the set to iterate over. This allows reuse of the iterator.
*/
void UnicodeSetIterator::reset(const UnicodeSet& uSet) {
this->set = &uSet;
reset();
}
/**
* Resets to the start, to allow the iteration to start over again.
*/
void UnicodeSetIterator::reset() {
if (set == NULL) {
// Set up indices to empty iteration
endRange = -1;
stringCount = 0;
} else {
endRange = set->getRangeCount() - 1;
stringCount = set->strings->size();
}
range = 0;
endElement = -1;
nextElement = 0;
if (endRange >= 0) {
loadRange(range);
}
nextString = 0;
}
void UnicodeSetIterator::loadRange(int32_t iRange) {
nextElement = set->getRangeStart(iRange);
endElement = set->getRangeEnd(iRange);
}
U_NAMESPACE_END
//eof
--- NEW FILE: ustrcase.c ---
/*
*******************************************************************************
*
* Copyright (C) 2001-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: ustrcase.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002feb20
* created by: Markus W. Scherer
*
* Implementation file for string casing C API functions.
* Uses functions from uchar.c for basic functionality that requires access
* to the Unicode Character Database (uprops.dat).
*/
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/ubrk.h"
#include "cmemory.h"
#include "unormimp.h"
#include "ustr_imp.h"
/* string casing ------------------------------------------------------------ */
#if !UCONFIG_NO_BREAK_ITERATION
/*
* Internal titlecasing function,
* using u_internalStrToLower() and u_internalToTitle().
*
* Must get titleIter!=NULL.
*/
U_CFUNC int32_t
u_internalStrToTitle(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
UErrorCode *pErrorCode) {
UCharIterator iter;
UChar32 c;
int32_t prev, index, destIndex, length;
UBool isFirstIndex;
/* set up local variables */
uiter_setString(&iter, src, srcLength);
destIndex=0;
prev=0;
isFirstIndex=TRUE;
/* titlecasing loop */
while(prev<srcLength) {
/* find next index where to titlecase */
if(isFirstIndex) {
isFirstIndex=FALSE;
index=ubrk_first(titleIter);
} else {
index=ubrk_next(titleIter);
}
if(index==UBRK_DONE || index>srcLength) {
index=srcLength;
}
/* lowercase [prev..index[ */
if(prev<index) {
if(destIndex<destCapacity) {
length=u_internalStrToLower(dest+destIndex, destCapacity-destIndex,
src, srcLength,
prev, index,
locale,
pErrorCode);
} else {
length=u_internalStrToLower(NULL, 0,
src, srcLength,
prev, index,
locale,
pErrorCode);
}
destIndex+=length;
}
if(index>=srcLength) {
break;
}
/* titlecase the character at the found index */
UTF_NEXT_CHAR(src, index, srcLength, c);
iter.move(&iter, index, UITER_ZERO);
if(destIndex<destCapacity) {
length=u_internalToTitle(c, &iter,
dest+destIndex, destCapacity-destIndex,
locale);
} else {
length=u_internalToTitle(c, &iter, NULL, 0, locale);
}
if(length<0) {
length=-length;
}
destIndex+=length;
prev=index;
}
return destIndex;
}
#endif
/*
* Implement argument checking and buffer handling
* for string case mapping as a common function.
*/
enum {
TO_LOWER,
TO_UPPER,
TO_TITLE,
FOLD_CASE
};
static int32_t
u_strCaseMap(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
uint32_t options,
int32_t toWhichCase,
UErrorCode *pErrorCode) {
UChar buffer[300];
UChar *temp;
int32_t destLength;
UBool ownTitleIter;
/* check argument values */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 ||
(dest==NULL && destCapacity>0) ||
src==NULL ||
srcLength<-1
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* get the string length */
if(srcLength==-1) {
srcLength=u_strlen(src);
}
/* check for overlapping source and destination */
if( dest!=NULL &&
((src>=dest && src<(dest+destCapacity)) ||
(dest>=src && dest<(src+srcLength)))
) {
/* overlap: provide a temporary destination buffer and later copy the result */
if(destCapacity<=(sizeof(buffer)/U_SIZEOF_UCHAR)) {
/* the stack buffer is large enough */
temp=buffer;
} else {
/* allocate a buffer */
temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
if(temp==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return 0;
}
}
} else {
temp=dest;
}
ownTitleIter=FALSE;
destLength=0;
if(toWhichCase==TO_LOWER) {
destLength=u_internalStrToLower(temp, destCapacity,
src, srcLength,
0, srcLength,
locale, pErrorCode);
} else if(toWhichCase==TO_UPPER) {
destLength=u_internalStrToUpper(temp, destCapacity, src, srcLength,
locale, pErrorCode);
#if !UCONFIG_NO_BREAK_ITERATION
} else if(toWhichCase==TO_TITLE) {
if(titleIter==NULL) {
titleIter=ubrk_open(UBRK_WORD, locale,
src, srcLength,
pErrorCode);
ownTitleIter=(UBool)U_SUCCESS(*pErrorCode);
}
if(U_SUCCESS(*pErrorCode)) {
destLength=u_internalStrToTitle(temp, destCapacity, src, srcLength,
titleIter, locale, pErrorCode);
}
#endif
} else {
destLength=u_internalStrFoldCase(temp, destCapacity, src, srcLength,
options, pErrorCode);
}
if(temp!=dest) {
/* copy the result string to the destination buffer */
if(destLength>0) {
int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
if(copyLength>0) {
uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR);
}
}
if(temp!=buffer) {
uprv_free(temp);
}
}
#if !UCONFIG_NO_BREAK_ITERATION
if(ownTitleIter) {
ubrk_close(titleIter);
}
#endif
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
u_strToLower(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode) {
return u_strCaseMap(dest, destCapacity,
src, srcLength,
NULL, locale, 0,
TO_LOWER, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
u_strToUpper(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode) {
return u_strCaseMap(dest, destCapacity,
src, srcLength,
NULL, locale, 0,
TO_UPPER, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
u_strToTitle(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBreakIterator *titleIter,
const char *locale,
UErrorCode *pErrorCode) {
return u_strCaseMap(dest, destCapacity,
src, srcLength,
titleIter, locale, 0,
TO_TITLE, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
u_strFoldCase(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
uint32_t options,
UErrorCode *pErrorCode) {
return u_strCaseMap(dest, destCapacity,
src, srcLength,
NULL, NULL, options,
FOLD_CASE, pErrorCode);
}
/* case-insensitive string comparisons */
U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
uint32_t options,
UErrorCode *pErrorCode) {
/* argument checking */
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
return 0;
}
if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return unorm_cmpEquivFold(s1, length1, s2, length2,
options|U_COMPARE_IGNORE_CASE,
pErrorCode);
}
U_CAPI int32_t U_EXPORT2
u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
UErrorCode errorCode=U_ZERO_ERROR;
return unorm_cmpEquivFold(s1, -1, s2, -1,
options|U_COMPARE_IGNORE_CASE,
&errorCode);
}
U_CAPI int32_t U_EXPORT2
u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
UErrorCode errorCode=U_ZERO_ERROR;
return unorm_cmpEquivFold(s1, length, s2, length,
options|U_COMPARE_IGNORE_CASE,
&errorCode);
}
U_CAPI int32_t U_EXPORT2
u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
UErrorCode errorCode=U_ZERO_ERROR;
return unorm_cmpEquivFold(s1, n, s2, n,
options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
&errorCode);
}
--- NEW FILE: ustrenum.cpp ---
/*
**********************************************************************
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: November 11 2002
* Since: ICU 2.4
**********************************************************************
*/
#include "unicode/ustring.h"
#include "unicode/strenum.h"
#include "uenumimp.h"
#include "ustrenum.h"
#include "cstring.h"
#include "cmemory.h"
#define THIS(en) ((StringEnumeration*)(en->context))
U_CDECL_BEGIN
/**
* Wrapper API to make StringEnumeration look like UEnumeration.
*/
static void U_CALLCONV
ustrenum_close(UEnumeration* en) {
delete THIS(en);
uprv_free(en);
}
/**
* Wrapper API to make StringEnumeration look like UEnumeration.
*/
static int32_t U_CALLCONV
ustrenum_count(UEnumeration* en,
UErrorCode* ec)
{
return THIS(en)->count(*ec);
}
/**
* Wrapper API to make StringEnumeration look like UEnumeration.
*/
static const UChar* U_CALLCONV
ustrenum_unext(UEnumeration* en,
int32_t* resultLength,
UErrorCode* ec)
{
return THIS(en)->unext(resultLength, *ec);
}
/**
* Wrapper API to make StringEnumeration look like UEnumeration.
*/
static const char* U_CALLCONV
ustrenum_next(UEnumeration* en,
int32_t* resultLength,
UErrorCode* ec)
{
return THIS(en)->next(resultLength, *ec);
}
/**
* Wrapper API to make StringEnumeration look like UEnumeration.
*/
static void U_CALLCONV
ustrenum_reset(UEnumeration* en,
UErrorCode* ec)
{
THIS(en)->reset(*ec);
}
/**
* Pseudo-vtable for UEnumeration wrapper around StringEnumeration.
* The StringEnumeration pointer will be stored in 'context'.
*/
static const UEnumeration TEMPLATE = {
NULL,
NULL, // store StringEnumeration pointer here
ustrenum_close,
ustrenum_count,
ustrenum_unext,
ustrenum_next,
ustrenum_reset
};
U_CDECL_END
/**
* Given a StringEnumeration, wrap it in a UEnumeration. The
* StringEnumeration is adopted; after this call, the caller must not
* delete it (regardless of error status).
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openStringEnumeration(StringEnumeration* adopted, UErrorCode* ec) {
UEnumeration* result = NULL;
if (U_SUCCESS(*ec) && adopted != NULL) {
result = (UEnumeration*) uprv_malloc(sizeof(UEnumeration));
if (result == NULL) {
*ec = U_MEMORY_ALLOCATION_ERROR;
} else {
uprv_memcpy(result, &TEMPLATE, sizeof(TEMPLATE));
result->context = adopted;
}
}
if (result == NULL) {
delete adopted;
}
return result;
}
//eof
--- NEW FILE: ustrenum.h ---
/*
**********************************************************************
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: November 11 2002
* Since: ICU 2.4
**********************************************************************
*/
#ifndef _USTRENUM_H_
#define _USTRENUM_H_
#include "unicode/uenum.h"
#include "unicode/strenum.h"
/**
* Given a StringEnumeration, wrap it in a UEnumeration. The
* StringEnumeration is adopted; after this call, the caller must not
* delete it (regardless of error status).
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openStringEnumeration(StringEnumeration* adopted, UErrorCode* ec);
/* _USTRENUM_H_ */
#endif
/*eof*/
--- NEW FILE: util.cpp ---
/*
**********************************************************************
* Copyright (c) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/19/2001 aliu Creation.
**********************************************************************
*/
#include "util.h"
#include "unicode/uchar.h"
#include "unicode/unimatch.h"
#include "uprops.h"
// Define UChar constants using hex for EBCDIC compatibility
static const UChar BACKSLASH = 0x005C; /*\*/
static const UChar UPPER_U = 0x0055; /*U*/
static const UChar LOWER_U = 0x0075; /*u*/
static const UChar APOSTROPHE = 0x0027; // '\''
static const UChar SPACE = 0x0020; // ' '
// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
static const UChar DIGITS[] = {
48,49,50,51,52,53,54,55,56,57,
65,66,67,68,69,70,71,72,73,74,
75,76,77,78,79,80,81,82,83,84,
85,86,87,88,89,90
};
UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
int32_t radix, int32_t minDigits) {
if (radix < 2 || radix > 36) {
// Bogus radix
return result.append((UChar)63/*?*/);
}
// Handle negatives
if (n < 0) {
n = -n;
result.append((UChar)45/*-*/);
}
// First determine the number of digits
int32_t nn = n;
int32_t r = 1;
while (nn >= radix) {
nn /= radix;
r *= radix;
--minDigits;
}
// Now generate the digits
while (--minDigits > 0) {
result.append(DIGITS[0]);
}
while (r > 0) {
int32_t digit = n / r;
result.append(DIGITS[digit]);
n -= digit * r;
r /= radix;
}
return result;
}
static const UChar HEX[16] = {48,49,50,51,52,53,54,55, // 0-7
56,57,65,66,67,68,69,70}; // 8-9 A-F
/**
* Return true if the character is NOT printable ASCII.
*/
UBool ICU_Utility::isUnprintable(UChar32 c) {
return !(c == 0x0A || (c >= 0x20 && c <= 0x7E));
}
/**
* Escape unprintable characters using \uxxxx notation for U+0000 to
* U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
* printable ASCII, then do nothing and return FALSE. Otherwise,
* append the escaped notation and return TRUE.
*/
UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
if (isUnprintable(c)) {
result.append(BACKSLASH);
if (c & ~0xFFFF) {
result.append(UPPER_U);
result.append(HEX[0xF&(c>>28)]);
result.append(HEX[0xF&(c>>24)]);
result.append(HEX[0xF&(c>>20)]);
result.append(HEX[0xF&(c>>16)]);
} else {
result.append(LOWER_U);
}
result.append(HEX[0xF&(c>>12)]);
result.append(HEX[0xF&(c>>8)]);
result.append(HEX[0xF&(c>>4)]);
result.append(HEX[0xF&c]);
return TRUE;
}
return FALSE;
}
/**
* Returns the index of a character, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for 'h'.
*/
int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
UChar charToFind) {
for (int32_t i=start; i<limit; ++i) {
UChar c = text.charAt(i);
if (c == BACKSLASH) {
++i;
} else if (c == APOSTROPHE) {
while (++i < limit
&& text.charAt(i) != APOSTROPHE) {}
} else if (c == charToFind) {
return i;
}
}
return -1;
}
/**
* Skip over a sequence of zero or more white space characters at pos.
* @param advance if true, advance pos to the first non-white-space
* character at or after pos, or str.length(), if there is none.
* Otherwise leave pos unchanged.
* @return the index of the first non-white-space character at or
* after pos, or str.length(), if there is none.
*/
int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
UBool advance) {
int32_t p = pos;
while (p < str.length()) {
UChar32 c = str.char32At(p);
if (!uprv_isRuleWhiteSpace(c)) {
break;
}
p += UTF_CHAR_LENGTH(c);
}
if (advance) {
pos = p;
}
return p;
}
/**
* Skip over whitespace in a Replaceable. Whitespace is defined by
* uprv_isRuleWhiteSpace(). Skipping may be done in the forward or
* reverse direction. In either case, the leftmost index will be
* inclusive, and the rightmost index will be exclusive. That is,
* given a range defined as [start, limit), the call
* skipWhitespace(text, start, limit) will advance start past leading
* whitespace, whereas the call skipWhitespace(text, limit, start),
* will back up limit past trailing whitespace.
* @param text the text to be analyzed
* @param pos either the start or limit of a range of 'text', to skip
* leading or trailing whitespace, respectively
* @param stop either the limit or start of a range of 'text', to skip
* leading or trailing whitespace, respectively
* @return the new start or limit, depending on what was passed in to
* 'pos'
*/
//?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
//? int32_t pos, int32_t stop) {
//? UChar32 c;
//? UBool isForward = (stop >= pos);
//?
//? if (!isForward) {
//? --pos; // pos is a limit, so back up by one
//? }
//?
//? while (pos != stop &&
//? uprv_isRuleWhiteSpace(c = text.char32At(pos))) {
//? if (isForward) {
//? pos += UTF_CHAR_LENGTH(c);
//? } else {
//? pos -= UTF_CHAR_LENGTH(c);
//? }
//? }
//?
//? if (!isForward) {
//? ++pos; // make pos back into a limit
//? }
//?
//? return pos;
//?}
/**
* Parse a single non-whitespace character 'ch', optionally
* preceded by whitespace.
* @param id the string to be parsed
* @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
* offset of the first character to be parsed. On output, pos[0]
* is the index after the last parsed character. If the parse
* fails, pos[0] will be unchanged.
* @param ch the non-whitespace character to be parsed.
* @return true if 'ch' is seen preceded by zero or more
* whitespace characters.
*/
UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {
int32_t start = pos;
skipWhitespace(id, pos, TRUE);
if (pos == id.length() ||
id.charAt(pos) != ch) {
pos = start;
return FALSE;
}
++pos;
return TRUE;
}
/**
* Parse a pattern string starting at offset pos. Keywords are
* matched case-insensitively. Spaces may be skipped and may be
* optional or required. Integer values may be parsed, and if
* they are, they will be returned in the given array. If
* successful, the offset of the next non-space character is
* returned. On failure, -1 is returned.
* @param pattern must only contain lowercase characters, which
* will match their uppercase equivalents as well. A space
* character matches one or more required spaces. A '~' character
* matches zero or more optional spaces. A '#' character matches
* an integer and stores it in parsedInts, which the caller must
* ensure has enough capacity.
* @param parsedInts array to receive parsed integers. Caller
* must ensure that parsedInts.length is >= the number of '#'
* signs in 'pattern'.
* @return the position after the last character parsed, or -1 if
* the parse failed
*/
int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
const UnicodeString& pattern, int32_t* parsedInts) {
// TODO Update this to handle surrogates
int32_t p;
int32_t intCount = 0; // number of integers parsed
for (int32_t i=0; i<pattern.length(); ++i) {
UChar cpat = pattern.charAt(i);
UChar c;
switch (cpat) {
case 32 /*' '*/:
if (pos >= limit) {
return -1;
}
c = rule.charAt(pos++);
if (!uprv_isRuleWhiteSpace(c)) {
return -1;
}
// FALL THROUGH to skipWhitespace
case 126 /*'~'*/:
pos = skipWhitespace(rule, pos);
break;
case 35 /*'#'*/:
p = pos;
parsedInts[intCount++] = parseInteger(rule, p, limit);
if (p == pos) {
// Syntax error; failed to parse integer
return -1;
}
pos = p;
break;
default:
if (pos >= limit) {
return -1;
}
c = (UChar) u_tolower(rule.charAt(pos++));
if (c != cpat) {
return -1;
}
break;
}
}
return pos;
}
/**
* Parse a pattern string within the given Replaceable and a parsing
* pattern. Characters are matched literally and case-sensitively
* except for the following special characters:
*
* ~ zero or more uprv_isRuleWhiteSpace chars
*
* If end of pattern is reached with all matches along the way,
* pos is advanced to the first unparsed index and returned.
* Otherwise -1 is returned.
* @param pat pattern that controls parsing
* @param text text to be parsed, starting at index
* @param index offset to first character to parse
* @param limit offset after last character to parse
* @return index after last parsed character, or -1 on parse failure.
*/
int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
const Replaceable& text,
int32_t index,
int32_t limit) {
int32_t ipat = 0;
// empty pattern matches immediately
if (ipat == pat.length()) {
return index;
}
UChar32 cpat = pat.char32At(ipat);
while (index < limit) {
UChar32 c = text.char32At(index);
// parse \s*
if (cpat == 126 /*~*/) {
if (uprv_isRuleWhiteSpace(c)) {
index += UTF_CHAR_LENGTH(c);
continue;
} else {
if (++ipat == pat.length()) {
return index; // success; c unparsed
}
// fall thru; process c again with next cpat
}
}
// parse literal
else if (c == cpat) {
index += UTF_CHAR_LENGTH(c);
ipat += UTF_CHAR_LENGTH(cpat);
if (ipat == pat.length()) {
return index; // success; c parsed
}
// fall thru; get next cpat
}
// match failure of literal
else {
return -1;
}
cpat = pat.char32At(ipat);
}
return -1; // text ended before end of pat
}
static const UChar ZERO_X[] = {48, 120, 0}; // "0x"
/**
* Parse an integer at pos, either of the form \d+ or of the form
* 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
* or octal format.
* @param pos INPUT-OUTPUT parameter. On input, the first
* character to parse. On output, the character after the last
* parsed character.
*/
int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
int32_t count = 0;
int32_t value = 0;
int32_t p = pos;
int8_t radix = 10;
if (0 == rule.caseCompare(p, 2, ZERO_X, U_FOLD_CASE_DEFAULT)) {
p += 2;
radix = 16;
} else if (p < limit && rule.charAt(p) == 48 /*0*/) {
p++;
count = 1;
radix = 8;
}
while (p < limit) {
int32_t d = u_digit(rule.charAt(p++), radix);
if (d < 0) {
--p;
break;
}
++count;
int32_t v = (value * radix) + d;
if (v <= value) {
// If there are too many input digits, at some point
// the value will go negative, e.g., if we have seen
// "0x8000000" already and there is another '0', when
// we parse the next 0 the value will go negative.
return 0;
}
value = v;
}
if (count > 0) {
pos = p;
}
return value;
}
/**
* Parse a Unicode identifier from the given string at the given
* position. Return the identifier, or an empty string if there
* is no identifier.
* @param str the string to parse
* @param pos INPUT-OUPUT parameter. On INPUT, pos is the
* first character to examine. It must be less than str.length(),
* and it must not point to a whitespace character. That is, must
* have pos < str.length() and
* !uprv_isRuleWhiteSpace(str.char32At(pos)). On
* OUTPUT, the position after the last parsed character.
* @return the Unicode identifier, or an empty string if there is
* no valid identifier at pos.
*/
UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
// assert(pos < str.length());
// assert(!uprv_isRuleWhiteSpace(str.char32At(pos)));
UnicodeString buf;
int p = pos;
while (p < str.length()) {
UChar32 ch = str.char32At(p);
if (buf.length() == 0) {
if (u_isIDStart(ch)) {
buf.append(ch);
} else {
buf.truncate(0);
return buf;
}
} else {
if (u_isIDPart(ch)) {
buf.append(ch);
} else {
break;
}
}
p += UTF_CHAR_LENGTH(ch);
}
pos = p;
return buf;
}
/**
* Parse an unsigned 31-bit integer at the given offset. Use
* UCharacter.digit() to parse individual characters into digits.
* @param text the text to be parsed
* @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
* offset within text at which to start parsing; it should point
* to a valid digit. On exit, pos[0] is the offset after the last
* parsed character. If the parse failed, it will be unchanged on
* exit. Must be >= 0 on entry.
* @param radix the radix in which to parse; must be >= 2 and <=
* 36.
* @return a non-negative parsed number, or -1 upon parse failure.
* Parse fails if there are no digits, that is, if pos[0] does not
* point to a valid digit on entry, or if the number to be parsed
* does not fit into a 31-bit unsigned integer.
*/
int32_t ICU_Utility::parseNumber(const UnicodeString& text,
int32_t& pos, int8_t radix) {
// assert(pos[0] >= 0);
// assert(radix >= 2);
// assert(radix <= 36);
int32_t n = 0;
int32_t p = pos;
while (p < text.length()) {
UChar32 ch = text.char32At(p);
int32_t d = u_digit(ch, radix);
if (d < 0) {
break;
}
n = radix*n + d;
// ASSUME that when a 32-bit integer overflows it becomes
// negative. E.g., 214748364 * 10 + 8 => negative value.
if (n < 0) {
return -1;
}
++p;
}
if (p == pos) {
return -1;
}
pos = p;
return n;
}
/**
* Append a character to a rule that is being built up. To flush
* the quoteBuf to rule, make one final call with isLiteral == TRUE.
* If there is no final character, pass in (UChar32)-1 as c.
* @param rule the string to append the character to
* @param c the character to append, or (UChar32)-1 if none.
* @param isLiteral if true, then the given character should not be
* quoted or escaped. Usually this means it is a syntactic element
* such as > or $
* @param escapeUnprintable if true, then unprintable characters
* should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will
* appear outside of quotes.
* @param quoteBuf a buffer which is used to build up quoted
* substrings. The caller should initially supply an empty buffer,
* and thereafter should not modify the buffer. The buffer should be
* cleared out by, at the end, calling this method with a literal
* character.
*/
void ICU_Utility::appendToRule(UnicodeString& rule,
UChar32 c,
UBool isLiteral,
UBool escapeUnprintable,
UnicodeString& quoteBuf) {
// If we are escaping unprintables, then escape them outside
// quotes. \u and \U are not recognized within quotes. The same
// logic applies to literals, but literals are never escaped.
if (isLiteral ||
(escapeUnprintable && ICU_Utility::isUnprintable(c))) {
if (quoteBuf.length() > 0) {
// We prefer backslash APOSTROPHE to double APOSTROPHE
// (more readable, less similar to ") so if there are
// double APOSTROPHEs at the ends, we pull them outside
// of the quote.
// If the first thing in the quoteBuf is APOSTROPHE
// (doubled) then pull it out.
while (quoteBuf.length() >= 2 &&
quoteBuf.charAt(0) == APOSTROPHE &&
quoteBuf.charAt(1) == APOSTROPHE) {
rule.append(BACKSLASH).append(APOSTROPHE);
quoteBuf.remove(0, 2);
}
// If the last thing in the quoteBuf is APOSTROPHE
// (doubled) then remove and count it and add it after.
int32_t trailingCount = 0;
while (quoteBuf.length() >= 2 &&
quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
quoteBuf.truncate(quoteBuf.length()-2);
++trailingCount;
}
if (quoteBuf.length() > 0) {
rule.append(APOSTROPHE);
rule.append(quoteBuf);
rule.append(APOSTROPHE);
quoteBuf.truncate(0);
}
while (trailingCount-- > 0) {
rule.append(BACKSLASH).append(APOSTROPHE);
}
}
if (c != (UChar32)-1) {
/* Since spaces are ignored during parsing, they are
* emitted only for readability. We emit one here
* only if there isn't already one at the end of the
* rule.
*/
if (c == SPACE) {
int32_t len = rule.length();
if (len > 0 && rule.charAt(len-1) != c) {
rule.append(c);
}
} else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
rule.append(c);
}
}
}
// Escape ' and '\' and don't begin a quote just for them
else if (quoteBuf.length() == 0 &&
(c == APOSTROPHE || c == BACKSLASH)) {
rule.append(BACKSLASH);
rule.append(c);
}
// Specials (printable ascii that isn't [0-9a-zA-Z]) and
// whitespace need quoting. Also append stuff to quotes if we are
// building up a quoted substring already.
else if (quoteBuf.length() > 0 ||
(c >= 0x0021 && c <= 0x007E &&
!((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
(c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
(c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
uprv_isRuleWhiteSpace(c)) {
quoteBuf.append(c);
// Double ' within a quote
if (c == APOSTROPHE) {
quoteBuf.append(c);
}
}
// Otherwise just append
else {
rule.append(c);
}
}
void ICU_Utility::appendToRule(UnicodeString& rule,
const UnicodeString& text,
UBool isLiteral,
UBool escapeUnprintable,
UnicodeString& quoteBuf) {
for (int32_t i=0; i<text.length(); ++i) {
appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
}
}
/**
* Given a matcher reference, which may be null, append its
* pattern as a literal to the given rule.
*/
void ICU_Utility::appendToRule(UnicodeString& rule,
const UnicodeMatcher* matcher,
UBool escapeUnprintable,
UnicodeString& quoteBuf) {
if (matcher != NULL) {
UnicodeString pat;
appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
TRUE, escapeUnprintable, quoteBuf);
}
}
//eof
--- NEW FILE: util.h ---
/*
**********************************************************************
* Copyright (c) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/19/2001 aliu Creation.
**********************************************************************
*/
#ifndef ICU_UTIL_H
#define ICU_UTIL_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/unistr.h"
//--------------------------------------------------------------------
// class ICU_Utility
// i18n utility functions, scoped into the class ICU_Utility.
//--------------------------------------------------------------------
U_NAMESPACE_BEGIN
class UnicodeMatcher;
class U_COMMON_API ICU_Utility /* not : public UObject because all methods are static */ {
public:
/**
* Append a number to the given UnicodeString in the given radix.
* Standard digits '0'-'9' are used and letters 'A'-'Z' for
* radices 11 through 36.
* @param result the digits of the number are appended here
* @param n the number to be converted to digits; may be negative.
* If negative, a '-' is prepended to the digits.
* @param radix a radix from 2 to 36 inclusive.
* @param minDigits the minimum number of digits, not including
* any '-', to produce. Values less than 2 have no effect. One
* digit is always emitted regardless of this parameter.
* @return a reference to result
*/
static UnicodeString& appendNumber(UnicodeString& result, int32_t n,
int32_t radix = 10,
int32_t minDigits = 1);
/**
* Return true if the character is NOT printable ASCII.
*
* This method should really be in UnicodeString (or similar). For
* now, we implement it here and share it with friend classes.
*/
static UBool isUnprintable(UChar32 c);
/**
* Escape unprintable characters using \uxxxx notation for U+0000 to
* U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is
* printable ASCII, then do nothing and return FALSE. Otherwise,
* append the escaped notation and return TRUE.
*/
static UBool escapeUnprintable(UnicodeString& result, UChar32 c);
/**
* Returns the index of a character, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for 'h'.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param c character to search for
* @return Offset of the first instance of c, or -1 if not found.
*/
static int32_t quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
UChar c);
/**
* Skip over a sequence of zero or more white space characters at pos.
* @param advance if true, advance pos to the first non-white-space
* character at or after pos, or str.length(), if there is none.
* Otherwise leave pos unchanged.
* @return the index of the first non-white-space character at or
* after pos, or str.length(), if there is none.
*/
static int32_t skipWhitespace(const UnicodeString& str, int32_t& pos,
UBool advance = FALSE);
/**
* Skip over whitespace in a Replaceable. Whitespace is defined by
* uprv_isRuleWhiteSpace(). Skipping may be done in the forward or
* reverse direction. In either case, the leftmost index will be
* inclusive, and the rightmost index will be exclusive. That is,
* given a range defined as [start, limit), the call
* skipWhitespace(text, start, limit) will advance start past leading
* whitespace, whereas the call skipWhitespace(text, limit, start),
* will back up limit past trailing whitespace.
* @param text the text to be analyzed
* @param pos either the start or limit of a range of 'text', to skip
* leading or trailing whitespace, respectively
* @param stop either the limit or start of a range of 'text', to skip
* leading or trailing whitespace, respectively
* @return the new start or limit, depending on what was passed in to
* 'pos'
*/
//?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
//? static int32_t skipWhitespace(const Replaceable& text,
//? int32_t pos, int32_t stop);
/**
* Parse a single non-whitespace character 'ch', optionally
* preceded by whitespace.
* @param id the string to be parsed
* @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
* offset of the first character to be parsed. On output, pos[0]
* is the index after the last parsed character. If the parse
* fails, pos[0] will be unchanged.
* @param ch the non-whitespace character to be parsed.
* @return true if 'ch' is seen preceded by zero or more
* whitespace characters.
*/
static UBool parseChar(const UnicodeString& id, int32_t& pos, UChar ch);
/**
* Parse a pattern string starting at offset pos. Keywords are
* matched case-insensitively. Spaces may be skipped and may be
* optional or required. Integer values may be parsed, and if
* they are, they will be returned in the given array. If
* successful, the offset of the next non-space character is
* returned. On failure, -1 is returned.
* @param pattern must only contain lowercase characters, which
* will match their uppercase equivalents as well. A space
* character matches one or more required spaces. A '~' character
* matches zero or more optional spaces. A '#' character matches
* an integer and stores it in parsedInts, which the caller must
* ensure has enough capacity.
* @param parsedInts array to receive parsed integers. Caller
* must ensure that parsedInts.length is >= the number of '#'
* signs in 'pattern'.
* @return the position after the last character parsed, or -1 if
* the parse failed
*/
static int32_t parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
const UnicodeString& pattern, int32_t* parsedInts);
/**
* Parse a pattern string within the given Replaceable and a parsing
* pattern. Characters are matched literally and case-sensitively
* except for the following special characters:
*
* ~ zero or more uprv_isRuleWhiteSpace chars
*
* If end of pattern is reached with all matches along the way,
* pos is advanced to the first unparsed index and returned.
* Otherwise -1 is returned.
* @param pat pattern that controls parsing
* @param text text to be parsed, starting at index
* @param index offset to first character to parse
* @param limit offset after last character to parse
* @return index after last parsed character, or -1 on parse failure.
*/
static int32_t parsePattern(const UnicodeString& pat,
const Replaceable& text,
int32_t index,
int32_t limit);
/**
* Parse an integer at pos, either of the form \d+ or of the form
* 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
* or octal format.
* @param pos INPUT-OUTPUT parameter. On input, the first
* character to parse. On output, the character after the last
* parsed character.
*/
static int32_t parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit);
/**
* Parse a Unicode identifier from the given string at the given
* position. Return the identifier, or an empty string if there
* is no identifier.
* @param str the string to parse
* @param pos INPUT-OUPUT parameter. On INPUT, pos is the
* first character to examine. It must be less than str.length(),
* and it must not point to a whitespace character. That is, must
* have pos < str.length() and
* !UCharacter::isWhitespace(str.char32At(pos)). On
* OUTPUT, the position after the last parsed character.
* @return the Unicode identifier, or an empty string if there is
* no valid identifier at pos.
*/
static UnicodeString parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos);
/**
* Parse an unsigned 31-bit integer at the given offset. Use
* UCharacter.digit() to parse individual characters into digits.
* @param text the text to be parsed
* @param pos INPUT-OUTPUT parameter. On entry, pos is the
* offset within text at which to start parsing; it should point
* to a valid digit. On exit, pos is the offset after the last
* parsed character. If the parse failed, it will be unchanged on
* exit. Must be >= 0 on entry.
* @param radix the radix in which to parse; must be >= 2 and <=
* 36.
* @return a non-negative parsed number, or -1 upon parse failure.
* Parse fails if there are no digits, that is, if pos does not
* point to a valid digit on entry, or if the number to be parsed
* does not fit into a 31-bit unsigned integer.
*/
static int32_t parseNumber(const UnicodeString& text,
int32_t& pos, int8_t radix);
static void appendToRule(UnicodeString& rule,
UChar32 c,
UBool isLiteral,
UBool escapeUnprintable,
UnicodeString& quoteBuf);
static void appendToRule(UnicodeString& rule,
const UnicodeString& text,
UBool isLiteral,
UBool escapeUnprintable,
UnicodeString& quoteBuf);
static void appendToRule(UnicodeString& rule,
const UnicodeMatcher* matcher,
UBool escapeUnprintable,
UnicodeString& quoteBuf);
private:
// do not instantiate
ICU_Utility();
};
U_NAMESPACE_END
#endif
//eof
--- NEW FILE: utrie.c ---
/*
******************************************************************************
*
* Copyright (C) 2001-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utrie.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001oct20
* created by: Markus W. Scherer
*
* This is a common implementation of a "folded" trie.
* It is a kind of compressed, serializable table of 16- or 32-bit values associated with
* Unicode code points (0..0x10ffff).
*/
[...1012 lines suppressed...]
}
}
if(j>0) {
prevBlock=-1;
}
prev=c;
prevValue=value;
}
++c;
}
}
} while(++i<offset);
}
++l;
}
/* deliver last range */
enumRange(context, prev, c, prevValue);
}
--- NEW FILE: utrie.h ---
/*
******************************************************************************
*
* Copyright (C) 2001-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utrie.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001nov08
* created by: Markus W. Scherer
*/
#ifndef __UTRIE_H__
#define __UTRIE_H__
#include "unicode/utypes.h"
U_CDECL_BEGIN
/**
* \file
*
* This is a common implementation of a "folded" trie.
* It is a kind of compressed, serializable table of 16- or 32-bit values associated with
* Unicode code points (0..0x10ffff).
*
* This implementation is optimized for getting values while walking forward
* through a UTF-16 string.
* Therefore, the simplest and fastest access macros are the
* _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros.
*
* The _FROM_BMP() macros are a little more complicated; they get values
* even for lead surrogate code _points_, while the _FROM_LEAD() macros
* get special "folded" values for lead surrogate code _units_ if
* there is relevant data associated with them.
* From such a folded value, an offset needs to be extracted to supply
* to the _FROM_OFFSET_TRAIL() macros.
*
* Most of the more complex (and more convenient) functions/macros call a callback function
* to get that offset from the folded value for a lead surrogate unit.
*/
/**
* Trie constants, defining shift widths, index array lengths, etc.
*/
enum {
/** Shift size for shifting right the input index. 1..9 */
UTRIE_SHIFT=5,
/** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT,
/** Mask for getting the lower bits from the input index. */
UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1,
/**
* Lead surrogate code points' index displacement in the index array.
* 0x10000-0xd800=0x2800
*/
UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT,
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
* 0..UTRIE_SHIFT
*/
UTRIE_INDEX_SHIFT=2,
/** The alignment size of a stage 2 data block. Also the granularity for compaction. */
UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT,
/** Number of bits of a trail surrogate that are used in index table lookups. */
UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT,
/**
* Number of index (stage 1) entries per lead surrogate.
* Same as number of indexe entries for 1024 trail surrogates,
* ==0x400>>UTRIE_SHIFT
*/
UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
/** Length of the BMP portion of the index (stage 1) array. */
UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT
};
/**
* Length of the index (stage 1) array before folding.
* Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT.
*/
#define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT)
/**
* Maximum length of the runtime data (stage 2) array.
* Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT.
*/
#define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT)
/**
* Maximum length of the build-time data (stage 2) array.
* The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
* (Number of Unicode code points + one all-initial-value block +
* possible duplicate entries for 1024 lead surrogates.)
*/
#define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400)
/**
* Runtime UTrie callback function.
* Extract from a lead surrogate's data the
* index array offset of the indexes for that lead surrogate.
*
* @param data data value for a surrogate from the trie, including the folding offset
* @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate
*/
typedef int32_t U_CALLCONV
UTrieGetFoldingOffset(uint32_t data);
/**
* Run-time Trie structure.
*
* Either the data table is 16 bits wide and accessed via the index
* pointer, with each index item increased by indexLength;
* in this case, data32==NULL.
*
* Or the data table is 32 bits wide and accessed via the data32 pointer.
*/
struct UTrie {
const uint16_t *index;
const uint32_t *data32; /* NULL if 16b data is used via index */
/**
* This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros.
* If convenience macros like _GET16 or _NEXT32 are used, this function must be set.
* @see UTrieGetFoldingOffset
*/
UTrieGetFoldingOffset *getFoldingOffset;
int32_t indexLength, dataLength;
uint32_t initialValue;
UBool isLatin1Linear;
};
typedef struct UTrie UTrie;
/** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */
#define _UTRIE_GET_RAW(trie, data, offset, c16) \
(trie)->data[ \
((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \
((c16)&UTRIE_MASK) \
]
/** Internal trie getter from a pair of surrogates */
#define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result, resultType) { \
int32_t __offset; \
\
/* get data for lead surrogate */ \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
__offset=(trie)->getFoldingOffset(result); \
\
/* get the real data from the folded lead/trail units */ \
if(__offset>0) { \
(result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \
} else { \
(result)=(resultType)((trie)->initialValue); \
} \
}
/** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */
#define _UTRIE_GET_FROM_BMP(trie, data, c16) \
_UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16);
/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
* if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); }
*/
#define _UTRIE_GET(trie, data, c32, result, resultType) \
if((uint32_t)(c32)<=0xffff) { \
/* BMP code points */ \
(result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \
} else if((uint32_t)(c32)<=0x10ffff) { \
/* supplementary code point */ \
UChar __lead16=UTF16_LEAD(c32); \
_UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result, resultType); \
} else { \
/* out of range */ \
(result)=(resultType)((trie)->initialValue); \
}
/** Internal next-post-increment: get the next code point (c, c2) and its data */
#define _UTRIE_NEXT(trie, data, src, limit, c, c2, result, resultType) { \
(c)=*(src)++; \
if(!UTF_IS_LEAD(c)) { \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
} else if((src)!=(limit) && UTF_IS_TRAIL((c2)=*(src))) { \
++(src); \
_UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
} else { \
/* unpaired lead surrogate code point */ \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
} \
}
/** Internal previous: get the previous code point (c, c2) and its data */
#define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result, resultType) { \
(c)=*--(src); \
if(!UTF_IS_SURROGATE(c)) { \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
} else if(!UTF_IS_SURROGATE_FIRST(c)) { \
/* trail surrogate */ \
if((start)!=(src) && UTF_IS_LEAD((c2)=*((src)-1))) { \
--(src); \
(result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \
_UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
} else { \
/* unpaired trail surrogate code point */ \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
} \
} else { \
/* unpaired lead surrogate code point */ \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
} \
}
/* Public UTrie API ---------------------------------------------------------*/
/**
* Get a pointer to the contiguous part of the data array
* for the Latin-1 range (U+0000..U+00ff).
* Must be used only if the Latin-1 range is in fact linear
* (trie->isLatin1Linear).
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @return (const uint16_t *) pointer to values for Latin-1 code points
*/
#define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH)
/**
* Get a pointer to the contiguous part of the data array
* for the Latin-1 range (U+0000..U+00ff).
* Must be used only if the Latin-1 range is in fact linear
* (trie->isLatin1Linear).
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @return (const uint32_t *) pointer to values for Latin-1 code points
*/
#define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
/**
* Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
* c16 may be a lead surrogate, which may have a value including a folding offset.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint16_t) trie lookup result
*/
#define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16)
/**
* Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
* c16 may be a lead surrogate, which may have a value including a folding offset.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint32_t) trie lookup result
*/
#define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16)
/**
* Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint16_t) trie lookup result
*/
#define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16)
/**
* Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint32_t) trie lookup result
*/
#define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16)
/**
* Get a 16-bit trie value from a code point.
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c32 (UChar32, in) the input code point
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result, uint16_t)
/**
* Get a 32-bit trie value from a code point.
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c32 (UChar32, in) the input code point
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result, uint32_t)
/**
* Get the next code point (c, c2), post-increment src,
* and get a 16-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param src (const UChar *, in/out) the source text pointer
* @param limit (const UChar *, in) the limit pointer for the text, or NULL
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result, uint16_t)
/**
* Get the next code point (c, c2), post-increment src,
* and get a 32-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param src (const UChar *, in/out) the source text pointer
* @param limit (const UChar *, in) the limit pointer for the text, or NULL
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result, uint32_t)
/**
* Get the previous code point (c, c2), pre-decrement src,
* and get a 16-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param start (const UChar *, in) the start pointer for the text, or NULL
* @param src (const UChar *, in/out) the source text pointer
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result, uint16_t)
/**
* Get the previous code point (c, c2), pre-decrement src,
* and get a 32-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param start (const UChar *, in) the start pointer for the text, or NULL
* @param src (const UChar *, in/out) the source text pointer
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result, uint32_t)
/**
* Get a 16-bit trie value from a pair of surrogates.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c (UChar, in) a lead surrogate
* @param c2 (UChar, in) a trail surrogate
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result, uint16_t)
/**
* Get a 32-bit trie value from a pair of surrogates.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c (UChar, in) a lead surrogate
* @param c2 (UChar, in) a trail surrogate
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result, uint32_t)
/**
* Get a 16-bit trie value from a folding offset (from the value of a lead surrogate)
* and a trail surrogate.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param offset (int32_t, in) the folding offset from the value of a lead surrogate
* @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
* @return (uint16_t) trie lookup result
*/
#define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff)
/**
* Get a 32-bit trie value from a folding offset (from the value of a lead surrogate)
* and a trail surrogate.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param offset (int32_t, in) the folding offset from the value of a lead surrogate
* @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
* @return (uint32_t) trie lookup result
*/
#define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff)
/* enumeration callback types */
/**
* Callback from utrie_enum(), extracts a uint32_t value from a
* trie value. This value will be passed on to the UTrieEnumRange function.
*
* @param context an opaque pointer, as passed into utrie_enum()
* @param value a value from the trie
* @return the value that is to be passed on to the UTrieEnumRange function
*/
typedef uint32_t U_CALLCONV
UTrieEnumValue(const void *context, uint32_t value);
/**
* Callback from utrie_enum(), is called for each contiguous range
* of code points with the same value as retrieved from the trie and
* transformed by the UTrieEnumValue function.
*
* The callback function can stop the enumeration by returning FALSE.
*
* @param context an opaque pointer, as passed into utrie_enum()
* @param start the first code point in a contiguous range with value
* @param limit one past the last code point in a contiguous range with value
* @param value the value that is set for all code points in [start..limit[
* @return FALSE to stop the enumeration
*/
typedef UBool U_CALLCONV
UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value);
/**
* Enumerate efficiently all values in a trie.
* For each entry in the trie, the value to be delivered is passed through
* the UTrieEnumValue function.
* The value is unchanged if that function pointer is NULL.
*
* For each contiguous range of code points with a given value,
* the UTrieEnumRange function is called.
*
* @param trie a pointer to the runtime trie structure
* @param enumValue a pointer to a function that may transform the trie entry value,
* or NULL if the values from the trie are to be used directly
* @param enumRange a pointer to a function that is called for each contiguous range
* of code points with the same value
* @param context an opaque pointer that is passed on to the callback functions
*/
U_CAPI void U_EXPORT2
utrie_enum(UTrie *trie,
UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context);
/**
* Unserialize a trie from 32-bit-aligned memory.
* Inverse of utrie_serialize().
* Fills the UTrie runtime trie structure with the settings for the trie data.
*
* @param trie a pointer to the runtime trie structure
* @param data a pointer to 32-bit-aligned memory containing trie data
* @param length the number of bytes available at data
* @param pErrorCode an in/out ICU UErrorCode
* @return the number of bytes at data taken up by the trie data
*/
U_CAPI int32_t U_EXPORT2
utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode);
/* Building a trie ----------------------------------------------------------*/
/**
* Build-time trie structure.
* Opaque definition, here only to make fillIn parameters possible
* for utrie_open() and utrie_clone().
*/
struct UNewTrie {
/**
* Index values at build-time are 32 bits wide for easier processing.
* Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()).
*/
int32_t index[UTRIE_MAX_INDEX_LENGTH];
uint32_t *data;
int32_t indexLength, dataCapacity, dataLength;
UBool isAllocated, isDataAllocated;
UBool isLatin1Linear, isCompacted;
/**
* Map of adjusted indexes, used in utrie_compact().
* Maps from original indexes to new ones.
*/
int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT];
};
typedef struct UNewTrie UNewTrie;
/**
* Build-time trie callback function, used with utrie_serialize().
* This function calculates a lead surrogate's value including a folding offset
* from the 1024 supplementary code points [start..start+1024[ .
* It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
*
* The folding offset is provided by the caller.
* It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
* Instead of the offset itself, n can be stored in 10 bits -
* or fewer if it can be assumed that few lead surrogates have associated data.
*
* The returned value must be
* - not zero if and only if there is relevant data
* for the corresponding 1024 supplementary code points
* - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
*
* @return a folded value, or 0 if there is no relevant data for the lead surrogate.
*/
typedef uint32_t U_CALLCONV
UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset);
/**
* Open a build-time trie structure.
* The size of the build-time data array is specified to avoid allocating a large
* array in all cases. The array itself can also be passed in.
*
* Although the trie is never fully expanded to a linear array, especially when
* utrie_setRange32() is used, the data array could be large during build time.
* The maximum length is
* UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
* (Number of Unicode code points + one all-initial-value block +
* possible duplicate entries for 1024 lead surrogates.)
* (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.)
*
* @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or
* NULL if one is to be allocated
* @param aliasData a pointer to a data array to be used (will not be released), or
* NULL if one is to be allocated
* @param maxDataLength the capacity of aliasData (if not NULL) or
* the length of the data array to be allocated
* @param initialValue the initial value that is set for all code points
* @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
* kept in a linear, contiguous part of the data array
* @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
*/
U_CAPI UNewTrie * U_EXPORT2
utrie_open(UNewTrie *fillIn,
uint32_t *aliasData, int32_t maxDataLength,
uint32_t initialValue, UBool latin1Linear);
/**
* Clone a build-time trie structure with all entries.
*
* @param fillIn like in utrie_open()
* @param other the build-time trie structure to clone
* @param aliasData like in utrie_open(),
* used if aliasDataLength>=(capacity of other's data array)
* @param aliasDataLength the length of aliasData
* @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
*/
U_CAPI UNewTrie * U_EXPORT2
utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength);
/**
* Close a build-time trie structure, and release memory
* that was allocated by utrie_open() or utrie_clone().
*
* @param trie the build-time trie
*/
U_CAPI void U_EXPORT2
utrie_close(UNewTrie *trie);
/**
* Get the data array of a build-time trie.
* The data may be modified, but entries that are equal before
* must still be equal after modification.
*
* @param trie the build-time trie
* @param pLength (out) a pointer to a variable that receives the number
* of entries in the data array
* @return the data array
*/
U_CAPI uint32_t * U_EXPORT2
utrie_getData(UNewTrie *trie, int32_t *pLength);
/**
* Set a value for a code point.
*
* @param trie the build-time trie
* @param c the code point
* @param value the value
* @return FALSE if a failure occurred (illegal argument or data array overrun)
*/
U_CAPI UBool U_EXPORT2
utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value);
/**
* Get a value from a code point as stored in the build-time trie.
*
* @param trie the build-time trie
* @param c the code point
* @param pInBlockZero if not NULL, then *pInBlockZero is set to TRUE
* iff the value is retrieved from block 0;
* block 0 is the all-initial-value initial block
* @return the value
*/
U_CAPI uint32_t U_EXPORT2
utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero);
/**
* Set a value in a range of code points [start..limit[.
* All code points c with start<=c<limit will get the value if
* overwrite is TRUE or if the old value is 0.
*
* @param trie the build-time trie
* @param start the first code point to get the value
* @param limit one past the last code point to get the value
* @param value the value
* @param overwrite flag for whether old non-initial values are to be overwritten
* @return FALSE if a failure occurred (illegal argument or data array overrun)
*/
U_CAPI UBool U_EXPORT2
utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite);
/**
* Compact the build-time trie after all values are set, and then
* serialize it into 32-bit aligned memory.
*
* After this, the trie can only be serizalized again and/or closed;
* no further values can be added.
*
* @see utrie_unserialize()
*
* @param trie the build-time trie
* @param data a pointer to 32-bit-aligned memory for the trie data
* @param capacity the number of bytes available at data
* @param getFoldedValue a callback function that calculates the value for
* a lead surrogate from all of its supplementary code points
* and the folding offset
* @param reduceTo16Bits flag for whether the values are to be reduced to a
* width of 16 bits for serialization and runtime
* @param pErrorCode a UErrorCode argument; among other possible error codes:
* - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
* - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small
* - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization
*
* @return the number of bytes written for the trie
*/
U_CAPI int32_t U_EXPORT2
utrie_serialize(UNewTrie *trie, void *data, int32_t capacity,
UNewTrieGetFoldedValue *getFoldedValue,
UBool reduceTo16Bits,
UErrorCode *pErrorCode);
U_CDECL_END
#endif
--- NEW FILE: uvectr32.cpp ---
/*
******************************************************************************
* Copyright (C) 1999-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
* Date Name Description
* 10/22/99 alan Creation.
**********************************************************************
*/
#include "uvectr32.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
#define DEFUALT_CAPACITY 8
/*
* Constants for hinting whether a key is an integer
* or a pointer. If a hint bit is zero, then the associated
* token is assumed to be an integer. This is needed for iSeries
*/
const char UVector32::fgClassID=0;
UVector32::UVector32(UErrorCode &status) :
count(0),
capacity(0),
elements(NULL)
{
_init(DEFUALT_CAPACITY, status);
}
UVector32::UVector32(int32_t initialCapacity, UErrorCode &status) :
count(0),
capacity(0),
elements(0)
{
_init(initialCapacity, status);
}
void UVector32::_init(int32_t initialCapacity, UErrorCode &status) {
// Fix bogus initialCapacity values; avoid malloc(0)
if (initialCapacity < 1) {
initialCapacity = DEFUALT_CAPACITY;
}
elements = (int32_t *)uprv_malloc(sizeof(int32_t)*initialCapacity);
if (elements == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
capacity = initialCapacity;
}
}
UVector32::~UVector32() {
uprv_free(elements);
elements = 0;
}
/**
* Assign this object to another (make this a copy of 'other').
*/
void UVector32::assign(const UVector32& other, UErrorCode &ec) {
if (ensureCapacity(other.count, ec)) {
setSize(other.count);
for (int32_t i=0; i<other.count; ++i) {
elements[i] = other.elements[i];
}
}
}
UBool UVector32::operator==(const UVector32& other) {
int32_t i;
if (count != other.count) return FALSE;
for (i=0; i<count; ++i) {
if (elements[i] != other.elements[i]) {
return FALSE;
}
}
return TRUE;
}
void UVector32::setElementAt(int32_t elem, int32_t index) {
if (0 <= index && index < count) {
elements[index] = elem;
}
/* else index out of range */
}
void UVector32::insertElementAt(int32_t elem, int32_t index, UErrorCode &status) {
// must have 0 <= index <= count
if (0 <= index && index <= count && ensureCapacity(count + 1, status)) {
for (int32_t i=count; i>index; --i) {
elements[i] = elements[i-1];
}
elements[index] = elem;
++count;
}
/* else index out of range */
}
UBool UVector32::containsAll(const UVector32& other) const {
for (int32_t i=0; i<other.size(); ++i) {
if (indexOf(other.elements[i]) < 0) {
return FALSE;
}
}
return TRUE;
}
UBool UVector32::containsNone(const UVector32& other) const {
for (int32_t i=0; i<other.size(); ++i) {
if (indexOf(other.elements[i]) >= 0) {
return FALSE;
}
}
return TRUE;
}
UBool UVector32::removeAll(const UVector32& other) {
UBool changed = FALSE;
for (int32_t i=0; i<other.size(); ++i) {
int32_t j = indexOf(other.elements[i]);
if (j >= 0) {
removeElementAt(j);
changed = TRUE;
}
}
return changed;
}
UBool UVector32::retainAll(const UVector32& other) {
UBool changed = FALSE;
for (int32_t j=size()-1; j>=0; --j) {
int32_t i = other.indexOf(elements[j]);
if (i < 0) {
removeElementAt(j);
changed = TRUE;
}
}
return changed;
}
void UVector32::removeElementAt(int32_t index) {
if (index >= 0) {
for (int32_t i=index; i<count-1; ++i) {
elements[i] = elements[i+1];
}
--count;
}
}
void UVector32::removeAllElements(void) {
count = 0;
}
UBool UVector32::equals(const UVector32 &other) const {
int i;
if (this->count != other.count) {
return FALSE;
}
for (i=0; i<count; i++) {
if (elements[i] != other.elements[i]) {
return FALSE;
}
}
return TRUE;
}
int32_t UVector32::indexOf(int32_t key, int32_t startIndex) const {
int32_t i;
for (i=startIndex; i<count; ++i) {
if (key == elements[i]) {
return i;
}
}
return -1;
}
UBool UVector32::expandCapacity(int32_t minimumCapacity, UErrorCode &status) {
if (capacity >= minimumCapacity) {
return TRUE;
} else {
int32_t newCap = capacity * 2;
if (newCap < minimumCapacity) {
newCap = minimumCapacity;
}
int32_t* newElems = (int32_t *)uprv_malloc(sizeof(int32_t)*newCap);
if (newElems == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return FALSE;
}
uprv_memcpy(newElems, elements, sizeof(elements[0]) * count);
uprv_free(elements);
elements = newElems;
capacity = newCap;
return TRUE;
}
}
/**
* Change the size of this vector as follows: If newSize is smaller,
* then truncate the array, possibly deleting held elements for i >=
* newSize. If newSize is larger, grow the array, filling in new
* slots with NULL.
*/
void UVector32::setSize(int32_t newSize) {
int32_t i;
if (newSize < 0) {
return;
}
if (newSize > count) {
UErrorCode ec = U_ZERO_ERROR;
if (!ensureCapacity(newSize, ec)) {
return;
}
for (i=count; i<newSize; ++i) {
elements[i] = 0;
}
}
count = newSize;
}
/**
* Insert the given integer into this vector at its sorted position
* as defined by 'compare'. The current elements are assumed to
* be sorted already.
*/
void UVector32::sortedInsert(int32_t tok, UErrorCode& ec) {
// Perform a binary search for the location to insert tok at. Tok
// will be inserted between two elements a and b such that a <=
// tok && tok < b, where there is a 'virtual' elements[-1] always
// less than tok and a 'virtual' elements[count] always greater
// than tok.
int32_t min = 0, max = count;
while (min != max) {
int32_t probe = (min + max) / 2;
//int8_t c = (*compare)(elements[probe], tok);
//if (c > 0) {
if (elements[probe] > tok) {
max = probe;
} else {
// assert(c <= 0);
min = probe + 1;
}
}
if (ensureCapacity(count + 1, ec)) {
for (int32_t i=count; i>min; --i) {
elements[i] = elements[i-1];
}
elements[min] = tok;
++count;
}
}
U_NAMESPACE_END
--- NEW FILE: uvectr32.h ---
/*
**********************************************************************
* Copyright (C) 1999-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
//
// UVector32 is a class implementing a vector of 32 bit integers.
// It is similar to UVector, but holds int32_t values rather than pointers.
// Most of the code is unchanged from UVector.
//
#ifndef UVECTOR32_H
#define UVECTOR32_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "uhash.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
/**
* <p>Ultralightweight C++ implementation of a <tt>void*</tt> vector
* that is (mostly) compatible with java.util.Vector.
*
* <p>This is a very simple implementation, written to satisfy an
* immediate porting need. As such, it is not completely fleshed out,
* and it aims for simplicity and conformity. Nonetheless, it serves
* its purpose (porting code from java that uses java.util.Vector)
* well, and it could be easily made into a more robust vector class.
*
* <p><b>Design notes</b>
*
* <p>There is index bounds checking, but little is done about it. If
* indices are out of bounds, either nothing happens, or zero is
* returned. We <em>do</em> avoid indexing off into the weeds.
*
* <p>There is detection of out of memory, but the handling is very
* coarse-grained -- similar to UnicodeString's protocol, but even
* coarser. The class contains <em>one static flag</em> that is set
* when any call to <tt>new</tt> returns zero. This allows the caller
* to use several vectors and make just one check at the end to see if
* a memory failure occurred. This is more efficient than making a
* check after each call on each vector when doing many operations on
* multiple vectors. The single static flag works best when memory
* failures are infrequent, and when recovery options are limited or
* nonexistent.
*
* <p><b>To do</b>
*
* <p>Improve the handling of index out of bounds errors.
*
* @author Alan Liu
*/
class U_COMMON_API UVector32 : public UObject {
private:
int32_t count;
int32_t capacity;
int32_t* elements;
public:
UVector32(UErrorCode &status);
UVector32(int32_t initialCapacity, UErrorCode &status);
~UVector32();
/**
* Assign this object to another (make this a copy of 'other').
* Use the 'assign' function to assign each element.
*/
void assign(const UVector32& other, UErrorCode &ec);
/**
* Compare this vector with another. They will be considered
* equal if they are of the same size and all elements are equal,
* as compared using this object's comparer.
*/
UBool operator==(const UVector32& other);
/**
* Equivalent to !operator==()
*/
inline UBool operator!=(const UVector32& other);
//------------------------------------------------------------
// java.util.Vector API
//------------------------------------------------------------
void addElement(int32_t elem, UErrorCode &status);
void setElementAt(int32_t elem, int32_t index);
void insertElementAt(int32_t elem, int32_t index, UErrorCode &status);
int32_t elementAti(int32_t index) const;
UBool equals(const UVector32 &other) const;
int32_t lastElementi(void) const;
int32_t indexOf(int32_t obj, int32_t startIndex = 0) const;
UBool contains(int32_t obj) const;
UBool containsAll(const UVector32& other) const;
UBool removeAll(const UVector32& other);
UBool retainAll(const UVector32& other);
void removeElementAt(int32_t index);
void removeAllElements();
int32_t size(void) const;
UBool isEmpty(void) const;
// Inline. Use this one for speedy size check.
inline UBool ensureCapacity(int32_t minimumCapacity, UErrorCode &status);
// Out-of-line, handles actual growth. Called by ensureCapacity() when necessary.
UBool expandCapacity(int32_t minimumCapacity, UErrorCode &status);
/**
* Change the size of this vector as follows: If newSize is
* smaller, then truncate the array, possibly deleting held
* elements for i >= newSize. If newSize is larger, grow the
* array, filling in new slows with zero.
*/
void setSize(int32_t newSize);
//------------------------------------------------------------
// New API
//------------------------------------------------------------
/**
* Returns true if this vector contains none of the elements
* of the given vector.
* @param other vector to be checked for containment
* @return true if the test condition is met
*/
UBool containsNone(const UVector32& other) const;
/**
* Insert the given integer into this vector at its sorted position.
* The current elements are assumed to be sorted already.
*/
void sortedInsert(int32_t obj, UErrorCode& ec);
/**
* Returns a pointer to the internal array holding the vector.
*/
int32_t *getBuffer() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.2
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
private:
void _init(int32_t initialCapacity, UErrorCode &status);
// Disallow
UVector32(const UVector32&);
// Disallow
UVector32& operator=(const UVector32&);
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
// API Functions for Stack operations.
// In the original UVector, these were in a separate derived class, UStack.
// Here in UVector32, they are all together.
public:
UBool empty(void) const;
int32_t peeki(void) const;
int32_t popi(void);
int32_t push(int32_t i, UErrorCode &status);
int32_t *reserveBlock(int32_t size, UErrorCode &status);
int32_t *popFrame(int32_t size);
};
// UVector32 inlines
inline UBool UVector32::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
if (capacity >= minimumCapacity) {
return TRUE;
} else {
return expandCapacity(minimumCapacity, status);
}
}
inline int32_t UVector32::elementAti(int32_t index) const {
return (0 <= index && index < count) ? elements[index] : 0;
}
inline void UVector32::addElement(int32_t elem, UErrorCode &status) {
if (ensureCapacity(count + 1, status)) {
elements[count] = elem;
count++;
}
}
inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) {
ensureCapacity(count+size, status);
int32_t *rp = elements+count;
count += size;
return rp;
}
inline int32_t *UVector32::popFrame(int32_t size) {
U_ASSERT(count >= size);
count -= size;
if (count < 0) {
count = 0;
}
return elements+count-size;
}
inline int32_t UVector32::size(void) const {
return count;
}
inline UBool UVector32::isEmpty(void) const {
return count == 0;
}
inline UBool UVector32::contains(int32_t obj) const {
return indexOf(obj) >= 0;
}
inline int32_t UVector32::lastElementi(void) const {
return elementAti(count-1);
}
inline UBool UVector32::operator!=(const UVector32& other) {
return !operator==(other);
}
inline int32_t *UVector32::getBuffer() const {
return elements;
};
// UStack inlines
inline UBool UVector32::empty(void) const {
return isEmpty();
}
inline int32_t UVector32::peeki(void) const {
return lastElementi();
}
inline int32_t UVector32::push(int32_t i, UErrorCode &status) {
addElement(i, status);
return i;
}
inline int32_t UVector32::popi(void) {
int32_t result = 0;
if (count > 0) {
count--;
result = elements[count];
}
return result;
}
U_NAMESPACE_END
#endif
- Previous message: [sword-cvs] icu-sword/source/extra/uconv/unicode uwmsg.h,1.2,1.3
- Next message: [sword-cvs] icu-sword/source/test/testmap .cvsignore,1.3,1.4 Makefile.in,1.3,1.4 testmap.c,1.2,1.3 testmap.dsp,1.2,1.3
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]