{*******************************************************} { } { The Delphi Unicode Controls Project } { http://home.ccci.org/wolbrink } { } { Special Note: This unit, Unicode.pas, was written } { originally by Mike Lischke. Only a few minor } { modifications and bug fixes have been made. I } { have provided this unit so that others, at their } { own discretion, may use the same Unicode.pas as me.} { } { If you prefer, you may download the original at } { http://www.delphi-unicode.net or you may use the } { JclUnicode.pas from the JEDI component library at } { http://www.delphi-jedi.org/ and look for JCL. To } { use the JCL unit, you must define "JCL" for Tnt } { Delphi Unicode controls to compile. } { } {*******************************************************} unit Unicode; {TNT-IGNORE-UNIT} // Copyright (c) 1999, 2000 Mike Lischke (public@lischke-online.de) // Portions Copyright (c) 1999, 2000 Azret Botash (az) // // 01-APR-2000 ml: // preparation for public release // FEB-MAR 2000 version 2.0 beta // - Unicode regular expressions (URE) search class (TURESearch) // - generic search engine base class for both the Boyer-Moore and the RE search class // - whole word only search in UTBM, bug fixes in UTBM // - string decompositon (including hangul) // OCT/99 - JAN/2000 ml: version 1.0 // - basic Unicode implementation, more than 100 WideString/UCS2 and UCS4 core functions // - TWideStrings and TWideStringList classes // - Unicode Tuned Boyer-Moore search class (TUTBMSearch) // - low and high level Unicode/Wide* functions // - low level Unicode UCS4 data import and functions // - helper functions //---------------------------------------------------------------------------------------------------------------------- // This unit contains routines and classes to manage and work with Unicode/WideStrings strings. // You need Delphi 4 or higher to compile this code. // // Unicode encodings and wide strings: // Currently there are several encoding schemes defined which describe (among others) the code size and (resulting from // this) the usable value pool. Delphi supports the wide character data type for Unicode which corresponds to // UCS2 (UTF-16 coding scheme). This scheme uses 2 bytes to store character values and can therefor handle up to // 65536 characters. Another scheme is UCS4 (UTF-32 coding scheme) which uses 4 bytes per character. The first 65536 // code points correspond directly to those of UCS2. Other code points are mainly used for character surrogates. // To provide support for UCS2 (WideChar in Delphi) as well as UCS4 the library is splitted into two parts. The low // level part accepts and returns UCS4 characters while the high level part deals directly with WideChar/WideString // data types. Additionally, UCS2 is defined as being WideChar to retain maximum compatibility. // // Publicy available low level functions are all preceded by "Unicode..." (e.g. in UnicodeToUpper) while // the high level functions use the Str... or Wide... naming scheme (e.g. WideUpCase and WideUpperCase). // //---------------------------------------------------------------------------------------------------------------------- // Open issues: // - Keep in mind that this unit is still in beta state. In particular the URE class does not yet work for all cases. // - Yet to do things in the URE class are: // - check all character classes if they match correctly // - optimize rebuild of DFA (build only when pattern changes) // - set flag parameter of ExecuteURE // - add \d any decimal digit // \D any character that is not a decimal digit // \s any whitespace character // \S any character that is not a whitespace character // \w any "word" character // \W any "non-word" character // - For a perfect text search both the text to be searched through as well as the pattern must be normalized // to allow to match, say, accented and unaccented characters or the ligature fi with the letter combination fi etc. // Normalization is usually done by decomposing the string and optionally compose it again, but I had not yet the // opportunity to go through the composition stuff. // - The wide string classes still compare text with functions provided by the particular system. This works usually // fine under WinNT/W2K (although also there are limitations like maximum text lengths). Under Win9x conversions // from and to MBCS are necessary which are bound to a particular locale and so very limited in general use. // These comparisons should be changed so that the code in this unit is used. This requires, though, a working // composition implementation. interface uses Windows, Classes; const // definitions of often used characters: // Note: Use them only for tests of a certain character not to determine character classes like // white spaces as in Unicode are often many code points defined being in a certain class. // Hence your best option is to use the various UnicodeIs* functions. // can't use identifier "Null" here as this is already in a special Variant identifier WideNull = WideChar(#0); Tabulator = WideChar(#9); Space = WideChar(#32); // logical line breaks LF = WideChar($A); LineFeed = WideChar($A); VerticalTab = WideChar($B); FormFeed = WideChar($C); CR = WideChar($D); CarriageReturn = WideChar($D); // CRLF: WideString = #$D#$A; LineSeparator = WideChar($2028); ParagraphSeparator = WideChar($2029); // byte order marks for strings // Unicode text files should contain $FFFE as first character to identify such a file clearly. Depending on the system // where the file was created on this appears either in big endian or little endian style. BOM_LSB_FIRST = WideChar($FEFF); // this is how the BOM appears on x86 systems when written by a x86 system BOM_MSB_FIRST = WideChar($FFFE); type // Unicode transformation formats (UTF) data types UTF7 = Char; UTF8 = Char; UTF16 = WideChar; UTF32 = Cardinal; // UTF conversion schemes (UCS) data types PUCS4 = ^UCS4; UCS4 = Cardinal; PUCS2 = PWideChar; UCS2 = WideChar; const ReplacementCharacter: UCS4 = $0000FFFD; MaximumUCS2: UCS4 = $0000FFFF; MaximumUTF16: UCS4 = $0010FFFF; MaximumUCS4: UCS4 = $7FFFFFFF; SurrogateHighStart: UCS4 = $D800; SurrogateHighEnd: UCS4 = $DBFF; SurrogateLowStart: UCS4 = $DC00; SurrogateLowEnd: UCS4 = $DFFF; type PCardinal = ^Cardinal; TWideStrings = class; TSearchFlags = set of ( sfCaseSensitive, // match letter case sfIgnoreNonSpacing, // ignore non-spacing characters in search sfSpaceCompress, // handle several consecutive white spaces as one white space // (this applies to the pattern as well as the search text) sfWholeWordOnly); // match only text at end/start and/or surrounded by white spaces // a generic search class defininition used for tuned Boyer-Moore and Unicode regular expression searches TSearchEngine = class private FResults: TList; // 2 entries for each result (start and stop position) FOwner: TWideStrings; // at the moment unused, perhaps later to access strings faster protected function GetCount: Integer; virtual; public constructor Create(AOwner: TWideStrings); virtual; destructor Destroy; override; procedure AddResult(Start, Stop: Cardinal); virtual; procedure Clear; virtual; procedure ClearResults; virtual; procedure DeleteResult(Index: Cardinal); virtual; procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; virtual; abstract; procedure FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; virtual; abstract; function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract; function FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract; function FindAll(const Text: WideString): Boolean; overload; virtual; abstract; function FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; overload; virtual; abstract; procedure GetResult(Index: Cardinal; var Start, Stop: Integer); virtual; property Count: Integer read GetCount; end; // The Unicode Tuned Boyer-Moore (UTBM) search implementation is an extended translation created from a free package // written by Mark Leisher (mleisher@crl.nmsu.edu). // // The code handles high and low surrogates as well as case (in)dependency, can ignore non-spacing characters and // allows optionally to return whole words only. // single pattern character PUTBMChar = ^TUTBMChar; TUTBMChar = record LoCase, UpCase, TitleCase: UCS4; end; PUTBMSkip = ^TUTBMSkip; TUTBMSkip = record BMChar: PUTBMChar; SkipValues: Integer; end; TUTBMSearch = class(TSearchEngine) private FFlags: TSearchFlags; FPattern: PUTBMChar; FPatternUsed, FPatternSize, FPatternLength: Cardinal; FSkipValues: PUTBMSkip; FSkipsUsed: Integer; FMD4: Cardinal; protected procedure ClearPattern; procedure Compile(Pattern: PUCS2; PatternLength: Integer; Flags: TSearchFlags); function Find(Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean; function GetSkipValue(TextStart, TextEnd: PUCS2): Cardinal; function Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: Cardinal): Boolean; public constructor Create(AOwner: TWideStrings); override; destructor Destroy; override; procedure Clear; override; procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); override; procedure FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); override; function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; override; function FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; override; function FindAll(const Text: WideString): Boolean; override; function FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; override; end; // Regular expression search engine for text in UCS2 form taking surrogates into account. // This implementation is an improved translation from the URE package written by Mark Leisher (mleisher@crl.nmsu.edu) // who used a variation of the RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). // Assumptions: // o Regular expression and text already normalized. // o Conversion to lower case assumes a 1-1 mapping. // // Definitions: // Separator - any one of U+2028, U+2029, NL, CR. // // Operators: // . - match any character // * - match zero or more of the last subexpression // + - match one or more of the last subexpression // ? - match zero or one of the last subexpression // () - subexpression grouping // {m, n} - match at least m occurences and up to n occurences // Note: both values can be 0 or ommitted which denotes then a unlimiting bound // {,} and {0,} and {0, 0} correspond to * // {, 1} and {0, 1} correspond to ? // {1,} and {1, 0} correspond to + // {m} - match exactly m occurences // // Notes: // o The "." operator normally does not match separators, but a flag is // available that will allow this operator to match a separator. // // Literals and Constants: // c - literal UCS2 character // \x.... - hexadecimal number of up to 4 digits // \X.... - hexadecimal number of up to 4 digits // \u.... - hexadecimal number of up to 4 digits // \U.... - hexadecimal number of up to 4 digits // // Character classes: // [...] - Character class // [^...] - Negated character class // \pN1,N2,...,Nn - Character properties class // \PN1,N2,...,Nn - Negated character properties class // // POSIX character classes recognized: // :alnum: // :alpha: // :cntrl: // :digit: // :graph: // :lower: // :print: // :punct: // :space: // :upper: // :xdigit: // // Notes: // o Character property classes are \p or \P followed by a comma separated // list of integers between 1 and 32. These integers are references to // the following character properties: // // N Character Property // -------------------------- // 1 _URE_NONSPACING // 2 _URE_COMBINING // 3 _URE_NUMDIGIT // 4 _URE_NUMOTHER // 5 _URE_SPACESEP // 6 _URE_LINESEP // 7 _URE_PARASEP // 8 _URE_CNTRL // 9 _URE_PRIVATE // 10 _URE_UPPER (note: upper, lower and titel case classes need to have case // 11 _URE_LOWER sensitive search be enabled to match correctly!) // 12 _URE_TITLE // 13 _URE_MODIFIER // 14 _URE_OTHERLETTER // 15 _URE_DASHPUNCT // 16 _URE_OPENPUNCT // 17 _URE_CLOSEPUNCT // 18 _URE_OTHERPUNCT // 19 _URE_MATHSYM // 20 _URE_CURRENCYSYM // 21 _URE_OTHERSYM // 22 _URE_LTR // 23 _URE_RTL // 24 _URE_EURONUM // 25 _URE_EURONUMSEP // 26 _URE_EURONUMTERM // 27 _URE_ARABNUM // 28 _URE_COMMONSEP // 29 _URE_BLOCKSEP // 30 _URE_SEGMENTSEP // 31 _URE_WHITESPACE // 32 _URE_OTHERNEUT // // o Character classes can contain literals, constants, and character // property classes. Example: // // [abc\U10A\p1,3,4] // structure used to handle a compacted range of characters PRange = ^TRange; TRange = record MinCode, MaxCode: UCS4; end; TCClass = record Ranges: array of TRange; RangesUsed: Integer; end; // either a single character or a list of character classes TSymbol = record Chr: UCS4; CCL: TCClass; end; // this is a general element structure used for expressions and stack elements TElement = record OnStack: Boolean; AType, LHS, RHS: Cardinal; end; // this is a structure used to track a list or a stack of states PStateList = ^TStateList; TStateList = record List: array of Cardinal; ListUsed: Integer; end; // structure to track the list of unique states for a symbol during reduction PSymbolTableEntry = ^TSymbolTableEntry; TSymbolTableEntry = record ID, AType: Cardinal; Mods, Props: Cardinal; Symbol: TSymbol; States: TStateList; end; // structure to hold a single State PState = ^TState; TState = record ID: Cardinal; Accepting: Boolean; StateList: TStateList; Transitions: array of TElement; TransitionsUsed: Integer; end; // structure used for keeping lists of states TStateTable = record States: array of TState; StatesUsed: Integer; end; // structure to track pairs of FDFA states when equivalent states are merged TEquivalent = record Left, Right: Cardinal; end; TExpressionList = record Expressions: array of TElement; ExpressionsUsed: Integer; end; TSymbolTable = record Symbols: array of TSymbolTableEntry; SymbolsUsed: Integer; end; TEquivalentList = record Equivalents: array of TEquivalent; EquivalentsUsed: Integer; end; // structure used for constructing the NFA and reducing to a minimal FDFA PUREBuffer = ^TUREBuffer; TUREBuffer = record Reducing: Boolean; Error: Integer; Flags: Cardinal; Stack: TStateList; SymbolTable: TSymbolTable; // table of unique symbols encountered ExpressionList: TExpressionList; // tracks the unique expressions generated for the NFA and when the NFA is reduced States: TStateTable; // the reduced table of unique groups of NFA states EquivalentList: TEquivalentList; // tracks states when equivalent states are merged end; TTransition = record Symbol, NextState: Cardinal; end; PDFAState = ^TDFAState; TDFAState = record Accepting: Boolean; NumberTransitions: Integer; StartTransition: Integer; end; TDFAStates = record States: array of TDFAState; StatesUsed: Integer; end; TTransitions = record Transitions: array of TTransition; TransitionsUsed: Integer; end; TDFA = record Flags: Cardinal; SymbolTable: TSymbolTable; StateList: TDFAStates; TransitionList: TTransitions; end; TURESearch = class(TSearchEngine) private FUREBuffer: TUREBuffer; FDFA: TDFA; protected procedure AddEquivalentPair(L, R: Cardinal); procedure AddRange(var CCL: TCClass; Range: TRange); function AddState(NewStates: array of Cardinal): Cardinal; procedure AddSymbolState(Symbol, State: Cardinal); function BuildCharacterClass(CP: PUCS2; Limit: Cardinal; Symbol: PSymbolTableEntry): Cardinal; procedure CCLSetup(Symbol: PSymbolTableEntry; Mask: Cardinal); procedure ClearUREBuffer; function CompileSymbol(S: PUCS2; Limit: Cardinal; Symbol: PSymbolTableEntry): Cardinal; procedure CompileURE(RE: PWideChar; RELength: Cardinal; Casefold: Boolean); procedure CollectPendingOperations(var State: Cardinal); function ConvertRE2NFA(RE: PWideChar; RELength: Cardinal): Cardinal; function ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean; procedure ClearDFA; procedure HexDigitSetup(Symbol: PSymbolTableEntry; Mask: Cardinal); function MakeExpression(AType, LHS, RHS: Cardinal): Cardinal; function MakeHexNumber(NP: PUCS2; Limit: Cardinal; var Number: Cardinal): Cardinal; function MakeSymbol(S: PUCS2; Limit: Cardinal; var Consumed: Cardinal): Cardinal; function MatchesProperties(Props, C: Cardinal): Boolean; procedure MergeEquivalents; function ParsePropertyList(Properties: PUCS2; Limit: Cardinal; var Mask: Cardinal): Cardinal; function Peek: Cardinal; function Pop: Cardinal; function PosixCCL(CP: PUCS2; Limit: Cardinal; Symbol: PSymbolTableEntry): Cardinal; function ProbeLowSurrogate(LeftState: PUCS2; Limit: Cardinal; var Code: UCS4): Cardinal; procedure Push(V: Cardinal); procedure Reduce(Start: Cardinal); procedure SpaceSetup(Symbol: PSymbolTableEntry; Mask: Cardinal); function SymbolsAreDifferent(A, B: PSymbolTableEntry): Boolean; public constructor Create(AOwner: TWideStrings); override; destructor Destroy; override; procedure Clear; override; procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); override; procedure FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); override; function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; override; function FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; override; function FindAll(const Text: WideString): Boolean; override; function FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; override; end; // Event used to give the application a chance to switch the way of how to save the text in TWideStrings // if the text contains characters not only from the ANSI block but the save type is // ANSI. On triggering the event the application can change the property SaveUnicode // as needed. This property is again checked after the callback returns. TConfirmConversionEvent = procedure(Sender: TWideStrings; var Allowed: Boolean) of object; TWideStrings = class(TPersistent) private FUpdateCount: Integer; FLanguage: LCID; // language can usually left alone, the system's default is used FSaved, // set in SaveToStream, True in case saving was successfull otherwise False FSaveUnicode: Boolean; // flag set on loading to keep track in which format to save // (can be set explicitely, but expect losses if there's true Unicode content // and this flag is set to False) FOnConfirmConversion: TConfirmConversionEvent; function GetCommaText: WideString; function GetName(Index: Integer): WideString; function GetValue(const Name: WideString): WideString; procedure ReadData(Reader: TReader); procedure SetCommaText(const Value: WideString); procedure SetValue(const Name, Value: WideString); procedure WriteData(Writer: TWriter); protected procedure DefineProperties(Filer: TFiler); override; procedure Error(const Msg: String; Data: Integer); function Get(Index: Integer): WideString; virtual; abstract; function GetCapacity: Integer; virtual; function GetCount: Integer; virtual; abstract; function GetObject(Index: Integer): TObject; virtual; function GetTextStr: WideString; virtual; procedure Put(Index: Integer; const S: WideString); virtual; procedure PutObject(Index: Integer; AObject: TObject); virtual; procedure SetCapacity(NewCapacity: Integer); virtual; procedure SetTextStr(const Value: WideString); virtual; procedure SetUpdateState(Updating: Boolean); virtual; procedure SetLanguage(Value: LCID); virtual; public constructor Create; destructor Destroy; override; function Add(const S: WideString): Integer; virtual; function AddObject(const S: WideString; AObject: TObject): Integer; virtual; procedure Append(const S: WideString); procedure AddStrings(Strings: TStrings); overload; virtual; procedure AddStrings(Strings: TWideStrings); overload; virtual; procedure Assign(Source: TPersistent); override; procedure AssignTo(Dest: TPersistent); override; procedure BeginUpdate; procedure Clear; virtual; abstract; procedure Delete(Index: Integer); virtual; abstract; procedure EndUpdate; function Equals(Strings: TWideStrings): Boolean; procedure Exchange(Index1, Index2: Integer); virtual; function GetText: PWideChar; virtual; function IndexOf(const S: WideString): Integer; virtual; function IndexOfName(const Name: WideString): Integer; function IndexOfObject(AObject: TObject): Integer; procedure Insert(Index: Integer; const S: WideString); virtual; abstract; procedure InsertObject(Index: Integer; const S: WideString; AObject: TObject); procedure LoadFromFile(const FileName: String); virtual; procedure LoadFromStream(Stream: TStream); virtual; procedure Move(CurIndex, NewIndex: Integer); virtual; procedure SaveToFile(const FileName: String); virtual; procedure SaveToStream(Stream: TStream); virtual; procedure SetText(Text: PWideChar); virtual; property Capacity: Integer read GetCapacity write SetCapacity; property CommaText: WideString read GetCommaText write SetCommaText; property Count: Integer read GetCount; property Language: LCID read FLanguage write SetLanguage; property Names[Index: Integer]: WideString read GetName; property Objects[Index: Integer]: TObject read GetObject write PutObject; property Values[const Name: WideString]: WideString read GetValue write SetValue; property Saved: Boolean read FSaved; property SaveUnicode: Boolean read FSaveUnicode write FSaveUnicode; property Strings[Index: Integer]: WideString read Get write Put; default; property Text: WideString read GetTextStr write SetTextStr; property OnConfirmConversion: TConfirmConversionEvent read FOnConfirmConversion write FOnConfirmConversion; end; // TWideStringList class TWideStringItem = record FString: WideString; FObject: TObject; end; TWideStringItemList = array of TWideStringItem; TWideStringList = class(TWideStrings) private FList: TWideStringItemList; FCount: Integer; FSorted: Boolean; FDuplicates: TDuplicates; FOnChange: TNotifyEvent; FOnChanging: TNotifyEvent; procedure ExchangeItems(Index1, Index2: Integer); procedure Grow; procedure QuickSort(L, R: Integer); procedure InsertItem(Index: Integer; const S: WideString); procedure SetSorted(Value: Boolean); protected procedure Changed; virtual; procedure Changing; virtual; function Get(Index: Integer): WideString; override; function GetCapacity: Integer; override; function GetCount: Integer; override; function GetObject(Index: Integer): TObject; override; procedure Put(Index: Integer; const S: WideString); override; procedure PutObject(Index: Integer; AObject: TObject); override; procedure SetCapacity(NewCapacity: Integer); override; procedure SetUpdateState(Updating: Boolean); override; procedure SetLanguage(Value: LCID); override; public destructor Destroy; override; function Add(const S: WideString): Integer; override; procedure Clear; override; procedure Delete(Index: Integer); override; procedure Exchange(Index1, Index2: Integer); override; function Find(const S: WideString; var Index: Integer): Boolean; virtual; function IndexOf(const S: WideString): Integer; override; procedure Insert(Index: Integer; const S: WideString); override; procedure Sort; virtual; property Duplicates: TDuplicates read FDuplicates write FDuplicates; property Sorted: Boolean read FSorted write SetSorted; property OnChange: TNotifyEvent read FOnChange write FOnChange; property OnChanging: TNotifyEvent read FOnChanging write FOnChanging; end; // result type for number retrival functions TUNumber = record Numerator, Denominator: Integer; end; // functions involving Null-terminated strings // NOTE: PWideChars as well as WideStrings are NOT managed by reference counting (in opposition to 8 bit strings)! function StrLenW(Str: PWideChar): Cardinal; function StrEndW(Str: PWideChar): PWideChar; function StrMoveW(Dest, Source: PWideChar; Count: Cardinal): PWideChar; function StrCopyW(Dest, Source: PWideChar): PWideChar; function StrECopyW(Dest, Source: PWideChar): PWideChar; function StrLCopyW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar; function StrPCopyW(Dest: PWideChar; const Source: String): PWideChar; function StrPLCopyW(Dest: PWideChar; const Source: String; MaxLen: Cardinal): PWideChar; function StrCatW(Dest, Source: PWideChar): PWideChar; function StrLCatW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar; function StrCompW(Str1, Str2: PWideChar): Integer; function StrICompW(Str1, Str2: PWideChar): Integer; function StrLCompW(Str1, Str2: PWideChar; MaxLen: Cardinal): Integer; function StrLICompW(Str1, Str2: PWideChar; MaxLen: Cardinal): Integer; function StrNScanW(S1, S2: PWideChar): Integer; function StrRNScanW(S1, S2: PWideChar): Integer; function StrScanW(Str: PWideChar; Chr: WideChar): PWideChar; overload; function StrScanW(Str: PWideChar; Chr: WideChar; StrLen: Cardinal): PWideChar; overload; function StrRScanW(Str: PWideChar; Chr: WideChar): PWideChar; function StrPosW(Str, SubStr: PWideChar): PWideChar; function StrUpperW(Str: PWideChar): PWideChar; function StrLowerW(Str: PWideChar): PWideChar; function StrTitleW(Str: PWideChar): PWideChar; function StrAllocW(Size: Cardinal): PWideChar; function StrBufSizeW(Str: PWideChar): Cardinal; function StrNewW(Str: PWideChar): PWideChar; procedure StrDisposeW(Str: PWideChar); procedure StrSwapByteOrder(Str: PWideChar); // functions involving Delphi wide strings function WideAdjustLineBreaks(const S: WideString): WideString; function WideCharPos(const S: WideString; const Ch: WideChar; const Index: Integer): Integer; //az function WideCompose(const S: WideString): WideString; function WideComposeHangul(Source: WideString): WideString; function WideDecompose(const S: WideString): WideString; function WideLoCase(C: WideChar): WideChar; function WideLowerCase(const S: WideString): WideString; function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString; function WideQuotedStr(const S: WideString; Quote: WideChar): WideString; function WideStringOfChar(C: WideChar; Count: Cardinal): WideString; function WideTitleCaseChar(C: WideChar): WideChar; function WideTitleCaseString(const S: WideString): WideString; function WideTrim(const S: WideString): WideString; function WideTrimLeft(const S: WideString): WideString; function WideTrimRight(const S: WideString): WideString; function WideUpCase(C: WideChar): WideChar; function WideUpperCase(const S: WideString): WideString; // low level character routines function UnicodeGetDigit(Code: UCS4): Integer; function UnicodeGetNumber(Code: UCS4): TUNumber; function UnicodeToUpper(Code: UCS4): UCS4; function UnicodeToLower(Code: UCS4): UCS4; function UnicodeToTitle(Code: UCS4): UCS4; // character test routines function UnicodeIsAlpha(C: UCS4): Boolean; function UnicodeIsDigit(C: UCS4): Boolean; function UnicodeIsAlphaNum(C: UCS4): Boolean; function UnicodeIsControl(C: UCS4): Boolean; function UnicodeIsSpace(C: UCS4): Boolean; function UnicodeIsWhiteSpace(C: UCS4): Boolean; function UnicodeIsBlank(C: UCS4): Boolean; function UnicodeIsPunctuation(C: UCS4): Boolean; function UnicodeIsGraph(C: UCS4): Boolean; function UnicodeIsPrintable(C: UCS4): Boolean; function UnicodeIsUpper(C: UCS4): Boolean; function UnicodeIsLower(C: UCS4): Boolean; function UnicodeIsTitle(C: UCS4): Boolean; function UnicodeIsHexDigit(C: UCS4): Boolean; function UnicodeIsIsoControl(C: UCS4): Boolean; function UnicodeIsFormatControl(C: UCS4): Boolean; function UnicodeIsSymbol(C: UCS4): Boolean; function UnicodeIsNumber(C: UCS4): Boolean; function UnicodeIsNonSpacing(C: UCS4): Boolean; function UnicodeIsOpenPunctuation(C: UCS4): Boolean; function UnicodeIsClosePunctuation(C: UCS4): Boolean; function UnicodeIsInitialPunctuation(C: UCS4): Boolean; function UnicodeIsFinalPunctuation(C: UCS4): Boolean; function UnicodeIsComposite(C: UCS4): Boolean; function UnicodeIsQuotationMark(C: UCS4): Boolean; function UnicodeIsSymmetric(C: UCS4): Boolean; function UnicodeIsMirroring(C: UCS4): Boolean; function UnicodeIsNonBreaking(C: UCS4): Boolean; // Directionality functions function UnicodeIsRTL(C: UCS4): Boolean; function UnicodeIsLTR(C: UCS4): Boolean; function UnicodeIsStrong(C: UCS4): Boolean; function UnicodeIsWeak(C: UCS4): Boolean; function UnicodeIsNeutral(C: UCS4): Boolean; function UnicodeIsSeparator(C: UCS4): Boolean; // Other character test functions function UnicodeIsMark(C: UCS4): Boolean; function UnicodeIsModifier(C: UCS4): Boolean; function UnicodeIsLetterNumber(C: UCS4): Boolean; function UnicodeIsConnectionPunctuation(C: UCS4): Boolean; function UnicodeIsDash(C: UCS4): Boolean; function UnicodeIsMath(C: UCS4): Boolean; function UnicodeIsCurrency(C: UCS4): Boolean; function UnicodeIsModifierSymbol(C: UCS4): Boolean; function UnicodeIsNonSpacingMark(C: UCS4): Boolean; function UnicodeIsSpacingMark(C: UCS4): Boolean; function UnicodeIsEnclosing(C: UCS4): Boolean; function UnicodeIsPrivate(C: UCS4): Boolean; function UnicodeIsSurrogate(C: UCS4): Boolean; function UnicodeIsLineSeparator(C: UCS4): Boolean; function UnicodeIsParagraphSeparator(C: UCS4): Boolean; function UnicodeIsIdenifierStart(C: UCS4): Boolean; function UnicodeIsIdentifierPart(C: UCS4): Boolean; function UnicodeIsDefined(C: UCS4): Boolean; function UnicodeIsUndefined(C: UCS4): Boolean; function UnicodeIsHan(C: UCS4): Boolean; function UnicodeIsHangul(C: UCS4): Boolean; // utility functions function CodePageFromLocale(Language: LCID): Cardinal; function KeyboardCodePage: Cardinal; function KeyUnicode(C: Char): WideChar; function CodeBlockFromChar(const C: WideChar): Cardinal; function CodePageToWideString(const A: AnsiString; CodePage: Cardinal; dwFlags: Cardinal = 0): WideString; // WideString Conversion routines function WideStringToUTF8(const S: WideString): AnsiString; function UTF8ToWideString(const S: AnsiString): WideString; //---------------------------------------------------------------------------------------------------------------------- implementation // ~67K Unicode data for case mapping, decomposition, numbers etc. // This data is loaded on demand which means only those parts will be put in memory which are needed // by one of the lookup functions. {$R Unicode.res} uses Consts, {$IFNDEF VER130} RTLConsts, {$ENDIF} SyncObjs, SysUtils; resourcestring SUREBaseString = 'Error in regular expression: %s' + #13; SUREUnexpectedEOS = 'Unexpected end of pattern.'; SURECharacterClassOpen = 'Character class not closed, '']'' is missing.'; SUREUnbalancedGroup = 'Unbalanced group expression, '')'' is missing.'; SUREInvalidCharProperty = 'A character property is invalid'; SUREInvalidRepeatRange = 'Invalid repeation range.'; SURERepeatRangeOpen = 'Repeation range not closed, ''}'' is missing.'; SUREExpressionEmpty = 'Expression is empty.'; type TCompareFunc = function (const W1, W2: WideString; Locale: LCID): Integer; var WideCompareText: TCompareFunc; //----------------- Loader routines for resource data ------------------------------------------------------------------ const // Values that can appear in the Mask1 parameter of the IsProperty function. UC_MN = $00000001; // Mark, Non-Spacing UC_MC = $00000002; // Mark, Spacing Combining UC_ME = $00000004; // Mark, Enclosing UC_ND = $00000008; // Number, Decimal Digit UC_NL = $00000010; // Number, Letter UC_NO = $00000020; // Number, Other UC_ZS = $00000040; // Separator, Space UC_ZL = $00000080; // Separator, Line UC_ZP = $00000100; // Separator, Paragraph UC_CC = $00000200; // Other, Control UC_CF = $00000400; // Other, Format UC_OS = $00000800; // Other, Surrogate UC_CO = $00001000; // Other, private use UC_CN = $00002000; // Other, not assigned UC_LU = $00004000; // Letter, Uppercase UC_LL = $00008000; // Letter, Lowercase UC_LT = $00010000; // Letter, Titlecase UC_LM = $00020000; // Letter, Modifier UC_LO = $00040000; // Letter, Other UC_PC = $00080000; // Punctuation, Connector UC_PD = $00100000; // Punctuation, Dash UC_PS = $00200000; // Punctuation, Open UC_PE = $00400000; // Punctuation, Close UC_PO = $00800000; // Punctuation, Other UC_SM = $01000000; // Symbol, Math UC_SC = $02000000; // Symbol, Currency UC_SK = $04000000; // Symbol, Modifier UC_SO = $08000000; // Symbol, Other UC_L = $10000000; // Left-To-Right UC_R = $20000000; // Right-To-Left UC_EN = $40000000; // European Number UC_ES = $80000000; // European Number Separator // Values that can appear in the Mask2 parameter of the IsProperty function UC_ET = $00000001; // European Number Terminator UC_AN = $00000002; // Arabic Number UC_CS = $00000004; // Common Number Separator UC_B = $00000008; // Block Separator UC_S = $00000010; // Segment (unit) Separator (this includes tab and vertical tab) UC_WS = $00000020; // Whitespace UC_ON = $00000040; // Other Neutrals // Implementation specific character properties. UC_CM = $00000080; // Composite UC_NB = $00000100; // Non-Breaking UC_SY = $00000200; // Symmetric UC_HD = $00000400; // Hex Digit UC_QM = $00000800; // Quote Mark UC_MR = $00001000; // Mirroring UC_SS = $00002000; // Space, other UC_CP = $00004000; // Defined // Added for UnicodeData-2.1.3. UC_PI = $00008000; // Punctuation, Initial UC_PF = $00010000; // Punctuation, Final type TUHeader = record BOM: WideChar; Count: Word; case Boolean of True: (Bytes: Cardinal); False: (Len: array[0..1] of Word); end; TWordArray = array of Word; TCardinalArray = array of Cardinal; var // As the global data can be accessed by several threads it should be guarded // while the data is loaded. LoadInProgress: TCriticalSection; //----------------- internal support routines -------------------------------------------------------------------------- function SwapCardinal(C: Cardinal): Cardinal; // swaps all bytes in C from MSB to LSB order // EAX contains both parameter as well as result asm BSWAP EAX end; //----------------- support for character properties ------------------------------------------------------------------- var PropertyOffsets: TWordArray; PropertyRanges: TCardinalArray; procedure LoadUnicodeTypeData; // loads the character property data (as saved by the Unicode database extractor into the ctype.dat file) var I, Size: Integer; Header: TUHeader; Stream: TResourceStream; begin // make sure no other code is currently modifying the global data area if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create; LoadInProgress.Enter; // Data already loaded? if PropertyOffsets = nil then begin Stream := TResourceStream.Create(HInstance, 'TYPE', 'UNICODE'); Stream.Read(Header, SizeOf(Header)); if Header.BOM = BOM_MSB_FIRST then begin Header.Count := Swap(Header.Count); Header.Bytes := SwapCardinal(Header.Bytes); end; // Calculate the offset into the storage for the ranges. The offsets // array is on a 4-byte boundary and one larger than the value provided in // the header count field. This means the offset to the ranges must be // calculated after aligning the count to a 4-byte boundary. Size := (Header.Count + 1) * SizeOf(Word); if (Size and 3) <> 0 then Inc(Size, 4 - (Size and 3)); // fill offsets array SetLength(PropertyOffsets, Size div SizeOf(Word)); Stream.Read(PropertyOffsets[0], Size); // Do an endian swap if necessary. Don't forget there is an extra node on the end with the final index. if Header.BOM = BOM_MSB_FIRST then begin for I := 0 to Header.Count do PropertyOffsets[I] := Swap(PropertyOffsets[I]); end; // Load the ranges. The number of elements is in the last array position of the offsets. SetLength(PropertyRanges, PropertyOffsets[Header.Count]); Stream.Read(PropertyRanges[0], PropertyOffsets[Header.Count] * SizeOf(Cardinal)); // Do an endian swap if necessary. if Header.BOM = BOM_MSB_FIRST then begin for I := 0 to PropertyOffsets[Header.Count] - 1 do PropertyRanges[I] := SwapCardinal(PropertyRanges[I]); end; Stream.Free; end; LoadInProgress.Leave; end; //---------------------------------------------------------------------------------------------------------------------- function PropertyLookup(Code, N: Cardinal): Boolean; var L, R, M: Integer; begin // load property data if not already done if PropertyOffsets = nil then LoadUnicodeTypeData; Result := False; // There is an extra node on the end of the offsets to allow this routine // to work right. If the index is 0xffff, then there are no nodes for the property. L := PropertyOffsets[N]; if L <> $FFFF then begin // Locate the next offset that is not 0xffff. The sentinel at the end of // the array is the max index value. M := 1; while ((Integer(N) + M) < High(PropertyOffsets)) and (PropertyOffsets[Integer(N) + M] = $FFFF) do Inc(M); R := PropertyOffsets[Integer(N) + M] - 1; while L <= R do begin // Determine a "mid" point and adjust to make sure the mid point is at // the beginning of a range pair. M := (L + R) shr 1; Dec(M, M and 1); if Code > PropertyRanges[M + 1] then L := M + 2 else if Code < PropertyRanges[M] then R := M - 2 else if (Code >= PropertyRanges[M]) and (Code <= PropertyRanges[M + 1]) then begin Result := True; Break; end; end; end; end; //---------------------------------------------------------------------------------------------------------------------- function IsProperty(Code, Mask1, Mask2: Cardinal): Boolean; var I: Cardinal; Mask: Cardinal; begin Result := False; if Mask1 <> 0 then begin Mask := 1; for I := 0 to 31 do begin if ((Mask1 and Mask) <> 0) and PropertyLookup(Code, I) then begin Result := True; Exit; end; Mask := Mask shl 1; end; end; if Mask2 <> 0 then begin I := 32; Mask := 1; while I < Cardinal(High(PropertyOffsets)) do begin if ((Mask2 and Mask) <> 0) and PropertyLookup(Code, I) then begin Result := True; Exit; end; Inc(I); Mask := Mask shl 1; end; end; end; //----------------- support for case mapping --------------------------------------------------------------------------- var CaseMapSize: Cardinal; CaseLengths: array[0..1] of Word; CaseMap: TCardinalArray; procedure LoadUnicodeCaseData; var Stream: TResourceStream; I: Cardinal; Header: TUHeader; begin // make sure no other code is currently modifying the global data area if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create; LoadInProgress.Enter; if CaseMap = nil then begin Stream := TResourceStream.Create(HInstance, 'CASE', 'UNICODE'); Stream.Read(Header, SizeOf(Header)); if Header.BOM = BOM_MSB_FIRST then begin Header.Count := Swap(Header.Count); Header.Len[0] := Swap(Header.Len[0]); Header.Len[1] := Swap(Header.Len[1]); end; // Set the node count and lengths of the upper and lower case mapping tables. CaseMapSize := Header.Count * 3; CaseLengths[0] := Header.Len[0] * 3; CaseLengths[1] := Header.Len[1] * 3; SetLength(CaseMap, CaseMapSize); // Load the case mapping table. Stream.Read(CaseMap[0], CaseMapSize * SizeOf(Cardinal)); // Do an endian swap if necessary. if Header.BOM = BOM_MSB_FIRST then for I := 0 to CaseMapSize -1 do CaseMap[I] := SwapCardinal(CaseMap[I]); Stream.Free; end; LoadInProgress.Leave; end; //---------------------------------------------------------------------------------------------------------------------- function CaseLookup(Code: Cardinal; L, R, Field: Integer): Cardinal; var M: Integer; begin // load case mapping data if not already done if CaseMap = nil then LoadUnicodeCaseData; // Do the binary search. while L <= R do begin // Determine a "mid" point and adjust to make sure the mid point is at // the beginning of a case mapping triple. M := (L + R) shr 1; Dec(M, M mod 3); if Code > CaseMap[M] then L := M + 3 else if Code < CaseMap[M] then R := M - 3 else if Code = CaseMap[M] then begin Result := CaseMap[M + Field]; Exit; end; end; Result := Code; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeToUpper(Code: UCS4): UCS4; var Field, L, R: Integer; begin // load case mapping data if not already done if CaseMap = nil then LoadUnicodeCaseData; if UnicodeIsUpper(Code) then Result := Code else begin if UnicodeIsLower(Code) then begin Field := 2; L := CaseLengths[0]; R := (L + CaseLengths[1]) - 3; end else begin Field := 1; L := CaseLengths[0] + CaseLengths[1]; R := CaseMapSize - 3; end; Result := CaseLookup(Code, L, R, Field); end; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeToLower(Code: UCS4): UCS4; var Field, L, R: Integer; begin // load case mapping data if not already done if CaseMap = nil then LoadUnicodeCaseData; if UnicodeIsLower(Code) then Result := Code else begin if UnicodeIsUpper(Code) then begin Field := 1; L := 0; R := CaseLengths[0] - 3; end else begin Field := 2; L := CaseLengths[0] + CaseLengths[1]; R := CaseMapSize - 3; end; Result := CaseLookup(Code, L, R, Field); end; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeToTitle(Code: UCS4): UCS4; var Field, L, R: Integer; begin // load case mapping data if not already done if CaseMap = nil then LoadUnicodeCaseData; if UnicodeIsTitle(Code) then Result := Code else begin // The offset will always be the same for converting to title case. Field := 2; if UnicodeIsUpper(Code) then begin L := 0; R := CaseLengths[0] - 3; end else begin L := CaseLengths[0]; R := (L + CaseLengths[1]) - 3; end; Result := CaseLookup(Code, L, R, Field); end; end; //----------------- Support for decomposition -------------------------------------------------------------------------- const // constants for hangul composition and decomposition (this is done algorithmically // saving so significant memory) SBase = $AC00; LBase = $1100; VBase = $1161; TBase = $11A7; LCount = 19; VCount = 21; TCount = 28; NCount = VCount * TCount; // 588 SCount = LCount * NCount; // 11172 var DecompositionSize: Cardinal; DecompositionNodes, Decompositions: TCardinalArray; //---------------------------------------------------------------------------------------------------------------------- procedure LoadUnicodeDecompositionData; var Stream: TResourceStream; I: Cardinal; Header: TUHeader; begin // make sure no other code is currently modifying the global data area if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create; LoadInProgress.Enter; if Decompositions = nil then begin Stream := TResourceStream.Create(HInstance, 'DECOMPOSE', 'UNICODE'); Stream.Read(Header, SizeOf(Header)); if Header.BOM = BOM_MSB_FIRST then begin Header.Count := Swap(Header.Count); Header.Bytes := SwapCardinal(Header.Bytes); end; DecompositionSize := Header.Count shl 1; // two values per node SetLength(DecompositionNodes, DecompositionSize + 1); // one entry more (the sentinel) Stream.Read(DecompositionNodes[0], (DecompositionSize + 1) * SizeOf(Cardinal)); SetLength(Decompositions, (Header.Bytes div SizeOf(Cardinal)) - DecompositionSize - 1); Stream.Read(Decompositions[0], Length(Decompositions) * SizeOf(Cardinal)); // Do an endian swap if necessary. if Header.BOM = BOM_MSB_FIRST then begin for I := 0 to High(DecompositionNodes) do DecompositionNodes[I] := SwapCardinal(DecompositionNodes[I]); for I := 0 to High(Decompositions) do Decompositions[I] := SwapCardinal(Decompositions[I]); end; Stream.Free; end; LoadInProgress.Leave; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeDecomposeHangul(Code: UCS4): TCardinalArray; // algorithmically decompose hangul character using some predefined contstants var Rest: Integer; begin if not UnicodeIsHangul(Code) then Result := nil else begin Dec(Code, SBase); Rest := Code mod TCount; if Rest = 0 then SetLength(Result, 2) else SetLength(Result, 3); Result[0] := LBase + (Code div NCount); Result[1] := VBase + ((Code mod NCount) div TCount); if Rest <> 0 then Result[2] := TBase + Rest; end; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeDecompose(Code: UCS4): TCardinalArray; var L, R, M: Integer; begin // load decomposition data if not already done if Decompositions = nil then LoadUnicodeDecompositionData; if not UnicodeIsComposite(Code) then begin // return the code itself if it is not a composite SetLength(Result, 1); Result[0] := Code; end else begin // if the code is hangul then decomposition is algorithmically Result := UnicodeDecomposeHangul(Code); if Result = nil then begin L := 0; R := DecompositionNodes[DecompositionSize] - 1; while L <= R do begin // Determine a "mid" point and adjust to make sure the mid point is at // the beginning of a code + offset pair. M := (L + R) shr 1; Dec(M, M and 1); if Code > DecompositionNodes[M] then L := M + 2 else if Code < DecompositionNodes[M] then R := M - 2 else if Code = DecompositionNodes[M] then begin // found a decomposition, return the codes SetLength(Result, DecompositionNodes[M + 3] - DecompositionNodes[M + 1] - 1); Move(Decompositions[DecompositionNodes[M + 1]], Result[0], Length(Result) * SizeOf(Cardinal)); Break; end; end; end; end; end; //----------------- Support for combining classes ---------------------------------------------------------------------- var CCLSize: Cardinal; CCLNodes: TCardinalArray; //---------------------------------------------------------------------------------------------------------------------- procedure LoadUnicodeCombiningData; var Stream: TResourceStream; I: Cardinal; Header: TUHeader; begin // make sure no other code is currently modifying the global data area if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create; LoadInProgress.Enter; if CCLNodes = nil then begin Stream := TResourceStream.Create(HInstance, 'COMBINE', 'UNICODE'); Stream.Read(Header, SizeOf(Header)); if Header.BOM = BOM_MSB_FIRST then begin Header.Count := Swap(Header.Count); Header.Bytes := SwapCardinal(Header.Bytes); end; CCLSize := Header.Count * 3; SetLength(CCLNodes, CCLSize); Stream.Read(CCLNodes[0], CCLSize * SizeOf(Cardinal)); if Header.BOM = BOM_MSB_FIRST then for I := 0 to CCLSize - 1 do CCLNodes[I] := SwapCardinal(CCLNodes[I]); Stream.Free; end; LoadInProgress.Leave; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeCanonicalClass(Code: Cardinal): Cardinal; var L, R, M: Integer; begin // load combination data if not already done if CCLNodes = nil then LoadUnicodeCombiningData; Result := 0; L := 0; R := CCLSize - 1; while L <= R do begin M := (L + R) shr 1; Dec(M, M mod 3); if Code > CCLNodes[M + 1] then L := M + 3 else if Code < CCLNodes[M] then R := M - 3 else if (Code >= CCLNodes[M]) and (Code <= CCLNodes[M + 1]) then begin Result := CCLNodes[M + 2]; Break; end; end; end; //----------------- Support for numeric values ------------------------------------------------------------------------- var NumberSize: Cardinal; NumberNodes: TCardinalArray; NumberValues: TWordArray; //---------------------------------------------------------------------------------------------------------------------- procedure LoadUnicodeNumberData; var Stream: TResourceStream; I: Cardinal; Header: TUHeader; begin // make sure no other code is currently modifying the global data area if LoadInProgress = nil then LoadInProgress := TCriticalSection.Create; LoadInProgress.Enter; if NumberNodes = nil then begin Stream := TResourceStream.Create(HInstance, 'NUMBERS', 'UNICODE'); Stream.Read(Header, SizeOf(Header)); if Header.BOM = BOM_MSB_FIRST then begin Header.Count := Swap(Header.Count); Header.Bytes := SwapCardinal(Header.Bytes); end; NumberSize := Header.Count; SetLength(NumberNodes, NumberSize); Stream.Read(NumberNodes[0], NumberSize * SizeOf(Cardinal)); SetLength(NumberValues, (Header.Bytes - NumberSize * SizeOf(Cardinal)) div SizeOf(Word)); Stream.Read(NumberValues[0], Length(NumberValues) * SizeOf(Word)); if Header.BOM = BOM_MSB_FIRST then begin for I := 0 to High(NumberNodes) do NumberNodes[I] := SwapCardinal(NumberNodes[I]); for I := 0 to High(NumberValues) do NumberValues[I] := Swap(NumberValues[I]); end; Stream.Free; end; LoadInProgress.Leave; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeNumberLookup(Code: UCS4; var num: TUNumber): Boolean; var L, R, M: Integer; VP: PWord; begin // load number data if not already done if NumberNodes = nil then LoadUnicodeNumberData; Result := False; L := 0; R := NumberSize - 1; while L <= R do begin // Determine a "mid" point and adjust to make sure the mid point is at // the beginning of a code+offset pair. M := (L + R) shr 1; Dec(M, M and 1); if Code > NumberNodes[M] then L := M + 2 else if Code < NumberNodes[M] then R := M - 2 else begin VP := Pointer(Cardinal(@NumberValues[0]) + NumberNodes[M + 1]); num.numerator := VP^; Inc(VP); num.denominator := VP^; Result := True; Break; end; end; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeDigitLookup(Code: UCS4; var Digit: Integer): Boolean; var L, R, M: Integer; VP: PWord; begin // load number data if not already done if NumberNodes = nil then LoadUnicodeNumberData; Result := False; L := 0; R := NumberSize - 1; while L <= R do begin // Determine a "mid" point and adjust to make sure the mid point is at // the beginning of a code+offset pair. M := (L + R) shr 1; Dec(M, M and 1); if Code > NumberNodes[M] then L := M + 2 else if Code < NumberNodes[M] then R := M - 2 else begin VP := Pointer(Cardinal(@NumberValues[0]) + NumberNodes[M + 1]); M := VP^; Inc(VP); if M = VP^ then begin Digit := M; Result := True; end; Break; end; end; end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeGetNumber(Code: UCS4): TUNumber; begin // Initialize with some arbitrary value, because the caller simply cannot // tell for sure if the code is a number without calling the ucisnumber() // macro before calling this function. Result.Numerator := -111; Result.Denominator := -111; UnicodeNumberLookup(Code, Result); end; //---------------------------------------------------------------------------------------------------------------------- function UnicodeGetDigit(Code: UCS4): Integer; begin // Initialize with some arbitrary value, because the caller simply cannot // tell for sure if the code is a number without calling the ucisdigit() // macro before calling this function. Result := -111; UnicodeDigitLookup(Code, Result); end; //----------------- TSearchEngine -------------------------------------------------------------------------------------- constructor TSearchEngine.Create(AOwner: TWideStrings); begin FOwner := AOwner; FResults := TList.Create; end; //---------------------------------------------------------------------------------------------------------------------- destructor TSearchEngine.Destroy; begin Clear; FResults.Free; inherited; end; //---------------------------------------------------------------------------------------------------------------------- procedure TSearchEngine.AddResult(Start, Stop: Cardinal); begin FResults.Add(Pointer(Start)); FResults.Add(Pointer(Stop)); end; //---------------------------------------------------------------------------------------------------------------------- procedure TSearchEngine.Clear; begin ClearResults; end; //---------------------------------------------------------------------------------------------------------------------- procedure TSearchEngine.ClearResults; begin FResults.Clear; end; //---------------------------------------------------------------------------------------------------------------------- procedure TSearchEngine.DeleteResult(Index: Cardinal); // explicitly deletes a search result begin with FResults do begin // start index Delete(2 * Index); // stop index Delete(2 * Index); end; end; //---------------------------------------------------------------------------------------------------------------------- function TSearchEngine.GetCount: Integer; // returns the number of matches found begin Result := FResults.Count div 2; end; //---------------------------------------------------------------------------------------------------------------------- procedure TSearchEngine.GetResult(Index: Cardinal; var Start, Stop: Integer); // returns the start position of a match (end position can be determined by adding the length // of the pattern to the start position) begin Start := Cardinal(FResults[2 * Index]); Stop := Cardinal(FResults[2 * Index + 1]); end; //----------------- TUTBMSearch ---------------------------------------------------------------------------------------- constructor TUTBMSearch.Create(AOwner: TWideStrings); begin inherited; end; //---------------------------------------------------------------------------------------------------------------------- destructor TUTBMSearch.Destroy; begin inherited; end; //---------------------------------------------------------------------------------------------------------------------- procedure TUTBMSearch.ClearPattern; begin FreeMem(FPattern); FPattern := nil; FFlags := []; FPatternUsed := 0; FPatternSize := 0; FPatternLength := 0; FreeMem(FSkipValues); FSkipValues := nil; FSkipsUsed := 0; FMD4 := 0; end; //---------------------------------------------------------------------------------------------------------------------- function TUTBMSearch.GetSkipValue(TextStart, TextEnd: PUCS2): Cardinal; // looks up the SkipValues value for a character var I: Integer; C1, C2: UCS4; Sp: PUTBMSkip; begin Result := 0; if Cardinal(TextStart) < Cardinal(TextEnd) then begin C1 := Word(TextStart^); if (TextStart + 1) < TextEnd then C2 := Word((TextStart + 1)^) else C2 := $FFFFFFFF; if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and (SurrogateLowStart <= C2) and (C2 <= $DDDD) then C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Sp := FSkipValues; for I := 0 to FSkipsUsed - 1 do begin if not (Boolean(C1 xor Sp.BMChar.UpCase) and Boolean(C1 xor Sp.BMChar.LoCase) and Boolean(C1 xor Sp.BMChar.TitleCase)) then begin if (TextEnd - TextStart) < Sp.SkipValues then Result := TextEnd - TextStart else Result := Sp.SkipValues; Exit; end; Inc(Sp); end; Result := FPatternLength; end; end; //---------------------------------------------------------------------------------------------------------------------- function TUTBMSearch.Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: Cardinal): Boolean; // Checks once whether the text at position Start (which points to the end of the current text part to be matched) // matches. // Note: If whole words only are allowed then the left and right border tests are done here too. The keypoint for the // right border is that the next character after the search string is either the text end or a space character. // For the left side this is similar, but there is nothing like a string start marker (like the string end marker #0). // // It seems not obvious, but we still can use the passed Text pointer to do the left check. Although this pointer // might not point to the real string start (e.g. in TUTBMSearch.FindAll Text is incremented as needed) it is // still a valid check mark. The reason is that Text either points to the real string start or a previous match // (happend already, keep in mind the search options do not change in the FindAll loop) and the character just // before Text is a space character. // This fact implies, though, that strings passed to Find (or FindFirst, FindAll in TUTBMSearch) always really // start at the given address. Although this might not be the case in some circumstances (e.g. if you pass only // the selection from an editor) it is still assumed that a pattern matching from the first position on (from the // search string start) also matches when whole words only are allowed. var CheckSpace: Boolean; C1, C2: UCS4; Count: Integer; Cp: PUTBMChar; begin // be pessimistic Result := False; // set the potential match endpoint first MatchEnd := (Start - Text) + 1; C1 := Word(Start^); if (Start + 1) < Stop then C2 := Word((Start + 1)^) else C2 := $FFFFFFFF; if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); // Adjust the match end point to occur after the UTF-16 character. Inc(MatchEnd); end; // check special cases if FPatternUsed = 1 then begin MatchStart := Start - Text; Result := True; Exit; end; // Early out if entire words need to be matched and the next character // in the search string is neither the string end nor a space character. if (sfWholeWordOnly in FFlags) and not ((Start + 1)^ = WideNull) and not UnicodeIsWhiteSpace(Word((Start + 1)^)) then Exit; // compare backward Cp := FPattern; Inc(Cp, FPatternUsed - 1); Count := FPatternLength; while (Start >= Text) and (Count > 0) do begin // ignore non-spacing characters if indicated if sfIgnoreNonSpacing in FFlags then begin while (Start > Text) and UnicodeIsNonSpacing(C1) do begin Dec(Start); C2 := Word(Start^); if (Start - 1) > Text then C1 := Word((Start - 1)^) else C1 := $FFFFFFFF; if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Dec(Start); end else C1 := C2; end; end; // handle space compression if indicated if sfSpaceCompress in FFlags then begin CheckSpace := False; while (Start > Text) and (UnicodeIsWhiteSpace(C1) or UnicodeIsControl(C1)) do begin CheckSpace := UnicodeIsWhiteSpace(C1); Dec(Start); C2 := Word(Start^); if (Start - 1) > Text then C1 := Word((Start - 1)^) else C1 := $FFFFFFFF; if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Dec(Start); end else C1 := C2; end; // Handle things if space compression was indicated and one or // more member characters were found. if CheckSpace then begin if Cp.UpCase <> $20 then Exit; Dec(Cp); Dec(Count); // If Count is 0 at this place then the space character(s) was the first // in the pattern and we need to correct the start position. if Count = 0 then Inc(Start); end; end; // handle the normal comparison cases if (Count > 0) and (Boolean(C1 xor Cp.UpCase) and Boolean(C1 xor Cp.LoCase) and Boolean(C1 xor Cp.TitleCase)) then Exit; if C1 >= $10000 then Dec(Count, 2) else Dec(Count, 1); if Count > 0 then begin Dec(Cp); // get the next preceding character if Start > Text then begin Dec(Start); C2 := Word(Start^); if (Start - 1) > Text then C1 := Word((Start - 1)^) else C1 := $FFFFFFFF; if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Dec(Start); end else C1 := C2; end; end; end; // So far the string matched. Now check its left border for a space character if // whole word only are allowed. if not (sfWholeWordOnly in FFlags) or (Start <= Text) or UnicodeIsWhiteSpace(Word((Start - 1)^)) then begin // set the match start position MatchStart := Start - Text; Result := True; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TUTBMSearch.Compile(Pattern: PUCS2; PatternLength: Integer; Flags: TSearchFlags); var HaveSpace: Boolean; I, J, K, SLen: Integer; Cp: PUTBMChar; Sp: PUTBMSkip; C1, C2, Sentinel: UCS4; begin if Assigned(Pattern) and (Pattern^ <> #0) and (PatternLength > 0) then begin // do some initialization FFlags := Flags; // extra skip flag FMD4 := 1; Sentinel := 0; // allocate more storage if necessary FPattern := AllocMem(SizeOf(TUTBMChar) * PatternLength); FSkipValues := AllocMem(SizeOf(TUTBMSkip) * PatternLength); FPatternSize := PatternLength; // Preprocess the pattern to remove controls (if specified) and determine case. Cp := FPattern; I := 0; HaveSpace := False; while I < PatternLength do begin C1 := Word(Pattern[I]); if (I + 1) < PatternLength then C2 := Word(Pattern[I + 1]) else C2 := $FFFFFFFF; if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); // Make sure the HaveSpace flag is turned off if the character is not an appropriate one. if not UnicodeIsWhiteSpace(C1) then HaveSpace := False; // If non-spacing characters should be ignored, do it here. if (sfIgnoreNonSpacing in Flags) and UnicodeIsNonSpacing(C1) then begin Inc(I); Continue; end; // check if spaces and controls need to be compressed if sfSpaceCompress in Flags then begin if UnicodeIsWhiteSpace(C1) then begin if not HaveSpace then begin // Add a space and set the flag. Cp.UpCase := $20; Cp.LoCase := $20; Cp.TitleCase := $20; Inc(Cp); // increase the real pattern length Inc(FPatternLength); Sentinel := $20; HaveSpace := True; end; Inc(I); Continue; end; // ignore all control characters if UnicodeIsControl(C1) then begin Inc(I); Continue; end; end; // add the character if not (sfCaseSensitive in Flags) then begin Cp.UpCase := UnicodeToUpper(C1); Cp.LoCase := UnicodeToLower(C1); Cp.TitleCase := UnicodeToTitle(C1); end else begin Cp.UpCase := C1; Cp.LoCase := C1; Cp.TitleCase := C1; end; Sentinel := Cp.UpCase; // move to the next character Inc(Cp); // increase the real pattern length appropriately if C1 >= $10000 then Inc(FPatternLength, 2) else Inc(FPatternLength, 1); // increment the loop index for UTF-16 characters if C1 > $10000 then Inc(I, 2) else Inc(I); end; // set the number of characters actually used FPatternUsed := (PChar(Cp) - PChar(FPattern)) div SizeOf(TUTBMChar); // Go through and construct the skip array and determine the actual length // of the pattern in UCS2 terms. SLen := FPatternLength - 1; Cp := FPattern; K := 0; for I := 0 to FPatternUsed - 1 do begin // locate the character in the FSkipValues array Sp := FSkipValues; J := 0; while (J < FSkipsUsed) and (Sp.BMChar.UpCase <> Cp.UpCase) do begin Inc(J); Inc(Sp); end; // If the character is not found, set the new FSkipValues element and // increase the number of FSkipValues elements. if J = FSkipsUsed then begin Sp.BMChar := Cp; Inc(FSkipsUsed); end; // Set the updated FSkipValues value. If the character is UTF-16 and is // not the last one in the pattern, add one to its FSkipValues value. Sp.SkipValues := SLen - K; if (Cp.UpCase >= $10000) and ((K + 2) < SLen) then Inc(Sp.SkipValues); // set the new extra FSkipValues for the sentinel character if ((Cp.UpCase >= $10000) and ((K + 2) <= SLen) or ((K + 1) <= SLen) and (Cp.UpCase = Sentinel)) then FMD4 := SLen - K; // increase the actual index if Cp.UpCase >= $10000 then Inc(K, 2) else Inc(K, 1); Inc(Cp); end; end; end; //---------------------------------------------------------------------------------------------------------------------- function TUTBMSearch.Find(Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean; // this is the main matching routine using a tuned Boyer-Moore algorithm var K: Cardinal; Start, Stop: PUCS2; begin Result := False; if Assigned(FPattern) and (FPatternUsed > 0) and Assigned(Text) and (TextLen > 0) and (TextLen >= FPatternLength) then begin Start := Text + FPatternLength - 1; Stop := Text + TextLen; // adjust the start point if it points to a low surrogate if (SurrogateLowStart <= UCS4(Start^)) and (UCS4(Start^) <= SurrogateLowEnd) and (SurrogateHighStart <= UCS4((Start - 1)^)) and (UCS4((Start - 1)^) <= SurrogateHighEnd) then Dec(Start); while Start < Stop do begin repeat K := GetSkipValue(Start, Stop); if K = 0 then Break; Inc(Start, K); if (Start < Stop) and (SurrogateLowStart <= UCS4(Start^)) and (UCS4(Start^) <= SurrogateLowEnd) and (SurrogateHighStart <= UCS4((Start - 1)^)) and (UCS4((Start - 1)^) <= SurrogateHighEnd) then Dec(Start); until False; if (Start < Stop) and Match(Text, Start, Stop, MatchStart, MatchEnd) then begin Result := True; Break; end; Inc(Start, FMD4); if (Start < Stop) and (SurrogateLowStart <= UCS4(Start^)) and (UCS4(Start^) <= SurrogateLowEnd) and (SurrogateHighStart <= UCS4((Start - 1)^)) and (UCS4((Start - 1)^) <= SurrogateHighEnd) then Dec(Start); end; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TUTBMSearch.Clear; begin ClearPattern; inherited; end; //---------------------------------------------------------------------------------------------------------------------- function TUTBMSearch.FindAll(const Text: WideString): Boolean; begin Result := FindAll(PWideChar(Text), Length(Text)); end; //---------------------------------------------------------------------------------------------------------------------- function TUTBMSearch.FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; // Looks for all occurences of the pattern passed to FindPrepare and creates an internal list of their positions. var Start, Stop: Cardinal; Run: PWideChar; RunLen: Cardinal; begin ClearResults; Run := Text; RunLen := TextLen; // repeat to find all occurences of the pattern while Find(Run, RunLen, Start, Stop) do begin // store this result (consider text pointer movement)... AddResult(Start + Run - Text, Stop + Run - Text); // ... and advance text position and length Inc(Run, Stop); Dec(RunLen, Stop); end; Result := Count > 0; end; //---------------------------------------------------------------------------------------------------------------------- function TUTBMSearch.FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; // Looks for the first occurence of the pattern passed to FindPrepare in Text and returns True if one could be // found (in which case Start and Stop are set to the according indices) otherwise False. // This function is in particular of interest if only one occurence needs to be found. begin ClearResults; Result := Find(PWideChar(Text), Length(Text), Start, Stop); if Result then AddResult(Start, Stop); end; //---------------------------------------------------------------------------------------------------------------------- function TUTBMSearch.FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; // Same as the WideString version of this method. begin ClearResults; Result := Find(Text, TextLen, Start, Stop); if Result then AddResult(Start, Stop); end; //---------------------------------------------------------------------------------------------------------------------- procedure TUTBMSearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags); begin FindPrepare(PWideChar(Pattern), Length(Pattern), Options); end; //---------------------------------------------------------------------------------------------------------------------- procedure TUTBMSearch.FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); // prepare following search by compiling the given pattern into an internal structure begin Compile(Pattern, PatternLength, Options); end; //----------------- Unicode RE search core ----------------------------------------------------------------------------- const _URE_NONSPACING = $00000001; _URE_COMBINING = $00000002; _URE_NUMDIGIT = $00000004; _URE_NUMOTHER = $00000008; _URE_SPACESEP = $00000010; _URE_LINESEP = $00000020; _URE_PARASEP = $00000040; _URE_CNTRL = $00000080; _URE_PRIVATE = $00000100; _URE_UPPER = $00000200; _URE_LOWER = $00000400; _URE_TITLE = $00000800; _URE_MODIFIER = $00001000; _URE_OTHERLETTER = $00002000; _URE_DASHPUNCT = $00004000; _URE_OPENPUNCT = $00008000; _URE_CLOSEPUNCT = $00010000; _URE_OTHERPUNCT = $00020000; _URE_MATHSYM = $00040000; _URE_CURRENCYSYM = $00080000; _URE_OTHERSYM = $00100000; _URE_LTR = $00200000; _URE_RTL = $00400000; _URE_EURONUM = $00800000; _URE_EURONUMSEP = $01000000; _URE_EURONUMTERM = $02000000; _URE_ARABNUM = $04000000; _URE_COMMONSEP = $08000000; _URE_BLOCKSEP = $10000000; _URE_SEGMENTSEP = $20000000; _URE_WHITESPACE = $40000000; _URE_OTHERNEUT = $80000000; // Error codes _URE_OK = 0; _URE_UNEXPECTED_EOS = -1; _URE_CCLASS_OPEN = -2; _URE_UNBALANCED_GROUP = -3; _URE_INVALID_PROPERTY = -4; _URE_INVALID_RANGE = -5; _URE_RANGE_OPEN = -6; // options that can be combined for searching URE_IGNORE_NONSPACING = $01; URE_DONT_MATCHES_SEPARATORS = $02; const // Flags used internally in the FDFA _URE_DFA_CASEFOLD = $01; _URE_DFA_BLANKLINE = $02; CClassFlags: array[0..32] of Cardinal = ( 0, _URE_NONSPACING, _URE_COMBINING, _URE_NUMDIGIT, _URE_NUMOTHER, _URE_SPACESEP, _URE_LINESEP, _URE_PARASEP, _URE_CNTRL, _URE_PRIVATE, _URE_UPPER, _URE_LOWER, _URE_TITLE, _URE_MODIFIER, _URE_OTHERLETTER, _URE_DASHPUNCT, _URE_OPENPUNCT, _URE_CLOSEPUNCT, _URE_OTHERPUNCT, _URE_MATHSYM, _URE_CURRENCYSYM, _URE_OTHERSYM, _URE_LTR, _URE_RTL, _URE_EURONUM, _URE_EURONUMSEP, _URE_EURONUMTERM, _URE_ARABNUM, _URE_COMMONSEP, _URE_BLOCKSEP, _URE_SEGMENTSEP, _URE_WHITESPACE, _URE_OTHERNEUT ); const // symbol types for the FDFA _URE_ANY_CHAR = 1; _URE_CHAR = 2; _URE_CCLASS = 3; _URE_NCCLASS = 4; _URE_BOL_ANCHOR = 5; _URE_EOL_ANCHOR = 6; // op codes for converting the NFA to a FDFA _URE_SYMBOL = 10; _URE_PAREN = 11; _URE_QUEST = 12; _URE_STAR = 13; _URE_PLUS = 14; _URE_ONE = 15; _URE_AND = 16; _URE_OR = 17; _URE_NOOP = $FFFF; _URE_REGSTART = $8000; _URE_REGEND = $4000; //----------------- TURESearch ----------------------------------------------------------------------------------------- constructor TURESearch.Create(AOwner: TWideStrings); begin inherited; end; //---------------------------------------------------------------------------------------------------------------------- destructor TURESearch.Destroy; begin inherited; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.Clear; begin inherited; ClearUREBuffer; ClearDFA; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.Push(V: Cardinal); begin with FUREBuffer do begin // If the 'Reducing' parameter is True, check to see if the value passed is already on the stack. if Reducing and ExpressionList.Expressions[Word(V)].OnStack then Exit; if Stack.ListUsed = Length(Stack.List) then SetLength(Stack.List, Length(Stack.List) + 8); Stack.List[Stack.ListUsed] := V; Inc(Stack.ListUsed); // If the 'reducing' parameter is True, flag the element as being on the Stack. if Reducing then ExpressionList.Expressions[Word(V)].OnStack := True; end; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.Peek: Cardinal; begin if FUREBuffer.Stack.ListUsed = 0 then Result := _URE_NOOP else Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed - 1]; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.Pop: Cardinal; begin if FUREBuffer.Stack.ListUsed = 0 then Result := _URE_NOOP else begin Dec(FUREBuffer.Stack.ListUsed); Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed]; if FUREBuffer.Reducing then FUREBuffer.ExpressionList.Expressions[Word(Result)].OnStack := False; end; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.ParsePropertyList(Properties: PUCS2; Limit: Cardinal; var Mask: Cardinal): Cardinal; // Parse a comma-separated list of integers that represent character properties. Combine them // into a mask that is returned in the 'mask' variable, and return the number of characters consumed. var M, N: Cardinal; Run, ListEnd: PUCS2; begin Run := Properties; ListEnd := Run + Limit; M := 0; N := 0; while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do begin if Run^ = ',' then begin // Encountered a comma, so select the next character property flag and reset the number. M := M or CClassFlags[N]; N := 0; end else if (Run^ >= '0') and (Run^ <= '9') then begin // Encountered a digit, so start or Continue building the cardinal that represents the character property flag. N := (N * 10) + Cardinal(Word(Run^) - Ord('0')); end else // Encountered something that is not part of the property list. Indicate that we are done. Break; // If a property number greater than 32 occurs, then there is a problem. Most likely a missing comma separator. if N > 32 then FUREBuffer.Error := _URE_INVALID_PROPERTY; Inc(Run); end; if N in [1..32] then M := M or CClassFlags[N]; // Set the mask that represents the group of character properties. Mask := M; // Return the number of characters consumed. Result := Run - Properties; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.MakeHexNumber(NP: PUCS2; Limit: Cardinal; var Number: Cardinal): Cardinal; // Collect a hex number with 1 to 4 digits and return the number of characters used. var I: Integer; Run, ListEnd: PUCS2; begin Run := np; ListEnd := Run + Limit; Number := 0; I := 0; while (I < 4) and (Run < ListEnd) do begin if (Run^ >= '0') and (Run^ <= '9') then Number := (Number shl 4) + Cardinal(Word(Run^) - Ord('0')) else if (Run^ >= 'A') and (Run^ <= 'F') then Number := (Number shl 4) + Cardinal(Word(Run^) - Ord('A')) + 10 else if (Run^ >= 'a') and (Run^ <= 'f') then Number := (Number shl 4) + Cardinal(Word(Run^) - Ord('a')) + 10 else Break; Inc(I); Inc(Run); end; Result := Run - NP; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.AddRange(var CCL: TCClass; Range: TRange); // Insert a Range into a character class, removing duplicates and ordering them in increasing Range-start order. var I: Integer; Temp: UCS4; begin // If the `Casefold' flag is set, then make sure both endpoints of the Range are converted to lower. if (FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0 then begin Range.MinCode := UnicodeToLower(Range.MinCode); Range.MaxCode := UnicodeToLower(Range.MaxCode); end; // Swap the Range endpoints if they are not in increasing order. if Range.MinCode > Range.MaxCode then begin Temp := Range.MinCode; Range.MinCode := Range.MaxCode; Range.MaxCode := Temp; end; I := 0; while (I < CCL.RangesUsed) and (Range.MinCode < CCL.Ranges[I].MinCode) do Inc(I); // check for a duplicate if (I < CCL.RangesUsed) and (Range.MinCode = CCL.Ranges[I].MinCode) and (Range.MaxCode = CCL.Ranges[I].MaxCode) then Exit; if CCL.RangesUsed = Length(CCL.Ranges) then SetLength(CCL.Ranges, Length(CCL.Ranges) + 8); if I < CCL.RangesUsed then Move(CCL.Ranges[I], CCL.Ranges[I + 1], SizeOf(TRange) * (CCL.RangesUsed - I)); CCL.Ranges[I].MinCode := Range.MinCode; CCL.Ranges[I].MaxCode := Range.MaxCode; Inc(CCL.RangesUsed); end; //---------------------------------------------------------------------------------------------------------------------- const _URE_ALPHA_MASK = _URE_UPPER or _URE_LOWER or _URE_OTHERLETTER or _URE_MODIFIER or _URE_TITLE or _URE_NONSPACING or _URE_COMBINING; _URE_ALNUM_MASK = _URE_ALPHA_MASK or _URE_NUMDIGIT; _URE_PUNCT_MASK = _URE_DASHPUNCT or _URE_OPENPUNCT or _URE_CLOSEPUNCT or _URE_OTHERPUNCT; _URE_GRAPH_MASK = _URE_NUMDIGIT or _URE_NUMOTHER or _URE_ALPHA_MASK or _URE_MATHSYM or _URE_CURRENCYSYM or _URE_OTHERSYM; _URE_PRINT_MASK = _URE_GRAPH_MASK or _URE_SPACESEP; _URE_SPACE_MASK = _URE_SPACESEP or _URE_LINESEP or _URE_PARASEP; type PTrie = ^TTrie; TTrie = record Key: UCS2; Len, Next: Cardinal; Setup: Integer; Mask: Cardinal; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.CCLSetup(Symbol: PSymbolTableEntry; Mask: Cardinal); begin Symbol.Props := Symbol.Props or Mask; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.SpaceSetup(Symbol: PSymbolTableEntry; Mask: Cardinal); var Range: TRange; begin Symbol.Props := Symbol.Props or Mask; Range.MinCode := Word(Tabulator); Range.MaxCode := Word(Tabulator); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := Word(CarriageReturn); Range.MaxCode := Word(CarriageReturn); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := Word(LineFeed); Range.MaxCode := Word(LineFeed); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := Word(FormFeed); Range.MaxCode := Word(FormFeed); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := $FEFF; Range.MaxCode := $FEFF; AddRange(Symbol.Symbol.CCL, Range); end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.HexDigitSetup(Symbol: PSymbolTableEntry; Mask: Cardinal); var Range: TRange; begin Range.MinCode := Word('0'); Range.MaxCode := Word('9'); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := Word('A'); Range.MaxCode := Word('F'); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := Word('a'); Range.MaxCode := Word('f'); AddRange(Symbol.Symbol.CCL, Range); end; //---------------------------------------------------------------------------------------------------------------------- const CClassTrie: array[0..64] of TTrie = ( (Key: #$003A; Len: 1; Next: 1; Setup: 0; Mask: 0), (Key: #$0061; Len: 9; Next: 10; Setup: 0; Mask: 0), (Key: #$0063; Len: 8; Next: 19; Setup: 0; Mask: 0), (Key: #$0064; Len: 7; Next: 24; Setup: 0; Mask: 0), (Key: #$0067; Len: 6; Next: 29; Setup: 0; Mask: 0), (Key: #$006C; Len: 5; Next: 34; Setup: 0; Mask: 0), (Key: #$0070; Len: 4; Next: 39; Setup: 0; Mask: 0), (Key: #$0073; Len: 3; Next: 49; Setup: 0; Mask: 0), (Key: #$0075; Len: 2; Next: 54; Setup: 0; Mask: 0), (Key: #$0078; Len: 1; Next: 59; Setup: 0; Mask: 0), (Key: #$006C; Len: 1; Next: 11; Setup: 0; Mask: 0), (Key: #$006E; Len: 2; Next: 13; Setup: 0; Mask: 0), (Key: #$0070; Len: 1; Next: 16; Setup: 0; Mask: 0), (Key: #$0075; Len: 1; Next: 14; Setup: 0; Mask: 0), (Key: #$006D; Len: 1; Next: 15; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 16; Setup: 1; Mask: _URE_ALNUM_MASK), (Key: #$0068; Len: 1; Next: 17; Setup: 0; Mask: 0), (Key: #$0061; Len: 1; Next: 18; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 19; Setup: 1; Mask: _URE_ALPHA_MASK), (Key: #$006E; Len: 1; Next: 20; Setup: 0; Mask: 0), (Key: #$0074; Len: 1; Next: 21; Setup: 0; Mask: 0), (Key: #$0072; Len: 1; Next: 22; Setup: 0; Mask: 0), (Key: #$006C; Len: 1; Next: 23; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 24; Setup: 1; Mask: _URE_CNTRL), (Key: #$0069; Len: 1; Next: 25; Setup: 0; Mask: 0), (Key: #$0067; Len: 1; Next: 26; Setup: 0; Mask: 0), (Key: #$0069; Len: 1; Next: 27; Setup: 0; Mask: 0), (Key: #$0074; Len: 1; Next: 28; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 29; Setup: 1; Mask: _URE_NUMDIGIT), (Key: #$0072; Len: 1; Next: 30; Setup: 0; Mask: 0), (Key: #$0061; Len: 1; Next: 31; Setup: 0; Mask: 0), (Key: #$0070; Len: 1; Next: 32; Setup: 0; Mask: 0), (Key: #$0068; Len: 1; Next: 33; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 34; Setup: 1; Mask: _URE_GRAPH_MASK), (Key: #$006F; Len: 1; Next: 35; Setup: 0; Mask: 0), (Key: #$0077; Len: 1; Next: 36; Setup: 0; Mask: 0), (Key: #$0065; Len: 1; Next: 37; Setup: 0; Mask: 0), (Key: #$0072; Len: 1; Next: 38; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 39; Setup: 1; Mask: _URE_LOWER), (Key: #$0072; Len: 2; Next: 41; Setup: 0; Mask: 0), (Key: #$0075; Len: 1; Next: 45; Setup: 0; Mask: 0), (Key: #$0069; Len: 1; Next: 42; Setup: 0; Mask: 0), (Key: #$006E; Len: 1; Next: 43; Setup: 0; Mask: 0), (Key: #$0074; Len: 1; Next: 44; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 45; Setup: 1; Mask: _URE_PRINT_MASK), (Key: #$006E; Len: 1; Next: 46; Setup: 0; Mask: 0), (Key: #$0063; Len: 1; Next: 47; Setup: 0; Mask: 0), (Key: #$0074; Len: 1; Next: 48; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 49; Setup: 1; Mask: _URE_PUNCT_MASK), (Key: #$0070; Len: 1; Next: 50; Setup: 0; Mask: 0), (Key: #$0061; Len: 1; Next: 51; Setup: 0; Mask: 0), (Key: #$0063; Len: 1; Next: 52; Setup: 0; Mask: 0), (Key: #$0065; Len: 1; Next: 53; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 54; Setup: 2; Mask: _URE_SPACE_MASK), (Key: #$0070; Len: 1; Next: 55; Setup: 0; Mask: 0), (Key: #$0070; Len: 1; Next: 56; Setup: 0; Mask: 0), (Key: #$0065; Len: 1; Next: 57; Setup: 0; Mask: 0), (Key: #$0072; Len: 1; Next: 58; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 59; Setup: 1; Mask: _URE_UPPER), (Key: #$0064; Len: 1; Next: 60; Setup: 0; Mask: 0), (Key: #$0069; Len: 1; Next: 61; Setup: 0; Mask: 0), (Key: #$0067; Len: 1; Next: 62; Setup: 0; Mask: 0), (Key: #$0069; Len: 1; Next: 63; Setup: 0; Mask: 0), (Key: #$0074; Len: 1; Next: 64; Setup: 0; Mask: 0), (Key: #$003A; Len: 1; Next: 65; Setup: 3; Mask: 0) ); //---------------------------------------------------------------------------------------------------------------------- function TURESearch.PosixCCL(CP: PUCS2; Limit: Cardinal; Symbol: PSymbolTableEntry): Cardinal; // Probe for one of the POSIX colon delimited character classes in the static trie. var I: Integer; N: Cardinal; TP: PTrie; Run, ListEnd: PUCS2; begin Result := 0; // If the number of characters left is less than 7, // then this cannot be interpreted as one of the colon delimited classes. if Limit >= 7 then begin Run := cp; ListEnd := Run + Limit; TP := @CClassTrie[0]; I := 0; while (Run < ListEnd) and (I < 8) do begin N := TP.Len; while (N > 0) and (TP.Key <> Run^) do begin Inc(TP); Dec(N); end; if N = 0 then begin Result := 0; Exit; end; if (Run^ = ':') and ((I = 6) or (I = 7)) then begin Inc(Run); Break; end; if (Run + 1) < ListEnd then TP := @CClassTrie[TP.Next]; Inc(I); Inc(Run); end; case TP.Setup of 1: begin CCLSetup(Symbol, TP.Mask); Result := Run - CP; end; 2: begin SpaceSetup(Symbol, TP.Mask); Result := Run - CP; end; 3: begin HexDigitSetup(Symbol, TP.Mask); Result := Run - CP; end; else Result := 0; end; end; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.BuildCharacterClass(CP: PUCS2; Limit: Cardinal; Symbol: PSymbolTableEntry): Cardinal; // Construct a list of ranges and return the number of characters consumed. var RangeEnd: Integer; N: Cardinal; Run, ListEnd: PUCS2; C, Last: UCS4; Range: TRange; begin Run := cp; ListEnd := Run + Limit; if Run^ = '^' then begin Symbol.AType := _URE_NCCLASS; Inc(Run); end else Symbol.AType := _URE_CCLASS; Last := 0; RangeEnd := 0; while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do begin // Allow for the special case []abc], where the first closing bracket would end an empty // character class, which makes no sense. Hence this bracket is treaded literally. if (Run^ = ']') and (Symbol.Symbol.CCL.RangesUsed > 0) then Break; C := UCS4(Run^); Inc(Run); // escape character if C = Ord('\') then begin if Run = ListEnd then begin // The EOS was encountered when expecting the reverse solidus to be followed by the character it is escaping. // Set an Error code and return the number of characters consumed up to this point. FUREBuffer.Error := _URE_UNEXPECTED_EOS; Result := Run - CP; Exit; end; C := UCS4(Run^); Inc(Run); case UCS2(C) of 'a': C := $07; 'b': C := $08; 'f': C := $0c; 'n': C := $0a; 'R': C := $0d; 't': C := $09; 'v': C := $0b; 'p', 'P': begin Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Props)); // Invert the bit mask of the properties if this is a negated character class or if 'P' is used to specify // a list of character properties that should *not* match in a character class. if C = Ord('P') then Symbol.Props := not Symbol.Props; Continue; end; 'x', 'X', 'u', 'U': begin if (Run < ListEnd) and ((Run^ >= '0') and (Run^ <= '9') or (Run^ >= 'A') and (Run^ <= 'F') or (Run^ >= 'a') and (Run^ <= 'f')) then Inc(Run, MakeHexNumber(Run, ListEnd - Run, C)); end; end; end else if C = Ord(':') then begin // Probe for a POSIX colon delimited character class. Dec(Run); N := PosixCCL(Run, ListEnd - Run, Symbol); if N = 0 then Inc(Run) else begin Inc(Run, N); Continue; end; end; // Check to see if the current character is a low surrogate that needs // to be combined with a preceding high surrogate. if Last <> 0 then begin if (C >= SurrogateLowStart) and (C <= SurrogateLowEnd) then begin // Construct the UTF16 character code. C := $10000 + (((Last and $03FF) shl 10) or (C and $03FF)) end else begin // Add the isolated high surrogate to the range. if RangeEnd = 1 then Range.MaxCode := Last and $FFFF else begin Range.MinCode := Last and $FFFF; Range.MaxCode := Last and $FFFF; end; AddRange(Symbol.Symbol.CCL, Range); RangeEnd := 0; end; end; // Clear the Last character code. Last := 0; // This slightly awkward code handles the different cases needed to construct a range. if (C >= SurrogateHighStart) and (C <= SurrogateHighEnd) then begin // If the high surrogate is followed by a Range indicator, simply add it as the Range start. Otherwise, // save it in the next character is a low surrogate. if Run^ = '-' then begin Inc(Run); Range.MinCode := C; RangeEnd := 1; end else Last := C; end else if RangeEnd = 1 then begin Range.MaxCode := C; AddRange(Symbol.Symbol.CCL, Range); RangeEnd := 0; end else begin Range.MinCode := C; Range.MaxCode := C; if Run^ = '-' then begin Inc(Run); RangeEnd := 1; end else AddRange(Symbol.Symbol.CCL, Range); end; end; if (Run < ListEnd) and (Run^ = ']') then Inc(Run) else begin // The parse was not terminated by the character class close symbol (']'), so set an error code. FUREBuffer.Error := _URE_CCLASS_OPEN; end; Result := Run - CP; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.ProbeLowSurrogate(LeftState: PUCS2; Limit: Cardinal; var Code: UCS4): Cardinal; // Probe for a low surrogate hex code. var I: Integer; Run, ListEnd: PUCS2; begin I := 0; Code := 0; Run := LeftState; ListEnd := Run + Limit; while (I < 4) and (Run < ListEnd) do begin if (Run^ >= '0') and (Run^ <= '9') then Code := (Code shl 4) + Cardinal(Word(Run^) - Ord('0')) else if (Run^ >= 'A') and (Run^ <= 'F') then Code := (Code shl 4) + Cardinal(Word(Run^) - Ord('A')) + 10 else if (Run^ >= 'a') and (Run^ <= 'f') then Code := (Code shl 4) + Cardinal(Word(Run^) - Ord('a')) + 10 else Break; Inc(Run); end; if (SurrogateLowStart <= Code) and (Code <= SurrogateLowEnd) then Result := Run - LeftState else Result := 0; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.CompileSymbol(S: PUCS2; Limit: Cardinal; Symbol: PSymbolTableEntry): Cardinal; var C: UCS4; Run, ListEnd: PUCS2; begin Run := S; ListEnd := S + Limit; C := Word(Run^); Inc(Run); if C = Ord('\') then begin if Run = ListEnd then begin // The EOS was encountered when expecting the reverse solidus to be followed by the character it is escaping. // Set an Error code and return the number of characters consumed up to this point. FUREBuffer.Error := _URE_UNEXPECTED_EOS; Result := Run - S; Exit; end; C := Word(Run^); Inc(Run); case UCS2(C) of 'p', 'P': begin if UCS2(C) = 'p' then Symbol.AType :=_URE_CCLASS else Symbol.AType :=_URE_NCCLASS; Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Props)); end; 'a': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $07; end; 'b': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $08; end; 'f': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0C; end; 'n': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0A; end; 'r': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0D; end; 't': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $09; end; 'v': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0B; end; else case UCS2(C) of 'x', 'X', 'u', 'U': begin // Collect between 1 and 4 digits representing an UCS2 code. if (Run < ListEnd) and ((Run^ >= '0') and (Run^ <= '9') or (Run^ >= 'A') and (Run^ <= 'F') or (Run^ >= 'a') and (Run^ <= 'f')) then Inc(Run, MakeHexNumber(Run, ListEnd - Run, C)); end; end; // Simply add an escaped character here. Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := C; end; end else if (UCS2(C) = '^') or (UCS2(C) = '$') then begin // Handle the BOL and EOL anchors. This actually consists simply of setting a flag that indicates that the user // supplied anchor match function should be called. This needs to be done instead of simply matching line/paragraph // separators because beginning-of-text and end-of-text tests are needed as well. if UCS2(C) = '^' then Symbol.AType := _URE_BOL_ANCHOR else Symbol.AType := _URE_EOL_ANCHOR; end else if UCS2(C) = '[' then begin // construct a character class Inc(Run, BuildCharacterClass(Run, ListEnd - Run, Symbol)); end else if UCS2(C) = '.' then Symbol.AType := _URE_ANY_CHAR else begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := C; end; // If the symbol type happens to be a character and is a high surrogate, then probe forward to see if it is followed // by a low surrogate that needs to be added. if (Run < ListEnd) and (Symbol.AType = _URE_CHAR) and (SurrogateHighStart <= Symbol.Symbol.Chr) and (Symbol.Symbol.Chr <= SurrogateHighEnd) then begin if (SurrogateLowStart <= UCS4(Run^)) and (UCS4(Run^) <= SurrogateLowEnd) then begin Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (Word(Run^) and $03FF)); Inc(Run); end else if (Run^ = '\') and (((Run + 1)^ = 'x') or ((Run + 1)^ = 'X') or ((Run + 1)^ = 'u') or ((Run + 1)^ = 'U')) then begin Inc(Run, ProbeLowSurrogate(Run + 2, ListEnd - (Run + 2), C)); if (SurrogateLowStart <= C) and (C <= SurrogateLowEnd) then begin // Take into account the \[xu] in front of the hex code. Inc(Run, 2); Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (C and $03FF)); end; end; end; // Last, make sure any _URE_CHAR type symbols are changed to lower if the 'Casefold' flag is set. if ((FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0) and (Symbol.AType = _URE_CHAR) then Symbol.Symbol.Chr := UnicodeToLower(Symbol.Symbol.Chr); // If the symbol constructed is anything other than one of the anchors, // make sure the _URE_DFA_BLANKLINE flag is removed. if (Symbol.AType <> _URE_BOL_ANCHOR) and (Symbol.AType <> _URE_EOL_ANCHOR) then FUREBuffer.Flags := FUREBuffer.Flags and not _URE_DFA_BLANKLINE; // Return the number of characters consumed. Result := Run - S; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.SymbolsAreDifferent(A, B: PSymbolTableEntry): Boolean; begin Result := False; if (A.AType <> B.AType) or (A.Mods <> B.Mods) or (A.Props <> B.Props) then Result := True else begin if (A.AType = _URE_CCLASS) or (A.AType = _URE_NCCLASS) then begin if A.Symbol.CCL.RangesUsed <> B.Symbol.CCL.RangesUsed then Result := True else begin if (A.Symbol.CCL.RangesUsed > 0) and not CompareMem(@A.Symbol.CCL.Ranges[0], @B.Symbol.CCL.Ranges[0], SizeOf(TRange) * A.Symbol.CCL.RangesUsed) then begin Result := True;; end end; end else if (A.AType = _URE_CHAR) and (A.Symbol.Chr <> B.Symbol.Chr) then Result := True; end; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.MakeSymbol(S: PUCS2; Limit: Cardinal; var Consumed: Cardinal): Cardinal; // Construct a symbol, but only keep unique symbols. var I: Integer; Start: PSymbolTableEntry; Symbol: TSymbolTableEntry; begin // Build the next symbol so we can test to see if it is already in the symbol table. FillChar(Symbol, SizeOf(TSymbolTableEntry), 0); Consumed := CompileSymbol(S, Limit, @Symbol); // Check to see if the symbol exists. I := 0; Start := @FUREBuffer.SymbolTable.Symbols[0]; while (I < FUREBuffer.SymbolTable.SymbolsUsed) and SymbolsAreDifferent(@Symbol, Start) do begin Inc(I); Inc(Start); end; if I < FUREBuffer.SymbolTable.SymbolsUsed then begin // Free up any ranges used for the symbol. if (Symbol.AType = _URE_CCLASS) or (Symbol.AType = _URE_NCCLASS) then Symbol.Symbol.CCL.Ranges := nil; Result := FUREBuffer.SymbolTable.Symbols[I].ID; Exit; end; // Need to add the new symbol. if FUREBuffer.SymbolTable.SymbolsUsed = Length(FUREBuffer.SymbolTable.Symbols) then begin SetLength(FUREBuffer.SymbolTable.Symbols, Length(FUREBuffer.SymbolTable.Symbols) + 8); end; Symbol.ID := FUREBuffer.SymbolTable.SymbolsUsed; Inc(FUREBuffer.SymbolTable.SymbolsUsed); FUREBuffer.SymbolTable.Symbols[Symbol.ID] := Symbol; Result := Symbol.ID; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.MakeExpression(AType, LHS, RHS: Cardinal): Cardinal; var I: Integer; begin // Determine if the expression already exists or not. with FUREBuffer.ExpressionList do begin for I := 0 to ExpressionsUsed - 1 do if (Expressions[I].AType = AType) and (Expressions[I].LHS = LHS) and (Expressions[I].RHS = RHS) then begin Result := I; Exit; end; // Need to add a new expression. if ExpressionsUsed = Length(Expressions) then SetLength(Expressions, Length(Expressions) + 8); Expressions[ExpressionsUsed].OnStack := False; Expressions[ExpressionsUsed].AType := AType; Expressions[ExpressionsUsed].LHS := LHS; Expressions[ExpressionsUsed].RHS := RHS; Result := ExpressionsUsed; Inc(ExpressionsUsed); end; end; //---------------------------------------------------------------------------------------------------------------------- function IsSpecial(C: Word): Boolean; begin Result := C in [Word('+'), Word('*'), Word('?'), Word('{'), Word('|'), Word(')')]; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.CollectPendingOperations(var State: Cardinal); // collect all pending AND and OR operations and make corresponding expressions var Operation: Cardinal; begin repeat Operation := Peek; if (Operation <> _URE_AND) and (Operation <> _URE_OR) then Break; // make an expression with the AND or OR operator and its right hand side Operation := Pop; State := MakeExpression(Operation, Pop, State); until False; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.ConvertRE2NFA(RE: PWideChar; RELength: Cardinal): Cardinal; // Convert the regular expression into an NFA in a form that will be easy to reduce to a FDFA. // The starting state for the reduction will be returned. var C: UCS2; Head, Tail: PUCS2; S: WideString; Symbol, State, LastState, Used, M, N: Cardinal; I: Integer; begin State := _URE_NOOP; Head := RE; Tail := Head + RELength; while (FUREBuffer.Error = _URE_OK) and (Head < Tail) do begin C := Head^; Inc(Head); case C of '(': Push(_URE_PAREN); ')': // check for the case of too many close parentheses begin if Peek = _URE_NOOP then begin FUREBuffer.Error := _URE_UNBALANCED_GROUP; Break; end; CollectPendingOperations(State); // remove the _URE_PAREN off the stack Pop; end; '*': State := MakeExpression(_URE_STAR, State, _URE_NOOP); '+': State := MakeExpression(_URE_PLUS, State, _URE_NOOP); '?': State := MakeExpression(_URE_QUEST, State, _URE_NOOP); '|': begin CollectPendingOperations(State); Push(State); Push(_URE_OR); end; '{': // expressions of the form {m, n} begin C := #0; M := 0; N := 0; // get first number while UnicodeIsWhiteSpace(Word(Head^)) do Inc(Head); S := ''; while Head^ in [WideChar('0')..WideChar('9')] do begin S := S + Head^; Inc(Head); end; if S <> '' then M := StrToInt(S); while UnicodeIsWhiteSpace(Word(Head^)) do Inc(Head); if (Head^ <> ',') and (Head^ <> '}') then begin FUREBuffer.Error := _URE_INVALID_RANGE; Break; end; // check for an upper limit if Head^ <> '}' then begin Inc(Head); // get second number while UnicodeIsWhiteSpace(Word(Head^)) do Inc(Head); S := ''; while Head^ in [WideChar('0')..WideChar('9')] do begin S := S + Head^; Inc(Head); end; if S <> '' then N := StrToInt(S); end else N := M; if Head^ <> '}' then begin FUREBuffer.Error := _URE_RANGE_OPEN; Break; end else Inc(Head); // N = 0 means unlimited number of occurences if N = 0 then begin case M of 0: // {,} {0,} {0, 0} mean the same as the star operator State := MakeExpression(_URE_STAR, State, _URE_NOOP); 1: // {1,} {1, 0} mean the same as the plus operator State := MakeExpression(_URE_PLUS, State, _URE_NOOP); else begin // encapsulate the expanded branches as would they be in parenthesis // in order to avoid unwanted concatenation with pending operations/symbols Push(_URE_PAREN); // {m,} {m, 0} mean M fixed occurences plus star operator // make E^m... for I := 1 to M - 1 do begin Push(State); Push(_URE_AND); end; // ...and repeat the last symbol one or more times State := MakeExpression(_URE_PLUS, State, _URE_NOOP); CollectPendingOperations(State); Pop; end; end; end else begin // check proper range limits if M > N then begin FUREBuffer.Error := _URE_INVALID_RANGE; Break; end; // check special case {0, 1} (which corresponds to the ? operator) if (M = 0) and (N = 1) then State := MakeExpression(_URE_QUEST, State, _URE_NOOP) else begin // handle the general case by expanding {m, n} into the equivalent // expression E^m | E^(m + 1) | ... | E^n // encapsulate the expanded branches as would they be in parenthesis // in order to avoid unwanted concatenation with pending operations/symbols Push(_URE_PAREN); // keep initial state as this is the one all alternatives start from LastState := State; // Consider the special case M = 0 first. Because there's no construct to // enter a pure epsilon-transition into the expression array I work around // with the question mark operator to describe the first and second branch alternative. if M = 0 then begin State := MakeExpression(_URE_QUEST, State, _URE_NOOP); Inc(M, 2); // Mark the pending OR operation (there must always follow at least on more // alternative because the special case {0, 1} has already been handled). Push(State); Push(_URE_OR); end; while M <= N do begin State := LastState; // create E^M for I := 1 to Integer(M) - 1 do begin Push(State); Push(_URE_AND); end; // finish the branch and mark it as pending OR operation if it isn't the last one CollectPendingOperations(State); if M < N then begin Push(State); Push(_URE_OR); end; Inc(M); end; // remove the _URE_PAREN off the stack Pop; end; end; end; else Dec(Head); Symbol := MakeSymbol(Head, Tail - Head, Used); Inc(Head, Used); State := MakeExpression(_URE_SYMBOL, Symbol, _URE_NOOP); end; if (C <> '(') and (C <> '|') and (C <> '{') and (Head < Tail) and (not IsSpecial(Word(Head^)) or (Head^ = '(')) then begin Push(State); Push(_URE_AND); end; end; CollectPendingOperations(State); if FUREBuffer.Stack.ListUsed > 0 then FUREBuffer.Error := _URE_UNBALANCED_GROUP; if FUREBuffer.Error = _URE_OK then Result := State else Result := _URE_NOOP; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.AddSymbolState(Symbol, State: Cardinal); var I, J: Integer; Found: Boolean; begin // Locate the symbol in the symbol table so the state can be added. // If the symbol doesn't exist, then we are in serious trouble. for I := 0 to FUREBuffer.SymbolTable.SymbolsUsed - 1 do if Symbol = FUREBuffer.SymbolTable.Symbols[I].ID then Break; // Now find out if the state exists in the symbol's state list. with FUREBuffer.SymbolTable.Symbols[I].States do begin Found := False; for J := 0 to ListUsed - 1 do if State <= List[J] then begin Found := True; Break; end; if not Found then J := ListUsed; if not Found or (State < List[J]) then begin // Need to add the state in order. if ListUsed = Length(List) then SetLength(List, Length(List) + 8); if J < ListUsed then Move(List[J], List[J + 1], SizeOf(Cardinal) * (ListUsed - J)); List[J] := State; Inc(ListUsed); end; end; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.AddState(NewStates: array of Cardinal): Cardinal; var I: Integer; Found: Boolean; begin Found := False; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if (FUREBuffer.States.States[I].StateList.ListUsed = Length(NewStates)) and CompareMem(@NewStates[0], @FUREBuffer.States.States[I].StateList.List[0], SizeOf(Cardinal) * Length(NewStates)) then begin Found := True; Break; end; end; if not Found then begin // Need to add a new DFA State (set of NFA states). if FUREBuffer.States.StatesUsed = Length(FUREBuffer.States.States) then SetLength(FUREBuffer.States.States, Length(FUREBuffer.States.States) + 8); with FUREBuffer.States.States[FUREBuffer.States.StatesUsed] do begin ID := FUREBuffer.States.StatesUsed; if (StateList.ListUsed + Length(NewStates)) >= Length(StateList.List) then SetLength(StateList.List, Length(StateList.List) + Length(NewStates) + 8); Move(NewStates[0], StateList.List[StateList.ListUsed], SizeOf(Cardinal) * Length(NewStates)); Inc(StateList.ListUsed, Length(NewStates)); end; Inc(FUREBuffer.States.StatesUsed); end; // Return the ID of the DFA state representing a group of NFA States. if Found then Result := I else Result := FUREBuffer.States.StatesUsed - 1; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.Reduce(Start: Cardinal); var I, J, Symbols: Integer; State, RHS, s1, s2, ns1, ns2: Cardinal; Evaluating: Boolean; begin FUREBuffer.Reducing := True; // Add the starting state for the reduction. AddState([Start]); // Process each set of NFA states that get created. I := 0; // further states are added in the loop while I < FUREBuffer.States.StatesUsed do begin with FUREBuffer, States.States[I], ExpressionList do begin // Push the current states on the stack. for J := 0 to StateList.ListUsed - 1 do Push(StateList.List[J]); // Reduce the NFA states. Accepting := False; Symbols := 0; J := 0; // need a while loop here as the stack will be modified within the loop and // so also its usage count used to terminate the loop while J < FUREBuffer.Stack.ListUsed do begin State := FUREBuffer.Stack.List[J]; Evaluating := True; // This inner loop is the iterative equivalent of recursively // reducing subexpressions generated as a result of a reduction. while Evaluating do begin case Expressions[State].AType of _URE_SYMBOL: begin ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP); AddSymbolState(Expressions[State].LHS, ns1); Inc(Symbols); Evaluating := False; end; _URE_ONE: begin Accepting := True; Evaluating := False; end; _URE_QUEST: begin s1 := Expressions[State].LHS; ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP); State := MakeExpression(_URE_OR, ns1, s1); end; _URE_PLUS: begin s1 := Expressions[State].LHS; ns1 := MakeExpression(_URE_STAR, s1, _URE_NOOP); State := MakeExpression(_URE_AND, s1, ns1); end; _URE_STAR: begin s1 := Expressions[State].LHS; ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP); ns2 := MakeExpression(_URE_PLUS, s1, _URE_NOOP); State := MakeExpression(_URE_OR, ns1, ns2); end; _URE_OR: begin s1 := Expressions[State].LHS; s2 := Expressions[State].RHS; Push(s1); Push(s2); Evaluating := False; end; _URE_AND: begin s1 := Expressions[State].LHS; s2 := Expressions[State].RHS; case Expressions[s1].AType of _URE_SYMBOL: begin AddSymbolState(Expressions[s1].LHS, s2); Inc(Symbols); Evaluating := False; end; _URE_ONE: State := s2; _URE_QUEST: begin ns1 := Expressions[s1].LHS; ns2 := MakeExpression(_URE_AND, ns1, s2); State := MakeExpression(_URE_OR, s2, ns2); end; _URE_PLUS: begin ns1 := Expressions[s1].LHS; ns2 := MakeExpression(_URE_OR, s2, State); State := MakeExpression(_URE_AND, ns1, ns2); end; _URE_STAR: begin ns1 := Expressions[s1].LHS; ns2 := MakeExpression(_URE_AND, ns1, State); State := MakeExpression(_URE_OR, s2, ns2); end; _URE_OR: begin ns1 := Expressions[s1].LHS; ns2 := Expressions[s1].RHS; ns1 := MakeExpression(_URE_AND, ns1, s2); ns2 := MakeExpression(_URE_AND, ns2, s2); State := MakeExpression(_URE_OR, ns1, ns2); end; _URE_AND: begin ns1 := Expressions[s1].LHS; ns2 := Expressions[s1].RHS; ns2 := MakeExpression(_URE_AND, ns2, s2); State := MakeExpression(_URE_AND, ns1, ns2); end; end; end; end; end; Inc(J); end; // clear the state stack while Pop <> _URE_NOOP do ; // generate the DFA states for the symbols collected during the current reduction if (TransitionsUsed + Symbols) > Length(Transitions) then SetLength(Transitions, Length(Transitions) + Symbols); // go through the symbol table and generate the DFA state transitions for // each symbol that has collected NFA states Symbols := 0; J := 0; while J < FUREBuffer.SymbolTable.SymbolsUsed do begin begin if FUREBuffer.SymbolTable.Symbols[J].States.ListUsed > 0 then begin Transitions[Symbols].LHS := FUREBuffer.SymbolTable.Symbols[J].ID; with FUREBuffer.SymbolTable.Symbols[J] do begin RHS := AddState(Copy(States.List, 0, States.ListUsed)); States.ListUsed := 0; end; Transitions[Symbols].RHS := RHS; Inc(Symbols); end; end; Inc(J); end; // set the number of transitions actually used // Note: we need again to qualify a part of the TransistionsUsed path since the // state array could be reallocated in the AddState call above and the with ... do // will then be invalid. States.States[I].TransitionsUsed := Symbols; end; Inc(I); end; FUREBuffer.Reducing := False; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.AddEquivalentPair(L, R: Cardinal); var I: Integer; begin L := FUREBuffer.States.States[L].ID; R := FUREBuffer.States.States[R].ID; if L <> R then begin if L > R then begin I := L; L := R; R := I; end; // Check to see if the equivalence pair already exists. I := 0; with FUREBuffer.EquivalentList do begin while (I < EquivalentsUsed) and ((Equivalents[I].Left <> L) or (Equivalents[I].Right <> R)) do Inc(I); if I >= EquivalentsUsed then begin if EquivalentsUsed = Length(Equivalents) then SetLength(Equivalents, Length(Equivalents) + 8); Equivalents[EquivalentsUsed].Left := L; Equivalents[EquivalentsUsed].Right := R; Inc(EquivalentsUsed); end; end; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.MergeEquivalents; // merges the DFA states that are equivalent var I, J, K, Equal: Integer; Done: Boolean; State1, State2, LeftState, RightState: PState; begin for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin State1 := @FUREBuffer.States.States[I]; if State1.ID = Cardinal(I) then begin J := 0; while J < I do begin State2 := @FUREBuffer.States.States[J]; if State2.ID = Cardinal(J) then begin FUREBuffer.EquivalentList.EquivalentsUsed := 0; AddEquivalentPair(I, J); Done := False; Equal := 0; while Equal < FUREBuffer.EquivalentList.EquivalentsUsed do begin LeftState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Left]; RightState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Right]; if (LeftState.Accepting <> RightState.Accepting) or (LeftState.TransitionsUsed <> RightState.TransitionsUsed) then begin Done := True; Break; end; K := 0; while (K < LeftState.TransitionsUsed) and (LeftState.Transitions[K].LHS = RightState.Transitions[K].LHS) do Inc(K); if K < LeftState.TransitionsUsed then begin Done := True; Break; end; for K := 0 to LeftState.TransitionsUsed - 1 do AddEquivalentPair(LeftState.Transitions[K].RHS, RightState.Transitions[K].RHS); Inc(Equal); end; if not Done then Break; end; Inc(J); end; if J < I then with FUREBuffer do for Equal := 0 to EquivalentList.EquivalentsUsed - 1 do States.States[EquivalentList.Equivalents[Equal].Right].ID := States.States[EquivalentList.Equivalents[Equal].Left].ID; end; end; // Renumber the states appropriately State1 := @FUREBuffer.States.States[0]; Equal := 0; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if State1.ID = Cardinal(I) then begin State1.ID := Equal; Inc(Equal); end else State1.ID := FUREBuffer.States.States[State1.ID].ID; Inc(State1); end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.ClearUREBuffer; var I: Integer; begin with FUREBuffer do begin // quite a few dynamic arrays to free Stack.List := nil; ExpressionList.Expressions := nil; // the symbol table has been handed over to the DFA and will be freed on // release of the DFA SymbolTable.SymbolsUsed := 0; for I := 0 to States.StatesUsed - 1 do begin States.States[I].Transitions := nil; States.States[I].StateList.List := nil; States.States[I].StateList.ListUsed := 0; States.States[I].TransitionsUsed := 0; end; States.StatesUsed := 0; States.States := nil; EquivalentList.Equivalents := nil; end; FillChar(FUREBuffer, SizeOf(FUREBuffer), 0); end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.CompileURE(RE: PWideChar; RELength: Cardinal; Casefold: Boolean); var I, J: Integer; State: Cardinal; Run: PState; TP: Integer; begin // be paranoid if Assigned(RE) and (RE^ <> WideNull) and (RELength > 0) then begin // Reset the various fields of the compilation buffer. Default the Flags // to indicate the presense of the "^$" pattern. If any other pattern // occurs, then this flag will be removed. This is done to catch this // special pattern and handle it specially when matching. ClearUREBuffer; ClearDFA; FUREBuffer.Flags := _URE_DFA_BLANKLINE; if Casefold then FUREBuffer.Flags := FUREBuffer.Flags or _URE_DFA_CASEFOLD; // Construct the NFA. If this stage returns a 0, then an error occured or an empty expression was passed. State := ConvertRE2NFA(RE, RELength); if State <> _URE_NOOP then begin // Do the expression reduction to get the initial DFA. Reduce(State); // Merge all the equivalent DFA States. MergeEquivalents; // Construct the minimal DFA. FDFA.Flags := FUREBuffer.Flags and (_URE_DFA_CASEFOLD or _URE_DFA_BLANKLINE); // Free up the NFA state groups and transfer the symbols from the buffer to the FDFA. FDFA.SymbolTable := FUREBuffer.SymbolTable; FUREBuffer.SymbolTable.Symbols := nil; // Collect the total number of states and transitions needed for the DFA. State := 0; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if FUREBuffer.States.States[I].ID = State then begin Inc(FDFA.StateList.StatesUsed); Inc(FDFA.TransitionList.TransitionsUsed, FUREBuffer.States.States[I].TransitionsUsed); Inc(State); end; end; // Allocate enough space for the states and transitions. SetLength(FDFA.StateList.States, FDFA.StateList.StatesUsed); SetLength(FDFA.TransitionList.Transitions, FDFA.TransitionList.TransitionsUsed); // Actually transfer the FDFA States from the buffer. State := 0; TP := 0; Run := @FUREBuffer.States.States[0]; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if Run.ID = State then begin FDFA.StateList.States[I].StartTransition := TP; FDFA.StateList.States[I].NumberTransitions := Run.TransitionsUsed; FDFA.StateList.States[I].Accepting := Run.Accepting; // Add the transitions for the state for J := 0 to FDFA.StateList.States[I].NumberTransitions - 1 do begin FDFA.TransitionList.Transitions[TP].Symbol := Run.Transitions[J].LHS; FDFA.TransitionList.Transitions[TP].NextState := FUREBuffer.States.States[Run.Transitions[J].RHS].ID; Inc(TP); end; Inc(State); end; Inc(Run); end; end else begin // there might be an error while parsing the pattern, show it if so case FUREBuffer.Error of _URE_UNEXPECTED_EOS: raise Exception.CreateFmt(SUREBaseString + SUREUnexpectedEOS, [RE]); _URE_CCLASS_OPEN: raise Exception.CreateFmt(SUREBaseString + SURECharacterClassOpen, [RE]); _URE_UNBALANCED_GROUP: raise Exception.CreateFmt(SUREBaseString + SUREUnbalancedGroup, [RE]); _URE_INVALID_PROPERTY: raise Exception.CreateFmt(SUREBaseString + SUREInvalidCharProperty, [RE]); _URE_INVALID_RANGE: raise Exception.CreateFmt(SUREBaseString + SUREInvalidRepeatRange, [RE]); _URE_RANGE_OPEN: raise Exception.CreateFmt(SUREBaseString + SURERepeatRangeOpen, [RE]); else // expression was empty raise Exception.Create(SUREExpressionEmpty); end; end; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.ClearDFA; var I: Integer; begin with FDFA do begin for I := 0 to SymbolTable.SymbolsUsed - 1 do begin if (SymbolTable.Symbols[I].AType = _URE_CCLASS) or (SymbolTable.Symbols[I].AType = _URE_NCCLASS) then SymbolTable.Symbols[I].Symbol.CCL.Ranges := nil; end; for I := 0 to SymbolTable.SymbolsUsed - 1 do begin FDFA.SymbolTable.Symbols[I].States.List := nil; FDFA.SymbolTable.Symbols[I].States.ListUsed := 0; end; SymbolTable.SymbolsUsed := 0; SymbolTable.Symbols := nil; StateList.States := nil; TransitionList.Transitions := nil; end; FillChar(FDFA, SizeOf(FDFA), 0); end; //---------------------------------------------------------------------------------------------------------------------- function IsSeparator(C: UCS4): Boolean; begin Result := (C = $D) or (C = $A) or (C = $2028) or (C = $2029); end; //---------------------------------------------------------------------------------------------------------------------- const PropertyMap: array[0..31] of Cardinal = ( 0, // class ID 1, corresponds to UC_MN 1, // class ID 2, UC_MC 3, // 3, UC_ND 5, // 4, UC_NO 6, // 5, UC_ZS 7, // 6, UC_ZL 8, // 7, UC_ZP 9, // 8, UC_CC 12, // 9, UC_CO 14, // 10, UC_LU 15, // 11, UC_LL 16, // 12, UC_LT 17, // 13, UC_LM 18, // 14, UC_LO 20, // 15, UC_PD 21, // 16, UC_PS 22, // 17, UC_PE 23, // 18, UC_PO 24, // 19, UC_SM 25, // 20, UC_SC 26, // 21, UC_SO 27, // 22, UC_L 28, // 23, UC_R 29, // 24, UC_EN 30, // 25, UC_ES 32, // 26, UC_ET 33, // 27, UC_AN 34, // 28, UC_CS 35, // 29, UC_B 36, // 30, UC_S 37, // 31, UC_WS 38 // 32, UC_ON ); function TURESearch.MatchesProperties(Props, C: Cardinal): Boolean; // tries to match any of the given properties var I: Integer; Mask: Cardinal; begin Result := False; // We need only one match in order to tell the caller success, // but unfortunately we cannot directly map the URE flags to the // Unicode property flags. Hence we need to loop and explicitly remap them. Mask := 1; for I := 0 to 31 do begin if ((Props and Mask) <> 0) and PropertyLookup(C, PropertyMap[I]) then begin Result := True; Exit; end; Mask := Mask shl 1; end; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean; var I, J: Integer; Matched, Found: Boolean; Start, Stop: Integer; C: UCS4; Run, Tail, lp: PUCS2; LastState: PDFAState; Symbol: PSymbolTableEntry; Rp: PRange; begin Result := False; if Assigned(Text) then begin // Handle the special of an empty string matching the "^$" pattern. if (Textlen = 0) and ((FDFA.Flags and _URE_DFA_BLANKLINE) <> 0) then begin MatchStart := 0; MatchEnd := 0; Result := True; Exit; end; Run := Text; Tail := Run + TextLen; Start := -1; Stop := -1; LastState := @FDFA.StateList.States[0]; Found := False; while not Found and (Run < Tail) do begin lp := Run; C := UCS4(Run^); Inc(Run); // Check to see if this is a high surrogate that should be combined with a following low surrogate. if (Run < Tail) and (SurrogateHighStart <= C) and (C <= SurrogateHighEnd) and (SurrogateLowStart <= UCS4(Run^)) and (UCS4(Run^) <= SurrogateLowEnd) then begin C := $10000 + (((C and $03FF) shl 10) or (UCS4(Run^) and $03FF)); Inc(Run); end; // Determine if the character is non-spacing and should be skipped. if ((Flags and URE_IGNORE_NONSPACING) <> 0) and UnicodeIsNonSpacingMark(C) then begin Inc(Run); Continue; end; if (FDFA.Flags and _URE_DFA_CASEFOLD) <> 0 then C := UnicodeToLower(C); // See if one of the transitions matches. I := LastState.NumberTransitions - 1; Matched := False; while not Matched and (I >= 0) do begin Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol]; case Symbol.AType of _URE_ANY_CHAR: if ((Flags and URE_DONT_MATCHES_SEPARATORS) <> 0) or not IsSeparator(C) then Matched := True; _URE_CHAR: if C = Symbol.Symbol.Chr then Matched := True; _URE_BOL_ANCHOR: if Lp = Text then begin Run := lp; Matched := True; end else if IsSeparator(C) then begin if (C = $D) and (Run < Tail) and (Run^ = #$A) then Inc(Run); Lp := Run; Matched := True; end; _URE_EOL_ANCHOR: if IsSeparator(C) then begin // Put the pointer back before the separator so the match end position will be correct. // This will also cause the `Run' pointer to be advanced over the current separator once the match // end point has been recorded. Run := Lp; Matched := True; end; _URE_CCLASS, _URE_NCCLASS: with Symbol^ do begin if Props <> 0 then Matched := MatchesProperties(Props, C); if Symbol.CCL.RangesUsed > 0 then begin Rp := @Symbol.CCL.Ranges[0]; for J := 0 to Symbol.CCL.RangesUsed - 1 do begin if (Rp.MinCode <= C) and (C <= Rp.MaxCode) then begin Matched := True; Break; end; Inc(Rp); end; end; if AType = _URE_NCCLASS then Matched := not Matched; end; end; if Matched then begin if Start = -1 then Start := Lp - Text else Stop := Run - Text; LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState]; // If the match was an EOL anchor, adjust the pointer past the separator that caused the match. // The correct match position has been recorded already. if Symbol.AType = _URE_EOL_ANCHOR then begin // skip the character that caused the match. Inc(Run); // Handle the infamous CRLF situation. if (Run < Tail) and (C = $D) and (Run^ = #$A) then Inc(Run); end; end; Dec(I); end; if not Matched then begin Found := LastState.Accepting; if not Found then begin // If the last state was not accepting, then reset and start over. LastState := @FDFA.StateList.States[0]; Start := -1; Stop := -1; end else begin // set start and stop pointer if not yet done if Start = -1 then begin Start := Lp - Text; Stop := Run - Text; end else if Stop = -1 then Stop := Lp - Text; end; end else if Run = Tail then begin if not LastState.Accepting then begin // This ugly hack is to make sure the end-of-line anchors match when the source text hits the end. // This is only done if the last subexpression matches. for I := 0 to LastState.NumberTransitions - 1 do begin if Found then Break; Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol]; if Symbol.AType =_URE_EOL_ANCHOR then begin LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState]; if LastState.Accepting then begin Stop := Run - Text; Found := True; end else Break; end; end; end else begin // Make sure any conditions that match all the way to the end of the string match. Found := True; Stop := Run - Text; end; end; end; if Found then begin MatchStart := Start; MatchEnd := Stop; end; Result := Found; end; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.FindAll(const Text: WideString): Boolean; begin Result := FindAll(PWideChar(Text), Length(Text)); end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.FindAll(const Text: PWideChar; TextLen: Cardinal): Boolean; // Looks for all occurences of the pattern passed to FindPrepare and creates an internal list of their positions. var Start, Stop: Cardinal; Run: PWideChar; RunLen: Cardinal; begin ClearResults; Run := Text; RunLen := TextLen; // repeat to find all occurences of the pattern while ExecuteURE(0, Run, RunLen, Start, Stop) do begin // store this result (consider text pointer movement)... AddResult(Start + Run - Text, Stop + Run - Text); // ... and advance text position and length Inc(Run, Stop); Dec(RunLen, Stop); end; Result := FResults.Count > 0; end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; begin Result := FindFirst(PWideChar(Text), Length(Text), Start, Stop); end; //---------------------------------------------------------------------------------------------------------------------- function TURESearch.FindFirst(const Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; // Looks for the first occurence of the pattern passed to FindPrepare in Text and returns True if one could be // found (in which case Start and Stop are set to the according indices) otherwise False. // This function is in particular of interest if only one occurence needs to be found. begin ClearResults; Result := ExecuteURE(0, PWideChar(Text), Length(Text), Start, Stop); if Result then AddResult(Start, Stop); end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.FindPrepare(const Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); begin CompileURE(Pattern, PatternLength, not (sfCaseSensitive in Options)); end; //---------------------------------------------------------------------------------------------------------------------- procedure TURESearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags); begin CompileURE(PWideChar(Pattern), Length(Pattern), not (sfCaseSensitive in Options)); end; //----------------- TWideStrings --------------------------------------------------------------------------------------- constructor TWideStrings.Create; begin inherited; // there should seldom be the need to use a language other than the one of the system FLanguage := GetUserDefaultLCID; end; //---------------------------------------------------------------------------------------------------------------------- destructor TWideStrings.Destroy; begin inherited; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SetLanguage(Value: LCID); begin FLanguage := Value; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.Add(const S: WideString): Integer; begin Result := GetCount; Insert(Result, S); end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.AddObject(const S: WideString; AObject: TObject): Integer; begin Result := Add(S); PutObject(Result, AObject); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.Append(const S: WideString); begin Add(S); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.AddStrings(Strings: TStrings); var I: Integer; begin BeginUpdate; try for I := 0 to Strings.Count - 1 do AddObject(Strings[I], Strings.Objects[I]); finally EndUpdate; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.AddStrings(Strings: TWideStrings); var I: Integer; begin BeginUpdate; try for I := 0 to Strings.Count - 1 do AddObject(Strings[I], Strings.Objects[I]); finally EndUpdate; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.Assign(Source: TPersistent); // usual assignment routine, but able to assign wide and small strings var I: Integer; begin if Source is TWideStrings then begin BeginUpdate; try Clear; AddStrings(TWideStrings(Source)); finally EndUpdate; end; end else if Source is TStrings then begin BeginUpdate; try Clear; for I := 0 to TStrings(Source).Count - 1 do AddObject(TStrings(Source)[I], TStrings(Source).Objects[I]); finally EndUpdate; end; end else inherited Assign(Source); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.AssignTo(Dest: TPersistent); // need to do also assignment to old style TStrings, but this class doesn't know TWideStrings, so // we need to do it from here var I: Integer; begin if Dest is TStrings then with Dest as TStrings do begin BeginUpdate; try Clear; for I := 0 to Self.Count - 1 do AddObject(Self[I], Self.Objects[I]); finally EndUpdate; end; end else if Dest is TWideStrings then with Dest as TWideStrings do begin BeginUpdate; try Clear; AddStrings(Self); finally EndUpdate; end; end else inherited; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.BeginUpdate; begin if FUpdateCount = 0 then SetUpdateState(True); Inc(FUpdateCount); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.DefineProperties(Filer: TFiler); function DoWrite: Boolean; begin if Filer.Ancestor <> nil then begin Result := True; if Filer.Ancestor is TWideStrings then Result := not Equals(TWideStrings(Filer.Ancestor)) end else Result := Count > 0; end; begin Filer.DefineProperty('WideStrings', ReadData, WriteData, DoWrite); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.EndUpdate; begin Dec(FUpdateCount); if FUpdateCount = 0 then SetUpdateState(False); end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.Equals(Strings: TWideStrings): Boolean; var I, Count: Integer; begin Result := False; Count := GetCount; if Count <> Strings.GetCount then Exit; for I := 0 to Count - 1 do if Get(I) <> Strings.Get(I) then Exit; Result := True; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.Error(const Msg: String; Data: Integer); function ReturnAddr: Pointer; asm MOV EAX, [EBP + 4] end; begin raise EStringListError.CreateFmt(Msg, [Data]) at ReturnAddr; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.Exchange(Index1, Index2: Integer); var TempObject: TObject; TempString: WideString; begin BeginUpdate; try TempString := Strings[Index1]; TempObject := Objects[Index1]; Strings[Index1] := Strings[Index2]; Objects[Index1] := Objects[Index2]; Strings[Index2] := TempString; Objects[Index2] := TempObject; finally EndUpdate; end; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.GetCapacity: Integer; begin // descendants may optionally override/replace this default implementation Result := Count; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.GetCommaText: WideString; var S: WideString; P: PWideChar; I, Count: Integer; begin Count := GetCount; if (Count = 1) and (Get(0) = '') then Result := '""' else begin Result := ''; for I := 0 to Count - 1 do begin S := Get(I); P := PWideChar(S); while not (P^ in [WideNull..Space, WideChar('"'), WideChar(',')]) do Inc(P); if (P^ <> WideNull) then S := WideQuotedStr(S, '"'); Result := Result + S + ','; end; System.Delete(Result, Length(Result), 1); end; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.GetName(Index: Integer): WideString; var P: Integer; begin Result := Get(Index); P := Pos('=', Result); if P > 0 then SetLength(Result, P - 1) else Result := ''; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.GetObject(Index: Integer): TObject; begin Result := nil; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.GetText: PWideChar; begin Result := StrNewW(PWideChar(GetTextStr)); end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.GetTextStr: WideString; var I, L, Size, Count: Integer; P: PWideChar; S: WideString; begin Count := GetCount; Size := 0; for I := 0 to Count - 1 do Inc(Size, Length(Get(I)) + 2); SetLength(Result, Size); P := Pointer(Result); for I := 0 to Count - 1 do begin S := Get(I); L := Length(S); if L <> 0 then begin System.Move(Pointer(S)^, P^, 2 * L); Inc(P, L); end; P^ := CarriageReturn; Inc(P); P^ := LineFeed; Inc(P); end; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.GetValue(const Name: WideString): WideString; var I: Integer; begin I := IndexOfName(Name); if I >= 0 then Result := Copy(Get(I), Length(Name) + 2, MaxInt) else Result := ''; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.IndexOf(const S: WideString): Integer; begin for Result := 0 to GetCount - 1 do if WideCompareText(Get(Result), S, FLanguage) = 0 then Exit; Result := -1; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.IndexOfName(const Name: WideString): Integer; var P: Integer; S: WideString; begin for Result := 0 to GetCount - 1 do begin S := Get(Result); P := Pos('=', S); if (P > 0) and (WideCompareText(Copy(S, 1, P - 1), Name, FLanguage) = 0) then Exit; end; Result := -1; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStrings.IndexOfObject(AObject: TObject): Integer; begin for Result := 0 to GetCount - 1 do if GetObject(Result) = AObject then Exit; Result := -1; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.InsertObject(Index: Integer; const S: WideString; AObject: TObject); begin Insert(Index, S); PutObject(Index, AObject); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.LoadFromFile(const FileName: String); var Stream: TStream; begin try Stream := TFileStream.Create(FileName, fmOpenRead or fmShareDenyNone); try LoadFromStream(Stream); finally Stream.Free; end; except {$IFDEF VER130} RaiseLastWin32Error; {$ELSE} RaiseLastOSError; {$ENDIF} end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.LoadFromStream(Stream: TStream); // usual loader routine, but enhanced to handle byte order marks in stream var Size, BytesRead: Integer; Order: WideChar; SW: WideString; SA: String; begin BeginUpdate; try Size := Stream.Size - Stream.Position; BytesRead := Stream.Read(Order, 2); if (Order = BOM_LSB_FIRST) or (Order = BOM_MSB_FIRST) then begin FSaveUnicode := True; SetLength(SW, (Size - 2) div 2); Stream.Read(PWideChar(SW)^, Size - 2); if Order = BOM_MSB_FIRST then StrSwapByteOrder(PWideChar(SW)); SetTextStr(SW); end else begin // without byte order mark it is assumed that we are loading ANSI text FSaveUnicode := False; Stream.Seek(-BytesRead, soFromCurrent); SetLength(SA, Size); Stream.Read(PChar(SA)^, Size); SetTextStr(SA); end; finally EndUpdate; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.Move(CurIndex, NewIndex: Integer); var TempObject: TObject; TempString: WideString; begin if CurIndex <> NewIndex then begin BeginUpdate; try TempString := Get(CurIndex); TempObject := GetObject(CurIndex); Delete(CurIndex); InsertObject(NewIndex, TempString, TempObject); finally EndUpdate; end; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.Put(Index: Integer; const S: WideString); var TempObject: TObject; begin TempObject := GetObject(Index); Delete(Index); InsertObject(Index, S, TempObject); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.PutObject(Index: Integer; AObject: TObject); begin end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.ReadData(Reader: TReader); begin Reader.ReadListBegin; BeginUpdate; try Clear; while not Reader.EndOfList do Add(Reader.ReadWideString); finally EndUpdate; end; Reader.ReadListEnd; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SaveToFile(const FileName: String); var Stream: TStream; begin Stream := TFileStream.Create(FileName, fmCreate); try SaveToStream(Stream); finally Stream.Free; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SaveToStream(Stream: TStream); var SW, BOM: WideString; SA: String; Allowed: Boolean; Run: PWideChar; begin // The application can decide in which format to save the content. // If FSaveUnicode is False then all strings are saved in standard ANSI format // which is also loadable by TStrings but you should be aware that all Unicode // strings are then converted to ANSI based on the current system locale. // An extra event is supplied to ask the user about the potential loss of information // when converting Unicode to ANSI strings. SW := GetTextStr; Allowed := True; FSaved := False; // be pessimistic // check for potential information loss makes only sense if the application has set // an event to be used as call back to ask about the conversion if not FSaveUnicode and Assigned(FOnConfirmConversion) then begin // application requests to save only ANSI characters, so check the text and // call back in case information could be lost Run := PWideChar(SW); // only ask if there's at least one Unicode character in the text while Run^ in [WideChar(#1)..WideChar(#255)] do Inc(Run); // Note: The application can still set FSaveUnicode to True in the callback. if Run^ <> WideNull then FOnConfirmConversion(Self, Allowed); end; if Allowed then begin // only save if allowed if FSaveUnicode then begin BOM := BOM_LSB_FIRST; Stream.WriteBuffer(PWideChar(BOM)^, 2); // SW has already been filled Stream.WriteBuffer(PWideChar(SW)^, 2 * Length(SW)); end else begin // implicit conversion to ANSI SA := SW; if Allowed then Stream.WriteBuffer(PChar(SA)^, Length(SA)); end; FSaved := True; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SetCapacity(NewCapacity: Integer); begin // do nothing - descendants may optionally implement this method end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SetCommaText(const Value: WideString); var P, P1: PWideChar; S: WideString; begin BeginUpdate; try Clear; P := PWideChar(Value); while P^ in [WideChar(#1)..Space] do Inc(P); while P^ <> WideNull do begin if P^ = '"' then S := WideExtractQuotedStr(P, '"') else begin P1 := P; while (P^ > Space) and (P^ <> ',') do Inc(P); SetString(S, P1, P - P1); end; Add(S); while P^ in [WideChar(#1)..Space] do Inc(P); if P^ = ',' then repeat Inc(P); until not (P^ in [WideChar(#1)..Space]); end; finally EndUpdate; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SetText(Text: PWideChar); begin SetTextStr(Text); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SetTextStr(const Value: WideString); var Head, Tail: PWideChar; S: WideString; begin BeginUpdate; try Clear; Head := PWideChar(Value); while Head^ <> WideNull do begin Tail := Head; while not (Tail^ in [WideNull, LineFeed, CarriageReturn, VerticalTab, FormFeed]) and (Tail^ <> LineSeparator) and (Tail^ <> ParagraphSeparator) do Inc(Tail); SetString(S, Head, Tail - Head); Add(S); Head := Tail; if Head^ <> WideNull then begin Inc(Head); if (Tail^ = CarriageReturn) and (Head^ = LineFeed) then Inc(Head); end; end; finally EndUpdate; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SetUpdateState(Updating: Boolean); begin end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.SetValue(const Name, Value: WideString); var I : Integer; begin I := IndexOfName(Name); if Value <> '' then begin if I < 0 then I := Add(''); Put(I, Name + '=' + Value); end else if I >= 0 then Delete(I); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStrings.WriteData(Writer: TWriter); var I: Integer; begin Writer.WriteListBegin; for I := 0 to Count-1 do Writer.WriteWideString(Get(I)); Writer.WriteListEnd; end; //----------------- TWideStringList ------------------------------------------------------------------------------------ destructor TWideStringList.Destroy; begin FOnChange := nil; FOnChanging := nil; FCount := 0; FList := nil; inherited Destroy; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStringList.Add(const S: WideString): Integer; begin if not Sorted then Result := FCount else if Find(S, Result) then case Duplicates of dupIgnore: Exit; dupError: Error(SDuplicateString, 0); end; InsertItem(Result, S); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Changed; begin if (FUpdateCount = 0) and Assigned(FOnChange) then FOnChange(Self); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Changing; begin if (FUpdateCount = 0) and Assigned(FOnChanging) then FOnChanging(Self); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Clear; begin if FCount <> 0 then begin Changing; // this will automatically finalize the array FList := nil; FCount := 0; SetCapacity(0); Changed; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Delete(Index: Integer); begin if (Index < 0) or (Index >= FCount) then Error(SListIndexError, Index); Changing; FList[Index].FString := ''; Dec(FCount); if Index < FCount then begin System.Move(FList[Index + 1], FList[Index], (FCount - Index) * SizeOf(TWideStringItem)); Pointer(FList[FCount].FString) := nil; end; Changed; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Exchange(Index1, Index2: Integer); begin if (Index1 < 0) or (Index1 >= FCount) then Error(SListIndexError, Index1); if (Index2 < 0) or (Index2 >= FCount) then Error(SListIndexError, Index2); Changing; ExchangeItems(Index1, Index2); Changed; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.ExchangeItems(Index1, Index2: Integer); var Temp: TWideStringItem; begin Temp := FList[Index1]; FList[Index1] := FList[Index2]; FList[Index2] := Temp; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStringList.Find(const S: WideString; var Index: Integer): Boolean; var L, H, I, C: Integer; begin Result := False; L := 0; H := FCount - 1; while L <= H do begin I := (L + H) shr 1; C := WideCompareText(FList[I].FString, S, FLanguage); if C < 0 then L := I+1 else begin H := I - 1; if C = 0 then begin Result := True; if Duplicates <> dupAccept then L := I; end; end; end; Index := L; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStringList.Get(Index: Integer): WideString; begin if (Index < 0) or (Index >= FCount) then Error(SListIndexError, Index); Result := FList[Index].FString; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStringList.GetCapacity: Integer; begin Result := Length(FList); end; //---------------------------------------------------------------------------------------------------------------------- function TWideStringList.GetCount: Integer; begin Result := FCount; end; //---------------------------------------------------------------------------------------------------------------------- function TWideStringList.GetObject(Index: Integer): TObject; begin if (Index < 0) or (Index >= FCount) then Error(SListIndexError, Index); Result := FList[Index].FObject; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Grow; var Delta, Len: Integer; begin Len := Length(FList); if Len > 64 then Delta := Len div 4 else if Len > 8 then Delta := 16 else Delta := 4; SetCapacity(Len + Delta); end; //---------------------------------------------------------------------------------------------------------------------- function TWideStringList.IndexOf(const S: WideString): Integer; begin if not Sorted then Result := inherited IndexOf(S) else if not Find(S, Result) then Result := -1; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Insert(Index: Integer; const S: WideString); begin if Sorted then Error(SSortedListError, 0); if (Index < 0) or (Index > FCount) then Error(SListIndexError, Index); InsertItem(Index, S); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.InsertItem(Index: Integer; const S: WideString); begin Changing; if FCount = Length(FList) then Grow; if Index < FCount then System.Move(FList[Index], FList[Index + 1], (FCount - Index) * SizeOf(TWideStringItem)); with FList[Index] do begin Pointer(FString) := nil; FObject := nil; FString := S; end; Inc(FCount); Changed; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Put(Index: Integer; const S: WideString); begin if Sorted then Error(SSortedListError, 0); if (Index < 0) or (Index >= FCount) then Error(SListIndexError, Index); Changing; FList[Index].FString := S; Changed; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.PutObject(Index: Integer; AObject: TObject); begin if (Index < 0) or (Index >= FCount) then Error(SListIndexError, Index); Changing; FList[Index].FObject := AObject; Changed; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.QuickSort(L, R: Integer); var I, J: Integer; P: WideString; begin repeat I := L; J := R; P := FList[(L + R) shr 1].FString; repeat while WideCompareText(FList[I].FString, P, FLanguage) < 0 do Inc(I); while WideCompareText(FList[J].FString, P, FLanguage) > 0 do Dec(J); if I <= J then begin ExchangeItems(I, J); Inc(I); Dec(J); end; until I > J; if L < J then QuickSort(L, J); L := I; until I >= R; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.SetCapacity(NewCapacity: Integer); begin SetLength(FList, NewCapacity); end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.SetSorted(Value: Boolean); begin if FSorted <> Value then begin if Value then Sort; FSorted := Value; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.SetUpdateState(Updating: Boolean); begin if Updating then Changing else Changed; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.Sort; begin if not Sorted and (FCount > 1) then begin Changing; QuickSort(0, FCount - 1); Changed; end; end; //---------------------------------------------------------------------------------------------------------------------- procedure TWideStringList.SetLanguage(Value: LCID); begin inherited; if Sorted then Sort; end; //----------------- functions for null terminated wide strings --------------------------------------------------------- function StrLenW(Str: PWideChar): Cardinal; // returns number of characters in a string excluding the null terminator asm MOV EDX, EDI MOV EDI, EAX MOV ECX, 0FFFFFFFFH XOR AX, AX REPNE SCASW MOV EAX, 0FFFFFFFEH SUB EAX, ECX MOV EDI, EDX end; //---------------------------------------------------------------------------------------------------------------------- function StrEndW(Str: PWideChar): PWideChar; // returns a pointer to the end of a null terminated string asm MOV EDX, EDI MOV EDI, EAX MOV ECX, 0FFFFFFFFH XOR AX, AX REPNE SCASW LEA EAX, [EDI - 2] MOV EDI, EDX end; //---------------------------------------------------------------------------------------------------------------------- function StrMoveW(Dest, Source: PWideChar; Count: Cardinal): PWideChar; // Copies the specified number of characters to the destination string and returns Dest // also as result. Dest must have enough room to store at least Count characters. asm PUSH ESI PUSH EDI MOV ESI, EDX MOV EDI, EAX MOV EDX, ECX CMP EDI, ESI JG @@1 JE @@2 SHR ECX, 1 REP MOVSD MOV ECX, EDX AND ECX, 1 REP MOVSW JMP @@2 @@1: LEA ESI, [ESI + 2 * ECX - 2] LEA EDI, [EDI + 2 * ECX - 2] STD AND ECX, 1 REP MOVSW SUB EDI, 2 SUB ESI, 2 MOV ECX, EDX SHR ECX, 1 REP MOVSD CLD @@2: POP EDI POP ESI end; //---------------------------------------------------------------------------------------------------------------------- function StrCopyW(Dest, Source: PWideChar): PWideChar; // copies Source to Dest and returns Dest asm PUSH EDI PUSH ESI MOV ESI, EAX MOV EDI, EDX MOV ECX, 0FFFFFFFFH XOR AX, AX REPNE SCASW NOT ECX MOV EDI, ESI MOV ESI, EDX MOV EDX, ECX MOV EAX, EDI SHR ECX, 1 REP MOVSD MOV ECX, EDX AND ECX, 1 REP MOVSW POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrECopyW(Dest, Source: PWideChar): PWideChar; // copies Source to Dest and returns a pointer to the null character ending the string asm PUSH EDI PUSH ESI MOV ESI, EAX MOV EDI, EDX MOV ECX, 0FFFFFFFFH XOR AX, AX REPNE SCASW NOT ECX MOV EDI, ESI MOV ESI, EDX MOV EDX, ECX SHR ECX, 1 REP MOVSD MOV ECX, EDX AND ECX, 1 REP MOVSW LEA EAX, [EDI - 2] POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrLCopyW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar; // copies a specified maximum number of characters from Source to Dest asm PUSH EDI PUSH ESI PUSH EBX MOV ESI, EAX MOV EDI, EDX MOV EBX, ECX XOR AX, AX TEST ECX, ECX JZ @@1 REPNE SCASW JNE @@1 INC ECX @@1: SUB EBX, ECX MOV EDI, ESI MOV ESI, EDX MOV EDX, EDI MOV ECX, EBX SHR ECX, 1 REP MOVSD MOV ECX, EBX AND ECX, 1 REP MOVSW STOSW MOV EAX, EDX POP EBX POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrPCopyW(Dest: PWideChar; const Source: String): PWideChar; // copies a Pascal-style string to a null-terminated wide string begin if Length(Source) > 0 then Result := StrPLCopyW(Dest, Source, Length(Source)) else Result := Dest; Result[Length(Source)] := WideNull; end; //---------------------------------------------------------------------------------------------------------------------- function StrPLCopyW(Dest: PWideChar; const Source: String; MaxLen: Cardinal): PWideChar; // copies characters from a Pascal-style string into a null-terminated wide string asm PUSH EDI PUSH ESI MOV EDI, EAX MOV ESI, EDX MOV EDX, EAX XOR AX, AX @@1: LODSB STOSW DEC ECX JNZ @@1 MOV EAX, EDX POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrCatW(Dest, Source: PWideChar): PWideChar; // appends a copy of Source to the end of Dest and returns the concatenated string begin StrCopyW(StrEndW(Dest), Source); Result := Dest; end; //---------------------------------------------------------------------------------------------------------------------- function StrLCatW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar; // appends a specified maximum number of WideCharacters to string asm PUSH EDI PUSH ESI PUSH EBX MOV EDI, Dest MOV ESI, Source MOV EBX, MaxLen SHL EBX, 1 CALL StrEndW MOV ECX, EDI ADD ECX, EBX SUB ECX, EAX JBE @@1 MOV EDX, ESI SHR ECX, 1 CALL StrLCopyW @@1: MOV EAX, EDI POP EBX POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrCompW(Str1, Str2: PWideChar): Integer; // compares Str1 to Str2 (binary comparation) // Note: There's also an extended comparation function which uses a given language to // compare unicode strings. asm PUSH EDI PUSH ESI MOV EDI, EDX MOV ESI, EAX MOV ECX, 0FFFFFFFFH XOR EAX, EAX REPNE SCASW NOT ECX MOV EDI, EDX XOR EDX, EDX REPE CMPSW MOV AX, [ESI - 2] MOV DX, [EDI - 2] SUB EAX, EDX POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrICompW(Str1, Str2: PWideChar): Integer; // compares Str1 to Str2 without case sensitivity (binary comparation), // Note: only ANSI characters are compared case insensitively asm PUSH EDI PUSH ESI MOV EDI, EDX MOV ESI, EAX MOV ECX, 0FFFFFFFFH XOR EAX, EAX REPNE SCASW NOT ECX MOV EDI, EDX XOR EDX, EDX @@1: REPE CMPSW JE @@4 MOV AX, [ESI - 2] CMP AX, 'a' JB @@2 CMP AX, 'z' JA @@2 SUB AL, 20H @@2: MOV DX, [EDI - 2] CMP DX, 'a' JB @@3 CMP DX, 'z' JA @@3 SUB DX, 20H @@3: SUB EAX, EDX JE @@1 @@4: POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrLCompW(Str1, Str2: PWideChar; MaxLen: Cardinal): Integer; // compares a specified maximum number of charaters in two strings asm PUSH EDI PUSH ESI PUSH EBX MOV EDI, EDX MOV ESI, EAX MOV EBX, ECX XOR EAX, EAX OR ECX, ECX JE @@1 REPNE SCASW SUB EBX, ECX MOV ECX, EBX MOV EDI, EDX XOR EDX, EDX REPE CMPSW MOV AX, [ESI - 2] MOV DX, [EDI - 2] SUB EAX, EDX @@1: POP EBX POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrLICompW(Str1, Str2: PWideChar; MaxLen: Cardinal): Integer; // compares strings up to a specified maximum number of characters, not case sensitive // Note: only ANSI characters are compared case insensitively asm PUSH EDI PUSH ESI PUSH EBX MOV EDI, EDX MOV ESI, EAX MOV EBX, ECX XOR EAX, EAX OR ECX, ECX JE @@4 REPNE SCASW SUB EBX, ECX MOV ECX, EBX MOV EDI, EDX XOR EDX, EDX @@1: REPE CMPSW JE @@4 MOV AX, [ESI - 2] CMP AX, 'a' JB @@2 CMP AX, 'z' JA @@2 SUB AX, 20H @@2: MOV DX, [EDI - 2] CMP DX, 'a' JB @@3 CMP DX, 'z' JA @@3 SUB DX, 20H @@3: SUB EAX, EDX JE @@1 @@4: POP EBX POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrNScanW(S1, S2: PWideChar): Integer; // determines where (in S1) the first time one of the characters of S2 appear. // The result is the length of a string part of S1 where none of the characters of // S2 do appear (not counting the trailing #0 and starting with position 0 in S1). var Run: PWideChar; begin Result := -1; if Assigned(S1) and Assigned(S2) then begin Run := S1; while (Run^ <> #0) do begin if StrScanW(S2, Run^) <> nil then Break; Inc(Run); end; Result := Run - S1; end; end; //---------------------------------------------------------------------------------------------------------------------- function StrRNScanW(S1, S2: PWideChar): Integer; // This function does the same as StrRNScanW but uses S1 in reverse order. This means S1 points to the last // character of a string, is traveresed reversely and terminates with a starting #0. // This is useful for parsing strings stored in reversed macro buffers etc. var Run: PWideChar; begin Result := -1; if Assigned(S1) and Assigned(S2) then begin Run := S1; while (Run^ <> #0) do begin if StrScanW(S2, Run^) <> nil then Break; Dec(Run); end; Result := S1 - Run; end; end; //---------------------------------------------------------------------------------------------------------------------- function StrScanW(Str: PWideChar; Chr: WideChar): PWideChar; // returns a pointer to first occurrence of a specified character in a string asm PUSH EDI PUSH EAX MOV EDI, Str MOV ECX, 0FFFFFFFFH XOR AX, AX REPNE SCASW NOT ECX POP EDI MOV AX, Chr REPNE SCASW MOV EAX, 0 JNE @@1 MOV EAX, EDI SUB EAX, 2 @@1: POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrScanW(Str: PWideChar; Chr: WideChar; StrLen: Cardinal): PWideChar; // returns a pointer to first occurrence of a specified character in a string // or nil if not found // Note: this is just a binary search for the specified character and there's no check for // a terminating null. Instead at most StrLen characters are searched. This makes // this function extremly fast. // // on enter EAX contains Str, EDX contains Chr and ECX StrLen // on exit EAX contains result pointer or nil asm TEST EAX, EAX JZ @@Exit // get out if the string is nil or StrLen is 0 JCXZ @@Exit @@Loop: CMP [EAX], DX // this unrolled loop is actually faster on modern processors JE @@Exit // than REP SCASW INC EAX DEC ECX JNZ @@Loop XOR EAX, EAX @@Exit: end; //---------------------------------------------------------------------------------------------------------------------- function StrRScanW(Str: PWideChar; Chr: WideChar): PWideChar; // returns a pointer to the last occurance of Chr in Str asm PUSH EDI MOV EDI, Str MOV ECX, 0FFFFFFFFH XOR AX, AX REPNE SCASW NOT ECX STD SUB EDI, 2 MOV AX, Chr REPNE SCASW MOV EAX, 0 JNE @@1 MOV EAX, EDI ADD EAX, 2 @@1: CLD POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrPosW(Str, SubStr: PWideChar): PWideChar; // returns a pointer to the first occurance of SubStr in Str asm PUSH EDI PUSH ESI PUSH EBX OR EAX, EAX JZ @@2 OR EDX, EDX JZ @@2 MOV EBX, EAX MOV EDI, EDX XOR AX, AX MOV ECX, 0FFFFFFFFH REPNE SCASW NOT ECX DEC ECX JZ @@2 MOV ESI, ECX MOV EDI, EBX MOV ECX, 0FFFFFFFFH REPNE SCASW NOT ECX SUB ECX, ESI JBE @@2 MOV EDI, EBX LEA EBX, [ESI - 1] // Note: 2 would be wrong here, we are dealing with numbers not an address @@1: MOV ESI, EDX LODSW REPNE SCASW JNE @@2 MOV EAX, ECX PUSH EDI MOV ECX, EBX REPE CMPSW POP EDI MOV ECX, EAX JNE @@1 LEA EAX, [EDI - 2] JMP @@3 @@2: XOR EAX, EAX @@3: POP EBX POP ESI POP EDI end; //---------------------------------------------------------------------------------------------------------------------- function StrUpperW(Str: PWideChar): PWideChar; // converts Str to upper case and returns it begin Result := Str; while Str^ <> WideNull do begin Str^ := WideChar(UnicodeToUpper(Word(Str^))); Inc(Str); end; end; //---------------------------------------------------------------------------------------------------------------------- function StrLowerW(Str: PWideChar): PWideChar; // converts Str to lower case and returns it begin Result := Str; while Str^ <> WideNull do begin Str^ := WideChar(UnicodeToLower(Word(Str^))); Inc(Str); end; end; //---------------------------------------------------------------------------------------------------------------------- function StrTitleW(Str: PWideChar): PWideChar; // converts Str to title case and returns it begin Result := Str; while Str^ <> WideNull do begin Str^ := WideChar(UnicodeToTitle(Word(Str^))); Inc(Str); end; end; //---------------------------------------------------------------------------------------------------------------------- function StrAllocW(Size: Cardinal): PWideChar; // Allocates a buffer for a null-terminated wide string and returns a pointer // to the first character of the string. begin Size := SizeOf(WideChar) * Size + SizeOf(Cardinal); GetMem(Result, Size); FillChar(Result^, Size, 0); Cardinal(Pointer(Result)^) := Size; Inc(Result, SizeOf(Cardinal) div SizeOf(WideChar)); end; //---------------------------------------------------------------------------------------------------------------------- function StrBufSizeW(Str: PWideChar): Cardinal; // Returns max number of wide characters that can be stored in a buffer allocated by StrAllocW. begin Dec(Str, SizeOf(Cardinal) div SizeOf(WideChar)); Result := (Cardinal(Pointer(Str)^) - SizeOf(Cardinal)) div 2; end; //---------------------------------------------------------------------------------------------------------------------- function StrNewW(Str: PWideChar): PWideChar; // Duplicates the given string (if not nil) and returns the address of the new string. var Size: Cardinal; begin if Str = nil then Result := nil else begin Size := StrLenW(Str) + 1; Result := StrMoveW(StrAllocW(Size), Str, Size); end; end; //---------------------------------------------------------------------------------------------------------------------- procedure StrDisposeW(Str: PWideChar); // releases a string allocated with StrNew. begin if Str <> nil then begin Dec(Str, SizeOf(Cardinal) div SizeOf(WideChar)); FreeMem(Str, Cardinal(Pointer(Str)^)); end; end; //---------------------------------------------------------------------------------------------------------------------- procedure StrSwapByteOrder(Str: PWideChar); // exchanges in each character of the given string the low order and high order // byte to go from LSB to MSB and vice versa. // EAX contains address of string asm PUSH ESI PUSH EDI MOV ESI, EAX MOV EDI, ESI XOR EAX, EAX // clear high order byte to be able to use 32bit operand below @@1: LODSW OR EAX, EAX JZ @@2 XCHG AL, AH STOSW JMP @@1 @@2: POP EDI POP ESI end; //---------------------------------------------------------------------------------------------------------------------- function WideAdjustLineBreaks(const S: WideString): WideString; var Source, SourceEnd, Dest: PWideChar; Extra: Integer; begin Source := Pointer(S); SourceEnd := Source + Length(S); Extra := 0; while Source < SourceEnd do begin case Source^ of LF: Inc(Extra); CR: if Source[1] = LineFeed then Inc(Source) else Inc(Extra); end; Inc(Source); end; Source := Pointer(S); SetString(Result, nil, SourceEnd - Source + Extra); Dest := Pointer(Result); while Source < SourceEnd do case Source^ of LineFeed: begin Dest^ := LineSeparator; Inc(Dest); Inc(Source); end; CarriageReturn: begin Dest^ := LineSeparator; Inc(Dest); Inc(Source); if Source^ = LineFeed then Inc(Source); end; else Dest^ := Source^; Inc(Dest); Inc(Source); end; end; //---------------------------------------------------------------------------------------------------------------------- function WideQuotedStr(const S: WideString; Quote: WideChar): WideString; // works like QuotedStr from SysUtils.pas but can insert any quotation character var P, Src, Dest: PWideChar; AddCount: Integer; begin AddCount := 0; P := StrScanW(PWideChar(S), Quote); while Assigned(P) do begin Inc(P); Inc(AddCount); P := StrScanW(P, Quote); end; if AddCount = 0 then Result := Quote + S + Quote else begin SetLength(Result, Length(S) + AddCount + 2); Dest := PWideChar(Result); Dest^ := Quote; Inc(Dest); Src := PWideChar(S); P := StrScanW(Src, Quote); repeat Inc(P); Move(Src^, Dest^, (P - Src) * SizeOf(WideChar)); Inc(Dest, P - Src); Dest^ := Quote; Inc(Dest); Src := P; P := StrScanW(Src, Quote); until P = nil; P := StrEndW(Src); Move(Src^, Dest^, (P - Src) * SizeOf(WideChar)); Inc(Dest, P - Src); Dest^ := Quote; end; end; //---------------------------------------------------------------------------------------------------------------------- function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString; // extracts a string enclosed in quote characters given by Quote var P, Dest: PWideChar; DropCount: Integer; begin Result := ''; if (Src = nil) or (Src^ <> Quote) then Exit; Inc(Src); DropCount := 1; P := Src; Src := StrScanW(Src, Quote); while Assigned(Src) do // count adjacent pairs of quote chars begin Inc(Src); if Src^ <> Quote then Break; Inc(Src); Inc(DropCount); Src := StrScanW(Src, Quote); end; if Src = nil then Src := StrEndW(P); if (Src - P) <= 1 then Exit; if DropCount = 1 then SetString(Result, P, Src - P - 1) else begin SetLength(Result, Src - P - DropCount); Dest := PWideChar(Result); Src := StrScanW(P, Quote); while Assigned(Src) do begin Inc(Src); if Src^ <> Quote then Break; Move(P^, Dest^, (Src - P) * SizeOf(WideChar)); Inc(Dest, Src - P); Inc(Src); P := Src; Src := StrScanW(Src, Quote); end; if Src = nil then Src := StrEndW(P); Move(P^, Dest^, (Src - P - 1) * SizeOf(WideChar)); end; end; //---------------------------------------------------------------------------------------------------------------------- function WideStringOfChar(C: WideChar; Count: Cardinal): WideString; // returns a string of Count characters filled with C var I: Integer; begin SetLength(Result, Count); for I := 1 to Count do Result[I] := C; end; //---------------------------------------------------------------------------------------------------------------------- function WideTrim(const S: WideString): WideString; var I, L: Integer; begin L := Length(S); I := 1; while (I <= L) and (UnicodeIsWhiteSpace(Word(S[I])) or UnicodeIsControl(Word(S[I]))) do Inc(I); if I > L then Result := '' else begin while UnicodeIsWhiteSpace(Word(S[L])) or UnicodeIsControl(Word(S[L])) do Dec(L); Result := Copy(S, I, L - I + 1); end; end; //---------------------------------------------------------------------------------------------------------------------- function WideTrimLeft(const S: WideString): WideString; var I, L: Integer; begin L := Length(S); I := 1; while (I <= L) and (UnicodeIsWhiteSpace(Word(S[I])) or UnicodeIsControl(Word(S[I]))) do Inc(I); Result := Copy(S, I, Maxint); end; //---------------------------------------------------------------------------------------------------------------------- function WideTrimRight(const S: WideString): WideString; var I: Integer; begin I := Length(S); while (I > 0) and (UnicodeIsWhiteSpace(Word(S[I])) or UnicodeIsControl(Word(S[I]))) do Dec(I); Result := Copy(S, 1, I); end; //---------------------------------------------------------------------------------------------------------------------- function WideCharPos(const S: WideString; const Ch: WideChar; const Index: Integer): Integer; // returns the index of character Ch in S, starts searching at index Index // Note: This is a quick memory search. No attempt is made to interpret either the given // charcter nor the string (ligatures, modifiers, surrogates etc.) asm TEST EAX,EAX // make sure we are not null JZ @StrIsNil DEC ECX // make index zero based JL @IdxIsSmall PUSH EBX PUSH EDI MOV EDI,EAX // EDI := S XOR EAX,EAX MOV AX, DX // AX := Ch MOV EDX,[EDI-4] // EDX := Length(S) * 2 SHR EDX,1 // EDX := EDX div 2 MOV EBX,EDX // save the length to calc. result SUB EDX,ECX // EDX = EDX - Index = # of chars to scan JLE @IdxIsBig ADD EDI,ECX // point to index'th char MOV ECX,EDX // loop counter CLD REPNE SCASW JNE @NoMatch MOV EAX,EBX // result := saved length - SUB EAX,ECX // loop counter value POP EDI POP EBX RET @IdxIsBig: @NoMatch: XOR EAX,EAX POP EDI POP EBX RET @IdxIsSmall: XOR EAX,EAX @StrIsNil: end; //---------------------------------------------------------------------------------------------------------------------- function WideCompose(const S: WideString): WideString; // returns a string with all characters of S but if there is a possibility to combine characters // then they are composed var I: Integer; begin for I := 1 to Length(S) do begin //UnicodeCompose( end; end; //---------------------------------------------------------------------------------------------------------------------- function WideComposeHangul(Source: WideString): WideString; var Len: Integer; Ch, Last: WideChar; I, J: Integer; LINdex, VIndex, SIndex, TIndex: Integer; begin // copy first char Len := Length(Source); if Len > 0 then begin // allocate memory only once and shorten the result when done SetLength(Result, Len); J := 1; Last := Source[J]; Result := Last; for I := 2 to Len do begin Ch := Source[I]; // 1. check to see if two current characters are L and V LIndex := Word(Last) - LBase; if (0 <= LIndex) and (LIndex < LCount) then begin VIndex := Word(Ch) - VBase; if (0 <= VIndex) and (VIndex < VCount) then begin // make syllable of form LV Last := WideChar((SBase + (LIndex * VCount + VIndex) * TCount)); Result[J] := Last; // reset last Continue; // discard Ch end; end; // 2. check to see if two current characters are LV and T SIndex := Word(Last) - SBase; if (0 <= SIndex) and (SIndex < SCount) and ((SIndex mod TCount) = 0) then begin TIndex := Word(Ch) - TBase; if (0 <= TIndex) and (TIndex <= TCount) then begin // make syllable of form LVT Inc(Word(Last), TIndex); Result[J] := Last; // reset last Continue; // discard ch end; end; // if neither case was true, just add the character Last := Ch; Inc(J); Result[J] := Ch; end; // shorten the result to real length SetLength(Result, J); end else Result := ''; end; //---------------------------------------------------------------------------------------------------------------------- function WideDecompose(const S: WideString): WideString; // returns a string with all characters of S but decomposed, e.g. Ê is returned as E^ etc. var I, J, K: Integer; CClass: Cardinal; Decomp: TCardinalArray; begin Result := ''; Decomp := nil; for I := 1 to Length(S) do begin // no need to dive iteratively into decompositions as this is already done // on creation of the data used to lookup the decomposition Decomp := UnicodeDecompose(Word(S[I])); // We need to sort the returned values according to their canonical class. for J := 0 to High(Decomp) do begin CClass := UnicodeCanonicalClass(Decomp[J]); if CClass = 0 then Result := Result + WideChar(Decomp[J]) else begin K := Length(Result); // bubble-sort combining marks as necessary while K > 1 do begin if UnicodeCanonicalClass(Word(Result[K])) <= CClass then Break; Dec(K); end; Insert(WideChar(Decomp[J]), Result, K + 1); end; end; end; end; //---------------------------------------------------------------------------------------------------------------------- function WideLoCase(C: WideChar): WideChar; begin Result := WideChar(UnicodeToLower(Word(C))); end; //---------------------------------------------------------------------------------------------------------------------- function WideLowerCase(const S: WideString): WideString; var I: Integer; begin Result := S; for I := 1 to Length(S) do Result[I] := WideChar(UnicodeToLower(Word(Result[I]))); end; //---------------------------------------------------------------------------------------------------------------------- function WideTitleCaseChar(C: WideChar): WideChar; begin Result := WideChar(UnicodeToTitle(Word(C))); end; //---------------------------------------------------------------------------------------------------------------------- function WideTitleCaseString(const S: WideString): WideString; var I: Integer; begin Result := S; for I := 1 to Length(S) do Result[I] := WideChar(UnicodeToTitle(Word(Result[I]))); end; //---------------------------------------------------------------------------------------------------------------------- function WideUpCase(C: WideChar): WideChar; begin Result := WideChar(UnicodeToUpper(Word(C))); end; //---------------------------------------------------------------------------------------------------------------------- function WideUpperCase(const S: WideString): WideString; var I: Integer; begin Result := S; for I := 1 to Length(S) do Result[I] := WideChar(UnicodeToUpper(Word(Result[I]))); end; //----------------- character test routines ---------------------------------------------------------------------------- // Is the character alphabetic? function UnicodeIsAlpha(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LU or UC_LL or UC_LM or UC_LO or UC_LT, 0); end; // Is the character a digit? function UnicodeIsDigit(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ND, 0); end; // Is the character alphabetic or a number? function UnicodeIsAlphaNum(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LU or UC_LL or UC_LM or UC_LO or UC_LT or UC_ND, 0); end; // Is the character a control character? function UnicodeIsControl(C: UCS4): Boolean; begin Result := IsProperty(C, UC_CC or UC_CF, 0); end; // Is the character a spacing character? function UnicodeIsSpace(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ZS or UC_SS, 0); end; // Is the character a white space character (same as UnicodeIsSpace plus tabulator, new line etc.)? function UnicodeIsWhiteSpace(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ZS or UC_SS, UC_WS or UC_S); end; // Is the character a space separator? function UnicodeIsBlank(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ZS, 0); end; // Is the character a punctuation mark? function UnicodeIsPunctuation(C: UCS4): Boolean; begin Result := IsProperty(C, UC_PD or UC_PS or UC_PE or UC_PO, UC_PI or UC_PF); end; // Is the character graphical? function UnicodeIsGraph(C: UCS4): Boolean; begin Result := IsProperty(C, UC_MN or UC_MC or UC_ME or UC_ND or UC_NL or UC_NO or UC_LU or UC_LL or UC_LT or UC_LM or UC_LO or UC_PC or UC_PD or UC_PS or UC_PE or UC_PO or UC_SM or UC_SM or UC_SC or UC_SK or UC_SO, UC_PI or UC_PF); end; // Is the character printable? function UnicodeIsPrintable(C: UCS4): Boolean; begin Result := IsProperty(C, UC_MN or UC_MC or UC_ME or UC_ND or UC_NL or UC_NO or UC_LU or UC_LL or UC_LT or UC_LM or UC_LO or UC_PC or UC_PD or UC_PS or UC_PE or UC_PO or UC_SM or UC_SM or UC_SC or UC_SK or UC_SO or UC_ZS, UC_PI or UC_PF); end; // Is the character already upper case? function UnicodeIsUpper(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LU, 0); end; // Is the character already lower case? function UnicodeIsLower(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LL, 0); end; // Is the character already title case? function UnicodeIsTitle(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LT, 0); end; // Is the character a hex digit? function UnicodeIsHexDigit(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_HD); end; // Is the character a C0 control character (< 32)? function UnicodeIsIsoControl(C: UCS4): Boolean; begin Result := IsProperty(C, UC_CC, 0); end; // Is the character a format control character? function UnicodeIsFormatControl(C: UCS4): Boolean; begin Result := IsProperty(C, UC_CF, 0); end; // Is the character a symbol? function UnicodeIsSymbol(C: UCS4): Boolean; begin Result := IsProperty(C, UC_SM or UC_SC or UC_SO or UC_SK, 0); end; // Is the character a number or digit? function UnicodeIsNumber(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ND or UC_NO or UC_NL, 0); end; // Is the character non-spacing? function UnicodeIsNonSpacing(C: UCS4): Boolean; begin Result := IsProperty(C, UC_MN, 0); end; // Is the character an open/left punctuation (i.e. '[')? function UnicodeIsOpenPunctuation(C: UCS4): Boolean; begin Result := IsProperty(C, UC_PS, 0); end; // Is the character an close/right punctuation (i.e. ']')? function UnicodeIsClosePunctuation(C: UCS4): Boolean; begin Result := IsProperty(C, UC_PE, 0); end; // Is the character an initial punctuation (i.e. U+2018 LEFT SINGLE QUOTATION MARK)? function UnicodeIsInitialPunctuation(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_PI); end; // Is the character a final punctuation (i.e. U+2019 RIGHT SINGLE QUOTATION MARK)? function UnicodeIsFinalPunctuation(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_PF); end; // Can the character be decomposed into a set of other characters? function UnicodeIsComposite(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_CM); end; // Is the character one of the many quotation marks? function UnicodeIsQuotationMark(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_QM); end; // Is the character one that has an opposite form (i.e. <>)? function UnicodeIsSymmetric(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_SY); end; // Is the character mirroring (superset of symmetric)? function UnicodeIsMirroring(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_MR); end; // Is the character non-breaking (i.e. non-breaking space)? function UnicodeIsNonBreaking(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_NB); end; // Directionality functions // Does the character have strong right-to-left directionality (i.e. Arabic letters)? function UnicodeIsRTL(C: UCS4): Boolean; begin Result := IsProperty(C, UC_R, 0); end; // Does the character have strong left-to-right directionality (i.e. Latin letters)? function UnicodeIsLTR(C: UCS4): Boolean; begin Result := IsProperty(C, UC_L, 0); end; // Does the character have strong directionality? function UnicodeIsStrong(C: UCS4): Boolean; begin Result := IsProperty(C, UC_L or UC_R, 0); end; // Does the character have weak directionality (i.e. numbers)? function UnicodeIsWeak(C: UCS4): Boolean; begin Result := IsProperty(C, UC_EN or UC_ES, UC_ET or UC_AN or UC_CS); end; // Does the character have neutral directionality (i.e. whitespace)? function UnicodeIsNeutral(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_B or UC_S or UC_WS or UC_ON); end; // Is the character a block or segment separator? function UnicodeIsSeparator(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_B or UC_S); end; // Other functions inspired by John Cowan. // Is the character a mark of some kind? function UnicodeIsMark(C: UCS4): Boolean; begin Result := IsProperty(C, UC_MN or UC_MC or UC_ME, 0); end; // Is the character a modifier letter? function UnicodeIsModifier(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LM, 0); end; // Is the character a number represented by a letter? function UnicodeIsLetterNumber(C: UCS4): Boolean; begin Result := IsProperty(C, UC_NL, 0); end; // Is the character connecting punctuation? function UnicodeIsConnectionPunctuation(C: UCS4): Boolean; begin Result := IsProperty(C, UC_PC, 0); end; // Is the character a dash punctuation? function UnicodeIsDash(C: UCS4): Boolean; begin Result := IsProperty(C, UC_PD, 0); end; // Is the character a math character? function UnicodeIsMath(C: UCS4): Boolean; begin Result := IsProperty(C, UC_SM, 0); end; // Is the character a currency character? function UnicodeIsCurrency(C: UCS4): Boolean; begin Result := IsProperty(C, UC_SC, 0); end; // Is the character a modifier symbol? function UnicodeIsModifierSymbol(C: UCS4): Boolean; begin Result := IsProperty(C, UC_SK, 0); end; // Is the character a non-spacing mark? function UnicodeIsNonSpacingMark(C: UCS4): Boolean; begin Result := IsProperty(C, UC_MN, 0); end; // Is the character a spacing mark? function UnicodeIsSpacingMark(C: UCS4): Boolean; begin Result := IsProperty(C, UC_MC, 0); end; // Is the character enclosing (i.e. enclosing box)? function UnicodeIsEnclosing(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ME, 0); end; // Is the character from the Private Use Area? function UnicodeIsPrivate(C: UCS4): Boolean; begin Result := IsProperty(C, UC_CO, 0); end; // Is the character one of the surrogate codes? function UnicodeIsSurrogate(C: UCS4): Boolean; begin Result := IsProperty(C, UC_OS, 0); end; // Is the character a line separator? function UnicodeIsLineSeparator(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ZL, 0); end; // Is th character a paragraph separator; function UnicodeIsParagraphSeparator(C: UCS4): Boolean; begin Result := IsProperty(C, UC_ZP, 0); end; // Can the character begin an identifier? function UnicodeIsIdenifierStart(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LU or UC_LL or UC_LT or UC_LO or UC_NL, 0); end; // Can the character appear in an identifier? function UnicodeIsIdentifierPart(C: UCS4): Boolean; begin Result := IsProperty(C, UC_LU or UC_LL or UC_LT or UC_LO or UC_NL or UC_MN or UC_MC or UC_ND or UC_PC or UC_CF, 0); end; // Is the character defined (appears in one of the data files)? function UnicodeIsDefined(C: UCS4): Boolean; begin Result := IsProperty(C, 0, UC_CP); end; // Is the character not defined (non-Unicode)? function UnicodeIsUndefined(C: UCS4): Boolean; begin Result := not IsProperty(C, 0, UC_CP); end; // Other miscellaneous character property functions. // Is the character a Han ideograph? function UnicodeIsHan(C: UCS4): Boolean; begin Result := ((C >= $4E00) and (C <= $9FFF)) or ((C >= $F900) and (C <= $FAFF)); end; // Is the character a pre-composed Hangul syllable? function UnicodeIsHangul(C: UCS4): Boolean; begin Result := (C >= $AC00) and (C <= $D7FF); end; //---------------------------------------------------------------------------------------------------------------------- function CodePageFromLocale(Language: LCID): Cardinal; // determines the code page for a given locale var Buf: array[0..6] of Char; begin GetLocaleInfo(Language, LOCALE_IDefaultAnsiCodePage, Buf, 6); Result := StrToIntDef(Buf, GetACP); end; //---------------------------------------------------------------------------------------------------------------------- function KeyboardCodePage: Cardinal; begin Result := CodePageFromLocale(GetKeyboardLayout(0) and $FFFF); end; //---------------------------------------------------------------------------------------------------------------------- function KeyUnicode(C: Char): WideChar; // converts the given character (as it comes with a WM_CHAR message) into its corresponding // Unicode character depending on the active keyboard layout begin MultiByteToWideChar(KeyboardCodePage, MB_USEGLYPHCHARS, @C, 1, @Result, 1); end; //---------------------------------------------------------------------------------------------------------------------- function CodeBlockFromChar(const C: WideChar): Cardinal; // returns the Unicode code block to which C belongs begin case C of #$0000..#$007F: // Basic Latin Result := 0; #$0080..#$00FF: // Latin-1 Supplement Result := 1; #$0100..#$017F: // Latin Extended-A Result := 2; #$0180..#$024F: // Latin Extended-B Result := 3; #$0250..#$02AF: // IPA Extensions Result := 4; #$02B0..#$02FF: // Spacing Modifier Letters Result := 5; #$0300..#$036F: // Combining Diacritical Marks Result := 6; #$0370..#$03FF: // Greek Result := 7; #$0400..#$04FF: // Cyrillic Result := 8; #$0530..#$058F: // Armenian Result := 9; #$0590..#$05FF: // Hebrew Result := 10; #$0600..#$06FF: // Arabic Result := 11; #$0900..#$097F: // Devanagari Result := 12; #$0980..#$09FF: // Bengali Result := 13; #$0A00..#$0A7F: // Gurmukhi Result := 14; #$0A80..#$0AFF: // Gujarati Result := 15; #$0B00..#$0B7F: // Oriya Result := 16; #$0B80..#$0BFF: // Tamil Result := 17; #$0C00..#$0C7F: // Telugu Result := 18; #$0C80..#$0CFF: // Kannada Result := 19; #$0D00..#$0D7F: // Malayalam Result := 20; #$0E00..#$0E7F: // Thai Result := 21; #$0E80..#$0EFF: // Lao Result := 22; #$0F00..#$0FBF: // Tibetan Result := 23; #$10A0..#$10FF: // Georgian Result := 24; #$1100..#$11FF: // Hangul Jamo Result := 25; #$1E00..#$1EFF: // Latin Extended Additional Result := 26; #$1F00..#$1FFF: // Greek Extended Result := 27; #$2000..#$206F: // General Punctuation Result := 28; #$2070..#$209F: // Superscripts and Subscripts Result := 29; #$20A0..#$20CF: // Currency Symbols Result := 30; #$20D0..#$20FF: // Combining Marks for Symbols Result := 31; #$2100..#$214F: // Letterlike Symbols Result := 32; #$2150..#$218F: // Number Forms Result := 33; #$2190..#$21FF: // Arrows Result := 34; #$2200..#$22FF: // Mathematical Operators Result := 35; #$2300..#$23FF: // Miscellaneous Technical Result := 36; #$2400..#$243F: // Control Pictures Result := 37; #$2440..#$245F: // Optical Character Recognition Result := 38; #$2460..#$24FF: // Enclosed Alphanumerics Result := 39; #$2500..#$257F: // Box Drawing Result := 40; #$2580..#$259F: // Block Elements Result := 41; #$25A0..#$25FF: // Geometric Shapes Result := 42; #$2600..#$26FF: // Miscellaneous Symbols Result := 43; #$2700..#$27BF: // Dingbats Result := 44; #$3000..#$303F: // CJK Symbols and Punctuation Result := 45; #$3040..#$309F: // Hiragana Result := 46; #$30A0..#$30FF: // Katakana Result := 47; #$3100..#$312F: // Bopomofo Result := 48; #$3130..#$318F: // Hangul Compatibility Jamo Result := 49; #$3190..#$319F: // Kanbun Result := 50; #$3200..#$32FF: // Enclosed CJK Letters and Months Result := 51; #$3300..#$33FF: // CJK Compatibility Result := 52; #$4E00..#$9FFF: // CJK Unified Ideographs Result := 53; #$AC00..#$D7A3: // Hangul Syllables Result := 54; #$D800..#$DB7F: // High Surrogates Result := 55; #$DB80..#$DBFF: // High Private Use Surrogates Result := 56; #$DC00..#$DFFF: // Low Surrogates Result := 57; #$E000..#$F8FF: // Private Use Result := 58; #$F900..#$FAFF: // CJK Compatibility Ideographs Result := 59; #$FB00..#$FB4F: // Alphabetic Presentation Forms Result := 60; #$FB50..#$FDFF: // Arabic Presentation Forms-A Result := 61; #$FE20..#$FE2F: // Combining Half Marks Result := 62; #$FE30..#$FE4F: // CJK Compatibility Forms Result := 63; #$FE50..#$FE6F: // Small Form Variants Result := 64; #$FE70..#$FEFF: // Arabic Presentation Forms-B Result := 65; #$FF00..#$FFEF: // Halfwidth and Fullwidth Forms Result := 66; else // #$FFF0..#$FFFF Specials Result := 67; end; end; //---------------------------------------------------------------------------------------------------------------------- function CodePageToWideString(const A: AnsiString; CodePage: Cardinal; dwFlags: Cardinal = 0): WideString; var Len: Integer; begin // figure length Len := MultiByteToWideChar(CodePage, dwFlags, PChar(A), Length(A), nil, 0); SetLength(Result, Len); if Len > 0 then begin // convert string Len := MultiByteToWideChar(CodePage, dwFlags, PChar(A), Length(A), PWideChar(Result), Length(Result)); // check result if Len = 0 then begin {$IFDEF VER130} RaiseLastWin32Error; {$ELSE} RaiseLastOSError; {$ENDIF} end else SetLength(Result, Len); end; end; //---------------------------------------------------------------------------------------------------------------------- function CompareTextWin95(const W1, W2: WideString; Locale: LCID): Integer; // special comparation function for Win9x since there's no system defined comparation function, // returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2 var S1, S2: String; CP: Cardinal; L1, L2: Integer; begin L1 := Length(W1); L2 := Length(W2); SetLength(S1, L1); SetLength(S2, L2); CP := CodePageFromLocale(Locale); WideCharToMultiByte(CP, 0, PWideChar(W1), L1, PChar(S1), L1, nil, nil); WideCharToMultiByte(CP, 0, PWideChar(W2), L2, PChar(S2), L2, nil, nil); Result := CompareStringA(Locale, NORM_IGNORECASE, PChar(S1), Length(S1), PChar(S2), Length(S2)) - 2; end; //---------------------------------------------------------------------------------------------------------------------- function CompareTextWinNT(const W1, W2: WideString; Locale: LCID): Integer; // Wrapper function for WinNT since there's no system defined comparation function in Win9x and // we need a central comparation function for TWideStringList. // Returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2 begin Result := CompareStringW(Locale, NORM_IGNORECASE, PWideChar(W1), Length(W1), PWideChar(W2), Length(W2)) - 2; end; //----------------- Conversion routines -------------------------------------------------------------------------------- const halfShift: Integer = 10; halfBase: UCS4 = $0010000; halfMask: UCS4 = $3FF; offsetsFromUTF8: array[0..5] of UCS4 = ($00000000, $00003080, $000E2080, $03C82080, $FA082080, $82082080); bytesFromUTF8: array[0..255] of Byte = ( 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5); firstByteMark: array[0..6] of Byte = ($00, $00, $C0, $E0, $F0, $F8, $FC); //---------------------------------------------------------------------------------------------------------------------- function WideStringToUTF8(const S: WideString): AnsiString; var ch: UCS4; L, J, T, bytesToWrite: Word; byteMask: UCS4; byteMark: UCS4; begin if Length(S) = 0 then begin Result := ''; Exit; end; SetLength(Result, Length(S) * 6); // assume worst case T := 1; for J := 1 to Length(S) do begin byteMask := $BF; byteMark := $80; ch := UCS4(S[J]); if ch < $80 then bytesToWrite := 1 else if ch < $800 then bytesToWrite := 2 else if ch < $10000 then bytesToWrite := 3 else if ch < $200000 then bytesToWrite := 4 else if ch < $4000000 then bytesToWrite := 5 else if ch <= MaximumUCS4 then bytesToWrite := 6 else begin bytesToWrite := 2; ch := ReplacementCharacter; end; for L := bytesToWrite downto 2 do begin Result[T + L - 1] := Char((ch or byteMark) and byteMask); ch := ch shr 6; end; Result[T] := Char(ch or firstByteMark[bytesToWrite]); Inc(T, bytesToWrite); end; SetLength(Result, T - 1); // assume worst case end; //---------------------------------------------------------------------------------------------------------------------- function UTF8ToWideString(const S: AnsiString): WideString; var L, J, T: Cardinal; ch: UCS4; extraBytesToWrite: Word; begin if Length(S) = 0 then begin Result := ''; Exit; end; SetLength(Result, Length(S)); // create enough room L := 1; T := 1; while L <= Cardinal(Length(S)) do begin ch := 0; extraBytesToWrite := bytesFromUTF8[Ord(S[L])]; for J := extraBytesToWrite downto 1 do begin ch := ch + Ord(S[L]); Inc(L); ch := ch shl 6; end; ch := ch + Ord(S[L]); Inc(L); ch := ch - offsetsFromUTF8[extraBytesToWrite]; if ch <= MaximumUCS2 then begin Result[T] := WideChar(ch); Inc(T); end else if ch > MaximumUCS4 then begin Result[T] := WideChar(ReplacementCharacter); Inc(T); end else begin ch := ch - halfBase; Result[T] := WideChar((ch shr halfShift) + SurrogateHighStart); Inc(T); Result[T] := WideChar((ch and halfMask) + SurrogateLowStart); Inc(T); end; end; SetLength(Result, T - 1); // now fix up length end; //---------------------------------------------------------------------------------------------------------------------- initialization if (Win32Platform and VER_PLATFORM_WIN32_NT) <> 0 then @WideCompareText := @CompareTextWinNT else @WideCompareText := @CompareTextWin95; finalization LoadInProgress.Free; end.