Files

6933 lines
200 KiB
ObjectPascal
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

unit SynRegExpr;
{
TRegExpr class library
Delphi Regular Expressions
Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
You can choose to use this Pascal unit in one of the two following licenses:
Option 1>
You may use this software in any kind of development,
including comercial, redistribute, and modify it freely,
under the following restrictions :
1. This software is provided as it is, without any kind of
warranty given. Use it at Your own risk.The author is not
responsible for any consequences of use of this software.
2. The origin of this software may not be mispresented, You
must not claim that You wrote the original software. If
You use this software in any kind of product, it would be
appreciated that there in a information box, or in the
documentation would be an acknowledgement like
Partial Copyright (c) 2004 Andrey V. Sorokin
https://sorokin.engineer/
andrey@sorokin.engineer
3. You may not have any income from distributing this source
(or altered version of it) to other developers. When You
use this product in a comercial package, the source may
not be charged seperatly.
4. Altered versions must be plainly marked as such, and must
not be misrepresented as being the original software.
5. RegExp Studio application and all the visual components as
well as documentation is not part of the TRegExpr library
and is not free for usage.
https://sorokin.engineer/
andrey@sorokin.engineer
Option 2>
The same modified LGPL with static linking exception as the Free Pascal RTL
}
{
program is essentially a linear encoding
of a nondeterministic finite-state machine (aka syntax charts or
"railroad normal form" in parsing technology). Each node is an opcode
plus a "next" pointer, possibly plus an operand. "Next" pointers of
all nodes except BRANCH implement concatenation; a "next" pointer with
a BRANCH on both ends of it connects two alternatives. (Here we
have one of the subtle syntax dependencies: an individual BRANCH (as
opposed to a collection of them) is never concatenated with anything
because of operator precedence.) The operand of some types of node is
a literal string; for others, it is a node leading into a sub-FSM. In
particular, the operand of a BRANCH node is the first node of the branch.
(NB this is *not* a tree structure: the tail of the branch connects
to the thing following the set of BRANCHes.)
}
interface
{ off $DEFINE DebugSynRegExpr }
// ======== Determine compiler
{$INCLUDE SynEdit.inc}
// ======== Define base compiler options
{$BOOLEVAL OFF}
{$EXTENDEDSYNTAX ON}
{$LONGSTRINGS ON}
{$OPTIMIZATION ON}
{$IFDEF FPC}
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
{$INLINE ON}
{$ENDIF}
// ======== Define options for TRegExpr engine
{$DEFINE Unicode} // Use WideChar for characters and UnicodeString/WideString for strings
{ off $DEFINE UnicodeEx} // Support Unicode >0xFFFF, e.g. emoji, e.g. "." must find 2 WideChars of 1 emoji
{ off $DEFINE UseWordChars} // Use WordChars property, otherwise fixed list 'a'..'z','A'..'Z','0'..'9','_'
{ off $DEFINE UseSpaceChars} // Use SpaceChars property, otherwise fixed list
{ off $DEFINE UseLineSep} // Use LineSeparators property, otherwise fixed line-break chars
{$IFDEF FPC}
{$DEFINE FastUnicodeData} // Use arrays for UpperCase/LowerCase/IsWordChar, they take 320K more memory
{$ENDIF}
{$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string
{$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string
{$IFNDEF FPC} // Not supported in FreePascal
{$DEFINE reRealExceptionAddr} // Exceptions will point to appropriate source line, not to Error procedure
{$ENDIF}
{$DEFINE ComplexBraces} // Support braces in complex cases
{$IFNDEF Unicode}
{$UNDEF UnicodeEx}
{$UNDEF FastUnicodeData}
{$ENDIF}
{.$DEFINE Compat} // Enable compatability methods/properties for forked version in Free Pascal 3.0
// ======== Define Pascal-language options
// Define 'UseAsserts' option (do not edit this definitions).
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
{$IFDEF SYN_COMPILER_3_UP} {$DEFINE UseAsserts} {$ENDIF}
{$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
// Define 'use subroutine parameters default values' option (do not edit this definition).
{$IFDEF SYN_COMPILER_4_UP} {$DEFINE DefParam} {$ENDIF}
{$IFDEF FPC} {$DEFINE DefParam} {$ENDIF}
// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
{$IFDEF SYN_COMPILER_5_UP} {$DEFINE OverMeth} {$ENDIF}
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
// Define 'InlineFuncs' options, to use inline keyword (do not edit this definitions).
{$IFDEF SYN_COMPILER_8_UP} {$DEFINE InlineFuncs} {$ENDIF}
{$IFDEF FPC} {$DEFINE InlineFuncs} {$ENDIF}
uses
Classes, // TStrings in Split method
SysUtils, // Exception
{$IFDEF D2009}
{$IFDEF D_XE}
System.Character,
{$ELSE}
Character,
{$ENDIF}
{$ENDIF}
Math;
type
{$IFNDEF FPC}
// Delphi doesn't have PtrInt but has NativeInt
PtrInt = NativeInt;
PtrUInt = NativeInt;
{$ENDIF}
{$IFDEF UniCode}
PRegExprChar = PWideChar;
{$IFDEF FPC}
RegExprString = UnicodeString;
{$ELSE}
{$IFDEF SYN_DELPHI_2009_UP}
RegExprString = UnicodeString;
{$ELSE}
RegExprString = WideString;
{$ENDIF}
{$ENDIF}
REChar = WideChar;
{$ELSE}
PRegExprChar = PChar;
RegExprString = AnsiString;
REChar = Char;
{$ENDIF}
TREOp = REChar; // internal opcode type
PREOp = ^TREOp;
type
TRegExprCharset = set of byte;
const
// Escape char ('\' in common r.e.) used for escaping metachars (\w, \d etc)
EscChar = '\';
// Substitute method: prefix of group reference: $1 .. $9 and $<name>
SubstituteGroupChar = '$';
RegExprModifierI: boolean = False; // default value for ModifierI
RegExprModifierR: boolean = True; // default value for ModifierR
RegExprModifierS: boolean = True; // default value for ModifierS
RegExprModifierG: boolean = True; // default value for ModifierG
RegExprModifierM: boolean = False; // default value for ModifierM
RegExprModifierX: boolean = False; // default value for ModifierX
{$IFDEF UseSpaceChars}
// default value for SpaceChars
RegExprSpaceChars: RegExprString = ' '#$9#$A#$D#$C;
{$ENDIF}
{$IFDEF UseWordChars}
// default value for WordChars
RegExprWordChars: RegExprString = '0123456789'
+ 'abcdefghijklmnopqrstuvwxyz'
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
{$ENDIF}
{$IFDEF UseLineSep}
// default value for LineSeparators
RegExprLineSeparators: RegExprString = #$d#$a#$b#$c
{$IFDEF UniCode}
+ #$2028#$2029#$85
{$ENDIF};
{$ENDIF}
// Tab and Unicode category "Space Separator":
// https://www.compart.com/en/unicode/category/Zs
RegExprHorzSeparators: RegExprString = #9#$20#$A0
{$IFDEF UniCode}
+ #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000
{$ENDIF};
RegExprUsePairedBreak: boolean = True;
RegExprReplaceLineBreak: RegExprString = sLineBreak;
RegExprLookaheadIsAtomic: boolean = False;
RegExprLookbehindIsAtomic: boolean = True;
const
RegexMaxGroups = 90;
// Max number of groups.
// Be carefull - don't use values which overflow OP_CLOSE* opcode
// (in this case you'll get compiler error).
// Big value causes slower work and more stack required.
RegexMaxMaxGroups = 255;
// Max possible value for RegexMaxGroups.
// Don't change it! It's defined by internal TRegExpr design.
{$IFDEF ComplexBraces}
const
LoopStackMax = 10; // max depth of loops stack //###0.925
type
TRegExprLoopStack = array [1 .. LoopStackMax] of integer;
{$ENDIF}
type
TRegExprModifiers = record
I: boolean;
// Case-insensitive.
R: boolean;
// Extended syntax for Russian ranges in [].
// If True, then а-я additionally includes letter 'ё',
// А-Я additionally includes 'Ё', and а-Я includes all Russian letters.
// Turn it off if it interferes with your national alphabet.
S: boolean;
// Dot '.' matches any char, otherwise only [^\n].
G: boolean;
// Greedy. Switching it off switches all operators to non-greedy style,
// so if G=False, then '*' works like '*?', '+' works like '+?' and so on.
M: boolean;
// Treat string as multiple lines. It changes `^' and `$' from
// matching at only the very start/end of the string to the start/end
// of any line anywhere within the string.
X: boolean;
// Allow comments in regex using # char.
end;
function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
type
TRegExpr = class;
TRegExprReplaceFunction = function(ARegExpr: TRegExpr): RegExprString of object;
TRegExprCharChecker = function(ch: REChar): boolean of object;
TRegExprCharCheckerArray = array[0 .. 30] of TRegExprCharChecker;
TRegExprCharCheckerInfo = record
CharBegin, CharEnd: REChar;
CheckerIndex: integer;
end;
TRegExprCharCheckerInfos = array of TRegExprCharCheckerInfo;
{$IFDEF Compat}
TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object;
{$ENDIF}
{ TRegExpr }
TRegExpr = class
private
GrpStart: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group start in InputString
GrpEnd: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group end in InputString
GrpIndexes: array [0 .. RegexMaxGroups - 1] of integer; // map global group index to _capturing_ group index
GrpNames: array [0 .. RegexMaxGroups - 1] of RegExprString; // names of groups, if non-empty
GrpAtomic: array [0 .. RegexMaxGroups - 1] of boolean; // group[i] is atomic (filled in Compile)
GrpAtomicDone: array [0 .. RegexMaxGroups - 1] of boolean; // atomic group[i] is "done" (used in Exec* only)
GrpOpCodes: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to opcode of group[i] (used by OP_SUBCALL*)
GrpSubCalled: array [0 .. RegexMaxGroups - 1] of boolean; // group[i] is called by OP_SUBCALL*
GrpCount: integer;
{$IFDEF ComplexBraces}
LoopStack: TRegExprLoopStack; // state before entering loop
LoopStackIdx: integer; // 0 - out of all loops
{$ENDIF}
// The "internal use only" fields to pass info from compile
// to execute that permits the execute phase to run lots faster on
// simple cases.
regAnchored: REChar; // is the match anchored (at beginning-of-line only)?
// regAnchored permits very fast decisions on suitable starting points
// for a match, cutting down the work a lot. regMust permits fast rejection
// of lines that cannot possibly match. The regMust tests are costly enough
// that regcomp() supplies a regMust only if the r.e. contains something
// potentially expensive (at present, the only such thing detected is * or +
// at the start of the r.e., which can involve a lot of backup). regMustLen is
// supplied because the test in regexec() needs it and regcomp() is computing
// it anyway.
regMust: PRegExprChar; // string (pointer into program) that match must include, or nil
regMustLen: integer; // length of regMust string
regMustString: RegExprString; // string which must occur in match (got from regMust/regMustLen)
regLookahead: boolean; // regex has _some_ lookahead
regLookaheadNeg: boolean; // regex has _nagative_ lookahead
regLookaheadGroup: integer; // index of group for lookahead
regLookbehind: boolean; // regex has positive lookbehind
regNestedCalls: integer; // some attempt to prevent 'catastrophic backtracking' but not used
{$IFDEF UseFirstCharSet}
FirstCharSet: TRegExprCharset;
FirstCharArray: array[byte] of boolean;
{$ENDIF}
// work variables for Exec routines - save stack in recursion
regInput: PRegExprChar; // pointer to currently handling char of input string
fInputStart: PRegExprChar; // pointer to first char of input string
fInputEnd: PRegExprChar; // pointer after last char of input string
fRegexStart: PRegExprChar; // pointer to first char of regex
fRegexEnd: PRegExprChar; // pointer after last char of regex
regCurrentGrp: integer; // index of group handling by OP_OPEN* opcode
// work variables for compiler's routines
regParse: PRegExprChar; // pointer to currently handling char of regex
regNumBrackets: integer; // count of () brackets
regDummy: REChar; // dummy pointer, used to detect 1st/2nd pass of Compile
// if p=@regDummy, it is pass-1: opcode memory is not yet allocated
programm: PRegExprChar; // pointer to opcode, =nil in pass-1
regCode: PRegExprChar; // pointer to last emitted opcode; changing in pass-2, but =@regDummy in pass-1
regCodeSize: integer; // total opcode size in REChars
regCodeWork: PRegExprChar; // pointer to opcode, to first code after MAGIC
regExactlyLen: PLongInt; // pointer to length of substring of OP_EXACTLY* inside opcode
fSecondPass: boolean; // true inside pass-2 of Compile
fExpression: RegExprString; // regex string
fInputString: RegExprString; // input string
fLastError: integer; // Error call sets code of LastError
fLastErrorOpcode: TREOp;
fLastErrorSymbol: REChar;
fModifiers: TRegExprModifiers; // regex modifiers
fCompModifiers: TRegExprModifiers; // compiler's copy of modifiers
fProgModifiers: TRegExprModifiers; // modifiers values from last programm compilation
{$IFDEF UseSpaceChars}
fSpaceChars: RegExprString;
{$ENDIF}
{$IFDEF UseWordChars}
fWordChars: RegExprString;
{$ENDIF}
{$IFDEF UseLineSep}
fLineSeparators: RegExprString;
{$ENDIF}
fUsePairedBreak: boolean;
fReplaceLineEnd: RegExprString; // string to use for "\n" in Substitute method
fSlowChecksSizeMax: integer;
// Exec() param ASlowChecks is set to True, when Length(InputString)<SlowChecksSizeMax
// This ASlowChecks enables to use regMustString optimization
{$IFNDEF UniCode}
fLineSepArray: array[byte] of boolean;
{$ENDIF}
CharCheckers: TRegExprCharCheckerArray;
CharCheckerInfos: TRegExprCharCheckerInfos;
CheckerIndex_Word: byte;
CheckerIndex_NotWord: byte;
CheckerIndex_Digit: byte;
CheckerIndex_NotDigit: byte;
CheckerIndex_Space: byte;
CheckerIndex_NotSpace: byte;
CheckerIndex_HorzSep: byte;
CheckerIndex_NotHorzSep: byte;
CheckerIndex_VertSep: byte;
CheckerIndex_NotVertSep: byte;
CheckerIndex_LowerAZ: byte;
CheckerIndex_UpperAZ: byte;
fHelper: TRegExpr;
fHelperLen: integer;
{$IFDEF Compat}
fUseUnicodeWordDetection: boolean;
fInvertCase: TRegExprInvertCaseFunction;
fEmptyInputRaisesError: boolean;
fUseOsLineEndOnReplace: boolean;
function OldInvertCase(const Ch: REChar): REChar;
function GetLinePairedSeparator: RegExprString;
procedure SetLinePairedSeparator(const AValue: RegExprString);
procedure SetUseOsLineEndOnReplace(AValue: boolean);
{$ENDIF}
procedure InitCharCheckers;
function CharChecker_Word(ch: REChar): boolean;
function CharChecker_NotWord(ch: REChar): boolean;
function CharChecker_Space(ch: REChar): boolean;
function CharChecker_NotSpace(ch: REChar): boolean;
function CharChecker_Digit(ch: REChar): boolean;
function CharChecker_NotDigit(ch: REChar): boolean;
function CharChecker_HorzSep(ch: REChar): boolean;
function CharChecker_NotHorzSep(ch: REChar): boolean;
function CharChecker_VertSep(ch: REChar): boolean;
function CharChecker_NotVertSep(ch: REChar): boolean;
function CharChecker_LowerAZ(ch: REChar): boolean;
function CharChecker_UpperAZ(ch: REChar): boolean;
function DumpCheckerIndex(N: byte): RegExprString;
function DumpCategoryChars(ch, ch2: REChar; Positive: boolean): RegExprString;
procedure ClearMatches; {$IFDEF InlineFuncs}inline;{$ENDIF}
procedure ClearInternalIndexes; {$IFDEF InlineFuncs}inline;{$ENDIF}
function FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
procedure GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
procedure GetCharSetFromSpaceChars(var ARes: TRegExprCharset); {$IFDEF InlineFuncs}inline;{$ENDIF}
procedure GetCharSetFromWordChars(var ARes: TRegExprCharSet); {$IFDEF InlineFuncs}inline;{$ENDIF}
function IsWordChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
function IsSpaceChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
function IsCustomLineSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
{$IFDEF UseLineSep}
procedure InitLineSepArray;
{$ENDIF}
procedure FindGroupName(APtr, AEndPtr: PRegExprChar; AEndChar: REChar; var AName: RegExprString);
// Mark programm as having to be [re]compiled
procedure InvalidateProgramm;
// Check if we can use compiled regex, compile it if something changed
function IsProgrammOk: boolean;
procedure SetExpression(const AStr: RegExprString);
function GetModifierStr: RegExprString;
procedure SetModifierStr(const AStr: RegExprString);
function GetModifierG: boolean;
function GetModifierI: boolean;
function GetModifierM: boolean;
function GetModifierR: boolean;
function GetModifierS: boolean;
function GetModifierX: boolean;
procedure SetModifierG(AValue: boolean);
procedure SetModifierI(AValue: boolean);
procedure SetModifierM(AValue: boolean);
procedure SetModifierR(AValue: boolean);
procedure SetModifierS(AValue: boolean);
procedure SetModifierX(AValue: boolean);
// Default handler raises exception ERegExpr with
// Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
// and CompilerErrorPos = value of property CompilerErrorPos.
procedure Error(AErrorID: integer); virtual; // error handler.
{ ==================== Compiler section =================== }
// compile a regular expression into internal code
function CompileRegExpr(ARegExp: PRegExprChar): boolean;
// set the next-pointer at the end of a node chain
procedure Tail(p: PRegExprChar; val: PRegExprChar);
// regoptail - regtail on operand of first argument; nop if operandless
procedure OpTail(p: PRegExprChar; val: PRegExprChar);
// regnode - emit a node, return location
function EmitNode(op: TREOp): PRegExprChar;
// emit (if appropriate) a byte of code
procedure EmitC(ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
// emit LongInt value
procedure EmitInt(AValue: LongInt); {$IFDEF InlineFuncs}inline;{$ENDIF}
// emit back-reference to group
function EmitGroupRef(AIndex: integer; AIgnoreCase: boolean): PRegExprChar;
{$IFDEF FastUnicodeData}
procedure FindCategoryName(var scan: PRegExprChar; var ch1, ch2: REChar);
function EmitCategoryMain(APositive: boolean): PRegExprChar;
{$ENDIF}
// insert an operator in front of already-emitted operand
// Means relocating the operand.
procedure InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
// ###0.90
// regular expression, i.e. main body or parenthesized thing
function ParseReg(InBrackets: boolean; var FlagParse: integer): PRegExprChar;
// one alternative of an | operator
function ParseBranch(var FlagParse: integer): PRegExprChar;
// something followed by possible [*+?]
function ParsePiece(var FlagParse: integer): PRegExprChar;
function HexDig(Ch: REChar): integer;
function UnQuoteChar(var APtr: PRegExprChar): REChar;
// the lowest level
function ParseAtom(var FlagParse: integer): PRegExprChar;
// current pos in r.e. - for error hanling
function GetCompilerErrorPos: PtrInt;
{$IFDEF UseFirstCharSet} // ###0.929
procedure FillFirstCharSet(prog: PRegExprChar);
{$ENDIF}
{ ===================== Matching section =================== }
// repeatedly match something simple, report how many
function FindRepeated(p: PRegExprChar; AMax: integer): integer;
// dig the "next" pointer out of a node
function regNext(p: PRegExprChar): PRegExprChar;
// recursively matching routine
function MatchPrim(prog: PRegExprChar): boolean;
// match at specific position only, called from ExecPrim
function MatchAtOnePos(APos: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
// Exec for stored InputString
function ExecPrim(AOffset: integer; ATryOnce, ASlowChecks, ABackward: boolean): boolean;
function GetSubExprCount: integer;
function GetMatchPos(Idx: integer): PtrInt;
function GetMatchLen(Idx: integer): PtrInt;
function GetMatch(Idx: integer): RegExprString;
procedure SetInputString(const AInputString: RegExprString);
procedure SetInputRange(AStart, AEnd: PRegExprChar);
{$IFDEF UseLineSep}
procedure SetLineSeparators(const AStr: RegExprString);
{$ENDIF}
procedure SetUsePairedBreak(AValue: boolean);
public
constructor Create; {$IFDEF OverMeth} overload;
constructor Create(const AExpression: RegExprString); overload;
{$ENDIF}
destructor Destroy; override;
class function VersionMajor: integer;
class function VersionMinor: integer;
// match a programm against a string AInputString
// !!! Exec store AInputString into InputString property
// For Delphi 5 and higher available overloaded versions - first without
// parameter (uses already assigned to InputString property value)
// and second that has int parameter and is same as ExecPos
function Exec(const AInputString: RegExprString): boolean;
{$IFDEF OverMeth} overload;
function Exec: boolean; overload;
function Exec(AOffset: integer): boolean; overload;
{$ENDIF}
// find next match:
// ExecNext;
// works the same as
// if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
// else ExecPos (MatchPos [0] + MatchLen [0]);
// but it's more simpler !
// Raises exception if used without preceeding SUCCESSFUL call to
// Exec* (Exec, ExecPos, ExecNext). So You always must use something like
// if Exec (InputString) then repeat { proceed results} until not ExecNext;
function ExecNext(ABackward: boolean {$IFDEF DefParam} = False{$ENDIF}): boolean;
// find match for InputString starting from AOffset position
// (AOffset=1 - first char of InputString)
function ExecPos(AOffset: integer {$IFDEF DefParam} = 1{$ENDIF}): boolean;
{$IFDEF OverMeth} overload;
function ExecPos(AOffset: integer; ATryOnce, ABackward: boolean): boolean; overload;
{$ENDIF}
// Returns ATemplate with '$&' or '$0' replaced by whole r.e.
// occurence and '$1'...'$nn' replaced by subexpression with given index.
// Symbol '$' is used instead of '\' (for future extensions
// and for more Perl-compatibility) and accepts more than one digit.
// If you want to place into template raw '$' or '\', use prefix '\'.
// Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
// If you want to place any number after '$' you must enclose it
// with curly braces: '${12}'.
// Example: 'a$12bc' -> 'a<Match[12]>bc'
// 'a${1}2bc' -> 'a<Match[1]>2bc'.
function Substitute(const ATemplate: RegExprString): RegExprString;
// Splits AInputStr to list by positions of all r.e. occurencies.
// Internally calls Exec, ExecNext.
procedure Split(const AInputStr: RegExprString; APieces: TStrings);
function Replace(const AInputStr: RegExprString;
const AReplaceStr: RegExprString;
AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}) // ###0.946
: RegExprString; {$IFDEF OverMeth} overload;
function Replace(const AInputStr: RegExprString;
AReplaceFunc: TRegExprReplaceFunction): RegExprString; overload;
{$ENDIF}
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr.
// If AUseSubstitution is true, then AReplaceStr will be used
// as template for Substitution methods.
// For example:
// Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
// Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
// will return: def 'BLOCK' value 'test1'
// Replace ('BLOCK( test1)', 'def "$1" value "$2"')
// will return: def "$1" value "$2"
// Internally calls Exec, ExecNext.
// Overloaded version and ReplaceEx operate with callback function,
// so you can implement really complex functionality.
function ReplaceEx(const AInputStr: RegExprString;
AReplaceFunc: TRegExprReplaceFunction): RegExprString;
{$IFDEF Compat}
function ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload; deprecated 'Use modern form of ExecPos()';
class function InvertCaseFunction(const Ch: REChar): REChar; deprecated 'This has no effect now';
property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; deprecated 'This has no effect now';
property UseUnicodeWordDetection: boolean read fUseUnicodeWordDetection write fUseUnicodeWordDetection; deprecated 'This has no effect, use {$DEFINE Unicode} instead';
property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; deprecated 'This has no effect now';
property EmptyInputRaisesError: boolean read fEmptyInputRaisesError write fEmptyInputRaisesError; deprecated 'This has no effect now';
property UseOsLineEndOnReplace: boolean read fUseOsLineEndOnReplace write SetUseOsLineEndOnReplace; deprecated 'Use property ReplaceLineEnd instead';
{$ENDIF}
// Returns ID of last error, 0 if no errors (unusable if
// Error method raises exception) and clear internal status
// into 0 (no errors).
function LastError: integer;
// Returns Error message for error with ID = AErrorID.
function ErrorMsg(AErrorID: integer): RegExprString; virtual;
// Re-compile regex
procedure Compile;
{$IFDEF RegExpPCodeDump}
// Show compiled regex in textual form
function Dump: RegExprString;
// Show single opcode in textual form
function DumpOp(op: TREOp): RegExprString;
{$ENDIF}
function IsCompiled: boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
// Opcode contains only operations for fixed match length: EXACTLY*, ANY*, etc
function IsFixedLength(var op: TREOp; var ALen: integer): boolean;
// Regular expression.
// For optimization, TRegExpr will automatically compiles it into 'P-code'
// (You can see it with help of Dump method) and stores in internal
// structures. Real [re]compilation occures only when it really needed -
// while calling Exec, ExecNext, Substitute, Dump, etc
// and only if Expression or other P-code affected properties was changed
// after last [re]compilation.
// If any errors while [re]compilation occures, Error method is called
// (by default Error raises exception - see below)
property Expression: RegExprString read fExpression write SetExpression;
// Set/get default values of r.e.syntax modifiers. Modifiers in
// r.e. (?ismx-ismx) will replace this default values.
// If you try to set unsupported modifier, Error will be called
// (by defaul Error raises exception ERegExpr).
property ModifierStr: RegExprString read GetModifierStr write SetModifierStr;
property ModifierI: boolean read GetModifierI write SetModifierI;
property ModifierR: boolean read GetModifierR write SetModifierR;
property ModifierS: boolean read GetModifierS write SetModifierS;
property ModifierG: boolean read GetModifierG write SetModifierG;
property ModifierM: boolean read GetModifierM write SetModifierM;
property ModifierX: boolean read GetModifierX write SetModifierX;
// returns current input string (from last Exec call or last assign
// to this property).
// Any assignment to this property clear Match* properties !
property InputString: RegExprString read fInputString write SetInputString;
// Number of subexpressions has been found in last Exec* call.
// If there are no subexpr. but whole expr was found (Exec* returned True),
// then SubExprMatchCount=0, if no subexpressions nor whole
// r.e. found (Exec* returned false) then SubExprMatchCount=-1.
// Note, that some subexpr. may be not found and for such
// subexpr. MathPos=MatchLen=-1 and Match=''.
// For example: Expression := '(1)?2(3)?';
// Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
// Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
// Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
// Exec ('2'): SubExprMatchCount=0, Match[0]='2'
// Exec ('7') - return False: SubExprMatchCount=-1
property SubExprMatchCount: integer read GetSubExprCount;
// pos of entrance subexpr. #Idx into tested in last Exec*
// string. First subexpr. has Idx=1, last - MatchCount,
// whole r.e. has Idx=0.
// Returns -1 if in r.e. no such subexpr. or this subexpr.
// not found in input string.
property MatchPos[Idx: integer]: PtrInt read GetMatchPos;
// len of entrance subexpr. #Idx r.e. into tested in last Exec*
// string. First subexpr. has Idx=1, last - MatchCount,
// whole r.e. has Idx=0.
// Returns -1 if in r.e. no such subexpr. or this subexpr.
// not found in input string.
// Remember - MatchLen may be 0 (if r.e. match empty string) !
property MatchLen[Idx: integer]: PtrInt read GetMatchLen;
// == copy (InputString, MatchPos [Idx], MatchLen [Idx])
// Returns '' if in r.e. no such subexpr. or this subexpr.
// not found in input string.
property Match[Idx: integer]: RegExprString read GetMatch;
// get index of group (subexpression) by name, to support named groups
// like in Python: (?P<name>regex)
function MatchIndexFromName(const AName: RegExprString): integer;
function MatchFromName(const AName: RegExprString): RegExprString;
// Returns position in r.e. where compiler stopped.
// Useful for error diagnostics
property CompilerErrorPos: PtrInt read GetCompilerErrorPos;
{$IFDEF UseSpaceChars}
// Contains chars, treated as /s (initially filled with RegExprSpaceChars
// global constant)
property SpaceChars: RegExprString read fSpaceChars write fSpaceChars;
// ###0.927
{$ENDIF}
{$IFDEF UseWordChars}
// Contains chars, treated as /w (initially filled with RegExprWordChars
// global constant)
property WordChars: RegExprString read fWordChars write fWordChars;
// ###0.929
{$ENDIF}
{$IFDEF UseLineSep}
// line separators (like \n in Unix)
property LineSeparators: RegExprString read fLineSeparators write SetLineSeparators; // ###0.941
{$ENDIF}
// support paired line-break CR LF
property UseLinePairedBreak: boolean read fUsePairedBreak write SetUsePairedBreak;
property ReplaceLineEnd: RegExprString read fReplaceLineEnd write fReplaceLineEnd;
property SlowChecksSizeMax: integer read fSlowChecksSizeMax write fSlowChecksSizeMax;
end;
type
ERegExpr = class(Exception)
public
ErrorCode: integer;
CompilerErrorPos: PtrInt;
end;
// true if string AInputString match regular expression ARegExpr
// ! will raise exeption if syntax errors in ARegExpr
function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
// Split AInputStr into APieces by r.e. ARegExpr occurencies
procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
APieces: TStrings);
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
// If AUseSubstitution is true, then AReplaceStr will be used
// as template for Substitution methods.
// For example:
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
// 'BLOCK( test1)', 'def "$1" value "$2"', True)
// will return: def 'BLOCK' value 'test1'
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
// 'BLOCK( test1)', 'def "$1" value "$2"')
// will return: def "$1" value "$2"
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
{$IFDEF OverMeth}overload; // ###0.947
// Alternate form allowing to set more parameters.
type
TRegexReplaceOption = (
rroModifierI,
rroModifierR,
rroModifierS,
rroModifierG,
rroModifierM,
rroModifierX,
rroUseSubstitution,
rroUseOsLineEnd
);
TRegexReplaceOptions = set of TRegexReplaceOption;
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
Options: TRegexReplaceOptions): RegExprString; overload;
{$ENDIF}
// Replace all metachars with its safe representation,
// for example 'abc$cd.(' converts into 'abc\$cd\.\('
// This function useful for r.e. autogeneration from
// user input
function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
// Makes list of subexpressions found in ARegExpr r.e.
// In ASubExps every item represent subexpression,
// from first to last, in format:
// String - subexpression text (without '()')
// low word of Object - starting position in ARegExpr, including '('
// if exists! (first position is 1)
// high word of Object - length, including starting '(' and ending ')'
// if exist!
// AExtendedSyntax - must be True if modifier /m will be On while
// using the r.e.
// Useful for GUI editors of r.e. etc (You can find example of using
// in TestRExp.dpr project)
// Returns
// 0 Success. No unbalanced brackets was found;
// -1 There are not enough closing brackets ')';
// -(n+1) At position n was found opening '[' without //###0.942
// corresponding closing ']';
// n At position n was found closing bracket ')' without
// corresponding opening '('.
// If Result <> 0, then ASubExpr can contain empty items or illegal ones
function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
AExtendedSyntax: boolean{$IFDEF DefParam} = False{$ENDIF}): integer;
implementation
{$IFDEF FastUnicodeData}
uses
regexpr_unicodedata;
{$ENDIF}
const
// TRegExpr.VersionMajor/Minor return values of these constants:
REVersionMajor = 1;
REVersionMinor = 153;
OpKind_End = REChar(1);
OpKind_MetaClass = REChar(2);
OpKind_Range = REChar(3);
OpKind_Char = REChar(4);
OpKind_CategoryYes = REChar(5);
OpKind_CategoryNo = REChar(6);
RegExprAllSet = [0 .. 255];
RegExprWordSet = [Ord('a') .. Ord('z'), Ord('A') .. Ord('Z'), Ord('0') .. Ord('9'), Ord('_')];
RegExprDigitSet = [Ord('0') .. Ord('9')];
RegExprLowerAzSet = [Ord('a') .. Ord('z')];
RegExprUpperAzSet = [Ord('A') .. Ord('Z')];
RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet;
RegExprSpaceSet = [Ord(' '), $9, $A, $D, $C];
RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UniCode} + [$85] {$ENDIF};
RegExprHorzSeparatorsSet = [9, $20, $A0];
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
type
TRENextOff = PtrInt;
// internal Next "pointer" (offset to current p-code) //###0.933
PRENextOff = ^TRENextOff;
// used for extracting Next "pointers" from compiled r.e. //###0.933
TREBracesArg = integer; // type of {m,n} arguments
PREBracesArg = ^TREBracesArg;
TREGroupKind = (
gkNormalGroup,
gkNonCapturingGroup,
gkNamedGroupReference,
gkComment,
gkModifierString,
gkLookahead,
gkLookaheadNeg,
gkLookbehind,
gkLookbehindNeg,
gkRecursion,
gkSubCall
);
// Alexey T.: handling of that define FPC_REQUIRES_PROPER_ALIGNMENT was present even 15 years ago,
// but with it, we have failing of some RegEx tests, on ARM64 CPU.
// If I undefine FPC_REQUIRES_PROPER_ALIGNMENT, all tests run OK on ARM64 again.
{$undef FPC_REQUIRES_PROPER_ALIGNMENT}
const
REOpSz = SizeOf(TREOp) div SizeOf(REChar);
// size of OP_ command in REChars
{$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
// add space for aligning pointer
// -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size
RENextOffSz = (2 * SizeOf(TRENextOff) div SizeOf(REChar)) - 1;
REBracesArgSz = (2 * SizeOf(TREBracesArg) div SizeOf(REChar));
// add space for aligning pointer
{$ELSE}
RENextOffSz = (SizeOf(TRENextOff) div SizeOf(REChar));
// size of Next pointer in REChars
REBracesArgSz = SizeOf(TREBracesArg) div SizeOf(REChar);
// size of BRACES arguments in REChars
{$ENDIF}
RENumberSz = SizeOf(LongInt) div SizeOf(REChar);
function IsPairedBreak(p: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
const
cBreak = {$IFDEF Unicode} $000D000A; {$ELSE} $0D0A; {$ENDIF}
type
PtrPair = {$IFDEF Unicode} ^LongInt; {$ELSE} ^Word; {$ENDIF}
begin
Result := PtrPair(p)^ = cBreak;
end;
function _FindCharInBuffer(SBegin, SEnd: PRegExprChar; Ch: REChar): PRegExprChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
while SBegin < SEnd do
begin
if SBegin^ = Ch then
begin
Result := SBegin;
Exit;
end;
Inc(SBegin);
end;
Result := nil;
end;
function IsIgnoredChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
case AChar of
' ', #9, #$d, #$a:
Result := True
else
Result := False;
end;
end;
function _IsMetaChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
case AChar of
'd', 'D',
's', 'S',
'w', 'W',
'v', 'V',
'h', 'H':
Result := True
else
Result := False;
end;
end;
function AlignToPtr(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
{$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
Result := Align(p, SizeOf(Pointer));
{$ELSE}
Result := p;
{$ENDIF}
end;
function AlignToInt(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
{$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
Result := Align(p, SizeOf(integer));
{$ELSE}
Result := p;
{$ENDIF}
end;
{$IFDEF FastUnicodeData}
function _UpperCase(Ch: REChar): REChar; inline;
begin
Result := CharUpperArray[Ord(Ch)];
end;
function _LowerCase(Ch: REChar): REChar; inline;
begin
Result := CharLowerArray[Ord(Ch)];
end;
{$ELSE}
function _UpperCase(Ch: REChar): REChar;
begin
Result := Ch;
if (Ch >= 'a') and (Ch <= 'z') then
begin
Dec(Result, 32);
Exit;
end;
if Ord(Ch) < 128 then
Exit;
{$IFDEF FPC}
{$IFDEF UniCode}
Result := UnicodeUpperCase(Ch)[1];
{$ELSE}
Result := AnsiUpperCase(Ch)[1];
{$ENDIF}
{$ELSE}
{$IFDEF UniCode}
{$IFDEF D_XE4}
Result := Ch.ToUpper;
{$ELSE}
{$IFDEF D2009}
Result := TCharacter.ToUpper(Ch);
{$ENDIF}
{$ENDIF}
{$ELSE}
Result := AnsiUpperCase(Ch)[1];
{$ENDIF}
{$ENDIF}
end;
function _LowerCase(Ch: REChar): REChar;
begin
Result := Ch;
if (Ch >= 'A') and (Ch <= 'Z') then
begin
Inc(Result, 32);
Exit;
end;
if Ord(Ch) < 128 then
Exit;
{$IFDEF FPC}
{$IFDEF UniCode}
Result := UnicodeLowerCase(Ch)[1];
{$ELSE}
Result := AnsiLowerCase(Ch)[1];
{$ENDIF}
{$ELSE}
{$IFDEF UniCode}
{$IFDEF D_XE4}
Result := Ch.ToLower;
{$ELSE}
{$IFDEF D2009}
Result := TCharacter.ToLower(Ch);
{$ENDIF}
{$ENDIF}
{$ELSE}
Result := AnsiLowerCase(Ch)[1];
{$ENDIF}
{$ENDIF}
end;
{$ENDIF}
function InvertCase(const Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
Result := _UpperCase(Ch);
if Result = Ch then
Result := _LowerCase(Ch);
end;
function _FindClosingBracket(P, PEnd: PRegExprChar): PRegExprChar;
var
Level: integer;
begin
Result := nil;
Level := 1;
repeat
if P >= PEnd then Exit;
case P^ of
EscChar:
Inc(P);
'(':
begin
Inc(Level);
end;
')':
begin
Dec(Level);
if Level = 0 then
begin
Result := P;
Exit;
end;
end;
end;
Inc(P);
until False;
end;
{$IFDEF UNICODEEX}
procedure IncUnicode(var p: PRegExprChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
// make additional increment if we are on low-surrogate char
// no need to check p<fInputEnd, at the end of string we have chr(0)
var
ch: REChar;
begin
Inc(p);
ch := p^;
if (Ord(ch) >= $DC00) and (Ord(ch) <= $DFFF) then
Inc(p);
end;
procedure IncUnicode2(var p: PRegExprChar; var N: integer); {$IFDEF InlineFuncs}inline;{$ENDIF}
var
ch: REChar;
begin
Inc(p);
Inc(N);
ch := p^;
if (Ord(ch) >= $DC00) and (Ord(ch) <= $DFFF) then
begin
Inc(p);
Inc(N);
end;
end;
{$ENDIF}
{ ============================================================= }
{ ===================== Global functions ====================== }
{ ============================================================= }
function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
begin
Result :=
(A.I = B.I) and
(A.G = B.G) and
(A.M = B.M) and
(A.S = B.S) and
(A.R = B.R) and
(A.X = B.X);
end;
function ParseModifiers(const APtr: PRegExprChar;
ALen: integer;
var AValue: TRegExprModifiers): boolean;
// Parse string and set AValue if it's in format 'ismxrg-ismxrg'
var
IsOn: boolean;
i: integer;
begin
Result := True;
IsOn := True;
for i := 0 to ALen-1 do
case APtr[i] of
'-':
IsOn := False;
'I', 'i':
AValue.I := IsOn;
'R', 'r':
AValue.R := IsOn;
'S', 's':
AValue.S := IsOn;
'G', 'g':
AValue.G := IsOn;
'M', 'm':
AValue.M := IsOn;
'X', 'x':
AValue.X := IsOn;
else
begin
Result := False;
Exit;
end;
end;
end;
function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
var
r: TRegExpr;
begin
r := TRegExpr.Create;
try
r.Expression := ARegExpr;
Result := r.Exec(AInputStr);
finally
r.Free;
end;
end; { of function ExecRegExpr
-------------------------------------------------------------- }
procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
APieces: TStrings);
var
r: TRegExpr;
begin
APieces.Clear;
r := TRegExpr.Create;
try
r.Expression := ARegExpr;
r.Split(AInputStr, APieces);
finally
r.Free;
end;
end; { of procedure SplitRegExpr
-------------------------------------------------------------- }
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
begin
with TRegExpr.Create do
try
Expression := ARegExpr;
Result := Replace(AInputStr, AReplaceStr, AUseSubstitution);
finally
Free;
end;
end; { of function ReplaceRegExpr
-------------------------------------------------------------- }
{$IFDEF OverMeth}
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
Options: TRegexReplaceOptions): RegExprString; overload;
begin
with TRegExpr.Create do
try
ModifierI := (rroModifierI in Options);
ModifierR := (rroModifierR in Options);
ModifierS := (rroModifierS in Options);
ModifierG := (rroModifierG in Options);
ModifierM := (rroModifierM in Options);
ModifierX := (rroModifierX in Options);
// Set this after the above, if the regex contains modifiers, they will be applied.
Expression := ARegExpr;
if rroUseOsLineEnd in Options then
ReplaceLineEnd := sLineBreak
else
ReplaceLineEnd := #10;
Result := Replace(AInputStr, AReplaceStr, rroUseSubstitution in Options);
finally
Free;
end;
end;
{$ENDIF}
(*
const
MetaChars_Init = '^$.[()|?+*' + EscChar + '{';
MetaChars = MetaChars_Init; // not needed to be a variable, const is faster
MetaAll = MetaChars_Init + ']}'; // Very similar to MetaChars, but slighly changed.
*)
function _IsMetaSymbol1(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
case ch of
'^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{':
Result := True
else
Result := False
end;
end;
function _IsMetaSymbol2(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
case ch of
'^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{',
']', '}':
Result := True
else
Result := False
end;
end;
function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
var
i, i0, Len: integer;
ch: REChar;
begin
Result := '';
Len := Length(AStr);
i := 1;
i0 := i;
while i <= Len do
begin
ch := AStr[i];
if _IsMetaSymbol2(ch) then
begin
Result := Result + System.Copy(AStr, i0, i - i0) + EscChar + ch;
i0 := i + 1;
end;
Inc(i);
end;
Result := Result + System.Copy(AStr, i0, MaxInt); // Tail
end; { of function QuoteRegExprMetaChars
-------------------------------------------------------------- }
function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
AExtendedSyntax: boolean{$IFDEF DefParam} = False{$ENDIF}): integer;
type
TStackItemRec = record // ###0.945
SubExprIdx: integer;
StartPos: PtrInt;
end;
TStackArray = packed array [0 .. RegexMaxMaxGroups - 1] of TStackItemRec;
var
Len, SubExprLen: integer;
i, i0: integer;
Modif: TRegExprModifiers;
Stack: ^TStackArray; // ###0.945
StackIdx, StackSz: integer;
begin
Result := 0; // no unbalanced brackets found at this very moment
FillChar(Modif, SizeOf(Modif), 0);
ASubExprs.Clear; // I don't think that adding to non empty list
// can be useful, so I simplified algorithm to work only with empty list
Len := Length(ARegExpr); // some optimization tricks
// first we have to calculate number of subexpression to reserve
// space in Stack array (may be we'll reserve more than needed, but
// it's faster then memory reallocation during parsing)
StackSz := 1; // add 1 for entire r.e.
for i := 1 to Len do
if ARegExpr[i] = '(' then
Inc(StackSz);
// SetLength (Stack, StackSz); //###0.945
GetMem(Stack, SizeOf(TStackItemRec) * StackSz);
try
StackIdx := 0;
i := 1;
while (i <= Len) do
begin
case ARegExpr[i] of
'(':
begin
if (i < Len) and (ARegExpr[i + 1] = '?') then
begin
// this is not subexpression, but comment or other
// Perl extension. We must check is it (?ismxrg-ismxrg)
// and change AExtendedSyntax if /x is changed.
Inc(i, 2); // skip '(?'
i0 := i;
while (i <= Len) and (ARegExpr[i] <> ')') do
Inc(i);
if i > Len then
Result := -1 // unbalansed '('
else
if ParseModifiers(@ARegExpr[i0], i - i0, Modif) then
// Alexey-T: original code had copy from i, not from i0
AExtendedSyntax := Modif.X;
end
else
begin // subexpression starts
ASubExprs.Add(''); // just reserve space
with Stack[StackIdx] do
begin
SubExprIdx := ASubExprs.Count - 1;
StartPos := i;
end;
Inc(StackIdx);
end;
end;
')':
begin
if StackIdx = 0 then
Result := i // unbalanced ')'
else
begin
Dec(StackIdx);
with Stack[StackIdx] do
begin
SubExprLen := i - StartPos + 1;
ASubExprs.Objects[SubExprIdx] :=
TObject(StartPos or (SubExprLen ShL 16));
ASubExprs[SubExprIdx] := System.Copy(ARegExpr, StartPos + 1,
SubExprLen - 2); // add without brackets
end;
end;
end;
EscChar:
Inc(i); // skip quoted symbol
'[':
begin
// we have to skip character ranges at once, because they can
// contain '#', and '#' in it must NOT be recognized as eXtended
// comment beginning!
i0 := i;
Inc(i);
if ARegExpr[i] = ']' // first ']' inside [] treated as simple char, no need to check '['
then
Inc(i);
while (i <= Len) and (ARegExpr[i] <> ']') do
if ARegExpr[i] = EscChar // ###0.942
then
Inc(i, 2) // skip 'escaped' char to prevent stopping at '\]'
else
Inc(i);
if (i > Len) or (ARegExpr[i] <> ']') // ###0.942
then
Result := -(i0 + 1); // unbalansed '[' //###0.942
end;
'#':
if AExtendedSyntax then
begin
// skip eXtended comments
while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a)
// do not use [#$d, #$a] due to UniCode compatibility
do
Inc(i);
while (i + 1 <= Len) and
((ARegExpr[i + 1] = #$d) or (ARegExpr[i + 1] = #$a)) do
Inc(i); // attempt to work with different kinds of line separators
// now we are at the line separator that must be skipped.
end;
// here is no 'else' clause - we simply skip ordinary chars
end; // of case
Inc(i); // skip scanned char
// ! can move after Len due to skipping quoted symbol
end;
// check brackets balance
if StackIdx <> 0 then
Result := -1; // unbalansed '('
// check if entire r.e. added
if (ASubExprs.Count = 0) or ((PtrInt(ASubExprs.Objects[0]) and $FFFF) <> 1)
or (((PtrInt(ASubExprs.Objects[0]) ShR 16) and $FFFF) <> Len)
// whole r.e. wasn't added because it isn't bracketed
// well, we add it now:
then
ASubExprs.InsertObject(0, ARegExpr, TObject((Len ShL 16) or 1));
finally
FreeMem(Stack);
end;
end; { of function RegExprSubExpressions
-------------------------------------------------------------- }
const
OP_MAGIC = TREOp(216); // programm signature
// name opcode opnd? meaning
OP_EEND = TREOp(0); // - End of program
OP_BOL = TREOp(1); // - Match "" at beginning of line
OP_EOL = TREOp(2); // - Match "" at end of line
OP_ANY = TREOp(3); // - Match any one character
OP_ANYOF = TREOp(4); // Str Match any character in string Str
OP_ANYBUT = TREOp(5); // Str Match any char. not in string Str
OP_BRANCH = TREOp(6); // Node Match this alternative, or the next
OP_BACK = TREOp(7); // - Jump backward (Next < 0)
OP_EXACTLY = TREOp(8); // Str Match string Str
OP_NOTHING = TREOp(9); // - Match empty string
OP_STAR = TREOp(10); // Node Match this (simple) thing 0 or more times
OP_PLUS = TREOp(11); // Node Match this (simple) thing 1 or more times
OP_ANYDIGIT = TREOp(12); // - Match any digit (equiv [0-9])
OP_NOTDIGIT = TREOp(13); // - Match not digit (equiv [0-9])
OP_ANYLETTER = TREOp(14); // - Match any letter from property WordChars
OP_NOTLETTER = TREOp(15); // - Match not letter from property WordChars
OP_ANYSPACE = TREOp(16); // - Match any space char (see property SpaceChars)
OP_NOTSPACE = TREOp(17); // - Match not space char (see property SpaceChars)
OP_BRACES = TREOp(18);
// Node,Min,Max Match this (simple) thing from Min to Max times.
// Min and Max are TREBracesArg
OP_COMMENT = TREOp(19); // - Comment ;)
OP_EXACTLYCI = TREOp(20); // Str Match string Str case insensitive
OP_ANYOFCI = TREOp(21);
// Str Match any character in string Str, case insensitive
OP_ANYBUTCI = TREOp(22);
// Str Match any char. not in string Str, case insensitive
OP_LOOPENTRY = TREOp(23); // Node Start of loop (Node - LOOP for this loop)
OP_LOOP = TREOp(24); // Node,Min,Max,LoopEntryJmp - back jump for LOOPENTRY.
// Min and Max are TREBracesArg
// Node - next node in sequence,
// LoopEntryJmp - associated LOOPENTRY node addr
OP_EOL2 = TReOp(25); // like OP_EOL but also matches before final line-break
OP_BSUBEXP = TREOp(28);
// Idx Match previously matched subexpression #Idx (stored as REChar) //###0.936
OP_BSUBEXPCI = TREOp(29); // Idx -"- in case-insensitive mode
// Non-Greedy Style Ops //###0.940
OP_STARNG = TREOp(30); // Same as OP_START but in non-greedy mode
OP_PLUSNG = TREOp(31); // Same as OP_PLUS but in non-greedy mode
OP_BRACESNG = TREOp(32); // Same as OP_BRACES but in non-greedy mode
OP_LOOPNG = TREOp(33); // Same as OP_LOOP but in non-greedy mode
// Multiline mode \m
OP_BOLML = TREOp(34); // - Match "" at beginning of line
OP_EOLML = TREOp(35); // - Match "" at end of line
OP_ANYML = TREOp(36); // - Match any one character
// Word boundary
OP_BOUND = TREOp(37); // Match "" between words //###0.943
OP_NOTBOUND = TREOp(38); // Match "" not between words //###0.943
OP_ANYHORZSEP = TREOp(39); // Any horizontal whitespace \h
OP_NOTHORZSEP = TREOp(40); // Not horizontal whitespace \H
OP_ANYVERTSEP = TREOp(41); // Any vertical whitespace \v
OP_NOTVERTSEP = TREOp(42); // Not vertical whitespace \V
OP_ANYCATEGORY = TREOp(43); // \p{L}
OP_NOTCATEGORY = TREOp(44); // \P{L}
OP_STAR_POSS = TReOp(45);
OP_PLUS_POSS = TReOp(46);
OP_BRACES_POSS = TReOp(47);
OP_RECUR = TReOp(48);
// !!! Change OP_OPEN value if you add new opcodes !!!
OP_OPEN = TREOp(50); // Opening of group; OP_OPEN+i is for group i
OP_OPEN_FIRST = Succ(OP_OPEN);
OP_OPEN_LAST = TREOp(Ord(OP_OPEN) + RegexMaxGroups - 1);
OP_CLOSE = Succ(OP_OPEN_LAST); // Closing of group; OP_CLOSE+i is for group i
OP_CLOSE_FIRST = Succ(OP_CLOSE);
OP_CLOSE_LAST = TReOp(Ord(OP_CLOSE) + RegexMaxGroups - 1);
OP_SUBCALL = Succ(OP_CLOSE_LAST); // Call of subroutine; OP_SUBCALL+i is for group i
OP_SUBCALL_FIRST = Succ(OP_SUBCALL);
OP_SUBCALL_LAST =
{$IFDEF Unicode}
TReOp(Ord(OP_SUBCALL) + RegexMaxGroups - 1);
{$ELSE}
High(REChar); // must fit to 0..255 range
{$ENDIF}
// We work with p-code through pointers, compatible with PRegExprChar.
// Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
// must have lengths that can be divided by SizeOf (REChar) !
// A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
// The Next is a offset from the opcode of the node containing it.
// An operand, if any, simply follows the node. (Note that much of
// the code generation knows about this implicit relationship!)
// Using TRENextOff=PtrInt speed up p-code processing.
// Opcodes description:
//
// BRANCH The set of branches constituting a single choice are hooked
// together with their "next" pointers, since precedence prevents
// anything being concatenated to any individual branch. The
// "next" pointer of the last BRANCH in a choice points to the
// thing following the whole choice. This is also where the
// final "next" pointer of each individual branch points; each
// branch starts with the operand node of a BRANCH node.
// BACK Normal "next" pointers all implicitly point forward; BACK
// exists to make loop structures possible.
// STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
// circular BRANCH structures using BACK. Complex '{min,max}'
// - as pair LOOPENTRY-LOOP (see below). Simple cases (one
// character per match) are implemented with STAR, PLUS and
// BRACES for speed and to minimize recursive plunges.
// LOOPENTRY,LOOP {min,max} are implemented as special pair
// LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
// current level.
// OPEN,CLOSE are numbered at compile time.
{ ============================================================= }
{ ================== Error handling section =================== }
{ ============================================================= }
const
reeOk = 0;
reeCompNullArgument = 100;
reeUnknownMetaSymbol = 101;
reeCompParseRegTooManyBrackets = 102;
reeCompParseRegUnmatchedBrackets = 103;
reeCompParseRegUnmatchedBrackets2 = 104;
reeCompParseRegJunkOnEnd = 105;
reePlusStarOperandCouldBeEmpty = 106;
reeNestedQuantif = 107;
reeBadHexDigit = 108;
reeInvalidRange = 109;
reeParseAtomTrailingBackSlash = 110;
reeNoHexCodeAfterBSlashX = 111;
reeHexCodeAfterBSlashXTooBig = 112;
reeUnmatchedSqBrackets = 113;
reeInternalUrp = 114;
reeQuantifFollowsNothing = 115;
reeTrailingBackSlash = 116;
reeNoLetterAfterBSlashC = 117;
reeMetaCharAfterMinusInRange = 118;
reeRarseAtomInternalDisaster = 119;
reeIncorrectSpecialBrackets = 120;
reeIncorrectBraces = 121;
reeBRACESArgTooBig = 122;
reeUnknownOpcodeInFillFirst = 123;
reeBracesMinParamGreaterMax = 124;
reeUnclosedComment = 125;
reeComplexBracesNotImplemented = 126;
reeUnrecognizedModifier = 127;
reeBadLinePairedSeparator = 128;
reeBadUnicodeCategory = 129;
reeTooSmallCheckersArray = 130;
reePossessiveAfterComplexBraces = 131;
reeBadRecursion = 132;
reeBadSubCall = 133;
reeNamedGroupBad = 140;
reeNamedGroupBadName = 141;
reeNamedGroupBadRef = 142;
reeNamedGroupDupName = 143;
reeLookaheadBad = 150;
reeLookbehindBad = 152;
reeLookbehindTooComplex = 153;
reeLookaroundNotAtEdge = 154;
// Runtime errors must be >= reeFirstRuntimeCode
reeFirstRuntimeCode = 1000;
reeRegRepeatCalledInappropriately = 1000;
reeMatchPrimMemoryCorruption = 1001;
reeMatchPrimCorruptedPointers = 1002;
reeNoExpression = 1003;
reeCorruptedProgram = 1004;
reeOffsetMustBePositive = 1006;
reeExecNextWithoutExec = 1007;
reeBadOpcodeInCharClass = 1008;
reeDumpCorruptedOpcode = 1011;
reeModifierUnsupported = 1013;
reeLoopStackExceeded = 1014;
reeLoopWithoutEntry = 1015;
function TRegExpr.ErrorMsg(AErrorID: integer): RegExprString;
begin
case AErrorID of
reeOk:
Result := 'No errors';
reeCompNullArgument:
Result := 'TRegExpr compile: null argument';
reeUnknownMetaSymbol:
Result := 'TRegExpr compile: unknown meta-character: \' + fLastErrorSymbol;
reeCompParseRegTooManyBrackets:
Result := 'TRegExpr compile: ParseReg: too many ()';
reeCompParseRegUnmatchedBrackets:
Result := 'TRegExpr compile: ParseReg: unmatched ()';
reeCompParseRegUnmatchedBrackets2:
Result := 'TRegExpr compile: ParseReg: unmatched ()';
reeCompParseRegJunkOnEnd:
Result := 'TRegExpr compile: ParseReg: junk at end';
reePlusStarOperandCouldBeEmpty:
Result := 'TRegExpr compile: *+ operand could be empty';
reeNestedQuantif:
Result := 'TRegExpr compile: nested quantifier *?+';
reeBadHexDigit:
Result := 'TRegExpr compile: bad hex digit';
reeInvalidRange:
Result := 'TRegExpr compile: invalid [] range';
reeParseAtomTrailingBackSlash:
Result := 'TRegExpr compile: parse atom trailing \';
reeNoHexCodeAfterBSlashX:
Result := 'TRegExpr compile: no hex code after \x';
reeNoLetterAfterBSlashC:
Result := 'TRegExpr compile: no letter "A".."Z" after \c';
reeMetaCharAfterMinusInRange:
Result := 'TRegExpr compile: metachar after "-" in [] range';
reeHexCodeAfterBSlashXTooBig:
Result := 'TRegExpr compile: hex code after \x is too big';
reeUnmatchedSqBrackets:
Result := 'TRegExpr compile: unmatched []';
reeInternalUrp:
Result := 'TRegExpr compile: internal fail on char "|", ")"';
reeQuantifFollowsNothing:
Result := 'TRegExpr compile: quantifier ?+*{ follows nothing';
reeTrailingBackSlash:
Result := 'TRegExpr compile: trailing \';
reeRarseAtomInternalDisaster:
Result := 'TRegExpr compile: RarseAtom internal disaster';
reeIncorrectSpecialBrackets:
Result := 'TRegExpr compile: incorrect expression in (?...) brackets';
reeIncorrectBraces:
Result := 'TRegExpr compile: incorrect {} braces';
reeBRACESArgTooBig:
Result := 'TRegExpr compile: braces {} argument too big';
reeUnknownOpcodeInFillFirst:
Result := 'TRegExpr compile: unknown opcode in FillFirstCharSet ('+DumpOp(fLastErrorOpcode)+')';
reeBracesMinParamGreaterMax:
Result := 'TRegExpr compile: braces {} min param greater then max';
reeUnclosedComment:
Result := 'TRegExpr compile: unclosed (?#comment)';
reeComplexBracesNotImplemented:
Result := 'TRegExpr compile: if you use braces {} and non-greedy ops *?, +?, ?? for complex cases, enable {$DEFINE ComplexBraces}';
reeUnrecognizedModifier:
Result := 'TRegExpr compile: incorrect modifier in (?...)';
reeBadLinePairedSeparator:
Result := 'TRegExpr compile: LinePairedSeparator must countain two different chars or be empty';
reeBadUnicodeCategory:
Result := 'TRegExpr compile: invalid category after \p or \P';
reeTooSmallCheckersArray:
Result := 'TRegExpr compile: too small CharCheckers array';
reePossessiveAfterComplexBraces:
Result := 'TRegExpr compile: possessive + after complex braces: (foo){n,m}+';
reeBadRecursion:
Result := 'TRegExpr compile: bad recursion (?R)';
reeBadSubCall:
Result := 'TRegExpr compile: bad subroutine call';
reeNamedGroupBad:
Result := 'TRegExpr compile: bad named group';
reeNamedGroupBadName:
Result := 'TRegExpr compile: bad identifier in named group';
reeNamedGroupBadRef:
Result := 'TRegExpr compile: bad back-reference to named group';
reeNamedGroupDupName:
Result := 'TRegExpr compile: named group defined more than once';
reeLookaheadBad:
Result := 'TRegExpr compile: bad lookahead';
reeLookbehindBad:
Result := 'TRegExpr compile: bad lookbehind';
reeLookbehindTooComplex:
Result := 'TRegExpr compile: lookbehind (?<!foo) must have fixed length';
reeLookaroundNotAtEdge:
Result := 'TRegExpr compile: lookaround brackets must be at the very beginning/ending';
reeRegRepeatCalledInappropriately:
Result := 'TRegExpr exec: RegRepeat called inappropriately';
reeMatchPrimMemoryCorruption:
Result := 'TRegExpr exec: MatchPrim memory corruption';
reeMatchPrimCorruptedPointers:
Result := 'TRegExpr exec: MatchPrim corrupted pointers';
reeNoExpression:
Result := 'TRegExpr exec: empty expression';
reeCorruptedProgram:
Result := 'TRegExpr exec: corrupted opcode (no magic byte)';
reeOffsetMustBePositive:
Result := 'TRegExpr exec: offset must be >0';
reeExecNextWithoutExec:
Result := 'TRegExpr exec: ExecNext without Exec(Pos)';
reeBadOpcodeInCharClass:
Result := 'TRegExpr exec: invalid opcode in char class';
reeDumpCorruptedOpcode:
Result := 'TRegExpr dump: corrupted opcode';
reeLoopStackExceeded:
Result := 'TRegExpr exec: loop stack exceeded';
reeLoopWithoutEntry:
Result := 'TRegExpr exec: loop without loop entry';
else
Result := 'Unknown error';
end;
end; { of procedure TRegExpr.Error
-------------------------------------------------------------- }
function TRegExpr.LastError: integer;
begin
Result := fLastError;
fLastError := reeOk;
end; { of function TRegExpr.LastError
-------------------------------------------------------------- }
{ ============================================================= }
{ ===================== Common section ======================== }
{ ============================================================= }
class function TRegExpr.VersionMajor: integer;
begin
Result := REVersionMajor;
end;
class function TRegExpr.VersionMinor: integer;
begin
Result := REVersionMinor;
end;
constructor TRegExpr.Create;
begin
inherited;
programm := nil;
fExpression := '';
fInputString := '';
FillChar(fModifiers, SizeOf(fModifiers), 0);
fModifiers.I := RegExprModifierI;
fModifiers.R := RegExprModifierR;
fModifiers.S := RegExprModifierS;
fModifiers.G := RegExprModifierG;
fModifiers.M := RegExprModifierM;
fModifiers.X := RegExprModifierX;
{$IFDEF UseSpaceChars}
SpaceChars := RegExprSpaceChars;
{$ENDIF}
{$IFDEF UseWordChars}
WordChars := RegExprWordChars;
{$ENDIF}
{$IFDEF UseLineSep}
fLineSeparators := RegExprLineSeparators;
{$ENDIF}
fUsePairedBreak := RegExprUsePairedBreak;
fReplaceLineEnd := RegExprReplaceLineBreak;
fSlowChecksSizeMax := 2000;
{$IFDEF UseLineSep}
InitLineSepArray;
{$ENDIF}
InitCharCheckers;
{$IFDEF Compat}
fInvertCase := OldInvertCase;
{$ENDIF}
end; { of constructor TRegExpr.Create
-------------------------------------------------------------- }
{$IFDEF OverMeth}
constructor TRegExpr.Create(const AExpression: RegExprString);
begin
Create;
Expression := AExpression;
end;
{$ENDIF}
destructor TRegExpr.Destroy;
begin
if programm <> nil then
begin
FreeMem(programm);
programm := nil;
end;
if Assigned(fHelper) then
FreeAndNil(fHelper);
end;
procedure TRegExpr.SetExpression(const AStr: RegExprString);
begin
if (AStr <> fExpression) or not IsCompiled then
begin
fExpression := AStr;
UniqueString(fExpression);
fRegexStart := PRegExprChar(fExpression);
fRegexEnd := fRegexStart + Length(fExpression);
InvalidateProgramm;
end;
end; { of procedure TRegExpr.SetExpression
-------------------------------------------------------------- }
function TRegExpr.GetSubExprCount: integer;
begin
// if nothing found, we must return -1 per TRegExpr docs
if GrpStart[0] = nil then
Result := -1
else
Result := GrpCount;
end;
function TRegExpr.GetMatchPos(Idx: integer): PtrInt;
begin
Idx := GrpIndexes[Idx];
if (Idx >= 0) and (GrpStart[Idx] <> nil) then
Result := GrpStart[Idx] - fInputStart + 1
else
Result := -1;
end; { of function TRegExpr.GetMatchPos
-------------------------------------------------------------- }
function TRegExpr.GetMatchLen(Idx: integer): PtrInt;
begin
Idx := GrpIndexes[Idx];
if (Idx >= 0) and (GrpStart[Idx] <> nil) then
Result := GrpEnd[Idx] - GrpStart[Idx]
else
Result := -1;
end; { of function TRegExpr.GetMatchLen
-------------------------------------------------------------- }
function TRegExpr.GetMatch(Idx: integer): RegExprString;
begin
Result := '';
Idx := GrpIndexes[Idx];
if (Idx >= 0) and (GrpEnd[Idx] > GrpStart[Idx]) then
SetString(Result, GrpStart[Idx], GrpEnd[Idx] - GrpStart[Idx]);
end; { of function TRegExpr.GetMatch
-------------------------------------------------------------- }
function TRegExpr.MatchIndexFromName(const AName: RegExprString): integer;
var
i: integer;
begin
for i := 1 {not 0} to GrpCount do
if GrpNames[i] = AName then
begin
Result := i;
Exit;
end;
Result := -1;
end;
function TRegExpr.MatchFromName(const AName: RegExprString): RegExprString;
var
Idx: integer;
begin
Idx := MatchIndexFromName(AName);
if Idx >= 0 then
Result := GetMatch(Idx)
else
Result := '';
end;
function TRegExpr.GetModifierStr: RegExprString;
begin
Result := '-';
if ModifierI then
Result := 'i' + Result
else
Result := Result + 'i';
if ModifierR then
Result := 'r' + Result
else
Result := Result + 'r';
if ModifierS then
Result := 's' + Result
else
Result := Result + 's';
if ModifierG then
Result := 'g' + Result
else
Result := Result + 'g';
if ModifierM then
Result := 'm' + Result
else
Result := Result + 'm';
if ModifierX then
Result := 'x' + Result
else
Result := Result + 'x';
if Result[Length(Result)] = '-' // remove '-' if all modifiers are 'On'
then
System.Delete(Result, Length(Result), 1);
end; { of function TRegExpr.GetModifierStr
-------------------------------------------------------------- }
procedure TRegExpr.SetModifierG(AValue: boolean);
begin
if fModifiers.G <> AValue then
begin
fModifiers.G := AValue;
InvalidateProgramm;
end;
end;
procedure TRegExpr.SetModifierI(AValue: boolean);
begin
if fModifiers.I <> AValue then
begin
fModifiers.I := AValue;
InvalidateProgramm;
end;
end;
procedure TRegExpr.SetModifierM(AValue: boolean);
begin
if fModifiers.M <> AValue then
begin
fModifiers.M := AValue;
InvalidateProgramm;
end;
end;
procedure TRegExpr.SetModifierR(AValue: boolean);
begin
if fModifiers.R <> AValue then
begin
fModifiers.R := AValue;
InvalidateProgramm;
end;
end;
procedure TRegExpr.SetModifierS(AValue: boolean);
begin
if fModifiers.S <> AValue then
begin
fModifiers.S := AValue;
InvalidateProgramm;
end;
end;
procedure TRegExpr.SetModifierX(AValue: boolean);
begin
if fModifiers.X <> AValue then
begin
fModifiers.X := AValue;
InvalidateProgramm;
end;
end;
procedure TRegExpr.SetModifierStr(const AStr: RegExprString);
begin
if ParseModifiers(PRegExprChar(AStr), Length(AStr), fModifiers) then
InvalidateProgramm
else
Error(reeModifierUnsupported);
end;
{ ============================================================= }
{ ==================== Compiler section ======================= }
{ ============================================================= }
{$IFDEF FastUnicodeData}
function TRegExpr.IsWordChar(AChar: REChar): boolean;
begin
// bit 7 in value: is word char
Result := CharCategoryArray[Ord(AChar)] and 128 <> 0;
end;
(*
// Unicode General Category
UGC_UppercaseLetter = 0; Lu
UGC_LowercaseLetter = 1; Ll
UGC_TitlecaseLetter = 2; Lt
UGC_ModifierLetter = 3; Lm
UGC_OtherLetter = 4; Lo
UGC_NonSpacingMark = 5; Mn
UGC_CombiningMark = 6; Mc
UGC_EnclosingMark = 7; Me
UGC_DecimalNumber = 8; Nd
UGC_LetterNumber = 9; Nl
UGC_OtherNumber = 10; No
UGC_ConnectPunctuation = 11; Pc
UGC_DashPunctuation = 12; Pd
UGC_OpenPunctuation = 13; Ps
UGC_ClosePunctuation = 14; Pe
UGC_InitialPunctuation = 15; Pi
UGC_FinalPunctuation = 16; Pf
UGC_OtherPunctuation = 17; Po
UGC_MathSymbol = 18; Sm
UGC_CurrencySymbol = 19; Sc
UGC_ModifierSymbol = 20; Sk
UGC_OtherSymbol = 21; So
UGC_SpaceSeparator = 22; Zs
UGC_LineSeparator = 23; Zl
UGC_ParagraphSeparator = 24; Zp
UGC_Control = 25; Cc
UGC_Format = 26; Cf
UGC_Surrogate = 27; Cs
UGC_PrivateUse = 28; Co
UGC_Unassigned = 29; Cn
*)
const
CategoryNames: array[0..29] of array[0..1] of REChar = (
('L', 'u'),
('L', 'l'),
('L', 't'),
('L', 'm'),
('L', 'o'),
('M', 'n'),
('M', 'c'),
('M', 'e'),
('N', 'd'),
('N', 'l'),
('N', 'o'),
('P', 'c'),
('P', 'd'),
('P', 's'),
('P', 'e'),
('P', 'i'),
('P', 'f'),
('P', 'o'),
('S', 'm'),
('S', 'c'),
('S', 'k'),
('S', 'o'),
('Z', 's'),
('Z', 'l'),
('Z', 'p'),
('C', 'c'),
('C', 'f'),
('C', 's'),
('C', 'o'),
('C', 'n')
);
function IsCategoryFirstChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
case AChar of
'L', 'M', 'N', 'P', 'S', 'C', 'Z':
Result := True;
else
Result := False;
end;
end;
function IsCategoryChars(AChar, AChar2: REChar): boolean;
var
i: integer;
begin
for i := Low(CategoryNames) to High(CategoryNames) do
if (AChar = CategoryNames[i][0]) then
if (AChar2 = CategoryNames[i][1]) then
begin
Result := True;
Exit
end;
Result := False;
end;
function CheckCharCategory(AChar: REChar; Ch0, Ch1: REChar): boolean;
// AChar: check this char against opcode
// Ch0, Ch1: opcode operands after OP_*CATEGORY
var
N: byte;
Name0, Name1: REChar;
begin
Result := False;
// bits 0..6 are category
N := CharCategoryArray[Ord(AChar)] and 127;
if N <= High(CategoryNames) then
begin
Name0 := CategoryNames[N][0];
Name1 := CategoryNames[N][1];
if Ch0 <> Name0 then Exit;
if Ch1 <> #0 then
if Ch1 <> Name1 then Exit;
Result := True;
end;
end;
function MatchOneCharCategory(opnd, scan: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
// opnd: points to opcode operands after OP_*CATEGORY
// scan: points into InputString
begin
Result := CheckCharCategory(scan^, opnd^, (opnd + 1)^);
end;
{$ELSE}
function TRegExpr.IsWordChar(AChar: REChar): boolean;
begin
{$IFDEF UseWordChars}
Result := Pos(AChar, fWordChars) > 0;
{$ELSE}
case AChar of
'a' .. 'z',
'A' .. 'Z',
'0' .. '9', '_':
Result := True
else
Result := False;
end;
{$ENDIF}
end;
{$ENDIF}
function TRegExpr.IsSpaceChar(AChar: REChar): boolean;
begin
{$IFDEF UseSpaceChars}
Result := Pos(AChar, fSpaceChars) > 0;
{$ELSE}
case AChar of
' ', #$9, #$A, #$D, #$C:
Result := True
else
Result := False;
end;
{$ENDIF}
end;
function TRegExpr.IsCustomLineSeparator(AChar: REChar): boolean;
begin
{$IFDEF UseLineSep}
{$IFDEF UniCode}
Result := Pos(AChar, fLineSeparators) > 0;
{$ELSE}
Result := fLineSepArray[byte(AChar)];
{$ENDIF}
{$ELSE}
case AChar of
#$d, #$a,
{$IFDEF UniCode}
#$85, #$2028, #$2029,
{$ENDIF}
#$b, #$c:
Result := True;
else
Result := False;
end;
{$ENDIF}
end;
function IsDigitChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
case AChar of
'0' .. '9':
Result := True;
else
Result := False;
end;
end;
function IsHorzSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
// Tab and Unicode categoty "Space Separator": https://www.compart.com/en/unicode/category/Zs
case AChar of
#9, #$20, #$A0:
Result := True;
{$IFDEF UniCode}
#$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000:
Result := True;
{$ENDIF}
else
Result := False;
end;
end;
function IsVertLineSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
case AChar of
#$d, #$a, #$b, #$c:
Result := True;
{$IFDEF UniCode}
#$2028, #$2029, #$85:
Result := True;
{$ENDIF}
else
Result := False;
end;
end;
procedure TRegExpr.InvalidateProgramm;
begin
if programm <> nil then
begin
FreeMem(programm);
programm := nil;
end;
end; { of procedure TRegExpr.InvalidateProgramm
-------------------------------------------------------------- }
procedure TRegExpr.Compile;
begin
if fExpression = '' then
begin
Error(reeNoExpression);
Exit;
end;
CompileRegExpr(fRegexStart);
end; { of procedure TRegExpr.Compile
-------------------------------------------------------------- }
{$IFDEF UseLineSep}
procedure TRegExpr.InitLineSepArray;
{$IFNDEF UniCode}
var
i: integer;
{$ENDIF}
begin
{$IFNDEF UniCode}
FillChar(fLineSepArray, SizeOf(fLineSepArray), 0);
for i := 1 to Length(fLineSeparators) do
fLineSepArray[byte(fLineSeparators[i])] := True;
{$ENDIF}
end;
{$ENDIF}
function TRegExpr.IsProgrammOk: boolean;
begin
Result := False;
// check modifiers
if not IsModifiersEqual(fModifiers, fProgModifiers) then
InvalidateProgramm;
// compile if needed
if programm = nil then
begin
Compile;
// Check compiled programm
if programm = nil then
Exit;
end;
if programm[0] <> OP_MAGIC then
Error(reeCorruptedProgram)
else
Result := True;
end; { of function TRegExpr.IsProgrammOk
-------------------------------------------------------------- }
procedure TRegExpr.Tail(p: PRegExprChar; val: PRegExprChar);
// set the next-pointer at the end of a node chain
var
scan: PRegExprChar;
temp: PRegExprChar;
begin
if p = @regDummy then
Exit;
// Find last node.
scan := p;
repeat
temp := regNext(scan);
if temp = nil then
Break;
scan := temp;
until False;
// Set Next 'pointer'
if val < scan then
PRENextOff(AlignToPtr(scan + REOpSz))^ := -(scan - val) // ###0.948
// work around PWideChar subtraction bug (Delphi uses
// shr after subtraction to calculate widechar distance %-( )
// so, if difference is negative we have .. the "feature" :(
// I could wrap it in $IFDEF UniCode, but I didn't because
// "P Q computes the difference between the address given
// by P (the higher address) and the address given by Q (the
// lower address)" - Delphi help quotation.
else
PRENextOff(AlignToPtr(scan + REOpSz))^ := val - scan; // ###0.933
end; { of procedure TRegExpr.Tail
-------------------------------------------------------------- }
procedure TRegExpr.OpTail(p: PRegExprChar; val: PRegExprChar);
// regtail on operand of first argument; nop if operandless
begin
// "Operandless" and "op != OP_BRANCH" are synonymous in practice.
if (p = nil) or (p = @regDummy) or (PREOp(p)^ <> OP_BRANCH) then
Exit;
Tail(p + REOpSz + RENextOffSz, val); // ###0.933
end; { of procedure TRegExpr.OpTail
-------------------------------------------------------------- }
function TRegExpr.EmitNode(op: TREOp): PRegExprChar; // ###0.933
// emit a node, return location
begin
Result := regCode;
if Result <> @regDummy then
begin
PREOp(regCode)^ := op;
Inc(regCode, REOpSz);
PRENextOff(AlignToPtr(regCode))^ := 0; // Next "pointer" := nil
Inc(regCode, RENextOffSz);
if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
regExactlyLen := PLongInt(regCode)
else
regExactlyLen := nil;
{$IFDEF DebugSynRegExpr}
if regcode - programm > regsize then
raise Exception.Create('TRegExpr.EmitNode buffer overrun');
{$ENDIF}
end
else
Inc(regCodeSize, REOpSz + RENextOffSz);
// compute code size without code generation
end; { of function TRegExpr.EmitNode
-------------------------------------------------------------- }
procedure TRegExpr.EmitC(ch: REChar);
begin
if regCode <> @regDummy then
begin
regCode^ := ch;
Inc(regCode);
{$IFDEF DebugSynRegExpr}
if regcode - programm > regsize then
raise Exception.Create('TRegExpr.EmitC buffer overrun');
{$ENDIF}
end
else
Inc(regCodeSize, REOpSz); // Type of p-code pointer always is ^REChar
end; { of procedure TRegExpr.EmitC
-------------------------------------------------------------- }
procedure TRegExpr.EmitInt(AValue: LongInt);
begin
if regCode <> @regDummy then
begin
PLongInt(regCode)^ := AValue;
Inc(regCode, RENumberSz);
{$IFDEF DebugSynRegExpr}
if regcode - programm > regsize then
raise Exception.Create('TRegExpr.EmitInt buffer overrun');
{$ENDIF}
end
else
Inc(regCodeSize, RENumberSz);
end;
function TRegExpr.EmitGroupRef(AIndex: integer; AIgnoreCase: boolean): PRegExprChar;
begin
if AIgnoreCase then
Result := EmitNode(OP_BSUBEXPCI)
else
Result := EmitNode(OP_BSUBEXP);
EmitC(REChar(AIndex));
end;
{$IFDEF FastUnicodeData}
procedure TRegExpr.FindCategoryName(var scan: PRegExprChar; var ch1, ch2: REChar);
// scan: points into regex string after '\p', to find category name
// ch1, ch2: 2-char name of category; ch2 can be #0
var
ch: REChar;
pos1, pos2, namePtr: PRegExprChar;
nameLen: integer;
begin
ch1 := #0;
ch2 := #0;
ch := scan^;
if IsCategoryFirstChar(ch) then
begin
ch1 := ch;
Exit;
end;
if ch = '{' then
begin
pos1 := scan;
pos2 := pos1;
while (pos2 < fRegexEnd) and (pos2^ <> '}') do
Inc(pos2);
if pos2 >= fRegexEnd then
Error(reeIncorrectBraces);
namePtr := pos1+1;
nameLen := pos2-pos1-1;
Inc(scan, nameLen+1);
if nameLen<1 then
Error(reeBadUnicodeCategory);
if nameLen>2 then
Error(reeBadUnicodeCategory);
if nameLen = 1 then
begin
ch1 := namePtr^;
ch2 := #0;
if not IsCategoryFirstChar(ch1) then
Error(reeBadUnicodeCategory);
Exit;
end;
if nameLen = 2 then
begin
ch1 := namePtr^;
ch2 := (namePtr+1)^;
if not IsCategoryChars(ch1, ch2) then
Error(reeBadUnicodeCategory);
Exit;
end;
end
else
Error(reeBadUnicodeCategory);
end;
function TRegExpr.EmitCategoryMain(APositive: boolean): PRegExprChar;
var
ch, ch2: REChar;
begin
Inc(regParse);
if regParse >= fRegexEnd then
Error(reeBadUnicodeCategory);
FindCategoryName(regParse, ch, ch2);
if APositive then
Result := EmitNode(OP_ANYCATEGORY)
else
Result := EmitNode(OP_NOTCATEGORY);
EmitC(ch);
EmitC(ch2);
end;
{$ENDIF}
procedure TRegExpr.InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
// insert an operator in front of already-emitted operand
// Means relocating the operand.
var
src, dst, place: PRegExprChar;
i: integer;
begin
if regCode = @regDummy then
begin
Inc(regCodeSize, sz);
Exit;
end;
// move code behind insert position
src := regCode;
Inc(regCode, sz);
{$IFDEF DebugSynRegExpr}
if regCode - programm > regCodeSize then
raise Exception.Create('TRegExpr.InsertOperator buffer overrun');
// if (opnd<regCode) or (opnd-regCode>regCodeSize) then
// raise Exception.Create('TRegExpr.InsertOperator invalid opnd');
{$ENDIF}
dst := regCode;
while src > opnd do
begin
Dec(dst);
Dec(src);
dst^ := src^;
end;
place := opnd; // Op node, where operand used to be.
PREOp(place)^ := op;
Inc(place, REOpSz);
for i := 1 + REOpSz to sz do
begin
place^ := #0;
Inc(place);
end;
end; { of procedure TRegExpr.InsertOperator
-------------------------------------------------------------- }
function FindSkippedMetaLen(PStart, PEnd: PRegExprChar): integer; {$IFDEF InlineFuncs}inline;{$ENDIF}
// find length of initial segment of PStart string consisting
// entirely of characters not from IsMetaSymbol1.
begin
Result := 0;
while PStart < PEnd do
begin
if _IsMetaSymbol1(PStart^) then
Exit;
Inc(Result);
Inc(PStart)
end;
end;
const
// Flags to be passed up and down.
FLAG_WORST = 0; // Worst case
FLAG_HASWIDTH = 1; // Cannot match empty string
FLAG_SIMPLE = 2; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand
FLAG_SPECSTART = 4; // Starts with * or +
{$IFDEF UniCode}
RusRangeLoLow = #$430; // 'а'
RusRangeLoHigh = #$44F; // 'я'
RusRangeHiLow = #$410; // 'А'
RusRangeHiHigh = #$42F; // 'Я'
{$ELSE}
RusRangeLoLow = #$E0; // 'а' in cp1251
RusRangeLoHigh = #$FF; // 'я' in cp1251
RusRangeHiLow = #$C0; // 'А' in cp1251
RusRangeHiHigh = #$DF; // 'Я' in cp1251
{$ENDIF}
function TRegExpr.FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
// Buffer contains char pairs: (Kind, Data), where Kind is one of OpKind_ values,
// and Data depends on Kind
var
OpKind: REChar;
ch, ch2: REChar;
N, i: integer;
begin
if AIgnoreCase then
AChar := _UpperCase(AChar);
repeat
OpKind := ABuffer^;
case OpKind of
OpKind_End:
begin
Result := False;
Exit;
end;
OpKind_Range:
begin
Inc(ABuffer);
ch := ABuffer^;
Inc(ABuffer);
ch2 := ABuffer^;
Inc(ABuffer);
{
// if AIgnoreCase, ch, ch2 are upcased in opcode
if AIgnoreCase then
begin
ch := _UpperCase(ch);
ch2 := _UpperCase(ch2);
end;
}
if (AChar >= ch) and (AChar <= ch2) then
begin
Result := True;
Exit;
end;
end;
OpKind_MetaClass:
begin
Inc(ABuffer);
N := Ord(ABuffer^);
Inc(ABuffer);
if CharCheckers[N](AChar) then
begin
Result := True;
Exit
end;
end;
OpKind_Char:
begin
Inc(ABuffer);
N := PLongInt(ABuffer)^;
Inc(ABuffer, RENumberSz);
for i := 1 to N do
begin
ch := ABuffer^;
Inc(ABuffer);
{
// already upcased in opcode
if AIgnoreCase then
ch := _UpperCase(ch);
}
if ch = AChar then
begin
Result := True;
Exit;
end;
end;
end;
{$IFDEF FastUnicodeData}
OpKind_CategoryYes,
OpKind_CategoryNo:
begin
Inc(ABuffer);
ch := ABuffer^;
Inc(ABuffer);
ch2 := ABuffer^;
Inc(ABuffer);
Result := CheckCharCategory(AChar, ch, ch2);
if OpKind = OpKind_CategoryNo then
Result := not Result;
if Result then
Exit;
end;
{$ENDIF}
else
Error(reeBadOpcodeInCharClass);
end;
until False; // assume that Buffer is ended correctly
end;
procedure TRegExpr.GetCharSetFromWordChars(var ARes: TRegExprCharSet);
{$IFDEF UseWordChars}
var
i: integer;
ch: REChar;
{$ENDIF}
begin
{$IFDEF UseWordChars}
ARes := [];
for i := 1 to Length(fWordChars) do
begin
ch := fWordChars[i];
{$IFDEF UniCode}
if Ord(ch) <= $FF then
{$ENDIF}
Include(ARes, byte(ch));
end;
{$ELSE}
ARes := RegExprWordSet;
{$ENDIF}
end;
procedure TRegExpr.GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
{$IFDEF UseSpaceChars}
var
i: integer;
ch: REChar;
{$ENDIF}
begin
{$IFDEF UseSpaceChars}
ARes := [];
for i := 1 to Length(fSpaceChars) do
begin
ch := fSpaceChars[i];
{$IFDEF UniCode}
if Ord(ch) <= $FF then
{$ENDIF}
Include(ARes, byte(ch));
end;
{$ELSE}
ARes := RegExprSpaceSet;
{$ENDIF}
end;
procedure TRegExpr.GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
var
ch, ch2: REChar;
TempSet: TRegExprCharSet;
N, i: integer;
begin
ARes := [];
TempSet := [];
repeat
case ABuffer^ of
OpKind_End:
Exit;
OpKind_Range:
begin
Inc(ABuffer);
ch := ABuffer^;
Inc(ABuffer);
ch2 := ABuffer^;
Inc(ABuffer);
for i := Ord(ch) to
{$IFDEF UniCode} Min(Ord(ch2), $FF) {$ELSE} Ord(ch2) {$ENDIF} do
begin
Include(ARes, byte(i));
if AIgnoreCase then
Include(ARes, byte(InvertCase(REChar(i))));
end;
end;
OpKind_MetaClass:
begin
Inc(ABuffer);
N := Ord(ABuffer^);
Inc(ABuffer);
if N = CheckerIndex_Word then
begin
GetCharSetFromWordChars(TempSet);
ARes := ARes + TempSet;
end
else
if N = CheckerIndex_NotWord then
begin
GetCharSetFromWordChars(TempSet);
ARes := ARes + (RegExprAllSet - TempSet);
end
else
if N = CheckerIndex_Space then
begin
GetCharSetFromSpaceChars(TempSet);
ARes := ARes + TempSet;
end
else
if N = CheckerIndex_NotSpace then
begin
GetCharSetFromSpaceChars(TempSet);
ARes := ARes + (RegExprAllSet - TempSet);
end
else
if N = CheckerIndex_Digit then
ARes := ARes + RegExprDigitSet
else
if N = CheckerIndex_NotDigit then
ARes := ARes + (RegExprAllSet - RegExprDigitSet)
else
if N = CheckerIndex_VertSep then
ARes := ARes + RegExprLineSeparatorsSet
else
if N = CheckerIndex_NotVertSep then
ARes := ARes + (RegExprAllSet - RegExprLineSeparatorsSet)
else
if N = CheckerIndex_HorzSep then
ARes := ARes + RegExprHorzSeparatorsSet
else
if N = CheckerIndex_NotHorzSep then
ARes := ARes + (RegExprAllSet - RegExprHorzSeparatorsSet)
else
if N = CheckerIndex_LowerAZ then
begin
if AIgnoreCase then
ARes := ARes + RegExprAllAzSet
else
ARes := ARes + RegExprLowerAzSet;
end
else
if N = CheckerIndex_UpperAZ then
begin
if AIgnoreCase then
ARes := ARes + RegExprAllAzSet
else
ARes := ARes + RegExprUpperAzSet;
end
else
Error(reeBadOpcodeInCharClass);
end;
OpKind_Char:
begin
Inc(ABuffer);
N := PLongInt(ABuffer)^;
Inc(ABuffer, RENumberSz);
for i := 1 to N do
begin
ch := ABuffer^;
Inc(ABuffer);
{$IFDEF UniCode}
if Ord(ch) <= $FF then
{$ENDIF}
begin
Include(ARes, byte(ch));
if AIgnoreCase then
Include(ARes, byte(InvertCase(ch)));
end;
end;
end;
{$IFDEF FastUnicodeData}
OpKind_CategoryYes,
OpKind_CategoryNo:
begin
// usage of FirstCharSet makes no sense for regex with \p \P
ARes := RegExprAllSet;
Exit;
end;
{$ENDIF}
else
Error(reeBadOpcodeInCharClass);
end;
until False; // assume that Buffer is ended correctly
end;
function TRegExpr.GetModifierG: boolean;
begin
Result := fModifiers.G;
end;
function TRegExpr.GetModifierI: boolean;
begin
Result := fModifiers.I;
end;
function TRegExpr.GetModifierM: boolean;
begin
Result := fModifiers.M;
end;
function TRegExpr.GetModifierR: boolean;
begin
Result := fModifiers.R;
end;
function TRegExpr.GetModifierS: boolean;
begin
Result := fModifiers.S;
end;
function TRegExpr.GetModifierX: boolean;
begin
Result := fModifiers.X;
end;
function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): boolean;
// Compile a regular expression into internal code
// We can't allocate space until we know how big the compiled form will be,
// but we can't compile it (and thus know how big it is) until we've got a
// place to put the code. So we cheat: we compile it twice, once with code
// generation turned off and size counting turned on, and once "for real".
// This also means that we don't allocate space until we are sure that the
// thing really will compile successfully, and we never have to move the
// code and thus invalidate pointers into it. (Note that it has to be in
// one piece because free() must be able to free it all.)
// Beware that the optimization-preparation code in here knows about some
// of the structure of the compiled regexp.
var
scan, longest, longestTemp: PRegExprChar;
Len, LenTemp: integer;
FlagTemp: integer;
begin
Result := False;
FlagTemp := 0;
regParse := nil; // for correct error handling
regExactlyLen := nil;
ClearInternalIndexes;
fLastError := reeOk;
fLastErrorOpcode := TREOp(0);
if Assigned(fHelper) then
FreeAndNil(fHelper);
fHelperLen := 0;
try
if programm <> nil then
begin
FreeMem(programm);
programm := nil;
end;
if ARegExp = nil then
begin
Error(reeCompNullArgument);
Exit;
end;
fProgModifiers := fModifiers;
// well, may it's paranoia. I'll check it later.
// First pass: calculate opcode size, validate regex
fSecondPass := False;
fCompModifiers := fModifiers;
regParse := ARegExp;
regNumBrackets := 1;
regCodeSize := 0;
regCode := @regDummy;
regCodeWork := nil;
regLookahead := False;
regLookaheadNeg := False;
regLookaheadGroup := -1;
regLookbehind := False;
EmitC(OP_MAGIC);
if ParseReg(False, FlagTemp) = nil then
Exit;
// Allocate memory
GetMem(programm, regCodeSize * SizeOf(REChar));
// Second pass: emit opcode
fSecondPass := True;
fCompModifiers := fModifiers;
regParse := ARegExp;
regNumBrackets := 1;
regCode := programm;
regCodeWork := programm + REOpSz;
EmitC(OP_MAGIC);
if ParseReg(False, FlagTemp) = nil then
Exit;
// Dig out information for optimizations.
{$IFDEF UseFirstCharSet} // ###0.929
FirstCharSet := [];
FillFirstCharSet(regCodeWork);
for Len := 0 to 255 do
FirstCharArray[Len] := byte(Len) in FirstCharSet;
{$ENDIF}
regAnchored := #0;
regMust := nil;
regMustLen := 0;
regMustString := '';
scan := regCodeWork; // First OP_BRANCH.
if PREOp(regNext(scan))^ = OP_EEND then
begin // Only one top-level choice.
scan := scan + REOpSz + RENextOffSz;
// Starting-point info.
if PREOp(scan)^ = OP_BOL then
Inc(regAnchored);
// If there's something expensive in the r.e., find the longest
// literal string that must appear and make it the regMust. Resolve
// ties in favor of later strings, since the regstart check works
// with the beginning of the r.e. and avoiding duplication
// strengthens checking. Not a strong reason, but sufficient in the
// absence of others.
if (FlagTemp and FLAG_SPECSTART) <> 0 then
begin
longest := nil;
Len := 0;
while scan <> nil do
begin
if PREOp(scan)^ = OP_EXACTLY then
begin
longestTemp := scan + REOpSz + RENextOffSz + RENumberSz;
LenTemp := PLongInt(scan + REOpSz + RENextOffSz)^;
if LenTemp >= Len then
begin
longest := longestTemp;
Len := LenTemp;
end;
end;
scan := regNext(scan);
end;
regMust := longest;
regMustLen := Len;
if regMustLen > 1 then // don't use regMust if too short
SetString(regMustString, regMust, regMustLen);
end;
end;
Result := True;
finally
begin
if not Result then
InvalidateProgramm;
end;
end;
end; { of function TRegExpr.CompileRegExpr
-------------------------------------------------------------- }
function TRegExpr.ParseReg(InBrackets: boolean; var FlagParse: integer): PRegExprChar;
// regular expression, i.e. main body or parenthesized thing
// Caller must absorb opening parenthesis.
// Combining parenthesis handling with the base level of regular expression
// is a trifle forced, but the need to tie the tails of the branches to what
// follows makes it hard to avoid.
var
ret, br, ender: PRegExprChar;
NBrackets: integer;
FlagTemp: integer;
SavedModifiers: TRegExprModifiers;
begin
Result := nil;
FlagTemp := 0;
FlagParse := FLAG_HASWIDTH; // Tentatively.
NBrackets := 0;
SavedModifiers := fCompModifiers;
// Make an OP_OPEN node, if parenthesized.
if InBrackets then
begin
if regNumBrackets >= RegexMaxGroups then
begin
Error(reeCompParseRegTooManyBrackets);
Exit;
end;
NBrackets := regNumBrackets;
Inc(regNumBrackets);
ret := EmitNode(TREOp(Ord(OP_OPEN) + NBrackets));
GrpOpCodes[NBrackets] := ret;
end
else
ret := nil;
// Pick up the branches, linking them together.
br := ParseBranch(FlagTemp);
if br = nil then
begin
Result := nil;
Exit;
end;
if ret <> nil then
Tail(ret, br) // OP_OPEN -> first.
else
ret := br;
if (FlagTemp and FLAG_HASWIDTH) = 0 then
FlagParse := FlagParse and not FLAG_HASWIDTH;
FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART;
while (regParse^ = '|') do
begin
Inc(regParse);
br := ParseBranch(FlagTemp);
if br = nil then
begin
Result := nil;
Exit;
end;
Tail(ret, br); // OP_BRANCH -> OP_BRANCH.
if (FlagTemp and FLAG_HASWIDTH) = 0 then
FlagParse := FlagParse and not FLAG_HASWIDTH;
FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART;
end;
// Make a closing node, and hook it on the end.
if InBrackets then
ender := EmitNode(TREOp(Ord(OP_CLOSE) + NBrackets))
else
ender := EmitNode(OP_EEND);
Tail(ret, ender);
// Hook the tails of the branches to the closing node.
br := ret;
while br <> nil do
begin
OpTail(br, ender);
br := regNext(br);
end;
// Check for proper termination.
if InBrackets then
if regParse^ <> ')' then
begin
Error(reeCompParseRegUnmatchedBrackets);
Exit;
end
else
Inc(regParse); // skip trailing ')'
if (not InBrackets) and (regParse < fRegexEnd) then
begin
if regParse^ = ')' then
Error(reeCompParseRegUnmatchedBrackets2)
else
Error(reeCompParseRegJunkOnEnd);
Exit;
end;
fCompModifiers := SavedModifiers; // restore modifiers of parent
Result := ret;
end; { of function TRegExpr.ParseReg
-------------------------------------------------------------- }
function TRegExpr.ParseBranch(var FlagParse: integer): PRegExprChar;
// one alternative of an | operator
// Implements the concatenation operator.
var
ret, chain, latest: PRegExprChar;
FlagTemp: integer;
begin
FlagTemp := 0;
FlagParse := FLAG_WORST; // Tentatively.
ret := EmitNode(OP_BRANCH);
chain := nil;
while (regParse < fRegexEnd) and (regParse^ <> '|') and (regParse^ <> ')') do
begin
latest := ParsePiece(FlagTemp);
if latest = nil then
begin
Result := nil;
Exit;
end;
FlagParse := FlagParse or FlagTemp and FLAG_HASWIDTH;
if chain = nil // First piece.
then
FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART
else
Tail(chain, latest);
chain := latest;
end;
if chain = nil // Loop ran zero times.
then
EmitNode(OP_NOTHING);
Result := ret;
end; { of function TRegExpr.ParseBranch
-------------------------------------------------------------- }
function TRegExpr.ParsePiece(var FlagParse: integer): PRegExprChar;
// something followed by possible [*+?{]
// Note that the branching code sequences used for ? and the general cases
// of * and + and { are somewhat optimized: they use the same OP_NOTHING node as
// both the endmarker for their branch list and the body of the last branch.
// It might seem that this node could be dispensed with entirely, but the
// endmarker role is not redundant.
function ParseNumber(AStart, AEnd: PRegExprChar): TREBracesArg;
begin
Result := 0;
if AEnd - AStart + 1 > 8 then
begin // prevent stupid scanning
Error(reeBRACESArgTooBig);
Exit;
end;
while AStart <= AEnd do
begin
Result := Result * 10 + (Ord(AStart^) - Ord('0'));
Inc(AStart);
end;
if (Result > MaxBracesArg) or (Result < 0) then
begin
Error(reeBRACESArgTooBig);
Exit;
end;
end;
var
TheOp: TREOp;
NextNode: PRegExprChar;
procedure EmitComplexBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
{$IFDEF ComplexBraces}
var
off: TRENextOff;
{$ENDIF}
begin
{$IFNDEF ComplexBraces}
Error(reeComplexBracesNotImplemented);
{$ELSE}
if ANonGreedyOp then
TheOp := OP_LOOPNG
else
TheOp := OP_LOOP;
InsertOperator(OP_LOOPENTRY, Result, REOpSz + RENextOffSz);
NextNode := EmitNode(TheOp);
if regCode <> @regDummy then
begin
off := (Result + REOpSz + RENextOffSz) - (regCode - REOpSz - RENextOffSz);
// back to Atom after OP_LOOPENTRY
PREBracesArg(AlignToInt(regCode))^ := ABracesMin;
Inc(regCode, REBracesArgSz);
PREBracesArg(AlignToInt(regCode))^ := ABracesMax;
Inc(regCode, REBracesArgSz);
PRENextOff(AlignToPtr(regCode))^ := off;
Inc(regCode, RENextOffSz);
{$IFDEF DebugSynRegExpr}
if regcode - programm > regsize then
raise Exception.Create
('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun');
{$ENDIF}
end
else
Inc(regCodeSize, REBracesArgSz * 2 + RENextOffSz);
Tail(Result, NextNode); // OP_LOOPENTRY -> OP_LOOP
if regCode <> @regDummy then
Tail(Result + REOpSz + RENextOffSz, NextNode); // Atom -> OP_LOOP
{$ENDIF}
end;
procedure EmitSimpleBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp, APossessive: boolean);
begin
if APossessive then
TheOp := OP_BRACES_POSS
else
if ANonGreedyOp then
TheOp := OP_BRACESNG
else
TheOp := OP_BRACES;
InsertOperator(TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
if regCode <> @regDummy then
begin
PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin;
PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax;
end;
end;
var
op, nextch: REChar;
NonGreedyOp, NonGreedyCh, PossessiveCh: boolean;
FlagTemp: integer;
BracesMin, BracesMax: TREBracesArg;
p: PRegExprChar;
begin
FlagTemp := 0;
Result := ParseAtom(FlagTemp);
if Result = nil then
Exit;
op := regParse^;
if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then
begin
FlagParse := FlagTemp;
Exit;
end;
if ((FlagTemp and FLAG_HASWIDTH) = 0) and (op <> '?') then
begin
Error(reePlusStarOperandCouldBeEmpty);
Exit;
end;
case op of
'*':
begin
FlagParse := FLAG_WORST or FLAG_SPECSTART;
nextch := (regParse + 1)^;
PossessiveCh := nextch = '+';
if PossessiveCh then
begin
NonGreedyCh := False;
NonGreedyOp := False;
end
else
begin
NonGreedyCh := nextch = '?';
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
end;
if (FlagTemp and FLAG_SIMPLE) = 0 then
begin
if NonGreedyOp then
EmitComplexBraces(0, MaxBracesArg, NonGreedyOp)
else
begin // Emit x* as (x&|), where & means "self".
InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
OpTail(Result, EmitNode(OP_BACK)); // and loop
OpTail(Result, Result); // back
Tail(Result, EmitNode(OP_BRANCH)); // or
Tail(Result, EmitNode(OP_NOTHING)); // nil.
end
end
else
begin // Simple
if PossessiveCh then
TheOp := OP_STAR_POSS
else
if NonGreedyOp then
TheOp := OP_STARNG
else
TheOp := OP_STAR;
InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
end;
if NonGreedyCh or PossessiveCh then
Inc(regParse); // Skip extra char ('?')
end; { of case '*' }
'+':
begin
FlagParse := FLAG_WORST or FLAG_SPECSTART or FLAG_HASWIDTH;
nextch := (regParse + 1)^;
PossessiveCh := nextch = '+';
if PossessiveCh then
begin
NonGreedyCh := False;
NonGreedyOp := False;
end
else
begin
NonGreedyCh := nextch = '?';
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
end;
if (FlagTemp and FLAG_SIMPLE) = 0 then
begin
if NonGreedyOp then
EmitComplexBraces(1, MaxBracesArg, NonGreedyOp)
else
begin // Emit x+ as x(&|), where & means "self".
NextNode := EmitNode(OP_BRANCH); // Either
Tail(Result, NextNode);
Tail(EmitNode(OP_BACK), Result); // loop back
Tail(NextNode, EmitNode(OP_BRANCH)); // or
Tail(Result, EmitNode(OP_NOTHING)); // nil.
end
end
else
begin // Simple
if PossessiveCh then
TheOp := OP_PLUS_POSS
else
if NonGreedyOp then
TheOp := OP_PLUSNG
else
TheOp := OP_PLUS;
InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
end;
if NonGreedyCh or PossessiveCh then
Inc(regParse); // Skip extra char ('?')
end; { of case '+' }
'?':
begin
FlagParse := FLAG_WORST;
nextch := (regParse + 1)^;
PossessiveCh := nextch = '+';
if PossessiveCh then
begin
NonGreedyCh := False;
NonGreedyOp := False;
end
else
begin
NonGreedyCh := nextch = '?';
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
end;
if NonGreedyOp or PossessiveCh then
begin // ###0.940 // We emit x?? as x{0,1}?
if (FlagTemp and FLAG_SIMPLE) = 0 then
begin
if PossessiveCh then
Error(reePossessiveAfterComplexBraces);
EmitComplexBraces(0, 1, NonGreedyOp);
end
else
EmitSimpleBraces(0, 1, NonGreedyOp, PossessiveCh);
end
else
begin // greedy '?'
InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
Tail(Result, EmitNode(OP_BRANCH)); // or
NextNode := EmitNode(OP_NOTHING); // nil.
Tail(Result, NextNode);
OpTail(Result, NextNode);
end;
if NonGreedyCh or PossessiveCh then
Inc(regParse); // Skip extra char ('?')
end; { of case '?' }
'{':
begin
Inc(regParse);
p := regParse;
while IsDigitChar(regParse^) do // <min> MUST appear
Inc(regParse);
if (regParse^ <> '}') and (regParse^ <> ',') or (p = regParse) then
begin
Error(reeIncorrectBraces);
Exit;
end;
BracesMin := ParseNumber(p, regParse - 1);
if regParse^ = ',' then
begin
Inc(regParse);
p := regParse;
while IsDigitChar(regParse^) do
Inc(regParse);
if regParse^ <> '}' then
begin
Error(reeIncorrectBraces);
Exit;
end;
if p = regParse then
BracesMax := MaxBracesArg
else
BracesMax := ParseNumber(p, regParse - 1);
end
else
BracesMax := BracesMin; // {n} == {n,n}
if BracesMin > BracesMax then
begin
Error(reeBracesMinParamGreaterMax);
Exit;
end;
if BracesMin > 0 then
FlagParse := FLAG_WORST;
if BracesMax > 0 then
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SPECSTART;
nextch := (regParse + 1)^;
PossessiveCh := nextch = '+';
if PossessiveCh then
begin
NonGreedyCh := False;
NonGreedyOp := False;
end
else
begin
NonGreedyCh := nextch = '?';
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
end;
if (FlagTemp and FLAG_SIMPLE) <> 0 then
EmitSimpleBraces(BracesMin, BracesMax, NonGreedyOp, PossessiveCh)
else
begin
if PossessiveCh then
Error(reePossessiveAfterComplexBraces);
EmitComplexBraces(BracesMin, BracesMax, NonGreedyOp);
end;
if NonGreedyCh or PossessiveCh then
Inc(regParse); // Skip extra char '?'
end; // of case '{'
// else // here we can't be
end; { of case op }
Inc(regParse);
op := regParse^;
if (op = '*') or (op = '+') or (op = '?') or (op = '{') then
Error(reeNestedQuantif);
end; { of function TRegExpr.ParsePiece
-------------------------------------------------------------- }
function TRegExpr.HexDig(Ch: REChar): integer;
begin
case Ch of
'0' .. '9':
Result := Ord(Ch) - Ord('0');
'a' .. 'f':
Result := Ord(Ch) - Ord('a') + 10;
'A' .. 'F':
Result := Ord(Ch) - Ord('A') + 10;
else
begin
Result := 0;
Error(reeBadHexDigit);
end;
end;
end;
function TRegExpr.UnQuoteChar(var APtr: PRegExprChar): REChar;
var
Ch: REChar;
begin
case APtr^ of
't':
Result := #$9; // \t => tab (HT/TAB)
'n':
Result := #$a; // \n => newline (NL)
'r':
Result := #$d; // \r => carriage return (CR)
'f':
Result := #$c; // \f => form feed (FF)
'a':
Result := #$7; // \a => alarm (bell) (BEL)
'e':
Result := #$1b; // \e => escape (ESC)
'c':
begin // \cK => code for Ctrl+K
Result := #0;
Inc(APtr);
if APtr >= fRegexEnd then
Error(reeNoLetterAfterBSlashC);
Ch := APtr^;
case Ch of
'a' .. 'z':
Result := REChar(Ord(Ch) - Ord('a') + 1);
'A' .. 'Z':
Result := REChar(Ord(Ch) - Ord('A') + 1);
else
Error(reeNoLetterAfterBSlashC);
end;
end;
'x':
begin // \x: hex char
Result := #0;
Inc(APtr);
if APtr >= fRegexEnd then
begin
Error(reeNoHexCodeAfterBSlashX);
Exit;
end;
if APtr^ = '{' then
begin // \x{nnnn} //###0.936
repeat
Inc(APtr);
if APtr >= fRegexEnd then
begin
Error(reeNoHexCodeAfterBSlashX);
Exit;
end;
if APtr^ <> '}' then
begin
if (Ord(Result) ShR (SizeOf(REChar) * 8 - 4)) and $F <> 0 then
begin
Error(reeHexCodeAfterBSlashXTooBig);
Exit;
end;
Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
// HexDig will cause Error if bad hex digit found
end
else
Break;
until False;
end
else
begin
Result := REChar(HexDig(APtr^));
// HexDig will cause Error if bad hex digit found
Inc(APtr);
if APtr >= fRegexEnd then
begin
Error(reeNoHexCodeAfterBSlashX);
Exit;
end;
Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
// HexDig will cause Error if bad hex digit found
end;
end;
else
begin
Result := APtr^;
if (Result <> '_') and IsWordChar(Result) then
begin
fLastErrorSymbol := Result;
Error(reeUnknownMetaSymbol);
end;
end;
end;
end;
function TRegExpr.ParseAtom(var FlagParse: integer): PRegExprChar;
// the lowest level
// Optimization: gobbles an entire sequence of ordinary characters so that
// it can turn them into a single node, which is smaller to store and
// faster to run. Backslashed characters are exceptions, each becoming a
// separate node; the code is simpler that way and it's not worth fixing.
var
ret: PRegExprChar;
RangeBeg, RangeEnd: REChar;
CanBeRange: boolean;
AddrOfLen: PLongInt;
procedure EmitExactly(Ch: REChar);
begin
if fCompModifiers.I then
ret := EmitNode(OP_EXACTLYCI)
else
ret := EmitNode(OP_EXACTLY);
EmitInt(1);
EmitC(Ch);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
procedure EmitRangeChar(Ch: REChar; AStartOfRange: boolean);
begin
CanBeRange := AStartOfRange;
if fCompModifiers.I then
Ch := _UpperCase(Ch);
if AStartOfRange then
begin
AddrOfLen := nil;
RangeBeg := Ch;
end
else
begin
if AddrOfLen = nil then
begin
EmitC(OpKind_Char);
Pointer(AddrOfLen) := regCode;
EmitInt(0);
end;
Inc(AddrOfLen^);
EmitC(Ch);
end;
end;
procedure EmitRangePacked(ch1, ch2: REChar);
var
ChkIndex: integer;
begin
AddrOfLen := nil;
CanBeRange := False;
if fCompModifiers.I then
begin
ch1 := _UpperCase(ch1);
ch2 := _UpperCase(ch2);
end;
for ChkIndex := Low(CharCheckerInfos) to High(CharCheckerInfos) do
if (CharCheckerInfos[ChkIndex].CharBegin = ch1) and
(CharCheckerInfos[ChkIndex].CharEnd = ch2) then
begin
EmitC(OpKind_MetaClass);
EmitC(REChar(CharCheckerInfos[ChkIndex].CheckerIndex));
Exit;
end;
EmitC(OpKind_Range);
EmitC(ch1);
EmitC(ch2);
end;
{$IFDEF FastUnicodeData}
procedure EmitCategoryInCharClass(APositive: boolean);
var
ch, ch2: REChar;
begin
AddrOfLen := nil;
CanBeRange := False;
Inc(regParse);
FindCategoryName(regParse, ch, ch2);
if APositive then
EmitC(OpKind_CategoryYes)
else
EmitC(OpKind_CategoryNo);
EmitC(ch);
EmitC(ch2);
end;
{$ENDIF}
var
FlagTemp: integer;
Len: integer;
SavedPtr: PRegExprChar;
EnderChar, TempChar: REChar;
DashForRange: Boolean;
GrpKind: TREGroupKind;
GrpName: RegExprString;
GrpIndex: integer;
NextCh: REChar;
begin
Result := nil;
FlagTemp := 0;
FlagParse := FLAG_WORST;
AddrOfLen := nil;
Inc(regParse);
case (regParse - 1)^ of
'^':
begin
if not fCompModifiers.M
{$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then
ret := EmitNode(OP_BOL)
else
ret := EmitNode(OP_BOLML);
end;
'$':
begin
if not fCompModifiers.M
{$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then
ret := EmitNode(OP_EOL)
else
ret := EmitNode(OP_EOLML);
end;
'.':
begin
if fCompModifiers.S then
begin
ret := EmitNode(OP_ANY);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end
else
begin // not /s, so emit [^:LineSeparators:]
ret := EmitNode(OP_ANYML);
FlagParse := FlagParse or FLAG_HASWIDTH; // not so simple ;)
end;
end;
'[':
begin
if regParse^ = '^' then
begin // Complement of range.
if fCompModifiers.I then
ret := EmitNode(OP_ANYBUTCI)
else
ret := EmitNode(OP_ANYBUT);
Inc(regParse);
end
else if fCompModifiers.I then
ret := EmitNode(OP_ANYOFCI)
else
ret := EmitNode(OP_ANYOF);
CanBeRange := False;
if regParse^ = ']' then
begin
// first ']' inside [] treated as simple char, no need to check '['
EmitRangeChar(regParse^, (regParse + 1)^ = '-');
Inc(regParse);
end;
while (regParse < fRegexEnd) and (regParse^ <> ']') do
begin
// last '-' inside [] treated as simple dash
if (regParse^ = '-') and
((regParse + 1) < fRegexEnd) and
((regParse + 1)^ = ']') then
begin
EmitRangeChar('-', False);
Inc(regParse);
Break;
end;
// char '-' which (maybe) makes a range
if (regParse^ = '-') and ((regParse + 1) < fRegexEnd) and CanBeRange then
begin
Inc(regParse);
RangeEnd := regParse^;
if RangeEnd = EscChar then
begin
if _IsMetaChar((regParse + 1)^) then
begin
Error(reeMetaCharAfterMinusInRange);
Exit;
end;
Inc(regParse);
RangeEnd := UnQuoteChar(regParse);
end;
// special handling for Russian range a-YA, add 2 ranges: a-ya and A-YA
if fCompModifiers.R and
(RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then
begin
EmitRangePacked(RusRangeLoLow, RusRangeLoHigh);
EmitRangePacked(RusRangeHiLow, RusRangeHiHigh);
end
else
begin // standard r.e. handling
if RangeBeg > RangeEnd then
begin
Error(reeInvalidRange);
Exit;
end;
EmitRangePacked(RangeBeg, RangeEnd);
end;
Inc(regParse);
end
else
begin
if regParse^ = EscChar then
begin
Inc(regParse);
if regParse >= fRegexEnd then
begin
Error(reeParseAtomTrailingBackSlash);
Exit;
end;
if _IsMetaChar(regParse^) then
begin
AddrOfLen := nil;
CanBeRange := False;
EmitC(OpKind_MetaClass);
case regParse^ of
'w':
EmitC(REChar(CheckerIndex_Word));
'W':
EmitC(REChar(CheckerIndex_NotWord));
's':
EmitC(REChar(CheckerIndex_Space));
'S':
EmitC(REChar(CheckerIndex_NotSpace));
'd':
EmitC(REChar(CheckerIndex_Digit));
'D':
EmitC(REChar(CheckerIndex_NotDigit));
'v':
EmitC(REChar(CheckerIndex_VertSep));
'V':
EmitC(REChar(CheckerIndex_NotVertSep));
'h':
EmitC(REChar(CheckerIndex_HorzSep));
'H':
EmitC(REChar(CheckerIndex_NotHorzSep));
else
Error(reeBadOpcodeInCharClass);
end;
end
else
{$IFDEF FastUnicodeData}
if regParse^ = 'p' then
EmitCategoryInCharClass(True)
else
if regParse^ = 'P' then
EmitCategoryInCharClass(False)
else
{$ENDIF}
begin
TempChar := UnQuoteChar(regParse);
// False if '-' is last char in []
DashForRange :=
(regParse + 2 < fRegexEnd) and
((regParse + 1)^ = '-') and
((regParse + 2)^ <> ']');
EmitRangeChar(TempChar, DashForRange);
end;
end
else
begin
// False if '-' is last char in []
DashForRange :=
(regParse + 2 < fRegexEnd) and
((regParse + 1)^ = '-') and
((regParse + 2)^ <> ']');
EmitRangeChar(regParse^, DashForRange);
end;
Inc(regParse);
end;
end; { of while }
AddrOfLen := nil;
CanBeRange := False;
EmitC(OpKind_End);
if regParse^ <> ']' then
begin
Error(reeUnmatchedSqBrackets);
Exit;
end;
Inc(regParse);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'(':
begin
GrpKind := gkNormalGroup;
GrpName := '';
// A: detect kind of expression in brackets
if regParse^ = '?' then
begin
NextCh := (regParse + 1)^;
case NextCh of
':':
begin
// non-capturing group: (?:regex)
GrpKind := gkNonCapturingGroup;
Inc(regParse, 2);
end;
'>':
begin
// atomic group: (?>regex)
GrpKind := gkNonCapturingGroup;
Inc(regParse, 2);
GrpAtomic[regNumBrackets] := True;
end;
'P':
begin
if (regParse + 4 >= fRegexEnd) then
Error(reeNamedGroupBad);
case (regParse + 2)^ of
'<':
begin
// named group: (?P<name>regex)
GrpKind := gkNormalGroup;
FindGroupName(regParse + 3, fRegexEnd, '>', GrpName);
Inc(regParse, Length(GrpName) + 4);
end;
'=':
begin
// back-reference to named group: (?P=name)
GrpKind := gkNamedGroupReference;
FindGroupName(regParse + 3, fRegexEnd, ')', GrpName);
Inc(regParse, Length(GrpName) + 4);
end;
'>':
begin
// subroutine call to named group: (?P>name)
GrpKind := gkSubCall;
FindGroupName(regParse + 3, fRegexEnd, ')', GrpName);
Inc(regParse, Length(GrpName) + 4);
GrpIndex := MatchIndexFromName(GrpName);
if GrpIndex < 1 then
Error(reeNamedGroupBadRef);
end;
else
Error(reeNamedGroupBad);
end;
end;
'<':
begin
// lookbehind: (?<=foo)bar
if (regParse + 4 >= fRegexEnd) then
Error(reeLookbehindBad);
case (regParse + 2)^ of
'=':
begin
// allow lookbehind only at the beginning
if regParse <> fRegexStart + 1 then
Error(reeLookaroundNotAtEdge);
GrpKind := gkLookbehind;
GrpAtomic[regNumBrackets] := RegExprLookbehindIsAtomic;
regLookbehind := True;
Inc(regParse, 3);
end;
'!':
begin
// allow lookbehind only at the beginning
if regParse <> fRegexStart + 1 then
Error(reeLookaroundNotAtEdge);
GrpKind := gkLookbehindNeg;
Inc(regParse, 3);
SavedPtr := _FindClosingBracket(regParse, fRegexEnd);
if SavedPtr = nil then
Error(reeCompParseRegUnmatchedBrackets);
// for '(?<!foo)bar', we make our regex 'bar' and make Helper object with 'foo'
if not fSecondPass then
begin
Len := SavedPtr - fRegexStart - 4;
if Len = 0 then
Error(reeLookbehindBad);
if fHelper = nil then
fHelper := TRegExpr.Create;
fHelper.Expression := Copy(fExpression, 5, Len);
try
fHelper.Compile;
except
Len := fHelper.LastError;
FreeAndNil(fHelper);
Error(Len);
end;
if fHelper.IsFixedLength(TempChar, Len) then
fHelperLen := Len
else
begin
FreeAndNil(fHelper);
Error(reeLookbehindTooComplex);
end;
end;
// jump to closing bracket, don't make opcode for (?<!foo)
regParse := SavedPtr + 1;
end;
else
Error(reeLookbehindBad);
end;
end;
'=', '!':
begin
// lookaheads: foo(?=bar) and foo(?!bar)
if (regParse + 3 >= fRegexEnd) then
Error(reeLookaheadBad);
regLookahead := True;
regLookaheadGroup := regNumBrackets;
if NextCh = '=' then
begin
GrpKind := gkLookahead;
end
else
begin
GrpKind := gkLookaheadNeg;
regLookaheadNeg := True;
end;
GrpAtomic[regNumBrackets] := RegExprLookaheadIsAtomic;
// check that these brackets are last in regex
SavedPtr := _FindClosingBracket(regParse + 1, fRegexEnd);
if (SavedPtr <> fRegexEnd - 1) then
Error(reeLookaroundNotAtEdge);
Inc(regParse, 2);
end;
'#':
begin
// (?#comment)
GrpKind := gkComment;
Inc(regParse, 2);
end;
'a'..'z', '-':
begin
// modifiers string like (?mxr)
GrpKind := gkModifierString;
Inc(regParse);
end;
'R', '0':
begin
// recursion (?R), (?0)
GrpKind := gkRecursion;
Inc(regParse, 2);
if regParse^ <> ')' then
Error(reeBadRecursion);
Inc(regParse);
end;
'1'..'9':
begin
// subroutine call (?1)..(?99)
GrpKind := gkSubCall;
GrpIndex := Ord(NextCh) - Ord('0');
Inc(regParse, 2);
// support 2-digit group numbers
case regParse^ of
')':
begin
Inc(regParse);
end;
'0'..'9':
begin
GrpIndex := GrpIndex * 10 + Ord(regParse^) - Ord('0');
if GrpIndex >= RegexMaxGroups then
Error(reeBadSubCall);
Inc(regParse);
if regParse^ <> ')' then
Error(reeBadSubCall);
Inc(regParse);
end
else
Error(reeBadRecursion);
end;
end;
'''':
begin
// named group: (?'name'regex)
if (regParse + 4 >= fRegexEnd) then
Error(reeNamedGroupBad);
GrpKind := gkNormalGroup;
FindGroupName(regParse + 2, fRegexEnd, '''', GrpName);
Inc(regParse, Length(GrpName) + 3);
end;
'&':
begin
// subroutine call to named group: (?&name)
if (regParse + 2 >= fRegexEnd) then
Error(reeBadSubCall);
GrpKind := gkSubCall;
FindGroupName(regParse + 2, fRegexEnd, ')', GrpName);
Inc(regParse, Length(GrpName) + 3);
GrpIndex := MatchIndexFromName(GrpName);
if GrpIndex < 1 then
Error(reeNamedGroupBadRef);
end;
else
Error(reeIncorrectSpecialBrackets);
end;
end;
// B: process found kind of brackets
case GrpKind of
gkNormalGroup,
gkNonCapturingGroup,
gkLookahead,
gkLookaheadNeg,
gkLookbehind:
begin
// skip this block for one of passes, to not double groups count;
// must take first pass (we need GrpNames filled)
if (GrpKind = gkNormalGroup) and not fSecondPass then
if GrpCount < RegexMaxGroups - 1 then
begin
Inc(GrpCount);
GrpIndexes[GrpCount] := regNumBrackets;
if GrpName <> '' then
begin
if MatchIndexFromName(GrpName) >= 0 then
Error(reeNamedGroupDupName);
GrpNames[GrpCount] := GrpName;
end;
end;
ret := ParseReg(True, FlagTemp);
if ret = nil then
begin
Result := nil;
Exit;
end;
FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART);
end;
gkLookbehindNeg:
begin
// don't make opcode
ret := EmitNode(OP_COMMENT);
FlagParse := FLAG_WORST;
end;
gkNamedGroupReference:
begin
Len := MatchIndexFromName(GrpName);
if Len < 0 then
Error(reeNamedGroupBadRef);
ret := EmitGroupRef(Len, fCompModifiers.I);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
gkModifierString:
begin
SavedPtr := regParse;
while (regParse < fRegexEnd) and (regParse^ <> ')') do
Inc(regParse);
if (regParse^ <> ')') or
not ParseModifiers(SavedPtr, regParse - SavedPtr, fCompModifiers) then
begin
Error(reeUnrecognizedModifier);
Exit;
end;
Inc(regParse); // skip ')'
ret := EmitNode(OP_COMMENT); // comment
// Error (reeQuantifFollowsNothing);
// Exit;
end;
gkComment:
begin
while (regParse < fRegexEnd) and (regParse^ <> ')') do
Inc(regParse);
if regParse^ <> ')' then
begin
Error(reeUnclosedComment);
Exit;
end;
Inc(regParse); // skip ')'
ret := EmitNode(OP_COMMENT); // comment
end;
gkRecursion:
begin
// set FLAG_HASWIDTH to allow compiling of such regex: b(?:m|(?R))*e
FlagParse := FlagParse or FLAG_HASWIDTH;
ret := EmitNode(OP_RECUR);
end;
gkSubCall:
begin
// set FLAG_HASWIDTH like for (?R)
FlagParse := FlagParse or FLAG_HASWIDTH;
ret := EmitNode(TReOp(Ord(OP_SUBCALL) + GrpIndex));
end;
end; // case GrpKind of
end;
'|', ')':
begin // Supposed to be caught earlier.
Error(reeInternalUrp);
Exit;
end;
'?', '+', '*':
begin
Error(reeQuantifFollowsNothing);
Exit;
end;
EscChar:
begin
if regParse >= fRegexEnd then
begin
Error(reeTrailingBackSlash);
Exit;
end;
case regParse^ of
'b':
ret := EmitNode(OP_BOUND);
'B':
ret := EmitNode(OP_NOTBOUND);
'A':
ret := EmitNode(OP_BOL);
'z':
ret := EmitNode(OP_EOL);
'Z':
ret := EmitNode(OP_EOL2);
'd':
begin // r.e.extension - any digit ('0' .. '9')
ret := EmitNode(OP_ANYDIGIT);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'D':
begin // r.e.extension - not digit ('0' .. '9')
ret := EmitNode(OP_NOTDIGIT);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
's':
begin // r.e.extension - any space char
ret := EmitNode(OP_ANYSPACE);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'S':
begin // r.e.extension - not space char
ret := EmitNode(OP_NOTSPACE);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'w':
begin // r.e.extension - any english char / digit / '_'
ret := EmitNode(OP_ANYLETTER);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'W':
begin // r.e.extension - not english char / digit / '_'
ret := EmitNode(OP_NOTLETTER);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'v':
begin
ret := EmitNode(OP_ANYVERTSEP);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'V':
begin
ret := EmitNode(OP_NOTVERTSEP);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'h':
begin
ret := EmitNode(OP_ANYHORZSEP);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'H':
begin
ret := EmitNode(OP_NOTHORZSEP);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'1' .. '9':
begin
ret := EmitGroupRef(Ord(regParse^) - Ord('0'), fCompModifiers.I);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
{$IFDEF FastUnicodeData}
'p':
begin
ret := EmitCategoryMain(True);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
'P':
begin
ret := EmitCategoryMain(False);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end;
{$ENDIF}
else
EmitExactly(UnQuoteChar(regParse));
end; { of case }
Inc(regParse);
end;
else
begin
Dec(regParse);
if fCompModifiers.X and // check for eXtended syntax
((regParse^ = '#') or IsIgnoredChar(regParse^)) then
begin // ###0.941 \x
if regParse^ = '#' then
begin // Skip eXtended comment
// find comment terminator (group of \n and/or \r)
while (regParse < fRegexEnd) and (regParse^ <> #$d) and
(regParse^ <> #$a) do
Inc(regParse);
while (regParse^ = #$d) or (regParse^ = #$a)
// skip comment terminator
do
Inc(regParse);
// attempt to support different type of line separators
end
else
begin // Skip the blanks!
while IsIgnoredChar(regParse^) do
Inc(regParse);
end;
ret := EmitNode(OP_COMMENT); // comment
end
else
begin
Len := FindSkippedMetaLen(regParse, fRegexEnd);
if Len <= 0 then
if regParse^ <> '{' then
begin
Error(reeRarseAtomInternalDisaster);
Exit;
end
else
Len := FindSkippedMetaLen(regParse + 1, fRegexEnd) + 1;
// bad {n,m} - compile as EXACTLY
EnderChar := (regParse + Len)^;
if (Len > 1) and ((EnderChar = '*') or (EnderChar = '+') or (EnderChar = '?') or (EnderChar = '{')) then
Dec(Len); // back off clear of ?+*{ operand.
FlagParse := FlagParse or FLAG_HASWIDTH;
if Len = 1 then
FlagParse := FlagParse or FLAG_SIMPLE;
if fCompModifiers.I then
ret := EmitNode(OP_EXACTLYCI)
else
ret := EmitNode(OP_EXACTLY);
EmitInt(0);
while (Len > 0) and ((not fCompModifiers.X) or (regParse^ <> '#')) do
begin
if not fCompModifiers.X or not IsIgnoredChar(regParse^) then
begin
EmitC(regParse^);
if regCode <> @regDummy then
Inc(regExactlyLen^);
end;
Inc(regParse);
Dec(Len);
end;
end; { of if not comment }
end; { of case else }
end; { of case }
Result := ret;
end; { of function TRegExpr.ParseAtom
-------------------------------------------------------------- }
function TRegExpr.GetCompilerErrorPos: PtrInt;
begin
Result := 0;
if (fRegexStart = nil) or (regParse = nil) then
Exit; // not in compiling mode ?
Result := regParse - fRegexStart;
end; { of function TRegExpr.GetCompilerErrorPos
-------------------------------------------------------------- }
{ ============================================================= }
{ ===================== Matching section ====================== }
{ ============================================================= }
procedure TRegExpr.FindGroupName(APtr, AEndPtr: PRegExprChar; AEndChar: REChar; var AName: RegExprString);
// check that group name is valid identifier, started from non-digit
// this is to be like in Python regex
var
P: PRegExprChar;
begin
P := APtr;
if IsDigitChar(P^) or not IsWordChar(P^) then
Error(reeNamedGroupBadName);
repeat
if P >= AEndPtr then
Error(reeNamedGroupBad);
if P^ = AEndChar then
Break;
if not IsWordChar(P^) then
Error(reeNamedGroupBadName);
Inc(P);
until False;
SetString(AName, APtr, P-APtr);
end;
function TRegExpr.FindRepeated(p: PRegExprChar; AMax: integer): integer;
// repeatedly match something simple, report how many
// p: points to current opcode
var
scan: PRegExprChar;
opnd: PRegExprChar;
TheMax: PtrInt; // PtrInt, gets diff of 2 pointers
InvChar: REChar;
CurStart, CurEnd: PRegExprChar;
ArrayIndex, i: integer;
begin
Result := 0;
scan := regInput; // points into InputString
opnd := p + REOpSz + RENextOffSz; // points to operand of opcode (after OP_nnn code)
TheMax := fInputEnd - scan;
if TheMax > AMax then
TheMax := AMax;
case PREOp(p)^ of
OP_ANY:
begin
// note - OP_ANYML cannot be proceeded in FindRepeated because can skip
// more than one char at once
{$IFDEF UnicodeEx}
for i := 1 to TheMax do
IncUnicode2(scan, Result);
{$ELSE}
Result := TheMax;
Inc(scan, Result);
{$ENDIF}
end;
OP_EXACTLY:
begin // in opnd can be only ONE char !!!
{
// Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
NLen := PLongInt(opnd)^;
if TheMax > NLen then
TheMax := NLen;
}
Inc(opnd, RENumberSz);
while (Result < TheMax) and (opnd^ = scan^) do
begin
Inc(Result);
Inc(scan);
end;
end;
OP_EXACTLYCI:
begin // in opnd can be only ONE char !!!
{
// Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
NLen := PLongInt(opnd)^;
if TheMax > NLen then
TheMax := NLen;
}
Inc(opnd, RENumberSz);
while (Result < TheMax) and (opnd^ = scan^) do
begin // prevent unneeded InvertCase //###0.931
Inc(Result);
Inc(scan);
end;
if Result < TheMax then
begin // ###0.931
InvChar := InvertCase(opnd^); // store in register
while (Result < TheMax) and ((opnd^ = scan^) or (InvChar = scan^)) do
begin
Inc(Result);
Inc(scan);
end;
end;
end;
OP_BSUBEXP:
begin // ###0.936
ArrayIndex := GrpIndexes[Ord(opnd^)];
if ArrayIndex < 0 then
Exit;
CurStart := GrpStart[ArrayIndex];
if CurStart = nil then
Exit;
CurEnd := GrpEnd[ArrayIndex];
if CurEnd = nil then
Exit;
repeat
opnd := CurStart;
while opnd < CurEnd do
begin
if (scan >= fInputEnd) or (scan^ <> opnd^) then
Exit;
Inc(scan);
Inc(opnd);
end;
Inc(Result);
regInput := scan;
until Result >= AMax;
end;
OP_BSUBEXPCI:
begin // ###0.936
ArrayIndex := GrpIndexes[Ord(opnd^)];
if ArrayIndex < 0 then
Exit;
CurStart := GrpStart[ArrayIndex];
if CurStart = nil then
Exit;
CurEnd := GrpEnd[ArrayIndex];
if CurEnd = nil then
Exit;
repeat
opnd := CurStart;
while opnd < CurEnd do
begin
if (scan >= fInputEnd) or
((scan^ <> opnd^) and (scan^ <> InvertCase(opnd^))) then
Exit;
Inc(scan);
Inc(opnd);
end;
Inc(Result);
regInput := scan;
until Result >= AMax;
end;
OP_ANYDIGIT:
while (Result < TheMax) and IsDigitChar(scan^) do
begin
Inc(Result);
Inc(scan);
end;
OP_NOTDIGIT:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not IsDigitChar(scan^) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not IsDigitChar(scan^) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYLETTER:
while (Result < TheMax) and IsWordChar(scan^) do // ###0.940
begin
Inc(Result);
Inc(scan);
end;
OP_NOTLETTER:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not IsWordChar(scan^) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not IsWordChar(scan^) do // ###0.940
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYSPACE:
while (Result < TheMax) and IsSpaceChar(scan^) do
begin
Inc(Result);
Inc(scan);
end;
OP_NOTSPACE:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not IsSpaceChar(scan^) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not IsSpaceChar(scan^) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYVERTSEP:
while (Result < TheMax) and IsVertLineSeparator(scan^) do
begin
Inc(Result);
Inc(scan);
end;
OP_NOTVERTSEP:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not IsVertLineSeparator(scan^) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not IsVertLineSeparator(scan^) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYHORZSEP:
while (Result < TheMax) and IsHorzSeparator(scan^) do
begin
Inc(Result);
Inc(scan);
end;
OP_NOTHORZSEP:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not IsHorzSeparator(scan^) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not IsHorzSeparator(scan^) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYOF:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and FindInCharClass(opnd, scan^, False) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and FindInCharClass(opnd, scan^, False) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYBUT:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not FindInCharClass(opnd, scan^, False) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not FindInCharClass(opnd, scan^, False) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYOFCI:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and FindInCharClass(opnd, scan^, True) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and FindInCharClass(opnd, scan^, True) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_ANYBUTCI:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not FindInCharClass(opnd, scan^, True) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not FindInCharClass(opnd, scan^, True) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
{$IFDEF FastUnicodeData}
OP_ANYCATEGORY:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and MatchOneCharCategory(opnd, scan) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and MatchOneCharCategory(opnd, scan) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
OP_NOTCATEGORY:
{$IFDEF UNICODEEX}
begin
i := 0;
while (i < TheMax) and not MatchOneCharCategory(opnd, scan) do
begin
Inc(i);
IncUnicode2(scan, Result);
end;
end;
{$ELSE}
while (Result < TheMax) and not MatchOneCharCategory(opnd, scan) do
begin
Inc(Result);
Inc(scan);
end;
{$ENDIF}
{$ENDIF}
else
begin // Oh dear. Called inappropriately.
Result := 0;
Error(reeRegRepeatCalledInappropriately);
Exit;
end;
end; { of case }
regInput := scan;
end; { of function TRegExpr.FindRepeated
-------------------------------------------------------------- }
function TRegExpr.regNext(p: PRegExprChar): PRegExprChar;
// dig the "next" pointer out of a node
var
offset: TRENextOff;
begin
if p = @regDummy then
begin
Result := nil;
Exit;
end;
offset := PRENextOff(AlignToPtr(p + REOpSz))^; // ###0.933 inlined NEXT
if offset = 0 then
Result := nil
else
Result := p + offset;
end; { of function TRegExpr.regNext
-------------------------------------------------------------- }
function TRegExpr.MatchPrim(prog: PRegExprChar): boolean;
// recursively matching routine
// Conceptually the strategy is simple: check to see whether the current
// node matches, call self recursively to see whether the rest matches,
// and then act accordingly. In practice we make some effort to avoid
// recursion, in particular by going through "ordinary" nodes (that don't
// need to know whether the rest of the match failed) by a loop instead of
// by recursion.
var
scan: PRegExprChar; // Current node.
next: PRegExprChar; // Next node.
Len: PtrInt;
opnd: PRegExprChar;
no: integer;
save: PRegExprChar;
saveCurrentGrp: integer;
nextch: REChar;
BracesMin, BracesMax: integer;
// we use integer instead of TREBracesArg for better support */+
{$IFDEF ComplexBraces}
SavedLoopStack: TRegExprLoopStack; // :(( very bad for recursion
SavedLoopStackIdx: integer; // ###0.925
{$ENDIF}
bound1, bound2: boolean;
checkAtomicGroup: boolean;
begin
Result := False;
{
// Alexey: not sure it's ok for long searches in big texts, so disabled
if regNestedCalls > MaxRegexBackTracking then
Exit;
Inc(regNestedCalls);
}
scan := prog;
while scan <> nil do
begin
Len := PRENextOff(AlignToPtr(scan + 1))^; // ###0.932 inlined regNext
if Len = 0 then
next := nil
else
next := scan + Len;
case scan^ of
OP_BOUND:
begin
bound1 := (regInput = fInputStart) or not IsWordChar((regInput - 1)^);
bound2 := (regInput >= fInputEnd) or not IsWordChar(regInput^);
if bound1 = bound2 then
Exit;
end;
OP_NOTBOUND:
begin
bound1 := (regInput = fInputStart) or not IsWordChar((regInput - 1)^);
bound2 := (regInput >= fInputEnd) or not IsWordChar(regInput^);
if bound1 <> bound2 then
Exit;
end;
OP_BOL:
begin
if regInput <> fInputStart then
Exit;
end;
OP_EOL:
begin
// \z matches at the very end
if regInput < fInputEnd then
Exit;
end;
OP_EOL2:
begin
// \Z matches at the very and + before the final line-break (LF and CR LF)
if regInput < fInputEnd then
begin
if (regInput = fInputEnd - 1) and (regInput^ = #10) then
begin end
else
if (regInput = fInputEnd - 2) and (regInput^ = #13) and ((regInput + 1) ^ = #10) then
begin end
else
Exit;
end;
end;
OP_BOLML:
if regInput > fInputStart then
begin
if ((regInput - 1) <= fInputStart) or
not IsPairedBreak(regInput - 2) then
begin
// don't stop between paired separator
if IsPairedBreak(regInput - 1) then
Exit;
if not IsCustomLineSeparator((regInput - 1)^) then
Exit;
end;
end;
OP_EOLML:
if regInput < fInputEnd then
begin
if not IsPairedBreak(regInput) then
begin
// don't stop between paired separator
if (regInput > fInputStart) and IsPairedBreak(regInput - 1) then
Exit;
if not IsCustomLineSeparator(regInput^) then
Exit;
end;
end;
OP_ANY:
begin
if regInput >= fInputEnd then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYML:
begin
if (regInput >= fInputEnd) or
IsPairedBreak(regInput) or
IsCustomLineSeparator(regInput^)
then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYDIGIT:
begin
if (regInput >= fInputEnd) or not IsDigitChar(regInput^) then
Exit;
Inc(regInput);
end;
OP_NOTDIGIT:
begin
if (regInput >= fInputEnd) or IsDigitChar(regInput^) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYLETTER:
begin
if (regInput >= fInputEnd) or not IsWordChar(regInput^) then
Exit;
Inc(regInput);
end;
OP_NOTLETTER:
begin
if (regInput >= fInputEnd) or IsWordChar(regInput^) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYSPACE:
begin
if (regInput >= fInputEnd) or not IsSpaceChar(regInput^) then
Exit;
Inc(regInput);
end;
OP_NOTSPACE:
begin
if (regInput >= fInputEnd) or IsSpaceChar(regInput^) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYVERTSEP:
begin
if (regInput >= fInputEnd) or not IsVertLineSeparator(regInput^) then
Exit;
Inc(regInput);
end;
OP_NOTVERTSEP:
begin
if (regInput >= fInputEnd) or IsVertLineSeparator(regInput^) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYHORZSEP:
begin
if (regInput >= fInputEnd) or not IsHorzSeparator(regInput^) then
Exit;
Inc(regInput);
end;
OP_NOTHORZSEP:
begin
if (regInput >= fInputEnd) or IsHorzSeparator(regInput^) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_EXACTLYCI:
begin
opnd := scan + REOpSz + RENextOffSz; // OPERAND
Len := PLongInt(opnd)^;
Inc(opnd, RENumberSz);
// Inline the first character, for speed.
if (opnd^ <> regInput^) and (InvertCase(opnd^) <> regInput^) then
Exit;
// ###0.929 begin
no := Len;
save := regInput;
while no > 1 do
begin
Inc(save);
Inc(opnd);
if (opnd^ <> save^) and (InvertCase(opnd^) <> save^) then
Exit;
Dec(no);
end;
// ###0.929 end
Inc(regInput, Len);
end;
OP_EXACTLY:
begin
opnd := scan + REOpSz + RENextOffSz; // OPERAND
Len := PLongInt(opnd)^;
Inc(opnd, RENumberSz);
// Inline the first character, for speed.
if opnd^ <> regInput^ then
Exit;
// ###0.929 begin
no := Len;
save := regInput;
while no > 1 do
begin
Inc(save);
Inc(opnd);
if opnd^ <> save^ then
Exit;
Dec(no);
end;
// ###0.929 end
Inc(regInput, Len);
end;
OP_BSUBEXP:
begin // ###0.936
no := Ord((scan + REOpSz + RENextOffSz)^);
no := GrpIndexes[no];
if no < 0 then
Exit;
if GrpStart[no] = nil then
Exit;
if GrpEnd[no] = nil then
Exit;
save := regInput;
opnd := GrpStart[no];
while opnd < GrpEnd[no] do
begin
if (save >= fInputEnd) or (save^ <> opnd^) then
Exit;
Inc(save);
Inc(opnd);
end;
regInput := save;
end;
OP_BSUBEXPCI:
begin // ###0.936
no := Ord((scan + REOpSz + RENextOffSz)^);
no := GrpIndexes[no];
if no < 0 then
Exit;
if GrpStart[no] = nil then
Exit;
if GrpEnd[no] = nil then
Exit;
save := regInput;
opnd := GrpStart[no];
while opnd < GrpEnd[no] do
begin
if (save >= fInputEnd) or
((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then
Exit;
Inc(save);
Inc(opnd);
end;
regInput := save;
end;
OP_ANYOF:
begin
if (regInput >= fInputEnd) or
not FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, False) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYBUT:
begin
if (regInput >= fInputEnd) or
FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, False) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYOFCI:
begin
if (regInput >= fInputEnd) or
not FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, True) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_ANYBUTCI:
begin
if (regInput >= fInputEnd) or
FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, True) then
Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_NOTHING:
;
OP_COMMENT:
;
OP_BACK:
;
OP_OPEN_FIRST .. OP_OPEN_LAST:
begin
no := Ord(scan^) - Ord(OP_OPEN);
regCurrentGrp := no;
save := GrpStart[no]; // ###0.936
GrpStart[no] := regInput; // ###0.936
Result := MatchPrim(next);
if not Result then // ###0.936
GrpStart[no] := save;
// handle negative lookahead
if regLookaheadNeg then
if no = regLookaheadGroup then
begin
Result := not Result;
if Result then
begin
// we need zero length of "lookahead group",
// it is later used to adjust the match
GrpStart[no] := regInput;
GrpEnd[no]:= regInput;
end
else
GrpStart[no] := save;
end;
Exit;
end;
OP_CLOSE_FIRST .. OP_CLOSE_LAST:
begin
no := Ord(scan^) - Ord(OP_CLOSE);
regCurrentGrp := -1;
// handle atomic group, mark it as "done"
// (we are here because some OP_BRANCH is matched)
if GrpAtomic[no] then
GrpAtomicDone[no] := True;
save := GrpEnd[no]; // ###0.936
GrpEnd[no] := regInput; // ###0.936
// if we are in OP_SUBCALL* call, it called OP_OPEN*, so we must return
// in OP_CLOSE, without going to next opcode
if GrpSubCalled[no] then
begin
Result := True;
Exit;
end;
Result := MatchPrim(next);
if not Result then // ###0.936
GrpEnd[no] := save;
Exit;
end;
OP_BRANCH:
begin
saveCurrentGrp := regCurrentGrp;
checkAtomicGroup := (regCurrentGrp >= 0) and GrpAtomic[regCurrentGrp];
if (next^ <> OP_BRANCH) // No next choice in group
then
next := scan + REOpSz + RENextOffSz // Avoid recursion
else
begin
repeat
save := regInput;
Result := MatchPrim(scan + REOpSz + RENextOffSz);
regCurrentGrp := saveCurrentGrp;
if Result then
Exit;
// if branch worked until OP_CLOSE, and marked atomic group as "done", then exit
if checkAtomicGroup then
if GrpAtomicDone[regCurrentGrp] then
Exit;
regInput := save;
scan := regNext(scan);
until (scan = nil) or (scan^ <> OP_BRANCH);
Exit;
end;
end;
{$IFDEF ComplexBraces}
OP_LOOPENTRY:
begin // ###0.925
no := LoopStackIdx;
Inc(LoopStackIdx);
if LoopStackIdx > LoopStackMax then
begin
Error(reeLoopStackExceeded);
Exit;
end;
save := regInput;
LoopStack[LoopStackIdx] := 0; // init loop counter
Result := MatchPrim(next); // execute loop
LoopStackIdx := no; // cleanup
if Result then
Exit;
regInput := save;
Exit;
end;
OP_LOOP, OP_LOOPNG:
begin // ###0.940
if LoopStackIdx <= 0 then
begin
Error(reeLoopWithoutEntry);
Exit;
end;
opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + 2 * REBracesArgSz))^;
BracesMin := PREBracesArg(AlignToInt(scan + REOpSz + RENextOffSz))^;
BracesMax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
save := regInput;
if LoopStack[LoopStackIdx] >= BracesMin then
begin // Min alredy matched - we can work
if scan^ = OP_LOOP then
begin
// greedy way - first try to max deep of greed ;)
if LoopStack[LoopStackIdx] < BracesMax then
begin
Inc(LoopStack[LoopStackIdx]);
no := LoopStackIdx;
Result := MatchPrim(opnd);
LoopStackIdx := no;
if Result then
Exit;
regInput := save;
end;
Dec(LoopStackIdx); // Fail. May be we are too greedy? ;)
Result := MatchPrim(next);
if not Result then
regInput := save;
Exit;
end
else
begin
// non-greedy - try just now
Result := MatchPrim(next);
if Result then
Exit
else
regInput := save; // failed - move next and try again
if LoopStack[LoopStackIdx] < BracesMax then
begin
Inc(LoopStack[LoopStackIdx]);
no := LoopStackIdx;
Result := MatchPrim(opnd);
LoopStackIdx := no;
if Result then
Exit;
regInput := save;
end;
Dec(LoopStackIdx); // Failed - back up
Exit;
end
end
else
begin // first match a min_cnt times
Inc(LoopStack[LoopStackIdx]);
no := LoopStackIdx;
Result := MatchPrim(opnd);
LoopStackIdx := no;
if Result then
Exit;
Dec(LoopStack[LoopStackIdx]);
regInput := save;
Exit;
end;
end;
{$ENDIF}
OP_STAR, OP_PLUS, OP_BRACES, OP_STARNG, OP_PLUSNG, OP_BRACESNG:
begin
// Lookahead to avoid useless match attempts when we know
// what character comes next.
nextch := #0;
if next^ = OP_EXACTLY then
nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
BracesMax := MaxInt; // infinite loop for * and + //###0.92
if (scan^ = OP_STAR) or (scan^ = OP_STARNG) then
BracesMin := 0 // star
else if (scan^ = OP_PLUS) or (scan^ = OP_PLUSNG) then
BracesMin := 1 // plus
else
begin // braces
BracesMin := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
BracesMax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
end;
save := regInput;
opnd := scan + REOpSz + RENextOffSz;
if (scan^ = OP_BRACES) or (scan^ = OP_BRACESNG) then
Inc(opnd, 2 * REBracesArgSz);
if (scan^ = OP_PLUSNG) or (scan^ = OP_STARNG) or (scan^ = OP_BRACESNG) then
begin
// non-greedy mode
BracesMax := FindRepeated(opnd, BracesMax);
// don't repeat more than BracesMax
// Now we know real Max limit to move forward (for recursion 'back up')
// In some cases it can be faster to check only Min positions first,
// but after that we have to check every position separtely instead
// of fast scannig in loop.
no := BracesMin;
while no <= BracesMax do
begin
regInput := save + no;
// If it could work, try it.
if (nextch = #0) or (regInput^ = nextch) then
begin
{$IFDEF ComplexBraces}
System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
// ###0.925
SavedLoopStackIdx := LoopStackIdx;
{$ENDIF}
if MatchPrim(next) then
begin
Result := True;
Exit;
end;
{$IFDEF ComplexBraces}
System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
LoopStackIdx := SavedLoopStackIdx;
{$ENDIF}
end;
Inc(no); // Couldn't or didn't - move forward.
end; { of while }
Exit;
end
else
begin // greedy mode
no := FindRepeated(opnd, BracesMax); // don't repeat more than max_cnt
while no >= BracesMin do
begin
// If it could work, try it.
if (nextch = #0) or (regInput^ = nextch) then
begin
{$IFDEF ComplexBraces}
System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
// ###0.925
SavedLoopStackIdx := LoopStackIdx;
{$ENDIF}
if MatchPrim(next) then
begin
Result := True;
Exit;
end;
{$IFDEF ComplexBraces}
System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
LoopStackIdx := SavedLoopStackIdx;
{$ENDIF}
end;
Dec(no); // Couldn't or didn't - back up.
regInput := save + no;
end; { of while }
Exit;
end;
end;
OP_STAR_POSS, OP_PLUS_POSS, OP_BRACES_POSS:
begin
// Lookahead to avoid useless match attempts when we know
// what character comes next.
nextch := #0;
if next^ = OP_EXACTLY then
nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
opnd := scan + REOpSz + RENextOffSz;
case scan^ of
OP_STAR_POSS:
begin
BracesMin := 0;
BracesMax := MaxInt;
end;
OP_PLUS_POSS:
begin
BracesMin := 1;
BracesMax := MaxInt;
end;
else
begin // braces
BracesMin := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
BracesMax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
Inc(opnd, 2 * REBracesArgSz);
end;
end;
no := FindRepeated(opnd, BracesMax);
if no >= BracesMin then
if (nextch = #0) or (regInput^ = nextch) then
Result := MatchPrim(next);
Exit;
end;
OP_EEND:
begin
Result := True; // Success!
Exit;
end;
{$IFDEF FastUnicodeData}
OP_ANYCATEGORY:
begin
if (regInput >= fInputEnd) then Exit;
if not MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
OP_NOTCATEGORY:
begin
if (regInput >= fInputEnd) then Exit;
if MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit;
{$IFDEF UNICODEEX}
IncUnicode(regInput);
{$ELSE}
Inc(regInput);
{$ENDIF}
end;
{$ENDIF}
OP_RECUR:
begin
// call opcode start
if not MatchPrim(regCodeWork) then Exit;
end;
OP_SUBCALL_FIRST .. OP_SUBCALL_LAST:
begin
// call subroutine
no := GrpIndexes[Ord(scan^) - Ord(OP_SUBCALL)];
if no < 0 then Exit;
save := GrpOpCodes[no];
if save = nil then Exit;
checkAtomicGroup := GrpSubCalled[no];
// mark group in GrpSubCalled array so opcode can detect subcall
GrpSubCalled[no] := True;
if not MatchPrim(save) then
begin
GrpSubCalled[no] := checkAtomicGroup;
Exit;
end;
GrpSubCalled[no] := checkAtomicGroup;
end;
else
begin
Error(reeMatchPrimMemoryCorruption);
Exit;
end;
end; { of case scan^ }
scan := next;
end; { of while scan <> nil }
// We get here only if there's trouble -- normally "case EEND" is the
// terminating point.
Error(reeMatchPrimCorruptedPointers);
end; { of function TRegExpr.MatchPrim
-------------------------------------------------------------- }
function TRegExpr.Exec(const AInputString: RegExprString): boolean;
begin
InputString := AInputString;
Result := ExecPrim(1, False, False, False);
end; { of function TRegExpr.Exec
-------------------------------------------------------------- }
{$IFDEF OverMeth}
function TRegExpr.Exec: boolean;
var
SlowChecks: boolean;
begin
SlowChecks := Length(fInputString) < fSlowChecksSizeMax;
Result := ExecPrim(1, False, SlowChecks, False);
end; { of function TRegExpr.Exec
-------------------------------------------------------------- }
function TRegExpr.Exec(AOffset: integer): boolean;
begin
Result := ExecPrim(AOffset, False, False, False);
end; { of function TRegExpr.Exec
-------------------------------------------------------------- }
{$ENDIF}
function TRegExpr.ExecPos(AOffset: integer {$IFDEF DefParam} = 1{$ENDIF}): boolean;
begin
Result := ExecPrim(AOffset, False, False, False);
end; { of function TRegExpr.ExecPos
-------------------------------------------------------------- }
{$IFDEF OverMeth}
function TRegExpr.ExecPos(AOffset: integer; ATryOnce, ABackward: boolean): boolean;
begin
Result := ExecPrim(AOffset, ATryOnce, False, ABackward);
end;
{$ENDIF}
function TRegExpr.MatchAtOnePos(APos: PRegExprChar): boolean;
begin
// test for lookbehind '(?<!foo)bar' before running actual MatchPrim
if Assigned(fHelper) then
if (APos - fHelperLen) >= fInputStart then
begin
fHelper.SetInputRange(APos - fHelperLen, APos);
if fHelper.MatchAtOnePos(APos - fHelperLen) then
begin
Result := False;
Exit;
end;
end;
regInput := APos;
regCurrentGrp := -1;
regNestedCalls := 0;
Result := MatchPrim(regCodeWork);
if Result then
begin
GrpStart[0] := APos;
GrpEnd[0] := regInput;
// with lookbehind, increase found position by the len of group=1
if regLookbehind then
Inc(GrpStart[0], GrpEnd[1] - GrpStart[1]);
// with lookahead, decrease ending by the len of group=regLookaheadGroup
if regLookahead and (regLookaheadGroup > 0) then
Dec(GrpEnd[0], GrpEnd[regLookaheadGroup] - GrpStart[regLookaheadGroup]);
end;
end;
procedure TRegExpr.ClearMatches;
begin
FillChar(GrpStart, SizeOf(GrpStart), 0);
FillChar(GrpEnd, SizeOf(GrpEnd), 0);
FillChar(GrpAtomicDone, SizeOf(GrpAtomicDone), 0);
FillChar(GrpSubCalled, SizeOf(GrpSubCalled), 0);
end;
procedure TRegExpr.ClearInternalIndexes;
var
i: integer;
begin
FillChar(GrpStart, SizeOf(GrpStart), 0);
FillChar(GrpEnd, SizeOf(GrpEnd), 0);
FillChar(GrpAtomic, SizeOf(GrpAtomic), 0);
FillChar(GrpAtomicDone, SizeOf(GrpAtomicDone), 0);
FillChar(GrpSubCalled, SizeOf(GrpSubCalled), 0);
FillChar(GrpOpCodes, SizeOf(GrpOpCodes), 0);
for i := 0 to RegexMaxGroups - 1 do
begin
GrpIndexes[i] := -1;
GrpNames[i] := '';
end;
GrpIndexes[0] := 0;
GrpCount := 0;
end;
function TRegExpr.ExecPrim(AOffset: integer;
ATryOnce, ASlowChecks, ABackward: boolean): boolean;
var
Ptr: PRegExprChar;
begin
Result := False;
// Ensure that Match cleared either if optimization tricks or some error
// will lead to leaving ExecPrim without actual search. That is
// important for ExecNext logic and so on.
ClearMatches;
// Don't check IsProgrammOk here! it causes big slowdown in test_benchmark!
if programm = nil then
begin
Compile;
if programm = nil then
Exit;
end;
if fInputString = '' then
begin
// Empty string can match e.g. '^$'
if regMustLen > 0 then
Exit;
end;
// Check that the start position is not negative
if AOffset < 1 then
begin
Error(reeOffsetMustBePositive);
Exit;
end;
// Check that the start position is not longer than the line
if AOffset > (Length(fInputString) + 1) then
Exit;
Ptr := fInputStart + AOffset - 1;
// If there is a "must appear" string, look for it.
if ASlowChecks then
if regMustString <> '' then
if Pos(regMustString, fInputString) = 0 then Exit;
{$IFDEF ComplexBraces}
// no loops started
LoopStackIdx := 0; // ###0.925
{$ENDIF}
// ATryOnce or anchored match (it needs to be tried only once).
if ATryOnce or (regAnchored <> #0) then
begin
{$IFDEF UseFirstCharSet}
{$IFDEF UniCode}
if Ord(Ptr^) <= $FF then
{$ENDIF}
if not FirstCharArray[byte(Ptr^)] then
Exit;
{$ENDIF}
Result := MatchAtOnePos(Ptr);
Exit;
end;
// Messy cases: unanchored match.
if ABackward then
Inc(Ptr, 2)
else
Dec(Ptr);
repeat
if ABackward then
begin
Dec(Ptr);
if Ptr < fInputStart then
Exit;
end
else
begin
Inc(Ptr);
if Ptr > fInputEnd then
Exit;
end;
{$IFDEF UseFirstCharSet}
{$IFDEF UniCode}
if Ord(Ptr^) <= $FF then
{$ENDIF}
if not FirstCharArray[byte(Ptr^)] then
Continue;
{$ENDIF}
Result := MatchAtOnePos(Ptr);
// Exit on a match or after testing the end-of-string
if Result then
Exit;
until False;
end; { of function TRegExpr.ExecPrim
-------------------------------------------------------------- }
function TRegExpr.ExecNext(ABackward: boolean {$IFDEF DefParam} = False{$ENDIF}): boolean;
var
PtrBegin, PtrEnd: PRegExprChar;
Offset: PtrInt;
begin
PtrBegin := GrpStart[0];
PtrEnd := GrpEnd[0];
if (PtrBegin = nil) or (PtrEnd = nil) then
begin
Error(reeExecNextWithoutExec);
Result := False;
Exit;
end;
Offset := PtrEnd - fInputStart + 1;
// prevent infinite looping if empty string matches r.e.
if PtrBegin = PtrEnd then
Inc(Offset);
Result := ExecPrim(Offset, False, False, ABackward);
end; { of function TRegExpr.ExecNext
-------------------------------------------------------------- }
procedure TRegExpr.SetInputString(const AInputString: RegExprString);
begin
ClearMatches;
fInputString := AInputString;
UniqueString(fInputString);
fInputStart := PRegExprChar(fInputString);
fInputEnd := fInputStart + Length(fInputString);
end;
procedure TRegExpr.SetInputRange(AStart, AEnd: PRegExprChar);
begin
fInputString := '';
fInputStart := AStart;
fInputEnd := AEnd;
end;
{$IFDEF UseLineSep}
procedure TRegExpr.SetLineSeparators(const AStr: RegExprString);
begin
if AStr <> fLineSeparators then
begin
fLineSeparators := AStr;
InitLineSepArray;
InvalidateProgramm;
end;
end; { of procedure TRegExpr.SetLineSeparators
-------------------------------------------------------------- }
{$ENDIF}
procedure TRegExpr.SetUsePairedBreak(AValue: boolean);
begin
if AValue <> fUsePairedBreak then
begin
fUsePairedBreak := AValue;
InvalidateProgramm;
end;
end;
function TRegExpr.Substitute(const ATemplate: RegExprString): RegExprString;
// perform substitutions after a regexp match
var
TemplateBeg, TemplateEnd: PRegExprChar;
function ParseVarName(var APtr: PRegExprChar): integer;
// extract name of variable: $1 or ${1} or ${name}
// from APtr^, uses TemplateEnd
var
p: PRegExprChar;
Delimited: boolean;
GrpName: RegExprString;
begin
Result := 0;
p := APtr;
Delimited := (p < TemplateEnd) and (p^ = '{');
if Delimited then
Inc(p); // skip left curly brace
if (p < TemplateEnd) and (p^ = '&') then
Inc(p) // this is '$&' or '${&}'
else
begin
if IsDigitChar(p^) then
begin
while (p < TemplateEnd) and IsDigitChar(p^) do
begin
Result := Result * 10 + (Ord(p^) - Ord('0'));
Inc(p);
end
end
else
if Delimited then
begin
FindGroupName(p, TemplateEnd, '}', GrpName);
Result := MatchIndexFromName(GrpName);
Inc(p, Length(GrpName));
end;
end;
if Delimited then
if (p < TemplateEnd) and (p^ = '}') then
Inc(p) // skip right curly brace
else
p := APtr; // isn't properly terminated
if p = APtr then
Result := -1; // no valid digits found or no right curly brace
APtr := p;
end;
procedure FindSubstGroupIndex(var p: PRegExprChar; var Idx: integer);
begin
Idx := ParseVarName(p);
if (Idx >= 0) and (Idx <= High(GrpIndexes)) then
Idx := GrpIndexes[Idx];
end;
type
TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper, smodeAllLower);
var
Mode: TSubstMode;
p, p0, p1, ResultPtr: PRegExprChar;
ResultLen, n: integer;
Ch, QuotedChar: REChar;
begin
// Check programm and input string
if not IsProgrammOk then
Exit;
// Note: don't check for empty fInputString, it's valid case,
// e.g. user needs to replace regex "\b" to "_", it's zero match length
if ATemplate = '' then
begin
Result := '';
Exit;
end;
TemplateBeg := PRegExprChar(ATemplate);
TemplateEnd := TemplateBeg + Length(ATemplate);
// Count result length for speed optimization.
ResultLen := 0;
p := TemplateBeg;
while p < TemplateEnd do
begin
Ch := p^;
Inc(p);
n := -1;
if Ch = SubstituteGroupChar then
FindSubstGroupIndex(p, n);
if n >= 0 then
begin
Inc(ResultLen, GrpEnd[n] - GrpStart[n]);
end
else
begin
if (Ch = EscChar) and (p < TemplateEnd) then
begin // quoted or special char followed
Ch := p^;
Inc(p);
case Ch of
'n':
Inc(ResultLen, Length(fReplaceLineEnd));
'u', 'l', 'U', 'L': { nothing }
;
'x':
begin
Inc(ResultLen);
if (p^ = '{') then
begin // skip \x{....}
while ((p^ <> '}') and (p < TemplateEnd)) do
p := p + 1;
p := p + 1;
end
else
p := p + 2 // skip \x..
end;
else
Inc(ResultLen);
end;
end
else
Inc(ResultLen);
end;
end;
// Get memory. We do it once and it significant speed up work !
if ResultLen = 0 then
begin
Result := '';
Exit;
end;
SetLength(Result, ResultLen);
// Fill Result
ResultPtr := PRegExprChar(Result);
p := TemplateBeg;
Mode := smodeNormal;
while p < TemplateEnd do
begin
Ch := p^;
p0 := p;
Inc(p);
p1 := p;
n := -1;
if Ch = SubstituteGroupChar then
FindSubstGroupIndex(p, n);
if (n >= 0) then
begin
p0 := GrpStart[n];
p1 := GrpEnd[n];
end
else
begin
if (Ch = EscChar) and (p < TemplateEnd) then
begin // quoted or special char followed
Ch := p^;
Inc(p);
case Ch of
'n':
begin
p0 := PRegExprChar(fReplaceLineEnd);
p1 := p0 + Length(fReplaceLineEnd);
end;
'x', 't', 'r', 'f', 'a', 'e':
begin
p := p - 1;
// UnquoteChar expects the escaped char under the pointer
QuotedChar := UnQuoteChar(p);
p := p + 1;
// Skip after last part of the escaped sequence - UnquoteChar stops on the last symbol of it
p0 := @QuotedChar;
p1 := p0 + 1;
end;
'l':
begin
Mode := smodeOneLower;
p1 := p0;
end;
'L':
begin
Mode := smodeAllLower;
p1 := p0;
end;
'u':
begin
Mode := smodeOneUpper;
p1 := p0;
end;
'U':
begin
Mode := smodeAllUpper;
p1 := p0;
end;
else
begin
Inc(p0);
Inc(p1);
end;
end;
end
end;
if p0 < p1 then
begin
while p0 < p1 do
begin
case Mode of
smodeOneLower:
begin
ResultPtr^ := _LowerCase(p0^);
Mode := smodeNormal;
end;
smodeAllLower:
begin
ResultPtr^ := _LowerCase(p0^);
end;
smodeOneUpper:
begin
ResultPtr^ := _UpperCase(p0^);
Mode := smodeNormal;
end;
smodeAllUpper:
begin
ResultPtr^ := _UpperCase(p0^);
end;
else
ResultPtr^ := p0^;
end;
Inc(ResultPtr);
Inc(p0);
end;
Mode := smodeNormal;
end;
end;
end; { of function TRegExpr.Substitute
-------------------------------------------------------------- }
procedure TRegExpr.Split(const AInputStr: RegExprString; APieces: TStrings);
var
PrevPos: PtrInt;
begin
PrevPos := 1;
if Exec(AInputStr) then
repeat
APieces.Add(System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos));
PrevPos := MatchPos[0] + MatchLen[0];
until not ExecNext;
APieces.Add(System.Copy(AInputStr, PrevPos, MaxInt)); // Tail
end; { of procedure TRegExpr.Split
-------------------------------------------------------------- }
function TRegExpr.Replace(const AInputStr: RegExprString;
const AReplaceStr: RegExprString;
AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
var
PrevPos: PtrInt;
begin
Result := '';
PrevPos := 1;
if Exec(AInputStr) then
repeat
Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos);
if AUseSubstitution // ###0.946
then
Result := Result + Substitute(AReplaceStr)
else
Result := Result + AReplaceStr;
PrevPos := MatchPos[0] + MatchLen[0];
until not ExecNext;
Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
end; { of function TRegExpr.Replace
-------------------------------------------------------------- }
function TRegExpr.ReplaceEx(const AInputStr: RegExprString;
AReplaceFunc: TRegExprReplaceFunction): RegExprString;
var
PrevPos: PtrInt;
begin
Result := '';
PrevPos := 1;
if Exec(AInputStr) then
repeat
Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos)
+ AReplaceFunc(Self);
PrevPos := MatchPos[0] + MatchLen[0];
until not ExecNext;
Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
end; { of function TRegExpr.ReplaceEx
-------------------------------------------------------------- }
{$IFDEF OverMeth}
function TRegExpr.Replace(const AInputStr: RegExprString;
AReplaceFunc: TRegExprReplaceFunction): RegExprString;
begin
Result := ReplaceEx(AInputStr, AReplaceFunc);
end; { of function TRegExpr.Replace
-------------------------------------------------------------- }
{$ENDIF}
{ ============================================================= }
{ ====================== Debug section ======================== }
{ ============================================================= }
{$IFDEF UseFirstCharSet}
procedure TRegExpr.FillFirstCharSet(prog: PRegExprChar);
var
scan: PRegExprChar; // Current node.
Next: PRegExprChar; // Next node.
opnd: PRegExprChar;
Oper: TREOp;
ch: REChar;
min_cnt, i: integer;
TempSet: TRegExprCharset;
begin
TempSet := [];
scan := prog;
while scan <> nil do
begin
Next := regNext(scan);
Oper := PREOp(scan)^;
case Oper of
OP_BSUBEXP,
OP_BSUBEXPCI:
begin
// we cannot optimize r.e. if it starts with back reference
FirstCharSet := RegExprAllSet; //###0.930
Exit;
end;
OP_BOL,
OP_BOLML:
; // Exit; //###0.937
OP_EOL,
OP_EOL2,
OP_EOLML:
begin //###0.948 was empty in 0.947, was EXIT in 0.937
Include(FirstCharSet, 0);
if ModifierM then
begin
{$IFDEF UseLineSep}
for i := 1 to Length(LineSeparators) do
Include(FirstCharSet, byte(LineSeparators[i]));
{$ELSE}
FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
{$ENDIF}
end;
Exit;
end;
OP_BOUND,
OP_NOTBOUND:
; //###0.943 ?!!
OP_ANY,
OP_ANYML:
begin // we can better define ANYML !!!
FirstCharSet := RegExprAllSet; //###0.930
Exit;
end;
OP_ANYDIGIT:
begin
FirstCharSet := FirstCharSet + RegExprDigitSet;
Exit;
end;
OP_NOTDIGIT:
begin
FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprDigitSet);
Exit;
end;
OP_ANYLETTER:
begin
GetCharSetFromWordChars(TempSet);
FirstCharSet := FirstCharSet + TempSet;
Exit;
end;
OP_NOTLETTER:
begin
GetCharSetFromWordChars(TempSet);
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
Exit;
end;
OP_ANYSPACE:
begin
GetCharSetFromSpaceChars(TempSet);
FirstCharSet := FirstCharSet + TempSet;
Exit;
end;
OP_NOTSPACE:
begin
GetCharSetFromSpaceChars(TempSet);
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
Exit;
end;
OP_ANYVERTSEP:
begin
FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
Exit;
end;
OP_NOTVERTSEP:
begin
FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprLineSeparatorsSet);
Exit;
end;
OP_ANYHORZSEP:
begin
FirstCharSet := FirstCharSet + RegExprHorzSeparatorsSet;
Exit;
end;
OP_NOTHORZSEP:
begin
FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprHorzSeparatorsSet);
Exit;
end;
OP_EXACTLYCI:
begin
ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
{$IFDEF UniCode}
if Ord(ch) <= $FF then
{$ENDIF}
begin
Include(FirstCharSet, byte(ch));
Include(FirstCharSet, byte(InvertCase(ch)));
end;
Exit;
end;
OP_EXACTLY:
begin
ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
{$IFDEF UniCode}
if Ord(ch) <= $FF then
{$ENDIF}
Include(FirstCharSet, byte(ch));
Exit;
end;
OP_ANYOF:
begin
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
FirstCharSet := FirstCharSet + TempSet;
Exit;
end;
OP_ANYBUT:
begin
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
Exit;
end;
OP_ANYOFCI:
begin
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
FirstCharSet := FirstCharSet + TempSet;
Exit;
end;
OP_ANYBUTCI:
begin
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
Exit;
end;
OP_NOTHING:
;
OP_COMMENT:
;
OP_BACK:
;
OP_OPEN_FIRST .. OP_OPEN_LAST:
begin
FillFirstCharSet(Next);
Exit;
end;
OP_CLOSE_FIRST .. OP_CLOSE_LAST:
begin
FillFirstCharSet(Next);
Exit;
end;
OP_BRANCH:
begin
if (PREOp(Next)^ <> OP_BRANCH) // No choice.
then
Next := scan + REOpSz + RENextOffSz // Avoid recursion.
else
begin
repeat
FillFirstCharSet(scan + REOpSz + RENextOffSz);
scan := regNext(scan);
until (scan = nil) or (PREOp(scan)^ <> OP_BRANCH);
Exit;
end;
end;
{$IFDEF ComplexBraces}
OP_LOOPENTRY:
begin //###0.925
//LoopStack [LoopStackIdx] := 0; //###0.940 line removed
FillFirstCharSet(Next); // execute LOOP
Exit;
end;
OP_LOOP,
OP_LOOPNG:
begin //###0.940
opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz * 2))^;
min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
FillFirstCharSet(opnd);
if min_cnt = 0 then
FillFirstCharSet(Next);
Exit;
end;
{$ENDIF}
OP_STAR,
OP_STARNG,
OP_STAR_POSS: //###0.940
FillFirstCharSet(scan + REOpSz + RENextOffSz);
OP_PLUS,
OP_PLUSNG,
OP_PLUS_POSS:
begin //###0.940
FillFirstCharSet(scan + REOpSz + RENextOffSz);
Exit;
end;
OP_BRACES,
OP_BRACESNG,
OP_BRACES_POSS:
begin //###0.940
opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES
FillFirstCharSet(opnd);
if min_cnt > 0 then
Exit;
end;
OP_EEND:
begin
FirstCharSet := RegExprAllSet; //###0.948
Exit;
end;
OP_ANYCATEGORY,
OP_NOTCATEGORY:
begin
FirstCharSet := RegExprAllSet;
Exit;
end;
OP_RECUR,
OP_SUBCALL_FIRST .. OP_SUBCALL_LAST:
begin
end
else
begin
fLastErrorOpcode := Oper;
Error(reeUnknownOpcodeInFillFirst);
Exit;
end;
end; { of case scan^}
scan := Next;
end; { of while scan <> nil}
end; { of procedure FillFirstCharSet
--------------------------------------------------------------}
{$ENDIF}
procedure TRegExpr.InitCharCheckers;
var
Cnt: integer;
//
function Add(AChecker: TRegExprCharChecker): byte;
begin
Inc(Cnt);
if Cnt > High(CharCheckers) then
Error(reeTooSmallCheckersArray);
CharCheckers[Cnt - 1] := AChecker;
Result := Cnt - 1;
end;
//
begin
Cnt := 0;
FillChar(CharCheckers, SizeOf(CharCheckers), 0);
CheckerIndex_Word := Add(CharChecker_Word);
CheckerIndex_NotWord := Add(CharChecker_NotWord);
CheckerIndex_Space := Add(CharChecker_Space);
CheckerIndex_NotSpace := Add(CharChecker_NotSpace);
CheckerIndex_Digit := Add(CharChecker_Digit);
CheckerIndex_NotDigit := Add(CharChecker_NotDigit);
CheckerIndex_VertSep := Add(CharChecker_VertSep);
CheckerIndex_NotVertSep := Add(CharChecker_NotVertSep);
CheckerIndex_HorzSep := Add(CharChecker_HorzSep);
CheckerIndex_NotHorzSep := Add(CharChecker_NotHorzSep);
//CheckerIndex_AllAZ := Add(CharChecker_AllAZ);
CheckerIndex_LowerAZ := Add(CharChecker_LowerAZ);
CheckerIndex_UpperAZ := Add(CharChecker_UpperAZ);
SetLength(CharCheckerInfos, 3);
with CharCheckerInfos[0] do
begin
CharBegin := 'a';
CharEnd:= 'z';
CheckerIndex := CheckerIndex_LowerAZ;
end;
with CharCheckerInfos[1] do
begin
CharBegin := 'A';
CharEnd := 'Z';
CheckerIndex := CheckerIndex_UpperAZ;
end;
with CharCheckerInfos[2] do
begin
CharBegin := '0';
CharEnd := '9';
CheckerIndex := CheckerIndex_Digit;
end;
end;
function TRegExpr.CharChecker_Word(ch: REChar): boolean;
begin
Result := IsWordChar(ch);
end;
function TRegExpr.CharChecker_NotWord(ch: REChar): boolean;
begin
Result := not IsWordChar(ch);
end;
function TRegExpr.CharChecker_Space(ch: REChar): boolean;
begin
Result := IsSpaceChar(ch);
end;
function TRegExpr.CharChecker_NotSpace(ch: REChar): boolean;
begin
Result := not IsSpaceChar(ch);
end;
function TRegExpr.CharChecker_Digit(ch: REChar): boolean;
begin
Result := IsDigitChar(ch);
end;
function TRegExpr.CharChecker_NotDigit(ch: REChar): boolean;
begin
Result := not IsDigitChar(ch);
end;
function TRegExpr.CharChecker_VertSep(ch: REChar): boolean;
begin
Result := IsVertLineSeparator(ch);
end;
function TRegExpr.CharChecker_NotVertSep(ch: REChar): boolean;
begin
Result := not IsVertLineSeparator(ch);
end;
function TRegExpr.CharChecker_HorzSep(ch: REChar): boolean;
begin
Result := IsHorzSeparator(ch);
end;
function TRegExpr.CharChecker_NotHorzSep(ch: REChar): boolean;
begin
Result := not IsHorzSeparator(ch);
end;
function TRegExpr.CharChecker_LowerAZ(ch: REChar): boolean;
begin
case ch of
'a' .. 'z':
Result := True;
else
Result := False;
end;
end;
function TRegExpr.CharChecker_UpperAZ(ch: REChar): boolean;
begin
case ch of
'A' .. 'Z':
Result := True;
else
Result := False;
end;
end;
{$IFDEF RegExpPCodeDump}
function TRegExpr.DumpOp(op: TREOp): RegExprString;
// printable representation of opcode
begin
case op of
OP_BOL:
Result := 'BOL';
OP_EOL:
Result := 'EOL';
OP_EOL2:
Result := 'EOL2';
OP_BOLML:
Result := 'BOLML';
OP_EOLML:
Result := 'EOLML';
OP_BOUND:
Result := 'BOUND';
OP_NOTBOUND:
Result := 'NOTBOUND';
OP_ANY:
Result := 'ANY';
OP_ANYML:
Result := 'ANYML';
OP_ANYLETTER:
Result := 'ANYLETTER';
OP_NOTLETTER:
Result := 'NOTLETTER';
OP_ANYDIGIT:
Result := 'ANYDIGIT';
OP_NOTDIGIT:
Result := 'NOTDIGIT';
OP_ANYSPACE:
Result := 'ANYSPACE';
OP_NOTSPACE:
Result := 'NOTSPACE';
OP_ANYHORZSEP:
Result := 'ANYHORZSEP';
OP_NOTHORZSEP:
Result := 'NOTHORZSEP';
OP_ANYVERTSEP:
Result := 'ANYVERTSEP';
OP_NOTVERTSEP:
Result := 'NOTVERTSEP';
OP_ANYOF:
Result := 'ANYOF';
OP_ANYBUT:
Result := 'ANYBUT';
OP_ANYOFCI:
Result := 'ANYOF/CI';
OP_ANYBUTCI:
Result := 'ANYBUT/CI';
OP_BRANCH:
Result := 'BRANCH';
OP_EXACTLY:
Result := 'EXACTLY';
OP_EXACTLYCI:
Result := 'EXACTLY/CI';
OP_NOTHING:
Result := 'NOTHING';
OP_COMMENT:
Result := 'COMMENT';
OP_BACK:
Result := 'BACK';
OP_EEND:
Result := 'END';
OP_BSUBEXP:
Result := 'BSUBEXP';
OP_BSUBEXPCI:
Result := 'BSUBEXP/CI';
OP_OPEN_FIRST .. OP_OPEN_LAST:
Result := Format('OPEN[%d]', [Ord(op) - Ord(OP_OPEN)]);
OP_CLOSE_FIRST .. OP_CLOSE_LAST:
Result := Format('CLOSE[%d]', [Ord(op) - Ord(OP_CLOSE)]);
OP_STAR:
Result := 'STAR';
OP_PLUS:
Result := 'PLUS';
OP_BRACES:
Result := 'BRACES';
{$IFDEF ComplexBraces}
OP_LOOPENTRY:
Result := 'LOOPENTRY';
OP_LOOP:
Result := 'LOOP';
OP_LOOPNG:
Result := 'LOOPNG';
{$ENDIF}
OP_STARNG:
Result := 'STARNG';
OP_PLUSNG:
Result := 'PLUSNG';
OP_BRACESNG:
Result := 'BRACESNG';
OP_STAR_POSS:
Result := 'STAR_POSS';
OP_PLUS_POSS:
Result := 'PLUS_POSS';
OP_BRACES_POSS:
Result := 'BRACES_POSS';
OP_ANYCATEGORY:
Result := 'ANYCATEG';
OP_NOTCATEGORY:
Result := 'NOTCATEG';
OP_RECUR:
Result := 'RECURSION';
OP_SUBCALL_FIRST .. OP_SUBCALL_LAST:
Result := Format('SUBCALL[%d]', [Ord(op) - Ord(OP_SUBCALL)]);
else
Error(reeDumpCorruptedOpcode);
end;
end; { of function TRegExpr.DumpOp
-------------------------------------------------------------- }
function TRegExpr.IsCompiled: boolean;
begin
Result := programm <> nil;
end;
function PrintableChar(AChar: REChar): RegExprString; {$IFDEF InlineFuncs}inline;{$ENDIF}
begin
if AChar < ' ' then
Result := '#' + IntToStr(Ord(AChar))
else
Result := AChar;
end;
function TRegExpr.DumpCheckerIndex(N: byte): RegExprString;
begin
Result := '?';
if N = CheckerIndex_Word then Result := '\w' else
if N = CheckerIndex_NotWord then Result := '\W' else
if N = CheckerIndex_Digit then Result := '\d' else
if N = CheckerIndex_NotDigit then Result := '\D' else
if N = CheckerIndex_Space then Result := '\s' else
if N = CheckerIndex_NotSpace then Result := '\S' else
if N = CheckerIndex_HorzSep then Result := '\h' else
if N = CheckerIndex_NotHorzSep then Result := '\H' else
if N = CheckerIndex_VertSep then Result := '\v' else
if N = CheckerIndex_NotVertSep then Result := '\V' else
if N = CheckerIndex_LowerAZ then Result := 'az' else
if N = CheckerIndex_UpperAZ then Result := 'AZ' else
;
end;
function TRegExpr.DumpCategoryChars(ch, ch2: REChar; Positive: boolean): RegExprString;
const
S: array[boolean] of RegExprString = ('P', 'p');
begin
Result := '\' + S[Positive] + '{' + ch;
if ch2 <> #0 then
Result := Result + ch2;
Result := Result + '} ';
end;
function TRegExpr.Dump: RegExprString;
// dump a regexp in vaguely comprehensible form
var
s: PRegExprChar;
op: TREOp; // Arbitrary non-END op.
next: PRegExprChar;
i, NLen: integer;
Diff: PtrInt;
iByte: byte;
ch, ch2: REChar;
begin
if not IsProgrammOk then
Exit;
op := OP_EXACTLY;
Result := '';
s := regCodeWork;
while op <> OP_EEND do
begin // While that wasn't END last time...
op := s^;
Result := Result + Format('%2d: %s', [s - programm, DumpOp(s^)]);
// Where, what.
next := regNext(s);
if next = nil // Next ptr.
then
Result := Result + ' (0)'
else
begin
if next > s
// ###0.948 PWideChar subtraction workaround (see comments in Tail method for details)
then
Diff := next - s
else
Diff := -(s - next);
Result := Result + Format(' (%d) ', [(s - programm) + Diff]);
end;
Inc(s, REOpSz + RENextOffSz);
if (op = OP_ANYOF) or (op = OP_ANYOFCI) or (op = OP_ANYBUT) or (op = OP_ANYBUTCI) then
begin
repeat
case s^ of
OpKind_End:
begin
Inc(s);
Break;
end;
OpKind_Range:
begin
Result := Result + 'Rng(';
Inc(s);
Result := Result + PrintableChar(s^) + '-';
Inc(s);
Result := Result + PrintableChar(s^);
Result := Result + ') ';
Inc(s);
end;
OpKind_MetaClass:
begin
Inc(s);
Result := Result + DumpCheckerIndex(byte(s^)) + ' ';
Inc(s);
end;
OpKind_Char:
begin
Inc(s);
NLen := PLongInt(s)^;
Inc(s, RENumberSz);
Result := Result + 'Ch(';
for i := 1 to NLen do
begin
Result := Result + PrintableChar(s^);
Inc(s);
end;
Result := Result + ') ';
end;
OpKind_CategoryYes:
begin
Inc(s);
ch := s^;
Inc(s);
ch2 := s^;
Result := Result + DumpCategoryChars(ch, ch2, True);
Inc(s);
end;
OpKind_CategoryNo:
begin
Inc(s);
ch := s^;
Inc(s);
ch2 := s^;
Result := Result + DumpCategoryChars(ch, ch2, False);
Inc(s);
end;
else
Error(reeDumpCorruptedOpcode);
end;
until false;
end;
if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
begin
// Literal string, where present.
NLen := PLongInt(s)^;
Inc(s, RENumberSz);
for i := 1 to NLen do
begin
Result := Result + PrintableChar(s^);
Inc(s);
end;
end;
if (op = OP_BSUBEXP) or (op = OP_BSUBEXPCI) then
begin
Result := Result + ' \' + IntToStr(Ord(s^));
Inc(s);
end;
if (op = OP_BRACES) or (op = OP_BRACESNG) or (op = OP_BRACES_POSS) then
begin // ###0.941
// show min/max argument of braces operator
Result := Result + Format('{%d,%d}', [PREBracesArg(AlignToInt(s))^,
PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
Inc(s, REBracesArgSz * 2);
end;
{$IFDEF ComplexBraces}
if (op = OP_LOOP) or (op = OP_LOOPNG) then
begin // ###0.940
Result := Result + Format(' -> (%d) {%d,%d}',
[(s - programm - (REOpSz + RENextOffSz)) +
PRENextOff(AlignToPtr(s + 2 * REBracesArgSz))^,
PREBracesArg(AlignToInt(s))^,
PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
Inc(s, 2 * REBracesArgSz + RENextOffSz);
end;
{$ENDIF}
if (op = OP_ANYCATEGORY) or (op = OP_NOTCATEGORY) then
begin
ch := s^;
Inc(s);
ch2 := s^;
Inc(s);
if ch2<>#0 then
Result := Result + '{' + ch + ch2 + '}'
else
Result := Result + '{' + ch + '}';
end;
Result := Result + #$d#$a;
end; { of while }
// Header fields of interest.
if regAnchored <> #0 then
Result := Result + 'Anchored; ';
if regMustString <> '' then
Result := Result + 'Must have: "' + regMustString + '"; ';
{$IFDEF UseFirstCharSet} // ###0.929
Result := Result + #$d#$a'First charset: ';
if FirstCharSet = [] then
Result := Result + '<empty set>'
else
if FirstCharSet = RegExprAllSet then
Result := Result + '<all chars>'
else
for iByte := 0 to 255 do
if iByte in FirstCharSet then
Result := Result + PrintableChar(REChar(iByte));
{$ENDIF}
Result := Result + #$d#$a;
end; { of function TRegExpr.Dump
-------------------------------------------------------------- }
{$ENDIF}
function TRegExpr.IsFixedLength(var op: TREOp; var ALen: integer): boolean;
var
s, next: PRegExprChar;
N, N2: integer;
begin
Result := False;
ALen := 0;
if not IsCompiled then Exit;
s := regCodeWork;
repeat
next := regNext(s);
op := s^;
Inc(s, REOpSz + RENextOffSz);
case op of
OP_EEND:
begin
Result := True;
Exit;
end;
OP_BRANCH:
begin
op := next^;
if op <> OP_EEND then Exit;
end;
OP_COMMENT,
OP_BOUND,
OP_NOTBOUND:
Continue;
OP_ANY,
OP_ANYML,
OP_ANYDIGIT,
OP_NOTDIGIT,
OP_ANYLETTER,
OP_NOTLETTER,
OP_ANYSPACE,
OP_NOTSPACE,
OP_ANYHORZSEP,
OP_NOTHORZSEP,
OP_ANYVERTSEP,
OP_NOTVERTSEP:
begin
Inc(ALen);
Continue;
end;
OP_ANYOF,
OP_ANYOFCI,
OP_ANYBUT,
OP_ANYBUTCI:
begin
Inc(ALen);
repeat
case s^ of
OpKind_End:
begin
Inc(s);
Break;
end;
OpKind_Range:
begin
Inc(s);
Inc(s);
Inc(s);
end;
OpKind_MetaClass:
begin
Inc(s);
Inc(s);
end;
OpKind_Char:
begin
Inc(s);
Inc(s, RENumberSz + PLongInt(s)^);
end;
OpKind_CategoryYes,
OpKind_CategoryNo:
begin
Inc(s);
Inc(s);
Inc(s);
end;
end;
until False;
end;
OP_EXACTLY,
OP_EXACTLYCI:
begin
N := PLongInt(s)^;
Inc(ALen, N);
Inc(s, RENumberSz + N);
Continue;
end;
OP_ANYCATEGORY,
OP_NOTCATEGORY:
begin
Inc(ALen);
Inc(s, 2);
Continue;
end;
OP_BRACES,
OP_BRACESNG,
OP_BRACES_POSS:
begin
// allow only d{n,n}
N := PREBracesArg(AlignToInt(s))^;
N2 := PREBracesArg(AlignToInt(s + REBracesArgSz))^;
if N <> N2 then
Exit;
Inc(ALen, N-1);
Inc(s, REBracesArgSz * 2);
end;
else
Exit;
end;
until False;
end;
{$IFDEF reRealExceptionAddr}
{$OPTIMIZATION ON}
// ReturnAddr works correctly only if compiler optimization is ON
// I placed this method at very end of unit because there are no
// way to restore compiler optimization flag ...
{$ENDIF}
procedure TRegExpr.Error(AErrorID: integer);
{$IFNDEF LINUX}
{$IFDEF reRealExceptionAddr}
function ReturnAddr: Pointer; // ###0.938
asm
mov eax,[ebp+4]
end;
{$ENDIF}
{$ENDIF}
var
e: ERegExpr;
Msg: string;
begin
fLastError := AErrorID; // dummy stub - useless because will raise exception
Msg := ErrorMsg(AErrorID);
// compilation error ?
if AErrorID < reeFirstRuntimeCode then
Msg := Msg + ' (pos ' + IntToStr(CompilerErrorPos) + ')';
e := ERegExpr.Create(Msg);
e.ErrorCode := AErrorID;
e.CompilerErrorPos := CompilerErrorPos;
raise e
{$IFNDEF LINUX}
{$IFDEF reRealExceptionAddr}
at ReturnAddr
{$ENDIF}
{$ENDIF};
end; { of procedure TRegExpr.Error
-------------------------------------------------------------- }
{$IFDEF Compat} // APIs needed only for users of old FPC 3.0
function TRegExpr.ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload;
begin
Result := ExecPrim(AOffset, ATryOnce, False, False);
end;
function TRegExpr.OldInvertCase(const Ch: REChar): REChar;
begin
Result := _UpperCase(Ch);
if Result = Ch then
Result := _LowerCase(Ch);
end;
class function TRegExpr.InvertCaseFunction(const Ch: REChar): REChar;
begin
Result := _UpperCase(Ch);
if Result = Ch then
Result := _LowerCase(Ch);
end;
function TRegExpr.GetLinePairedSeparator: RegExprString;
begin
// not supported anymore
Result := '';
end;
procedure TRegExpr.SetLinePairedSeparator(const AValue: RegExprString);
begin
// not supported anymore
end;
procedure TRegExpr.SetUseOsLineEndOnReplace(AValue: boolean);
begin
if fUseOsLineEndOnReplace = AValue then
Exit;
fUseOsLineEndOnReplace := AValue;
if fUseOsLineEndOnReplace then
fReplaceLineEnd := sLineBreak
else
fReplaceLineEnd := #10;
end;
{$ENDIF}
end.