File : g-regpat.adb
------------------------------------------------------------------------------
-- --
-- GNAT LIBRARY COMPONENTS --
-- --
-- G N A T . R E G P A T --
-- --
-- B o d y --
-- --
-- $Revision: 1.3 $
-- --
-- Copyright (C) 1986 by University of Toronto. --
-- Copyright (C) 1996-1999 Ada Core Technologies, Inc. --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, --
-- MA 02111-1307, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT is maintained by Ada Core Technologies Inc (http://www.gnat.com). --
-- --
------------------------------------------------------------------------------
-- This is an altered Ada 95 version of the original V8 style regular
-- expression library written in C by Henry Spencer. Apart from the
-- translation to Ada, the interface has been considerably changed to
-- use the Ada String type instead of C-style nul-terminated strings.
-- The internal format of compiled regular expressions has not been
-- modified though and is binary compatible with the C version.
-- Beware that some of this code is subtly aware of the way operator
-- precedence is structured in regular expressions. Serious changes in
-- regular-expression syntax might require a total rethink.
package body GNAT.Regpat is
-- Structure for pattern matching "program"
MAGIC : constant Character := Character'Val (8#0234#);
-- The first byte of the regexp internal "program" is actually
-- this magic number; the start node begins in the second byte.
-- This is essentially a linear encoding of a nondeterministic
-- finite-state machine, also known as syntax charts or
-- "railroad normal form" in parsing technology.
-- Each node is an opcode plus a "next" pointer, possibly plus an
-- operand. "Next" pointers of all nodes except BRANCH implement
-- concatenation; a "next" pointer with a BRANCH on both ends of it
-- is connecting two alternatives.
-- (Here is one of the subtle syntax dependencies:
-- an individual BRANCH (as opposed to a collection of them) is
-- never concatenated with anything because of operator precedence.)
-- The operand of some types of node is a literal string; for others,
-- it is a node leading into a sub-FSM. In particular, the operand of
-- a BRANCH node is the first node of the branch.
-- (NB this is *not* a tree structure: the tail of the branch connects
-- to the thing following the set of BRANCHes.)
-- The opcodes are:
type Opcode is
(EOP, BOL, EOL, ANY, ANYOF, ANYBUT, BRANCH, BACK,
EXACTLY, NOTHING, STAR, PLUS, WORDA, WORDZ,
RESERVE_1, RESERVE_2, RESERVE_3, RESERVE_4, RESERVE_5, RESERVE_6,
OPEN, OPEN_1, OPEN_2, OPEN_3, OPEN_4, OPEN_5,
OPEN_6, OPEN_7, OPEN_8, OPEN_9,
CLOSE, CLOSE_1, CLOSE_2, CLOSE_3, CLOSE_4, CLOSE_5,
CLOSE_6, CLOSE_7, CLOSE_8, CLOSE_9);
for Opcode'Size use 8;
for Opcode use
-- Name Nr Operand? Meaning
(EOP => 0, -- no End of program
BOL => 1, -- no Match "" at beginning of line
EOL => 2, -- no Match "" at end of line
ANY => 3, -- no Match any one character
ANYOF => 4, -- str Match any character in this string
ANYBUT => 5, -- str Match any character not in this string
BRANCH => 6, -- node Match this alternative, or the next
BACK => 7, -- no Match "", "next" ptr points backward
EXACTLY => 8, -- str Match this string
NOTHING => 9, -- no Match empty string
STAR => 10, -- node Match this (simple) thing 0 or more times
PLUS => 11, -- node Match this (simple) thing 1 or more times
WORDA => 12, -- no Match "" at wordchar, where prev is nonword
WORDZ => 13, -- no Match "" at nonwordchar, where prev is word
RESERVE_1 => 14, -- no Reserved entities for future opcodes
RESERVE_2 => 15, -- These MUST be present, for converting an
RESERVE_3 => 16, -- Opcode to Character, see Get_Next
RESERVE_4 => 17, -- This is also desirable for speed reasons
RESERVE_5 => 18,
RESERVE_6 => 19,
OPEN => 20, -- no Mark this point in input as start of #n
OPEN_1 => 21,
OPEN_2 => 22,
OPEN_3 => 23,
OPEN_4 => 24,
OPEN_5 => 25,
OPEN_6 => 26,
OPEN_7 => 27,
OPEN_8 => 28,
OPEN_9 => 29,
CLOSE => 30, -- no Analogous to OPEN
CLOSE_1 => 31,
CLOSE_2 => 32,
CLOSE_3 => 33,
CLOSE_4 => 34,
CLOSE_5 => 35,
CLOSE_6 => 36,
CLOSE_7 => 37,
CLOSE_8 => 38,
CLOSE_9 => 39);
-- Opcode notes:
-- BRANCH
-- The set of branches constituting a single choice are hooked
-- together with their "next" pointers, since precedence prevents
-- anything being concatenated to any individual branch. The
-- "next" pointer of the last BRANCH in a choice points to the
-- thing following the whole choice. This is also where the
-- final "next" pointer of each individual branch points; each
-- branch starts with the operand node of a BRANCH node.
-- BACK
-- Normal "next" pointers all implicitly point forward;
-- BACK exists to make loop structures possible.
-- STAR,PLUS
-- '?', and complex '*' and '+', are implemented as circular
-- BRANCH structures using BACK. Simple cases (one character
-- per match) are implemented with STAR and PLUS for speed
-- and to minimize recursive plunges.
-- OPEN,CLOSE
-- ...are numbered at compile time.
-- A node is one char of opcode followed by two chars of "next" pointer.
-- "Next" pointers are stored as two 8-bit pieces, high order first. The
-- value is a positive offset from the opcode of the node containing it.
-- An operand, if any, simply follows the node. (Note that much of the
-- code generation knows about this implicit relationship.)
-- Using two bytes for the "next" pointer is vast overkill for most
-- things, but allows patterns to get big without disasters.
-----------------------
-- Local Subprograms --
-----------------------
function "+" (Left : Opcode; Right : Integer) return Opcode;
function "-" (Left : Opcode; Right : Opcode) return Integer;
function "=" (Left : Character; Right : Opcode) return Boolean;
function Is_Mult (C : Character) return Boolean;
-- Return True iff C is a regexp multiplier: '+', '*' or '?'
function Is_Word_Char (C : Character) return Boolean;
-- Return True iff C is an Ascii letter or an underscore ('_')
function Operand (P : Pointer) return Pointer;
-- Return a pointer to the operand of the node at P
function Get_Next_Offset
(Program : Program_Data;
IP : Pointer) return Pointer;
-- Get the offset field of a node. Used by Get_Next.
function Get_Next
(Program : Program_Data;
IP : Pointer) return Pointer;
-- Dig the next instruction pointer out of a node
procedure Optimize
(Self : in out Pattern_Matcher);
-- Optimize a Pattern_Matcher by noting certain special cases
-- All of the following subprograms are tiny and should be inlined
pragma Inline ("+");
pragma Inline ("-");
pragma Inline ("=");
pragma Inline (Is_Mult);
pragma Inline (Is_Word_Char);
pragma Inline (Get_Next);
pragma Inline (Get_Next_Offset);
pragma Inline (Operand);
type Expression_Flags is record
Has_Width, -- Known never to match null string
Simple, -- Simple enough to be STAR/PLUS operand
SP_Start : Boolean; -- Starts with * or +
end record;
---------
-- "=" --
---------
function "=" (Left : Character; Right : Opcode) return Boolean is
begin
return Character'Pos (Left) = Opcode'Pos (Right);
end "=";
---------
-- "+" --
---------
function "+" (Left : Opcode; Right : Integer) return Opcode is
begin
return Opcode'Val (Opcode'Pos (Left) + Right);
end "+";
---------
-- "-" --
---------
function "-" (Left : Opcode; Right : Opcode) return Integer is
begin
return Opcode'Pos (Left) - Opcode'Pos (Right);
end "-";
-------------
-- Is_Mult --
-------------
function Is_Mult (C : Character) return Boolean is
begin
return (C = '*' or C = '+' or C = '?');
end Is_Mult;
------------------
-- Is_Word_Char --
------------------
function Is_Word_Char (C : Character) return Boolean is
begin
return (C in 'a' .. 'z' or C in 'A' .. 'Z' or C = '_');
end Is_Word_Char;
-------------
-- Operand --
-------------
function Operand (P : Pointer) return Pointer is
begin
return P + 3;
end Operand;
---------------------
-- Get_Next_Offset --
---------------------
function Get_Next_Offset
(Program : Program_Data;
IP : Pointer)
return Pointer
is
begin
return Pointer (Character'Pos (Program (IP + 1))
+ 256 * Character'Pos (Program (IP + 2)));
end Get_Next_Offset;
--------------
-- Get_Next --
--------------
function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is
Offset : Pointer;
Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP)));
begin
pragma Assert (IP /= 0);
Offset := Get_Next_Offset (Program, IP);
if Offset = 0 then
return 0;
elsif Op = BACK then
return IP - Offset;
else
return IP + Offset;
end if;
end Get_Next;
Worst_Expression : constant Expression_Flags := (others => False);
-- Worst case
-------------
-- Compile --
-------------
procedure Compile
(Matcher : out Pattern_Matcher;
Expression : String;
Final_Code_Size : out Program_Size)
is
-- We can't allocate space until we know how big the compiled form
-- will be, but we can't compile it (and thus know how big it is)
-- until we've got a place to put the code. So we cheat: we compile
-- it twice, once with code generation turned off and size counting
-- turned on, and once "for real".
-- This also means that we don't allocate space until we are sure
-- that the thing really will compile successfully, and we never
-- have to move the code and thus invalidate pointers into it.
-- Beware that the optimization-preparation code in here knows
-- about some of the structure of the compiled regexp.
PM : Pattern_Matcher renames Matcher;
Program : Program_Data renames PM.Program;
Emit_Code : constant Boolean := PM.Size > 0;
Emit_Ptr : Pointer := Program_First;
Code_Size : Natural := 0;
Par_Count : Natural range 0 .. Max_Nesting := 1; -- () count
Parse_Pos : Natural := Expression'First; -- Input-scan pointer
Parse_End : Natural := Expression'Last;
----------------------------
-- Subprograms for Create --
----------------------------
procedure Emit (B : Character);
-- If code-generation is enabled, output the Character to the Program.
-- Otherwise just increases Code_Size.
function Emit_Node (Op : Opcode) return Pointer;
-- If code-generation is enabled, Emit_Node outputs the
-- opcode and reserves space for a pointer to the next node.
-- Return value is the location of new opcode, ie old Emit_Ptr.
procedure Parse
(Parenthesized : Boolean;
Flags : in out Expression_Flags;
IP : out Pointer);
-- Parse regular expression, i.e. main body or parenthesized thing
-- Caller must absorb opening parenthesis.
procedure Parse_Branch
(Flags : in out Expression_Flags;
IP : out Pointer);
-- Implements the concatenation operator and handles '|'
procedure Parse_Piece
(Flags : in out Expression_Flags; IP : out Pointer);
-- Parse something followed by possible [*+?]
procedure Parse_Atom
(Flags : in out Expression_Flags; IP : out Pointer);
-- Parse_Atom is the lowest level parse procedure.
-- Optimization: gobbles an entire sequence of ordinary characters
-- so that it can turn them into a single node, which is smaller to
-- store and faster to run. Backslashed characters are exceptions,
-- each becoming a separate node; the code is simpler that way and
-- it's not worth fixing.
procedure Insert_Operator (Op : Opcode; Operand : Pointer);
-- Insert_Operator inserts an operator in front of an
-- already-emitted operand and relocates the operand.
procedure Link_Tail (P, Val : Pointer);
-- Link_Tail sets the next-pointer at the end of a node chain
procedure Link_Operand_Tail (P, Val : Pointer);
-- Link_Tail on operand of first argument; nop if operandless
function Next_Instruction (P : Pointer) return Pointer;
-- Dig the "next" pointer out of a node
procedure Fail (M : in String);
-- Fail with a diagnostic message, if possible
----------
-- Fail --
----------
procedure Fail (M : in String) is
begin
-- M should be passed as exception message, but
-- this is not possible in a Pure package.
raise Expression_Error;
end Fail;
-----------
-- Parse --
-----------
-- Combining parenthesis handling with the base level
-- of regular expression is a trifle forced, but the
-- need to tie the tails of the branches to what follows
-- makes it hard to avoid.
procedure Parse
(Parenthesized : in Boolean;
Flags : in out Expression_Flags;
IP : out Pointer)
is
E : String renames Expression;
Br : Pointer;
Ender : Pointer;
Par_No : Natural;
New_Flags : Expression_Flags;
begin
Flags := (Has_Width => True, others => False); -- Tentatively
-- Make an OPEN node, if parenthesized
if Parenthesized then
if Par_Count >= Max_Nesting then
Fail ("too many ()");
end if;
Par_No := Par_Count;
Par_Count := Par_Count + 1;
IP := Emit_Node (OPEN + Par_No);
else
IP := 0;
end if;
-- Pick up the branches, linking them together
Parse_Branch (New_Flags, Br);
if Br = 0 then
IP := 0;
return;
end if;
if IP /= 0 then
Link_Tail (IP, Br); -- OPEN -> first
else
IP := Br;
end if;
if not New_Flags.Has_Width then
Flags.Has_Width := False;
end if;
Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
while Parse_Pos <= Parse_End
and then (E (Parse_Pos) = '|' or else E (Parse_Pos) = Ascii.LF)
loop
Parse_Pos := Parse_Pos + 1;
Parse_Branch (New_Flags, Br);
if Br = 0 then
IP := 0;
return;
end if;
Link_Tail (IP, Br); -- BRANCH -> BRANCH
if not New_Flags.Has_Width then
Flags.Has_Width := False;
end if;
Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
end loop;
-- Make a closing node, and hook it on the end
if Parenthesized then
Ender := Emit_Node (CLOSE + Par_No);
else
Ender := Emit_Node (EOP);
end if;
Link_Tail (IP, Ender);
-- Hook the tails of the branches to the closing node
Br := IP;
loop
exit when Br = 0;
Link_Operand_Tail (Br, Ender);
Br := Next_Instruction (Br);
end loop;
-- Check for proper termination
if Parenthesized then
if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then
Fail ("unmatched ()");
end if;
Parse_Pos := Parse_Pos + 1;
elsif Parse_Pos <= Parse_End then
if E (Parse_Pos) = ')' then
Fail ("unmatched ()");
else
Fail ("junk on end"); -- "Can't happen"
end if;
end if;
end Parse;
------------------
-- Parse_Branch --
------------------
procedure Parse_Branch
(Flags : in out Expression_Flags;
IP : out Pointer)
is
E : String renames Expression;
Chain : Pointer;
Last : Pointer;
New_Flags : Expression_Flags;
Dummy : Pointer;
begin
Flags := Worst_Expression; -- Tentatively
IP := Emit_Node (BRANCH);
Chain := 0;
while Parse_Pos <= Parse_End
and then E (Parse_Pos) /= ')'
and then E (Parse_Pos) /= Ascii.LF
and then E (Parse_Pos) /= '|'
loop
Parse_Piece (New_Flags, Last);
if Last = 0 then
IP := 0;
return;
end if;
Flags.Has_Width := Flags.Has_Width or New_Flags.Has_Width;
if Chain = 0 then -- First piece
Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
else
Link_Tail (Chain, Last);
end if;
Chain := Last;
end loop;
if Chain = 0 then -- Loop ran zero times
Dummy := Emit_Node (NOTHING);
end if;
end Parse_Branch;
-----------------
-- Parse_Piece --
-----------------
-- Note that the branching code sequences used for '?' and the
-- general cases of '*' and + are somewhat optimized: they use
-- the same NOTHING node as both the endmarker for their branch
-- list and the body of the last branch. It might seem that
-- this node could be dispensed with entirely, but the endmarker
-- role is not redundant.
procedure Parse_Piece
(Flags : in out Expression_Flags;
IP : out Pointer)
is
Op : Character;
Next : Pointer;
New_Flags : Expression_Flags;
begin
Parse_Atom (New_Flags, IP);
if IP = 0 then
return;
end if;
if Parse_Pos > Parse_End
or else not Is_Mult (Expression (Parse_Pos))
then
Flags := New_Flags;
return;
end if;
Op := Expression (Parse_Pos);
if not New_Flags.Has_Width and Op /= '?' then
Fail ("*+ Operand could be empty");
end if;
if Op /= '+' then
Flags := (SP_Start => True, others => False);
else
Flags := (Has_Width => True, others => False);
end if;
if Op = '*' and New_Flags.Simple then
Insert_Operator (STAR, IP);
elsif Op = '*' then
-- Emit x* as (x&|), where & means "self"
Insert_Operator (BRANCH, IP); -- Either x
Link_Operand_Tail (IP, Emit_Node (BACK)); -- and loop
Link_Operand_Tail (IP, IP); -- back
Link_Tail (IP, Emit_Node (BRANCH)); -- or
Link_Tail (IP, Emit_Node (NOTHING)); -- null
elsif Op = '+' and New_Flags.Simple then
Insert_Operator (PLUS, IP);
elsif Op = '+' then
-- Emit x+ as x(&|), where & means "self"
Next := Emit_Node (BRANCH); -- Either
Link_Tail (IP, Next); --
Link_Tail (Emit_Node (BACK), IP); -- loop back
Link_Tail (Next, Emit_Node (BRANCH)); -- or
Link_Tail (IP, Emit_Node (NOTHING)); -- null
elsif Op = '?' then
-- Emit x? as (x|)
Insert_Operator (BRANCH, IP); -- Either x
Link_Tail (IP, Emit_Node (BRANCH)); -- or
Next := Emit_Node (NOTHING); -- null
Link_Tail (IP, Next);
Link_Operand_Tail (IP, Next);
end if;
Parse_Pos := Parse_Pos + 1;
if Parse_Pos <= Parse_End
and then Is_Mult (Expression (Parse_Pos))
then
Fail ("nested *?+");
end if;
end Parse_Piece;
----------------
-- Parse_Atom --
----------------
procedure Parse_Atom
(Flags : in out Expression_Flags;
IP : out Pointer)
is
E : String renames Expression;
C : Character;
procedure Parse_Literal;
-- Parse_Literal encodes a string of characters
-- to be matched exactly.
--
-- This is a bit tricky due to quoted chars and due to
-- the multiplier characters '*', '+', and '?' that
-- take the SINGLE char previous as their operand.
--
-- On entry, the character at Parse_Pos - 1 is going to go
-- into the string, no matter what it is. It could be
-- following a \ if Parse_Atom was entered from the '\' case.
--
-- Basic idea is to pick up a good char in C and examine
-- the next char. If Is_Mult (C) then twiddle, if it's a \
-- then frozzle and if it's another magic char then push C and
-- terminate the string. If none of the above, push C on the
-- string and go around again.
--
-- Start_Pos is used to remember where "the current character"
-- starts in the string, if due to an Is_Mult we need to back
-- up and put the current char in a separate 1-character string.
-- When Start_Pos is 0, C is the only char in the string;
-- this is used in Is_Mult handling, and in setting the SIMPLE
-- flag at the end.
-------------------
-- Parse_Literal --
-------------------
procedure Parse_Literal is
Start_Pos : Natural;
C : Character;
begin
Parse_Pos := Parse_Pos - 1; -- Look at current character
IP := Emit_Node (EXACTLY);
Start_Pos := 0; -- Is this right ???
Parse_Loop :
loop
<<continue>>
C := E (Parse_Pos); -- Get current character
Parse_Pos := Parse_Pos + 1;
if Parse_Pos > Parse_End then
Emit (C); -- dump current character
exit Parse_Loop; -- and we are done
end if;
case E (Parse_Pos) is -- look at next one
when '.' | '[' | '(' | ')' | '|' | Ascii.LF |
-- ??? Chars '$' and '^' should not always be magic
'$' | '^' =>
Emit (C); -- dump cur char
exit Parse_Loop; -- and we are done
when '?' | '+' | '*' =>
if Start_Pos = 0 then -- If just C in str,
Emit (C); -- dump cur char
exit Parse_Loop; -- and we are done
end if;
-- End mult-char string one early
Parse_Pos := Start_Pos; -- Back up parse
exit Parse_Loop;
when '\' =>
Emit (C); -- Cur char OK
-- Look after \
if Parse_Pos + 1 > Parse_End
or else E (Parse_Pos + 1) = '<'
or else E (Parse_Pos + 1) = '>'
then
-- ??? Someday handle \1, \2, ...
exit Parse_Loop; -- Not quoted
else
-- Backup point is \, scan point is after it
Start_Pos := Parse_Pos;
Parse_Pos := Parse_Pos + 1;
goto continue; -- NOT exit
end if;
when others =>
-- Add current character to string
Emit (C);
end case;
-- Set backup point
Start_Pos := Parse_Pos;
end loop Parse_Loop;
Emit (Ascii.NUL);
Flags.Has_Width := True;
if Start_Pos = 0 then -- One character?
Flags.Simple := True;
end if;
end Parse_Literal;
-- Start of processing for Parse_Atom
begin
-- Tentatively set worst expression case
Flags := Worst_Expression;
C := E (Parse_Pos);
Parse_Pos := Parse_Pos + 1;
case (C) is
when '^' => IP := Emit_Node (BOL); -- These only have meaning at
when '$' => IP := Emit_Node (EOL); -- beginning/end of pattern ???
when '.' => IP := Emit_Node (ANY);
Flags.Has_Width := True;
Flags.Simple := True;
when '[' =>
declare
Class : Character;
Class_End : Character;
begin
if Parse_Pos <= Parse_End
and then E (Parse_Pos) = '^'
then
-- Complement of range
IP := Emit_Node (ANYBUT);
Parse_Pos := Parse_Pos + 1;
else
IP := Emit_Node (ANYOF);
end if;
if Parse_Pos <= Parse_End and then
(E (Parse_Pos) = ']' or E (Parse_Pos) = '-')
then
C := E (Parse_Pos);
Parse_Pos := Parse_Pos + 1;
Emit (C);
end if;
while Parse_Pos <= Parse_End
and then E (Parse_Pos) /= ']'
loop
if E (Parse_Pos) = '-' then
Parse_Pos := Parse_Pos + 1;
if Parse_Pos > Parse_End
or else E (Parse_Pos) = ']'
then
Emit ('-');
else
Class := Character'Succ
(E (Parse_Pos - 2));
Class_End := E (Parse_Pos);
if Class > Character'Succ (Class_End) then
Fail ("invalid [] range");
end if;
for C in Class .. Class_End loop
Emit (C);
end loop;
Parse_Pos := Parse_Pos + 1;
end if;
else
C := E (Parse_Pos);
Parse_Pos := Parse_Pos + 1;
Emit (C);
end if;
end loop;
Emit (Ascii.NUL);
if not (Parse_Pos <= Parse_End
and then E (Parse_Pos) = ']')
then
Fail ("unmatched []");
end if;
Parse_Pos := Parse_Pos + 1;
Flags.Has_Width := True;
Flags.Simple := True;
end;
when '(' =>
declare
New_Flags : Expression_Flags;
begin
Parse (True, New_Flags, IP);
if IP = 0 then
return;
end if;
Flags.Has_Width :=
Flags.Has_Width or New_Flags.Has_Width;
Flags.SP_Start :=
Flags.SP_Start or New_Flags.SP_Start;
end;
when '|' | Ascii.LF | ')' =>
Fail ("internal urp"); -- Supposed to be caught earlier
when '?' | '+' | '*' =>
Fail ("?+* follows nothing");
when '\' =>
if Parse_Pos > Parse_End then
Fail ("trailing \");
end if;
C := E (Parse_Pos);
Parse_Pos := Parse_Pos + 1;
case C is
when '<' => IP := Emit_Node (WORDA);
when '>' => IP := Emit_Node (WORDZ);
when others =>
-- ??? Someday handle \1, \2, ...
-- Handle general quoted chars in exact-match routine
Parse_Literal;
end case;
when others => Parse_Literal;
end case;
end Parse_Atom;
---------------
-- Emit_Node --
---------------
function Emit_Node (Op : Opcode) return Pointer is
Result : Pointer := Emit_Ptr;
begin
if Emit_Code then
Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op));
Program (Emit_Ptr + 1) := Ascii.NUL;
Program (Emit_Ptr + 2) := Ascii.NUL;
Emit_Ptr := Emit_Ptr + 3;
else
Code_Size := Code_Size + 3;
end if;
return Result;
end Emit_Node;
----------
-- Emit --
----------
procedure Emit (B : Character) is
begin
if Emit_Code then
Program (Emit_Ptr) := B;
Emit_Ptr := Emit_Ptr + 1;
else
Code_Size := Code_Size + 1;
end if;
end Emit;
---------------------
-- Insert_Operator --
---------------------
procedure Insert_Operator (Op : Opcode; Operand : Pointer) is
Source : Pointer;
Dest : Pointer;
Old : Pointer;
begin
if Emit_Code then
-- ??? Should rewrite using aggregate?
Source := Emit_Ptr;
Dest := Emit_Ptr + 3;
while Source > Operand loop
Dest := Dest - 1;
Source := Source - 1;
Program (Dest) := Program (Source);
end loop;
end if;
-- Op node, where operand used to be
Old := Emit_Node (Op);
end Insert_Operator;
---------------
-- Link_Tail --
---------------
procedure Link_Tail (P, Val : Pointer) is
Scan : Pointer;
Temp : Pointer;
Offset : Pointer;
begin
if not Emit_Code then
return;
end if;
-- Find last node
Scan := P;
loop
Temp := Next_Instruction (Scan);
exit when Temp = 0;
Scan := Temp;
end loop;
if Program (Scan) = BACK then
Offset := Scan - Val;
else
Offset := Val - Scan;
end if;
Program (Scan + 1) := Character'Val (Offset / 256);
Program (Scan + 2) := Character'Val (Offset mod 256);
end Link_Tail;
-----------------------
-- Link_Operand_Tail --
-----------------------
procedure Link_Operand_Tail (P, Val : Pointer) is
begin
-- "Operandless" and "op /= BRANCH" are synonymous in practice
if Emit_Code and Program (P) = BRANCH then
Link_Tail (Operand (P), Val);
end if;
end Link_Operand_Tail;
----------------------
-- Next_Instruction --
----------------------
function Next_Instruction (P : Pointer) return Pointer is
Offset : Pointer;
begin
if not Emit_Code then
return 0;
end if;
Offset := Get_Next_Offset (Program, P);
if Offset = 0 then
return 0;
end if;
if Program (P) = BACK then
return P - Offset;
else
return P + Offset;
end if;
end Next_Instruction;
Flags : Expression_Flags;
Result : Pointer;
-- Start of processing for Compile
begin
Emit (MAGIC);
Parse (False, Flags, Result);
if Result = 0 then
Fail ("Couldn't compile expression");
end if;
Final_Code_Size := Program_Size (Code_Size);
if Emit_Code then
Optimize (PM);
elsif Final_Code_Size >= PM.Size then
raise Storage_Error; -- Expression too big
end if;
end Compile;
-- Function version of Compile
function Compile
(Expression : String)
return Pattern_Matcher
is
Size : Program_Size;
Dummy : Pattern_Matcher (0);
begin
Compile (Dummy, Expression, Size);
declare
Result : Pattern_Matcher (Size);
begin
Compile (Result, Expression, Size);
return Result;
end;
end Compile;
--------------
-- Optimize --
--------------
procedure Optimize (Self : in out Pattern_Matcher) is
Max_Length : Natural;
This_Length : Natural;
Longest : Pointer;
Scan : Pointer;
Program : Program_Data renames Self.Program;
function String_Length (Start : Pointer) return Natural;
-- Returns the length of a string starting at Start in
-- the Program. Raises Constraint_Error if the string
-- is not terminated.
-- ??? Should allow embedded nulls, but don't want to be
-- incompatible with standard regexp machines now.
-------------------
-- String_Length --
-------------------
function String_Length (Start : Pointer) return Natural is
begin
for J in Start .. Program'Last loop
if Program (J) = Ascii.NUL then
return Natural (J - Start);
end if;
end loop;
raise Constraint_Error;
end String_Length;
-- Start of processing for Optimize
begin
-- Start with safe defaults (no optimization):
-- * No known first character of match
-- * Does not necessarily start at beginning of line
-- * No string known that has to appear in data
Self.First := Ascii.NUL;
Self.Anchored := False;
Self.Must_Have := Program'Last + 1;
Self.Must_Have_Length := 0;
Scan := Program_First + 1; -- First BRANCH
if Program (Get_Next (Program, Scan)) = EOP then
-- Only one top-level choice
Scan := Operand (Scan);
-- Determing starting-point info
if Program (Scan) = EXACTLY then
Self.First := Program (Operand (Scan));
elsif Program (Scan) = BOL then
Self.Anchored := True;
end if;
-- If there's something expensive in the regexp, find the
-- longest literal string that must appear and make it the
-- regmust. Resolve ties in favor of later strings, since
-- the regstart check works with the beginning of the regexp.
-- and avoiding duplication strengthens checking. Not a
-- strong reason, but sufficient in the absence of others.
if False then -- if Flags.SP_Start then ???
Longest := 0;
Max_Length := 0;
while Scan /= 0 loop
if Program (Scan) = EXACTLY then
This_Length := String_Length (Operand (Scan));
if This_Length >= Max_Length then
Longest := Operand (Scan);
Max_Length := This_Length;
end if;
end if;
Scan := Get_Next (Program, Scan);
end loop;
Self.Must_Have := Longest;
Self.Must_Have_Length := Max_Length;
end if;
end if;
end Optimize;
-----------
-- Match --
-----------
procedure Match
(Self : Pattern_Matcher;
Data : String;
Matches : out Match_Array)
is
Program : Program_Data renames Self.Program; -- Shorter notation
-- Global work variables
Input_Pos : Natural; -- String-input pointer
BOL_Pos : Natural; -- Beginning of input, for ^ check
Matched : Boolean := False; -- Until proven True
-----------------------
-- Local Subprograms --
-----------------------
function Next_Instruction (P : Pointer) return Pointer;
-- Next_Instruction - dig the "next" pointer out of a node,
-- similar to the one in Create.
-- Repeated for execution speed (less params to pass, one test missing)
function Index (Start : Positive; C : Character) return Natural;
-- Finds character C in Data starting at Start and returns position
function Repeat (IP : Pointer) return Natural;
-- Repeatedly match something simple, report how many
function Try (Pos : in Positive) return Boolean;
-- Try to match at specific point
function Match (IP : Pointer) return Boolean;
-- This is the main matching routine. Conceptually the strategy
-- is simple: check to see whether the current node matches,
-- call self recursively to see whether the rest matches,
-- and then act accordingly.
--
-- In practice Match makes some effort to avoid recursion, in
-- particular by going through "ordinary" nodes (that don't
-- need to know whether the rest of the match failed) by
-- using a loop instead of recursion.
function Match_Character (IP : Pointer; Key : Character) return Boolean;
pragma Inline (Next_Instruction);
pragma Inline (Index);
pragma Inline (Repeat);
pragma Inline (Match_Character);
---------------------
-- Match_Character --
---------------------
function Match_Character
(IP : Pointer;
Key : Character)
return Boolean
is
Matches : Boolean := False; -- Until proven True
C : Character;
begin
for J in IP .. Program'Last loop
C := Program (IP);
Matches := (C = Key);
exit when Matches or (C = Ascii.Nul);
end loop;
return Matches;
end Match_Character;
----------------------
-- Next_Instruction --
----------------------
function Next_Instruction (P : Pointer) return Pointer is
Offset : constant Pointer := Get_Next_Offset (Program, P);
begin
if Offset = 0 then
return 0;
elsif Program (P) = BACK then
return P - Offset;
else
return P + Offset;
end if;
end Next_Instruction;
-----------
-- Index --
-----------
function Index
(Start : Positive;
C : Character)
return Natural
is
begin
for J in Start .. Data'Last loop
if Data (J) = C then
return J;
end if;
end loop;
return 0;
end Index;
------------
-- Repeat --
------------
function Repeat (IP : Pointer) return Natural is
Scan : Natural := Input_Pos;
Last : constant Natural := Data'Last;
Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP)));
Opnd : constant Pointer := Operand (IP);
Count : Natural;
begin
case Op is
when ANY =>
Scan := Last + 1;
when EXACTLY =>
while Scan <= Last
and then Program (Opnd) = Data (Scan)
loop
Scan := Scan + 1;
end loop;
when ANYOF =>
while Scan <= Last
and then Match_Character (Opnd, Data (Scan))
loop
Scan := Scan + 1;
end loop;
when ANYBUT =>
while Scan <= Last
and then not Match_Character (Opnd, Data (Scan))
loop
Scan := Scan + 1;
end loop;
when others =>
-- Repeat was called inappropriately, internal error
raise Program_Error;
end case;
Count := Scan - Input_Pos;
Input_Pos := Scan;
return Count;
end Repeat;
-----------
-- Match --
-----------
function Match
(IP : Pointer)
return Boolean
is
Scan : Pointer := IP;
Next : Pointer;
Offset : Pointer;
Save : Natural;
Op : Opcode;
begin
State_Machine :
loop
pragma Assert (Scan /= 0);
-- Determine current opcode and count its usage in debug mode
Op := Opcode'Val (Character'Pos ((Program (Scan))));
-- Calculate offset of next instruction.
-- Second character is most significant in Program_Data.
Offset := Pointer (Character'Pos (Program (Scan + 1))
+ 256 * Character'Pos (Program (Scan + 2)));
-- Finally calculate next instruction pointer
if Op = BACK then
Next := Scan - Offset;
else
Next := Scan + Offset;
end if;
case Op is
when BRANCH =>
if Program (Next) /= BRANCH then
Next := Operand (Scan); -- No choide, avoid recursion
else
loop
Save := Input_Pos;
if Match (Operand (Scan)) then
return True;
end if;
Input_Pos := Save;
Scan := Next_Instruction (Scan);
exit when Scan = 0 or Program (Scan) /= BRANCH;
end loop;
exit State_Machine;
end if;
when NOTHING | BACK =>
null;
when BOL =>
exit State_Machine when Input_Pos /= BOL_Pos;
when EOL =>
exit State_Machine when Input_Pos <= Data'Last;
when WORDA =>
exit State_Machine when
not Is_Word_Char (Data (Input_Pos))
-- Must be looking at a letter, digit, or _
or else
(Input_Pos > BOL_Pos
and then Is_Word_Char (Data (Input_Pos - 1)));
-- Prev must be BOL or nonword
when WORDZ =>
-- Must be looking at non letter, digit, or '_'.
-- We don't care what the previous char was.
exit State_Machine when Is_Word_Char (Data (Input_Pos));
when ANY =>
exit State_Machine when Input_Pos > Data'Last;
Input_Pos := Input_Pos + 1;
when EXACTLY =>
declare
Opnd : Pointer := Operand (Scan);
Current : Positive := Input_Pos;
begin
while Current <= Data'Last
and then Program (Opnd) /= Ascii.Nul
loop
exit State_Machine
when Program (Opnd) /= Data (Current);
Current := Current + 1;
Opnd := Opnd + 1;
end loop;
Input_Pos := Current;
end;
when ANYOF =>
exit State_Machine when
Input_Pos > Data'Last
or else
not Match_Character
(Operand (Scan), Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when ANYBUT =>
exit State_Machine when
Input_Pos > Data'Last
or else
Match_Character (Operand (Scan), Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when OPEN_1 .. OPEN_9 =>
Save := Input_Pos;
-- Recursively match expression between ()
exit State_Machine when not Match (Next);
-- Subexpression matched, so store results
declare
No : constant Natural := Op - OPEN;
begin
-- Don't set start of match if some later
-- invocation of the same parentheses
-- already has.
if No <= Matches'Last
and then Matches (No).First = 0
then
Matches (No).First := Save;
end if;
end;
return True;
when CLOSE_1 .. CLOSE_9 =>
Save := Input_Pos;
exit State_Machine when not Match (Next);
declare
No : constant Natural := Op - CLOSE;
begin
-- Don't set Match (No).Last if some later
-- invocation of the same parentheses
-- already has.
if No <= Matches'Last
and then Matches (No).Last = 0
then
Matches (No).Last := Save - 1;
end if;
end;
return True;
when STAR | PLUS =>
declare
Next_Char : Character := Ascii.Nul;
No : Integer;
Min : Natural;
begin
-- Lookahead to avoid useless match attempts
-- when we know what character comes next.
if Program (Next) = EXACTLY then
Next_Char := Program (Operand (Next));
end if;
if Op = STAR then
Min := 0;
else
Min := 1;
end if;
Save := Input_Pos;
No := Repeat (Operand (Scan));
while No >= Min loop
-- If it could work, try it
if (Next_Char = Ascii.NUL
or Data (Input_Pos) = Next_Char)
and then Match (Next)
then
return True;
end if;
-- Could or did not work, back up
No := No - 1;
Input_Pos := Save + No;
end loop;
exit State_Machine;
end;
when EOP =>
return True; -- Success!
when others =>
raise Program_Error; -- Invalid instruction
end case;
Scan := Next;
end loop State_Machine;
-- If we get here, there is no match.
-- For successful matches when EOP" is the terminating point.
return False;
end Match;
---------
-- Try --
---------
function Try (Pos : in Positive) return Boolean is
begin
Input_Pos := Pos;
Matches := (others => No_Match);
if Match (Program_First + 1) then
Matches (0) := (Pos, Input_Pos - 1);
return True;
end if;
return False;
end Try;
-- Start of processing for Match
begin
-- Check validity of program
pragma Assert
(Program (Program_First) /= MAGIC,
"Corrupted Pattern_Matcher");
-- If there is a "must appear" string, look for it
if Self.Must_Have_Length > 0 then
declare
First : constant Character := Program (Self.Must_Have);
Must_First : constant Pointer := Self.Must_Have;
Must_Last : constant Pointer :=
Must_First + Pointer (Self.Must_Have_Length - 1);
Next_Try : Natural := Index (Data'First, First);
begin
while Next_Try /= 0
and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1)
= String (Program (Must_First .. Must_Last))
loop
Next_Try := Index (Next_Try + 1, First);
end loop;
if Next_Try = 0 then
return; -- Not present
end if;
end;
end if;
-- Mark beginning of line for ^
BOL_Pos := Data'First;
-- Simplest case first: an anchored match need be tried only once
if Self.Anchored then
Matched := Try (Data'First);
elsif Self.First /= Ascii.NUL then
-- We know what char it must start with
declare
Next_Try : Natural := Index (Data'First, Self.First);
begin
while Next_Try /= 0 loop
Matched := Try (Next_Try);
exit when Matched;
Next_Try := Index (Next_Try + 1, Self.First);
end loop;
end;
else
-- Messy cases: try all locations
for S in Data'Range loop
Matched := Try (S);
exit when Matched;
end loop;
end if;
-- Matched has its value
return;
end Match;
-----------
-- Match --
-----------
function Match
(Self : Pattern_Matcher;
Data : String)
return Natural
is
Matches : Match_Array;
begin
Match (Self, Data, Matches);
if Matches (0).First < Data'First then
return Data'First - 1;
end if;
return Matches (0).First;
end Match;
procedure Match
(Expression : String;
Data : String;
Matches : out Match_Array;
Size : Program_Size := Default_Program_Size)
is
PM : Pattern_Matcher (Size);
Final_Size : Program_Size; -- unused
begin
Compile (PM, Expression, Final_Size);
Match (PM, Data, Matches);
end Match;
function Match
(Expression : String;
Data : String;
Size : Program_Size := Default_Program_Size)
return Natural
is
PM : Pattern_Matcher (Size);
Final_Size : Program_Size; -- unused
begin
Compile (PM, Expression, Final_Size);
return Match (PM, Data);
end Match;
function Match
(Expression : String;
Data : String;
Size : Program_Size := Default_Program_Size)
return Boolean
is
Matches : Match_Array;
PM : Pattern_Matcher (Size);
Final_Size : Program_Size; -- unused
begin
Compile (PM, Expression, Final_Size);
Match (PM, Data, Matches);
return Matches (0).First >= Data'First;
end Match;
end GNAT.Regpat;