File : g-regpat.adb


------------------------------------------------------------------------------
--                                                                          --
--                         GNAT LIBRARY COMPONENTS                          --
--                                                                          --
--                          G N A T . R E G P A T                           --
--                                                                          --
--                                 B o d y                                  --
--                                                                          --
--                            $Revision: 1.3 $
--                                                                          --
--               Copyright (C) 1986 by University of Toronto.               --
--           Copyright (C) 1996-1999 Ada Core Technologies, Inc.            --
--                                                                          --
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
-- terms of the  GNU General Public License as published  by the Free Soft- --
-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
-- for  more details.  You should have  received  a copy of the GNU General --
-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
-- to  the Free Software Foundation,  59 Temple Place - Suite 330,  Boston, --
-- MA 02111-1307, USA.                                                      --
--                                                                          --
-- As a special exception,  if other files  instantiate  generics from this --
-- unit, or you link  this unit with other files  to produce an executable, --
-- this  unit  does not  by itself cause  the resulting  executable  to  be --
-- covered  by the  GNU  General  Public  License.  This exception does not --
-- however invalidate  any other reasons why  the executable file  might be --
-- covered by the  GNU Public License.                                      --
--                                                                          --
-- GNAT is maintained by Ada Core Technologies Inc (http://www.gnat.com).   --
--                                                                          --
------------------------------------------------------------------------------

--  This is an altered Ada 95 version of the original V8 style regular
--  expression library written in C by Henry Spencer. Apart from the
--  translation to Ada, the interface has been considerably changed to
--  use the Ada String type instead of C-style nul-terminated strings.
--  The internal format of compiled regular expressions has not been
--  modified though and is binary compatible with the C version.

--  Beware that some of this code is subtly aware of the way operator
--  precedence is structured in regular expressions. Serious changes in
--  regular-expression syntax might require a total rethink.

package body GNAT.Regpat is

   --  Structure for pattern matching "program"

   MAGIC : constant Character := Character'Val (8#0234#);

   --  The first byte of the regexp internal "program" is actually
   --  this magic number; the start node begins in the second byte.

   --  This is essentially a linear encoding of a nondeterministic
   --  finite-state machine, also known as syntax charts or
   --  "railroad normal form" in parsing technology.

   --  Each node is an opcode plus a "next" pointer, possibly plus an
   --  operand. "Next" pointers of all nodes except BRANCH implement
   --  concatenation; a "next" pointer with a BRANCH on both ends of it
   --  is connecting two alternatives.

   --  (Here is one of the subtle syntax dependencies:
   --  an individual BRANCH (as opposed to a collection of them) is
   --  never concatenated with anything because of operator precedence.)

   --  The operand of some types of node is a literal string; for others,
   --  it is a node leading into a sub-FSM. In particular, the operand of
   --  a BRANCH node is the first node of the branch.
   --  (NB this is *not* a tree structure:  the tail of the branch connects
   --  to the thing following the set of BRANCHes.)

   --  The opcodes are:

   type Opcode  is
     (EOP, BOL, EOL, ANY, ANYOF, ANYBUT, BRANCH, BACK,
      EXACTLY, NOTHING, STAR, PLUS, WORDA, WORDZ,
      RESERVE_1, RESERVE_2, RESERVE_3, RESERVE_4, RESERVE_5, RESERVE_6,
      OPEN,      OPEN_1,    OPEN_2,    OPEN_3,    OPEN_4,    OPEN_5,
                 OPEN_6,    OPEN_7,    OPEN_8,    OPEN_9,
      CLOSE,     CLOSE_1,   CLOSE_2,   CLOSE_3,   CLOSE_4,   CLOSE_5,
                 CLOSE_6,   CLOSE_7,   CLOSE_8,   CLOSE_9);

   for Opcode'Size use 8;

   for Opcode use
   --  Name         Nr    Operand?  Meaning
     (EOP       =>  0,   -- no     End of program
      BOL       =>  1,   -- no     Match "" at beginning of line
      EOL       =>  2,   -- no     Match "" at end of line
      ANY       =>  3,   -- no     Match   any one character
      ANYOF     =>  4,   -- str    Match any character in this string
      ANYBUT    =>  5,   -- str    Match any character not in this string
      BRANCH    =>  6,   -- node   Match this alternative, or the next
      BACK      =>  7,   -- no     Match "", "next" ptr points backward
      EXACTLY   =>  8,   -- str    Match this string
      NOTHING   =>  9,   -- no     Match empty string
      STAR      => 10,   -- node   Match this (simple) thing 0 or more times
      PLUS      => 11,   -- node   Match this (simple) thing 1 or more times
      WORDA     => 12,   -- no     Match "" at wordchar, where prev is nonword
      WORDZ     => 13,   -- no     Match "" at nonwordchar, where prev is word
      RESERVE_1 => 14,   -- no     Reserved entities for future opcodes
      RESERVE_2 => 15,   --        These MUST be present, for converting an
      RESERVE_3 => 16,   --        Opcode to Character, see Get_Next
      RESERVE_4 => 17,   --        This is also desirable for speed reasons
      RESERVE_5 => 18,
      RESERVE_6 => 19,
      OPEN      => 20,   -- no     Mark this point in input as start of #n
      OPEN_1    => 21,
      OPEN_2    => 22,
      OPEN_3    => 23,
      OPEN_4    => 24,
      OPEN_5    => 25,
      OPEN_6    => 26,
      OPEN_7    => 27,
      OPEN_8    => 28,
      OPEN_9    => 29,
      CLOSE     => 30,   -- no      Analogous to OPEN
      CLOSE_1   => 31,
      CLOSE_2   => 32,
      CLOSE_3   => 33,
      CLOSE_4   => 34,
      CLOSE_5   => 35,
      CLOSE_6   => 36,
      CLOSE_7   => 37,
      CLOSE_8   => 38,
      CLOSE_9   => 39);

   --  Opcode notes:

   --  BRANCH
   --    The set of branches constituting a single choice are hooked
   --    together with their "next" pointers, since precedence prevents
   --    anything being concatenated to any individual branch. The
   --    "next" pointer of the last BRANCH in a choice points to the
   --    thing following the whole choice. This is also where the
   --    final "next" pointer of each individual branch points; each
   --    branch starts with the operand node of a BRANCH node.

   --  BACK
   --    Normal "next" pointers all implicitly point forward;
   --    BACK exists to make loop structures possible.

   --  STAR,PLUS
   --    '?', and complex '*' and '+', are implemented as circular
   --    BRANCH structures using BACK. Simple cases (one character
   --    per match) are implemented with STAR and PLUS for speed
   --    and to minimize recursive plunges.

   --  OPEN,CLOSE
   --    ...are numbered at compile time.


   --  A node is one char of opcode followed by two chars of "next" pointer.
   --  "Next" pointers are stored as two 8-bit pieces, high order first. The
   --  value is a positive offset from the opcode of the node containing it.
   --  An operand, if any, simply follows the node. (Note that much of the
   --  code generation knows about this implicit relationship.)

   --  Using two bytes for the "next" pointer is vast overkill for most
   --  things, but allows patterns to get big without disasters.

   -----------------------
   -- Local Subprograms --
   -----------------------

   function "+" (Left : Opcode;    Right : Integer) return Opcode;
   function "-" (Left : Opcode;    Right : Opcode) return Integer;
   function "=" (Left : Character; Right : Opcode) return Boolean;

   function Is_Mult (C : Character) return Boolean;
   --  Return True iff C is a regexp multiplier: '+', '*' or '?'

   function Is_Word_Char (C : Character) return Boolean;
   --  Return True iff C is an Ascii letter or an underscore ('_')

   function Operand (P : Pointer) return Pointer;
   --  Return a pointer to the operand of the node at P

   function Get_Next_Offset
     (Program : Program_Data;
      IP      : Pointer) return Pointer;
   --  Get the offset field of a node. Used by Get_Next.

   function Get_Next
     (Program : Program_Data;
      IP      : Pointer) return Pointer;
   --  Dig the next instruction pointer out of a node

   procedure Optimize
     (Self              : in out Pattern_Matcher);
   --  Optimize a Pattern_Matcher by noting certain special cases

   --  All of the following subprograms are tiny and should be inlined

   pragma Inline ("+");
   pragma Inline ("-");
   pragma Inline ("=");
   pragma Inline (Is_Mult);
   pragma Inline (Is_Word_Char);
   pragma Inline (Get_Next);
   pragma Inline (Get_Next_Offset);
   pragma Inline (Operand);

   type Expression_Flags is record
      Has_Width,            -- Known never to match null string
      Simple,               -- Simple enough to be STAR/PLUS operand
      SP_Start  : Boolean;  -- Starts with * or +
   end record;

   ---------
   -- "=" --
   ---------

   function "=" (Left : Character; Right : Opcode) return Boolean is
   begin
      return Character'Pos (Left) = Opcode'Pos (Right);
   end "=";

   ---------
   -- "+" --
   ---------

   function "+" (Left : Opcode; Right : Integer) return Opcode is
   begin
      return Opcode'Val (Opcode'Pos (Left) + Right);
   end "+";

   ---------
   -- "-" --
   ---------

   function "-" (Left : Opcode; Right : Opcode) return Integer is
   begin
      return Opcode'Pos (Left) - Opcode'Pos (Right);
   end "-";

   -------------
   -- Is_Mult --
   -------------

   function Is_Mult (C : Character) return Boolean is
   begin
      return (C = '*' or C = '+' or C = '?');
   end Is_Mult;

   ------------------
   -- Is_Word_Char --
   ------------------

   function Is_Word_Char (C : Character) return Boolean is
   begin
      return (C in 'a' .. 'z' or C in 'A' .. 'Z' or C = '_');
   end Is_Word_Char;

   -------------
   -- Operand --
   -------------

   function Operand (P : Pointer) return Pointer is
   begin
      return P + 3;
   end Operand;

   ---------------------
   -- Get_Next_Offset --
   ---------------------

   function Get_Next_Offset
     (Program : Program_Data;
      IP      : Pointer)
      return    Pointer
   is
   begin
      return Pointer (Character'Pos (Program (IP + 1))
              + 256 * Character'Pos (Program (IP + 2)));
   end Get_Next_Offset;

   --------------
   -- Get_Next --
   --------------

   function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is
      Offset : Pointer;
      Op     : constant Opcode := Opcode'Val (Character'Pos (Program (IP)));

   begin
      pragma Assert (IP /= 0);

      Offset := Get_Next_Offset (Program, IP);

      if Offset = 0 then
         return 0;

      elsif Op = BACK then
         return IP - Offset;

      else
         return IP + Offset;
      end if;
   end Get_Next;

   Worst_Expression : constant Expression_Flags := (others => False);
   --  Worst case

   -------------
   -- Compile --
   -------------

   procedure Compile
     (Matcher         : out Pattern_Matcher;
      Expression      : String;
      Final_Code_Size : out Program_Size)
   is
      --  We can't allocate space until we know how big the compiled form
      --  will be, but we can't compile it (and thus know how big it is)
      --  until we've got a place to put the code. So we cheat: we compile
      --  it twice, once with code generation turned off and size counting
      --  turned on, and once "for real".

      --  This also means that we don't allocate space until we are sure
      --  that the thing really will compile successfully, and we never
      --  have to move the code and thus invalidate pointers into it.

      --  Beware that the optimization-preparation code in here knows
      --  about some of the structure of the compiled regexp.

      PM        : Pattern_Matcher renames Matcher;
      Program   : Program_Data renames PM.Program;

      Emit_Code : constant Boolean := PM.Size > 0;
      Emit_Ptr  : Pointer := Program_First;

      Code_Size : Natural := 0;
      Par_Count : Natural range 0 .. Max_Nesting := 1; -- () count
      Parse_Pos : Natural := Expression'First; -- Input-scan pointer
      Parse_End : Natural := Expression'Last;

      ----------------------------
      -- Subprograms for Create --
      ----------------------------

      procedure Emit (B : Character);
      --  If code-generation is enabled, output the Character to the Program.
      --  Otherwise just increases Code_Size.

      function  Emit_Node (Op : Opcode) return Pointer;
      --  If code-generation is enabled, Emit_Node outputs the
      --  opcode and reserves space for a pointer to the next node.
      --  Return value is the location of new opcode, ie old Emit_Ptr.

      procedure Parse
        (Parenthesized : Boolean;
         Flags         : in out Expression_Flags;
         IP            : out Pointer);
      --  Parse regular expression, i.e. main body or parenthesized thing
      --  Caller must absorb opening parenthesis.

      procedure Parse_Branch
        (Flags         : in out Expression_Flags;
         IP            : out Pointer);
      --  Implements the concatenation operator and handles '|'

      procedure Parse_Piece
        (Flags : in out Expression_Flags; IP : out Pointer);
      --  Parse something followed by possible [*+?]

      procedure Parse_Atom
        (Flags : in out Expression_Flags; IP : out Pointer);
      --  Parse_Atom is the lowest level parse procedure.
      --  Optimization:  gobbles an entire sequence of ordinary characters
      --  so that it can turn them into a single node, which is smaller to
      --  store and faster to run. Backslashed characters are exceptions,
      --  each becoming a separate node; the code is simpler that way and
      --  it's not worth fixing.

      procedure Insert_Operator (Op : Opcode; Operand : Pointer);
      --  Insert_Operator inserts an operator in front of an
      --  already-emitted operand and relocates the operand.

      procedure Link_Tail (P, Val : Pointer);
      --  Link_Tail sets the next-pointer at the end of a node chain

      procedure Link_Operand_Tail (P, Val : Pointer);
      --  Link_Tail on operand of first argument; nop if operandless

      function  Next_Instruction (P : Pointer) return Pointer;
      --  Dig the "next" pointer out of a node

      procedure Fail (M : in String);
      --  Fail with a diagnostic message, if possible

      ----------
      -- Fail --
      ----------

      procedure Fail (M : in String) is
      begin
         --  M should be passed as exception message, but
         --  this is not possible in a Pure package.

         raise Expression_Error;
      end Fail;

      -----------
      -- Parse --
      -----------

      --  Combining parenthesis handling with the base level
      --  of regular expression is a trifle forced, but the
      --  need to tie the tails of the branches to what follows
      --  makes it hard to avoid.

      procedure Parse
        (Parenthesized  : in Boolean;
         Flags          : in out Expression_Flags;
         IP             : out Pointer)
      is
         E              : String renames Expression;
         Br             : Pointer;
         Ender          : Pointer;
         Par_No         : Natural;
         New_Flags      : Expression_Flags;

      begin
         Flags := (Has_Width => True, others => False);  -- Tentatively

         --  Make an OPEN node, if parenthesized

         if Parenthesized then
            if Par_Count >= Max_Nesting then
               Fail ("too many ()");
            end if;

            Par_No := Par_Count;
            Par_Count := Par_Count + 1;
            IP := Emit_Node (OPEN + Par_No);
         else
            IP := 0;
         end if;

         --  Pick up the branches, linking them together

         Parse_Branch (New_Flags, Br);

         if Br = 0 then
            IP := 0;
            return;
         end if;

         if IP /= 0 then
            Link_Tail (IP, Br);   -- OPEN -> first
         else
            IP := Br;
         end if;

         if not New_Flags.Has_Width then
            Flags.Has_Width := False;
         end if;

         Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;

         while Parse_Pos <= Parse_End
           and then (E (Parse_Pos) = '|' or else E (Parse_Pos) = Ascii.LF)
         loop
            Parse_Pos := Parse_Pos + 1;
            Parse_Branch (New_Flags, Br);

            if Br = 0 then
               IP := 0;
               return;
            end if;

            Link_Tail (IP, Br);   -- BRANCH -> BRANCH

            if not New_Flags.Has_Width then
               Flags.Has_Width := False;
            end if;

            Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
         end loop;

         --  Make a closing node, and hook it on the end

         if Parenthesized then
            Ender := Emit_Node (CLOSE + Par_No);
         else
            Ender := Emit_Node (EOP);
         end if;

         Link_Tail (IP, Ender);

         --  Hook the tails of the branches to the closing node

         Br := IP;
         loop
            exit when Br = 0;
            Link_Operand_Tail (Br, Ender);
            Br := Next_Instruction (Br);
         end loop;

         --  Check for proper termination

         if Parenthesized then
            if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then
               Fail ("unmatched ()");
            end if;

            Parse_Pos := Parse_Pos + 1;

         elsif Parse_Pos <= Parse_End then
            if E (Parse_Pos) = ')'  then
               Fail ("unmatched ()");
            else
               Fail ("junk on end");         -- "Can't happen"
            end if;
         end if;
      end Parse;

      ------------------
      -- Parse_Branch --
      ------------------

      procedure Parse_Branch
        (Flags : in out Expression_Flags;
         IP    : out Pointer)
      is
         E         : String renames Expression;
         Chain     : Pointer;
         Last      : Pointer;
         New_Flags : Expression_Flags;
         Dummy     : Pointer;

      begin
         Flags := Worst_Expression;    -- Tentatively

         IP := Emit_Node (BRANCH);
         Chain := 0;

         while Parse_Pos <= Parse_End
           and then E (Parse_Pos) /= ')'
           and then E (Parse_Pos) /= Ascii.LF
           and then E (Parse_Pos) /= '|'
         loop
            Parse_Piece (New_Flags, Last);

            if Last = 0 then
               IP := 0;
               return;
            end if;

            Flags.Has_Width := Flags.Has_Width or New_Flags.Has_Width;

            if Chain = 0 then            -- First piece
               Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
            else
               Link_Tail (Chain, Last);
            end if;

            Chain := Last;
         end loop;

         if Chain = 0 then            -- Loop ran zero times
            Dummy := Emit_Node (NOTHING);
         end if;

      end Parse_Branch;

      -----------------
      -- Parse_Piece --
      -----------------

      --  Note that the branching code sequences used for '?' and the
      --  general cases of '*' and + are somewhat optimized: they use
      --  the same NOTHING node as both the endmarker for their branch
      --  list and the body of the last branch. It might seem that
      --  this node could be dispensed with entirely, but the endmarker
      --  role is not redundant.

      procedure Parse_Piece
        (Flags : in out Expression_Flags;
         IP    : out Pointer)
      is
         Op        : Character;
         Next      : Pointer;
         New_Flags : Expression_Flags;

      begin
         Parse_Atom (New_Flags, IP);

         if IP = 0 then
            return;
         end if;

         if Parse_Pos > Parse_End
           or else not Is_Mult (Expression (Parse_Pos))
         then
            Flags := New_Flags;
            return;
         end if;

         Op := Expression (Parse_Pos);

         if not New_Flags.Has_Width and Op /= '?' then
            Fail ("*+ Operand could be empty");
         end if;

         if Op /= '+' then
            Flags := (SP_Start => True, others => False);
         else
            Flags := (Has_Width => True, others => False);
         end if;

         if Op = '*' and New_Flags.Simple then
            Insert_Operator (STAR, IP);

         elsif Op = '*' then

            --  Emit x* as (x&|), where & means "self"

            Insert_Operator (BRANCH, IP);                -- Either x
            Link_Operand_Tail (IP, Emit_Node (BACK));    -- and loop
            Link_Operand_Tail (IP, IP);                  -- back
            Link_Tail (IP, Emit_Node (BRANCH));          -- or
            Link_Tail (IP, Emit_Node (NOTHING));         -- null

         elsif Op = '+' and New_Flags.Simple then
            Insert_Operator (PLUS, IP);

         elsif Op = '+' then

            --  Emit x+ as x(&|), where & means "self"

            Next := Emit_Node (BRANCH);                  -- Either
            Link_Tail (IP, Next);                        --
            Link_Tail (Emit_Node (BACK), IP);            -- loop back
            Link_Tail (Next, Emit_Node (BRANCH));        -- or
            Link_Tail (IP, Emit_Node (NOTHING));         -- null

         elsif Op = '?' then

            --  Emit x? as (x|)

            Insert_Operator (BRANCH, IP);                -- Either x
            Link_Tail (IP, Emit_Node (BRANCH));          -- or
            Next := Emit_Node (NOTHING);                 -- null
            Link_Tail (IP, Next);
            Link_Operand_Tail (IP, Next);
         end if;

         Parse_Pos := Parse_Pos + 1;

         if Parse_Pos <= Parse_End
           and then Is_Mult (Expression (Parse_Pos))
         then
            Fail ("nested *?+");
         end if;
      end Parse_Piece;

      ----------------
      -- Parse_Atom --
      ----------------

      procedure Parse_Atom
        (Flags : in out Expression_Flags;
         IP    : out Pointer)
      is
         E : String renames Expression;
         C : Character;

         procedure Parse_Literal;
         --  Parse_Literal encodes a string of characters
         --  to be matched exactly.
         --
         --  This is a bit tricky due to quoted chars and due to
         --  the multiplier characters '*', '+', and '?' that
         --  take the SINGLE char previous as their operand.
         --
         --  On entry, the character at Parse_Pos - 1 is going to go
         --  into the string, no matter what it is. It could be
         --  following a \ if Parse_Atom was entered from the '\' case.
         --
         --  Basic idea is to pick up a good char in C and examine
         --  the next char. If Is_Mult (C) then twiddle, if it's a \
         --  then frozzle and if it's another magic char then push C and
         --  terminate the string. If none of the above, push C on the
         --  string and go around again.
         --
         --  Start_Pos is used to remember where "the current character"
         --  starts in the string, if due to an Is_Mult we need to back
         --  up and put the current char in a separate 1-character string.
         --  When Start_Pos is 0, C is the only char in the string;
         --  this is used in Is_Mult handling, and in setting the SIMPLE
         --  flag at the end.

         -------------------
         -- Parse_Literal --
         -------------------

         procedure Parse_Literal is
            Start_Pos : Natural;
            C         : Character;

         begin
            Parse_Pos := Parse_Pos - 1;      -- Look at current character
            IP := Emit_Node (EXACTLY);
            Start_Pos := 0; -- Is this right ???

            Parse_Loop :
            loop
               <<continue>>

               C := E (Parse_Pos);           -- Get current character
               Parse_Pos := Parse_Pos + 1;

               if Parse_Pos > Parse_End then
                  Emit (C);                  -- dump current character
                  exit Parse_Loop;           -- and we are done
               end if;

               case E (Parse_Pos) is         -- look at next one
                  when '.' | '[' | '(' | ')' | '|' | Ascii.LF |

                     --  ??? Chars '$' and '^' should not always be magic

                        '$' | '^' =>

                     Emit (C);               -- dump cur char
                     exit Parse_Loop;        -- and we are done

                  when '?' | '+' | '*' =>
                     if Start_Pos = 0 then   -- If just C in str,
                        Emit (C);            -- dump cur char
                        exit Parse_Loop;     -- and we are done
                     end if;

                     --  End mult-char string one early

                     Parse_Pos := Start_Pos; -- Back up parse
                     exit Parse_Loop;

                  when '\' =>
                     Emit (C);               -- Cur char OK

                     --  Look after \

                     if Parse_Pos + 1 > Parse_End
                       or else E (Parse_Pos + 1) = '<'
                       or else E (Parse_Pos + 1) = '>'
                     then
                        --  ??? Someday handle \1, \2, ...

                        exit Parse_Loop;     -- Not quoted

                     else
                        --  Backup point is \, scan point is after it

                        Start_Pos := Parse_Pos;
                        Parse_Pos := Parse_Pos + 1;
                        goto continue;   -- NOT exit
                     end if;

                  when others =>

                     --  Add current character to string

                     Emit (C);
               end case;

               --  Set backup point

               Start_Pos := Parse_Pos;
            end loop Parse_Loop;

            Emit (Ascii.NUL);
            Flags.Has_Width := True;

            if Start_Pos = 0 then            -- One character?
               Flags.Simple := True;
            end if;
         end Parse_Literal;

      --  Start of processing for Parse_Atom

      begin
         --  Tentatively set worst expression case

         Flags := Worst_Expression;

         C := E (Parse_Pos);
         Parse_Pos := Parse_Pos + 1;

         case (C) is
            when '^' => IP := Emit_Node (BOL); --  These only have meaning at
            when '$' => IP := Emit_Node (EOL); --  beginning/end of pattern ???

            when '.' => IP := Emit_Node (ANY);
                        Flags.Has_Width := True;
                        Flags.Simple := True;

            when '[' =>
               declare
                  Class     : Character;
                  Class_End : Character;

               begin
                  if Parse_Pos <= Parse_End
                    and then E (Parse_Pos) = '^'
                  then
                     --  Complement of range

                     IP := Emit_Node (ANYBUT);
                     Parse_Pos := Parse_Pos + 1;

                  else
                     IP := Emit_Node (ANYOF);
                  end if;

                  if Parse_Pos <= Parse_End and then
                     (E (Parse_Pos) = ']' or E (Parse_Pos) = '-')
                  then
                     C := E (Parse_Pos);
                     Parse_Pos := Parse_Pos + 1;
                     Emit (C);
                  end if;

                  while Parse_Pos <= Parse_End
                    and then E (Parse_Pos) /= ']'
                  loop
                     if E (Parse_Pos) = '-' then
                        Parse_Pos := Parse_Pos + 1;

                        if Parse_Pos > Parse_End
                          or else E (Parse_Pos) = ']'
                        then
                           Emit ('-');

                        else
                           Class := Character'Succ
                             (E (Parse_Pos - 2));
                           Class_End := E (Parse_Pos);

                           if Class > Character'Succ (Class_End) then
                              Fail ("invalid [] range");
                           end if;

                           for C in Class .. Class_End loop
                              Emit (C);
                           end loop;

                           Parse_Pos := Parse_Pos + 1;
                        end if;

                     else
                        C := E (Parse_Pos);
                        Parse_Pos := Parse_Pos + 1;
                        Emit (C);
                     end if;
                  end loop;

                  Emit (Ascii.NUL);

                  if not (Parse_Pos <= Parse_End
                    and then E (Parse_Pos) = ']')
                  then
                     Fail ("unmatched []");
                  end if;

                  Parse_Pos := Parse_Pos + 1;
                  Flags.Has_Width := True;
                  Flags.Simple := True;
               end;

            when '(' =>
               declare
                  New_Flags : Expression_Flags;
               begin
                  Parse (True, New_Flags, IP);

                  if IP = 0 then
                     return;
                  end if;

                  Flags.Has_Width :=
                    Flags.Has_Width or New_Flags.Has_Width;
                  Flags.SP_Start :=
                    Flags.SP_Start or New_Flags.SP_Start;
               end;

            when '|' | Ascii.LF | ')' =>
               Fail ("internal urp");  --  Supposed to be caught earlier

            when '?' | '+' | '*' =>
               Fail ("?+* follows nothing");

            when '\' =>
               if Parse_Pos > Parse_End then
                  Fail ("trailing \");
               end if;

               C := E (Parse_Pos);
               Parse_Pos := Parse_Pos + 1;
               case C is
                  when '<' => IP := Emit_Node (WORDA);
                  when '>' => IP := Emit_Node (WORDZ);
                  when others =>

                     --  ??? Someday handle \1, \2, ...

                     --  Handle general quoted chars in exact-match routine

                     Parse_Literal;
               end case;

            when others => Parse_Literal;
         end case;
      end Parse_Atom;

      ---------------
      -- Emit_Node --
      ---------------

      function Emit_Node (Op : Opcode) return Pointer is
         Result : Pointer := Emit_Ptr;

      begin
         if Emit_Code then
            Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op));
            Program (Emit_Ptr + 1) := Ascii.NUL;
            Program (Emit_Ptr + 2) := Ascii.NUL;
            Emit_Ptr := Emit_Ptr + 3;

         else
            Code_Size := Code_Size + 3;
         end if;

         return Result;
      end Emit_Node;

      ----------
      -- Emit --
      ----------

      procedure Emit (B : Character) is
      begin
         if Emit_Code then
            Program (Emit_Ptr) := B;
            Emit_Ptr := Emit_Ptr + 1;
         else
            Code_Size := Code_Size + 1;
         end if;
      end Emit;

      ---------------------
      -- Insert_Operator --
      ---------------------

      procedure Insert_Operator (Op : Opcode; Operand : Pointer) is
         Source : Pointer;
         Dest   : Pointer;
         Old    : Pointer;

      begin
         if Emit_Code then
            --  ??? Should rewrite using aggregate?

            Source := Emit_Ptr;
            Dest := Emit_Ptr + 3;
            while Source > Operand loop
               Dest := Dest - 1;
               Source := Source - 1;
               Program (Dest) := Program (Source);
            end loop;
         end if;

         --  Op node, where operand used to be

         Old := Emit_Node (Op);
      end Insert_Operator;

      ---------------
      -- Link_Tail --
      ---------------

      procedure Link_Tail (P, Val : Pointer) is
         Scan   : Pointer;
         Temp   : Pointer;
         Offset : Pointer;

      begin
         if not Emit_Code then
            return;
         end if;

         --  Find last node

         Scan := P;
         loop
            Temp := Next_Instruction (Scan);
            exit when Temp = 0;
            Scan := Temp;
         end loop;

         if Program (Scan) = BACK then
            Offset := Scan - Val;
         else
            Offset := Val - Scan;
         end if;

         Program (Scan + 1) := Character'Val (Offset / 256);
         Program (Scan + 2) := Character'Val (Offset mod 256);
      end Link_Tail;

      -----------------------
      -- Link_Operand_Tail --
      -----------------------

      procedure Link_Operand_Tail (P, Val : Pointer) is
      begin
         --  "Operandless" and "op /= BRANCH" are synonymous in practice

         if Emit_Code and Program (P) = BRANCH then
            Link_Tail (Operand (P), Val);
         end if;
      end Link_Operand_Tail;

      ----------------------
      -- Next_Instruction --
      ----------------------

      function Next_Instruction (P : Pointer) return Pointer is
         Offset : Pointer;

      begin
         if not Emit_Code then
            return 0;
         end if;

         Offset := Get_Next_Offset (Program, P);

         if Offset = 0 then
            return 0;
         end if;

         if Program (P) = BACK then
            return P - Offset;
         else
            return P + Offset;
         end if;
      end Next_Instruction;

      Flags   : Expression_Flags;
      Result  : Pointer;

   --  Start of processing for Compile

   begin
      Emit (MAGIC);
      Parse (False, Flags, Result);

      if Result = 0 then
         Fail ("Couldn't compile expression");
      end if;

      Final_Code_Size := Program_Size (Code_Size);

      if Emit_Code then
         Optimize (PM);

      elsif Final_Code_Size >= PM.Size then
         raise Storage_Error; --  Expression too big
      end if;

   end Compile;

   --  Function version of Compile

   function Compile
     (Expression : String)
      return       Pattern_Matcher
   is
      Size  : Program_Size;
      Dummy : Pattern_Matcher (0);
   begin
      Compile (Dummy, Expression, Size);

      declare
         Result : Pattern_Matcher (Size);
      begin
         Compile (Result, Expression, Size);
         return Result;
      end;
   end Compile;

   --------------
   -- Optimize --
   --------------

   procedure Optimize (Self : in out Pattern_Matcher) is
      Max_Length  : Natural;
      This_Length : Natural;
      Longest     : Pointer;
      Scan        : Pointer;
      Program     : Program_Data renames Self.Program;

      function String_Length (Start : Pointer) return Natural;
      --  Returns the length of a string starting at Start in
      --  the Program. Raises Constraint_Error if the string
      --  is not terminated.

      --  ??? Should allow embedded nulls, but don't want to be
      --  incompatible with standard regexp machines now.

      -------------------
      -- String_Length --
      -------------------

      function String_Length (Start : Pointer) return Natural is
      begin
         for J in Start .. Program'Last loop
            if Program (J) = Ascii.NUL then
               return Natural (J - Start);
            end if;
         end loop;

         raise Constraint_Error;
      end String_Length;

   --  Start of processing for Optimize

   begin
      --  Start with safe defaults (no optimization):
      --    *  No known first character of match
      --    *  Does not necessarily start at beginning of line
      --    *  No string known that has to appear in data

      Self.First := Ascii.NUL;
      Self.Anchored := False;
      Self.Must_Have := Program'Last + 1;
      Self.Must_Have_Length := 0;

      Scan := Program_First + 1;  --  First BRANCH
      if Program (Get_Next (Program, Scan)) = EOP then

         --  Only one top-level choice

         Scan := Operand (Scan);

         --  Determing starting-point info

         if Program (Scan) = EXACTLY then
            Self.First := Program (Operand (Scan));

         elsif Program (Scan) = BOL then
            Self.Anchored := True;
         end if;

         --  If there's something expensive in the regexp, find the
         --  longest literal string that must appear and make it the
         --  regmust. Resolve ties in favor of later strings, since
         --  the regstart check works with the beginning of the regexp.
         --  and avoiding duplication strengthens checking. Not a
         --  strong reason, but sufficient in the absence of others.

         if False then -- if Flags.SP_Start then ???
            Longest := 0;
            Max_Length := 0;
            while Scan /= 0 loop
               if Program (Scan) = EXACTLY  then
                  This_Length := String_Length (Operand (Scan));

                  if This_Length >= Max_Length then
                     Longest := Operand (Scan);
                     Max_Length := This_Length;
                  end if;
               end if;

               Scan := Get_Next (Program, Scan);
            end loop;

            Self.Must_Have        := Longest;
            Self.Must_Have_Length := Max_Length;
         end if;
      end if;
   end Optimize;

   -----------
   -- Match --
   -----------

   procedure Match
     (Self    : Pattern_Matcher;
      Data    : String;
      Matches : out Match_Array)
   is
      Program   : Program_Data renames Self.Program; -- Shorter notation

      --  Global work variables

      Input_Pos : Natural;          -- String-input pointer
      BOL_Pos   : Natural;          -- Beginning of input, for ^ check
      Matched   : Boolean := False;  -- Until proven True

      -----------------------
      -- Local Subprograms --
      -----------------------

      function Next_Instruction (P : Pointer) return Pointer;
      --  Next_Instruction - dig the "next" pointer out of a node,
      --  similar to the one in Create.
      --  Repeated for execution speed (less params to pass, one test missing)

      function Index (Start : Positive; C : Character) return Natural;
      --  Finds character C in Data starting at Start and returns position

      function Repeat (IP : Pointer) return Natural;
      --  Repeatedly match something simple, report how many

      function Try (Pos : in Positive) return Boolean;
      --  Try to match at specific point

      function Match (IP : Pointer) return Boolean;
      --  This is the main matching routine. Conceptually the strategy
      --  is simple:  check to see whether the current node matches,
      --  call self recursively to see whether the rest matches,
      --  and then act accordingly.
      --
      --  In practice Match makes some effort to avoid recursion, in
      --  particular by going through "ordinary" nodes (that don't
      --  need to know whether the rest of the match failed) by
      --  using a loop instead of recursion.

      function Match_Character (IP : Pointer; Key : Character) return Boolean;

      pragma Inline (Next_Instruction);
      pragma Inline (Index);
      pragma Inline (Repeat);
      pragma Inline (Match_Character);

      ---------------------
      -- Match_Character --
      ---------------------

      function Match_Character
        (IP   : Pointer;
         Key  : Character)
         return Boolean
      is
         Matches : Boolean := False; --  Until proven True
         C       : Character;

      begin
         for J in IP .. Program'Last loop
            C := Program (IP);
            Matches := (C = Key);
            exit when Matches or (C = Ascii.Nul);
         end loop;

         return Matches;
      end Match_Character;

      ----------------------
      -- Next_Instruction --
      ----------------------

      function Next_Instruction (P : Pointer) return Pointer is
         Offset : constant Pointer := Get_Next_Offset (Program, P);

      begin
         if Offset = 0 then
            return 0;

         elsif Program (P) = BACK then
            return P - Offset;

         else
            return P + Offset;
         end if;
      end Next_Instruction;

      -----------
      -- Index --
      -----------

      function Index
        (Start : Positive;
         C     : Character)
         return  Natural
      is
      begin
         for J in Start .. Data'Last loop
            if Data (J) = C then
               return J;
            end if;
         end loop;

         return 0;
      end Index;

      ------------
      -- Repeat --
      ------------

      function Repeat (IP : Pointer) return Natural is
         Scan  : Natural := Input_Pos;
         Last  : constant Natural := Data'Last;
         Op    : constant Opcode := Opcode'Val (Character'Pos (Program (IP)));
         Opnd  : constant Pointer := Operand (IP);
         Count : Natural;

      begin
         case Op is
            when ANY =>
               Scan := Last + 1;

            when EXACTLY =>
               while Scan <= Last
                 and then Program (Opnd) = Data (Scan)
               loop
                  Scan := Scan + 1;
               end loop;

            when ANYOF =>
               while Scan <= Last
                 and then Match_Character (Opnd, Data (Scan))
               loop
                  Scan := Scan + 1;
               end loop;

            when ANYBUT =>
               while Scan <= Last
                 and then not Match_Character (Opnd, Data (Scan))
               loop
                  Scan := Scan + 1;
               end loop;

            when others =>

               --  Repeat was called inappropriately, internal error

               raise Program_Error;
         end case;

         Count := Scan - Input_Pos;
         Input_Pos := Scan;
         return Count;
      end Repeat;

      -----------
      -- Match --
      -----------

      function Match
        (IP   : Pointer)
         return Boolean
      is
         Scan   : Pointer := IP;
         Next   : Pointer;
         Offset : Pointer;
         Save   : Natural;
         Op     : Opcode;

      begin
         State_Machine :
         loop
            pragma Assert (Scan /= 0);

            --  Determine current opcode and count its usage in debug mode

            Op := Opcode'Val (Character'Pos ((Program (Scan))));

            --  Calculate offset of next instruction.
            --  Second character is most significant in Program_Data.

            Offset := Pointer (Character'Pos (Program (Scan + 1))
                        + 256 * Character'Pos (Program (Scan + 2)));

            --  Finally calculate next instruction pointer

            if Op = BACK then
               Next := Scan - Offset;
            else
               Next := Scan + Offset;
            end if;

            case Op is
               when BRANCH =>
                  if Program (Next) /= BRANCH then
                     Next := Operand (Scan); -- No choide, avoid recursion

                  else
                     loop
                        Save := Input_Pos;

                        if Match (Operand (Scan)) then
                           return True;
                        end if;

                        Input_Pos := Save;
                        Scan := Next_Instruction (Scan);
                        exit when Scan = 0 or Program (Scan) /= BRANCH;
                     end loop;

                     exit State_Machine;
                  end if;

               when NOTHING | BACK =>
                  null;

               when BOL =>
                  exit State_Machine when Input_Pos /= BOL_Pos;

               when EOL =>
                  exit State_Machine when Input_Pos <= Data'Last;

               when WORDA =>
                  exit State_Machine when
                     not Is_Word_Char (Data (Input_Pos))

                     --  Must be looking at a letter, digit, or _

                        or else

                     (Input_Pos > BOL_Pos
                       and then Is_Word_Char (Data (Input_Pos - 1)));

                     --  Prev must be BOL or nonword

               when WORDZ =>
                  --  Must be looking at non letter, digit, or '_'.
                  --  We don't care what the previous char was.

                  exit State_Machine when Is_Word_Char (Data (Input_Pos));

               when ANY =>
                  exit State_Machine when Input_Pos > Data'Last;
                  Input_Pos := Input_Pos + 1;

               when EXACTLY =>
                  declare
                     Opnd    : Pointer := Operand (Scan);
                     Current : Positive := Input_Pos;

                  begin
                     while Current <= Data'Last
                       and then Program (Opnd) /= Ascii.Nul
                     loop
                        exit State_Machine
                          when Program (Opnd) /= Data (Current);
                        Current := Current + 1;
                        Opnd := Opnd + 1;
                     end loop;

                     Input_Pos := Current;
                  end;

               when ANYOF =>
                  exit State_Machine when
                     Input_Pos > Data'Last
                       or else
                         not Match_Character
                               (Operand (Scan), Data (Input_Pos));

                  Input_Pos := Input_Pos + 1;

               when ANYBUT =>
                  exit State_Machine when
                     Input_Pos > Data'Last
                           or else
                     Match_Character (Operand (Scan), Data (Input_Pos));

                  Input_Pos := Input_Pos + 1;

               when OPEN_1 .. OPEN_9 =>
                  Save :=  Input_Pos;

                  --  Recursively match expression between ()

                  exit State_Machine when not Match (Next);

                  --  Subexpression matched, so store results

                  declare
                     No : constant Natural := Op - OPEN;

                  begin
                     --  Don't set start of match if some later
                     --  invocation of the same parentheses
                     --  already has.

                     if No <= Matches'Last
                          and then Matches (No).First = 0
                     then
                        Matches (No).First := Save;
                     end if;
                  end;

                  return True;

               when CLOSE_1 .. CLOSE_9 =>
                  Save := Input_Pos;
                  exit State_Machine when not Match (Next);
                  declare
                     No : constant Natural := Op - CLOSE;

                  begin
                     --  Don't set Match (No).Last if some later
                     --  invocation of the same parentheses
                     --  already has.

                     if No <= Matches'Last
                          and then Matches (No).Last = 0
                     then
                        Matches (No).Last := Save - 1;
                     end if;
                  end;

                  return True;

               when STAR | PLUS =>
                  declare
                     Next_Char : Character := Ascii.Nul;
                     No        : Integer;
                     Min       : Natural;
                  begin
                     --  Lookahead to avoid useless match attempts
                     --  when we know what character comes next.

                     if Program (Next) = EXACTLY then
                        Next_Char := Program (Operand (Next));
                     end if;

                     if Op = STAR then
                        Min := 0;
                     else
                        Min := 1;
                     end if;

                     Save := Input_Pos;
                     No := Repeat (Operand (Scan));

                     while No >= Min loop
                        --  If it could work, try it

                        if (Next_Char = Ascii.NUL
                             or Data (Input_Pos) = Next_Char)
                          and then Match (Next)
                        then
                           return True;
                        end if;

                        --  Could or did not work, back up

                        No := No - 1;
                        Input_Pos := Save + No;
                     end loop;
                     exit State_Machine;
                  end;

               when EOP =>
                  return True;                   -- Success!

               when others =>
                  raise Program_Error;           -- Invalid instruction
            end case;

            Scan := Next;
         end loop State_Machine;

         --  If we get here, there is no match.
         --  For successful matches when EOP" is the terminating point.

         return False;
      end Match;

      ---------
      -- Try --
      ---------

      function Try (Pos : in Positive) return Boolean is
      begin
         Input_Pos := Pos;
         Matches := (others => No_Match);

         if Match (Program_First + 1) then
            Matches (0) := (Pos, Input_Pos - 1);
            return True;
         end if;

         return False;
      end Try;

   --  Start of processing for Match

   begin
      --  Check validity of program

      pragma Assert
        (Program (Program_First) /= MAGIC,
         "Corrupted Pattern_Matcher");

      --  If there is a "must appear" string, look for it

      if Self.Must_Have_Length > 0 then
         declare
            First      : constant Character := Program (Self.Must_Have);
            Must_First : constant Pointer := Self.Must_Have;
            Must_Last  : constant Pointer :=
                           Must_First + Pointer (Self.Must_Have_Length - 1);
            Next_Try   : Natural := Index (Data'First, First);

         begin
            while Next_Try /= 0
              and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1)
                = String (Program (Must_First .. Must_Last))
            loop
               Next_Try := Index (Next_Try + 1, First);
            end loop;

            if Next_Try = 0 then
               return;                  -- Not present
            end if;
         end;
      end if;

      --  Mark beginning of line for ^

      BOL_Pos := Data'First;

      --  Simplest case first: an anchored match need be tried only once

      if Self.Anchored then
         Matched := Try (Data'First);

      elsif Self.First /= Ascii.NUL then

         --  We know what char it must start with

         declare
            Next_Try : Natural := Index (Data'First, Self.First);

         begin
            while Next_Try /= 0 loop
               Matched := Try (Next_Try);
               exit when Matched;
               Next_Try := Index (Next_Try + 1, Self.First);
            end loop;
         end;

      else
         --  Messy cases: try all locations

         for S in Data'Range loop
            Matched := Try (S);
            exit when Matched;
         end loop;
      end if;

      --  Matched has its value

      return;
   end Match;

   -----------
   -- Match --
   -----------

   function  Match
     (Self : Pattern_Matcher;
      Data : String)
      return Natural
   is
      Matches : Match_Array;

   begin
      Match (Self, Data, Matches);

      if Matches (0).First < Data'First then
         return Data'First - 1;
      end if;

      return Matches (0).First;
   end Match;

   procedure Match
     (Expression : String;
      Data       : String;
      Matches    : out Match_Array;
      Size       : Program_Size := Default_Program_Size)
   is
      PM         : Pattern_Matcher (Size);
      Final_Size : Program_Size; -- unused

   begin
      Compile (PM, Expression, Final_Size);
      Match (PM, Data, Matches);
   end Match;

   function  Match
     (Expression : String;
      Data       : String;
      Size       : Program_Size := Default_Program_Size)
      return       Natural
   is
      PM         : Pattern_Matcher (Size);
      Final_Size : Program_Size; -- unused

   begin
      Compile (PM, Expression, Final_Size);
      return Match (PM, Data);
   end Match;

   function  Match
     (Expression : String;
      Data       : String;
      Size       : Program_Size := Default_Program_Size)
      return       Boolean
   is
      Matches    : Match_Array;
      PM         : Pattern_Matcher (Size);
      Final_Size : Program_Size; -- unused

   begin
      Compile (PM, Expression, Final_Size);
      Match (PM, Data, Matches);
      return Matches (0).First >= Data'First;
   end Match;

end GNAT.Regpat;