File : g-regpat.ads


------------------------------------------------------------------------------
--                                                                          --
--                         GNAT LIBRARY COMPONENTS                          --
--                                                                          --
--                          G N A T . R E G P A T                           --
--                                                                          --
--                                 S p e c                                  --
--                                                                          --
--                            $Revision: 1.5 $
--                                                                          --
--               Copyright (C) 1986 by University of Toronto.               --
--           Copyright (C) 1996-1999 Ada Core Technologies, Inc.            --
--                                                                          --
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
-- terms of the  GNU General Public License as published  by the Free Soft- --
-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
-- for  more details.  You should have  received  a copy of the GNU General --
-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
-- to  the Free Software Foundation,  59 Temple Place - Suite 330,  Boston, --
-- MA 02111-1307, USA.                                                      --
--                                                                          --
-- As a special exception,  if other files  instantiate  generics from this --
-- unit, or you link  this unit with other files  to produce an executable, --
-- this  unit  does not  by itself cause  the resulting  executable  to  be --
-- covered  by the  GNU  General  Public  License.  This exception does not --
-- however invalidate  any other reasons why  the executable file  might be --
-- covered by the  GNU Public License.                                      --
--                                                                          --
-- GNAT is maintained by Ada Core Technologies Inc (http://www.gnat.com).   --
--                                                                          --
------------------------------------------------------------------------------

--  This is an altered Ada 95 version of the original V7 style regular
--  expression library written in C by Henry Spencer. Apart from the
--  translation to Ada, the interface has been considerably changed to
--  use the Ada String type instead of C-style nul-terminated strings.
--  The internal format of compiled regular expressions has not been
--  modified though and is binary compatible with the C version.

------------------------------------------------------------
-- Summary of Pattern Matching Packages in GNAT Hierarchy --
------------------------------------------------------------

--  There are three related packages that perform pattern maching functions.
--  the following is an outline of these packages, to help you determine
--  which is best for your needs.

--     GNAT.Regexp (files g-regexp.ads/g-regexp.adb)
--       This is a simple package providing Unix-style regular expression
--       matching with the restriction that it matches entire strings. It
--       is particularly useful for file name matching, and in particular
--       it provides "globbing patterns" that are useful in implementing
--       unix or DOS style wild card matching for file names.

--     GNAT.Regpat (files g-regpat.ads/g-regpat.adb)
--       This is a more complete implementation of Unix-style regular
--       expressions, copied from the original V7 style regular expression
--       library written in C by Henry Spencer. It is functionally the
--       same as this library, and uses the same internal data structures
--       stored in a binary compatible manner.

--     GNAT.Spitbol.Patterns (files g-spipat.ads/g-spipat.adb)
--       This is a completely general patterm matching package based on the
--       pattern language of SNOBOL4, as implemented in SPITBOL. The pattern
--       language is modeled on context free grammars, with context sensitive
--       extensions that provide full (type 0) computational capabilities.

package GNAT.Regpat is
pragma Pure (Regpat);

   --   The syntax of the regular expressions is:
   --
   --      expression ::=  branch { '|' branch }
   --
   --      branch ::= mult_branch | atom
   --
   --      mult_branch ::= atom mult
   --
   --      atom ::= '^' | '$' |
   --         at_begin
   --       | at_end
   --       | any
   --       | any_of
   --       | any_but
   --       | nested_expression
   --       | literal
   --
   --      mult ::=
   --         at_least_once
   --       | any_number_of
   --       | at_most_once
   --
   --      at_begin ::= '^'
   --
   --      at_end ::= '$'
   --
   --      any ::= '.'
   --
   --      any_of  ::= '[' set ']'
   --
   --      any_but ::= '[' '^' set ']'
   --
   --      nested_expresion := '(' expression ')'
   --
   --      literal ::= { char }
   --
   --      at_least_once ::= '+'
   --
   --      any_number_of ::= '*'
   --
   --      at_most_once ::= '?'
   --
   --      char ::=
   --         escaped_char
   --       | normal_char
   --
   --      set ::= { char | range }
   --
   --      escaped_char ::= '\' [ magic_char | normal_char]
   --
   --      range ::= char '-' char
   --
   --      magic_char ::= '^' | '$' | '.' | '(' | ')' | '[' | ']' | '\'
   --       | '+' | '*' + '?'
   --
   --      normal_char ::=  all characters that are not in magic_char

   --  For semantics of regular expressions, consult V7 Unix documentation

   Expression_Error : exception;
   --  This exception is raised when trying to compile an invalid
   --  regular expression. All subprograms taking an expression
   --  as parameter may raise Expression_Error.

   Max_Nesting      : constant := 10;
   Max_Program_Size : constant := 2**15 - 1;

   type Program_Size is range 0 .. Max_Program_Size;
   for Program_Size'Size use 16;

   Default_Program_Size : constant := 512;
   --  This size is used for implicitly generated matchers.

   -----------------
   -- Match_Array --
   -----------------

   subtype Match_Count is Natural range 0 .. Max_Nesting;

   type Match_Location is record
      First   : Natural := 0;
      Last    : Natural := 0;
   end record;

   type Match_Array is array (Match_Count) of Match_Location;

   No_Match : constant Match_Location := (First => 0, Last => 0);
   --  The No_Match constant is (0, 0) to differentiate between
   --  matching a null string at position 1, which uses (1, 0)
   --  and no match at all.

   ------------------------------
   -- Pattern_Matcher Creation --
   ------------------------------

   type Pattern_Matcher (Size : Program_Size) is private;

   function Compile
     (Expression : String)
      return       Pattern_Matcher;
   --  Compile a regular expression into internal code.
   --  Raises Expression_Error if Expression is not a legal regular expression.

   procedure Compile
     (Matcher         : out Pattern_Matcher;
      Expression      : String;
      Final_Code_Size : out Program_Size);
   --  Compile a regular expression into into internal code
   --  This procedure is significantly faster than the function
   --  Compile, as there is a known maximum size for the matcher.
   --  This function raises Storage_Error if Matcher is too small
   --  to hold the resulting code, or Expression_Error is Expression
   --  is not a legal regular expression.

   --------------
   -- Matching --
   --------------

   procedure Match
     (Expression : String;
      Data       : String;
      Matches    : out Match_Array;
      Size       : Program_Size := Default_Program_Size);
   --  Match Expression against Data and store result in Matches
   --  Function raises Storage_Error if Size is too small for Expression,
   --  or Expression_Error if Expression is not a legal regular expression

   function  Match
     (Expression : String;
      Data       : String;
      Size       : Program_Size := Default_Program_Size)
      return       Natural;
   --  Return the position where Data matches, or 0 if there is no match
   --  Function raises Storage_Error if Size is too small for Expression
   --  or Expression_Error if Expression is not a legal regular expression

   function Match
     (Expression : String;
      Data       : String;
      Size       : Program_Size := Default_Program_Size)
      return       Boolean;
   --  Return True iff Data matches Expression. Match raises Storage_Error
   --  if Size is too small for Expression, or Expression_Error if Expression
   --  is not a legal regular expression

   function  Match
     (Self : Pattern_Matcher;
      Data : String)
      return Natural;
   --  Return the position where Data matches, or 0 if there is no match.
   --  Raises Expression_Error if Expression is not a legal regular expression.

   procedure Match
     (Self    : Pattern_Matcher;
      Data    : String;
      Matches : out Match_Array);
   --  Match Data using the given pattern matcher and store result in Matches.
   --  Raises Expression_Error if Expression is not a legal regular expression.

private

   pragma Inline (Match);

   subtype Pointer is Program_Size range 0 .. Max_Program_Size;

   Pointer_Size : constant := Pointer'Size;

   --  The Pointer type is used to point into Program_Data

   --  Note that the pointer type is not necessarily 2 bytes
   --  although it is stored in the program using 2 bytes

   type Program_Data is array (Pointer range <>) of Character;

   Program_First : constant := 1;

   --  The "internal use only" fields in regexp are present to pass
   --  info from compile to execute that permits the execute phase
   --  to run lots faster on simple cases.  They are:

   --     First              character that must begin a match or Ascii.Nul
   --     Anchored           true iff match must start at beginning of line
   --     Must_Have          pointer to string that match must include or null
   --     Must_Have_Length   length of Must_Have string

   --  First and Anchored permit very fast decisions on suitable
   --  starting points for a match, cutting down the work a lot.
   --  Must_Have permits fast rejection of lines that cannot possibly
   --  match.

   --  The Must_Have tests are costly enough that Optimize
   --  supplies a Must_Have only if the r.e. contains something potentially
   --  expensive (at present, the only such thing detected is * or +
   --  at the start of the r.e., which can involve a lot of backup).
   --  The length is supplied because the test in Execute needs it
   --  and Optimize is computing it anyway.

   --  The initialization is meant to fail-safe in case the user of this
   --  package tries to use an uninitialized matcher. This takes advantage
   --  of the knowledge that Ascii.Nul translates to the end-of-program (EOP)
   --  instruction code of the state machine.

   type Pattern_Matcher (Size : Pointer) is record
      First            : Character   := Ascii.Nul;     --  internal use only
      Anchored         : Boolean     := False;         --  internal use only
      Must_Have        : Pointer     := 0;             --  internal use only
      Must_Have_Length : Natural     := 0;             --  internal use only
      Program          : Program_Data (0 .. Size) := (others => Ascii.Nul);
   end record;

end GNAT.Regpat;