File : g-regpat.ads
------------------------------------------------------------------------------
-- --
-- GNAT LIBRARY COMPONENTS --
-- --
-- G N A T . R E G P A T --
-- --
-- S p e c --
-- --
-- $Revision: 1.5 $
-- --
-- Copyright (C) 1986 by University of Toronto. --
-- Copyright (C) 1996-1999 Ada Core Technologies, Inc. --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 2, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
-- for more details. You should have received a copy of the GNU General --
-- Public License distributed with GNAT; see file COPYING. If not, write --
-- to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, --
-- MA 02111-1307, USA. --
-- --
-- As a special exception, if other files instantiate generics from this --
-- unit, or you link this unit with other files to produce an executable, --
-- this unit does not by itself cause the resulting executable to be --
-- covered by the GNU General Public License. This exception does not --
-- however invalidate any other reasons why the executable file might be --
-- covered by the GNU Public License. --
-- --
-- GNAT is maintained by Ada Core Technologies Inc (http://www.gnat.com). --
-- --
------------------------------------------------------------------------------
-- This is an altered Ada 95 version of the original V7 style regular
-- expression library written in C by Henry Spencer. Apart from the
-- translation to Ada, the interface has been considerably changed to
-- use the Ada String type instead of C-style nul-terminated strings.
-- The internal format of compiled regular expressions has not been
-- modified though and is binary compatible with the C version.
------------------------------------------------------------
-- Summary of Pattern Matching Packages in GNAT Hierarchy --
------------------------------------------------------------
-- There are three related packages that perform pattern maching functions.
-- the following is an outline of these packages, to help you determine
-- which is best for your needs.
-- GNAT.Regexp (files g-regexp.ads/g-regexp.adb)
-- This is a simple package providing Unix-style regular expression
-- matching with the restriction that it matches entire strings. It
-- is particularly useful for file name matching, and in particular
-- it provides "globbing patterns" that are useful in implementing
-- unix or DOS style wild card matching for file names.
-- GNAT.Regpat (files g-regpat.ads/g-regpat.adb)
-- This is a more complete implementation of Unix-style regular
-- expressions, copied from the original V7 style regular expression
-- library written in C by Henry Spencer. It is functionally the
-- same as this library, and uses the same internal data structures
-- stored in a binary compatible manner.
-- GNAT.Spitbol.Patterns (files g-spipat.ads/g-spipat.adb)
-- This is a completely general patterm matching package based on the
-- pattern language of SNOBOL4, as implemented in SPITBOL. The pattern
-- language is modeled on context free grammars, with context sensitive
-- extensions that provide full (type 0) computational capabilities.
package GNAT.Regpat is
pragma Pure (Regpat);
-- The syntax of the regular expressions is:
--
-- expression ::= branch { '|' branch }
--
-- branch ::= mult_branch | atom
--
-- mult_branch ::= atom mult
--
-- atom ::= '^' | '$' |
-- at_begin
-- | at_end
-- | any
-- | any_of
-- | any_but
-- | nested_expression
-- | literal
--
-- mult ::=
-- at_least_once
-- | any_number_of
-- | at_most_once
--
-- at_begin ::= '^'
--
-- at_end ::= '$'
--
-- any ::= '.'
--
-- any_of ::= '[' set ']'
--
-- any_but ::= '[' '^' set ']'
--
-- nested_expresion := '(' expression ')'
--
-- literal ::= { char }
--
-- at_least_once ::= '+'
--
-- any_number_of ::= '*'
--
-- at_most_once ::= '?'
--
-- char ::=
-- escaped_char
-- | normal_char
--
-- set ::= { char | range }
--
-- escaped_char ::= '\' [ magic_char | normal_char]
--
-- range ::= char '-' char
--
-- magic_char ::= '^' | '$' | '.' | '(' | ')' | '[' | ']' | '\'
-- | '+' | '*' + '?'
--
-- normal_char ::= all characters that are not in magic_char
-- For semantics of regular expressions, consult V7 Unix documentation
Expression_Error : exception;
-- This exception is raised when trying to compile an invalid
-- regular expression. All subprograms taking an expression
-- as parameter may raise Expression_Error.
Max_Nesting : constant := 10;
Max_Program_Size : constant := 2**15 - 1;
type Program_Size is range 0 .. Max_Program_Size;
for Program_Size'Size use 16;
Default_Program_Size : constant := 512;
-- This size is used for implicitly generated matchers.
-----------------
-- Match_Array --
-----------------
subtype Match_Count is Natural range 0 .. Max_Nesting;
type Match_Location is record
First : Natural := 0;
Last : Natural := 0;
end record;
type Match_Array is array (Match_Count) of Match_Location;
No_Match : constant Match_Location := (First => 0, Last => 0);
-- The No_Match constant is (0, 0) to differentiate between
-- matching a null string at position 1, which uses (1, 0)
-- and no match at all.
------------------------------
-- Pattern_Matcher Creation --
------------------------------
type Pattern_Matcher (Size : Program_Size) is private;
function Compile
(Expression : String)
return Pattern_Matcher;
-- Compile a regular expression into internal code.
-- Raises Expression_Error if Expression is not a legal regular expression.
procedure Compile
(Matcher : out Pattern_Matcher;
Expression : String;
Final_Code_Size : out Program_Size);
-- Compile a regular expression into into internal code
-- This procedure is significantly faster than the function
-- Compile, as there is a known maximum size for the matcher.
-- This function raises Storage_Error if Matcher is too small
-- to hold the resulting code, or Expression_Error is Expression
-- is not a legal regular expression.
--------------
-- Matching --
--------------
procedure Match
(Expression : String;
Data : String;
Matches : out Match_Array;
Size : Program_Size := Default_Program_Size);
-- Match Expression against Data and store result in Matches
-- Function raises Storage_Error if Size is too small for Expression,
-- or Expression_Error if Expression is not a legal regular expression
function Match
(Expression : String;
Data : String;
Size : Program_Size := Default_Program_Size)
return Natural;
-- Return the position where Data matches, or 0 if there is no match
-- Function raises Storage_Error if Size is too small for Expression
-- or Expression_Error if Expression is not a legal regular expression
function Match
(Expression : String;
Data : String;
Size : Program_Size := Default_Program_Size)
return Boolean;
-- Return True iff Data matches Expression. Match raises Storage_Error
-- if Size is too small for Expression, or Expression_Error if Expression
-- is not a legal regular expression
function Match
(Self : Pattern_Matcher;
Data : String)
return Natural;
-- Return the position where Data matches, or 0 if there is no match.
-- Raises Expression_Error if Expression is not a legal regular expression.
procedure Match
(Self : Pattern_Matcher;
Data : String;
Matches : out Match_Array);
-- Match Data using the given pattern matcher and store result in Matches.
-- Raises Expression_Error if Expression is not a legal regular expression.
private
pragma Inline (Match);
subtype Pointer is Program_Size range 0 .. Max_Program_Size;
Pointer_Size : constant := Pointer'Size;
-- The Pointer type is used to point into Program_Data
-- Note that the pointer type is not necessarily 2 bytes
-- although it is stored in the program using 2 bytes
type Program_Data is array (Pointer range <>) of Character;
Program_First : constant := 1;
-- The "internal use only" fields in regexp are present to pass
-- info from compile to execute that permits the execute phase
-- to run lots faster on simple cases. They are:
-- First character that must begin a match or Ascii.Nul
-- Anchored true iff match must start at beginning of line
-- Must_Have pointer to string that match must include or null
-- Must_Have_Length length of Must_Have string
-- First and Anchored permit very fast decisions on suitable
-- starting points for a match, cutting down the work a lot.
-- Must_Have permits fast rejection of lines that cannot possibly
-- match.
-- The Must_Have tests are costly enough that Optimize
-- supplies a Must_Have only if the r.e. contains something potentially
-- expensive (at present, the only such thing detected is * or +
-- at the start of the r.e., which can involve a lot of backup).
-- The length is supplied because the test in Execute needs it
-- and Optimize is computing it anyway.
-- The initialization is meant to fail-safe in case the user of this
-- package tries to use an uninitialized matcher. This takes advantage
-- of the knowledge that Ascii.Nul translates to the end-of-program (EOP)
-- instruction code of the state machine.
type Pattern_Matcher (Size : Pointer) is record
First : Character := Ascii.Nul; -- internal use only
Anchored : Boolean := False; -- internal use only
Must_Have : Pointer := 0; -- internal use only
Must_Have_Length : Natural := 0; -- internal use only
Program : Program_Data (0 .. Size) := (others => Ascii.Nul);
end record;
end GNAT.Regpat;