#!/usr/bin/perl -w

use strict;

use vars qw($VERSION);

require 5.005;

$VERSION = sprintf "%d.%d", q$Revision: 1.24 $ =~ /(\d+)/g; # jhi@iki.fi

use Getopt::Long;

my $Rules    = 0;
my $Stdout   = 1;
my $Out      = 1;
my $Format   = "%o --> %t";
my $Help     = 0;
my $Version  = 0;

sub usage {
    die <<__EO__;
$0: Usage: $0 [--nostdout] [--nofile] [--format=s] [--rules]
    [--help] [--version] [ lexicon_file rules_file ]
The options can be abbreviated and one dash used instead of two.
If no lexicon and rules files are specified, they will be asked interactively.
$0: Sound change applier (C) 2003 Jarkko Hietaniemi
$0: Based on Mark Rosenfelder's C version,
$0: see http://www.zompist.com/sounds.htm
__EO__
    exit(1);
}

usage() unless GetOptions(
			  'Rules'     => \$Rules,
			  'Stdout!'   => \$Stdout,
			  'Out!'     => \$Out,
			  'Format=s'  => \$Format,
			  'Help'      => \$Help,
			  'Version'   => \$Version,
			 );

usage() if $Help;

if ($Version) {
    print $VERSION, "\n";
    exit(1);
}

my ($LX, $SC);

sub do_ask {
    my ($type) = @_;
    print "Enter the name of the $type file: ";
    my $name = <STDIN>;
    if (defined $name) { chomp $name }
    unless (defined $name && length $name) {
	die "$0: \u$type file undefined\n";
    }
    return $name;
}

if (@ARGV == 2) {
    ($LX, $SC) = @ARGV;
} elsif (@ARGV == 0) {
    $LX = do_ask("lexicon");
    $SC = do_ask("rules");
} else {
    usage();
}

sub do_ext {
    my ($self, $type, $ext) = @_;
    unless (-f $_[0]) {
	if (-f "$_[0]$ext") {
	    $_[0] = "$_[0]$ext";
	} else {
	    die "$0: No $type file '$_[0]'\n";
	}
    }
}

do_ext($LX, "lexicon", ".lex");
do_ext($SC, "rules",   ".sc");

my %Cat;
my @Rule;

sub hasCat {
    my $Cat = join('', keys %Cat);
    $_[0] =~ /[$Cat]/;
}

sub expandCat {
    if (hasCat($_[0])) {
	$_[0] =~ s/(.)/exists $Cat{$1} ? $Cat{$1} : $1/eg;
    }
}

sub addRule {
    my ($from, $to, $env) = @_;

    my (@From, @To);
    my $i = 1;
    # print "from=$from to=$to env=$env\n";
    while ($env =~ /(?:\((.)\)|(_)|(\#)|(.))/g) {
	my ($opt, $foc, $anc, $els) = ($1, $2, $3, $4);
	# print "opt=$opt foc=$foc anc=$anc els=$els\n";
	my ($f, $t);
	if (defined $opt) {	# Optional.
	    $els = $opt;
	    $opt = 1;
	}
	if (defined $foc) {	# The focus.
	    $f = $from;
	    my $fHasCat = hasCat($f);
	    if ($fHasCat) {
		expandCat($f);
		# Delay [] wrapping.
	    }
	    $t = $to;
	    if (hasCat($t)) {
		expandCat($t);
		if ($fHasCat) { # Both sides have variables.
		    $t = qq{substr("$t", index("$f", \$$i), 1)};
		}
	    } else {
		$t = qq{"$t"};
	    }
	    $f = "[$f]" if $fHasCat; # Do the [] wrapping now.
	    $i++;
	} elsif (defined $anc) {	# An anchor.
	    if (pos($env)) {
		push @From, "\$";
	    } else {
		push @From, "\^";
            }
	    next; # Next token, please.
	} elsif (defined $els) {	# Something else.
	    my $s = $els;
	    if (hasCat($s)) {
		expandCat($s);
		$f = "[$s]";
	    } else {
		$f = $els;
	    }
	    $f .= '?' if $opt;
	    $t = "\$$i";
	    $i++;
	} else {
	    die "$0: Error: opt=$opt foc=$foc anc=$anc els=$els\n";
	}
	push @From, "($f)";
	push @To,   $t;
    }
    my $From = join('',  @From);
    my $To   = join('.', @To);
    eval qq{sub { my \$o = \$_; return "\$env\t$from -> $to\t\$o -> \$_"  if s/$From/$To/ge }};
    die "From = $From, To = $To: $@" if $@;
    push @Rule, eval qq{sub { my \$o = \$_; return "\$env\t$from -> $to\t\$o -> \$_"  if s/$From/$To/ge && \$_ ne \$o }};
}

sub parseRule {
    return if /^\s*(\*.*)?$/;
    if (m{^(.)=(.+)}) {
	my ($var, $val) = ($1, $2);
	$Cat{$var} = quotemeta $val;
    } elsif (m{^([^/]+)/([^/]*)/(.+)$}) {
	my ($from, $to, $env) = ($1, $2, $3);
	unless ($env =~ /^[^_]*_[^_]*$/) {
	    die "$SC:$.: must have exactly one _ in environment\n";
	} elsif ($env =~ /\#/) {
	    unless ($env =~ m{^\#([^\#]*)\#?$} ||
		    $env =~ m{^([^\#]*)\#$}) {
		die "$SC:$.: \# must be at the beginning or at the end\n";
	    }
	}
	addRule(quotemeta $from, quotemeta $to, $env);
   } else {
       die "$SC:$.: unknown syntax\n";
   }
}

if (open(SC, $SC)) {
    while (<SC>) { parseRule() } close(SC);
} else {
    die "$0: Failed to open rules file '$SC' for reading: $!\n";
}

my $out;
my $OUT = $SC;

$OUT =~ s/\.sc$/.out/;

if ($Out) {
    unless (open($out, ">$OUT")) {
	die "$0: Failed to open output file '$OUT' for writing: $!\n";
    }
}

sub do_print {
    print @_      if $Stdout;
    print $out @_ if $Out;
}

if (open(LX, $LX)) {
    while (<LX>) {
	return if /^\s*(\*.*)?$/;
	s/^\s+//; s/\s+$//;
	my $o = $_;
	my @v;
	for my $r (@Rule) {
	    while (my $v = &{ $r }) {
		push @v, $v if defined $v
	    }
	}
	my $out = $Format;
	$out =~ s/%o/$o/g;
	$out =~ s/%t/$_/g;
	do_print $out, "\n";
	if ($Rules) {
	    for (@v) {
		do_print "\t$_\n";
	    }
	}
    }
    close(LX);
    close $out if defined $out;
} else {
    die "$0: Failed to open lexicon file '$LX' for reading: $!\n";
}

exit(0);

__END__
=head1 NAME

sounds - apply sound changes

=head1 SYNOPIS

  sounds.pl [--nostdout] [--noout] [--format=s] [--rules]
            [--help] [--version] [lexicon_file rules_file]

=head1 DESCRIPTION

The F<sounds.pl> converts the B<lexicon> using the B<rules> and displays
the result.

=head2 The Lexicon File

The lexicon file contains words, one per line.

The lexicon file may contain empty lines, and also comment lines that
have (optional leading whitespace followed by) a "*".  Anything on
that line following the "*" is ignored.

If no lexicon file by the specified name does not exist,
also the name with F<.lex> appended will be tried.

=head2 The Rules Language

The F<rules_file> contains rules of the forms

    category=characters
    from/to/environment

With the first form, called category definitions, you can define
classes or groups of characters, for example

    V=aeiou

You can use these definitions in the rules of the second form,
called transformation rules.  For example:

    z/s/_

to change 'z' into 's' everywhere.  The '_' stands for the I<from> part.
Another example is

    m/n/_#

which changes word-final 'm' into 'n'.  The '#' stands for either the
end of the word or the beginning of the word, depending on the context.
Further example using category definitions:

    F=ie
    c/i/F_t

meaning that c changes to 'i' after the vowels 'i' and 'e' and before
a 't'.  Both parts can have category definitions, too:

    S=ptc
    Z=bdg
    S/Z/V_V

meaning that the letters 'ptc' change to their voiced equivalents
'bdg' between vowels.  If the lengths of I<from> and I<to> are not
equal, weird results will ensue.

Optional elements are indicated by enclosing them in parenthesis:

    s(s)

means one or two esses.

Similarly to the lexicon file, the rules file may contain empty lines,
and also comment lines that have (optional leading whitespace followed by)
a "*", and anything on the line following the "*" is ignored.

If no rules file by the specified name does not exist,
also the name with F<.sc> appended will be tried.

=head2 Options

  --help

Display the basic help text.

  --nostdout

Do not display the results in screen.

  --noout

Do not produce the output file.  The default is to create
F<foo.out> from F<foo.sc>.

  --format=s

The default format is C<< %o --> %t >> which means the original followed
by an "arrow" and the transformed result.

  --rules

Display the individual rules being applied.

  --version

Show version of the F<sounds.pl> script.

All the options can be shortened to their unique prefixes, and also the
leading C<--> be shortened to a single C<->.

=head1 ACKNOWLEDGEMENTS

Heavy intellectual debt to Mark Rosenfelder,
see http://www.zompist.com/sounds.htm
The user interface is not exactly the same.

=head1 PREREQUISITES

Getopt::Long
strict
vars

=head1 SCRIPT CATEGORIES

Linguistics

=head1 README

Apply sound changes.  Hopefully amusing for linguists, hobby or serious.

=head1 SEE ALSO

http://www.zompist.com/sounds.htm

=head1 COPYRIGHT

(C) 2003 by Jarkko Hietaniemi <jhi@iki.fi>

All rights reserved. You may distribute this code under the terms
of either the GNU General Public License or the Artistic License,
as specified in the Perl README file.

=cut