#!/usr/bin/perl -w use strict; use vars qw($VERSION); require 5.005; $VERSION = sprintf "%d.%d", q$Revision: 1.24 $ =~ /(\d+)/g; # jhi@iki.fi use Getopt::Long; my $Rules = 0; my $Stdout = 1; my $Out = 1; my $Format = "%o --> %t"; my $Help = 0; my $Version = 0; sub usage { die <<__EO__; $0: Usage: $0 [--nostdout] [--nofile] [--format=s] [--rules] [--help] [--version] [ lexicon_file rules_file ] The options can be abbreviated and one dash used instead of two. If no lexicon and rules files are specified, they will be asked interactively. $0: Sound change applier (C) 2003 Jarkko Hietaniemi $0: Based on Mark Rosenfelder's C version, $0: see http://www.zompist.com/sounds.htm __EO__ exit(1); } usage() unless GetOptions( 'Rules' => \$Rules, 'Stdout!' => \$Stdout, 'Out!' => \$Out, 'Format=s' => \$Format, 'Help' => \$Help, 'Version' => \$Version, ); usage() if $Help; if ($Version) { print $VERSION, "\n"; exit(1); } my ($LX, $SC); sub do_ask { my ($type) = @_; print "Enter the name of the $type file: "; my $name = <STDIN>; if (defined $name) { chomp $name } unless (defined $name && length $name) { die "$0: \u$type file undefined\n"; } return $name; } if (@ARGV == 2) { ($LX, $SC) = @ARGV; } elsif (@ARGV == 0) { $LX = do_ask("lexicon"); $SC = do_ask("rules"); } else { usage(); } sub do_ext { my ($self, $type, $ext) = @_; unless (-f $_[0]) { if (-f "$_[0]$ext") { $_[0] = "$_[0]$ext"; } else { die "$0: No $type file '$_[0]'\n"; } } } do_ext($LX, "lexicon", ".lex"); do_ext($SC, "rules", ".sc"); my %Cat; my @Rule; sub hasCat { my $Cat = join('', keys %Cat); $_[0] =~ /[$Cat]/; } sub expandCat { if (hasCat($_[0])) { $_[0] =~ s/(.)/exists $Cat{$1} ? $Cat{$1} : $1/eg; } } sub addRule { my ($from, $to, $env) = @_; my (@From, @To); my $i = 1; # print "from=$from to=$to env=$env\n"; while ($env =~ /(?:\((.)\)|(_)|(\#)|(.))/g) { my ($opt, $foc, $anc, $els) = ($1, $2, $3, $4); # print "opt=$opt foc=$foc anc=$anc els=$els\n"; my ($f, $t); if (defined $opt) { # Optional. $els = $opt; $opt = 1; } if (defined $foc) { # The focus. $f = $from; my $fHasCat = hasCat($f); if ($fHasCat) { expandCat($f); # Delay [] wrapping. } $t = $to; if (hasCat($t)) { expandCat($t); if ($fHasCat) { # Both sides have variables. $t = qq{substr("$t", index("$f", \$$i), 1)}; } } else { $t = qq{"$t"}; } $f = "[$f]" if $fHasCat; # Do the [] wrapping now. $i++; } elsif (defined $anc) { # An anchor. if (pos($env)) { push @From, "\$"; } else { push @From, "\^"; } next; # Next token, please. } elsif (defined $els) { # Something else. my $s = $els; if (hasCat($s)) { expandCat($s); $f = "[$s]"; } else { $f = $els; } $f .= '?' if $opt; $t = "\$$i"; $i++; } else { die "$0: Error: opt=$opt foc=$foc anc=$anc els=$els\n"; } push @From, "($f)"; push @To, $t; } my $From = join('', @From); my $To = join('.', @To); eval qq{sub { my \$o = \$_; return "\$env\t$from -> $to\t\$o -> \$_" if s/$From/$To/ge }}; die "From = $From, To = $To: $@" if $@; push @Rule, eval qq{sub { my \$o = \$_; return "\$env\t$from -> $to\t\$o -> \$_" if s/$From/$To/ge && \$_ ne \$o }}; } sub parseRule { return if /^\s*(\*.*)?$/; if (m{^(.)=(.+)}) { my ($var, $val) = ($1, $2); $Cat{$var} = quotemeta $val; } elsif (m{^([^/]+)/([^/]*)/(.+)$}) { my ($from, $to, $env) = ($1, $2, $3); unless ($env =~ /^[^_]*_[^_]*$/) { die "$SC:$.: must have exactly one _ in environment\n"; } elsif ($env =~ /\#/) { unless ($env =~ m{^\#([^\#]*)\#?$} || $env =~ m{^([^\#]*)\#$}) { die "$SC:$.: \# must be at the beginning or at the end\n"; } } addRule(quotemeta $from, quotemeta $to, $env); } else { die "$SC:$.: unknown syntax\n"; } } if (open(SC, $SC)) { while (<SC>) { parseRule() } close(SC); } else { die "$0: Failed to open rules file '$SC' for reading: $!\n"; } my $out; my $OUT = $SC; $OUT =~ s/\.sc$/.out/; if ($Out) { unless (open($out, ">$OUT")) { die "$0: Failed to open output file '$OUT' for writing: $!\n"; } } sub do_print { print @_ if $Stdout; print $out @_ if $Out; } if (open(LX, $LX)) { while (<LX>) { return if /^\s*(\*.*)?$/; s/^\s+//; s/\s+$//; my $o = $_; my @v; for my $r (@Rule) { while (my $v = &{ $r }) { push @v, $v if defined $v } } my $out = $Format; $out =~ s/%o/$o/g; $out =~ s/%t/$_/g; do_print $out, "\n"; if ($Rules) { for (@v) { do_print "\t$_\n"; } } } close(LX); close $out if defined $out; } else { die "$0: Failed to open lexicon file '$LX' for reading: $!\n"; } exit(0); __END__ =head1 NAME sounds - apply sound changes =head1 SYNOPIS sounds.pl [--nostdout] [--noout] [--format=s] [--rules] [--help] [--version] [lexicon_file rules_file] =head1 DESCRIPTION The F<sounds.pl> converts the B<lexicon> using the B<rules> and displays the result. =head2 The Lexicon File The lexicon file contains words, one per line. The lexicon file may contain empty lines, and also comment lines that have (optional leading whitespace followed by) a "*". Anything on that line following the "*" is ignored. If no lexicon file by the specified name does not exist, also the name with F<.lex> appended will be tried. =head2 The Rules Language The F<rules_file> contains rules of the forms category=characters from/to/environment With the first form, called category definitions, you can define classes or groups of characters, for example V=aeiou You can use these definitions in the rules of the second form, called transformation rules. For example: z/s/_ to change 'z' into 's' everywhere. The '_' stands for the I<from> part. Another example is m/n/_# which changes word-final 'm' into 'n'. The '#' stands for either the end of the word or the beginning of the word, depending on the context. Further example using category definitions: F=ie c/i/F_t meaning that c changes to 'i' after the vowels 'i' and 'e' and before a 't'. Both parts can have category definitions, too: S=ptc Z=bdg S/Z/V_V meaning that the letters 'ptc' change to their voiced equivalents 'bdg' between vowels. If the lengths of I<from> and I<to> are not equal, weird results will ensue. Optional elements are indicated by enclosing them in parenthesis: s(s) means one or two esses. Similarly to the lexicon file, the rules file may contain empty lines, and also comment lines that have (optional leading whitespace followed by) a "*", and anything on the line following the "*" is ignored. If no rules file by the specified name does not exist, also the name with F<.sc> appended will be tried. =head2 Options --help Display the basic help text. --nostdout Do not display the results in screen. --noout Do not produce the output file. The default is to create F<foo.out> from F<foo.sc>. --format=s The default format is C<< %o --> %t >> which means the original followed by an "arrow" and the transformed result. --rules Display the individual rules being applied. --version Show version of the F<sounds.pl> script. All the options can be shortened to their unique prefixes, and also the leading C<--> be shortened to a single C<->. =head1 ACKNOWLEDGEMENTS Heavy intellectual debt to Mark Rosenfelder, see http://www.zompist.com/sounds.htm The user interface is not exactly the same. =head1 PREREQUISITES Getopt::Long strict vars =head1 SCRIPT CATEGORIES Linguistics =head1 README Apply sound changes. Hopefully amusing for linguists, hobby or serious. =head1 SEE ALSO http://www.zompist.com/sounds.htm =head1 COPYRIGHT (C) 2003 by Jarkko Hietaniemi <jhi@iki.fi> All rights reserved. You may distribute this code under the terms of either the GNU General Public License or the Artistic License, as specified in the Perl README file. =cut