buildict.pl

This is buildict.pl in view mode; [Download] [Up]
#! /depot/public/bin/perl
# ---Le laboratoire de robotique de l'Institut de recherche d'Hydro-Quebec-----
# 
# Nom     : buildict [options] [[<] inputwords] > outputwords
# Fonction: Combine and reduce dictionaries through affixes.
# Fichiers: buildict.pl
# Notes   : This replaces the munchlist shell script provided with ispell 3.0.9
# 
# Cree    :  5 juillet 93 ------- Martin Boyer <mboyer@ireq-robot.hydro.qc.ca>
# Modifie :  5 juillet 93 -----1- Martin Boyer <mboyer@ireq-robot.hydro.qc.ca>
#           Copyright (c) 1993 Martin Boyer and Hydro-Quebec
# 
# Historique: 
# -----------------------------------------------------------------------------

$copyright = <<EOF;
  Copyright (c) 1993 Martin Boyer and Hydro-Quebec.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.


  Send comments, bug reports and fixes to:

  Martin Boyer                            mboyer@ireq-robot.hydro.qc.ca
  Institut de recherche d'Hydro-Quebec    mboyer@ireq-robot.uucp
  1800, montee Ste-Julie
  Varennes (Quebec) Canada   J3X 1S1
  +1 514 652-8412
EOF

require "getopts.pl";

##########################  Configuration  ###########################

# Where the ispell utilities are (buildhash, icombine, etc.)
$LIBDIR = '/local/lib/ispell';

# Where the 'ispell' executable is
$BINDIR = '/local/bin';

# Where you want temporary files.
# Usage peaks when sorting all roots and possible affixes.
$TMPDIR = '/tmp';

# Your sorting program.
# It must be able to:
#    sort ignoring case when given the '-f' flag,
#    sort on the first field when given the '+0 -1' flags
#    use the specified field separator when given the '-t' flag
#    send its output to the specified file when given the '-o' flag
# If you are using 8-bit characters, it must be able to handle them,
# at least sorting them according to their 'byte' value (the SunOS 4.1.1
# version of sort is OK).  If it can't ignore case on accented characters,
# (like SunOS 4.1.1) then icombine won't be able to eliminate unnessarily
# capitalized words, and the resultant directory won't be minimal 
# (but still valid).
# I tried using PERL to sort, but it multiplied the processing time
# by at least a factor of five.
# If 8 bit characters are used, they should be sorted the same way by sort
# and by perl.  This script should check for that.
# Finally, note that folded sorting is necessary only before icombine.
# It is also used to produce the final output, to make the result easier
# to read, but that is only cosmetics.
#
# The full path must be specified.
# You may want to add a flag to specify a different directory for
# temporary files to prevent the $TMPDIR partition from overflowing.
$SORT = '/usr/bin/sort -T /tmp';

# I use hard links to save space.  Soft links will probably fail
# because some files are deleted, and you don't want a soft link
# to a deleted file.
# An alternative is 'cp' but that will slow you down and use more space.
$COPY = '/usr/bin/ln';

# This must be able to rename a file across filesystems boundaries.
$RENAME = '/usr/bin/mv';

# Used only to print the current time at the beginning of the script.
$DATE = '/usr/bin/date';

# If you don't want to specify full paths for $SORT, $COPY, $RENAME,
# and $DATE, add system directories to this variable.
$ENV{'PATH'} = "$LIBDIR:$BINDIR";

# The default affix table.
$input_aff_table = "$LIBDIR/english.aff";

######################################################################

############################  Constants  #############################

$buildict_version = '1.0';

# You may want to put VALIDWORDS and SORTTMP in a different directory,
# to distribute temporary files in different partitions.
# The other files are pretty much created and deleted as the program goes.
#
# COMBINEDINPUT and VALIDWORDS must be on the same filesystem.
# COMBINEDINPUT and STRIPPEDINPUT may need to, depending on $COPY
# COMBINEDFLAGS and LEGALALL must be on the same filesystem.

$TMP		   = "$TMPDIR/munch$$.";
$FAKEDICT	   = $TMP . 'FAKEDICT';
$FAKEHASH	   = $TMP . 'FAKEHASH.hash';
$INPUTWORDS	   = $TMP . 'INPUTWORDS';
$EXPANDEDINPUT	   = $TMP . 'EXPANDEDINPUT';
$COMBINEDINPUT	   = $TMP . 'COMBINEDINPUT';
$STRIPPEDINPUT	   = $TMP . 'STRIPPEDINPUT';
$VALIDWORDS	   = $TMP . 'VALIDWORDS';
$DBMFILE	   = $TMP . 'DBM';
$SORTTMP	   = $TMP . 'SORTTMP';
$EXPANDEDPAIRS	   = $TMP . 'EXPANDEDPAIRS';
$AFFIXEDROOTS	   = $TMP . 'AFFIXEDROOTS';
$ALLFLAGLIST	   = $TMP . 'ALLFLAGLIST';
$LEGALFLAGLIST	   = $TMP . 'LEGALFLAGLIST';
$COMBINEDFLAGS	   = $TMP . 'COMBINEDFLAGS';
$ALLCROSS	   = $TMP . 'ALLCROSS';
$LEGALALL	   = $TMP . 'LEGALALL';
$ILLEGALCROSS	   = $TMP . 'ILLEGALCROSS';
$LEGALCROSS	   = $TMP . 'LEGALCROSS';
$CROSSPAIRS	   = $TMP . 'CROSSPAIRS';
$LEGALCROSSPAIRS   = $TMP . 'LEGALCROSSPAIRS';
$ILLEGALCROSSPAIRS = $TMP . 'ILLEGALCROSSPAIRS';
$LEGALEXPANDED	   = $TMP . 'LEGALEXPANDED';
$MINIMALAFFIXES	   = $TMP . 'MINIMALAFFIXES';
$NONAFFIXED	   = $TMP . 'NONAFFIXED';
$RESULT		   = $TMP . 'RESULT';

# These files are likely to be deleted by the cleanup subroutine.

@TMP_FILES = ($FAKEDICT, "$FAKEDICT.cnt", "$FAKEDICT.stat", $FAKEHASH,
	      $EXPANDEDINPUT, $COMBINEDINPUT, $STRIPPEDINPUT, $VALIDWORDS,
	      "$DBMFILE.dir", "$DBMFILE.pag", $SORTTMP,
	      $EXPANDEDPAIRS, $AFFIXEDROOTS, $ALLFLAGLIST, $LEGALFLAGLIST,
	      $COMBINEDFLAGS, $ALLCROSS, $LEGALALL,
	      $ILLEGALCROSS, $LEGALCROSS, $CROSSPAIRS,
	      $LEGALCROSSPAIRS, $ILLEGALCROSSPAIRS, $LEGALEXPANDED,
	      $MINIMALAFFIXES, "$MINIMALAFFIXES.cnt", "$MINIMALAFFIXES.stat", 
	      $NONAFFIXED, $RESULT);
######################################################################


######################################################################
#### Subroutines

# Prints its argument preceded by the time since the beginning of the script.
sub DBprint {
    if ($VERY_VERBOSE) {
	local($msg) = @_;
	local($time) = time - $^T;
	printf(STDERR "%2d:%02d.%02d: %s",
	       $time/3600, $time/60%60, $time%60, $msg);
    }
}

sub help {
    print STDERR <<EOF;
This is buildict version $buildict_version.

buildict accepts the following arguments:
   -C			        show the copyright notice
   -c 'output affix table'	affix table to convert to
   -l 'input affix table'	affix table of the input list
   -T 'source format'		format of the input list
   -s 'suppression dictionary'	list of words to exclude
   -w 'word characters'		characters that are part of words
   -o 'output file'		where to put the result if not standard output
   -D				keep files for debugging
   -h				print this help message and exit
   -e				only expand the input list
   -v				verbose output, to help maintain dictionaries
   -V				very verbose output, with timing
EOF
}

sub copyright {
    print STDERR $copyright;
}

sub cleanup {
    if ($DEBUG) {
	print STDERR "You may want to delete the following files:\n";
	foreach (@TMP_FILES) {
	    print STDERR "$_\n"		if -e $_;
	}
    } else {
	foreach (@TMP_FILES) {
	    unlink $_;
	}
    }
}

sub sighandler {
    local($sig) = @_;
    print STDERR "Caught a SIG$sig -- ";
    if ($DEBUG) {
	print STDERR "shutting down.\n";
    } else {
	print STDERR "cleaning up.\n";
    }
    &cleanup;
    exit(1);
}

# Passing a filename, sorts it in place, ignoring case differences.
# Duplicates are then eliminated.
# Returns the number of entries.
sub sort_ignoring_case {
    local($file) = @_;
    # Using sort -u to remove duplicates eliminates "funny" capitalizations.
    # e.g. "MacDonald" disappears if "macdonald" exists
    system("$SORT -f -o $SORTTMP $file");
    open(SORTTMP, "<$SORTTMP");
    open(FILE, ">$file");
    local($last, $num);
    while (<SORTTMP>) {
	next if $_ eq $last;
	print FILE $_;
	$num++;
	$last = $_;
    }
    close FILE;
    close SORTTMP;
    unlink $SORTTMP;
    $num;
}


# Passing a filename, sorts it in place.
# Duplicates are then eliminated.
# Returns the number of entries.
sub sort_file {
    local($file) = @_;
    # Using sort -u to remove duplicates eliminates "funny" capitalizations.
    # e.g. "MacDonald" disappears if "macdonald" exists
    system("$SORT -o $SORTTMP $file");
    open(SORTTMP, "<$SORTTMP");
    open(FILE, ">$file");
    local($last, $num);
    while (<SORTTMP>) {
	next if $_ eq $last;
	print FILE $_;
	$num++;
	$last = $_;
    }
    close FILE;
    close SORTTMP;
    unlink $SORTTMP;
    $num;
}


# Passing a filename, sorts it in place, using only the root field.
# Removes duplicate entries.  Returns the number of entries.
sub sort_affixed {
    local ($file) = @_;
    system("$SORT -t$flagmarker +0 -1 -o $SORTTMP $file");
    open(SORTTMP, "<$SORTTMP");
    open(FILE, ">$file");
    local($last, $num);
    while (<SORTTMP>) {
	next if $_ eq $last;
	print FILE $_;
	$num++;
	$last = $_;
    }
    close FILE;
    close SORTTMP;
    unlink $SORTTMP;
    $num;
}

# Passing a filename, sorts it in place, using only the root field,
# disregarding case. Removes duplicate entries.
# Returns the number of entries.
sub sort_affixed_ignoring_case {
    local ($file) = @_;
    system("$SORT -f -t$flagmarker +0 -1 -o $SORTTMP $file");
    open(SORTTMP, "<$SORTTMP");
    open(FILE, ">$file");
    local($last, $num);
    while (<SORTTMP>) {
	next if $_ eq $last;
	print FILE $_;
	$num++;
	$last = $_;
    }
    close FILE;
    close SORTTMP;
    unlink $SORTTMP;
    $num;
}

# Takes a sorted file of roots/affixes,
# the name of the file where the result should go,
# and a filehandle to a sorted list of valid words.
# The second file is created to contain only roots that 
# appear in the third argument.
# Returns the number of valid roots.
sub remove_illegal_roots {
    local($ALLRAF, $VALID, $REFERENCE) = @_;
    local($position, $num);
    seek($REFERENCE, 0, 0);
    open(ALLRAF, "<$ALLRAF");
    open(VALID, ">$VALID");
    $reference = <$REFERENCE>;
    chop $reference;
    while ($raf = <ALLRAF>) {
	$raf_cmp = (split(/$flagmarker/o, $raf, 2))[0];
	$position = $raf_cmp cmp $reference;
	if ($position < 0) {
	    next;
	}
	if ($position == 0) {
	    print VALID $raf;
	    $num++;
	    next;
	}
	($reference = <$REFERENCE>) || last;
	chop $reference;
	# Try the current word again, pointing to the new dictionary reference
	redo;
    }
    close VALID;
    close ALLRAF;
    $num;
}

sub by_number {
    $a <=> $b;
}

# Given two filehandles, the first to a file of two fields [string,
# number] (sorted on the first field), the second to a sorted list of
# valid words, return a sorted array of the second fields for which
# the first field doesn't appear in the list of valid words.  ugh!

sub find_illegal_raf_numbers {
    local($PAIRS, $DICT) = @_;
    local(%illegals);
    $reference = <$DICT>;
    chop $reference;
    while (<$PAIRS>) {
	($word, $num) = split;
	next if $illegals{$num};
	$position = $word cmp $reference;
	if ($position < 0) {
	    $illegals{$num}++;
	    next;
	}
	if ($position == 0) {
	    next;
	}
	($reference = <$DICT>) || last;
	chop $reference;
	# Try the current word again, pointing to the next reference
	redo;
    }
    # Some words may come (alphabetically) after the last reference word.
    $illegals{$num}++	  if $position != 0;
    while (<$PAIRS>) {
	($word, $num) = split;
	$illegals{$num}++;
    }
    return sort by_number keys(%illegals);
}

# Given the filename of a list of root/affixes,
# the filename of where the legal root/affixes should go,
# and a fileHANDLE to the list of valid words,
# creates the file containing only the legal root/affixes.
# An optional fourth argument is a filename where to print
# the illegal root/affixes.
# The files cannot be the same.

sub remove_illegal_raf {
    local($ALLRAF, $LEGALRAF, $VALIDWORDS, $ILLEGALRAF) = @_;

    open(ROOTAFFIXES,
	 "ispell $wordchars -ee -W0 -d $FAKEHASH -p /dev/null < $ALLRAF|");
    open(EXPANDEDPAIRS, "|$SORT +0 -1 -o $EXPANDEDPAIRS");
    while (<ROOTAFFIXES>) {
	$raf_number++;
	@line = split;
	$raf = shift(@line);
	foreach $expansion (@line) {
	    $num_expansions++;
	    print EXPANDEDPAIRS "$expansion $raf_number\n"
		|| die "Can't print.";
	}
    }
    close ROOTAFFIXES;
    &DBprint("  Closing EXPANDEDPAIRS ($num_expansions expansions, $raf_number root/affixes).\n");
    close EXPANDEDPAIRS;
    undef $raf_number;		# cleanup
    undef $num_expansions;	# cleanup

    # At this point, EXPANDEDPAIRS is sorted and each line has a
    # second field which points into ALLRAF.  We will use this
    # field to remove the illegal root/affix combinations when
    # transferring from ALLRAF to LEGALRAF.

    open(EXPANDEDPAIRS, "<$EXPANDEDPAIRS");
    seek($VALIDWORDS, 0, 0);
    &DBprint("  Finding illegals.\n");
    @illegals = &find_illegal_raf_numbers(EXPANDEDPAIRS, $VALIDWORDS);
    close EXPANDEDPAIRS;
    unlink $EXPANDEDPAIRS			unless $DEBUG;

    &DBprint("  Keeping only legal root/affixes.\n");
    open(ALLRAF, "<$ALLRAF");
    open(LEGALRAF, ">$LEGALRAF");
    open(ILLEGALRAF, ">$ILLEGALRAF")	if $ILLEGALRAF;
  illegal: foreach $illegal (@illegals) {
      while (<ALLRAF>) {
	  if ($. == $illegal) {
	      print ILLEGALRAF	if $ILLEGALRAF;
	      next illegal
	  }
	  $legal_raf++;
	  print LEGALRAF;
      }
    }
    close ILLEGALRAF		if $ILLEGALRAF;
    # Those who remain after the last illegal are legal
    while (<ALLRAF>) {
	$legal_raf++;
	print LEGALRAF;
    }
    close LEGALRAF;
    close ALLRAF;

    unlink $ALLRAF			unless $DEBUG;
    &DBprint("  Done, $legal_raf legal root/affixes.\n");
    undef $legal_raf;		# Cleanup
}

# Given the filename of a list of legal root/affixes, generating illegal
# cross-expansions,
# and a fileHANDLE to the list of valid words,
# creates the file containing only the root/affixes which generate legal
# cross-expansions.

sub remove_illegal_crossexp {
    local($CROSSEXP, $VALIDWORDS, $VALIDCROSS) = @_;
    open(CROSSEXP, "<$CROSSEXP");
    open(CROSSPAIRS, ">$CROSSPAIRS");
    open(VALIDCROSS, ">$VALIDCROSS");
    local($root, $affixes, @prefixes, @suffixes, $p, $s, @affix);
    # For each root with 'p' prefixes and 's' suffixes, test the root
    # with each prefix combined with each suffix, yielding 'p times s'
    # pairs of affixes.
    # Eliminate illegal pairs and recombine prefixes and suffixes in
    # such a way that no illegal pair is created.
    # Note that a list of only prefixes or only suffixes is always
    # legal since they cannot combine and illegal uncombined affixes have
    # already been removed.
    while (<CROSSEXP>) {
	chop;
	($root, $affixes) = split(/$flagmarker/o, $_, 2);
	$#prefixes = -1;
	$#suffixes = -1;
	# Separate the affixes into prefixes and suffixes
	foreach (split(//, $affixes)) {
	    if (index($prefixes, $_) >= 0) {
		push(@prefixes, $_);
	    } else {
		push(@suffixes, $_);
	    }
	}
	# The rest of this routine deals only with cross-expansions.
	# The uncrossed affixes must be listed as valid; they always are.
	print VALIDCROSS $root, $flagmarker, @prefixes, "\n";
	print VALIDCROSS $root, $flagmarker, @suffixes, "\n";
	# Generate prefix+suffix pairs.
	foreach $p (@prefixes) {
	    foreach $s (@suffixes) {
		print(CROSSPAIRS $root, $flagmarker, $p, $s, "\n");
	    }
	}
    }
    close CROSSEXP;
    close CROSSPAIRS;
    &remove_illegal_raf($CROSSPAIRS, $LEGALCROSSPAIRS, $VALIDWORDS,
			$ILLEGALCROSSPAIRS);
    unlink $CROSSPAIRS	    unless $DEBUG;
    if ($VERBOSE && -s $ILLEGALCROSSPAIRS) {
	print STDERR
	    "The following roots and affixes ",
	    "were not found in the dictionary;\n",
	    "if you know they are valid, ",
	    "adding them would save space.\n";
	open(ILLEGALCROSSPAIRS, "<$ILLEGALCROSSPAIRS");
	while (<ILLEGALCROSSPAIRS>) {
	    print STDERR "    $_";
	}
	close ILLEGALCROSSPAIRS;
    }
    unlink $ILLEGALCROSSPAIRS		unless $DEBUG;
    open(LEGALCROSSPAIRS, "<$LEGALCROSSPAIRS");
    local($raf, $previous_root, %prefixes, %suffixes);
    # For each root, try to recombine the affixes.
    while (1) {
	$raf = <LEGALCROSSPAIRS>;
	chop $raf;
	($root, $affixes) = split(/$flagmarker/o, $raf, 2);
	$previous_root = $root if !$previous_root; # First time through
	if ($root ne $previous_root) {
	    # If two or more prefixes are combined with the same set of
	    # affixes, combine the prefixes.
	    while (($p, $s) = each %prefixes) {
		$suffixes{$s} .= $p;
	    }
	    undef %prefixes;
	    while (($p, $s) = each %suffixes) {
		print VALIDCROSS $previous_root, $flagmarker, $p, $s, "\n";
	    }
	    undef %suffixes;
	    $previous_root = $root;
	}
	last if !$raf;
	($p, $s) = split(//, $affixes, 2);
	$prefixes{$p} .= $s;
    }
    close VALIDCROSS;
    close LEGALCROSSPAIRS;
    unlink $LEGALCROSSPAIRS	unless $DEBUG;
}

# Takes a file of sorted (disregarding case) root/affixes and
# a file of sorted non-affixed roots.
# They are merged and printed to the file named as the third argument.
sub merge_files_ignoring_case {
    local($AFF, $NAF, $RESULT) = @_;
    local($aff, $naf, $aff_cmp, $naf_cmp);
    open(AFF, "<$AFF");
    open(NAF, "<$NAF");
    open(RESULT, ">$RESULT");
    $aff = <AFF>;
    ($aff_cmp) = split(/$flagmarker/o, $aff);
    $aff_cmp =~ tr/A-Z\300-\336/a-z\340-\376/;
    $naf_cmp = $naf = <NAF>;
    chop $naf_cmp;
    $naf_cmp =~ tr/A-Z\300-\336/a-z\340-\376/;
    while ($aff && $naf) {
	if ($aff_cmp lt $naf_cmp) {
	    print RESULT $aff;
	    $aff = <AFF>;
	    ($aff_cmp) = split(/$flagmarker/o, $aff);
	    $aff_cmp =~ tr/A-Z\300-\336/a-z\340-\376/;
	} else {
	    print RESULT $naf;
	    $naf_cmp = $naf = <NAF>;
	    chop $naf_cmp;
	    $naf_cmp =~ tr/A-Z\300-\336/a-z\340-\376/;
	}
    }
    if ($aff) {
	print RESULT $aff;
	while (<AFF>) { print RESULT $_; }
    } elsif ($naf) {
	print RESULT $naf;
	while (<NAF>) { print RESULT $_; }
    }
    close $RESULT;
    close $NAF;
    close $AFF;
}

######################################################################


######################################################################
#### Read and process command line options

&Getopts('c:Cl:s:w:o:DT:ehvV');

$input_aff_table = $opt_l   if $opt_l;
if ($opt_c) {
    $output_aff_table = $opt_c;
} else {
    $output_aff_table = $input_aff_table;
}

$source_format = "-T $opt_T"	if $opt_T;
$suppression_dict = $opt_s	if $opt_s;
$wordchars = "-w$opt_w"		if $opt_w;
$expandonly = $opt_e;
$RESULT = $opt_o		if $opt_o;
$VERBOSE = $opt_v;
$VERY_VERBOSE = $opt_V;

if ($opt_h) {
    &help;
    exit(1);
}

if ($opt_C) {
    &copyright;
    exit(1);
}

$DEBUG = $opt_D;

select((select(STDERR), $| = 1)[0]);

$SIG{'INT'} = 'sighandler';
$SIG{'QUIT'} = 'sighandler';
$SIG{'TERM'} = 'sighandler';

# Setting up an exit routine to trap exceptions.
# See the end of this script for EndOfEval.
$start = __LINE__;
eval <<'EndOfEval';

print STDERR `$DATE`			    if $VERBOSE || $VERY_VERBOSE;

if ($#ARGV == 0) {
    $INPUTWORDS = $ARGV[0];
} else {
    &DBprint("Merging input lists.\n");
    push(@TMP_FILES, $INPUTWORDS);
    $delete_inputwords++;
    open(INPUTWORDS, ">$INPUTWORDS")	    || die "Can't open $INPUTWORDS";
    while (<>) { print INPUTWORDS $_; }
    close INPUTWORDS;
}

######################################################################


######################################################################
#### Create a dummy hash file to contain the input affix table.

&DBprint("Creating input affix table.\n");
open(FAKEDICT, ">$FAKEDICT") || die "Can't open $FAKEDICT";
print(FAKEDICT "QQQQQQQQ\n");
close(FAKEDICT);
$status = system("buildhash -s $FAKEDICT $input_aff_table $FAKEHASH");
die "Can't create fake hash file" if ($status/256 != 0);

######################################################################


######################################################################
#### Expand all input lists.
#### Weed out unnecessary capitalization.

&DBprint("Expanding input list.\n");
open(EXPANDEDINPUT, ">$EXPANDEDINPUT");
open(EXPANDING,
     "ispell $wordchars -e -d $FAKEHASH -p /dev/null < $INPUTWORDS |");
while (<EXPANDING>) {
    foreach (split) {
	print EXPANDEDINPUT "$_\n";	# Append each new word to the file
	$num_expansions++;
    }
}
close EXPANDEDING;
close EXPANDEDINPUT;
if ($delete_inputwords) {
    unlink $INPUTWORDS		unless $DEBUG;
}
&DBprint("Sorting and eliminating duplicate expansions.\n");
$num_unique = &sort_ignoring_case($EXPANDEDINPUT);
print STDERR "$num_expansions input words, $num_unique unique words.\n"
    if $VERBOSE;
undef $num_expansions, $num_unique;

if ($expandonly) {
    &DBprint("Printing expanded list as requested.\n");
    unlink $FAKEHASH, $FAKEDICT, "$FAKEDICT.cnt", "$FAKEDICT.stat"
	unless $DEBUG;
    if ($opt_o) {
	# Can't use rename here; the two files may be on different filesystems.
	system "$RENAME $EXPANDEDINPUT $RESULT"
			|| die "Can't copy EXPANDEDINPUT";
    } else {
	open(EXPANDEDINPUT, "<$EXPANDEDINPUT");
	while (<EXPANDEDINPUT>) { print; }
	close EXPANDEDINPUT;
	unlink $EXPANDEDINPUT		unless $DEBUG;
    }
    &DBprint("Done.\n");
    exit 0;
}

&DBprint("Combining capitalizations.\n");
system("icombine $source_format $input_aff_table < $EXPANDEDINPUT \\
						 > $COMBINEDINPUT");
unlink $EXPANDEDINPUT			  unless $DEBUG;
&DBprint("Sorting and eliminating duplicate capitalizations.\n");
$expanded_word_count = &sort_file($COMBINEDINPUT);
print STDERR "The input dictionary contains $expanded_word_count ",
	     "(uncapitalized) words.\n"
    if $VERBOSE;

######################################################################


######################################################################
#### Create a dummy hash file to contain the output affix table.
#### Figure out what the marker and the flags are going to be.

if ($output_aff_table eq $input_aff_table) {
    &DBprint("Using same affix table for input and output.\n");
} else {
    &DBprint("Creating output affix table.\n");
    open(FAKEDICT, ">$FAKEDICT") || die "Can't open $FAKEDICT";
    print(FAKEDICT "QQQQQQQQ\n");
    close(FAKEDICT);
    $status = system("buildhash -s $FAKEDICT $output_aff_table $FAKEHASH");
    die "Can't create fake hash file" if ($status/256 != 0);
}
unlink $FAKEDICT, "$FAKEDICT.cnt", "$FAKEDICT.stat"   unless $DEBUG;

open(AFFIXTABLE, "ispell -D -d $FAKEHASH|");
$state = 'nil';
while(<AFFIXTABLE>) {
    if ($state eq 'nil') {
	if (/^flagmarker\s+(.)/) {
	    $flagmarker = $1;
	}
	if (/^prefixes/) {
	    $state = 'prefixes';
	}
	if (/^suffixes/) {
	    $state = 'suffixes';
	}
    } elsif ($state eq 'prefixes') {
	if (/^\s+flag\s+\*(.)/) {
	    $prefixes .= $1	unless index($prefixes, $1) >= 0;
	}
	if (/^suffixes/) {
	    $state = 'suffixes';
	}
    } elsif ($state eq 'suffixes') {
	if (/^\s+flag\s+\*(.)/) {
	    $suffixes .= $1	unless index($suffixes, $1) >= 0;
	}
    }
}
close AFFIXTABLE;
die "Can't find the flag marker"		    if !$flagmarker;
&DBprint("The flag marker is '$flagmarker'.\n");
if ($VERBOSE) {
print STDERR "\t  "				 if $prefixes && $VERY_VERBOSE;
print STDERR "The prefix flags are '$prefixes'"	 if $prefixes;
print STDERR " and t"				 if $prefixes && $suffixes;
print STDERR "T"				 if !$prefixes && $suffixes;
print STDERR "he suffix flags are '$suffixes'";
print STDERR ".\n"				 if $prefixes || $suffixes;
}

######################################################################


######################################################################
#### If required, remove words already in a specified dictionary.

if ($suppression_dict) {
    &DBprint("Removing words already in dictionary \"$suppression_dict\".\n");
    open(STRIPPEDINPUT,
	 "|ispell $wordchars -l -d $suppression_dict -p /dev/null > \\
		  $STRIPPEDINPUT");
    open(COMBINEDINPUT, "<$COMBINEDINPUT");
    while (<COMBINEDINPUT>) {
	print STRIPPEDINPUT $_;
    }
    close COMBINEDINPUT;
    close STRIPPEDINPUT;
} else {
    &DBprint("No suppression list, munching all words.\n");
    system "$COPY $COMBINEDINPUT $STRIPPEDINPUT" ||
	die "Can't copy input word list.";
}

rename($COMBINEDINPUT, $VALIDWORDS);

######################################################################


######################################################################
#### Collect all possible roots and affixes.
#### Some of those are illegal and will be removed later.

&DBprint("Generating all roots and possible affixes.\n");
open(ALLROOTS,
     "ispell $wordchars -c -W0 -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT|");
open(AFFIXEDROOTS, ">$AFFIXEDROOTS");
while (<ALLROOTS>) {
    @line = split;
    next if $#line == 0;	# No possible root/affix for this word
    shift(@line);		# The original root/affix is not interesting
    foreach $word (@line) {
	print AFFIXEDROOTS "$word\n";
    }
}
close AFFIXEDROOTS;
close ALLROOTS;
&DBprint("Sorting...\n");
# This is where usage of temporary file space peaks
$num_illegals = &sort_affixed($AFFIXEDROOTS);
&DBprint("       ...done.\n");
open(VALIDWORDS, "<$VALIDWORDS");
$num_roots = &remove_illegal_roots($AFFIXEDROOTS, $ALLFLAGLIST, VALIDWORDS);
$num_illegals -= $num_roots;
&DBprint("$num_illegals illegal roots removed, $num_roots roots left.\n");
unlink $AFFIXEDROOTS		unless $DEBUG;

undef $num_illegals;		# cleanup
undef $num_roots;		# cleanup

######################################################################


######################################################################
#### Find all the possible expansions from the previous list. 
#### Remove the root/affix combinations which generate illegal words.

&DBprint("Pruning illegal root/affix combinations.\n");
&remove_illegal_raf($ALLFLAGLIST, $LEGALFLAGLIST, VALIDWORDS);
&DBprint("Combining multiple flags.\n");
&sort_ignoring_case($LEGALFLAGLIST);
system("icombine $source_format $input_aff_table < $LEGALFLAGLIST \\
						 > $COMBINEDFLAGS");
unlink $LEGALFLAGLIST			unless $DEBUG;

######################################################################


######################################################################
#### Eliminate illegal cross-products; i.e. combinations of suffixes
#### and prefixes that expand to words not in the dictionary.

if ($prefixes && $suffixes) {
    &DBprint("Eliminating illegal prefix/root/suffix combinations.\n");

    open(COMBINEDFLAGS, "<$COMBINEDFLAGS");
    open(ALLCROSS, ">$ALLCROSS");
    local($raf, $affixes, $has_prefixes, $has_suffixes);
    while ($raf = <COMBINEDFLAGS>) {
	$affixes = (split(/[$flagmarker\n]/o, $raf, 3))[1];
	$has_prefixes = $has_suffixes = 0;
	foreach (split(//, $affixes)) {
	    if (!$has_prefixes) {
		if (index($prefixes, $_) >= 0) {
		    $has_prefixes++;
		    last	    if $has_suffixes;
		}
	    }
	    if (!$has_suffixes) {
		if (index($suffixes, $_) >= 0) {
		    $has_suffixes++;
		    last	    if $has_prefixes;
		}
	    }
	}
	if ($has_prefixes && $has_suffixes) {
	    # Root with at least one prefix and one suffix.
	    print ALLCROSS $raf;
	} else {
	    # Save for later the roots with only prefixes or only suffixes.
	    chop $raf;
	    push(@non_crossed, $raf);
	}
    }
    undef $raf; undef $affixes; undef $has_prefixes; undef $has_suffixes;
    close ALLCROSS;
    close COMBINEDFLAGS;
    unlink $COMBINEDFLAGS			    unless $DEBUG;

    &DBprint("Root and affixes read, verifying cross-expansions.\n");
    &sort_file($ALLCROSS);
    &remove_illegal_raf($ALLCROSS, $LEGALALL, VALIDWORDS, $ILLEGALCROSS);

    &DBprint("Separating illegal cross-expansions.\n");
    &remove_illegal_crossexp($ILLEGALCROSS, VALIDWORDS, $LEGALCROSS);
    unlink $ILLEGALCROSS       unless $DEBUG;

    if (-s $LEGALCROSS) {
	open(LEGALALL, ">>$LEGALALL");
	open(LEGALCROSS, "<$LEGALCROSS");
	while (<LEGALCROSS>) {
	    print LEGALALL;
	}
	close LEGALALL;
	close LEGALCROSS;
    }
    unlink $LEGALCROSS			unless $DEBUG;

} else {
    &DBprint("No cross expansion in this language.\n");
    rename($COMBINEDFLAGS, $LEGALALL);
}
unlink $VALIDWORDS		unless $DEBUG;

######################################################################


######################################################################
#### Optimize affix usage:
#### some affixes are subsets of others and can be eliminated.

# This is done by expanding all roots which have flags.  The resulting
# words are then examined and stored in an associative array in such a
# way that, if two or more identical expansions exist, only the one
# belonging to the most "valued" group is retained.

&DBprint("Merging non optimal affixes.\n");
if ($#non_crossed >= 0) {
    open(LEGALEXPANDED,
	 "|ispell $wordchars -ee -W0 -d $FAKEHASH -p /dev/null > \\
		  $LEGALEXPANDED");
    foreach (@non_crossed) {
	print LEGALEXPANDED "$_\n";
    }
    close LEGALEXPANDED;
    undef @non_crossed;		# Reclaim storage
} else {
    unlink $LEGALEXPANDED;	# To make sure it is empty
}

system("ispell $wordchars -ee -W0 -d $FAKEHASH -p /dev/null \\
	       < $LEGALALL >> $LEGALEXPANDED");
unlink $LEGALALL		unless $DEBUG;

open(LEGALEXPANDED, "<$LEGALEXPANDED");
# Preallocate dummy entries to increase the DBM cache to 1000
foreach $i (1 .. 1000) {
    $words{$i}++;
}
dbmopen(%words, $DBMFILE, 0600);

&DBprint("Merging...\n");
while (<LEGALEXPANDED>) {
    ($raf, @expansions) = split;
    # The expansion value is defined as the total expansion length
    # divided by the root length, in characters.
    $value = length(join('', @expansions)) / index($raf, $flagmarker);
    # Build an associative array to retain only the most valued expansions
    foreach $expansion (@expansions) {
	$words{$expansion} = join(' ', $value, $raf)
	    unless ($old = $words{$expansion}) &&
		   $value <= (split(/ /, $old, 2))[0];
    }
}
close LEGALEXPANDED;
unlink $LEGALEXPANDED			    unless $DEBUG;

&DBprint("Uniquing by roots...\n");
# %words is now unique by expansions.  We need to make it unique by roots.
while (($key, $value) = each(%words)) {
    $roots{(split(/ /,$value,2))[1]}++;
}
dbmclose(%words);		# Reclaim storage
undef %words;			# Reclaim storage
unlink "$DBMFILE.dir", "$DBMFILE.pag"	  unless $DEBUG;

&DBprint("Printing...\n");
open(MINIMALAFFIXES, ">$MINIMALAFFIXES");
while (($key, $value) = each(%roots)) {
    print MINIMALAFFIXES "$key\n";
}
close MINIMALAFFIXES;
undef %roots;			# Reclaim storage

######################################################################


######################################################################
#### Add non-affixed words

&DBprint("Adding non affixed words.\n");

$num_affixed = &sort_affixed_ignoring_case($MINIMALAFFIXES);
$status = system("buildhash -s $MINIMALAFFIXES $output_aff_table $FAKEHASH");
die "Can't create intermediate hash file" if ($status/256 != 0);
unlink "$MINIMALAFFIXES.cnt", "$MINIMALAFFIXES.stat"   unless $DEBUG;

system("ispell $wordchars -l -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT \\
						       > $NONAFFIXED");
&DBprint("Completed list of non affixed words.\n");

unlink $FAKEHASH, $STRIPPEDINPUT	    unless $DEBUG;

&DBprint("Sorted affixed.\n");
$num_nonaffixed = &sort_ignoring_case($NONAFFIXED);
&DBprint("Sorted non affixed.\n");
&merge_files_ignoring_case($MINIMALAFFIXES, $NONAFFIXED, $RESULT);
&DBprint("Merged.\n");

if ($VERBOSE) {
print STDERR "\n  There are $num_nonaffixed simple roots ",
	     "and $num_affixed affixed roots.\n";
}

unlink $MINIMALAFFIXES, $NONAFFIXED	    unless $DEBUG;

if ($VERBOSE) {
printf(STDERR
       "  The final dictionary contains %d roots expanding into %d words.\n",
       $num_nonaffixed + $num_affixed, $expanded_word_count);
$time =  time - $^T;
printf(STDERR
       "  It took %d:%02d.%02d to create it.\n\n",
       $time/3600, $time/60%60, $time%60);
}

######################################################################


######################################################################
#### Print result on standard output if requested

unless ($opt_o) {
    open(RESULT, "<$RESULT");
    while (<RESULT>) { print; }
    close RESULT;
    unlink $RESULT			unless $DEBUG;
}
&DBprint("Done.\n");

######################################################################

# For Ray Lischner's PERL profiler:
&DB'profile if defined &DB'profile;

EndOfEval

if ($@) {
    $@ =~ s/\(eval\) line (\d+)/$0 . " line " . ($1+$start)/e;
    $errno = (0 + $!) || ($? >> 8) || 255;
    if (length $!) {
	warn "$@ ($!)\n";
    } else {
	warn $@;
    }
    print STDERR "Cleaning up...\n";
    &cleanup;
    &DBprint("Done.\n");
    exit $errno;
}

exit 0;
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.