This is buildict.pl in view mode; [Download] [Up]
#! /depot/public/bin/perl # ---Le laboratoire de robotique de l'Institut de recherche d'Hydro-Quebec----- # # Nom : buildict [options] [[<] inputwords] > outputwords # Fonction: Combine and reduce dictionaries through affixes. # Fichiers: buildict.pl # Notes : This replaces the munchlist shell script provided with ispell 3.0.9 # # Cree : 5 juillet 93 ------- Martin Boyer <mboyer@ireq-robot.hydro.qc.ca> # Modifie : 5 juillet 93 -----1- Martin Boyer <mboyer@ireq-robot.hydro.qc.ca> # Copyright (c) 1993 Martin Boyer and Hydro-Quebec # # Historique: # ----------------------------------------------------------------------------- $copyright = <<EOF; Copyright (c) 1993 Martin Boyer and Hydro-Quebec. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Send comments, bug reports and fixes to: Martin Boyer mboyer@ireq-robot.hydro.qc.ca Institut de recherche d'Hydro-Quebec mboyer@ireq-robot.uucp 1800, montee Ste-Julie Varennes (Quebec) Canada J3X 1S1 +1 514 652-8412 EOF require "getopts.pl"; ########################## Configuration ########################### # Where the ispell utilities are (buildhash, icombine, etc.) $LIBDIR = '/local/lib/ispell'; # Where the 'ispell' executable is $BINDIR = '/local/bin'; # Where you want temporary files. # Usage peaks when sorting all roots and possible affixes. $TMPDIR = '/tmp'; # Your sorting program. # It must be able to: # sort ignoring case when given the '-f' flag, # sort on the first field when given the '+0 -1' flags # use the specified field separator when given the '-t' flag # send its output to the specified file when given the '-o' flag # If you are using 8-bit characters, it must be able to handle them, # at least sorting them according to their 'byte' value (the SunOS 4.1.1 # version of sort is OK). If it can't ignore case on accented characters, # (like SunOS 4.1.1) then icombine won't be able to eliminate unnessarily # capitalized words, and the resultant directory won't be minimal # (but still valid). # I tried using PERL to sort, but it multiplied the processing time # by at least a factor of five. # If 8 bit characters are used, they should be sorted the same way by sort # and by perl. This script should check for that. # Finally, note that folded sorting is necessary only before icombine. # It is also used to produce the final output, to make the result easier # to read, but that is only cosmetics. # # The full path must be specified. # You may want to add a flag to specify a different directory for # temporary files to prevent the $TMPDIR partition from overflowing. $SORT = '/usr/bin/sort -T /tmp'; # I use hard links to save space. Soft links will probably fail # because some files are deleted, and you don't want a soft link # to a deleted file. # An alternative is 'cp' but that will slow you down and use more space. $COPY = '/usr/bin/ln'; # This must be able to rename a file across filesystems boundaries. $RENAME = '/usr/bin/mv'; # Used only to print the current time at the beginning of the script. $DATE = '/usr/bin/date'; # If you don't want to specify full paths for $SORT, $COPY, $RENAME, # and $DATE, add system directories to this variable. $ENV{'PATH'} = "$LIBDIR:$BINDIR"; # The default affix table. $input_aff_table = "$LIBDIR/english.aff"; ###################################################################### ############################ Constants ############################# $buildict_version = '1.0'; # You may want to put VALIDWORDS and SORTTMP in a different directory, # to distribute temporary files in different partitions. # The other files are pretty much created and deleted as the program goes. # # COMBINEDINPUT and VALIDWORDS must be on the same filesystem. # COMBINEDINPUT and STRIPPEDINPUT may need to, depending on $COPY # COMBINEDFLAGS and LEGALALL must be on the same filesystem. $TMP = "$TMPDIR/munch$$."; $FAKEDICT = $TMP . 'FAKEDICT'; $FAKEHASH = $TMP . 'FAKEHASH.hash'; $INPUTWORDS = $TMP . 'INPUTWORDS'; $EXPANDEDINPUT = $TMP . 'EXPANDEDINPUT'; $COMBINEDINPUT = $TMP . 'COMBINEDINPUT'; $STRIPPEDINPUT = $TMP . 'STRIPPEDINPUT'; $VALIDWORDS = $TMP . 'VALIDWORDS'; $DBMFILE = $TMP . 'DBM'; $SORTTMP = $TMP . 'SORTTMP'; $EXPANDEDPAIRS = $TMP . 'EXPANDEDPAIRS'; $AFFIXEDROOTS = $TMP . 'AFFIXEDROOTS'; $ALLFLAGLIST = $TMP . 'ALLFLAGLIST'; $LEGALFLAGLIST = $TMP . 'LEGALFLAGLIST'; $COMBINEDFLAGS = $TMP . 'COMBINEDFLAGS'; $ALLCROSS = $TMP . 'ALLCROSS'; $LEGALALL = $TMP . 'LEGALALL'; $ILLEGALCROSS = $TMP . 'ILLEGALCROSS'; $LEGALCROSS = $TMP . 'LEGALCROSS'; $CROSSPAIRS = $TMP . 'CROSSPAIRS'; $LEGALCROSSPAIRS = $TMP . 'LEGALCROSSPAIRS'; $ILLEGALCROSSPAIRS = $TMP . 'ILLEGALCROSSPAIRS'; $LEGALEXPANDED = $TMP . 'LEGALEXPANDED'; $MINIMALAFFIXES = $TMP . 'MINIMALAFFIXES'; $NONAFFIXED = $TMP . 'NONAFFIXED'; $RESULT = $TMP . 'RESULT'; # These files are likely to be deleted by the cleanup subroutine. @TMP_FILES = ($FAKEDICT, "$FAKEDICT.cnt", "$FAKEDICT.stat", $FAKEHASH, $EXPANDEDINPUT, $COMBINEDINPUT, $STRIPPEDINPUT, $VALIDWORDS, "$DBMFILE.dir", "$DBMFILE.pag", $SORTTMP, $EXPANDEDPAIRS, $AFFIXEDROOTS, $ALLFLAGLIST, $LEGALFLAGLIST, $COMBINEDFLAGS, $ALLCROSS, $LEGALALL, $ILLEGALCROSS, $LEGALCROSS, $CROSSPAIRS, $LEGALCROSSPAIRS, $ILLEGALCROSSPAIRS, $LEGALEXPANDED, $MINIMALAFFIXES, "$MINIMALAFFIXES.cnt", "$MINIMALAFFIXES.stat", $NONAFFIXED, $RESULT); ###################################################################### ###################################################################### #### Subroutines # Prints its argument preceded by the time since the beginning of the script. sub DBprint { if ($VERY_VERBOSE) { local($msg) = @_; local($time) = time - $^T; printf(STDERR "%2d:%02d.%02d: %s", $time/3600, $time/60%60, $time%60, $msg); } } sub help { print STDERR <<EOF; This is buildict version $buildict_version. buildict accepts the following arguments: -C show the copyright notice -c 'output affix table' affix table to convert to -l 'input affix table' affix table of the input list -T 'source format' format of the input list -s 'suppression dictionary' list of words to exclude -w 'word characters' characters that are part of words -o 'output file' where to put the result if not standard output -D keep files for debugging -h print this help message and exit -e only expand the input list -v verbose output, to help maintain dictionaries -V very verbose output, with timing EOF } sub copyright { print STDERR $copyright; } sub cleanup { if ($DEBUG) { print STDERR "You may want to delete the following files:\n"; foreach (@TMP_FILES) { print STDERR "$_\n" if -e $_; } } else { foreach (@TMP_FILES) { unlink $_; } } } sub sighandler { local($sig) = @_; print STDERR "Caught a SIG$sig -- "; if ($DEBUG) { print STDERR "shutting down.\n"; } else { print STDERR "cleaning up.\n"; } &cleanup; exit(1); } # Passing a filename, sorts it in place, ignoring case differences. # Duplicates are then eliminated. # Returns the number of entries. sub sort_ignoring_case { local($file) = @_; # Using sort -u to remove duplicates eliminates "funny" capitalizations. # e.g. "MacDonald" disappears if "macdonald" exists system("$SORT -f -o $SORTTMP $file"); open(SORTTMP, "<$SORTTMP"); open(FILE, ">$file"); local($last, $num); while (<SORTTMP>) { next if $_ eq $last; print FILE $_; $num++; $last = $_; } close FILE; close SORTTMP; unlink $SORTTMP; $num; } # Passing a filename, sorts it in place. # Duplicates are then eliminated. # Returns the number of entries. sub sort_file { local($file) = @_; # Using sort -u to remove duplicates eliminates "funny" capitalizations. # e.g. "MacDonald" disappears if "macdonald" exists system("$SORT -o $SORTTMP $file"); open(SORTTMP, "<$SORTTMP"); open(FILE, ">$file"); local($last, $num); while (<SORTTMP>) { next if $_ eq $last; print FILE $_; $num++; $last = $_; } close FILE; close SORTTMP; unlink $SORTTMP; $num; } # Passing a filename, sorts it in place, using only the root field. # Removes duplicate entries. Returns the number of entries. sub sort_affixed { local ($file) = @_; system("$SORT -t$flagmarker +0 -1 -o $SORTTMP $file"); open(SORTTMP, "<$SORTTMP"); open(FILE, ">$file"); local($last, $num); while (<SORTTMP>) { next if $_ eq $last; print FILE $_; $num++; $last = $_; } close FILE; close SORTTMP; unlink $SORTTMP; $num; } # Passing a filename, sorts it in place, using only the root field, # disregarding case. Removes duplicate entries. # Returns the number of entries. sub sort_affixed_ignoring_case { local ($file) = @_; system("$SORT -f -t$flagmarker +0 -1 -o $SORTTMP $file"); open(SORTTMP, "<$SORTTMP"); open(FILE, ">$file"); local($last, $num); while (<SORTTMP>) { next if $_ eq $last; print FILE $_; $num++; $last = $_; } close FILE; close SORTTMP; unlink $SORTTMP; $num; } # Takes a sorted file of roots/affixes, # the name of the file where the result should go, # and a filehandle to a sorted list of valid words. # The second file is created to contain only roots that # appear in the third argument. # Returns the number of valid roots. sub remove_illegal_roots { local($ALLRAF, $VALID, $REFERENCE) = @_; local($position, $num); seek($REFERENCE, 0, 0); open(ALLRAF, "<$ALLRAF"); open(VALID, ">$VALID"); $reference = <$REFERENCE>; chop $reference; while ($raf = <ALLRAF>) { $raf_cmp = (split(/$flagmarker/o, $raf, 2))[0]; $position = $raf_cmp cmp $reference; if ($position < 0) { next; } if ($position == 0) { print VALID $raf; $num++; next; } ($reference = <$REFERENCE>) || last; chop $reference; # Try the current word again, pointing to the new dictionary reference redo; } close VALID; close ALLRAF; $num; } sub by_number { $a <=> $b; } # Given two filehandles, the first to a file of two fields [string, # number] (sorted on the first field), the second to a sorted list of # valid words, return a sorted array of the second fields for which # the first field doesn't appear in the list of valid words. ugh! sub find_illegal_raf_numbers { local($PAIRS, $DICT) = @_; local(%illegals); $reference = <$DICT>; chop $reference; while (<$PAIRS>) { ($word, $num) = split; next if $illegals{$num}; $position = $word cmp $reference; if ($position < 0) { $illegals{$num}++; next; } if ($position == 0) { next; } ($reference = <$DICT>) || last; chop $reference; # Try the current word again, pointing to the next reference redo; } # Some words may come (alphabetically) after the last reference word. $illegals{$num}++ if $position != 0; while (<$PAIRS>) { ($word, $num) = split; $illegals{$num}++; } return sort by_number keys(%illegals); } # Given the filename of a list of root/affixes, # the filename of where the legal root/affixes should go, # and a fileHANDLE to the list of valid words, # creates the file containing only the legal root/affixes. # An optional fourth argument is a filename where to print # the illegal root/affixes. # The files cannot be the same. sub remove_illegal_raf { local($ALLRAF, $LEGALRAF, $VALIDWORDS, $ILLEGALRAF) = @_; open(ROOTAFFIXES, "ispell $wordchars -ee -W0 -d $FAKEHASH -p /dev/null < $ALLRAF|"); open(EXPANDEDPAIRS, "|$SORT +0 -1 -o $EXPANDEDPAIRS"); while (<ROOTAFFIXES>) { $raf_number++; @line = split; $raf = shift(@line); foreach $expansion (@line) { $num_expansions++; print EXPANDEDPAIRS "$expansion $raf_number\n" || die "Can't print."; } } close ROOTAFFIXES; &DBprint(" Closing EXPANDEDPAIRS ($num_expansions expansions, $raf_number root/affixes).\n"); close EXPANDEDPAIRS; undef $raf_number; # cleanup undef $num_expansions; # cleanup # At this point, EXPANDEDPAIRS is sorted and each line has a # second field which points into ALLRAF. We will use this # field to remove the illegal root/affix combinations when # transferring from ALLRAF to LEGALRAF. open(EXPANDEDPAIRS, "<$EXPANDEDPAIRS"); seek($VALIDWORDS, 0, 0); &DBprint(" Finding illegals.\n"); @illegals = &find_illegal_raf_numbers(EXPANDEDPAIRS, $VALIDWORDS); close EXPANDEDPAIRS; unlink $EXPANDEDPAIRS unless $DEBUG; &DBprint(" Keeping only legal root/affixes.\n"); open(ALLRAF, "<$ALLRAF"); open(LEGALRAF, ">$LEGALRAF"); open(ILLEGALRAF, ">$ILLEGALRAF") if $ILLEGALRAF; illegal: foreach $illegal (@illegals) { while (<ALLRAF>) { if ($. == $illegal) { print ILLEGALRAF if $ILLEGALRAF; next illegal } $legal_raf++; print LEGALRAF; } } close ILLEGALRAF if $ILLEGALRAF; # Those who remain after the last illegal are legal while (<ALLRAF>) { $legal_raf++; print LEGALRAF; } close LEGALRAF; close ALLRAF; unlink $ALLRAF unless $DEBUG; &DBprint(" Done, $legal_raf legal root/affixes.\n"); undef $legal_raf; # Cleanup } # Given the filename of a list of legal root/affixes, generating illegal # cross-expansions, # and a fileHANDLE to the list of valid words, # creates the file containing only the root/affixes which generate legal # cross-expansions. sub remove_illegal_crossexp { local($CROSSEXP, $VALIDWORDS, $VALIDCROSS) = @_; open(CROSSEXP, "<$CROSSEXP"); open(CROSSPAIRS, ">$CROSSPAIRS"); open(VALIDCROSS, ">$VALIDCROSS"); local($root, $affixes, @prefixes, @suffixes, $p, $s, @affix); # For each root with 'p' prefixes and 's' suffixes, test the root # with each prefix combined with each suffix, yielding 'p times s' # pairs of affixes. # Eliminate illegal pairs and recombine prefixes and suffixes in # such a way that no illegal pair is created. # Note that a list of only prefixes or only suffixes is always # legal since they cannot combine and illegal uncombined affixes have # already been removed. while (<CROSSEXP>) { chop; ($root, $affixes) = split(/$flagmarker/o, $_, 2); $#prefixes = -1; $#suffixes = -1; # Separate the affixes into prefixes and suffixes foreach (split(//, $affixes)) { if (index($prefixes, $_) >= 0) { push(@prefixes, $_); } else { push(@suffixes, $_); } } # The rest of this routine deals only with cross-expansions. # The uncrossed affixes must be listed as valid; they always are. print VALIDCROSS $root, $flagmarker, @prefixes, "\n"; print VALIDCROSS $root, $flagmarker, @suffixes, "\n"; # Generate prefix+suffix pairs. foreach $p (@prefixes) { foreach $s (@suffixes) { print(CROSSPAIRS $root, $flagmarker, $p, $s, "\n"); } } } close CROSSEXP; close CROSSPAIRS; &remove_illegal_raf($CROSSPAIRS, $LEGALCROSSPAIRS, $VALIDWORDS, $ILLEGALCROSSPAIRS); unlink $CROSSPAIRS unless $DEBUG; if ($VERBOSE && -s $ILLEGALCROSSPAIRS) { print STDERR "The following roots and affixes ", "were not found in the dictionary;\n", "if you know they are valid, ", "adding them would save space.\n"; open(ILLEGALCROSSPAIRS, "<$ILLEGALCROSSPAIRS"); while (<ILLEGALCROSSPAIRS>) { print STDERR " $_"; } close ILLEGALCROSSPAIRS; } unlink $ILLEGALCROSSPAIRS unless $DEBUG; open(LEGALCROSSPAIRS, "<$LEGALCROSSPAIRS"); local($raf, $previous_root, %prefixes, %suffixes); # For each root, try to recombine the affixes. while (1) { $raf = <LEGALCROSSPAIRS>; chop $raf; ($root, $affixes) = split(/$flagmarker/o, $raf, 2); $previous_root = $root if !$previous_root; # First time through if ($root ne $previous_root) { # If two or more prefixes are combined with the same set of # affixes, combine the prefixes. while (($p, $s) = each %prefixes) { $suffixes{$s} .= $p; } undef %prefixes; while (($p, $s) = each %suffixes) { print VALIDCROSS $previous_root, $flagmarker, $p, $s, "\n"; } undef %suffixes; $previous_root = $root; } last if !$raf; ($p, $s) = split(//, $affixes, 2); $prefixes{$p} .= $s; } close VALIDCROSS; close LEGALCROSSPAIRS; unlink $LEGALCROSSPAIRS unless $DEBUG; } # Takes a file of sorted (disregarding case) root/affixes and # a file of sorted non-affixed roots. # They are merged and printed to the file named as the third argument. sub merge_files_ignoring_case { local($AFF, $NAF, $RESULT) = @_; local($aff, $naf, $aff_cmp, $naf_cmp); open(AFF, "<$AFF"); open(NAF, "<$NAF"); open(RESULT, ">$RESULT"); $aff = <AFF>; ($aff_cmp) = split(/$flagmarker/o, $aff); $aff_cmp =~ tr/A-Z\300-\336/a-z\340-\376/; $naf_cmp = $naf = <NAF>; chop $naf_cmp; $naf_cmp =~ tr/A-Z\300-\336/a-z\340-\376/; while ($aff && $naf) { if ($aff_cmp lt $naf_cmp) { print RESULT $aff; $aff = <AFF>; ($aff_cmp) = split(/$flagmarker/o, $aff); $aff_cmp =~ tr/A-Z\300-\336/a-z\340-\376/; } else { print RESULT $naf; $naf_cmp = $naf = <NAF>; chop $naf_cmp; $naf_cmp =~ tr/A-Z\300-\336/a-z\340-\376/; } } if ($aff) { print RESULT $aff; while (<AFF>) { print RESULT $_; } } elsif ($naf) { print RESULT $naf; while (<NAF>) { print RESULT $_; } } close $RESULT; close $NAF; close $AFF; } ###################################################################### ###################################################################### #### Read and process command line options &Getopts('c:Cl:s:w:o:DT:ehvV'); $input_aff_table = $opt_l if $opt_l; if ($opt_c) { $output_aff_table = $opt_c; } else { $output_aff_table = $input_aff_table; } $source_format = "-T $opt_T" if $opt_T; $suppression_dict = $opt_s if $opt_s; $wordchars = "-w$opt_w" if $opt_w; $expandonly = $opt_e; $RESULT = $opt_o if $opt_o; $VERBOSE = $opt_v; $VERY_VERBOSE = $opt_V; if ($opt_h) { &help; exit(1); } if ($opt_C) { ©right; exit(1); } $DEBUG = $opt_D; select((select(STDERR), $| = 1)[0]); $SIG{'INT'} = 'sighandler'; $SIG{'QUIT'} = 'sighandler'; $SIG{'TERM'} = 'sighandler'; # Setting up an exit routine to trap exceptions. # See the end of this script for EndOfEval. $start = __LINE__; eval <<'EndOfEval'; print STDERR `$DATE` if $VERBOSE || $VERY_VERBOSE; if ($#ARGV == 0) { $INPUTWORDS = $ARGV[0]; } else { &DBprint("Merging input lists.\n"); push(@TMP_FILES, $INPUTWORDS); $delete_inputwords++; open(INPUTWORDS, ">$INPUTWORDS") || die "Can't open $INPUTWORDS"; while (<>) { print INPUTWORDS $_; } close INPUTWORDS; } ###################################################################### ###################################################################### #### Create a dummy hash file to contain the input affix table. &DBprint("Creating input affix table.\n"); open(FAKEDICT, ">$FAKEDICT") || die "Can't open $FAKEDICT"; print(FAKEDICT "QQQQQQQQ\n"); close(FAKEDICT); $status = system("buildhash -s $FAKEDICT $input_aff_table $FAKEHASH"); die "Can't create fake hash file" if ($status/256 != 0); ###################################################################### ###################################################################### #### Expand all input lists. #### Weed out unnecessary capitalization. &DBprint("Expanding input list.\n"); open(EXPANDEDINPUT, ">$EXPANDEDINPUT"); open(EXPANDING, "ispell $wordchars -e -d $FAKEHASH -p /dev/null < $INPUTWORDS |"); while (<EXPANDING>) { foreach (split) { print EXPANDEDINPUT "$_\n"; # Append each new word to the file $num_expansions++; } } close EXPANDEDING; close EXPANDEDINPUT; if ($delete_inputwords) { unlink $INPUTWORDS unless $DEBUG; } &DBprint("Sorting and eliminating duplicate expansions.\n"); $num_unique = &sort_ignoring_case($EXPANDEDINPUT); print STDERR "$num_expansions input words, $num_unique unique words.\n" if $VERBOSE; undef $num_expansions, $num_unique; if ($expandonly) { &DBprint("Printing expanded list as requested.\n"); unlink $FAKEHASH, $FAKEDICT, "$FAKEDICT.cnt", "$FAKEDICT.stat" unless $DEBUG; if ($opt_o) { # Can't use rename here; the two files may be on different filesystems. system "$RENAME $EXPANDEDINPUT $RESULT" || die "Can't copy EXPANDEDINPUT"; } else { open(EXPANDEDINPUT, "<$EXPANDEDINPUT"); while (<EXPANDEDINPUT>) { print; } close EXPANDEDINPUT; unlink $EXPANDEDINPUT unless $DEBUG; } &DBprint("Done.\n"); exit 0; } &DBprint("Combining capitalizations.\n"); system("icombine $source_format $input_aff_table < $EXPANDEDINPUT \\ > $COMBINEDINPUT"); unlink $EXPANDEDINPUT unless $DEBUG; &DBprint("Sorting and eliminating duplicate capitalizations.\n"); $expanded_word_count = &sort_file($COMBINEDINPUT); print STDERR "The input dictionary contains $expanded_word_count ", "(uncapitalized) words.\n" if $VERBOSE; ###################################################################### ###################################################################### #### Create a dummy hash file to contain the output affix table. #### Figure out what the marker and the flags are going to be. if ($output_aff_table eq $input_aff_table) { &DBprint("Using same affix table for input and output.\n"); } else { &DBprint("Creating output affix table.\n"); open(FAKEDICT, ">$FAKEDICT") || die "Can't open $FAKEDICT"; print(FAKEDICT "QQQQQQQQ\n"); close(FAKEDICT); $status = system("buildhash -s $FAKEDICT $output_aff_table $FAKEHASH"); die "Can't create fake hash file" if ($status/256 != 0); } unlink $FAKEDICT, "$FAKEDICT.cnt", "$FAKEDICT.stat" unless $DEBUG; open(AFFIXTABLE, "ispell -D -d $FAKEHASH|"); $state = 'nil'; while(<AFFIXTABLE>) { if ($state eq 'nil') { if (/^flagmarker\s+(.)/) { $flagmarker = $1; } if (/^prefixes/) { $state = 'prefixes'; } if (/^suffixes/) { $state = 'suffixes'; } } elsif ($state eq 'prefixes') { if (/^\s+flag\s+\*(.)/) { $prefixes .= $1 unless index($prefixes, $1) >= 0; } if (/^suffixes/) { $state = 'suffixes'; } } elsif ($state eq 'suffixes') { if (/^\s+flag\s+\*(.)/) { $suffixes .= $1 unless index($suffixes, $1) >= 0; } } } close AFFIXTABLE; die "Can't find the flag marker" if !$flagmarker; &DBprint("The flag marker is '$flagmarker'.\n"); if ($VERBOSE) { print STDERR "\t " if $prefixes && $VERY_VERBOSE; print STDERR "The prefix flags are '$prefixes'" if $prefixes; print STDERR " and t" if $prefixes && $suffixes; print STDERR "T" if !$prefixes && $suffixes; print STDERR "he suffix flags are '$suffixes'"; print STDERR ".\n" if $prefixes || $suffixes; } ###################################################################### ###################################################################### #### If required, remove words already in a specified dictionary. if ($suppression_dict) { &DBprint("Removing words already in dictionary \"$suppression_dict\".\n"); open(STRIPPEDINPUT, "|ispell $wordchars -l -d $suppression_dict -p /dev/null > \\ $STRIPPEDINPUT"); open(COMBINEDINPUT, "<$COMBINEDINPUT"); while (<COMBINEDINPUT>) { print STRIPPEDINPUT $_; } close COMBINEDINPUT; close STRIPPEDINPUT; } else { &DBprint("No suppression list, munching all words.\n"); system "$COPY $COMBINEDINPUT $STRIPPEDINPUT" || die "Can't copy input word list."; } rename($COMBINEDINPUT, $VALIDWORDS); ###################################################################### ###################################################################### #### Collect all possible roots and affixes. #### Some of those are illegal and will be removed later. &DBprint("Generating all roots and possible affixes.\n"); open(ALLROOTS, "ispell $wordchars -c -W0 -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT|"); open(AFFIXEDROOTS, ">$AFFIXEDROOTS"); while (<ALLROOTS>) { @line = split; next if $#line == 0; # No possible root/affix for this word shift(@line); # The original root/affix is not interesting foreach $word (@line) { print AFFIXEDROOTS "$word\n"; } } close AFFIXEDROOTS; close ALLROOTS; &DBprint("Sorting...\n"); # This is where usage of temporary file space peaks $num_illegals = &sort_affixed($AFFIXEDROOTS); &DBprint(" ...done.\n"); open(VALIDWORDS, "<$VALIDWORDS"); $num_roots = &remove_illegal_roots($AFFIXEDROOTS, $ALLFLAGLIST, VALIDWORDS); $num_illegals -= $num_roots; &DBprint("$num_illegals illegal roots removed, $num_roots roots left.\n"); unlink $AFFIXEDROOTS unless $DEBUG; undef $num_illegals; # cleanup undef $num_roots; # cleanup ###################################################################### ###################################################################### #### Find all the possible expansions from the previous list. #### Remove the root/affix combinations which generate illegal words. &DBprint("Pruning illegal root/affix combinations.\n"); &remove_illegal_raf($ALLFLAGLIST, $LEGALFLAGLIST, VALIDWORDS); &DBprint("Combining multiple flags.\n"); &sort_ignoring_case($LEGALFLAGLIST); system("icombine $source_format $input_aff_table < $LEGALFLAGLIST \\ > $COMBINEDFLAGS"); unlink $LEGALFLAGLIST unless $DEBUG; ###################################################################### ###################################################################### #### Eliminate illegal cross-products; i.e. combinations of suffixes #### and prefixes that expand to words not in the dictionary. if ($prefixes && $suffixes) { &DBprint("Eliminating illegal prefix/root/suffix combinations.\n"); open(COMBINEDFLAGS, "<$COMBINEDFLAGS"); open(ALLCROSS, ">$ALLCROSS"); local($raf, $affixes, $has_prefixes, $has_suffixes); while ($raf = <COMBINEDFLAGS>) { $affixes = (split(/[$flagmarker\n]/o, $raf, 3))[1]; $has_prefixes = $has_suffixes = 0; foreach (split(//, $affixes)) { if (!$has_prefixes) { if (index($prefixes, $_) >= 0) { $has_prefixes++; last if $has_suffixes; } } if (!$has_suffixes) { if (index($suffixes, $_) >= 0) { $has_suffixes++; last if $has_prefixes; } } } if ($has_prefixes && $has_suffixes) { # Root with at least one prefix and one suffix. print ALLCROSS $raf; } else { # Save for later the roots with only prefixes or only suffixes. chop $raf; push(@non_crossed, $raf); } } undef $raf; undef $affixes; undef $has_prefixes; undef $has_suffixes; close ALLCROSS; close COMBINEDFLAGS; unlink $COMBINEDFLAGS unless $DEBUG; &DBprint("Root and affixes read, verifying cross-expansions.\n"); &sort_file($ALLCROSS); &remove_illegal_raf($ALLCROSS, $LEGALALL, VALIDWORDS, $ILLEGALCROSS); &DBprint("Separating illegal cross-expansions.\n"); &remove_illegal_crossexp($ILLEGALCROSS, VALIDWORDS, $LEGALCROSS); unlink $ILLEGALCROSS unless $DEBUG; if (-s $LEGALCROSS) { open(LEGALALL, ">>$LEGALALL"); open(LEGALCROSS, "<$LEGALCROSS"); while (<LEGALCROSS>) { print LEGALALL; } close LEGALALL; close LEGALCROSS; } unlink $LEGALCROSS unless $DEBUG; } else { &DBprint("No cross expansion in this language.\n"); rename($COMBINEDFLAGS, $LEGALALL); } unlink $VALIDWORDS unless $DEBUG; ###################################################################### ###################################################################### #### Optimize affix usage: #### some affixes are subsets of others and can be eliminated. # This is done by expanding all roots which have flags. The resulting # words are then examined and stored in an associative array in such a # way that, if two or more identical expansions exist, only the one # belonging to the most "valued" group is retained. &DBprint("Merging non optimal affixes.\n"); if ($#non_crossed >= 0) { open(LEGALEXPANDED, "|ispell $wordchars -ee -W0 -d $FAKEHASH -p /dev/null > \\ $LEGALEXPANDED"); foreach (@non_crossed) { print LEGALEXPANDED "$_\n"; } close LEGALEXPANDED; undef @non_crossed; # Reclaim storage } else { unlink $LEGALEXPANDED; # To make sure it is empty } system("ispell $wordchars -ee -W0 -d $FAKEHASH -p /dev/null \\ < $LEGALALL >> $LEGALEXPANDED"); unlink $LEGALALL unless $DEBUG; open(LEGALEXPANDED, "<$LEGALEXPANDED"); # Preallocate dummy entries to increase the DBM cache to 1000 foreach $i (1 .. 1000) { $words{$i}++; } dbmopen(%words, $DBMFILE, 0600); &DBprint("Merging...\n"); while (<LEGALEXPANDED>) { ($raf, @expansions) = split; # The expansion value is defined as the total expansion length # divided by the root length, in characters. $value = length(join('', @expansions)) / index($raf, $flagmarker); # Build an associative array to retain only the most valued expansions foreach $expansion (@expansions) { $words{$expansion} = join(' ', $value, $raf) unless ($old = $words{$expansion}) && $value <= (split(/ /, $old, 2))[0]; } } close LEGALEXPANDED; unlink $LEGALEXPANDED unless $DEBUG; &DBprint("Uniquing by roots...\n"); # %words is now unique by expansions. We need to make it unique by roots. while (($key, $value) = each(%words)) { $roots{(split(/ /,$value,2))[1]}++; } dbmclose(%words); # Reclaim storage undef %words; # Reclaim storage unlink "$DBMFILE.dir", "$DBMFILE.pag" unless $DEBUG; &DBprint("Printing...\n"); open(MINIMALAFFIXES, ">$MINIMALAFFIXES"); while (($key, $value) = each(%roots)) { print MINIMALAFFIXES "$key\n"; } close MINIMALAFFIXES; undef %roots; # Reclaim storage ###################################################################### ###################################################################### #### Add non-affixed words &DBprint("Adding non affixed words.\n"); $num_affixed = &sort_affixed_ignoring_case($MINIMALAFFIXES); $status = system("buildhash -s $MINIMALAFFIXES $output_aff_table $FAKEHASH"); die "Can't create intermediate hash file" if ($status/256 != 0); unlink "$MINIMALAFFIXES.cnt", "$MINIMALAFFIXES.stat" unless $DEBUG; system("ispell $wordchars -l -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT \\ > $NONAFFIXED"); &DBprint("Completed list of non affixed words.\n"); unlink $FAKEHASH, $STRIPPEDINPUT unless $DEBUG; &DBprint("Sorted affixed.\n"); $num_nonaffixed = &sort_ignoring_case($NONAFFIXED); &DBprint("Sorted non affixed.\n"); &merge_files_ignoring_case($MINIMALAFFIXES, $NONAFFIXED, $RESULT); &DBprint("Merged.\n"); if ($VERBOSE) { print STDERR "\n There are $num_nonaffixed simple roots ", "and $num_affixed affixed roots.\n"; } unlink $MINIMALAFFIXES, $NONAFFIXED unless $DEBUG; if ($VERBOSE) { printf(STDERR " The final dictionary contains %d roots expanding into %d words.\n", $num_nonaffixed + $num_affixed, $expanded_word_count); $time = time - $^T; printf(STDERR " It took %d:%02d.%02d to create it.\n\n", $time/3600, $time/60%60, $time%60); } ###################################################################### ###################################################################### #### Print result on standard output if requested unless ($opt_o) { open(RESULT, "<$RESULT"); while (<RESULT>) { print; } close RESULT; unlink $RESULT unless $DEBUG; } &DBprint("Done.\n"); ###################################################################### # For Ray Lischner's PERL profiler: &DB'profile if defined &DB'profile; EndOfEval if ($@) { $@ =~ s/\(eval\) line (\d+)/$0 . " line " . ($1+$start)/e; $errno = (0 + $!) || ($? >> 8) || 255; if (length $!) { warn "$@ ($!)\n"; } else { warn $@; } print STDERR "Cleaning up...\n"; &cleanup; &DBprint("Done.\n"); exit $errno; } exit 0;
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.