ftp.nice.ch/pub/next/unix/text/NeXT_French_Dictionary.3.1.08.I.bs.tar.gz#/NeXT_French_Dictionary3.1.08/src/ispell-3.1.08.tar.gz#/ispell-3.1/buildhash.c

This is buildhash.c in view mode; [Download] [Up]

#ifndef lint
static char Rcs_Id[] =
    "$Id: buildhash.c,v 1.60 1994/01/25 07:11:18 geoff Exp $";
#endif

#define MAIN

/*
 * buildhash.c - make a hash table for okspell
 *
 * Pace Willisson, 1983
 *
 * Copyright 1992, 1993, Geoff Kuenning, Granada Hills, CA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All modifications to the source code must be clearly marked as
 *    such.  Binary redistributions based on modified source code
 *    must be clearly marked as modified versions in the documentation
 *    and/or other materials provided with the distribution.
 * 4. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgment:
 *      This product includes software developed by Geoff Kuenning and
 *      other unpaid contributors.
 * 5. The name of Geoff Kuenning may not be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Log: buildhash.c,v $
 * Revision 1.60  1994/01/25  07:11:18  geoff
 * Get rid of all old RCS log lines in preparation for the 3.1 release.
 *
 */

#include "config.h"
#include "ispell.h"
#include "proto.h"
#include "msgs.h"
#include "version.h"
#include <ctype.h>
#include <sys/stat.h>

int		main P ((int argc, char * argv[]));
static void	output P ((void));
static void	filltable P ((void));
VOID *		mymalloc P ((unsigned int size));
VOID *		myrealloc P ((VOID * ptr, unsigned int size,
		  unsigned int oldsize));
void		myfree P ((VOID * ptr));
static void	readdict P ((void));
static void	newcount P ((void));

#define NSTAT	100		/* Size probe-statistics table */

struct stat	dstat;		/* Result of stat-ing dict file */
struct stat	cstat;		/* Result of stat-ing count file */

int		hashsize;	/* Number of entries to go in hash table */

struct hashheader hashheader;	/* Header of hash table being built */
struct dent *	hashtbl;	/* Entries to go in hash table */

char *		Dfile;		/* Name of dictionary file */
char *		Hfile;		/* Name of hash (output) file */
char *		Lfile;		/* Name of language file */

char		Cfile[MAXPATHLEN]; /* Name of count file */
char		Sfile[MAXPATHLEN]; /* Name of statistics file */

static int silent = 0;		/* NZ to suppress count reports */

int main (argc, argv)
    int		argc;
    char *	argv[];
    {
    int		avg;
    FILE *	countf;
    FILE *	statf;
    int		stats[NSTAT];
    int		i;
    int		j;

    while (argc > 1  &&  *argv[1] == '-')
	{
	argc--;
	argv++;
	switch (argv[0][1])
	    {
	    case 's':
		silent = 1;
		break;
	    }
	}
    if (argc == 4)
	{
	Dfile = argv[1];
	Lfile = argv[2];
	Hfile = argv[3];
	}
    else
	{
	(void) fprintf (stderr, BHASH_C_USAGE);
	return 1;
	}

    if (yyopen (Lfile))			/* Open the language file */
	return 1;
    yyinit ();				/* Set up for the parse */
    if (yyparse ())			/* Parse the language tables */
	exit (1);

    (void) sprintf (Cfile, "%s.cnt", Dfile);
    (void) sprintf (Sfile, "%s.stat", Dfile);

    if (stat (Dfile, &dstat) < 0)
	{
	(void) fprintf (stderr, BHASH_C_NO_DICT, Dfile);
	    exit (1);
	}
    if (stat (Cfile, &cstat) < 0 || dstat.st_mtime > cstat.st_mtime)
	newcount ();

    if ((countf = fopen (Cfile, "r")) == NULL)
	{
	(void) fprintf (stderr, BHASH_C_NO_COUNT);
	exit (1);
	}
    hashsize = 0;
    (void) fscanf (countf, "%d", &hashsize);
    (void) fclose (countf);
    if (hashsize == 0)
	{
	(void) fprintf (stderr, BHASH_C_BAD_COUNT);
	exit (1);
	}
    readdict ();

    if ((statf = fopen (Sfile, "w")) == NULL)
	{
	(void) fprintf (stderr, CANT_CREATE, Sfile);
	exit (1);
	}

    for (i = 0; i < NSTAT; i++)
	stats[i] = 0;
    for (i = 0; i < hashsize; i++)
	{
	struct dent *   dp;

	dp = &hashtbl[i];
	if ((dp->flagfield & USED) != 0)
	    {
	    for (j = 0;  dp != NULL;  j++, dp = dp->next)
		{
		if (j >= NSTAT)
		    j = NSTAT - 1;
		stats[j]++;
		}
	    }
	}
    for (i = 0, j = 0, avg = 0;  i < NSTAT;  i++)
	{
	j += stats[i];
	avg += stats[i] * (i + 1);
	if (j == 0)
	    (void) fprintf (statf, "%d:\t%d\t0\t0.0\n", i + 1, stats[i]);
	else
	    (void) fprintf (statf, "%d:\t%d\t%d\t%f\n", i + 1, stats[i], j,
	      (double) avg / j);
	}
    (void) fclose (statf);

    filltable ();
    output ();
    return 0;
    }

static void output ()
    {
    register FILE *		houtfile;
    register struct dent *	dp;
    int				strptr;
    int				n;
    int				i;
    int				maxplen;
    int				maxslen;
    struct flagent *		fentry;

    if ((houtfile = fopen (Hfile, "wb")) == NULL)
	{
	(void) fprintf (stderr, CANT_CREATE, Hfile);
	return;
	}
    hashheader.stringsize = 0;
    hashheader.lstringsize = 0;
    hashheader.tblsize = hashsize;
    (void) fwrite ((char *) &hashheader, sizeof hashheader, 1, houtfile);
    strptr = 0;
    /*
    ** Put out the strings from the flags table.  This code assumes that
    ** the size of the hash header is a multiple of the size of ichar_t,
    ** and that any integer can be converted to an (ichar_t *) and back
    ** without damage.
    */
    maxslen = 0;
    for (i = numsflags, fentry = sflaglist;  --i >= 0;  fentry++)
	{
	if (fentry->stripl)
	    {
	    (void) fwrite ((char *) fentry->strip, fentry->stripl + 1,
	      sizeof (ichar_t), houtfile);
	    fentry->strip = (ichar_t *) strptr;
	    strptr += (fentry->stripl + 1) * sizeof (ichar_t);
	    }
	if (fentry->affl)
	    {
	    (void) fwrite ((char *) fentry->affix, fentry->affl + 1,
	      sizeof (ichar_t), houtfile);
	    fentry->affix = (ichar_t *) strptr;
	    strptr += (fentry->affl + 1) * sizeof (ichar_t);
	    }
	n = fentry->affl - fentry->stripl;
	if (n < 0)
	    n = -n;
	if (n > maxslen)
	    maxslen = n;
	}
    maxplen = 0;
    for (i = numpflags, fentry = pflaglist;  --i >= 0;  fentry++)
	{
	if (fentry->stripl)
	    {
	    (void) fwrite ((char *) fentry->strip, fentry->stripl + 1,
	      sizeof (ichar_t), houtfile);
	    fentry->strip = (ichar_t *) strptr;
	    strptr += (fentry->stripl + 1) * sizeof (ichar_t);
	    }
	if (fentry->affl)
	    {
	    (void) fwrite ((char *) fentry->affix, fentry->affl + 1,
	      sizeof (ichar_t), houtfile);
	    fentry->affix = (ichar_t *) strptr;
	    strptr += (fentry->affl + 1) * sizeof (ichar_t);
	    }
	n = fentry->affl - fentry->stripl;
	if (n < 0)
	    n = -n;
	if (n > maxplen)
	    maxplen = n;
	}
    /*
    ** Write out the string character type tables.
    */
    hashheader.strtypestart = strptr;
    for (i = 0;  i < hashheader.nstrchartype;  i++)
	{
	n = strlen (chartypes[i].name) + 1;
	(void) fwrite (chartypes[i].name, n, 1, houtfile);
	strptr += n;
	n = strlen (chartypes[i].deformatter) + 1;
	(void) fwrite (chartypes[i].deformatter, n, 1, houtfile);
	strptr += n;
	for (n = 0;
	  chartypes[i].suffixes[n] != '\0';
	  n += strlen (&chartypes[i].suffixes[n]) + 1)
	    ;
	n++;
	(void) fwrite (chartypes[i].suffixes, n, 1, houtfile);
	strptr += n;
	}
    hashheader.lstringsize = strptr;
    /* We allow one extra byte because missingletter() may add one byte */
    maxslen += maxplen + 1;
    if (maxslen > MAXAFFIXLEN)
	{
	(void) fprintf (stderr,
	  BHASH_C_BAFF_1 (MAXAFFIXLEN, maxslen - MAXAFFIXLEN));
	(void) fprintf (stderr, BHASH_C_BAFF_2);
	}
    /* Put out the dictionary strings */
    for (i = 0, dp = hashtbl;  i < hashsize;  i++, dp++)
	{
	if (dp->word == NULL)
	    dp->word = (char *) -1;
	else
	    {
	    n = strlen (dp->word) + 1;
	    (void) fwrite (dp->word, n, 1, houtfile);
	    dp->word = (char *) strptr;
	    strptr += n;
	    }
	}
    /* Pad file to a struct dent boundary for efficiency. */
    n = (strptr + sizeof hashheader) % sizeof (struct dent);
    if (n != 0)
	{
	n = sizeof (struct dent) - n;
	strptr += n;
	while (--n >= 0)
	    (void) putc ('\0', houtfile);
	}
    /* Put out the hash table itself */
    for (i = 0, dp = hashtbl;  i < hashsize;  i++, dp++)
	{
	if (dp->next != 0)
	    {
	    int		x;
	    x = dp->next - hashtbl;
	    dp->next = (struct dent *)x;
	    }
	else
	    {
	    dp->next = (struct dent *)-1;
	    }
#ifdef PIECEMEAL_HASH_WRITES
	(void) fwrite ((char *) dp, sizeof (struct dent), 1, houtfile);
#endif /* PIECEMEAL_HASH_WRITES */
	}
#ifndef PIECEMEAL_HASH_WRITES
    (void) fwrite ((char *) hashtbl, sizeof (struct dent), hashsize, houtfile);
#endif /* PIECEMEAL_HASH_WRITES */
    /* Put out the language tables */
    (void) fwrite ((char *) sflaglist,
      sizeof (struct flagent), numsflags, houtfile);
    hashheader.stblsize = numsflags;
    (void) fwrite ((char *) pflaglist,
      sizeof (struct flagent), numpflags, houtfile);
    hashheader.ptblsize = numpflags;
    /* Finish filling in the hash header. */
    hashheader.stringsize = strptr;
    rewind (houtfile);
    (void) fwrite ((char *) &hashheader, sizeof hashheader, 1, houtfile);
    (void) fclose (houtfile);
    }

static void filltable ()
    {
    struct dent *freepointer, *nextword, *dp;
    struct dent *hashend;
    int i;
    int overflows;
    
    hashend = hashtbl + hashsize;
    for (freepointer = hashtbl;
      (freepointer->flagfield & USED)  &&  freepointer < hashend;
      freepointer++)
	;
    overflows = 0;
    for (nextword = hashtbl, i = hashsize; i != 0; nextword++, i--)
	{
	if ((nextword->flagfield & USED) == 0)
	    continue;
	if (nextword->next >= hashtbl  &&  nextword->next < hashend)
	    continue;
	dp = nextword;
	while (dp->next)
	    {
	    if (freepointer >= hashend)
		{
		overflows++;
		break;
		}
	    else
		{
		*freepointer = *(dp->next);
		dp->next = freepointer;
		dp = freepointer;

		while ((freepointer->flagfield & USED)
		  &&  freepointer < hashend)
		    freepointer++;
		}
	    }
	}
    if (overflows)
	(void) fprintf (stderr, BHASH_C_OVERFLOW, overflows);
    }

#if MALLOC_INCREMENT == 0
VOID * mymalloc (size)
    unsigned int	size;
    {

    return malloc (size);
    }

/* ARGSUSED */
VOID * myrealloc (ptr, size, oldsize)
    VOID *		ptr;
    unsigned int	size;
    unsigned int	oldsize;
    {

    return realloc (ptr, size);
    }

void myfree (ptr)
    VOID *	ptr;
    {

    free (ptr);
    }

#else

VOID * mymalloc (size)		/* Fast, unfree-able variant of malloc */
    unsigned int	size;
    {
    VOID *		retval;
    static int		bytesleft = 0;
    static VOID *	nextspace;

    if (size < 4)
	size = 4;
    size = (size + 7) & ~7;	/* Assume doubleword boundaries are enough */
    if (bytesleft < size)
	{
	bytesleft = (size < MALLOC_INCREMENT) ? MALLOC_INCREMENT : size;
	nextspace = malloc ((unsigned) bytesleft);
	if (nextspace == NULL)
	    {
	    bytesleft = 0;
	    return NULL;
	    }
	}
    retval = nextspace;
    nextspace = (VOID *) ((char *) nextspace + size);
    bytesleft -= size;
    return retval;
    }

VOID * myrealloc (ptr, size, oldsize)
    VOID *		ptr;
    unsigned int	size;
    unsigned int	oldsize;
    {
    VOID *nptr;

    nptr = mymalloc (size);
    if (nptr == NULL)
	return NULL;
    (void) bcopy (ptr, nptr, oldsize);
    return nptr;
    }

/* ARGSUSED */
void myfree (ptr)
    VOID *		ptr;
    {
    }
#endif

static void readdict ()
    {
    struct dent		d;
    register struct dent * dp;
    struct dent *	lastdp;
    char		lbuf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS];
    char		ucbuf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS];
    FILE *		dictf;
    int			i;
    int			h;

    if ((dictf = fopen (Dfile, "r")) == NULL)
	{
	(void) fprintf (stderr, BHASH_C_CANT_OPEN_DICT);
	exit (1);
	}

    hashtbl =
      (struct dent *) calloc ((unsigned) hashsize, sizeof (struct dent));
    if (hashtbl == NULL)
	{
	(void) fprintf (stderr, BHASH_C_NO_SPACE);
	exit (1);
	}

    i = 0;
    while (fgets (lbuf, sizeof lbuf, dictf) != NULL)
	{
	if (!silent  &&  (i % 1000) == 0)
	    {
	    (void) fprintf (stderr, "%d ", i);
	    (void) fflush (stdout);
	    }
	i++;

	if (makedent (lbuf, sizeof lbuf, &d) < 0)
	    continue;

	h = hash (strtosichar (d.word, 1), hashsize);

	dp = &hashtbl[h];
	if ((dp->flagfield & USED) == 0)
	    {
	    *dp = d;
#ifndef NO_CAPITALIZATION_SUPPORT
	    /*
	    ** If it's a followcase word, we need to make this a
	    ** special dummy entry, and add a second with the
	    ** correct capitalization.
	    */
	    if (captype (d.flagfield) == FOLLOWCASE)
		{
		if (addvheader (dp))
		  exit (1);
		}
#endif
	    }
	else
	    {

	    /*
	    ** Collision.  Skip to the end of the collision
	    ** chain, or to a pre-existing entry for this
	    ** word.  Note that d.word always exists at
	    ** this point.
	    */
	    (void) strcpy (ucbuf, d.word);
	    chupcase (ucbuf);
	    while (dp != NULL)
		{
		if (strcmp (dp->word, ucbuf) == 0)
		    break;
#ifndef NO_CAPITALIZATION_SUPPORT
		while (dp->flagfield & MOREVARIANTS)
		    dp = dp->next;
#endif /* NO_CAPITALIZATION_SUPPORT */
		dp = dp->next;
		}
	    if (dp != NULL)
		{
		/*
		** A different capitalization is already in
		** the dictionary.  Combine capitalizations.
		*/
		if (combinecaps (dp, &d) < 0)
		    exit (1);
		}
	    else
		{
		/* Insert a new word into the dictionary */
		for (dp = &hashtbl[h];  dp->next != NULL;  )
		    dp = dp->next;
		lastdp = dp;
		dp = (struct dent *) mymalloc (sizeof (struct dent));
		if (dp == NULL)
		    {
		    (void) fprintf (stderr, BHASH_C_COLLISION_SPACE);
		    exit (1);
		    }
		*dp = d;
		lastdp->next = dp;
		dp->next = NULL;
#ifndef NO_CAPITALIZATION_SUPPORT
		/*
		** If it's a followcase word, we need to make this a
		** special dummy entry, and add a second with the
		** correct capitalization.
		*/
		if (captype (d.flagfield) == FOLLOWCASE)
		    {
		    if (addvheader (dp))
		      exit (1);
		    }
#endif
		}
	    }
	}
    if (!silent)
	(void) fprintf (stderr, "\n");
    (void) fclose (dictf);
    }

static void newcount ()
    {
    char		buf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS];
#ifndef NO_CAPITALIZATION_SUPPORT
    ichar_t		ibuf[INPUTWORDLEN + MAXAFFIXLEN + 2 * MASKBITS];
#endif
    register FILE *	d;
    register int	i;
#ifndef NO_CAPITALIZATION_SUPPORT
    ichar_t		lastibuf[sizeof ibuf / sizeof (ichar_t)];
    int			headercounted;
    int			followcase;
    register char *	cp;
#endif

    if (!silent)
	(void) fprintf (stderr, BHASH_C_COUNTING);

    if ((d = fopen (Dfile, "r")) == NULL)
	{
	(void) fprintf (stderr, BHASH_C_CANT_OPEN_DICT);
	exit (1);
	}

#ifndef NO_CAPITALIZATION_SUPPORT
    headercounted = 0;
    lastibuf[0] = 0;
#endif
    for (i = 0;  fgets (buf, sizeof buf, d);  )
	{
	if ((++i % 1000) == 0  &&  !silent)
	    {
	    (void) fprintf (stderr, "%d ", i);
	    (void) fflush (stdout);
	    }
#ifndef NO_CAPITALIZATION_SUPPORT
	cp = index (buf, hashheader.flagmarker);
	if (cp != NULL)
	    *cp = '\0';
	if (strtoichar (ibuf, buf, INPUTWORDLEN * sizeof (ichar_t), 1))
	    (void) fprintf (stderr, WORD_TOO_LONG (buf));
	followcase = (whatcap (ibuf) == FOLLOWCASE);
	upcase (ibuf);
	if (icharcmp (ibuf, lastibuf) != 0)
	    headercounted = 0;
	else if (!headercounted)
	    {
	    /* First duplicate will take two entries */
	    if ((++i % 1000) == 0  &&  !silent)
		{
		(void) fprintf (stderr, "%d ", i);
		(void) fflush (stdout);
		}
	    headercounted = 1;
	    }
	if (!headercounted  &&  followcase)
	    {
	    /* It's followcase and the first entry -- count again */
	    if ((++i % 1000) == 0  &&  !silent)
		{
		(void) fprintf (stderr, "%d ", i);
		(void) fflush (stdout);
		}
	    headercounted = 1;
	    }
	(void) icharcpy (lastibuf, ibuf);
#endif
	}
    (void) fclose (d);
    if (!silent)
	(void) fprintf (stderr, BHASH_C_WORD_COUNT, i);
    if ((d = fopen (Cfile, "w")) == NULL)
	{
	(void) fprintf (stderr, CANT_CREATE, Cfile);
	exit (1);
	}
    (void) fprintf (d, "%d\n", i);
    (void) fclose (d);
    }

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.