irinv.c

This is irinv.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
*/

/* Change log:
 * $Log:	irinv.c,v $
 * Revision 1.28  92/04/28  16:55:50  morris
 * added boolean to serial engine
 * 
 * Revision 1.27  92/03/19  09:34:26  morris
 * fixed the dictionary header to accurately indicate the number of blocks
 * 
 * Revision 1.26  92/03/01  16:11:16  brewster
 * took out the analyze_hashtable_distribution
 * 
 * Revision 1.25  92/02/12  13:27:31  jonathan
 * Added "$Log:	irinv.c,v $
 * Revision 1.28  92/04/28  16:55:50  morris
 * added boolean to serial engine
 * 
 * Revision 1.27  92/03/19  09:34:26  morris
 * fixed the dictionary header to accurately indicate the number of blocks
 * 
 * Revision 1.26  92/03/01  16:11:16  brewster
 * took out the analyze_hashtable_distribution
 * " so RCS will put the log message in the header
 * 
*/

/* Inverted file accessor functions. 
   
Main functions:
  finished_add_word

*/


#include <string.h> /* for memset() */

#include "cutil.h"
#include "futil.h"
#include "irhash.h"
#include "hash.h"
#include "panic.h"
#include "irfiles.h"
#include "irext.h"

/* ================== */
/* === Index file === */
/* ================== */

/*This is an implementation of the inverted file itself.
 * An index_block_number is the position in the file that the
 * entry starts.
 * Index block 0 is the null pointer.
 * The header contains the number of different words in the file.
 *
 */

/* the format of an index block is:
 *  in the first byte:
 *  INDEX_BLOCK_FULL_FLAG 
 *    || INDEX_BLOCK_NOT_FULL_FLAG
 *    || INDEX_BLOCK_DICTIONARY_FLAG
 * if index_block_full_flag
 *  next_index_block
 *  index_block_size
 *  stuff
 *  (the number of valid entries is index_block_size - index_block_header_size)
 * if index_block_not_full_flag
 *  number_of_valid_entries
 *  index_block_size
 *  stuff
 * if index_block_dictionary_flag
 *  last_index_block	  NEXT_INDEX_BLOCK_SIZE (0 when this is the last)
 *  index_block_size      INDEX_BLOCK_SIZE_SIZE
 *  number_of_occurances  NUMBER_OF_OCCURANCES_SIZE
 *  word (followed by \n)
 *
 *
 * "Stuff" is a list of word occurances of the format:
 * (this is written in irhash.c and read in sersrch.c)
 * 	doc_id
 *	character_position
 *	weight
 *	doc_id
 *	character_position
 * 	weight 
 * 	etc.
 *
 *   It should be (probably in release 9):
 *	doc_id (with high bit set)
 *	weight
 *	character_position
 *	...
 *	character_position
 *	doc_id
 *	weight
 *	character_position
 *	etc.
 * 	
 */

long 
write_dictionary_index_block(number_of_occurances,word,stream)
long number_of_occurances;
char *word;
FILE *stream;
/* returns a pointer to the index block allocated */
{
  /* this assumes that the stream is positioned at the end */
  long before_length = ftell(stream); /* file_length(stream); */
  
  long index_block_size = 
    INDEX_BLOCK_FLAG_SIZE +
      NEXT_INDEX_BLOCK_SIZE +
	INDEX_BLOCK_SIZE_SIZE +
	  NUMBER_OF_OCCURANCES_SIZE +
	    strlen(word) +
	      strlen("\n");	/* this is done this way for PC's (necessary?) */
  /* printf("writing dict entry to position %ld\n", before_length); */
  /* grow the file */
  /* in this implementation, the file is always filled by the fwrite,
   * so it will grow itself.
   * grow_file(stream, before_length + how_large);
   * file length leaves the stream at the end, so we are all set.
   * s_fseek(stream, before_length, SEEK_SET);
   */
  write_bytes(INDEX_BLOCK_DICTIONARY_FLAG, INDEX_BLOCK_FLAG_SIZE,
	      stream);	
  write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, stream);
  write_bytes(index_block_size, INDEX_BLOCK_SIZE_SIZE, stream);

  write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, 
	      stream);
  if(strlen(word) > MAX_WORD_LENGTH) /* for debugging */
    waislog(WLOG_HIGH, WLOG_ERROR, 
	    "Word: '%s' is too long (%ld characters) for some reason.",
	    word, strlen(word));

  fprintf(stream, "%s\n", word); 

  /* just to check
  { long after_length = ftell(stream);
    if(after_length - before_length != index_block_size){
      waislog(WLOG_HIGH, WLOG_ERROR, 
	      "dictionary entry size is %ld, and we thought %ld",
	      after_length - before_length, index_block_size);
    }
  }
  */

  return(before_length);
}

#ifdef everneeded
static long modify_dictionary_index_block 
  _AP((long index_block,long last_index_block,long number_of_occurances,
       FILE* stream));

static long modify_dictionary_index_block(index_block,last_index_block,number_of_occurances,stream)
long index_block;   
long last_index_block;
long number_of_occurances;
FILE* stream;

     /* this does not allow one to change the word itself, only the entries 
	around it.  It will panic if the index_block is not a valid block.
	This returns the the stream to pointing at the end of the file.
	*/
{
  s_fseek(stream, index_block, SEEK_SET);
  if(INDEX_BLOCK_DICTIONARY_FLAG != read_bytes(INDEX_BLOCK_FLAG_SIZE, stream))
    panic("the index block %ld is not a legal dictionary block",
	  index_block);
  write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, 
	      stream);
  write_bytes(last_index_block, NEXT_INDEX_BLOCK_SIZE, stream);
  read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); /* ignore it */
  write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, 
	      stream);
  s_fseek(stream, 0L, SEEK_END);
}

#endif /* def everneeded */

static long next_dictionary_index_block
  _AP((long* index_block_size,long* number_of_occurances,char* word,
       FILE* stream));

static long
next_dictionary_index_block(index_block_size,number_of_occurances,word,stream)
long *index_block_size;
long *number_of_occurances;
char *word;
FILE *stream;
{
  /* this read the dictionary index block from the index stream.
     It assumes the stream is positioned at the start of a dictionary
     block, and will return non-0 if it is not.
     returns 0 if it succeeds.
     returns -1 if it is at the of a file.
     returns -2 if it read something strange.
     Always sets word length to 0 if it fails. */
  long index_block_flag;
  char temp[MAX_WORD_LENGTH + 2];
  
  word[0] = '\0';

  index_block_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
  if(index_block_flag == EOF){
    /* not an error, it is the way it knows it is done 
    waislog(WLOG_HIGH, WLOG_ERROR, "Hit the end of the inverted file too early");
    */
    return(-1);
  }
  if(index_block_flag != INDEX_BLOCK_DICTIONARY_FLAG){
    waislog(WLOG_HIGH, WLOG_ERROR, 
	    "Did not find a dictionary entry, flag is %ld at file position %ld", 
	    index_block_flag, ftell(stream) - INDEX_BLOCK_FLAG_SIZE);
    return(-2);
  }
  (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
  *index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
  *number_of_occurances = read_bytes(NUMBER_OF_OCCURANCES_SIZE,
				     stream);
  fgets(temp, MAX_WORD_LENGTH + 2, stream); /* 2 is for the \n and '\0' */

  /* trim the \n */
  if(temp[strlen(temp) - 1] == '\n'){
    temp[strlen(temp) - 1] = '\0';
  }
  strcpy(word, temp);
  /* printf("Merging word '%s'\n", word); */

  return(0);
}

long
read_dictionary_index_block(index_block,
			    last_index_block,
			    index_block_size,
			    number_of_occurances,
			    word,
			    stream)
long index_block;
long *last_index_block;
long *index_block_size;
long *number_of_occurances;
char *word;
FILE *stream;
{
  /* this read the dictionary index block from the index stream.
     returns 0 if it succeeds. */
  long answer;
  s_fseek(stream, index_block, SEEK_SET);
  if(0 > (answer = next_dictionary_index_block(index_block_size,
						    number_of_occurances,
						    word,
						    stream)))
    panic("read dictionary block %ld failed", index_block);

  *last_index_block = 0;
  return(answer);
}


long allocate_index_block(how_large,stream)
long how_large;
FILE* stream;
{
  /* This returns a pointer for an index block.
     It creates the space and writes the header.
     how_large includes the header.
     Returns the block_number (the byte address of first byte in the block */
  /* this assumes that the stream is positioned at the end */
  long before_length = ftell(stream); /* file_length(stream); */
  /* grow the file */
  /* in this implementation, the file is always filled by the fwrite,
   * so it will grow itself.
   * grow_file(stream, before_length + how_large);
   * file length leaves the stream at the end, so we are all set.
   * s_fseek(stream, before_length, SEEK_SET);
   */
  write_bytes(INDEX_BLOCK_FULL_FLAG, /* in this version all are full */
	      INDEX_BLOCK_FLAG_SIZE,
	      stream);
  write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, stream);
  write_bytes(how_large, INDEX_BLOCK_SIZE_SIZE, stream);
  return(before_length);
}

#ifdef testing

static void scan_index_blocks _AP((char* filename,boolean verbose));

static void scan_index_blocks(filename,verbose)
char *filename;
boolean verbose;
{
  /* this is a debugging routine for checking the inverted index file */
  long current_index_block = INDEX_HEADER_SIZE;
  FILE *stream = s_fopen(filename, "rb");
  long length = file_length(stream);
  
  s_fseek(stream, current_index_block, SEEK_SET);
  printf("File length %ld\n", length);

  while(current_index_block < length){
    long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
    long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
    long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
    if(flag == INDEX_BLOCK_DICTIONARY_FLAG){
      long last_index_block;
      long index_block_size;
      long number_of_occurances;
      char word[MAX_WORD_LENGTH + 1];
      if(0 > read_dictionary_index_block(current_index_block,
				  &last_index_block,
				  &index_block_size,
				  &number_of_occurances,
				  word,
				  stream))
	panic("read dictionary index block failed");
      if(verbose)
	printf("%5ld: size %3ld Dictionary entry '%s', number of occurances %ld last block %ld\n",
	       current_index_block, index_block_size, word, 
	       number_of_occurances, next_index_block);
    }
    else if(flag == INDEX_BLOCK_NOT_FULL_FLAG){
      if(verbose)
	printf("%5ld: size %3ld Not full, valid entries %ld\n",
	       current_index_block, index_block_size, next_index_block);
    }
    else if(flag == INDEX_BLOCK_FULL_FLAG){
      if(verbose)
	printf("%5ld: size %3ld full block, next block %ld\n",
	     current_index_block, index_block_size, next_index_block);
    }
    else 
      panic("bad entry %ld (ftell %ld), flag was %ld", 
	    current_index_block, 
	    ftell(stream), flag);
    current_index_block += index_block_size;
    s_fseek(stream, current_index_block, SEEK_SET);
  }
  s_fclose(stream);
}   

#endif /* def testing */

#define COPY_BLOCK_BUFFER_LENGTH 1000

static long copy_stream _AP((FILE* from_stream,FILE* to_stream,long n));

static long copy_stream(from_stream,to_stream,n)
FILE *from_stream;
FILE *to_stream;
long n;
{
  char buffer[COPY_BLOCK_BUFFER_LENGTH];
  while(n > 0){
    /* keep reading until we are done */
    long amount_read = fread(buffer, sizeof(char), 
			     MIN(n, COPY_BLOCK_BUFFER_LENGTH),
			     from_stream);
    if(amount_read == 0 || amount_read == EOF)
      return(-1);
    if(amount_read != fwrite(buffer, sizeof(char), amount_read, to_stream))
      return(-1);
    n -= amount_read;
  }
  return(0);
}


static void merge_blocks _AP((char* word,FILE* from_stream_1,
			      FILE* from_stream_2,FILE* to_stream));

static void merge_blocks(word,from_stream_1,from_stream_2,to_stream)
char* word;
FILE *from_stream_1;
FILE *from_stream_2;
FILE *to_stream;
/* puts from_stream_1 first into to_stream then from_stream_2*/
{
  long flag;
  long index_block_size;
  long other_block_size;
    
  flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_1);
  (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_1);
  index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_1);

  if(flag == EOF) return;
  if(flag != INDEX_BLOCK_FULL_FLAG)
    panic("the next index block is not a full block");

  flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_2);
  (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_2);
  other_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_2);
  if(flag != INDEX_BLOCK_FULL_FLAG)
    panic("the next index block is not a full block");

  write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream);
  write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, to_stream);
  if((index_block_size + other_block_size) >
     (1L << (INDEX_BLOCK_SIZE_SIZE * 8))){
    /* then the block is too large to be represented in the 
       index_block_size_size.  This routine takes the radical step to
       eliminate it completely.  The "right" thing to do is to
       chain blocks, but I dont think it is worth it since this should not
       happen very often.  If it does happen often then bump up
       INDEX_BLOCK_SIZE_SIZE. */
    waislog(WLOG_LOW, WLOG_INFO,
	    "Can not merge the index block for %s, since it would create one that is too big.  Deleting it.",word);
    write_bytes(INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_SIZE_SIZE, to_stream);
    s_fseek(from_stream_1, index_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
    s_fseek(from_stream_2, other_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
  }
  else{ /* copy away */
    write_bytes(index_block_size + other_block_size - INDEX_BLOCK_HEADER_SIZE, 
		INDEX_BLOCK_SIZE_SIZE, to_stream);
    copy_stream(from_stream_1, to_stream, index_block_size - INDEX_BLOCK_HEADER_SIZE);
    copy_stream(from_stream_2, to_stream, other_block_size - INDEX_BLOCK_HEADER_SIZE);
  }
}

static void make_dummy_block _AP((FILE* from_stream_1,FILE* from_stream_2,
				  FILE* to_stream));

static void make_dummy_block(from_stream_1,from_stream_2,to_stream)
FILE *from_stream_1;
FILE *from_stream_2;
FILE *to_stream;
/* deletes block from_stream_1 and from_stream_2 and makes a 0 length
   block on to_stream */
{
  long flag;
  long index_block_size;
  long other_block_size;
    
  flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_1);
  (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_1);
  index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_1);

  if(flag == EOF) return;
  if(flag != INDEX_BLOCK_FULL_FLAG)
    panic("the next index block is not a full block");

  flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_2);
  (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_2);
  other_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_2);
  if(flag != INDEX_BLOCK_FULL_FLAG)
    panic("the next index block is not a full block");

  write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream);
  write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, to_stream);
  write_bytes(INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_SIZE_SIZE, to_stream);
  s_fseek(from_stream_1, index_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
  s_fseek(from_stream_2, other_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR);
}

static void copy_block _AP((FILE* from_stream,FILE* to_stream));

static void copy_block(from_stream,to_stream)
FILE *from_stream;
FILE *to_stream;
/* copies an index block from one stream to another */
{ /* copies an index block from from_stream to to_stream */
  long flag;
  long next_index_block;
  long index_block_size;

  flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream);
  next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream);
  index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream);

  if(flag == EOF) return;
  write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream);
  write_bytes(next_index_block, NEXT_INDEX_BLOCK_SIZE, to_stream);
  write_bytes(index_block_size, INDEX_BLOCK_SIZE_SIZE, to_stream);
  /* copy the real stuff */
  copy_stream(from_stream, to_stream, 
	      index_block_size - INDEX_BLOCK_HEADER_SIZE);
}

/* ================== */
/* === Merge File === */
/* ================== */

static boolean merge_index_file _AP((long from_version, long into_version,
				  database* db,
				  boolean generate_dictionary));

/* Merges a index file (from_version) into another (into_version).
   Returns true if successful */
static boolean merge_index_file(into_version, from_version, 
				db,generate_dictionary)
long into_version;
long from_version;
database* db;
boolean generate_dictionary;
/* 
   This is done by merging both into version -1 and then, 
   renames it to into_version, then deletes from_version. */
{
  char input_filename_1[MAX_FILENAME_LEN]; /* into_version file */
  char input_filename_2[MAX_FILENAME_LEN]; /* from_version file */
  char output_filename[MAX_FILENAME_LEN];  /* version -1 */

  FILE *input_stream_1;
  FILE *input_stream_2;
  FILE *output_stream;
  char input_word_1[MAX_WORD_LENGTH + 1];
  char input_word_2[MAX_WORD_LENGTH + 1];

  if(generate_dictionary) {
    if(into_version == -2)
      waislog(WLOG_MEDIUM, WLOG_INDEX,
	      "Merging version %ld into master version and generating dictionary.",
	      from_version, into_version);
    else
      waislog(WLOG_MEDIUM, WLOG_INDEX,
	      "Merging version %ld into %ld and generating dictionary.",
	      from_version, into_version);
  }
  else {
    waislog(WLOG_MEDIUM, WLOG_INDEX, "Merging version %ld and %ld", 
	    into_version, from_version);
  }

  index_filename_with_version(into_version, input_filename_1, db);
  index_filename_with_version(from_version, input_filename_2, db); 
  index_filename_with_version(-1L, output_filename, db); 
  

  if(NULL == (input_stream_1 = s_fopen(input_filename_1, "rb")))
    return(false);

  if(NULL == (input_stream_2 = s_fopen(input_filename_2, "rb")))
    return(false);

  if(NULL == (output_stream = s_fopen(output_filename, "wb")))
    return(false);

  {
    long index_block_size_1;
    long number_of_occurances_1;
    long index_block_size_2;
    long number_of_occurances_2;

    /* leave room for the number_of_words in the output */
    s_fseek(output_stream, INDEX_HEADER_SIZE, SEEK_SET);

    /* read the number of words in the 2 input files, this is the maximum 
       number of words that can be in the dictionary,  It is used by 
       init_dict_file to set asside space for the dictionary_header_block.
       It is very likley that some of the words are duplicates, so the actual
       size of the header_block will be smaller.  But we never know until we
       do the merge, so we have to give enough space just in case.
       NOTE - in the case where generate_dictioary is false, we still need
       to do this because we need to skip the header_size in each of the
       input streams 
     */
    db->number_of_words = read_bytes(INDEX_HEADER_SIZE,input_stream_1) +
                          read_bytes(INDEX_HEADER_SIZE,input_stream_2);

    if (generate_dictionary) /* allocate the header block if necessary */
     { init_dict_file_for_writing(db);
     }

    /* now that the dictioanry_header_block is allocated, we can start counting
       for real */
    db->number_of_words = 0; 

    if(-1 > next_dictionary_index_block(&index_block_size_1,
				       &number_of_occurances_1,
				       input_word_1,
				       input_stream_1))
      panic("Read of dictionary block failed in file %s",
	    input_filename_1);
    if(-1 > next_dictionary_index_block(&index_block_size_2,
				&number_of_occurances_2,
				input_word_2,
				input_stream_2))
      panic("Read of dictionary block failed in file %s",
	    input_filename_2);
    while(true){
      long strlen_1 = strlen(input_word_1);
      long strlen_2 = strlen(input_word_2);
      long compare = strcmp(input_word_1, input_word_2); /* +1 if 1 is bigger */

      if((0 == strlen_1) && 
	 (0 == strlen_2)){
	/* printf("Done with merge\n"); */
	/* then we are done. */
	break;
      }
      else if((0 != strlen_1) && (0 != strlen_2) && (0 == compare)){
	/* they are equal */
	/* printf("Merging word %s and %s\n", input_word_1, input_word_2); */
	write_dictionary_index_block(number_of_occurances_1 +
				     number_of_occurances_2, 
				     input_word_1, 
				     output_stream);
	if(generate_dictionary){
	  add_word_to_dictionary(input_word_1, ftell(output_stream), 
				 number_of_occurances_1 +
				 number_of_occurances_2, 
				 db);
	}
	db->number_of_words++;
	/* copy file 1 first.  this assumes that file 1 was indexed before
	   file 2 */
	if((number_of_occurances_1+number_of_occurances_2) > MAX_OCCURANCES){
	  /* too many already, just make a dummy (0 length) */
	  if((number_of_occurances_1 < MAX_OCCURANCES) &&
	     (number_of_occurances_2 < MAX_OCCURANCES)) {
	    /* only print the first time */
	    waislog(WLOG_MEDIUM, WLOG_INDEX,
		    "Deleting word '%s' since it has %ld occurences (limit %ld).",
		    input_word_1,number_of_occurances_1+number_of_occurances_2,
		    MAX_OCCURANCES);
	  }
	  make_dummy_block(input_stream_1, input_stream_2, output_stream);
	}
	else{
	  merge_blocks(input_word_1,input_stream_1,input_stream_2,
		       output_stream); 
	}
	if(-1 > next_dictionary_index_block(&index_block_size_1,
				    &number_of_occurances_1,
				    input_word_1,
				    input_stream_1))
	  panic("Read of dictionary block failed in file %s",
		input_filename_1);

	if(-1 > next_dictionary_index_block(&index_block_size_2,
				    &number_of_occurances_2,
				    input_word_2,
				    input_stream_2))
	  panic("Read of dictionary block failed in file %s",
		input_filename_2);
      }
      else if(((0 == strlen_1) && (0 != strlen_2) && (0 > compare)) ||
	      ((0 != strlen_1) && (0 != strlen_2) && (0 < compare))){
	/* write from block 2 */
	/* printf("From block 2: writing word '%s' not '%s'\n", 
	   input_word_2, input_word_1);  */
	write_dictionary_index_block(number_of_occurances_2, input_word_2,
				     output_stream);
	if(generate_dictionary){
	  add_word_to_dictionary(input_word_2, ftell(output_stream), 
				 number_of_occurances_2, db);
	}
	db->number_of_words++;
	copy_block(input_stream_2, output_stream);
	if(-1 > next_dictionary_index_block(&index_block_size_2,
				    &number_of_occurances_2,
				    input_word_2,
				    input_stream_2))
	  panic("Read of dictionary block failed in file %s",
		input_filename_2);
      }
      else{
	/* write from block 1 */
	/* printf("From block 1: writing word '%s' not '%s'\n", 
	   input_word_1, input_word_2); */
	write_dictionary_index_block(number_of_occurances_1, input_word_1,
				     output_stream);
	if(generate_dictionary){
	  add_word_to_dictionary(input_word_1, ftell(output_stream), 
				 number_of_occurances_1, db);
	}
	db->number_of_words++;
	copy_block(input_stream_1, output_stream);
	if(-1 > next_dictionary_index_block(&index_block_size_1,
					   &number_of_occurances_1,
					   input_word_1,
					   input_stream_1))
	  panic("Read of dictionary block failed in file %s",
		input_filename_1);
      }
    }
  }

  /* write out the number of words */
  s_fseek(output_stream, 0L, SEEK_SET);
  write_bytes(db->number_of_words, INDEX_HEADER_SIZE, output_stream);

  s_fclose(input_stream_1);
  s_fclose(input_stream_2);
  s_fclose(output_stream);
  /* check resulting file */	
  /* check_index_file(output_filename); */
  /* scan_index_blocks(output_filename); */

  /* delete the input files */
  remove(input_filename_1);
  remove(input_filename_2);
  /* These next two calls result in the renaming of the new files ontop of 
     the old
     files. This is the only critical time in which if an online update is 
     happening (the index files are being changed while the 
     queries are being answered).  If a query comes along in another
     process and opens the .inv file after the rename is done, but before
     the finished_add_word_to_dictionary has the change to rename the .dct
     file and therefore opens the old version of the .dct file, then the
     files will be out of synch. */
  rename(output_filename, input_filename_1);  
  if(generate_dictionary)
    finished_add_word_to_dictionary(db);
    
#ifdef THINK_C
  /* set the mac file type to INDX */
  setFileType(input_filename_1, WAIS_INDEX_FILE_TYPE, CREATOR);
#endif /* THINK_C */

  return(true);
}


/* only works on positive, non-zero numbers, and is slow to boot, yippie */
static long logcount _AP((long number));
static long logcount(number)
long number;
{
  long answer = 0;
  for( ; number > 0; number = number >> 1)
    answer++;
  return(answer);
}
    

static void merge_index_files _AP((database* db));

static void merge_index_files(db)
database *db;
/* this merges all the temporary inverted files into a large on 
 * and creates a dictionary.
 * This is done in a logrithmic merge of the files.
 * An n-ary merge would be faster of course, but what the heck... 
 */
{
  /* this version does it two at a time */
  char filename[MAX_FILENAME_LEN];
  char filename2[MAX_FILENAME_LEN];
  long level;
  long final_level;
  long number_of_files_to_merge = db->index_file_number;
  /* at level 0, merge 0,1->0; 2,3->2; 4,5->4; 6,7->6; 
     at level 1, merge 0,2->0; 4,6->4;
     at level 2, merge 0,4->0;
     stop.
     If there is only 1 file, then dont merge (final_level will be -1),
     the dictionary will be generated by merge after the for loop.
     */
  final_level = logcount(number_of_files_to_merge - 1) - 1;
  for(level = 0; level <= final_level; level++){
    long into_version;
    for(into_version = 0; 
	into_version < number_of_files_to_merge; 
	into_version += (2 << level)){
      long from_version = into_version + (1 << level);
      if(from_version < number_of_files_to_merge){
	boolean generate_dictionary; /* if this is the final level and 
					there is no .inv file then yes */
	if(level == final_level){
	  if(probe_file(index_filename(filename, db)))
	    generate_dictionary = false;
	  else generate_dictionary = true;
	}
	else{ 
	  generate_dictionary = false;
	}
	
	/* printf("Level %d (out of %d) merging into %d from %d\n", 
	       level, final_level, into_version, from_version); */
	if(false == merge_index_file(into_version, from_version, db,
				     generate_dictionary))
	  panic("Error in merging file %ld into %ld", 
		from_version, into_version);
      }
    }
  } /* done merging */
  if(probe_file(index_filename(filename, db))){
    /* if there is a .inv file, then we are adding to an existing db. merge */ 
    /* do this by merging straight into the final using the special -2 version
       for the .inv file */
    merge_index_file(-2L, 0L, db, true);    
  }
  else if(number_of_files_to_merge == 1){
    /* then we have to generate a dictionary for this one file. 
       create a dummy file in 1, then merge that while making a dictionary
       */
    touch_file(index_filename_with_version(1, filename, db));
    merge_index_file(0L, 1, db, true);
    /* rename 0 into the final name */
    rename(index_filename_with_version(0L,filename2, db),
	   index_filename(filename, db));
  }
}    

/* ============================================ */
/* ===  Flushing the memory version of the  === */
/* ===  word hashtable to disk files        === */
/* ============================================ */

static void flush_word_entry_to_file _AP((hash_entry* wrd_entry,
					  database* db));

static void flush_word_entry_to_file(wrd_entry,db)
hash_entry* wrd_entry;
database* db;
{
  /* In this version, each word_entry is made into an index block.
     This means that there may be many small index blocks if the memory
     can not hold many files worth of data.  This approach was taken 
     for simplicities sake and it might be the best approach if there
     is a small vocabulary.
     
     This assumes that the index stream is positioned at the end.
     */

  if(wrd_entry->number_of_occurances > MAX_OCCURANCES){
    /* there are too many of this word, do nothing.
     * this may result in some having been written before, or 
     * no occurances of this word might be recorded.  Is this right?
     * if the first MAX_OCCURANCES want to be recored, then
     * add this clause to this condition:
     *     && wrd_entry->starting_block_number != 0
     */
    return;
  }
  if((wrd_entry->current_memory_ptr - wrd_entry->memory_ptr +
      INDEX_BLOCK_HEADER_SIZE) >=
     (1L << (8 * INDEX_BLOCK_SIZE_SIZE)))
    return; /* there are too many entries to store away, therefore throw it all away */
  
  if(NULL == wrd_entry->memory_ptr){
    /* not entries to write out */
    return;
  }
  if(0 == (wrd_entry->current_memory_ptr - wrd_entry->memory_ptr)){
    return;			/* there are no word entries to write */
  }
  /* printf("Flushing word %s\n", wrd_entry->key); */
  /* if this is the entry for this word, the put in the dictionary
     entry in the index file */
  write_dictionary_index_block(wrd_entry->number_of_occurances,
			       wrd_entry->key,
			       db->index_stream);

  db->number_of_words++;

  /* allocate and write the new block */
  allocate_index_block(wrd_entry->current_memory_ptr -
		       wrd_entry->memory_ptr +
		       INDEX_BLOCK_HEADER_SIZE,
		       db->index_stream);
  /* cprintf(PRINT_AS_INDEXING, "New block number: %ld\n", new_block); */
  if((wrd_entry->current_memory_ptr - wrd_entry->memory_ptr) !=
     fwrite(wrd_entry->memory_ptr, 1L, (wrd_entry->current_memory_ptr -
					wrd_entry->memory_ptr),
	    db->index_stream))
    panic("Write failed");
  /* free the memory for the block written out */
  free_word_occurance_block(wrd_entry->memory_ptr);
  wrd_entry->memory_ptr =  NULL;
  wrd_entry->current_memory_ptr = NULL;
  wrd_entry->memory_size = 0;
  wrd_entry->current_doc_id = 0;
}
  

/* This flushes all of the memory version of the word hashtable to disk.
 * this is called when the hashtable is filling up or we have 
 * accumulated enough words.  If completely is true, then it will merge 
 * all the intermediate files.
 */
void flush_memory_hashtable_to_disk(db,completely)
database* db;
boolean completely;
{
  /* map over the memory word hashtable and write the entries to disk */
  long i;
  char filename[1000];

  /* analyze the hash distribution 
  analyze_hashtable_distribution(db->the_word_memory_hashtable);
  */

  db->index_stream = 
    s_fopen(index_filename_with_version(db->index_file_number++, filename, db),
	    "wb");
  if(NULL == db->index_stream)
    panic("Could not open file %s to insert index", 
	  index_filename_with_version(db->index_file_number, filename, db));

  waislog(WLOG_MEDIUM, WLOG_INDEX,
	  "Flushing %ld different words, %ld total words to disk...", 
	  hashtable_count(db->the_word_memory_hashtable),
	  db->number_of_words_in_hashtable);
  db->number_of_words = 0;
  write_bytes(0l, INDEX_HEADER_SIZE, db->index_stream);

  sort_hashtable(db->the_word_memory_hashtable);
  for(i = 0; i < hashtable_count(db->the_word_memory_hashtable); i++){
    flush_word_entry_to_file(&db->the_word_memory_hashtable->contents[i], db);
  }

  /* write out the number of entries into the index_file header */
  s_fseek(db->index_stream, 0L, SEEK_SET);
  write_bytes(db->number_of_words, INDEX_HEADER_SIZE, db->index_stream);
  s_fclose(db->index_stream);

  /* since everything is written out, we can flush these. */
  flush_word_occurance_buffers(); 
  clr_hashtable(db->the_word_memory_hashtable);
  db->number_of_words_in_hashtable = 0;

  /* add the stopwords to the index for the next session. */
  add_stop_words(db->the_word_memory_hashtable);

  waislog(WLOG_MEDIUM, WLOG_INDEX,
	  "Done flushing");
  /* scan_index_blocks(filename); */

  if(completely){
    waislog(WLOG_MEDIUM, WLOG_INDEX,
	    "Merging %ld files",
	    db->index_file_number);
    merge_index_files(db);
    waislog(WLOG_MEDIUM, WLOG_INDEX,
	    "Done merging.");
  }
}

long finished_add_word(db)
database *db;
{
  flush_memory_hashtable_to_disk(db,true);
  return(0); /* successful */
}

long init_add_word(db, hashtable_size, flush_after_n_words)
     database *db;
     long hashtable_size;
     long flush_after_n_words;
{
  if(NULL != db->the_word_memory_hashtable)
    free_hashtable(db->the_word_memory_hashtable);
  db->the_word_memory_hashtable =
    make_hashtable(0, hashtable_size, sizeof(hash_entry));
  db->flush_after_n_words = flush_after_n_words;
  add_stop_words(db->the_word_memory_hashtable);
  return(0);
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.