sighash.c

This is sighash.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
*/

/* The memory hashtables for building an index. */
/* -brewster 5/90 */

/* main functions:
 *   add_word
 *   finished_add_word
 *   look_up_word
 *
 * The idea is to store up a bunch of words before going to disk.
 * A word entry points to where it will go on disk, and
 * accumulates the entries before doing it.
 *
 * Some of the policy issues in this file are:
 *   How much weight should the first occurance of a word in a document get
 *   over the other occurances.  The first occurance should be worth more
 *   so that words with 3 occurances of "dog" and not "cat"'s should not 
 *   win out over 1 "dog" and 1 "cat" if the question is "Tell me about cats
 *   torture dogs"
 *   The extra weight is 5 at this point.
 *
 */

#ifndef lint
static char *RCSid = "$Header: /proj/wais/wais-8-b5/ir/RCS/sighash.c,v 1.23 92/05/05 13:21:25 shen Exp Locker: shen $";
#endif

/* Change log:
 * $Log:	sighash.c,v $
 * Revision 1.23  92/05/05  13:21:25  shen
 * undo the change make in previous revision 1.22
 * fixed the bug in db->total_word_ccount by checking the last word in t
 * table. If the last word is DICTIONARY_TOTAL_SIZE_WORD, decrement the number
 * of words to add to dictionary file by 1.
 * 
 * Revision 1.22  92/05/05  11:01:54  shen
 * fixed db->total_word_count for incremental update by eliminating adding
 * DICTIONARY_TOTAL_SIZE_WORD to hash table.
 * 
 * Revision 1.21  92/04/29  08:21:15  shen
 * redefine MAX_OCCURANCES to a very big number: 0x10000000.
 * 
 * Revision 1.20  92/03/20  11:04:18  jonathan
 * Added word_postition argument to add word.  See irext.h for explanation.
 * 
 * Revision 1.19  92/03/01  16:11:57  brewster
 * took out analyze_hashtable_distribution
 * 
 * Revision 1.18  92/02/24  19:58:04  jonathan
 * Added code to add the dictionary to the hastable on startup.
 * 
 * Revision 1.17  92/02/12  13:46:18  jonathan
 * Added "$Log" so RCS will put the log message in the header
 * 
*/

/* To Do:
 *  done: Improve the hashing functions.
 *  done: stop inserting into hash table after max number have been accumulated
 *  done: make flush not flush buffers that are too big.
 */
 
#include <ctype.h>
#include <string.h> 	/* for strlen(), memset() */

#include "panic.h"
#include "cutil.h"
#include "futil.h"
#include "irfiles.h"
#include "irhash.h"
#include "stoplist.h"
#include "irinv.h"
#include "sigindex.h"

#ifdef UNIX
#define PRINT_AS_INDEXING true /* also defined in irtfiles.c and irfiles.c */
#else 
#define PRINT_AS_INDEXING false
#endif

#define PROXIMITY /* this turns on writing out of all word occurances */


/* ---------------------------------------------------- */
static hash_entry* look_up_word _AP((char* word,hashtable*
				     the_word_memory_hashtable));
  
static hash_entry* 
look_up_word(word,the_word_memory_hashtable)
char* word;
hashtable* the_word_memory_hashtable;
{
  hash_entry * answer = get_hash(word, the_word_memory_hashtable);
  
  if(NULL != answer)
    return(answer);
  else{
    hash_entry wrd_entry;
    answer = put_hash(word, the_word_memory_hashtable, &wrd_entry);
    answer->number_of_occurances = 0;
    answer->memory_ptr = NULL;
    answer->memory_size = 0;
    answer->current_memory_ptr = answer->memory_ptr;
    answer->current_doc_id = 0;
    return(answer);
  }
}

#ifdef NOTUSED
static unsigned char add_weight _AP((long current_weight,long new_weight));

static unsigned char 
add_weight(current_weight,new_weight)
long current_weight;
long new_weight;
/* add a new weight to the existing one */
{
  /* this should be smarter than this, like doing the log or something */
  if(127 < (current_weight + new_weight)){
    /* the max char.  should be 255, but does not work on all compilers */
    return(127);
  }
  else{
    return(current_weight + new_weight);
  }
}

long write_bytes_to_memory(value,size,ptr)
long value;
long size;
unsigned char* ptr;
{
  /* writes the number into memory lsb first.  
     returns the number of bytes written */
  long i;
  long original_value = value;

  if(size < 0) /* paranoia */
    panic("attempting to write a negative number of bytes");

  ptr += size; /* start at the end of the block and write backwards */
  for (i = 0; i < size; i++){
    ptr--;
    *ptr = (unsigned char)(value & 0xFF);
    value = value >> 8;
  }
  if(value != 0)
    panic("In a call to write_bytes_to_memory, the value %ld can not be represented in %ld bytes", original_value, size);

  return(size);
}
		
#endif /* def NOTUSED */

/* adds a word to the hashtable. 
 * Returns the 0 if successful. See irext.h for more documentation.
 */
long add_word(word, char_pos, line_pos,
	      weight, doc_id, date, word_pair, db, word_position)
     char *word;	/* the word to be indexed, this could be a
			   word pair. If NULL there are no more words
			   to be indexed */
     long char_pos;	/* the position of the start of the
			   word */
     long line_pos;	/* this is passed for the best
			   section calculation */
     long weight;	/* how important the word looks
			   syntactically (such as is it bold)
			   NOT used by signature system */
     long doc_id; 	/* current document, this will never be 0 */
     time_t date; /* display day of this document, 0 if not known */
     long word_pair;
     database* db; /* database to insert the document */
     boolean word_position; /* ignored here. */
{
  /* look up the word in the hashtable */
  /* creates it if necessary */	
  hash_entry* wrd_entry;
  hashtable * the_word_memory_hashtable = db->the_word_memory_hashtable;
  /* printf("Word: '%s' doc_id: %ld, pos: %ld, weight: %ld\n",
     word, doc_id, char_pos, weight); */
  
  if(NULL == db->the_word_memory_hashtable){
    panic("The memory word hashtable is not defined.");
  }

  /* if we have indexed enough words flush the memory copies to disk.
  if(db->number_of_words_in_hashtable == db->flush_after_n_words)
    flush_memory_hashtable_to_disk(db, false);
    ** not done on sig system **
 */
  
  wrd_entry = look_up_word(word, the_word_memory_hashtable);
  wrd_entry->number_of_occurances ++;

  /* check if we have too many of this word before we add it */
#undef MAX_OCCURANCES
#define MAX_OCCURANCES 0x10000000
  if(wrd_entry->number_of_occurances < MAX_OCCURANCES){
    db->number_of_words_in_hashtable ++;
    sig_add_word(word, char_pos, line_pos, weight, doc_id, date, word_pair);
  }
  return(0L);
}

void add_stop_words(the_word_memory_hashtable)
hashtable *the_word_memory_hashtable;
     /* add the stop words to the hashtable.  this must be done before
	adding other words */
{
  init_stop_list();
  while(true){
    char *word = next_stop_word();
    hash_entry* wrd_entry;

    if(NULL == word)
      break;
    wrd_entry = look_up_word(word, the_word_memory_hashtable);
    wrd_entry->number_of_occurances = STOP_WORD_FLAG;
  }
}


long finished_add_word(db)
database *db;
{
  /* write out the dictioanry */
  long i;
  long num_words;

  db->number_of_words = hashtable_count(db->the_word_memory_hashtable);
  init_dict_file_for_writing(db);
  /* analyze_hashtable_distribution(db->the_word_memory_hashtable); */
  sort_hashtable(db->the_word_memory_hashtable);
  /* exclude the last word which is DICTIONARY_TOTAL_SIZE_WORD */
  num_words = hashtable_count(db->the_word_memory_hashtable);
  if ( 0 == strcmp(db->the_word_memory_hashtable->contents[num_words-1].key,
                   DICTIONARY_TOTAL_SIZE_WORD) ) 
    num_words--;
   
  for(i = 0; i < num_words; i++){
    hash_entry * entry = &db->the_word_memory_hashtable->contents[i];
    if(0 == (STOP_WORD_FLAG & entry->number_of_occurances)){
      /* write out the dictionary entry */
      /* printf("Adding word: %s %ld entries\n", entry->word, entry->number_of_occurances); */
      /* the position in this world is a unique id for every word */
      add_word_to_dictionary(entry->key, i,
			     entry->number_of_occurances, db);
    }
  }
  finished_add_word_to_dictionary(db);
  return(sig_finished_add_word(db));
}

/* Add the dictionary to the hastable */

void add_dictionary_to_hashtable(db)
database *db;
{
  /* prints the contents of a dictionary */
  FILE *stream = db->dictionary_stream;
  long i, j, new_number_of_dictionary_blocks;
  extern unsigned char *dictionary_header_block, *dictionary_block;
  extern long number_of_dictionary_blocks;

  if(stream != NULL) {
    waislog(WLOG_LOW, WLOG_INFO, "Adding dictionary to hastable");

    s_fseek(stream, 0L, SEEK_SET);
    new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, stream);
    if(new_number_of_dictionary_blocks > number_of_dictionary_blocks)
      dictionary_header_block = NULL;
    number_of_dictionary_blocks = new_number_of_dictionary_blocks;
    if(NULL == (dictionary_header_block =
		read_dictionary_block(dictionary_header_block,
				      DICTIONARY_HEADER_SIZE,
				      number_of_dictionary_blocks,
				      stream))) {    
      waislog(WLOG_MEDIUM, WLOG_WARNING,
	      "Could not read dictionary header block");
      return;
    }
    for(i = 0; i < number_of_dictionary_blocks; i++){
      long pos = dictionary_block_position(i, dictionary_header_block);
      if(NULL == (dictionary_block =
		  read_dictionary_block(dictionary_block,
					pos, DICTIONARY_BLOCK_SIZE, stream))) {
	waislog(WLOG_MEDIUM, WLOG_WARNING,
		"Could not read dictionary block %ld", pos);
      }
      else
	/* iterate over words */
	for(j = 0; j < DICTIONARY_BLOCK_SIZE; j++) {
	  char *word = dictionary_block_word(j, dictionary_block);
	  hash_entry* wrd_entry;

	  if(word[0] == '\0' )
	    break;
	  wrd_entry = look_up_word(word, db->the_word_memory_hashtable);
	  wrd_entry->number_of_occurances = 
	    dictionary_block_word_occurances(j, dictionary_block);
	}
    }
  }
}

long init_add_word(db, hashtable_size, flush_after_n_words)
     database *db;
     long hashtable_size;
     long flush_after_n_words;
{
  if(NULL != db->the_word_memory_hashtable)
    free_hashtable(db->the_word_memory_hashtable);
  db->the_word_memory_hashtable =
    make_hashtable(0, hashtable_size, sizeof(hash_entry));
  db->flush_after_n_words = 0x7FFFFFFF;  /* a large number */
  sig_init_add_word(db, BATCH_UPDATE, ADD_UPDATE);
  add_dictionary_to_hashtable(db);
  add_stop_words(db->the_word_memory_hashtable);
  return(0);
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.