irhash.c

This is irhash.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
*/

/* The memory hashtables for building an index. */
/* -brewster 5/90 */

/* Change log:
 * $Log:	irhash.c,v $
 * Revision 1.16  92/03/20  11:04:33  jonathan
 * Added code to handle switches for word_pairs and word_postition info.
 * 
 * Revision 1.15  92/02/12  13:26:28  jonathan
 * Added "$Log" so RCS will put the log message in the header
 * 
*/

/* main functions:
 *   add_word
 *   finished_add_word
 *   look_up_word
 *
 * The idea is to store up a bunch of words before going to disk.
 * A word entry points to where it will go on disk, and
 * accumulates the entries before doing it.
 *
 * Some of the policy issues in this file are:
 *   How much weight should the first occurance of a word in a document get
 *   over the other occurances.  The first occurance should be worth more
 *   so that words with 3 occurances of "dog" and not "cat"'s should not 
 *   win out over 1 "dog" and 1 "cat" if the question is "Tell me about cats
 *   torture dogs"
 *   The extra weight is 5 at this point.
 *
 */

/* To Do:
 *  done: Improve the hashing functions.
 *  done: stop inserting into hash table after max number have been accumulated
 *  done: make flush not flush buffers that are too big.
 */
 
#include <ctype.h>
#include <string.h> 	/* for strlen(), memset() */

#include "panic.h"
#include "cutil.h"
#include "irfiles.h"
#include "irhash.h"
#include "stoplist.h"
#include "irinv.h"

#ifdef UNIX
#define PRINT_AS_INDEXING true /* also defined in irtfiles.c and irfiles.c */
#else 
#define PRINT_AS_INDEXING false
#endif

/* ================================
   ===  Word Occurance Buffers  ===
   ================================ */

/* Word occurance buffers
 * This is a simple memory allocator for use with the word memory hashtable.
 * Since the buffers are tiny, this is done as a copy-sweep GC scheme.
 * Oh, I long for the storage system of lisp.
 */
unsigned char *first_word_occurance_buffer = NULL;  /* allocate blocks out of this */
unsigned char *last_word_occurance_buffer = NULL;
long word_occurance_block_length = 256000;  /* maybe this should be larger? */
unsigned char * word_occurance_free_ptr = NULL;

unsigned char *make_word_occurrance_block(size)
long size;

{
  /* allocates a word_occurance_block out of the buffers */
  /* old way: s_malloc((size_t)size); */
  /* returns a pointer to a piece of memory */
  if(NULL == first_word_occurance_buffer){
    /* initialize it */
    first_word_occurance_buffer = 
      (unsigned char *)s_malloc(MAX(word_occurance_block_length,
			   sizeof(size_t)+ size));
    *(unsigned char **)first_word_occurance_buffer = NULL; /* set the end */
    last_word_occurance_buffer = first_word_occurance_buffer;
    word_occurance_free_ptr = first_word_occurance_buffer + sizeof(size_t);
  }
  if((long)word_occurance_free_ptr + size >= 
     word_occurance_block_length + (long)last_word_occurance_buffer){
    /* then allocate a new block */
    unsigned char * new_block = (unsigned char *)s_malloc(MAX(word_occurance_block_length,
					    sizeof(size_t)+ size));
    *(unsigned char **)new_block = NULL; /* set the end of the chain */
    *(unsigned char **)last_word_occurance_buffer = new_block;
    word_occurance_free_ptr = new_block + sizeof(size_t);
    last_word_occurance_buffer = new_block;
  }
  /* allocate away */	
  { unsigned char * answer = word_occurance_free_ptr;
    word_occurance_free_ptr += size;	
    return(answer);  
  }
}

void free_word_occurance_block(block)
unsigned char *block;
{
  /* this is not used with the new scheme, but is here in case
     malloc is a win on some systems */
  /* old way s_free(block); */
}

static void flush_word_occur_bufs_internal
  _AP((unsigned char* head_of_list));

static void flush_word_occur_bufs_internal(head_of_list)
unsigned char* head_of_list;
/* frees all word occurance buffers.  This should be done with care */
{      
  while(1){
    unsigned char * next_block;
    if(NULL == head_of_list)
      break;
    next_block = *(unsigned char **)head_of_list;
    s_free(head_of_list);
    head_of_list = next_block;
  }
}

void flush_word_occurance_buffers()
{
  /* frees all word occurance buffers.  This should be done with care */
  flush_word_occur_bufs_internal(first_word_occurance_buffer);
  first_word_occurance_buffer = NULL;
  word_occurance_free_ptr = NULL;
  last_word_occurance_buffer = NULL;
}


/* ---------------------------------------------------- */
static hash_entry* look_up_word _AP((char* word,hashtable*
				     the_word_memory_hashtable));
  
static hash_entry* 
look_up_word(word,the_word_memory_hashtable)
char* word;
hashtable* the_word_memory_hashtable;
{
  hash_entry * answer = get_hash(word, the_word_memory_hashtable);
  
  if(NULL != answer)
    return(answer);
  else{
    hash_entry wrd_entry;
    answer = put_hash(word, the_word_memory_hashtable, &wrd_entry);
    answer->number_of_occurances = 0;
    answer->memory_ptr = 
      make_word_occurrance_block(WORD_MEMORY_INIT_BLOCK_SIZE);
    answer->current_memory_ptr = answer->memory_ptr;
    answer->memory_size = WORD_MEMORY_INIT_BLOCK_SIZE;
    answer->current_doc_id = 0;
    return(answer);
  }
}

static unsigned char add_weight _AP((long current_weight,long new_weight));

static unsigned char 
add_weight(current_weight,new_weight)
long current_weight;
long new_weight;
/* add a new weight to the existing one */
{
  /* this should be smarter than this, like doing the log or something */
  if(127 < (current_weight + new_weight)){
    /* the max char.  should be 255, but does not work on all compilers */
    return(127);
  }
  else{
    return(current_weight + new_weight);
  }
}

static unsigned char* more_memory _AP((unsigned char* current_memory_ptr,
				       long current_memory_size,
				       long new_size));

static unsigned char* more_memory(current_memory_ptr,current_memory_size,new_size)
unsigned char* current_memory_ptr;
long current_memory_size;
long new_size;
/* Allocates more memory for a hash_entry.  It transfers all the bytes 
 * from the old to the new and then returns the new.
 */
{
  unsigned char* new_memory = NULL;
  if(current_memory_size > new_size){
    panic("trying to contract a hash_entry block.  This is not right\n");
  }
  new_memory = make_word_occurrance_block(new_size);
  if(NULL == new_memory){
    panic("Out of memory.");
  }
  memset(new_memory, 0, new_size);
  memmove(new_memory, current_memory_ptr, (size_t)current_memory_size); 
  return(new_memory);
}

static long more_memory_size _AP((long current_size,
				  long number_of_occurances));

static long more_memory_size(current_size,number_of_occurances)
long current_size;
long number_of_occurances;
/* This is pretty important to get right.  This is a place holder */
{
  return(MAX(2 * current_size, WORD_MEMORY_INIT_BLOCK_SIZE));
}

long write_bytes_to_memory(value,size,ptr)
long value;
long size;
unsigned char* ptr;
{
  /* writes the number into memory lsb first.  
     returns the number of bytes written */
  long i;
  long original_value = value;

  if(size < 0) /* paranoia */
    panic("attempting to write a negative number of bytes");

  ptr += size; /* start at the end of the block and write backwards */
  for (i = 0; i < size; i++){
    ptr--;
    *ptr = (unsigned char)(value & 0xFF);
    value = value >> 8;
  }
  if(value != 0)
    panic("In a call to write_bytes_to_memory, the value %ld can not be represented in %ld bytes", original_value, size);

  return(size);
}
		
/* adds a word to the hashtable. Returns the 0 if successful. 
   See irext.h for more documentation.
 */
long add_word(word, char_pos, line_pos,
	      weight, doc_id, date, word_pair, db, word_position)
     char *word;	/* the word to be indexed, this could be a
			   word pair. If NULL there are no more words
			   to be indexed */
     long char_pos;	/* the position of the start of the
			   word */
     long line_pos;	/* this is passed for the best
			   section calculation */
     long weight;	/* how important the word looks
			   syntactically (such as is it bold)
			   NOT used by signature system */
     long doc_id; 	/* current document, this will never be 0 */
     time_t date; /* display day of this document, 0 if not known */
     long word_pair;
     database* db; /* database to insert the document */
     boolean word_position; /* whether to have multiple entries for words in a document */
{
  /* look up the word in the hashtable */
  /* creates it if necessary */	
  hash_entry* wrd_entry;
  hashtable * the_word_memory_hashtable = db->the_word_memory_hashtable;
  /* printf("Word: '%s' doc_id: %ld, pos: %ld, weight: %ld\n",
     word, doc_id, char_pos, weight); */
  
  if(NULL == db->the_word_memory_hashtable){
    panic("The memory word hashtable is not defined.");
  }

  /* if we have indexed enough words flush the memory copies to disk. */
  if(db->number_of_words_in_hashtable == db->flush_after_n_words)
    flush_memory_hashtable_to_disk(db, false);

  wrd_entry = look_up_word(word, the_word_memory_hashtable);
  wrd_entry->number_of_occurances ++;

  if(wrd_entry->number_of_occurances > MAX_OCCURANCES){
    /* do nothing. we have enough of that word */
  }
  else{
    /* we have a word to add */
    db->number_of_words_in_hashtable ++;

    if(word_position || doc_id != wrd_entry->current_doc_id){
      /* then we have a new doc_id to add to the memory block */
          
      /* check to see if we need more memory */
      if((wrd_entry->memory_size -
	  (wrd_entry->current_memory_ptr - 
	   wrd_entry->memory_ptr) 
	  < 
	  INDEX_ELEMENT_SIZE)){
	/* we need more memory. this makes more and frees the old*/
	unsigned char* old_memory_ptr = wrd_entry->memory_ptr;
 
	long new_size = 
	  more_memory_size(wrd_entry->memory_size,
			   wrd_entry->number_of_occurances);
	/* cprintf(PRINT_AS_INDEXING, "Get more memory %ld bytes for %s\n", new_size, word); */
	wrd_entry->memory_ptr = 
	  more_memory(wrd_entry->memory_ptr, wrd_entry->memory_size,
		      new_size);
	wrd_entry->current_memory_ptr = 
	  wrd_entry->memory_ptr + /* new offset */
	    (wrd_entry->current_memory_ptr - old_memory_ptr);
	/* just being paranoid... no longer illegal
	   if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr)
	   panic("After allocating more memory, the size went to 0");
	   */
	wrd_entry->memory_size = new_size;
      }				/* finished making more memory */

      /* add away */
      wrd_entry->current_memory_ptr +=
	write_bytes_to_memory(doc_id, DOCUMENT_ID_SIZE,
			      wrd_entry->current_memory_ptr);
      wrd_entry->current_memory_ptr +=
	write_bytes_to_memory(char_pos, 
			      CHARACTER_POSITION_SIZE,
			      wrd_entry->current_memory_ptr);
      wrd_entry->current_memory_ptr +=
	write_bytes_to_memory(weight +
			       /* add 5 since for the first one */
			      (doc_id != wrd_entry->current_doc_id)?5:0,
			      WEIGHT_SIZE,
			      wrd_entry->current_memory_ptr);
      wrd_entry->current_doc_id = doc_id;

    }
    else{
      /* The word is already there,
       * just increment the weight in the record.
       * This will change when/if position information is kept (for proximity).
       */
      if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr){
	panic("Memory hashtable error. Recorded doc_id %ld, current doc_id %ld\n",
	      wrd_entry->current_doc_id, doc_id);
      }
      *(wrd_entry->current_memory_ptr - 1) =
	add_weight(*(wrd_entry->current_memory_ptr - 1), weight);
    }
  }
  return(0L);
}

void add_stop_words(the_word_memory_hashtable)
hashtable *the_word_memory_hashtable;
     /* add the stop words to the hashtable.  this must be done before
	adding other words */
{
  init_stop_list();
  while(true){
    char *word = next_stop_word();
    hash_entry* wrd_entry;

    if(NULL == word)
      break;
    wrd_entry = look_up_word(word, the_word_memory_hashtable);
    wrd_entry->number_of_occurances = STOP_WORD_FLAG;
  }
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.