ftp.nice.ch/pub/next/connectivity/infosystems/WAIStation.1.9.6.N.b.tar.gz#/WAIS/ir/sersrch.c

This is sersrch.c in view mode; [Download] [Up]

/* WIDE AREA INFORMATION SERVER SOFTWARE
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.    
   Brewster@think.com
*/


/* implements the search part of irext.h 
   (search_word and finished_search_word)
   -brewster

Split from irsearch.c

   5/31/91 Added scale_scores.  Fixed document_score_array to long.
   7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.
   2/4/92 Made document_score_array a double.

   - Jonny G
 * $Log:	sersrch.c,v $
 * Revision 1.24  92/04/28  16:56:54  morris
 * added boolean to serial engine
 * 
 * Revision 1.23  92/03/15  10:15:18  jonathan
 * Added Simon Spero's ASSIGN replacement for read_bytes.
 * 
 * Revision 1.22  92/03/05  07:09:54  shen
 * add two more dummy arguments to call to init_search_engine
 * 
 * Revision 1.21  92/02/12  17:29:52  jonathan
 * Conditionalized inclusion of object code.
 * 
 * Revision 1.20  92/02/12  13:40:06  jonathan
 * Added "$Log" so RCS will put the log message in the header
 * 
*/

#include "cdialect.h"
#include "irfiles.h"
#include "irsearch.h"
#include "irext.h"
#include "byte_order.h"
#include <string.h>

#ifdef BOOL
#include "obj.h"
#include "irparse.h"
object* currentQuery = NULL; /* kludge until irext goes away */
#endif /* def BOOL */

/* weighting for relevant document terms - 
   this may become a parameter to the query.
*/

#define RF_WEIGHTING 0.1

/* ==================================
 * ===  Initialization Functions  ===
 * ==================================*/


long init_search_engine(file, initialize, for_search, cm_mem_percent, 
text_size, grow_percent)
  char* file;
  boolean initialize;
  boolean for_search;
  long cm_mem_percent;  /* unused */
  long text_size;     /* unused */
  long grow_percent;  /* unused */
{
  static boolean inited = false;

  if (inited == false)
   { 
#ifdef BOOL
     initObj();
     initBool();
#endif
     inited = true;
   }

  return(0);
}

long finished_search_engine()
{
  return(0);
}


/*
 *  ext_open_database: see irext.h
 */

long ext_open_database (db, initialize, for_search)
     database *db;
     boolean initialize;
     boolean for_search;
{ /* this has to deal with the .inv file */
  char file[MAX_FILE_NAME_LEN];

  if(initialize) /* make a new one */
    db->index_stream = s_fopen(index_filename(file, db), "w+b");
  else if(for_search) /* just search */
    db->index_stream = s_fopen(index_filename(file, db), "rb");
  else /* write to an existing db */
    db->index_stream = s_fopen(index_filename(file, db), "r+b");

  if (db->index_stream == NULL) {
    waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n", 
	    file);
      disposeDatabase(db);
      return(1);
    }
  return(0);
}
  


/*
 *  ext_close_database: see irext.h
 */

long ext_close_database (db)
     database *db;
{
  return(0);
}

char *database_file(database_name)
     char *database_name;
{
  return(database_name);
}
  
/*===========================*
 *===  Setting Paramters  ===*
 *===========================*/

long max_hit_retrieved = 0;
char **srcs = NULL;

long set_query_parameter (mask, parameters)
     long mask;
     query_parameter_type * parameters;
{
  switch (mask)
    {
    case SET_MAX_RETRIEVED_MASK:
      max_hit_retrieved = parameters->max_hit_retrieved;
      return(0);
      break;
    case SET_SELECT_SOURCE:
      if(NULL != srcs){
	if(NULL != srcs[0])
	  s_free(srcs[0]);
	s_free(srcs);
      }
      srcs = parameters->srcs;
      break;
    default:
      return(-1);
      break;
    }
  return(0);
}

/*==============================*
 *===  Document Score Array  ===*
 *==============================*/

double *document_score_array = NULL;
long document_score_array_len = 0;

/* make_document_score_array insures that the document_score_array
   array is long enough, if not it makes it long enough */
static void make_document_score_array _AP((long length));
static void make_document_score_array(length)
long length;
{
  if(length <= document_score_array_len)
    return;
  /* we have to make a new one.  free the old one first (if any) */
  if(document_score_array != 0){
    s_free(document_score_array);
  }
  document_score_array = (double*)s_malloc(
					 (size_t)(length * sizeof(double)));
  document_score_array_len = length;
}

static void destroy_document_score_array _AP((void));
static void destroy_document_score_array()
{
  s_free(document_score_array);
  document_score_array_len = 0;
}
    
void clear_document_score_array()
/* side effects the document_score_array. */
{ 
  memset(document_score_array, 0, 
	 document_score_array_len * sizeof(double));
}

/* for debugging purposes */
void print_document_score_array(start,stop)
unsigned long start;
unsigned long stop;
/* assumes start >= 0, stop < db->doc_table_allocated_entries */
{
	long i;
	for(i = start; i <= stop; i++){
		printf("entry number %d: %f \n", 
		       i, document_score_array[i]);
	}
}



/*=========================*
 *===  Best Hits Array  ===*
 *=========================*/

hit *best_hits_array = NULL;
long best_hits_array_len = 0;
long current_best_hit = 0;

/* see irext.h for doc */
long init_best_hit (db)
     database *db;
{

#ifdef BOOL
  if (currentQuery != NULL)
    send(currentQuery,InitBestHit,db);
#endif /* def BOOL */

  return(0);
}

/* make_best_hits_array insures that the best_hits_array
   array is long enough, if not it makes it long enough */
static void make_best_hits_array _AP((long length));
static void make_best_hits_array(length)
long length;
{
  if(length <= best_hits_array_len)
    return;
  /* we have to make a new one.  free the old one first (if any) */
  if(best_hits_array != 0){
    s_free(best_hits_array);
  }
  best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
  best_hits_array_len = length;
}

static void destroy_best_hits_array _AP((void));
static void destroy_best_hits_array()
{
  s_free(best_hits_array);
  best_hits_array_len = 0;
}
    
void clear_best_hits_array()
/* side effects the best_hits_array.  XXX could use memset */
{ 
  memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
}

/* for debugging purposes */
void print_best_hits()
{
  long i;
  for( i = 0; i < best_hits_array_len; i++){
    if (best_hits_array[i].weight != 0)
      { printf("Best hit %ld: weight %ld, doc_id %ld, headline %s, filename %s, lines %ld\n", 
	       i, best_hits_array[i].weight, 
	       best_hits_array[i].document_id,
	       best_hits_array[i].headline,
	       best_hits_array[i].filename,
	       best_hits_array[i].number_of_lines);
      }
  }
}

void sort_best_hits(db)
     database * db;
{
  /* returns nothing.
   * side effects best_hits and document_score_array
   */

  long i, doc;
  double worst_weight_to_make_it = 0.0;
  document_table_entry doc_entry;
  long best_hit_number = 0;

  /* snuff the scores */
  for(i = 0; i < max_hit_retrieved; i++){
    best_hits_array[i].weight = 0;
  }

  /* loop over the doc, and keep the doc_id and weight in best hit table */
  for(doc = 0; doc < db->doc_table_allocated_entries; doc++){
    double weight = document_score_array[doc];
    if(worst_weight_to_make_it < weight){
      /* merge it into the best_hits array. start at the bottom */
      for(i = (max_hit_retrieved - 1); i >= 0; i--){
	if(weight > best_hits_array[i].weight 
	   /* && (check_document_id(doc, db) == true) too slow.*/
	   ){
	  /* move this entry down */	
	  if((i + 1) < max_hit_retrieved){
	    best_hits_array[i+1].weight = best_hits_array[i].weight;
	    best_hits_array[i+1].document_id = best_hits_array[i].document_id;
	  }
	  best_hits_array[i].document_id = doc;
  	  best_hits_array[i].weight = weight;
	}
	else
	  break;
      }      
    }
  }
  
  for(i = 0; i < max_hit_retrieved; i++){
    if(best_hits_array[i].weight <= 0)  /* if we are out of good stuff, return */
      return;
    /* fill in the rest of the hit */
    if (read_document_table_entry(&doc_entry,
				  best_hits_array[i].document_id,
				  db) 
	== true){
      best_hits_array[best_hit_number].weight = best_hits_array[i].weight;
      best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
      best_hits_array[best_hit_number].start_character = doc_entry.start_character;
      best_hits_array[best_hit_number].end_character = doc_entry.end_character;
      best_hits_array[best_hit_number].document_length = doc_entry.document_length;
      best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
      read_filename_table_entry(doc_entry.filename_id, 
				best_hits_array[best_hit_number].filename,
				best_hits_array[best_hit_number].type,
				NULL,
				db),
      strncpy(best_hits_array[best_hit_number].headline, 
	      read_headline_table_entry(doc_entry.headline_id,db),
	      MAX_FILE_NAME_LEN);
      best_hit_number++;
    } 
    beFriendly();
  }
  for(i = best_hit_number; i < max_hit_retrieved; i++){
    best_hits_array[best_hit_number].weight = 0;
  }
  /* print_best_hits(s);  for debugging */
}


/* returns the next best hit */
long best_hit(db, doc_id, best_character, best_line, score)
     database *db;
     long *doc_id;	
     long *best_character;
     long *best_line;
     long *score;
{

  *best_character = 0; 
  *best_line = 0;
  
#ifdef BOOL
  if (currentQuery != NULL) /* for boolean */
   {
     send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score);
     if (*doc_id > 0)
       return(0); /* ok */
     else
       return(-1); /* no more docs */
   }
#endif /* BOOL */

  if(current_best_hit > best_hits_array_len)
    return(1);
  if(best_hits_array[current_best_hit].weight == 0)
    return(1);
  *doc_id = best_hits_array[current_best_hit].document_id;
  *score  = best_hits_array[current_best_hit].weight;
  current_best_hit++;
  return(0);
}

long finished_best_hit(db)
database *db;
{ 

#ifdef BOOL
  if (currentQuery != NULL) /* for boolean */
   { send(currentQuery,Delete);
     currentQuery = NULL;
     return(0);
   }
#endif /* BOOL */

  /* if we are on a small machine, we might want to 
     destroy_document_score_array */
  clear_document_score_array();
  clear_best_hits_array();
  current_best_hit = 0;
  return(0);
}

/*=============================*	
 *===  Searching for words  ===*
 *=============================*/

/* see irext.h for doc */
long init_search_word (db)
     database* db;
{
  return(0);
}

/* see irext.h for doc */
long search_word(word,char_pos, line_pos, weight, doc_id, 
		 word_pair, db)
     char *word; /* the word to be searched for */
     long char_pos;		/* the position of the start of the word */
     long line_pos;		/* is this needed? not for signature system */
     long weight;		/* how important the word looks syntactically,
				   such as is it bold */
     long doc_id;		/* current document, seed words is 0,
				   then it increments into the relevant 
				   document */
     long word_pair;
     database *db;
{
  /* this side effects the document_score_array,
   * and downcases the word.
   * Returns 0 if successful or word not present, 
   * returns non-0 if an error.
   *
   */
  
  long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */
  long count, index_block_size;
  long internal_document_id, internal_weight, number_of_valid_entries;
  long index_file_block_number;
  long number_of_occurances;

  FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)];
  char *index_buffer;
  char *i;
  FILE *stream = NULL;

  index_buffer = (char*)index_buffer_data;

  index_file_block_number = 
    look_up_word_in_dictionary(word, &number_of_occurances, db);

  current_best_hit = 0;  /* so that the best hits willstart from 0 */

  /* check the document_score_array */
  if(document_score_array_len < db->doc_table_allocated_entries)
    make_document_score_array(db->doc_table_allocated_entries);

  if(index_file_block_number >= 0){
    stream = db->index_stream;
    
    while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) && 
	  (index_file_block_number != 0)){	
      /* read the index block */
      if (0 != fseek(stream, (long)index_file_block_number, 
		     SEEK_SET))	
	{ 
	  waislog(WLOG_HIGH, WLOG_ERROR, 
		  "fseek failed into the inverted file to position %ld",
		  (long)index_file_block_number); 
	  return(-1);
	}
      
      read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE);

      ASSIGN(not_full_flag,
	     INDEX_BLOCK_FLAG_SIZE,
	     index_buffer,
	     INDEX_BLOCK_HEADER_SIZE,
	     0 );
      ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE,
	     index_buffer+INDEX_BLOCK_FLAG_SIZE,
	     INDEX_BLOCK_HEADER_SIZE,
	     INDEX_BLOCK_FLAG_SIZE);
      ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE,
	     index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE,
	     INDEX_BLOCK_HEADER_SIZE,
	     INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE);

/* 
  this is equivalent, but slower:
      not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
      index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
      index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);

      printf("flag = %d, block_num = %d, block_size = %d\n",
	     not_full_flag, 
	     index_file_block_number,
	     index_block_size);

      fflush(stdout);
*/
      if(EOF == index_block_size) 
	{ 
	  waislog(WLOG_HIGH, WLOG_ERROR, 
		  "reading from the index file failed");
	  return(-1);
	}
      
      if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
	/* not full */
	number_of_valid_entries = index_file_block_number;
      }
      else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
	/* full */
	number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
      }
      else{			/* bad news, file is corrupted. */
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"Expected the flag in the inverted file to be valid.  it is %ld",
		not_full_flag);
	return(-1);
      }
      /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
      
      /* add the array to the document_score_array */
      number_of_valid_entries /= INDEX_ELEMENT_SIZE;

      for(count=0;count <  number_of_valid_entries;count++) {
	int wgt;
	int did;
	if(count%1024 == 0) {
	  read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE*
		MIN(1024,number_of_valid_entries-count));
	  i=index_buffer;
	}
	ASSIGN(wgt,WEIGHT_SIZE,    
	       i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,
	       INDEX_ELEMENT_SIZE,
	       DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE);
	ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0);

	internal_weight = wgt;
	internal_document_id = did;

	/*
	printf("entry %ld, Doc_id: %ld, weight %ld \n",
		count, internal_document_id, internal_weight);
	fflush(stdout);
	*/
	if(EOF == internal_weight) 
	  { 
	    waislog(WLOG_HIGH, WLOG_ERROR, 
		    "reading from the doc-id table failed");
	    return(-1);
	  }
	/* if(doc_id > 0) we are doing a relevant document */
	document_score_array[internal_document_id] += 
	  (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
	i+=INDEX_ELEMENT_SIZE;
      }
    }
    return(0); 
  }
  else if(0 == index_file_block_number){
    /* an error occurred on looking up the word */
    return(-1);
  }
  else				/* index_file_block_number is negative */
    return(0);		/* word not present */
}

/* now collect the best hits */
long finished_search_word(db)
     database *db;
{ 

#ifdef BOOL
  if (currentQuery != NULL)
    return; /* do nothing for boolean */
#endif /* def BOOL */

  /* check the document_score_array */
  if(document_score_array_len < db->doc_table_allocated_entries)
    make_document_score_array(db->doc_table_allocated_entries);

  make_best_hits_array(max_hit_retrieved);
  sort_best_hits(db);
  return(0);
}

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.