irverify.c

This is irverify.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
*/

/* Change log:
 * $Log:	irverify.c,v $
 * Revision 1.6  92/04/01  09:57:36  morris
 * fixed and eof check in readPostings
 * 
 * Revision 1.5  92/03/28  19:48:10  jonathan
 * Fixed Log header.
 * 
 * Revision 1.4  92/02/18  15:36:32  morris
 * made it faster
 * 
 * Revision 1.3  92/02/12  13:32:33  jonathan
 * Added $Log so RCS will put the log message in the header
 * 
*/

#include "irverify.h"
#include "irfiles.h"
#include "panic.h"
#include "futil.h"

#define TEST_READ false

/*---------------------------------------------------------------------------*/

void
printIndex (db)
database* db;
/* iterate over the index printing the contents */
{
  serialPostingFile* spf = NULL;
  char indexFileName[MAX_FILE_NAME_LEN + 1];
  postingsForATerm* posts = NULL;
  
  spf = initSerialPostingFile(index_filename(indexFileName,db));
  
  while ((posts = getPostingsForNextTerm(spf)) != NULL)
   { printPostingsForATerm(posts);
     /* XXX dispose of them */
   }
   
  disposeSerialPostingFile(spf);
}

/*---------------------------------------------------------------------------*/

static void 
print_dictionary_block_and_index _AP((unsigned char* block,long size,serialPostingFile* spf));

static void 
print_dictionary_block_and_index(block,size,spf)
unsigned char *block;
long size;
serialPostingFile* spf;
/* this prints the contents of a dictionary block */
{
  long i;
  postingsForATerm* posts = NULL;
  
  for(i = 0; i < size; i++)
   {
     char *word = dictionary_block_word(i, block);
     long pos = dictionary_block_position(i, block);
     if(word[0] == '\0')
       break;
     printf("Entry %3ld: %21s %7ld\n", i, word,pos);
     posts = getPostingsAt(spf,pos);
     printPostingsForATerm(posts);
     /* XXX dispose of them postings */
   }
}

/*---------------------------------------------------------------------------*/

extern long number_of_dictionary_blocks;
extern unsigned char *dictionary_header_block;
extern unsigned char *dictionary_block;

void
printIndexUsingDictionary(db)
database* db;
/* use the dictionary to go over the index */
{
  /* prints the contents of a dictionary */
  FILE *dictStream = db->dictionary_stream;
  long i;
  long new_number_of_dictionary_blocks;
  serialPostingFile* spf = NULL;
  char indexFileName[MAX_FILE_NAME_LEN + 1];

  spf = initSerialPostingFile(index_filename(indexFileName,db));

  if(NULL == dictStream)
    panic("dictionary dictStream is not open");
  s_fseek(dictStream, 0L, SEEK_SET);
  new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, dictStream);
  if(new_number_of_dictionary_blocks > number_of_dictionary_blocks)
    dictionary_header_block = NULL;
  number_of_dictionary_blocks = new_number_of_dictionary_blocks;
  printf("Number of dictionary blocks %ld\n", number_of_dictionary_blocks);
  if(NULL == (dictionary_header_block =
	      read_dictionary_block(dictionary_header_block,
				    DICTIONARY_HEADER_SIZE,
				    number_of_dictionary_blocks,
				    dictStream)))
    panic("Could not read dictionary header block");
  printf("The Dictionary Header Block:\n");
  print_dictionary_block(dictionary_header_block, number_of_dictionary_blocks);
  for(i = 0; i < number_of_dictionary_blocks; i++)
  {
    long pos = dictionary_block_position(i, dictionary_header_block);
    if(NULL == (dictionary_block =
		read_dictionary_block(dictionary_block,
				      pos, DICTIONARY_BLOCK_SIZE, dictStream)))
      panic("Could not read dictionary block %ld", pos);
    printf("\n\nDictionary block %ld (position %ld):\n", i, pos);
    print_dictionary_block_and_index(dictionary_block, DICTIONARY_BLOCK_SIZE,spf);
  }
  fseek(dictStream, 0L, SEEK_END);
  disposeSerialPostingFile(spf);
}

/*---------------------------------------------------------------------------*/

serialPostingFile*
initSerialPostingFile(filename)
char* filename;
/* open an inverted index file create by irn8. return a structure
   maintaining its state
 */
{
  FILE* stream = NULL;
  serialPostingFile* pf = NULL;

  stream = s_fopen(filename,"rb");
  if (stream == NULL) /* can't open that file */
    return(NULL);
  s_fseek(stream,INDEX_HEADER_SIZE,SEEK_SET);

  pf = (serialPostingFile*)s_malloc((size_t)sizeof(serialPostingFile));
  pf->stream = stream;
  pf->length = file_length(stream);
  pf->current_index_block = INDEX_HEADER_SIZE;

  return(pf);
}

/*---------------------------------------------------------------------------*/

void
disposeSerialPostingFile(pf)
serialPostingFile* pf;
{
  s_fclose(pf->stream);
  s_free(pf);
}

/*---------------------------------------------------------------------------*/

void 
printPostingsForATerm(pfat)
postingsForATerm* pfat;
{
  long i;

  if (pfat->word[0] != '\0')
    printf("word '%s'\n",pfat->word);
    
  for (i = 0; i < pfat->entries; i++)
    printf("\tdoc %ld weight %ld\n",pfat->docs[i],pfat->weights[i]);
}

/*---------------------------------------------------------------------------*/

postingsForATerm*
getPostingsAt(spf,position)
serialPostingFile* spf;
long position;
/* position better be a valid starting position! */
{
  fseek(spf->stream,position,SEEK_SET);
  spf->current_index_block = position;
  return(getPostingsForNextTerm(spf));
}

/*---------------------------------------------------------------------------*/

void
disposePostingsForATerm(pfat)
postingsForATerm* pfat;
{
  s_free(pfat->docs);
  s_free(pfat->weights);
  s_free(pfat);
}

/*---------------------------------------------------------------------------*/

void 
removePostings(pfat,start,run)
postingsForATerm* pfat;
long start;
long run;
/* remove postings start through start + run from the pfat */
{
  void* toPtr = NULL;
  long runLen;
  long toMove;

  if (start + run > pfat->entries)
    return; /* this is an error */

  toPtr = (void*)(pfat->docs + (start * sizeof(docID)));
  runLen = run * sizeof(docID);
  toMove = ((pfat->entries - start) * sizeof(docID)) - runLen;
  memmove(toPtr,toPtr + runLen,toMove);

  toPtr = (void*)(pfat->weights + (start * sizeof(postingWeight)));
  runLen = run * sizeof(docID);
  toMove = ((pfat->entries - start) * sizeof(postingWeight)) - runLen;
  memmove(toPtr,toPtr + runLen,toMove);

  pfat->entries -= run;
}

/*---------------------------------------------------------------------------*/

void
readPostings(spf,posts,not_full_flag)
serialPostingFile* spf;
postingsForATerm* posts;
long not_full_flag;
{
  long count;
  long document_id,weight,number_of_valid_entries;
  long index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  
  if (EOF == index_block_size) 
   { fprintf(stderr,"reading from the index file failed\n");
     return;
   }
      
  if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG) /* not full */
    number_of_valid_entries = index_block / INDEX_ELEMENT_SIZE;

  else if (not_full_flag == INDEX_BLOCK_FULL_FLAG) /* full */
   { number_of_valid_entries = 
       (index_block_size - INDEX_BLOCK_HEADER_SIZE) / INDEX_ELEMENT_SIZE;
   }

  else
   { /* bad news,file is corrupted.  this should return error
	code rather than panicing XXX */
     panic("Expected the flag in the inverted file to be valid.  it is %lx",
	   not_full_flag);
   }

  posts->docs = 
    (docID*)s_malloc((size_t)(sizeof(docID) * number_of_valid_entries));
  posts->weights = 
    (postingWeight*)s_malloc((size_t)(sizeof(postingWeight) * 
				      number_of_valid_entries));

  for (count = 0; count < number_of_valid_entries; count++)
   { long val;
     posts->docs[count] = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
     s_fseek(spf->stream,WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,SEEK_CUR);
     val = read_bytes(WEIGHT_SIZE,spf->stream);
     if(EOF == val)
      { fprintf(stderr,"reading from the inverted file failed\n");
        return;
      }
     else
      { posts->weights[count] = val;
        posts->entries++;
      } 
   }
}

/*---------------------------------------------------------------------------*/

long
readDictionaryIndexBlock(number_of_occurances,word,stream)
long *number_of_occurances;
char *word;
FILE *stream;
/* NOTE - similar to read_dictionary_index_lock */
{
  /* this reads the dictionary index block from the index stream.
     It assumes the stream is positioned at the right after the flag
     returns 0 if it succeeds.
     returns -1 if it is at the end of a file.
     returns -2 if it read something strange.
     Always sets word length to 0 if it fails. */

  char temp[MAX_WORD_LENGTH + 2];
  
  word[0] = '\0';

  s_fseek(stream,NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_SIZE_SIZE,SEEK_CUR);
  *number_of_occurances = read_bytes(NUMBER_OF_OCCURANCES_SIZE,stream);
  fgets(temp,MAX_WORD_LENGTH + 2,stream); /* 2 is for the \n and '\0' */

  /* trim the \n */
  if(temp[strlen(temp) - 1] == '\n'){
    temp[strlen(temp) - 1] = '\0';
  }
  strcpy(word, temp);

  return(0);
}

/*---------------------------------------------------------------------------*/

postingsForATerm*
getPostingsForNextTerm(spf)
serialPostingFile* spf;
{
  postingsForATerm* posts = NULL;
  
  posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
  posts->word[0] = '\0';
  posts->entries = 0;

  /* this is really a 2 step process - read the dictonary block, then read
     the postings.  I don't see any reason to unwrap it though */
  while (true)
   { 
     long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);

     if (flag == EOF)
      { return(NULL);
      }

     if (flag == INDEX_BLOCK_DICTIONARY_FLAG) /* the dictionary entry */
      { /* read the dictionary part */
	long number_of_occurances;
	if (readDictionaryIndexBlock(&number_of_occurances,
				     posts->word,spf->stream) < 0)
	  panic("read dictionary index block failed");
      }
     else /* the posting entry */
      { readPostings(spf,posts,flag);
	break;
      }
   }

  return(posts);
}

/*---------------------------------------------------------------------------*/




#ifdef old

these routines are slower thatn the current ones, keep them around for a 
while until we are sure the new ones work ok

/*---------------------------------------------------------------------------*/

postingsForATerm*
getPostingsForNextTerm(spf)
serialPostingFile* spf;
{
  postingsForATerm* slow;
  postingsForATerm* fast;
/*
  long filePos = s_ftell(spf->stream);
  slow = getPostingsForNextTermSLOW(spf);
  printf("SLOW:\n");
  printPostingsForATerm(slow);NL();
  s_fseek(spf->stream,filePos,SEEK_SET);
*/
  fast = getPostingsForNextTermFAST(spf);
/*  printf("FAST:\n");
  printPostingsForATerm(fast);NL();
  disposePostingsForATerm(slow);
*/
  return(fast);
}

/*---------------------------------------------------------------------------*/

postingsForATerm*
getPostingsForNextTermSLOW(spf)
serialPostingFile* spf;
{
  postingsForATerm* posts = NULL;
  boolean keepGoing = true;
  
  if (spf->current_index_block >= spf->length)
    return(NULL);

  posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
  posts->word[0] = '\0';
  posts->entries = 0;

  while (keepGoing) 
   { 
     long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
     long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
     long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);

     if (flag == INDEX_BLOCK_DICTIONARY_FLAG)
       { long last_index_block;
	 long index_block_size;
	 long number_of_occurances;
	 char word[MAX_WORD_LENGTH + 1];
	 if (0 > read_dictionary_index_block(spf->current_index_block,
					     &last_index_block,
					     &index_block_size,
					     &number_of_occurances,
					     word,
					     spf->stream))
	   panic("read dictionary index block failed");
	 cprintf(TEST_READ,
		 "%ld: size %3ld word '%s',occurances %ld last block %ld\n",
		 spf->current_index_block,index_block_size,word,
		 number_of_occurances,next_index_block);
	 strcpy(posts->word,word);
       }

     else if (flag == INDEX_BLOCK_NOT_FULL_FLAG)
       { cprintf(TEST_READ,"%ld: size %3ld Not full,valid entries %ld\n",
		 spf->current_index_block,index_block_size,next_index_block);
	 readPostings(spf,posts);
	 keepGoing = false;
       }

     else if (flag == INDEX_BLOCK_FULL_FLAG)
       { cprintf(TEST_READ,"%ld: size %3ld full block,next block %ld\n",
		 spf->current_index_block,index_block_size,next_index_block);
	 readPostings(spf,posts);
	 keepGoing = false;
       }

     else 
       panic("bad entry %ld (ftell %ld),flag was %ld",
	     spf->current_index_block,ftell(spf->stream),flag);

     spf->current_index_block += index_block_size;
     s_fseek(spf->stream,spf->current_index_block,SEEK_SET);
   }

  return(posts);
}

/*---------------------------------------------------------------------------*/

void
readPostings(spf,posts)
serialPostingFile* spf;
postingsForATerm* posts;
{
  long not_full_flag = INDEX_BLOCK_FULL_FLAG;
  long count,index_block_size;
  long document_id,weight,number_of_valid_entries;
  long index_block = spf->current_index_block;
  
  if (index_block >= 0)
    {
      /* read the index block */
      if (0 != fseek(spf->stream,(long)index_block,SEEK_SET))
	{ 
	  fprintf(stderr,
		  "fseek failed into the inverted file to position %ld\n",
		  (long)index_block); 
	  return;
	}
      
      not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
      index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
      index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
      if (EOF == index_block_size) 
	{ fprintf(stderr,"reading from the index file failed\n");
	  return;
	}
      
      if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG)
	{ /* not full */
	  number_of_valid_entries = index_block;
	}
      else if (not_full_flag == INDEX_BLOCK_FULL_FLAG)
	{ /* full */
	  number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
	}
      else
	{ /* bad news,file is corrupted.  this should return error
	     code rather than panicing XXX */
	panic("Expected the flag in the inverted file to be valid.  it is %ld",
	      not_full_flag);
        }

      cprintf(TEST_READ,"  number of valid bytes: %ld\n",
	      number_of_valid_entries);
      
      for (count = 0; count < number_of_valid_entries; 
	   count = count + INDEX_ELEMENT_SIZE)
	{
	  document_id = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
	  (void)read_bytes(WORD_POSITION_SIZE,spf->stream);
	  (void)read_bytes(CHARACTER_POSITION_SIZE,spf->stream);
	  weight = read_bytes(WEIGHT_SIZE,spf->stream);
	  cprintf(TEST_READ,"    entry %ld,Doc_id: %ld,weight %ld\n",
		  count % INDEX_ELEMENT_SIZE,document_id,weight);
	  if(EOF == weight) 
	    { fprintf(stderr,"reading from the doc-id table failed\n");
	      return;
	    }
	  posts->entries++;
	  posts->docs = (docID*)s_realloc(posts->docs,
				  (size_t)(sizeof(docID) * posts->entries));
	  posts->docs[posts->entries - 1] = document_id;
	  posts->weights = (postingWeight*)s_realloc(posts->weights,
				     (size_t)(sizeof(postingWeight) * 
					      posts->entries));
	  posts->weights[posts->entries - 1] = weight;
	}
    }
}

/*---------------------------------------------------------------------------*/

#endif /* ndef old */
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.