irretrvl.c

This is irretrvl.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.	
  
*/

#ifndef lint
static char *RCSid = "$Header: /tmp_mnt/net/quake/proj/wais/wais-8-b5/ir/RCS/irretrvl.c,v 1.30 92/05/10 14:43:59 jonathan Exp $";
#endif

/* Change log:
 * $Log:	irretrvl.c,v $
 * Revision 1.30  92/05/10  14:43:59  jonathan
 * 
 * Made a little safer on NULL docid's when parsing.
 * 
 * Revision 1.29  92/05/06  17:31:26  jonathan
 * modified #if's for NeXT and Mach.  Added S_ISDIR definition for them both.
 * 
 * Revision 1.28  92/05/04  17:19:54  jonathan
 * Added test for parsing docids (if null, log error).
 * 
 * Revision 1.27  92/04/28  16:56:08  morris
 * added boolean to serial engine
 * 
 * Revision 1.26  92/04/01  17:09:46  jonathan
 * Added index_directory to check_for_legitimate_file to test if filename is
 * under default directory (for FTP-like retrieval).
 * 
 * 
 * Revision 1.25  92/03/18  08:54:41  jonathan
 * Removed databaseName argument from getData and getDocumentText.  The
 * database name is now culled from the docid.  Removed special cases for INFO
 * and Quest db's, as they should no longer be needed.
 * 
 * Revision 1.24  92/02/18  14:04:49  jonathan
 * in check_for_legitimate_file: added INFO to the list of special case
 * retrievals from MAC's.
 * 
 * Revision 1.23  92/02/18  11:53:45  jonathan
 * conditionalized use of tempnam for NeXT (doesn't exist, use tmpnam
 * instead).  May be a BSD thing.
 * 
 * Revision 1.22  92/02/17  12:38:52  jonathan
 * special case catalog in check_for_legitimate_file.
 * 
 * Revision 1.21  92/02/16  18:04:52  jonathan
 * Demoted more WLOG_ERROR's to WLOG_WARNING's
 * 
 * Revision 1.20  92/02/15  19:40:30  jonathan
 * Improved reporting of retrieval errors.
 * 
 * Revision 1.19  92/02/15  18:58:38  jonathan
 * Changed most (but not all) waislog errors to warnings on retrieval.
 * 
 * Revision 1.18  92/02/14  16:06:20  jonathan
 * Fixed text in error message for invalid docid (not in DB)
 * 
 * Revision 1.17  92/02/14  15:24:08  jonathan
 * Made parseDocID public.
 * 
 * Revision 1.16  92/02/12  13:29:35  jonathan
 * Added "$Log" so RCS will put the log message in the header
 * 
*/

/* retrieval part of the serial ir engine.  if you are using a different 
   storage system for the documents, replace this file.

   -brewster

 10/91 added .Z file support from mlm@cs.brown.edu (Moises Lejter)

 to do:
  handle .Z files at a lower level.

 */

#include "irretrvl.h"
#include "irfiles.h" /* for filename_table_ext */
#include <string.h>
#include "futil.h"
#include <ctype.h>  /* for isspace */
#include "irext.h"
#include "irdirent.h"
#include <sys/stat.h>

#ifdef Mach
#include <sys/inode.h>
#define S_ISDIR(f_mode) (f_mode & IFDIR)
#endif /* Mach */

#if (defined(NeXT) && !(defined(S_ISDIR)))
#define S_ISDIR(f_mode) ((f_mode) & S_IFDIR)
#endif

/*----------------------------------------------------------------------*/


boolean
parseDocID(doc,filename,start_character,end_character,errorCode)
DocObj* doc;
char* filename;
long* start_character;
long* end_character;
long* errorCode;
{
  DocID* theDocID = NULL;
  char* local_id = NULL;
  char* token = NULL;
  long i;

  if((theDocID = docIDFromAny(doc->DocumentID)) == NULL) 
    return false;

  local_id = anyToString(GetLocalID(theDocID));
  
  freeDocID(theDocID);

  /* parse the doc id into start pos, end pos, and filename */
  /* first the start char */
  token = local_id;
  for (i = 0; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
    ;
  if (local_id[i] == '\0')
   { 
     waislog(WLOG_HIGH, WLOG_WARNING, 
	     "Attempt to retrieve data for bad doc-id: '%s'",local_id); 
     *errorCode = GDT_BadDocID;
     s_free(local_id);
     return(false);
   }
  local_id[i] = '\0';
  sscanf(token,"%ld",start_character);
  /* now the second char */
  token = local_id + i + 1;
  for (++i; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
   ;
  if (local_id[i] == '\0')
   { 
     waislog(WLOG_HIGH, WLOG_WARNING, 
	     "Attempt to retrieve data for bad doc-id: '%s'",local_id); 
     *errorCode = GDT_BadDocID;
     s_free(local_id);
     return(false);
   }
  local_id[i] = '\0';
  sscanf(token,"%ld",end_character);
  /* and finally the file name */
  strncpy(filename,local_id + i + 1,MAX_FILENAME_LEN);
  s_free(local_id);
  return(true);
}


/*----------------------------------------------------------------------*/

/* this checks to make sure that the filename is a file 
   within the database */

static boolean check_for_legitimate_file 
  _AP((char *filename, char* database_name, char* index_directory));

static boolean check_for_legitimate_file(filename, database_name, index_directory)
     char *filename;
     char *database_name;  /* full pathname of the database */
     char *index_directory;
{
  struct stat sbuf;

  /* the help file and catalog file (the .src and .cat files) must be
     special cased because it is not in the filename table */

  /* caching is done in filename_in_filename_file for repeated requests 
     for the same file, so it does not need to be repeated here. */

  if(NULL != strstr(filename, ".src")) /* let it pass */
    return(true);

  if(NULL != strstr(filename, ".cat")) /* let it pass */
    return(true);

  stat(filename, &sbuf);
  if(S_ISDIR(sbuf.st_mode)) {
    waislog(WLOG_HIGH, WLOG_WARNING, 
	    "File: '%s' is a directory, and cannot be retrieved.",
	    filename);
    return(false);
  }
  else {
    /* name of the file of the filetable for this db (eg  /bar/foo.fn).  confusing, no? */
    char filename_table_filename[MAX_FILE_NAME_LEN +1]; 
    
    pathname_directory(database_name, filename_table_filename);
    strncat(filename_table_filename, "/", MAX_FILE_NAME_LEN);
    strncat(filename_table_filename, 
	    database_file(pathname_name(database_name)), 
	    MAX_FILE_NAME_LEN);
    s_strncat(filename_table_filename, filename_table_ext, MAX_FILE_NAME_LEN,
	      MAX_FILE_NAME_LEN);
    if(!filename_in_filename_file(filename, NULL, NULL, filename_table_filename)){
      /* we lose.  this means either the db does not exist, or
	 the file is not in that db.  Log the bad news */
      if(index_directory == NULL)
	return true;
      else if (substrcmp(filename, index_directory))
	return true;
      waislog(WLOG_HIGH, WLOG_WARNING, 
	      "File: '%s' is not in DB '%s', and cannot be retrieved.",
	      filename, filename_table_filename);
      return(false);
    }
    else{			/* everything is peachy */
      return(true);
    }
  }
}
  

/*----------------------------------------------------------------------*/

WAISDocumentText* getData(doc, errorCode, index_directory)
DocObj* doc;
long* errorCode;
char* index_directory;
/* it isn't text, so we can just grab data */
{
  FILE* file = NULL;
  char fileName[MAX_FILENAME_LEN + 1];
  char* dbname = NULL;
  WAISDocumentText* data = NULL;
  long start,end;		/* position of the document in the file */
  long startByte,endByte,bytes,bytesRead; /* part of the doc that we want */
  char* buffer = NULL;
  any* bufAny = NULL;
  DocID *docid;
#if (defined(NeXT) || defined(Mach))
  char tmpFileName[MAX_FILENAME_LEN+1];
#else
  char *tmpFileName = NULL;
#endif /* NeXT or Mach */

  /* we can only handle byte chunks here */
  if ((doc->ChunkCode == CT_byte) ||
      (doc->ChunkCode == CT_document)) {
    if (parseDocID(doc,fileName,&start,&end,errorCode) == false)
     {
       waislog(WLOG_HIGH, WLOG_WARNING, "can't parse docid");
       *errorCode = GDT_MissingDocID;
       return(NULL);
     }
 
    *errorCode = GDT_NoError;
  
    docid = docIDFromAny(doc->DocumentID);
    dbname = anyToString(GetDatabase(docid));
    freeDocID(docid);

    if(true == check_for_legitimate_file(fileName, dbname, index_directory)){
      file = s_fopen(fileName,"rb"); 

      if (file == NULL){
	if(probe_file_possibly_compressed(fileName)) {
	  char buffer[ 2 * MAX_FILENAME_LEN + 10 ];
#if (defined(NeXT) || defined(Mach))
	  tmpnam(tmpFileName);
#else
	  tmpFileName = tempnam( "/tmp/", 0 );
#endif /* NeXT or Mach */
	  sprintf( buffer, "zcat %s.Z > %s", fileName, tmpFileName );
	  system( buffer );
	  file = s_fopen(tmpFileName,"rb");
	}
      }
    }

    if (file == NULL) { 
      waislog(WLOG_HIGH, WLOG_WARNING, 
	      "Attempt to retrieve data for missing doc-id: '%s'",
	      fileName);
      *errorCode = GDT_MissingDocID;
      s_free(dbname);
      return(NULL);
    }

    if (doc->ChunkCode == CT_byte) {
      startByte = doc->ChunkStart.Pos + start;
      endByte = doc->ChunkEnd.Pos + start;
    }
    else {
      startByte = start;
      endByte = end;
    }

    waislog(WLOG_LOW, WLOG_RETRIEVE,
	    "Retrieving DocID: %d %d %s, byte: %d %d, from database %s", 
	    start, end, fileName, startByte, endByte, dbname);

    s_free(dbname);

    if (endByte > end && end != 0) { 
      waislog(WLOG_HIGH, WLOG_WARNING, 
	      "retrieval beyond bounds of document %ld in file <%s>",
	      endByte,fileName);
      *errorCode = GDT_BadRange;
      endByte = end;
    }
   
    /* get the bytes */
    if (fseek(file,startByte,SEEK_SET) != 0)
      { 
	waislog(WLOG_HIGH, WLOG_WARNING, 
		"retrieval can't seek to %ld in file <%s>",startByte,
		fileName);
	*errorCode = GDT_BadRange;
	if (tmpFileName) unlink( tmpFileName );
	if (tmpFileName) unlink( tmpFileName );
	if (tmpFileName) unlink( tmpFileName );
	return(NULL);
      }

    bytes = endByte - startByte; 
    buffer = (char*)s_malloc(bytes);
  
    bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
  
    if (bytesRead != bytes)
      { 
	waislog(WLOG_HIGH, WLOG_WARNING, 
		"retrieval error in file <%s>",fileName);
	*errorCode = GDT_BadRange;
	if (bytesRead == 0) 
	  return(NULL);
      }
  
    bufAny = makeAny(bytesRead,buffer);
  
    data = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
  
    /* the any and the buffer are freed by freeWAISSearchResponse() */
    s_fclose(file);
    if (tmpFileName) unlink( tmpFileName );

    return(data);
  }
  else
    { 
      waislog(WLOG_HIGH, WLOG_WARNING, 
	      "search engine can only use whole documents or byte offsets for data lookup");
      *errorCode = GDT_UnsupportedChunkType;
      return(NULL);
    }

}

/*----------------------------------------------------------------------*/

#define BUFSZ	(size_t)5000

WAISDocumentText* getDocumentText(doc, errorCode, index_directory)
DocObj* doc;
long* errorCode;
char* index_directory;
/* find the text for doc, get the sub part if any, finally construct and
   return a WAISDocumentText.  If it can not find the document 
   (or some other error) it returns NULL and sets errorCode.
 */
{
  WAISDocumentText* text = NULL;
  FILE* file = NULL;
  char* dbname = NULL;
  char* buffer = NULL;
  any* bufAny = NULL;
  char filename[MAX_FILENAME_LEN + 1];
  long start_character;
  long end_character;
  register long i;
  long bytes,bytesRead;
  long startByte,endByte,byte,lines;
#if (defined(NeXT) || defined(Mach))    
  char tmpFileName[MAX_FILENAME_LEN+1];
#else
  char *tmpFileName = NULL;
#endif /* NeXT or Mach */
  DocID* theDocID = NULL;
  char* local_id = NULL;
  
  *errorCode = GDT_NoError;

  /* we can only handle line chunks for now */
  if (doc->ChunkCode != CT_line)
   { 
     waislog(WLOG_HIGH, WLOG_WARNING, 
	     "search engine can only use line offsets for now.");
     *errorCode = GDT_UnsupportedChunkType;
     return(NULL);
   }

  theDocID = docIDFromAny(doc->DocumentID);
  dbname = anyToString(GetDatabase(theDocID));
  local_id = anyToString(GetLocalID(theDocID));
  freeDocID(theDocID);

  if (parseDocID(doc,filename,&start_character,&end_character,errorCode) == 
      false) {
    waislog(WLOG_HIGH, WLOG_ERROR,
	    "Can't parse doc-id: '%s'", local_id);
    *errorCode = GDT_MissingDocID;
    s_free(dbname);
    s_free(local_id);
    return(NULL);
  }

  waislog(WLOG_LOW, WLOG_RETRIEVE,
	  "Retrieving DocID: '%s', line range: %d %d, from database %s", 
	  local_id, doc->ChunkStart.Pos, doc->ChunkEnd.Pos,
	  dbname);
  /* check the database */
  if(NULL == dbname){
    waislog(WLOG_HIGH, WLOG_WARNING,
	    "Missing database for doc-id: '%s'", local_id);
    *errorCode = GDT_MissingDatabase;
    s_free(local_id);
    return(NULL);
  }
  
  if(check_for_legitimate_file(filename, dbname, index_directory) == false){
    waislog(WLOG_HIGH, WLOG_WARNING,
	    "doc-id: '%s' not in database '%s'", local_id,dbname);    
    *errorCode = GDT_MissingDocID;
    s_free(dbname);
    s_free(local_id);
    return(NULL);
  }

  s_free(dbname);

  file = s_fopen(filename,"r");
  if (file == NULL)
    if(probe_file_possibly_compressed(filename)) {
      char buffer[ 2 * MAX_FILENAME_LEN + 10 ];
#if (defined(NeXT) || defined(Mach))
      tmpnam(tmpFileName);
#else
      tmpFileName = tempnam( "/tmp/", 0 );
#endif /* NeXT or Mach */
      sprintf( buffer, "zcat %s.Z > %s", filename, tmpFileName );
      system( buffer );
      file = s_fopen(tmpFileName,"r");
    }
  if (file == NULL) { 
    waislog(WLOG_HIGH, WLOG_WARNING, 
	    "Attempt to retrieve text for bad doc-id: '%s'", local_id);     
    *errorCode = GDT_MissingDocID;
    s_free(local_id);
    return(NULL);
  }

  if(0 != fseek(file, start_character, SEEK_SET))
   { 
     waislog(WLOG_HIGH, WLOG_WARNING, 
	     " error on attempt to seek into file for doc-id: '%s'", local_id);
     s_free(local_id);
     *errorCode = GDT_BadRange;
     return(NULL);
   }
  /* find the start byte */
  buffer = (char*)s_malloc(BUFSZ);
  lines = byte = 0;
  while (lines < doc->ChunkStart.Pos)
   { /* search a buffer full */
     bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file); 
     for (i = 0; i < bytesRead && lines < doc->ChunkStart.Pos; i++, byte++)
      { if (buffer[i] == '\n' || buffer[i] == '\r')
	  /* \r should not happen because we are reading the file in text 
	     mode */
          lines++;
      }
     if (bytesRead == 0) /* cheasy handling files that don't end with nl */
       lines++;
   } 
  startByte = byte;
   
  beFriendly();
  
  /* find the end byte */ /* this could be done while getting the bytes XXX */
  /* search starting form the start pos */  
  if (fseek(file,startByte + start_character,SEEK_SET) != 0) 
   { 
     waislog(WLOG_HIGH, WLOG_WARNING, 
	     "retrieval can't seek to %ld in file <%s>",
	     startByte,filename);
     
     *errorCode = GDT_BadRange;
     if (tmpFileName) unlink( tmpFileName );
     s_free(local_id);
     return(NULL);
   }

  beFriendly();
  
  while (lines < doc->ChunkEnd.Pos) 
   { /* search a buffer full */
     bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file); 
     for (i = 0; i < bytesRead && lines < doc->ChunkEnd.Pos; i++, byte++)
      { if (buffer[i] == '\n' || buffer[i] == '\r')
	  /* \r should not happen, we are reading the file in text mode */
          lines++;
      }
     if (bytesRead == 0) /* cheasy handling of files that don't end with nl */
       lines++;
   } 
  endByte = byte;
   
  beFriendly();
  
  s_free(buffer);
     
  /* get the bytes */
  if (fseek(file,startByte + start_character,SEEK_SET) != 0)
   { 
     waislog(WLOG_HIGH, WLOG_WARNING, 
	     "retrieval can't seek to %ld in file <%s>",startByte,
	     filename);
     
     *errorCode = GDT_BadRange;
     if (tmpFileName) unlink( tmpFileName );
     s_free(local_id);
     return(NULL);
   }
   
  bytes = endByte - startByte; 
  buffer = (char*)s_malloc(bytes);
  
  bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
  
  if (bytesRead != bytes)
   { 
     waislog(WLOG_HIGH, WLOG_WARNING, 
	     "retrieval error in file <%s>",filename);
     
     *errorCode = GDT_BadRange;
     if (tmpFileName) unlink( tmpFileName );
     s_free(local_id);
     return(NULL);
   }
  
  bufAny = makeAny(bytesRead,buffer);
  
  text = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
  
  /* the any and the buffer are freed by freeWAISSearchResponse() */
  s_fclose(file);
  if (tmpFileName) unlink( tmpFileName );
  *errorCode = GDT_NoError;
  s_free(local_id);
  return(text);
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.