This is irretrvl.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. */ #ifndef lint static char *RCSid = "$Header: /tmp_mnt/net/quake/proj/wais/wais-8-b5/ir/RCS/irretrvl.c,v 1.30 92/05/10 14:43:59 jonathan Exp $"; #endif /* Change log: * $Log: irretrvl.c,v $ * Revision 1.30 92/05/10 14:43:59 jonathan * * Made a little safer on NULL docid's when parsing. * * Revision 1.29 92/05/06 17:31:26 jonathan * modified #if's for NeXT and Mach. Added S_ISDIR definition for them both. * * Revision 1.28 92/05/04 17:19:54 jonathan * Added test for parsing docids (if null, log error). * * Revision 1.27 92/04/28 16:56:08 morris * added boolean to serial engine * * Revision 1.26 92/04/01 17:09:46 jonathan * Added index_directory to check_for_legitimate_file to test if filename is * under default directory (for FTP-like retrieval). * * * Revision 1.25 92/03/18 08:54:41 jonathan * Removed databaseName argument from getData and getDocumentText. The * database name is now culled from the docid. Removed special cases for INFO * and Quest db's, as they should no longer be needed. * * Revision 1.24 92/02/18 14:04:49 jonathan * in check_for_legitimate_file: added INFO to the list of special case * retrievals from MAC's. * * Revision 1.23 92/02/18 11:53:45 jonathan * conditionalized use of tempnam for NeXT (doesn't exist, use tmpnam * instead). May be a BSD thing. * * Revision 1.22 92/02/17 12:38:52 jonathan * special case catalog in check_for_legitimate_file. * * Revision 1.21 92/02/16 18:04:52 jonathan * Demoted more WLOG_ERROR's to WLOG_WARNING's * * Revision 1.20 92/02/15 19:40:30 jonathan * Improved reporting of retrieval errors. * * Revision 1.19 92/02/15 18:58:38 jonathan * Changed most (but not all) waislog errors to warnings on retrieval. * * Revision 1.18 92/02/14 16:06:20 jonathan * Fixed text in error message for invalid docid (not in DB) * * Revision 1.17 92/02/14 15:24:08 jonathan * Made parseDocID public. * * Revision 1.16 92/02/12 13:29:35 jonathan * Added "$Log" so RCS will put the log message in the header * */ /* retrieval part of the serial ir engine. if you are using a different storage system for the documents, replace this file. -brewster 10/91 added .Z file support from mlm@cs.brown.edu (Moises Lejter) to do: handle .Z files at a lower level. */ #include "irretrvl.h" #include "irfiles.h" /* for filename_table_ext */ #include <string.h> #include "futil.h" #include <ctype.h> /* for isspace */ #include "irext.h" #include "irdirent.h" #include <sys/stat.h> #ifdef Mach #include <sys/inode.h> #define S_ISDIR(f_mode) (f_mode & IFDIR) #endif /* Mach */ #if (defined(NeXT) && !(defined(S_ISDIR))) #define S_ISDIR(f_mode) ((f_mode) & S_IFDIR) #endif /*----------------------------------------------------------------------*/ boolean parseDocID(doc,filename,start_character,end_character,errorCode) DocObj* doc; char* filename; long* start_character; long* end_character; long* errorCode; { DocID* theDocID = NULL; char* local_id = NULL; char* token = NULL; long i; if((theDocID = docIDFromAny(doc->DocumentID)) == NULL) return false; local_id = anyToString(GetLocalID(theDocID)); freeDocID(theDocID); /* parse the doc id into start pos, end pos, and filename */ /* first the start char */ token = local_id; for (i = 0; local_id[i] != '\0' && isspace(local_id[i]) == false; i++) ; if (local_id[i] == '\0') { waislog(WLOG_HIGH, WLOG_WARNING, "Attempt to retrieve data for bad doc-id: '%s'",local_id); *errorCode = GDT_BadDocID; s_free(local_id); return(false); } local_id[i] = '\0'; sscanf(token,"%ld",start_character); /* now the second char */ token = local_id + i + 1; for (++i; local_id[i] != '\0' && isspace(local_id[i]) == false; i++) ; if (local_id[i] == '\0') { waislog(WLOG_HIGH, WLOG_WARNING, "Attempt to retrieve data for bad doc-id: '%s'",local_id); *errorCode = GDT_BadDocID; s_free(local_id); return(false); } local_id[i] = '\0'; sscanf(token,"%ld",end_character); /* and finally the file name */ strncpy(filename,local_id + i + 1,MAX_FILENAME_LEN); s_free(local_id); return(true); } /*----------------------------------------------------------------------*/ /* this checks to make sure that the filename is a file within the database */ static boolean check_for_legitimate_file _AP((char *filename, char* database_name, char* index_directory)); static boolean check_for_legitimate_file(filename, database_name, index_directory) char *filename; char *database_name; /* full pathname of the database */ char *index_directory; { struct stat sbuf; /* the help file and catalog file (the .src and .cat files) must be special cased because it is not in the filename table */ /* caching is done in filename_in_filename_file for repeated requests for the same file, so it does not need to be repeated here. */ if(NULL != strstr(filename, ".src")) /* let it pass */ return(true); if(NULL != strstr(filename, ".cat")) /* let it pass */ return(true); stat(filename, &sbuf); if(S_ISDIR(sbuf.st_mode)) { waislog(WLOG_HIGH, WLOG_WARNING, "File: '%s' is a directory, and cannot be retrieved.", filename); return(false); } else { /* name of the file of the filetable for this db (eg /bar/foo.fn). confusing, no? */ char filename_table_filename[MAX_FILE_NAME_LEN +1]; pathname_directory(database_name, filename_table_filename); strncat(filename_table_filename, "/", MAX_FILE_NAME_LEN); strncat(filename_table_filename, database_file(pathname_name(database_name)), MAX_FILE_NAME_LEN); s_strncat(filename_table_filename, filename_table_ext, MAX_FILE_NAME_LEN, MAX_FILE_NAME_LEN); if(!filename_in_filename_file(filename, NULL, NULL, filename_table_filename)){ /* we lose. this means either the db does not exist, or the file is not in that db. Log the bad news */ if(index_directory == NULL) return true; else if (substrcmp(filename, index_directory)) return true; waislog(WLOG_HIGH, WLOG_WARNING, "File: '%s' is not in DB '%s', and cannot be retrieved.", filename, filename_table_filename); return(false); } else{ /* everything is peachy */ return(true); } } } /*----------------------------------------------------------------------*/ WAISDocumentText* getData(doc, errorCode, index_directory) DocObj* doc; long* errorCode; char* index_directory; /* it isn't text, so we can just grab data */ { FILE* file = NULL; char fileName[MAX_FILENAME_LEN + 1]; char* dbname = NULL; WAISDocumentText* data = NULL; long start,end; /* position of the document in the file */ long startByte,endByte,bytes,bytesRead; /* part of the doc that we want */ char* buffer = NULL; any* bufAny = NULL; DocID *docid; #if (defined(NeXT) || defined(Mach)) char tmpFileName[MAX_FILENAME_LEN+1]; #else char *tmpFileName = NULL; #endif /* NeXT or Mach */ /* we can only handle byte chunks here */ if ((doc->ChunkCode == CT_byte) || (doc->ChunkCode == CT_document)) { if (parseDocID(doc,fileName,&start,&end,errorCode) == false) { waislog(WLOG_HIGH, WLOG_WARNING, "can't parse docid"); *errorCode = GDT_MissingDocID; return(NULL); } *errorCode = GDT_NoError; docid = docIDFromAny(doc->DocumentID); dbname = anyToString(GetDatabase(docid)); freeDocID(docid); if(true == check_for_legitimate_file(fileName, dbname, index_directory)){ file = s_fopen(fileName,"rb"); if (file == NULL){ if(probe_file_possibly_compressed(fileName)) { char buffer[ 2 * MAX_FILENAME_LEN + 10 ]; #if (defined(NeXT) || defined(Mach)) tmpnam(tmpFileName); #else tmpFileName = tempnam( "/tmp/", 0 ); #endif /* NeXT or Mach */ sprintf( buffer, "zcat %s.Z > %s", fileName, tmpFileName ); system( buffer ); file = s_fopen(tmpFileName,"rb"); } } } if (file == NULL) { waislog(WLOG_HIGH, WLOG_WARNING, "Attempt to retrieve data for missing doc-id: '%s'", fileName); *errorCode = GDT_MissingDocID; s_free(dbname); return(NULL); } if (doc->ChunkCode == CT_byte) { startByte = doc->ChunkStart.Pos + start; endByte = doc->ChunkEnd.Pos + start; } else { startByte = start; endByte = end; } waislog(WLOG_LOW, WLOG_RETRIEVE, "Retrieving DocID: %d %d %s, byte: %d %d, from database %s", start, end, fileName, startByte, endByte, dbname); s_free(dbname); if (endByte > end && end != 0) { waislog(WLOG_HIGH, WLOG_WARNING, "retrieval beyond bounds of document %ld in file <%s>", endByte,fileName); *errorCode = GDT_BadRange; endByte = end; } /* get the bytes */ if (fseek(file,startByte,SEEK_SET) != 0) { waislog(WLOG_HIGH, WLOG_WARNING, "retrieval can't seek to %ld in file <%s>",startByte, fileName); *errorCode = GDT_BadRange; if (tmpFileName) unlink( tmpFileName ); if (tmpFileName) unlink( tmpFileName ); if (tmpFileName) unlink( tmpFileName ); return(NULL); } bytes = endByte - startByte; buffer = (char*)s_malloc(bytes); bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file); if (bytesRead != bytes) { waislog(WLOG_HIGH, WLOG_WARNING, "retrieval error in file <%s>",fileName); *errorCode = GDT_BadRange; if (bytesRead == 0) return(NULL); } bufAny = makeAny(bytesRead,buffer); data = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny); /* the any and the buffer are freed by freeWAISSearchResponse() */ s_fclose(file); if (tmpFileName) unlink( tmpFileName ); return(data); } else { waislog(WLOG_HIGH, WLOG_WARNING, "search engine can only use whole documents or byte offsets for data lookup"); *errorCode = GDT_UnsupportedChunkType; return(NULL); } } /*----------------------------------------------------------------------*/ #define BUFSZ (size_t)5000 WAISDocumentText* getDocumentText(doc, errorCode, index_directory) DocObj* doc; long* errorCode; char* index_directory; /* find the text for doc, get the sub part if any, finally construct and return a WAISDocumentText. If it can not find the document (or some other error) it returns NULL and sets errorCode. */ { WAISDocumentText* text = NULL; FILE* file = NULL; char* dbname = NULL; char* buffer = NULL; any* bufAny = NULL; char filename[MAX_FILENAME_LEN + 1]; long start_character; long end_character; register long i; long bytes,bytesRead; long startByte,endByte,byte,lines; #if (defined(NeXT) || defined(Mach)) char tmpFileName[MAX_FILENAME_LEN+1]; #else char *tmpFileName = NULL; #endif /* NeXT or Mach */ DocID* theDocID = NULL; char* local_id = NULL; *errorCode = GDT_NoError; /* we can only handle line chunks for now */ if (doc->ChunkCode != CT_line) { waislog(WLOG_HIGH, WLOG_WARNING, "search engine can only use line offsets for now."); *errorCode = GDT_UnsupportedChunkType; return(NULL); } theDocID = docIDFromAny(doc->DocumentID); dbname = anyToString(GetDatabase(theDocID)); local_id = anyToString(GetLocalID(theDocID)); freeDocID(theDocID); if (parseDocID(doc,filename,&start_character,&end_character,errorCode) == false) { waislog(WLOG_HIGH, WLOG_ERROR, "Can't parse doc-id: '%s'", local_id); *errorCode = GDT_MissingDocID; s_free(dbname); s_free(local_id); return(NULL); } waislog(WLOG_LOW, WLOG_RETRIEVE, "Retrieving DocID: '%s', line range: %d %d, from database %s", local_id, doc->ChunkStart.Pos, doc->ChunkEnd.Pos, dbname); /* check the database */ if(NULL == dbname){ waislog(WLOG_HIGH, WLOG_WARNING, "Missing database for doc-id: '%s'", local_id); *errorCode = GDT_MissingDatabase; s_free(local_id); return(NULL); } if(check_for_legitimate_file(filename, dbname, index_directory) == false){ waislog(WLOG_HIGH, WLOG_WARNING, "doc-id: '%s' not in database '%s'", local_id,dbname); *errorCode = GDT_MissingDocID; s_free(dbname); s_free(local_id); return(NULL); } s_free(dbname); file = s_fopen(filename,"r"); if (file == NULL) if(probe_file_possibly_compressed(filename)) { char buffer[ 2 * MAX_FILENAME_LEN + 10 ]; #if (defined(NeXT) || defined(Mach)) tmpnam(tmpFileName); #else tmpFileName = tempnam( "/tmp/", 0 ); #endif /* NeXT or Mach */ sprintf( buffer, "zcat %s.Z > %s", filename, tmpFileName ); system( buffer ); file = s_fopen(tmpFileName,"r"); } if (file == NULL) { waislog(WLOG_HIGH, WLOG_WARNING, "Attempt to retrieve text for bad doc-id: '%s'", local_id); *errorCode = GDT_MissingDocID; s_free(local_id); return(NULL); } if(0 != fseek(file, start_character, SEEK_SET)) { waislog(WLOG_HIGH, WLOG_WARNING, " error on attempt to seek into file for doc-id: '%s'", local_id); s_free(local_id); *errorCode = GDT_BadRange; return(NULL); } /* find the start byte */ buffer = (char*)s_malloc(BUFSZ); lines = byte = 0; while (lines < doc->ChunkStart.Pos) { /* search a buffer full */ bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file); for (i = 0; i < bytesRead && lines < doc->ChunkStart.Pos; i++, byte++) { if (buffer[i] == '\n' || buffer[i] == '\r') /* \r should not happen because we are reading the file in text mode */ lines++; } if (bytesRead == 0) /* cheasy handling files that don't end with nl */ lines++; } startByte = byte; beFriendly(); /* find the end byte */ /* this could be done while getting the bytes XXX */ /* search starting form the start pos */ if (fseek(file,startByte + start_character,SEEK_SET) != 0) { waislog(WLOG_HIGH, WLOG_WARNING, "retrieval can't seek to %ld in file <%s>", startByte,filename); *errorCode = GDT_BadRange; if (tmpFileName) unlink( tmpFileName ); s_free(local_id); return(NULL); } beFriendly(); while (lines < doc->ChunkEnd.Pos) { /* search a buffer full */ bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file); for (i = 0; i < bytesRead && lines < doc->ChunkEnd.Pos; i++, byte++) { if (buffer[i] == '\n' || buffer[i] == '\r') /* \r should not happen, we are reading the file in text mode */ lines++; } if (bytesRead == 0) /* cheasy handling of files that don't end with nl */ lines++; } endByte = byte; beFriendly(); s_free(buffer); /* get the bytes */ if (fseek(file,startByte + start_character,SEEK_SET) != 0) { waislog(WLOG_HIGH, WLOG_WARNING, "retrieval can't seek to %ld in file <%s>",startByte, filename); *errorCode = GDT_BadRange; if (tmpFileName) unlink( tmpFileName ); s_free(local_id); return(NULL); } bytes = endByte - startByte; buffer = (char*)s_malloc(bytes); bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file); if (bytesRead != bytes) { waislog(WLOG_HIGH, WLOG_WARNING, "retrieval error in file <%s>",filename); *errorCode = GDT_BadRange; if (tmpFileName) unlink( tmpFileName ); s_free(local_id); return(NULL); } bufAny = makeAny(bytesRead,buffer); text = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny); /* the any and the buffer are freed by freeWAISSearchResponse() */ s_fclose(file); if (tmpFileName) unlink( tmpFileName ); *errorCode = GDT_NoError; s_free(local_id); return(text); }
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.