ftp.nice.ch/pub/next/connectivity/infosystems/WAIStation.1.9.6.N.b.tar.gz#/WAIS/ir/irext.h

This is irext.h in view mode; [Download] [Up]

/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer. */

/* Include file for the irhash.c file.
   Implements the building functions in irext.h */

#ifndef IREXT_H
#define IREXT_H


/* An interface for adding new server types into the WAIS system.
 * The idea is to use the parsing and bookkeeping operatios of the serial 
 * indexer, while allowing different invered file and signiture systems 
 * to be added as back ends.
 *
 * - Tracy Shen and Brewster 3/91
 */

/*
 * $Log:	irext.h,v $
 * Revision 1.24  92/03/20  11:02:16  jonathan
 * Added word_position boolean to add_word.  This is a new switch to allow an
 * indexer to ignore the word position info (if it cares), based on
 * indexer parameters.
 * 
 * Revision 1.23  92/03/05  07:06:12  shen
 * update init_search_engine prototype to add two more parameters
 * For seeker-ram, the twonew parameters are: grow percent and textsixe
 * 
 * Revision 1.22  92/02/29  20:11:57  jonathan
 * Conditionalized definition of DF_INDEPENDENT, etc.
 * 
 * Revision 1.21  92/02/13  12:24:25  jonathan
 * conditionalized inclusion of irparse.h on BOOL.
 * 
 *
 * Tracy changes:
 *  - in function "add_word", add two more parameters, source and date
 *  - add a new function "set_query_parameter"
 * proposed changes by brewster:
 *  replace date_type with time_t: accepted
 *  take out all "unsigned" type modifiers (tracey will concider this)
 *  replace short with long: accepted
 *  replace int with long (we port to 16 bit machines still): accepted
 *  added source to delete_doc_id parameters: accepted
 * Proposed changes by brewster and tracy:
 *  if routines are successful return 0, otherwise an error code: accepted
 * Proposed changes by harry:
 *  make the dictionary value be any size.  This can be done by 
 *   passing in a size arg or by passing in read and write routines.
 *  have a function that says we will not be calling best_hit anymore.
 * proposed changes by brewster:
 *  took out hash_pos from add word.
 *  change source to a database* db.
 *  added finished_best_hit, finished_search_word(db), finished_add_word
 * NOT ACCEPTED proposed by tracy (9/91)
 *  create a function init_search_word and pass an array of db's
 *  modify search_word dont pass db.
 *  modify finish_search_word dont pass db.
 *  create init_best_hit pass dbs
 *  modify best_hit to pass db (to be modified)
 *  modify a hit structure to contain a db
 *  create se_init, se_exit (tracy will send these in)
 *  create se_open_database, se_close_database (tracy will send these in)
 * ACCEPTED proposed by tracy (9/91)  
 *  create a function init_search_word and pass db
 *  add total word count to db (not set in server side)
 *  create init_best_hit 
 *  change dbs slot query_parameter_type from being a "database** dbs"
 *    to "char** srcs" (not done on server side yet)
 *  modify best_hit to have argument doc_id, best_character, score
 *  create ext_open_database, ext_close_database
 *  modify init_add_word to add char* src. (not done in server side)
 * proposed changes by brewster
 *  modify best_hit to take both best_character and best_line
 * IMPLEMENTED proposed by brewster
 *  specify that search_word take a downcased word
 *  create function char *database_file(char *database_name)
 *	that will return the name of the file that the database is stored in.
 *	on Seeker and Beta this would return "INDEX" always, 
 * 	and serial server this would returns its argument. 
 * IMPLEMENTED proposed by brewster
 *   the srcs list in set_query_parameter will be what the user
 *   passed in the database fields of the Z39.50 request.
 *   This means that if multiple src's are specified separate by comma's
 *   then it is up to the backend to parse those out.
 * IMPLEMENTED proposed by brewster
 *   the src field in init_add_word will always be NULL and
 *   the information will be passed via set_query_parameter.
 * APPROVED proposed by brewster
 *   add init_search_engine and finished_search_engine
 *     this would be called when the server process starts and exists.
 *     these functions could check to make sure everything is sane.
 *     same arguments in ext_open_database
 * proposed by brewster 
 *   change scores to doubles rather than longs.  maybe weights too.
 * IMPLEMENTED proposed by tracy
 *   pass another argument to search_word and add_word:
 *	long word_pair.
 * proposed by tracy: APPROVED
 *   take out src arg from init_add_word
 *   change arg name in search_word from doc_id to relevant_doc_number
 *   take out dictionary_value from search_word
 *   New arguments to ext_open_database:
 *     initialize (same)
 *     for_search  (if true searches can happen, otherwise can not)
 *     for_update  (if true updates can happen, otherwise can not)
 * proposed change by tracy
 *   add 2 more long arguments to init_search_engine and ext_open_database
 *     (for seeker, the first argument should be the percentage of 
 *      CM memory for signatures.  For open_database put in the total 
 *       raw text size).
 *     
 */

#include "cdialect.h"
#include "irfiles.h" /* for database */

#ifdef BOOL
#include "irparse.h" /* for boolean searches */
#endif

#ifdef __cplusplus
/* declare these as C style functions */
extern "C"
	{
#endif /* def __cplusplus */

/* ============================
 * ===  Control Functions  ===
 * ============================*/

/*
 * SE_init - Search Engine initialization function
 *
 * Parameter description:
 *   if_update - if update is to be performed in this run, value
 *               be True (1L) or False(0L).
 *   if_query  - if query is to be performed in this run, value
 *               be True (1L) or False (0L).
 *
 * Functional description:
 *   This function should be the first function call FE(front end)
 *   make to the BE(back end) SE(search engine).
 *   It gives the SE a chance to initialize its global variables
 *   to best serve FE's requests efficiently.
 *   It only needs to * be called once each run. Sebsequent calls
 *   will be ignored.
 *   For a batch update ( eg, waisindex run),
 *   parameters if_update should be set to True, and if_quesry be False.
 *   For serving query (eg, a waisserver run), the if_query will be True.
 *   To allow on-line update while serving query, if_update should be
 *   set to True, otherwise be False.
 *   The waisserver has to be able to take on-line update request and
 *   update the search engine's database.
 *
 */

long SE_init _AP(( long if_update, long if_query));

/*
 * SE_exit - Search Engine exit function
 *
 * Parameter description:
 *   None
 *
 * Functional description:
 *   This function should be the last function call FE(front end)
 *   makes to SE. It gives the SE a chance to flush data kept in
 *   buffers, clean up temporary files,  and free up resourecs.
 */

long SE_exit _AP(( void));



long SE_open_database _AP (( database *db,
                       long if_initialize,
                       long if_update,
                       long if_query,
                       long *parameter1,
                       long *parameter2));
/*
 * SE_open_database - open a database
 *
 * Parameter description:
 *   db - pointer to a database structure. The structure should
 *               contain a field "SE_private_tag" of type void *.
 *               The SE will fill in this field when the
 *               database is open. This is the search enginer's pointer
 *               to its data structure of the database.
 *   if_initialize - if initialize this databse. If value is
 *               True, the database will be set empty. If
 *               one already exists, it will be purged
 *               or marked old  according to the system maintenance
 *               policy employed.
 *   if_update - if update is to be performed on this database, value
 *               be True (1L) or False(0L).
 *   parameter1 and parameter2 - these are additional info the
 *               SE needs from the FE. They are pointers. If
 *               a SE does not need extra info, FE will just pass
 *               NULL.
 *               For new seeker, parameter1 is the databases max size
 *               in percentage to the full-system-load.
 *               CM memory is a limited resource to be shared by
 *               multiple databases, and is not efficient
 *               in dynamic re-allocation.
 *               Seeker requires the FE to tell it the maximum size the
 *               database can grow to thus it can pre-allocate the
 *               right amount of processors to the databse and will
 *               wrap around when it reachs the limit to squeeze out
 *               old data.
 *               The It is specified as the
 *               percentage of the full-load CM signature pool.
 *               For example, on a 8K CM with small memory, it can
 *               holds up to 200 megabytes raw text size data.
 *               If parameter
 *		
 */

long SE_close_database _AP (( database *db));

long SE_checkpointing _AP(( database *db));




/* this is called when the server or indexer is started up 
 *   before any other operations are run.  
 *
 * If this is a server starting, then file is the directory of the index.
 * If this is an indexer starting, then file is the index file.
 * NOTE - This routine may be called more than once
 *
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 *          -1 insufficient resources
 */

long init_search_engine _AP((char* file, 
			     boolean initialize, 
			     boolean for_search,
			     long cm_mem_percent,
			     long rawtext_size,
			     long grow_percent));


/* this is called when the server is shut down.
 *
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 */

long finished_search_engine _AP((void));


/*
 *  ext_open_database: This function will be called on a database before
 *  	any operations are done on it.  It might be called multiple times 
 *	with the same database before a close is done.
 *	initialize: means that the database should be cleared of all state 
 *	  	since it will be rebuilt from scratch.
 *	for_search: means that the database will only be used for searching.
 *		if this is false, then it can be searched and added to.
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 *
 */

long ext_open_database _AP((database *db, 
			    boolean initialize, 
			    boolean for_search));

/*
 *  ext_close_database: This function will be called after all operations 
 *	on this database are done.
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long ext_close_database _AP((database *db));

char *database_file _AP((char *database_name));

/* ============================
 * ===  Building Functions  ===
 * ============================*/

/* init_add_word add_word... finished_search_word 
 *  is the sequence for creating an update.  When a finished_add_word is done,
 *  then the it is safe (and expected) that the builder will flush things to files.
 *  set query parameter can be called at any time between documents during an add.
 */

/*
 *  init_add_word: called before any calls to add_word.  finished_add_word
 *   	will be called before another init_add_word is called.
 *	db is the one that will be added to.
 * 	parameter1 and parameter2 are implementation specific.
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 * 
 */
long init_add_word _AP ((database *db, 
			 long parameter1, long parameter2));

/*
 *  add_word: add this word to the database
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */
long add_word _AP((
		   char *word,	/* the word to be indexed, this could be a
				   word pair. If NULL there are no more words
				   to be indexed */
		   long char_pos, /* the position of the start of the
				     word */
		   long line_pos, /* this is passed for the best
				     section calculation */
		   long weight,	/* how important the word looks
				   syntactically (such as is it bold)
				   NOT used by signature system */
		   long doc_id,	/* current document, this will never be 0 */
		   time_t date, /* display day of this document, 0 if not known */
		   long word_pair, /* 1 if it is, 0 if not */
		   database* db, /* database to insert the document */
		   boolean word_position /* whether the position is valid or not */
		   ));

/*
 *  finished_add_word: states that there are no more words to add
 *   to this database.
 *
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long finished_add_word _AP((database *db));

/* ===============================
 * ===  Maintenance Functions  ===
 * ===============================*/

/*
 *  delete_doc_id : delete a document
 *   return values:  0, successfull
 *                  -1, document not found
 *
 */
long delete_doc_id _AP((long doc_id, database *db));

/* =============================
 * ===  Searching Functions  ===
 * =============================*/


/*
 *  set_query_parameter : set query parameter
 *      set search attributes such as date factor, document source ids,
 *      and maximum number of documents returned in a search ( the last
 *      one is an important performance factor to signature  type system)
 *      The search artributes applies to all comming queries until
 *      they are re-set by next set_query_parameter call.
 *
 *   return values:  none
 *
 */
#define SET_MAX_RETRIEVED_MASK 1
#define SET_DATE_FACTOR_MASK   2
#define SET_SELECT_SOURCE      4

/* enum literals for date_factor */
#ifndef DF_INDEPENDENT
#define DF_INDEPENDENT          1
#define DF_LATER                2
#define DF_EARLIER              3
#endif

typedef struct {
  long max_hit_retrieved;
  /* max number of hits can be returned by
   * the search engine. For a signature
   * type system, the default value is 20
   */
  long date_factor;		/* default is DF_INDEPENDENT */
  long num_db;			/* value of zero indicating select all,
				 * default is selecting all
				 */
  char **srcs;			/* string of sources to be searched */
}  query_parameter_type;

/*
 *  set_query_parameter: set a mode variable for the search engine
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long set_query_parameter _AP ((
			 long mask,
			 query_parameter_type *parameters
			 /* fields in the query parameter structure are only
			  * interpreted when the corresponding mask bit 
			  * is set in the mask argument.
			  */
			 ));



/*
 *  init_search_word: called before any search_word is called in this query.
 * 	The only operations that occur after this is search_word.
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long init_search_word _AP((database* db));

/*
 *  search_word: searches for a word in the index.  it side effects 
 *               internal state so that best_hit will return the correct 
 *               results.
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long search_word 
  _AP ((char *word, /* the word to be searched for */
	long char_pos,		/* the position of the start of the word */
	long line_pos,		/* is this needed? not for signature system */
	long weight,		/* how important the word looks syntactically,
				   such as is it bold */
	long relevant_doc_number,/* current document, seed words is 0,
				   then it increments into the relevant 
				   document */
	long word_pair, /* 1 if it is, 0 if not */
	database *db
	));


/*
 *  finished_search_word: states that there are no more words that will
 *   be searched for before best_hit will be called.
 *
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long finished_search_word _AP((database *db));



/*
 *  init_best_hit: called before any best_hit is called in this query.
 * 	The only operations that occur after this is best_hit.
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long init_best_hit _AP((database *db));

/*
 *  best-hit :
 *
 *   return values:  0, successfull
 *                  -1, no more documents to return
 *		    Other values returned to signal future signals.
 *
 */
long best_hit _AP ((database* db,long *doc_id, long *best_character, 
		    long *best_line, long *score));

/*
 *  finished_best_hit: states that there are no more best_hits will be called
 *   before the next set of search_words or add_words.
 *
 *   return values: 0 if successful, non-0 if error
 *       defined error conditions:
 *
 */

long finished_best_hit _AP((database* db));

#ifdef __cplusplus
	}
#endif /* def __cplusplus */

#endif /* ndef IREXT_H */

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.