This is sersrch.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* implements the search part of irext.h (search_word and finished_search_word) -brewster Split from irsearch.c 5/31/91 Added scale_scores. Fixed document_score_array to long. 7/8/91 Removed scale_scores, handled in search_word with doc_id > 0. 2/4/92 Made document_score_array a double. - Jonny G * $Log: sersrch.c,v $ * Revision 1.24 92/04/28 16:56:54 morris * added boolean to serial engine * * Revision 1.23 92/03/15 10:15:18 jonathan * Added Simon Spero's ASSIGN replacement for read_bytes. * * Revision 1.22 92/03/05 07:09:54 shen * add two more dummy arguments to call to init_search_engine * * Revision 1.21 92/02/12 17:29:52 jonathan * Conditionalized inclusion of object code. * * Revision 1.20 92/02/12 13:40:06 jonathan * Added "$Log" so RCS will put the log message in the header * */ #include "cdialect.h" #include "irfiles.h" #include "irsearch.h" #include "irext.h" #include "byte_order.h" #include <string.h> #ifdef BOOL #include "obj.h" #include "irparse.h" object* currentQuery = NULL; /* kludge until irext goes away */ #endif /* def BOOL */ /* weighting for relevant document terms - this may become a parameter to the query. */ #define RF_WEIGHTING 0.1 /* ================================== * === Initialization Functions === * ==================================*/ long init_search_engine(file, initialize, for_search, cm_mem_percent, text_size, grow_percent) char* file; boolean initialize; boolean for_search; long cm_mem_percent; /* unused */ long text_size; /* unused */ long grow_percent; /* unused */ { static boolean inited = false; if (inited == false) { #ifdef BOOL initObj(); initBool(); #endif inited = true; } return(0); } long finished_search_engine() { return(0); } /* * ext_open_database: see irext.h */ long ext_open_database (db, initialize, for_search) database *db; boolean initialize; boolean for_search; { /* this has to deal with the .inv file */ char file[MAX_FILE_NAME_LEN]; if(initialize) /* make a new one */ db->index_stream = s_fopen(index_filename(file, db), "w+b"); else if(for_search) /* just search */ db->index_stream = s_fopen(index_filename(file, db), "rb"); else /* write to an existing db */ db->index_stream = s_fopen(index_filename(file, db), "r+b"); if (db->index_stream == NULL) { waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n", file); disposeDatabase(db); return(1); } return(0); } /* * ext_close_database: see irext.h */ long ext_close_database (db) database *db; { return(0); } char *database_file(database_name) char *database_name; { return(database_name); } /*===========================* *=== Setting Paramters ===* *===========================*/ long max_hit_retrieved = 0; char **srcs = NULL; long set_query_parameter (mask, parameters) long mask; query_parameter_type * parameters; { switch (mask) { case SET_MAX_RETRIEVED_MASK: max_hit_retrieved = parameters->max_hit_retrieved; return(0); break; case SET_SELECT_SOURCE: if(NULL != srcs){ if(NULL != srcs[0]) s_free(srcs[0]); s_free(srcs); } srcs = parameters->srcs; break; default: return(-1); break; } return(0); } /*==============================* *=== Document Score Array ===* *==============================*/ double *document_score_array = NULL; long document_score_array_len = 0; /* make_document_score_array insures that the document_score_array array is long enough, if not it makes it long enough */ static void make_document_score_array _AP((long length)); static void make_document_score_array(length) long length; { if(length <= document_score_array_len) return; /* we have to make a new one. free the old one first (if any) */ if(document_score_array != 0){ s_free(document_score_array); } document_score_array = (double*)s_malloc( (size_t)(length * sizeof(double))); document_score_array_len = length; } static void destroy_document_score_array _AP((void)); static void destroy_document_score_array() { s_free(document_score_array); document_score_array_len = 0; } void clear_document_score_array() /* side effects the document_score_array. */ { memset(document_score_array, 0, document_score_array_len * sizeof(double)); } /* for debugging purposes */ void print_document_score_array(start,stop) unsigned long start; unsigned long stop; /* assumes start >= 0, stop < db->doc_table_allocated_entries */ { long i; for(i = start; i <= stop; i++){ printf("entry number %d: %f \n", i, document_score_array[i]); } } /*=========================* *=== Best Hits Array ===* *=========================*/ hit *best_hits_array = NULL; long best_hits_array_len = 0; long current_best_hit = 0; /* see irext.h for doc */ long init_best_hit (db) database *db; { #ifdef BOOL if (currentQuery != NULL) send(currentQuery,InitBestHit,db); #endif /* def BOOL */ return(0); } /* make_best_hits_array insures that the best_hits_array array is long enough, if not it makes it long enough */ static void make_best_hits_array _AP((long length)); static void make_best_hits_array(length) long length; { if(length <= best_hits_array_len) return; /* we have to make a new one. free the old one first (if any) */ if(best_hits_array != 0){ s_free(best_hits_array); } best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit))); best_hits_array_len = length; } static void destroy_best_hits_array _AP((void)); static void destroy_best_hits_array() { s_free(best_hits_array); best_hits_array_len = 0; } void clear_best_hits_array() /* side effects the best_hits_array. XXX could use memset */ { memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit)); } /* for debugging purposes */ void print_best_hits() { long i; for( i = 0; i < best_hits_array_len; i++){ if (best_hits_array[i].weight != 0) { printf("Best hit %ld: weight %ld, doc_id %ld, headline %s, filename %s, lines %ld\n", i, best_hits_array[i].weight, best_hits_array[i].document_id, best_hits_array[i].headline, best_hits_array[i].filename, best_hits_array[i].number_of_lines); } } } void sort_best_hits(db) database * db; { /* returns nothing. * side effects best_hits and document_score_array */ long i, doc; double worst_weight_to_make_it = 0.0; document_table_entry doc_entry; long best_hit_number = 0; /* snuff the scores */ for(i = 0; i < max_hit_retrieved; i++){ best_hits_array[i].weight = 0; } /* loop over the doc, and keep the doc_id and weight in best hit table */ for(doc = 0; doc < db->doc_table_allocated_entries; doc++){ double weight = document_score_array[doc]; if(worst_weight_to_make_it < weight){ /* merge it into the best_hits array. start at the bottom */ for(i = (max_hit_retrieved - 1); i >= 0; i--){ if(weight > best_hits_array[i].weight /* && (check_document_id(doc, db) == true) too slow.*/ ){ /* move this entry down */ if((i + 1) < max_hit_retrieved){ best_hits_array[i+1].weight = best_hits_array[i].weight; best_hits_array[i+1].document_id = best_hits_array[i].document_id; } best_hits_array[i].document_id = doc; best_hits_array[i].weight = weight; } else break; } } } for(i = 0; i < max_hit_retrieved; i++){ if(best_hits_array[i].weight <= 0) /* if we are out of good stuff, return */ return; /* fill in the rest of the hit */ if (read_document_table_entry(&doc_entry, best_hits_array[i].document_id, db) == true){ best_hits_array[best_hit_number].weight = best_hits_array[i].weight; best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id; best_hits_array[best_hit_number].start_character = doc_entry.start_character; best_hits_array[best_hit_number].end_character = doc_entry.end_character; best_hits_array[best_hit_number].document_length = doc_entry.document_length; best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines; read_filename_table_entry(doc_entry.filename_id, best_hits_array[best_hit_number].filename, best_hits_array[best_hit_number].type, NULL, db), strncpy(best_hits_array[best_hit_number].headline, read_headline_table_entry(doc_entry.headline_id,db), MAX_FILE_NAME_LEN); best_hit_number++; } beFriendly(); } for(i = best_hit_number; i < max_hit_retrieved; i++){ best_hits_array[best_hit_number].weight = 0; } /* print_best_hits(s); for debugging */ } /* returns the next best hit */ long best_hit(db, doc_id, best_character, best_line, score) database *db; long *doc_id; long *best_character; long *best_line; long *score; { *best_character = 0; *best_line = 0; #ifdef BOOL if (currentQuery != NULL) /* for boolean */ { send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score); if (*doc_id > 0) return(0); /* ok */ else return(-1); /* no more docs */ } #endif /* BOOL */ if(current_best_hit > best_hits_array_len) return(1); if(best_hits_array[current_best_hit].weight == 0) return(1); *doc_id = best_hits_array[current_best_hit].document_id; *score = best_hits_array[current_best_hit].weight; current_best_hit++; return(0); } long finished_best_hit(db) database *db; { #ifdef BOOL if (currentQuery != NULL) /* for boolean */ { send(currentQuery,Delete); currentQuery = NULL; return(0); } #endif /* BOOL */ /* if we are on a small machine, we might want to destroy_document_score_array */ clear_document_score_array(); clear_best_hits_array(); current_best_hit = 0; return(0); } /*=============================* *=== Searching for words ===* *=============================*/ /* see irext.h for doc */ long init_search_word (db) database* db; { return(0); } /* see irext.h for doc */ long search_word(word,char_pos, line_pos, weight, doc_id, word_pair, db) char *word; /* the word to be searched for */ long char_pos; /* the position of the start of the word */ long line_pos; /* is this needed? not for signature system */ long weight; /* how important the word looks syntactically, such as is it bold */ long doc_id; /* current document, seed words is 0, then it increments into the relevant document */ long word_pair; database *db; { /* this side effects the document_score_array, * and downcases the word. * Returns 0 if successful or word not present, * returns non-0 if an error. * */ long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */ long count, index_block_size; long internal_document_id, internal_weight, number_of_valid_entries; long index_file_block_number; long number_of_occurances; FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)]; char *index_buffer; char *i; FILE *stream = NULL; index_buffer = (char*)index_buffer_data; index_file_block_number = look_up_word_in_dictionary(word, &number_of_occurances, db); current_best_hit = 0; /* so that the best hits willstart from 0 */ /* check the document_score_array */ if(document_score_array_len < db->doc_table_allocated_entries) make_document_score_array(db->doc_table_allocated_entries); if(index_file_block_number >= 0){ stream = db->index_stream; while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) && (index_file_block_number != 0)){ /* read the index block */ if (0 != fseek(stream, (long)index_file_block_number, SEEK_SET)) { waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the inverted file to position %ld", (long)index_file_block_number); return(-1); } read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE); ASSIGN(not_full_flag, INDEX_BLOCK_FLAG_SIZE, index_buffer, INDEX_BLOCK_HEADER_SIZE, 0 ); ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE, index_buffer+INDEX_BLOCK_FLAG_SIZE, INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_FLAG_SIZE); ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE, index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE, INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE); /* this is equivalent, but slower: not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream); index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream); index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); printf("flag = %d, block_num = %d, block_size = %d\n", not_full_flag, index_file_block_number, index_block_size); fflush(stdout); */ if(EOF == index_block_size) { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the index file failed"); return(-1); } if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){ /* not full */ number_of_valid_entries = index_file_block_number; } else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){ /* full */ number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE; } else{ /* bad news, file is corrupted. */ waislog(WLOG_HIGH, WLOG_ERROR, "Expected the flag in the inverted file to be valid. it is %ld", not_full_flag); return(-1); } /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */ /* add the array to the document_score_array */ number_of_valid_entries /= INDEX_ELEMENT_SIZE; for(count=0;count < number_of_valid_entries;count++) { int wgt; int did; if(count%1024 == 0) { read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE* MIN(1024,number_of_valid_entries-count)); i=index_buffer; } ASSIGN(wgt,WEIGHT_SIZE, i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE, INDEX_ELEMENT_SIZE, DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE); ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0); internal_weight = wgt; internal_document_id = did; /* printf("entry %ld, Doc_id: %ld, weight %ld \n", count, internal_document_id, internal_weight); fflush(stdout); */ if(EOF == internal_weight) { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the doc-id table failed"); return(-1); } /* if(doc_id > 0) we are doing a relevant document */ document_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; i+=INDEX_ELEMENT_SIZE; } } return(0); } else if(0 == index_file_block_number){ /* an error occurred on looking up the word */ return(-1); } else /* index_file_block_number is negative */ return(0); /* word not present */ } /* now collect the best hits */ long finished_search_word(db) database *db; { #ifdef BOOL if (currentQuery != NULL) return; /* do nothing for boolean */ #endif /* def BOOL */ /* check the document_score_array */ if(document_score_array_len < db->doc_table_allocated_entries) make_document_score_array(db->doc_table_allocated_entries); make_best_hits_array(max_hit_retrieved); sort_best_hits(db); return(0); }
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.