This is search.c in view mode; [Download] [Up]
/*
** Copyright (C) 1995, Enterprise Integration Technologies Corp.
** All Rights Reserved.
** Kevin Hughes, kevinh@eit.com
** 3/11/94
*/
#include "swish.h"
#include "search.h"
/* The main search function.
** Parentheses are stripped out, things made lowercase,
** extra blanks removed, etc.
*/
void search(words, indexlist, structure)
char *words;
struct swline *indexlist;
int structure;
{
int i, j;
float num;
char word[MAXWORDLEN];
struct result *resultlist;
struct sortresult *sortresultlist;
struct swline *tmplist;
FILE *fp;
searchwordlist = NULL;
for (i = j = 0; words[i] != '\0' && words[i] != '\n'; i++) {
if (isspace(words[i]) || words[i] == '(' ||
words[i] == ')') {
if (j) {
word[j] = '\0';
searchwordlist = (struct swline *)
addswline(searchwordlist, (char *)
convertentities(word));
j = 0;
}
if (words[i] == '(') {
searchwordlist = (struct swline *)
addswline(searchwordlist, "(");
}
if (words[i] == ')') {
searchwordlist = (struct swline *)
addswline(searchwordlist, ")");
}
}
else
word[j++] = tolower(words[i]);
}
if (j) {
word[j] = '\0';
searchwordlist = (struct swline *)
addswline(searchwordlist, (char *) convertentities(word));
}
printf("%s\n", INDEXHEADER);
if (words[0] == '\0') {
printf("err: no search words specified\n.\n");
exit(0);
}
printf("search words:");
tmplist = searchwordlist;
while (tmplist != NULL) {
printf(" %s", tmplist->line);
tmplist = tmplist->next;
}
putchar('\n');
while (indexlist != NULL) {
commonerror = bigrank = 0;
if ((fp = fopen(indexlist->line, "r")) == NULL) {
printf("# Name: unknown index\n");
printf("err: could not open index file\n.\n");
exit(0);
}
if (!isokindexheader(fp)) {
printf("err: the index file format is unknown\n.\n");
exit(0);
}
getheader(fp);
if (!getindexfilenum(fp)) {
printf("err: the index file is empty\n.\n");
exit(0);
}
readoffsets(fp);
readstopwords(fp);
readfileoffsets(fp);
resultlist = NULL;
tmplist = searchwordlist;
tmplist = (struct swline *) fixnot(tmplist, fp);
searchwordlist = (struct swline *) expandstar(tmplist, fp);
resultlist = (struct result *) parseterm(fp, 0);
sortresultlist = NULL;
while (resultlist != NULL) {
if (resultlist->structure & structure)
sortresultlist = (struct sortresult *)
addsortresult(sortresultlist, resultlist->rank,
lookupfile(resultlist->filenum, fp));
resultlist = resultlist->next;
}
fclose(fp);
if (sortresultlist == NULL) {
if (commonerror)
printf("err: a word is too common\n");
else
printf("err: no results\n");
}
else {
if (bigrank)
num = 1000.0 / (float) bigrank;
else
num = 1000;
printsortedresults(sortresultlist, num);
}
searchwordlist = tmplist;
indexlist = indexlist->next;
}
printf(".\n");
}
/* This puts parentheses in the right places around not structures
** so the parser can do its thing correctly.
*/
struct swline *fixnot(sp)
struct swline *sp;
{
int openparen, hasnot;
char lastline[MAXWORDLEN];
struct swline *tmpp, *newp;
#ifdef DEBUG
struct swline *newp2;
#endif
tmpp = sp;
newp = NULL;
openparen = 0;
hasnot = 0;
lastline[0] = '\0';
while (tmpp != NULL) {
if ((tmpp->line)[0] == '(')
openparen++;
if ((tmpp->line)[0] == ')')
openparen--;
if (!strcmp(tmpp->line, "not") && lastline[0] != '\0' &&
lastline[0] != '(') {
hasnot = 1;
newp = (struct swline *) addswline(newp, "(");
}
else if (hasnot && !openparen) {
hasnot = 0;
newp = (struct swline *) addswline(newp, tmpp->line);
newp = (struct swline *) addswline(newp, ")");
strcpy(lastline, tmpp->line);
tmpp = tmpp->next;
continue;
}
newp = (struct swline *) addswline(newp, tmpp->line);
strcpy(lastline, tmpp->line);
tmpp = tmpp->next;
}
#ifdef DEBUG
newp2 = newp;
while (newp2 != NULL) {
printf("%s ", newp2->line);
newp2 = newp2->next;
}
putchar('\n');
#endif
return newp;
}
/* Expands words with asterisks as wildcards into a series of
** "or" searches. Terms like "quick\*" are expanded into
** "quicktime or quickly", etc.
*/
struct swline *expandstar(sp, fp)
struct swline *sp;
FILE *fp;
{
int i, firsttime, gotstar;
char foundword[MAXWORDLEN], searchword[MAXWORDLEN];
struct swline *newp;
newp = NULL;
while (sp != NULL) {
strcpy(searchword, sp->line);
if (searchword[0] != '*' && strchr(searchword, '*')) {
for (i = gotstar = 0; searchword[i]; i++)
if (gotstar)
searchword[i] = '\0';
else if (searchword[i] == '*') {
searchword[i] = '\0';
gotstar = 1;
}
firsttime = 0;
do {
strcpy(foundword, getmatchword(searchword,
fp, firsttime));
if (strcmp(foundword, NOWORD)) {
if (firsttime)
newp = (struct swline *)
addswline(newp, "or");
newp = (struct swline *)
addswline(newp, foundword);
}
else {
if (!firsttime)
newp = (struct swline *)
addswline(newp, NOWORD);
break;
}
firsttime++;
} while (strcmp(foundword, NOWORD));
}
else {
newp = (struct swline *) addswline(newp,
searchword);
}
sp = sp->next;
}
return newp;
}
/* If firsttime is 1, returns the first match to a beginnng of a word.
** Else if it's 0, returns the next match, until nothing is found,
** in which case NULL is returned.
*/
char *getmatchword(word, fp, firsttime)
char *word;
FILE *fp;
int firsttime;
{
int i, c, found;
char *d;
static char fileword[MAXWORDLEN];
if (!firsttime) {
for (i = found = 0; indexchars[i] != '\0'; i++)
if (word[0] == indexchars[i]) {
fseek(fp, offsets[i], 0);
found = 1;
}
if (!found)
return NOWORD;
}
if (offsets[STOPWORDPOS] == ftell(fp))
return NOWORD;
for (i = 0; (c = fgetc(fp)) != 0; ) {
if (c == ':') {
fileword[i] = '\0';
i = 0;
while ((c = fgetc(fp)) != 0)
;
if (fileword[0] != word[0])
return NOWORD;
d = (char *) strstr(fileword, word);
if (d != NULL && d == &fileword[0])
return fileword;
else {
if (offsets[STOPWORDPOS] == ftell(fp))
return NOWORD;
}
}
else
fileword[i++] = c;
}
return NOWORD;
}
/* Reads and prints the header of an index file.
*/
void getheader(fp)
FILE *fp;
{
int c;
char line[MAXSTRLEN];
fgets(line, MAXSTRLEN, fp);
while (1) {
c = fgetc(fp);
ungetc(c, fp);
if (c == '#') {
fgets(line, MAXSTRLEN, fp);
printf("%s", line);
continue;
}
else
break;
}
fseek(fp, 0, 0);
}
/* Reads the offsets in the index file so word lookup is faster.
*/
void readoffsets(fp)
FILE *fp;
{
int c, i, k;
long j, num;
for (i = 0; i < MAXCHARS; i++)
offsets[i] = 0;
fseek(fp, 0, 0);
while (1) {
c = fgetc(fp);
if (c == '#') {
do {
c = fgetc(fp);
} while (c && c != '\n');
continue;
}
else
break;
}
j = 0;
while (c != EOF && c != '\n') {
k = MAXLONGLEN;
for (num = 0; c && isdigit(c) && k--; ) {
num = (num * 10) + (c - '0');
c = fgetc(fp);
}
offsets[j++] = num;
}
}
/* Reads the stopwords in the index file.
*/
void readstopwords(fp)
FILE *fp;
{
int i, c;
char word[MAXWORDLEN];
fseek(fp, offsets[STOPWORDPOS], 0);
for (i = 0; (c = fgetc(fp)) != '\n' && c != EOF; )
if (!isspace(c))
word[i++] = c;
else {
word[i] = '\0';
addstophash(word);
i = 0;
}
}
/* Reads the file offset table in the index file.
*/
void readfileoffsets(fp)
FILE *fp;
{
int j, k, c;
long num;
j = 0;
fseek(fp, offsets[FILEOFFSETPOS], 0);
c = fgetc(fp);
while (c != EOF && c != '\n') {
k = MAXLONGLEN;
for (num = 0; c != EOF && isdigit(c) && k--; ) {
num = (num * 10) + (c - '0');
c = fgetc(fp);
}
addtofilehashlist(j++, num);
}
}
/* The recursive parsing function.
** This was a headache to make but ended up being surprisingly easy. :)
** parseone tells the function to only operate on one word or term.
*/
struct result *parseterm(fp, parseone)
FILE *fp;
int parseone;
{
int rulenum;
char word[MAXWORDLEN];
struct result *rp, *newrp;
rp = NULL;
rulenum = OR_RULE;
while (searchwordlist != NULL) {
strcpy(word, searchwordlist->line);
if (rulenum == NO_RULE)
rulenum = DEFAULT_RULE;
if (isunaryrule(word)) {
searchwordlist = searchwordlist->next;
rp = (struct result *) parseterm(fp, 1);
rp = (struct result *) notresultlist(rp, fp);
continue;
}
else if (isbooleanrule(word)) {
rulenum = getrulenum(word);
searchwordlist = searchwordlist->next;
continue;
}
if (word[0] == '(') {
searchwordlist = searchwordlist->next;
newrp = (struct result *) parseterm(fp, 0);
if (rulenum == AND_RULE)
rp = (struct result *)
andresultlists(rp, newrp);
else if (rulenum == OR_RULE)
rp = (struct result *)
orresultlists(rp, newrp);
if (searchwordlist == NULL)
break;
continue;
}
else if (word[0] == ')') {
searchwordlist = searchwordlist->next;
break;
}
rp = (struct result *) operate(rp, rulenum, word, fp);
if (parseone) {
searchwordlist = searchwordlist->next;
break;
}
rulenum = NO_RULE;
searchwordlist = searchwordlist->next;
}
return rp;
}
/* Looks up a word in the index file -
** it calls getfileinfo(), which does the real searching.
*/
struct result *operate(rp, rulenum, word, fp)
struct result *rp;
int rulenum;
char *word;
FILE *fp;
{
int i, found;
struct result *newrp, *returnrp;
if (isstopword(word) && !isrule(word)) {
if (rulenum == OR_RULE && rp != NULL)
return rp;
else
commonerror = 1;
}
for (i = found = 0; indexchars[i] != '\0'; i++)
if (word[0] == indexchars[i]) {
fseek(fp, offsets[i], 0);
found = 1;
}
if (!found) {
if (rulenum == AND_RULE)
return NULL;
else if (rulenum == OR_RULE)
return rp;
}
newrp = (struct result *) getfileinfo(word, fp);
if (rulenum == AND_RULE)
returnrp = (struct result *) andresultlists(rp, newrp);
else if (rulenum == OR_RULE)
returnrp = (struct result *) orresultlists(rp, newrp);
else if (rulenum == NOT_RULE)
returnrp = (struct result *) notresultlist(newrp, fp);
return returnrp;
}
/* Looks up a file name in the index file.
*/
char *lookupfile(filenum, fp)
int filenum;
FILE *fp;
{
static char line[MAXSTRLEN];
fseek(fp, getfilenum(decodefilenum(filenum) - 1), 0);
fgets(line, MAXSTRLEN, fp);
return line;
}
/* Finds a word and returns its corresponding file and rank information list.
** If not found, NULL is returned.
*/
struct result *getfileinfo(word, fp)
char *word;
FILE *fp;
{
int i, c, x, countnum, rank, filenum, structure;
char fileword[MAXWORDLEN];
struct result *rp;
rp = NULL;
for (i = 0; (c = fgetc(fp)) != 0; ) {
if (c == ':') {
fileword[i] = '\0';
i = 0;
if (!strcmp(word, fileword))
break;
else {
while ((c = fgetc(fp)) != 0)
;
if (offsets[STOPWORDPOS] == ftell(fp))
return NULL;
continue;
}
}
else
fileword[i++] = c;
}
if (c == 0)
return NULL;
countnum = 1;
ungetc(c, fp);
while ((c = fgetc(fp)) != 0) {
x = 0;
do {
c = fgetc(fp);
if (c == 0)
return rp;
x *= 128;
x += c & 127;
} while (c & 128);
if (x) {
if (countnum == 1) {
filenum = x;
countnum++;
}
else if (countnum == 2) {
rank = x;
countnum++;
}
else if (countnum == 3) {
structure = x;
rp = (struct result *)
addtoresultlist(rp, filenum, rank,
structure);
countnum = 1;
}
}
}
return rp;
}
/* Is a word a rule?
*/
int isrule(word)
char *word;
{
if (!strcmp(word, "and") || !strcmp(word, "or") || !strcmp(word, "not"))
return 1;
else
return 0;
}
/* Is a word a boolean rule?
*/
int isbooleanrule(word)
char *word;
{
if (!strcmp(word, "and") || !strcmp(word, "or"))
return 1;
else
return 0;
}
/* Is a word a unary rule?
*/
int isunaryrule(word)
char *word;
{
if (!strcmp(word, "not"))
return 1;
else
return 0;
}
/* Return the number for a rule.
*/
int getrulenum(word)
char *word;
{
if (!strcmp(word, "and"))
return AND_RULE;
else if (!strcmp(word, "or"))
return OR_RULE;
else if (!strcmp(word, "not"))
return NOT_RULE;
return NO_RULE;
}
/* Takes two lists of results from searches and ANDs them together.
*/
struct result *andresultlists(r1, r2)
struct result *r1;
struct result *r2;
{
static struct result *tmpnode, *newnode;
if (r1 == NULL || r2 == NULL)
return NULL;
newnode = NULL;
while (r1 != NULL) {
tmpnode = r2;
while (tmpnode != NULL) {
if (r1->filenum == tmpnode->filenum)
newnode = (struct result *)
addtoresultlist(newnode, r1->filenum,
(r1->rank + tmpnode->rank) / 2,
r1->structure & tmpnode->structure);
tmpnode = tmpnode->next;
}
r1 = r1->next;
}
return newnode;
}
/* Takes two lists of results from searches and ORs them together.
*/
struct result *orresultlists(r1, r2)
struct result *r1;
struct result *r2;
{
int i;
struct result *rp;
static struct result *newnode;
newnode = NULL;
if (r1 == NULL)
return r2;
else if (r2 == NULL)
return r1;
initresulthashlist();
while (r1 != NULL) {
mergeresulthashlist(r1->filenum, r1->rank, r1->structure);
r1 = r1->next;
}
while (r2 != NULL) {
mergeresulthashlist(r2->filenum, r2->rank, r2->structure);
r2 = r2->next;
}
for (i = 0; i < HASHSIZE; i++) {
rp = resulthashlist[i];
while (rp != NULL) {
newnode = (struct result *) addtoresultlist(newnode,
rp->filenum, rp->rank, rp->structure);
rp = rp->next;
}
}
return newnode;
}
/* This performs the NOT unary operation on a result list.
** NOTed files are marked with a default rank of 1000.
*/
struct result *notresultlist(rp, fp)
struct result *rp;
FILE *fp;
{
int i, filenums;
struct result *newp;
newp = NULL;
initmarkentrylist();
while (rp != NULL) {
marknum(rp->filenum);
rp = rp->next;
}
filenums = getindexfilenum(fp);
for (i = 1; !ismarked(i) && i < filenums; i++)
newp = (struct result *) addtoresultlist(newp,
i, 1000, IN_ALL);
return newp;
}
/* Adds a file number and rank to a list of results.
*/
struct result *addtoresultlist(rp, filenum, rank, structure)
struct result *rp;
int filenum;
int rank;
int structure;
{
struct result *newnode;
static struct result *head;
newnode = (struct result *) emalloc(sizeof(struct result));
newnode->filenum = filenum;
newnode->rank = rank;
newnode->structure = structure;
newnode->next = NULL;
if (rp == NULL)
rp = newnode;
else
head->next = newnode;
head = newnode;
return rp;
}
/* Adds the results of a search, sorts them by rank.
*/
struct sortresult *addsortresult(sp, rank, fileinfo)
struct sortresult *sp;
int rank;
char *fileinfo;
{
if (rank > bigrank)
bigrank = rank;
if (sp == NULL) {
sp = (struct sortresult *) emalloc(sizeof(struct sortresult));
sp->rank = rank;
sp->fileinfo = (char *) mystrdup(fileinfo);
sp->left = sp->right = NULL;
}
else {
if (sp->rank < rank)
sp->left = (struct sortresult *)
addsortresult(sp->left, rank, fileinfo);
else
sp->right = (struct sortresult *)
addsortresult(sp->right, rank, fileinfo);
}
return sp;
}
/* Prints the final results of a search.
*/
void printsortedresults(sp, num)
struct sortresult *sp;
float num;
{
int rank;
if (sp != NULL) {
printsortedresults(sp->left, num);
rank = (int) ((float) sp->rank * num);
if (rank >= 999)
rank = 1000;
if (maxhits) {
if (maxhits == -1)
printf("%d %s", (rank <= 0) ? 1 :
rank, sp->fileinfo);
else if (maxhits > 0) {
printf("%d %s", (rank <= 0) ? 1 :
rank, sp->fileinfo);
maxhits--;
}
}
printsortedresults(sp->right, num);
}
}
/* Reads a compressed line. This is just here for testing, etc.
*/
void getrawindexline(fp)
FILE *fp;
{
int c, inword;
inword = 1;
while ((c = fgetc(fp)) != EOF) {
if (c == ':' && inword)
inword = 0;
if (!inword) {
do {
c = fgetc(fp);
if (c == 0)
return;
} while (c & 128);
}
}
}
/* Does an index file have a readable format?
*/
int isokindexheader(fp)
FILE *fp;
{
char line[MAXSTRLEN];
fseek(fp, 0, 0);
fgets(line, MAXSTRLEN, fp);
if (line[strlen(line) - 1] == '\n')
line[strlen(line) - 1] = '\0';
if (strcmp(line, INDEXHEADER)) {
fseek(fp, 0, 0);
return 0;
}
fseek(fp, 0, 0);
return 1;
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.