This is index.c in view mode; [Download] [Up]
/*
** Copyright (C) 1995, Enterprise Integration Technologies Corp.
** All Rights Reserved.
** Kevin Hughes, kevinh@eit.com
** 3/11/94
*/
#include "swish.h"
#include "index.h"
/* Recursively goes into a directory and calls the word-indexing
** functions for each file that's found.
*/
void indexadir(dir)
char *dir;
{
int badfile;
DIR *dfd;
#ifdef NEXTSTEP
struct direct *dp;
#else
struct dirent *dp;
#endif
static char s[MAXFILELEN], title[MAXSTRLEN];
struct sortentry *sortfilelist, *sortdirlist;
struct swline *tmplist;
sortfilelist = sortdirlist = NULL;
if (islink(dir) && !followsymlinks)
return;
if (dir[strlen(dir) - 1] == '/')
dir[strlen(dir) - 1] = '\0';
if ((dfd = opendir(dir)) == NULL)
return;
while ((dp = readdir(dfd)) != NULL && dirconlist != NULL) {
badfile = 0;
tmplist = dirconlist;
while (tmplist != NULL) {
if (lstrstr(dp->d_name, tmplist->line)) {
badfile = 1;
break;
}
tmplist = tmplist->next;
}
if (badfile)
return;
}
closedir(dfd);
dfd = opendir(dir);
while ((dp = readdir(dfd)) != NULL) {
if ((dp->d_name)[0] == '.')
continue;
if (islink(dp->d_name) && !followsymlinks)
continue;
badfile = 0;
tmplist = fileislist;
while (tmplist != NULL) {
if (!strcmp(dp->d_name, tmplist->line)) {
badfile = 1;
break;
}
tmplist = tmplist->next;
}
if (badfile)
continue;
badfile = 0;
tmplist = fileconlist;
while (tmplist != NULL) {
if (lstrstr(dp->d_name, tmplist->line)) {
badfile = 1;
break;
}
tmplist = tmplist->next;
}
if (badfile)
continue;
sprintf(s, "%s%s%s", dir, dir[strlen(dir) - 1] == '/' ?
"" : "/", dp->d_name);
if (islink(s) && !followsymlinks)
continue;
badfile = 0;
tmplist = pathconlist;
while (tmplist != NULL) {
if (lstrstr(s, tmplist->line)) {
badfile = 1;
break;
}
tmplist = tmplist->next;
}
if (badfile)
continue;
if (!isdirectory(s)) {
if (!isoksuffix(dp->d_name, suffixlist))
continue;
if (ishtml(s)) {
strcpy(title, (char *) parsetitle(s));
if (!isoktitle(title))
continue;
}
else {
if (strrchr(s, '/') != NULL)
strcpy(title, strrchr(s, '/') + 1);
else
strcpy(title, s);
}
sortfilelist = (struct sortentry *)
addsortentry(sortfilelist, s, title);
}
else {
sortdirlist = (struct sortentry *)
addsortentry(sortdirlist, s, s);
}
}
closedir(dfd);
printfiles(sortfilelist);
printdirs(sortdirlist);
}
/* Calls the word-indexing function for a single file.
*/
void indexafile(path)
char *path;
{
int badfile;
char *t, title[MAXSTRLEN];
struct sortentry *fileentry;
struct swline *tmplist;
if (islink(path) && !followsymlinks)
return;
if (path[strlen(path) - 1] == '/')
path[strlen(path) - 1] = '\0';
badfile = 0;
tmplist = fileislist;
while (tmplist != NULL) {
if (!strcmp(path, tmplist->line)) {
badfile = 1;
break;
}
tmplist = tmplist->next;
}
if (badfile)
return;
badfile = 0;
tmplist = fileconlist;
while (tmplist != NULL) {
if (lstrstr(path, tmplist->line)) {
badfile = 1;
break;
}
tmplist = tmplist->next;
}
if (badfile)
return;
badfile = 0;
tmplist = pathconlist;
while (tmplist != NULL) {
if (lstrstr(path, tmplist->line)) {
badfile = 1;
break;
}
tmplist = tmplist->next;
}
if (badfile)
return;
if (!isoksuffix(path, suffixlist))
return;
if (ishtml(path)) {
strcpy(title, (char *) parsetitle(path));
if (!isoktitle(title))
return;
}
else {
if ((t = strrchr(path, '/')) != NULL)
strcpy(title, t + 1);
else
strcpy(title, path);
}
fileentry = (struct sortentry *) emalloc(sizeof(struct sortentry));
fileentry->filename = (char *) mystrdup(path);
fileentry->title = (char *) mystrdup(title);
fileentry->left = fileentry->right = NULL;
printfiles(fileentry);
}
/* Indexes the words in the tree of files in alphabetical order.
*/
void printfiles(e)
struct sortentry *e;
{
int wordcount;
char *s;
if (e != NULL) {
printfiles(e->left);
if (verbose == 3) {
if ((s = (char *) strrchr(e->filename, '/')) == NULL)
printf(" %s", e->filename);
else
printf(" %s", s + 1);
}
wordcount = countwords(e->filename, e->title);
if (verbose == 3) {
if (wordcount)
printf(" (%d words)\n", wordcount);
else
printf(" (no words)\n");
fflush(stdout);
}
free(e->filename);
free(e->title);
printfiles(e->right);
free(e);
}
}
/* Prints out the directory names as things are getting indexed.
** Calls indexadir() so directories in the tree are indexed,
** in alphabetical order...
*/
void printdirs(e)
struct sortentry *e;
{
if (e != NULL) {
printdirs(e->left);
if (verbose == 3)
printf("\nIn dir \"%s\":\n", e->filename);
else if (verbose == 2)
printf("Checking dir \"%s\"...\n", e->filename);
indexadir(e->filename);
free(e->filename);
free(e->title);
printdirs(e->right);
free(e);
}
}
/* Stores file names in alphabetical order so they can be
** indexed alphabetically. No big whoop.
*/
struct sortentry *addsortentry(e, filename, title)
struct sortentry *e;
char *filename;
char *title;
{
if (e == NULL) {
e = (struct sortentry *) emalloc(sizeof(struct sortentry));
e->filename = (char *) mystrdup(filename);
e->title = (char *) mystrdup(title);
e->left = e->right = NULL;
}
else {
if (strcmp(e->filename, filename) > 0)
e->left = (struct sortentry *)
addsortentry(e->left, filename, title);
else
e->right = (struct sortentry *)
addsortentry(e->right, filename, title);
}
return e;
}
/* Adds a word to the master index tree.
*/
struct entry *addentry(e, word, filenum, emphasized, structure)
struct entry *e;
char *word;
int filenum;
int emphasized;
int structure;
{
int isbigger;
struct location *tp, *oldtp;
if (e == NULL) {
e = (struct entry *) emalloc(sizeof(struct entry));
e->word = (char *) mystrdup(word);
e->tfrequency = 1;
e->locationlist = (struct location *)
emalloc(sizeof(struct location));
e->locationlist->filenum = filenum;
e->locationlist->frequency = 1;
e->locationlist->emphasized = emphasized;
e->locationlist->structure = structure;
e->locationlist->next = NULL;
e->left = e->right = NULL;
totalwords++;
}
else {
isbigger = wordcompare(e->word, word);
if (isbigger == 0) {
tp = e->locationlist;
while (tp != NULL && tp->filenum != filenum) {
oldtp = tp;
tp = tp->next;
}
if (tp == NULL) {
tp = (struct location *) emalloc(sizeof(struct
location));
tp->filenum = filenum;
tp->frequency = 1;
tp->emphasized = emphasized;
tp->structure = structure;
tp->next = NULL;
oldtp->next = tp;
if (!emphasized)
e->tfrequency = e->tfrequency + 1;
}
else {
if (tp->filenum == filenum) {
tp->frequency = tp->frequency + 1;
if (emphasized)
tp->emphasized =
tp->emphasized + 1;
tp->structure |= structure;
}
}
}
else if (isbigger > 0)
e->left = (struct entry *)
addentry(e->left, word, filenum, emphasized,
structure);
else
e->right = (struct entry *)
addentry(e->right, word, filenum, emphasized,
structure);
}
return e;
}
/* Adds a file to the master list of files and file numbers.
*/
struct file *addtofilelist(filep, filename, title, size)
struct file *filep;
char *filename;
char *title;
int size;
{
struct file *newnode;
static struct file *filelistp = NULL;
newnode = (struct file *) emalloc(sizeof(struct file));
newnode->filename = (char *) mystrdup(filename);
newnode->title = (char *) mystrdup(title);
newnode->size = size;
newnode->next = NULL;
if (filep == NULL)
filep = newnode;
else if (filelistp != NULL)
filelistp->next = newnode;
filelistp = newnode;
return filep;
}
/* Just goes through the master list of files and
** counts 'em.
*/
int getfilecount(filep)
struct file *filep;
{
int i;
for (i = 0; filep != NULL; filep = filep->next)
i++;
return i;
}
/* Returns the nicely formatted date.
*/
char *getthedate()
{
static char date[MAXSTRLEN];
time_t time;
time = (time_t) getthetime();
strftime(date, MAXSTRLEN, "%x %X", (struct tm *) localtime(&time));
strftime(date, MAXSTRLEN, "%d/%m/%y %H:%M:%S %Z",
(struct tm *) localtime(&time));
return date;
}
/* Indexes all the words in a file and adds the appropriate information
** to the appropriate structures.
*/
int countwords(filename, title)
char *filename;
char *title;
{
int c, i, j, inword, ftotalwords, emphasized, structure;
static int filenum;
char word[MAXWORDLEN], tag[MAXSTRLEN];
FILE *fp;
if ((fp = fopen(filename, "r")) == NULL)
return 0;
ftotalwords = 0;
if (isoksuffix(filename, nocontentslist) && nocontentslist != NULL) {
filelist = addtofilelist(filelist, filename,
title, getsize(filename));
fclose(fp);
filenum++;
if (!(filenum % 128))
filenum++;
addtofwordtotals(filenum, 100);
return (countwordstr(title, filenum, 0));
}
filelist = addtofilelist(filelist, filename, title, getsize(filename));
filenum++;
if (!(filenum % 128))
filenum++;
c = 1;
i = j = 0;
inword = 0;
emphasized = 0;
structure = 1;
while (c != EOF && (c = fgetc(fp)) != EOF) {
if (!inword) {
if (iswordchar(c)) {
i = 0;
word[i++] = c;
if (i == MAXWORDLEN)
i--;
inword = 1;
}
}
else if (inword) {
if (!iswordchar(c)) {
word[i++] = '\0';
if (i == MAXWORDLEN)
word[--i] = '\0';
for (i = 0; word[i]; i++)
word[i] = tolower(word[i]);
i = 0;
if (isokword(word))
strcpy(word, (char *)
convertentities(word));
/* Sorry, have to do isokword() twice to filter out converted strings! */
if (hasokchars(word) && isokword(word)) {
#ifdef DEBUG
printf(" %s %d\n", word, structure);
#endif
entrylist = (struct entry *)
addentry(entrylist, word,
filenum, emphasized, structure);
ftotalwords++;
}
inword = 0;
}
else {
word[i++] = c;
if (i == MAXWORDLEN)
i--;
}
}
if (c == '<' && !INDEXTAGS) {
j = 0;
while ((c = fgetc(fp)) != EOF) {
tag[j++] = c;
if (j == MAXSTRLEN)
j--;
if (c == '>') {
if (j)
tag[--j] = '\0';
else
tag[j] = '\0';
#ifdef DEBUG
printf("t: %s\n", tag);
#endif
structure = getstructure(tag,
structure);
#ifdef DEBUG
printf("s: %d\n", structure);
#endif
if (tag[0] == '!')
ftotalwords +=
parsecomment(tag,
filenum, structure);
if ((structure & IN_HEADER) ||
(structure & IN_TITLE))
emphasized = 5;
else
emphasized = 0;
break;
}
}
}
}
fclose(fp);
addtofwordtotals(filenum, ftotalwords);
return ftotalwords;
}
/* Indexes the words in a string, such as a file name or an
** HTML title.
*/
int countwordstr(s, filenum, emphasized)
char *s;
int filenum;
int emphasized;
{
int i, j, inword, wordcount;
char c, word[MAXWORDLEN], tmpstr[MAXFILELEN];
sprintf(tmpstr, "%s ", s);
for (j = inword = wordcount = 0; (c = tmpstr[j]) != '\0'; j++) {
if (!inword) {
if (iswordchar(c)) {
i = 0;
word[i++] = c;
if (i == MAXWORDLEN)
i--;
inword = 1;
}
}
else {
if (!iswordchar(c)) {
wordcount++;
word[i] = '\0';
for (i = 0; word[i]; i++)
word[i] = tolower(word[i]);
if (isokword(word))
strcpy(word, (char *)
convertentities(word));
if (hasokchars(word) && isokword(word))
entrylist = (struct entry *)
addentry(entrylist, word,
filenum, emphasized, IN_FILE);
inword = 0;
}
else {
word[i++] = c;
if (i == MAXWORDLEN)
i--;
}
}
}
return wordcount;
}
/* This returns the value corresponding to the HTML structures
** a word is in.
*/
int getstructure(tag, structure)
char *tag;
int structure;
{
int len;
len = strlen(tag);
if (lstrstr(tag, "/title") && len == 6)
structure &= ~IN_TITLE;
else if (lstrstr(tag, "title") && len == 5)
structure |= IN_TITLE;
else if (lstrstr(tag, "/head") && len == 5)
structure &= ~IN_HEAD;
else if (lstrstr(tag, "head") && len == 4)
structure |= IN_HEAD;
else if (lstrstr(tag, "/body") && len == 5)
structure &= ~IN_BODY;
else if (lstrstr(tag, "body") && len == 4)
structure |= IN_BODY;
else if (tag[0] == '/' && tolower(tag[1]) == 'h' && isdigit(tag[2]))
structure &= ~IN_HEADER;
else if (tolower(tag[0]) == 'h' && isdigit(tag[1]))
structure |= IN_HEADER;
else if (lstrstr(tag, "/em") || lstrstr(tag, "/strong"))
structure &= ~IN_EMPHASIZED;
else if (lstrstr(tag, "em") || lstrstr(tag, "strong"))
structure |= IN_EMPHASIZED;
else if ((tolower(tag[0]) == 'b' || tolower(tag[0]) == 'i') &&
len == 1)
structure |= IN_EMPHASIZED;
else if (tag[0] == '/' && tag[2] == '\0' && (tolower(tag[1]) == 'b' ||
tolower(tag[1]) == 'i'))
structure &= ~IN_EMPHASIZED;
return structure;
}
/* Parses the words in a comment.
*/
int parsecomment(tag, filenum, structure)
char *tag;
int filenum;
int structure;
{
int i, j, inword, wordcount, emphasized;
char c, word[MAXWORDLEN];
if (EMPHASIZECOMMENTS)
emphasized = 5;
else
emphasized = 0;
structure |= IN_COMMENTS;
for (j = 3, inword = wordcount = 0; (c = tag[j]) != '\0'; j++) {
if (!inword) {
if (iswordchar(c)) {
i = 0;
word[i++] = c;
if (i == MAXWORDLEN)
i--;
inword = 1;
}
}
else {
if (!iswordchar(c)) {
wordcount++;
word[i] = '\0';
for (i = 0; word[i]; i++)
word[i] = tolower(word[i]);
if (isokword(word))
strcpy(word, (char *)
convertentities(word));
if (hasokchars(word) && isokword(word))
entrylist = (struct entry *)
addentry(entrylist, word,
filenum, emphasized, structure);
inword = 0;
}
else {
word[i++] = c;
if (i == MAXWORDLEN)
i--;
}
}
}
return wordcount;
}
/* Removes words that occur in over _plimit_ percent of the files and
** that occur in over _flimit_ files (marks them as stopwords, that is).
*/
int removestops(ep, totalfiles, plimit, flimit)
struct entry *ep;
int totalfiles;
int plimit;
int flimit;
{
int percent, wordfilecount, stopwords;
struct location *lp;
stopwords = 0;
if (ep != NULL) {
stopwords += removestops(ep->left, totalfiles, plimit, flimit);
lp = ep->locationlist;
wordfilecount = 0;
while (lp != NULL) {
wordfilecount++;
lp = lp->next;
}
percent = ((float) wordfilecount / (float) totalfiles) * 100.0;
if (percent >= plimit && wordfilecount >= flimit) {
addstophash(ep->word);
stopwords++;
}
stopwords += removestops(ep->right, totalfiles, plimit, flimit);
}
return stopwords;
}
/* This is somewhat similar to the rank calculation algorithm
** from WAIS (I think). Any suggestions for improvements?
** Note that ranks can't be smaller than 1, emphasized words
** (words in titles, headers) have ranks multiplied by at least 5
** (just a guess), and ranks divisible by 128 are bumped up by one
** (to make the compression scheme with with '\0' as a line delimiter
** work). Fudging with the ranks doesn't seem to make much difference.
*/
int getrank(freq, tfreq, words, emphasized)
int freq;
int tfreq;
int words;
int emphasized;
{
float d, e, f;
int tmprank;
char rankstr[MAXSTRLEN];
if (freq < 5)
freq = 5;
d = 1.0 / (double) tfreq;
e = ((log((double) freq) + 10.0) * d) / words;
f = e * 10000.0;
sprintf(rankstr, "%f", f);
tmprank = atoi(rankstr);
if (tmprank <= 0)
tmprank = 1;
if (emphasized)
tmprank *= emphasized;
if (!(tmprank % 128))
tmprank++;
return tmprank;
}
/* Prints the index information at the head of index files.
*/
void printheader(fp, filename, totalwords, totalfiles)
FILE *fp;
char *filename;
int totalwords;
int totalfiles;
{
char *c;
c = (char *) strrchr(filename, '/');
fprintf(fp, "%s\n", INDEXHEADER);
fprintf(fp, "# Name: %s\n", (indexn[0] == '\0') ? "(no name)" :
indexn);
fprintf(fp, "# Saved as: %s\n", (c == NULL && c + 1 != '\0') ?
filename : c + 1);
fprintf(fp, "# Counts: ");
if (totalwords)
fprintf(fp, "%d words%s", totalwords, (totalfiles) ? ", " : "");
if (totalfiles)
fprintf(fp, "%d files", totalfiles);
fprintf(fp, "\n");
fprintf(fp, "# Indexed on: %s\n", getthedate());
fprintf(fp, "# Description: %s\n", (indexd[0] == '\0') ?
"(no description)" : indexd);
fprintf(fp, "# Pointer: %s\n", (indexp[0] == '\0') ?
"(no pointer)" : indexp);
fprintf(fp, "# Maintained by: %s\n", (indexa[0] == '\0') ?
"(no maintainer)" : indexa);
}
/* Print the index entries that hold the word, rank, and other information.
*/
void printindex(ep, fp)
struct entry *ep;
FILE *fp;
{
int i, rank;
struct location *lp;
if (ep != NULL) {
printindex(ep->left, fp);
if (!isstopword(ep->word)) {
for (i = 0; indexchars[i] != '\0'; i++)
if ((ep->word)[0] == indexchars[i] &&
!offsets[i])
offsets[i] = ftell(fp);
fprintf(fp, "%s:", ep->word);
lp = ep->locationlist;
while (lp != NULL) {
compress(lp->filenum, fp);
rank = getrank(lp->frequency, ep->tfrequency,
gettotalwords(lp->filenum), lp->emphasized);
compress(rank, fp);
compress(lp->structure, fp);
lp = lp->next;
}
fputc(0, fp);
}
printindex(ep->right, fp);
}
}
/* Prints the list of stopwords into the index file.
*/
void printstopwords(fp)
FILE *fp;
{
int hashval;
struct swline *sp;
offsets[STOPWORDPOS] = ftell(fp);
for (hashval = 0; hashval < HASHSIZE; hashval++) {
sp = hashstoplist[hashval];
while (sp != NULL) {
fprintf(fp, "%s ", sp->line);
sp = sp->next;
}
}
fprintf(fp, "\n");
}
/* Prints the list of files, titles, and sizes into the index file.
*/
void printfilelist(filep, fp)
struct file *filep;
FILE *fp;
{
int i;
i = 0;
offsets[FILELISTPOS] = ftell(fp);
while (filep != NULL) {
addtofilehashlist(i++, ftell(fp));
fprintf(fp, "%s \"%s\" %d\n", ruleparse(filep->filename),
filep->title, filep->size);
filep = filep->next;
}
}
/* Prints the list of file offsets into the index file.
*/
void printfileoffsets(fp)
FILE *fp;
{
int i;
offsets[FILEOFFSETPOS] = ftell(fp);
for (i = 0; getfilenum(i) != 0; i++)
fprintf(fp, "%016li", getfilenum(i));
}
/* Takes a number and prints it to a file using the simple
** accordion scheme of storing numbers.
*/
void compress(num, fp)
int num;
FILE *fp;
{
int i, r;
static char s[8];
i = 0;
while (num) {
r = num % 128;
num /= 128;
s[i++] = r;
}
while (i-- >= 0)
fputc(s[i] | (i ? 128 : 0), fp);
}
/* Prints out the decompressed values in an index file.
*/
void decompress(fp)
FILE *fp;
{
int c, x, inword;
long pos;
char line[MAXSTRLEN], header[MAXHEADCHARS + 1];
readoffsets(fp);
fseek(fp, 0, 0);
inword = 1;
while (1) {
c = fgetc(fp);
ungetc(c, fp);
if (c == '#') {
fgets(line, MAXSTRLEN, fp);
printf("%s", line);
continue;
}
else {
fgets(header, MAXHEADCHARS + 1, fp);
printf("%s", header);
break;
}
}
while ((c = fgetc(fp)) != EOF) {
if (c == ':' && inword) {
inword = 0;
putchar(c);
}
if (inword)
putchar(c);
else {
x = 0;
do {
c = fgetc(fp);
pos = ftell(fp);
if (pos == offsets[STOPWORDPOS]) {
putchar('\n');
while (fgets(line, MAXSTRLEN, fp)
!= NULL)
printf("%s", line);
return;
}
if (c == 0) {
putchar('\n');
inword = 1;
break;
}
x *= 128;
x += c & 127;
} while (c & 128);
if (x)
printf(" %d", x);
}
}
}
/* Parses lines according to the ReplaceRules directives.
*/
char *ruleparse(line)
char *line;
{
char rule[MAXSTRLEN];
static char tmpline[MAXSTRLEN], newtmpline[MAXSTRLEN];
static char line1[MAXSTRLEN], line2[MAXSTRLEN];
struct swline *tmplist;
if (replacelist == NULL)
return line;
tmplist = replacelist;
strcpy(tmpline, line);
while(1) {
if (tmplist == NULL)
return tmpline;
strcpy(rule, tmplist->line);
tmplist = tmplist->next;
if (tmplist == NULL)
return tmpline;
if (rule == NULL) {
replacelist = tmplist;
return tmpline;
}
else {
if (lstrstr(rule, "replace")) {
strcpy(line1, tmplist->line);
tmplist = tmplist->next;
strcpy(line2, tmplist->line);
tmplist = tmplist->next;
strcpy(newtmpline, (char *) replace(tmpline,
line1, NOWORD));
strcpy(newtmpline, (char *) replace(newtmpline,
NOWORD, line2));
}
else if (lstrstr(rule, "append")) {
sprintf(newtmpline, "%s%s", tmpline,
tmplist->line);
tmplist = tmplist->next;
}
else if (lstrstr(rule, "prepend")) {
sprintf(newtmpline, "%s%s", tmplist->line,
tmpline);
tmplist = tmplist->next;
}
strcpy(tmpline, newtmpline);
}
}
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.