
This is merge.c in view mode; [Download] [Up]

** Copyright (C) 1995, Enterprise Integration Technologies Corp.        
** All Rights Reserved.
** Kevin Hughes, kevinh@eit.com 
** 3/11/94

#include "swish.h"
#include "merge.h"

/* The main merge functions - it accepts three file names.
** This is a bit hairy. It basically acts as a zipper,
** zipping up both index files into one.

void readmerge(file1, file2, outfile)
     char *file1;
     char *file2;
     char *outfile;
	int i, j, indexfilenum1, indexfilenum2, result, totalfiles,
		skipwords, skipfiles;
	long limit1, limit2, fileinfo1, fileinfo2, offsetstart;
	char line[MAXSTRLEN];
	struct indexentry *ip1, *ip2, *ip3;
	struct indexentry *buffer1, *buffer2;
	FILE *fp1, *fp2, *fp3;


	if ((fp1 = fopen(file1, "r")) == NULL) {
		sprintf(errorstr, "Couldn't read the index file \"%s\".",
	if (!isokindexheader(fp1)) {
		sprintf(errorstr, "\"%s\" has an unknown format.",
	if ((fp2 = fopen(file2, "r")) == NULL) {
		sprintf(errorstr, "Couldn't read the index file \"%s\".",
	if (!isokindexheader(fp2)) {
		sprintf(errorstr, "\"%s\" has an unknown format.",

	ip1 = ip2 = ip3 = NULL;
	buffer1 = buffer2 = NULL;
	if (verbose)
		printf("Counting files... ");
	indexfilenum1 = getindexfilenum(fp1);
	indexfilenum2 = getindexfilenum(fp2);
	totalfiles = indexfilenum1 + indexfilenum2;
	if (verbose) {
		printf("%d files.\n", indexfilenum1 + indexfilenum2);
		printf("Reading stopwords...");
	limit1 = offsets[STOPWORDPOS];
	fileinfo1 = offsets[FILELISTPOS];
	limit2 = offsets[STOPWORDPOS];
	fileinfo2 = offsets[FILELISTPOS];

	if (verbose)
		printf("\nReading file info...");
	fseek(fp1, fileinfo1, 0);
	for (i = 1; i <= indexfilenum1; i++) {
		fgets(line, MAXSTRLEN, fp1);
		addindexfilelist(i, line, &totalfiles);
	fseek(fp2, fileinfo2, 0);
	for (i = 1; i <= indexfilenum2; i++) {
		fgets(line, MAXSTRLEN, fp2);
		addindexfilelist(i + indexfilenum1, line, &totalfiles);

	if ((fp3 = fopen(outfile, "w")) == NULL) {
		"Couldn't write the merged index file \"%s\".",

	if (verbose)
		printf("\nMerging words... ");

	printheader(fp3, outfile, 0, totalfiles);

        offsetstart = ftell(fp3);
        for (i = 0; i < MAXCHARS; i++)
                fprintf(fp3, "%016li", offsets[i]);
        fputc('\n', fp3);


	for (i = 0; i < MAXCHARS; i++)
		offsets[i] = 0;

	skipwords = 0;
	while (1) {
		if (buffer1 == NULL) {
			ip1 = (struct indexentry *) readindexline(fp1, limit1);
			if (ip1 == NULL)
			buffer1 = ip1;
		if (buffer2 == NULL) {
			ip2 = (struct indexentry *) readindexline(fp2, limit2);
			if (ip2 == NULL)
			addfilenums(ip2, indexfilenum1);
			buffer2 = ip2;
		result = wordcompare(ip1->word, ip2->word);
		if (!result) {
			ip3 = (struct indexentry *) mergeindexentries(ip1, ip2);
			printindexentry(ip3, fp3);
			buffer1 = buffer2 = NULL;
		else if (result < 0) {
			printindexentry(ip1, fp3);
			buffer1 = NULL;
		else {
			printindexentry(ip2, fp3);
			buffer2 = NULL;

	if (verbose) {
		if (skipwords)
			printf("%d redundant word%s.", skipwords,
			(skipwords == 1) ? "" : "s");
			printf("no redundant words.");


	if (verbose)
		printf("\nMerging file info... ");
	fp3 = fopen(outfile, "a+");

	offsets[FILELISTPOS] = ftell(fp3);
	for (i = j = 1; i <= indexfilenum1 + indexfilenum2; i++)
		if (getmap(i) == j) {
			addtofilehashlist(j++ - 1, ftell(fp3));
			fprintf(fp3, "%s", lookupindexfilenum(i));

	skipfiles = (indexfilenum1 + indexfilenum2) - totalfiles;
	if (verbose) {
		if (skipfiles)
			printf("%d redundant file%s.", skipfiles,
			(skipfiles == 1) ? "" : "s");
			printf("no redundant files.");


	fseek(fp3, offsetstart, 0);
        for (i = 0; i < MAXCHARS; i++)
                fprintf(fp3, "%016li", offsets[i]);


	if (verbose)

/* Gets the number of files in an index file.

int getindexfilenum(fp)
     FILE *fp;
	int i;
	char line[MAXSTRLEN];

	fseek(fp, offsets[FILELISTPOS], 0);

	i = 0;
	while(ftell(fp) != offsets[FILEOFFSETPOS]) {
		fgets(line, MAXSTRLEN, fp);

	return i;

/* This adds an offset to the file numbers in a particular
** result list. For instance, file 1 has file numbers going from
** 1 to 10, but so does file 2, so I have to add 10 to all the
** file numbers in file 2 before merging.

void addfilenums(ip, num)
     struct indexentry *ip;
     int num;
	struct result *rp;

	rp = ip->result;
	while (rp != NULL) {
		rp->filenum =
		encodefilenum(getmap(decodefilenum(rp->filenum) + num));
		rp = rp->next;

/* This reads the next line in the index file and puts the results
** in a result structure.

struct indexentry *readindexline(fp, limit)
     FILE *fp;
     long limit;
        int i, c, x, countnum, rank, filenum, structure;
        char fileword[MAXWORDLEN];
        struct result *rp;
	struct indexentry *ip;

        rp = NULL;

	if (limit == ftell(fp))
		return NULL;
        for (i = 0; (c = fgetc(fp)) != 0; ) {
                if (c == ':') {
                        fileword[i] = '\0';
                        fileword[i++] = c;

        countnum = 1;

        ungetc(c, fp);
        while ((c = fgetc(fp)) != 0) {
                x = 0;
                do {
                        c = fgetc(fp);
                        if (c == 0)
                        x *= 128;
                        x += c & 127;
                } while (c & 128);
		if (c == 0)
                if (x) {
                        if (countnum == 1) {
                                filenum = x;
                        else if (countnum == 2) {
                                rank = x;
                        else if (countnum == 3) {
                                structure = x;
                                rp = (struct result *)
                                addtoresultlist(rp, filenum,
				rank, structure);
				countnum = 1;

	ip = (struct indexentry *) emalloc(sizeof(struct indexentry));
	ip->word = (char *) mystrdup(fileword);
	ip->result = rp;

        return ip;

/* This puts all the file info into a hash table so that it can
** be looked up by its pathname and filenumber. This is how
** we find redundant file information.

void addindexfilelist(num, info, totalfiles)
     int num;
     char *info;
     int *totalfiles;
	int i;
	static int j;
	unsigned hashval;
	char tmpstr[MAXSTRLEN], path[MAXSTRLEN];
	struct indexfileinfo *ip1, *ip2;

	strcpy(path, extractpath(info));

	i = lookupindexfilepath(path);
	if (i != -1) {
		*totalfiles = *totalfiles - 1;
		remap(num, i);

	remap(num, j + 1);

	ip1 = (struct indexfileinfo *) emalloc(sizeof(struct indexfileinfo));
	ip1->filenum = num;
	ip1->fileinfo = (char *) mystrdup(info);
	ip1->path = (char *) mystrdup(path);

	sprintf(tmpstr, "%d", num);
	hashval = bighash(tmpstr);
	ip1->next = indexfilehashlist[hashval];
	indexfilehashlist[hashval] = ip1;

	ip2 = (struct indexfileinfo *) emalloc(sizeof(struct indexfileinfo));
	ip2->filenum = num;
	ip2->fileinfo = (char *) mystrdup(info);
	ip2->path = (char *) mystrdup(path);

	hashval = bighash(path);
	ip2->next = indexfilehashlist[hashval];
	indexfilehashlist[hashval] = ip2;

/* This extracts the pathname information from the file information
** line as stored in the index file.

char *extractpath(s)
     char *s;
	int i;
	static char path[MAXSTRLEN];

	for (i = 0; s[i] && s[i] != '\"'; i++)
		path[i] = s[i];
	path[i - 1] = '\0';
	path[i] = '\0';

	return path;

/* This returns the file information corresponding to a file number.

char *lookupindexfilenum(num)
     int num;
	unsigned hashval;
	char tmpstr[MAXSTRLEN];
	struct indexfileinfo *ip;

	sprintf(tmpstr, "%d", num);
	hashval = bighash(tmpstr);
	ip = indexfilehashlist[hashval];

	while (ip != NULL) {
		if (ip->filenum == num)
			return ip->fileinfo;
		ip = ip->next;
	return NULL;

/* This returns the file number corresponding to a pathname.

int lookupindexfilepath(path)
     char *path;
	unsigned hashval;
	struct indexfileinfo *ip;

	hashval = bighash(path);
	ip = indexfilehashlist[hashval];

	while (ip != NULL) {
		if (!strcmp(ip->path, path))
			return ip->filenum;
		ip = ip->next;
	return -1;

/* This simply concatenates two information lists that correspond
** to a word found in both index files.

struct indexentry *mergeindexentries(ip1, ip2)
     struct indexentry *ip1;
     struct indexentry *ip2;
	struct result *newrp, *rp1, *rp2;
	struct indexentry *ep;

	rp1 = ip1->result;
	rp2 = ip2->result;
	newrp = NULL;

	while (rp1 != NULL) {
		newrp = (struct result *) addtoresultlist(newrp,
		rp1->filenum, rp1->rank, rp1->structure);
		rp1 = rp1->next;
	while (rp2 != NULL) {
		newrp = (struct result *) addtoresultlist(newrp,
		rp2->filenum, rp2->rank, rp2->structure);
		rp2 = rp2->next;

	ep = (struct indexentry *) emalloc(sizeof(struct indexentry));
	ep->word = (char *) mystrdup(ip1->word);
	ep->result = newrp;

	return ep;

/* This prints a new word entry into the merged index file,
** removing redundant file information as it goes along.

void printindexentry(ip, fp)
     struct indexentry *ip;
     FILE *fp;
	int i, num;
	struct result *rp;

	for (i = 0; indexchars[i] != '\0'; i++)
		if ((ip->word)[0] == indexchars[i] && !offsets[i])
			offsets[i] = ftell(fp);
	fprintf(fp, "%s:", ip->word);
	rp = ip->result;
	while (rp != NULL) {
		num = rp->filenum;
		if (!ismarked(num)) {
			compress(num, fp);
			compress(rp->rank, fp);
			compress(rp->structure, fp);
		rp = rp->next;
	fputc(0, fp);

/* This associates a number with a new number.
** This function is used to remap file numbers from index
** files to a new merged index file.

void remap(oldnum, newnum)
     int oldnum;
     int newnum;
	unsigned hashval;
	char tmpstr[MAXSTRLEN];
	struct mapentry *mp;

	mp = (struct mapentry *) emalloc(sizeof(struct mapentry));
	mp->oldnum = oldnum;
	mp->newnum = newnum;

	sprintf(tmpstr, "%d", oldnum);
	hashval = bighash(tmpstr);
	mp->next = mapentrylist[hashval];
	mapentrylist[hashval] = mp;

/* This retrieves the number associated with another.

int getmap(num)
     int num;
	unsigned hashval;
	char tmpstr[MAXSTRLEN];
	struct mapentry *mp;

	sprintf(tmpstr, "%d", num);
	hashval = bighash(tmpstr);
	mp = mapentrylist[hashval];

	while (mp != NULL) {
		if (mp->oldnum == num)
			return mp->newnum;
		mp = mp->next;
	return num;

/* This marks a number as having been printed.

void marknum(num)
     int num;
	unsigned hashval;
	char tmpstr[MAXSTRLEN];
	struct markentry *mp;

	mp = (struct markentry *) emalloc(sizeof(struct markentry));
	mp->num = num;

	sprintf(tmpstr, "%d", num);
	hashval = bighash(tmpstr);
	mp->next = markentrylist[hashval];
	markentrylist[hashval] = mp;

/* Has a number been printed?

int ismarked(num)
     int num;
	unsigned hashval;
	char tmpstr[MAXSTRLEN];
	struct markentry *mp;

	sprintf(tmpstr, "%d", num);
	hashval = bighash(tmpstr);
	mp = markentrylist[hashval];

	while (mp != NULL) {
		if (mp->num == num)
			return 1;
		mp = mp->next;
	return 0;

/* Initialize the marking list.

void initmarkentrylist()
	int i;
	struct markentry *mp;

	for (i = 0; i < BIGHASHSIZE; i++) {
		mp = markentrylist[i];
		if (mp != NULL)
		markentrylist[i] = NULL;

/* Initialize the main file list.

void initindexfilehashlist()
	int i;
	struct indexfileinfo *ip;

	for (i = 0; i < BIGHASHSIZE; i++) {
		ip = indexfilehashlist[i];
		if (ip != NULL)
		indexfilehashlist[i] = NULL;

/* Frees up used index entries, my best attempt at memory management...
** I still have bytes leaking elsewhere...

void freeindexentry(ip)
     struct indexentry *ip;
	struct result *rp, *oldp;

	rp = ip->result;
	while (rp != NULL) {
		oldp = rp;
		rp = rp->next;

/* Translates a file number into something that can be compressed.

int encodefilenum(num)
     int num;
	int i, j;

	for (i = j = 0; i != num; i++) {
		if (!(j % 128))
	return j;

/* Translates a compressed file number into a correct file number.

int decodefilenum(num)
     int num;
        int i, extra;

        for (i = 1, extra = 0; i < num; i++)
                if (!(i % 128)) {
        num -= extra;

	return num;

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.