ftp.nice.ch/peanuts/GeneralData/Documents/literature/RecipeIndex.0.94.NI.bs.tar.gz#/RecipeIndex/RecipeFilter.m

This is RecipeFilter.m in view mode; [Download] [Up]

/* RecipeFilter.m -- Recipe filtering and description services for DL indexing
	of usenet Recipe articles.
	The following two services are implemented.

[1] RecipeDescribe Service:
	Describes Recipe articles.
	This can be modified quite easily.
	This description service is advertised via the following specification in
	~/Library/Services/RecipeIndexing.service/services (or /LocalLibrary...) file.
		Filter: RecipeDescribe
		Port: IXRECIPEFILTER
		Send Type: NXTypedFileContentsPboardType:nr
		Return Type: IXFileDescriptionPboardType
		Executable: RecipeFilter.daemon

[2] RecipeFilter Service:
	Filters junk out of news articles, so junk will not be indexed.
	"Junk" currently is defined as:
	    Path: line.
	    Nntp-posting-host: line.

	This filtering service is advertised via the following specification in
	~/Library/Services/RecipeIndexing.service/services (or /LocalLibrary...) file.
		Filter: RecipeFilter
		Port: IXRECIPEFILTER
		Send Type: NXTypedFileContentsPboardType:nr
		Return Type: NXAsciiPboardType
		Executable: RecipeFilter.daemon

	Advantage of this service daemon scheme over the Unix stdio filter
	(invoked via NXUNIXSTDIO port) is that the daemon based filter is
	invoked only once per DL indexing session, unlike stdio filter which
	is invoked for every article indexed.

	Daemons can keep running indefinitely, but this one quits after some
	duration of inactivity.

	No Copyright is claimed.
	This program is hereby released into the public domain.

	Izumi Ohzawa, izumi@pinoko.berkeley.edu.

------- Revision History -----------------------------------------------------
94-01-23
	Version 0.9 of NewsDescribe.   Izumi Ohzawa, izumi@pinoko.berkeley.edu
	Version 0.91, 0.92.
94-01-29
	Version 0.93.  Izumi.
		Renamed to NewsFilter.
		Now filters out junk from news articles via new filter service:
		NewsFilter to skip Path: and Nntp-posting-host: etc. and
		NewsGrazer's and other uuencoded junk.
94-04-27
	Version 0.94.  Izumi.
		Renamed to RecipeFilter to handle nroff format recipes
		posted as news articles.
-------------------------------------------------------------------------------
*/


#import <appkit/appkit.h>
#import <indexing/indexing.h>

#import <stdio.h>
#import <stdlib.h>
#import <strings.h>
#import <time.h>

// Uncomment the following during debugging.  See Console output.
// #define MYDEBUG 1

static char *version = "[V0.94, Izumi Ohzawa, izumi@pinoko.berkeley.edu]";

// Timeout stuff
static  float timeoutvalue = 600.0;	// wait this long before quitting.
static  float interval = 30.0;		// timeout check interval
static  time_t timelast;		// last time (in seconds) that this service is called.

// String buffers.
static  char linebuf[8192];
static  char subject[1024];
static  char strbuf1[1024];
static  char *reset1 = "reset1";
static  char *reset2 = "reset2";

// The following must match the spec in "services" file.
char *InPBType = "NXTypedFileContentsPboardType:nr";


@interface Provider:Object
{
    DPSTimedEntry	timer;
}
- init;
- step;
- RecipeDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg;
- RecipeFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg;
@end



@implementation Provider

// Returns a line in outbuf upto '\n' from inbuf, keeps current position.
// Similar to fgets(), except it will read from another multi-line string.
// If no more lines, returns NULL.
// BUGS: must reset inbuf to something else before use.
//
char *sgets(char *outbuf, char *inbuf)
{
static char *lastinbuf=NULL;
static char *cptr;
static char *endbuf;	// points to NULL char at the end of str.
    if(lastinbuf != inbuf) {
	// input string changed, so update current positions.
	cptr = inbuf;
	lastinbuf = inbuf;
	endbuf = inbuf + strlen(inbuf);
    }
    if(cptr == NULL || cptr >= endbuf) {
	// Reached end of inbuf.
	return(NULL);
    }
    else {
	// return next line.
	sscanf(cptr, "%[^\n]", outbuf);	// no '\n' in outbuf
	strcat(outbuf, "\n");		// add '\n' at the end
    	cptr = index(cptr, '\n');	// position cptr on next '\n'.
	if(cptr != NULL) cptr++;	// point to the next line.
	return(outbuf);
    }
}


void runOneStep(DPSTimedEntry timedEntry, double timeNow, void *TEobject)
{
    [(id)TEobject step];
}


- init
{
    time(&timelast);		// initialize no-work timer.
    timer = DPSAddTimedEntry(interval, &runOneStep, self, NX_RUNMODALTHRESHOLD);
    [super init];
    return self;
}

- step
{
time_t timenow;
    time(&timenow);
    if( (timenow - timelast) >= timeoutvalue) {
	DPSRemoveTimedEntry (timer);
	fprintf(stderr,
		"RecipeFilter: No requests received for %.1f seconds. Quitting: %s",
		timeoutvalue, ctime(&timenow));
	_exit(2);
    }
    return self;
}

// Key words for Recipe description extraction.
static char *desckwords[] = {
	".RZ",
	""
    };
#define NDESCKEY	  1		/* # of names in *desckwords[] */

// The main work-horse method to extract description line based on
// Subject:, From:, and Date: lines from data passed via pasteboard.
//
- RecipeDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg
{
char *Recipedata; 
int  length;
int  prmnumber, i;
int  Done, subjectDone;
char *ptr;

    	time(&timelast);		// refresh the no-work timer.

 	if([pbid findAvailableTypeFrom:&InPBType num:1]) {
	    // We have correct PB type.
	    [pbid readType:InPBType data:&Recipedata length:&length];
	    if(Recipedata && length) {
		// PB has some actual data in it.
		// Some initializations.
		subject[0] = '\0';		// clear strings
		sgets(linebuf, reset1);	// need to reset this function...
		Done = subjectDone = 0;
		linebuf[0] = '\0';

		// Read one line at a time get necessary info.
		while( (sgets(linebuf, Recipedata) != NULL) && !Done ) {

		    if(strlen(linebuf) <= 1) {
			Done = 1;
			break;
		    }
		    strbuf1[0] = '\0';
		    sscanf(linebuf, "%s", strbuf1);	/* first word */
		    prmnumber = -1;
		    for(i=0; i<NDESCKEY; i++) {
			    if( strcasecmp( strbuf1, desckwords[i]) == 0 ) {
				prmnumber = i;
				break;
			    }
		    }
	
		    /* --- lookup in *desckwords[] done ----- */
		    if( prmnumber != -1 ) {
			switch(prmnumber) {
			    case 0:	/* .RZ */
				ptr = index(linebuf, '"');		// first "
				sscanf(++ptr, "%[^\"]", subject);
				ptr = index(ptr, '"');			// closing "
				ptr = index(++ptr, '"');
				sscanf(++ptr, "%[^\"]", strbuf1);
				subjectDone = 1;
				break;

			    default:	/* should not happen */
				break;
			    }   /* end of switch() */
		    }        /* end if( prmnumber != -1 ) */

		    // Check for Done flags
		    if(subjectDone)
			Done = 1;
		}   /* end of while( (sgets .. )) loop */

		sprintf(linebuf, "%s - %s\n", subject, strbuf1);
		// Send the description back on pasteboard.
 		[pbid declareTypes:&IXFileDescriptionPboardType num:1 owner:self];
 		[pbid writeType:IXFileDescriptionPboardType data:linebuf
 				   length:strlen(linebuf)];
#ifdef MYDEBUG
		fprintf(stderr, "%s", linebuf);
#endif
	    } /* end if(Recipedata && length) */
	    else
		*errmsg = "No actual data found on pasteboard.";

 	    [pbid deallocatePasteboardData:Recipedata length:length];
	}  /* end if(types[i]) */
	else
	    *errmsg = "No good PBoard type.";

	return self;
}


// These skip and done words are hard coded as below currently.
// Ideally, these should be modifiable via a word list file included in
// ~/Library/Services/RecipeIndexing.service directory (or /LocalLibrary...)
// Path to this directory should be obtainable via argv[0] in main().
// I am not going to do this, so if there are any takers, please!
//
//  Line skip words that cause entire line to be filtered out.
static char *lskipwords[] = {
	"Path:",    "Nntp-Posting-Host:",
	""
    };
#define NLSKIP	  2		/* # of names in *lskipwords[] */

// Perhaps, I should combine these into a paird struct of string and number.
// Done strings that will flush the rest of the article from this line on.
static char *donestring[] = {
	"",
	""
    };
// Length of each of donestring above
static int   donecount[] = {
	29,
	0                
    };
#define NDONE	  0		/* # of names in *donestring[] */

// Recipe article filter service method that removes junk (see top of this file)
// from article text before it is passed to indexing scanner.
// DigitalLibrarian calles this method with full article content on the
// pasteboard of type "NXTypedFileContentsPboardType:nr".
// This method is called before the description filter service 
// -RecipeDescribe:..... method is called.
//
- RecipeFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg
{

NXStream *outStream;
char *outBuffer;
int outLen, dummy;
char *Recipedata; 
int  length;
int  prmnumber, i;
int  Done;

    	time(&timelast);		// refresh the no-work timer.

 	if([pbid findAvailableTypeFrom:&InPBType num:1]) {
	    // We have correct PB type.
	    [pbid readType:InPBType data:&Recipedata length:&length];
	    if(Recipedata && length) {
		// PB has some actual data in it.
		// Some initializations.
    		outStream = NXOpenMemory(NULL, 0, NX_WRITEONLY);  // create memory stream
		sgets(linebuf, reset2);	// need to reset this function...
		Done = 0;
		linebuf[0] = '\0';

		// Read one line at a time get necessary info.
		while( (sgets(linebuf, Recipedata) != NULL) && !Done ) {

		    // First, see if we want to skip this line.
		    strbuf1[0] = '\0';
		    sscanf(linebuf, "%s", strbuf1);	/* first word */

		    if( (strlen(strbuf1) == 61) && (strbuf1[0] == 'M')) {
#ifdef MYDEBUG
			// Debug: print 'u' for each line of uuencoded junk removed.
			fputc('u', stderr);
#endif
			continue;	/* one word of 61 chars must be uuencoded stuff */
		    }

		    prmnumber = -1;
		    for(i=0; i<NLSKIP; i++) {
			if( strcasecmp(strbuf1, lskipwords[i]) == 0 ) {
			    prmnumber = i;		// Match found
			    break;
			}
		    }
		    if(prmnumber != -1) {
#ifdef MYDEBUG
			fprintf(stderr,"  Skip line: %s", linebuf);
#endif
			continue;	// There was a match. Try next line
		    }

		    // Now, check if we want to kill the rest of article
		    prmnumber = -1;
		    for(i=0; i<NDONE; i++) {
			if( strncasecmp( linebuf, donestring[i], donecount[i]) == 0 ) {
			    prmnumber = i;
			    break;
			}
		    }
		    if(prmnumber != -1) {
			// There was a match to donstring[].
#ifdef MYDEBUG
			fprintf(stderr,"  Kill to EOF: %s", linebuf);
#endif
			Done = 1;	// Raise DONE flag to kill article to EOF.
			continue;	// go to top to get kicked out of while(.. && !Done)..
		    }

		    // We want this line. So write to stream.
    		    NXPrintf(outStream, "%s", linebuf);
		}   /* end of while( (sgets .. )) loop */

		NXFlush(outStream);
		NXGetMemoryBuffer(outStream, &outBuffer, &outLen, &dummy);
		// Send the description back on pasteboard.
 		[pbid declareTypes:&NXAsciiPboardType num:1 owner:self];
 		[pbid writeType:NXAsciiPboardType data:outBuffer length:outLen];
		NXCloseMemory(outStream, NX_FREEBUFFER);
	    } /* end if(Recipedata && length) */
	    else
		*errmsg = "No actual data found on pasteboard.";

 	    [pbid deallocatePasteboardData:Recipedata length:length];
	}  /* end if(types[i]) */
	else
	    *errmsg = "No good PBoard type.";

	return self;
}

@end


int main(int argc, char *argv[])
{  
id lid;
#ifdef MYDEBUG
int i;
#endif

    time(&timelast);
    fprintf(stderr,
	"RecipeFilter.deamon for DLibrarian indexing started: %s %s\n",
			ctime(&timelast), version);

#ifdef MYDEBUG
    for(i=0; i<argc; i++)
	fprintf(stderr, "argv[%d]: %s\n", i, argv[i]);
#endif

    lid = [[Listener alloc] init];
    [lid setServicesDelegate:[[Provider alloc] init]];
    if([lid checkInAs:"IXRECIPEFILTER"]) _exit(1);	// uh-oh!
    [lid addPort];
    [Listener run];
    return 0;
}

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.