This is RecipeFilter.m in view mode; [Download] [Up]
/* RecipeFilter.m -- Recipe filtering and description services for DL indexing of usenet Recipe articles. The following two services are implemented. [1] RecipeDescribe Service: Describes Recipe articles. This can be modified quite easily. This description service is advertised via the following specification in ~/Library/Services/RecipeIndexing.service/services (or /LocalLibrary...) file. Filter: RecipeDescribe Port: IXRECIPEFILTER Send Type: NXTypedFileContentsPboardType:nr Return Type: IXFileDescriptionPboardType Executable: RecipeFilter.daemon [2] RecipeFilter Service: Filters junk out of news articles, so junk will not be indexed. "Junk" currently is defined as: Path: line. Nntp-posting-host: line. This filtering service is advertised via the following specification in ~/Library/Services/RecipeIndexing.service/services (or /LocalLibrary...) file. Filter: RecipeFilter Port: IXRECIPEFILTER Send Type: NXTypedFileContentsPboardType:nr Return Type: NXAsciiPboardType Executable: RecipeFilter.daemon Advantage of this service daemon scheme over the Unix stdio filter (invoked via NXUNIXSTDIO port) is that the daemon based filter is invoked only once per DL indexing session, unlike stdio filter which is invoked for every article indexed. Daemons can keep running indefinitely, but this one quits after some duration of inactivity. No Copyright is claimed. This program is hereby released into the public domain. Izumi Ohzawa, izumi@pinoko.berkeley.edu. ------- Revision History ----------------------------------------------------- 94-01-23 Version 0.9 of NewsDescribe. Izumi Ohzawa, izumi@pinoko.berkeley.edu Version 0.91, 0.92. 94-01-29 Version 0.93. Izumi. Renamed to NewsFilter. Now filters out junk from news articles via new filter service: NewsFilter to skip Path: and Nntp-posting-host: etc. and NewsGrazer's and other uuencoded junk. 94-04-27 Version 0.94. Izumi. Renamed to RecipeFilter to handle nroff format recipes posted as news articles. ------------------------------------------------------------------------------- */ #import <appkit/appkit.h> #import <indexing/indexing.h> #import <stdio.h> #import <stdlib.h> #import <strings.h> #import <time.h> // Uncomment the following during debugging. See Console output. // #define MYDEBUG 1 static char *version = "[V0.94, Izumi Ohzawa, izumi@pinoko.berkeley.edu]"; // Timeout stuff static float timeoutvalue = 600.0; // wait this long before quitting. static float interval = 30.0; // timeout check interval static time_t timelast; // last time (in seconds) that this service is called. // String buffers. static char linebuf[8192]; static char subject[1024]; static char strbuf1[1024]; static char *reset1 = "reset1"; static char *reset2 = "reset2"; // The following must match the spec in "services" file. char *InPBType = "NXTypedFileContentsPboardType:nr"; @interface Provider:Object { DPSTimedEntry timer; } - init; - step; - RecipeDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg; - RecipeFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg; @end @implementation Provider // Returns a line in outbuf upto '\n' from inbuf, keeps current position. // Similar to fgets(), except it will read from another multi-line string. // If no more lines, returns NULL. // BUGS: must reset inbuf to something else before use. // char *sgets(char *outbuf, char *inbuf) { static char *lastinbuf=NULL; static char *cptr; static char *endbuf; // points to NULL char at the end of str. if(lastinbuf != inbuf) { // input string changed, so update current positions. cptr = inbuf; lastinbuf = inbuf; endbuf = inbuf + strlen(inbuf); } if(cptr == NULL || cptr >= endbuf) { // Reached end of inbuf. return(NULL); } else { // return next line. sscanf(cptr, "%[^\n]", outbuf); // no '\n' in outbuf strcat(outbuf, "\n"); // add '\n' at the end cptr = index(cptr, '\n'); // position cptr on next '\n'. if(cptr != NULL) cptr++; // point to the next line. return(outbuf); } } void runOneStep(DPSTimedEntry timedEntry, double timeNow, void *TEobject) { [(id)TEobject step]; } - init { time(&timelast); // initialize no-work timer. timer = DPSAddTimedEntry(interval, &runOneStep, self, NX_RUNMODALTHRESHOLD); [super init]; return self; } - step { time_t timenow; time(&timenow); if( (timenow - timelast) >= timeoutvalue) { DPSRemoveTimedEntry (timer); fprintf(stderr, "RecipeFilter: No requests received for %.1f seconds. Quitting: %s", timeoutvalue, ctime(&timenow)); _exit(2); } return self; } // Key words for Recipe description extraction. static char *desckwords[] = { ".RZ", "" }; #define NDESCKEY 1 /* # of names in *desckwords[] */ // The main work-horse method to extract description line based on // Subject:, From:, and Date: lines from data passed via pasteboard. // - RecipeDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg { char *Recipedata; int length; int prmnumber, i; int Done, subjectDone; char *ptr; time(&timelast); // refresh the no-work timer. if([pbid findAvailableTypeFrom:&InPBType num:1]) { // We have correct PB type. [pbid readType:InPBType data:&Recipedata length:&length]; if(Recipedata && length) { // PB has some actual data in it. // Some initializations. subject[0] = '\0'; // clear strings sgets(linebuf, reset1); // need to reset this function... Done = subjectDone = 0; linebuf[0] = '\0'; // Read one line at a time get necessary info. while( (sgets(linebuf, Recipedata) != NULL) && !Done ) { if(strlen(linebuf) <= 1) { Done = 1; break; } strbuf1[0] = '\0'; sscanf(linebuf, "%s", strbuf1); /* first word */ prmnumber = -1; for(i=0; i<NDESCKEY; i++) { if( strcasecmp( strbuf1, desckwords[i]) == 0 ) { prmnumber = i; break; } } /* --- lookup in *desckwords[] done ----- */ if( prmnumber != -1 ) { switch(prmnumber) { case 0: /* .RZ */ ptr = index(linebuf, '"'); // first " sscanf(++ptr, "%[^\"]", subject); ptr = index(ptr, '"'); // closing " ptr = index(++ptr, '"'); sscanf(++ptr, "%[^\"]", strbuf1); subjectDone = 1; break; default: /* should not happen */ break; } /* end of switch() */ } /* end if( prmnumber != -1 ) */ // Check for Done flags if(subjectDone) Done = 1; } /* end of while( (sgets .. )) loop */ sprintf(linebuf, "%s - %s\n", subject, strbuf1); // Send the description back on pasteboard. [pbid declareTypes:&IXFileDescriptionPboardType num:1 owner:self]; [pbid writeType:IXFileDescriptionPboardType data:linebuf length:strlen(linebuf)]; #ifdef MYDEBUG fprintf(stderr, "%s", linebuf); #endif } /* end if(Recipedata && length) */ else *errmsg = "No actual data found on pasteboard."; [pbid deallocatePasteboardData:Recipedata length:length]; } /* end if(types[i]) */ else *errmsg = "No good PBoard type."; return self; } // These skip and done words are hard coded as below currently. // Ideally, these should be modifiable via a word list file included in // ~/Library/Services/RecipeIndexing.service directory (or /LocalLibrary...) // Path to this directory should be obtainable via argv[0] in main(). // I am not going to do this, so if there are any takers, please! // // Line skip words that cause entire line to be filtered out. static char *lskipwords[] = { "Path:", "Nntp-Posting-Host:", "" }; #define NLSKIP 2 /* # of names in *lskipwords[] */ // Perhaps, I should combine these into a paird struct of string and number. // Done strings that will flush the rest of the article from this line on. static char *donestring[] = { "", "" }; // Length of each of donestring above static int donecount[] = { 29, 0 }; #define NDONE 0 /* # of names in *donestring[] */ // Recipe article filter service method that removes junk (see top of this file) // from article text before it is passed to indexing scanner. // DigitalLibrarian calles this method with full article content on the // pasteboard of type "NXTypedFileContentsPboardType:nr". // This method is called before the description filter service // -RecipeDescribe:..... method is called. // - RecipeFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg { NXStream *outStream; char *outBuffer; int outLen, dummy; char *Recipedata; int length; int prmnumber, i; int Done; time(&timelast); // refresh the no-work timer. if([pbid findAvailableTypeFrom:&InPBType num:1]) { // We have correct PB type. [pbid readType:InPBType data:&Recipedata length:&length]; if(Recipedata && length) { // PB has some actual data in it. // Some initializations. outStream = NXOpenMemory(NULL, 0, NX_WRITEONLY); // create memory stream sgets(linebuf, reset2); // need to reset this function... Done = 0; linebuf[0] = '\0'; // Read one line at a time get necessary info. while( (sgets(linebuf, Recipedata) != NULL) && !Done ) { // First, see if we want to skip this line. strbuf1[0] = '\0'; sscanf(linebuf, "%s", strbuf1); /* first word */ if( (strlen(strbuf1) == 61) && (strbuf1[0] == 'M')) { #ifdef MYDEBUG // Debug: print 'u' for each line of uuencoded junk removed. fputc('u', stderr); #endif continue; /* one word of 61 chars must be uuencoded stuff */ } prmnumber = -1; for(i=0; i<NLSKIP; i++) { if( strcasecmp(strbuf1, lskipwords[i]) == 0 ) { prmnumber = i; // Match found break; } } if(prmnumber != -1) { #ifdef MYDEBUG fprintf(stderr," Skip line: %s", linebuf); #endif continue; // There was a match. Try next line } // Now, check if we want to kill the rest of article prmnumber = -1; for(i=0; i<NDONE; i++) { if( strncasecmp( linebuf, donestring[i], donecount[i]) == 0 ) { prmnumber = i; break; } } if(prmnumber != -1) { // There was a match to donstring[]. #ifdef MYDEBUG fprintf(stderr," Kill to EOF: %s", linebuf); #endif Done = 1; // Raise DONE flag to kill article to EOF. continue; // go to top to get kicked out of while(.. && !Done).. } // We want this line. So write to stream. NXPrintf(outStream, "%s", linebuf); } /* end of while( (sgets .. )) loop */ NXFlush(outStream); NXGetMemoryBuffer(outStream, &outBuffer, &outLen, &dummy); // Send the description back on pasteboard. [pbid declareTypes:&NXAsciiPboardType num:1 owner:self]; [pbid writeType:NXAsciiPboardType data:outBuffer length:outLen]; NXCloseMemory(outStream, NX_FREEBUFFER); } /* end if(Recipedata && length) */ else *errmsg = "No actual data found on pasteboard."; [pbid deallocatePasteboardData:Recipedata length:length]; } /* end if(types[i]) */ else *errmsg = "No good PBoard type."; return self; } @end int main(int argc, char *argv[]) { id lid; #ifdef MYDEBUG int i; #endif time(&timelast); fprintf(stderr, "RecipeFilter.deamon for DLibrarian indexing started: %s %s\n", ctime(&timelast), version); #ifdef MYDEBUG for(i=0; i<argc; i++) fprintf(stderr, "argv[%d]: %s\n", i, argv[i]); #endif lid = [[Listener alloc] init]; [lid setServicesDelegate:[[Provider alloc] init]]; if([lid checkInAs:"IXRECIPEFILTER"]) _exit(1); // uh-oh! [lid addPort]; [Listener run]; return 0; }
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.