This is HtmlFilter.m in view mode; [Download] [Up]
/* HtmlFilter.m -- Html filtering and description services for DL indexing of Html docs. The following two services are implemented. [1] HtmlDescribe Service: Describes Html articles based currently on Subject:, From:, and Date: lines. This can be modified quite easily. Some effort has been made to extract full name of the poster from the From: line. This description service is advertised via the following specification in ~/Library/Services/HtmlIndexing.service/services (or /LocalLibrary...) file. Filter: HtmlDescribe Port: IXHTMLFILTER Send Type: NXTypedFileContentsPboardType:html Return Type: IXFileDescriptionPboardType Executable: HtmlFilter.daemon [2] HtmlFilter Service: Filters junk out of Html articles, so junk will not be indexed. "Junk" currently is defined as: Path: line. Nntp-posting-host: line. Lines including and after "-- NewsGrazer, a NeXTstep(tm) ....." Ohter UUENCODE'ed stuff. This filtering service is advertised via the following specification in ~/Library/Services/HtmlIndexing.service/services (or /LocalLibrary...) file. Filter: HtmlFilter Port: IXHTMLFILTER Send Type: NXTypedFileContentsPboardType:html Return Type: NXAsciiPboardType Executable: HtmlFilter.daemon Advantage of this service daemon scheme over the Unix stdio filter (invoked via NXUNIXSTDIO port) is that the daemon based filter is invoked only once per DL indexing session, unlike stdio filter which is invoked for every article indexed. Daemons can keep running indefinitely, but this one quits after some duration of inactivity. No Copyright is claimed. This program is hereby released into the public domain. Benoät GrangÝ [ben@fizz.fdn.org] distributed a similar daemon free of charge, but no source code was included in the distribution. This version has been developed from scratch by myself. Izumi Ohzawa, izumi@pinoko.berkeley.edu. ------- TO DO ---------------------------------------------------------------- [1] Making things configurable, e.g., what goes into the description line in what order, and what kind of lines to filter out in -HtmlFilter:... method. These specs can be stored in a file in ~/Library/Services/HtmlIndexing.service directory (or /LocalLibrary..) This path should be in argv[0] in main() because that is where the executable lives too. I probably won't get around to doing this myself, or doing much enhancements. ------- Revision History ----------------------------------------------------- 98//11/26 Version 0.13 of HtmlDescribe. Juergen Sell js@euler.han.de Stolen from HewsIndexing, errors are mine. ------------------------------------------------------------------------------- */ #import <appkit/appkit.h> #import <indexing/indexing.h> #import <stdio.h> #import <stdlib.h> #import <strings.h> #import <time.h> #include <misckit.h> // Uncomment the following during debugging. See Console output. #define MYDEBUG 1 static char *version = "[V0.13, Juergen Sell js@euler.han.de]"; // Timeout stuff static float timeoutvalue = 600.0; // wait this long before quitting. static float interval = 30.0; // timeout check interval static time_t timelast; // last time (in seconds) that this service is called. // String buffers. static char linebuf[8192]; static char strbuf1[1024]; static char *reset1 = "reset1"; static char *reset2 = "reset2"; // The following must match the spec in "services" file. char *InPBType = "NXTypedFileContentsPboardType:html"; @interface Provider:Object { DPSTimedEntry timer; } - init; - step; - HtmlDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg; - HtmlFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg; @end @implementation Provider // Returns a line in outbuf upto '\n' from inbuf, keeps current position. // Similar to fgets(), except it will read from another multi-line string. // If no more lines, returns NULL. // BUGS: must reset inbuf to something else before use. // char *sgets(char *outbuf, char *inbuf) { static char *lastinbuf=NULL; static char *cptr; static char *endbuf; // points to NULL char at the end of str. if(lastinbuf != inbuf) { // input string changed, so update current positions. cptr = inbuf; lastinbuf = inbuf; endbuf = inbuf + strlen(inbuf); } if(cptr == NULL || cptr >= endbuf) { // Reached end of inbuf. return(NULL); } else { // return next line. sscanf(cptr, "%[^\n]", outbuf); // no '\n' in outbuf strcat(outbuf, "\n"); // add '\n' at the end cptr = index(cptr, '\n'); // position cptr on next '\n'. if(cptr != NULL) cptr++; // point to the next line. return(outbuf); } } void runOneStep(DPSTimedEntry timedEntry, double timeNow, void *TEobject) { [(id)TEobject step]; } - init { time(&timelast); // initialize no-work timer. timer = DPSAddTimedEntry(interval, &runOneStep, self, NX_RUNMODALTHRESHOLD); [super init]; return self; } - step { time_t timenow; time(&timenow); if( (timenow - timelast) >= timeoutvalue) { DPSRemoveTimedEntry (timer); fprintf(stderr, "HtmlFilter: No requests received for %.1f seconds. Quitting: %s", timeoutvalue, ctime(&timenow)); _exit(2); } return self; } // Key words for news description extraction. static char *desckwords[] = { "<TITLE>", "<H1>", "" }; static char *desckstopwords[] = { "</TITLE>", "</H1>", "" }; #define NDESCKEY 2 /* # of names in *desckwords[] */ // The main work-horse method to extract description line based on // Subject:, From:, and Date: lines from data passed via pasteboard. // - HtmlDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg { char *htmldata; int length; int i,s; int Done; id smString, bmString,amString,pmString, lineString,resString; int desckdone[] = { 0, 0, 0 }; time(&timelast); // refresh the no-work timer. if([pbid findAvailableTypeFrom:&InPBType num:1]) { // We have correct PB type. [pbid readType:InPBType data:&htmldata length:&length]; if(htmldata && length) { // PB has some actual data in it. // Some initializations. sgets(linebuf, reset1); // need to reset this function... Done = 0; linebuf[0] = '\0'; lineString= [[MiscString alloc] init]; smString= [[MiscString alloc] init]; bmString= [[MiscString alloc] init]; amString= [[MiscString alloc] init]; pmString= [[MiscString alloc] init]; resString= [[MiscString alloc] init]; // Read one line at a time get necessary info. while( (sgets(linebuf, htmldata) != NULL) && !Done ) { [lineString setStringValue:linebuf]; for(i=0,s=0; i<NDESCKEY; i++) { if (desckdone[i] == 0) { // tag not done nor in progess -> need further twist if (1 == [lineString grep: desckwords[i] before:nil middle:nil after:amString] ) { desckdone[i]= 1; // in progess [lineString takeStringValueFrom:amString]; if (1 == [lineString grep: desckstopwords[i] before:bmString middle:nil after:nil] ) { [resString concatenate:bmString]; // append sequence between start- and stopword desckdone[i]= 2; // done if (s <= 2*NDESCKEY-3) [resString cat:" -- "]; } else { [resString concatenate:amString]; // append sequence after startword } } }else if (desckdone[i] == 1) { // tag in progess -> need further twist if (1 == [lineString grep: desckstopwords[i] before:bmString middle:pmString after:amString] ) { [resString concatenate:bmString]; // append part till stopword } else { [resString concatenate:lineString]; // append line } if (s <= 2*NDESCKEY-3) [resString cat:" -- "]; desckdone[i]= 2; // done [lineString takeStringValueFrom:amString]; } s+= desckdone[i]; } if (s == 2*NDESCKEY) Done= 1; } /* end of while( (sgets .. )) loop */ [resString replaceEveryOccurrenceOfChar:'\n' with:""]; [resString replaceEveryOccurrenceOfRegex:" +" with:" "]; // Send the description back on pasteboard. [pbid declareTypes:&IXFileDescriptionPboardType num:1 owner:self]; [pbid writeType:IXFileDescriptionPboardType data:[resString stringValue] length:[resString length]]; #ifdef MYDEBUG fprintf(stderr, ">%s\n", [resString stringValue]); #endif } /* end if(htmldata && length) */ else *errmsg = "No actual data found on pasteboard."; [pbid deallocatePasteboardData:htmldata length:length]; } /* end if(types[i]) */ else *errmsg = "No good PBoard type."; return self; } // These skip and done words are hard coded as below currently. // Ideally, these should be modifiable via a word list file included in // ~/Library/Services/NewsIndexing.service directory (or /LocalLibrary...) // Path to this directory should be obtainable via argv[0] in main(). // I am not going to do this, so if there are any takers, please! // // Line skip words that cause entire line to be filtered out. static char *lskipwords[] = { "</TITLE>", "</BODY>", "" }; #define NLSKIP 2 /* # of names in *lskipwords[] */ // Html article filter service method that removes junk (see top of this file) // from article text before it is passed to indexing scanner. // DigitalLibrarian calles this method with full article content on the // pasteboard of type "NXTypedFileContentsPboardType:html". // This method is called before the description filter service // -HtmlDescribe:..... method is called. // - HtmlFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg { NXStream *outStream; char *outBuffer; int outLen, dummy; char *htmldata; int length; int prmnumber, i; int Done; time(&timelast); // refresh the no-work timer. if([pbid findAvailableTypeFrom:&InPBType num:1]) { // We have correct PB type. [pbid readType:InPBType data:&htmldata length:&length]; if(htmldata && length) { // PB has some actual data in it. // Some initializations. outStream = NXOpenMemory(NULL, 0, NX_WRITEONLY); // create memory stream sgets(linebuf, reset2); // need to reset this function... Done = 0; linebuf[0] = '\0'; // Read one line at a time get necessary info. while( (sgets(linebuf, htmldata) != NULL) && !Done ) { // First, see if we want to skip this line. strbuf1[0] = '\0'; sscanf(linebuf, "%s", strbuf1); /* first word */ // if( (strlen(strbuf1) == 61) && (strbuf1[0] == 'M')) { // #ifdef MYDEBUG // // Debug: print 'u' for each line of uuencoded junk removed. // fputc('u', stderr); // #endif // continue; /* one word of 61 chars must be uuencoded stuff */ // } prmnumber = -1; for(i=0; i<NLSKIP; i++) { if( strcasecmp(strbuf1, lskipwords[i]) == 0 ) { prmnumber = i; // Match found break; } } if(prmnumber != -1) { #ifdef MYDEBUG fprintf(stderr," Skip line: %s", linebuf); #endif continue; // There was a match. Try next line } // // Now, check if we want to kill the rest of article // prmnumber = -1; // for(i=0; i<NDONE; i++) { // if( strncasecmp( linebuf, donestring[i], donecount[i]) == 0 ) { // prmnumber = i; // break; // } // } // if(prmnumber != -1) { // // There was a match to donstring[]. // #ifdef MYDEBUG // fprintf(stderr," Kill to EOF: %s", linebuf); // #endif // Done = 1; // Raise DONE flag to kill article to EOF. // continue; // go to top to get kicked out of while(.. && !Done).. // } // We want this line. So write to stream. NXPrintf(outStream, "%s", linebuf); } /* end of while( (sgets .. )) loop */ NXFlush(outStream); NXGetMemoryBuffer(outStream, &outBuffer, &outLen, &dummy); // Send the description back on pasteboard. [pbid declareTypes:&NXAsciiPboardType num:1 owner:self]; [pbid writeType:NXAsciiPboardType data:outBuffer length:outLen]; NXCloseMemory(outStream, NX_FREEBUFFER); } /* end if(htmldata && length) */ else *errmsg = "No actual data found on pasteboard."; [pbid deallocatePasteboardData:htmldata length:length]; } /* end if(types[i]) */ else *errmsg = "No good PBoard type."; return self; } @end int main(int argc, char *argv[]) { id lid; #ifdef MYDEBUG int i; #endif time(&timelast); fprintf(stderr, "HtmlFilter.deamon for DLibrarian indexing started: %s %s\n", ctime(&timelast), version); #ifdef MYDEBUG for(i=0; i<argc; i++) fprintf(stderr, "argv[%d]: %s\n", i, argv[i]); #endif lid = [[Listener alloc] init]; [lid setServicesDelegate:[[Provider alloc] init]]; if([lid checkInAs:"IXHTMLFILTER"]) _exit(1); // uh-oh! [lid addPort]; [Listener run]; return 0; }
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.