/* HtmlFilter.m -- Html filtering and description services for DL indexing
	of  Html docs.
	The following two services are implemented.

[1] HtmlDescribe Service:
	Describes Html articles based currently on Subject:, From:, and Date: lines.
	This can be modified quite easily.  Some effort has been made to extract
	full name of the poster from the From: line.

	This description service is advertised via the following specification in
	~/Library/Services/HtmlIndexing.service/services (or /LocalLibrary...) file.
		Filter: HtmlDescribe
		Send Type: NXTypedFileContentsPboardType:html
		Return Type: IXFileDescriptionPboardType
		Executable: HtmlFilter.daemon

[2] HtmlFilter Service:
	Filters junk out of Html articles, so junk will not be indexed.
	"Junk" currently is defined as:
	    Path: line.
	    Nntp-posting-host: line.
	    Lines including and after "-- NewsGrazer, a NeXTstep(tm) ....."
	    Ohter UUENCODE'ed stuff.

	This filtering service is advertised via the following specification in
	~/Library/Services/HtmlIndexing.service/services (or /LocalLibrary...) file.
		Filter: HtmlFilter
		Send Type: NXTypedFileContentsPboardType:html
		Return Type: NXAsciiPboardType
		Executable: HtmlFilter.daemon

	Advantage of this service daemon scheme over the Unix stdio filter
	(invoked via NXUNIXSTDIO port) is that the daemon based filter is
	invoked only once per DL indexing session, unlike stdio filter which
	is invoked for every article indexed.

	Daemons can keep running indefinitely, but this one quits after some
	duration of inactivity.

	No Copyright is claimed.
	This program is hereby released into the public domain.

	Benoät GrangÝ [ben@fizz.fdn.org] distributed a similar daemon
	free of charge, but no source code was included in the distribution.
	This version has been developed from scratch by myself.

	Izumi Ohzawa, izumi@pinoko.berkeley.edu.

------- TO DO ----------------------------------------------------------------
[1] Making things configurable, e.g., what goes into the description line in
	what order, and what kind of lines to filter out in -HtmlFilter:...
	method.  These specs can be stored in a file in
	~/Library/Services/HtmlIndexing.service directory (or /LocalLibrary..)
	This path should be in argv[0] in main() because that is where the
	executable lives too.
	I probably won't get around to doing this myself, or doing much

------- Revision History -----------------------------------------------------
	Version 0.13 of HtmlDescribe. Juergen Sell js@euler.han.de
	Stolen from HewsIndexing, errors are mine.

#import <appkit/appkit.h>
#import <indexing/indexing.h>

#import <stdio.h>
#import <stdlib.h>
#import <strings.h>
#import <time.h>

#include <misckit.h>

// Uncomment the following during debugging.  See Console output.
#define MYDEBUG 1

static char *version = "[V0.13, Juergen Sell js@euler.han.de]";

// Timeout stuff
static  float timeoutvalue = 600.0;	// wait this long before quitting.
static  float interval = 30.0;		// timeout check interval
static  time_t timelast;		// last time (in seconds) that this service is called.

// String buffers.
static  char linebuf[8192];
static  char strbuf1[1024];
static  char *reset1 = "reset1";
static  char *reset2 = "reset2";

// The following must match the spec in "services" file.
char *InPBType = "NXTypedFileContentsPboardType:html";

@interface Provider:Object
    DPSTimedEntry	timer;
- init;
- step;
- HtmlDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg;
- HtmlFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg;

@implementation Provider

// Returns a line in outbuf upto '\n' from inbuf, keeps current position.
// Similar to fgets(), except it will read from another multi-line string.
// If no more lines, returns NULL.
// BUGS: must reset inbuf to something else before use.
char *sgets(char *outbuf, char *inbuf)
static char *lastinbuf=NULL;
static char *cptr;
static char *endbuf;	// points to NULL char at the end of str.
    if(lastinbuf != inbuf) {
	// input string changed, so update current positions.
	cptr = inbuf;
	lastinbuf = inbuf;
	endbuf = inbuf + strlen(inbuf);
    if(cptr == NULL || cptr >= endbuf) {
	// Reached end of inbuf.
    else {
	// return next line.
	sscanf(cptr, "%[^\n]", outbuf);	// no '\n' in outbuf
	strcat(outbuf, "\n");		// add '\n' at the end
    	cptr = index(cptr, '\n');	// position cptr on next '\n'.
	if(cptr != NULL) cptr++;	// point to the next line.

void runOneStep(DPSTimedEntry timedEntry, double timeNow, void *TEobject)
    [(id)TEobject step];

- init
    time(&timelast);		// initialize no-work timer.
    timer = DPSAddTimedEntry(interval, &runOneStep, self, NX_RUNMODALTHRESHOLD);
    [super init];
    return self;

- step
time_t timenow;
    if( (timenow - timelast) >= timeoutvalue) {
	DPSRemoveTimedEntry (timer);
		"HtmlFilter: No requests received for %.1f seconds. Quitting: %s",
		timeoutvalue, ctime(&timenow));
    return self;

// Key words for news description extraction.
static char *desckwords[] = {
	"<TITLE>",    "<H1>",
static char *desckstopwords[] = {
	"</TITLE>",    "</H1>",

#define NDESCKEY	  2		/* # of names in *desckwords[] */

// The main work-horse method to extract description line based on
// Subject:, From:, and Date: lines from data passed via pasteboard.
- HtmlDescribe:(id)pbid userData:(const char *)udata error:(char **)errmsg
char *htmldata; 
int  length;
int  i,s;
int  Done;
id smString, bmString,amString,pmString, lineString,resString;
int desckdone[] = {	0, 0, 0    };

    	time(&timelast);		// refresh the no-work timer.

 	if([pbid findAvailableTypeFrom:&InPBType num:1]) {
	  // We have correct PB type.
	  [pbid readType:InPBType data:&htmldata length:&length];
	  if(htmldata && length) {
	    // PB has some actual data in it.
	    // Some initializations.
	    sgets(linebuf, reset1);	// need to reset this function...
	    Done = 0;
	    linebuf[0] = '\0';
	    lineString= [[MiscString alloc] init];
	    smString= [[MiscString alloc] init];
	    bmString= [[MiscString alloc] init];
	    amString= [[MiscString alloc] init];
	    pmString= [[MiscString alloc] init];
	    resString= [[MiscString alloc] init];

	    // Read one line at a time get necessary info.
	    while( (sgets(linebuf, htmldata) != NULL) && !Done ) {
	      [lineString setStringValue:linebuf];
	      for(i=0,s=0; i<NDESCKEY; i++) {
		if (desckdone[i] == 0) {
		  // tag not done nor in progess  -> need further twist
		  if (1 == [lineString grep: desckwords[i] 
				       after:amString] ) {
		    desckdone[i]= 1; // in progess
		    [lineString takeStringValueFrom:amString];
		    if (1 == [lineString grep: desckstopwords[i] 
					 after:nil] ) {
		      [resString concatenate:bmString]; // append sequence between start- and stopword
		      desckdone[i]= 2; // done
		      if (s <= 2*NDESCKEY-3) [resString cat:" -- "];
		    } else {
		      [resString concatenate:amString]; // append sequence after startword
		}else if (desckdone[i] == 1) {
		  // tag in progess  -> need further twist
		  if (1 == [lineString grep: desckstopwords[i] 
				       after:amString] ) {
		    [resString concatenate:bmString]; // append part till stopword
		  } else {
		    [resString concatenate:lineString]; // append line
		  if (s <= 2*NDESCKEY-3) [resString cat:" -- "];
		  desckdone[i]= 2; // done
		  [lineString takeStringValueFrom:amString];
		s+= desckdone[i];
	      if (s == 2*NDESCKEY) Done= 1;
	    }   /* end of while( (sgets .. )) loop */

	    [resString replaceEveryOccurrenceOfChar:'\n' with:""];
	    [resString replaceEveryOccurrenceOfRegex:" +" with:" "];
	    // Send the description back on pasteboard.
	    [pbid declareTypes:&IXFileDescriptionPboardType num:1 owner:self];
	    [pbid writeType:IXFileDescriptionPboardType data:[resString stringValue]
		  length:[resString length]];
#ifdef MYDEBUG
		fprintf(stderr, ">%s\n", [resString stringValue]);
	    } /* end if(htmldata && length) */
		*errmsg = "No actual data found on pasteboard.";

 	    [pbid deallocatePasteboardData:htmldata length:length];
	}  /* end if(types[i]) */
	    *errmsg = "No good PBoard type.";

	return self;

// These skip and done words are hard coded as below currently.
// Ideally, these should be modifiable via a word list file included in
// ~/Library/Services/NewsIndexing.service directory (or /LocalLibrary...)
// Path to this directory should be obtainable via argv[0] in main().
// I am not going to do this, so if there are any takers, please!
//  Line skip words that cause entire line to be filtered out.
static char *lskipwords[] = {
	"</TITLE>",    "</BODY>",
#define NLSKIP	  2		/* # of names in *lskipwords[] */

// Html article filter service method that removes junk (see top of this file)
// from article text before it is passed to indexing scanner.
// DigitalLibrarian calles this method with full article content on the
// pasteboard of type "NXTypedFileContentsPboardType:html".
// This method is called before the description filter service 
// -HtmlDescribe:..... method is called.
- HtmlFilter:(id)pbid userData:(const char *)udata error:(char **)errmsg

NXStream *outStream;
char *outBuffer;
int outLen, dummy;
char *htmldata; 
int  length;
int  prmnumber, i;
int  Done;

    	time(&timelast);		// refresh the no-work timer.

 	if([pbid findAvailableTypeFrom:&InPBType num:1]) {
	    // We have correct PB type.
	    [pbid readType:InPBType data:&htmldata length:&length];
	    if(htmldata && length) {
		// PB has some actual data in it.
		// Some initializations.
    		outStream = NXOpenMemory(NULL, 0, NX_WRITEONLY);  // create memory stream
		sgets(linebuf, reset2);	// need to reset this function...
		Done = 0;
		linebuf[0] = '\0';

		// Read one line at a time get necessary info.
		while( (sgets(linebuf, htmldata) != NULL) && !Done ) {

		    // First, see if we want to skip this line.
		    strbuf1[0] = '\0';
		    sscanf(linebuf, "%s", strbuf1);	/* first word */

// 		    if( (strlen(strbuf1) == 61) && (strbuf1[0] == 'M')) {
// #ifdef MYDEBUG
// 			// Debug: print 'u' for each line of uuencoded junk removed.
// 			fputc('u', stderr);
// #endif
// 			continue;	/* one word of 61 chars must be uuencoded stuff */
// 		    }

		    prmnumber = -1;
		    for(i=0; i<NLSKIP; i++) {
			if( strcasecmp(strbuf1, lskipwords[i]) == 0 ) {
			    prmnumber = i;		// Match found
		    if(prmnumber != -1) {
#ifdef MYDEBUG
			fprintf(stderr,"  Skip line: %s", linebuf);
			continue;	// There was a match. Try next line

// 		    // Now, check if we want to kill the rest of article
// 		    prmnumber = -1;
// 		    for(i=0; i<NDONE; i++) {
// 			if( strncasecmp( linebuf, donestring[i], donecount[i]) == 0 ) {
// 			    prmnumber = i;
// 			    break;
// 			}
// 		    }
// 		    if(prmnumber != -1) {
// 			// There was a match to donstring[].
// #ifdef MYDEBUG
// 			fprintf(stderr,"  Kill to EOF: %s", linebuf);
// #endif
// 			Done = 1;	// Raise DONE flag to kill article to EOF.
// 			continue;	// go to top to get kicked out of while(.. && !Done)..
// 		    }

		    // We want this line. So write to stream.
    		    NXPrintf(outStream, "%s", linebuf);
		}   /* end of while( (sgets .. )) loop */

		NXGetMemoryBuffer(outStream, &outBuffer, &outLen, &dummy);
		// Send the description back on pasteboard.
 		[pbid declareTypes:&NXAsciiPboardType num:1 owner:self];
 		[pbid writeType:NXAsciiPboardType data:outBuffer length:outLen];
		NXCloseMemory(outStream, NX_FREEBUFFER);
	    } /* end if(htmldata && length) */
		*errmsg = "No actual data found on pasteboard.";

 	    [pbid deallocatePasteboardData:htmldata length:length];
	}  /* end if(types[i]) */
	    *errmsg = "No good PBoard type.";

	return self;


int main(int argc, char *argv[])
id lid;
#ifdef MYDEBUG
int i;

	"HtmlFilter.deamon for DLibrarian indexing started: %s %s\n",
			ctime(&timelast), version);

#ifdef MYDEBUG
    for(i=0; i<argc; i++)
	fprintf(stderr, "argv[%d]: %s\n", i, argv[i]);

    lid = [[Listener alloc] init];
    [lid setServicesDelegate:[[Provider alloc] init]];
    if([lid checkInAs:"IXHTMLFILTER"]) _exit(1);	// uh-oh!
    [lid addPort];
    [Listener run];
    return 0;

