ftp.nice.ch/pub/next/unix/network/news/nn.6.4.16.s.tar.gz#/nn/digest.c

This is digest.c in view mode; [Download] [Up]

/*
 *	(c) Copyright 1990, Kim Fabricius Storm.  All rights reserved.
 *
 *	Digest article handling
 *
 *	The code to do the selective parsing of mail and mmdf formats,
 *	mail from lines and determining folder types is based on patches
 *	contributed by Bernd Wechner (bernd@bhpcpd.kembla.oz.au).
 */

#include "config.h"
#include "news.h"
#include "debug.h"

export int strict_from_parse = 2;

#ifdef DG_TEST

#define TEST(fmt, x, y) if (Debug & DG_TEST) printf(fmt, x, y)

#else

#define TEST(fmt, x, y)

#endif

#define UNIFY 040

static char digest_pattern[] = "igest";

init_digest_parsing()
{
    register char *m;

    for (m = digest_pattern; *m; m++) *m |= UNIFY;
}


is_digest()
{
    register char *subject;
    register char c, *q, *m;

    if ((subject = news.ng_subj) == NULL) return 0;

    while (c = *subject++) {
	if ((c | UNIFY) != ('d' | UNIFY)) continue;

	q = subject; m = digest_pattern;
	while ((c = *m++) && (*q++ | UNIFY) == c);
	if (c == NUL) return 1;
    }
    return 0;
}


/*
 * is_mail_from_line - Is this a legal unix mail "From " line?
 *
 * Given a line of input will check to see if it matches the standard
 * unix mail "from " header format. Returns 0 if it does and <0 if not.
 *
 * The check may be very lax or very strict depending upon
 * the value of "strict-mail-from-parse":
 *
 * 0 - Lax, checks only for the string "From ".
 * 1 - Strict, checks that the correct number of fields are present.
 * 2 - Very strict, also checks that each field contains a legal value.
 *
 * Assumptions: Not having the definitive unix mailbox reference I have
 * assumed that unix mailbox headers follow this format:
 *
 * From <person> <date> <garbage>
 *
 * Where <person> is the address of the sender, being an ordinary
 * string with no white space imbedded in it, and <date> is the date of
 * posting, in ctime(3C) format.
 *
 * This would, on the face of it, seem valid. I (Bernd) have yet to find a
 * unix mailbox header which doesn't follow this format.
 *
 * From: Bernd Wechner (bernd@bhpcpd.kembla.oz.au)
 * Obfuscated by: KFS (as usual)
 */

#define MAX_FIELDS 10

static char legal_day[]		= "SunMonTueWedThuFriSat";
static char legal_month[]	= "JanFebMarAprMayJunJulAugSepOctNovDec";
static int  legal_numbers[]	= { 1, 31, 0, 23, 0, 59, 0, 60, 1969, 2199 };

int is_mail_from_line(line, namebuf)
char *line;	/* Line of text to be checked */
char *namebuf;	/* Optional buffer to place packed sender info */
{
    char *fields[MAX_FIELDS];
    char *sender_tail;
    register char *lp, **fp;
    register int n, i;

    if (strncmp(line, "From ", 5)) return -100;
    if (strict_from_parse == 0) return 0;

    lp = line + 5;
    /* sender day mon dd hh:mm:ss year */
    for (n = 0, fp = fields; n < MAX_FIELDS; n++) {
	while (*lp && *lp != NL && isascii(*lp) && isspace(*lp)) lp++;
	if (*lp == NUL || *lp == NL) break;
	*fp++ = lp;
	while (*lp && isascii(*lp) && !isspace(*lp))
	    if (*lp++ == ':' && (n == 4 || n == 5)) break;
	if (n == 0) sender_tail = lp;
    }

    if (n < 8) return -200-n;

    fp = fields;

    if (n > 8 && !isdigit(fp[7][0])) fp[7] = fp[8]; /* ... TZ year */
    if (n > 9 && !isdigit(fp[7][0])) fp[7] = fp[9]; /* ... TZ DST year */

    if (namebuf != NULL) {
	char x = *sender_tail;
	*sender_tail = NUL;
	pack_name(namebuf, *fp, NAME_LENGTH);
	*sender_tail = x;
    }

    if (strict_from_parse == 1) return 0;

    fp++;
    for (i = 0; i < 21; i += 3)
	if (strncmp(*fp, &legal_day[i], 3) == 0) break;
    if (i == 21) return -1;

    fp++;
    for (i = 0; i < 36; i += 3)
	if (strncmp(*fp, &legal_month[i], 3) == 0) break;
    if (i == 36) return -2;

    for (i = 0; i < 10; i += 2) {
	lp = *++fp;
	if (!isdigit(*lp)) return -20-i;
	n = atoi(lp);
	if (n < legal_numbers[i] || legal_numbers[i+1] < n) return -10-i;
    }
    return 0;
}

/*
 * expect that f is positioned at header of an article
 */

static int is_mmdf_folder = 0;
static int is_mail_folder = 0;

/*
 * get_folder_type
 *
 * Given a file descriptor f, will check what type of folder it is.
 * Must be called at zero offset and caller must reposition if necessary.
 * Side-effects: sets is_mail_folder, is_mmdf_folder, and current_folder_type.
 * Return values:
 *	-1: folder is empty,
 *	 0: normal digest,
 *	 1: UNIX mail format
 *	 2: MMDF format
 */                                        

export int current_folder_type;

int get_folder_type(f)
FILE *f;
{
    char line[1024];
    int siz;

    is_mail_folder = 0;
    is_mmdf_folder = 0;

    if (fgets(line, 1024, f) == NULL)	
	return current_folder_type = -1;

    if (strncmp(line, "\001\001\001\001\n", 5) == 0) {
	is_mmdf_folder = 1;
	return current_folder_type = 2;
    }

    if (is_mail_from_line(line, (char *)NULL) == 0) {
	is_mail_folder = 1;
	return current_folder_type = 1;
    }

    return current_folder_type = 0;
}

get_digest_article(f, hdrbuf)
FILE *f;
news_header_buffer hdrbuf;
{
    int cont;

    digest.dg_hpos = ftell(f);
    TEST("GET DIGEST hp=%ld\n", digest.dg_hpos, 0);

    do {
	if (!parse_digest_header(f, 0, hdrbuf)) return -1;
	digest.dg_fpos = ftell(f);
	TEST("END HEADER hp=%ld fp=%ld\n", digest.dg_hpos, digest.dg_fpos);
    } while ((cont = skip_digest_body(f)) < 0);

    TEST("END BODY lp=%ld next=%ld\n", digest.dg_lpos, ftell(f));

    return cont;
}

#define BACKUP_LINES	 50	/* remember class + offset for parsed lines */

#define	LN_BLANK	0x01	/* blank line */
#define	LN_DASHED	0x02	/* dash line */
#define	LN_HEADER	0x04	/* (possible) header line */
#define	LN_ASTERISK	0x08	/* asterisk line (near end) */
#define	LN_END_OF	0x10	/* End of ... line */
#define	LN_TEXT		0x20	/* unclassified line */


/*
 * skip until 'Subject: ' (or End of digest) line is found
 * then backup till start of header
 */

/*
 * Tuning parameters:
 *
 *	MIN_HEADER_LINES:	number of known header lines that must
 *				be found in a block to identify a new
 *				header
 *
 *	MAX_BLANKS_DASH		max no of blanks on a 'dash line'
 *
 *	MIN_DASHES		min no of dashes on a 'dash line'
 *
 *	MAX_BLANKS_ASTERISKS	max no of blanks on an 'asterisk line'
 *
 *	MIN_ASTERISKS		min no of asterisks on an 'asterisk line'
 *
 *	MAX_BLANKS_END_OF	max no of blanks before "End of "
 */

#define	MIN_HEADER_LINES	2
#define	MAX_BLANKS_DASH		3
#define	MIN_DASHES		16
#define	MAX_BLANKS_ASTERISK	1
#define	MIN_ASTERISKS		10
#define	MAX_BLANKS_END_OF	1

skip_digest_body(f)
register FILE *f;
{
    off_t  backup_p[BACKUP_LINES];
    int	   line_type[BACKUP_LINES];
    register int backup_index, backup_count;
    int    more_header_lines, end_or_asterisks, blanks;
    int    colon_lines;
    char   line[1024];
    register char *cp;
    char **dg_hdr_field();

#define	decrease_index()	\
    if (--backup_index < 0) backup_index = BACKUP_LINES - 1

    backup_index = -1;
    backup_count = 0;
    end_or_asterisks = 0;

    digest.dg_lines = 0;


 next_line:
    more_header_lines = 0;
    colon_lines = 0;

 next_possible_header_line:
    digest.dg_lines++;

    if (++backup_index == BACKUP_LINES) backup_index = 0;
    if (backup_count < BACKUP_LINES) backup_count++;

    backup_p[backup_index] = ftell(f);
    line_type[backup_index] = LN_TEXT;

    if (fgets(line, 1024, f) == NULL) {
	TEST("end_of_file, bc=%d, lines=%d\n", backup_count, digest.dg_lines);

	if (is_mmdf_folder) {
	    digest.dg_lpos = backup_p[backup_index];
	    is_mmdf_folder = 0;
	    return 0;
	}

	/* end of file => look for "****" or "End of" line */

	if (end_or_asterisks)
	    while (--backup_count >= 0) {
		--digest.dg_lines;
		decrease_index();
		if (line_type[backup_index] & (LN_ASTERISK | LN_END_OF)) break;
	    }

	digest.dg_lpos = backup_p[backup_index];

	if (digest.dg_lines == 0) return 0;

	while (--backup_count >= 0) {
	    --digest.dg_lines;
	    digest.dg_lpos = backup_p[backup_index];
	    decrease_index();
	    if ((line_type[backup_index] &
		(LN_ASTERISK | LN_END_OF | LN_BLANK | LN_DASHED)) == 0)
		break;
	}

	return 0;	/* no article follows */
    }

    TEST("\n>>%-.50s ==>>", line, 0);

    if (is_mmdf_folder) {
	/* in an mmdf folder we simply look for the next ^A^A^A^A line */
	if (line[0] != '\001' || strcmp(line, "\001\001\001\001\n"))
	    goto next_line;

	digest.dg_lpos = backup_p[backup_index];
	--digest.dg_lines;
	return (digest.dg_lines <= 0) ? -1 : 1;
    }

    for (cp = line; *cp && isascii(*cp) && isspace(*cp); cp++);

    if (*cp == NUL) {
	TEST("BLANK", 0, 0);
	line_type[backup_index] = LN_BLANK;
	goto next_line;
    }

    if (is_mail_folder) {
	/* in a mail folder we simply look for the next "From " line */
	if (line[0] != 'F' || is_mail_from_line(line, (char *)NULL) < 0)
	    goto next_line;

	line_type[backup_index] = LN_HEADER;
	fseek(f, backup_p[backup_index], 0);
	goto found_mail_header;
    }

    blanks = cp - line;

    if (*cp == '-') {
	if (blanks > MAX_BLANKS_DASH) goto next_line;

	while (*cp == '-') cp++;
	if (cp - line - blanks > MIN_DASHES) {
	    while (*cp && (*cp == '-' || (isascii(*cp) && isspace(*cp)))) cp++;
	    if (*cp == NUL) {
		TEST("DASHED", 0, 0);

		line_type[backup_index] = LN_DASHED;
	    }

	}
	goto next_line;
    }

    if (*cp == '*') {
	if (blanks > MAX_BLANKS_ASTERISK) goto next_line;

	while (*cp == '*') cp++;
	if (cp - line - blanks > MIN_ASTERISKS) {
	    while (*cp && (*cp == '*' || (isascii(*cp) && isspace(*cp)))) cp++;
	    if (*cp == NUL) {
		TEST("ASTERISK", 0, 0);
		line_type[backup_index] = LN_ASTERISK;
		end_or_asterisks++;
	    }
	}
	goto next_line;
    }

    if (blanks <= MAX_BLANKS_END_OF &&
	*cp == 'E' && strncmp(cp, "End of ", 7) == 0) {
	TEST("END_OF_", 0, 0);
	line_type[backup_index] = LN_END_OF;
	end_or_asterisks++;
	goto next_line;
    }

    if (blanks)
	goto next_possible_header_line;
/* must be able to handle continued lines in sub-digest headers...
	goto next_line;
*/

    if (!dg_hdr_field(line, 0))
    {
	char *colon;
	if (colon = strchr(line, ':')) {
	    for (cp = line; cp < colon; cp++)
		if (!isascii(*cp) || isspace(*cp)) break;
	    if (cp == colon) {
		TEST("COLON", 0, 0);
		colon_lines++;
		line_type[backup_index] = LN_HEADER;
		goto next_possible_header_line;
	    }
	}
	if (is_mail_from_line(line, (char *)NULL) == 0) {
	    TEST("FROM_", 0, 0);
	    colon_lines++;
	    line_type[backup_index] = LN_HEADER;
	}

	goto next_possible_header_line;
    }

    TEST("HEADER", 0, 0);

    line_type[backup_index] = LN_HEADER;
    if (++more_header_lines < MIN_HEADER_LINES)
	goto next_possible_header_line;

    /* found block with MIN_HEADER_LINES */

    TEST("\nSearch for start of header\n", 0, 0);

    colon_lines += more_header_lines;
    for (;;) {
	fseek(f, backup_p[backup_index], 0);
	if (line_type[backup_index] == LN_HEADER)
	    if (--colon_lines <= 0) break;
	--digest.dg_lines;
	if (--backup_count == 0) break;
	decrease_index();
	if ((line_type[backup_index] & (LN_HEADER | LN_TEXT)) == 0)
	    break;
    }

    if (digest.dg_lines == 0) {
	TEST("Skipped empty article\n", 0, 0);
	return -1;
    }

 found_mail_header:

    for (;;) {
	digest.dg_lpos = backup_p[backup_index];
	if (--backup_count < 0) break;
	decrease_index();
	if ((line_type[backup_index] & (LN_BLANK | LN_DASHED)) == 0)
	    break;
	--digest.dg_lines;
    }

    return (digest.dg_lines == 0) ? -1 : 1;
}


parse_digest_header(f, all, hdrbuf)
FILE *f;
int all;
news_header_buffer hdrbuf;
{
    extern char *parse_header(), **dg_hdr_field();

    digest.dg_date = digest.dg_from = digest.dg_subj = digest.dg_to = NULL;

    parse_header(f, dg_hdr_field, all, hdrbuf);

    return digest.dg_from || digest.dg_subj;
}


static char **dg_hdr_field(lp, all)
register char *lp;
int all;
{
    static char *dummy;
    static char namebuf[NAME_LENGTH+1];

#define check(name, lgt, field) \
    if (isascii(lp[lgt]) && isspace(lp[lgt]) && strncmp(name, lp, lgt) == 0) {\
	TEST("MATCH: field ", 0, 0); \
	return &digest.field; \
    }


    TEST("\nPARSE[%.20s] ==>> ", lp, 0);

    switch (*lp++) {

     case '\001':
	/* In an mmdf folder ^A^A^A^A is skipped at beginning of header */
	if (!is_mmdf_folder) break;
	if (strncmp(lp, "\001\001\001\n", 4)) break;
	digest.dg_hpos += 5;
	return NULL;

     case 'D':
     case 'd':
	check("ate:",	4, dg_date);
	break;

     case 'F':
     case 'f':
	check("rom:",	4, dg_from);
	if (!is_mail_folder) break;
	if (*--lp != 'F') break;
	if (is_mail_from_line(lp, namebuf) < 0) break;
	/* Store packed sender in dg_from here and return dummy to parser */
	if (digest.dg_from == NULL) digest.dg_from = namebuf;
	return &dummy;

     case 'R':
     case 'r':
	if (!all) break;
	check("e:",	2, dg_subj);
	break;

     case 'S':
     case 's':
	check("ubject:", 7, dg_subj);
	check("ubject",	6, dg_subj);
	break;

     case 'T':
     case 't':
	check("itle:",	5, dg_subj);
	if (!all) break;
	check("o:",	2, dg_to);
	break;
    }

#undef check
    TEST("NOT MATCHED ", 0, 0);

    return NULL;
}

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.