ParseHTML.h

This is ParseHTML.h in view mode; [Download] [Up]
/*				FORMAT CONVERSION FROM SGML
**				===========================
**
**
**	22 Nov 92	Fixed quoting of hrefs.
**			CERN_WEIRDO ifdefed out -- proper SGML expected
**			REMOVE_SCRIPT ifdefed out -- did ignore lines starting with "."
*/

#import "HTStyle.h"

extern HTStyleSheet * styleSheet;

#ifndef NEXT_CHAR
static FILE * sgmlStream;
#define END_OF_FILE NXAtEOS(sgmlStream)		/* @@@@ */
#define NEXT_CHAR getc(sgmlStream)
#define BACK_UP	ungetc(sgmlStream)
#endif

#define upper(c) (  ((c>='a')&&(c<='z')) ? (char)((int)c-32) : c )

/*	State machine states:
*/
enum state_enum {S_text,	/* We are not in a tag				*/
#ifdef REMOVE_SCRIPT
		S_column_1,	/* as Text but first character on input line	*/
		S_dot,		/* We have had dor in first column		*/
		S_junk_script,	/* Ignore everything until NL or ";"		*/
#else
#define S_column_1 S_text
#endif
		S_word,		/* We have just had a non-white printable 	*/
		S_char_ref,	/* Numeric character reference */
		S_tag_start,	/* We have just had "<"				*/
		S_tag_h,
		S_tag_a, S_end_a,
		S_tag_d, S_end_d,
		S_tag_i,
		S_tag_l, S_tag_lis,
		S_tag_n,
		S_tag_o, S_end_o,
		S_tag_p,
		S_tag_u, S_end_u,
		S_tag_end,	/* We have just had "</"			*/
		S_restoffile,
		S_end_h, 
		S_title,
		S_anchor, S_href, S_href_quoted, S_href_unquoted, S_aname,
		S_junk_tag,	/* Ignore everything until ">"			*/
#ifdef CERN_WEIRDO
		S_junk_line,	/* Ignore everything until "\n"			*/
#endif
		S_done};

typedef struct _SGML_style {
    char *	start_tag;	/* Tag to mark start of a style 	*/
    char *	paragraph_tag;	/* Tag to mark paragraph mark within style */
    char *	tab_tag;	/* Tag to mark tab within style 	*/
    char *	end_tag;	/* Tag to mark end of style 		*/
    char *	start_text;	/* Text conventionally starting this style */
    char *	paragraph_text;	/* Text used as a paragraph mark within style*/
    char *	end_text;	/* Text used to end a style 		*/
    HTStyle *	style;		/* Paragraph style to be used 		*/
    int		free_format;	/* Flag: are line ends word breaks only? */
    int		litteral;	/* Flag: end only at close tag (cheat) ? */
} SGML_style;

/*	Stack of previous styles:
*/
typedef struct _NestedStyle {
	struct _NestedStyle *	next;		/* previously nested style or 0 */
	SGML_style * 		SGML;		/* SGML style interrupted */
} NestedStyle;

/*				MODULE-WIDE DATA
**
**
*/

/*	We delay changing style until necessary to avoid dummy style changes
**	resulting in too many extra newlines.
*/
static SGML_style * current_style;	/* The current output style */
static SGML_style * next_style;		/* The next style to go into */

static NestedStyle * styleStack;
static int output_in_word;		/* Flag: Last character ouput was non-white */

static char_num;			/* Current value of numeric character reference */

/*	Paragraph Styles used by the SGML parser:
**	----------------------------------------
*/

static SGML_style	Normal =
	{ "", "<P>\n", "\t", "",
	 "","", "", 0 ,1, 0};
	
static SGML_style	Heading[6] = {
	{ "\n<H1>", "</H1>\n<H1>", "\t", "</H1>", "", "", "", 0, 1, 0},
	{ "\n<H2>", "</H2>\n<H2>", "\t", "</H2>", "", "", "", 0, 1, 0},
	{ "\n<H3>", "</H3>\n<H3>", "\t", "</H3>", "", "", "", 0, 1, 0},
	{ "\n<H4>", "</H4>\n<H4>", "\t", "</H4>", "", "", "", 0, 1, 0},
	{ "\n<H5>", "</H5>\n<H5>", "\t", "</H5>", "", "", "", 0, 1, 0},
	{ "\n<H6>", "</H6>\n<H6>", "\t", "</H6>", "", "", "", 0, 1, 0}
};
	 
static SGML_style	Glossary =	/* Large hanging indent with tab */
	{ "\n<DL>\n<DT>", "\n<DT>", "\n<DD>", "\n</DL>\n",
	"", "", "", 0, 1};
	
static SGML_style	listStyle  =		/* Hanging indent with tab */
	{ "\n<UL>\n<LI>", "\n<LI>", "\t", "\n</UL>",
	"\267\t", "\267\t", "", 0, 1, 0};

static SGML_style	addressStyle =
	{ "\n<ADDRESS>", "<P>", "\t", "\n</ADDRESS>",
	"", "", "", 0, 1, 0 };
	
/*	Explicit format styles:
*/	
static SGML_style	Example =	/* Fixed width font, at least 80 chars wide */
	{ "\n<XMP>", "\n", "\t", "</XMP>",
	"", "", "", 0 , 0, 1};

static SGML_style	Preformatted =	/* Fixed width font, at least 80 chars wide */
	{ "\n<PRE>", "\n", "\t", "</PRE>",
	"", "", "", 0 , 0, 0};		/* not litteral */

static SGML_style	Fixed =	/* Fixed width font, at least 80 chars wide */
	{ "\n<FIXED>", "<P>", "\t", "</FIXED>",
	"", "", "", 0 , 1, 0};

static SGML_style	Listing =	/* Fixed width font, at least 80 chars wide */
	{ "\n<LISTING>", "\n", "\t", "</LISTING>",
	"", "", "", 0 , 0, 1};

/*	Table of all possible SGML paragraph styles
*/
static SGML_style * styleTable[] = {
	&Normal, &Heading[0], &Heading[1], &Heading[2],
	&Heading[3], &Heading[4], &Heading[5],
	&Glossary, &listStyle, &addressStyle,  &Preformatted, &Fixed, &Example, &Listing
}; /* style table */

#define NUMBER_OF_STYLES (sizeof(styleTable)/sizeof(styleTable[0]))


/*	Highlighting styles
*/
static HTStyle * Highlighting[3];

/*				F U N C T I O N S
*/


/*	Get Styles from style sheet
**	---------------------------
*/
void get_styles()
{
    Normal.style =		HTStyleNamed(styleSheet, "Normal");
    Heading[0].style =		HTStyleNamed(styleSheet, "Heading1");
    Heading[1].style =		HTStyleNamed(styleSheet, "Heading2");
    Heading[2].style =		HTStyleNamed(styleSheet, "Heading3");
    Heading[3].style =		HTStyleNamed(styleSheet, "Heading4");
    Heading[4].style =		HTStyleNamed(styleSheet, "Heading5");
    Heading[5].style =		HTStyleNamed(styleSheet, "Heading6");
    Glossary.style =		HTStyleNamed(styleSheet, "Glossary");
    listStyle.style =		HTStyleNamed(styleSheet, "List");
    addressStyle.style= 	HTStyleNamed(styleSheet, "Address");
    Example.style =		HTStyleNamed(styleSheet, "Example");
    Preformatted.style =	HTStyleNamed(styleSheet, "Example");
    Listing.style =		HTStyleNamed(styleSheet, "Listing");
    
    Highlighting[0] =	HTStyleNamed(styleSheet, "Italic");
    Highlighting[1] =	HTStyleNamed(styleSheet, "Bold");
    Highlighting[2] =	HTStyleNamed(styleSheet, "Bold-Italic");
}


/*	Output the code for styles
**	--------------------------
*/
void output_paragraph()
{
    HTStyle * s = current_style->style;
    int newlines = ((s->spaceBefore+s->spaceAfter) / s->paragraph->lineHt) + 1;
    int i;
    for(i=0; i<newlines; i++) OUTPUT('\n');	/* Rather approximate! @@	*/
    OUTPUTS(current_style->paragraph_text);
    output_in_word = 0;
}

/*	Switch SGML paragraph style (finishing the old one)
**
**	The "formatted" flag allows us to add a paragraph end at the end of a
**	normal style (such as <H1> etc) but suppresses this for litteral text
**	styles such as <XMP> and <LISTING which have explicit paragraph end.
**	Thus, ALL text between <XMP> tags is litteral, and no newline results
**	from going in and out of <XMP> sections.
**
**	Now, we allow only the larger of the space before/space after
**	requirements, as that is nearer what is meant.
*/
void update_style()
{
    HTStyle * cur = current_style->style;
    HTStyle * next = next_style->style;
    
    OUTPUTS(current_style->end_text);
    
    if (current_style->free_format && cur && next) {	/* generate new lines */
    	int i;
	float space = cur->spaceAfter > next->spaceBefore ?
			cur->spaceAfter : next->spaceBefore;	/* max */
        int newlines = (space/cur->paragraph->lineHt) + 1;
	
	output_in_word = 0;
        for(i=0; i<newlines; i++) OUTPUT('\n');	/* Rather approximate! 	*/
    }

    current_style = next_style;
    if (current_style->style) SET_STYLE(current_style->style);

    OUTPUTS(current_style->start_text);
}

#define UPDATE_STYLE {if (current_style!=next_style) update_style();}

/*	Rememember that we will be going into style s
**	---------------------------------------------
*/
void change_style(SGML_style * s)
{
    next_style = s;
}


/*	End an SGML style
*/
void end_style()
{
    if (styleStack) {
        NestedStyle * N = styleStack;
	styleStack = N->next;
	free(N);
	if (styleStack) change_style(styleStack->SGML);
	else change_style(&Normal);
    } else {
        if (TRACE) printf("HTML: We have ended more styles than we have started!\n");
        change_style(&Normal);		/* Note there is no nesting! */
    }
}

/*	Start a nested SGML style
*/
void start_style(SGML_style * s)
{
    NestedStyle * N = malloc(sizeof(*N));
    N->next = styleStack;
    N->SGML = s;
    styleStack = N;
    change_style(s);
}

/*	Start a highlighted area
**	------------------------
*/

void start_highlighting(HTStyle * style)
{
    /* SET_STYLE(style);  @@@ to be fixed up */
}

/*	End a highlighted area
**	----------------------
*/
void end_highlighting()
{
	/* @@@@@@ Need set and unset style functions, traits and nesting */
}

/*	Check keyword syntax
**	---------------------
**
**	This function is called when there is only one thing it can be.
**	The check is case-insensitive.
**
** On entry,
**	s		Points to a template string in uppercase, with a space
**			standing for any amount of space in the input stream.
**			THE FIRST CHARACTER HAS ALREADY BEEN READ AND CHECKED.
** On exit,
**	returns		YES if matched, all maching characters gobbled up;
**			NO  if failure, only matching characters gobbled up.
*/
static BOOL check(char *s)
{
    char * p = s+1;	/* Pointer to template string */
    char c;		/* Character from stream */
    for (; *p; p++) {
        if (*p == ' ') {
     	    for(c=NEXT_CHAR; WHITE(c) ;c=NEXT_CHAR) /*null*/ ;
	    BACK_UP;		/* Put non-blank back into stream */
    	} else {
	    c = NEXT_CHAR;
	    if (upper(c) != *p) {
	    	printf("SGML parse: `%c' found when `%c' in `%s' was expected.\n",
				c, *p, s);
		BACK_UP;	/* Put eroneous character back on stream */
	    	return NO;	/* failed: syntax error */
	    } /* bad char */
	} /* non-blank */
    } /* for */
    return YES;			/* succeded: go to end of template string */
}


/*	Read example text
**	-----------------
**
**	Returns when terminator or end-of-file found.
**
**	As we are looking for a terminator, we have to buffer things which
**	could be terminators so as to be able to replace thm into the output
**	stream if we find they aren't. If there wasn't the ambiguity as to
**	upper/lower case, we could of course just regurgitate the terminator
**	itself.
**
*/
static int parse_example(SGML_style * style, char * terminator)
{
    char * p = terminator;
    char buffer[20];			/* One longer than the terminator */
    char * q = buffer;

    start_style(style);
    UPDATE_STYLE;
    for (;;){
    	if (END_OF_FILE) return S_text;	/* return if end of stream */
    	*q = NEXT_CHAR;
	if (upper(*q)==*p) {
	    p++; q++;
	    if (!*p) {
	        end_style();
	        return S_text;		/* Return: terminator found */
	    }
	} else {
	    if (q!=buffer) {		/* Replace what could have been terminator */
	        for(p=buffer; p<q; p++) {
    	    		OUTPUT(*p);
		}
		buffer[0] = *q;		/* Put this char back at beginning of buffer */
		p = terminator;	        /* point to start of terminator again */
		q = buffer;
	    }
#ifdef JUNK
    	    if (*q !=10) {
	        OUTPUT(*q);		/* 	Most common 99% path  */
	    } else {
	    	output_paragraph();	/* @@ gives space_before and after */
	    }
#else
	    OUTPUT(*q);		/* 	Most common 99% path  */
#endif
	}
    }    
}


/*	Read in an SGML Stream						readSGML:
**	----------------------
*/

/*	State machine to perform lexical analysis of SGML
**
**	This routine parses an SGML stream to produce styles, text and anchors.
**
**	This machine does not do good error recovery. It ignores tags which it doesn't
**	understand. It is a simple finite state machine with no push-down stack, and
**	therefore cannot (yet) understand nested constructs.
**
**	NON-REENTRANT.
**
** On entry,
**	sgmlStream	is open for read, and will provide the marked up data.
**	diagnostic	0 => Read and interpret
**			1 => Dump RTF into buffer as text.
** On exit,
**	return value	is self.
**	self		has anchors added which came up.
**			Is loaded if state returned is "done".
*/


/*	When a state has been found, we break out of the switch with this macro.
**	It is a macro to allow the code to be changed more easily (eg to return).
**	As it breaks out of the inner switch only, we must remember breaks after
**	that switch to get out of the next outer one, and so on.
*/
#ifdef NeXT
- readSGML: (NXStream *)stream diagnostic:(int)diagnostic
#else
int readSGML(HyperText * self, FILE * stream, int diagnostic)

#endif
#define SETSTATE(x) {state=(x); break;}
{    
    enum state_enum state = S_column_1;
     
/*	Information to be accumulated:
*/
    char title[256];			/* See <TITLE> tag. */
    char reference[256];		/* See <A HREF=...> attribute */
    char anchor_name[256];		/* See <A NAME=...> attribute */
    int title_length = 0;
    int reference_length = 0;
    int anchor_name_length = 0;		/* See <A NAME=...> attribte */
    BOOL end_style_on_nl = NO;		/* For styles which only last a line (ugh!) */
    BOOL white_significant = NO;	/* Not free format */

    
/*	Set up global pointer for other routines
*/
    output_in_word = 0;		/* Flag: Last character output was non-white */
    HT = self;
    sgmlStream = stream;	

/*	Pick up the styles we want from a local style sheet
*/
    get_styles();
    styleStack = 0;
    current_style = &Normal;
    
    if (TRACE) printf("Parsing SGML stream %i\n", sgmlStream);
    START_OUTPUT;
    set_style(Normal.style);		/* Was random! 910910 TBL */  

    while(!END_OF_FILE && (state!=S_done)) {
        char c = NEXT_CHAR;
	if (c == (char)-1) {
	    if (TRACE) printf("*** HT: -1 found on input stream not at EOF!\n");
	    break;
	}
#ifdef CHARACTER_TRACE
	if(TRACE) printf("<%c>", c);
#endif
	switch (state) {
	    
#ifdef REMOVE_SCRIPT
	case S_column_1:
	    if (c=='.') {
	        SETSTATE(S_dot);
	    }
	    BACK_UP;
	    SETSTATE(S_text);
	    
	case S_dot:				/* Dot in first column */
	    if (WHITE(c)) {
	        OUTPUT('.');
		BACK_UP;
		SETSTATE(S_text);		/* OOPS: must have been real "." */
	    } else {
	        SETSTATE(S_junk_script);	/* Throw away SCRIPT commands */
	    }

	case S_junk_script:
		SETSTATE( (c=='\n')||(c==';') ? S_column_1
 					      : S_junk_script);

#endif	    
	case S_word:		/* We have just had non-white characters */
	    if (c=='<') SETSTATE(S_tag_start);
	    if (c=='&') goto rcdata;
	    if (!WHITE(c)) {
	        OUTPUT(c);
		break;
	    }
	    	    
	case S_text:		/* We are not in a tag or a word */
	    switch(c) {
	    case '<':	SETSTATE(S_tag_start);
		
/*	Special code for CERN SGML double newline significance: ugh!  :-(
*/
	    case '\n':
	    	if (white_significant) {
		    output_paragraph();
		    output_in_word = 0;
		    SETSTATE(S_text);
		}
		
#ifdef CERN_WEIRDO				/* Obsolete 921122 */
		if (end_style_on_nl) {
		    end_style();
		    end_style_on_nl = NO;
		} else {
		    int newlines = 1;
		    while( (c=NEXT_CHAR)==10) {
			newlines++;
		    }
		    if (newlines>1) {
		        output_paragraph(); /* n newlines becomes a paragraph.*/
			output_in_word=0;
		    }
		    BACK_UP;		/* Go back and check c again */
		    SETSTATE(S_column_1);
		}
#else
		{
		    int newlines = 1;
		    while( (c=NEXT_CHAR)==10) {
			newlines++;
		    }
		    BACK_UP;		/* Go back and check c again */
		    SETSTATE(S_column_1);
		}
#endif
	    
	    case '\t':
	        UPDATE_STYLE;			/* Must be in new style */
						/* FALL THROUGH! */
	    case ' ':
	    	OUTPUT(c);
		output_in_word = 0;
		SETSTATE(S_text);
		
	    default:	   				/* New word */

		/* The character is non-white. Print a space if necessary. */
	        UPDATE_STYLE;			/* Must be in new style */
		if (output_in_word) {
		    OUTPUT(' ');
		}
rcdata:
		if (c=='&') {			/* Entities */
		    c = NEXT_CHAR;
		    switch (c) {
			case 'a': if (check("AMP;")) { c = '&'; goto printable; }; break;
			case 'l': if (check("LT;"))  { c = '<'; goto printable; }; break;
			case 'g': if (check("GT;"))  { c = '>'; goto printable; }; break;
			case 'q': if (check("QUOT;")){ c = '"'; goto printable; }; break;
			case '#': {
			    char_num = 0;	/* initialize accumulation */
			    SETSTATE(S_char_ref);
			}
			default: break;
		    }
		    if (TRACE) fprintf(stderr, "HTML: Bad entity.\n");
		    SETSTATE(S_word);
		}
printable:
	    	OUTPUT(c);			/* First char of new word */ 
		output_in_word = 1;
	        SETSTATE(S_word);		/* Now take rest of word faster */
	    
	    } /* switch(c) */
	    break;
	    
	case S_char_ref:
		if ((c>=0) && (c<=9)) {
		    char_num = char_num*10 + (c-'0');
		} else {		/* c had better be a semicolon in fact */
		    c = (char) char_num;
		    goto printable;	/* always treat as non-blank @@@ bug? */
		};
		
	case S_tag_start:
	    switch (c) {
	    case 'A':
	    case 'a':	SETSTATE(S_tag_a);
	    case 'd':
	    case 'D':   SETSTATE(S_tag_d);
	    case 'H':
	    case 'h':	SETSTATE(S_tag_h);
	    case 'i':
	    case 'I':	SETSTATE(S_tag_i);
	    case 'L':
	    case 'l':	SETSTATE(S_tag_l);
	    case 'n':
	    case 'N':	SETSTATE(S_tag_n);
	    case 'O':
	    case 'o':	SETSTATE(S_tag_o);
	    case 'p':
	    case 'P':	SETSTATE(S_tag_p);
	    case 'r':
	    case 'R':	SETSTATE(check("RESTOFFILE")
	    			? S_restoffile:S_junk_tag)
	    case 'T':
	    case 't':	SETSTATE(check("TITLE>") ? S_title : S_junk_tag);
	    case 'U':
	    case 'u':	SETSTATE(S_tag_u);
	    case 'X':
	    case 'x':	SETSTATE( check("XMP>") ? 
			  parse_example(&Example, "</XMP>")
			  : S_junk_tag);
	    case '/':	SETSTATE( S_tag_end);
	    default:	SETSTATE( S_junk_tag);
	    } /*  switch on character */
	    break;
    
	case S_tag_end:
	    switch (c) {
	    case 'A':
	    case 'a':	SETSTATE(S_end_a);
	    case 'D':
	    case 'd':	SETSTATE(S_end_d);
	    case 'H':
	    case 'h':	SETSTATE(S_end_h);
	    case 'I':
	    case 'i':   if (check("ISINDEX")) isIndex = YES;
	    		SETSTATE(S_junk_tag);
	    case 'n':
	    case 'N':	SETSTATE(check("NODE>") ? S_done : S_junk_tag)
	    case 'O':
	    case 'o':	SETSTATE(S_end_o);
	    case 'P':
	    case 'p':   if (check("PRE")) {
			    end_style();
			    white_significant = NO;
			    SETSTATE(S_junk_tag);
			}
	    case 'U':
	    case 'u':	SETSTATE(S_end_u);
	    default:	SETSTATE(S_junk_tag);
	    } /*  switch on character */
	    break;
	
	case S_junk_tag:	SETSTATE( (c=='>') ? S_text : S_junk_tag);

#ifdef CERN_WEIRDO
	case S_junk_line:	SETSTATE( (c=='\n') ? S_column_1 : S_junk_line);
#endif
	case S_tag_i:
            switch(c) {
#ifdef CERN_WEIRDO
	    case '1':	SETSTATE(S_junk_line); /* Junk I1 */
#endif
	    case 's':
	    case 'S':	if (check("SINDEX")) isIndex = YES;
	    		SETSTATE(S_junk_tag);
	    default:	SETSTATE(S_junk_tag);
	    }	
	    break;
	case S_tag_a:
	    switch(c) {
	    case 'd':
	    case 'D':
		if (!check("DDRESS>")) { SETSTATE(S_junk_tag) };
		start_style(&addressStyle);
		SETSTATE( S_text);
	    
	    case '\n':
	    case ' ':
	    case '>':
	    	reference_length = 0;
	    	anchor_name_length = 0;
	    	SETSTATE(S_anchor);
	    
	    } /* switch on character */
	    break;
	    
	case S_tag_p:	if ((c==' ') || (c=='>')) {
			    output_paragraph();
			    SETSTATE( c=='>'? S_text : S_junk_tag);
			}
			
			if ((c=='R') || (c=='r')) {		/* <PRE> */
			    if (check("RE")) {
				start_style(&Preformatted);
				update_style();
			        white_significant = YES;
				SETSTATE( S_junk_tag);
			    }
			}
			if ((c=='L') || (c=='l')) {		/* OBSOLETE @@ */
			    if (check("LAINTEXT>")) {
				if (TRACE) printf("Loading as plain text\n");
				[self readText:sgmlStream];	/* will read to end */
				SETSTATE(S_done);		/* Inhibit RTF load */
                             }
			 }
			 SETSTATE(S_junk_tag);

	case S_tag_lis:
	    		SETSTATE( check("TING>") ? 
			  parse_example(&Listing, "</LISTING>")
			  : S_junk_tag);


/* Subnodes are delimited by <NODE>...</NODE>. They have the same address as the
** node, but the anchor IDs must be different. This is not thought out.	@@
** Perhaps a hierarchical anchor ID format ....
*/
	case S_tag_n:
	    switch(c) {
	    case 'o':
	    case 'O':	if (check("ODE>")) {	/* Load a subnode */
			    if(TRACE)  printf("Loading subnode...NOT IMPLEMENTED\n");
#ifdef NOT_DEFINED
			    Anchor * a = [Anchor new];
			    HyperText * SN;
			    [a setAddress:[nodeAnchor address]];
			    SN = [HyperText newAnchor:a Server:server];
			    [alsoStore addObject:SN];
			    SN->storeWith = self;
			    [SN readSGML:sgmlStream diagnostic:diagnostic];
			    /* But leave it hidden from view for now. */
#endif
	    		}
	    		SETSTATE(S_text);
			
	    case 'E':					/* <NE */
	    case 'e':
	        if (check("EXTID ")) {
		    int value = 0;
		    for(;;){
		        c = NEXT_CHAR;
			if ((c=='N') || (c=='n')) {
			    if (!check("N = ")) {
				if (TRACE) fprintf(stderr, 
					"HTML: Bad nextid\n");
				SETSTATE(S_junk_tag);
			    }
			    c = NEXT_CHAR;
			}
			if (c=='"') continue;	/* 921122 */
			if ((c>='a' && c<='z') || (c>='A' && c<='Z'))
			 continue; /* 930701 */
			if ((c<'0') || (c>'9')) {
			    nextAnchorNumber = value;
			    if (TRACE) fprintf(stderr, "Next anchor number: %i\n", value);
			    BACK_UP;
			    SETSTATE(S_junk_tag);
			    break;
			}
			value = value*10 + (c-'0');
		    }
		}
		SETSTATE(S_junk_tag);
		
	    } /* switch */
	    break;
	    
/*	Parse anchor tag:
**	----------------
*/
	case S_anchor:
	    if (c==' ') SETSTATE( S_anchor);			/* Ignore spaces */
	    if ((c=='H')||(c=='h')) {
		if (check("HREF = ")) {
		    SETSTATE( S_href);
		}
	    }
	    if ((c=='N')||(c=='n')) {
		if (check("NAME = ")) {
		    SETSTATE( S_aname);
		}
	    }
	    
	    if (c=='>') {			/* Anchor tag is over */
		/* Should use appendStartAnchor! @@@ */
		HTStyle * style = HTStyleNew();
	    	char * parsed_address;
		int anchorNumber;
		
	        reference[reference_length]=0;		/* Terminate it */
	        anchor_name[anchor_name_length]=0;	/* Terminate it */

		style->anchor =
		  *anchor_name ? [Anchor newParent:nodeAnchor tag:anchor_name]
			      : [self anchor];
			      
		/* If next anchor number not specified, ensure it is safe */
		
		if ((anchor_name[0] == ANCHOR_ID_PREFIX)
		&&  (sscanf(anchor_name+1, "%i", &anchorNumber) > 0))	/* numeric? */
		 if (anchorNumber >= nextAnchorNumber)
		  nextAnchorNumber = anchorNumber+1;		/* Prevent reuse */
		
		
		[(Anchor *)style->anchor isLastChild];	/* Put in correct order */	      
		if (*reference) {			/* Link only if href */
		    parsed_address = HTParse(reference, [nodeAnchor address],
		    	 PARSE_ALL);
		    [(Anchor *)(style->anchor) linkTo:
		    			[Anchor newAddress:parsed_address]];
		    free(parsed_address);
		}
		
		UPDATE_STYLE;
		SET_STYLE(style);		/* Start anchor here */
		free(style);
		SETSTATE(S_text);
	    }
	    printf("SGML: Bad attribute in anchor.\n");
	    SETSTATE( S_junk_tag);
    
	case S_href:
	    if (c=='"') SETSTATE (S_href_quoted);
	case S_href_unquoted:
	    if ((c==' ') || (c=='\n')) SETSTATE( S_anchor);
	    if (c=='>'){
	        BACK_UP;
	    	SETSTATE( S_anchor);
	    }
	    if (reference_length<255) {
		reference[reference_length++] = c;
	    }
	    SETSTATE(S_href_unquoted);

	case S_href_quoted:
	    if (c=='"') SETSTATE( S_anchor);
	    if (reference_length<255) {
		reference[reference_length++] = c;
	    }
	    SETSTATE( state);

	case S_aname:
	    if ((c==' ') || (c=='\n')) SETSTATE( S_anchor);
	    if (c=='>'){
	        BACK_UP;
	    	SETSTATE( S_anchor);
	    }
	    if (c!='"')	/* 930701 */
	     if (anchor_name_length<255) {
		anchor_name[anchor_name_length++] = c;
	    }
	    SETSTATE( state);
    
	case S_end_a:
	    switch(c) {
	    case 'd':					/* End address */
	    case 'D':
	    	if (!check("DDRESS >")) SETSTATE(S_junk_tag);
		end_style();
		SETSTATE(S_text);

	    case '>':					/* End anchor */
		{
		    [HT appendEndAnchor];
		    SETSTATE(S_text);
		}
		
	    default: SETSTATE(S_junk_tag);
	    } /* switch c */
	    break;

/*	Parse glossary tags
**	-------------------
**
**	We allow <DL> </DL> but we do not allow <DL> (text) <DT> ...
*/
	case S_tag_d:
	    switch(c) {
	    case 'L':
	    case 'l':				/* Start Definition list 	<DL> */
		(void) check("L> <");		/* Ignore first DT */
		c = NEXT_CHAR;
		if (c=='/') {
		    check("/DL>");
		} else {
		    (void) check("DT>");
		    start_style(&Glossary);
		}
		SETSTATE(S_text);
	    case 'T':
	    case 't':				/* Definition term 		<DT> */
        	output_paragraph();
	        SETSTATE(S_junk_tag);
	    case 'D':
	    case 'd':				/* Definition definition 	<DD> */
		OUTPUT('\t');
	        SETSTATE(S_junk_tag);
	    } /*switch c */
	    break;

	case S_end_d:				/* end definition list 	</DL> */
	    if ((c != 'l')&&(c!='L')) SETSTATE(S_junk_tag);
	    end_style();
	    SETSTATE(S_junk_tag);
	        
/*	Parse highlighting and headers
**	------------------------------
**	@ All these formats should be nested, and should be defined by a style sheet.
*/	
	case S_tag_h:
	    switch (c) {
	    case '1':
	    case '2':
	    case '3':
	    case '4':
	    case '5':
	    case '6':
		    start_style(&Heading[c-'1']);
		    update_style();

#ifdef CERN_WEIRDO
		    end_style_on_nl = YES;		/* Style can end at line end */
#endif
		    SETSTATE( S_junk_tag);
	    case 'P':
	    case 'p':
		switch (c=NEXT_CHAR) {
		case '1':
		case '2':
		case '3':
		    start_highlighting(Highlighting[c-'1']);
		    SETSTATE( S_junk_tag);	    
		default: SETSTATE( S_junk_tag);	    
		}
		break;
		    
	    default: SETSTATE( S_junk_tag);
	    } /* switch c */
	    break;
    
	case S_end_h:
	    switch (c) {
	    case '1':
	    case '2':
	    case '3':
	    case '4':
	    case '5':
	    case '6':
#ifdef CERN_WEIRDO
		    end_style_on_nl = NO;		/* That's over. */
#endif
		    end_style();
		    SETSTATE( S_junk_tag);
	    case 'P':
	    case 'p':
		switch (NEXT_CHAR) {
		case '1':
		case '2':
		case '3':
		    end_highlighting();
		    SETSTATE( S_junk_tag);
		default: SETSTATE( S_junk_tag);
		} /* switch */
		break;	    

	    default: SETSTATE( S_junk_tag);
	    
	    } /* switch c */
	    break;
    
/*	Parse Lists, ordered and unordered
**	----------------------------------
**
**	This only affects the horizontal line format, not the font.
*/
        case S_tag_o:
	case S_tag_u:
	    
	    if ((c == 'l') || (c=='L')) {        
		(void) check("L> <LI>");	/* Ignore first LI after UL */
		start_style(&listStyle);
	    }
	    SETSTATE(S_text);
	    
	case S_tag_l:
	    switch(c) {
	    case 'I':
	    case 'i':				/* List element		<LI> */
        	c = NEXT_CHAR;
		if (c=='S') {
		    SETSTATE(S_tag_lis);
		}
		output_paragraph();
	        SETSTATE(S_text);
		
	    default: SETSTATE(S_junk_tag);
	    } /*switch c */
	    break;

	case S_end_o:				/* end n list 		</UL> */
	case S_end_u:				/* end n list 		</UL> */
	    if ((c != 'l')&&(c!='L')) SETSTATE(S_junk_tag);
	    end_style();
	    SETSTATE(S_junk_tag);

/*	Parse rest of file on another format
*/
	case S_restoffile:
	    switch (c) {
	    
	    case ' ':
	    case '\n':
	    case '\t':
	    	break;
	     
	    case 'p':
	    case 'P':
	       	if (check("PLAINTEXT>")) {
		    if (TRACE) printf("Loading as plain text\n");
		    start_style(&Example);
		    LOADPLAINTEXT;
		    SETSTATE(S_done);		/* ... */
		 }
	     
	    case 'R':
	    case 'r':
	       	if (check("RTF>")) {
		    if (TRACE) printf("Loading as RTF\n");
		    [self readRichText:sgmlStream];	/* will read to end */
		    [self adjustWindow];		/* Fix scrollers */
		    SETSTATE(S_done);		/* Inhibit RTF load */
		 }
	     
	     }
	     break;
	             
/*	Parse <TITLE>..</TITLE>
*/
	case S_title:
	    if (c=='<') {
		if (check("</TITLE>")) {
		    title[title_length]=0;		/* Add a terminator */
		    if (TRACE)printf("\nTitle:\t`%s'\n", title);
		    [[self window] setTitle:title];
		    SETSTATE( S_text);
		} else SETSTATE( S_junk_tag);	/* @@@ forgets < in titles! */
	    } else {
		if (title_length < 255) title[title_length++] = c; 
		SETSTATE( state);
	    } /* if */
	case S_done:
	    break;			/* Should never happen */
	    
	} /* switch state */
    } /* for loop */

    if ((state!=S_text) && (state != S_done))
        if(TRACE) printf("*** Unfinished SGML file: Left in state %i\n", state);
	
    if (state != S_done) {
        OUTPUT('\n');	/* Ensure that the text always ends in \n for ScanALine */
        FINISH_OUTPUT;
    }

/*	Clean up any styles left nested
*/
    while (styleStack) {
	NestedStyle * N = styleStack;
	styleStack = N->next;
	if (TRACE) printf("HT: Left in style at end of document!\n");
	free(N);
    }
    
    [window setDocEdited:NO];
    tFlags.changeState = 0; 		/* Please notify delegate if changed */
    return self;
    
} /* readSGML:diagnostic: */


/*		Write SGML File back OUT
**		------------------------
**
**	This is currently quite NeXT-specific.
**
**	We run through te runs. When a characteristic of a run changes, we
**	output the approporiate SGML code. When several characteristics change at
**	the same place, we output the code in an order such that the resulting
**	structures wil be nested. This means first unwrapping the old ones, and
**	then entering the new ones. For example, it is better to produce
**
**		<h2><a>...</a></h2><a>...</a>
**	than
**
**		<h2><a>...</h2></a><a>...</a>
**
**	The special treatment of newlines is because we want to strip extra newlines
**	out. We ignore newlines at the beginning and end of the para style,
**	and we treat multiple newlines as a single paragraph mark.
**
**	Bugs:	@@@ Highlighting is ignored.
**		@@@ end text is ignored.
*/

#define LINE_WRAP 64		/* Start thinking about line wrap here */

static int SGML_gen_newlines;	/* Number of newlines pending during SGML generation */
static SGML_gen_errors;		/* Number of unrcognizable runs */
static SGML_style * currentSGML;
static const char * saveName;	/* pointer to name node is being saved under */
static char * prefix;		/* Pointer to prefix string to be junked */
static int	lineLength;	/* Number of characters on a line so far */

/*	This function, for any paragraph style, finds the SGML style, if any
*/
SGML_style * findSGML(void *para)
{
    int i;
    if (!para) return &Normal;			/* Totally unstyled becomes Normal */
    for (i=0; i<NUMBER_OF_STYLES; i++) {
	SGML_style * S = styleTable[i];
	if (S) {
	    HTStyle * style = S->style;
	    if(style) {
	        if (style->paragraph == para)
		    return S;
	    }
	}
    }
    if (TRACE) printf("HT: Can't find SGML style!\n");
    SGML_gen_errors++; 
    return &Normal;
}


/*	This function generates the code for one run, given the previous run.
**
*/
void change_run(NXRun *last, NXRun *r)
{
    int chars_left = r->chars;
       
    if (r->info != last->info) {			/* End anchor */
	if (last->info) NXPrintf(sgmlStream, "</A>");
    }
    
    if (r->paraStyle != last->paraStyle)
     if (last->paraStyle) {				/* End paragraph */
	if (currentSGML) NXPrintf(sgmlStream, "%s", currentSGML->end_tag);
	else NXPrintf(sgmlStream,"<P>\n");
	lineLength = 0;	 /* At column 1 */
    }

    
    if (r->paraStyle != last->paraStyle) {		/* Start paragraph */
	currentSGML = findSGML(r->paraStyle);
	if (currentSGML) {

	    if (currentSGML->free_format)
	     while(chars_left && WHITE(*read_pointer)) {/* Strip leading */
		(chars_left)--;				/*   white space */
		(void) NEXT_TEXT_CHAR;
	    }
	    NXPrintf(sgmlStream, "%s", currentSGML->start_tag);
	    prefix = currentSGML->start_text;
	} 
	SGML_gen_newlines=0;				/* Cancel  */
    }
    
    if (r->info != last->info) {			/* Start anchor */

	if (SGML_gen_newlines) {	/* Got anchor, need paragraph separator */
	    NXPrintf(sgmlStream, "%s", currentSGML->paragraph_tag);
	    SGML_gen_newlines=0;	/* paragraph flushed. */
	}
	if (r->info) {
	    Anchor * a = (Anchor *) r->info;
	    Anchor * d = [a destination];
	    NXPrintf(sgmlStream, "<A\nNAME=\"%s\"", [a address]);
	    if (d) {
	        Anchor * p = [d parent];
		Anchor * n = p ? p : d;		/* The node anchor */
	        char *absolute = HTParse([n address], [[HT nodeAnchor]address],
		  		PARSE_ALL);
		char * relative = HTRelative(absolute, saveName);
		if (!p)						/* Whole node */
		    NXPrintf(sgmlStream, " HREF=\"%s\"", relative);
	        else if (p == [HT nodeAnchor])			/* In same node */
		    NXPrintf(sgmlStream, " HREF=\"#%s\"", [d address]);
		else						/* In different node */
		    NXPrintf(sgmlStream, " HREF=\"%s#%s\"", relative, [d address]);
		free(relative);
		free(absolute);
	    }
	    NXPrintf(sgmlStream, ">");
	}
    }

/*	Now output the textual part of the run
**
**	Within the prefix region (prefix!=0), we discard white space and
**	characters matching *prefix++. Note the prefix string may contain white space.
**
**	The SGML_gen_newlines flag means that newlines have been found. They are
**	not actually implemented unless some more non-white text is found, so that
**	trailing newlines on the end of paragraphs are stripped.
**
**	The line wrapping is primitive in the extreme, as only text characters are
**	counted. In practise it limits the length of any line to a reasonable amount,
**	though this is not guarranteed.
*/
    {
	while (chars_left) {	
	    char c = NEXT_TEXT_CHAR;
	    chars_left--;
	    if (prefix) {
	        if (*prefix) {
		    if (c==*prefix) {
			++prefix;
			continue;			/* Strip prefix characters */
		    }
		    if (WHITE(c)) continue;			/* Strip white space */
		    if (TRACE) printf(
	   "HTML: WARNING: Paragraph prefix incomplete: %i found where %i expected.\n",
			     c, *prefix);
		    }
		prefix=0;				/* Prefix is over */
	    }
	    
	    if (c=='\n') {				/* Paragraph Marks:	*/
		if (currentSGML->free_format) {
		    SGML_gen_newlines++;		/* Just flag it */
		    prefix = currentSGML->paragraph_text;
	        } else {
		    NXPrintf(sgmlStream, "%s", currentSGML->paragraph_tag);
		}
		lineLength = 0;	 /* At column 1 */
		
	    } else {					/* Not newline */

		if (SGML_gen_newlines) {/* Got text, need paragraph separator */
		    NXPrintf(sgmlStream, "%s", currentSGML->paragraph_tag);
		    SGML_gen_newlines=0;	/* paragraph flushed. */
		    lineLength = 0;	 	/* At column 1 */
		}
		if (c=='\t') {
		    if (currentSGML) NXPrintf(sgmlStream, "%s", currentSGML->tab_tag);
		    else NXPrintf(sgmlStream, "\t");
		} else {				/* Not tab or newline */
		    lineLength ++;	/* @@bug doesn't count entity names */
		    if ((currentSGML->free_format)
		    &&  (lineLength++ > LINE_WRAP)	/* Wrap lines if we can */
		    &&  (c==' ')) {
			    c = '\n';
			    lineLength = 0;
		    }
		    
		    if (currentSGML->litteral) {
		        NXPrintf(sgmlStream, "%c", c);
		    } else {
			switch(c) {
		    	case '<': 	NXPrintf(sgmlStream, "&lt;"); break;
		    	case '&': 	NXPrintf(sgmlStream, "&amp;"); break;
			default:	NXPrintf(sgmlStream, "%c", c); break;
			} /* switch */
		    } /* not litteral */
	        }
	    }
	}
    }
} /* change_run */



/*	This is the body of the SGML output method.
*/
- writeSGML:(NXStream *) stream relativeTo:(const char *)aName
{
    NXRun * r = theRuns->runs;
    int sor;				/* Character position of start of run */
    NXRun dummy;
    
    dummy.paraStyle = 0;
    dummy.info = 0;
    dummy.chars = 0;
    
    SGML_gen_newlines=0;		/* Number of newlines read but not inserted */    
    HT = self;
    saveName = aName;
    sgmlStream = stream;
    SGML_gen_errors = 0;
    currentSGML = 0;
    prefix = 0;				/* No prefix to junk */
    
    START_INPUT;
    lineLength = 0;			/* Starting in column 1 */
    		
    NXPrintf(stream, "<HTML>\n<HEAD>\n");
    NXPrintf(stream, "<TITLE>%s</TITLE>", [window title]);
    
    if (nextAnchorNumber) NXPrintf(stream, "\n<NEXTID N=\"z%i\">\n",
	 nextAnchorNumber);
    NXPrintf(stream, "</HEAD>\n");
    NXPrintf(stream, "<BODY>");

/*	Change style tags etc
*/
    change_run(&dummy, r);			/* Start first run */
    
    for (sor=r++->chars; sor<textLength; sor=sor+(r++)->chars)  {
        if (TRACE) printf("%4i:  %i chars in run %3i.\n",
			sor, r->chars, r-theRuns->runs);
	change_run(r-1, r);	/* Runs 2 to N */
    }
    change_run(r, &dummy);			/* Close last run */

    tFlags.changeState = 0; 		/* Please notify delegate if changed */
    NXPrintf(stream, "</BODY>\n</HTML>\n");

    return (SGML_gen_errors) ? nil : self;    
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.