This is ParseHTML.h in view mode; [Download] [Up]
/* FORMAT CONVERSION FROM SGML
** ===========================
**
**
** 22 Nov 92 Fixed quoting of hrefs.
** CERN_WEIRDO ifdefed out -- proper SGML expected
** REMOVE_SCRIPT ifdefed out -- did ignore lines starting with "."
*/
#import "HTStyle.h"
extern HTStyleSheet * styleSheet;
#ifndef NEXT_CHAR
static FILE * sgmlStream;
#define END_OF_FILE NXAtEOS(sgmlStream) /* @@@@ */
#define NEXT_CHAR getc(sgmlStream)
#define BACK_UP ungetc(sgmlStream)
#endif
#define upper(c) ( ((c>='a')&&(c<='z')) ? (char)((int)c-32) : c )
/* State machine states:
*/
enum state_enum {S_text, /* We are not in a tag */
#ifdef REMOVE_SCRIPT
S_column_1, /* as Text but first character on input line */
S_dot, /* We have had dor in first column */
S_junk_script, /* Ignore everything until NL or ";" */
#else
#define S_column_1 S_text
#endif
S_word, /* We have just had a non-white printable */
S_char_ref, /* Numeric character reference */
S_tag_start, /* We have just had "<" */
S_tag_h,
S_tag_a, S_end_a,
S_tag_d, S_end_d,
S_tag_i,
S_tag_l, S_tag_lis,
S_tag_n,
S_tag_o, S_end_o,
S_tag_p,
S_tag_u, S_end_u,
S_tag_end, /* We have just had "</" */
S_restoffile,
S_end_h,
S_title,
S_anchor, S_href, S_href_quoted, S_href_unquoted, S_aname,
S_junk_tag, /* Ignore everything until ">" */
#ifdef CERN_WEIRDO
S_junk_line, /* Ignore everything until "\n" */
#endif
S_done};
typedef struct _SGML_style {
char * start_tag; /* Tag to mark start of a style */
char * paragraph_tag; /* Tag to mark paragraph mark within style */
char * tab_tag; /* Tag to mark tab within style */
char * end_tag; /* Tag to mark end of style */
char * start_text; /* Text conventionally starting this style */
char * paragraph_text; /* Text used as a paragraph mark within style*/
char * end_text; /* Text used to end a style */
HTStyle * style; /* Paragraph style to be used */
int free_format; /* Flag: are line ends word breaks only? */
int litteral; /* Flag: end only at close tag (cheat) ? */
} SGML_style;
/* Stack of previous styles:
*/
typedef struct _NestedStyle {
struct _NestedStyle * next; /* previously nested style or 0 */
SGML_style * SGML; /* SGML style interrupted */
} NestedStyle;
/* MODULE-WIDE DATA
**
**
*/
/* We delay changing style until necessary to avoid dummy style changes
** resulting in too many extra newlines.
*/
static SGML_style * current_style; /* The current output style */
static SGML_style * next_style; /* The next style to go into */
static NestedStyle * styleStack;
static int output_in_word; /* Flag: Last character ouput was non-white */
static char_num; /* Current value of numeric character reference */
/* Paragraph Styles used by the SGML parser:
** ----------------------------------------
*/
static SGML_style Normal =
{ "", "<P>\n", "\t", "",
"","", "", 0 ,1, 0};
static SGML_style Heading[6] = {
{ "\n<H1>", "</H1>\n<H1>", "\t", "</H1>", "", "", "", 0, 1, 0},
{ "\n<H2>", "</H2>\n<H2>", "\t", "</H2>", "", "", "", 0, 1, 0},
{ "\n<H3>", "</H3>\n<H3>", "\t", "</H3>", "", "", "", 0, 1, 0},
{ "\n<H4>", "</H4>\n<H4>", "\t", "</H4>", "", "", "", 0, 1, 0},
{ "\n<H5>", "</H5>\n<H5>", "\t", "</H5>", "", "", "", 0, 1, 0},
{ "\n<H6>", "</H6>\n<H6>", "\t", "</H6>", "", "", "", 0, 1, 0}
};
static SGML_style Glossary = /* Large hanging indent with tab */
{ "\n<DL>\n<DT>", "\n<DT>", "\n<DD>", "\n</DL>\n",
"", "", "", 0, 1};
static SGML_style listStyle = /* Hanging indent with tab */
{ "\n<UL>\n<LI>", "\n<LI>", "\t", "\n</UL>",
"\267\t", "\267\t", "", 0, 1, 0};
static SGML_style addressStyle =
{ "\n<ADDRESS>", "<P>", "\t", "\n</ADDRESS>",
"", "", "", 0, 1, 0 };
/* Explicit format styles:
*/
static SGML_style Example = /* Fixed width font, at least 80 chars wide */
{ "\n<XMP>", "\n", "\t", "</XMP>",
"", "", "", 0 , 0, 1};
static SGML_style Preformatted = /* Fixed width font, at least 80 chars wide */
{ "\n<PRE>", "\n", "\t", "</PRE>",
"", "", "", 0 , 0, 0}; /* not litteral */
static SGML_style Fixed = /* Fixed width font, at least 80 chars wide */
{ "\n<FIXED>", "<P>", "\t", "</FIXED>",
"", "", "", 0 , 1, 0};
static SGML_style Listing = /* Fixed width font, at least 80 chars wide */
{ "\n<LISTING>", "\n", "\t", "</LISTING>",
"", "", "", 0 , 0, 1};
/* Table of all possible SGML paragraph styles
*/
static SGML_style * styleTable[] = {
&Normal, &Heading[0], &Heading[1], &Heading[2],
&Heading[3], &Heading[4], &Heading[5],
&Glossary, &listStyle, &addressStyle, &Preformatted, &Fixed, &Example, &Listing
}; /* style table */
#define NUMBER_OF_STYLES (sizeof(styleTable)/sizeof(styleTable[0]))
/* Highlighting styles
*/
static HTStyle * Highlighting[3];
/* F U N C T I O N S
*/
/* Get Styles from style sheet
** ---------------------------
*/
void get_styles()
{
Normal.style = HTStyleNamed(styleSheet, "Normal");
Heading[0].style = HTStyleNamed(styleSheet, "Heading1");
Heading[1].style = HTStyleNamed(styleSheet, "Heading2");
Heading[2].style = HTStyleNamed(styleSheet, "Heading3");
Heading[3].style = HTStyleNamed(styleSheet, "Heading4");
Heading[4].style = HTStyleNamed(styleSheet, "Heading5");
Heading[5].style = HTStyleNamed(styleSheet, "Heading6");
Glossary.style = HTStyleNamed(styleSheet, "Glossary");
listStyle.style = HTStyleNamed(styleSheet, "List");
addressStyle.style= HTStyleNamed(styleSheet, "Address");
Example.style = HTStyleNamed(styleSheet, "Example");
Preformatted.style = HTStyleNamed(styleSheet, "Example");
Listing.style = HTStyleNamed(styleSheet, "Listing");
Highlighting[0] = HTStyleNamed(styleSheet, "Italic");
Highlighting[1] = HTStyleNamed(styleSheet, "Bold");
Highlighting[2] = HTStyleNamed(styleSheet, "Bold-Italic");
}
/* Output the code for styles
** --------------------------
*/
void output_paragraph()
{
HTStyle * s = current_style->style;
int newlines = ((s->spaceBefore+s->spaceAfter) / s->paragraph->lineHt) + 1;
int i;
for(i=0; i<newlines; i++) OUTPUT('\n'); /* Rather approximate! @@ */
OUTPUTS(current_style->paragraph_text);
output_in_word = 0;
}
/* Switch SGML paragraph style (finishing the old one)
**
** The "formatted" flag allows us to add a paragraph end at the end of a
** normal style (such as <H1> etc) but suppresses this for litteral text
** styles such as <XMP> and <LISTING which have explicit paragraph end.
** Thus, ALL text between <XMP> tags is litteral, and no newline results
** from going in and out of <XMP> sections.
**
** Now, we allow only the larger of the space before/space after
** requirements, as that is nearer what is meant.
*/
void update_style()
{
HTStyle * cur = current_style->style;
HTStyle * next = next_style->style;
OUTPUTS(current_style->end_text);
if (current_style->free_format && cur && next) { /* generate new lines */
int i;
float space = cur->spaceAfter > next->spaceBefore ?
cur->spaceAfter : next->spaceBefore; /* max */
int newlines = (space/cur->paragraph->lineHt) + 1;
output_in_word = 0;
for(i=0; i<newlines; i++) OUTPUT('\n'); /* Rather approximate! */
}
current_style = next_style;
if (current_style->style) SET_STYLE(current_style->style);
OUTPUTS(current_style->start_text);
}
#define UPDATE_STYLE {if (current_style!=next_style) update_style();}
/* Rememember that we will be going into style s
** ---------------------------------------------
*/
void change_style(SGML_style * s)
{
next_style = s;
}
/* End an SGML style
*/
void end_style()
{
if (styleStack) {
NestedStyle * N = styleStack;
styleStack = N->next;
free(N);
if (styleStack) change_style(styleStack->SGML);
else change_style(&Normal);
} else {
if (TRACE) printf("HTML: We have ended more styles than we have started!\n");
change_style(&Normal); /* Note there is no nesting! */
}
}
/* Start a nested SGML style
*/
void start_style(SGML_style * s)
{
NestedStyle * N = malloc(sizeof(*N));
N->next = styleStack;
N->SGML = s;
styleStack = N;
change_style(s);
}
/* Start a highlighted area
** ------------------------
*/
void start_highlighting(HTStyle * style)
{
/* SET_STYLE(style); @@@ to be fixed up */
}
/* End a highlighted area
** ----------------------
*/
void end_highlighting()
{
/* @@@@@@ Need set and unset style functions, traits and nesting */
}
/* Check keyword syntax
** ---------------------
**
** This function is called when there is only one thing it can be.
** The check is case-insensitive.
**
** On entry,
** s Points to a template string in uppercase, with a space
** standing for any amount of space in the input stream.
** THE FIRST CHARACTER HAS ALREADY BEEN READ AND CHECKED.
** On exit,
** returns YES if matched, all maching characters gobbled up;
** NO if failure, only matching characters gobbled up.
*/
static BOOL check(char *s)
{
char * p = s+1; /* Pointer to template string */
char c; /* Character from stream */
for (; *p; p++) {
if (*p == ' ') {
for(c=NEXT_CHAR; WHITE(c) ;c=NEXT_CHAR) /*null*/ ;
BACK_UP; /* Put non-blank back into stream */
} else {
c = NEXT_CHAR;
if (upper(c) != *p) {
printf("SGML parse: `%c' found when `%c' in `%s' was expected.\n",
c, *p, s);
BACK_UP; /* Put eroneous character back on stream */
return NO; /* failed: syntax error */
} /* bad char */
} /* non-blank */
} /* for */
return YES; /* succeded: go to end of template string */
}
/* Read example text
** -----------------
**
** Returns when terminator or end-of-file found.
**
** As we are looking for a terminator, we have to buffer things which
** could be terminators so as to be able to replace thm into the output
** stream if we find they aren't. If there wasn't the ambiguity as to
** upper/lower case, we could of course just regurgitate the terminator
** itself.
**
*/
static int parse_example(SGML_style * style, char * terminator)
{
char * p = terminator;
char buffer[20]; /* One longer than the terminator */
char * q = buffer;
start_style(style);
UPDATE_STYLE;
for (;;){
if (END_OF_FILE) return S_text; /* return if end of stream */
*q = NEXT_CHAR;
if (upper(*q)==*p) {
p++; q++;
if (!*p) {
end_style();
return S_text; /* Return: terminator found */
}
} else {
if (q!=buffer) { /* Replace what could have been terminator */
for(p=buffer; p<q; p++) {
OUTPUT(*p);
}
buffer[0] = *q; /* Put this char back at beginning of buffer */
p = terminator; /* point to start of terminator again */
q = buffer;
}
#ifdef JUNK
if (*q !=10) {
OUTPUT(*q); /* Most common 99% path */
} else {
output_paragraph(); /* @@ gives space_before and after */
}
#else
OUTPUT(*q); /* Most common 99% path */
#endif
}
}
}
/* Read in an SGML Stream readSGML:
** ----------------------
*/
/* State machine to perform lexical analysis of SGML
**
** This routine parses an SGML stream to produce styles, text and anchors.
**
** This machine does not do good error recovery. It ignores tags which it doesn't
** understand. It is a simple finite state machine with no push-down stack, and
** therefore cannot (yet) understand nested constructs.
**
** NON-REENTRANT.
**
** On entry,
** sgmlStream is open for read, and will provide the marked up data.
** diagnostic 0 => Read and interpret
** 1 => Dump RTF into buffer as text.
** On exit,
** return value is self.
** self has anchors added which came up.
** Is loaded if state returned is "done".
*/
/* When a state has been found, we break out of the switch with this macro.
** It is a macro to allow the code to be changed more easily (eg to return).
** As it breaks out of the inner switch only, we must remember breaks after
** that switch to get out of the next outer one, and so on.
*/
#ifdef NeXT
- readSGML: (NXStream *)stream diagnostic:(int)diagnostic
#else
int readSGML(HyperText * self, FILE * stream, int diagnostic)
#endif
#define SETSTATE(x) {state=(x); break;}
{
enum state_enum state = S_column_1;
/* Information to be accumulated:
*/
char title[256]; /* See <TITLE> tag. */
char reference[256]; /* See <A HREF=...> attribute */
char anchor_name[256]; /* See <A NAME=...> attribute */
int title_length = 0;
int reference_length = 0;
int anchor_name_length = 0; /* See <A NAME=...> attribte */
BOOL end_style_on_nl = NO; /* For styles which only last a line (ugh!) */
BOOL white_significant = NO; /* Not free format */
/* Set up global pointer for other routines
*/
output_in_word = 0; /* Flag: Last character output was non-white */
HT = self;
sgmlStream = stream;
/* Pick up the styles we want from a local style sheet
*/
get_styles();
styleStack = 0;
current_style = &Normal;
if (TRACE) printf("Parsing SGML stream %i\n", sgmlStream);
START_OUTPUT;
set_style(Normal.style); /* Was random! 910910 TBL */
while(!END_OF_FILE && (state!=S_done)) {
char c = NEXT_CHAR;
if (c == (char)-1) {
if (TRACE) printf("*** HT: -1 found on input stream not at EOF!\n");
break;
}
#ifdef CHARACTER_TRACE
if(TRACE) printf("<%c>", c);
#endif
switch (state) {
#ifdef REMOVE_SCRIPT
case S_column_1:
if (c=='.') {
SETSTATE(S_dot);
}
BACK_UP;
SETSTATE(S_text);
case S_dot: /* Dot in first column */
if (WHITE(c)) {
OUTPUT('.');
BACK_UP;
SETSTATE(S_text); /* OOPS: must have been real "." */
} else {
SETSTATE(S_junk_script); /* Throw away SCRIPT commands */
}
case S_junk_script:
SETSTATE( (c=='\n')||(c==';') ? S_column_1
: S_junk_script);
#endif
case S_word: /* We have just had non-white characters */
if (c=='<') SETSTATE(S_tag_start);
if (c=='&') goto rcdata;
if (!WHITE(c)) {
OUTPUT(c);
break;
}
case S_text: /* We are not in a tag or a word */
switch(c) {
case '<': SETSTATE(S_tag_start);
/* Special code for CERN SGML double newline significance: ugh! :-(
*/
case '\n':
if (white_significant) {
output_paragraph();
output_in_word = 0;
SETSTATE(S_text);
}
#ifdef CERN_WEIRDO /* Obsolete 921122 */
if (end_style_on_nl) {
end_style();
end_style_on_nl = NO;
} else {
int newlines = 1;
while( (c=NEXT_CHAR)==10) {
newlines++;
}
if (newlines>1) {
output_paragraph(); /* n newlines becomes a paragraph.*/
output_in_word=0;
}
BACK_UP; /* Go back and check c again */
SETSTATE(S_column_1);
}
#else
{
int newlines = 1;
while( (c=NEXT_CHAR)==10) {
newlines++;
}
BACK_UP; /* Go back and check c again */
SETSTATE(S_column_1);
}
#endif
case '\t':
UPDATE_STYLE; /* Must be in new style */
/* FALL THROUGH! */
case ' ':
OUTPUT(c);
output_in_word = 0;
SETSTATE(S_text);
default: /* New word */
/* The character is non-white. Print a space if necessary. */
UPDATE_STYLE; /* Must be in new style */
if (output_in_word) {
OUTPUT(' ');
}
rcdata:
if (c=='&') { /* Entities */
c = NEXT_CHAR;
switch (c) {
case 'a': if (check("AMP;")) { c = '&'; goto printable; }; break;
case 'l': if (check("LT;")) { c = '<'; goto printable; }; break;
case 'g': if (check("GT;")) { c = '>'; goto printable; }; break;
case 'q': if (check("QUOT;")){ c = '"'; goto printable; }; break;
case '#': {
char_num = 0; /* initialize accumulation */
SETSTATE(S_char_ref);
}
default: break;
}
if (TRACE) fprintf(stderr, "HTML: Bad entity.\n");
SETSTATE(S_word);
}
printable:
OUTPUT(c); /* First char of new word */
output_in_word = 1;
SETSTATE(S_word); /* Now take rest of word faster */
} /* switch(c) */
break;
case S_char_ref:
if ((c>=0) && (c<=9)) {
char_num = char_num*10 + (c-'0');
} else { /* c had better be a semicolon in fact */
c = (char) char_num;
goto printable; /* always treat as non-blank @@@ bug? */
};
case S_tag_start:
switch (c) {
case 'A':
case 'a': SETSTATE(S_tag_a);
case 'd':
case 'D': SETSTATE(S_tag_d);
case 'H':
case 'h': SETSTATE(S_tag_h);
case 'i':
case 'I': SETSTATE(S_tag_i);
case 'L':
case 'l': SETSTATE(S_tag_l);
case 'n':
case 'N': SETSTATE(S_tag_n);
case 'O':
case 'o': SETSTATE(S_tag_o);
case 'p':
case 'P': SETSTATE(S_tag_p);
case 'r':
case 'R': SETSTATE(check("RESTOFFILE")
? S_restoffile:S_junk_tag)
case 'T':
case 't': SETSTATE(check("TITLE>") ? S_title : S_junk_tag);
case 'U':
case 'u': SETSTATE(S_tag_u);
case 'X':
case 'x': SETSTATE( check("XMP>") ?
parse_example(&Example, "</XMP>")
: S_junk_tag);
case '/': SETSTATE( S_tag_end);
default: SETSTATE( S_junk_tag);
} /* switch on character */
break;
case S_tag_end:
switch (c) {
case 'A':
case 'a': SETSTATE(S_end_a);
case 'D':
case 'd': SETSTATE(S_end_d);
case 'H':
case 'h': SETSTATE(S_end_h);
case 'I':
case 'i': if (check("ISINDEX")) isIndex = YES;
SETSTATE(S_junk_tag);
case 'n':
case 'N': SETSTATE(check("NODE>") ? S_done : S_junk_tag)
case 'O':
case 'o': SETSTATE(S_end_o);
case 'P':
case 'p': if (check("PRE")) {
end_style();
white_significant = NO;
SETSTATE(S_junk_tag);
}
case 'U':
case 'u': SETSTATE(S_end_u);
default: SETSTATE(S_junk_tag);
} /* switch on character */
break;
case S_junk_tag: SETSTATE( (c=='>') ? S_text : S_junk_tag);
#ifdef CERN_WEIRDO
case S_junk_line: SETSTATE( (c=='\n') ? S_column_1 : S_junk_line);
#endif
case S_tag_i:
switch(c) {
#ifdef CERN_WEIRDO
case '1': SETSTATE(S_junk_line); /* Junk I1 */
#endif
case 's':
case 'S': if (check("SINDEX")) isIndex = YES;
SETSTATE(S_junk_tag);
default: SETSTATE(S_junk_tag);
}
break;
case S_tag_a:
switch(c) {
case 'd':
case 'D':
if (!check("DDRESS>")) { SETSTATE(S_junk_tag) };
start_style(&addressStyle);
SETSTATE( S_text);
case '\n':
case ' ':
case '>':
reference_length = 0;
anchor_name_length = 0;
SETSTATE(S_anchor);
} /* switch on character */
break;
case S_tag_p: if ((c==' ') || (c=='>')) {
output_paragraph();
SETSTATE( c=='>'? S_text : S_junk_tag);
}
if ((c=='R') || (c=='r')) { /* <PRE> */
if (check("RE")) {
start_style(&Preformatted);
update_style();
white_significant = YES;
SETSTATE( S_junk_tag);
}
}
if ((c=='L') || (c=='l')) { /* OBSOLETE @@ */
if (check("LAINTEXT>")) {
if (TRACE) printf("Loading as plain text\n");
[self readText:sgmlStream]; /* will read to end */
SETSTATE(S_done); /* Inhibit RTF load */
}
}
SETSTATE(S_junk_tag);
case S_tag_lis:
SETSTATE( check("TING>") ?
parse_example(&Listing, "</LISTING>")
: S_junk_tag);
/* Subnodes are delimited by <NODE>...</NODE>. They have the same address as the
** node, but the anchor IDs must be different. This is not thought out. @@
** Perhaps a hierarchical anchor ID format ....
*/
case S_tag_n:
switch(c) {
case 'o':
case 'O': if (check("ODE>")) { /* Load a subnode */
if(TRACE) printf("Loading subnode...NOT IMPLEMENTED\n");
#ifdef NOT_DEFINED
Anchor * a = [Anchor new];
HyperText * SN;
[a setAddress:[nodeAnchor address]];
SN = [HyperText newAnchor:a Server:server];
[alsoStore addObject:SN];
SN->storeWith = self;
[SN readSGML:sgmlStream diagnostic:diagnostic];
/* But leave it hidden from view for now. */
#endif
}
SETSTATE(S_text);
case 'E': /* <NE */
case 'e':
if (check("EXTID ")) {
int value = 0;
for(;;){
c = NEXT_CHAR;
if ((c=='N') || (c=='n')) {
if (!check("N = ")) {
if (TRACE) fprintf(stderr,
"HTML: Bad nextid\n");
SETSTATE(S_junk_tag);
}
c = NEXT_CHAR;
}
if (c=='"') continue; /* 921122 */
if ((c>='a' && c<='z') || (c>='A' && c<='Z'))
continue; /* 930701 */
if ((c<'0') || (c>'9')) {
nextAnchorNumber = value;
if (TRACE) fprintf(stderr, "Next anchor number: %i\n", value);
BACK_UP;
SETSTATE(S_junk_tag);
break;
}
value = value*10 + (c-'0');
}
}
SETSTATE(S_junk_tag);
} /* switch */
break;
/* Parse anchor tag:
** ----------------
*/
case S_anchor:
if (c==' ') SETSTATE( S_anchor); /* Ignore spaces */
if ((c=='H')||(c=='h')) {
if (check("HREF = ")) {
SETSTATE( S_href);
}
}
if ((c=='N')||(c=='n')) {
if (check("NAME = ")) {
SETSTATE( S_aname);
}
}
if (c=='>') { /* Anchor tag is over */
/* Should use appendStartAnchor! @@@ */
HTStyle * style = HTStyleNew();
char * parsed_address;
int anchorNumber;
reference[reference_length]=0; /* Terminate it */
anchor_name[anchor_name_length]=0; /* Terminate it */
style->anchor =
*anchor_name ? [Anchor newParent:nodeAnchor tag:anchor_name]
: [self anchor];
/* If next anchor number not specified, ensure it is safe */
if ((anchor_name[0] == ANCHOR_ID_PREFIX)
&& (sscanf(anchor_name+1, "%i", &anchorNumber) > 0)) /* numeric? */
if (anchorNumber >= nextAnchorNumber)
nextAnchorNumber = anchorNumber+1; /* Prevent reuse */
[(Anchor *)style->anchor isLastChild]; /* Put in correct order */
if (*reference) { /* Link only if href */
parsed_address = HTParse(reference, [nodeAnchor address],
PARSE_ALL);
[(Anchor *)(style->anchor) linkTo:
[Anchor newAddress:parsed_address]];
free(parsed_address);
}
UPDATE_STYLE;
SET_STYLE(style); /* Start anchor here */
free(style);
SETSTATE(S_text);
}
printf("SGML: Bad attribute in anchor.\n");
SETSTATE( S_junk_tag);
case S_href:
if (c=='"') SETSTATE (S_href_quoted);
case S_href_unquoted:
if ((c==' ') || (c=='\n')) SETSTATE( S_anchor);
if (c=='>'){
BACK_UP;
SETSTATE( S_anchor);
}
if (reference_length<255) {
reference[reference_length++] = c;
}
SETSTATE(S_href_unquoted);
case S_href_quoted:
if (c=='"') SETSTATE( S_anchor);
if (reference_length<255) {
reference[reference_length++] = c;
}
SETSTATE( state);
case S_aname:
if ((c==' ') || (c=='\n')) SETSTATE( S_anchor);
if (c=='>'){
BACK_UP;
SETSTATE( S_anchor);
}
if (c!='"') /* 930701 */
if (anchor_name_length<255) {
anchor_name[anchor_name_length++] = c;
}
SETSTATE( state);
case S_end_a:
switch(c) {
case 'd': /* End address */
case 'D':
if (!check("DDRESS >")) SETSTATE(S_junk_tag);
end_style();
SETSTATE(S_text);
case '>': /* End anchor */
{
[HT appendEndAnchor];
SETSTATE(S_text);
}
default: SETSTATE(S_junk_tag);
} /* switch c */
break;
/* Parse glossary tags
** -------------------
**
** We allow <DL> </DL> but we do not allow <DL> (text) <DT> ...
*/
case S_tag_d:
switch(c) {
case 'L':
case 'l': /* Start Definition list <DL> */
(void) check("L> <"); /* Ignore first DT */
c = NEXT_CHAR;
if (c=='/') {
check("/DL>");
} else {
(void) check("DT>");
start_style(&Glossary);
}
SETSTATE(S_text);
case 'T':
case 't': /* Definition term <DT> */
output_paragraph();
SETSTATE(S_junk_tag);
case 'D':
case 'd': /* Definition definition <DD> */
OUTPUT('\t');
SETSTATE(S_junk_tag);
} /*switch c */
break;
case S_end_d: /* end definition list </DL> */
if ((c != 'l')&&(c!='L')) SETSTATE(S_junk_tag);
end_style();
SETSTATE(S_junk_tag);
/* Parse highlighting and headers
** ------------------------------
** @ All these formats should be nested, and should be defined by a style sheet.
*/
case S_tag_h:
switch (c) {
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
start_style(&Heading[c-'1']);
update_style();
#ifdef CERN_WEIRDO
end_style_on_nl = YES; /* Style can end at line end */
#endif
SETSTATE( S_junk_tag);
case 'P':
case 'p':
switch (c=NEXT_CHAR) {
case '1':
case '2':
case '3':
start_highlighting(Highlighting[c-'1']);
SETSTATE( S_junk_tag);
default: SETSTATE( S_junk_tag);
}
break;
default: SETSTATE( S_junk_tag);
} /* switch c */
break;
case S_end_h:
switch (c) {
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
#ifdef CERN_WEIRDO
end_style_on_nl = NO; /* That's over. */
#endif
end_style();
SETSTATE( S_junk_tag);
case 'P':
case 'p':
switch (NEXT_CHAR) {
case '1':
case '2':
case '3':
end_highlighting();
SETSTATE( S_junk_tag);
default: SETSTATE( S_junk_tag);
} /* switch */
break;
default: SETSTATE( S_junk_tag);
} /* switch c */
break;
/* Parse Lists, ordered and unordered
** ----------------------------------
**
** This only affects the horizontal line format, not the font.
*/
case S_tag_o:
case S_tag_u:
if ((c == 'l') || (c=='L')) {
(void) check("L> <LI>"); /* Ignore first LI after UL */
start_style(&listStyle);
}
SETSTATE(S_text);
case S_tag_l:
switch(c) {
case 'I':
case 'i': /* List element <LI> */
c = NEXT_CHAR;
if (c=='S') {
SETSTATE(S_tag_lis);
}
output_paragraph();
SETSTATE(S_text);
default: SETSTATE(S_junk_tag);
} /*switch c */
break;
case S_end_o: /* end n list </UL> */
case S_end_u: /* end n list </UL> */
if ((c != 'l')&&(c!='L')) SETSTATE(S_junk_tag);
end_style();
SETSTATE(S_junk_tag);
/* Parse rest of file on another format
*/
case S_restoffile:
switch (c) {
case ' ':
case '\n':
case '\t':
break;
case 'p':
case 'P':
if (check("PLAINTEXT>")) {
if (TRACE) printf("Loading as plain text\n");
start_style(&Example);
LOADPLAINTEXT;
SETSTATE(S_done); /* ... */
}
case 'R':
case 'r':
if (check("RTF>")) {
if (TRACE) printf("Loading as RTF\n");
[self readRichText:sgmlStream]; /* will read to end */
[self adjustWindow]; /* Fix scrollers */
SETSTATE(S_done); /* Inhibit RTF load */
}
}
break;
/* Parse <TITLE>..</TITLE>
*/
case S_title:
if (c=='<') {
if (check("</TITLE>")) {
title[title_length]=0; /* Add a terminator */
if (TRACE)printf("\nTitle:\t`%s'\n", title);
[[self window] setTitle:title];
SETSTATE( S_text);
} else SETSTATE( S_junk_tag); /* @@@ forgets < in titles! */
} else {
if (title_length < 255) title[title_length++] = c;
SETSTATE( state);
} /* if */
case S_done:
break; /* Should never happen */
} /* switch state */
} /* for loop */
if ((state!=S_text) && (state != S_done))
if(TRACE) printf("*** Unfinished SGML file: Left in state %i\n", state);
if (state != S_done) {
OUTPUT('\n'); /* Ensure that the text always ends in \n for ScanALine */
FINISH_OUTPUT;
}
/* Clean up any styles left nested
*/
while (styleStack) {
NestedStyle * N = styleStack;
styleStack = N->next;
if (TRACE) printf("HT: Left in style at end of document!\n");
free(N);
}
[window setDocEdited:NO];
tFlags.changeState = 0; /* Please notify delegate if changed */
return self;
} /* readSGML:diagnostic: */
/* Write SGML File back OUT
** ------------------------
**
** This is currently quite NeXT-specific.
**
** We run through te runs. When a characteristic of a run changes, we
** output the approporiate SGML code. When several characteristics change at
** the same place, we output the code in an order such that the resulting
** structures wil be nested. This means first unwrapping the old ones, and
** then entering the new ones. For example, it is better to produce
**
** <h2><a>...</a></h2><a>...</a>
** than
**
** <h2><a>...</h2></a><a>...</a>
**
** The special treatment of newlines is because we want to strip extra newlines
** out. We ignore newlines at the beginning and end of the para style,
** and we treat multiple newlines as a single paragraph mark.
**
** Bugs: @@@ Highlighting is ignored.
** @@@ end text is ignored.
*/
#define LINE_WRAP 64 /* Start thinking about line wrap here */
static int SGML_gen_newlines; /* Number of newlines pending during SGML generation */
static SGML_gen_errors; /* Number of unrcognizable runs */
static SGML_style * currentSGML;
static const char * saveName; /* pointer to name node is being saved under */
static char * prefix; /* Pointer to prefix string to be junked */
static int lineLength; /* Number of characters on a line so far */
/* This function, for any paragraph style, finds the SGML style, if any
*/
SGML_style * findSGML(void *para)
{
int i;
if (!para) return &Normal; /* Totally unstyled becomes Normal */
for (i=0; i<NUMBER_OF_STYLES; i++) {
SGML_style * S = styleTable[i];
if (S) {
HTStyle * style = S->style;
if(style) {
if (style->paragraph == para)
return S;
}
}
}
if (TRACE) printf("HT: Can't find SGML style!\n");
SGML_gen_errors++;
return &Normal;
}
/* This function generates the code for one run, given the previous run.
**
*/
void change_run(NXRun *last, NXRun *r)
{
int chars_left = r->chars;
if (r->info != last->info) { /* End anchor */
if (last->info) NXPrintf(sgmlStream, "</A>");
}
if (r->paraStyle != last->paraStyle)
if (last->paraStyle) { /* End paragraph */
if (currentSGML) NXPrintf(sgmlStream, "%s", currentSGML->end_tag);
else NXPrintf(sgmlStream,"<P>\n");
lineLength = 0; /* At column 1 */
}
if (r->paraStyle != last->paraStyle) { /* Start paragraph */
currentSGML = findSGML(r->paraStyle);
if (currentSGML) {
if (currentSGML->free_format)
while(chars_left && WHITE(*read_pointer)) {/* Strip leading */
(chars_left)--; /* white space */
(void) NEXT_TEXT_CHAR;
}
NXPrintf(sgmlStream, "%s", currentSGML->start_tag);
prefix = currentSGML->start_text;
}
SGML_gen_newlines=0; /* Cancel */
}
if (r->info != last->info) { /* Start anchor */
if (SGML_gen_newlines) { /* Got anchor, need paragraph separator */
NXPrintf(sgmlStream, "%s", currentSGML->paragraph_tag);
SGML_gen_newlines=0; /* paragraph flushed. */
}
if (r->info) {
Anchor * a = (Anchor *) r->info;
Anchor * d = [a destination];
NXPrintf(sgmlStream, "<A\nNAME=\"%s\"", [a address]);
if (d) {
Anchor * p = [d parent];
Anchor * n = p ? p : d; /* The node anchor */
char *absolute = HTParse([n address], [[HT nodeAnchor]address],
PARSE_ALL);
char * relative = HTRelative(absolute, saveName);
if (!p) /* Whole node */
NXPrintf(sgmlStream, " HREF=\"%s\"", relative);
else if (p == [HT nodeAnchor]) /* In same node */
NXPrintf(sgmlStream, " HREF=\"#%s\"", [d address]);
else /* In different node */
NXPrintf(sgmlStream, " HREF=\"%s#%s\"", relative, [d address]);
free(relative);
free(absolute);
}
NXPrintf(sgmlStream, ">");
}
}
/* Now output the textual part of the run
**
** Within the prefix region (prefix!=0), we discard white space and
** characters matching *prefix++. Note the prefix string may contain white space.
**
** The SGML_gen_newlines flag means that newlines have been found. They are
** not actually implemented unless some more non-white text is found, so that
** trailing newlines on the end of paragraphs are stripped.
**
** The line wrapping is primitive in the extreme, as only text characters are
** counted. In practise it limits the length of any line to a reasonable amount,
** though this is not guarranteed.
*/
{
while (chars_left) {
char c = NEXT_TEXT_CHAR;
chars_left--;
if (prefix) {
if (*prefix) {
if (c==*prefix) {
++prefix;
continue; /* Strip prefix characters */
}
if (WHITE(c)) continue; /* Strip white space */
if (TRACE) printf(
"HTML: WARNING: Paragraph prefix incomplete: %i found where %i expected.\n",
c, *prefix);
}
prefix=0; /* Prefix is over */
}
if (c=='\n') { /* Paragraph Marks: */
if (currentSGML->free_format) {
SGML_gen_newlines++; /* Just flag it */
prefix = currentSGML->paragraph_text;
} else {
NXPrintf(sgmlStream, "%s", currentSGML->paragraph_tag);
}
lineLength = 0; /* At column 1 */
} else { /* Not newline */
if (SGML_gen_newlines) {/* Got text, need paragraph separator */
NXPrintf(sgmlStream, "%s", currentSGML->paragraph_tag);
SGML_gen_newlines=0; /* paragraph flushed. */
lineLength = 0; /* At column 1 */
}
if (c=='\t') {
if (currentSGML) NXPrintf(sgmlStream, "%s", currentSGML->tab_tag);
else NXPrintf(sgmlStream, "\t");
} else { /* Not tab or newline */
lineLength ++; /* @@bug doesn't count entity names */
if ((currentSGML->free_format)
&& (lineLength++ > LINE_WRAP) /* Wrap lines if we can */
&& (c==' ')) {
c = '\n';
lineLength = 0;
}
if (currentSGML->litteral) {
NXPrintf(sgmlStream, "%c", c);
} else {
switch(c) {
case '<': NXPrintf(sgmlStream, "<"); break;
case '&': NXPrintf(sgmlStream, "&"); break;
default: NXPrintf(sgmlStream, "%c", c); break;
} /* switch */
} /* not litteral */
}
}
}
}
} /* change_run */
/* This is the body of the SGML output method.
*/
- writeSGML:(NXStream *) stream relativeTo:(const char *)aName
{
NXRun * r = theRuns->runs;
int sor; /* Character position of start of run */
NXRun dummy;
dummy.paraStyle = 0;
dummy.info = 0;
dummy.chars = 0;
SGML_gen_newlines=0; /* Number of newlines read but not inserted */
HT = self;
saveName = aName;
sgmlStream = stream;
SGML_gen_errors = 0;
currentSGML = 0;
prefix = 0; /* No prefix to junk */
START_INPUT;
lineLength = 0; /* Starting in column 1 */
NXPrintf(stream, "<HTML>\n<HEAD>\n");
NXPrintf(stream, "<TITLE>%s</TITLE>", [window title]);
if (nextAnchorNumber) NXPrintf(stream, "\n<NEXTID N=\"z%i\">\n",
nextAnchorNumber);
NXPrintf(stream, "</HEAD>\n");
NXPrintf(stream, "<BODY>");
/* Change style tags etc
*/
change_run(&dummy, r); /* Start first run */
for (sor=r++->chars; sor<textLength; sor=sor+(r++)->chars) {
if (TRACE) printf("%4i: %i chars in run %3i.\n",
sor, r->chars, r-theRuns->runs);
change_run(r-1, r); /* Runs 2 to N */
}
change_run(r, &dummy); /* Close last run */
tFlags.changeState = 0; /* Please notify delegate if changed */
NXPrintf(stream, "</BODY>\n</HTML>\n");
return (SGML_gen_errors) ? nil : self;
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.