HTPlain.c

This is HTPlain.c in view mode; [Download] [Up]
/*		Plain text object		HTWrite.c
**		=================
**
**	This version of the stream object just writes to a socket.
**	The socket is assumed open and left open.
**
**	Bugs:
**		strings written must be less than buffer size.
*/
#include "HTUtils.h"
#include "tcp.h"

#include "HTPlain.h"

#define BUFFER_SIZE 4096;	/* Tradeoff */

#include "HText.h"
#include "HTStyle.h"
#include "HTMLDTD.h"
#include "HTCJK.h"
#include "UCMap.h"
#include "UCDefs.h"
#include "UCAux.h"

#include "LYLeaks.h"

#define FREE(x) if (x) {free(x); x = NULL;}

extern HTStyleSheet * styleSheet;

extern int current_char_set;
extern CONST char **LYCharSets[];
extern int LYlowest_eightbit[];
extern BOOLEAN LYRawMode;
extern CONST char * HTMLGetEntityName PARAMS((UCode_t code));
extern BOOL HTPassEightBitRaw;
extern BOOL HTPassHighCtrlRaw;
extern HTCJKlang HTCJK;

PUBLIC int HTPlain_lastraw = -1;

/*		HTML Object
**		-----------
*/
struct _HTStream {
    CONST HTStreamClass *	isa;
    HText *			text;
    /*
    **  The node_anchor UCInfo and handle for the input (PARSER) stage. - FM
    */
    LYUCcharset	*		inUCI;
    int				inUCLYhndl;
    /*
    **  The node_anchor UCInfo and handle for the output (HTEXT) stage. - FM
    */
    LYUCcharset	*		outUCI;
    int				outUCLYhndl;
    /*
    **  Counter, value, buffer and pointer for UTF-8 handling. - FM
    */
    char			utf_count;
    UCode_t			utf_char;
    char 			utf_buf[8];
    char *			utf_buf_p;
    /*
    **  The charset transformation structure. - FM
    */
    UCTransParams		T;
};

PRIVATE char replace_buf [64];        /* buffer for replacement strings */

PRIVATE void HTPlain_getChartransInfo ARGS2(
	HTStream *,		me,
	HTParentAnchor *,	anchor)
{
    if (me->inUCLYhndl < 0) {
	HTAnchor_copyUCInfoStage(anchor, UCT_STAGE_PARSER, UCT_STAGE_MIME,
				 	 UCT_SETBY_PARSER);
	me->inUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_PARSER);
    }
    if (me->outUCLYhndl < 0) {
	int chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT);
	if (chndl < 0) {
	    chndl = current_char_set;
	    HTAnchor_setUCInfoStage(anchor, chndl,
				    UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT);
	}
	HTAnchor_setUCInfoStage(anchor, chndl,
				UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT);
	me->outUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT);
    }
    me->inUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_PARSER);
    me->outUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_HTEXT);
}

/*	Write the buffer out to the socket
**	----------------------------------
*/

/*_________________________________________________________________________
**
**			A C T I O N 	R O U T I N E S
*/

PRIVATE void HTPlain_write PARAMS((
	HTStream *		me,
	CONST char *		s,
	int			l));

/*	Character handling
**	------------------
*/
PRIVATE void HTPlain_put_character ARGS2(
	HTStream *,		me,
	char,			c)
{
#ifdef REMOVE_CR_ONLY
    /*
    **  Throw away \r's.
    */
    if (c != '\r') {
       HText_appendCharacter(me->text, c);
    }
#else
    /*
    **  See HTPlain_write() for explanations of the following code
    **  (we've been called via HTPlain_put_string() to do for each
    **  character of a terminated string what HTPlain_write() does
    **  via a while loop for each character in a stream of given
    **  length). - FM
    */
    if ((HTPlain_lastraw == '\r') && c == '\n') {
	HTPlain_lastraw = -1;
	return;
    }
    HTPlain_lastraw = c;
    if (c == '\r') {
	HText_appendCharacter(me->text, '\n');
    } else if ((unsigned char)c >= 127) {
	/*
	**  For now, don't repeat everything here
	**  that has been done below - KW
	*/
	HTPlain_write(me, &c, 1);
    } else if (HTCJK != NOCJK) {
	HText_appendCharacter(me->text, c);
    } else if ((unsigned char)c >= 127 && (unsigned char)c < 161 &&
    	       HTPassHighCtrlRaw) {
	HText_appendCharacter(me->text, c);
    } else if ((unsigned char)c == 160) {
	HText_appendCharacter(me->text, ' ');
    } else if ((unsigned char)c == 173) {
        return;
    } else if (((unsigned char)c >= 32 && (unsigned char)c < 127) ||
	       c == '\n' || c == '\t') {
	HText_appendCharacter(me->text, c);
    } else if ((unsigned char)c > 160) {
	if (!HTPassEightBitRaw &&
	    !((me->outUCLYhndl == 0) ||
	      (me->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1)))) {
	    int len, high, low, i, diff = 1;
	    CONST char * name;
	    UCode_t value = (UCode_t)((unsigned char)c - 160);

	    name = HTMLGetEntityName(value);
	    len =  strlen(name);
	    for (low = 0, high = HTML_dtd.number_of_entities;
		high > low;
		diff < 0 ? (low = i+1) : (high = i)) {
		/* Binary search */
		i = (low + (high-low)/2);
		diff = strncmp(HTML_dtd.entity_names[i], name, len);
		if (diff == 0) {
		    HText_appendText(me->text,
				     LYCharSets[me->outUCLYhndl][i]);
		    break;
		}
	    }
	    if (diff) {
		HText_appendCharacter(me->text, c);
	    }
	} else {
	    HText_appendCharacter(me->text, c);
	}
    }
#endif /* REMOVE_CR_ONLY */
}


/*	String handling
**	---------------
**
*/
PRIVATE void HTPlain_put_string ARGS2(HTStream *, me, CONST char*, s)
{
#ifdef REMOVE_CR_ONLY
    HText_appendText(me->text, s);
#else
    CONST char * p;

    if (s == NULL)
	return;
    for (p = s; *p; p++) {
        HTPlain_put_character(me, *p);
    }
#endif /* REMOVE_CR_ONLY */
}


/*
**	Entry function for displayed text/plain and WWW_SOURCE strings. - FM
**	---------------------------------------------------------------
*/
PRIVATE void HTPlain_write ARGS3(HTStream *, me, CONST char*, s, int, l)
{
    CONST char * p;
    CONST char * e = s+l;
    char c;
    unsigned char c_unsign;
    BOOL chk;
    UCode_t code, uck;
    char saved_char_in = '\0';

    for (p = s; p < e; p++) {
#ifdef REMOVE_CR_ONLY
	/* 
	**  Append the whole string, but remove any \r's. - FM
	*/
	if (*p != '\r') {
	    HText_appendCharacter(me->text, *p);
	}
#else
	/*
	**  Try to handle lone LFs, CRLFs and lone CRs
	**  as newline, and to deal with control, ASCII,
	**  and 8-bit characters based on best guesses
	**  of what's appropriate. - FM
	*/
	if ((HTPlain_lastraw == '\r') && *p == '\n') {
	    HTPlain_lastraw = -1;
	    continue;
	}
	HTPlain_lastraw = *p;
	if (*p == '\r') {
	    HText_appendCharacter(me->text, '\n');
	    continue;
	}
	/*
	**  Make sure the character is handled as Unicode
	**  whenever that's appropriate.  - FM
	*/
	c = *p;
	c_unsign = (unsigned char)c;
	code = (UCode_t)c_unsign;
	saved_char_in = '\0';
	/*
	**  Combine any UTF-8 multibytes into Unicode
	**  to check for special characters. - FM
	*/
	if (me->T.decode_utf8) {
	    /*
	    **  Combine UTF-8 into Unicode.
	    **  Incomplete characters silently ignored.
	    **  from Linux kernel's console.c - KW
	    */
	    if (c_unsign > 127) {
		/*
		**  We have an octet from a multibyte character. - FM
		*/
		if (me->utf_count > 0 && (c & 0xc0) == 0x80) {
		    /*
		    **  Adjust the UCode_t value, add the octet
		    **  to the buffer, and decrement the byte
		    **  count. - FM
		    */
		    me->utf_char = (me->utf_char << 6) | (c & 0x3f);
		    me->utf_count--;
		    *(me->utf_buf_p) = c;
		    (me->utf_buf_p)++;
		    if (me->utf_count == 0) {
		        /*
			**  Got a complete multibyte character.
			*/
			*(me->utf_buf_p) = '\0';
			code = me->utf_char;
			if (code > 0 && code < 256) {
			    c = ((char)(code & 0xff));
			    c_unsign = (unsigned char)c;
			}
		    } else {
			/*
			**  Get the next byte. - FM
			*/
			continue;
		    }
		} else {
		    /*
		    **  Start handling a new multibyte character. - FM
		    */
		    me->utf_buf_p[0] = c;
		    me->utf_buf_p = &me->utf_buf[1];
		    if ((*p & 0xe0) == 0xc0) {
			me->utf_count = 1;
			me->utf_char = (c & 0x1f);
		    } else if ((*p & 0xf0) == 0xe0) {
			me->utf_count = 2;
			me->utf_char = (c & 0x0f);
		    } else if ((*p & 0xf8) == 0xf0) {
			me->utf_count = 3;
			me->utf_char = (c & 0x07);
		    } else if ((*p & 0xfc) == 0xf8) {
			me->utf_count = 4;
			me->utf_char = (c & 0x03);
		    } else if ((*p & 0xfe) == 0xfc) {
			me->utf_count = 5;
			me->utf_char = (c & 0x01);
		    } else {
			/*
			 *  We got garbage, so ignore it. - FM
			 */
			me->utf_count = 0;
			me->utf_buf_p[0] = '\0';
			me->utf_buf_p = me->utf_buf;
		    }
		    /*
		    **  Get the next byte. - FM
		    */
		    continue;
		}
	    } else if (me->utf_count > 0) {
		/*
		**  Got an ASCII character when expecting
		**  UTF-8 multibytes, so ignore the buffered
		**  multibye characters and fall through with
		**  the current ASCII character. - FM
		*/
		me->utf_count = 0;
		me->utf_buf[0] = '\0';
		me->utf_buf_p = me->utf_buf;
		code = (UCode_t)c_unsign;
	    } else {
		/*
		**  Got a valid ASCII character, so fall
		**  through with it. - FM
		*/
		code = (UCode_t)c_unsign;
	    }
	}
	/*
	**  Convert characters from non-UTF-8 charsets
	**  to Unicode (if appropriate). - FM
	*/
	if (!(me->T.decode_utf8 &&
	      (unsigned char)(*p) > 127)) {
	    if (me->T.strip_raw_char_in)
		saved_char_in = c;
	    if (me->T.trans_to_uni &&
		(code >= 127 ||
		 (code < 32 && code != 0 &&
		  me->T.trans_C0_to_uni))) {
		/*
		**  Convert the octet to Unicode. - FM
		*/
		code = (UCode_t)UCTransToUni(c, me->inUCLYhndl);
		if (code > 0) {
		    saved_char_in = c;
		    if (code < 256) {
			c = ((char)(code & 0xff));
			c_unsign = (unsigned char)c;
		    }
		}
	    } else if (code < 32 && code != 0 &&
		       me->T.trans_C0_to_uni) {
		/*
		**  Quote from SGML.c:
		**  	"This else if may be too ugly to keep. - KW"
		*/
		if (me->T.trans_from_uni &&
		    (((code = UCTransToUni(c, me->inUCLYhndl)) >= 32) ||
		     (me->T.transp &&
		      (code = UCTransToUni(c, me->inUCLYhndl)) > 0))) {
		    saved_char_in = c;
		    if (code < 256) {
			c = ((char)(code & 0xff));
			c_unsign = (unsigned char)c;
		    }
		} else {
		    uck = -1;
		    if (me->T.transp) {
			uck = UCTransCharStr(replace_buf, 60, c,
					     me->inUCLYhndl,
					     me->inUCLYhndl, NO);
		    }
		    if (!me->T.transp || uck < 0) {
			uck = UCTransCharStr(replace_buf, 60, c,
					     me->inUCLYhndl,
					     me->outUCLYhndl, YES);
		    }
		    if (uck == 0) {
			continue;
		    } else if (uck < 0) {
			me->utf_buf[0] = '\0';
			code = (unsigned char)c;
		    } else {
			c = replace_buf[0];
			if (c && replace_buf[1]) {
			    HText_appendText(me->text, replace_buf);
			    continue;
			}
		    }
		    me->utf_buf[0] = '\0';
		    code = (unsigned char)c;
		} /*  Next line end of ugly stuff for C0. - KW */
	    } else {
		me->utf_buf[0] = '\0';
		code = (unsigned char)c;
	    }
	}
	/*
	**  At this point we have either code in Unicode
	**  (and c in latin1 if code is in the latin1 range),
	**  or code and c will have to be passed raw.
	*/

	/*
	**  If CJK mode is on, we'll assume the document matches
	**  the user's selected character set, and if not, the
	**  user should toggle off raw/CJK mode to reload. - FM
	*/
	if (HTCJK != NOCJK) {
	    HText_appendCharacter(me->text, c);

#define PASSHICTRL (me->T.transp || \
		    code >= LYlowest_eightbit[me->inUCLYhndl])
#define PASS8859SPECL me->T.pass_160_173_raw
#define PASSHI8BIT (HTPassEightBitRaw || \
		    (me->T.do_8bitraw && !me->T.trans_from_uni))
	/*
	**  If HTPassHighCtrlRaw is set (e.g., for KOI8-R) assume the
	**  document matches and pass 127-160 8-bit characters.  If it
	**  doesn't match, the user should toggle raw/CJK mode off. - FM
	*/
	} else if (code >= 127 && code < 161 &&
		   PASSHICTRL && PASS8859SPECL) {
	    HText_appendCharacter(me->text, c);
	} else if (code == 173 && PASS8859SPECL) {
	    HText_appendCharacter(me->text, c);
	/*
	**  If neither HTPassHighCtrlRaw nor CJK is set, play it safe
	**  and treat 160 (nbsp) as an ASCII space (32). - FM
	*/
	} else if (code == 160) {
	    HText_appendCharacter(me->text, ' ');
	/*
	**  If neither HTPassHighCtrlRaw nor CJK is set, play it safe
	**  and ignore 173 (shy). - FM
	*/
	} else if (code == 173) {
	    continue;
	/*
	**  If we get to here, pass the displayable ASCII characters. - FM
	*/
	} else if ((code >= 32 && code < 127) ||
		   (PASSHI8BIT &&
		    c >= LYlowest_eightbit[me->outUCLYhndl]) ||
		   *p == '\n' || *p == '\t') {
	    HText_appendCharacter(me->text, c);
	/*
	**  Use an ASCII space (32) for ensp, emsp or thinsp. - FM 
	*/
	} else if (code == 8194 || code == 8195 || code == 8201) {
	    HText_appendCharacter(me->text, ' ');
	/*
	**  Use ASCII hyphen for 8211 (ndash/endash)
	**  or 8212 (mdash/emdash). - FM
	*/
	} else if (code == 8211 || code == 8212) {
	    HText_appendCharacter(me->text, '-');
	/*
	**  If we want the raw character, pass it now. - FM
	*/
	} else if (me->T.use_raw_char_in && saved_char_in) {
	    HText_appendCharacter(me->text, saved_char_in);
/******************************************************************
 *   I. LATIN-1 OR UCS2  TO  DISPLAY CHARSET
 ******************************************************************/  
	} else if ((chk = (me->T.trans_from_uni && code >= 160)) &&
		   (uck = UCTransUniChar(code,
					 me->outUCLYhndl)) >= 32 &&
		   uck < 256) {
	    if (TRACE) {
		fprintf(stderr,
			"UCTransUniChar returned 0x%.2lX:'%c'.\n",
			uck, ((char)(uck & 0xff)));
	    }
	    HText_appendCharacter(me->text, ((char)(uck & 0xff)));
	} else if (chk &&
		   (uck == -4 ||
		    (me->T.repl_translated_C0 && uck > 0 && uck < 32)) &&
		   /*
		   **  Not found; look for replacement string.
		   */
		   (uck = UCTransUniCharStr(replace_buf, 60, code,
					    me->outUCLYhndl, 0) >= 0)) { 
	    /*
	    **  No further tests for valididy - assume that whoever
	    **  defined replacement strings knew what she was doing.
	    */
	    HText_appendText(me->text, replace_buf);
	/*
	**  If we get to here, and should have translated,
	**  translation has failed so far.  
	*/
	} else if (chk && code > 127 && me->T.output_utf8) {
	    /*
	    **  We want UTF-8 output, so do it now. - FM
	    */
	    if (*me->utf_buf) {
		HText_appendText(me->text, me->utf_buf);
		me->utf_buf[0] = '\0';
		me->utf_buf_p = me->utf_buf;
	    } else if (UCConvertUniToUtf8(code, replace_buf)) {
		HText_appendText(me->text, replace_buf);
	    } else {
		sprintf(replace_buf, "U%.2lX", code);
		HText_appendText(me->text, replace_buf);
	    }
	} else if (me->T.strip_raw_char_in &&
		   (unsigned char)*p >= 192 &&
		   (unsigned char)*p < 255) {
	    /*
	    **  KOI special: strip high bit, gives
	    **  (somewhat) readable ASCII.
	    */
	    HText_appendCharacter(me->text, (char)(*p & 0x7f));
	/*
	**  If we don't actually want the character,
	**  make it safe and output that now. - FM
	*/
	} else if ((c_unsign > 0 &&
		    c_unsign < LYlowest_eightbit[me->outUCLYhndl]) ||
		   (me->T.trans_from_uni && !HTPassEightBitRaw)) {
	    /*
	    **  If we do not have the "7-bit approximations" as our
	    **  output character set (in which case we did it already)
	    **  seek a translation for that.  Otherwise, or if the
	    **  translation fails, use UHHH notation. - FM
	    */
	    if ((chk = (me->outUCLYhndl !=
			UCGetLYhndl_byMIME("us-ascii"))) &&
		(uck = UCTransUniChar(code,
				      UCGetLYhndl_byMIME("us-ascii")))
				      >= 32 && uck < 127) {
		/*
		**  Got an ASCII character (yippey). - FM
		*/
		c = ((char)(uck & 0xff));
		HText_appendCharacter(me->text, c);
	    } else if ((chk && uck == -4) &&
		       (uck = UCTransUniCharStr(replace_buf,
						60, code,
						UCGetLYhndl_byMIME("us-ascii"),
						0) >= 0)) {
		/*
		**  Got a repacement string (yippey). - FM
		*/
		HText_appendText(me->text, replace_buf);
	    } else if (code == 8204 || code == 8205) {
		/*
		**  Ignore 8204 (zwnj) or 8205 (zwj), if we get to here. - FM
		*/
		if (TRACE) {
		    fprintf(stderr,
			    "HTPlain_write: Ignoring '%ld'.\n", code);
		}
	    } else if (code == 8206 || code == 8207) {
		/*
		**  Ignore 8206 (lrm) or 8207 (rlm), if we get to here. - FM
		*/
		if (TRACE) {
		    fprintf(stderr,
			    "HTPlain_write: Ignoring '%ld'.\n", code);
		}
	    } else {
		/*
		**  Out of luck, so use the UHHH notation (ugh). - FM
		*/
		sprintf(replace_buf, "U%.2lX", code);
		HText_appendText(me->text, replace_buf);
	    }
	/*
	**  If we get to here and have a monobyte character,
	**  pass it. - FM
	*/
	} else if (c_unsign > 0 && c_unsign < 256) {
	    HText_appendCharacter(me->text, c);
	}
#endif /* REMOVE_CR_ONLY */
    }
}

/*	Free an HTML object
**	-------------------
**
**	Note that the SGML parsing context is freed, but the created object is
**	not, as it takes on an existence of its own unless explicitly freed.
*/
PRIVATE void HTPlain_free ARGS1(
	HTStream *,	me)
{
    FREE(me);
}

/*	End writing
*/
PRIVATE void HTPlain_abort ARGS2(
	HTStream *,	me,
	HTError,	e)
{
    HTPlain_free(me);
}

/*		Structured Object Class
**		-----------------------
*/
PUBLIC CONST HTStreamClass HTPlain =
{		
	"SocketWriter",
	HTPlain_free,
	HTPlain_abort,
	HTPlain_put_character, 	HTPlain_put_string, HTPlain_write,
}; 

/*		New object
**		----------
*/
PUBLIC HTStream* HTPlainPresent ARGS3(
	HTPresentation *,	pres,
	HTParentAnchor *,	anchor,	
	HTStream *,		sink)
{

    HTStream* me = (HTStream*)malloc(sizeof(*me));
    if (me == NULL)
        outofmem(__FILE__, "HTPlain_new");
    me->isa = &HTPlain;

    HTPlain_lastraw = -1;

    me->utf_count = 0;
    me->utf_char = 0;
    me->utf_buf[0] = me->utf_buf[6] =me->utf_buf[7] = '\0';
    me->utf_buf_p = me->utf_buf;
    me->outUCLYhndl = HTAnchor_getUCLYhndl(anchor,UCT_STAGE_HTEXT);
    me->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,UCT_STAGE_PARSER);
    HTPlain_getChartransInfo(me, anchor);
    UCSetTransParams(&me->T,
		     me->inUCLYhndl, me->inUCI,
		     me->outUCLYhndl,
		     HTAnchor_getUCInfoStage(anchor,UCT_STAGE_HTEXT));

    me->text = HText_new(anchor);
    HText_setStyle(me->text, HTStyleNamed(styleSheet, "Example"));
    HText_beginAppend(me->text);

    return (HTStream*) me;
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.