ftp.nice.ch/pub/next/unix/archiver/arc.5.21.N.bs.tar.gz#/arc/arcsq.c

This is arcsq.c in view mode; [Download] [Up]

/*
 * $Header: arcsq.c,v 1.3 88/07/31 18:53:32 hyc Exp $
 */

/*
 * ARC - Archive utility - ARCSQ 
 *
 * Version 3.10, created on 01/30/86 at 20:10:46
 *
 * (C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED 
 *
 * By:  Thom Henderson 
 *
 * Description: This file contains the routines used to squeeze a file when
 * placing it in an archive. 
 *
 * Language: Computer Innovations Optimizing C86 
 *
 * Programming notes: Most of the routines used for the Huffman squeezing
 * algorithm were lifted from the SQ program by Dick Greenlaw, as adapted to
 * CI-C86 by Robert J. Beilstein. 
 */
#include <stdio.h>
#include "arc.h"

/* stuff for Huffman squeezing */

#define TRUE 1
#define FALSE 0
#define ERROR (-1)
#define SPEOF 256		/* special endfile token */
#define NOCHILD (-1)		/* marks end of path through tree */
#define NUMVALS 257		/* 256 data values plus SPEOF */
#define NUMNODES (NUMVALS+NUMVALS-1)	/* number of nodes */
#define MAXCOUNT (unsigned short) 65535	/* biggest unsigned integer */

/*
 * The following array of structures are the nodes of the binary trees. The
 * first NUMVALS nodes become the leaves of the final tree and represent the
 * values of the data bytes being encoded and the special endfile, SPEOF. The
 * remaining nodes become the internal nodes of the final tree. 
 */

struct nd {			/* shared by unsqueezer */
	unsigned short	weight;	/* number of appearances */
	short		tdepth;	/* length on longest path in tree */
	short		lchild, rchild;	/* indices to next level */
}               node[NUMNODES];	/* use large buffer */

static int      dctreehd;	/* index to head of final tree */

/*
 * This is the encoding table: The bit strings have first bit in low bit.
 * Note that counts were scaled so code fits unsigned integer. 
 */

static int      codelen[NUMVALS];	/* number of bits in code */
static unsigned short code[NUMVALS];	/* code itself, right adjusted */
static unsigned short tcode;	/* temporary code value */
static long     valcount[NUMVALS];	/* actual count of times seen */

/* Variables used by encoding process */

static int      curin;		/* value currently being encoded */
static int      cbitsrem;	/* # of code string bits left */
static unsigned short ccode;	/* current code right justified */

static	void	scale(), heap(), adjust(), bld_tree(), init_enc(), put_int();
static	int	cmptrees(), buildenc(), maxchar();
void 
init_sq()
{				/* prepare for scanning pass */
	int             i;	/* node index */

	/*
	 * Initialize all nodes to single element binary trees with zero
	 * weight and depth. 
	 */

	for (i = 0; i < NUMNODES; ++i) {
		node[i].weight = 0;
		node[i].tdepth = 0;
		node[i].lchild = NOCHILD;
		node[i].rchild = NOCHILD;
	}

	for (i = 0; i < NUMVALS; i++)
		valcount[i] = 0;
}

void 
scan_sq(c)			/* add a byte to the tables */
	int             c;	/* byte to add */
{
	unsigned short *wp;	/* speeds up weight counting */

	/* Build frequency info in tree */

	if (c == EOF)		/* it's traditional */
		c = SPEOF;	/* dumb, but traditional */

	if (*(wp = &node[c].weight) != MAXCOUNT)
		++(*wp);	/* bump weight counter */

	valcount[c]++;		/* bump byte counter */
}

long 
pred_sq()
{				/* predict size of squeezed file */
	int             i;
	int             btlist[NUMVALS];	/* list of intermediate
						 * b-trees */
	int             listlen;/* length of btlist */
	unsigned short	ceiling;/* limit for scaling */
	long            size = 0;	/* predicted size */
	int             numnodes;	/* # of nodes in simplified tree */

	scan_sq(EOF);		/* signal end of input */

	ceiling = MAXCOUNT;

	/* Keep trying to scale and encode */

	do {
		scale(ceiling);
		ceiling /= 2;	/* in case we rescale */

		/*
		 * Build list of single node binary trees having leaves for
		 * the input values with non-zero counts 
		 */

		for (i = listlen = 0; i < NUMVALS; ++i) {
			if (node[i].weight != 0) {
				node[i].tdepth = 0;
				btlist[listlen++] = i;
			}
		}

		/*
		 * Arrange list of trees into a heap with the entry indexing
		 * the node with the least weight at the top. 
		 */

		heap(btlist, listlen);

		/* Convert the list of trees to a single decoding tree */

		bld_tree(btlist, listlen);

		/* Initialize the encoding table */

		init_enc();

		/*
		 * Try to build encoding table. Fail if any code is > 16 bits
		 * long. 
		 */
	} while (buildenc(0, dctreehd) == ERROR);

	/* Initialize encoding variables */

	cbitsrem = 0;		/* force initial read */
	curin = 0;		/* anything but endfile */

	for (i = 0; i < NUMVALS; i++)	/* add bits for each code */
		size += valcount[i] * codelen[i];

	size = (size + 7) / 8;	/* reduce to number of bytes */

	numnodes = dctreehd < NUMVALS ? 0 : dctreehd - (NUMVALS - 1);

	size += sizeof(short) + 2 * numnodes * sizeof(short);

	return size;
}

/*
 * The count of number of occurrances of each input value have already been
 * prevented from exceeding MAXCOUNT. Now we must scale them so that their
 * sum doesn't exceed ceiling and yet no non-zero count can become zero. This
 * scaling prevents errors in the weights of the interior nodes of the
 * Huffman tree and also ensures that the codes will fit in an unsigned
 * integer. Rescaling is used if necessary to limit the code length. 
 */

static	void
scale(ceil)
	unsigned short	ceil;	/* upper limit on total weight */
{
	register int    i;
	int             ovflw, divisor;
	unsigned short	w, sum;
	unsigned char   increased;	/* flag */

	do {
		for (i = sum = ovflw = 0; i < NUMVALS; ++i) {
			if (node[i].weight > (ceil - sum))
				++ovflw;
			sum += node[i].weight;
		}

		divisor = ovflw + 1;

		/* Ensure no non-zero values are lost */

		increased = FALSE;
		for (i = 0; i < NUMVALS; ++i) {
			w = node[i].weight;
			if (w < divisor && w != 0) {	/* Don't fail to provide
							 * a code if it's used
							 * at all */

				node[i].weight = divisor;
				increased = TRUE;
			}
		}
	} while (increased);

	/* Scaling factor chosen, now scale */

	if (divisor > 1)
		for (i = 0; i < NUMVALS; ++i)
			node[i].weight /= divisor;
}

/*
 * heap() and adjust() maintain a list of binary trees as a heap with the top
 * indexing the binary tree on the list which has the least weight or, in
 * case of equal weights, least depth in its longest path. The depth part is
 * not strictly necessary, but tends to avoid long codes which might provoke
 * rescaling. 
 */

static	void
heap(list, length)
	int             list[], length;
{
	register int    i;

	for (i = (length - 2) / 2; i >= 0; --i)
		adjust(list, i, length - 1);
}

/* Make a heap from a heap with a new top */

static	void
adjust(list, top, bottom)
	int             list[], top, bottom;
{
	register int    k, temp;

	k = 2 * top + 1;	/* left child of top */
	temp = list[top];	/* remember root node of top tree */

	if (k <= bottom) {
		if (k < bottom && cmptrees(list[k], list[k + 1]))
			++k;

		/* k indexes "smaller" child (in heap of trees) of top */
		/* now make top index "smaller" of old top and smallest child */

		if (cmptrees(temp, list[k])) {
			list[top] = list[k];
			list[k] = temp;

			/* Make the changed list a heap */

			adjust(list, k, bottom);	/* recursive */
		}
	}
}

/*
 * Compare two trees, if a > b return true, else return false. Note
 * comparison rules in previous comments. 
 */

static	int 
cmptrees(a, b)
	int             a, b;	/* root nodes of trees */
{
	if (node[a].weight > node[b].weight)
		return TRUE;
	if (node[a].weight == node[b].weight)
		if (node[a].tdepth > node[b].tdepth)
			return TRUE;
	return FALSE;
}

/*
 * HUFFMAN ALGORITHM: develops the single element trees into a single binary
 * tree by forming subtrees rooted in interior nodes having weights equal to
 * the sum of weights of all their descendents and having depth counts
 * indicating the depth of their longest paths. 
 *
 * When all trees have been formed into a single tree satisfying the heap
 * property (on weight, with depth as a tie breaker) then the binary code
 * assigned to a leaf (value to be encoded) is then the series of left (0)
 * and right (1) paths leading from the root to the leaf. Note that trees are
 * removed from the heaped list by moving the last element over the top
 * element and reheaping the shorter list. 
 */

static	void
bld_tree(list, len)
	int             list[];
int             len;
{
	register int    freenode;	/* next free node in tree */
	register struct nd *frnp;	/* free node pointer */
	int             lch, rch;	/* temps for left, right children */

	/*
	 * Initialize index to next available (non-leaf) node. Lower numbered
	 * nodes correspond to leaves (data values). 
	 */

	freenode = NUMVALS;

	while (len > 1) {	/* Take from list two btrees with least
				 * weight and build an interior node pointing
				 * to them. This forms a new tree. */

		lch = list[0];	/* This one will be left child */

		/* delete top (least) tree from the list of trees */

		list[0] = list[--len];
		adjust(list, 0, len - 1);

		/* Take new top (least) tree. Reuse list slot later */

		rch = list[0];	/* This one will be right child */

		/*
		 * Form new tree from the two least trees using a free node
		 * as root. Put the new tree in the list. 
		 */

		frnp = &node[freenode];	/* address of next free node */
		list[0] = freenode++;	/* put at top for now */
		frnp->lchild = lch;
		frnp->rchild = rch;
		frnp->weight = node[lch].weight + node[rch].weight;
		frnp->tdepth = 1 + maxchar(node[lch].tdepth, node[rch].tdepth);

		/* reheap list  to get least tree at top */

		adjust(list, 0, len - 1);
	}
	dctreehd = list[0];	/* head of final tree */
}

static int 
maxchar(a, b)
{
	return a > b ? a : b;
}

static	void
init_enc()
{
	register int    i;

	/* Initialize encoding table */

	for (i = 0; i < NUMVALS; ++i)
		codelen[i] = 0;
}

/*
 * Recursive routine to walk the indicated subtree and level and maintain the
 * current path code in bstree. When a leaf is found the entire code string
 * and length are put into the encoding table entry for the leaf's data value
 * . 
 *
 * Returns ERROR if codes are too long. 
 */

static int 
buildenc(level, root)
	int             level;	/* level of tree being examined, from zero */
	int             root;	/* root of subtree is also data value if leaf */
{
	register int    l, r;

	l = node[root].lchild;
	r = node[root].rchild;

	if (l == NOCHILD && r == NOCHILD) {	/* Leaf. Previous path
						 * determines bit string code
						 * of length level (bits 0 to
						 * level - 1). Ensures unused
						 * code bits are zero. */

		codelen[root] = level;
		code[root] = tcode & (((unsigned short	) ~0) >> (16 - level));
		return (level > 16) ? ERROR : 0;
	} else {
		if (l != NOCHILD) {	/* Clear path bit and continue deeper */

			tcode &= ~(1 << level);
			if (buildenc(level + 1, l) == ERROR)
				return ERROR;	/* pass back bad statuses */
		}
		if (r != NOCHILD) {	/* Set path bit and continue deeper */

			tcode |= 1 << level;
			if (buildenc(level + 1, r) == ERROR)
				return ERROR;	/* pass back bad statuses */
		}
	}
	return NULL;		/* it worked if we reach here */
}

static	void
put_int(n, f)			/* output an integer */
	short		n;	/* integer to output */
	FILE           *f;	/* file to put it to */
{
	void		putc_pak();

	putc_pak(n & 0xff, f);	/* first the low byte */
	putc_pak(n >> 8, f);	/* then the high byte */
}

/* Write out the header of the compressed file */

static long 
wrt_head(ob)
	FILE           *ob;
{
	register int    l, r;
	int             i, k;
	int             numnodes;	/* # of nodes in simplified tree */

	/*
	 * Write out a simplified decoding tree. Only the interior nodes are
	 * written. When a child is a leaf index (representing a data value)
	 * it is recoded as -(index + 1) to distinguish it from interior
	 * indexes which are recoded as positive indexes in the new tree. 
	 *
	 * Note that this tree will be empty for an empty file. 
	 */

	numnodes = dctreehd < NUMVALS ? 0 : dctreehd - (NUMVALS - 1);
	put_int(numnodes, ob);

	for (k = 0, i = dctreehd; k < numnodes; ++k, --i) {
		l = node[i].lchild;
		r = node[i].rchild;
		l = l < NUMVALS ? -(l + 1) : dctreehd - l;
		r = r < NUMVALS ? -(r + 1) : dctreehd - r;
		put_int(l, ob);
		put_int(r, ob);
	}

	return sizeof(short) + numnodes * 2 * sizeof(short);
}

/*
 * Get an encoded byte or EOF. Reads from specified stream AS NEEDED. 
 *
 * There are two unsynchronized bit-byte relationships here. The input stream
 * bytes are converted to bit strings of various lengths via the static
 * variables named c... These bit strings are concatenated without padding to
 * become the stream of encoded result bytes, which this function returns one
 * at a time. The EOF (end of file) is converted to SPEOF for convenience and
 * encoded like any other input value. True EOF is returned after that. 
 */

static int 
gethuff(ib)			/* Returns bytes except for EOF */
	FILE           *ib;
{
	int             rbyte;	/* Result byte value */
	int             need;	/* number of bits */
	int		getc_ncr();

	rbyte = 0;
	need = 8;		/* build one byte per call */

	/*
	 * Loop to build a byte of encoded data. Initialization forces read
	 * the first time. 
	 */

loop:
	if (cbitsrem >= need) {	/* if current code is big enough */
		if (need == 0)
			return rbyte;

		rbyte |= ccode << (8 - need);	/* take what we need */
		ccode >>= need;	/* and leave the rest */
		cbitsrem -= need;
		return rbyte & 0xff;
	}
	/* We need more than current code */

	if (cbitsrem > 0) {
		rbyte |= ccode << (8 - need);	/* take what there is */
		need -= cbitsrem;
	}
	/* No more bits in current code string */

	if (curin == SPEOF) {	/* The end of file token has been encoded. If
				 * result byte has data return it and do EOF
				 * next time. */

		cbitsrem = 0;
		return (need == 8) ? EOF : rbyte + 0;
	}
	/* Get an input byte */

	if ((curin = getc_ncr(ib)) == EOF)
		curin = SPEOF;	/* convenient for encoding */

	ccode = code[curin];	/* get the new byte's code */
	cbitsrem = codelen[curin];

	goto loop;
}

/*
 * This routine is used to perform the actual squeeze operation.  It can only
 * be called after the file has been scanned.  It returns the true length of
 * the squeezed entry. 
 */

long 
file_sq(f, t)			/* squeeze a file into an archive */
	FILE           *f;	/* file to squeeze */
	FILE           *t;	/* archive to receive file */
{
	int             c;	/* one byte of squeezed data */
	long            size;	/* size after squeezing */

	size = wrt_head(t);	/* write out the decode tree */

	while ((c = gethuff(f)) != EOF) {
		putc_pak(c, t);
		size++;
	}

	return size;		/* report true size */
}

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.