ftp.nice.ch/Attic/openStep/developer/resources/MiscKit.2.0.5.s.gnutar.gz#/MiscKit2/Temp/regex/framework/rxnfa.c

This is rxnfa.c in view mode; [Download] [Up]

/*	Copyright (C) 1995, 1996 Tom Lord
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 * 
 * You should have received a copy of the GNU Library General Public License
 * along with this software; see the file COPYING.  If not, write to
 * the Free Software Foundation, 59 Temple Place - Suite 330, 
 * Boston, MA 02111-1307, USA. 
 */


/*
 * Tom Lord (lord@cygnus.com, lord@gnu.ai.mit.edu)
 */


#include "rxall.h"
#include "rxnfa.h"



#define remalloc(M, S) (M ? realloc (M, S) : malloc (S))


/* {Low Level Data Structure Code}
 */

/* Constructs a new nfa node. */
#ifdef __STDC__
struct rx_nfa_state *
rx_nfa_state (struct rx *rx)
#else
struct rx_nfa_state *
rx_nfa_state (rx)
     struct rx *rx;
#endif
{
  struct rx_nfa_state * n = (struct rx_nfa_state *)malloc (sizeof (*n));
  if (!n)
    return 0;
  rx_bzero ((char *)n, sizeof (*n));
  n->next = rx->nfa_states;
  rx->nfa_states = n;
  return n;
}


#ifdef __STDC__
static void
rx_free_nfa_state (struct rx_nfa_state * n)
#else
static void
rx_free_nfa_state (n)
  struct rx_nfa_state * n;
#endif
{
  free ((char *)n);
}


/* This adds an edge between two nodes, but doesn't initialize the 
 * edge label.
 */

#ifdef __STDC__
struct rx_nfa_edge * 
rx_nfa_edge (struct rx *rx,
	     enum rx_nfa_etype type,
	     struct rx_nfa_state *start,
	     struct rx_nfa_state *dest)
#else
struct rx_nfa_edge * 
rx_nfa_edge (rx, type, start, dest)
     struct rx *rx;
     enum rx_nfa_etype type;
     struct rx_nfa_state *start;
     struct rx_nfa_state *dest;
#endif
{
  struct rx_nfa_edge *e;
  e = (struct rx_nfa_edge *)malloc (sizeof (*e));
  if (!e)
    return 0;
  e->next = start->edges;
  start->edges = e;
  e->type = type;
  e->dest = dest;
  return e;
}


#ifdef __STDC__
static void
rx_free_nfa_edge (struct rx_nfa_edge * e)
#else
static void
rx_free_nfa_edge (e)
     struct rx_nfa_edge * e;
#endif
{
  free ((char *)e);
}


/* This constructs a POSSIBLE_FUTURE, which is a kind epsilon-closure
 * of an NFA.  These are added to an nfa automaticly by eclose_nfa.
 */  

#ifdef __STDC__
static struct rx_possible_future * 
rx_possible_future (struct rx * rx,
		 struct rx_se_list * effects)
#else
static struct rx_possible_future * 
rx_possible_future (rx, effects)
     struct rx * rx;
     struct rx_se_list * effects;
#endif
{
  struct rx_possible_future *ec;
  ec = (struct rx_possible_future *) malloc (sizeof (*ec));
  if (!ec)
    return 0;
  ec->destset = 0;
  ec->next = 0;
  ec->effects = effects;
  return ec;
}


#ifdef __STDC__
static void
rx_free_possible_future (struct rx_possible_future * pf)
#else
static void
rx_free_possible_future (pf)
     struct rx_possible_future * pf;
#endif
{
  free ((char *)pf);
}


#ifdef __STDC__
static void
rx_free_nfa_graph (struct rx *rx)
#else
static void
rx_free_nfa_graph (rx)
     struct rx *rx;
#endif
{
  while (rx->nfa_states)
    {
      while (rx->nfa_states->edges)
	{
	  switch (rx->nfa_states->edges->type)
	    {
	    case ne_cset:
	      rx_free_cset (rx->nfa_states->edges->params.cset);
	      break;
	    default:
	      break;
	    }
	  {
	    struct rx_nfa_edge * e;
	    e = rx->nfa_states->edges;
	    rx->nfa_states->edges = rx->nfa_states->edges->next;
	    rx_free_nfa_edge (e);
	  }
	} /* while (rx->nfa_states->edges) */
      {
	/* Iterate over the partial epsilon closures of rx->nfa_states */
	struct rx_possible_future * pf = rx->nfa_states->futures;
	while (pf)
	  {
	    struct rx_possible_future * pft = pf;
	    pf = pf->next;
	    rx_free_possible_future (pft);
	  }
      }
      {
	struct rx_nfa_state *n;
	n = rx->nfa_states;
	rx->nfa_states = rx->nfa_states->next;
	rx_free_nfa_state (n);
      }
    }
}



/* {Translating a Syntax Tree into an NFA}
 *
 */


/* This is the Thompson regexp->nfa algorithm. 
 * It is modified to allow for `side-effect epsilons.'  Those are
 * edges that are taken whenever a similarly placed epsilon edge 
 * would be, but which also imply that some side effect occurs 
 * when the edge is taken.
 *
 * Side effects are used to model parts of the pattern langauge 
 * that are not regular.
 */

#ifdef __STDC__
int
rx_build_nfa (struct rx *rx,
	      struct rexp_node *rexp,
	      struct rx_nfa_state **start,
	      struct rx_nfa_state **end)
#else
int
rx_build_nfa (rx, rexp, start, end)
     struct rx *rx;
     struct rexp_node *rexp;
     struct rx_nfa_state **start;
     struct rx_nfa_state **end;
#endif
{
  struct rx_nfa_edge *edge;

  /* Start & end nodes may have been allocated by the caller. */
  *start = *start ? *start : rx_nfa_state (rx);

  if (!*start)
    return 0;

  if (!rexp)
    {
      *end = *start;
      return 1;
    }

  *end = *end ? *end : rx_nfa_state (rx);

  if (!*end)
    {
      rx_free_nfa_state (*start);
      return 0;
    }

  switch (rexp->type)
    {
    case r_cset:
      edge = rx_nfa_edge (rx, ne_cset, *start, *end);
      (*start)->has_cset_edges = 1;
      if (!edge)
	return 0;
      edge->params.cset = rx_copy_cset (rx->local_cset_size,
					rexp->params.cset);
      if (!edge->params.cset)
	{
	  rx_free_nfa_edge (edge);
	  return 0;
	}
      return 1;

    case r_string:
      {
	if (rexp->params.cstr.len == 1)
	  {
	    edge = rx_nfa_edge (rx, ne_cset, *start, *end);
	    (*start)->has_cset_edges = 1;
	    if (!edge)
	      return 0;
	    edge->params.cset = rx_cset (rx->local_cset_size);
	    if (!edge->params.cset)
	      {
		rx_free_nfa_edge (edge);
		return 0;
	      }
	    RX_bitset_enjoin (edge->params.cset, rexp->params.cstr.contents[0]);
	    return 1;
	  }
	else
	  {
	    struct rexp_node copied;
	    struct rx_nfa_state * shared;

	    copied = *rexp;
	    shared = 0;
	    copied.params.cstr.len--;
	    copied.params.cstr.contents++;
	    if (!rx_build_nfa (rx, &copied, &shared, end))
	      return 0;
	    copied.params.cstr.len = 1;
	    copied.params.cstr.contents--;
	    return rx_build_nfa (rx, &copied, start, &shared);
	  }
      }
 
    case r_opt:
      return (rx_build_nfa (rx, rexp->params.pair.left, start, end)
	      && rx_nfa_edge (rx, ne_epsilon, *start, *end));

    case r_plus:
      {
	struct rx_nfa_state * star_start = 0;
	struct rx_nfa_state * star_end = 0;
	struct rx_nfa_state * shared;

	shared = 0;
	if (!rx_build_nfa (rx, rexp->params.pair.left, start, &shared))
	  return 0;
	return (rx_build_nfa (rx, rexp->params.pair.left,
			      &star_start, &star_end)
		&& star_start
		&& star_end
		&& rx_nfa_edge (rx, ne_epsilon, star_start, star_end)
		&& rx_nfa_edge (rx, ne_epsilon, shared, star_start)
		&& rx_nfa_edge (rx, ne_epsilon, star_end, *end)
		&& rx_nfa_edge (rx, ne_epsilon, star_end, star_start));
      }

    case r_interval:
    case r_star:
      {
	struct rx_nfa_state * star_start = 0;
	struct rx_nfa_state * star_end = 0;
	return (rx_build_nfa (rx, rexp->params.pair.left,
			      &star_start, &star_end)
		&& star_start
		&& star_end
		&& rx_nfa_edge (rx, ne_epsilon, star_start, star_end)
		&& rx_nfa_edge (rx, ne_epsilon, *start, star_start)
		&& rx_nfa_edge (rx, ne_epsilon, star_end, *end)

		&& rx_nfa_edge (rx, ne_epsilon, star_end, star_start));
      }

    case r_cut:
      {
	struct rx_nfa_state * cut_end = 0;

	cut_end = rx_nfa_state (rx);
	if (!(cut_end && rx_nfa_edge (rx, ne_epsilon, *start, cut_end)))
	  {
	    rx_free_nfa_state (*start);
	    rx_free_nfa_state (*end);
	    if (cut_end)
	      rx_free_nfa_state (cut_end);
	    return 0;
	  }
	cut_end->is_final = rexp->params.intval;
	return 1;
      }

    case r_parens:
      return rx_build_nfa (rx, rexp->params.pair.left, start, end);

    case r_concat:
      {
	struct rx_nfa_state *shared = 0;
	return
	  (rx_build_nfa (rx, rexp->params.pair.left, start, &shared)
	   && rx_build_nfa (rx, rexp->params.pair.right, &shared, end));
      }

    case r_alternate:
      {
	struct rx_nfa_state *ls = 0;
	struct rx_nfa_state *le = 0;
	struct rx_nfa_state *rs = 0;
	struct rx_nfa_state *re = 0;
	return (rx_build_nfa (rx, rexp->params.pair.left, &ls, &le)
		&& rx_build_nfa (rx, rexp->params.pair.right, &rs, &re)
		&& rx_nfa_edge (rx, ne_epsilon, *start, ls)
		&& rx_nfa_edge (rx, ne_epsilon, *start, rs)
		&& rx_nfa_edge (rx, ne_epsilon, le, *end)
		&& rx_nfa_edge (rx, ne_epsilon, re, *end));
      }

    case r_context:
      edge = rx_nfa_edge (rx, ne_side_effect, *start, *end);
      if (!edge)
	return 0;
      edge->params.side_effect = (void *)rexp->params.intval;
      return 1;
    }

  /* this should never happen */
  return 0;
}


/* {Low Level Data Structures for the Static NFA->SuperNFA Analysis}
 *
 * There are side effect lists -- lists of side effects occuring
 * along an uninterrupted, acyclic path of side-effect epsilon edges.
 * Such paths are collapsed to single edges in the course of computing
 * epsilon closures.  The resulting single edges are labled with a list 
 * of all the side effects from the original multi-edge path.  Equivalent
 * lists of side effects are made == by the constructors below.
 *
 * There are also nfa state sets.  These are used to hold a list of all
 * states reachable from a starting state for a given type of transition
 * and side effect list.   These are also hash-consed.
 */



/* The next several functions compare, construct, etc. lists of side
 * effects.  See ECLOSE_NFA (below) for details.
 */

/* Ordering of rx_se_list
 * (-1, 0, 1 return value convention).
 */

#ifdef __STDC__
static int 
se_list_cmp (void * va, void * vb)
#else
static int 
se_list_cmp (va, vb)
     void * va;
     void * vb;
#endif
{
  struct rx_se_list * a = (struct rx_se_list *)va;
  struct rx_se_list * b = (struct rx_se_list *)vb;

  return ((va == vb)
	  ? 0
	  : (!va
	     ? -1
	     : (!vb
		? 1
		: ((long)a->car < (long)b->car
		   ? 1
		   : ((long)a->car > (long)b->car
		      ? -1
		      : se_list_cmp ((void *)a->cdr, (void *)b->cdr))))));
}


#ifdef __STDC__
static int 
se_list_equal (void * va, void * vb)
#else
static int 
se_list_equal (va, vb)
     void * va;
     void * vb;
#endif
{
  return !(se_list_cmp (va, vb));
}

static struct rx_hash_rules se_list_hash_rules = { se_list_equal, 0, 0, 0, 0 };


#ifdef __STDC__
static struct rx_se_list * 
side_effect_cons (struct rx * rx,
		  void * se, struct rx_se_list * list)
#else
static struct rx_se_list * 
side_effect_cons (rx, se, list)
     struct rx * rx;
     void * se;
     struct rx_se_list * list;
#endif
{
  struct rx_se_list * l;
  l = ((struct rx_se_list *) malloc (sizeof (*l)));
  if (!l)
    return 0;
  l->car = se;
  l->cdr = list;
  return l;
}


#ifdef __STDC__
static struct rx_se_list *
hash_cons_se_prog (struct rx * rx,
		   struct rx_hash * memo,
		   void * car, struct rx_se_list * cdr)
#else
static struct rx_se_list *
hash_cons_se_prog (rx, memo, car, cdr)
     struct rx * rx;
     struct rx_hash * memo;
     void * car;
     struct rx_se_list * cdr;
#endif
{
  long hash = (long)car ^ (long)cdr;
  struct rx_se_list template;

  template.car = car;
  template.cdr = cdr;
  {
    struct rx_hash_item * it = rx_hash_store (memo, hash,
					      (void *)&template,
					      &se_list_hash_rules);
    if (!it)
      return 0;
    if (it->data == (void *)&template)
      {
	struct rx_se_list * consed;
	consed = (struct rx_se_list *) malloc (sizeof (*consed));
	*consed = template;
	it->data = (void *)consed;
      }
    return (struct rx_se_list *)it->data;
  }
}
     

#ifdef __STDC__
static struct rx_se_list *
hash_se_prog (struct rx * rx, struct rx_hash * memo, struct rx_se_list * prog)
#else
static struct rx_se_list *
hash_se_prog (rx, memo, prog)
     struct rx * rx;
     struct rx_hash * memo;
     struct rx_se_list * prog;
#endif
{
  struct rx_se_list * answer = 0;
  while (prog)
    {
      answer = hash_cons_se_prog (rx, memo, prog->car, answer);
      if (!answer)
	return 0;
      prog = prog->cdr;
    }
  return answer;
}



/* {Constructors, etc. for NFA State Sets}
 */

#ifdef __STDC__
static int 
nfa_set_cmp (void * va, void * vb)
#else
static int 
nfa_set_cmp (va, vb)
     void * va;
     void * vb;
#endif
{
  struct rx_nfa_state_set * a = (struct rx_nfa_state_set *)va;
  struct rx_nfa_state_set * b = (struct rx_nfa_state_set *)vb;

  return ((va == vb)
	  ? 0
	  : (!va
	     ? -1
	     : (!vb
		? 1
		: (a->car->id < b->car->id
		   ? 1
		   : (a->car->id > b->car->id
		      ? -1
		      : nfa_set_cmp ((void *)a->cdr, (void *)b->cdr))))));
}

#ifdef __STDC__
static int 
nfa_set_equal (void * va, void * vb)
#else
static int 
nfa_set_equal (va, vb)
     void * va;
     void * vb;
#endif
{
  return !nfa_set_cmp (va, vb);
}

static struct rx_hash_rules nfa_set_hash_rules = { nfa_set_equal, 0, 0, 0, 0 };


#ifdef __STDC__
static struct rx_nfa_state_set * 
nfa_set_cons (struct rx * rx,
	      struct rx_hash * memo, struct rx_nfa_state * state,
	      struct rx_nfa_state_set * set)
#else
static struct rx_nfa_state_set * 
nfa_set_cons (rx, memo, state, set)
     struct rx * rx;
     struct rx_hash * memo;
     struct rx_nfa_state * state;
     struct rx_nfa_state_set * set;
#endif
{
  struct rx_nfa_state_set template;
  struct rx_hash_item * node;
  template.car = state;
  template.cdr = set;
  node = rx_hash_store (memo,
			(((long)state) >> 8) ^ (long)set,
			&template, &nfa_set_hash_rules);
  if (!node)
    return 0;
  if (node->data == &template)
    {
      struct rx_nfa_state_set * l;
      l = (struct rx_nfa_state_set *) malloc (sizeof (*l));
      node->data = (void *) l;
      if (!l)
	return 0;
      *l = template;
    }
  return (struct rx_nfa_state_set *)node->data;
}


#ifdef __STDC__
static struct rx_nfa_state_set * 
nfa_set_enjoin (struct rx * rx,
		struct rx_hash * memo, struct rx_nfa_state * state,
		struct rx_nfa_state_set * set)
#else
static struct rx_nfa_state_set * 
nfa_set_enjoin (rx, memo, state, set)
     struct rx * rx;
     struct rx_hash * memo;
     struct rx_nfa_state * state;
     struct rx_nfa_state_set * set;
#endif
{
  if (!set || (state->id < set->car->id))
    return nfa_set_cons (rx, memo, state, set);
  if (state->id == set->car->id)
    return set;
  else
    {
      struct rx_nfa_state_set * newcdr
	= nfa_set_enjoin (rx, memo, state, set->cdr);
      if (newcdr != set->cdr)
	set = nfa_set_cons (rx, memo, set->car, newcdr);
      return set;
    }
}


/* {Computing Epsilon/Side-effect Closures.}
 */

struct eclose_frame
{
  struct rx_se_list *prog_backwards;
};


/* This is called while computing closures for "outnode".
 * The current node in the traversal is "node".
 * "frame" contains a list of a all side effects between 
 * "outnode" and "node" from most to least recent.
 *
 * Explores forward from "node", adding new possible
 * futures to outnode.
 *
 * Returns 0 on allocation failure.
 */

#ifdef __STDC__
static int 
eclose_node (struct rx *rx, struct rx_nfa_state *outnode,
	     struct rx_nfa_state *node, struct eclose_frame *frame)
#else
static int 
eclose_node (rx, outnode, node, frame)
     struct rx *rx;
     struct rx_nfa_state *outnode;
     struct rx_nfa_state *node;
     struct eclose_frame *frame;
#endif
{
  struct rx_nfa_edge *e = node->edges;

  /* For each node, we follow all epsilon paths to build the closure.
   * The closure omits nodes that have only epsilon edges.
   * The closure is split into partial closures -- all the states in
   * a partial closure are reached by crossing the same list of
   * of side effects (though not necessarily the same path).
   */
  if (node->mark)
    return 1;
  node->mark = 1;

  /* If "node" has more than just epsilon and 
   * and side-effect transitions (id >= 0), or is final,
   * then it has to be added to the possible futures
   * of "outnode".
   */
  if (node->id >= 0 || node->is_final)
    {
      struct rx_possible_future **ec;
      struct rx_se_list * prog_in_order;
      int cmp;

      prog_in_order = ((struct rx_se_list *)hash_se_prog (rx,
							  &rx->se_list_memo,
							  frame->prog_backwards));

      ec = &outnode->futures;

      while (*ec)
	{
	  cmp = se_list_cmp ((void *)(*ec)->effects, (void *)prog_in_order);
	  if (cmp <= 0)
	    break;
	  ec = &(*ec)->next;
	}

      if (!*ec || (cmp < 0))
	{
	  struct rx_possible_future * pf;
	  pf = rx_possible_future (rx, prog_in_order);
	  if (!pf)
	    return 0;
	  pf->next = *ec;
	  *ec = pf;
	}
      if (node->id >= 0)
	{
	  (*ec)->destset = nfa_set_enjoin (rx, &rx->set_list_memo,
					   node, (*ec)->destset);
	  if (!(*ec)->destset)
	    return 0;
	}
    }

  /* Recurse on outgoing epsilon and side effect nodes.
   */
  while (e)
    {
      switch (e->type)
	{
	case ne_epsilon:
	  if (!eclose_node (rx, outnode, e->dest, frame))
	    return 0;
	  break;
	case ne_side_effect:
	  {
	    frame->prog_backwards = side_effect_cons (rx, 
						      e->params.side_effect,
						      frame->prog_backwards);
	    if (!frame->prog_backwards)
	      return 0;
	    if (!eclose_node (rx, outnode, e->dest, frame))
	      return 0;
	    {
	      struct rx_se_list * dying = frame->prog_backwards;
	      frame->prog_backwards = frame->prog_backwards->cdr;
	      free ((char *)dying);
	    }
	    break;
	  }
	default:
	  break;
	}
      e = e->next;
    }
  node->mark = 0;
  return 1;
}


#ifdef __STDC__
struct rx_possible_future *
rx_state_possible_futures (struct rx * rx, struct rx_nfa_state * n)
#else
struct rx_possible_future *
rx_state_possible_futures (rx, n)
     struct rx * rx;
     struct rx_nfa_state * n;
#endif
{
  if (n->futures_computed)
    return n->futures;
  else
    {
      struct eclose_frame frame;
      frame.prog_backwards = 0;
      if (!eclose_node (rx, n, n, &frame))
	return 0;
      else
	{
	  n->futures_computed = 1;
	  return n->futures;
	}
    }
}



/* {Storing the NFA in a Contiguous Region of Memory}
 */



#ifdef __STDC__
static void 
se_memo_freer (struct rx_hash_item * node)
#else
static void 
se_memo_freer (node)
     struct rx_hash_item * node;
#endif
{
  free ((char *)node->data);
}


#ifdef __STDC__
static void 
nfa_set_freer (struct rx_hash_item * node)
#else
static void 
nfa_set_freer (node)
     struct rx_hash_item * node;
#endif
{
  free ((char *)node->data);
}

#ifdef __STDC__
void
rx_free_nfa (struct rx *rx)
#else
void
rx_free_nfa (rx)
     struct rx *rx;
#endif
{
  rx_free_hash_table (&rx->se_list_memo, se_memo_freer, &se_list_hash_rules);
  rx_bzero ((char *)&rx->se_list_memo, sizeof (rx->se_list_memo));
  rx_free_hash_table (&rx->set_list_memo, nfa_set_freer, &nfa_set_hash_rules);
  rx_bzero ((char *)&rx->set_list_memo, sizeof (rx->set_list_memo));
  rx_free_nfa_graph (rx);
}

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.