ftp.nice.ch/Attic/openStep/developer/resources/MiscKit.2.0.5.s.gnutar.gz#/MiscKit2/Temp/regex/framework/rxanal.c

This is rxanal.c in view mode; [Download] [Up]

/*	Copyright (C) 1995, 1996 Tom Lord
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 * 
 * You should have received a copy of the GNU Library General Public License
 * along with this software; see the file COPYING.  If not, write to
 * the Free Software Foundation, 59 Temple Place - Suite 330, 
 * Boston, MA 02111-1307, USA. 
 */



#include "rxall.h"
#include "rxanal.h"
#include "rxbitset.h"
#include "rxsuper.h"




#ifdef __STDC__
int
rx_posix_analyze_rexp (struct rexp_node *** subexps,
		       size_t * re_nsub,
		       struct rexp_node * node,
		       int id)
#else
int
rx_posix_analyze_rexp (subexps, re_nsub, node, id)
     struct rexp_node *** subexps;
     size_t * re_nsub;
     struct rexp_node * node;
     int id;
#endif
{
  if (node)
    {
      size_t this_subexp;
      if (node->type == r_parens)
	{
	  if (node->params.intval >= 0)
	    {
	      this_subexp = *re_nsub;
	      ++*re_nsub;
	      if (!*subexps)
		*subexps = (struct rexp_node **)malloc (sizeof (struct rexp_node *) * *re_nsub);
	      else
		*subexps = (struct rexp_node **)realloc (*subexps,
							 sizeof (struct rexp_node *) * *re_nsub);
	    }
	}
      if (node->params.pair.left)
	id = rx_posix_analyze_rexp (subexps, re_nsub, node->params.pair.left, id);
      if (node->params.pair.right)
	id = rx_posix_analyze_rexp (subexps, re_nsub, node->params.pair.right, id);
      switch (node->type)
	{
	case r_cset:
	  node->len = 1;
	  node->observed = 0;
	  break;
 	case r_string:
 	  node->len = node->params.cstr.len;
 	  node->observed = 0;
 	  break;
	case r_cut:
	  node->len = 0;
	  node->observed = 0;
	  break;
	case r_concat:
	case r_alternate:
	  {
	    int lob, rob;
	    int llen, rlen;
	    lob = (!node->params.pair.left ? 0 : node->params.pair.left->observed);
	    rob = (!node->params.pair.right ? 0 : node->params.pair.right->observed);
	    llen = (!node->params.pair.left ? 0 : node->params.pair.left->len);
	    rlen = (!node->params.pair.right ? 0 : node->params.pair.right->len);
	    node->len = ((llen >= 0) && (rlen >= 0)
			 ? ((node->type == r_concat)
			    ? llen + rlen
			    : ((llen == rlen) ? llen : -1))
			 : -1);
	    node->observed = lob || rob;
	    break;
	  }
	case r_opt:
	case r_star:
	case r_plus:
	  node->len = -1;
	  node->observed = (node->params.pair.left
			    ? node->params.pair.left->observed
			    : 0);
	  break;

	case  r_interval:
	  node->len = -1;
	  node->observed = 1;
	  break;

	case r_parens:
	  if (node->params.intval >= 0)
	    {
	      node->observed = 1;
	      (*subexps)[this_subexp] = node;
	    }
	  else
	    node->observed = (node->params.pair.left
			      ? node->params.pair.left->observed
			      : 0);
	  node->len = (node->params.pair.left
		       ? node->params.pair.left->len
		       : 0);
	  break;
	case r_context:
	  switch (node->params.intval)
	    {
	    default:
	      node->observed = 1;
	      node->len = -1;
	      break;
	    case '^':
	    case '$':
	    case '=':
	    case '<':
	    case '>':
	    case 'b':
	    case 'B':
	    case '`':
	    case '\'':
	      node->observed = 1;
	      node->len = 0;
	      break;
	    }
	  break;
	}
      if (node->observed)
	node->id = id++;
      return id;
    }
  return id;
}

/* Returns 0 unless the pattern can match the empty string. */
#ifdef __STDC__
int
rx_fill_in_fastmap (int cset_size, unsigned char * map, struct rexp_node * exp)
#else
int
rx_fill_in_fastmap (cset_size, map, exp)
     int cset_size;
     unsigned char * map;
     struct rexp_node * exp;
#endif
{
  if (!exp)
    {
    can_match_empty:
      {
	int x;
	for (x = 0; x < cset_size; ++x)
	  map[x] = 1;
      }
      return 1;
    }
  
  switch (exp->type)
    {
    case r_cset:
      {
	int x;
	int most;
	
	most = exp->params.cset_size;
	for (x = 0; x < most; ++x)
	  if (RX_bitset_member (exp->params.cset, x))
	    map[x] = 1;
      }
      return 0;

    case r_string:
      if (exp->params.cstr.len)
 	{
	  map[exp->params.cstr.contents[0]] = 1;
	  return 0;
 	}
      else
	return 1;

    case r_cut:
      return 1;
      

    case r_concat:
      return rx_fill_in_fastmap (cset_size, map, exp->params.pair.left);

      /* Why not the right branch?  If the left branch
       * can't be empty it doesn't matter.  If it can, then
       * the fastmap is already saturated, and again, the
       * right branch doesn't matter.
       */


    case r_alternate:
      return (  rx_fill_in_fastmap (cset_size, map, exp->params.pair.left)
	      | rx_fill_in_fastmap (cset_size, map, exp->params.pair.right));

    case r_parens:
    case r_plus:
      return rx_fill_in_fastmap (cset_size, map, exp->params.pair.left);

    case r_opt:
    case r_star:
      goto can_match_empty;
      /* Why not the subtree?  These operators already saturate
       * the fastmap. 
       */

    case r_interval:
      if (exp->params.intval == 0)
	goto can_match_empty;
      else
	return rx_fill_in_fastmap (cset_size, map, exp->params.pair.left);
      
    case r_context:
      goto can_match_empty;
    }

  /* this should never happen but gcc seems to like it */
  return 0;
  
}


#ifdef __STDC__
int
rx_is_anchored_p (struct rexp_node * exp)
#else
int
rx_is_anchored_p (exp)
     struct rexp_node * exp;
#endif
{
  if (!exp)
    return 0;

  switch (exp->type)
    {
    case r_opt:
    case r_star:
    case r_cset:
    case r_string:
    case r_cut:
      return 0;

    case r_parens:
    case r_plus:
    case r_concat:
      return rx_is_anchored_p (exp->params.pair.left);

    case r_alternate:
      return (   rx_is_anchored_p (exp->params.pair.left)
	      && rx_is_anchored_p (exp->params.pair.right));


    case r_interval:
      if (exp->params.intval == 0)
	return 0;
      else
	return rx_is_anchored_p (exp->params.pair.left);
      
    case r_context:
      return (exp->params.intval == '^');
    }

  /* this should never happen but gcc seems to like it */
  return 0;
}



#ifdef __STDC__
enum rx_answers
rx_start_superstate (struct rx_classical_system * frame)
#else
enum rx_answers
rx_start_superstate (frame)
     struct rx_classical_system * frame;
#endif
{
  struct rx_superset * start_contents;
  struct rx_nfa_state_set * start_nfa_set;

  if (frame->rx->start_set)
    start_contents = frame->rx->start_set;
  else
    {
      {
	struct rx_possible_future * futures;
	futures = rx_state_possible_futures (frame->rx, frame->rx->start_nfa_states);

	if (!futures)
	  return rx_bogus;

	if (futures->next)
	  return rx_start_state_with_too_many_futures;

	start_nfa_set = futures->destset;
      }
      
      start_contents =
	rx_superstate_eclosure_union (frame->rx,
				      rx_superset_cons (frame->rx, 0, 0),
				      start_nfa_set);
      
      if (!start_contents)
	return rx_bogus;
	    
      start_contents->starts_for = frame->rx;
      frame->rx->start_set = start_contents;
    }

  if (   start_contents->superstate
      && (start_contents->superstate->rx_id == frame->rx->rx_id))
    {
      if (frame->state)
	{
	  rx_unlock_superstate (frame->rx, frame->state);
	}
      frame->state = start_contents->superstate;
      /* The cached superstate may be in a semifree state.
       * We need to lock it and preserve the invariant
       * that a locked superstate is never semifree.
       * So refresh it.
       */
      rx_refresh_this_superstate (frame->rx->cache, frame->state);
      rx_lock_superstate (frame->rx, frame->state);
      return rx_yes;
    }
  else
    {
      struct rx_superstate * state;

      rx_protect_superset (frame->rx, start_contents);
      state = rx_superstate (frame->rx, start_contents);
      rx_release_superset (frame->rx, start_contents);
      if (!state)
	return rx_bogus;
      if (frame->state)
	{
	  rx_unlock_superstate (frame->rx, frame->state);
	}
      frame->state = state;
      rx_lock_superstate (frame->rx, frame->state);
      return rx_yes;
    }
}




#ifdef __STDC__
enum rx_answers
rx_fit_p (struct rx_classical_system * frame, unsigned const char * burst, int len)
#else
enum rx_answers
rx_fit_p (frame, burst, len)
     struct rx_classical_system * frame;
     unsigned const char * burst;
     int len;
#endif
{
  struct rx_inx * inx_table;
  struct rx_inx * inx;

  if (!frame->state)
    return rx_bogus;

  if (!len)
    {
      frame->final_tag = frame->state->contents->is_final;
      return (frame->state->contents->is_final
	      ? rx_yes
	      : rx_no);
    }

  inx_table = frame->state->transitions;
  rx_unlock_superstate (frame->rx, frame->state);

  while (len--)
    {
      struct rx_inx * next_table;

      inx = inx_table + *burst;
      next_table = (struct rx_inx *)inx->data;
      while (!next_table)
	{
	  struct rx_superstate * state;
	  state = ((struct rx_superstate *)
		   ((char *)inx_table
		    - ((unsigned long)
		       ((struct rx_superstate *)0)->transitions)));

	  switch ((int)inx->inx)
	    {
	    case rx_backtrack:
	      /* RX_BACKTRACK means that we've reached the empty
	       * superstate, indicating that match can't succeed
	       * from this point.
	       */
	      frame->state = 0;
	      return rx_no;
	    
	    case rx_cache_miss:
	      /* Because the superstate NFA is lazily constructed,
	       * and in fact may erode from underneath us, we sometimes
	       * have to construct the next instruction from the hard way.
	       * This invokes one step in the lazy-conversion.
	       */
	      inx = 
		rx_handle_cache_miss
		  (frame->rx, state, *burst, inx->data_2);

	      if (!inx)
		{
		  frame->state = 0;
		  return rx_bogus;
		}
	      next_table = (struct rx_inx *)inx->data;
	      continue;
		

	      /* No other instructions are legal here.
	       * (In particular, this function does not handle backtracking
	       * or the related instructions.)
	       */
	    default:
	      frame->state = 0;
	      return rx_bogus;
	  }
	}
      inx_table = next_table;
      ++burst;
    }

  if (inx->data_2)		/* indicates a final superstate */
    {
      frame->final_tag = (int)inx->data_2;
      frame->state = ((struct rx_superstate *)
		      ((char *)inx_table
		       - ((unsigned long)
			  ((struct rx_superstate *)0)->transitions)));
      rx_lock_superstate (frame->rx, frame->state);
      return rx_yes;
    }
  frame->state = ((struct rx_superstate *)
		  ((char *)inx_table
		   - ((unsigned long)
		      ((struct rx_superstate *)0)->transitions)));
  rx_lock_superstate (frame->rx, frame->state);
  return rx_no;
}




#ifdef __STDC__
enum rx_answers
rx_advance (struct rx_classical_system * frame, unsigned const char * burst, int len)
#else
enum rx_answers
rx_advance (frame, burst, len)
     struct rx_classical_system * frame;
     unsigned const char * burst;
     int len;
#endif
{
  struct rx_inx * inx_table;

  if (!frame->state)
    return rx_bogus;

  if (!len)
    return rx_yes;

  inx_table = frame->state->transitions;
  rx_unlock_superstate (frame->rx, frame->state);

  while (len--)
    {
      struct rx_inx * inx;
      struct rx_inx * next_table;

      inx = inx_table + *burst;
      next_table = (struct rx_inx *)inx->data;
      while (!next_table)
	{
	  struct rx_superstate * state;
	  state = ((struct rx_superstate *)
		   ((char *)inx_table
		    - ((unsigned long)
		       ((struct rx_superstate *)0)->transitions)));

	  switch ((int)inx->inx)
	    {
	    case rx_backtrack:
	      /* RX_BACKTRACK means that we've reached the empty
	       * superstate, indicating that match can't succeed
	       * from this point.
	       */
	      frame->state = 0;
	      return rx_no;
	    
	    case rx_cache_miss:
	      /* Because the superstate NFA is lazily constructed,
	       * and in fact may erode from underneath us, we sometimes
	       * have to construct the next instruction from the hard way.
	       * This invokes one step in the lazy-conversion.
	       */
	      inx = 
		rx_handle_cache_miss
		  (frame->rx, state, *burst, inx->data_2);

	      if (!inx)
		{
		  frame->state = 0;
		  return rx_bogus;
		}
	      next_table = (struct rx_inx *)inx->data;
	      continue;
		

	      /* No other instructions are legal here.
	       * (In particular, this function does not handle backtracking
	       * or the related instructions.)
	       */
	    default:
	      frame->state = 0;
	      return rx_bogus;
	  }
	}
      inx_table = next_table;
      ++burst;
    }
  
  frame->state = ((struct rx_superstate *)
		  ((char *)inx_table
		   - ((unsigned long)
		      ((struct rx_superstate *)0)->transitions)));
  rx_lock_superstate (frame->rx, frame->state);
  return rx_yes;
}



#ifdef __STDC__
int
rx_advance_to_final (struct rx_classical_system * frame, unsigned const char * burst, int len)
#else
int
rx_advance_to_final (frame, burst, len)
     struct rx_classical_system * frame;
     unsigned const char * burst;
     int len;
#endif
{
  int initial_len;
  struct rx_inx * inx_table;
  struct rx_superstate * this_state;

  if (!frame->state)
    return 0;

  if (!len)
    {
      frame->final_tag = frame->state->contents->is_final;
      return 0;
    }

  inx_table = frame->state->transitions;

  initial_len = len;

  this_state = frame->state;

  while (len--)
    {
      struct rx_inx * inx;
      struct rx_inx * next_table;

      /* this_state holds the state for the position we're
       * leaving.  this_state is locked. 
       */
      inx = inx_table + *burst;
      next_table = (struct rx_inx *)inx->data;

      while (!next_table)
	{
	  struct rx_superstate * state;

	  state = ((struct rx_superstate *)
		   ((char *)inx_table
		    - ((unsigned long)
		       ((struct rx_superstate *)0)->transitions)));
	  
	  switch ((int)inx->inx)
	    {
	    case rx_backtrack:
	      /* RX_BACKTRACK means that we've reached the empty
	       * superstate, indicating that match can't succeed
	       * from this point.
	       *
	       * Return to the state for the position prior to what
	       * we failed at, and return that position.
	       */
	      frame->state = this_state;
	      frame->final_tag = this_state->contents->is_final;
	      return (initial_len - len) - 1;
	    
	    case rx_cache_miss:
	      /* Because the superstate NFA is lazily constructed,
	       * and in fact may erode from underneath us, we sometimes
	       * have to construct the next instruction from the hard way.
	       * This invokes one step in the lazy-conversion.
	       */
	      inx = rx_handle_cache_miss
		(frame->rx, state, *burst, inx->data_2);

	      if (!inx)
		{
		  rx_unlock_superstate (frame->rx, this_state);
		  frame->state = 0;
		  return -1;
		}
	      next_table = (struct rx_inx *)inx->data;
	      continue;
		

	      /* No other instructions are legal here.
	       * (In particular, this function does not handle backtracking
	       * or the related instructions.)
	       */
	    default:
	      rx_unlock_superstate (frame->rx, this_state);
	      frame->state = 0;
	      return -1;
	  }
	}

      /* Release the superstate for the preceeding position: */
      rx_unlock_superstate (frame->rx, this_state);

      /* Compute the superstate for the new position: */
      inx_table = next_table;
      this_state = ((struct rx_superstate *)
		    ((char *)inx_table
		     - ((unsigned long)
			((struct rx_superstate *)0)->transitions)));
      
      /* Lock it (see top-of-loop invariant): */
      rx_lock_superstate (frame->rx, this_state);
      
      /* Check to see if we should stop: */
      if (this_state->contents->is_final)
	{
	  frame->final_tag = this_state->contents->is_final;
	  frame->state = this_state;
	  return (initial_len - len);
	}
      
      ++burst;
    }

  /* Consumed all of the characters. */
  frame->state = this_state;
  frame->final_tag = this_state->contents->is_final;

  /* state already locked (see top-of-loop invariant) */
  return initial_len;
}




#ifdef __STDC__
void
rx_terminate_system (struct rx_classical_system * frame)
#else
void
rx_terminate_system (frame)
     struct rx_classical_system * frame;
#endif
{
  if (frame->state)
    {
      rx_unlock_superstate (frame->rx, frame->state);
      frame->state = 0;
    }
}









#ifdef __STDC__
void
rx_init_system (struct rx_classical_system * frame, struct rx * rx)
#else
void
rx_init_system (frame, rx)
     struct rx_classical_system * frame;
     struct rx * rx;
#endif
{
  frame->rx = rx;
  frame->state = 0;
}



These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.