rxsuper.h

This is rxsuper.h in view mode; [Download] [Up]
/* classes: h_files */

#ifndef RXSUPERH
#define RXSUPERH

/*	Copyright (C) 1995, 1996 Tom Lord
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 * 
 * You should have received a copy of the GNU Library General Public License
 * along with this software; see the file COPYING.  If not, write to
 * the Free Software Foundation, 59 Temple Place - Suite 330, 
 * Boston, MA 02111-1307, USA. 
 */

/*  lord	Sun May  7 12:40:17 1995	*/



#include "rxnfa.h"



/* This begins the description of the superstate NFA.
 *
 * The superstate NFA corresponds to the NFA in these ways:
 *
 * Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
 *
 * Superstate edges correspond to NFA paths.
 *
 * The superstate has no epsilon transitions;
 * every edge has a character label, and a (possibly empty) side
 * effect label.   The side effect label corresponds to a list of
 * side effects that occur in the NFA.  These parts are referred
 * to as:   superedge_character(EDGE) and superedge_sides(EDGE).
 *
 * For a superstate edge EDGE starting in some superstate SUPER,
 * the following is true (in pseudo-notation :-):
 *
 *       exists DEST in nfa_states s.t. 
 *         exists nfaEDGE in nfa_edges s.t.
 *                 origin (nfaEDGE) == DEST
 *              && origin (nfaEDGE) is a member of nfa_states(SUPER)
 *              && exists PF in possible_futures(dest(nfaEDGE)) s.t.
 * 	                sides_of_possible_future (PF) == superedge_sides (EDGE)
 *
 * also:
 *
 *      let SUPER2 := superedge_destination(EDGE)
 *          nfa_states(SUPER2)
 *           == union of all nfa state sets S s.t.
 *                          exists PF in possible_futures(dest(nfaEDGE)) s.t.
 * 	                       sides_of_possible_future (PF) == superedge_sides (EDGE)
 *                          && S == dests_of_possible_future (PF) }
 *
 * Or in english, every superstate is a set of nfa states.  A given
 * character and a superstate implies many transitions in the NFA --
 * those that begin with an edge labeled with that character from a
 * state in the set corresponding to the superstate.
 * 
 * The destinations of those transitions each have a set of possible
 * futures.  A possible future is a list of side effects and a set of
 * destination NFA states.  Two sets of possible futures can be
 * `merged' by combining all pairs of possible futures that have the
 * same side effects.  A pair is combined by creating a new future
 * with the same side effect but the union of the two destination sets.
 * In this way, all the possible futures suggested by a superstate
 * and a character can be merged into a set of possible futures where
 * no two elements of the set have the same set of side effects.
 *
 * The destination of a possible future, being a set of NFA states, 
 * corresponds to a supernfa state.  So, the merged set of possible
 * futures we just created can serve as a set of edges in the
 * supernfa.
 *
 * The representation of the superstate nfa and the nfa is critical.
 * The nfa has to be compact, but has to facilitate the rapid
 * computation of missing superstates.  The superstate nfa has to 
 * be fast to interpret, lazilly constructed, and bounded in space.
 *
 * To facilitate interpretation, the superstate data structures are 
 * peppered with `instruction frames'.  There is an instruction set
 * defined below which matchers using the supernfa must be able to
 * interpret.
 *
 * We'd like to make it possible but not mandatory to use code
 * addresses to represent instructions (c.f. gcc's computed goto).
 * Therefore, we define an enumerated type of opcodes, and when
 * writing one of these instructions into a data structure, use
 * the opcode as an index into a table of instruction values.
 * 
 * Below are the opcodes that occur in the superstate nfa.
 *
 * The descriptions of the opcodes refer to data structures that are
 * described further below. 
 */

enum rx_opcode
{
  /* 
   * BACKTRACK_POINT is invoked when a character transition in 
   * a superstate leads to more than one edge.  In that case,
   * the edges have to be explored independently using a backtracking
   * strategy.
   *
   * A BACKTRACK_POINT instruction is stored in a superstate's 
   * transition table for some character when it is known that that
   * character crosses more than one edge.  On encountering this
   * instruction, the matcher saves enough state to backtrack to this
   * point later in the match.
   */
  rx_backtrack_point = 0,	/* data is (struct transition_class *) */

  /* 
   * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
   * There is one occurence of this instruction per rx_distinct_future.
   * This instruction is skipped if a rx_distinct_future has no side effects.
   */
  rx_do_side_effects = rx_backtrack_point + 1,

  /* data is (struct rx_distinct_future *) */

  /* 
   * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
   * destination superstate has been reclaimed (or was never built).
   * It recomputes the destination superstate.
   * RX_CACHE_MISS is also stored in a superstate transition table before
   * any of its edges have been built.
   */
  rx_cache_miss = rx_do_side_effects + 1,
  /* data is (struct rx_distinct_future *) */

  /* 
   * RX_NEXT_CHAR is called to consume the next character and take the
   * corresponding transition.  This is the only instruction that uses 
   * the DATA field of the instruction frame instead of DATA_2.
   * The comments about rx_inx explain this further.
   */
  rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */

  /* RX_BACKTRACK indicates that a transition fails.  Don't
   * confuse this with rx_backtrack_point.
   */
  rx_backtrack = rx_next_char + 1, /* no data */

  /* 
   * RX_ERROR_INX is stored only in places that should never be executed.
   */
  rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */

  rx_num_instructions = rx_error_inx + 1
};

/* The heart of the matcher is a `word-code-interpreter' 
 * (like a byte-code interpreter, except that instructions
 * are a full word wide).
 *
 * Instructions are not stored in a vector of code, instead,
 * they are scattered throughout the data structures built
 * by the regexp compiler and the matcher.  One word-code instruction,
 * together with the arguments to that instruction, constitute
 * an instruction frame (struct rx_inx).
 *
 * This structure type is padded by hand to a power of 2 because
 * in one of the dominant cases, we dispatch by indexing a table
 * of instruction frames.  If that indexing can be accomplished
 * by just a shift of the index, we're happy.
 *
 * Instructions take at most one argument, but there are two
 * slots in an instruction frame that might hold that argument.
 * These are called data and data_2.  The data slot is only
 * used for one instruction (RX_NEXT_CHAR).  For all other 
 * instructions, data should be set to 0.
 *
 * RX_NEXT_CHAR is the most important instruction by far.
 * By reserving the data field for its exclusive use, 
 * instruction dispatch is sped up in that case.  There is
 * no need to fetch both the instruction and the data,
 * only the data is needed.  In other words, a `cycle' begins
 * by fetching the field data.  If that is non-0, then it must
 * be the destination state of a next_char transition, so
 * make that value the current state, advance the match position
 * by one character, and start a new cycle.  On the other hand,
 * if data is 0, fetch the instruction and do a more complicated
 * dispatch on that.
 */

struct rx_inx 
{
  void * data;
  void * data_2;
  void * inx;
  void * fnord;
};

#ifndef RX_TAIL_ARRAY
#define RX_TAIL_ARRAY  1
#endif

/* A superstate corresponds to a set of nfa states.  Those sets are
 * represented by STRUCT RX_SUPERSET.  The constructors
 * guarantee that only one (shared) structure is created for a given set.
 */
struct rx_superset
{
  int refs;			/* This is a reference counted structure. */

  /* We keep these sets in a cache because (in an unpredictable way),
   * the same set is often created again and again.  
   *
   * When an NFA is destroyed, some of the supersets for that NFA
   * may still exist.  This can lead to false cache hits -- an apparent cache
   * hit on a superset that properly belongs to an already free NFA.
   *
   * When a cache hit appears to occur, we will have in hand the
   * nfa for which it may have happened.  Every nfa is given
   * its own sequence number.  The cache is validated
   * by comparing the nfa sequence number to this field:
   */
  int id;

  struct rx_nfa_state * car;	/* May or may not be a valid addr. */
  struct rx_superset * cdr;

  /* If the corresponding superstate exists: */
  struct rx_superstate * superstate;

  /* That is_final field of the constiuent nfa states which has the greatest magnitude. */
  int is_final;

  /* The OR of the corresponding fields of the constiuent nfa states. */
  int has_cset_edges;


  /* There is another bookkeeping problem.  It is expensive to 
   * compute the starting nfa state set for an nfa.  So, once computed,
   * it is cached in the `struct rx'.
   *
   * But, the state set can be flushed from the superstate cache.
   * When that happens, the cached value in the `struct rx' has
   * to be flushed.
   */
  struct rx * starts_for;

  /* This is used to link into a hash bucket so these objects can
   * be `hash-consed'.
   */
  struct rx_hash_item hash_item;
};

#define rx_protect_superset(RX,CON) (++(CON)->refs)

/* The terminology may be confusing (rename this structure?).
 * Every character occurs in at most one rx_super_edge per super-state.
 * But, that structure might have more than one option, indicating a point
 * of non-determinism. 
 *
 * In other words, this structure holds a list of superstate edges
 * sharing a common starting state and character label.  The edges
 * are in the field OPTIONS.  All superstate edges sharing the same
 * starting state and character are in this list.
 */
struct rx_super_edge
{
  struct rx_super_edge *next;
  struct rx_inx rx_backtrack_frame;
  int cset_size;
  rx_Bitset cset;
  struct rx_distinct_future *options;
};

/* A superstate is a set of nfa states (RX_SUPERSET) along
 * with a transition table.  Superstates are built on demand and reclaimed
 * without warning.  To protect a superstate from this ghastly fate,
 * use LOCK_SUPERSTATE. 
 */
struct rx_superstate
{
  int rx_id;			/* c.f. the id field of rx_superset */
  int locks;			/* protection from reclamation */

  /* Within a superstate cache, all the superstates are kept in a big
   * queue.  The tail of the queue is the state most likely to be
   * reclaimed.  The *recyclable fields hold the queue position of 
   * this state.
   */
  struct rx_superstate * next_recyclable;
  struct rx_superstate * prev_recyclable;

  /* The supernfa edges that exist in the cache and that have
   * this state as their destination are kept in this list:
   */
  struct rx_distinct_future * transition_refs;

  /* The list of nfa states corresponding to this superstate: */
  struct rx_superset * contents;

  /* The list of edges in the cache beginning from this state. */
  struct rx_super_edge * edges;

  /* A tail of the recyclable queue is marked as semifree.  A semifree
   * state has no incoming next_char transitions -- any transition
   * into a semifree state causes a complex dispatch with the side
   * effect of rescuing the state from its semifree state into a 
   * fully free state at the head of the queue.
   *
   * An alternative to this might be to make next_char more expensive,
   * and to move a state to the head of the recyclable queue whenever
   * it is entered.  That way, popular states would never be recycled.
   *
   * But unilaterally making next_char more expensive actually loses.
   * So, incoming transitions are only made expensive for states near
   * the tail of the recyclable queue.  The more cache contention
   * there is, the more frequently a state will have to prove itself
   * and be moved back to the front of the queue.  If there is less 
   * contention, then popular states just aggregate in the front of 
   * the queue and stay there.
   */
  int is_semifree;


  /* This keeps track of the size of the transition table for this
   * state.  There is a half-hearted attempt to support variable sized
   * superstates.
   */
  int trans_size;

  /* Indexed by characters... */
  struct rx_inx transitions[RX_TAIL_ARRAY];
};


/* A list of distinct futures define the edges that leave from a 
 * given superstate on a given character.  c.f. rx_super_edge.
 */
struct rx_distinct_future
{
  struct rx_distinct_future * next_same_super_edge[2];
  struct rx_distinct_future * next_same_dest;
  struct rx_distinct_future * prev_same_dest;
  struct rx_superstate * present;	/* source state */
  struct rx_superstate * future;	/* destination state */
  struct rx_super_edge * edge;


  /* The future_frame holds the instruction that should be executed
   * after all the side effects are done, when it is time to complete
   * the transition to the next state.
   *
   * Normally this is a next_char instruction, but it may be a
   * cache_miss instruction as well, depending on whether or not
   * the superstate is in the cache and semifree.
   * 
   * If this is the only future for a given superstate/char, and
   * if there are no side effects to be performed, this frame is
   * not used (directly) at all.  Instead, its contents are copied
   * into the transition table of the starting state of this dist. future
   * (a sort of goto elimination).
   */
  struct rx_inx future_frame;

  struct rx_inx side_effects_frame;
  struct rx_se_list * effects;
};

#define rx_lock_superstate(R,S)  ((S)->locks++)
#define rx_unlock_superstate(R,S) (--(S)->locks)

struct rx_cache;

#ifdef __STDC__
typedef void (*rx_morecore_fn)(struct rx_cache *);
#else
typedef void (*rx_morecore_fn)();
#endif

struct rx_cache
{
  struct rx_hash_rules superset_hash_rules;

  struct rx_superstate * lru_superstate;
  struct rx_superstate * semifree_superstate;

  struct rx_superset * empty_superset;

  int superstates;
  int semifree_superstates;
  int hits;
  int misses;

  int bytes_allowed;
  int bytes_used;

  int local_cset_size;
  void ** instruction_table;

  struct rx_hash superset_table;
};

#ifndef RX_DEFAULT_DFA_CACHE_SIZE
/* This is an upper bound on the number of bytes that may (normally)
 * be allocated for DFA states.  If this threshold would be exceeded,
 * Rx tries to flush some DFA states from the cache.
 *
 * This value is used whenever the rx_default_cache is used (for example,
 * with the Posix entry points).
 */
#define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
#endif

extern struct rx_cache * rx_default_cache;


#ifdef __STDC__
extern char * rx_cache_malloc (struct rx_cache * cache, int size);
extern void rx_cache_free (struct rx_cache * cache, int size, char * mem);
extern void rx_release_superset (struct rx *rx,
				 struct rx_superset *set);
extern struct rx_superset * rx_superset_cons (struct rx * rx,
					      struct rx_nfa_state *car, struct rx_superset *cdr);
extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ;
extern struct rx_superstate * rx_superstate (struct rx *rx,
					     struct rx_superset *set);
extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ;

#else /* STDC */
extern char * rx_cache_malloc ();
extern void rx_cache_free ();
extern void rx_release_superset ();
extern struct rx_superset * rx_superset_cons ();
extern struct rx_superset * rx_superstate_eclosure_union ();
extern struct rx_superstate * rx_superstate ();
extern struct rx_inx * rx_handle_cache_miss ();

#endif /* STDC */

#endif  /* RXSUPERH */
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.