ftp.nice.ch/pub/next/unix/developer/slang0.99-34.s.tar.gz#/slang/src/slregexp.c

This is slregexp.c in view mode; [Download] [Up]

/* ed style regular expressions */
/* Copyright (c) 1992, 1995 John E. Davis
 * All rights reserved.
 * 
 * You may distribute under the terms of either the GNU General Public
 * License or the Perl Artistic License.
 */


#include "config.h"

#include <stdio.h>
#include <string.h>


#include "slang.h"

#define SET_BIT(b, n) b[(unsigned int) (n) >> 3] |= 1 << ((unsigned int) (n) % 8)
#define TEST_BIT(b, n) (b[(unsigned int)(n) >> 3] & (1 << ((unsigned int) (n) % 8)))
#define LITERAL 1
#define RANGE 2			       /* [...] */
#define ANY 3			       /* . */
#define BOL 4			       /* ^ */
#define EOL 5			       /* $ */
#define NTH_MATCH 6		       /* \1 \2 ... \9 */
#define OPAREN 7		       /* \( */
#define CPAREN 0x8		       /* \) */
#define ANY_DIGIT 0x9		       /* \d */
#define BOW	0xA		       /* \< */
#define EOW	0xB		       /* \> */
#if 0
#define NOT_LITERAL		0xC	       /* \~ */
#endif
#define STAR 0x80		       /* * */
#define LEAST_ONCE 0x40		       /* + */
#define MAYBE_ONCE 0x20		       /* ? */
#define MANY 0x10		       /* {n,m} */
/* The rest are additions */
#define YES_CASE (STAR | BOL)
#define NO_CASE  (STAR | EOL)

#define UPPERCASE(x)  (cs ? (x) : UPPER_CASE(x))
#define LOWERCASE(x)  (cs ? (x) : LOWER_CASE(x))

static unsigned char Word_Chars[256];
#define IS_WORD_CHAR(x) Word_Chars[(unsigned int) (x)]

static int Next_Pos;
static SLRegexp_Type *This_Reg;
static unsigned char *This_Str;

static unsigned char *do_nth_match(int n, unsigned char *str, unsigned char *estr)
{
   unsigned char *bpos;
   if (n > Next_Pos) return (NULL);
   
   bpos = This_Reg->beg_matches[n] + This_Str;
   n = This_Reg->end_matches[n];
   if (n == 0) return(str);
   if (n > (int) (estr - str)) return (NULL);
   
   /* This needs fixed for case sensitive match */
   if (0 != strncmp((char *) str, (char *) bpos, (unsigned int) n)) return (NULL);
   str += n;
   return (str);
}


/* returns pointer to the end of regexp or NULL */
static unsigned char *regexp_looking_at(register unsigned char *str, unsigned char *estr, unsigned char *buf, register int cs)
{
   register unsigned char p, p1;
   unsigned char *save_str, *tmpstr;
   int n, n0, n1;
   int save_next = 0;
   p = *buf++;
   
   
   while (p != 0)
     {
	/* p1 = UPPERCASE(*buf); */
	/* if (str < estr) c = UPPERCASE(*str); */

	switch((unsigned char) p)
	  {
	   case BOW:
	     if ((str != This_Str)
		 && ((str >= estr)
		     || IS_WORD_CHAR(*(str - 1))
		     || (0 == IS_WORD_CHAR(*str)))) return NULL;
	     break;
	     
	   case EOW:
	     if ((str < estr)
		 && IS_WORD_CHAR (*str)) return NULL;
	     break;
	     
	   case YES_CASE: cs = 1; break;
	   case NO_CASE: cs = 0; break;

	   case OPAREN:
	     This_Reg->beg_matches[Next_Pos] = (int) (str - This_Str);
	     This_Reg->beg_matches[Next_Pos + 1] = -1;
	     break;
	   case CPAREN: 
	     This_Reg->end_matches[Next_Pos] = (unsigned int) (str - This_Str) - This_Reg->beg_matches[Next_Pos];
	     Next_Pos++;
	     break;
#ifdef NOT_LITERAL
	   case NOT_LITERAL:
	     if ((str >= estr) || (*buf == UPPERCASE(*str))) return (NULL);
	     str++; buf++;
	     break;
	     
	   case MAYBE_ONCE | NOT_LITERAL:
	     save_str = str;
	     if ((str < estr) && (*buf != UPPERCASE(*str))) str++;
	     buf++;
	     goto match_rest;

	   case NOT_LITERAL | LEAST_ONCE:   /* match at least once */
	     if ((str >= estr) || (UPPERCASE(*str) == UPPERCASE(*buf))) return (NULL);
	     str++;
	     /* drop */
	   case STAR | NOT_LITERAL:
	     save_str = str;  p1 = *buf;
	     while ((str < estr) && (UPPERCASE(*str) != p1)) str++;
	     buf++;
	     goto match_rest;
	     
	     /* this type consists of the expression + two bytes that
	        determine number of matches to perform */
	   case MANY | NOT_LITERAL:
	     p1 = *buf; buf++;
	     n = n0 = (int) (unsigned char) *buf++;
	     /* minimum number to match--- could be 0 */
	     n1 = (int) (unsigned char) *buf++;
	     /* maximum number to match */

	     while (n && (str < estr) && (p1 != *str)) 
	       {
		  n--;
		  str++;
	       }
	     if (n) return (NULL);
	     
	     save_str = str;
	     n = n1 - n0;
	     while (n && (str < estr) && (p1 != *str)) 
	       {
		  n--;
		  str++;
	       }
	     goto match_rest;
#endif				       /* NOT_LITERAL */
	   case LITERAL: 
	     if ((str >= estr) || (*buf != UPPERCASE(*str))) return (NULL);
	     str++; buf++;
	     break;
	     
	   case MAYBE_ONCE | LITERAL:
	     save_str = str;
	     if ((str < estr) && (*buf == UPPERCASE(*str))) str++;
	     buf++;
	     goto match_rest;

	   case LITERAL | LEAST_ONCE:   /* match at least once */
	     if ((str >= estr) || (UPPERCASE(*str) != UPPERCASE(*buf))) return (NULL);
	     str++;
	     /* drop */
	   case STAR | LITERAL:
	     save_str = str;  p1 = *buf;
	     while ((str < estr) && (UPPERCASE(*str) == p1)) str++;
	     buf++;
	     goto match_rest;
	     
	     /* this type consists of the expression + two bytes that
	        determine number of matches to perform */
	   case MANY | LITERAL:
	     p1 = *buf; buf++;
	     n = n0 = (int) (unsigned char) *buf++;
	     /* minimum number to match--- could be 0 */
	     n1 = (int) (unsigned char) *buf++;
	     /* maximum number to match */

	     while (n && (str < estr) && (p1 == *str)) 
	       {
		  n--;
		  str++;
	       }
	     if (n) return (NULL);
	     
	     save_str = str;
	     n = n1 - n0;
	     while (n && (str < estr) && (p1 == *str)) 
	       {
		  n--;
		  str++;
	       }
	     goto match_rest;
	     
	   case NTH_MATCH: 
	     if ((str = do_nth_match((int) (unsigned char) *buf, str, estr)) == NULL) return(NULL);
	     buf++;
	     break;
	     
	   case MAYBE_ONCE | NTH_MATCH:
	     save_str = str;
	     tmpstr = do_nth_match((int) (unsigned char) *buf, str, estr);
	     buf++;
	     if (tmpstr != NULL)
	       {
		  str = tmpstr;
		  goto match_rest;
	       }
	     continue;

	   case LEAST_ONCE | NTH_MATCH:
	     if ((str = do_nth_match((int) (unsigned char) *buf, str, estr)) == NULL) return(NULL);
	     /* drop */
	   case STAR | NTH_MATCH: 
	     save_str = str;
	     while (NULL != (tmpstr = do_nth_match((int) (unsigned char) *buf, str, estr)))
	       {
		  str = tmpstr;
	       }
	     buf++;
	     goto match_rest;
	     
	   case MANY | NTH_MATCH: return(NULL);
	     /* needs done */
	     
	   case RANGE: 
	     if (str >= estr) return (NULL);
	     if (TEST_BIT(buf, UPPERCASE(*str)) == 0) return (NULL);
	     buf += 32; str++;
	     break;
	     
	   case MAYBE_ONCE | RANGE:
	     save_str = str;
	     if ((str < estr) && TEST_BIT(buf, UPPERCASE(*str))) str++;
	     buf += 32;
	     goto match_rest;

	   case LEAST_ONCE | RANGE: 
	     if ((str >= estr) || (0 == TEST_BIT(buf, UPPERCASE(*str)))) return NULL;
	     str++;
	     /* drop */
	   case STAR | RANGE:
	     save_str = str;
	     while ((str < estr) && TEST_BIT(buf, UPPERCASE(*str))) str++;
	     buf += 32;
	     goto match_rest;
	     
	     /* The first 32 bytes correspond to the range and the two 
	      * following bytes indicate the min and max number of matches.
	      */
	   case MANY | RANGE:  
	     /* minimum number to match--- could be 0 */
	     n = n0 = (int) (unsigned char) *(buf + 32);
	     /* maximum number to match */
	     n1 = (int) (unsigned char) *(buf + 33);

	     while (n && (str < estr) && (TEST_BIT(buf, UPPERCASE(*str))))
	       {
		  n--;
		  str++;
	       }
	     if (n) return (NULL);
	     save_str = str;
	     n = n1 - n0;
	     while (n && (str < estr) && (TEST_BIT(buf, UPPERCASE(*str))))
	       {
		  n--;
		  str++;
	       }
	     buf += 34;		       /* 32 + 2 */
	     goto match_rest;
	     
	   case ANY_DIGIT:
	     if ((str >= estr) || (*str > '9') || (*str < '0')) return (NULL);
	     str++;
	     break;

	   case MAYBE_ONCE | ANY_DIGIT:
	     save_str = str;
	     if ((str < estr) && ((*str > '9') || (*str < '0'))) str++;
	     goto match_rest;

	   case LEAST_ONCE | ANY_DIGIT:
	     if ((str < estr) && ((*str > '9') || (*str < '0'))) return NULL;
	     str++;
	     /* drop */
	   case STAR | ANY_DIGIT:
	     save_str = str;
	     while ((str < estr) && ((*str <= '9') && (*str >= '0'))) str++;
	     goto match_rest;
	     
	   case MANY | ANY_DIGIT: 
	     /* needs finished */
	     return (NULL);
	     
	   case ANY:
	     if ((str >= estr) || (*str == '\n')) return (NULL);
	     str++;
	     break;
	     
	   case MAYBE_ONCE | ANY:
	     save_str = str;
	     if ((str < estr) && (*str != '\n')) str++;
	     goto match_rest;

	   case LEAST_ONCE | ANY:
	     if ((str >= estr) || (*str == '\n')) return (NULL);
	     str++;
	     /* drop */
	   case STAR | ANY:
	     save_str = str;
	     while ((str < estr) && (*str != '\n')) str++;
	     goto match_rest;
	     
	   case MANY | ANY: 
	     return (NULL);
	     /* needs finished */

	   case EOL: 
	     if ((str >= estr) || (*str == '\n')) return (str);
	     return(NULL);

	   default: return (NULL);
	  }
	p = *buf++;
	continue;
	
	match_rest:
	if (save_str == str)
	  {
	     p = *buf++;
	     continue;
	  }
	
	/* if (p == EOL)
	 * {
	 * if (str < estr) return (NULL); else return (str);
	 * } 
	 */
	
	save_next = Next_Pos;
	while (str >= save_str)
	  {
	     tmpstr = regexp_looking_at(str, estr, buf, cs);
	     if (tmpstr != NULL) return(tmpstr);
	     Next_Pos = save_next;
	     str--;
	  }
	return (NULL);
     }
   if ((p != 0) && (p != EOL)) return (NULL); else return (str);
}


unsigned char *SLang_regexp_match(unsigned char *str, 
				  unsigned int len, SLRegexp_Type *reg)
{
   register unsigned char c = 0, *estr = str + len;
   int cs = reg->case_sensitive, lit = 0;
   unsigned char *buf = reg->buf, *epos;

   if (reg->min_length > len) return NULL;
   
   Next_Pos = 1;
   This_Reg = reg;
   This_Str = str;

   if (*buf == BOL) 
     {
	if (NULL == (epos = regexp_looking_at(str, estr, buf + 1, cs))) str = NULL;
	reg->beg_matches[0] = (int) (str - This_Str);
	reg->end_matches[0] = (unsigned int) (epos - str);
	return(str);
     }

   if (*buf == NO_CASE)
     {
	buf++;  cs = 0;
     }
   
   if (*buf == YES_CASE)
     {
	buf++;  cs = 1;
     }
   
   if (*buf == LITERAL) 
     {
	lit = 1;
	c = *(buf + 1);
     }
   else if ((*buf == OPAREN) && (*(buf + 1) == LITERAL))
     {
	lit = 1;
	c = *(buf + 2);
     }
   
   while (str < estr)
     {
	/* take care of leading chars */
	if (lit)
	  {
	     while ((str < estr) && (c != UPPERCASE(*str))) str++;
	     if (str >= estr) return(NULL);
	  }
	Next_Pos = 1;
	if (NULL != (epos = regexp_looking_at(str, estr, buf, cs)))
	  {
	     reg->beg_matches[0] = (int) (str - This_Str);
	     reg->end_matches[0] = (unsigned int) (epos - str);
	     return (str);
	  }
	str++;
     }
   return (NULL);
}

static unsigned char *convert_digit(unsigned char *pat, int *nn)
{
   int n = 0, m = 0;
   unsigned char c;
   while (c = (unsigned char) *pat, (c <= '9') && (c >= '0'))
     {
	pat++;
	n = 10 * n + (c - '0');
	m++;
     }
   if (m == 0) 
     {
	return (NULL);
     }
   *nn = n;
   return pat;
}

#define ERROR  return (int) (pat - reg->pat)

/* Returns 0 if successful or offset in pattern of error */
int SLang_regexp_compile (SLRegexp_Type *reg)
{
   register unsigned char *buf, *ebuf, *pat;
   unsigned char *last = NULL, *tmppat;
   register unsigned char c;
   int i, reverse = 0, n, cs;
   int oparen = 0, dparen = 0, nparen = 0;
   /* substring stuff */
   int count, last_count, this_max_mm = 0, max_mm = 0, ordinary_search, 
     no_osearch = 0, min_length = 0;
   unsigned char *mm_p = NULL, *this_mm_p = NULL;
   static int already_initialized;
   
   reg->beg_matches[0] = reg->end_matches[0] = 0;
   buf = reg->buf;
   ebuf = (reg->buf + reg->buf_len) - 2; /* make some room */
   pat = reg->pat;
   cs = reg->case_sensitive;

   if (already_initialized == 0)
     {
	SLang_init_case_tables ();
#ifdef pc_system
	SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\200-\232\240-\245\341-\353", 0);
#else
	SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\277-\326\330-\336\340-\366\370-\376", 0);
#endif
	already_initialized = 1;
     }
   
   i = 1; while (i < 10) 
     {
	reg->beg_matches[i] = -1;
	reg->end_matches[i] = 0;
	i++;
     }

   if (*pat == '^')
     {
	pat++;
	*buf++ = BOL;
	reg->must_match_bol = 1;
     }
   else reg->must_match_bol = 0;

   *buf = 0;
   
   last_count = count = 0;
   while ((c = *pat++) != 0)
     {
	if (buf >= ebuf - 3)
	  {
	     SLang_doerror ("Pattern too large to be compiled.");
	     ERROR;
	  }
	
	count++;
	switch (c)
	  {
	   case '$': 
	     if (*pat != 0) goto literal_char;
	     *buf++ = EOL;
	     break;
	     
	   case '\\': 
	     c = *pat++;
	     no_osearch = 1;
	     switch(c)
	       {
		case 'e': c = 033; goto literal_char;
		case 'n': c = '\n'; goto literal_char;
		case 't': c = '\t'; goto literal_char;
		case 'C': cs = 0; *buf++ = NO_CASE; break;
		case 'c': cs = 1; *buf++ = YES_CASE; break;
		case '1': case '2': case '3':  case '4':  case '5':  
		case '6': case '7': case '8':  case '9': 
		  c = c - '0';
		  if ((int) c > nparen / 2) ERROR;
		  last = buf;
		  *buf++ = NTH_MATCH; *buf++ = c;
		  break;
#ifdef NOT_LITERAL
		case '~':	       /* slang extension */
		  if ((c = *pat) == 0) ERROR;
		  pat++;
		  last = buf;
		  *buf++ = NOT_LITERAL;
		  *buf++ = c;
		  min_length++;
		  break;
#endif
		case 'd':	       /* slang extension */
		  last = buf;
		  *buf++ = ANY_DIGIT;
		  min_length++;
		  break;
		  
		case '<':
		  last = NULL;
		  *buf++ = BOW;
		  break;
		  
		case '>':
		  last = NULL;
		  *buf++ = EOW;
		  break;
		  
		case '{':
		  if (last == NULL) goto literal_char;
		  *last |= MANY;
		  tmppat = convert_digit(pat, &n);
		  if (tmppat == NULL) ERROR;
		  pat = tmppat;
		  *buf++ = n;

		  min_length += (n - 1);
		  
		  if (*pat == '\\') 
		    {
		       *buf++ = n;
		    }
		  else if (*pat == ',')
		    {
		       pat++;
		       if (*pat == '\\')
			 {
			    n = 255;
			 }
		       else 
			 {
			    tmppat = convert_digit(pat, &n);
			    if (tmppat == NULL) ERROR;
			    pat = tmppat;
			    if (*pat != '\\') ERROR;
			 }
		       *buf++ = n;
		    }
		  else ERROR;
		  last = NULL;
		  pat++;
		  if (*pat != '}') ERROR;
		  pat++;
		  break;   /* case '{' */
		  
		case '(': 
		  oparen++;
		  if ((oparen > '9') || (dparen == 1)) ERROR;
		  dparen = 1;
		  nparen++;
		  *buf++ = OPAREN;
		  break;
		case ')':
		  if (dparen != 1) ERROR;
		  dparen = 0;
		  nparen++;
		  *buf++ = CPAREN;
		  break;

		case 0: ERROR;
		default:
		  goto literal_char;
	       }
	     break;

	   case '[':

	     *buf = RANGE;
	     last = buf++;
	     
	     if (buf + 32 >= ebuf) ERROR;

	     for (i = 0; i < 32; i++) buf[i] = 0;
	     c = *pat++;
	     if (c == '^') 
	       {
		  reverse = 1;
		  SET_BIT(buf, '\n');
		  c = *pat++;
	       }
	     
	     if (c == ']')
	       {
		  SET_BIT(buf, c);
		  c = *pat++;
	       }
	     while (c && (c != ']'))
	       {
		  if (c == '\\')
		    {
		       c = *pat++;
		       switch(c)
			 {
			    case 'n': c = '\n'; break;
			    case 't': c = '\t'; break;
			    case 0: ERROR;
			 }
		    }

		  if (*pat == '-')
		    {
		       pat++;
		       while (c < *pat)
			 {
			    if (cs == 0)
			      {
				 SET_BIT(buf, UPPERCASE(c));
				 SET_BIT(buf, LOWERCASE(c));
			      }
			    else SET_BIT(buf, c);
			    c++;
			 }
		    }
		  if (cs == 0)
		    {
		       SET_BIT(buf, UPPERCASE(c));
		       SET_BIT(buf, LOWERCASE(c));
		    }
		  else SET_BIT(buf, c);
		  c = *pat++;
	       }
	     if (c != ']') ERROR;
	     if (reverse) for (i = 0; i < 32; i++) buf[i] = buf[i] ^ 0xFF;
	     reverse = 0;
	     buf += 32;
	     min_length++;
	     break;
	     
	   case '.':
	     last = buf;
	     *buf++ = ANY;
	     min_length++;
	     break;
	     
	   case '*': 
	     if (last == NULL) goto literal_char;
	     *last |= STAR;
	     min_length--;
	     last = NULL;
	     break;
	
	   case '+': 
	     if (last == NULL) goto literal_char;
	     *last |= LEAST_ONCE;
	     last = NULL;
	     break;

	   case '?':
	     if (last == NULL) goto literal_char;
	     *last |= MAYBE_ONCE;
	     last = NULL;
	     min_length--;
	     break;

	   literal_char:
	   default:
	     /* This is to keep track of longest substring */
	     min_length++;
	     this_max_mm++;
	     if (last_count + 1 == count)
	       {
		  if (this_max_mm == 1)
		    {
		       this_mm_p = buf;
		    }
		  else if (max_mm < this_max_mm)
		    {
		       mm_p = this_mm_p;
		       max_mm = this_max_mm;
		    }
	       }
	     else 
	       {
		  this_mm_p = buf;
		  this_max_mm = 1;
	       }
	     
	     last_count = count;
		       
	     last = buf;
	     *buf++ = LITERAL;
	     *buf++ = UPPERCASE(c);
	  }
     }
   *buf = 0;
   /* Check for ordinary search */
   ebuf = buf;
   buf = reg->buf;
   
   if (no_osearch) ordinary_search = 0;
   else
     {
	ordinary_search = 1;
	while (buf < ebuf)
	  {
	     if (*buf != LITERAL) 
	       {
		  ordinary_search = 0;
		  break;
	       }
	     buf += 2;
	  }
     }
   
   reg->osearch = ordinary_search;
   reg->must_match_str[15] = 0;
   reg->min_length = (min_length > 0) ? (unsigned int) min_length : 0;
   if (ordinary_search)
     {
	strncpy((char *) reg->must_match_str, (char *) reg->pat, 15);
	reg->must_match = 1;
	return(0);
     }
   /* check for longest substring of pattern */
   reg->must_match = 0;
   if ((mm_p == NULL) && (this_mm_p != NULL)) mm_p = this_mm_p;
   if (mm_p == NULL)
     {
	return (0);
     }
   n = 15;
   pat = reg->must_match_str;
   buf = mm_p;
   while (n--)
     {
	if (*buf++ != LITERAL) break;
	*pat++ = *buf++;
     }
   *pat = 0;
   if (pat != reg->must_match_str) reg->must_match = 1;
   return(0);
}

char *SLregexp_quote_string (char *re, char *buf, unsigned int buflen)
{
   char ch;
   char *b, *bmax;
   
   if (re == NULL) return NULL;
   
   b = buf;
   bmax = buf + buflen;
   
   while (b < bmax)
     {
	switch (ch = *re++)
	  {
	   case 0:
	     *b = 0;
	     return buf;
	     
	   case '$':
	   case '\\':
	   case '[':
	   case ']':
	   case '.':
	   case '^':
	   case '*':
	   case '+':
	   case '?':
	     *b++ = '\\';
	    if (b == bmax) break;
	     /* drop */
	     
	   default:
	     *b++ = ch;
	  }
     }
   return NULL;
}

/*
#define MAX_EXP 4096
int main(int argc, char **argv)
{
   FILE *fp;
   char *regexp, *file;
   char expbuf[MAX_EXP], buf[512];
   SLRegexp_Type reg;
   
   file = argv[2];
   regexp = argv[1];
   
   if (NULL == (fp = fopen(file, "r")))
     {
	fprintf(stderr, "File not open\n");
	return(1);
     }
   
   reg.buf = expbuf;
   reg.buf_len = MAX_EXP;
   reg.pat = regexp;
   reg.case_sensitive = 1;

   if (!regexp_compile(&reg)) while (NULL != fgets(buf, 511, fp))
     {
	if (reg.osearch)
	  {
	     if (NULL == strstr(buf, reg.pat)) continue;
	  }
	else 
	  {
	     if (reg.must_match && (NULL == strstr(buf, reg.must_match_str))) continue;
	     if (0 == regexp_match(buf, buf + strlen(buf), &reg)) continue;
	  }
	
	fputs(buf, stdout);
     }
   return (0);
}
*/

These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.