Example #1
0
int
main (int argc, char **argv)
{
  struct dfa *dfa;
  char *beg, *end, *p;
  int allow_nl;

  set_program_name (argv[0]);
  if (argc < 3)
    exit (EXIT_FAILURE);

  setlocale (LC_ALL, "");

  dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
  dfa = dfaalloc ();
  dfacomp (argv[1], strlen (argv[1]), dfa, 0);

  beg = argv[2];
  end = argv[2] + strlen (argv[2]);
  allow_nl = argc > 3 && atoi (argv[3]);

  p = dfaexec (dfa, beg, end, allow_nl, NULL, NULL);

  if (p != NULL)
    printf ("%zd\n", p - beg);

  exit (EXIT_SUCCESS);
}
Example #2
0
void
GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
{
  const char *err;
  const char *p, *sep;
  size_t total = size;
  char *motif;

  if (match_icase)
    syntax_bits |= RE_ICASE;
  re_set_syntax (syntax_bits);
  dfasyntax (syntax_bits, match_icase, eolbyte);

  /* For GNU regex compiler we have to pass the patterns separately to detect
     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
     GNU regex should have raise a syntax error.  The same for backref, where
     the backref should have been local to each pattern.  */
  p = pattern;
  do
    {
      size_t len;
      sep = memchr (p, '\n', total);
      if (sep)
        {
          len = sep - p;
          sep++;
          total -= (len + 1);
        }
      else
        {
          len = total;
          total = 0;
        }

      patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
      patterns[pcount] = patterns0;

      if ((err = re_compile_pattern (p, len,
                                    &(patterns[pcount].regexbuf))) != NULL)
        error (EXIT_TROUBLE, 0, "%s", err);
      pcount++;

      p = sep;
    } while (sep && total != 0);

  /* In the match_words and match_lines cases, we use a different pattern
     for the DFA matcher that will quickly throw out cases that won't work.
     Then if DFA succeeds we do some hairy stuff using the regex matcher
     to decide whether the match should really count. */
  if (match_words || match_lines)
    {
      static char const line_beg_no_bk[] = "^(";
      static char const line_end_no_bk[] = ")$";
      static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
      static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
      static char const line_beg_bk[] = "^\\(";
      static char const line_end_bk[] = "\\)$";
      static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
      static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
      int bk = !(syntax_bits & RE_NO_BK_PARENS);
      char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);

      strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
                             : (bk ? word_beg_bk : word_beg_no_bk));
      total = strlen(n);
      memcpy (n + total, pattern, size);
      total += size;
      strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
                                     : (bk ? word_end_bk : word_end_no_bk));
      total += strlen (n + total);
      pattern = motif = n;
      size = total;
    }
  else
    motif = NULL;

  dfa = dfaalloc ();
  dfacomp (pattern, size, dfa, 1);
  kwsmusts ();

  free(motif);
}
Example #3
0
Regexp *
make_regexp(char *s, size_t len, int ignorecase, int dfa)
{
	Regexp *rp;
	const char *rerr;
	char *src = s;
	char *temp;
	char *end = s + len;
	register char *dest;
	register int c, c2;

	/* Handle escaped characters first. */

	/*
	 * Build a copy of the string (in dest) with the
	 * escaped characters translated, and generate the regex
	 * from that.  
	 */
	emalloc(dest, char *, len + 2, "make_regexp");
	temp = dest;

	while (src < end) {
		if (*src == '\\') {
			c = *++src;
			switch (c) {
			case 'a':
			case 'b':
			case 'f':
			case 'n':
			case 'r':
			case 't':
			case 'v':
			case 'x':
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
				c2 = parse_escape(&src);
				if (c2 < 0)
					cant_happen();
				/*
				 * Unix awk treats octal (and hex?) chars
				 * literally in re's, so escape regexp
				 * metacharacters.
				 */
				if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
				    && strchr("()|*+?.^$\\[]", c2) != NULL)
					*dest++ = '\\';
				*dest++ = (char) c2;
				break;
			case '8':
			case '9':	/* a\9b not valid */
				*dest++ = c;
				src++;
				break;
			case 'y':	/* normally \b */
				/* gnu regex op */
				if (! do_traditional) {
					*dest++ = '\\';
					*dest++ = 'b';
					src++;
					break;
				}
				/* else, fall through */
			default:
				*dest++ = '\\';
				*dest++ = (char) c;
				src++;
				break;
			} /* switch */
		} else
			*dest++ = *src++;	/* not '\\' */
	} /* for */

	*dest = '\0' ;	/* Only necessary if we print dest ? */
	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
	memset((char *) rp, 0, sizeof(*rp));
	rp->pat.allocated = 0;	/* regex will allocate the buffer */
	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");

	if (ignorecase)
		rp->pat.translate = casetable;
	else
		rp->pat.translate = NULL;
	len = dest - temp;
	if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
		fatal("%s: /%s/", gettext(rerr), temp);

	/* gack. this must be done *after* re_compile_pattern */
	rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
	if (dfa && ! ignorecase) {
		dfacomp(temp, len, &(rp->dfareg), TRUE);
		rp->dfa = TRUE;
	} else
		rp->dfa = FALSE;

	free(temp);
	return rp;
}
Example #4
0
static void *
compile (const char *pattern, size_t pattern_size,
	 bool match_icase, bool match_words, bool match_lines, char eolbyte,
	 reg_syntax_t syntax)
{
  struct compiled_regex *cregex;
  const char *err;
  const char *sep;
  size_t total = pattern_size;
  const char *motif = pattern;

  cregex = (struct compiled_regex *) xmalloc (sizeof (struct compiled_regex));
  memset (cregex, '\0', sizeof (struct compiled_regex));
  cregex->match_words = match_words;
  cregex->match_lines = match_lines;
  cregex->eolbyte = eolbyte;
  cregex->patterns = NULL;
  cregex->pcount = 0;

  re_set_syntax (syntax);
  dfasyntax (syntax, match_icase, eolbyte);

  /* For GNU regex compiler we have to pass the patterns separately to detect
     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
     GNU regex should have raise a syntax error.  The same for backref, where
     the backref should have been local to each pattern.  */
  do
    {
      size_t len;
      sep = memchr (motif, '\n', total);
      if (sep)
	{
	  len = sep - motif;
	  sep++;
	  total -= (len + 1);
	}
      else
	{
	  len = total;
	  total = 0;
	}

      cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
      memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));

      if ((err = re_compile_pattern (motif, len,
				     &(cregex->patterns[cregex->pcount].regexbuf))) != NULL)
	error (exit_failure, 0, err);
      cregex->pcount++;

      motif = sep;
    } while (sep && total != 0);

  /* In the match_words and match_lines cases, we use a different pattern
     for the DFA matcher that will quickly throw out cases that won't work.
     Then if DFA succeeds we do some hairy stuff using the regex matcher
     to decide whether the match should really count. */
  if (match_words || match_lines)
    {
      /* In the whole-word case, we use the pattern:
	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
	 In the whole-line case, we use the pattern:
	 ^(userpattern)$.  */

      static const char line_beg[] = "^(";
      static const char line_end[] = ")$";
      static const char word_beg[] = "(^|[^[:alnum:]_])(";
      static const char word_end[] = ")([^[:alnum:]_]|$)";
      char *n = (char *) xmalloc (sizeof word_beg - 1 + pattern_size + sizeof word_end);
      size_t i;
      strcpy (n, match_lines ? line_beg : word_beg);
      i = strlen(n);
      memcpy (n + i, pattern, pattern_size);
      i += pattern_size;
      strcpy (n + i, match_lines ? line_end : word_end);
      i += strlen (n + i);
      pattern = n;
      pattern_size = i;
    }

  dfacomp (pattern, pattern_size, &cregex->dfa, 1);
  kwsmusts (cregex, match_icase, match_words, match_lines, eolbyte);

  return cregex;
}
Example #5
0
File: regexp.c Project: agordon/sed
static void
compile_regex_1 (struct regex *new_regex, int needed_sub)
{
  const char *error;
  int syntax = ((extended_regexp_flags & REG_EXTENDED)
                 ? RE_SYNTAX_POSIX_EXTENDED
                 : RE_SYNTAX_POSIX_BASIC);

  syntax &= ~RE_DOT_NOT_NULL;
  syntax |= RE_NO_POSIX_BACKTRACKING;

  switch (posixicity)
    {
    case POSIXLY_EXTENDED:
      syntax &= ~RE_UNMATCHED_RIGHT_PAREN_ORD;
      break;
    case POSIXLY_CORRECT:
      syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD;
      break;
    case POSIXLY_BASIC:
      syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS;
      if (!(extended_regexp_flags & REG_EXTENDED))
        syntax |= RE_LIMITED_OPS;
      break;
    }

  if (new_regex->flags & REG_ICASE)
    syntax |= RE_ICASE;
  else
    new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8));
  syntax |= needed_sub ? 0 : RE_NO_SUB;

  /* If REG_NEWLINE is set, newlines are treated differently.  */
  if (new_regex->flags & REG_NEWLINE)
    {
      /* REG_NEWLINE implies neither . nor [^...] match newline.  */
      syntax &= ~RE_DOT_NEWLINE;
      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
    }

  re_set_syntax (syntax);
  error = re_compile_pattern (new_regex->re, new_regex->sz,
                              &new_regex->pattern);
  new_regex->pattern.newline_anchor =
    buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0;

  new_regex->pattern.translate = NULL;
#ifndef RE_ICASE
  if (new_regex->flags & REG_ICASE)
    {
      static char translate[1 << (sizeof (char) * 8)];
      int i;
      for (i = 0; i < sizeof (translate) / sizeof (char); i++)
        translate[i] = tolower (i);

      new_regex->pattern.translate = translate;
    }
#endif

  if (error)
    bad_prog (error);

  /* Just to be sure, I mark this as not POSIXLY_CORRECT behavior */
  if (needed_sub
      && new_regex->pattern.re_nsub < needed_sub - 1
      && posixicity == POSIXLY_EXTENDED)
    {
      char buf[200];
      sprintf (buf, _("invalid reference \\%d on `s' command's RHS"),
              needed_sub - 1);
      bad_prog (buf);
    }

  int dfaopts = buffer_delimiter == '\n' ? 0 : DFA_EOL_NUL;
  new_regex->dfa = dfaalloc ();
  dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts);
  dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1);

  /* The patterns which consist of only ^ or $ often appear in
     substitution, but regex and dfa are not good at them, as regex does
     not build fastmap, and as all in buffer must be scanned for $.  So
     we mark them to handle manually.  */
  if (new_regex->sz == 1)
    {
      if (new_regex->re[0] == '^')
        new_regex->begline = true;
      if (new_regex->re[0] == '$')
        new_regex->endline = true;
    }
}
Example #6
0
File: re.c Project: WndSks/msys
Regexp *
make_regexp(const char *s, size_t len, int ignorecase, int dfa)
{
	Regexp *rp;
	const char *rerr;
	const char *src = s;
	char *temp;
	const char *end = s + len;
	register char *dest;
	register int c, c2;
	static short first = TRUE;
	static short no_dfa = FALSE;
	int has_anchor = FALSE;

	/* The number of bytes in the current multibyte character.
	   It is 0, when the current character is a singlebyte character.  */
	size_t is_multibyte = 0;
#ifdef MBS_SUPPORT
	mbstate_t mbs;

	if (gawk_mb_cur_max > 1)
		memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
#endif

	if (first) {
		first = FALSE;
		no_dfa = (getenv("GAWK_NO_DFA") != NULL);	/* for debugging and testing */
	}

	/* Handle escaped characters first. */

	/*
	 * Build a copy of the string (in dest) with the
	 * escaped characters translated, and generate the regex
	 * from that.  
	 */
	emalloc(dest, char *, len + 2, "make_regexp");
	temp = dest;

	while (src < end) {
#ifdef MBS_SUPPORT
		if (gawk_mb_cur_max > 1 && ! is_multibyte) {
			/* The previous byte is a singlebyte character, or last byte
			   of a multibyte character.  We check the next character.  */
			is_multibyte = mbrlen(src, end - src, &mbs);
			if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
				|| (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
				/* We treat it as a singlebyte character.  */
				is_multibyte = 0;
			}
		}
#endif

		/* We skip multibyte character, since it must not be a special
		   character.  */
		if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
		    (*src == '\\')) {
			c = *++src;
			switch (c) {
			case 'a':
			case 'b':
			case 'f':
			case 'n':
			case 'r':
			case 't':
			case 'v':
			case 'x':
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
				c2 = parse_escape(&src);
				if (c2 < 0)
					cant_happen();
				/*
				 * Unix awk treats octal (and hex?) chars
				 * literally in re's, so escape regexp
				 * metacharacters.
				 */
				if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
				    && strchr("()|*+?.^$\\[]", c2) != NULL)
					*dest++ = '\\';
				*dest++ = (char) c2;
				break;
			case '8':
			case '9':	/* a\9b not valid */
				*dest++ = c;
				src++;
				break;
			case 'y':	/* normally \b */
				/* gnu regex op */
				if (! do_traditional) {
					*dest++ = '\\';
					*dest++ = 'b';
					src++;
					break;
				}
				/* else, fall through */
			default:
				*dest++ = '\\';
				*dest++ = (char) c;
				src++;
				break;
			} /* switch */
		} else {
			c = *src;
			if (c == '^' || c == '$')
				has_anchor = TRUE;
			*dest++ = *src++;	/* not '\\' */
		}
		if (gawk_mb_cur_max > 1 && is_multibyte)
			is_multibyte--;
	} /* while */

	*dest = '\0' ;	/* Only necessary if we print dest ? */
	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
	memset((char *) rp, 0, sizeof(*rp));
	rp->pat.allocated = 0;	/* regex will allocate the buffer */
	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");

	/*
	 * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
	 * was going to turn out to be, I wouldn't have bothered with it.
	 *
	 * In the case where we have a multibyte character set, we have no
	 * choice but to use RE_ICASE, since the casetable is for single-byte
	 * character sets only.
	 *
	 * On the other hand, if we do have a single-byte character set,
	 * using the casetable should give  a performance improvement, since
	 * it's computed only once, not each time a regex is compiled.  We
	 * also think it's probably better for portability.  See the
	 * discussion by the definition of casetable[] in eval.c.
	 */

	if (ignorecase) {
		if (gawk_mb_cur_max > 1) {
			syn |= RE_ICASE;
			rp->pat.translate = NULL;
		} else {
			syn &= ~RE_ICASE;
			rp->pat.translate = (char *) casetable;
		}
	} else {
		rp->pat.translate = NULL;
		syn &= ~RE_ICASE;
	}

	dfasyntax(syn | (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
	re_set_syntax(syn);

	len = dest - temp;
	if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
		fatal("%s: /%s/", rerr, temp);	/* rerr already gettextized inside regex routines */

	/* gack. this must be done *after* re_compile_pattern */
	rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
	if (dfa && ! no_dfa) {
		dfacomp(temp, len, &(rp->dfareg), TRUE);
		rp->dfa = TRUE;
	} else
		rp->dfa = FALSE;
	rp->has_anchor = has_anchor;

	free(temp);
	return rp;
}
Example #7
0
int compile_fast_regex( fast_regex_t * fre_t, const char * sre, size_t len )
{
	struct _fregex * fre = (struct _fregex *)( fre_t->data ) ;

	int errval ;
	char * errstr ;
	int sublen ;
	char * substr ;
	
	errval = 0 ;

		/*	fprintf( stderr, "compile_fast_regex( %p[%p] , %s )\n", fre_t, fre->kwset, sre ) ; */

	if( fast_regex_subsys_error == NULL )
		fast_regex_subsys_error = fprintf_error ;
	
	if( fre_t->options & FRE_STRING_SEARCH )
	{
		if( fre_t->options & FRE_NO_KWSET )
		{
			return -1 ;
		}
			
		/*
		 * straight string match
		 * 'sre' represents a series of newline separated strings to search for
		 */
		while( sre )
		{
			substr = strchr( sre, '\n' ) ;
			if( substr == NULL )
				sublen = len ;
			else
				sublen = (substr - sre) ;

			errstr = kwsincr( fre->kwset, sre, sublen );
			if( errstr )
			{
				errval = (*fast_regex_subsys_error)( "kwset" , errstr ) ;
				if( errval )
					return errval ;
			}
			else
				fre->num_exact_kws ++ ;
		
			len -= (sublen+1) ;
			sre = (substr) ? (substr + 1) : NULL ;
		}
		errstr = kwsprep( fre->kwset );
		if( errstr )
		{
			errval = (*fast_regex_subsys_error)( "kwset", errstr ) ;
			if( errval )
				return errval ;
		}
		return 0 ;
	}
	
	if( HAS_DFA(fre_t->options) || HAS_KWSET(fre_t->options) )
	{
		dfasyntax( RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, ( fre_t->options & FRE_CASE_INSENSITIVE ) ) ;
		dfacomp( sre, len, &(fre->dfa), 1 ) ;

		if( HAS_KWSET(fre_t->options) )
		{
			errval = build_kws_from_dfa( fre ) ;
			if( fre_t->options & FRE_NO_DFA )
			{  /* We used the DFA only to get the keywords */
				dfafree( &(fre->dfa) );
			}
		}
		else
		{
			fre->kwset = 0 ;
		}
	}

	if( errval )
		return errval ;

	if( HAS_REGEX(fre_t->options) )
	{
		re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
		errstr = re_compile_pattern( sre, len, &(fre->regex) ) ;
		if( errstr )
		{
			errval = (*fast_regex_subsys_error)( "re", errstr ) ;
			if( errval )
				return errval ;
		}
	}
	
	return errval ;
}