예제 #1
0
파일: re.c 프로젝트: OS2World/DEV-UTIL-gawk
int
research(Regexp *rp, register char *str, int start,
	register size_t len, int need_start)
{
	char *ret = str;
	int try_backref;

	/*
	 * Always do dfa search if can; if it fails, then even if
	 * need_start is true, we won't bother with the regex search.
	 */
	if (rp->dfa) {
		char save;
		int count = 0;

		/*
		 * dfa likes to stick a '\n' right after the matched
		 * text.  So we just save and restore the character.
		 */
		save = str[start+len];
		ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
					&count, &try_backref);
		str[start+len] = save;
	}
	if (ret) {
		if (need_start || rp->dfa == FALSE || try_backref) {
			int res = re_search(&(rp->pat), str, start+len,
					start, len, &(rp->regs));
			return res;
		} else
			return 1;
	} else
		return -1;
}
예제 #2
0
파일: dfa-match-aux.c 프로젝트: aixoss/grep
int
main (int argc, char **argv)
{
  struct dfa *dfa;
  char *beg, *end, *p;
  int allow_nl;

  set_program_name (argv[0]);
  if (argc < 3)
    exit (EXIT_FAILURE);

  setlocale (LC_ALL, "");

  dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
  dfa = dfaalloc ();
  dfacomp (argv[1], strlen (argv[1]), dfa, 0);

  beg = argv[2];
  end = argv[2] + strlen (argv[2]);
  allow_nl = argc > 3 && atoi (argv[3]);

  p = dfaexec (dfa, beg, end, allow_nl, NULL, NULL);

  if (p != NULL)
    printf ("%zd\n", p - beg);

  exit (EXIT_SUCCESS);
}
예제 #3
0
int evaluate_fast_regex( struct fast_regex * fre_t, char * str, size_t len )
{
	char * sub ;
	struct _fregex * fre = (struct _fregex *)( fre_t->data ) ;

	if( fre->kwset )
	{
		struct kwsmatch kwsm ;
		sub = kwsexec( fre->kwset, (char *)str, len, &kwsm) ;
		if( sub == NULL )
			return 0 ;
		if( kwsm.index < fre->num_exact_kws )
		{
			return 1 ;
		}
	}

	if( HAS_DFA(fre_t->options) )
	{
		int backref = 0 ;
		sub = dfaexec( &(fre->dfa), str, (str+len), 0, NULL, &backref) ;
		if( sub == NULL )
			return 0 ;
		if ( !backref || (fre_t->options & FRE_NO_REGEX) )
			return 1 ;
	}

	return re_match( &fre->regex , str, len, 0, NULL ) > 0 ;
}
예제 #4
0
파일: re.c 프로젝트: WndSks/msys
int
research(Regexp *rp, register char *str, int start,
	register size_t len, int flags)
{
	const char *ret = str;
	int try_backref;
	int need_start;
	int no_bol;
	int res;

	need_start = ((flags & RE_NEED_START) != 0);
	no_bol = ((flags & RE_NO_BOL) != 0);

	if (no_bol)
		rp->pat.not_bol = 1;

	/*
	 * Always do dfa search if can; if it fails, then even if
	 * need_start is true, we won't bother with the regex search.
	 *
	 * The dfa matcher doesn't have a no_bol flag, so don't bother
	 * trying it in that case.
	 */
	if (rp->dfa && ! no_bol) {
		char save;
		int count = 0;
 		/*
		 * dfa likes to stick a '\n' right after the matched
		 * text.  So we just save and restore the character.
		 */
		save = str[start+len];
		ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
					&count, &try_backref);
		str[start+len] = save;
	}

	if (ret) {
		if (need_start || rp->dfa == FALSE || try_backref) {
			/*
			 * Passing NULL as last arg speeds up search for cases
			 * where we don't need the start/end info.
			 */
			res = re_search(&(rp->pat), str, start+len,
				start, len, need_start ? &(rp->regs) : NULL);
		} else
			res = 1;
	} else
		res = -1;

	rp->pat.not_bol = 0;
	return res;
}
예제 #5
0
size_t
EGexecute (char const *buf, size_t size, size_t *match_size,
           char const *start_ptr)
{
  char const *buflim, *beg, *end, *match, *best_match, *mb_start;
  char eol = eolbyte;
  int backref;
  regoff_t start;
  size_t len, best_len;
  struct kwsmatch kwsm;
  size_t i, ret_val;
  mb_len_map_t *map = NULL;

  if (MB_CUR_MAX > 1)
    {
      if (match_icase)
        {
          /* mbtolower adds a NUL byte at the end.  That will provide
             space for the sentinel byte dfaexec may add.  */
          char *case_buf = mbtolower (buf, &size, &map);
          if (start_ptr)
            start_ptr = case_buf + (start_ptr - buf);
          buf = case_buf;
        }
    }

  mb_start = buf;
  buflim = buf + size;

  for (beg = end = buf; end < buflim; beg = end)
    {
      if (!start_ptr)
        {
          /* We don't care about an exact match.  */
          if (kwset)
            {
              /* Find a possible match using the KWset matcher. */
              size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
              if (offset == (size_t) -1)
                goto failure;
              beg += offset;
              /* Narrow down to the line containing the candidate, and
                 run it through DFA. */
              if ((end = memchr(beg, eol, buflim - beg)) != NULL)
                end++;
              else
                end = buflim;
              match = beg;
              while (beg > buf && beg[-1] != eol)
                --beg;
              if (kwsm.index < kwset_exact_matches)
                {
                  if (!MBS_SUPPORT)
                    goto success;

                  if (mb_start < beg)
                    mb_start = beg;
                  if (MB_CUR_MAX == 1
                      || !is_mb_middle (&mb_start, match, buflim,
                                        kwsm.size[0]))
                    goto success;
                }
              if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
                continue;
            }
          else
            {
              /* No good fixed strings; start with DFA. */
              char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
                                              0, NULL, &backref);
              /* If there's no match, or if we've matched the sentinel,
                 we're done.  */
              if (next_beg == NULL || next_beg == buflim)
                break;
              /* Narrow down to the line we've found. */
              beg = next_beg;
              if ((end = memchr(beg, eol, buflim - beg)) != NULL)
                end++;
              else
                end = buflim;
              while (beg > buf && beg[-1] != eol)
                --beg;
            }
          /* Successful, no backreferences encountered! */
          if (!backref)
            goto success;
        }
      else
        {
          /* We are looking for the leftmost (then longest) exact match.
             We will go through the outer loop only once.  */
          beg = start_ptr;
          end = buflim;
        }

      /* If the "line" is longer than the maximum regexp offset,
         die as if we've run out of memory.  */
      if (TYPE_MAXIMUM (regoff_t) < end - buf - 1)
        xalloc_die ();

      /* If we've made it to this point, this means DFA has seen
         a probable match, and we need to run it through Regex. */
      best_match = end;
      best_len = 0;
      for (i = 0; i < pcount; i++)
        {
          patterns[i].regexbuf.not_eol = 0;
          start = re_search (&(patterns[i].regexbuf),
                             buf, end - buf - 1,
                             beg - buf, end - beg - 1,
                             &(patterns[i].regs));
          if (start < -1)
            xalloc_die ();
          else if (0 <= start)
            {
              len = patterns[i].regs.end[0] - start;
              match = buf + start;
              if (match > best_match)
                continue;
              if (start_ptr && !match_words)
                goto assess_pattern_match;
              if ((!match_lines && !match_words)
                  || (match_lines && len == end - beg - 1))
                {
                  match = beg;
                  len = end - beg;
                  goto assess_pattern_match;
                }
              /* If -w, check if the match aligns with word boundaries.
                 We do this iteratively because:
                 (a) the line may contain more than one occurrence of the
                 pattern, and
                 (b) Several alternatives in the pattern might be valid at a
                 given point, and we may need to consider a shorter one to
                 find a word boundary.  */
              if (match_words)
                while (match <= best_match)
                  {
                    regoff_t shorter_len = 0;
                    if ((match == buf || !WCHAR ((unsigned char) match[-1]))
                        && (start + len == end - buf - 1
                            || !WCHAR ((unsigned char) match[len])))
                      goto assess_pattern_match;
                    if (len > 0)
                      {
                        /* Try a shorter length anchored at the same place. */
                        --len;
                        patterns[i].regexbuf.not_eol = 1;
                        shorter_len = re_match (&(patterns[i].regexbuf),
                                                buf, match + len - beg,
                                                match - buf,
                                                &(patterns[i].regs));
                        if (shorter_len < -1)
                          xalloc_die ();
                      }
                    if (0 < shorter_len)
                      len = shorter_len;
                    else
                      {
                        /* Try looking further on. */
                        if (match == end - 1)
                          break;
                        match++;
                        patterns[i].regexbuf.not_eol = 0;
                        start = re_search (&(patterns[i].regexbuf),
                                           buf, end - buf - 1,
                                           match - buf, end - match - 1,
                                           &(patterns[i].regs));
                        if (start < 0)
                          {
                            if (start < -1)
                              xalloc_die ();
                            break;
                          }
                        len = patterns[i].regs.end[0] - start;
                        match = buf + start;
                      }
                  } /* while (match <= best_match) */
              continue;
            assess_pattern_match:
              if (!start_ptr)
                {
                  /* Good enough for a non-exact match.
                     No need to look at further patterns, if any.  */
                  goto success;
                }
              if (match < best_match || (match == best_match && len > best_len))
                {
                  /* Best exact match:  leftmost, then longest.  */
                  best_match = match;
                  best_len = len;
                }
            } /* if re_search >= 0 */
        } /* for Regex patterns.  */
        if (best_match < end)
          {
            /* We have found an exact match.  We were just
               waiting for the best one (leftmost then longest).  */
            beg = best_match;
            len = best_len;
            goto success_in_len;
          }
    } /* for (beg = end ..) */

 failure:
  ret_val = -1;
  goto out;

 success:
  len = end - beg;
 success_in_len:;
  size_t off = beg - buf;
  mb_case_map_apply (map, &off, &len);
  *match_size = len;
  ret_val = off;
 out:
  return ret_val;
}
예제 #6
0
static size_t
EGexecute (const void *compiled_pattern,
	   const char *buf, size_t buf_size,
	   size_t *match_size, bool exact)
{
  struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
  register const char *buflim, *beg, *end;
  char eol = cregex->eolbyte;
  int backref, start, len;
  struct kwsmatch kwsm;
  size_t i;
#ifdef MBS_SUPPORT
  char *mb_properties = NULL;
#endif /* MBS_SUPPORT */

#ifdef MBS_SUPPORT
  if (MB_CUR_MAX > 1 && cregex->ckwset.kwset)
    mb_properties = check_multibyte_string (buf, buf_size);
#endif /* MBS_SUPPORT */

  buflim = buf + buf_size;

  for (beg = end = buf; end < buflim; beg = end)
    {
      if (!exact)
	{
	  if (cregex->ckwset.kwset)
	    {
	      /* Find a possible match using the KWset matcher. */
	      size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm);
	      if (offset == (size_t) -1)
		{
#ifdef MBS_SUPPORT
		  if (MB_CUR_MAX > 1)
		    free (mb_properties);
#endif
		  return (size_t)-1;
		}
	      beg += offset;
	      /* Narrow down to the line containing the candidate, and
		 run it through DFA. */
	      end = memchr (beg, eol, buflim - beg);
	      if (end != NULL)
		end++;
	      else
		end = buflim;
#ifdef MBS_SUPPORT
	      if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
		continue;
#endif
	      while (beg > buf && beg[-1] != eol)
		--beg;
	      if (kwsm.index < cregex->kwset_exact_matches)
		goto success;
	      if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1)
		continue;
	    }
	  else
	    {
	      /* No good fixed strings; start with DFA. */
	      size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref);
	      if (offset == (size_t) -1)
		break;
	      /* Narrow down to the line we've found. */
	      beg += offset;
	      end = memchr (beg, eol, buflim - beg);
	      if (end != NULL)
		end++;
	      else
		end = buflim;
	      while (beg > buf && beg[-1] != eol)
		--beg;
	    }
	  /* Successful, no backreferences encountered! */
	  if (!backref)
	    goto success;
	}
      else
	end = beg + buf_size;

      /* If we've made it to this point, this means DFA has seen
	 a probable match, and we need to run it through Regex. */
      for (i = 0; i < cregex->pcount; i++)
	{
	  cregex->patterns[i].regexbuf.not_eol = 0;
	  if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg,
				       end - beg - 1, 0,
				       end - beg - 1, &(cregex->patterns[i].regs))))
	    {
	      len = cregex->patterns[i].regs.end[0] - start;
	      if (exact)
		{
		  *match_size = len;
		  return start;
		}
	      if ((!cregex->match_lines && !cregex->match_words)
		  || (cregex->match_lines && len == end - beg - 1))
		goto success;
	      /* If -w, check if the match aligns with word boundaries.
		 We do this iteratively because:
		 (a) the line may contain more than one occurence of the
		 pattern, and
		 (b) Several alternatives in the pattern might be valid at a
		 given point, and we may need to consider a shorter one to
		 find a word boundary.  */
	      if (cregex->match_words)
		while (start >= 0)
		  {
		    if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
			&& (len == end - beg - 1
			    || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
		      goto success;
		    if (len > 0)
		      {
			/* Try a shorter length anchored at the same place. */
			--len;
			cregex->patterns[i].regexbuf.not_eol = 1;
			len = re_match (&(cregex->patterns[i].regexbuf), beg,
					start + len, start,
					&(cregex->patterns[i].regs));
		      }
		    if (len <= 0)
		      {
			/* Try looking further on. */
			if (start == end - beg - 1)
			  break;
			++start;
			cregex->patterns[i].regexbuf.not_eol = 0;
			start = re_search (&(cregex->patterns[i].regexbuf), beg,
					   end - beg - 1,
					   start, end - beg - 1 - start,
					   &(cregex->patterns[i].regs));
			len = cregex->patterns[i].regs.end[0] - start;
		      }
		  }
	    }
	} /* for Regex patterns.  */
    } /* for (beg = end ..) */
#ifdef MBS_SUPPORT
  if (MB_CUR_MAX > 1 && mb_properties)
    free (mb_properties);
#endif /* MBS_SUPPORT */
  return (size_t) -1;

 success:
#ifdef MBS_SUPPORT
  if (MB_CUR_MAX > 1 && mb_properties)
    free (mb_properties);
#endif /* MBS_SUPPORT */
  *match_size = end - beg;
  return beg - buf;
}
예제 #7
0
파일: regexp.c 프로젝트: agordon/sed
int
match_regex (struct regex *regex, char *buf, size_t buflen,
            size_t buf_start_offset, struct re_registers *regarray,
            int regsize)
{
  int ret;
  static struct regex *regex_last;

  /* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */

  /* Keep track of the last regexp matched. */
  if (!regex)
    {
      regex = regex_last;
      if (!regex_last)
        bad_prog (_(NO_REGEX));
    }
  else
    regex_last = regex;

  /* gnulib's re_search uses signed-int as length */
  if (buflen >= INT_MAX)
    panic (_("regex input buffer length larger than INT_MAX"));

  if (regex->pattern.no_sub && regsize)
    {
      /* Re-compiling an existing regex, free the previously allocated
         structures.  */
      if (regex->dfa)
        {
          dfafree (regex->dfa);
          free (regex->dfa);
          regex->dfa = NULL;
        }
      regfree (&regex->pattern);

      compile_regex_1 (regex, regsize);
    }

  regex->pattern.regs_allocated = REGS_REALLOCATE;

  /* Optimized handling for '^' and '$' patterns */
  if (regex->begline || regex->endline)
    {
      size_t offset;

      if (regex->endline)
        {
          const char *p = NULL;

          if (regex->flags & REG_NEWLINE)
            p = memchr (buf + buf_start_offset, buffer_delimiter,
                        buflen - buf_start_offset);

          offset = p ? p - buf : buflen;
        }
      else if (buf_start_offset == 0)
        /* begline anchor, starting at beginning of the buffer. */
        offset = 0;
      else if (!(regex->flags & REG_NEWLINE))
        /* begline anchor, starting in the middle of the text buffer,
           and multiline regex is not specified - will never match.
           Example: seq 2 | sed 'N;s/^/X/g' */
        return 0;
      else if (buf[buf_start_offset - 1] == buffer_delimiter)
        /* begline anchor, starting in the middle of the text buffer,
           with multiline match, and the current character
           is the line delimiter - start here.
           Example: seq 2 | sed 'N;s/^/X/mg' */
        offset = buf_start_offset;
      else
        {
          /* begline anchor, starting in the middle of the search buffer,
             all previous optimizions didn't work: search
             for the next line delimiter character in the buffer,
             and start from there if found. */
          const char *p = memchr (buf + buf_start_offset, buffer_delimiter,
                                  buflen - buf_start_offset);

          if (p == NULL)
            return 0;

          offset = p - buf + 1;
        }

      if (regsize)
        {
          size_t i;

          if (!regarray->start)
            {
              regarray->start = XCALLOC (1, regoff_t);
              regarray->end = XCALLOC (1, regoff_t);
              regarray->num_regs = 1;
            }

          regarray->start[0] = offset;
          regarray->end[0] = offset;

          for (i = 1 ; i < regarray->num_regs; ++i)
            regarray->start[i] = regarray->end[i] = -1;
        }

      return 1;
    }

  if (buf_start_offset == 0)
    {
      struct dfa *superset = dfasuperset (regex->dfa);

      if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL))
        return 0;

      if ((!regsize && (regex->flags & REG_NEWLINE))
          || (!superset && dfaisfast (regex->dfa)))
        {
          bool backref = false;

          if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref))
            return 0;

          if (!regsize && (regex->flags & REG_NEWLINE) && !backref)
            return 1;
        }
    }

  /* If the buffer delimiter is not newline character, we cannot use
     newline_anchor flag of regex.  So do it line-by-line, and add offset
     value to results.  */
  if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n')
    {
      const char *beg, *end;
      const char *start;

      beg = buf;

      if (buf_start_offset > 0)
        {
          const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset);

          if (eol != NULL)
            beg = eol + 1;
        }

      start = buf + buf_start_offset;

      for (;;)
        {
          end = memchr (beg, buffer_delimiter, buf + buflen - beg);

          if (end == NULL)
            end = buf + buflen;

          ret = re_search (&regex->pattern, beg, end - beg,
                           start - beg, end - start,
                           regsize ? regarray : NULL);

          if (ret > -1)
            {
              size_t i;

              ret += beg - buf;

              if (regsize)
                {
                  for (i = 0; i < regarray->num_regs; ++i)
                    {
                      if (regarray->start[i] > -1)
                        regarray->start[i] += beg - buf;
                      if (regarray->end[i] > -1)
                        regarray->end[i] += beg - buf;
                    }
                }

              break;
            }

          if (end == buf + buflen)
            break;

          beg = start = end + 1;
        }
    }
  else
    ret = re_search (&regex->pattern, buf, buflen, buf_start_offset,
                     buflen - buf_start_offset,
                     regsize ? regarray : NULL);

  return (ret > -1);
}