Exemple #1
0
/**
 * Reads in a grain from a regex - part of the CL Regex Optimiser.
 *
 * A grain is a string of safe symbols not followed by ?, *, or {..}.
 * This function finds the longest grain it can starting at the point
 * in the regex indicated by mark; backslash-escaped characters are
 * allowed but the backslashes must be stripped by the caller.
 *
 * @param mark  Pointer to location in the regex string from
 *              which to read.
 * @return      Pointer to the first character after the grain
 *              it has read in (or the original "mark" pointer
 *              if no grain is found).
 */
char *
read_grain(char *mark)
{
  char *point = mark;
  int last_char_escaped = 0, glen;

  glen = 0; /* effective length of grain */
  while ( is_safe_char(*point) || (*point == '\\' && point[1]) ) {
    if (*point == '\\') {
      /* skip backslash and escaped character
       * (but not at end of string; backslash at
       * end of string really is backslash). */
      point++;
      last_char_escaped = 1;
    }
    else {
      last_char_escaped = 0;
    }
    point++;
    glen++;
  }
  if (point > mark) {        /* if followed by ?, *, or {..}, shrink grain by one char */
    if (*point == '?' || *point == '*' || *point == '{') {
      point--;
      glen--;
      if (last_char_escaped) /* if last character was escaped, make sure to remove the backslash as well */
        point--;
    }
  }
  if (glen >= 2)
    return point;
  else
    return mark;
}
Exemple #2
0
int
main(int argc, const char *argv[])
{
	const char *p;

	if (argc != 2) {
		exit(EXIT_FAILURE);
	}
	for (p = argv[1]; '\0' != *p; p++) {
		unsigned char c = *p;

		if (is_safe_char(c))
			printf("%c", c);
		else
			printf("%%%02X", c);
	}
	return 0;
}
Exemple #3
0
/**
 * Reads in a matchall (dot wildcard) or safe character -
 * part of the CL Regex Optimiser.
 *
 * This function reads in matchall, any safe character,
 * or a reasonably safe-looking character class.
 *
 * @param mark  Pointer to location in the regex string from
 *              which to read.
 * @return      Pointer to the first character after the character
 *              (class) it has read in (or the original "mark"
 *              pointer if something suitable was not read).
 */
char *
read_matchall(char *mark)
{
  if (*mark == '.') {
    /* read the matchall diot */
    return mark + 1;
  }
  else if (*mark == '[') {
    char *point = mark + 1;
    /* [AH: the original list, and the original comment to go with them ]:
    * according to the POSIX standard, \ does not have a special meaning in a character class;
       we won't skip it or any other special characters with possibly messy results;
       we just accept | as a special optimisation for the matches and contains operators in CQP
    while (*point != ']' && *point != '\\' && *point != '[' && *point != '\0') { */
    /* [AH: new version] the following characters are "not safe-looking" within a character class in PCRE: */
    while (*point != ']' && *point != '\\' && *point != '[' && *point != '\0'
            && *point != '-' && *point != '^') {
      point++;
    }
    /* if we got to ] without hitting a "messy" character, read the entire character class.
     * otherwise read nothing.  */
    return (*point == ']') ? point + 1 : mark;
  }
  else if (is_safe_char(*mark)) {
    return mark + 1;
  }
  else if (*mark == '\\') {      /* outside a character class, \ always escapes to literal meaning */
    /* TODO: problem, in PCRE a \ can escape not just the following letter, but a sequence after it;
     * will this break the grain-finder? (Overgenerating grains merely results in extra calls to the
     * regex engine, lowering the effect of the optimsiation, but not giving false matches)*/
    return mark + 2;
  }
  else {
    return mark;
  }
}