/** * Reads in a grain from a regex - part of the CL Regex Optimiser. * * A grain is a string of safe symbols not followed by ?, *, or {..}. * This function finds the longest grain it can starting at the point * in the regex indicated by mark; backslash-escaped characters are * allowed but the backslashes must be stripped by the caller. * * @param mark Pointer to location in the regex string from * which to read. * @return Pointer to the first character after the grain * it has read in (or the original "mark" pointer * if no grain is found). */ char * read_grain(char *mark) { char *point = mark; int last_char_escaped = 0, glen; glen = 0; /* effective length of grain */ while ( is_safe_char(*point) || (*point == '\\' && point[1]) ) { if (*point == '\\') { /* skip backslash and escaped character * (but not at end of string; backslash at * end of string really is backslash). */ point++; last_char_escaped = 1; } else { last_char_escaped = 0; } point++; glen++; } if (point > mark) { /* if followed by ?, *, or {..}, shrink grain by one char */ if (*point == '?' || *point == '*' || *point == '{') { point--; glen--; if (last_char_escaped) /* if last character was escaped, make sure to remove the backslash as well */ point--; } } if (glen >= 2) return point; else return mark; }
int main(int argc, const char *argv[]) { const char *p; if (argc != 2) { exit(EXIT_FAILURE); } for (p = argv[1]; '\0' != *p; p++) { unsigned char c = *p; if (is_safe_char(c)) printf("%c", c); else printf("%%%02X", c); } return 0; }
/** * Reads in a matchall (dot wildcard) or safe character - * part of the CL Regex Optimiser. * * This function reads in matchall, any safe character, * or a reasonably safe-looking character class. * * @param mark Pointer to location in the regex string from * which to read. * @return Pointer to the first character after the character * (class) it has read in (or the original "mark" * pointer if something suitable was not read). */ char * read_matchall(char *mark) { if (*mark == '.') { /* read the matchall diot */ return mark + 1; } else if (*mark == '[') { char *point = mark + 1; /* [AH: the original list, and the original comment to go with them ]: * according to the POSIX standard, \ does not have a special meaning in a character class; we won't skip it or any other special characters with possibly messy results; we just accept | as a special optimisation for the matches and contains operators in CQP while (*point != ']' && *point != '\\' && *point != '[' && *point != '\0') { */ /* [AH: new version] the following characters are "not safe-looking" within a character class in PCRE: */ while (*point != ']' && *point != '\\' && *point != '[' && *point != '\0' && *point != '-' && *point != '^') { point++; } /* if we got to ] without hitting a "messy" character, read the entire character class. * otherwise read nothing. */ return (*point == ']') ? point + 1 : mark; } else if (is_safe_char(*mark)) { return mark + 1; } else if (*mark == '\\') { /* outside a character class, \ always escapes to literal meaning */ /* TODO: problem, in PCRE a \ can escape not just the following letter, but a sequence after it; * will this break the grain-finder? (Overgenerating grains merely results in extra calls to the * regex engine, lowering the effect of the optimsiation, but not giving false matches)*/ return mark + 2; } else { return mark; } }