int research(Regexp *rp, register char *str, int start, register size_t len, int need_start) { char *ret = str; int try_backref; /* * Always do dfa search if can; if it fails, then even if * need_start is true, we won't bother with the regex search. */ if (rp->dfa) { char save; int count = 0; /* * dfa likes to stick a '\n' right after the matched * text. So we just save and restore the character. */ save = str[start+len]; ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE, &count, &try_backref); str[start+len] = save; } if (ret) { if (need_start || rp->dfa == FALSE || try_backref) { int res = re_search(&(rp->pat), str, start+len, start, len, &(rp->regs)); return res; } else return 1; } else return -1; }
int main (int argc, char **argv) { struct dfa *dfa; char *beg, *end, *p; int allow_nl; set_program_name (argv[0]); if (argc < 3) exit (EXIT_FAILURE); setlocale (LC_ALL, ""); dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); dfa = dfaalloc (); dfacomp (argv[1], strlen (argv[1]), dfa, 0); beg = argv[2]; end = argv[2] + strlen (argv[2]); allow_nl = argc > 3 && atoi (argv[3]); p = dfaexec (dfa, beg, end, allow_nl, NULL, NULL); if (p != NULL) printf ("%zd\n", p - beg); exit (EXIT_SUCCESS); }
int evaluate_fast_regex( struct fast_regex * fre_t, char * str, size_t len ) { char * sub ; struct _fregex * fre = (struct _fregex *)( fre_t->data ) ; if( fre->kwset ) { struct kwsmatch kwsm ; sub = kwsexec( fre->kwset, (char *)str, len, &kwsm) ; if( sub == NULL ) return 0 ; if( kwsm.index < fre->num_exact_kws ) { return 1 ; } } if( HAS_DFA(fre_t->options) ) { int backref = 0 ; sub = dfaexec( &(fre->dfa), str, (str+len), 0, NULL, &backref) ; if( sub == NULL ) return 0 ; if ( !backref || (fre_t->options & FRE_NO_REGEX) ) return 1 ; } return re_match( &fre->regex , str, len, 0, NULL ) > 0 ; }
int research(Regexp *rp, register char *str, int start, register size_t len, int flags) { const char *ret = str; int try_backref; int need_start; int no_bol; int res; need_start = ((flags & RE_NEED_START) != 0); no_bol = ((flags & RE_NO_BOL) != 0); if (no_bol) rp->pat.not_bol = 1; /* * Always do dfa search if can; if it fails, then even if * need_start is true, we won't bother with the regex search. * * The dfa matcher doesn't have a no_bol flag, so don't bother * trying it in that case. */ if (rp->dfa && ! no_bol) { char save; int count = 0; /* * dfa likes to stick a '\n' right after the matched * text. So we just save and restore the character. */ save = str[start+len]; ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE, &count, &try_backref); str[start+len] = save; } if (ret) { if (need_start || rp->dfa == FALSE || try_backref) { /* * Passing NULL as last arg speeds up search for cases * where we don't need the start/end info. */ res = re_search(&(rp->pat), str, start+len, start, len, need_start ? &(rp->regs) : NULL); } else res = 1; } else res = -1; rp->pat.not_bol = 0; return res; }
size_t EGexecute (char const *buf, size_t size, size_t *match_size, char const *start_ptr) { char const *buflim, *beg, *end, *match, *best_match, *mb_start; char eol = eolbyte; int backref; regoff_t start; size_t len, best_len; struct kwsmatch kwsm; size_t i, ret_val; mb_len_map_t *map = NULL; if (MB_CUR_MAX > 1) { if (match_icase) { /* mbtolower adds a NUL byte at the end. That will provide space for the sentinel byte dfaexec may add. */ char *case_buf = mbtolower (buf, &size, &map); if (start_ptr) start_ptr = case_buf + (start_ptr - buf); buf = case_buf; } } mb_start = buf; buflim = buf + size; for (beg = end = buf; end < buflim; beg = end) { if (!start_ptr) { /* We don't care about an exact match. */ if (kwset) { /* Find a possible match using the KWset matcher. */ size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) goto failure; beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ if ((end = memchr(beg, eol, buflim - beg)) != NULL) end++; else end = buflim; match = beg; while (beg > buf && beg[-1] != eol) --beg; if (kwsm.index < kwset_exact_matches) { if (!MBS_SUPPORT) goto success; if (mb_start < beg) mb_start = beg; if (MB_CUR_MAX == 1 || !is_mb_middle (&mb_start, match, buflim, kwsm.size[0])) goto success; } if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) continue; } else { /* No good fixed strings; start with DFA. */ char const *next_beg = dfaexec (dfa, beg, (char *) buflim, 0, NULL, &backref); /* If there's no match, or if we've matched the sentinel, we're done. */ if (next_beg == NULL || next_beg == buflim) break; /* Narrow down to the line we've found. */ beg = next_beg; if ((end = memchr(beg, eol, buflim - beg)) != NULL) end++; else end = buflim; while (beg > buf && beg[-1] != eol) --beg; } /* Successful, no backreferences encountered! */ if (!backref) goto success; } else { /* We are looking for the leftmost (then longest) exact match. We will go through the outer loop only once. */ beg = start_ptr; end = buflim; } /* If the "line" is longer than the maximum regexp offset, die as if we've run out of memory. */ if (TYPE_MAXIMUM (regoff_t) < end - buf - 1) xalloc_die (); /* If we've made it to this point, this means DFA has seen a probable match, and we need to run it through Regex. */ best_match = end; best_len = 0; for (i = 0; i < pcount; i++) { patterns[i].regexbuf.not_eol = 0; start = re_search (&(patterns[i].regexbuf), buf, end - buf - 1, beg - buf, end - beg - 1, &(patterns[i].regs)); if (start < -1) xalloc_die (); else if (0 <= start) { len = patterns[i].regs.end[0] - start; match = buf + start; if (match > best_match) continue; if (start_ptr && !match_words) goto assess_pattern_match; if ((!match_lines && !match_words) || (match_lines && len == end - beg - 1)) { match = beg; len = end - beg; goto assess_pattern_match; } /* If -w, check if the match aligns with word boundaries. We do this iteratively because: (a) the line may contain more than one occurrence of the pattern, and (b) Several alternatives in the pattern might be valid at a given point, and we may need to consider a shorter one to find a word boundary. */ if (match_words) while (match <= best_match) { regoff_t shorter_len = 0; if ((match == buf || !WCHAR ((unsigned char) match[-1])) && (start + len == end - buf - 1 || !WCHAR ((unsigned char) match[len]))) goto assess_pattern_match; if (len > 0) { /* Try a shorter length anchored at the same place. */ --len; patterns[i].regexbuf.not_eol = 1; shorter_len = re_match (&(patterns[i].regexbuf), buf, match + len - beg, match - buf, &(patterns[i].regs)); if (shorter_len < -1) xalloc_die (); } if (0 < shorter_len) len = shorter_len; else { /* Try looking further on. */ if (match == end - 1) break; match++; patterns[i].regexbuf.not_eol = 0; start = re_search (&(patterns[i].regexbuf), buf, end - buf - 1, match - buf, end - match - 1, &(patterns[i].regs)); if (start < 0) { if (start < -1) xalloc_die (); break; } len = patterns[i].regs.end[0] - start; match = buf + start; } } /* while (match <= best_match) */ continue; assess_pattern_match: if (!start_ptr) { /* Good enough for a non-exact match. No need to look at further patterns, if any. */ goto success; } if (match < best_match || (match == best_match && len > best_len)) { /* Best exact match: leftmost, then longest. */ best_match = match; best_len = len; } } /* if re_search >= 0 */ } /* for Regex patterns. */ if (best_match < end) { /* We have found an exact match. We were just waiting for the best one (leftmost then longest). */ beg = best_match; len = best_len; goto success_in_len; } } /* for (beg = end ..) */ failure: ret_val = -1; goto out; success: len = end - beg; success_in_len:; size_t off = beg - buf; mb_case_map_apply (map, &off, &len); *match_size = len; ret_val = off; out: return ret_val; }
static size_t EGexecute (const void *compiled_pattern, const char *buf, size_t buf_size, size_t *match_size, bool exact) { struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern; register const char *buflim, *beg, *end; char eol = cregex->eolbyte; int backref, start, len; struct kwsmatch kwsm; size_t i; #ifdef MBS_SUPPORT char *mb_properties = NULL; #endif /* MBS_SUPPORT */ #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && cregex->ckwset.kwset) mb_properties = check_multibyte_string (buf, buf_size); #endif /* MBS_SUPPORT */ buflim = buf + buf_size; for (beg = end = buf; end < buflim; beg = end) { if (!exact) { if (cregex->ckwset.kwset) { /* Find a possible match using the KWset matcher. */ size_t offset = kwsexec (cregex->ckwset.kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) { #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) free (mb_properties); #endif return (size_t)-1; } beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr (beg, eol, buflim - beg); if (end != NULL) end++; else end = buflim; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) continue; #endif while (beg > buf && beg[-1] != eol) --beg; if (kwsm.index < cregex->kwset_exact_matches) goto success; if (dfaexec (&cregex->dfa, beg, end - beg, &backref) == (size_t) -1) continue; } else { /* No good fixed strings; start with DFA. */ size_t offset = dfaexec (&cregex->dfa, beg, buflim - beg, &backref); if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ beg += offset; end = memchr (beg, eol, buflim - beg); if (end != NULL) end++; else end = buflim; while (beg > buf && beg[-1] != eol) --beg; } /* Successful, no backreferences encountered! */ if (!backref) goto success; } else end = beg + buf_size; /* If we've made it to this point, this means DFA has seen a probable match, and we need to run it through Regex. */ for (i = 0; i < cregex->pcount; i++) { cregex->patterns[i].regexbuf.not_eol = 0; if (0 <= (start = re_search (&(cregex->patterns[i].regexbuf), beg, end - beg - 1, 0, end - beg - 1, &(cregex->patterns[i].regs)))) { len = cregex->patterns[i].regs.end[0] - start; if (exact) { *match_size = len; return start; } if ((!cregex->match_lines && !cregex->match_words) || (cregex->match_lines && len == end - beg - 1)) goto success; /* If -w, check if the match aligns with word boundaries. We do this iteratively because: (a) the line may contain more than one occurence of the pattern, and (b) Several alternatives in the pattern might be valid at a given point, and we may need to consider a shorter one to find a word boundary. */ if (cregex->match_words) while (start >= 0) { if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1])) && (len == end - beg - 1 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len]))) goto success; if (len > 0) { /* Try a shorter length anchored at the same place. */ --len; cregex->patterns[i].regexbuf.not_eol = 1; len = re_match (&(cregex->patterns[i].regexbuf), beg, start + len, start, &(cregex->patterns[i].regs)); } if (len <= 0) { /* Try looking further on. */ if (start == end - beg - 1) break; ++start; cregex->patterns[i].regexbuf.not_eol = 0; start = re_search (&(cregex->patterns[i].regexbuf), beg, end - beg - 1, start, end - beg - 1 - start, &(cregex->patterns[i].regs)); len = cregex->patterns[i].regs.end[0] - start; } } } } /* for Regex patterns. */ } /* for (beg = end ..) */ #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && mb_properties) free (mb_properties); #endif /* MBS_SUPPORT */ return (size_t) -1; success: #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && mb_properties) free (mb_properties); #endif /* MBS_SUPPORT */ *match_size = end - beg; return beg - buf; }
int match_regex (struct regex *regex, char *buf, size_t buflen, size_t buf_start_offset, struct re_registers *regarray, int regsize) { int ret; static struct regex *regex_last; /* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */ /* Keep track of the last regexp matched. */ if (!regex) { regex = regex_last; if (!regex_last) bad_prog (_(NO_REGEX)); } else regex_last = regex; /* gnulib's re_search uses signed-int as length */ if (buflen >= INT_MAX) panic (_("regex input buffer length larger than INT_MAX")); if (regex->pattern.no_sub && regsize) { /* Re-compiling an existing regex, free the previously allocated structures. */ if (regex->dfa) { dfafree (regex->dfa); free (regex->dfa); regex->dfa = NULL; } regfree (®ex->pattern); compile_regex_1 (regex, regsize); } regex->pattern.regs_allocated = REGS_REALLOCATE; /* Optimized handling for '^' and '$' patterns */ if (regex->begline || regex->endline) { size_t offset; if (regex->endline) { const char *p = NULL; if (regex->flags & REG_NEWLINE) p = memchr (buf + buf_start_offset, buffer_delimiter, buflen - buf_start_offset); offset = p ? p - buf : buflen; } else if (buf_start_offset == 0) /* begline anchor, starting at beginning of the buffer. */ offset = 0; else if (!(regex->flags & REG_NEWLINE)) /* begline anchor, starting in the middle of the text buffer, and multiline regex is not specified - will never match. Example: seq 2 | sed 'N;s/^/X/g' */ return 0; else if (buf[buf_start_offset - 1] == buffer_delimiter) /* begline anchor, starting in the middle of the text buffer, with multiline match, and the current character is the line delimiter - start here. Example: seq 2 | sed 'N;s/^/X/mg' */ offset = buf_start_offset; else { /* begline anchor, starting in the middle of the search buffer, all previous optimizions didn't work: search for the next line delimiter character in the buffer, and start from there if found. */ const char *p = memchr (buf + buf_start_offset, buffer_delimiter, buflen - buf_start_offset); if (p == NULL) return 0; offset = p - buf + 1; } if (regsize) { size_t i; if (!regarray->start) { regarray->start = XCALLOC (1, regoff_t); regarray->end = XCALLOC (1, regoff_t); regarray->num_regs = 1; } regarray->start[0] = offset; regarray->end[0] = offset; for (i = 1 ; i < regarray->num_regs; ++i) regarray->start[i] = regarray->end[i] = -1; } return 1; } if (buf_start_offset == 0) { struct dfa *superset = dfasuperset (regex->dfa); if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL)) return 0; if ((!regsize && (regex->flags & REG_NEWLINE)) || (!superset && dfaisfast (regex->dfa))) { bool backref = false; if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref)) return 0; if (!regsize && (regex->flags & REG_NEWLINE) && !backref) return 1; } } /* If the buffer delimiter is not newline character, we cannot use newline_anchor flag of regex. So do it line-by-line, and add offset value to results. */ if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n') { const char *beg, *end; const char *start; beg = buf; if (buf_start_offset > 0) { const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset); if (eol != NULL) beg = eol + 1; } start = buf + buf_start_offset; for (;;) { end = memchr (beg, buffer_delimiter, buf + buflen - beg); if (end == NULL) end = buf + buflen; ret = re_search (®ex->pattern, beg, end - beg, start - beg, end - start, regsize ? regarray : NULL); if (ret > -1) { size_t i; ret += beg - buf; if (regsize) { for (i = 0; i < regarray->num_regs; ++i) { if (regarray->start[i] > -1) regarray->start[i] += beg - buf; if (regarray->end[i] > -1) regarray->end[i] += beg - buf; } } break; } if (end == buf + buflen) break; beg = start = end + 1; } } else ret = re_search (®ex->pattern, buf, buflen, buf_start_offset, buflen - buf_start_offset, regsize ? regarray : NULL); return (ret > -1); }