/* * Range search for singlebyte locales using the modified UNIX(R) Regular * Expression Library DFA. */ static int rc_range(struct iblok *ip, char *last) { char *p; int c, cstat, nstat; Dfa *dp = e0->e_exp->re_dfa; p = ip->ib_cur; lineno++; cstat = dp->anybol; if (dp->acc[cstat]) goto found; for (;;) { if ((nstat = dp->trans[cstat][*p & 0377]) == 0) { /* * '\0' is used to indicate end-of-line. If a '\0' * character appears in input, it matches '$' but * the DFA remains in dead state afterwards; there * is thus no need to handle this condition * specially to get the same behavior as in plain * regexec(). */ if ((c = *p & 0377) == '\n') c = '\0'; if ((nstat = regtrans(dp, cstat, c, 1)) == 0) goto fail; dp->trans[cstat]['\n'] = dp->trans[cstat]['\0']; } if (dp->acc[cstat = nstat - 1]) { found: for (;;) { if (vflag == 0) { succeed: outline(ip, last, p - ip->ib_cur); if (qflag || lflag) return 1; } else { fail: ip->ib_cur = p; while (*ip->ib_cur++ != '\n'); } if ((p = ip->ib_cur) > last) return 0; lineno++; if (dp->acc[cstat = dp->anybol] == 0) goto brk2; } } if (*p++ == '\n') { if (vflag) { p--; goto succeed; } if ((ip->ib_cur = p) > last) return 0; lineno++; if (dp->acc[cstat = dp->anybol]) goto found; } brk2:; } }
LIBUXRE_STATIC int libuxre_regdfaexec(Dfa *dp, Exec *xp) { const unsigned char *s; int i, nst, st, mb_cur_max; w_type wc; dp->flags = xp->flags & REG_NOTEOL; /* for regtrans() */ mb_cur_max = xp->mb_cur_max; if (xp->nmatch != 0) return leftmost(dp, xp); if (mb_cur_max == 1 && (xp->flags & REG_NEWLINE) == 0) return regdfaexec_opt(dp, xp); s = xp->str; st = dp->anybol; if (xp->flags & REG_NOTBOL) st = 1; if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) return 0; /* initial empty match allowed */ for (;;) { if ((wc = *s++) == '\n') { if (xp->flags & REG_NEWLINE) wc = ROP_EOL; } else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) s += i; if ((wc & ~(long)(NCHAR - 1)) != 0 || (nst = dp->trans[st][wc]) == 0) { if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) return REG_ESPACE; if (wc == ROP_EOL) /* REG_NEWLINE only */ { if (dp->acc[nst - 1]) return 0; if (dp->acc[st = dp->anybol]) return 0; continue; } } if (dp->acc[st = nst - 1]) return 0; if (wc == '\0') /* st == 0 */ return REG_NOMATCH; } }
/* * Optimization by simplification: singlebyte locale and REG_NEWLINE not set. * Performance gain for grep is 25% so it's worth the hack. */ static int regdfaexec_opt(Dfa *dp, Exec *xp) { const unsigned char *s; int nst, st; s = xp->str; st = dp->anybol; if (xp->flags & REG_NOTBOL) st = 1; if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) return 0; /* initial empty match allowed */ do { if ((nst = dp->trans[st][*s]) == 0) { if ((nst = regtrans(dp, st, *s, 1)) == 0) return REG_ESPACE; } if (dp->acc[st = nst - 1]) return 0; } while (*s++ != '\0'); /* st != 0 */ return REG_NOMATCH; }
static int leftmost(Dfa *dp, Exec *xp) { const unsigned char *s, *beg, *end; int i, nst, st, mb_cur_max; w_type wc; mb_cur_max = xp->mb_cur_max; beg = s = xp->str; end = 0; st = dp->leftbol; if (xp->flags & REG_NOTBOL) st = dp->leftmost; if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) end = s; /* initial empty match allowed */ for (;;) { if ((wc = *s++) == '\n') { if (xp->flags & REG_NEWLINE) wc = ROP_EOL; } else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) s += i; if ((wc & ~(long)(NCHAR - 1)) != 0 || (nst = dp->trans[st][wc]) == 0) { if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) return REG_ESPACE; if (wc == ROP_EOL) /* REG_NEWLINE only */ { if (dp->acc[nst - 1]) { if (end == 0 || end < s) end = s; break; } beg = s; st = dp->leftbol; goto newst; } } if ((st = nst - 1) == 0) /* dead state */ { if (end != 0) break; if ((wc = *beg++) == '\0') return REG_NOMATCH; else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, beg)) > 0) beg += i; s = beg; st = dp->leftmost; goto newst; } if (wc == '\0') { if (dp->acc[st]) { s--; /* don't include \0 */ if (end == 0 || end < s) end = s; break; } if (end != 0) break; return REG_NOMATCH; } newst:; if (dp->acc[st]) { if (end == 0 || end < s) end = s; } } xp->match[0].rm_so = beg - xp->str; xp->match[0].rm_eo = end - xp->str; return 0; }
LIBUXRE_STATIC int libuxre_regdfacomp(regex_t *ep, Tree *tp, Lex *lxp) { Tree *lp; Dfa *dp; Posn *p; int st; /* * It's convenient to insert an STAR(ALL) subtree to the * immediate left of the current tree. This makes the * "any match" libuxre_regdfaexec() not a special case, * and the initial state signature will fall out when * building the follow sets for all the leaves. */ if ((lp = libuxre_reg1tree(ROP_ALL, 0)) == 0 || (lp = libuxre_reg1tree(ROP_STAR, lp)) == 0 || (tp->left.ptr = lp = libuxre_reg2tree(ROP_CAT, lp, tp->left.ptr)) == 0) { return REG_ESPACE; } lp->parent = tp; if ((dp = calloc(1, sizeof(Dfa))) == 0) return REG_ESPACE; ep->re_dfa = dp; /* * Just in case null pointers aren't just all bits zero... */ dp->posfoll = 0; dp->sigfoll = 0; dp->cursig = 0; dp->posn = 0; /* * Assign position values to each of the tree's leaves * (the important parts), meanwhile potentially rewriting * the parse tree so that it fits within the restrictions * of our DFA. */ if ((tp = findposn(ep, tp, lxp->mb_cur_max)) == 0) goto err; /* * Get space for the array of positions and current set, * now that the number of positions is known. */ if ((dp->posn = malloc(sizeof(Posn) * dp->nposn + dp->nposn)) == 0) goto err; dp->posset = (unsigned char *)&dp->posn[dp->nposn]; /* * Get follow sets for each position. */ if (posnfoll(dp, tp) != 0) goto err; /* * Set up the special invariant states: * - dead state (no valid transitions); index 0. * - initial state for any match [STAR(ALL) follow set]; index 1. * - initial state for any match after ROP_BOL. * - initial state for left-most longest if REG_NOTBOL. * - initial state for left-most longest after ROP_BOL. * The final two are not allocated if leftmost() cannot be called. * The pairs of initial states are the same if there is no * explicit ROP_BOL transition. */ dp->avail += dp->used; dp->used = 0; if ((dp->sigfoll = malloc(sizeof(size_t) * dp->avail)) == 0) goto err; p = &dp->posn[dp->nposn - 1]; /* same as first(root) */ dp->cursig = &dp->posfoll[p->seti]; dp->nset = p->nset; dp->top = 1; /* index 0 is dead state */ addstate(dp); /* must be state index 1 (returns 2) */ if ((dp->cursig = malloc(sizeof(size_t) * dp->nposn)) == 0) goto err; dp->nfix = 2; if ((st = regtrans(dp, 1, ROP_BOL, lxp->mb_cur_max)) == 0) goto err; if ((dp->anybol = st - 1) == 2) /* new state */ dp->nfix = 3; if ((ep->re_flags & REG_NOSUB) == 0) /* leftmost() might be called */ { /* * leftmost() initial states are the same as the * "any match" ones without the STAR(ALL) position. */ dp->sigi[dp->nfix] = 0; dp->nsig[dp->nfix] = dp->nsig[1] - 1; dp->acc[dp->nfix] = dp->acc[1]; dp->leftbol = dp->leftmost = dp->nfix; dp->nfix++; if (dp->anybol != 1) /* distinct state w/BOL */ { dp->sigi[dp->nfix] = dp->sigi[2]; dp->nsig[dp->nfix] = dp->nsig[2] - 1; dp->acc[dp->nfix] = dp->acc[2]; dp->leftbol = dp->nfix; dp->nfix++; } dp->top = dp->nfix; } return 0; err:; libuxre_regdeldfa(dp); return REG_ESPACE; }
/* * Range search for multibyte locales using the modified UNIX(R) Regular * Expression Library DFA. */ static int rc_rangew(struct iblok *ip, char *last) { char *p; int n, cstat, nstat; wint_t wc; Dfa *dp = e0->e_exp->re_dfa; p = ip->ib_cur; lineno++; cstat = dp->anybol; if (dp->acc[cstat]) goto found; for (;;) { if (*p & 0200) { if ((n = mbtowi(&wc, p, last + 1 - p)) < 0) { n = 1; wc = WEOF; } } else { wc = *p; n = 1; } if ((wc & ~(wchar_t)(NCHAR-1)) != 0 || (nstat = dp->trans[cstat][wc]) == 0) { /* * '\0' is used to indicate end-of-line. If a '\0' * character appears in input, it matches '$' but * the DFA remains in dead state afterwards; there * is thus no need to handle this condition * specially to get the same behavior as in plain * regexec(). */ if (wc == '\n') wc = '\0'; if ((nstat = regtrans(dp, cstat, wc, mb_cur_max)) == 0) goto fail; dp->trans[cstat]['\n'] = dp->trans[cstat]['\0']; } if (dp->acc[cstat = nstat - 1]) { found: for (;;) { if (vflag == 0) { succeed: outline(ip, last, p - ip->ib_cur); if (qflag || lflag) return 1; } else { fail: ip->ib_cur = p; while (*ip->ib_cur++ != '\n'); } if ((p = ip->ib_cur) > last) return 0; lineno++; if (dp->acc[cstat = dp->anybol] == 0) goto brk2; } } p += n; if (p[-n] == '\n') { if (vflag) { p--; goto succeed; } if ((ip->ib_cur = p) > last) return 0; lineno++; if (dp->acc[cstat = dp->anybol]) goto found; } brk2:; } }