/* * miss - handle a stateset cache miss * * css is the current stateset, co is the color of the current input character, * cp points to the character after that (which is where we may need to test * LACONs). start does not affect matching behavior but is needed for pickss' * heuristics about which stateset cache entry to replace. * * Ordinarily, returns the address of the next stateset (the one that is * valid after consuming the input character). Returns NULL if no valid * NFA states remain, ie we have a certain match failure. * Internal errors also return NULL, with v->err set. */ static struct sset * miss(struct vars * v, struct dfa * d, struct sset * css, pcolor co, chr *cp, /* next chr */ chr *start) /* where the attempt got started */ { struct cnfa *cnfa = d->cnfa; int i; unsigned h; struct carc *ca; struct sset *p; int ispost; int noprogress; int gotstate; int dolacons; int sawlacons; /* for convenience, we can be called even if it might not be a miss */ if (css->outs[co] != NULL) { FDEBUG(("hit\n")); return css->outs[co]; } FDEBUG(("miss\n")); /* * Checking for operation cancel in the inner text search loop seems * unduly expensive. As a compromise, check during cache misses. */ if (CANCEL_REQUESTED(v->re)) { ERR(REG_CANCEL); return NULL; } /* * What set of states would we end up in after consuming the co character? * We first consider PLAIN arcs that consume the character, and then look * to see what LACON arcs could be traversed after consuming it. */ for (i = 0; i < d->wordsper; i++) d->work[i] = 0; /* build new stateset bitmap in d->work */ ispost = 0; noprogress = 1; gotstate = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(css->states, i)) for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) if (ca->co == co) { BSET(d->work, ca->to); gotstate = 1; if (ca->to == cnfa->post) ispost = 1; if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d -> %d\n", i, ca->to)); } if (!gotstate) return NULL; /* character cannot reach any new state */ dolacons = (cnfa->flags & HASLACONS); sawlacons = 0; /* outer loop handles transitive closure of reachable-by-LACON states */ while (dolacons) { dolacons = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(d->work, i)) for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) { if (ca->co < cnfa->ncolors) continue; /* not a LACON arc */ if (ISBSET(d->work, ca->to)) continue; /* arc would be a no-op anyway */ sawlacons = 1; /* this LACON affects our result */ if (!lacon(v, cnfa, cp, ca->co)) { if (ISERR()) return NULL; continue; /* LACON arc cannot be traversed */ } if (ISERR()) return NULL; BSET(d->work, ca->to); dolacons = 1; if (ca->to == cnfa->post) ispost = 1; if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d :> %d\n", i, ca->to)); } } h = HASH(d->work, d->wordsper); /* Is this stateset already in the cache? */ for (p = d->ssets, i = d->nssused; i > 0; p++, i--) if (HIT(h, d->work, p, d->wordsper)) { FDEBUG(("cached c%d\n", (int) (p - d->ssets))); break; /* NOTE BREAK OUT */ } if (i == 0) { /* nope, need a new cache entry */ p = getvacant(v, d, cp, start); if (p == NULL) return NULL; assert(p != css); for (i = 0; i < d->wordsper; i++) p->states[i] = d->work[i]; p->hash = h; p->flags = (ispost) ? POSTSTATE : 0; if (noprogress) p->flags |= NOPROGRESS; /* lastseen to be dealt with by caller */ } /* * Link new stateset to old, unless a LACON affected the result, in which * case we don't create the link. That forces future transitions across * this same arc (same prior stateset and character color) to come through * miss() again, so that we can recheck the LACON(s), which might or might * not pass since context will be different. */ if (!sawlacons) { FDEBUG(("c%d[%d]->c%d\n", (int) (css - d->ssets), co, (int) (p - d->ssets))); css->outs[co] = p; css->inchain[co] = p->ins; p->ins.ss = css; p->ins.co = (color) co; } return p; }
/* * cdissect - check backrefs and determine subexpression matches * * cdissect recursively processes a subre tree to check matching of backrefs * and/or identify submatch boundaries for capture nodes. The proposed match * runs from "begin" to "end" (not including "end"), and we are basically * "dissecting" it to see where the submatches are. * * Before calling any level of cdissect, the caller must have run the node's * DFA and found that the proposed substring satisfies the DFA. (We make * the caller do that because in concatenation and iteration nodes, it's * much faster to check all the substrings against the child DFAs before we * recurse.) Also, caller must have cleared subexpression match data via * zaptreesubs (or zapallsubs at the top level). */ static int /* regexec return code */ cdissect(struct vars * v, struct subre * t, chr *begin, /* beginning of relevant substring */ chr *end) /* end of same */ { int er; assert(t != NULL); MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); /* handy place to check for operation cancel */ if (CANCEL_REQUESTED(v->re)) return REG_CANCEL; switch (t->op) { case '=': /* terminal node */ assert(t->left == NULL && t->right == NULL); er = REG_OKAY; /* no action, parent did the work */ break; case 'b': /* back reference */ assert(t->left == NULL && t->right == NULL); er = cbrdissect(v, t, begin, end); break; case '.': /* concatenation */ assert(t->left != NULL && t->right != NULL); if (t->left->flags & SHORTER) /* reverse scan */ er = crevcondissect(v, t, begin, end); else er = ccondissect(v, t, begin, end); break; case '|': /* alternation */ assert(t->left != NULL); er = caltdissect(v, t, begin, end); break; case '*': /* iteration */ assert(t->left != NULL); if (t->left->flags & SHORTER) /* reverse scan */ er = creviterdissect(v, t, begin, end); else er = citerdissect(v, t, begin, end); break; case '(': /* capturing */ assert(t->left != NULL && t->right == NULL); assert(t->subno > 0); er = cdissect(v, t->left, begin, end); if (er == REG_OKAY) subset(v, t, begin, end); break; default: er = REG_ASSERT; break; } /* * We should never have a match failure unless backrefs lurk below; * otherwise, either caller failed to check the DFA, or there's some * inconsistency between the DFA and the node's innards. */ assert(er != REG_NOMATCH || (t->flags & BACKR)); return er; }