Esempio n. 1
0
/*
 * miss - handle a stateset cache miss
 *
 * css is the current stateset, co is the color of the current input character,
 * cp points to the character after that (which is where we may need to test
 * LACONs).  start does not affect matching behavior but is needed for pickss'
 * heuristics about which stateset cache entry to replace.
 *
 * Ordinarily, returns the address of the next stateset (the one that is
 * valid after consuming the input character).  Returns NULL if no valid
 * NFA states remain, ie we have a certain match failure.
 * Internal errors also return NULL, with v->err set.
 */
static struct sset *
miss(struct vars * v,
	 struct dfa * d,
	 struct sset * css,
	 pcolor co,
	 chr *cp,					/* next chr */
	 chr *start)				/* where the attempt got started */
{
	struct cnfa *cnfa = d->cnfa;
	int			i;
	unsigned	h;
	struct carc *ca;
	struct sset *p;
	int			ispost;
	int			noprogress;
	int			gotstate;
	int			dolacons;
	int			sawlacons;

	/* for convenience, we can be called even if it might not be a miss */
	if (css->outs[co] != NULL)
	{
		FDEBUG(("hit\n"));
		return css->outs[co];
	}
	FDEBUG(("miss\n"));

	/*
	 * Checking for operation cancel in the inner text search loop seems
	 * unduly expensive.  As a compromise, check during cache misses.
	 */
	if (CANCEL_REQUESTED(v->re))
	{
		ERR(REG_CANCEL);
		return NULL;
	}

	/*
	 * What set of states would we end up in after consuming the co character?
	 * We first consider PLAIN arcs that consume the character, and then look
	 * to see what LACON arcs could be traversed after consuming it.
	 */
	for (i = 0; i < d->wordsper; i++)
		d->work[i] = 0;			/* build new stateset bitmap in d->work */
	ispost = 0;
	noprogress = 1;
	gotstate = 0;
	for (i = 0; i < d->nstates; i++)
		if (ISBSET(css->states, i))
			for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
				if (ca->co == co)
				{
					BSET(d->work, ca->to);
					gotstate = 1;
					if (ca->to == cnfa->post)
						ispost = 1;
					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
						noprogress = 0;
					FDEBUG(("%d -> %d\n", i, ca->to));
				}
	if (!gotstate)
		return NULL;			/* character cannot reach any new state */
	dolacons = (cnfa->flags & HASLACONS);
	sawlacons = 0;
	/* outer loop handles transitive closure of reachable-by-LACON states */
	while (dolacons)
	{
		dolacons = 0;
		for (i = 0; i < d->nstates; i++)
			if (ISBSET(d->work, i))
				for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
				{
					if (ca->co < cnfa->ncolors)
						continue;		/* not a LACON arc */
					if (ISBSET(d->work, ca->to))
						continue;		/* arc would be a no-op anyway */
					sawlacons = 1;		/* this LACON affects our result */
					if (!lacon(v, cnfa, cp, ca->co))
					{
						if (ISERR())
							return NULL;
						continue;		/* LACON arc cannot be traversed */
					}
					if (ISERR())
						return NULL;
					BSET(d->work, ca->to);
					dolacons = 1;
					if (ca->to == cnfa->post)
						ispost = 1;
					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
						noprogress = 0;
					FDEBUG(("%d :> %d\n", i, ca->to));
				}
	}
	h = HASH(d->work, d->wordsper);

	/* Is this stateset already in the cache? */
	for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
		if (HIT(h, d->work, p, d->wordsper))
		{
			FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
			break;				/* NOTE BREAK OUT */
		}
	if (i == 0)
	{							/* nope, need a new cache entry */
		p = getvacant(v, d, cp, start);
		if (p == NULL)
			return NULL;
		assert(p != css);
		for (i = 0; i < d->wordsper; i++)
			p->states[i] = d->work[i];
		p->hash = h;
		p->flags = (ispost) ? POSTSTATE : 0;
		if (noprogress)
			p->flags |= NOPROGRESS;
		/* lastseen to be dealt with by caller */
	}

	/*
	 * Link new stateset to old, unless a LACON affected the result, in which
	 * case we don't create the link.  That forces future transitions across
	 * this same arc (same prior stateset and character color) to come through
	 * miss() again, so that we can recheck the LACON(s), which might or might
	 * not pass since context will be different.
	 */
	if (!sawlacons)
	{
		FDEBUG(("c%d[%d]->c%d\n",
				(int) (css - d->ssets), co, (int) (p - d->ssets)));
		css->outs[co] = p;
		css->inchain[co] = p->ins;
		p->ins.ss = css;
		p->ins.co = (color) co;
	}
	return p;
}
Esempio n. 2
0
/*
 * cdissect - check backrefs and determine subexpression matches
 *
 * cdissect recursively processes a subre tree to check matching of backrefs
 * and/or identify submatch boundaries for capture nodes.  The proposed match
 * runs from "begin" to "end" (not including "end"), and we are basically
 * "dissecting" it to see where the submatches are.
 *
 * Before calling any level of cdissect, the caller must have run the node's
 * DFA and found that the proposed substring satisfies the DFA.  (We make
 * the caller do that because in concatenation and iteration nodes, it's
 * much faster to check all the substrings against the child DFAs before we
 * recurse.)  Also, caller must have cleared subexpression match data via
 * zaptreesubs (or zapallsubs at the top level).
 */
static int						/* regexec return code */
cdissect(struct vars * v,
		 struct subre * t,
		 chr *begin,			/* beginning of relevant substring */
		 chr *end)				/* end of same */
{
	int			er;

	assert(t != NULL);
	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));

	/* handy place to check for operation cancel */
	if (CANCEL_REQUESTED(v->re))
		return REG_CANCEL;

	switch (t->op)
	{
		case '=':				/* terminal node */
			assert(t->left == NULL && t->right == NULL);
			er = REG_OKAY;		/* no action, parent did the work */
			break;
		case 'b':				/* back reference */
			assert(t->left == NULL && t->right == NULL);
			er = cbrdissect(v, t, begin, end);
			break;
		case '.':				/* concatenation */
			assert(t->left != NULL && t->right != NULL);
			if (t->left->flags & SHORTER)		/* reverse scan */
				er = crevcondissect(v, t, begin, end);
			else
				er = ccondissect(v, t, begin, end);
			break;
		case '|':				/* alternation */
			assert(t->left != NULL);
			er = caltdissect(v, t, begin, end);
			break;
		case '*':				/* iteration */
			assert(t->left != NULL);
			if (t->left->flags & SHORTER)		/* reverse scan */
				er = creviterdissect(v, t, begin, end);
			else
				er = citerdissect(v, t, begin, end);
			break;
		case '(':				/* capturing */
			assert(t->left != NULL && t->right == NULL);
			assert(t->subno > 0);
			er = cdissect(v, t->left, begin, end);
			if (er == REG_OKAY)
				subset(v, t, begin, end);
			break;
		default:
			er = REG_ASSERT;
			break;
	}

	/*
	 * We should never have a match failure unless backrefs lurk below;
	 * otherwise, either caller failed to check the DFA, or there's some
	 * inconsistency between the DFA and the node's innards.
	 */
	assert(er != REG_NOMATCH || (t->flags & BACKR));

	return er;
}