/* - findmust - fill in must and mlen with longest mandatory literal string * * This algorithm could do fancy things like analyzing the operands of | * for common subsequences. Someday. This code is simple and finds most * of the interesting cases. * * Note that must and mlen got initialized during setup. */ static void findmust(struct parse *p, struct re_guts *g) { sop *scan; sop *start; /* start initialized in the default case, after that */ sop *newstart; /* newstart was initialized in the OCHAR case */ sopno newlen; sop s; char *cp; sopno i; /* avoid making error situations worse */ if (p->error != 0) return; /* find the longest OCHAR sequence in strip */ newlen = 0; scan = g->strip + 1; do { s = *scan++; switch (OP(s)) { case OCHAR: /* sequence member */ if (newlen == 0) /* new sequence */ newstart = scan - 1; newlen++; break; case OPLUS_: /* things that don't break one */ case OLPAREN: case ORPAREN: break; case OQUEST_: /* things that must be skipped */ case OCH_: scan--; do { scan += OPND(s); s = *scan; /* assert() interferes w debug printouts */ if (OP(s) != O_QUEST && OP(s) != O_CH && OP(s) != OOR2) { g->iflags |= REGEX_BAD; return; } } while (OP(s) != O_QUEST && OP(s) != O_CH); /* fallthrough */ default: /* things that break a sequence */ if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; } newlen = 0; break; } } while (OP(s) != OEND); if (g->mlen == 0) /* there isn't one */ return; /* turn it into a character string */ g->must = cli_malloc((size_t)g->mlen + 1); if (g->must == NULL) { /* argh; just forget it */ g->mlen = 0; return; } cp = g->must; scan = start; for (i = g->mlen; i > 0; i--) { while (OP(s = *scan++) != OCHAR) continue; assert(cp < g->must + g->mlen); *cp++ = (char)OPND(s); } assert(cp == g->must + g->mlen); *cp++ = '\0'; /* just on general principles */ }
/* - step - map set of states reachable before char to set reachable after == static states step(struct re_guts *g, sopno start, sopno stop, \ == states bef, int ch, states aft); == #define BOL (OUT-1) == #define EOL (BOL-1) == #define BOLEOL (BOL-2) == #define NOTHING (BOL-3) == #define BOW (BOL-4) == #define EOW (BOL-5) == #define BADCHAR (BOL-6) == #define NONCHAR(c) ((c) <= OUT) */ static states step(struct re_guts *g, sopno start, /* start state within strip */ sopno stop, /* state after stop state within strip */ states bef, /* states reachable before */ wint_t ch, /* character or NONCHAR code */ states aft) /* states already known reachable after */ { cset *cs; sop s; sopno pc; onestate here; /* note, macros know this name */ sopno look; int i; for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { s = g->strip[pc]; switch (OP(s)) { case OEND: assert(pc == stop-1); break; case OCHAR: /* only characters can match */ assert(!NONCHAR(ch) || ch != OPND(s)); if (ch == OPND(s)) FWD(aft, bef, 1); break; case OBOL: if (ch == BOL || ch == BOLEOL) FWD(aft, bef, 1); break; case OEOL: if (ch == EOL || ch == BOLEOL) FWD(aft, bef, 1); break; case OBOW: if (ch == BOW) FWD(aft, bef, 1); break; case OEOW: if (ch == EOW) FWD(aft, bef, 1); break; case OANY: if (!NONCHAR(ch)) FWD(aft, bef, 1); break; case OANYOF: cs = &g->sets[OPND(s)]; if (!NONCHAR(ch) && CHIN(cs, ch)) FWD(aft, bef, 1); break; case OBACK_: /* ignored here */ case O_BACK: FWD(aft, aft, 1); break; case OPLUS_: /* forward, this is just an empty */ FWD(aft, aft, 1); break; case O_PLUS: /* both forward and back */ FWD(aft, aft, 1); i = ISSETBACK(aft, OPND(s)); BACK(aft, aft, OPND(s)); if (!i && ISSETBACK(aft, OPND(s))) { /* oho, must reconsider loop body */ pc -= OPND(s) + 1; INIT(here, pc); } break; case OQUEST_: /* two branches, both forward */ FWD(aft, aft, 1); FWD(aft, aft, OPND(s)); break; case O_QUEST: /* just an empty */ FWD(aft, aft, 1); break; case OLPAREN: /* not significant here */ case ORPAREN: FWD(aft, aft, 1); break; case OCH_: /* mark the first two branches */ FWD(aft, aft, 1); assert(OP(g->strip[pc+OPND(s)]) == OOR2); FWD(aft, aft, OPND(s)); break; case OOR1: /* done a branch, find the O_CH */ if (ISSTATEIN(aft, here)) { for (look = 1; OP(s = g->strip[pc+look]) != O_CH; look += OPND(s)) assert(OP(s) == OOR2); FWD(aft, aft, look); } break; case OOR2: /* propagate OCH_'s marking */ FWD(aft, aft, 1); if (OP(g->strip[pc+OPND(s)]) != O_CH) { assert(OP(g->strip[pc+OPND(s)]) == OOR2); FWD(aft, aft, OPND(s)); } break; case O_CH: /* just empty */ FWD(aft, aft, 1); break; default: /* ooooops... */ assert(nope); break; } } return(aft); }
/* - dissect - figure out what matched what, no back references == static const char *dissect(struct match *m, const char *start, \ == const char *stop, sopno startst, sopno stopst); */ static const char * /* == stop (success) always */ dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst) { int i; sopno ss; /* start sop of current subRE */ sopno es; /* end sop of current subRE */ const char *sp; /* start of string matched by it */ const char *stp; /* string matched by it cannot pass here */ const char *rest; /* start of rest of string */ const char *tail; /* string unmatched by rest of RE */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ const char *ssp; /* start of string matched by subsubRE */ const char *sep; /* end of string matched by subsubRE */ const char *oldssp; /* previous ssp */ const char *dp __unused; /* actually used for assert checks */ AT("diss", start, stop, startst, stopst); sp = start; for (ss = startst; ss < stopst; ss = es) { /* identify end of subRE */ es = ss; switch (OP(m->g->strip[es])) { case OPLUS_: case OQUEST_: es += OPND(m->g->strip[es]); break; case OCH_: while (OP(m->g->strip[es]) != O_CH) es += OPND(m->g->strip[es]); break; } es++; /* figure out what it matched */ switch (OP(m->g->strip[ss])) { case OEND: assert(nope); break; case OCHAR: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBOL: case OEOL: case OBOW: case OEOW: break; case OANY: case OANYOF: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBACK_: case O_BACK: assert(nope); break; /* cases where length of match is hard to find */ case OQUEST_: stp = stop; for (;;) { /* how long could this one be? */ rest = slow(m, sp, stp, ss, es); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = slow(m, rest, stop, es, stopst); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = es - 1; /* did innards match? */ if (slow(m, sp, rest, ssub, esub) != NULL) { dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); } else /* no */ assert(sp == rest); sp = rest; break; case OPLUS_: stp = stop; for (;;) { /* how long could this one be? */ rest = slow(m, sp, stp, ss, es); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = slow(m, rest, stop, es, stopst); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = es - 1; ssp = sp; oldssp = ssp; for (;;) { /* find last match of innards */ sep = slow(m, ssp, rest, ssub, esub); if (sep == NULL || sep == ssp) break; /* failed or matched null */ oldssp = ssp; /* on to next try */ ssp = sep; } if (sep == NULL) { /* last successful match */ sep = ssp; ssp = oldssp; } assert(sep == rest); /* must exhaust substring */ assert(slow(m, ssp, sep, ssub, esub) == rest); dp = dissect(m, ssp, sep, ssub, esub); assert(dp == sep); sp = rest; break; case OCH_: stp = stop; for (;;) { /* how long could this one be? */ rest = slow(m, sp, stp, ss, es); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = slow(m, rest, stop, es, stopst); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = ss + OPND(m->g->strip[ss]) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ if (slow(m, sp, rest, ssub, esub) == rest) break; /* it matched all of it */ /* that one missed, try next one */ assert(OP(m->g->strip[esub]) == OOR1); esub++; assert(OP(m->g->strip[esub]) == OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); if (OP(m->g->strip[esub]) == OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); } dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); sp = rest; break; case O_PLUS: case O_QUEST: case OOR1: case OOR2: case O_CH: assert(nope); break; case OLPAREN: i = OPND(m->g->strip[ss]); assert(0 < i && i <= m->g->nsub); m->pmatch[i].rm_so = sp - m->offp; break; case ORPAREN: i = OPND(m->g->strip[ss]); assert(0 < i && i <= m->g->nsub); m->pmatch[i].rm_eo = sp - m->offp; break; default: /* uh oh */ assert(nope); break; } } assert(sp == stop); return(sp); }
/* - backref - figure out what matched what, figuring in back references == static const char *backref(struct match *m, const char *start, \ == const char *stop, sopno startst, sopno stopst, sopno lev); */ static const char * /* == stop (success) or NULL (failure) */ backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, /* PLUS nesting level */ int rec) { int i; sopno ss; /* start sop of current subRE */ const char *sp; /* start of string matched by it */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ const char *ssp; /* start of string matched by subsubRE */ const char *dp; size_t len; int hard; sop s; regoff_t offsave; cset *cs; wint_t wc; AT("back", start, stop, startst, stopst); sp = start; /* get as far as we can with easy stuff */ hard = 0; for (ss = startst; !hard && ss < stopst; ss++) switch (OP(s = m->g->strip[ss])) { case OCHAR: if (sp == stop) return(NULL); sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); if (wc != OPND(s)) return(NULL); break; case OANY: if (sp == stop) return(NULL); sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); if (wc == BADCHAR) return (NULL); break; case OANYOF: if (sp == stop) return (NULL); cs = &m->g->sets[OPND(s)]; sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); if (wc == BADCHAR || !CHIN(cs, wc)) return(NULL); break; case OBOL: if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp < m->endp && *(sp-1) == '\n' && (m->g->cflags®_NEWLINE)) ) { /* yes */ } else return(NULL); break; case OEOL: if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) ) { /* yes */ } else return(NULL); break; case OBOW: if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp < m->endp && *(sp-1) == '\n' && (m->g->cflags®_NEWLINE)) || (sp > m->beginp && !ISWORD(*(sp-1))) ) && (sp < m->endp && ISWORD(*sp)) ) { /* yes */ } else return(NULL); break; case OEOW: if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) || (sp < m->endp && !ISWORD(*sp)) ) && (sp > m->beginp && ISWORD(*(sp-1))) ) { /* yes */ } else return(NULL); break; case O_QUEST: break; case OOR1: /* matches null but needs to skip */ ss++; s = m->g->strip[ss]; do { assert(OP(s) == OOR2); ss += OPND(s); } while (OP(s = m->g->strip[ss]) != O_CH); /* note that the ss++ gets us past the O_CH */ break; default: /* have to make a choice */ hard = 1; break; } if (!hard) { /* that was it! */ if (sp != stop) return(NULL); return(sp); } ss--; /* adjust for the for's final increment */ /* the hard stuff */ AT("hard", sp, stop, ss, stopst); s = m->g->strip[ss]; switch (OP(s)) { case OBACK_: /* the vilest depths */ i = OPND(s); assert(0 < i && i <= m->g->nsub); if (m->pmatch[i].rm_eo == -1) return(NULL); assert(m->pmatch[i].rm_so != -1); len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; if (len == 0 && rec++ > MAX_RECURSION) return(NULL); assert(stop - m->beginp >= len); if (sp > stop - len) return(NULL); /* not enough left to match */ ssp = m->offp + m->pmatch[i].rm_so; if (memcmp(sp, ssp, len) != 0) return(NULL); while (m->g->strip[ss] != SOP(O_BACK, i)) ss++; return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); break; case OQUEST_: /* to null or not */ dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); /* not */ return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); break; case OPLUS_: assert(m->lastpos != NULL); assert(lev+1 <= m->g->nplus); m->lastpos[lev+1] = sp; return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); break; case O_PLUS: if (sp == m->lastpos[lev]) /* last pass matched null */ return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); /* try another pass */ m->lastpos[lev] = sp; dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); if (dp == NULL) return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); else return(dp); break; case OCH_: /* find the right one, if any */ ssub = ss + 1; esub = ss + OPND(s) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ dp = backref(m, sp, stop, ssub, esub, lev, rec); if (dp != NULL) return(dp); /* that one missed, try next one */ if (OP(m->g->strip[esub]) == O_CH) return(NULL); /* there is none */ esub++; assert(OP(m->g->strip[esub]) == OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); if (OP(m->g->strip[esub]) == OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); } break; case OLPAREN: /* must undo assignment if rest fails */ i = OPND(s); assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_so; m->pmatch[i].rm_so = sp - m->offp; dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); m->pmatch[i].rm_so = offsave; return(NULL); break; case ORPAREN: /* must undo assignment if rest fails */ i = OPND(s); assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_eo; m->pmatch[i].rm_eo = sp - m->offp; dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); m->pmatch[i].rm_eo = offsave; return(NULL); break; default: /* uh oh */ assert(nope); break; } /* "can't happen" */ assert(nope); /* NOTREACHED */ return "shut up gcc"; }
/* * s_print - print the strip for debugging */ static void s_print(struct re_guts *g, FILE *d) { sop *s; cset *cs; int done = 0; sop opnd; int col = 0; ssize_t last; sopno offset = 2; # define GAP() { if (offset % 5 == 0) { \ if (col > 40) { \ fprintf(d, "\n\t"); \ col = 0; \ } else { \ fprintf(d, " "); \ col++; \ } \ } else \ col++; \ offset++; \ } if (OP(g->strip[0]) != OEND) fprintf(d, "missing initial OEND!\n"); for (s = &g->strip[1]; !done; s++) { opnd = OPND(*s); switch (OP(*s)) { case OEND: fprintf(d, "\n"); done = 1; break; case OCHAR: if (strchr("\\|()^$.[+*?{}!<> ", (char)opnd) != NULL) fprintf(d, "\\%c", (char)opnd); else fprintf(d, "%s", regchar((char)opnd)); break; case OBOL: fprintf(d, "^"); break; case OEOL: fprintf(d, "$"); break; case OBOW: fprintf(d, "\\{"); break; case OEOW: fprintf(d, "\\}"); break; case OANY: fprintf(d, "."); break; case OANYOF: fprintf(d, "[(%ld)", (long)opnd); #ifdef __NetBSD__ cs = &g->sets[opnd]; last = -1; for (size_t i = 0; i < g->csetsize+1; i++) /* +1 flushes */ if (CHIN(cs, i) && i < g->csetsize) { if (last < 0) { fprintf(d, "%s", regchar(i)); last = i; } } else { if (last >= 0) { if (last != (ssize_t)i - 1) fprintf(d, "-%s", regchar(i - 1)); last = -1; } } #endif fprintf(d, "]"); break; case OBACK_: fprintf(d, "(\\<%ld>", (long)opnd); break; case O_BACK: fprintf(d, "<%ld>\\)", (long)opnd); break; case OPLUS_: fprintf(d, "(+"); if (OP(*(s+opnd)) != O_PLUS) fprintf(d, "<%ld>", (long)opnd); break; case O_PLUS: if (OP(*(s-opnd)) != OPLUS_) fprintf(d, "<%ld>", (long)opnd); fprintf(d, "+)"); break; case OQUEST_: fprintf(d, "(?"); if (OP(*(s+opnd)) != O_QUEST) fprintf(d, "<%ld>", (long)opnd); break; case O_QUEST: if (OP(*(s-opnd)) != OQUEST_) fprintf(d, "<%ld>", (long)opnd); fprintf(d, "?)"); break; case OLPAREN: fprintf(d, "((<%ld>", (long)opnd); break; case ORPAREN: fprintf(d, "<%ld>))", (long)opnd); break; case OCH_: fprintf(d, "<"); if (OP(*(s+opnd)) != OOR2) fprintf(d, "<%ld>", (long)opnd); break; case OOR1: if (OP(*(s-opnd)) != OOR1 && OP(*(s-opnd)) != OCH_) fprintf(d, "<%ld>", (long)opnd); fprintf(d, "|"); break; case OOR2: fprintf(d, "|"); if (OP(*(s+opnd)) != OOR2 && OP(*(s+opnd)) != O_CH) fprintf(d, "<%ld>", (long)opnd); break; case O_CH: if (OP(*(s-opnd)) != OOR1) fprintf(d, "<%ld>", (long)opnd); fprintf(d, ">"); break; default: #ifdef __FreeBSD__ fprintf(d, "!%ld(%ld)!", OP(*s), opnd); #else fprintf(d, "!%d(%d)!", OP(*s), opnd); #endif break; } if (!done) GAP(); } }