/* - regtail - set the next-pointer at the end of a node chain */ static void regtail( char *p, char *val ) { register char *scan; register char *temp; register int offset; if (p == ®dummy) return; /* Find last node. */ scan = p; for (;;) { temp = regnext(scan); if (temp == NULL) break; scan = temp; } if (OP(scan) == BACK) offset = scan - val; else offset = val - scan; *(scan+1) = (offset>>8)&0377; *(scan+2) = offset&0377; }
static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin) { int nextch = '\0'; const char *save; int no; int c; int max = preg->program[scan + 2]; int min = preg->program[scan + 3]; int next = regnext(preg, scan); /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ if (OP(preg, next) == EXACTLY) { nextch = preg->program[OPERAND(next)]; } save = preg->reginput; no = regrepeat(preg, scan + 5, max); if (no < min) { return 0; } if (matchmin) { /* from min up to no */ max = no; no = min; } /* else from no down to min */ while (1) { if (matchmin) { if (no > max) { break; } } else { if (no < min) { break; } } preg->reginput = save + utf8_index(save, no); reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); /* If it could work, try it. */ if (reg_iseol(preg, nextch) || c == nextch) { if (regmatch(preg, next)) { return(1); } } if (matchmin) { /* Couldn't or didn't, add one more */ no++; } else { /* Couldn't or didn't -- back up. */ no--; } } return(0); }
static int regmatchrepeat(regex_t *preg, int scan, int matchmin) { int *scanpt = preg->program + scan; int max = scanpt[2]; int min = scanpt[3]; /* Have we reached min? */ if (scanpt[4] < min) { /* No, so get another one */ scanpt[4]++; if (regmatch(preg, scan + 5)) { return 1; } scanpt[4]--; return 0; } if (scanpt[4] > max) { return 0; } if (matchmin) { /* minimal, so try other branch first */ if (regmatch(preg, regnext(preg, scan))) { return 1; } /* No, so try one more */ scanpt[4]++; if (regmatch(preg, scan + 5)) { return 1; } scanpt[4]--; return 0; } /* maximal, so try this branch again */ if (scanpt[4] < max) { scanpt[4]++; if (regmatch(preg, scan + 5)) { return 1; } scanpt[4]--; } /* At this point we are at max with no match. Try the other branch */ return regmatch(preg, regnext(preg, scan)); }
void CRegExp::regtail(TCHAR *p, TCHAR *val) { TCHAR *scan; TCHAR *temp; // int offset; if (!bEmitCode) return; // Find last node. for (scan = p; (temp = regnext(scan)) != NULL; scan = temp) continue; *((short *)(scan+1)) = (OP(scan) == BACK) ? scan - val : val - scan; }
/* - regdump - dump a SRE onto stdout in vaguely comprehensible form */ void regdump(SRE *r) { register char *s; register char op = EXACTLY; /* Arbitrary non-END op. */ register char *next; extern char *strchr(); s = r->program + 1; while (op != END) { /* While that wasn't END last time... */ op = OP(s); printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ next = regnext(s); if (next == NULL) { /* Next ptr. */ printf("(0)"); } else { printf("(%d)", (s-r->program)+(next-s)); } s += 3; if (op == ANYOF || op == ANYBUT || op == EXACTLY) { /* Literal string, where present. */ while (*s != '\0') { putchar(*s); s++; } s++; } putchar('\n'); } /* Header fields of interest. */ if (r->regstart != '\0') { printf("start `%c' ", r->regstart); } if (r->reganch) { printf("anchored "); } if (r->regmust != NULL) { printf("must have \"%s\"", r->regmust); } printf("\n"); }
/* - regdump - dump a regexp onto stdout in vaguely comprehensible form */ void regdump (regexp * r) { register char *s; register char op = EXACTLY; /* Arbitrary non-END op. */ register char *nxt; #ifdef _AIX extern char *strchr(); #endif /* _AIX */ s = r->program + 1; while (op != END) { /* While that wasn't END last time... */ op = OP(s); printf("%2ld%s", (s - r->program), regprop(s)); /* Where, what. */ nxt = regnext(s); if (nxt == (char *) NULL) /* nxt ptr. */ printf("(0)"); else printf("(%ld)", ((s - r->program) + (nxt - s))); s += 3; if (op == ANYOF || op == ANYBUT || op == EXACTLY) { /* Literal string, where present. */ while (*s != '\0') { putchar(*s); s++; } s++; } putchar('\n'); } /* Header fields of interest. */ if (r->regstart != '\0') printf("start `%c' ", r->regstart); if (r->reganch) printf("anchored "); if (r->regmust != (char *) NULL) printf("must have \"%s\"", r->regmust); printf("\n"); }
/* - regtail - set the next-pointer at the end of a node chain */ static void regtail(regex_t *preg, int p, int val) { int scan; int temp; int offset; /* Find last node. */ scan = p; for (;;) { temp = regnext(preg, scan); if (temp == 0) break; scan = temp; } if (OP(preg, scan) == BACK) offset = scan - val; else offset = val - scan; preg->program[scan + 1] = offset; }
/* - regtail - set the next-pointer at the end of a node chain */ void ossimRegExp::regtail (char* p, const char* val) { char* scan; char* temp; int offset; if (p == ®dummy) return; // Find last node. scan = p; for (;;) { temp = regnext(scan); if (temp == NULL) break; scan = temp; } if (OP(scan) == BACK) offset = (const char*)scan - val; else offset = val - scan; *(scan + 1) = (offset >> 8) & 0377; *(scan + 2) = offset & 0377; }
/* - regcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. */ regexp * regcomp( const char *exp ) { register regexp *r; register char *scan; register char *longest; register unsigned len; int flags; if (exp == NULL) FAIL("NULL argument"); /* First pass: determine size, legality. */ #ifdef notdef if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ #endif regparse = (char *)exp; regnpar = 1; regsize = 0L; regcode = ®dummy; regc(MAGIC); if (reg(0, &flags) == NULL) return(NULL); /* Small enough for pointer-storage convention? */ if (regsize >= 32767L) /* Probably could be 65535L. */ FAIL("regexp too big"); /* Allocate space. */ r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); if (r == NULL) FAIL("out of space"); /* Second pass: emit code. */ regparse = (char *)exp; regnpar = 1; regcode = r->program; regc(MAGIC); if (reg(0, &flags) == NULL) return(NULL); /* Dig out information for optimizations. */ r->regstart = '\0'; /* Worst-case defaults. */ r->reganch = 0; r->regmust = NULL; r->regmlen = 0; scan = r->program+1; /* First BRANCH. */ if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(scan) == EXACTLY) r->regstart = *OPERAND(scan); else if (OP(scan) == BOL) r->reganch++; /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if (flags&SPSTART) { longest = NULL; len = 0; for (; scan != NULL; scan = regnext(scan)) if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { longest = OPERAND(scan); len = strlen(OPERAND(scan)); } r->regmust = longest; r->regmlen = len; } } return(r); }
/* * reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static char *reg( int paren, int *flagp ) { char *ret, *br, *ender; int flags; char parno = 0; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if( paren ) { if( regnpar >= NSUBEXP ) { FAIL( ERR_RE_TOO_MANY_ROUND_BRACKETS ); } parno = regnpar; regnpar++; ret = regnode( OPEN + parno ); } else { ret = NULL; } /* Pick up the branches, linking them together. */ br = regbranch( &flags ); if( br == NULL ) { return( NULL ); } if( ret != NULL ) { regtail( ret, br ); /* OPEN -> first. */ } else { ret = br; } if( !( flags & HASWIDTH ) ) { *flagp &= ~HASWIDTH; } *flagp |= flags & SPSTART; while( *regparse == '|' ) { regparse++; br = regbranch( &flags ); if( br == NULL ) { return( NULL ); } regtail( ret, br ); /* BRANCH -> BRANCH. */ if( !( flags & HASWIDTH ) ) { *flagp &= ~HASWIDTH; } *flagp |= flags & SPSTART; } /* Make a closing node, and hook it on the end. */ ender = regnode( ( paren ) ? CLOSE + parno : END ); regtail( ret, ender ); /* Hook the tails of the branches to the closing node. */ for( br = ret; br != NULL; br = regnext( br ) ) { regoptail( br, ender ); } /* Check for proper termination. */ if( paren && *regparse++ != ')' ) { FAIL( ERR_RE_UNMATCHED_ROUND_BRACKETS ); } else if( !paren && *regparse != '\0' ) { if( *regparse == ')' ) { FAIL( ERR_RE_UNMATCHED_ROUND_BRACKETS ); } else { FAIL( ERR_RE_INTERNAL_FOULUP ); /* "Can't happen". */ } } return( ret ); }
int CRegExp::regmatch(TCHAR *prog) { TCHAR *scan; // Current node. TCHAR *next; // Next node. for (scan = prog; scan != NULL; scan = next) { next = regnext(scan); switch (OP(scan)) { case BOL: if (reginput != regbol) return(0); break; case EOL: if (*reginput != _T('\0')) return(0); break; case ANY: if (*reginput == _T('\0')) return(0); reginput++; break; case EXACTLY: { size_t len; TCHAR *const opnd = OPERAND(scan); // Inline the first character, for speed. if (*opnd != *reginput) return(0); len = _tcslen(opnd); if (len > 1 && _tcsncmp(opnd, reginput, len) != 0) return(0); reginput += len; break; } case ANYOF: if (*reginput == _T('\0') || _tcschr(OPERAND(scan), *reginput) == NULL) return(0); reginput++; break; case ANYBUT: if (*reginput == _T('\0') || _tcschr(OPERAND(scan), *reginput) != NULL) return(0); reginput++; break; case NOTHING: break; case BACK: break; case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4: case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9: { const int no = OP(scan) - OPEN; TCHAR *const input = reginput; if (regmatch(next)) { // Don't set startp if some later // invocation of the same parentheses // already has. if (startp[no] == NULL) startp[no] = input; return(1); } else return(0); break; } case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4: case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9: { const int no = OP(scan) - CLOSE; TCHAR *const input = reginput; if (regmatch(next)) { // Don't set endp if some later // invocation of the same parentheses // already has. if (endp[no] == NULL) endp[no] = input; return(1); } else return(0); break; } case BRANCH: { TCHAR *const save = reginput; if (OP(next) != BRANCH) // No choice. next = OPERAND(scan); // Avoid recursion. else { while (OP(scan) == BRANCH) { if (regmatch(OPERAND(scan))) return(1); reginput = save; scan = regnext(scan); } return(0); // NOTREACHED } break; } case STAR: case PLUS: { const TCHAR nextch = (OP(next) == EXACTLY) ? *OPERAND(next) : _T('\0'); size_t no; TCHAR *const save = reginput; const size_t min = (OP(scan) == STAR) ? 0 : 1; for (no = regrepeat(OPERAND(scan)) + 1; no > min; no--) { reginput = save + no - 1; // If it could work, try it. if (nextch == _T('\0') || *reginput == nextch) if (regmatch(next)) return(1); } return(0); break; } case END: return(1); // Success! break; default: TRACE0("regexp corruption\n"); return(0); break; } } // We get here only if there's trouble -- normally "case END" is // the terminating point. TRACE0("corrupted pointers\n"); return(0); }
/* - regdump - dump a regexp onto stdout in vaguely comprehensible form */ static void regdump(regex_t *preg) { int s; int op = EXACTLY; /* Arbitrary non-END op. */ int next; char buf[MAX_UTF8_LEN + 1]; int i; for (i = 1; i < preg->p; i++) { printf("%02x ", (unsigned char)preg->program[i]); if (i % 16 == 0) { printf("\n"); } } printf("\n"); s = 1; while (op != END && s < preg->p) { /* While that wasn't END last time... */ op = OP(preg, s); printf("%3d: %s", s, regprop(op)); /* Where, what. */ next = regnext(preg, s); if (next == 0) /* Next ptr. */ printf("(0)"); else printf("(%d)", next); s += 2; if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) { int max = preg->program[s]; int min = preg->program[s + 1]; if (max == 65535) { printf("{%d,*}", min); } else { printf("{%d,%d}", min, max); } printf(" %d", preg->program[s + 2]); s += 3; } else if (op == ANYOF || op == ANYBUT) { /* set of ranges */ while (preg->program[s]) { int len = preg->program[s++]; int first = preg->program[s++]; buf[utf8_getchars(buf, first)] = 0; printf("%s", buf); if (len > 1) { buf[utf8_getchars(buf, first + len - 1)] = 0; printf("-%s", buf); } } s++; } else if (op == EXACTLY) { /* Literal string, where present. */ while (preg->program[s]) { buf[utf8_getchars(buf, preg->program[s])] = 0; printf("%s", buf); s++; } s++; } putchar('\n'); } if (op == END) { /* Header fields of interest. */ if (preg->regstart) { buf[utf8_getchars(buf, preg->regstart)] = 0; printf("start '%s' ", buf); } if (preg->reganch) printf("anchored "); if (preg->regmust != 0) { int i; printf("must have:"); for (i = 0; i < preg->regmlen; i++) { putchar(preg->program[preg->regmust + i]); } putchar('\n'); } } printf("\n"); }
/* 0 failure, 1 success */ static int regmatch(regex_t *preg, int prog) { int scan; /* Current node. */ int next; /* Next node. */ const char *save; scan = prog; #ifdef DEBUG if (scan != 0 && regnarrate) fprintf(stderr, "%s(\n", regprop(scan)); #endif while (scan != 0) { int n; int c; #ifdef DEBUG if (regnarrate) { fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */ } #endif next = regnext(preg, scan); n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE)); switch (OP(preg, scan)) { case BOL: if (preg->reginput != preg->regbol) return(0); break; case EOL: if (!reg_iseol(preg, c)) { return(0); } break; case WORDA: /* Must be looking at a letter, digit, or _ */ if ((!isalnum(UCHAR(c))) && c != '_') return(0); /* Prev must be BOL or nonword */ if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return(0); break; case WORDZ: /* Can't match at BOL */ if (preg->reginput > preg->regbol) { /* Current must be EOL or nonword */ if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') { c = preg->reginput[-1]; /* Previous must be word */ if (isalnum(UCHAR(c)) || c == '_') { break; } } } /* No */ return(0); case ANY: if (reg_iseol(preg, c)) return 0; preg->reginput += n; break; case EXACTLY: { int opnd; int len; int slen; opnd = OPERAND(scan); len = str_int_len(preg->program + opnd); slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE); if (slen < 0) { return(0); } preg->reginput += slen; } break; case ANYOF: if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) { return(0); } preg->reginput += n; break; case ANYBUT: if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) { return(0); } preg->reginput += n; break; case NOTHING: break; case BACK: break; case BRANCH: if (OP(preg, next) != BRANCH) /* No choice. */ next = OPERAND(scan); /* Avoid recursion. */ else { do { save = preg->reginput; if (regmatch(preg, OPERAND(scan))) { return(1); } preg->reginput = save; scan = regnext(preg, scan); } while (scan != 0 && OP(preg, scan) == BRANCH); return(0); /* NOTREACHED */ } break; case REP: case REPMIN: return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN); case REPX: case REPXMIN: return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN); case END: return 1; /* Success! */ case OPENNC: case CLOSENC: return regmatch(preg, next); default: if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) { save = preg->reginput; if (regmatch(preg, next)) { if (OP(preg, scan) < CLOSE) { int no = OP(preg, scan) - OPEN; if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) { preg->pmatch[no].rm_so = save - preg->start; } } else { int no = OP(preg, scan) - CLOSE; if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) { preg->pmatch[no].rm_eo = save - preg->start; } } return(1); } return(0); } return REG_ERR_INTERNAL; } scan = next; } /* * We get here only if there's trouble -- normally "case END" is * the terminating point. */ return REG_ERR_INTERNAL; }
/* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ static int /* 0 failure, 1 success */ regmatch(char *prog) { register char *scan; /* Current node. */ char *next; /* Next node. */ wchar_t wc = L'\0'; int len; scan = prog; while (scan != NULL) { next = regnext(scan); switch (OP(scan)) { case BOL: if (reginput != regbol) return(0); break; case EOL: if (CHARLEN(reginput) != 0) return(0); break; case WORDA: /* Must be looking at a letter, digit, or _ */ len = mbtowc(&wc, reginput, MB_CUR_MAX); if (len == -1) wc = *reginput; if ((!iswalnum(wc)) && wc != L'_') return(0); /* Prev must be BOL or nonword */ len = mbtowc(&wc, reginput - reglmlen, MB_CUR_MAX); if (len == -1) { wc = *(reginput- reglmlen); len = 1; } if (reginput > regbol && (iswalnum(wc) || wc == L'_')) return(0); break; case WORDZ: len = mbtowc(&wc, reginput, MB_CUR_MAX); if (len == -1) { wc = *reginput; len = 1; } /* Must be looking at non letter, digit, or _ */ if (iswalnum(wc) || wc == L'_') return(0); /* We don't care what the previous char was */ break; case ANY: /* Solaris 2.6 Motif diff bug 1236359 - 1 line */ if ( (len = CHARLEN(reginput)) <= 0) return(0); reglmlen = len; reginput += INCRLEN(len); break; case EXACTLY: { register int len; register int clen; register char *opnd; register char *op, *ip; opnd = OPERAND(scan); len = strlen(opnd); for (clen = len, op = opnd, ip = reginput; clen; ) { int opl = CHARLEN(op), ipl = CHARLEN(ip); if (opl == ipl && !strncmp(op, ip, ipl)) { op += ipl; ip += ipl; clen -= ipl; reglmlen = ipl; } else break; } if (clen) return(0); reginput += len; } break; case ANYOF: /* Solaris 2.6 motif diff bug 1236359 - 1 line */ if ( ((len = CHARLEN(reginput)) <= 0) || !inclass(OPERAND(scan), reginput)) return 0; reginput += len; reglmlen = len; break; case ANYBUT: /* Solaris 2.6 motif diff bug 1236359 - 1 line */ if ( ((len = CHARLEN(reginput)) <= 0) || inclass(OPERAND(scan), reginput)) return 0; reginput += len; reglmlen = len; break; case NOTHING: break; case BACK: break; case OPEN + 1: case OPEN + 2: case OPEN + 3: case OPEN + 4: case OPEN + 5: case OPEN + 6: case OPEN + 7: case OPEN + 8: case OPEN + 9: { register int no; register char *save; no = OP(scan) - OPEN; save = reginput; if (regmatch(next)) { /* * Don't set startp if some later * invocation of the same parentheses * already has. */ if (regstartp[no] == NULL) regstartp[no] = save; return(1); } else return(0); } break; case CLOSE + 1: case CLOSE + 2: case CLOSE + 3: case CLOSE + 4: case CLOSE + 5: case CLOSE + 6: case CLOSE + 7: case CLOSE + 8: case CLOSE + 9: { register int no; register char *save; no = OP(scan) - CLOSE; save = reginput; if (regmatch(next)) { /* * Don't set endp if some later * invocation of the same parentheses * already has. */ if (regendp[no] == NULL) regendp[no] = save; return(1); } else return(0); } break; case BRANCH: { register char *save; if (OP(next) != BRANCH) /* No choice. */ next = OPERAND(scan); /* Avoid recursion. */ else { do { save = reginput; if (regmatch(OPERAND(scan))) return(1); reginput = save; scan = regnext(scan); } while (scan != NULL && OP(scan) == BRANCH); return(0); /* NOTREACHED */ } } break; case STAR: case PLUS: { register char *nextch; register int no; register char *save; register int min; int nchars = 0; /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ nextch = 0; if (OP(next) == EXACTLY) nextch = OPERAND(next); min = (OP(scan) == STAR) ? 0 : 1; save = reginput; no = regrepeat(OPERAND(scan)); while (no >= min) { /* Solaris 2.6 motif diff bug 1236359 - 1 line */ int mb_len = 0; /* If it could work, try it. */ if (!nextch || !(len = CHARLEN(nextch)) || !strncmp(reginput, nextch, len) ) if (regmatch(next)) return(1); /* Couldn't or didn't -- back up. */ no--; reginput = save; /* Solaris 2.6 motif diff bug 1236359 - 4 lines */ for (nchars = 0; nchars < no && mb_len >= 0; nchars++) { mb_len = CHARLEN(reginput); if (mb_len > 0) reginput += mb_len; } } return(0); } break; case END: return(1); /* Success! */ break; default: return(0); break; } scan = next; } /* * We get here only if there's trouble -- normally "case END" is * the terminating point. */ return(0); }
/* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ static int /* 0 failure, 1 success */ regmatch(char *prog) { register char *scan; /* Current node. */ char *next; /* Next node. */ extern char *strchr(); scan = prog; #ifdef DEBUG if (scan != NULL && regnarrate) { fprintf(stderr, "%s(\n", regprop(scan)); } #endif while (scan != NULL) { #ifdef DEBUG if (regnarrate) { fprintf(stderr, "%s...\n", regprop(scan)); } #endif next = regnext(scan); switch (OP(scan)) { case BOL: if (reginput != regbol) { return(0); } break; case EOL: if (regpeek(0) != '\0' && regpeek(0) != '\n') { return(0); } break; case BEGWORD: /* Match if current char isident * and previous char BOL or !ident */ if ((regpeek(0) == 0 || !isident(regpeek(0))) || (reginput != regbol && isident(regpeek(-1)))) { return(0); } break; case ENDWORD: /* Match if previous char isident * and current char EOL or !ident */ if ((regpeek(0) != 0 && isident(regpeek(0))) || reginput == regbol || !isident(regpeek(-1))) { return(0); } break; case WHITESP: /* match single whitespace */ if (regpeek(0) != 0 && !isspace(regpeek(0))) { return(0); } reginput++; break; case NWHITESP: /* don't match eol, or space or tab */ if (regpeek(0) == 0 || isspace(regpeek(0))) { return(0); } reginput++; break; case ALNUM: /* includes _ */ if (regpeek(0) == 0 || !isident(regpeek(0))) { return(0); } reginput++; break; case NALNUM: if (regpeek(0) == 0 || isident(regpeek(0))) { return(0); } reginput++; break; case DIGIT: if (regpeek(0) == 0 || !isdigit(regpeek(0))) { return(0); } reginput++; break; case NDIGIT: if (regpeek(0) == 0 || isdigit(regpeek(0))) { return(0); } reginput++; break; case PRINT: if (regpeek(0) == 0 || !(isprint(regpeek(0)) || isspace(regpeek(0)))) { return(0); } reginput++; break; case NPRINT: if (regpeek(0) == 0 || isprint(regpeek(0)) || isspace(regpeek(0))) { return(0); } reginput++; break; case ANY: if (regpeek(0) == '\0' || regpeek(0) == '\n') { return(0); } regseek(1); break; case EXACTLY: { register int len; register char *opnd; opnd = OPERAND(scan); /* Inline the first character, for speed. */ if (*opnd != regpeek(0)) { return(0); } len = strlen(opnd); if (len > 1 && strncmp(opnd, reginput, len) != 0) { return(0); } regseek(len); } break; case ANYOF: if (strchr(OPERAND(scan), regpeek(0)) == NULL) { return(0); } regseek(1); break; case ANYBUT: if (strchr(OPERAND(scan), regpeek(0)) != NULL) { return(0); } regseek(1); break; case NOTHING: break; case BACK: break; case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4: case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9: { register int no; register char *save; no = OP(scan) - OPEN; save = reginput; if (regmatch(next)) { /* * Don't set startp if some later * invocation of the same parentheses * already has. */ if (regstartp[no] == NULL) { regstartp[no] = save; } return(1); } else { return(0); } } break; case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4: case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9: { register int no; register char *save; no = OP(scan) - CLOSE; save = reginput; if (regmatch(next)) { /* * Don't set endp if some later * invocation of the same parentheses * already has. */ if (regendp[no] == NULL) { regendp[no] = save; } return(1); } else { return(0); } } break; case BRANCH: { register char *save; if (OP(next) != BRANCH) { /* No choice. */ next = OPERAND(scan); /* Avoid recursion. */ } else { do { save = reginput; if (regmatch(OPERAND(scan))) { return(1); } reginput = save; scan = regnext(scan); } while (scan != NULL && OP(scan) == BRANCH); return(0); /* NOTREACHED */ } } break; case STAR: case PLUS: { register char nextch; register int no; register char *save; register int min; /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ nextch = '\0'; if (OP(next) == EXACTLY) { nextch = *OPERAND(next); } min = (OP(scan) == STAR) ? 0 : 1; save = reginput; no = regrepeat(OPERAND(scan)); while (no >= min) { /* If it could work, try it. */ if (nextch == '\0' || regpeek(0) == nextch) { if (regmatch(next)) { return(1); } } /* Couldn't or didn't -- back up. */ no--; reginput = save + no; } return(0); } break; case MINMAX: { register char *save; unsigned char min; unsigned char max; register int no; next = OPERAND(scan); min = OP(next); next = OPERAND(next); max = OP(next); next = OPERAND(next); save = reginput; for (no = 0 ; no < min ; no++) { if (!regmatch(next)) { reginput = save; return(0); } } for ( ; no < max ; no++) { if (!regmatch(next)) { break; } } return(1); } break; case END: return(1); /* Success! */ break; default: SREerror("memory corruption"); return(0); break; } scan = next; } /* * We get here only if there's trouble -- normally "case END" is * the terminating point. */ SREerror("corrupted pointers"); return(0); }
void ossimRegExp::compile (const char* exp) { const char* scan; const char* longest; unsigned long len; int flags; if (exp == NULL) { //RAISE Error, SYM(ossimRegExp), SYM(No_Expr), printf ("ossimRegExp::compile(): No expression supplied.\n"); return; } // First pass: determine size, legality. regparse = exp; regnpar = 1; regsize = 0L; regcode = ®dummy; regc(MAGIC); if(!reg(0, &flags)) { printf ("ossimRegExp::compile(): Error in compile.\n"); return; } this->startp[0] = this->endp[0] = this->searchstring = NULL; // Small enough for pointer-storage convention? if (regsize >= 32767L) { // Probably could be 65535L. //RAISE Error, SYM(ossimRegExp), SYM(Expr_Too_Big), printf ("ossimRegExp::compile(): Expression too big.\n"); return; } // Allocate space. //#ifndef WIN32 if (this->program != NULL) delete [] this->program; //#endif this->program = new char[regsize]; this->progsize = (int) regsize; if (this->program == NULL) { //RAISE Error, SYM(ossimRegExp), SYM(Out_Of_Memory), printf ("ossimRegExp::compile(): Out of memory.\n"); return; } // Second pass: emit code. regparse = exp; regnpar = 1; regcode = this->program; regc(MAGIC); reg(0, &flags); // Dig out information for optimizations. this->regstart = '\0'; // Worst-case defaults. this->reganch = 0; this->regmust = NULL; this->regmlen = 0; scan = this->program + 1; // First BRANCH. if (OP(regnext(scan)) == END) { // Only one top-level choice. scan = OPERAND(scan); // Starting-point info. if (OP(scan) == EXACTLY) this->regstart = *OPERAND(scan); else if (OP(scan) == BOL) this->reganch++; // // If there's something expensive in the r.e., find the longest // literal string that must appear and make it the regmust. Resolve // ties in favor of later strings, since the regstart check works // with the beginning of the r.e. and avoiding duplication // strengthens checking. Not a strong reason, but sufficient in the // absence of others. // if (flags & SPSTART) { longest = NULL; len = 0; for (; scan != NULL; scan = regnext(scan)) if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { longest = OPERAND(scan); len = (unsigned long)strlen(OPERAND(scan)); } this->regmust = longest; this->regmlen = len; } } }
/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ char* ossimRegExp::reg (int paren, int *flagp) { char* ret; char* br; char* ender; int parno =0; int flags; *flagp = HASWIDTH; // Tentatively. // Make an OPEN node, if parenthesized. if (paren) { if (regnpar >= NSUBEXP) { //RAISE Error, SYM(ossimRegExp), SYM(Too_Many_Parens), printf ("ossimRegExp::compile(): Too many parentheses.\n"); return 0; } parno = regnpar; regnpar++; ret = regnode(OPEN + parno); } else ret = NULL; // Pick up the branches, linking them together. br = regbranch(&flags); if (br == NULL) return (NULL); if (ret != NULL) regtail(ret, br); // OPEN -> first. else ret = br; if (!(flags & HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags & SPSTART; while (*regparse == '|') { regparse++; br = regbranch(&flags); if (br == NULL) return (NULL); regtail(ret, br); // BRANCH -> BRANCH. if (!(flags & HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags & SPSTART; } // Make a closing node, and hook it on the end. ender = regnode((paren) ? CLOSE + parno : END); regtail(ret, ender); // Hook the tails of the branches to the closing node. for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); // Check for proper termination. if (paren && *regparse++ != ')') { //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Parens), printf ("ossimRegExp::compile(): Unmatched parentheses.\n"); return 0; } else if (!paren && *regparse != '\0') { if (*regparse == ')') { //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Parens), printf ("ossimRegExp::compile(): Unmatched parentheses.\n"); return 0; } else { //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error), printf ("ossimRegExp::compile(): Internal error.\n"); return 0; } // NOTREACHED } return (ret); }
/* - regcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. */ int regcomp(regex_t *preg, const char *exp, int cflags) { int scan; int longest; unsigned len; int flags; #ifdef DEBUG fprintf(stderr, "Compiling: '%s'\n", exp); #endif memset(preg, 0, sizeof(*preg)); if (exp == NULL) FAIL(preg, REG_ERR_NULL_ARGUMENT); /* First pass: determine size, legality. */ preg->cflags = cflags; preg->regparse = exp; /* Allocate space. */ preg->proglen = (strlen(exp) + 1) * 5; preg->program = malloc(preg->proglen * sizeof(int)); if (preg->program == NULL) FAIL(preg, REG_ERR_NOMEM); /* Note that since we store a magic value as the first item in the program, * program offsets will never be 0 */ regc(preg, REG_MAGIC); if (reg(preg, 0, &flags) == 0) { return preg->err; } /* Small enough for pointer-storage convention? */ if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */ FAIL(preg,REG_ERR_TOO_BIG); /* Dig out information for optimizations. */ preg->regstart = 0; /* Worst-case defaults. */ preg->reganch = 0; preg->regmust = 0; preg->regmlen = 0; scan = 1; /* First BRANCH. */ if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(preg, scan) == EXACTLY) { preg->regstart = preg->program[OPERAND(scan)]; } else if (OP(preg, scan) == BOL) preg->reganch++; /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if (flags&SPSTART) { longest = 0; len = 0; for (; scan != 0; scan = regnext(preg, scan)) { if (OP(preg, scan) == EXACTLY) { int plen = str_int_len(preg->program + OPERAND(scan)); if (plen >= len) { longest = OPERAND(scan); len = plen; } } } preg->regmust = longest; preg->regmlen = len; } } #ifdef DEBUG regdump(preg); #endif return 0; }
TCHAR *CRegExp::reg(int paren, int *flagp) { char *ret; char *br; char *ender; int parno; int flags; *flagp = HASWIDTH; // Tentatively. if (paren) { // Make an OPEN node. if (regnpar >= NSUBEXP) { TRACE1("Too many (). NSUBEXP is set to %d\n", NSUBEXP ); return NULL; } parno = regnpar; regnpar++; ret = regnode(OPEN+parno); } // Pick up the branches, linking them together. br = regbranch(&flags); if (br == NULL) return(NULL); if (paren) regtail(ret, br); // OPEN -> first. else ret = br; *flagp &= ~(~flags&HASWIDTH); // Clear bit if bit 0. *flagp |= flags&SPSTART; while (*regparse == _T('|')) { regparse++; br = regbranch(&flags); if (br == NULL) return(NULL); regtail(ret, br); // BRANCH -> BRANCH. *flagp &= ~(~flags&HASWIDTH); *flagp |= flags&SPSTART; } // Make a closing node, and hook it on the end. ender = regnode((paren) ? CLOSE+parno : END); regtail(ret, ender); // Hook the tails of the branches to the closing node. for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); // Check for proper termination. if (paren && *regparse++ != _T(')')) { TRACE0("unterminated ()\n"); return NULL; } else if (!paren && *regparse != _T('\0')) { if (*regparse == _T(')')) { TRACE0("unmatched ()\n"); return NULL; } else { TRACE0("internal error: junk on end\n"); return NULL; } // NOTREACHED } return(ret); }
/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) { int ret; int br; int ender; int parno = 0; int flags; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (preg->regparse[0] == '?' && preg->regparse[1] == ':') { /* non-capturing paren */ preg->regparse += 2; parno = -1; } else { parno = ++preg->re_nsub; } ret = regnode(preg, OPEN+parno); } else ret = 0; /* Pick up the branches, linking them together. */ br = regbranch(preg, &flags); if (br == 0) return 0; if (ret != 0) regtail(preg, ret, br); /* OPEN -> first. */ else ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; while (*preg->regparse == '|') { preg->regparse++; br = regbranch(preg, &flags); if (br == 0) return 0; regtail(preg, ret, br); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ ender = regnode(preg, (paren) ? CLOSE+parno : END); regtail(preg, ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender); /* Check for proper termination. */ if (paren && *preg->regparse++ != ')') { preg->err = REG_ERR_UNMATCHED_PAREN; return 0; } else if (!paren && *preg->regparse != '\0') { if (*preg->regparse == ')') { preg->err = REG_ERR_UNMATCHED_PAREN; return 0; } else { preg->err = REG_ERR_JUNK_ON_END; return 0; } } return(ret); }
CRegExp* CRegExp::RegComp(const TCHAR *exp) { TCHAR *scan; int flags; if (exp == NULL) return NULL; bCompiled = TRUE; // First pass: determine size, legality. bEmitCode = FALSE; regparse = (TCHAR *)exp; regnpar = 1; regsize = 0L; regdummy[0] = NOTHING; regdummy[1] = regdummy[2] = 0; regcode = regdummy; if (reg(0, &flags) == NULL) return(NULL); // Allocate space. delete program; program = new TCHAR[regsize]; memset( program, 0, regsize * sizeof(TCHAR) ); if (program == NULL) return NULL; // Second pass: emit code. bEmitCode = TRUE; regparse = (TCHAR *)exp; regnpar = 1; regcode = program; if (reg(0, &flags) == NULL) return NULL; // Dig out information for optimizations. regstart = _T('\0'); // Worst-case defaults. reganch = 0; regmust = NULL; regmlen = 0; scan = program; // First BRANCH. if (OP(regnext(scan)) == END) { // Only one top-level choice. scan = OPERAND(scan); // Starting-point info. if (OP(scan) == EXACTLY) regstart = *OPERAND(scan); else if (OP(scan) == BOL) reganch = 1; // If there's something expensive in the r.e., find the // longest literal string that must appear and make it the // regmust. Resolve ties in favor of later strings, since // the regstart check works with the beginning of the r.e. // and avoiding duplication strengthens checking. Not a // strong reason, but sufficient in the absence of others. if (flags&SPSTART) { char *longest = NULL; size_t len = 0; for (; scan != NULL; scan = regnext(scan)) if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len) { longest = OPERAND(scan); len = _tcslen(OPERAND(scan)); } regmust = longest; regmlen = (int)len; } } return this; }
/* - regcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because FREE() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. */ regexp *regcomp (unsigned char * exp, int excompat) /* \( \) operators like in unix ex */ { register regexp *r; register unsigned char *scan; register char *longest; register int len; int flags; short *exp2, *dest, c; if (!exp) FAIL("NULL argument\n"); exp2 = (short *) DXALLOC((strlen((char *)exp) + 1) * (sizeof(short[8]) / sizeof(char[8])), TAG_TEMPORARY, "regcomp: 1"); for (scan = exp, dest = exp2; (c = *scan++);) { switch (c) { case '(': case ')': *dest++ = excompat ? c : c | SPECIAL; break; case '.': case '*': case '+': case '?': case '|': case '$': case '^': case '[': case ']': *dest++ = c | SPECIAL; break; case '\\': switch (c = *scan++) { case 0: FREE(exp2); FAIL("Regular expression cannot end with '\\'. Use \"\\\\\".\n"); break; case '(': case ')': *dest++ = excompat ? c | SPECIAL : c; break; case '<': case '>': *dest++ = c | SPECIAL; break; case '{': case '}': FREE(exp2); FAIL("sorry, unimplemented operator\n"); case 'b': *dest++ = '\b'; break; case 't': *dest++ = '\t'; break; case 'r': *dest++ = '\r'; break; default: *dest++ = c; } break; default: *dest++ = c; } } *dest = 0; /* First pass: determine size, legality. */ regparse = exp2; regnpar = 1; regsize = 0L; regcode = ®dummy; regc((char) MAGIC); if (reg(0, &flags) == (char *) NULL) { FREE(exp2); return ((regexp *) NULL); } /* Small enough for pointer-storage convention? */ if (regsize >= 32767L) /* Probably could be 65535L. */ { FREE(exp2); FAIL("regexp too big\n"); } /* Allocate space. */ r = (regexp *) DXALLOC(sizeof(regexp) + (unsigned) regsize, TAG_TEMPORARY, "regcomp: 2"); if (r == (regexp *) NULL) { FREE(exp2); FAIL("out of space\n"); } /* Second pass: emit code. */ regparse = exp2; regnpar = 1; regcode = (char *)(r->program); regc((char) MAGIC); if (reg(0, &flags) == NULL) { FREE(exp2); FREE(r); return ((regexp *) NULL); } /* Dig out information for optimizations. */ r->regstart = '\0'; /* Worst-case defaults. */ r->reganch = 0; r->regmust = NULL; r->regmlen = 0; scan = (unsigned char *)(r->program + 1); /* First BRANCH. */ if (OP(regnext((char *)scan)) == END) { /* Only one top-level choice. */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(scan) == EXACTLY) r->regstart = *OPERAND(scan); else if (OP(scan) == BOL) r->reganch++; /* * If there's something expensive in the r.e., find the longest * literal string that must appear and make it the regmust. Resolve * ties in favor of later strings, since the regstart check works * with the beginning of the r.e. and avoiding duplication * strengthens checking. Not a strong reason, but sufficient in the * absence of others. */ if (flags & SPSTART) { longest = NULL; len = 0; for (; scan != NULL; scan = (unsigned char *)regnext((char *)scan)) { char *tmp = (char *)OPERAND(scan); int tlen; if (OP(scan) == EXACTLY && (tlen = strlen(tmp)) >= len) { longest = tmp; len = tlen; } } r->regmust = longest; r->regmlen = len; } } FREE((char *) exp2); return (r); }
/* - RegComp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. */ regexp *RegComp( const char *instr ) { regexp *r; char *scan; char *longest; const char *exp; char buff[MAX_STR*2]; int flags, ignmag = FALSE; unsigned j; size_t i, k, len; #ifdef WANT_EXCLAMATION if( instr[0] == '!' ) { instr++; ignmag = TRUE; } #endif /* * flip roles of magic chars */ if( !ignmag && ( !MAGICFLAG && MAGICSTR != NULL ) ) { j = 0; k = strlen( instr ); for( i = 0; i < k; i++ ) { if( instr[i] == '\\' ) { if( strchr( MAGICSTR, instr[i + 1] ) == NULL ) { buff[j++] = '\\'; } i++; } else { if( strchr( MAGICSTR, instr[i] ) != NULL ) { buff[j++] = '\\'; } } buff[j++] = instr[i]; } buff[j] = 0; exp = buff; } else { exp = instr; } regError( ERR_NO_ERR ); if( exp == NULL ) { FAIL( ERR_RE_NULL_ARGUMENT ); } /* First pass: determine size, legality. */ regparse = exp; regnpar = 1; regsize = 0L; regcode = ®dummy; regc( MAGIC ); if( reg( 0, &flags ) == NULL ) { return( NULL ); } /* Allocate space. */ r = ALLOC( sizeof( regexp ) + ( unsigned ) regsize ); /* Second pass: emit code. */ regparse = exp; regnpar = 1; regcode = r->program; regc( MAGIC ); if( reg( 0, &flags ) == NULL ) { return( NULL ); } /* Dig out information for optimizations. */ r->regstart = '\0'; /* Worst-case defaults. */ r->reganch = 0; r->regmust = NULL; r->regmlen = 0; scan = r->program + 1; /* First BRANCH. */ if( OP( regnext( scan ) ) == END ) { /* Only one top-level choice. */ scan = OPERAND( scan ); /* Starting-point info. */ if( OP( scan ) == EXACTLY ) { r->regstart = *OPERAND( scan ); } else if( OP( scan ) == BOL ) { r->reganch++; } /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if( flags & SPSTART ) { longest = NULL; len = 0; for( ; scan != NULL; scan = regnext( scan ) ) { if( OP( scan ) == EXACTLY && strlen( OPERAND( scan ) ) >= len ) { longest = OPERAND( scan ); len = strlen( OPERAND( scan ) ); } } r->regmust = longest; r->regmlen = (short)len; } } return( r ); }
/* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ static int regmatch (char * prog) { register char *scan; /* Current node. */ char *nxt; /* nxt node. */ scan = prog; #ifdef DEBUG if (scan != (char *) NULL && regnarrate) debug_message("%s(\n", regprop(scan)); #endif while (scan != (char *) NULL) { #ifdef DEBUG if (regnarrate) debug_message("%s...\n", regprop(scan)); #endif nxt = regnext(scan); switch (OP(scan)) { case BOL: if (reginput != regbol) return (0); break; case EOL: if (*reginput != '\0') return (0); break; case ANY: if (*reginput == '\0') return (0); reginput++; break; case WORDSTART: if (reginput == regbol) break; if (*reginput == '\0' || ISWORDPART(*(reginput - 1)) || !ISWORDPART(*reginput)) return (0); break; case WORDEND: if (*reginput == '\0') break; if (reginput == regbol || !ISWORDPART(*(reginput - 1)) || ISWORDPART(*reginput)) return (0); break; case EXACTLY:{ register int len; register char *opnd; opnd = OPERAND(scan); /* Inline the first character, for speed. */ if (*opnd != *reginput) return (0); len = strlen(opnd); if (len > 1 && strncmp(opnd, reginput, len) != 0) return (0); reginput += len; } break; case ANYOF: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == (char *) NULL) return (0); reginput++; break; case ANYBUT: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != (char *) NULL) return (0); reginput++; break; case NOTHING: break; case BACK: break; case OPEN + 1: case OPEN + 2: case OPEN + 3: case OPEN + 4: case OPEN + 5: case OPEN + 6: case OPEN + 7: case OPEN + 8: case OPEN + 9:{ register int no; register const char *save; no = OP(scan) - OPEN; save = reginput; if (regmatch(nxt)) { /* * Don't set startp if some later invocation of the same * parentheses already has. */ if (regstartp[no] == (char *) NULL) regstartp[no] = save; return (1); } else return (0); } break; case CLOSE + 1: case CLOSE + 2: case CLOSE + 3: case CLOSE + 4: case CLOSE + 5: case CLOSE + 6: case CLOSE + 7: case CLOSE + 8: case CLOSE + 9:{ register int no; register const char *save; no = OP(scan) - CLOSE; save = reginput; if (regmatch(nxt)) { /* * Don't set endp if some later invocation of the same * parentheses already has. */ if (regendp[no] == (char *) NULL) regendp[no] = save; return (1); } else return (0); } break; case BRANCH:{ register const char *save; if (OP(nxt) != BRANCH) /* No choice. */ nxt = OPERAND(scan); /* Avoid recursion. */ else { do { save = reginput; if (regmatch(OPERAND(scan))) return (1); reginput = save; scan = regnext(scan); } while (scan != (char *) NULL && OP(scan) == BRANCH); return (0); /* NOTREACHED */ } } break; case STAR: case PLUS:{ register char nextch; register int no; register const char *save; register int minimum; /* * Lookahead to avoid useless match attempts when we know * what character comes next. */ nextch = '\0'; if (OP(nxt) == EXACTLY) nextch = *OPERAND(nxt); minimum = (OP(scan) == STAR) ? 0 : 1; save = reginput; no = regrepeat(OPERAND(scan)); while (no >= minimum) { /* If it could work, try it. */ if (nextch == '\0' || *reginput == nextch) if (regmatch(nxt)) return (1); /* Couldn't or didn't -- back up. */ no--; reginput = save + no; } return (0); } break; case END: return (1); /* Success! */ break; default: regerror("memory corruption\n"); return (0); break; } scan = nxt; } /* * We get here only if there's trouble -- normally "case END" is the * terminating point. */ regerror("corrupted pointers\n"); return (0); }
/* * REcompile - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled RE_EXP. */ STATUS REcompile( char *exp, RE_EXP **re_exp, i4 mem_tag ) { register RE_EXP *r; register char *scan; register char *longest; register i4 len; i4 flags; u_char magic = MAGIC; if (exp == NULL) { _error("NULL argument"); return (FAIL); } /* First pass: determine size, legality. */ regparse = exp; regnpar = 1; regsize = 0L; regcode = ®dummy; regc( (char *) &magic ); if (reg(0, &flags) == NULL) return( FAIL ); /* Small enough for pointer-storage convention? */ if (regsize >= 32767L) /* Probably could be 65535L. */ { _error("regular expression too big"); return (FAIL); } /* Allocate space. */ r = (RE_EXP *) MEreqmem( mem_tag, sizeof(RE_EXP) + (unsigned) regsize, FALSE, NULL); if (r == NULL) { _error("out of space"); return (FAIL); } /* Second pass: emit code. */ regparse = exp; regnpar = 1; regcode = r->program; regc( (char *) &magic ); if (reg(0, &flags) == NULL) return( FAIL ); /* Dig out information for optimizations. */ r->regstart = '\0'; /* Worst-case defaults. */ r->reganch = 0; r->regmust = NULL; r->regmlen = 0; scan = r->program+1; /* First BRANCH. */ if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(scan) == EXACTLY) r->regstart = *OPERAND(scan); else if (OP(scan) == BOL) r->reganch++; /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if (flags&SPSTART) { longest = NULL; len = 0; for (; scan != NULL; scan = regnext(scan)) if (OP(scan) == EXACTLY && STlength(OPERAND(scan)) >= len) { longest = OPERAND(scan); len = STlength(OPERAND(scan)); } r->regmust = longest; r->regmlen = len; } } *re_exp = r; return( OK ); }
int regevaluate (struct Expr *expr, uchar *val, int amt) { struct Probe *base = (struct Probe *)((uchar *)expr + expr->size); struct Probe *probe, *stack, *clone; int idx, queue; // reset evaluator expr->val = val; expr->amt = amt; expr->top = 0; // if using new compiled node tree if( !expr->memo ) expr->memo = (uchar *)(expr + 1) + expr->tree; // calculate size of memo array in bits // by calculating number of nodes // and multiplying by source len idx = (expr->memo - (uchar *)(expr + 1)) / sizeof(struct Node); // convert number of bits to number of bytes // and clear memo array idx = (idx * (amt + 1) + 7) / 8; expr->tree = idx + (expr->memo - (uchar *)(expr + 1)); if( expr->tree + sizeof(struct Expr) > expr->size ) return 0; // out of memory else memset (expr->memo, 0, idx); // launch initial probe on root of parse tree if( probe = regprobe (expr) ) probe->node = (struct Node *)(expr + 1); else return 0; // out of memory queue = base - probe; // evaluate input string against parse tree // until a probe reaches both the end of the // parse tree and the end of the input string while( idx = queue ) { probe = base - idx; queue = probe->next; // continue our node down to a // pattern match node. while( ++expr->steps ) { // if maximum occurrences reached // move to sibling node // if no sibling, either return // success if done, or kill probe if( probe->occurrence == probe->node->maximum ) if( regnext (expr, probe) ) continue; else if( probe->off == expr->amt ) return 1; else break; // if another probe began evaluation // of this node at this offset before, // abandon our probe. idx = probe->node - (struct Node *)(expr + 1); idx *= amt + 1; idx += probe->off; if( ++probe->occurrence > probe->node->minimum ) if( expr->memo[idx/8] & (1 << (idx % 8)) ) break; else expr->memo[idx/8] |= 1 << (idx % 8); // if minimum requirement met // clone another probe to continue // with alternate if( probe->occurrence > probe->node->minimum ) if( clone = regclone (expr, probe) ) { clone->occurrence = clone->node->maximum; clone->next = queue; queue = base - clone; } else return 0; // out of memory // descend probe into subexpressions if( probe->node->typelen <= 0 ) { // make a stack node // to remember parent if( stack = regprobe (expr) ) stack->next = probe->stack; else return 0; // out of memory stack->occurrence = probe->occurrence; stack->off = probe->off; probe->node = probe->node->type->child; probe->stack = base - stack; probe->occurrence = 0; continue; } // advance to next input character, // or kill probe if no pattern match, if( regmatch (expr, probe) ) probe->off++; else break; } // delete our probe and continue // with next queued clone regkill (expr, probe); } // when run queue is exhausted, // delete all probes and return failure return 0; }
/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static char * reg( int paren, /* Parenthesized? */ int *flagp ) { register char *ret; register char *br; register char *ender; register int parno; int flags; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (regnpar >= NSUBEXP) FAIL("too many ()"); parno = regnpar; regnpar++; ret = regnode(OPEN+parno); } else ret = NULL; /* Pick up the branches, linking them together. */ br = regbranch(&flags); if (br == NULL) return(NULL); if (ret != NULL) regtail(ret, br); /* OPEN -> first. */ else ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; while (*regparse == '|' || *regparse == '\n') { regparse++; br = regbranch(&flags); if (br == NULL) return(NULL); regtail(ret, br); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ ender = regnode((paren) ? CLOSE+parno : END); regtail(ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); /* Check for proper termination. */ if (paren && *regparse++ != ')') { FAIL("unmatched ()"); } else if (!paren && *regparse != '\0') { if (*regparse == ')') { FAIL("unmatched ()"); } else FAIL("junk on end"); /* "Can't happen". */ /* NOTREACHED */ } return(ret); }
/* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. * 0 failure, 1 success */ int ossimRegExp::regmatch (const char* prog) { const char* scan; // Current node. const char* next; // Next node. scan = prog; while (scan != NULL) { next = regnext(scan); switch (OP(scan)) { case BOL: if (reginput != regbol) return (0); break; case EOL: if (*reginput != '\0') return (0); break; case ANY: if (*reginput == '\0') return (0); reginput++; break; case EXACTLY: { int len; const char* opnd; opnd = OPERAND(scan); // Inline the first character, for speed. if (*opnd != *reginput) return (0); len = (int)strlen(opnd); if (len > 1 && strncmp(opnd, reginput, len) != 0) return (0); reginput += len; } break; case ANYOF: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) return (0); reginput++; break; case ANYBUT: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) return (0); reginput++; break; case NOTHING: break; case BACK: break; case OPEN + 1: case OPEN + 2: case OPEN + 3: case OPEN + 4: case OPEN + 5: case OPEN + 6: case OPEN + 7: case OPEN + 8: case OPEN + 9: { int no; const char* save; no = OP(scan) - OPEN; save = reginput; if (regmatch(next)) { // // Don't set startp if some later invocation of the // same parentheses already has. // if (regstartp[no] == NULL) regstartp[no] = save; return (1); } else return (0); } // break; case CLOSE + 1: case CLOSE + 2: case CLOSE + 3: case CLOSE + 4: case CLOSE + 5: case CLOSE + 6: case CLOSE + 7: case CLOSE + 8: case CLOSE + 9: { int no; const char* save; no = OP(scan) - CLOSE; save = reginput; if (regmatch(next)) { // // Don't set endp if some later invocation of the // same parentheses already has. // if (regendp[no] == NULL) regendp[no] = save; return (1); } else return (0); } // break; case BRANCH: { const char* save; if (OP(next) != BRANCH) // No choice. next = OPERAND(scan); // Avoid recursion. else { do { save = reginput; if (regmatch(OPERAND(scan))) return (1); reginput = save; scan = regnext(scan); } while (scan != NULL && OP(scan) == BRANCH); return (0); // NOTREACHED } } break; case STAR: case PLUS: { char nextch; int no; const char* save; int min_no; // // Lookahead to avoid useless match attempts when we know // what character comes next. // nextch = '\0'; if (OP(next) == EXACTLY) nextch = *OPERAND(next); min_no = (OP(scan) == STAR) ? 0 : 1; save = reginput; no = regrepeat(OPERAND(scan)); while (no >= min_no) { // If it could work, try it. if (nextch == '\0' || *reginput == nextch) if (regmatch(next)) return (1); // Couldn't or didn't -- back up. no--; reginput = save + no; } return (0); } // break; case END: return (1); // Success! default: //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error), printf ("ossimRegExp::find(): Internal error -- memory corrupted.\n"); return 0; } scan = next; } // // We get here only if there's trouble -- normally "case END" is the // terminating point. // //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error), printf ("ossimRegExp::find(): Internal error -- corrupted pointers.\n"); return (0); }
/* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ static int /* 0 failure, 1 success */ regmatch( char *prog ) { register char *scan; /* Current node. */ char *next; /* Next node. */ scan = prog; #ifdef DEBUG if (scan != NULL && regnarrate) fprintf(stderr, "%s(\n", regprop(scan)); #endif while (scan != NULL) { #ifdef DEBUG if (regnarrate) fprintf(stderr, "%s...\n", regprop(scan)); #endif next = regnext(scan); switch (OP(scan)) { case BOL: if (reginput != regbol) return(0); break; case EOL: if (*reginput != '\0') return(0); break; case WORDA: /* Must be looking at a letter, digit, or _ */ if ((!isalnum(*reginput)) && *reginput != '_') return(0); /* Prev must be BOL or nonword */ if (reginput > regbol && (isalnum(reginput[-1]) || reginput[-1] == '_')) return(0); break; case WORDZ: /* Must be looking at non letter, digit, or _ */ if (isalnum(*reginput) || *reginput == '_') return(0); /* We don't care what the previous char was */ break; case ANY: if (*reginput == '\0') return(0); reginput++; break; case EXACTLY: { register int len; register char *opnd; opnd = OPERAND(scan); /* Inline the first character, for speed. */ if (*opnd != *reginput) return(0); len = strlen(opnd); if (len > 1 && strncmp(opnd, reginput, len) != 0) return(0); reginput += len; } break; case ANYOF: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) return(0); reginput++; break; case ANYBUT: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) return(0); reginput++; break; case NOTHING: break; case BACK: break; case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4: case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9: { register int no; register const char *save; no = OP(scan) - OPEN; save = reginput; if (regmatch(next)) { /* * Don't set startp if some later * invocation of the same parentheses * already has. */ if (regstartp[no] == NULL) regstartp[no] = save; return(1); } else return(0); } break; case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4: case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9: { register int no; register const char *save; no = OP(scan) - CLOSE; save = reginput; if (regmatch(next)) { /* * Don't set endp if some later * invocation of the same parentheses * already has. */ if (regendp[no] == NULL) regendp[no] = save; return(1); } else return(0); } break; case BRANCH: { register const char *save; if (OP(next) != BRANCH) /* No choice. */ next = OPERAND(scan); /* Avoid recursion. */ else { do { save = reginput; if (regmatch(OPERAND(scan))) return(1); reginput = save; scan = regnext(scan); } while (scan != NULL && OP(scan) == BRANCH); return(0); /* NOTREACHED */ } } break; case STAR: case PLUS: { register char nextch; register int no; register const char *save; register int min; /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ nextch = '\0'; if (OP(next) == EXACTLY) nextch = *OPERAND(next); min = (OP(scan) == STAR) ? 0 : 1; save = reginput; no = regrepeat(OPERAND(scan)); while (no >= min) { /* If it could work, try it. */ if (nextch == '\0' || *reginput == nextch) if (regmatch(next)) return(1); /* Couldn't or didn't -- back up. */ no--; reginput = save + no; } return(0); } break; case END: return(1); /* Success! */ break; default: regerror("memory corruption"); return(0); break; } scan = next; } /* * We get here only if there's trouble -- normally "case END" is * the terminating point. */ regerror("corrupted pointers"); return(0); }
/* - regcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. */ PGPError pgpRegComp(PGPContextRef context, char const *exp, regexp **pregexp) { regexp *r; char const *scan; char const *longest; int len; int flags; regcompState s_rcs; regcompState *rcs = &s_rcs; PGPValidateContext( context ); PGPValidatePtr( exp ); PGPValidatePtr( pregexp ); *pregexp = NULL; pgpClearMemory( &s_rcs, sizeof(s_rcs) ); /* First pass: determine size, legality. */ rcs->regparse = exp; rcs->regnpar = 1; rcs->regsize = 0L; rcs->regcode = ®dummy; regc(rcs, MAGIC); if (reg(rcs, 0, &flags) == NULL) return(kPGPError_OutOfMemory); /* Small enough for pointer-storage convention? */ if (rcs->regsize >= 32767L) /* Probably could be 65535L. */ return(kPGPError_BadParams); /* Allocate space. */ r = (regexp *)pgpContextMemAlloc(context, sizeof(regexp) + (unsigned)rcs->regsize, 0); if (r == NULL) return kPGPError_OutOfMemory; /* Second pass: emit code. */ rcs->regparse = exp; rcs->regnpar = 1; rcs->regcode = r->program; regc(rcs, MAGIC); if (reg(rcs, 0, &flags) == NULL) return(kPGPError_OutOfMemory); /* Dig out information for optimizations. */ r->regstart = '\0'; /* Worst-case defaults. */ r->reganch = 0; r->regmust = NULL; r->regmlen = 0; scan = r->program+1; /* First BRANCH. */ if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(scan) == EXACTLY) r->regstart = *OPERAND(scan); else if (OP(scan) == BOL) r->reganch++; /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if (flags&SPSTART) { longest = NULL; len = 0; for (; scan != NULL; scan = regnext(scan)) if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= (unsigned)len) { longest = OPERAND(scan); len = strlen(OPERAND(scan)); } r->regmust = longest; r->regmlen = len; } } *pregexp = r; return(kPGPError_NoErr); }