/* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ static int regbranch(regex_t *preg, int *flagp ) { int ret; int chain; int latest; int flags; *flagp = WORST; /* Tentatively. */ ret = regnode(preg, BRANCH); chain = 0; while (*preg->regparse != '\0' && *preg->regparse != ')' && *preg->regparse != '|') { latest = regpiece(preg, &flags); if (latest == 0) return 0; *flagp |= flags&HASWIDTH; if (chain == 0) {/* First piece. */ *flagp |= flags&SPSTART; } else { regtail(preg, chain, latest); } chain = latest; } if (chain == 0) /* Loop ran zero times. */ (void) regnode(preg, NOTHING); return(ret); }
/* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ static char * regbranch( int *flagp ) { register char *ret; register char *chain; register char *latest; int flags; *flagp = WORST; /* Tentatively. */ ret = regnode(BRANCH); chain = NULL; while (*regparse != '\0' && *regparse != ')' && *regparse != '\n' && *regparse != '|') { latest = regpiece(&flags); if (latest == NULL) return(NULL); *flagp |= flags&HASWIDTH; if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else regtail(chain, latest); chain = latest; } if (chain == NULL) /* Loop ran zero times. */ (void) regnode(NOTHING); return(ret); }
TCHAR *CRegExp::regbranch(int *flagp) { TCHAR *ret; TCHAR *chain; TCHAR *latest; int flags; int c; *flagp = WORST; // Tentatively. ret = regnode(BRANCH); chain = NULL; while ((c = *regparse) != _T('\0') && c != _T('|') && c != _T(')')) { latest = regpiece(&flags); if (latest == NULL) return(NULL); *flagp |= flags&HASWIDTH; if (chain == NULL) // First piece. *flagp |= flags&SPSTART; else regtail(chain, latest); chain = latest; } if (chain == NULL) // Loop ran zero times. (void) regnode(NOTHING); return(ret); }
/* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ static char * regbranch(int *flagp) { register char *ret; register char *chain; register char *latest; int flags; int len = 0; *flagp = WORST; /* Tentatively. */ ret = regnode(BRANCH); chain = NULL; /* Solaris 2.6 motif diff bug 1236359 1 line */ while ((len = CHARLEN(regparse)) > 0 && (len != 1 || (*regparse != ')' && *regparse != '\n' && *regparse != '|'))) { latest = regpiece(&flags); if (latest == NULL) return(NULL); *flagp |= flags & HASWIDTH; if (chain == NULL) /* First piece. */ *flagp |= flags & SPSTART; else regtail(chain, latest); chain = latest; } if (chain == NULL) /* Loop ran zero times. */ (void) regnode(NOTHING); return(ret); }
struct Node *regspcl (struct Expr *expr, uchar *type, struct Node *parent) { struct Node *node; uchar *pat; switch( *type ) { case 'd': pat = "[0-9]"; break; case 'D': pat = "[^0-9]"; break; case 's': pat = "[ ]"; break; case 'S': pat = "[^ ]"; break; case 'i': pat = "[a-zA-Z_:]"; break; case 'I': pat = "[^a-zA-Z_:]"; break; default: // pattern is escaped regular character if( node = regnode (expr) ) { node->maximum = 1; node->minimum = 1; node->typelen = 1; node->type->pattern = type; node->parent = parent; } return node; } if( node = regnode (expr) ) { node->maximum = 1; node->minimum = 1; node->typelen = strlen(pat); node->type->pattern = pat; node->parent = parent; } return node; }
/* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ char* ossimRegExp::regbranch (int *flagp) { char* ret; char* chain; char* latest; int flags; *flagp = WORST; // Tentatively. ret = regnode(BRANCH); chain = NULL; while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { latest = regpiece(&flags); if (latest == NULL) return (NULL); *flagp |= flags & HASWIDTH; if (chain == NULL) // First piece. *flagp |= flags & SPSTART; else regtail(chain, latest); chain = latest; } if (chain == NULL) // Loop ran zero times. regnode(NOTHING); return (ret); }
/* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ char* ossimRegExp::regpiece (int *flagp) { char* ret; char op; char* next; int flags; ret = regatom(&flags); if (ret == NULL) return (NULL); op = *regparse; if (!ISMULT(op)) { *flagp = flags; return (ret); } if (!(flags & HASWIDTH) && op != '?') { //RAISE Error, SYM(ossimRegExp), SYM(Empty_Operand), printf ("ossimRegExp::compile() : *+ operand could be empty.\n"); return 0; } *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH); if (op == '*' && (flags & SIMPLE)) reginsert(STAR, ret); else if (op == '*') { // Emit x* as (x&|), where & means "self". reginsert(BRANCH, ret); // Either x regoptail(ret, regnode(BACK)); // and loop regoptail(ret, ret); // back regtail(ret, regnode(BRANCH)); // or regtail(ret, regnode(NOTHING)); // null. } else if (op == '+' && (flags & SIMPLE)) reginsert(PLUS, ret); else if (op == '+') { // Emit x+ as x(&|), where & means "self". next = regnode(BRANCH); // Either regtail(ret, next); regtail(regnode(BACK), ret); // loop back regtail(next, regnode(BRANCH)); // or regtail(ret, regnode(NOTHING)); // null. } else if (op == '?') { // Emit x? as (x|) reginsert(BRANCH, ret); // Either x regtail(ret, regnode(BRANCH)); // or next = regnode(NOTHING);// null. regtail(ret, next); regoptail(ret, next); } regparse++; if (ISMULT(*regparse)) { //RAISE Error, SYM(ossimRegExp), SYM(Nested_Operand), printf ("ossimRegExp::compile(): Nested *?+.\n"); return 0; } return (ret); }
/* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static char * regpiece(int *flagp) { register char *ret; register char *op; register char *next; int flags; int len = 0; ret = regatom(&flags); if (ret == NULL) return(NULL); op = regparse; if (!ISMULT(op)) { *flagp = flags; return(ret); } len = CHARLEN(op); if (!(flags & HASWIDTH) && ((len != 1) || (*op != '?')) ) FAIL("*+ operand could be empty"); *flagp = ((len != 1) || (*op != '+')) ? (WORST | SPSTART) : (WORST | HASWIDTH); if ((len == 1) && (*op == '*') && (flags & SIMPLE)) reginsert(STAR, ret); else if ((len == 1) && (*op == '*')) { /* Emit x* as (x&|), where & means "self". */ reginsert(BRANCH, ret); /* Either x */ regoptail(ret, regnode(BACK)); /* and loop */ regoptail(ret, ret); /* back */ regtail(ret, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if ((len == 1) && (*op == '+') && (flags & SIMPLE)) reginsert(PLUS, ret); else if ((len == 1) && (*op == '+')) { /* Emit x+ as x(&|), where & means "self". */ next = regnode(BRANCH); /* Either */ regtail(ret, next); regtail(regnode(BACK), ret); /* loop back */ regtail(next, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if ((len == 1) && (*op == '?')) { /* Emit x? as (x|) */ reginsert(BRANCH, ret); /* Either x */ regtail(ret, regnode(BRANCH)); /* or */ next = regnode(NOTHING); /* null. */ regtail(ret, next); regoptail(ret, next); } regparse += INCRLEN(len); if (ISMULT(regparse)) FAIL("nested *?+"); return(ret); }
/* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static char *regpiece( int *flagp ) { char *ret, op, *next; int flags; ret = regatom( &flags ); if( ret == NULL ) { return( NULL ); } op = *regparse; if( !ISMULT( op ) ) { *flagp = flags; return( ret ); } if( !( flags & HASWIDTH ) && op != '?' ) { FAIL( ERR_RE_EMPTY_OPERAND ); } *flagp = ( op != '+' ) ? ( WORST | SPSTART ) : ( WORST | HASWIDTH ); if( op == '*' && ( flags & SIMPLE ) ) { reginsert( STAR, ret ); } else if( op == '*' ) { /* Emit x* as (x&|), where & means "self". */ reginsert( BRANCH, ret ); /* Either x */ regoptail( ret, regnode( BACK ) ); /* and loop */ regoptail( ret, ret ); /* back */ regtail( ret, regnode( BRANCH ) ); /* or */ regtail( ret, regnode( NOTHING ) ); /* null. */ } else if( op == '+' && ( flags & SIMPLE ) ) { reginsert( PLUS, ret ); } else if( op == '+' ) { /* Emit x+ as x(&|), where & means "self". */ next = regnode( BRANCH ); /* Either */ regtail( ret, next ); regtail( regnode( BACK ), ret ); /* loop back */ regtail( next, regnode( BRANCH ) ); /* or */ regtail( ret, regnode( NOTHING ) ); /* null. */ } else if( op == '?' ) { /* Emit x? as (x|) */ reginsert( BRANCH, ret ); /* Either x */ regtail( ret, regnode( BRANCH ) ); /* or */ next = regnode( NOTHING ); /* null. */ regtail( ret, next ); regoptail( ret, next ); } regparse++; if( ISMULT( *regparse ) ) { FAIL( ERR_RE_NESTED_OPERAND ); } return( ret ); }
/* * regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static char * regpiece(i4 *flagp) { register char *ret; register char op; register char *next; i4 flags; ret = regatom(&flags); if (ret == NULL) return(NULL); op = *regparse; if (!ISMULT(op)) { *flagp = flags; return(ret); } if (!(flags&HASWIDTH) && op != '?') _FAIL("*+ operand could be empty"); *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) reginsert(STAR, ret); else if (op == '*') { /* Emit x* as (x&|), where & means "self". */ reginsert(BRANCH, ret); /* Either x */ regoptail(ret, regnode(BACK)); /* and loop */ regoptail(ret, ret); /* back */ regtail(ret, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == '+' && (flags&SIMPLE)) reginsert(PLUS, ret); else if (op == '+') { /* Emit x+ as x(&|), where & means "self". */ next = regnode(BRANCH); /* Either */ regtail(ret, next); regtail(regnode(BACK), ret); /* loop back */ regtail(next, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == '?') { /* Emit x? as (x|) */ reginsert(BRANCH, ret); /* Either x */ regtail(ret, regnode(BRANCH)); /* or */ next = regnode(NOTHING); /* null. */ regtail(ret, next); regoptail(ret, next); } CMnext( regparse ); if (ISMULT(*regparse)) _FAIL("nested *?+"); return(ret); }
/* - regpiece - something followed by possible [*+?] * * Note that the branching code sequence used for ? and the general cases of * * and + are somewhat optimized: they use the same NOTHING node as both the * endmarker for their branch list and the body of the last branch. It might * seem that this node could be dispensed with entirely, but the endmarker * role is not redundant. */ static char *regpiece (int * flagp) { register char *ret; register short op; register char *nxt; int flags; ret = regatom(&flags); if (ret == (char *) NULL) return ((char *) NULL); op = *regparse; if (!ISMULT(op)) { *flagp = flags; return (ret); } if (!(flags & HASWIDTH) && op != QMARK) FAIL("*+ operand could be empty\n"); *flagp = (op != PLUSS) ? (WORST | SPSTART) : (WORST | HASWIDTH); if (op == ASTERIX && (flags & SIMPLE)) reginsert(STAR, ret); else if (op == ASTERIX) { /* Emit x* as (x&|), where & means "self". */ reginsert(BRANCH, ret); /* Either x */ regoptail(ret, regnode(BACK)); /* and loop */ regoptail(ret, ret); /* back */ regtail(ret, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == PLUSS && (flags & SIMPLE)) reginsert(PLUS, ret); else if (op == PLUSS) { /* Emit x+ as x(&|), where & means "self". */ nxt = regnode(BRANCH); /* Either */ regtail(ret, nxt); regtail(regnode(BACK), ret); /* loop back */ regtail(nxt, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == QMARK) { /* Emit x? as (x|) */ reginsert(BRANCH, ret); /* Either x */ regtail(ret, regnode(BRANCH)); /* or */ nxt = regnode(NOTHING); /* null. */ regtail(ret, nxt); regoptail(ret, nxt); } regparse++; if (ISMULT(*regparse)) FAIL("nested *?+\n"); return (ret); }
struct Node *regpat (struct Expr *expr, uchar *pat, int len, struct Node *parent) { struct Node *node; if( node = regnode (expr) ) { node->maximum = 1; node->minimum = 1; node->typelen = len; node->parent = parent; node->type->pattern = pat; } return node; }
/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static char * regatom(int *flagp) { register char *ret; int flags; *flagp = WORST; /* Tentatively. */ switch (*regparse++) { case '^': ret = regnode(BOL); break; case '$': ret = regnode(EOL); break; case '.': ret = regnode(ANY); *flagp |= HASWIDTH|SIMPLE; break; case '[': { register int chclass; register int chclassend; if (*regparse == '^') { /* Complement of range. */ ret = regnode(ANYBUT); regparse++; } else { ret = regnode(ANYOF); } if (*regparse == ']' || *regparse == '-') { regc(*regparse++); } while (*regparse != '\0' && *regparse != ']') { if (*regparse == '-') { regparse++; if (*regparse == ']' || *regparse == '\0') { regc('-'); } else { chclass = UCHARAT(regparse-2)+1; chclassend = UCHARAT(regparse); if (chclass > chclassend+1) { FAIL("invalid [] range"); } for (; chclass <= chclassend; chclass++) { regc(chclass); } regparse++; } } else if (*regparse == '\\') { switch(*++regparse) { case 'n' : regc('\n'); regparse++; break; case 't' : regc('\t'); regparse++; break; case ']' : regc(']'); regparse++; break; case '-' : regc('-'); regparse++; break; case '\\' : regc('\\'); regparse++; break; default : regparse--; regc(*regparse++); } } else { regc(*regparse++); } } regc('\0'); if (*regparse != ']') { FAIL("unmatched []"); } regparse++; *flagp |= HASWIDTH|SIMPLE; } break; case '(': ret = reg(1, &flags); if (ret == NULL) { return(NULL); } *flagp |= flags&(HASWIDTH|SPSTART); break; case '\0': case '|': case ')': FAIL("internal urp"); /* Supposed to be caught earlier. */ break; case '?': case '+': case '*': case '{': FAIL("?+*{ follows nothing"); break; case '\\': if (*regparse == '\0') { FAIL("trailing \\"); } switch(*regparse) { case '<': ret = regnode(BEGWORD); break; case '>': ret = regnode(ENDWORD); break; case 'd': ret = regnode(DIGIT); *flagp |= (HASWIDTH|SIMPLE); break; case 'D': ret = regnode(NDIGIT); *flagp |= (HASWIDTH|SIMPLE); break; case 'n' : ret = regnode(EXACTLY); regc('\n'); regc('\0'); *flagp |= (HASWIDTH|SIMPLE); break; case 'p': ret = regnode(PRINT); *flagp |= HASWIDTH|SIMPLE; break; case 'P': ret = regnode(NPRINT); *flagp |= HASWIDTH|SIMPLE; break; case 's': ret = regnode(WHITESP); *flagp |= HASWIDTH|SIMPLE; break; case 'S': ret = regnode(NWHITESP); *flagp |= HASWIDTH|SIMPLE; break; case 't' : ret = regnode(EXACTLY); regc('\t'); regc('\0'); *flagp |= (HASWIDTH|SIMPLE); break; case 'w': ret = regnode(ALNUM); *flagp |= HASWIDTH|SIMPLE; break; case 'W': ret = regnode(NALNUM); *flagp |= HASWIDTH|SIMPLE; break; default : ret = regnode(EXACTLY); regc(*regparse); regc('\0'); *flagp |= HASWIDTH|SIMPLE; } regparse++; break; default: { register int len; register char ender; regparse--; len = strcspn(regparse, META); if (len <= 0) { FAIL("internal disaster"); } ender = *(regparse+len); if (len > 1 && ISMULT(ender)) { len--; /* Back off clear of ?+* operand. */ } *flagp |= HASWIDTH; if (len == 1) { *flagp |= SIMPLE; } ret = regnode(EXACTLY); while (len > 0) { regc(*regparse++); len--; } regc('\0'); } break; } return(ret); }
/* * reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static char *reg( int paren, int *flagp ) { char *ret, *br, *ender; int flags; char parno = 0; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if( paren ) { if( regnpar >= NSUBEXP ) { FAIL( ERR_RE_TOO_MANY_ROUND_BRACKETS ); } parno = regnpar; regnpar++; ret = regnode( OPEN + parno ); } else { ret = NULL; } /* Pick up the branches, linking them together. */ br = regbranch( &flags ); if( br == NULL ) { return( NULL ); } if( ret != NULL ) { regtail( ret, br ); /* OPEN -> first. */ } else { ret = br; } if( !( flags & HASWIDTH ) ) { *flagp &= ~HASWIDTH; } *flagp |= flags & SPSTART; while( *regparse == '|' ) { regparse++; br = regbranch( &flags ); if( br == NULL ) { return( NULL ); } regtail( ret, br ); /* BRANCH -> BRANCH. */ if( !( flags & HASWIDTH ) ) { *flagp &= ~HASWIDTH; } *flagp |= flags & SPSTART; } /* Make a closing node, and hook it on the end. */ ender = regnode( ( paren ) ? CLOSE + parno : END ); regtail( ret, ender ); /* Hook the tails of the branches to the closing node. */ for( br = ret; br != NULL; br = regnext( br ) ) { regoptail( br, ender ); } /* Check for proper termination. */ if( paren && *regparse++ != ')' ) { FAIL( ERR_RE_UNMATCHED_ROUND_BRACKETS ); } else if( !paren && *regparse != '\0' ) { if( *regparse == ')' ) { FAIL( ERR_RE_UNMATCHED_ROUND_BRACKETS ); } else { FAIL( ERR_RE_INTERNAL_FOULUP ); /* "Can't happen". */ } } return( ret ); }
TCHAR *CRegExp::regatom(int *flagp) { TCHAR *ret; int flags; *flagp = WORST; // Tentatively. switch (*regparse++) { case _T('^'): ret = regnode(BOL); break; case _T('$'): ret = regnode(EOL); break; case _T('.'): ret = regnode(ANY); *flagp |= HASWIDTH|SIMPLE; break; case _T('['): { int range; int rangeend; int c; if (*regparse == _T('^')) { // Complement of range. ret = regnode(ANYBUT); regparse++; } else ret = regnode(ANYOF); if ((c = *regparse) == _T(']') || c == _T('-')) { regc(c); regparse++; } while ((c = *regparse++) != _T('\0') && c != _T(']')) { if (c != _T('-')) regc(c); else if ((c = *regparse) == _T(']') || c == _T('\0')) regc(_T('-')); else { range = (unsigned) (TCHAR)*(regparse-2); rangeend = (unsigned) (TCHAR)c; if (range > rangeend) { TRACE0("invalid [] range\n"); return NULL; } for (range++; range <= rangeend; range++) regc(range); regparse++; } } regc(_T('\0')); if (c != _T(']')) { TRACE0("unmatched []\n"); return NULL; } *flagp |= HASWIDTH|SIMPLE; break; } case _T('('): ret = reg(1, &flags); if (ret == NULL) return(NULL); *flagp |= flags&(HASWIDTH|SPSTART); break; case _T('\0'): case _T('|'): case _T(')'): // supposed to be caught earlier TRACE0("internal error: \\0|) unexpected\n"); return NULL; break; case _T('?'): case _T('+'): case _T('*'): TRACE0("?+* follows nothing\n"); return NULL; break; case _T('\\'): if (*regparse == _T('\0')) { TRACE0("trailing \\\n"); return NULL; } ret = regnode(EXACTLY); regc(*regparse++); regc(_T('\0')); *flagp |= HASWIDTH|SIMPLE; break; default: { size_t len; TCHAR ender; regparse--; len = _tcscspn(regparse, META); if (len == 0) { TRACE0("internal error: strcspn 0\n"); return NULL; } ender = *(regparse+len); if (len > 1 && ISREPN(ender)) len--; // Back off clear of ?+* operand. *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; ret = regnode(EXACTLY); for (; len > 0; len--) regc(*regparse++); regc(_T('\0')); break; } } return(ret); }
TCHAR *CRegExp::regpiece(int *flagp) { TCHAR *ret; TCHAR op; TCHAR *next; int flags; ret = regatom(&flags); if (ret == NULL) return(NULL); op = *regparse; if (!ISREPN(op)) { *flagp = flags; return(ret); } if (!(flags&HASWIDTH) && op != _T('?')) { TRACE0("*+ operand could be empty\n"); return NULL; } switch (op) { case _T('*'): *flagp = WORST|SPSTART; break; case _T('+'): *flagp = WORST|SPSTART|HASWIDTH; break; case _T('?'): *flagp = WORST; break; } if (op == _T('*') && (flags&SIMPLE)) reginsert(STAR, ret); else if (op == _T('*')) { // Emit x* as (x&|), where & means "self". reginsert(BRANCH, ret); // Either x regoptail(ret, regnode(BACK)); // and loop regoptail(ret, ret); // back regtail(ret, regnode(BRANCH)); // or regtail(ret, regnode(NOTHING)); // null. } else if (op == _T('+') && (flags&SIMPLE)) reginsert(PLUS, ret); else if (op == _T('+')) { // Emit x+ as x(&|), where & means "self". next = regnode(BRANCH); // Either regtail(ret, next); regtail(regnode(BACK), ret); // loop back regtail(next, regnode(BRANCH)); // or regtail(ret, regnode(NOTHING)); // null. } else if (op == _T('?')) { // Emit x? as (x|) reginsert(BRANCH, ret); // Either x regtail(ret, regnode(BRANCH)); // or next = regnode(NOTHING); // null. regtail(ret, next); regoptail(ret, next); } regparse++; if (ISREPN(*regparse)) { TRACE0("nested *?+\n"); return NULL; } return(ret); }
/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. */ static char *regatom (int * flagp) { register char *ret; int flags; *flagp = WORST; /* Tentatively. */ switch (*regparse++) { case CARET: ret = regnode(BOL); break; case DOLLAR: ret = regnode(EOL); break; case DOT: ret = regnode(ANY); *flagp |= HASWIDTH | SIMPLE; break; case LSHBRAC: ret = regnode(WORDSTART); break; case RSHBRAC: ret = regnode(WORDEND); break; case LSQBRAC:{ register int classs; register int classend; if (*regparse == CARET) { /* Complement of range. */ ret = regnode(ANYBUT); regparse++; } else ret = regnode(ANYOF); if (*regparse == RSQBRAC || *regparse == '-') regc(*regparse++); while (*regparse != '\0' && *regparse != RSQBRAC) { if (*regparse == '-') { regparse++; if (*regparse == RSQBRAC || *regparse == '\0') regc('-'); else { classs = (CHARBITS & *(regparse - 2)) + 1; classend = (CHARBITS & *(regparse)); if (classs > classend + 1) FAIL("invalid [] range\n"); for (; classs <= classend; classs++) regc(classs); regparse++; } } else regc(*regparse++); } regc('\0'); if (*regparse != RSQBRAC) FAIL("unmatched []\n"); regparse++; *flagp |= HASWIDTH | SIMPLE; } break; case LBRAC: ret = reg(1, &flags); if (ret == (char *) NULL) return ((char *) NULL); *flagp |= flags & (HASWIDTH | SPSTART); break; case '\0': case OR_OP: case RBRAC: FAIL("internal urp\n"); /* Supposed to be caught earlier. */ break; case ASTERIX: FAIL("* follows nothing\n"); break; case PLUSS: FAIL("+ follows nothing\n"); break; case QMARK: FAIL("? follows nothing\n"); break; default:{ register int len; register short ender; regparse--; for (len = 0; regparse[len] && !(regparse[len] & SPECIAL) && regparse[len] != RSQBRAC; len++); if (len <= 0) { FAIL("unexpected ]\n"); } ender = *(regparse + len); if (len > 1 && ISMULT(ender)) len--; /* Back off clear of ?+* operand. */ *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; ret = regnode(EXACTLY); while (len > 0) { regc(*regparse++); len--; } regc('\0'); } break; } return (ret); }
TCHAR *CRegExp::reg(int paren, int *flagp) { char *ret; char *br; char *ender; int parno; int flags; *flagp = HASWIDTH; // Tentatively. if (paren) { // Make an OPEN node. if (regnpar >= NSUBEXP) { TRACE1("Too many (). NSUBEXP is set to %d\n", NSUBEXP ); return NULL; } parno = regnpar; regnpar++; ret = regnode(OPEN+parno); } // Pick up the branches, linking them together. br = regbranch(&flags); if (br == NULL) return(NULL); if (paren) regtail(ret, br); // OPEN -> first. else ret = br; *flagp &= ~(~flags&HASWIDTH); // Clear bit if bit 0. *flagp |= flags&SPSTART; while (*regparse == _T('|')) { regparse++; br = regbranch(&flags); if (br == NULL) return(NULL); regtail(ret, br); // BRANCH -> BRANCH. *flagp &= ~(~flags&HASWIDTH); *flagp |= flags&SPSTART; } // Make a closing node, and hook it on the end. ender = regnode((paren) ? CLOSE+parno : END); regtail(ret, ender); // Hook the tails of the branches to the closing node. for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); // Check for proper termination. if (paren && *regparse++ != _T(')')) { TRACE0("unterminated ()\n"); return NULL; } else if (!paren && *regparse != _T('\0')) { if (*regparse == _T(')')) { TRACE0("unmatched ()\n"); return NULL; } else { TRACE0("internal error: junk on end\n"); return NULL; } // NOTREACHED } return(ret); }
/* * regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static char * regatom(i4 *flagp) { register char *ret; i4 flags; char null_byte = '\0'; *flagp = WORST; /* Tentatively. */ switch (*regparse) { case '^': CMnext( regparse ); ret = regnode(BOL); break; case '$': CMnext( regparse ); ret = regnode(EOL); break; case '.': CMnext( regparse ); ret = regnode(ANY); *flagp |= HASWIDTH|SIMPLE; break; case '[': { char *range_start = NULL; bool double_start; u_i2 first_u2, last_u2; u_char first_u1, last_u1; CMnext( regparse ); if (*regparse == '^') { /* Complement of range. */ ret = regnode(ANYBUT); CMnext( regparse ); } else ret = regnode(ANYOF); if (*regparse == ']' || *regparse == '-') { regc( regparse ); CMnext( regparse ); } while (*regparse != '\0' && *regparse != ']') { if (*regparse == '-') { char range_op = '-'; CMnext( regparse ); if( *regparse == ']' || *regparse == '\0' ) regc( &range_op ); else { char *tmp; bool invalid = FALSE; bool double_end; if( range_start == NULL ) invalid = TRUE; double_end = CMdbl1st( regparse ); if( !invalid && double_end && !double_start ) invalid = TRUE; if( !invalid && double_start && !double_start ) invalid = TRUE; if( !invalid && CMcmpcase( range_start, regparse ) > 0 ) invalid = TRUE; if( double_start ) _FAIL("don't know how to support character classes containing double-byte ranges"); if( invalid ) _FAIL("invalid [] range"); /* no double-byte ranges! */ /* ** Initialize the value for the end of the range. */ last_u1 = UCHARAT(regparse); for (; first_u1 <= last_u1; first_u1++ ) regc( (char *) &first_u1 ); CMnext( regparse ); } } else { range_start = regparse; if( CMdbl1st( range_start ) ) { double_start = TRUE; first_u2 = *(u_i2 *) range_start; } else { double_start = FALSE; first_u1 = UCHARAT(range_start); } regc( regparse ); CMnext( regparse ); } } regc( &null_byte ); if (*regparse != ']') _FAIL("unmatched []"); CMnext( regparse ); *flagp |= HASWIDTH|SIMPLE; } break; case '(': CMnext( regparse ); ret = reg(1, &flags); if (ret == NULL) return(NULL); *flagp |= flags&(HASWIDTH|SPSTART); break; case '\0': case '|': case ')': CMnext( regparse ); _FAIL("internal urp"); /* Supposed to be caught earlier. */ break; case '?': case '+': case '*': CMnext( regparse ); _FAIL("?+* follows nothing"); break; case '\\': CMnext( regparse ); if (*regparse == '\0') _FAIL("trailing \\"); ret = regnode(EXACTLY); regc( regparse ); CMnext( regparse ); regc( &null_byte ); *flagp |= HASWIDTH|SIMPLE; break; default: { register i4 len; register char ender; len = my_strcspn(regparse, META); if (len <= 0) _FAIL("internal disaster"); ender = *(regparse+len); if (len > 1 && ISMULT(ender)) len--; /* Back off clear of ?+* operand. */ *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; ret = regnode(EXACTLY); while (len > 0) { regc( regparse ); CMbytedec( len, regparse ); CMnext( regparse ); } regc( &null_byte ); } break; } return(ret); }
/* * regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static char *regatom( int *flagp ) { char *ret; int flags; *flagp = WORST; /* Tentatively. */ switch( *regparse++ ) { case '~': if( *regparse == 0 ) { FAIL( ERR_RE_INVALID_CASETOGGLE ); } ret = regnode( CASEI ); break; case '@': if( *regparse == 0 ) { FAIL( ERR_RE_INVALID_CASETOGGLE ); } ret = regnode( NOCASEI ); break; case '^': ret = regnode( BOL ); break; case '$': ret = regnode( EOL ); break; case '.': ret = regnode( ANY ); *flagp |= HASWIDTH | SIMPLE; break; case '[': { if( *regparse == '^' ) { /* Complement of range. */ ret = regnode( ANYBUT ); regparse++; } else { ret = regnode( ANYOF ); } if( *regparse == ']' || *regparse == '-' ) { regc( *regparse++ ); } while( *regparse != '\0' && *regparse != ']' ) { if( *regparse == '-' ) { regparse++; if( *regparse == ']' || *regparse == '\0' ) { regc( '-' ); } else { int class; int classend; class = UCHARAT( regparse - 2 ) + 1; classend = UCHARAT( regparse ); if( class > classend + 1 ) { FAIL( ERR_RE_INVALID_SB_RANGE ); } for( ; class <= classend; class++ ) { regc( (char)class ); } regparse++; } } else { if( *regparse == '\\' && *( regparse + 1 ) == 't' && REALTABS ) { regparse += 2; regc( '\t' ); } else { regc( *regparse++ ); } } } regc( '\0' ); if( *regparse != ']' ) { FAIL( ERR_RE_UNMATCHED_SQUARE_BRACKET ); } regparse++; *flagp |= HASWIDTH | SIMPLE; }
/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp ) { int ret; int br; int ender; int parno = 0; int flags; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (preg->regparse[0] == '?' && preg->regparse[1] == ':') { /* non-capturing paren */ preg->regparse += 2; parno = -1; } else { parno = ++preg->re_nsub; } ret = regnode(preg, OPEN+parno); } else ret = 0; /* Pick up the branches, linking them together. */ br = regbranch(preg, &flags); if (br == 0) return 0; if (ret != 0) regtail(preg, ret, br); /* OPEN -> first. */ else ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; while (*preg->regparse == '|') { preg->regparse++; br = regbranch(preg, &flags); if (br == 0) return 0; regtail(preg, ret, br); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ ender = regnode(preg, (paren) ? CLOSE+parno : END); regtail(preg, ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender); /* Check for proper termination. */ if (paren && *preg->regparse++ != ')') { preg->err = REG_ERR_UNMATCHED_PAREN; return 0; } else if (!paren && *preg->regparse != '\0') { if (*preg->regparse == ')') { preg->err = REG_ERR_UNMATCHED_PAREN; return 0; } else { preg->err = REG_ERR_JUNK_ON_END; return 0; } } return(ret); }
/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ char* ossimRegExp::reg (int paren, int *flagp) { char* ret; char* br; char* ender; int parno =0; int flags; *flagp = HASWIDTH; // Tentatively. // Make an OPEN node, if parenthesized. if (paren) { if (regnpar >= NSUBEXP) { //RAISE Error, SYM(ossimRegExp), SYM(Too_Many_Parens), printf ("ossimRegExp::compile(): Too many parentheses.\n"); return 0; } parno = regnpar; regnpar++; ret = regnode(OPEN + parno); } else ret = NULL; // Pick up the branches, linking them together. br = regbranch(&flags); if (br == NULL) return (NULL); if (ret != NULL) regtail(ret, br); // OPEN -> first. else ret = br; if (!(flags & HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags & SPSTART; while (*regparse == '|') { regparse++; br = regbranch(&flags); if (br == NULL) return (NULL); regtail(ret, br); // BRANCH -> BRANCH. if (!(flags & HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags & SPSTART; } // Make a closing node, and hook it on the end. ender = regnode((paren) ? CLOSE + parno : END); regtail(ret, ender); // Hook the tails of the branches to the closing node. for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); // Check for proper termination. if (paren && *regparse++ != ')') { //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Parens), printf ("ossimRegExp::compile(): Unmatched parentheses.\n"); return 0; } else if (!paren && *regparse != '\0') { if (*regparse == ')') { //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Parens), printf ("ossimRegExp::compile(): Unmatched parentheses.\n"); return 0; } else { //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error), printf ("ossimRegExp::compile(): Internal error.\n"); return 0; } // NOTREACHED } return (ret); }
int regcomp (struct Expr *expr, int size, uchar *pat, int len) { int bnest = 0, off = 0, ch; struct Node *node = NULL; struct Node *prev = NULL; struct Node *parent; if( size < sizeof(*expr) ) return 0; memset (expr, 0, sizeof(*expr)); expr->size = size; if( parent = regpat(expr, NULL, 0, NULL) ) while( off < len ) { switch( ch = pat[off] ) { case '{': if( node ) { off += regminimax (expr, pat + off, len - off, node); continue; } return 0; case ']': return 0; case '[': if( node = regpat(expr, pat + off++, 1, parent) ) bnest = 1; else return 0; while( off < len && bnest ) if( pat[off] == '[' ) bnest++, off++, node->typelen++; else if( pat[off] == ']' ) --bnest, off++, node->typelen++; else off++, node->typelen++; regappend (node, prev); prev = node; continue; // "or" node case '|': if( node = regnode(expr) ) { node->typelen = -1; node->parent = parent; node->minimum = 0; node->maximum = 1; } else return 0; // if already underway, // move node chain under // new "or" node if( parent->ornode ) { node->type->child = parent->ornode->next; parent->ornode->next = node; } else { node->type->child = parent->type->child; parent->type->child = node; } parent->ornode = prev = node; // reparent child nodes // under new "or" node if( node = node->type->child ) do node->parent = prev; while( node = node->next ); off++; continue; case '(': if( parent = regpat (expr, NULL, 0, parent) ) // expression node regappend (parent, prev); else return 0; prev = node = NULL; off++; continue; case ')': if( node = parent ) { off++; parent = node->parent; if( prev = parent->type->child ) while( prev->next ) prev = prev->next; continue; } return 0; case '\\': off++; if( pat[off] >= 'A' && pat[off] <= 'Z' || pat[off] >= 'a' && pat[off] <= 'z' ) if( node = regspcl(expr, pat + off, parent) ) { regappend (node, prev); prev = node; off++; continue; } default: if( node = regpat(expr, pat + off++, 1, parent) ) regappend (node, prev); else return 0; prev = node; continue; case '?': if( node ) { node->minimum = 0; node->maximum = 1; off++; continue; } return 0; case '+': if( node ) { node->minimum = 1; node->maximum = 0x7fffffff; off++; continue; } return 0; case '*': if( node ) { node->minimum = 0; node->maximum = 0x7fffffff; off++; continue; } return 0; } } return 1; }
/* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static int regpiece(regex_t *preg, int *flagp) { int ret; char op; int next; int flags; int min; int max; ret = regatom(preg, &flags); if (ret == 0) return 0; op = *preg->regparse; if (!ISMULT(op)) { *flagp = flags; return(ret); } if (!(flags&HASWIDTH) && op != '?') { preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY; return 0; } /* Handle braces (counted repetition) by expansion */ if (op == '{') { char *end; min = strtoul(preg->regparse + 1, &end, 10); if (end == preg->regparse + 1) { preg->err = REG_ERR_BAD_COUNT; return 0; } if (*end == '}') { max = min; } else { preg->regparse = end; max = strtoul(preg->regparse + 1, &end, 10); if (*end != '}') { preg->err = REG_ERR_UNMATCHED_BRACES; return 0; } } if (end == preg->regparse + 1) { max = MAX_REP_COUNT; } else if (max < min || max >= 100) { preg->err = REG_ERR_BAD_COUNT; return 0; } if (min >= 100) { preg->err = REG_ERR_BAD_COUNT; return 0; } preg->regparse = strchr(preg->regparse, '}'); } else { min = (op == '+'); max = (op == '?' ? 1 : MAX_REP_COUNT); } if (preg->regparse[1] == '?') { preg->regparse++; next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret); } else { next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret); } preg->program[ret + 2] = max; preg->program[ret + 3] = min; preg->program[ret + 4] = 0; *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART); if (!(flags & SIMPLE)) { int back = regnode(preg, BACK); regtail(preg, back, ret); regtail(preg, next, back); } preg->regparse++; if (ISMULT(*preg->regparse)) { preg->err = REG_ERR_NESTED_COUNT; return 0; } return ret; }
/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ char* ossimRegExp::regatom (int *flagp) { char* ret; int flags; *flagp = WORST; // Tentatively. switch (*regparse++) { case '^': ret = regnode(BOL); break; case '$': ret = regnode(EOL); break; case '.': ret = regnode(ANY); *flagp |= HASWIDTH | SIMPLE; break; case '[': { int rxpclass; int rxpclassend; if (*regparse == '^') { // Complement of range. ret = regnode(ANYBUT); regparse++; } else ret = regnode(ANYOF); if (*regparse == ']' || *regparse == '-') regc(*regparse++); while (*regparse != '\0' && *regparse != ']') { if (*regparse == '-') { regparse++; if (*regparse == ']' || *regparse == '\0') regc('-'); else { rxpclass = UCHARAT(regparse - 2) + 1; rxpclassend = UCHARAT(regparse); if (rxpclass > rxpclassend + 1) { //RAISE Error, SYM(ossimRegExp), SYM(Invalid_Range), printf ("ossimRegExp::compile(): Invalid range in [].\n"); return 0; } for (; rxpclass <= rxpclassend; rxpclass++) regc(rxpclass); regparse++; } } else regc(*regparse++); } regc('\0'); if (*regparse != ']') { //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Bracket), printf ("ossimRegExp::compile(): Unmatched [].\n"); return 0; } regparse++; *flagp |= HASWIDTH | SIMPLE; } break; case '(': ret = reg(1, &flags); if (ret == NULL) return (NULL); *flagp |= flags & (HASWIDTH | SPSTART); break; case '\0': case '|': case ')': //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error), printf ("ossimRegExp::compile(): Internal error.\n"); // Never here return 0; case '?': case '+': case '*': //RAISE Error, SYM(ossimRegExp), SYM(No_Operand), printf ("ossimRegExp::compile(): ?+* follows nothing.\n"); return 0; case '\\': if (*regparse == '\0') { //RAISE Error, SYM(ossimRegExp), SYM(Trailing_Backslash), printf ("ossimRegExp::compile(): Trailing backslash.\n"); return 0; } ret = regnode(EXACTLY); regc(*regparse++); regc('\0'); *flagp |= HASWIDTH | SIMPLE; break; default: { int len; char ender; regparse--; len = (int)strcspn(regparse, META); if (len <= 0) { //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error), printf ("ossimRegExp::compile(): Internal error.\n"); return 0; } ender = *(regparse + len); if (len > 1 && ISMULT(ender)) len--; // Back off clear of ?+* operand. *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; ret = regnode(EXACTLY); while (len > 0) { regc(*regparse++); len--; } regc('\0'); } break; } return (ret); }
/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static char * regatom(int *flagp) { register char *ret; int flags; *flagp = WORST; /* Tentatively. */ switch (*regparse++) { case '^': ret = regnode(BOL); break; case '$': ret = regnode(EOL); break; case '.': ret = regnode(ANY); *flagp |= HASWIDTH|SIMPLE; break; case '[': { register int clss; register int classend; if (*regparse == '^') { /* Complement of range. */ ret = regnode(ANYBUT); regparse++; } else ret = regnode(ANYOF); if (*regparse == ']' || *regparse == '-') regc(*regparse++); while (*regparse != '\0' && *regparse != ']') { if (*regparse == '-') { regparse++; if (*regparse == ']' || *regparse == '\0') regc('-'); else { clss = UCHARAT(regparse-2)+1; classend = UCHARAT(regparse); if (clss > classend+1) FAIL("invalid [] range"); for (; clss <= classend; clss++) regc(clss); regparse++; } } else regc(*regparse++); } regc('\0'); if (*regparse != ']') FAIL("unmatched []"); regparse++; *flagp |= HASWIDTH|SIMPLE; } break; case '(': ret = reg(1, &flags); if (ret == NULL) return(NULL); *flagp |= flags&(HASWIDTH|SPSTART); break; case '\0': case '|': case ')': FAIL("internal urp"); /* Supposed to be caught earlier. */ /* NOTREACHED */ break; case '?': case '+': case '*': FAIL("?+* follows nothing"); /* NOTREACHED */ break; case '\\': if (*regparse == '\0') FAIL("trailing \\"); ret = regnode(EXACTLY); regc(*regparse++); regc('\0'); *flagp |= HASWIDTH|SIMPLE; break; default: { register int len; register char ender; regparse--; len = (int) strcspn(regparse, META); if (len <= 0) FAIL("internal disaster"); ender = *(regparse+len); if (len > 1 && ISMULT(ender)) len--; /* Back off clear of ?+* operand. */ *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; ret = regnode(EXACTLY); while (len > 0) { regc(*regparse++); len--; } regc('\0'); } break; } return(ret); }
/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static char * reg( int paren, /* Parenthesized? */ int *flagp ) { register char *ret; register char *br; register char *ender; register int parno; int flags; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (regnpar >= NSUBEXP) FAIL("too many ()"); parno = regnpar; regnpar++; ret = regnode(OPEN+parno); } else ret = NULL; /* Pick up the branches, linking them together. */ br = regbranch(&flags); if (br == NULL) return(NULL); if (ret != NULL) regtail(ret, br); /* OPEN -> first. */ else ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; while (*regparse == '|' || *regparse == '\n') { regparse++; br = regbranch(&flags); if (br == NULL) return(NULL); regtail(ret, br); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ ender = regnode((paren) ? CLOSE+parno : END); regtail(ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); /* Check for proper termination. */ if (paren && *regparse++ != ')') { FAIL("unmatched ()"); } else if (!paren && *regparse != '\0') { if (*regparse == ')') { FAIL("unmatched ()"); } else FAIL("junk on end"); /* "Can't happen". */ /* NOTREACHED */ } return(ret); }
/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static int regatom(regex_t *preg, int *flagp) { int ret; int flags; int nocase = (preg->cflags & REG_ICASE); int ch; int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase); *flagp = WORST; /* Tentatively. */ preg->regparse += n; switch (ch) { /* FIXME: these chars only have meaning at beg/end of pat? */ case '^': ret = regnode(preg, BOL); break; case '$': ret = regnode(preg, EOL); break; case '.': ret = regnode(preg, ANY); *flagp |= HASWIDTH|SIMPLE; break; case '[': { const char *pattern = preg->regparse; if (*pattern == '^') { /* Complement of range. */ ret = regnode(preg, ANYBUT); pattern++; } else ret = regnode(preg, ANYOF); /* Special case. If the first char is ']' or '-', it is part of the set */ if (*pattern == ']' || *pattern == '-') { reg_addrange(preg, *pattern, *pattern); pattern++; } while (*pattern && *pattern != ']') { /* Is this a range? a-z */ int start; int end; pattern += reg_utf8_tounicode_case(pattern, &start, nocase); if (start == '\\') { pattern += reg_decode_escape(pattern, &start); if (start == 0) { preg->err = REG_ERR_NULL_CHAR; return 0; } } if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') { /* skip '-' */ pattern += utf8_tounicode(pattern, &end); pattern += reg_utf8_tounicode_case(pattern, &end, nocase); if (end == '\\') { pattern += reg_decode_escape(pattern, &end); if (end == 0) { preg->err = REG_ERR_NULL_CHAR; return 0; } } reg_addrange(preg, start, end); continue; } if (start == '[') { if (strncmp(pattern, ":alpha:]", 8) == 0) { if ((preg->cflags & REG_ICASE) == 0) { reg_addrange(preg, 'a', 'z'); } reg_addrange(preg, 'A', 'Z'); pattern += 8; continue; } if (strncmp(pattern, ":alnum:]", 8) == 0) { if ((preg->cflags & REG_ICASE) == 0) { reg_addrange(preg, 'a', 'z'); } reg_addrange(preg, 'A', 'Z'); reg_addrange(preg, '0', '9'); pattern += 8; continue; } if (strncmp(pattern, ":space:]", 8) == 0) { reg_addrange_str(preg, " \t\r\n\f\v"); pattern += 8; continue; } } /* Not a range, so just add the char */ reg_addrange(preg, start, start); } regc(preg, '\0'); if (*pattern) { pattern++; } preg->regparse = pattern; *flagp |= HASWIDTH|SIMPLE; } break; case '(': ret = reg(preg, 1, &flags); if (ret == 0) return 0; *flagp |= flags&(HASWIDTH|SPSTART); break; case '\0': case '|': case ')': preg->err = REG_ERR_INTERNAL; return 0; /* Supposed to be caught earlier. */ case '?': case '+': case '*': case '{': preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING; return 0; case '\\': switch (*preg->regparse++) { case '\0': preg->err = REG_ERR_TRAILING_BACKSLASH; return 0; case '<': case 'm': ret = regnode(preg, WORDA); break; case '>': case 'M': ret = regnode(preg, WORDZ); break; case 'd': ret = regnode(preg, ANYOF); reg_addrange(preg, '0', '9'); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; case 'w': ret = regnode(preg, ANYOF); if ((preg->cflags & REG_ICASE) == 0) { reg_addrange(preg, 'a', 'z'); } reg_addrange(preg, 'A', 'Z'); reg_addrange(preg, '0', '9'); reg_addrange(preg, '_', '_'); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; case 's': ret = regnode(preg, ANYOF); reg_addrange_str(preg," \t\r\n\f\v"); regc(preg, '\0'); *flagp |= HASWIDTH|SIMPLE; break; /* FIXME: Someday handle \1, \2, ... */ default: /* Handle general quoted chars in exact-match routine */ /* Back up to include the backslash */ preg->regparse--; goto de_fault; } break; de_fault: default: { /* * Encode a string of characters to be matched exactly. */ int added = 0; /* Back up to pick up the first char of interest */ preg->regparse -= n; ret = regnode(preg, EXACTLY); /* Note that a META operator such as ? or * consumes the * preceding char. * Thus we must be careful to look ahead by 2 and add the * last char as it's own EXACTLY if necessary */ /* Until end of string or a META char is reached */ while (*preg->regparse && strchr(META, *preg->regparse) == NULL) { n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE)); if (ch == '\\' && preg->regparse[n]) { /* Non-trailing backslash. * Is this a special escape, or a regular escape? */ if (strchr("<>mMwds", preg->regparse[n])) { /* A special escape. All done with EXACTLY */ break; } /* Decode it. Note that we add the length for the escape * sequence to the length for the backlash so we can skip * the entire sequence, or not as required. */ n += reg_decode_escape(preg->regparse + n, &ch); if (ch == 0) { preg->err = REG_ERR_NULL_CHAR; return 0; } } /* Now we have one char 'ch' of length 'n'. * Check to see if the following char is a MULT */ if (ISMULT(preg->regparse[n])) { /* Yes. But do we already have some EXACTLY chars? */ if (added) { /* Yes, so return what we have and pick up the current char next time around */ break; } /* No, so add this single char and finish */ regc(preg, ch); added++; preg->regparse += n; break; } /* No, so just add this char normally */ regc(preg, ch); added++; preg->regparse += n; } regc(preg, '\0'); *flagp |= HASWIDTH; if (added == 1) *flagp |= SIMPLE; break; } break; } return(ret); }
/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ static char * regatom( int *flagp ) { register char *ret; int flags; *flagp = WORST; /* Tentatively. */ switch (*regparse++) { /* FIXME: these chars only have meaning at beg/end of pat? */ case '^': ret = regnode(BOL); break; case '$': ret = regnode(EOL); break; case '.': ret = regnode(ANY); *flagp |= HASWIDTH|SIMPLE; break; case '[': { register int classr; register int classend; if (*regparse == '^') { /* Complement of range. */ ret = regnode(ANYBUT); regparse++; } else ret = regnode(ANYOF); if (*regparse == ']' || *regparse == '-') regc(*regparse++); while (*regparse != '\0' && *regparse != ']') { if (*regparse == '-') { regparse++; if (*regparse == ']' || *regparse == '\0') regc('-'); else { classr = UCHARAT(regparse-2)+1; classend = UCHARAT(regparse); if (classr > classend+1) FAIL("invalid [] range"); for (; classr <= classend; classr++) regc(classr); regparse++; } } else regc(*regparse++); } regc('\0'); if (*regparse != ']') FAIL("unmatched []"); regparse++; *flagp |= HASWIDTH|SIMPLE; } break; case '(': ret = reg(1, &flags); if (ret == NULL) return(NULL); *flagp |= flags&(HASWIDTH|SPSTART); break; case '\0': case '|': case '\n': case ')': FAIL("internal urp"); /* Supposed to be caught earlier. */ break; case '?': case '+': case '*': FAIL("?+* follows nothing"); break; case '\\': switch (*regparse++) { case '\0': FAIL("trailing \\"); break; case '<': ret = regnode(WORDA); break; case '>': ret = regnode(WORDZ); break; /* FIXME: Someday handle \1, \2, ... */ default: /* Handle general quoted chars in exact-match routine */ goto de_fault; } break; de_fault: default: /* * Encode a string of characters to be matched exactly. * * This is a bit tricky due to quoted chars and due to * '*', '+', and '?' taking the SINGLE char previous * as their operand. * * On entry, the char at regparse[-1] is going to go * into the string, no matter what it is. (It could be * following a \ if we are entered from the '\' case.) * * Basic idea is to pick up a good char in ch and * examine the next char. If it's *+? then we twiddle. * If it's \ then we frozzle. If it's other magic char * we push ch and terminate the string. If none of the * above, we push ch on the string and go around again. * * regprev is used to remember where "the current char" * starts in the string, if due to a *+? we need to back * up and put the current char in a separate, 1-char, string. * When regprev is NULL, ch is the only char in the * string; this is used in *+? handling, and in setting * flags |= SIMPLE at the end. */ { char *regprev; register char ch; regparse--; /* Look at cur char */ ret = regnode(EXACTLY); for ( regprev = 0 ; ; ) { ch = *regparse++; /* Get current char */ switch (*regparse) { /* look at next one */ default: regc(ch); /* Add cur to string */ break; case '.': case '[': case '(': case ')': case '|': case '\n': case '$': case '^': case '\0': /* FIXME, $ and ^ should not always be magic */ magic: regc(ch); /* dump cur char */ goto done; /* and we are done */ case '?': case '+': case '*': if (!regprev) /* If just ch in str, */ goto magic; /* use it */ /* End mult-char string one early */ regparse = regprev; /* Back up parse */ goto done; case '\\': regc(ch); /* Cur char OK */ switch (regparse[1]){ /* Look after \ */ case '\0': case '<': case '>': /* FIXME: Someday handle \1, \2, ... */ goto done; /* Not quoted */ default: /* Backup point is \, scan * point is after it. */ regprev = regparse; regparse++; continue; /* NOT break; */ } } regprev = regparse; /* Set backup point */ } done: regc('\0'); *flagp |= HASWIDTH; if (!regprev) /* One char? */ *flagp |= SIMPLE; } break; } return(ret); }
/* - regpiece - something followed by possible [*+?{] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static char * regpiece(int *flagp) { register char *next; register char *ret; register char op; unsigned char max; unsigned char min; int flags; ret = regatom(&flags); if (ret == NULL) { return(NULL); } op = *regparse; if (!ISMULT(op)) { *flagp = flags; return(ret); } if (!(flags&HASWIDTH) && op != '?') { FAIL("*+{ operand could be empty"); } *flagp = (op != '+' && op != '{') ? (WORST|SPSTART) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) { reginsert(STAR, ret); } else if (op == '*') { /* Emit x* as (x&|), where & means "self". */ reginsert(BRANCH, ret); /* Either x */ regoptail(ret, regnode(BACK)); /* and loop */ regoptail(ret, ret); /* back */ regtail(ret, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == '+' && (flags&SIMPLE)) { reginsert(PLUS, ret); } else if (op == '+') { /* Emit x+ as x(&|), where & means "self". */ next = regnode(BRANCH); /* Either */ regtail(ret, next); regtail(regnode(BACK), ret); /* loop back */ regtail(next, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == '{') { for (min = 0, regparse++ ; *regparse && isdigit(*regparse) ; regparse++) { min = min * 10 + (*regparse - '0'); } for (max = 0, regparse++ ; *regparse && isdigit(*regparse) ; regparse++) { max = max * 10 + (*regparse - '0'); } reginsert(max, ret); next = OPERAND(ret); reginsert(min, ret); next = OPERAND(next); reginsert(MINMAX, ret); regtail(ret, OPERAND(next)); /* MINMAX->next = x */ } else if (op == '?') { /* Emit x? as (x|) */ reginsert(BRANCH, ret); /* Either x */ regtail(ret, regnode(BRANCH)); /* or */ next = regnode(NOTHING); /* null. */ regtail(ret, next); regoptail(ret, next); } regparse++; if (ISMULT(*regparse)) { FAIL("nested *?+{"); } return(ret); }