Ejemplo n.º 1
0
/**
 * Add all characters in the inclusive range between lower and upper.
 *
 * Handles a swapped range (upper < lower).
 */
static void reg_addrange(regex_t *preg, int lower, int upper)
{
	if (lower > upper) {
		reg_addrange(preg, upper, lower);
	}
	/* Add a range as length, start */
	regc(preg, upper - lower + 1);
	regc(preg, lower);
}
Ejemplo n.º 2
0
/*
 * regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static char *
regatom(i4 *flagp)
{
    register char *ret;
    i4 flags;
    char null_byte = '\0';

    *flagp = WORST;		/* Tentatively. */

    switch (*regparse) {
    case '^':
        CMnext( regparse );
        ret = regnode(BOL);
        break;
    case '$':
        CMnext( regparse );
        ret = regnode(EOL);
        break;
    case '.':
        CMnext( regparse );
        ret = regnode(ANY);
        *flagp |= HASWIDTH|SIMPLE;
        break;
    case '[': {
        char *range_start = NULL;
        bool double_start;
        u_i2 first_u2, last_u2;
        u_char first_u1, last_u1;

        CMnext( regparse );
        if (*regparse == '^') {	/* Complement of range. */
            ret = regnode(ANYBUT);
            CMnext( regparse );
        } else
            ret = regnode(ANYOF);
        if (*regparse == ']' || *regparse == '-') {
            regc( regparse );
            CMnext( regparse );
        }
        while (*regparse != '\0' && *regparse != ']') {
            if (*regparse == '-') {
                char range_op = '-';

                CMnext( regparse );
                if( *regparse == ']' ||
                        *regparse == '\0'
                  )
                    regc( &range_op );
                else {
                    char *tmp;
                    bool invalid = FALSE;
                    bool double_end;

                    if( range_start == NULL )
                        invalid = TRUE;

                    double_end =
                        CMdbl1st( regparse );

                    if( !invalid &&
                            double_end
                            && !double_start
                      )
                        invalid = TRUE;

                    if( !invalid &&
                            double_start
                            && !double_start
                      )
                        invalid = TRUE;

                    if( !invalid &&
                            CMcmpcase( range_start,
                                       regparse ) > 0
                      )
                        invalid = TRUE;

                    if( double_start )
                        _FAIL("don't know how to support character classes containing double-byte ranges");

                    if( invalid )
                        _FAIL("invalid [] range");
                    /* no double-byte ranges! */
                    /*
                    ** Initialize the value for the end of the range.
                    */
                    last_u1 = UCHARAT(regparse);
                    for (; first_u1 <= last_u1;
                            first_u1++
                        )
                        regc( (char *)
                              &first_u1 );

                    CMnext( regparse );
                }
            } else {
                range_start = regparse;
                if( CMdbl1st( range_start ) )
                {
                    double_start = TRUE;
                    first_u2 = *(u_i2 *) range_start;
                }
                else
                {
                    double_start = FALSE;
                    first_u1 = UCHARAT(range_start);
                }
                regc( regparse );
                CMnext( regparse );
            }
        }
        regc( &null_byte );
        if (*regparse != ']')
            _FAIL("unmatched []");
        CMnext( regparse );
        *flagp |= HASWIDTH|SIMPLE;
    }
    break;
    case '(':
        CMnext( regparse );
        ret = reg(1, &flags);
        if (ret == NULL)
            return(NULL);
        *flagp |= flags&(HASWIDTH|SPSTART);
        break;
    case '\0':
    case '|':
    case ')':
        CMnext( regparse );
        _FAIL("internal urp");	/* Supposed to be caught earlier. */
        break;
    case '?':
    case '+':
    case '*':
        CMnext( regparse );
        _FAIL("?+* follows nothing");
        break;
    case '\\':
        CMnext( regparse );
        if (*regparse == '\0')
            _FAIL("trailing \\");
        ret = regnode(EXACTLY);
        regc( regparse );
        CMnext( regparse );
        regc( &null_byte );
        *flagp |= HASWIDTH|SIMPLE;
        break;
    default: {
        register i4  len;
        register char ender;

        len = my_strcspn(regparse, META);
        if (len <= 0)
            _FAIL("internal disaster");
        ender = *(regparse+len);
        if (len > 1 && ISMULT(ender))
            len--;	/* Back off clear of ?+* operand. */
        *flagp |= HASWIDTH;
        if (len == 1)
            *flagp |= SIMPLE;
        ret = regnode(EXACTLY);
        while (len > 0) {
            regc( regparse );
            CMbytedec( len, regparse );
            CMnext( regparse );
        }
        regc( &null_byte );
    }
    break;
    }

    return(ret);
}
Ejemplo n.º 3
0
/*
 * REcompile - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled RE_EXP.
 */
STATUS
REcompile( char *exp, RE_EXP **re_exp, i4  mem_tag )
{
    register RE_EXP *r;
    register char *scan;
    register char *longest;
    register i4  len;
    i4 flags;
    u_char magic = MAGIC;

    if (exp == NULL)
    {
        _error("NULL argument");
        return (FAIL);
    }

    /* First pass: determine size, legality. */
    regparse = exp;
    regnpar = 1;
    regsize = 0L;
    regcode = &regdummy;
    regc( (char *) &magic );
    if (reg(0, &flags) == NULL)
        return( FAIL );

    /* Small enough for pointer-storage convention? */
    if (regsize >= 32767L)		/* Probably could be 65535L. */
    {
        _error("regular expression too big");
        return (FAIL);
    }

    /* Allocate space. */
    r = (RE_EXP *) MEreqmem( mem_tag, sizeof(RE_EXP) + (unsigned) regsize,
                             FALSE, NULL);
    if (r == NULL)
    {
        _error("out of space");
        return (FAIL);
    }

    /* Second pass: emit code. */
    regparse = exp;
    regnpar = 1;
    regcode = r->program;
    regc( (char *) &magic );
    if (reg(0, &flags) == NULL)
        return( FAIL );

    /* Dig out information for optimizations. */
    r->regstart = '\0';	/* Worst-case defaults. */
    r->reganch = 0;
    r->regmust = NULL;
    r->regmlen = 0;
    scan = r->program+1;			/* First BRANCH. */
    if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
        scan = OPERAND(scan);

        /* Starting-point info. */
        if (OP(scan) == EXACTLY)
            r->regstart = *OPERAND(scan);
        else if (OP(scan) == BOL)
            r->reganch++;

        /*
         * If there's something expensive in the r.e., find the
         * longest literal string that must appear and make it the
         * regmust.  Resolve ties in favor of later strings, since
         * the regstart check works with the beginning of the r.e.
         * and avoiding duplication strengthens checking.  Not a
         * strong reason, but sufficient in the absence of others.
         */
        if (flags&SPSTART) {
            longest = NULL;
            len = 0;
            for (; scan != NULL; scan = regnext(scan))
                if (OP(scan) == EXACTLY && STlength(OPERAND(scan)) >= len) {
                    longest = OPERAND(scan);
                    len = STlength(OPERAND(scan));
                }
            r->regmust = longest;
            r->regmlen = len;
        }
    }
    *re_exp = r;
    return( OK );
}
Ejemplo n.º 4
0
/*
 * regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static char *regatom( int *flagp )
{
    char *ret;
    int flags;

    *flagp = WORST;         /* Tentatively. */

    switch( *regparse++ ) {
    case '~':
        if( *regparse == 0 ) {
            FAIL( ERR_RE_INVALID_CASETOGGLE );
        }
        ret = regnode( CASEI );
        break;
    case '@':
        if( *regparse == 0 ) {
            FAIL( ERR_RE_INVALID_CASETOGGLE );
        }
        ret = regnode( NOCASEI );
        break;
    case '^':
        ret = regnode( BOL );
        break;
    case '$':
        ret = regnode( EOL );
        break;
    case '.':
        ret = regnode( ANY );
        *flagp |= HASWIDTH | SIMPLE;
        break;
    case '[':
        {
            if( *regparse == '^' ) { /* Complement of range. */
                ret = regnode( ANYBUT );
                regparse++;
            } else {
                ret = regnode( ANYOF );
            }
            if( *regparse == ']' || *regparse == '-' ) {
                regc( *regparse++ );
            }
            while( *regparse != '\0' && *regparse != ']' ) {
                if( *regparse == '-' ) {
                    regparse++;
                    if( *regparse == ']' || *regparse == '\0' ) {
                        regc( '-' );
                    } else {
                        int class;
                        int classend;

                        class = UCHARAT( regparse - 2 ) + 1;
                        classend = UCHARAT( regparse );
                        if( class > classend + 1 ) {
                            FAIL( ERR_RE_INVALID_SB_RANGE );
                        }
                        for( ; class <= classend; class++ ) {
                            regc( (char)class );
                        }
                        regparse++;
                    }
                } else {
                    if( *regparse == '\\' && *( regparse + 1 ) == 't' && REALTABS ) {
                        regparse += 2;
                        regc( '\t' );
                    } else {
                        regc( *regparse++ );
                    }
                }
            }
            regc( '\0' );
            if( *regparse != ']' ) {
                FAIL( ERR_RE_UNMATCHED_SQUARE_BRACKET );
            }
            regparse++;
            *flagp |= HASWIDTH | SIMPLE;
        }
Ejemplo n.º 5
0
/*
 - RegComp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
regexp *RegComp( const char *instr )
{
    regexp      *r;
    char        *scan;
    char        *longest;
    const char  *exp;
    char        buff[MAX_STR*2];
    int         flags, ignmag = FALSE;
    unsigned    j;
    size_t      i, k, len;

#ifdef WANT_EXCLAMATION
    if( instr[0] == '!' ) {
        instr++;
        ignmag = TRUE;
    }
#endif

    /*
     * flip roles of magic chars
     */
    if( !ignmag && ( !MAGICFLAG && MAGICSTR != NULL ) ) {
        j = 0;
        k = strlen( instr );
        for( i = 0; i < k; i++ ) {
            if( instr[i] == '\\' ) {
                if( strchr( MAGICSTR, instr[i + 1] ) == NULL ) {
                    buff[j++] = '\\';
                }
                i++;
            } else {
                if( strchr( MAGICSTR, instr[i] ) != NULL ) {
                    buff[j++] = '\\';
                }
            }
            buff[j++] = instr[i];

        }
        buff[j] = 0;
        exp = buff;
    } else {
        exp = instr;
    }

    regError( ERR_NO_ERR );
    if( exp == NULL ) {
        FAIL( ERR_RE_NULL_ARGUMENT );
    }

    /* First pass: determine size, legality. */
    regparse = exp;
    regnpar = 1;
    regsize = 0L;
    regcode = &regdummy;
    regc( MAGIC );
    if( reg( 0, &flags ) == NULL ) {
        return( NULL );
    }

    /* Allocate space. */
    r = ALLOC( sizeof( regexp ) + ( unsigned ) regsize );

    /* Second pass: emit code. */
    regparse = exp;
    regnpar = 1;
    regcode = r->program;
    regc( MAGIC );
    if( reg( 0, &flags ) == NULL ) {
        return( NULL );
    }

    /* Dig out information for optimizations. */
    r->regstart = '\0';     /* Worst-case defaults. */
    r->reganch = 0;
    r->regmust = NULL;
    r->regmlen = 0;
    scan = r->program + 1;                    /* First BRANCH. */
    if( OP( regnext( scan ) ) == END ) { /* Only one top-level choice. */
        scan = OPERAND( scan );

        /* Starting-point info. */
        if( OP( scan ) == EXACTLY ) {
            r->regstart = *OPERAND( scan );
        } else if( OP( scan ) == BOL ) {
            r->reganch++;
        }

        /*
         * If there's something expensive in the r.e., find the
         * longest literal string that must appear and make it the
         * regmust.  Resolve ties in favor of later strings, since
         * the regstart check works with the beginning of the r.e.
         * and avoiding duplication strengthens checking.  Not a
         * strong reason, but sufficient in the absence of others.
         */
        if( flags & SPSTART ) {
            longest = NULL;
            len = 0;
            for( ; scan != NULL; scan = regnext( scan ) ) {
                if( OP( scan ) == EXACTLY && strlen( OPERAND( scan ) ) >= len ) {
                    longest = OPERAND( scan );
                    len = strlen( OPERAND( scan ) );
                }
            }
            r->regmust = longest;
            r->regmlen = (short)len;
        }
    }

    return( r );
}
Ejemplo n.º 6
0
TCHAR *CRegExp::regatom(int *flagp)
{
	TCHAR *ret;
	int flags;

	*flagp = WORST;		// Tentatively.

	switch (*regparse++) {
	case _T('^'):
		ret = regnode(BOL);
		break;
	case _T('$'):
		ret = regnode(EOL);
		break;
	case _T('.'):
		ret = regnode(ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case _T('['): {
		int range;
		int rangeend;
		int c;

		if (*regparse == _T('^')) {	// Complement of range.
			ret = regnode(ANYBUT);
			regparse++;
		} else
			ret = regnode(ANYOF);
		if ((c = *regparse) == _T(']') || c == _T('-')) {
			regc(c);
			regparse++;
		}
		while ((c = *regparse++) != _T('\0') && c != _T(']')) {
			if (c != _T('-'))
				regc(c);
			else if ((c = *regparse) == _T(']') || c == _T('\0'))
				regc(_T('-'));
			else
			{
				range = (unsigned) (TCHAR)*(regparse-2);
				rangeend = (unsigned) (TCHAR)c;
				if (range > rangeend)
				{
					TRACE0("invalid [] range\n");
					return NULL;
				}
				for (range++; range <= rangeend; range++)
					regc(range);
				regparse++;
			}
		}
		regc(_T('\0'));
		if (c != _T(']'))
		{
			TRACE0("unmatched []\n");
			return NULL;
		}
		*flagp |= HASWIDTH|SIMPLE;
		break;
		}
	case _T('('):
		ret = reg(1, &flags);
		if (ret == NULL)
			return(NULL);
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case _T('\0'):
	case _T('|'):
	case _T(')'):
		// supposed to be caught earlier
		TRACE0("internal error: \\0|) unexpected\n");
		return NULL;
		break;
	case _T('?'):
	case _T('+'):
	case _T('*'):
		TRACE0("?+* follows nothing\n");
		return NULL;
		break;
	case _T('\\'):
		if (*regparse == _T('\0'))
		{
			TRACE0("trailing \\\n");
			return NULL;
		}
		ret = regnode(EXACTLY);
		regc(*regparse++);
		regc(_T('\0'));
		*flagp |= HASWIDTH|SIMPLE;
		break;
	default: {
		size_t len;
		TCHAR ender;

		regparse--;
		len = _tcscspn(regparse, META);
		if (len == 0)
		{
			TRACE0("internal error: strcspn 0\n");
			return NULL;
		}
		ender = *(regparse+len);
		if (len > 1 && ISREPN(ender))
			len--;		// Back off clear of ?+* operand.
		*flagp |= HASWIDTH;
		if (len == 1)
			*flagp |= SIMPLE;
		ret = regnode(EXACTLY);
		for (; len > 0; len--)
			regc(*regparse++);
		regc(_T('\0'));
		break;
		}
	}

	return(ret);
}
Ejemplo n.º 7
0
/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static char *
regatom(int *flagp)
{
	register char *ret;
	int flags;

	*flagp = WORST;		/* Tentatively. */

	switch (*regparse++) {
	case '^':
		ret = regnode(BOL);
		break;
	case '$':
		ret = regnode(EOL);
		break;
	case '.':
		ret = regnode(ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case '[': {
			register int clss;
			register int classend;

			if (*regparse == '^') {	/* Complement of range. */
				ret = regnode(ANYBUT);
				regparse++;
			} else
				ret = regnode(ANYOF);
			if (*regparse == ']' || *regparse == '-')
				regc(*regparse++);
			while (*regparse != '\0' && *regparse != ']') {
				if (*regparse == '-') {
					regparse++;
					if (*regparse == ']' || *regparse == '\0')
						regc('-');
					else {
						clss = UCHARAT(regparse-2)+1;
						classend = UCHARAT(regparse);
						if (clss > classend+1)
							FAIL("invalid [] range");
						for (; clss <= classend; clss++)
							regc(clss);
						regparse++;
					}
				} else
					regc(*regparse++);
			}
			regc('\0');
			if (*regparse != ']')
				FAIL("unmatched []");
			regparse++;
			*flagp |= HASWIDTH|SIMPLE;
		}
		break;
	case '(':
		ret = reg(1, &flags);
		if (ret == NULL)
			return(NULL);
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case '\0':
	case '|':
	case ')':
		FAIL("internal urp");	/* Supposed to be caught earlier. */
		/* NOTREACHED */
		break;
	case '?':
	case '+':
	case '*':
		FAIL("?+* follows nothing");
		/* NOTREACHED */
		break;
	case '\\':
		if (*regparse == '\0')
			FAIL("trailing \\");
		ret = regnode(EXACTLY);
		regc(*regparse++);
		regc('\0');
		*flagp |= HASWIDTH|SIMPLE;
		break;
	default: {
			register int len;
			register char ender;

			regparse--;
			len = (int) strcspn(regparse, META);
			if (len <= 0)
				FAIL("internal disaster");
			ender = *(regparse+len);
			if (len > 1 && ISMULT(ender))
				len--;		/* Back off clear of ?+* operand. */
			*flagp |= HASWIDTH;
			if (len == 1)
				*flagp |= SIMPLE;
			ret = regnode(EXACTLY);
			while (len > 0) {
				regc(*regparse++);
				len--;
			}
			regc('\0');
		}
		break;
	}

	return(ret);
}
Ejemplo n.º 8
0
/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static char *
regatom(int *flagp)
{
	register char *ret;
	int flags;

	*flagp = WORST;		/* Tentatively. */

	switch (*regparse++) {
	case '^':
		ret = regnode(BOL);
		break;
	case '$':
		ret = regnode(EOL);
		break;
	case '.':
		ret = regnode(ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case '[': {
			register int chclass;
			register int chclassend;

			if (*regparse == '^') {	/* Complement of range. */
				ret = regnode(ANYBUT);
				regparse++;
			} else {
				ret = regnode(ANYOF);
			}
			if (*regparse == ']' || *regparse == '-') {
				regc(*regparse++);
			}
			while (*regparse != '\0' && *regparse != ']') {
				if (*regparse == '-') {
					regparse++;
					if (*regparse == ']' || *regparse == '\0') {
						regc('-');
					} else {
						chclass = UCHARAT(regparse-2)+1;
						chclassend = UCHARAT(regparse);
						if (chclass > chclassend+1) {
							FAIL("invalid [] range");
						}
						for (; chclass <= chclassend; chclass++) {
							regc(chclass);
						}
						regparse++;
					}
				} else if (*regparse == '\\') {
					switch(*++regparse) {
					case 'n' :
						regc('\n');
						regparse++;
						break;
					case 't' :
						regc('\t');
						regparse++;
						break;
					case ']' :
						regc(']');
						regparse++;
						break;
					case '-' :
						regc('-');
						regparse++;
						break;
					case '\\' :
						regc('\\');
						regparse++;
						break;
					default :
						regparse--;
						regc(*regparse++);
					}
				} else {
					regc(*regparse++);
				}
			}
			regc('\0');
			if (*regparse != ']') {
				FAIL("unmatched []");
			}
			regparse++;
			*flagp |= HASWIDTH|SIMPLE;
		}
		break;
	case '(':
		ret = reg(1, &flags);
		if (ret == NULL) {
			return(NULL);
		}
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case '\0':
	case '|':
	case ')':
		FAIL("internal urp");	/* Supposed to be caught earlier. */
		break;
	case '?':
	case '+':
	case '*':
	case '{':
		FAIL("?+*{ follows nothing");
		break;
	case '\\':
		if (*regparse == '\0') {
			FAIL("trailing \\");
		}
		switch(*regparse) {
		case '<':
			ret = regnode(BEGWORD);
			break;
		case '>':
			ret = regnode(ENDWORD);
			break;
		case 'd':
			ret = regnode(DIGIT);
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'D':
			ret = regnode(NDIGIT);
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'n' :
			ret = regnode(EXACTLY);
			regc('\n');
			regc('\0');
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'p':
			ret = regnode(PRINT);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 'P':
			ret = regnode(NPRINT);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 's':
			ret = regnode(WHITESP);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 'S':
			ret = regnode(NWHITESP);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 't' :
			ret = regnode(EXACTLY);
			regc('\t');
			regc('\0');
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'w':
			ret = regnode(ALNUM);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 'W':
			ret = regnode(NALNUM);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		default :
			ret = regnode(EXACTLY);
			regc(*regparse);
			regc('\0');
			*flagp |= HASWIDTH|SIMPLE;
		}
		regparse++;
		break;
	default: {
			register int len;
			register char ender;

			regparse--;
			len = strcspn(regparse, META);
			if (len <= 0) {
				FAIL("internal disaster");
			}
			ender = *(regparse+len);
			if (len > 1 && ISMULT(ender)) {
				len--;		/* Back off clear of ?+* operand. */
			}
			*flagp |= HASWIDTH;
			if (len == 1) {
				*flagp |= SIMPLE;
			}
			ret = regnode(EXACTLY);
			while (len > 0) {
				regc(*regparse++);
				len--;
			}
			regc('\0');
		}
		break;
	}

	return(ret);
}
Ejemplo n.º 9
0
void ossimRegExp::compile (const char* exp) {
    const char* scan;
    const char* longest;
    unsigned long len;
    int         flags;

    if (exp == NULL) {
        //RAISE Error, SYM(ossimRegExp), SYM(No_Expr),
        printf ("ossimRegExp::compile(): No expression supplied.\n");
        return;
    }

    // First pass: determine size, legality.
    regparse = exp;
    regnpar = 1;
    regsize = 0L;
    regcode = &regdummy;
    regc(MAGIC);
    if(!reg(0, &flags))
    {
        printf ("ossimRegExp::compile(): Error in compile.\n");
        return;
    }
    this->startp[0] = this->endp[0] = this->searchstring = NULL;

    // Small enough for pointer-storage convention?
    if (regsize >= 32767L) {	// Probably could be 65535L.
        //RAISE Error, SYM(ossimRegExp), SYM(Expr_Too_Big),
        printf ("ossimRegExp::compile(): Expression too big.\n");
        return;
    }

    // Allocate space.
//#ifndef WIN32
    if (this->program != NULL) delete [] this->program;
//#endif
    this->program = new char[regsize];
    this->progsize = (int) regsize;

    if (this->program == NULL) {
        //RAISE Error, SYM(ossimRegExp), SYM(Out_Of_Memory),
        printf ("ossimRegExp::compile(): Out of memory.\n");
        return;
    }

    // Second pass: emit code.
    regparse = exp;
    regnpar = 1;
    regcode = this->program;
    regc(MAGIC);
    reg(0, &flags);

    // Dig out information for optimizations.
    this->regstart = '\0';		// Worst-case defaults.
    this->reganch = 0;
    this->regmust = NULL;
    this->regmlen = 0;
    scan = this->program + 1;	// First BRANCH.
    if (OP(regnext(scan)) == END) {	// Only one top-level choice.
        scan = OPERAND(scan);

        // Starting-point info.
        if (OP(scan) == EXACTLY)
            this->regstart = *OPERAND(scan);
        else if (OP(scan) == BOL)
            this->reganch++;

        //
        // If there's something expensive in the r.e., find the longest
        // literal string that must appear and make it the regmust.  Resolve
        // ties in favor of later strings, since the regstart check works
        // with the beginning of the r.e. and avoiding duplication
        // strengthens checking.  Not a strong reason, but sufficient in the
        // absence of others.
        //
        if (flags & SPSTART) {
            longest = NULL;
            len = 0;
            for (; scan != NULL; scan = regnext(scan))
                if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
                    longest = OPERAND(scan);
                    len = (unsigned long)strlen(OPERAND(scan));
                }
            this->regmust = longest;
            this->regmlen = len;
        }
    }
}
Ejemplo n.º 10
0
/*
   - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.
 */
static char *regatom (int * flagp)
{
    register char *ret;
    int flags;

    *flagp = WORST;             /* Tentatively. */

    switch (*regparse++) {
        case CARET:
            ret = regnode(BOL);
            break;
        case DOLLAR:
            ret = regnode(EOL);
            break;
        case DOT:
            ret = regnode(ANY);
            *flagp |= HASWIDTH | SIMPLE;
            break;
        case LSHBRAC:
            ret = regnode(WORDSTART);
            break;
        case RSHBRAC:
            ret = regnode(WORDEND);
            break;
        case LSQBRAC:{
                         register int classs;
                         register int classend;

                         if (*regparse == CARET) {   /* Complement of range. */
                             ret = regnode(ANYBUT);
                             regparse++;
                         } else
                             ret = regnode(ANYOF);
                         if (*regparse == RSQBRAC || *regparse == '-')
                             regc(*regparse++);
                         while (*regparse != '\0' && *regparse != RSQBRAC) {
                             if (*regparse == '-') {
                                 regparse++;
                                 if (*regparse == RSQBRAC || *regparse == '\0')
                                     regc('-');
                                 else {
                                     classs = (CHARBITS & *(regparse - 2)) + 1;
                                     classend = (CHARBITS & *(regparse));
                                     if (classs > classend + 1)
                                         FAIL("invalid [] range\n");
                                     for (; classs <= classend; classs++)
                                         regc(classs);
                                     regparse++;
                                 }
                             } else
                                 regc(*regparse++);
                         }
                         regc('\0');
                         if (*regparse != RSQBRAC)
                             FAIL("unmatched []\n");
                         regparse++;
                         *flagp |= HASWIDTH | SIMPLE;
                     }
                     break;
        case LBRAC:
                     ret = reg(1, &flags);
                     if (ret == (char *) NULL)
                         return ((char *) NULL);
                     *flagp |= flags & (HASWIDTH | SPSTART);
                     break;
        case '\0':
        case OR_OP:
        case RBRAC:
                     FAIL("internal urp\n"); /* Supposed to be caught earlier. */
                     break;
        case ASTERIX:
                     FAIL("* follows nothing\n");
                     break;
        case PLUSS:
                     FAIL("+ follows nothing\n");
                     break;
        case QMARK:
                     FAIL("? follows nothing\n");
                     break;
        default:{
                    register int len;
                    register short ender;

                    regparse--;
                    for (len = 0; regparse[len] &&
                            !(regparse[len] & SPECIAL) && regparse[len] != RSQBRAC; len++);
                    if (len <= 0) {
                        FAIL("unexpected ]\n");
                    }
                    ender = *(regparse + len);
                    if (len > 1 && ISMULT(ender))
                        len--;          /* Back off clear of ?+* operand. */
                    *flagp |= HASWIDTH;
                    if (len == 1)
                        *flagp |= SIMPLE;
                    ret = regnode(EXACTLY);
                    while (len > 0) {
                        regc(*regparse++);
                        len--;
                    }
                    regc('\0');
                }
                break;
    }

    return (ret);
}
Ejemplo n.º 11
0
/*
   - regcomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because FREE() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
regexp *regcomp (unsigned char * exp,
        int excompat)       /* \( \) operators like in unix ex */
{
    register regexp *r;
    register unsigned char *scan;
    register char *longest;
    register int len;
    int flags;
    short *exp2, *dest, c;

    if (!exp)
        FAIL("NULL argument\n");

    exp2 = (short *)
        DXALLOC((strlen((char *)exp) + 1) * (sizeof(short[8]) / sizeof(char[8])),
                TAG_TEMPORARY, "regcomp: 1");
    for (scan = exp, dest = exp2; (c = *scan++);) {
        switch (c) {
            case '(':
            case ')':
                *dest++ = excompat ? c : c | SPECIAL;
                break;
            case '.':
            case '*':
            case '+':
            case '?':
            case '|':
            case '$':
            case '^':
            case '[':
            case ']':
                *dest++ = c | SPECIAL;
                break;
            case '\\':
                switch (c = *scan++) {
                    case 0:
                        FREE(exp2);
                        FAIL("Regular expression cannot end with '\\'.  Use \"\\\\\".\n");
                        break;
                    case '(':
                    case ')':
                        *dest++ = excompat ? c | SPECIAL : c;
                        break;
                    case '<':
                    case '>':
                        *dest++ = c | SPECIAL;
                        break;
                    case '{':
                    case '}':
                        FREE(exp2);
                        FAIL("sorry, unimplemented operator\n");
                    case 'b':
                        *dest++ = '\b';
                        break;
                    case 't':
                        *dest++ = '\t';
                        break;
                    case 'r':
                        *dest++ = '\r';
                        break;
                    default:
                        *dest++ = c;
                }
                break;
            default:
                *dest++ = c;
        }
    }
    *dest = 0;
    /* First pass: determine size, legality. */
    regparse = exp2;
    regnpar = 1;
    regsize = 0L;
    regcode = &regdummy;
    regc((char) MAGIC);
    if (reg(0, &flags) == (char *) NULL) {
        FREE(exp2);
        return ((regexp *) NULL);
    }

    /* Small enough for pointer-storage convention? */
    if (regsize >= 32767L)      /* Probably could be 65535L. */
    {
        FREE(exp2);
        FAIL("regexp too big\n");
    }

    /* Allocate space. */
    r = (regexp *) DXALLOC(sizeof(regexp) + (unsigned) regsize,
            TAG_TEMPORARY, "regcomp: 2");
    if (r == (regexp *) NULL) {
        FREE(exp2);
        FAIL("out of space\n");
    }

    /* Second pass: emit code. */
    regparse = exp2;
    regnpar = 1;
    regcode = (char *)(r->program);
    regc((char) MAGIC);
    if (reg(0, &flags) == NULL) {
        FREE(exp2);
        FREE(r);
        return ((regexp *) NULL);
    }

    /* Dig out information for optimizations. */
    r->regstart = '\0';         /* Worst-case defaults. */
    r->reganch = 0;
    r->regmust = NULL;
    r->regmlen = 0;
    scan = (unsigned char *)(r->program + 1);   /* First BRANCH. */
    if (OP(regnext((char *)scan)) == END) {     /* Only one top-level choice. */
        scan = OPERAND(scan);

        /* Starting-point info. */
        if (OP(scan) == EXACTLY)
            r->regstart = *OPERAND(scan);
        else if (OP(scan) == BOL)
            r->reganch++;

        /*
         * If there's something expensive in the r.e., find the longest
         * literal string that must appear and make it the regmust.  Resolve
         * ties in favor of later strings, since the regstart check works
         * with the beginning of the r.e. and avoiding duplication
         * strengthens checking.  Not a strong reason, but sufficient in the
         * absence of others.
         */
        if (flags & SPSTART) {
            longest = NULL;
            len = 0;
            for (; scan != NULL; scan = (unsigned char *)regnext((char *)scan)) {
                char *tmp = (char *)OPERAND(scan);
                int tlen;
                if (OP(scan) == EXACTLY && (tlen = strlen(tmp)) >= len) {
                    longest = tmp;
                    len = tlen;
                }
            }
            r->regmust = longest;
            r->regmlen = len;
        }
    }
    FREE((char *) exp2);
    return (r);
}
Ejemplo n.º 12
0
/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static int regatom(regex_t *preg, int *flagp)
{
	int ret;
	int flags;
	int nocase = (preg->cflags & REG_ICASE);

	int ch;
	int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);

	*flagp = WORST;		/* Tentatively. */

	preg->regparse += n;
	switch (ch) {
	/* FIXME: these chars only have meaning at beg/end of pat? */
	case '^':
		ret = regnode(preg, BOL);
		break;
	case '$':
		ret = regnode(preg, EOL);
		break;
	case '.':
		ret = regnode(preg, ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case '[': {
			const char *pattern = preg->regparse;

			if (*pattern == '^') {	/* Complement of range. */
				ret = regnode(preg, ANYBUT);
				pattern++;
			} else
				ret = regnode(preg, ANYOF);

			/* Special case. If the first char is ']' or '-', it is part of the set */
			if (*pattern == ']' || *pattern == '-') {
				reg_addrange(preg, *pattern, *pattern);
				pattern++;
			}

			while (*pattern && *pattern != ']') {
				/* Is this a range? a-z */
				int start;
				int end;

				pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
				if (start == '\\') {
					pattern += reg_decode_escape(pattern, &start);
					if (start == 0) {
						preg->err = REG_ERR_NULL_CHAR;
						return 0;
					}
				}
				if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
					/* skip '-' */
					pattern += utf8_tounicode(pattern, &end);
					pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
					if (end == '\\') {
						pattern += reg_decode_escape(pattern, &end);
						if (end == 0) {
							preg->err = REG_ERR_NULL_CHAR;
							return 0;
						}
					}

					reg_addrange(preg, start, end);
					continue;
				}
				if (start == '[') {
					if (strncmp(pattern, ":alpha:]", 8) == 0) {
						if ((preg->cflags & REG_ICASE) == 0) {
							reg_addrange(preg, 'a', 'z');
						}
						reg_addrange(preg, 'A', 'Z');
						pattern += 8;
						continue;
					}
					if (strncmp(pattern, ":alnum:]", 8) == 0) {
						if ((preg->cflags & REG_ICASE) == 0) {
							reg_addrange(preg, 'a', 'z');
						}
						reg_addrange(preg, 'A', 'Z');
						reg_addrange(preg, '0', '9');
						pattern += 8;
						continue;
					}
					if (strncmp(pattern, ":space:]", 8) == 0) {
						reg_addrange_str(preg, " \t\r\n\f\v");
						pattern += 8;
						continue;
					}
				}
				/* Not a range, so just add the char */
				reg_addrange(preg, start, start);
			}
			regc(preg, '\0');

			if (*pattern) {
				pattern++;
			}
			preg->regparse = pattern;

			*flagp |= HASWIDTH|SIMPLE;
		}
		break;
	case '(':
		ret = reg(preg, 1, &flags);
		if (ret == 0)
			return 0;
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case '\0':
	case '|':
	case ')':
		preg->err = REG_ERR_INTERNAL;
		return 0;	/* Supposed to be caught earlier. */
	case '?':
	case '+':
	case '*':
	case '{':
		preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
		return 0;
	case '\\':
		switch (*preg->regparse++) {
		case '\0':
			preg->err = REG_ERR_TRAILING_BACKSLASH;
			return 0;
		case '<':
		case 'm':
			ret = regnode(preg, WORDA);
			break;
		case '>':
		case 'M':
			ret = regnode(preg, WORDZ);
			break;
		case 'd':
			ret = regnode(preg, ANYOF);
			reg_addrange(preg, '0', '9');
			regc(preg, '\0');
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 'w':
			ret = regnode(preg, ANYOF);
			if ((preg->cflags & REG_ICASE) == 0) {
				reg_addrange(preg, 'a', 'z');
			}
			reg_addrange(preg, 'A', 'Z');
			reg_addrange(preg, '0', '9');
			reg_addrange(preg, '_', '_');
			regc(preg, '\0');
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 's':
			ret = regnode(preg, ANYOF);
			reg_addrange_str(preg," \t\r\n\f\v");
			regc(preg, '\0');
			*flagp |= HASWIDTH|SIMPLE;
			break;
		/* FIXME: Someday handle \1, \2, ... */
		default:
			/* Handle general quoted chars in exact-match routine */
			/* Back up to include the backslash */
			preg->regparse--;
			goto de_fault;
		}
		break;
	de_fault:
	default: {
			/*
			 * Encode a string of characters to be matched exactly.
			 */
			int added = 0;

			/* Back up to pick up the first char of interest */
			preg->regparse -= n;

			ret = regnode(preg, EXACTLY);

			/* Note that a META operator such as ? or * consumes the
			 * preceding char.
			 * Thus we must be careful to look ahead by 2 and add the
			 * last char as it's own EXACTLY if necessary
			 */

			/* Until end of string or a META char is reached */
			while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
				n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
				if (ch == '\\' && preg->regparse[n]) {
					/* Non-trailing backslash.
					 * Is this a special escape, or a regular escape?
					 */
					if (strchr("<>mMwds", preg->regparse[n])) {
						/* A special escape. All done with EXACTLY */
						break;
					}
					/* Decode it. Note that we add the length for the escape
					 * sequence to the length for the backlash so we can skip
					 * the entire sequence, or not as required.
					 */
					n += reg_decode_escape(preg->regparse + n, &ch);
					if (ch == 0) {
						preg->err = REG_ERR_NULL_CHAR;
						return 0;
					}
				}

				/* Now we have one char 'ch' of length 'n'.
				 * Check to see if the following char is a MULT
				 */

				if (ISMULT(preg->regparse[n])) {
					/* Yes. But do we already have some EXACTLY chars? */
					if (added) {
						/* Yes, so return what we have and pick up the current char next time around */
						break;
					}
					/* No, so add this single char and finish */
					regc(preg, ch);
					added++;
					preg->regparse += n;
					break;
				}

				/* No, so just add this char normally */
				regc(preg, ch);
				added++;
				preg->regparse += n;
			}
			regc(preg, '\0');

			*flagp |= HASWIDTH;
			if (added == 1)
				*flagp |= SIMPLE;
			break;
		}
		break;
	}

	return(ret);
}
Ejemplo n.º 13
0
/*
 - regcomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
int regcomp(regex_t *preg, const char *exp, int cflags)
{
	int scan;
	int longest;
	unsigned len;
	int flags;

#ifdef DEBUG
	fprintf(stderr, "Compiling: '%s'\n", exp);
#endif
	memset(preg, 0, sizeof(*preg));

	if (exp == NULL)
		FAIL(preg, REG_ERR_NULL_ARGUMENT);

	/* First pass: determine size, legality. */
	preg->cflags = cflags;
	preg->regparse = exp;

	/* Allocate space. */
	preg->proglen = (strlen(exp) + 1) * 5;
	preg->program = malloc(preg->proglen * sizeof(int));
	if (preg->program == NULL)
		FAIL(preg, REG_ERR_NOMEM);

	/* Note that since we store a magic value as the first item in the program,
	 * program offsets will never be 0
	 */
	regc(preg, REG_MAGIC);
	if (reg(preg, 0, &flags) == 0) {
		return preg->err;
	}

	/* Small enough for pointer-storage convention? */
	if (preg->re_nsub >= REG_MAX_PAREN)		/* Probably could be 65535L. */
		FAIL(preg,REG_ERR_TOO_BIG);

	/* Dig out information for optimizations. */
	preg->regstart = 0;	/* Worst-case defaults. */
	preg->reganch = 0;
	preg->regmust = 0;
	preg->regmlen = 0;
	scan = 1;			/* First BRANCH. */
	if (OP(preg, regnext(preg, scan)) == END) {		/* Only one top-level choice. */
		scan = OPERAND(scan);

		/* Starting-point info. */
		if (OP(preg, scan) == EXACTLY) {
			preg->regstart = preg->program[OPERAND(scan)];
		}
		else if (OP(preg, scan) == BOL)
			preg->reganch++;

		/*
		 * If there's something expensive in the r.e., find the
		 * longest literal string that must appear and make it the
		 * regmust.  Resolve ties in favor of later strings, since
		 * the regstart check works with the beginning of the r.e.
		 * and avoiding duplication strengthens checking.  Not a
		 * strong reason, but sufficient in the absence of others.
		 */
		if (flags&SPSTART) {
			longest = 0;
			len = 0;
			for (; scan != 0; scan = regnext(preg, scan)) {
				if (OP(preg, scan) == EXACTLY) {
					int plen = str_int_len(preg->program + OPERAND(scan));
					if (plen >= len) {
						longest = OPERAND(scan);
						len = plen;
					}
				}
			}
			preg->regmust = longest;
			preg->regmlen = len;
		}
	}

#ifdef DEBUG
	regdump(preg);
#endif

	return 0;
}
Ejemplo n.º 14
0
/*
 - regcomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
regexp *
regcomp( const char *exp )
{
	register regexp *r;
	register char *scan;
	register char *longest;
	register unsigned len;
	int flags;

	if (exp == NULL)
		FAIL("NULL argument");

	/* First pass: determine size, legality. */
#ifdef notdef
	if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
#endif
	regparse = (char *)exp;
	regnpar = 1;
	regsize = 0L;
	regcode = &regdummy;
	regc(MAGIC);
	if (reg(0, &flags) == NULL)
		return(NULL);

	/* Small enough for pointer-storage convention? */
	if (regsize >= 32767L)		/* Probably could be 65535L. */
		FAIL("regexp too big");

	/* Allocate space. */
	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
	if (r == NULL)
		FAIL("out of space");

	/* Second pass: emit code. */
	regparse = (char *)exp;
	regnpar = 1;
	regcode = r->program;
	regc(MAGIC);
	if (reg(0, &flags) == NULL)
		return(NULL);

	/* Dig out information for optimizations. */
	r->regstart = '\0';	/* Worst-case defaults. */
	r->reganch = 0;
	r->regmust = NULL;
	r->regmlen = 0;
	scan = r->program+1;			/* First BRANCH. */
	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
		scan = OPERAND(scan);

		/* Starting-point info. */
		if (OP(scan) == EXACTLY)
			r->regstart = *OPERAND(scan);
		else if (OP(scan) == BOL)
			r->reganch++;

		/*
		 * If there's something expensive in the r.e., find the
		 * longest literal string that must appear and make it the
		 * regmust.  Resolve ties in favor of later strings, since
		 * the regstart check works with the beginning of the r.e.
		 * and avoiding duplication strengthens checking.  Not a
		 * strong reason, but sufficient in the absence of others.
		 */
		if (flags&SPSTART) {
			longest = NULL;
			len = 0;
			for (; scan != NULL; scan = regnext(scan))
				if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
					longest = OPERAND(scan);
					len = strlen(OPERAND(scan));
				}
			r->regmust = longest;
			r->regmlen = len;
		}
	}

	return(r);
}
Ejemplo n.º 15
0
/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
char* ossimRegExp::regatom (int *flagp) {
    char* ret;
    int   flags;

    *flagp = WORST;		// Tentatively.

    switch (*regparse++) {
    case '^':
        ret = regnode(BOL);
        break;
    case '$':
        ret = regnode(EOL);
        break;
    case '.':
        ret = regnode(ANY);
        *flagp |= HASWIDTH | SIMPLE;
        break;
    case '[': {
        int    rxpclass;
        int    rxpclassend;

        if (*regparse == '^') {	// Complement of range.
            ret = regnode(ANYBUT);
            regparse++;
        }
        else
            ret = regnode(ANYOF);
        if (*regparse == ']' || *regparse == '-')
            regc(*regparse++);
        while (*regparse != '\0' && *regparse != ']') {
            if (*regparse == '-') {
                regparse++;
                if (*regparse == ']' || *regparse == '\0')
                    regc('-');
                else {
                    rxpclass = UCHARAT(regparse - 2) + 1;
                    rxpclassend = UCHARAT(regparse);
                    if (rxpclass > rxpclassend + 1) {
                        //RAISE Error, SYM(ossimRegExp), SYM(Invalid_Range),
                        printf ("ossimRegExp::compile(): Invalid range in [].\n");
                        return 0;
                    }
                    for (; rxpclass <= rxpclassend; rxpclass++)
                        regc(rxpclass);
                    regparse++;
                }
            }
            else
                regc(*regparse++);
        }
        regc('\0');
        if (*regparse != ']') {
            //RAISE Error, SYM(ossimRegExp), SYM(Unmatched_Bracket),
            printf ("ossimRegExp::compile(): Unmatched [].\n");
            return 0;
        }
        regparse++;
        *flagp |= HASWIDTH | SIMPLE;
    }
    break;
    case '(':
        ret = reg(1, &flags);
        if (ret == NULL)
            return (NULL);
        *flagp |= flags & (HASWIDTH | SPSTART);
        break;
    case '\0':
    case '|':
    case ')':
        //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
        printf ("ossimRegExp::compile(): Internal error.\n"); // Never here
        return 0;
    case '?':
    case '+':
    case '*':
        //RAISE Error, SYM(ossimRegExp), SYM(No_Operand),
        printf ("ossimRegExp::compile(): ?+* follows nothing.\n");
        return 0;
    case '\\':
        if (*regparse == '\0') {
            //RAISE Error, SYM(ossimRegExp), SYM(Trailing_Backslash),
            printf ("ossimRegExp::compile(): Trailing backslash.\n");
            return 0;
        }
        ret = regnode(EXACTLY);
        regc(*regparse++);
        regc('\0');
        *flagp |= HASWIDTH | SIMPLE;
        break;
    default: {
        int    len;
        char   ender;

        regparse--;
        len = (int)strcspn(regparse, META);
        if (len <= 0) {
            //RAISE Error, SYM(ossimRegExp), SYM(Internal_Error),
            printf ("ossimRegExp::compile(): Internal error.\n");
            return 0;
        }
        ender = *(regparse + len);
        if (len > 1 && ISMULT(ender))
            len--;	// Back off clear of ?+* operand.
        *flagp |= HASWIDTH;
        if (len == 1)
            *flagp |= SIMPLE;
        ret = regnode(EXACTLY);
        while (len > 0) {
            regc(*regparse++);
            len--;
        }
        regc('\0');
    }
    break;
    }
    return (ret);
}
Ejemplo n.º 16
0
/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static char *
regatom( int *flagp )
{
	register char *ret;
	int flags;

	*flagp = WORST;		/* Tentatively. */

	switch (*regparse++) {
	/* FIXME: these chars only have meaning at beg/end of pat? */
	case '^':
		ret = regnode(BOL);
		break;
	case '$':
		ret = regnode(EOL);
		break;
	case '.':
		ret = regnode(ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case '[': {
			register int classr;
			register int classend;

			if (*regparse == '^') {	/* Complement of range. */
				ret = regnode(ANYBUT);
				regparse++;
			} else
				ret = regnode(ANYOF);
			if (*regparse == ']' || *regparse == '-')
				regc(*regparse++);
			while (*regparse != '\0' && *regparse != ']') {
				if (*regparse == '-') {
					regparse++;
					if (*regparse == ']' || *regparse == '\0')
						regc('-');
					else {
						classr = UCHARAT(regparse-2)+1;
						classend = UCHARAT(regparse);
						if (classr > classend+1)
							FAIL("invalid [] range");
						for (; classr <= classend; classr++)
							regc(classr);
						regparse++;
					}
				} else
					regc(*regparse++);
			}
			regc('\0');
			if (*regparse != ']')
				FAIL("unmatched []");
			regparse++;
			*flagp |= HASWIDTH|SIMPLE;
		}
		break;
	case '(':
		ret = reg(1, &flags);
		if (ret == NULL)
			return(NULL);
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case '\0':
	case '|':
	case '\n':
	case ')':
		FAIL("internal urp");	/* Supposed to be caught earlier. */
		break;
	case '?':
	case '+':
	case '*':
		FAIL("?+* follows nothing");
		break;
	case '\\':
		switch (*regparse++) {
		case '\0':
			FAIL("trailing \\");
			break;
		case '<':
			ret = regnode(WORDA);
			break;
		case '>':
			ret = regnode(WORDZ);
			break;
		/* FIXME: Someday handle \1, \2, ... */
		default:
			/* Handle general quoted chars in exact-match routine */
			goto de_fault;
		}
		break;
	de_fault:
	default:
		/*
		 * Encode a string of characters to be matched exactly.
		 *
		 * This is a bit tricky due to quoted chars and due to
		 * '*', '+', and '?' taking the SINGLE char previous
		 * as their operand.
		 *
		 * On entry, the char at regparse[-1] is going to go
		 * into the string, no matter what it is.  (It could be
		 * following a \ if we are entered from the '\' case.)
		 * 
		 * Basic idea is to pick up a good char in  ch  and
		 * examine the next char.  If it's *+? then we twiddle.
		 * If it's \ then we frozzle.  If it's other magic char
		 * we push  ch  and terminate the string.  If none of the
		 * above, we push  ch  on the string and go around again.
		 *
		 *  regprev  is used to remember where "the current char"
		 * starts in the string, if due to a *+? we need to back
		 * up and put the current char in a separate, 1-char, string.
		 * When  regprev  is NULL,  ch  is the only char in the
		 * string; this is used in *+? handling, and in setting
		 * flags |= SIMPLE at the end.
		 */
		{
			char *regprev;
			register char ch;

			regparse--;			/* Look at cur char */
			ret = regnode(EXACTLY);
			for ( regprev = 0 ; ; ) {
				ch = *regparse++;	/* Get current char */
				switch (*regparse) {	/* look at next one */

				default:
					regc(ch);	/* Add cur to string */
					break;

				case '.': case '[': case '(':
				case ')': case '|': case '\n':
				case '$': case '^':
				case '\0':
				/* FIXME, $ and ^ should not always be magic */
				magic:
					regc(ch);	/* dump cur char */
					goto done;	/* and we are done */

				case '?': case '+': case '*':
					if (!regprev) 	/* If just ch in str, */
						goto magic;	/* use it */
					/* End mult-char string one early */
					regparse = regprev; /* Back up parse */
					goto done;

				case '\\':
					regc(ch);	/* Cur char OK */
					switch (regparse[1]){ /* Look after \ */
					case '\0':
					case '<':
					case '>':
					/* FIXME: Someday handle \1, \2, ... */
						goto done; /* Not quoted */
					default:
						/* Backup point is \, scan							 * point is after it. */
						regprev = regparse;
						regparse++; 
						continue;	/* NOT break; */
					}
				}
				regprev = regparse;	/* Set backup point */
			}
		done:
			regc('\0');
			*flagp |= HASWIDTH;
			if (!regprev)		/* One char? */
				*flagp |= SIMPLE;
		}
		break;
	}

	return(ret);
}
Ejemplo n.º 17
0
/*
 - regcomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
	PGPError
pgpRegComp(PGPContextRef context, char const *exp, regexp **pregexp)
{
	regexp *r;
	char const *scan;
	char const *longest;
	int len;
	int flags;
	regcompState s_rcs;
	regcompState *rcs = &s_rcs;

	PGPValidateContext( context );
	PGPValidatePtr( exp );
	PGPValidatePtr( pregexp );

	*pregexp = NULL;

	pgpClearMemory( &s_rcs, sizeof(s_rcs) );

	/* First pass: determine size, legality. */
	rcs->regparse = exp;
	rcs->regnpar = 1;
	rcs->regsize = 0L;
	rcs->regcode = &regdummy;
	regc(rcs, MAGIC);
	if (reg(rcs, 0, &flags) == NULL)
		return(kPGPError_OutOfMemory);

	/* Small enough for pointer-storage convention? */
	if (rcs->regsize >= 32767L)		/* Probably could be 65535L. */
		return(kPGPError_BadParams);

	/* Allocate space. */
	r = (regexp *)pgpContextMemAlloc(context,
							 sizeof(regexp) + (unsigned)rcs->regsize, 0);
	if (r == NULL)
		return kPGPError_OutOfMemory;

	/* Second pass: emit code. */
	rcs->regparse = exp;
	rcs->regnpar = 1;
	rcs->regcode = r->program;
	regc(rcs, MAGIC);
	if (reg(rcs, 0, &flags) == NULL)
		return(kPGPError_OutOfMemory);

	/* Dig out information for optimizations. */
	r->regstart = '\0';	/* Worst-case defaults. */
	r->reganch = 0;
	r->regmust = NULL;
	r->regmlen = 0;
	scan = r->program+1;			/* First BRANCH. */
	if (OP(regnext(scan)) == END) {	/* Only one top-level choice. */
		scan = OPERAND(scan);

		/* Starting-point info. */
		if (OP(scan) == EXACTLY)
			r->regstart = *OPERAND(scan);
		else if (OP(scan) == BOL)
			r->reganch++;

		/*
		 * If there's something expensive in the r.e., find the
		 * longest literal string that must appear and make it the
		 * regmust.  Resolve ties in favor of later strings, since
		 * the regstart check works with the beginning of the r.e.
		 * and avoiding duplication strengthens checking.  Not a
		 * strong reason, but sufficient in the absence of others.
		 */
		if (flags&SPSTART) {
			longest = NULL;
			len = 0;
			for (; scan != NULL; scan = regnext(scan))
				if (OP(scan) == EXACTLY
						&& strlen(OPERAND(scan)) >= (unsigned)len) {
					longest = OPERAND(scan);
					len = strlen(OPERAND(scan));
				}
			r->regmust = longest;
			r->regmlen = len;
		}
	}

	*pregexp = r;
	return(kPGPError_NoErr);
}