Example #1
0
/*
 * eclass - supply cvec for an equivalence class
 * Must include case counterparts on request.
 */
static struct cvec *
eclass(struct vars * v,			/* context */
	   celt c,					/* Collating element representing the
								 * equivalence class. */
	   int cases)				/* all cases? */
{
	struct cvec *cv;

	/* crude fake equivalence class for testing */
	if ((v->cflags & REG_FAKE) && c == 'x')
	{
		cv = getcvec(v, 4, 0);
		addchr(cv, (chr) 'x');
		addchr(cv, (chr) 'y');
		if (cases)
		{
			addchr(cv, (chr) 'X');
			addchr(cv, (chr) 'Y');
		}
		return cv;
	}

	/* otherwise, none */
	if (cases)
		return allcases(v, c);
	cv = getcvec(v, 1, 0);
	assert(cv != NULL);
	addchr(cv, (chr) c);
	return cv;
}
/*
 - allcases - supply cvec for all case counterparts of a chr (including itself)
 * This is a shortcut, preferably an efficient one, for simple characters;
 * messy cases are done via range().
 ^ static struct cvec *allcases(struct vars *, pchr);
 */
static struct cvec *
allcases(
    struct vars *v,		/* context */
    pchr pc)			/* character to get case equivs of */
{
    struct cvec *cv;
    chr c = (chr)pc;
    chr lc, uc, tc;

    lc = Tcl_UniCharToLower((chr)c);
    uc = Tcl_UniCharToUpper((chr)c);
    tc = Tcl_UniCharToTitle((chr)c);

    if (tc != uc) {
	cv = getcvec(v, 3, 0);
	addchr(cv, tc);
    } else {
	cv = getcvec(v, 2, 0);
    }
    addchr(cv, lc);
    if (lc != uc) {
	addchr(cv, uc);
    }
    return cv;
}
/*
 - range - supply cvec for a range, including legality check
 ^ static struct cvec *range(struct vars *, celt, celt, int);
 */
static struct cvec *
range(
    struct vars *v,		/* context */
    celt a,			/* range start */
    celt b,			/* range end, might equal a */
    int cases)			/* case-independent? */
{
    int nchrs;
    struct cvec *cv;
    celt c, lc, uc, tc;

    if (a != b && !before(a, b)) {
	ERR(REG_ERANGE);
	return NULL;
    }

    if (!cases) {		/* easy version */
	cv = getcvec(v, 0, 1);
	NOERRN();
	addrange(cv, a, b);
	return cv;
    }

    /*
     * When case-independent, it's hard to decide when cvec ranges are usable,
     * so for now at least, we won't try. We allocate enough space for two
     * case variants plus a little extra for the two title case variants.
     */

    nchrs = (b - a + 1)*2 + 4;

    cv = getcvec(v, nchrs, 0);
    NOERRN();

    for (c=a; c<=b; c++) {
	addchr(cv, c);
	lc = Tcl_UniCharToLower((chr)c);
	uc = Tcl_UniCharToUpper((chr)c);
	tc = Tcl_UniCharToTitle((chr)c);
	if (c != lc) {
	    addchr(cv, lc);
	}
	if (c != uc) {
	    addchr(cv, uc);
	}
	if (c != tc && tc != uc) {
	    addchr(cv, tc);
	}
    }

    return cv;
}
Example #4
0
/*
 * allcases - supply cvec for all case counterparts of a chr (including itself)
 *
 * This is a shortcut, preferably an efficient one, for simple characters;
 * messy cases are done via range().
 */
static struct cvec *
allcases(struct vars * v,		/* context */
		 chr pc)				/* character to get case equivs of */
{
	struct cvec *cv;
	chr			c = (chr) pc;
	chr			lc,
				uc;

	lc = pg_wc_tolower((chr) c);
	uc = pg_wc_toupper((chr) c);

	cv = getcvec(v, 2, 0);
	addchr(cv, lc);
	if (lc != uc)
		addchr(cv, uc);
	return cv;
}
Example #5
0
/*
 * cclass - supply cvec for a character class
 *
 * Must include case counterparts on request.
 */
static struct cvec *
cclass(struct vars * v,			/* context */
	   const chr *startp,		/* where the name starts */
	   const chr *endp,			/* just past the end of the name */
	   int cases)				/* case-independent? */
{
	size_t		len;
	struct cvec *cv = NULL;
	const char **namePtr;
	int			i,
				index;

	/*
	 * The following arrays define the valid character class names.
	 */

	static const char *classNames[] = {
		"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
		"lower", "print", "punct", "space", "upper", "xdigit", NULL
	};

	enum classes
	{
		CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
		CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
	};

	/*
	 * Map the name to the corresponding enumerated value.
	 */
	len = endp - startp;
	index = -1;
	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
	{
		if (strlen(*namePtr) == len &&
			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
		{
			index = i;
			break;
		}
	}
	if (index == -1)
	{
		ERR(REG_ECTYPE);
		return NULL;
	}

	/*
	 * Remap lower and upper to alpha if the match is case insensitive.
	 */

	if (cases &&
		((enum classes) index == CC_LOWER ||
		 (enum classes) index == CC_UPPER))
		index = (int) CC_ALPHA;

	/*
	 * Now compute the character class contents.
	 *
	 * For the moment, assume that only char codes < 256 can be in these
	 * classes.
	 */

	switch ((enum classes) index)
	{
		case CC_PRINT:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_isprint((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_ALNUM:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_isalnum((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_ALPHA:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_isalpha((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_ASCII:
			cv = getcvec(v, 0, 1);
			if (cv)
				addrange(cv, 0, 0x7f);
			break;
		case CC_BLANK:
			cv = getcvec(v, 2, 0);
			addchr(cv, '\t');
			addchr(cv, ' ');
			break;
		case CC_CNTRL:
			cv = getcvec(v, 0, 2);
			addrange(cv, 0x0, 0x1f);
			addrange(cv, 0x7f, 0x9f);
			break;
		case CC_DIGIT:
			cv = getcvec(v, 0, 1);
			if (cv)
				addrange(cv, (chr) '0', (chr) '9');
			break;
		case CC_PUNCT:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_ispunct((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_XDIGIT:
			cv = getcvec(v, 0, 3);
			if (cv)
			{
				addrange(cv, '0', '9');
				addrange(cv, 'a', 'f');
				addrange(cv, 'A', 'F');
			}
			break;
		case CC_SPACE:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_isspace((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_LOWER:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_islower((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_UPPER:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_isupper((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
		case CC_GRAPH:
			cv = getcvec(v, UCHAR_MAX, 0);
			if (cv)
			{
				for (i = 0; i <= UCHAR_MAX; i++)
				{
					if (pg_wc_isgraph((chr) i))
						addchr(cv, (chr) i);
				}
			}
			break;
	}
	if (cv == NULL)
		ERR(REG_ESPACE);
	return cv;
}
Example #6
0
/*
 * range - supply cvec for a range, including legality check
 */
static struct cvec *
range(struct vars * v,			/* context */
	  celt a,					/* range start */
	  celt b,					/* range end, might equal a */
	  int cases)				/* case-independent? */
{
	int			nchrs;
	struct cvec *cv;
	celt		c,
				cc;

	if (a != b && !before(a, b))
	{
		ERR(REG_ERANGE);
		return NULL;
	}

	if (!cases)
	{							/* easy version */
		cv = getcvec(v, 0, 1);
		NOERRN();
		addrange(cv, a, b);
		return cv;
	}

	/*
	* When case-independent, it's hard to decide when cvec ranges are usable,
	* so for now at least, we won't try.  We use a range for the originally
	* specified chrs and then add on any case-equivalents that are outside
	* that range as individual chrs.
	*
	* To ensure sane behavior if someone specifies a very large range, limit
	* the allocation size to 100000 chrs (arbitrary) and check for overrun
	* inside the loop below.
	*/

	nchrs = b - a + 1;
    
	if (nchrs <= 0 || nchrs > 100000)
		nchrs = 100000;

	cv = getcvec(v, nchrs, 1);
	NOERRN();
	addrange(cv, a, b);

	for (c = a; c <= b; c++)
	{
		cc = pg_wc_tolower((chr) c);
		if (cc !=c && (before(cc, a) || before(b, cc)))
		{
			if (cv->nchrs >= cv->chrspace)
			{
				ERR(REG_ETOOBIG);
				return NULL;
			}
			addchr(cv, cc);
		}
		cc = pg_wc_toupper((chr) c);
		if (cc != c && (before(cc, a) || before(b, cc)))
		{
			if (cv->nchrs >= cv->chrspace)
			{
				ERR(REG_ETOOBIG);
				return NULL;
			}
			addchr(cv, cc);
		}
	}

	return cv;
}
/*
 - cclass - supply cvec for a character class
 * Must include case counterparts on request.
 ^ static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
 */
static struct cvec *
cclass(
    struct vars *v,		/* context */
    const chr *startp,		/* where the name starts */
    const chr *endp,		/* just past the end of the name */
    int cases)			/* case-independent? */
{
    size_t len;
    struct cvec *cv = NULL;
    Tcl_DString ds;
    const char *np;
    const char **namePtr;
    int i, index;

    /*
     * The following arrays define the valid character class names.
     */

    static const char *classNames[] = {
	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
	"lower", "print", "punct", "space", "upper", "xdigit", NULL
    };

    enum classes {
	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
    };


    /*
     * Extract the class name
     */

    len = endp - startp;
    Tcl_DStringInit(&ds);
    np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);

    /*
     * Remap lower and upper to alpha if the match is case insensitive.
     */

    if (cases && len == 5 && (strncmp("lower", np, 5) == 0
	    || strncmp("upper", np, 5) == 0)) {
	np = "alpha";
    }

    /*
     * Map the name to the corresponding enumerated value.
     */

    index = -1;
    for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
	if ((strlen(*namePtr) == len) && (strncmp(*namePtr, np, len) == 0)) {
	    index = i;
	    break;
	}
    }
    Tcl_DStringFree(&ds);
    if (index == -1) {
	ERR(REG_ECTYPE);
	return NULL;
    }

    /*
     * Now compute the character class contents.
     */

    switch((enum classes) index) {
    case CC_PRINT:
	cv = getcvec(v, NUM_PRINT_CHAR, NUM_PRINT_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_PRINT_CHAR ; i++) {
		addchr(cv, printCharTable[i]);
	    }
	    for (i=0 ; (size_t)i<NUM_PRINT_RANGE ; i++) {
		addrange(cv, printRangeTable[i].start,
			printRangeTable[i].end);
	    }
	}
	break;
    case CC_ALNUM:
	cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) {
		addchr(cv, alphaCharTable[i]);
	    }
	    for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) {
		addrange(cv, alphaRangeTable[i].start,
			alphaRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) {
		addrange(cv, digitRangeTable[i].start,
			digitRangeTable[i].end);
	    }
	}
	break;
    case CC_ALPHA:
	cv = getcvec(v, NUM_ALPHA_CHAR, NUM_ALPHA_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) {
		addrange(cv, alphaRangeTable[i].start,
			alphaRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) {
		addchr(cv, alphaCharTable[i]);
	    }
	}
	break;
    case CC_ASCII:
	cv = getcvec(v, 0, 1);
	if (cv) {
	    addrange(cv, 0, 0x7f);
	}
	break;
    case CC_BLANK:
	cv = getcvec(v, 2, 0);
	addchr(cv, '\t');
	addchr(cv, ' ');
	break;
    case CC_CNTRL:
	cv = getcvec(v, 0, 2);
	addrange(cv, 0x0, 0x1f);
	addrange(cv, 0x7f, 0x9f);
	break;
    case CC_DIGIT:
	cv = getcvec(v, 0, NUM_DIGIT_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) {
		addrange(cv, digitRangeTable[i].start,
			digitRangeTable[i].end);
	    }
	}
	break;
    case CC_PUNCT:
	cv = getcvec(v, NUM_PUNCT_CHAR, NUM_PUNCT_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_PUNCT_RANGE ; i++) {
		addrange(cv, punctRangeTable[i].start,
			punctRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_PUNCT_CHAR ; i++) {
		addchr(cv, punctCharTable[i]);
	    }
	}
	break;
    case CC_XDIGIT:
	/*
	 * This is a 3 instead of (NUM_DIGIT_RANGE+2) because I've no idea how
	 * to define the digits 'a' through 'f' in non-western locales. The
	 * concept is quite possibly non portable, or only used in contextx
	 * where the characters used would be the western ones anyway!
	 * Whatever is actually the case, the number of ranges is fixed (until
	 * someone comes up with a better arrangement!)
	 */

	cv = getcvec(v, 0, 3);
	if (cv) {
	    addrange(cv, '0', '9');
	    addrange(cv, 'a', 'f');
	    addrange(cv, 'A', 'F');
	}
	break;
    case CC_SPACE:
	cv = getcvec(v, NUM_SPACE_CHAR, NUM_SPACE_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_SPACE_RANGE ; i++) {
		addrange(cv, spaceRangeTable[i].start,
			spaceRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_SPACE_CHAR ; i++) {
		addchr(cv, spaceCharTable[i]);
	    }
	}
	break;
    case CC_LOWER:
	cv  = getcvec(v, NUM_LOWER_CHAR, NUM_LOWER_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_LOWER_RANGE ; i++) {
		addrange(cv, lowerRangeTable[i].start,
			lowerRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_LOWER_CHAR ; i++) {
		addchr(cv, lowerCharTable[i]);
	    }
	}
	break;
    case CC_UPPER:
	cv  = getcvec(v, NUM_UPPER_CHAR, NUM_UPPER_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_UPPER_RANGE ; i++) {
		addrange(cv, upperRangeTable[i].start,
			upperRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_UPPER_CHAR ; i++) {
		addchr(cv, upperCharTable[i]);
	    }
	}
	break;
    case CC_GRAPH:
	cv  = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
		addrange(cv, graphRangeTable[i].start,
			graphRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) {
		addchr(cv, graphCharTable[i]);
	    }
	}
	break;
    }
    if (cv == NULL) {
	ERR(REG_ESPACE);
    }
    return cv;
}