static void addranges_w(struct cstate *g) { addrange(g, '0', '9'); addrange(g, 'A', 'Z'); addrange(g, '_', '_'); addrange(g, 'a', 'z'); }
static void addranges_w(void) { addrange('0', '9'); addrange('A', 'Z'); addrange('_', '_'); addrange('a', 'z'); }
static void addranges_W(void) { addrange(0, '0'-1); addrange('9'+1, 'A'-1); addrange('Z'+1, '_'-1); addrange('_'+1, 'a'-1); addrange('z'+1, 0xFFFF); }
static void addranges_s(struct cstate *g) { addrange(g, 0x9, 0xD); addrange(g, 0x20, 0x20); addrange(g, 0xA0, 0xA0); addrange(g, 0x2028, 0x2029); addrange(g, 0xFEFF, 0xFEFF); }
static void addranges_W(struct cstate *g) { addrange(g, 0, '0'-1); addrange(g, '9'+1, 'A'-1); addrange(g, 'Z'+1, '_'-1); addrange(g, '_'+1, 'a'-1); addrange(g, 'z'+1, 0xFFFF); }
idn_result_t idn_ucsset_add(idn_ucsset_t ctx, unsigned long v) { assert(ctx != NULL && ctx->refcnt > 0); TRACE(("idn_ucsset_add(v=U+%lX)\n", v)); return (addrange(ctx, v, v, "idn_ucsset_add")); }
idn_result_t idn_ucsset_addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to) { assert(ctx != NULL && ctx->refcnt > 0); TRACE(("idn_ucsset_addrange(from=U+%lX, to=U+%lX)\n", from, to)); return (addrange(ctx, from, to, "idn_ucsset_addrange")); }
static void addranges_S(void) { addrange(0, 0x9-1); addrange(0x9+1, 0xA-1); addrange(0xD+1, 0x20-1); addrange(0x20+1, 0xA0-1); addrange(0xA0+1, 0x2028-1); addrange(0x2029+1, 0xFEFF-1); addrange(0xFEFF+1, 0xFFFF); }
/* - range - supply cvec for a range, including legality check ^ static struct cvec *range(struct vars *, celt, celt, int); */ static struct cvec * range( struct vars *v, /* context */ celt a, /* range start */ celt b, /* range end, might equal a */ int cases) /* case-independent? */ { int nchrs; struct cvec *cv; celt c, lc, uc, tc; if (a != b && !before(a, b)) { ERR(REG_ERANGE); return NULL; } if (!cases) { /* easy version */ cv = getcvec(v, 0, 1); NOERRN(); addrange(cv, a, b); return cv; } /* * When case-independent, it's hard to decide when cvec ranges are usable, * so for now at least, we won't try. We allocate enough space for two * case variants plus a little extra for the two title case variants. */ nchrs = (b - a + 1)*2 + 4; cv = getcvec(v, nchrs, 0); NOERRN(); for (c=a; c<=b; c++) { addchr(cv, c); lc = Tcl_UniCharToLower((chr)c); uc = Tcl_UniCharToUpper((chr)c); tc = Tcl_UniCharToTitle((chr)c); if (c != lc) { addchr(cv, lc); } if (c != uc) { addchr(cv, uc); } if (c != tc && tc != uc) { addchr(cv, tc); } } return cv; }
static void addranges_S(struct cstate *g) { addrange(g, 0, 0x9-1); addrange(g, 0xD+1, 0x20-1); addrange(g, 0x20+1, 0xA0-1); addrange(g, 0xA0+1, 0x2028-1); addrange(g, 0x2029+1, 0xFEFF-1); addrange(g, 0xFEFF+1, 0xFFFF); }
static void addranges_s(void) { addrange(0x9, 0x9); addrange(0xA, 0xD); addrange(0x20, 0x20); addrange(0xA0, 0xA0); addrange(0x2028, 0x2029); addrange(0xFEFF, 0xFEFF); }
static int lexclass(struct cstate *g) { int type = L_CCLASS; int quoted, havesave, havedash; Rune save; newcclass(g); quoted = nextrune(g); if (!quoted && g->yychar == '^') { type = L_NCCLASS; quoted = nextrune(g); } havesave = havedash = 0; for (;;) { if (g->yychar == 0) die(g, "unterminated character class"); if (!quoted && g->yychar == ']') break; if (!quoted && g->yychar == '-') { if (havesave) { if (havedash) { addrange(g, save, '-'); havesave = havedash = 0; } else { havedash = 1; } } else { save = '-'; havesave = 1; } } else if (quoted && strchr("DSWdsw", g->yychar)) { if (havesave) { addrange(g, save, save); if (havedash) addrange(g, '-', '-'); } switch (g->yychar) { case 'd': addranges_d(g); break; case 's': addranges_s(g); break; case 'w': addranges_w(g); break; case 'D': addranges_D(g); break; case 'S': addranges_S(g); break; case 'W': addranges_W(g); break; } havesave = havedash = 0; } else { if (quoted) { if (g->yychar == 'b') g->yychar = '\b'; else if (g->yychar == '0') g->yychar = 0; /* else identity escape */ } if (havesave) { if (havedash) { addrange(g, save, g->yychar); havesave = havedash = 0; } else { addrange(g, save, save); save = g->yychar; } } else { save = g->yychar; havesave = 1; } } quoted = nextrune(g); } if (havesave) { addrange(g, save, save); if (havedash) addrange(g, '-', '-'); } return type; }
static void addranges_d(void) { addrange('0', '9'); }
static void addranges_D(void) { addrange(0, '0'-1); addrange('9'+1, 0xFFFF); }
static void addranges_D(struct cstate *g) { addrange(g, 0, '0'-1); addrange(g, '9'+1, 0xFFFF); }
static void addranges_d(struct cstate *g) { addrange(g, '0', '9'); }
/* * range - supply cvec for a range, including legality check */ static struct cvec * range(struct vars * v, /* context */ celt a, /* range start */ celt b, /* range end, might equal a */ int cases) /* case-independent? */ { int nchrs; struct cvec *cv; celt c, cc; if (a != b && !before(a, b)) { ERR(REG_ERANGE); return NULL; } if (!cases) { /* easy version */ cv = getcvec(v, 0, 1); NOERRN(); addrange(cv, a, b); return cv; } /* * When case-independent, it's hard to decide when cvec ranges are usable, * so for now at least, we won't try. We use a range for the originally * specified chrs and then add on any case-equivalents that are outside * that range as individual chrs. * * To ensure sane behavior if someone specifies a very large range, limit * the allocation size to 100000 chrs (arbitrary) and check for overrun * inside the loop below. */ nchrs = b - a + 1; if (nchrs <= 0 || nchrs > 100000) nchrs = 100000; cv = getcvec(v, nchrs, 1); NOERRN(); addrange(cv, a, b); for (c = a; c <= b; c++) { cc = pg_wc_tolower((chr) c); if (cc !=c && (before(cc, a) || before(b, cc))) { if (cv->nchrs >= cv->chrspace) { ERR(REG_ETOOBIG); return NULL; } addchr(cv, cc); } cc = pg_wc_toupper((chr) c); if (cc != c && (before(cc, a) || before(b, cc))) { if (cv->nchrs >= cv->chrspace) { ERR(REG_ETOOBIG); return NULL; } addchr(cv, cc); } } return cv; }
/* * cclass - supply cvec for a character class * * Must include case counterparts on request. */ static struct cvec * cclass(struct vars * v, /* context */ const chr *startp, /* where the name starts */ const chr *endp, /* just past the end of the name */ int cases) /* case-independent? */ { size_t len; struct cvec *cv = NULL; const char **namePtr; int i, index; /* * The following arrays define the valid character class names. */ static const char *classNames[] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "xdigit", NULL }; enum classes { CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT }; /* * Map the name to the corresponding enumerated value. */ len = endp - startp; index = -1; for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) { if (strlen(*namePtr) == len && pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) { index = i; break; } } if (index == -1) { ERR(REG_ECTYPE); return NULL; } /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && ((enum classes) index == CC_LOWER || (enum classes) index == CC_UPPER)) index = (int) CC_ALPHA; /* * Now compute the character class contents. * * For the moment, assume that only char codes < 256 can be in these * classes. */ switch ((enum classes) index) { case CC_PRINT: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isprint((chr) i)) addchr(cv, (chr) i); } } break; case CC_ALNUM: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isalnum((chr) i)) addchr(cv, (chr) i); } } break; case CC_ALPHA: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isalpha((chr) i)) addchr(cv, (chr) i); } } break; case CC_ASCII: cv = getcvec(v, 0, 1); if (cv) addrange(cv, 0, 0x7f); break; case CC_BLANK: cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: cv = getcvec(v, 0, 1); if (cv) addrange(cv, (chr) '0', (chr) '9'); break; case CC_PUNCT: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_ispunct((chr) i)) addchr(cv, (chr) i); } } break; case CC_XDIGIT: cv = getcvec(v, 0, 3); if (cv) { addrange(cv, '0', '9'); addrange(cv, 'a', 'f'); addrange(cv, 'A', 'F'); } break; case CC_SPACE: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isspace((chr) i)) addchr(cv, (chr) i); } } break; case CC_LOWER: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_islower((chr) i)) addchr(cv, (chr) i); } } break; case CC_UPPER: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isupper((chr) i)) addchr(cv, (chr) i); } } break; case CC_GRAPH: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isgraph((chr) i)) addchr(cv, (chr) i); } } break; } if (cv == NULL) ERR(REG_ESPACE); return cv; }
/* - cclass - supply cvec for a character class * Must include case counterparts on request. ^ static struct cvec *cclass(struct vars *, const chr *, const chr *, int); */ static struct cvec * cclass( struct vars *v, /* context */ const chr *startp, /* where the name starts */ const chr *endp, /* just past the end of the name */ int cases) /* case-independent? */ { size_t len; struct cvec *cv = NULL; Tcl_DString ds; const char *np; const char **namePtr; int i, index; /* * The following arrays define the valid character class names. */ static const char *classNames[] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "xdigit", NULL }; enum classes { CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT }; /* * Extract the class name */ len = endp - startp; Tcl_DStringInit(&ds); np = Tcl_UniCharToUtfDString(startp, (int)len, &ds); /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && len == 5 && (strncmp("lower", np, 5) == 0 || strncmp("upper", np, 5) == 0)) { np = "alpha"; } /* * Map the name to the corresponding enumerated value. */ index = -1; for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) { if ((strlen(*namePtr) == len) && (strncmp(*namePtr, np, len) == 0)) { index = i; break; } } Tcl_DStringFree(&ds); if (index == -1) { ERR(REG_ECTYPE); return NULL; } /* * Now compute the character class contents. */ switch((enum classes) index) { case CC_PRINT: cv = getcvec(v, NUM_PRINT_CHAR, NUM_PRINT_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_PRINT_CHAR ; i++) { addchr(cv, printCharTable[i]); } for (i=0 ; (size_t)i<NUM_PRINT_RANGE ; i++) { addrange(cv, printRangeTable[i].start, printRangeTable[i].end); } } break; case CC_ALNUM: cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) { addchr(cv, alphaCharTable[i]); } for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) { addrange(cv, alphaRangeTable[i].start, alphaRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) { addrange(cv, digitRangeTable[i].start, digitRangeTable[i].end); } } break; case CC_ALPHA: cv = getcvec(v, NUM_ALPHA_CHAR, NUM_ALPHA_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) { addrange(cv, alphaRangeTable[i].start, alphaRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) { addchr(cv, alphaCharTable[i]); } } break; case CC_ASCII: cv = getcvec(v, 0, 1); if (cv) { addrange(cv, 0, 0x7f); } break; case CC_BLANK: cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: cv = getcvec(v, 0, NUM_DIGIT_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) { addrange(cv, digitRangeTable[i].start, digitRangeTable[i].end); } } break; case CC_PUNCT: cv = getcvec(v, NUM_PUNCT_CHAR, NUM_PUNCT_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_PUNCT_RANGE ; i++) { addrange(cv, punctRangeTable[i].start, punctRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_PUNCT_CHAR ; i++) { addchr(cv, punctCharTable[i]); } } break; case CC_XDIGIT: /* * This is a 3 instead of (NUM_DIGIT_RANGE+2) because I've no idea how * to define the digits 'a' through 'f' in non-western locales. The * concept is quite possibly non portable, or only used in contextx * where the characters used would be the western ones anyway! * Whatever is actually the case, the number of ranges is fixed (until * someone comes up with a better arrangement!) */ cv = getcvec(v, 0, 3); if (cv) { addrange(cv, '0', '9'); addrange(cv, 'a', 'f'); addrange(cv, 'A', 'F'); } break; case CC_SPACE: cv = getcvec(v, NUM_SPACE_CHAR, NUM_SPACE_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_SPACE_RANGE ; i++) { addrange(cv, spaceRangeTable[i].start, spaceRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_SPACE_CHAR ; i++) { addchr(cv, spaceCharTable[i]); } } break; case CC_LOWER: cv = getcvec(v, NUM_LOWER_CHAR, NUM_LOWER_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_LOWER_RANGE ; i++) { addrange(cv, lowerRangeTable[i].start, lowerRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_LOWER_CHAR ; i++) { addchr(cv, lowerCharTable[i]); } } break; case CC_UPPER: cv = getcvec(v, NUM_UPPER_CHAR, NUM_UPPER_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_UPPER_RANGE ; i++) { addrange(cv, upperRangeTable[i].start, upperRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_UPPER_CHAR ; i++) { addchr(cv, upperCharTable[i]); } } break; case CC_GRAPH: cv = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) { addrange(cv, graphRangeTable[i].start, graphRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) { addchr(cv, graphCharTable[i]); } } break; } if (cv == NULL) { ERR(REG_ESPACE); } return cv; }