/* * eclass - supply cvec for an equivalence class * Must include case counterparts on request. */ static struct cvec * eclass(struct vars * v, /* context */ celt c, /* Collating element representing the * equivalence class. */ int cases) /* all cases? */ { struct cvec *cv; /* crude fake equivalence class for testing */ if ((v->cflags & REG_FAKE) && c == 'x') { cv = getcvec(v, 4, 0); addchr(cv, (chr) 'x'); addchr(cv, (chr) 'y'); if (cases) { addchr(cv, (chr) 'X'); addchr(cv, (chr) 'Y'); } return cv; } /* otherwise, none */ if (cases) return allcases(v, c); cv = getcvec(v, 1, 0); assert(cv != NULL); addchr(cv, (chr) c); return cv; }
/* - allcases - supply cvec for all case counterparts of a chr (including itself) * This is a shortcut, preferably an efficient one, for simple characters; * messy cases are done via range(). ^ static struct cvec *allcases(struct vars *, pchr); */ static struct cvec * allcases( struct vars *v, /* context */ pchr pc) /* character to get case equivs of */ { struct cvec *cv; chr c = (chr)pc; chr lc, uc, tc; lc = Tcl_UniCharToLower((chr)c); uc = Tcl_UniCharToUpper((chr)c); tc = Tcl_UniCharToTitle((chr)c); if (tc != uc) { cv = getcvec(v, 3, 0); addchr(cv, tc); } else { cv = getcvec(v, 2, 0); } addchr(cv, lc); if (lc != uc) { addchr(cv, uc); } return cv; }
/* - range - supply cvec for a range, including legality check ^ static struct cvec *range(struct vars *, celt, celt, int); */ static struct cvec * range( struct vars *v, /* context */ celt a, /* range start */ celt b, /* range end, might equal a */ int cases) /* case-independent? */ { int nchrs; struct cvec *cv; celt c, lc, uc, tc; if (a != b && !before(a, b)) { ERR(REG_ERANGE); return NULL; } if (!cases) { /* easy version */ cv = getcvec(v, 0, 1); NOERRN(); addrange(cv, a, b); return cv; } /* * When case-independent, it's hard to decide when cvec ranges are usable, * so for now at least, we won't try. We allocate enough space for two * case variants plus a little extra for the two title case variants. */ nchrs = (b - a + 1)*2 + 4; cv = getcvec(v, nchrs, 0); NOERRN(); for (c=a; c<=b; c++) { addchr(cv, c); lc = Tcl_UniCharToLower((chr)c); uc = Tcl_UniCharToUpper((chr)c); tc = Tcl_UniCharToTitle((chr)c); if (c != lc) { addchr(cv, lc); } if (c != uc) { addchr(cv, uc); } if (c != tc && tc != uc) { addchr(cv, tc); } } return cv; }
/* * allcases - supply cvec for all case counterparts of a chr (including itself) * * This is a shortcut, preferably an efficient one, for simple characters; * messy cases are done via range(). */ static struct cvec * allcases(struct vars * v, /* context */ chr pc) /* character to get case equivs of */ { struct cvec *cv; chr c = (chr) pc; chr lc, uc; lc = pg_wc_tolower((chr) c); uc = pg_wc_toupper((chr) c); cv = getcvec(v, 2, 0); addchr(cv, lc); if (lc != uc) addchr(cv, uc); return cv; }
/* * cclass - supply cvec for a character class * * Must include case counterparts on request. */ static struct cvec * cclass(struct vars * v, /* context */ const chr *startp, /* where the name starts */ const chr *endp, /* just past the end of the name */ int cases) /* case-independent? */ { size_t len; struct cvec *cv = NULL; const char **namePtr; int i, index; /* * The following arrays define the valid character class names. */ static const char *classNames[] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "xdigit", NULL }; enum classes { CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT }; /* * Map the name to the corresponding enumerated value. */ len = endp - startp; index = -1; for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) { if (strlen(*namePtr) == len && pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) { index = i; break; } } if (index == -1) { ERR(REG_ECTYPE); return NULL; } /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && ((enum classes) index == CC_LOWER || (enum classes) index == CC_UPPER)) index = (int) CC_ALPHA; /* * Now compute the character class contents. * * For the moment, assume that only char codes < 256 can be in these * classes. */ switch ((enum classes) index) { case CC_PRINT: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isprint((chr) i)) addchr(cv, (chr) i); } } break; case CC_ALNUM: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isalnum((chr) i)) addchr(cv, (chr) i); } } break; case CC_ALPHA: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isalpha((chr) i)) addchr(cv, (chr) i); } } break; case CC_ASCII: cv = getcvec(v, 0, 1); if (cv) addrange(cv, 0, 0x7f); break; case CC_BLANK: cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: cv = getcvec(v, 0, 1); if (cv) addrange(cv, (chr) '0', (chr) '9'); break; case CC_PUNCT: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_ispunct((chr) i)) addchr(cv, (chr) i); } } break; case CC_XDIGIT: cv = getcvec(v, 0, 3); if (cv) { addrange(cv, '0', '9'); addrange(cv, 'a', 'f'); addrange(cv, 'A', 'F'); } break; case CC_SPACE: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isspace((chr) i)) addchr(cv, (chr) i); } } break; case CC_LOWER: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_islower((chr) i)) addchr(cv, (chr) i); } } break; case CC_UPPER: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isupper((chr) i)) addchr(cv, (chr) i); } } break; case CC_GRAPH: cv = getcvec(v, UCHAR_MAX, 0); if (cv) { for (i = 0; i <= UCHAR_MAX; i++) { if (pg_wc_isgraph((chr) i)) addchr(cv, (chr) i); } } break; } if (cv == NULL) ERR(REG_ESPACE); return cv; }
/* * range - supply cvec for a range, including legality check */ static struct cvec * range(struct vars * v, /* context */ celt a, /* range start */ celt b, /* range end, might equal a */ int cases) /* case-independent? */ { int nchrs; struct cvec *cv; celt c, cc; if (a != b && !before(a, b)) { ERR(REG_ERANGE); return NULL; } if (!cases) { /* easy version */ cv = getcvec(v, 0, 1); NOERRN(); addrange(cv, a, b); return cv; } /* * When case-independent, it's hard to decide when cvec ranges are usable, * so for now at least, we won't try. We use a range for the originally * specified chrs and then add on any case-equivalents that are outside * that range as individual chrs. * * To ensure sane behavior if someone specifies a very large range, limit * the allocation size to 100000 chrs (arbitrary) and check for overrun * inside the loop below. */ nchrs = b - a + 1; if (nchrs <= 0 || nchrs > 100000) nchrs = 100000; cv = getcvec(v, nchrs, 1); NOERRN(); addrange(cv, a, b); for (c = a; c <= b; c++) { cc = pg_wc_tolower((chr) c); if (cc !=c && (before(cc, a) || before(b, cc))) { if (cv->nchrs >= cv->chrspace) { ERR(REG_ETOOBIG); return NULL; } addchr(cv, cc); } cc = pg_wc_toupper((chr) c); if (cc != c && (before(cc, a) || before(b, cc))) { if (cv->nchrs >= cv->chrspace) { ERR(REG_ETOOBIG); return NULL; } addchr(cv, cc); } } return cv; }
/* - cclass - supply cvec for a character class * Must include case counterparts on request. ^ static struct cvec *cclass(struct vars *, const chr *, const chr *, int); */ static struct cvec * cclass( struct vars *v, /* context */ const chr *startp, /* where the name starts */ const chr *endp, /* just past the end of the name */ int cases) /* case-independent? */ { size_t len; struct cvec *cv = NULL; Tcl_DString ds; const char *np; const char **namePtr; int i, index; /* * The following arrays define the valid character class names. */ static const char *classNames[] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "xdigit", NULL }; enum classes { CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT }; /* * Extract the class name */ len = endp - startp; Tcl_DStringInit(&ds); np = Tcl_UniCharToUtfDString(startp, (int)len, &ds); /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && len == 5 && (strncmp("lower", np, 5) == 0 || strncmp("upper", np, 5) == 0)) { np = "alpha"; } /* * Map the name to the corresponding enumerated value. */ index = -1; for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) { if ((strlen(*namePtr) == len) && (strncmp(*namePtr, np, len) == 0)) { index = i; break; } } Tcl_DStringFree(&ds); if (index == -1) { ERR(REG_ECTYPE); return NULL; } /* * Now compute the character class contents. */ switch((enum classes) index) { case CC_PRINT: cv = getcvec(v, NUM_PRINT_CHAR, NUM_PRINT_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_PRINT_CHAR ; i++) { addchr(cv, printCharTable[i]); } for (i=0 ; (size_t)i<NUM_PRINT_RANGE ; i++) { addrange(cv, printRangeTable[i].start, printRangeTable[i].end); } } break; case CC_ALNUM: cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) { addchr(cv, alphaCharTable[i]); } for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) { addrange(cv, alphaRangeTable[i].start, alphaRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) { addrange(cv, digitRangeTable[i].start, digitRangeTable[i].end); } } break; case CC_ALPHA: cv = getcvec(v, NUM_ALPHA_CHAR, NUM_ALPHA_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) { addrange(cv, alphaRangeTable[i].start, alphaRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) { addchr(cv, alphaCharTable[i]); } } break; case CC_ASCII: cv = getcvec(v, 0, 1); if (cv) { addrange(cv, 0, 0x7f); } break; case CC_BLANK: cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: cv = getcvec(v, 0, NUM_DIGIT_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) { addrange(cv, digitRangeTable[i].start, digitRangeTable[i].end); } } break; case CC_PUNCT: cv = getcvec(v, NUM_PUNCT_CHAR, NUM_PUNCT_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_PUNCT_RANGE ; i++) { addrange(cv, punctRangeTable[i].start, punctRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_PUNCT_CHAR ; i++) { addchr(cv, punctCharTable[i]); } } break; case CC_XDIGIT: /* * This is a 3 instead of (NUM_DIGIT_RANGE+2) because I've no idea how * to define the digits 'a' through 'f' in non-western locales. The * concept is quite possibly non portable, or only used in contextx * where the characters used would be the western ones anyway! * Whatever is actually the case, the number of ranges is fixed (until * someone comes up with a better arrangement!) */ cv = getcvec(v, 0, 3); if (cv) { addrange(cv, '0', '9'); addrange(cv, 'a', 'f'); addrange(cv, 'A', 'F'); } break; case CC_SPACE: cv = getcvec(v, NUM_SPACE_CHAR, NUM_SPACE_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_SPACE_RANGE ; i++) { addrange(cv, spaceRangeTable[i].start, spaceRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_SPACE_CHAR ; i++) { addchr(cv, spaceCharTable[i]); } } break; case CC_LOWER: cv = getcvec(v, NUM_LOWER_CHAR, NUM_LOWER_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_LOWER_RANGE ; i++) { addrange(cv, lowerRangeTable[i].start, lowerRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_LOWER_CHAR ; i++) { addchr(cv, lowerCharTable[i]); } } break; case CC_UPPER: cv = getcvec(v, NUM_UPPER_CHAR, NUM_UPPER_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_UPPER_RANGE ; i++) { addrange(cv, upperRangeTable[i].start, upperRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_UPPER_CHAR ; i++) { addchr(cv, upperCharTable[i]); } } break; case CC_GRAPH: cv = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE); if (cv) { for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) { addrange(cv, graphRangeTable[i].start, graphRangeTable[i].end); } for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) { addchr(cv, graphCharTable[i]); } } break; } if (cv == NULL) { ERR(REG_ESPACE); } return cv; }
char *TKGetNextToken(TokenizerT *tk) { // Creates a temporary array to store the the remainder of the stream char *token = ""; int foundToken = 0; int counter = 0; char *remainder = malloc(sizeof(strlen(tk->stream))); // Outer loop runs to the end of the stream string // Inner loop runs to the end of the separators string while(counter < strlen(tk->stream)) { int i; for(i = 0 ; i < strlen(tk->separators); i++) { // Checks current separator with current character. If yes then checks if token has been found yet // If no, then the character is added to the current token char currsep = tk->separators[i]; if( tk->stream[counter] == currsep) { // Error case: If separators are in the beginning of the string. if(counter == 0) { // Error case: IF separator is at the beginning and stream has been truncated: // Add null terminator to end of token and stream if(counter == strlen(tk->stream)-1) { tk->stream = '\0'; char addthis = '\0'; token = addchr(token, addthis); return token; } tk->stream++; counter= -1; break; } // If token is found already, truncate stream to remainder and return token if(foundToken != 0) { int tokenindex = counter + 1; strncpy(remainder, tk->stream + tokenindex, strlen(tk->stream) - strlen(token)); tk->stream = remainder; char addthis = '\0'; token = addchr(token, addthis); return token; } }else { // Since current separator is not current character: // Add to token if(i == strlen(tk->separators) -1){ foundToken = 1; char addthis = tk->stream[counter]; token = addchr(token, addthis); } // Checks if we have reached end of the string and end of separators, if yes then // Manually add null terminator to the stream if we have reached the end of the stream if(counter == strlen(tk->stream)-1 && i == strlen(tk->separators) -1) { tk->stream = '\0'; char addthis = '\0'; token = addchr(token, addthis); return token; } } } counter++; } return 0; }