static int nextrune(void) { g.source += chartorune(&g.yychar, g.source); if (g.yychar == '\\') { g.source += chartorune(&g.yychar, g.source); switch (g.yychar) { case 0: die("unterminated escape sequence"); case 'f': g.yychar = '\f'; return 0; case 'n': g.yychar = '\n'; return 0; case 'r': g.yychar = '\r'; return 0; case 't': g.yychar = '\t'; return 0; case 'v': g.yychar = '\v'; return 0; case 'c': g.yychar = (*g.source++) & 31; return 0; case 'x': g.yychar = hex(*g.source++) << 4; g.yychar += hex(*g.source++); if (g.yychar == 0) { g.yychar = '0'; return 1; } return 0; case 'u': g.yychar = hex(*g.source++) << 12; g.yychar += hex(*g.source++) << 8; g.yychar += hex(*g.source++) << 4; g.yychar += hex(*g.source++); if (g.yychar == 0) { g.yychar = '0'; return 1; } return 0; } if (strchr(ESCAPES, g.yychar)) return 1; if (isalpharune(g.yychar) || g.yychar == '_') /* check identity escape */ die("invalid escape character"); return 0; } return 0; }
void tag(Trie* t, char* s, uvlong qid) { int l; char* q; Rune r; int nc; l = strlen(s); // don't use and and two char words as tags // don't use uttlerly long words, probably not tags. if(l < 3 || l > 40) return; // don't use as tags things other than alphanumeric. for(q = s; *q != 0; ){ nc = chartorune(&r, q); if(!isalpharune(r) && !isdigit(r)) return; q += nc; } trieput(t, s, qid); }
/* Advance `s` pointer to the end of identifier */ static void ident(const char **s) { const unsigned char *p = (unsigned char *) *s; int n; Rune r; while (p[0] != '\0') { if (p[0] == '$' || p[0] == '_' || isalnum(p[0])) { /* $, _, or any alphanumeric are valid identifier characters */ p++; } else if (p[0] == '\\' && p[1] == 'u' && isxdigit(p[2]) && isxdigit(p[3]) && isxdigit(p[4]) && isxdigit(p[5])) { /* Unicode escape, \uXXXX . Could be used like "var \u0078 = 1;" */ p += 6; } else if ((n = chartorune(&r, (char *) p)) > 1 && isalpharune(r)) { /* Unicode alphanumeric character */ p += n; } else { break; } } *s = (char *) p; }
static int jsY_isidentifierpart(int c) { return isdigit(c) || isalpha(c) || c == '$' || c == '_' || isalpharune(c); }
/* * This function is the heart of the tokenizer. * Organized as a giant switch statement. * * Switch statement is by the first character of the input stream. If first * character begins with a letter, it could be either keyword or identifier. * get_tok() calls ident() which shifts `s` pointer to the end of the word. * Now, tokenizer knows that the word begins at `p` and ends at `s`. * It calls function kw() to scan over the keywords that start with `p[0]` * letter. Therefore, keyword tokens and keyword strings must be in the * same order, to let kw() function work properly. * If kw() finds a keyword match, it returns keyword token. * Otherwise, it returns TOK_IDENTIFIER. * NOTE(lsm): `prev_tok` is a previously parsed token. It is needed for * correctly parsing regex literals. */ V7_PRIVATE enum v7_tok get_tok(const char **s, double *n, enum v7_tok prev_tok) { const char *p = *s; switch (*p) { /* Letters */ case 'a': ident(s); return TOK_IDENTIFIER; case 'b': ident(s); return kw(p, *s - p, 1, TOK_BREAK); case 'c': ident(s); return kw(p, *s - p, 3, TOK_CASE); case 'd': ident(s); return kw(p, *s - p, 4, TOK_DEBUGGER); case 'e': ident(s); return kw(p, *s - p, 1, TOK_ELSE); case 'f': ident(s); return kw(p, *s - p, 4, TOK_FALSE); case 'g': case 'h': ident(s); return TOK_IDENTIFIER; case 'i': ident(s); return kw(p, *s - p, 3, TOK_IF); case 'j': case 'k': case 'l': case 'm': ident(s); return TOK_IDENTIFIER; case 'n': ident(s); return kw(p, *s - p, 2, TOK_NEW); case 'o': case 'p': case 'q': ident(s); return TOK_IDENTIFIER; case 'r': ident(s); return kw(p, *s - p, 1, TOK_RETURN); case 's': ident(s); return kw(p, *s - p, 1, TOK_SWITCH); case 't': ident(s); return kw(p, *s - p, 5, TOK_THIS); case 'u': ident(s); return TOK_IDENTIFIER; case 'v': ident(s); return kw(p, *s - p, 2, TOK_VAR); case 'w': ident(s); return kw(p, *s - p, 2, TOK_WHILE); case 'x': case 'y': case 'z': ident(s); return TOK_IDENTIFIER; case '_': case '$': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '\\': /* Identifier may start with unicode escape sequence */ ident(s); return TOK_IDENTIFIER; /* Numbers */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': parse_number(p, s, n); return TOK_NUMBER; /* String literals */ case '\'': case '"': return parse_str_literal(s); /* Punctuators */ case '=': return punct2(s, '=', TOK_EQ, '=', TOK_EQ_EQ, TOK_ASSIGN); case '!': return punct2(s, '=', TOK_NE, '=', TOK_NE_NE, TOK_NOT); case '%': return punct1(s, '=', TOK_REM_ASSIGN, TOK_REM); case '*': return punct1(s, '=', TOK_MUL_ASSIGN, TOK_MUL); case '/': /* * TOK_DIV, TOK_DIV_ASSIGN, and TOK_REGEX_LITERAL start with `/` char. * Division can happen after an expression. * In expressions like this: * a /= b; c /= d; * things between slashes is NOT a regex literal. * The switch below catches all cases where division happens. */ switch (prev_tok) { case TOK_CLOSE_CURLY: case TOK_CLOSE_PAREN: case TOK_CLOSE_BRACKET: case TOK_IDENTIFIER: case TOK_NUMBER: return punct1(s, '=', TOK_DIV_ASSIGN, TOK_DIV); break; default: /* Not a division - this is a regex. Scan until closing slash */ for (p++; *p != '\0' && *p != '\n'; p++) { if (*p == '\\') { /* Skip escape sequence */ p++; } else if (*p == '/') { /* This is a closing slash */ p++; /* Skip regex flags */ while (*p == 'g' || *p == 'i' || *p == 'm') { p++; } *s = p; return TOK_REGEX_LITERAL; } } break; } return punct1(s, '=', TOK_DIV_ASSIGN, TOK_DIV); case '^': return punct1(s, '=', TOK_XOR_ASSIGN, TOK_XOR); case '+': return punct3(s, '+', TOK_PLUS_PLUS, '=', TOK_PLUS_ASSIGN, TOK_PLUS); case '-': return punct3(s, '-', TOK_MINUS_MINUS, '=', TOK_MINUS_ASSIGN, TOK_MINUS); case '&': return punct3(s, '&', TOK_LOGICAL_AND, '=', TOK_AND_ASSIGN, TOK_AND); case '|': return punct3(s, '|', TOK_LOGICAL_OR, '=', TOK_OR_ASSIGN, TOK_OR); case '<': if (s[0][1] == '=') { (*s) += 2; return TOK_LE; } return punct2(s, '<', TOK_LSHIFT, '=', TOK_LSHIFT_ASSIGN, TOK_LT); case '>': if (s[0][1] == '=') { (*s) += 2; return TOK_GE; } if (s[0][1] == '>' && s[0][2] == '>' && s[0][3] == '=') { (*s) += 4; return TOK_URSHIFT_ASSIGN; } if (s[0][1] == '>' && s[0][2] == '>') { (*s) += 3; return TOK_URSHIFT; } return punct2(s, '>', TOK_RSHIFT, '=', TOK_RSHIFT_ASSIGN, TOK_GT); case '{': (*s)++; return TOK_OPEN_CURLY; case '}': (*s)++; return TOK_CLOSE_CURLY; case '(': (*s)++; return TOK_OPEN_PAREN; case ')': (*s)++; return TOK_CLOSE_PAREN; case '[': (*s)++; return TOK_OPEN_BRACKET; case ']': (*s)++; return TOK_CLOSE_BRACKET; case '.': switch (*(*s + 1)) { /* Numbers */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': parse_number(p, s, n); return TOK_NUMBER; } (*s)++; return TOK_DOT; case ';': (*s)++; return TOK_SEMICOLON; case ':': (*s)++; return TOK_COLON; case '?': (*s)++; return TOK_QUESTION; case '~': (*s)++; return TOK_TILDA; case ',': (*s)++; return TOK_COMMA; default: { /* Handle unicode variables */ Rune r; int n; if ((n = chartorune(&r, *s)) > 1 && isalpharune(r)) { ident(s); return TOK_IDENTIFIER; } return TOK_END_OF_INPUT; } } }
static int isunicodeletter(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c); }
int trans(int c, char *) { int f; if (isalpharune(c) && ft == ITAL && c != 'f' && c != 'j') { /* italic letter */ shim(pclass, nclass = ILET); cadd(c); return ITAL; } if (isalpharune(c) && ft != ITAL) { /* other letter */ shim(pclass, nclass = OLET); cadd(c); return ROM; } if (isdigitrune(c)) { shim(pclass, nclass = DIG); roman(c); return ROM; /* this is the right side font of this object */ } f = ROM; nclass = OTHER; switch (c) { case ':': case ';': case '!': case '%': case '?': shim(pclass, nclass); roman(c); return f; case '(': case '[': shim(pclass, nclass = LPAR); roman(c); return f; case ')': case ']': shim(pclass, nclass = RPAR); roman(c); return f; case ',': shim(pclass, nclass = OTHER); roman(c); return f; case '.': if (rf == ROM) roman(c); else cadd(c); return f; case '|': /* postscript needs help with default width! */ shim(pclass, nclass = VBAR); sadd("\\v'.17m'\\z|\\v'-.17m'\\|"); /* and height */ return f; case '=': shim(pclass, nclass = PLUS); sadd("\\(eq"); return f; case '+': shim(pclass, nclass = PLUS); sadd("\\(pl"); return f; case '>': case '<': /* >, >=, >>, <, <-, <=, << */ shim(pclass, nclass = PLUS); if (*psp == '=') { sadd(c == '<' ? "\\(<=" : "\\(>="); psp++; } else if (c == '<' && *psp == '-') { /* <- only */ sadd("\\(<-"); psp++; } else if (*psp == c) { /* << or >> */ cadd(c); cadd(c); psp++; } else { cadd(c); } return f; case '-': shim(pclass, nclass = PLUS); /* probably too big for ->'s */ if (*psp == '>') { sadd("\\(->"); psp++; } else { sadd("\\(mi"); } return f; case '/': shim(pclass, nclass = SLASH); cadd('/'); return f; case '~': case ' ': sadd("\\|\\|"); return f; case '^': sadd("\\|"); return f; case '\\': /* troff - pass only \(xx without comment */ shim(pclass, nclass); cadd('\\'); cadd(c = *psp++); if (c == '(' && *psp && *(psp+1)) { cadd(*psp++); cadd(*psp++); } else fprintf(stderr, "eqn warning: unquoted troff command \\%c, file %s:%d\n", c, curfile->fname, curfile->lineno); return f; case '\'': shim(pclass, nclass); sadd("\\(fm"); return f; case 'f': if (ft == ITAL) { shim(pclass, nclass = ILETF); cadd('f'); f = ITAL; } else cadd('f'); return f; case 'j': if (ft == ITAL) { shim(pclass, nclass = ILETJ); cadd('j'); f = ITAL; } else cadd('j'); return f; default: shim(pclass, nclass); cadd(c); return ft==ITAL ? ITAL : ROM; } }