static int tok_get(register struct tok_state *tok, char **p_start, char **p_end) { register int c; int blankline; *p_start = *p_end = NULL; nextline: tok->start = NULL; blankline = 0; /* Get indentation level */ if (tok->atbol) { register int col = 0; register int altcol = 0; tok->atbol = 0; for (;;) { c = tok_nextc(tok); if (c == ' ') col++, altcol++; else if (c == '\t') { col = (col/tok->tabsize + 1) * tok->tabsize; altcol = (altcol/tok->alttabsize + 1) * tok->alttabsize; } else if (c == '\014') /* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ else break; } tok_backup(tok, c); if (c == '#' || c == '\n') { /* Lines with only whitespace and/or comments shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ if (col == 0 && c == '\n' && tok->prompt != NULL) blankline = 0; /* Let it through */ else blankline = 1; /* Ignore completely */ /* We can't jump back right here since we still may need to skip to the end of a comment */ } if (!blankline && tok->level == 0) { if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { if (indenterror(tok)) return ERRORTOKEN; } } else if (col > tok->indstack[tok->indent]) { /* Indent -- always one */ if (tok->indent+1 >= MAXINDENT) { tok->done = E_TOODEEP; tok->cur = tok->inp; return ERRORTOKEN; } if (altcol <= tok->altindstack[tok->indent]) { if (indenterror(tok)) return ERRORTOKEN; } tok->pendin++; tok->indstack[++tok->indent] = col; tok->altindstack[tok->indent] = altcol; } else /* col < tok->indstack[tok->indent] */ { /* Dedent -- any number, must be consistent */ while (tok->indent > 0 && col < tok->indstack[tok->indent]) { tok->pendin--; tok->indent--; } if (col != tok->indstack[tok->indent]) { tok->done = E_DEDENT; tok->cur = tok->inp; return ERRORTOKEN; } if (altcol != tok->altindstack[tok->indent]) { if (indenterror(tok)) return ERRORTOKEN; } } } } tok->start = tok->cur; /* Return pending indents/dedents */ if (tok->pendin != 0) { if (tok->pendin < 0) { tok->pendin++; return DEDENT; } else { tok->pendin--; return INDENT; } } again: tok->start = NULL; /* Skip spaces */ do { c = tok_nextc(tok); } while (c == ' ' || c == '\t' || c == '\014'); /* Set start of current token */ tok->start = tok->cur - 1; /* Skip comment, while looking for tab-setting magic */ if (c == '#') { static char *tabforms[] = { "tab-width:", /* Emacs */ ":tabstop=", /* vim, full form */ ":ts=", /* vim, abbreviated form */ "set tabsize=", /* will vi never die? */ /* more templates can be added here to support other editors */ }; char cbuf[80]; char *tp, **cp; tp = cbuf; do { *tp++ = c = tok_nextc(tok); } while (c != EOF && c != '\n' && (size_t)(tp - cbuf + 1) < sizeof(cbuf)); *tp = '\0'; for (cp = tabforms; cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); cp++) { if ((tp = strstr(cbuf, *cp))) { int newsize = atoi(tp + strlen(*cp)); if (newsize >= 1 && newsize <= 40) { tok->tabsize = newsize; if (Py_VerboseFlag) PySys_WriteStderr( "Tab size set to %d\n", newsize); } } } while (c != EOF && c != '\n') c = tok_nextc(tok); } /* Check for EOF and errors now */ if (c == EOF) { return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; } /* Identifier (most frequent token!) */ if (isalpha(c) || c == '_') { /* Process r"", u"" and ur"" */ switch (c) { case 'b': case 'B': c = tok_nextc(tok); if (c == 'r' || c == 'R') c = tok_nextc(tok); if (c == '"' || c == '\'') goto letter_quote; break; case 'r': case 'R': c = tok_nextc(tok); if (c == '"' || c == '\'') goto letter_quote; break; case 'u': case 'U': c = tok_nextc(tok); if (c == 'r' || c == 'R') c = tok_nextc(tok); if (c == '"' || c == '\'') goto letter_quote; break; } while (isalnum(c) || c == '_') { c = tok_nextc(tok); } tok_backup(tok, c); *p_start = tok->start; *p_end = tok->cur; return NAME; } /* Newline */ if (c == '\n') { tok->atbol = 1; if (blankline || tok->level > 0) goto nextline; *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; return NEWLINE; } /* Period or number starting with period? */ if (c == '.') { c = tok_nextc(tok); if (isdigit(c)) { goto fraction; } else { tok_backup(tok, c); *p_start = tok->start; *p_end = tok->cur; return DOT; } } /* Number */ if (isdigit(c)) { if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); if (c == '.') goto fraction; #ifndef WITHOUT_COMPLEX if (c == 'j' || c == 'J') goto imaginary; #endif if (c == 'x' || c == 'X') { /* Hex */ c = tok_nextc(tok); if (!isxdigit(c)) { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } do { c = tok_nextc(tok); } while (isxdigit(c)); } else if (c == 'o' || c == 'O') { /* Octal */ c = tok_nextc(tok); if (c < '0' || c >= '8') { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } do { c = tok_nextc(tok); } while ('0' <= c && c < '8'); } else if (c == 'b' || c == 'B') { /* Binary */ c = tok_nextc(tok); if (c != '0' && c != '1') { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } do { c = tok_nextc(tok); } while (c == '0' || c == '1'); } else { int found_decimal = 0; /* Octal; c is first char of it */ /* There's no 'isoctdigit' macro, sigh */ while ('0' <= c && c < '8') { c = tok_nextc(tok); } if (isdigit(c)) { found_decimal = 1; do { c = tok_nextc(tok); } while (isdigit(c)); } if (c == '.') goto fraction; else if (c == 'e' || c == 'E') goto exponent; #ifndef WITHOUT_COMPLEX else if (c == 'j' || c == 'J') goto imaginary; #endif else if (found_decimal) { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } } if (c == 'l' || c == 'L') c = tok_nextc(tok); } else { /* Decimal */ do { c = tok_nextc(tok); } while (isdigit(c)); if (c == 'l' || c == 'L') c = tok_nextc(tok); else { /* Accept floating point numbers. */ if (c == '.') { fraction: /* Fraction */ do { c = tok_nextc(tok); } while (isdigit(c)); } if (c == 'e' || c == 'E') { exponent: /* Exponent part */ c = tok_nextc(tok); if (c == '+' || c == '-') c = tok_nextc(tok); if (!isdigit(c)) { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } do { c = tok_nextc(tok); } while (isdigit(c)); } #ifndef WITHOUT_COMPLEX if (c == 'j' || c == 'J') /* Imaginary part */ imaginary: c = tok_nextc(tok); #endif } } tok_backup(tok, c); *p_start = tok->start; *p_end = tok->cur; return NUMBER; } letter_quote: /* String */ if (c == '\'' || c == '"') { Py_ssize_t quote2 = tok->cur - tok->start + 1; int quote = c; int triple = 0; int tripcount = 0; for (;;) { c = tok_nextc(tok); if (c == '\n') { if (!triple) { tok->done = E_EOLS; tok_backup(tok, c); return ERRORTOKEN; } tripcount = 0; tok->cont_line = 1; /* multiline string. */ } else if (c == EOF) { if (triple) tok->done = E_EOFS; else tok->done = E_EOLS; tok->cur = tok->inp; return ERRORTOKEN; } else if (c == quote) { tripcount++; if (tok->cur - tok->start == quote2) { c = tok_nextc(tok); if (c == quote) { triple = 1; tripcount = 0; continue; } tok_backup(tok, c); } if (!triple || tripcount == 3) break; } else if (c == '\\') { tripcount = 0; c = tok_nextc(tok); if (c == EOF) { tok->done = E_EOLS; tok->cur = tok->inp; return ERRORTOKEN; } } else tripcount = 0; } *p_start = tok->start; *p_end = tok->cur; return STRING; } /* Line continuation */ if (c == '\\') { c = tok_nextc(tok); if (c != '\n') { tok->done = E_LINECONT; tok->cur = tok->inp; return ERRORTOKEN; } tok->cont_line = 1; goto again; /* Read next line */ } /* Check for two-character token */ { int c2 = tok_nextc(tok); int token = PyToken_TwoChars(c, c2); #ifndef PGEN if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { if (PyErr_WarnExplicit(PyExc_DeprecationWarning, "<> not supported in 3.x; use !=", tok->filename, tok->lineno, NULL, NULL)) { return ERRORTOKEN; } } #endif if (token != OP) { int c3 = tok_nextc(tok); int token3 = PyToken_ThreeChars(c, c2, c3); if (token3 != OP) { token = token3; } else { tok_backup(tok, c3); } *p_start = tok->start; *p_end = tok->cur; return token; } tok_backup(tok, c2); } /* Keep track of parentheses nesting level */ switch (c) { case '(': case '[': case '{': tok->level++; break; case ')': case ']': case '}': tok->level--; break; } /* Punctuation character */ *p_start = tok->start; *p_end = tok->cur; return PyToken_OneChar(c); }
static int tok_get(struct tok_state *tok, char **p_start, char **p_end) { int c; int blankline, nonascii; *p_start = *p_end = NULL; nextline: tok->start = NULL; blankline = 0; /* Get indentation level */ if (tok->atbol) { int col = 0; int altcol = 0; tok->atbol = 0; for (;;) { c = tok_nextc(tok); if (c == ' ') col++, altcol++; else if (c == '\t') { col = (col/tok->tabsize + 1) * tok->tabsize; altcol = (altcol/tok->alttabsize + 1) * tok->alttabsize; } else if (c == '\014') /* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ else break; } tok_backup(tok, c); if (c == '#' || c == '\n') { /* Lines with only whitespace and/or comments shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ if (col == 0 && c == '\n' && tok->prompt != NULL) blankline = 0; /* Let it through */ else blankline = 1; /* Ignore completely */ /* We can't jump back right here since we still may need to skip to the end of a comment */ } if (!blankline && tok->level == 0) { if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { if (indenterror(tok)) return ERRORTOKEN; } } else if (col > tok->indstack[tok->indent]) { /* Indent -- always one */ if (tok->indent+1 >= MAXINDENT) { tok->done = E_TOODEEP; tok->cur = tok->inp; return ERRORTOKEN; } if (altcol <= tok->altindstack[tok->indent]) { if (indenterror(tok)) return ERRORTOKEN; } tok->pendin++; tok->indstack[++tok->indent] = col; tok->altindstack[tok->indent] = altcol; } else /* col < tok->indstack[tok->indent] */ { /* Dedent -- any number, must be consistent */ while (tok->indent > 0 && col < tok->indstack[tok->indent]) { tok->pendin--; tok->indent--; } if (col != tok->indstack[tok->indent]) { tok->done = E_DEDENT; tok->cur = tok->inp; return ERRORTOKEN; } if (altcol != tok->altindstack[tok->indent]) { if (indenterror(tok)) return ERRORTOKEN; } } } } tok->start = tok->cur; /* Return pending indents/dedents */ if (tok->pendin != 0) { if (tok->pendin < 0) { tok->pendin++; return DEDENT; } else { tok->pendin--; return INDENT; } } again: tok->start = NULL; /* Skip spaces */ do { c = tok_nextc(tok); } while (c == ' ' || c == '\t' || c == '\014'); /* Set start of current token */ tok->start = tok->cur - 1; /* Skip comment */ if (c == '#') while (c != EOF && c != '\n') c = tok_nextc(tok); /* Check for EOF and errors now */ if (c == EOF) { return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; } /* Identifier (most frequent token!) */ nonascii = 0; if (is_potential_identifier_start(c)) { /* Process b"", r"", u"", br"" and rb"" */ int saw_b = 0, saw_r = 0, saw_u = 0; while (1) { if (!(saw_b || saw_u) && (c == 'b' || c == 'B')) saw_b = 1; /* Since this is a backwards compatibility support literal we don't want to support it in arbitrary order like byte literals. */ else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U')) saw_u = 1; /* ur"" and ru"" are not supported */ else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) saw_r = 1; else break; c = tok_nextc(tok); if (c == '"' || c == '\'') goto letter_quote; } while (is_potential_identifier_char(c)) { if (c >= 128) nonascii = 1; c = tok_nextc(tok); } tok_backup(tok, c); if (nonascii && !verify_identifier(tok)) return ERRORTOKEN; *p_start = tok->start; *p_end = tok->cur; return NAME; } /* Newline */ if (c == '\n') { tok->atbol = 1; if (blankline || tok->level > 0) goto nextline; *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; return NEWLINE; } /* Period or number starting with period? */ if (c == '.') { c = tok_nextc(tok); if (isdigit(c)) { goto fraction; } else if (c == '.') { c = tok_nextc(tok); if (c == '.') { *p_start = tok->start; *p_end = tok->cur; return ELLIPSIS; } else { tok_backup(tok, c); } tok_backup(tok, '.'); } else { tok_backup(tok, c); } *p_start = tok->start; *p_end = tok->cur; return DOT; } /* Number */ if (isdigit(c)) { if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); if (c == '.') goto fraction; if (c == 'j' || c == 'J') goto imaginary; if (c == 'x' || c == 'X') { /* Hex */ c = tok_nextc(tok); if (!isxdigit(c)) { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } do { c = tok_nextc(tok); } while (isxdigit(c)); } else if (c == 'o' || c == 'O') { /* Octal */ c = tok_nextc(tok); if (c < '0' || c >= '8') { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } do { c = tok_nextc(tok); } while ('0' <= c && c < '8'); } else if (c == 'b' || c == 'B') { /* Binary */ c = tok_nextc(tok); if (c != '0' && c != '1') { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } do { c = tok_nextc(tok); } while (c == '0' || c == '1'); } else { int nonzero = 0; /* maybe old-style octal; c is first char of it */ /* in any case, allow '0' as a literal */ while (c == '0') c = tok_nextc(tok); while (isdigit(c)) { nonzero = 1; c = tok_nextc(tok); } if (c == '.') goto fraction; else if (c == 'e' || c == 'E') goto exponent; else if (c == 'j' || c == 'J') goto imaginary; else if (nonzero) { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } } } else { /* Decimal */ do { c = tok_nextc(tok); } while (isdigit(c)); { /* Accept floating point numbers. */ if (c == '.') { fraction: /* Fraction */ do { c = tok_nextc(tok); } while (isdigit(c)); } if (c == 'e' || c == 'E') { int e; exponent: e = c; /* Exponent part */ c = tok_nextc(tok); if (c == '+' || c == '-') { c = tok_nextc(tok); if (!isdigit(c)) { tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; } } else if (!isdigit(c)) { tok_backup(tok, c); tok_backup(tok, e); *p_start = tok->start; *p_end = tok->cur; return NUMBER; } do { c = tok_nextc(tok); } while (isdigit(c)); } if (c == 'j' || c == 'J') /* Imaginary part */ imaginary: c = tok_nextc(tok); } } tok_backup(tok, c); *p_start = tok->start; *p_end = tok->cur; return NUMBER; } letter_quote: /* String */ if (c == '\'' || c == '"') { int quote = c; int quote_size = 1; /* 1 or 3 */ int end_quote_size = 0; /* Find the quote size and start of string */ c = tok_nextc(tok); if (c == quote) { c = tok_nextc(tok); if (c == quote) quote_size = 3; else end_quote_size = 1; /* empty string found */ } if (c != quote) tok_backup(tok, c); /* Get rest of string */ while (end_quote_size != quote_size) { c = tok_nextc(tok); if (c == EOF) { if (quote_size == 3) tok->done = E_EOFS; else tok->done = E_EOLS; tok->cur = tok->inp; return ERRORTOKEN; } if (quote_size == 1 && c == '\n') { tok->done = E_EOLS; tok->cur = tok->inp; return ERRORTOKEN; } if (c == quote) end_quote_size += 1; else { end_quote_size = 0; if (c == '\\') c = tok_nextc(tok); /* skip escaped char */ } } *p_start = tok->start; *p_end = tok->cur; return STRING; } /* Line continuation */ if (c == '\\') { c = tok_nextc(tok); if (c != '\n') { tok->done = E_LINECONT; tok->cur = tok->inp; return ERRORTOKEN; } tok->cont_line = 1; goto again; /* Read next line */ } /* Check for two-character token */ { int c2 = tok_nextc(tok); int token = PyToken_TwoChars(c, c2); if (token != OP) { int c3 = tok_nextc(tok); int token3 = PyToken_ThreeChars(c, c2, c3); if (token3 != OP) { token = token3; } else { tok_backup(tok, c3); } *p_start = tok->start; *p_end = tok->cur; return token; } tok_backup(tok, c2); } /* Keep track of parentheses nesting level */ switch (c) { case '(': case '[': case '{': tok->level++; break; case ')': case ']': case '}': tok->level--; break; } /* Punctuation character */ *p_start = tok->start; *p_end = tok->cur; return PyToken_OneChar(c); }