static void parse_value(fb_parser_t *P, fb_value_t *v, int flags, const char *error_msg) { fb_token_t *t; fb_token_t *sign; sign = optional(P, '-'); t = P->token; switch (t->id) { case LEX_TOK_INT: read_integer_value(P, t, v, sign != 0); break; case LEX_TOK_FLOAT: read_float_value(P, t, v, sign != 0); break; case tok_kw_true: v->b = 1; v->type = vt_bool; break; case tok_kw_false: v->b = 0; v->type = vt_bool; break; case LEX_TOK_STRING_BEGIN: next(P); parse_string_literal(P, v); if (!(flags & allow_string_value)) { v->type = vt_invalid; error_tok(P, t, error_msg); return; } if (sign) { v->type = vt_invalid; error_tok(P, t, "string constants cannot be signed"); return; } return; case LEX_TOK_ID: parse_ref(P, &v->ref); v->type = vt_name_ref; if (sign) { v->type = vt_invalid; /* Technically they could, but we do not allow it. */ error_tok(P, t, "named values cannot be signed"); } return; default: /* We might have consumed a sign, but never mind that. */ error_tok(P, t, error_msg); return; } if (sign && v->type == vt_bool) { v->type = vt_invalid; error_tok(P, t, "boolean constants cannot be signed"); } next(P); }
static void parse_attribute(fb_parser_t *P, fb_attribute_t *a) { fb_token_t *t = P->token; if (match(P, LEX_TOK_STRING_BEGIN, "attribute expected string literal")) { parse_string_literal(P, &a->name.name); if (a->name.name.s.len == 0) { error_tok_as_string(P, t, "attribute name cannot be empty", 0, 0); } } match(P, ';', "attribute expected ';'"); }
static void parse_file_extension(fb_parser_t *P, fb_value_t *v) { if (v->type == vt_string) { error_tok_as_string(P, P->token, "file extension already set", v->s.s, v->s.len); } if (!match(P, LEX_TOK_STRING_BEGIN, "file_extension expected string literal")) { goto fail; } parse_string_literal(P, v); match(P, ';', "file_extension expected ';'"); return; fail: recover(P, ';', 1); }
// factor = number | string_literal | "(" expression ")" | // variable | "this" | "null" | "true" | "false" | // "{" object_literal "}" | // "[" array_literal "]" | // function_definition | // function_call static enum v7_err parse_factor(struct v7 *v7) { int old_sp = v7_sp(v7); if (*v7->cursor == '(') { TRY(match(v7, '(')); TRY(parse_expression(v7)); TRY(match(v7, ')')); } else if (*v7->cursor == '\'' || *v7->cursor == '"') { TRY(parse_string_literal(v7)); } else if (*v7->cursor == '{') { TRY(parse_object_literal(v7)); } else if (is_alpha(*v7->cursor) || *v7->cursor == '_') { TRY(parse_identifier(v7)); if (test_token(v7, "this", 4)) { inc_stack(v7, 1); v7_top(v7)[-1] = &v7->scopes[v7->current_scope]; } else if (test_token(v7, "null", 4)) { TRY(v7_make_and_push(v7, V7_NULL)); } else if (test_token(v7, "true", 4)) { TRY(v7_make_and_push(v7, V7_BOOL)); v7_top(v7)[-1]->v.num = 1; } else if (test_token(v7, "false", 5)) { TRY(v7_make_and_push(v7, V7_BOOL)); v7_top(v7)[-1]->v.num = 0; } else if (test_token(v7, "function", 8)) { TRY(parse_function_definition(v7, NULL, 0)); } else if (test_token(v7, "delete", 6)) { TRY(parse_delete(v7)); } else { TRY(parse_variable(v7)); } } else { TRY(parse_num(v7)); } if (*v7->cursor == '(') { TRY(parse_function_call(v7)); } // Don't leave anything on stack if no execution flag is set if (v7->no_exec) { inc_stack(v7, old_sp - v7->sp); } return V7_OK; }
static void parse_include(fb_parser_t *P) { fb_token_t *t = P->token; while (optional(P, tok_kw_include)) { if (P->opts.disable_includes) { error_tok(P, t, "include statements not supported by current environment"); } if (P->failed >= FLATCC_MAX_ERRORS) { return; } if (!match(P, LEX_TOK_STRING_BEGIN, "include expected a string literal as filename")) { recover(P, ';', 1); } parse_string_literal(P, &fb_add_include(P)->name); match(P, ';', "include statement expected ';'"); } }
static void parse_file_identifier(fb_parser_t *P, fb_value_t *v) { fb_token_t *t; if (v->type != vt_missing) { error_tok_as_string(P, P->token, "file identifier already set", v->s.s, v->s.len); } if (!match(P, LEX_TOK_STRING_BEGIN, "file_identifier expected string literal")) { goto fail; } t = P->token; parse_string_literal(P, v); if (v->s.s && v->s.len != 4) { v->type = vt_invalid; error_tok(P, t, "file_identifier must be 4 characters"); } match(P, ';', "file_identifier expected ';'"); return; fail: recover(P, ';', 1); }
static enum v7_err parse_object_literal(struct v7 *v7) { TRY(v7_make_and_push(v7, V7_OBJ)); TRY(match(v7, '{')); while (*v7->cursor != '}') { if (*v7->cursor == '\'' || *v7->cursor == '"') { TRY(parse_string_literal(v7)); } else { TRY(parse_identifier(v7)); TRY(v7_make_and_push_string(v7, v7->tok, v7->tok_len, 1)); } TRY(match(v7, ':')); TRY(parse_expression(v7)); if (!v7->no_exec) { struct v7_val **v = v7_top(v7) - 3; CHECK(v[0]->type == V7_OBJ, V7_TYPE_MISMATCH); v7_set(v7, v[0], v[1], v[2]); inc_stack(v7, -2); } test_and_skip_char(v7, ','); } TRY(match(v7, '}')); return V7_OK; }
static int next_token(parser_ctx_t *ctx, void *lval) { do { skip_spaces(ctx); if(ctx->ptr == ctx->end) return tEOF; }while(skip_comment(ctx) || skip_html_comment(ctx)); if(isalphaW(*ctx->ptr)) { int ret = check_keywords(ctx, lval); if(ret) return ret; return parse_identifier(ctx, lval); } if(isdigitW(*ctx->ptr)) return parse_numeric_literal(ctx, lval); switch(*ctx->ptr) { case '{': case '(': case ')': case '[': case ']': case ';': case ',': case '~': case '?': case ':': return *ctx->ptr++; case '}': *(const WCHAR**)lval = ctx->ptr++; return '}'; case '.': if(++ctx->ptr < ctx->end && isdigitW(*ctx->ptr)) return parse_double_literal(ctx, 0, lval); return '.'; case '<': if(++ctx->ptr == ctx->end) { *(int*)lval = EXPR_LESS; return tRelOper; } switch(*ctx->ptr) { case '=': /* <= */ ctx->ptr++; *(int*)lval = EXPR_LESSEQ; return tRelOper; case '<': /* << */ if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* <<= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNLSHIFT; return tAssignOper; } *(int*)lval = EXPR_LSHIFT; return tShiftOper; default: /* < */ *(int*)lval = EXPR_LESS; return tRelOper; } case '>': if(++ctx->ptr == ctx->end) { /* > */ *(int*)lval = EXPR_GREATER; return tRelOper; } switch(*ctx->ptr) { case '=': /* >= */ ctx->ptr++; *(int*)lval = EXPR_GREATEREQ; return tRelOper; case '>': /* >> */ if(++ctx->ptr < ctx->end) { if(*ctx->ptr == '=') { /* >>= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNRSHIFT; return tAssignOper; } if(*ctx->ptr == '>') { /* >>> */ if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* >>>= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNRRSHIFT; return tAssignOper; } *(int*)lval = EXPR_RRSHIFT; return tRelOper; } } *(int*)lval = EXPR_RSHIFT; return tShiftOper; default: *(int*)lval = EXPR_GREATER; return tRelOper; } case '+': ctx->ptr++; if(ctx->ptr < ctx->end) { switch(*ctx->ptr) { case '+': /* ++ */ ctx->ptr++; return tINC; case '=': /* += */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNADD; return tAssignOper; } } return '+'; case '-': ctx->ptr++; if(ctx->ptr < ctx->end) { switch(*ctx->ptr) { case '-': /* -- or --> */ ctx->ptr++; if(ctx->is_html && ctx->nl && ctx->ptr < ctx->end && *ctx->ptr == '>') { ctx->ptr++; return tHTMLCOMMENT; } return tDEC; case '=': /* -= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNSUB; return tAssignOper; } } return '-'; case '*': if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* *= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNMUL; return tAssignOper; } return '*'; case '%': if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* %= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNMOD; return tAssignOper; } return '%'; case '&': if(++ctx->ptr < ctx->end) { switch(*ctx->ptr) { case '=': /* &= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNAND; return tAssignOper; case '&': /* && */ ctx->ptr++; return tANDAND; } } return '&'; case '|': if(++ctx->ptr < ctx->end) { switch(*ctx->ptr) { case '=': /* |= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNOR; return tAssignOper; case '|': /* || */ ctx->ptr++; return tOROR; } } return '|'; case '^': if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* ^= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNXOR; return tAssignOper; } return '^'; case '!': if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* != */ if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* !== */ ctx->ptr++; *(int*)lval = EXPR_NOTEQEQ; return tEqOper; } *(int*)lval = EXPR_NOTEQ; return tEqOper; } return '!'; case '=': if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* == */ if(++ctx->ptr < ctx->end && *ctx->ptr == '=') { /* === */ ctx->ptr++; *(int*)lval = EXPR_EQEQ; return tEqOper; } *(int*)lval = EXPR_EQ; return tEqOper; } return '='; case '/': if(++ctx->ptr < ctx->end) { if(*ctx->ptr == '=') { /* /= */ ctx->ptr++; *(int*)lval = EXPR_ASSIGNDIV; return kDIVEQ; } } return '/'; case '\"': case '\'': return parse_string_literal(ctx, lval, *ctx->ptr); case '_': case '$': return parse_identifier(ctx, lval); case '@': return '@'; } WARN("unexpected char '%c' %d\n", *ctx->ptr, *ctx->ptr); return 0; }
skO *skO_parse (char **next, jmp_buf jmp, char *delim) { skO *obj; /* parsed token (or NULL) */ skO *list = skO_list_new(); /* used to store parsed objects */ skO *prefixed = NULL; /* store "prefix sugar" tokens */ char *src = *next; jmp_buf pe; int ended = 0; consume_leading(&src); if (setjmp(pe)) { if (prefixed) free(prefixed); skO_free(list); printf("parse jmp\n"); } if (delim) { if (*src != delim[0]) return NULL; src++; consume_leading(&src); #ifdef SK_PARSER_DEBUG printf("Parsed %c\n", delim[0]); #endif } else if (*src == 0) { ended = 1; exit(EXIT_FAILURE); } while (!ended) { consume_leading(&src); if (*src == 0) { ended = 1; *next = src; break; } /* Handle end of lists and prefixed syntax. */ if (delim) { if (*src == delim[1]) { src++; *next = src; #ifdef SK_PARSER_DEBUG printf("Parsed %c\n", delim[1]); #endif return list; } } /* Handle "prefix style" syntactic sugar. */ prefixed = skO_parse(&src, pe, "()"); if (prefixed) consume_leading(&src); /* Handle "reserving operations" syntactic sugar. */ obj = parse_op_def(&src); if (obj) { sk_list_append(list, obj); obj = skO_symbol_new("$=>"); goto matched; } obj = parse_obj_reserve(&src); if (obj) { sk_list_append(list, obj); obj = skO_symbol_new("$->"); goto matched; } obj = parse_obj_restore(&src); if (obj) { sk_list_append(list, obj); obj = skO_symbol_new("$<-"); goto matched; } /* Handle regular tokens. */ if ((obj = parse_string_literal(&src, pe)) || (obj = parse_number(&src)) || (obj = parse_qidentifier(&src)) || (obj = parse_identifier(&src)) || (obj = parse_character_literal(&src, pe)) || (obj = skO_parse(&src, pe, "[]"))) goto matched; /* If everything failed... */ fprintf(stderr, "PANIC! Parsing error: %s\n", src); longjmp(jmp, 1); matched: sk_list_append(list, obj); if (prefixed) { sk_list_append(list, prefixed->data.list); free(prefixed); prefixed = NULL; } } *next = src; return list; }
void mp_lexer_to_next(mp_lexer_t *lex) { // start new token text vstr_reset(&lex->vstr); // skip white space and comments bool had_physical_newline = skip_whitespace(lex, false); // set token source information lex->tok_line = lex->line; lex->tok_column = lex->column; if (lex->emit_dent < 0) { lex->tok_kind = MP_TOKEN_DEDENT; lex->emit_dent += 1; } else if (lex->emit_dent > 0) { lex->tok_kind = MP_TOKEN_INDENT; lex->emit_dent -= 1; } else if (had_physical_newline && lex->nested_bracket_level == 0) { lex->tok_kind = MP_TOKEN_NEWLINE; size_t num_spaces = lex->column - 1; if (num_spaces == indent_top(lex)) { } else if (num_spaces > indent_top(lex)) { indent_push(lex, num_spaces); lex->emit_dent += 1; } else { while (num_spaces < indent_top(lex)) { indent_pop(lex); lex->emit_dent -= 1; } if (num_spaces != indent_top(lex)) { lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH; } } } else if (is_end(lex)) { lex->tok_kind = MP_TOKEN_END; } else if (is_string_or_bytes(lex)) { // a string or bytes literal // Python requires adjacent string/bytes literals to be automatically // concatenated. We do it here in the tokeniser to make efficient use of RAM, // because then the lexer's vstr can be used to accumulate the string literal, // in contrast to creating a parse tree of strings and then joining them later // in the compiler. It's also more compact in code size to do it here. // MP_TOKEN_END is used to indicate that this is the first string token lex->tok_kind = MP_TOKEN_END; // Loop to accumulate string/bytes literals do { // parse type codes bool is_raw = false; mp_token_kind_t kind = MP_TOKEN_STRING; int n_char = 0; if (is_char(lex, 'u')) { n_char = 1; } else if (is_char(lex, 'b')) { kind = MP_TOKEN_BYTES; n_char = 1; if (is_char_following(lex, 'r')) { is_raw = true; n_char = 2; } } else if (is_char(lex, 'r')) { is_raw = true; n_char = 1; if (is_char_following(lex, 'b')) { kind = MP_TOKEN_BYTES; n_char = 2; } } // Set or check token kind if (lex->tok_kind == MP_TOKEN_END) { lex->tok_kind = kind; } else if (lex->tok_kind != kind) { // Can't concatenate string with bytes break; } // Skip any type code characters if (n_char != 0) { next_char(lex); if (n_char == 2) { next_char(lex); } } // Parse the literal parse_string_literal(lex, is_raw); // Skip whitespace so we can check if there's another string following skip_whitespace(lex, true); } while (is_string_or_bytes(lex)); } else if (is_head_of_identifier(lex)) { lex->tok_kind = MP_TOKEN_NAME; // get first char (add as byte to remain 8-bit clean and support utf-8) vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); next_char(lex); // get tail chars while (!is_end(lex) && is_tail_of_identifier(lex)) { vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); next_char(lex); } // Check if the name is a keyword. // We also check for __debug__ here and convert it to its value. This is // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we // need to check for this special token in many places in the compiler. const char *s = vstr_null_terminated_str(&lex->vstr); for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) { int cmp = strcmp(s, tok_kw[i]); if (cmp == 0) { lex->tok_kind = MP_TOKEN_KW_FALSE + i; if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) { lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE); } break; } else if (cmp < 0) { // Table is sorted and comparison was less-than, so stop searching break; } } } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) { bool forced_integer = false; if (is_char(lex, '.')) { lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG; } else { lex->tok_kind = MP_TOKEN_INTEGER; if (is_char(lex, '0') && is_following_base_char(lex)) { forced_integer = true; } } // get first char vstr_add_char(&lex->vstr, CUR_CHAR(lex)); next_char(lex); // get tail chars while (!is_end(lex)) { if (!forced_integer && is_char_or(lex, 'e', 'E')) { lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG; vstr_add_char(&lex->vstr, 'e'); next_char(lex); if (is_char(lex, '+') || is_char(lex, '-')) { vstr_add_char(&lex->vstr, CUR_CHAR(lex)); next_char(lex); } } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) { if (is_char_or3(lex, '.', 'j', 'J')) { lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG; } vstr_add_char(&lex->vstr, CUR_CHAR(lex)); next_char(lex); } else { break; } } } else { // search for encoded delimiter or operator const char *t = tok_enc; size_t tok_enc_index = 0; for (; *t != 0 && !is_char(lex, *t); t += 1) { if (*t == 'e' || *t == 'c') { t += 1; } tok_enc_index += 1; } next_char(lex); if (*t == 0) { // didn't match any delimiter or operator characters lex->tok_kind = MP_TOKEN_INVALID; } else if (*t == '!') { // "!=" is a special case because "!" is not a valid operator if (is_char(lex, '=')) { next_char(lex); lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL; } else { lex->tok_kind = MP_TOKEN_INVALID; } } else if (*t == '.') { // "." and "..." are special cases because ".." is not a valid operator if (is_char_and(lex, '.', '.')) { next_char(lex); next_char(lex); lex->tok_kind = MP_TOKEN_ELLIPSIS; } else { lex->tok_kind = MP_TOKEN_DEL_PERIOD; } } else { // matched a delimiter or operator character // get the maximum characters for a valid token t += 1; size_t t_index = tok_enc_index; while (*t == 'c' || *t == 'e') { t_index += 1; if (is_char(lex, t[1])) { next_char(lex); tok_enc_index = t_index; if (*t == 'e') { break; } } else if (*t == 'c') { break; } t += 2; } // set token kind lex->tok_kind = tok_enc_kind[tok_enc_index]; // compute bracket level for implicit line joining if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) { lex->nested_bracket_level += 1; } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) { lex->nested_bracket_level -= 1; } } } }
static int parse_next_token(void *lval, parser_ctx_t *ctx) { WCHAR c; skip_spaces(ctx); if(ctx->ptr == ctx->end) return ctx->last_token == tNL ? tEOF : tNL; c = *ctx->ptr; if('0' <= c && c <= '9') return parse_numeric_literal(ctx, lval); if(isalphaW(c)) { int ret = check_keywords(ctx); if(!ret) return parse_identifier(ctx, lval); if(ret != tREM) return ret; c = '\''; } switch(c) { case '\n': ctx->ptr++; return tNL; case '\'': return comment_line(ctx); case ':': case ')': case ',': case '=': case '+': case '*': case '/': case '^': case '\\': case '.': case '_': return *ctx->ptr++; case '-': if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '>') return comment_line(ctx); ctx->ptr++; return '-'; case '(': /* NOTE: * We resolve empty brackets in lexer instead of parser to avoid complex conflicts * in call statement special case |f()| without 'call' keyword */ ctx->ptr++; skip_spaces(ctx); if(*ctx->ptr == ')') { ctx->ptr++; return tEMPTYBRACKETS; } return '('; case '"': return parse_string_literal(ctx, lval); case '&': if(*++ctx->ptr == 'h' || *ctx->ptr == 'H') return parse_hex_literal(ctx, lval); return '&'; case '<': switch(*++ctx->ptr) { case '>': ctx->ptr++; return tNEQ; case '=': ctx->ptr++; return tLTEQ; case '!': if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '-') return comment_line(ctx); } return '<'; case '>': if(*++ctx->ptr == '=') { ctx->ptr++; return tGTEQ; } return '>'; default: FIXME("Unhandled char %c in %s\n", *ctx->ptr, debugstr_w(ctx->ptr)); } return 0; }