/* * Fetch the current lexer state and throw an error. */ TokenizeError Impl::error(const std::string &msg) { return TokenizeError{ Location{ this->file, yyget_lineno(this->scanner), this->linepos - static_cast<int>(yyget_leng(this->scanner)), static_cast<int>(yyget_leng(this->scanner)) }, msg }; }
/* * Fetch the current lexer state variables and create a token. */ void Impl::token(token_type type) { int length = yyget_leng(this->scanner); int token_start = this->linepos - length; int lineno = yyget_lineno(this->scanner); if (type == token_type::ENDLINE) { /* don't assign the `\n` for the next line */ --lineno; } // to register open and close parenthesis // for correct line-wrap-indentation. this->track_brackets(type, token_start); if (token_needs_payload(type)) { this->tokens.push(Token{ this->file, lineno, token_start, length, type, yyget_text(this->scanner) }); } else { this->tokens.push(Token{ this->file, lineno, token_start, length, type }); } }
/* * measure the indentation of a line */ void Impl::handle_indent(int depth) { this->linepos -= yyget_leng(this->scanner) - depth; if (not this->brackets.empty()) { // we're in a pair of brackets, // there the indentation is way funnier. // check if the content indentation is correct. int expected = this->brackets.top().get_content_indent(); if (depth != expected) { // if the expected depth is not correct, // then the only thing that is allowed is // the closing bracket. // the check will be done for the next token in // `track_brackets`. this->bracketcloseindent_expected = true; } // don't need to track the indent stack, // this is done in the bracket tracking now. return; } // regular indent is enforced when not in a bracket pair if ((depth % SPACES_PER_INDENT) > 0) { std::ostringstream builder; builder << "indentation requires exactly " << SPACES_PER_INDENT << " spaces per level"; throw this->error(builder.str()); } if (depth == this->previous_indent) { // same indent level, ignore return; } else if (depth < this->previous_indent) { // current line is further left than the previous one int delta = this->previous_indent - depth; while (delta > 0) { delta -= SPACES_PER_INDENT; this->token(token_type::DEDENT); } } else { // current line has more depth than the previous one int delta = depth - this->previous_indent; while (delta > 0) { delta -= SPACES_PER_INDENT; this->token(token_type::INDENT); } } this->previous_indent = depth; }
void FidlParser::lexer_error_callback(const char* error_msg) { const std::string& source_line = get_line(open_files_.back().file_content_, yyget_lineno(lexer_)); std::cout << open_files_.back().filename_ << ":" << yyget_lineno(lexer_) << ":" << yyget_column(lexer_) - yyget_leng(lexer_) << ": " << error_msg << std::endl; std::cout << ">" << source_line; first_error_ = ParserStatus::LEXING_ERROR; }
void wget_css_parse_buffer( const char *buf, void(*callback_uri)(void *user_ctx, const char *url, size_t len, size_t pos), void(*callback_encoding)(void *user_ctx, const char *url, size_t len), void *user_ctx) { int token; size_t length, pos = 0; char *text; yyscan_t scanner; // let flex operate on buf as a 0 terminated string // we could give buflen to this function and use yy_scan_bytes or yy_scan_buffer yylex_init(&scanner); yy_scan_string(buf, scanner); while ((token = yylex(scanner)) != CSSEOF) { if (token == IMPORT_SYM) { // e.g. @import "http:example.com/index.html" pos += yyget_leng(scanner); // skip whitespace before URI/STRING while ((token = yylex(scanner)) == S) pos += yyget_leng(scanner); // now token should be STRING or URI if (token == STRING) token = URI; } if (token == URI && callback_uri) { // e.g. url(http:example.com/index.html) text = yyget_text(scanner); length = yyget_leng(scanner); if (*text == '\'' || *text == '\"') { // a string - remove the quotes callback_uri(user_ctx, text + 1, length - 2, pos + 1); } else { // extract URI from url(...) if (!wget_strncasecmp_ascii(text, "url(", 4)) { char *otext = text; // remove trailing ) and any spaces before for (length--; c_isspace(text[length - 1]); length--); // remove leading url( and any spaces after for (length -= 4, text += 4; c_isspace(*text); text++, length--); // remove quotes if (*text == '\'' || *text == '\"') { text++; length -= 2; } callback_uri(user_ctx, text, length, pos + (text - otext)); } } } else if (token == CHARSET_SYM && callback_encoding) { // e.g. @charset "UTF-8" pos += yyget_leng(scanner); // skip whitespace before charset name while ((token = yylex(scanner)) == S) pos += yyget_leng(scanner); // now token should be STRING if (token == STRING) { text = yyget_text(scanner); length = yyget_leng(scanner); if (*text == '\'' || *text == '\"') { // a string - remove the quotes callback_encoding(user_ctx, text + 1, length - 2); } else { // a string without quotes callback_encoding(user_ctx, text, length); } } else { error_printf(_("Unknown token after @charset: %d\n"), token); } } pos += yyget_leng(scanner); } yylex_destroy(scanner); }
/* buffer is html-normlike "chunk", if original file is bigger than buffer, * we rewind to a space, so we'll know that tokens won't be broken in half at * the end of a buffer. All tokens except string-literals of course. * So we can assume that after the buffer there is either a space, EOF, or a * chunk of text not containing whitespace at all (for which we care only if its * a stringliteral)*/ void cli_js_process_buffer(struct parser_state *state, const char *buf, size_t n) { struct scope* current = state->current; YYSTYPE val; int yv; YY_BUFFER_STATE yyb; if(!state->global) { /* this state has either not been initialized, * or cli_js_parse_done() was already called on it */ cli_warnmsg(MODULE "invalid state\n"); return; } yyb = yy_scan_bytes(buf, n, state->scanner); memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; /* on EOF yylex will return 0 */ while( (yv=yylex(&val, state->scanner)) != 0) { const char *text; size_t leng; val.type = yv; switch(yv) { case TOK_VAR: current->fsm_state = InsideVar; break; case TOK_IDENTIFIER_NAME: text = yyget_text(state->scanner); leng = yyget_leng(state->scanner); if(current->last_token == TOK_DOT) { /* this is a member name, don't normalize */ TOKEN_SET(&val, string, cli_strdup(text)); val.type = TOK_UNNORM_IDENTIFIER; } else { switch(current->fsm_state) { case WaitParameterList: state->syntax_errors++; /* fall through */ case Base: case InsideInitializer: TOKEN_SET(&val, cstring, scope_use(current, text, leng)); break; case InsideVar: case InsideFunctionDecl: TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state)); current->fsm_state = InsideInitializer; current->brackets = 0; break; case WaitFunctionName: TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state)); current->fsm_state = WaitParameterList; break; } } break; case TOK_PAR_OPEN: switch(current->fsm_state) { case WaitFunctionName: /* fallthrough */ case WaitParameterList: current->fsm_state = InsideFunctionDecl; break; default: /* noop */ break; } break; case TOK_PAR_CLOSE: switch(current->fsm_state) { case WaitFunctionName: state->syntax_errors++; break; case WaitParameterList: current->fsm_state = Base; break; default: /* noop */ break; } break; case TOK_CURLY_BRACE_OPEN: switch(current->fsm_state) { case WaitFunctionName: /* fallthrough */ case WaitParameterList: case InsideFunctionDecl: /* in a syntactically correct * file, we would already be in * the Base state when we see a { */ current->fsm_state = Base; /* fall-through */ case InsideVar: case InsideInitializer: state->syntax_errors++; /* fall-through */ case Base: default: current->blocks++; break; } break; case TOK_CURLY_BRACE_CLOSE: if(current->blocks > 0) current->blocks--; else state->syntax_errors++; if(!current->blocks) { if(current->parent) { /* add dummy FUNCTION token to * mark function end */ TOKEN_SET(&val, cstring, "}"); add_token(state, &val); TOKEN_SET(&val, scope, NULL); val.type = TOK_FUNCTION; state->current = current = current->parent; } else{ /* extra } */ state->syntax_errors++; } } break; case TOK_BRACKET_OPEN: current->brackets++; break; case TOK_BRACKET_CLOSE: if(current->brackets > 0) current->brackets--; else state->syntax_errors++; break; case TOK_COMMA: if (current->fsm_state == InsideInitializer && current->brackets == 0 && current->blocks == 0) { /* initializer ended only if we * encountered a comma, and [] are * balanced. * This avoids switching state on: * var x = [4,y,u];*/ current->fsm_state = InsideVar; } break; case TOK_SEMICOLON: if (current->brackets == 0 && current->blocks == 0) { /* avoid switching state on unbalanced []: * var x = [test;testi]; */ current->fsm_state = Base; } break; case TOK_FUNCTION: current = scope_new(state); current->fsm_state = WaitFunctionName; TOKEN_SET(&val, scope, state->current); break; case TOK_StringLiteral: if(state->tokens.cnt > 1 && state->tokens.data[state->tokens.cnt-1].type == TOK_PLUS) { /* see if can fold */ yystype *prev_string = &state->tokens.data[state->tokens.cnt-2]; if(prev_string->type == TOK_StringLiteral) { char *str = TOKEN_GET(prev_string, string); size_t str_len = strlen(str); text = yyget_text(state->scanner); leng = yyget_leng(state->scanner); /* delete TOK_PLUS */ free_token(&state->tokens.data[--state->tokens.cnt]); str = cli_realloc(str, str_len + leng + 1); if (!str) break; strncpy(str+str_len, text, leng); str[str_len + leng] = '\0'; TOKEN_SET(prev_string, string, str); free(val.val.string); memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; continue; } } break; } if(val.vtype == vtype_undefined) { text = yyget_text(state->scanner); TOKEN_SET(&val, string, cli_strdup(text)); abort(); } add_token(state, &val); current->last_token = yv; memset(&val, 0, sizeof(val)); val.vtype = vtype_undefined; } }
void Impl::advance_linepos() { this->linepos += yyget_leng(this->scanner); }