static void read_string(lexer_state *ls, token_info *info) { // skip leading " next(ls); int cont = 1; while(cont) { switch(ls->current) { case EOS: lexer_error(ls, "unexpected eof in string"); return; case '\n': case '\r': lexer_error(ls, "unexpected new line in string"); return; case '"': cont = 0; break; default: save_and_next(ls); } } info->string = strdup(ls->buf->buf); // skip trailing " next(ls); }
token_t* lexer_expectmatch(lexer_state_t* lex, const char* end, const char* start, int linenum) { if ( lex->error ) return NULL; token_t* tok = lexer_nextif(lex, special_token); if ( tok == NULL ) { lexer_error(lex, "Expected \"%s\" token to match the \"%s\" on line %d.\n", end, start, linenum); } else if ( strcmp((char*) tok->value, end) != 0 ) { lexer_error(lex, "Expected a \"%s\" token to match the \"%s\" on line %d. Got a \"%s\"\n", end, start, linenum, (char*) tok->value); } return tok; }
token_t* lexer_expect_special(lexer_state_t* lex, const char* token) { if ( lex->error ) return NULL; token_t* tok = lexer_nextif(lex, special_token); if ( tok == NULL ) { lexer_error(lex, "Expected \"%s\" token.\n", token); } else if ( strcmp((char*) tok->value, token) != 0 ) { lexer_error(lex, "Expected a \"%s\", got a \"%s\"\n", token, (char*) tok->value); } return tok; }
int tdi_soup_parser_finalize(tdi_soup_parser *self) { PyObject *data, *name; tdi_parser_event event; int res; if (tdi_soup_lexer_finalize(self->lexer) == -1) return lexer_error(self); while (self->tagstack) { if (!(data = PyString_FromString(""))) { self->last_error = TDI_PARSER_ERR_ENV; return -1; } name = self->tagstack->name; Py_INCREF(name); tagstack_pop(&self->tagstack); event.type = TDI_PARSER_EVENT_ENDTAG; event.info.endtag.name = name; event.info.endtag.data = data; res = self->cb(&event, self->cb_ctx); Py_DECREF(name); Py_DECREF(data); if (res) { self->last_error = TDI_PARSER_ERR_ENV; return -1; } } return 0; }
static void read_numeric(lexer_state *ls, token_info *info) { int seen_dot = 0; for(;;) { if(ls->current == '.') { if(seen_dot) break; seen_dot = 1; } else if(!lisdigit(ls->current)) { // bad number if(lisalpha(ls->current)) { while(lisalnum(ls->current)) save_and_next(ls); lexer_error(ls, "badly formatted number"); } // end of number break; } save_and_next(ls); } long double number = strtold(ls->buf->buf, NULL); info->string = strdup(ls->buf->buf); info->number = number; }
int tdi_soup_parser_feed(tdi_soup_parser *self, PyObject *food) { if (tdi_soup_lexer_feed(self->lexer, food) == -1) return lexer_error(self); return 0; }
token_t* lexer_expect(lexer_state_t* lex, token_types_t tokentype) { if ( lex->error ) return NULL; token_t* tok = lexer_nextif(lex, tokentype); if ( tok == NULL ) { lexer_error(lex, "Expected %s token.\n", token_names[tokentype]); } return tok; }
// a string terminated with NULL is extracted, in str, with iter point to NULL int extract_identifier(code_t *code, int iter, identifier_t *id) { char curr = code->code[iter]; if(isalpha(curr) == false ) return -1; do { id->str[id->iter] = curr; id->iter++; printf("%c, %d\n", curr, id->iter); // [error] exceed max_id_length if(id->iter == max_id_length - 1) { lexer_error("exceed max_id_length"); return -1; } // curr = code->code[++iter]; if(isalnum(curr) == false) { id->str[id->iter] = '\0'; return id->iter; } } while(true); }
void lexer::parse_string(line_of_code & output) { std::string string; char string_character = input[i]; i++; std::size_t start = i; for(; i < end; i++) { char byte = input[i]; switch(byte) { case '\\': { if(end - i < 2) lexer_error("Backslash at the end of the input"); i++; char next_byte = input[i]; switch(next_byte) { case 'r': string.push_back('\r'); continue; case 'n': string.push_back('\n'); continue; } if(ail::is_hex_digit(next_byte)) { if(end - i < 2) lexer_error("Incomplete hex number escape sequence at the end of the input"); if(!ail::is_hex_digit(input[i + 1])) lexer_error("Invalid hex number escape sequence"); std::string hex_string = input.substr(i, 2); i++; char new_byte = ail::string_to_number<char>(hex_string, std::ios_base::hex); string.push_back(new_byte); } else lexer_error("Invalid escape sequence: " + ail::hex_string_8(static_cast<uchar>(next_byte))); break; } case '\n': lexer_error("Detected a newline in a string"); break; case '\'': case '"': if(byte == string_character) { output.lexemes.push_back(lexeme(string)); i++; return; } string.push_back(byte); break; default: string.push_back(byte); break; } } lexer_error("String lacks terminator"); }
/* * Validate syntax. */ token_data * lexer_check(struct inifile *inf, token_data *data) { switch(data->curr) { case QUOTE: if((inf->options & INIFILE_ALLOW_QUOTE) == 0) { return lexer_error(inf, data, "quoted strings is not allowed"); } if(data->seen == QUOTE) { if(data->quote.sch != data->quote.ech) { return lexer_error(inf, data, "unbalanced quoted string"); } } break; case WHITESP: case CDATA: case NONE: break; case BSECT: if(data->seen != NONE) { return lexer_error(inf, data, "begin section inside section"); } break; case ESECT: if(data->seen != BSECT) { return lexer_error(inf, data, "end section without begin section"); } break; case ASSIGN: if(data->seen != NONE) { if(data->seen == BSECT) { return lexer_error(inf, data, "assignment inide section"); } else if(data->seen == ASSIGN) { if(data->cls == VALUE && inf->options & INIFILE_ASSIGN_INSIDE) { /* * Allow assignment inside value. */ return data; } return lexer_error(inf, data, "dual assignment detected (misstype?)"); } else if(data->seen == ESECT) { return lexer_error(inf, data, "assignment to section"); } else { return lexer_error(inf, data, "assignment without keyword"); } } if(data->cls != KEYWORD) { return lexer_error(inf, data, "assignment to non-keyword"); } break; case EOSTR: if(data->seen != ASSIGN && data->seen != ESECT && data->seen != NONE && data->seen != QUOTE) { if(data->seen == BSECT) { return lexer_error(inf, data, "end of string while looking for matching end of section"); } else { return lexer_error(inf, data, "unexpected end of string"); } } if(data->seen == QUOTE) { if(!data->quote.ech) { if(data->prev != MLINE) { return lexer_error(inf, data, "unterminated quote"); } } if(data->quote.num % 2) { return lexer_error(inf, data, "unbalanced number of quotes"); } } break; case COMMENT: if(data->seen == ASSIGN || data->seen == BSECT) { if(data->cls != VALUE) { return lexer_error(inf, data, "assignment or section without value"); } } break; } switch(data->prev) { case MLINE: if(data->curr == EOSTR || data->curr == WHITESP) { if((inf->options & INIFILE_ALLOW_MULTILINE) == 0) { return lexer_error(inf, data, "multiline value"); } if(data->cls == KEYWORD) { return lexer_error(inf, data, "multiline keyword"); } } break; } return data; }
static int handle_starttag(tdi_soup_parser *self, tdi_parser_event *event, tdi_lexer_event *event_) { PyObject *name, *normname, *tmp, *data; int res; if (self->inempty && close_empty(self) == -1) return -1; /* sanitize */ if (PyString_GET_SIZE(event_->info.starttag.name) == 0 && PyList_GET_SIZE(event_->info.starttag.attr) == 0) { name = (Py_INCREF(self->lastopen), self->lastopen); } else { name = event_->info.starttag.name; Py_INCREF(name); Py_CLEAR(self->lastopen); self->lastopen = (Py_INCREF(name), name); } if (!(normname = self->normalize(self->normalize_ctx, name))) goto error; /* close unnestables */ while (self->tagstack) { res = self->nestable(self->nestable_ctx, self->tagstack->normname, normname); if (res == -1) goto error_normname; if (res) break; event->type = TDI_PARSER_EVENT_ENDTAG; if (!(data = PyString_FromString(""))) goto error_normname; tmp = self->tagstack->name; event->info.endtag.name = (Py_INCREF(tmp), tmp); event->info.endtag.data = data; tagstack_pop(&self->tagstack); res = !self->cb(event, self->cb_ctx) ? 0 : -1; Py_DECREF(tmp); Py_DECREF(data); if (res == -1) goto error_normname; } /* CDATA */ if (!event_->info.starttag.closed) { if ((res = self->cdata(self->cdata_ctx, normname)) == -1) goto error_normname; if (res) { res = tdi_soup_lexer_state_cdata(self->lexer, self->normalize, self->normalize_ctx, normname); if (res == -1) { lexer_error(self); goto error_normname; } } } /* pass event */ event->type = TDI_PARSER_EVENT_STARTTAG; event->info.starttag.name = name; event->info.starttag.attr = event_->info.starttag.attr; event->info.starttag.closed = event_->info.starttag.closed; event->info.starttag.data = event_->info.starttag.data; if (self->cb(event, self->cb_ctx)) goto error_normname; /* Maintain stack */ if (!event_->info.starttag.closed) { if (tagstack_push(&self->tagstack, normname, name) == -1) goto error_normname; if ((res = self->empty(self->empty_ctx, normname)) == -1) goto error_normname; if (res) self->inempty = 1; } /* cleanup & finish */ Py_DECREF(normname); Py_DECREF(name); return 0; error_normname: Py_DECREF(normname); error: Py_DECREF(name); if (!self->last_error) self->last_error = TDI_PARSER_ERR_ENV; return -1; }
token_t* _lexer_read_token(lexer_state_t* lex) { while ( (lex->source[lex->sourceIndex] == ' ' || lex->source[lex->sourceIndex] == '\t' || lex->source[lex->sourceIndex] == '\n') && lex->sourceIndex < lex->_sourceLen ) { if ( lex->source[lex->sourceIndex] == '\n' ) { lex->lineNumber++; lex->lineIndex = 0; } else { lex->lineIndex++; } lex->sourceIndex++; } if ( lex->sourceIndex >= lex->_sourceLen ) { return token_init(eof_token, lex->lineNumber, lex->lineIndex, NULL); } char current = lex->source[lex->sourceIndex]; if ( current >= '0' && current <= '9' ) { unsigned int currAlloc = MALLOC_CHUNK; unsigned int index = 0; char* str = (char*) malloc(currAlloc * sizeof(char)); while ( (current >= '0' && current <= '9') && (lex->sourceIndex + index) < lex->_sourceLen ) { if ( index >= (currAlloc - 1) ) { currAlloc += MALLOC_CHUNK; char* temp = (char*) realloc(str, currAlloc); if ( !temp ) { lex->sourceIndex += index; lex->lineIndex += index; lexer_error(lex, "Ran out of memory.\n"); free(str); return NULL; } str = temp; } str[index] = current; str[index + 1] = 0; current = lex->source[lex->sourceIndex + (++index)]; } if ( current == '.') { currAlloc += MALLOC_CHUNK; char* temp = (char*) realloc(str, currAlloc); if ( !temp ) { lex->sourceIndex += index; lex->lineIndex += index; lexer_error(lex, "Ran out of memory.\n"); free(str); return NULL; } str = temp; str[index] = '.'; str[index + 1] = 0; current = lex->source[lex->sourceIndex + (++index)]; while ( (current >= '0' && current <= '9') && (lex->sourceIndex + index) < lex->_sourceLen ) { if ( index >= (currAlloc - 1) ) { currAlloc += MALLOC_CHUNK; char* temp = (char*) realloc(str, currAlloc); if ( !temp ) { lex->sourceIndex += index; lex->lineIndex += index; lexer_error(lex, "Ran out of memory.\n"); free(str); return NULL; } str = temp; } str[index] = current; str[index + 1] = 0; current = lex->source[lex->sourceIndex + (++index)]; } lex->sourceIndex += index; lex->lineIndex += index; double* v = (double*) malloc(sizeof(double)); *v = atof(str); token_t* token = token_init(double_token, lex->lineNumber, lex->lineIndex, (void*) v); free(str); return token; } else { lex->sourceIndex += index; lex->lineIndex += index; int* v = (int*) malloc(sizeof(int)); *v = atol(str); token_t* token = token_init(int_token, lex->lineNumber, lex->lineIndex, (void*) v); free(str); return token; } } else if ( (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z') || (current >= '0' && current <= '9') || (current == '_')) { unsigned int currAlloc = MALLOC_CHUNK; char* str = (char*) malloc(currAlloc * sizeof(char)); str[0] = current; str[1] = 0; unsigned int index = 1; current = lex->source[lex->sourceIndex + index]; while ( ((current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z') || (current >= '0' && current <= '9') || (current == '_')) && (lex->sourceIndex + index) < lex->_sourceLen ) { if ( index >= (currAlloc - 1) ) { currAlloc += MALLOC_CHUNK; char* temp = (char*) realloc(str, currAlloc); if ( !temp ) { lex->sourceIndex += index; lex->lineIndex += index; lexer_error(lex, "Ran out of memory.\n"); free(str); return NULL; } str = temp; } str[index] = current; str[index + 1] = 0; current = lex->source[lex->sourceIndex + (++index)]; } lex->sourceIndex += index; lex->lineIndex += index; char* temp = (char*) realloc(str, strlen(str) + 1); if ( !temp ) { lexer_error(lex, "Ran out of memory.\n"); free(str); return NULL; } bool special = false; for ( int i = 0; i < lex->specialTokenLength; i++ ) { if ( strcmp(temp, lex->specialTokens[i]) == 0 ) { special = true; break; } } token_t* token = token_init((special ? special_token : name_token), lex->lineNumber, lex->lineIndex, (void*) temp); return token; } else if ( current == '\"' || current == '\'' ) { char initial = current; lex->sourceIndex += 1; lex->lineIndex += 1; current = lex->source[lex->sourceIndex]; unsigned int currAlloc = MALLOC_CHUNK; char* str = (char*) malloc(currAlloc * sizeof(char)); str[0] = current; str[1] = 0; unsigned int index = 1; current = lex->source[lex->sourceIndex + index]; while ( !(current == initial && lex->source[lex->sourceIndex + index - 1] != '\\') && (lex->sourceIndex + index) <= lex->_sourceLen ) { if ( index >= (currAlloc - 1) ) { currAlloc += MALLOC_CHUNK; char* temp = (char*) realloc(str, currAlloc); if ( !temp ) { lex->sourceIndex += index; lex->lineIndex += index; lexer_error(lex, "Ran out of memory.\n"); free(str); return NULL; } str = temp; } if ( lex->source[lex->sourceIndex + index] == '\\' ) { if ( lex->source[lex->sourceIndex + index - 1] != '\\' ) { current = lex->source[lex->sourceIndex + (++index)]; continue; } } int t = strlen(str); str[t] = current; str[t + 1] = 0; current = lex->source[lex->sourceIndex + (++index)]; } lex->sourceIndex += index; lex->lineIndex += index; char* temp = (char*) realloc(str, strlen(str) + 1); if ( !temp ) { lexer_error(lex, "Ran out of memory.\n"); free(str); return NULL; } token_t* token = token_init(string_token, lex->lineNumber, lex->lineIndex, (void*) temp); lex->sourceIndex++; return token; } else { for ( int i = 0; i < lex->specialTokenLength; i++ ) { bool good = true; if ( lex->specialTokens[i] != NULL && lex->specialTokens[i][0] == current ) { for ( int j = 0; j < strlen(lex->specialTokens[i]); j++ ) { if ( lex->specialTokens[i][j] != lex->source[j + lex->sourceIndex] ) { good = false; break; } } } else { good = false; } if ( good ) { char* copy = (char*) malloc((strlen(lex->specialTokens[i]) + 1) * sizeof(char));; strcpy(copy, lex->specialTokens[i]); lex->sourceIndex += strlen(copy); lex->lineIndex += strlen(copy); token_t* tok = token_init(special_token, lex->lineNumber, lex->lineIndex, (void*) copy); return tok; } } lexer_error(lex, "Unexpected token %c\n", current); } return NULL; }
static int lex(lexer_state *ls, token_info *info) { buffer_reset(ls->buf); if(setjmp(ls->error.buf)) return TK_ERROR; for(;;) { switch(ls->current) { case '\n': case '\r': { // newline inc_line(ls); break; } case ' ': case '\t': { // whitespace next(ls); break; } case '-': { // comment or minus // minus if(next(ls) != '-') return '-'; // comment, skip line while(next(ls) != EOS && !isnewline(ls)); break; } case '=': { // EQ next(ls); return '='; } case '<': { // LT, LTE, ASSIGN next(ls); if(ls->current == '=') { next(ls); return TK_LTE; } else if(ls->current == '-'){ next(ls); return TK_ASSIGN; } else return '<'; } case '>': { // GT, GTE next(ls); if(ls->current == '=') { next(ls); return TK_GTE; } else return '>'; } case '/': { // NEQ, DIV next(ls); if(ls->current == '=') { next(ls); return TK_NEQ; } else return '/'; } case '"': { // STRING read_string(ls, info); return TK_STRING; } case EOS: { // EOS return TK_EOS; } default: { if(lisdigit(ls->current)) { // NUMERIC read_numeric(ls, info); return TK_REAL; } if(lisalpha(ls->current)) { // ID or RESERVED return read_id_or_reserved(ls, info); } int c = ls->current; // valid operators, single character tokens, etc. switch(ls->current) { case '+': case '-': case '*': case '/': case '!': case '>': case '<': case '=': case '(': case ')': case '[': case ']': case '{': case '}': case ':': case '.': case ',': next(ls); return c; default: lexer_error(ls, "unrecognized symbol %c", c); next(ls); } } } } }
bool lexer::parse_number(line_of_code & output) { std::size_t start = i; char byte = input[i]; if(ail::is_digit(byte)) { i++; if(byte == '0') { std::size_t remaining_bytes = end - i; if(remaining_bytes > 1) { char next_byte = input[i + 1]; if(next_byte == 'x') { i++; remaining_bytes = end - i; if(remaining_bytes == 0) number_parsing_error("Incomplete hex number at the end of the input"); std::size_t hex_start = i; for(; i < end && ail::is_hex_digit(input[i]); i++); std::size_t hex_length = i - hex_start; if(hex_length == 0) lexer_error("Incomplete hex number"); std::string hex_string = input.substr(hex_start, i - end); types::unsigned_integer value = ail::string_to_number<types::unsigned_integer>(hex_string, std::ios_base::hex); output.lexemes.push_back(lexeme(value)); return true; } } } char const dot = '.'; bool got_dot = false; char last_byte = byte; for(; i < end; i++) { byte = input[i]; if(byte == dot) { if(got_dot) number_parsing_error("Encountered a floating point value containing multiple dots"); else got_dot = true; } else if(!ail::is_digit(byte)) break; last_byte = byte; } if(last_byte == dot) number_parsing_error("Encountered a floating point value ending with a dot"); std::string number_string = input.substr(start, i - start); lexeme current_lexeme; if(got_dot) current_lexeme = lexeme(ail::string_to_number<types::floating_point_value>(number_string)); else current_lexeme = lexeme(ail::string_to_number<types::signed_integer>(number_string)); output.lexemes.push_back(current_lexeme); return true; } else return false; }