// scan through whitespace static void scan_whitespace(Lexer *l) { char c; for (; (c = lex_current(l)), !lex_eof(l); lex_next(l)) { if (!is_whitespace(c)) { break; } } }
size_t lex_keyword(Lexer *l) { enum { kKwExprFunc }; char c; for (size_t i = 0; (c = lex_current(l)), !lex_eof(l); i++, lex_next(l)) { if (c >= 'a' && c <= 'z') { // keywords are always lowercase alphabetics } else if (c == ' ') { // proper end of a keyword char *s[] = {"var", "func"}; Keyword k[] = {kKeywordVar, kKeywordFunc}; for (size_t j = 0; j < sizeof(k) / sizeof(Keyword); j++) { if (memcmp(&l->input[-i], s[j], i) == 0) { // found keyword scan_whitespace(l); lex_emit(l, (Token){ .type = kTokenTypeKeyword, .keyword = k[j], }); return kKwExprFunc; } }
void get_lexeme (agent* thisAgent) { /* AGR 568 begin */ if (thisAgent->lex_alias) { thisAgent->lexeme = thisAgent->lex_alias->lexeme; thisAgent->lex_alias = thisAgent->lex_alias->next; return; } /* AGR 568 end */ thisAgent->lexeme.length = 0; thisAgent->lexeme.string[0] = 0; /* AGR 534 The only time a prompt should be printed out is if there's a command being expected; ie. the prompt shouldn't print out if we're in the middle of entering a production. So if we're in the middle of entering a production, then the parentheses level will be > 0, so that's the criteria we will use. AGR 5-Apr-94 */ thisAgent->load_errors_quit = FALSE; /* AGR 527c */ while (thisAgent->load_errors_quit==FALSE) { /* AGR 527c */ if (thisAgent->current_char==EOF) break; if (whitespace[static_cast<unsigned char>(thisAgent->current_char)]) { if (thisAgent->current_char == '\n') { if (thisAgent->current_file->fake_rparen_at_eol) { do_fake_rparen(thisAgent); return; } } get_next_char(thisAgent); continue; } //#ifdef USE_TCL if (thisAgent->current_char==';') { /* --- skip the semi-colon, forces newline in TCL --- */ get_next_char(thisAgent); /* consume it */ continue; } if (thisAgent->current_char=='#') { /* --- read from hash to end-of-line --- */ while ((thisAgent->current_char!='\n') && (thisAgent->current_char!=EOF)) get_next_char(thisAgent); if (thisAgent->current_file->fake_rparen_at_eol) { do_fake_rparen(thisAgent); return; } if (thisAgent->current_char!=EOF) get_next_char(thisAgent); continue; } //#else // if (thisAgent->current_char==';') { // /* --- read from semicolon to end-of-line --- */ // while ((thisAgent->current_char!='\n') && // (thisAgent->current_char!=EOF)) // get_next_char(thisAgent); // if (thisAgent->current_file->fake_rparen_at_eol) { // do_fake_rparen(thisAgent); // return; // } // if (thisAgent->current_char!=EOF) get_next_char(thisAgent); // continue; // } // if (thisAgent->current_char=='#') { // /* --- comments surrounded by "#|" and "|#" delimiters --- */ // record_position_of_start_of_lexeme(); /* in case of later error mesg. */ // get_next_char(thisAgent); // if (thisAgent->current_char!='|') { // print ("Error: '#' not followed by '|'\n"); // print_location_of_most_recent_lexeme(thisAgent); // continue; // } // get_next_char(thisAgent); /* consume the vbar */ // while (TRUE) { // if (thisAgent->current_char==EOF) { // print ("Error: '#|' without terminating '|#'\n"); // print_location_of_most_recent_lexeme(thisAgent); // break; // } // if (thisAgent->current_char!='|') { get_next_char(thisAgent); continue; } // get_next_char(thisAgent); // if (thisAgent->current_char=='#') break; // } // get_next_char(thisAgent); /* consume the closing '#' */ // continue; /* continue outer while(TRUE), reading more whitespace */ // } //#endif /* USE_TCL */ break; /* if no whitespace or comments found, break out of the loop */ } /* --- no more whitespace, so go get the actual lexeme --- */ record_position_of_start_of_lexeme(thisAgent); if (thisAgent->current_char!=EOF) (*(lexer_routines[static_cast<unsigned char>(thisAgent->current_char)]))(thisAgent); else lex_eof(thisAgent); }
static int lex_scan(lex_t *lex, json_error_t *error) { char c; strbuffer_clear(&lex->saved_text); if(lex->token == TOKEN_STRING) { free(lex->value.string); lex->value.string = NULL; } c = lex_get(lex, error); while(c == ' ' || c == '\t' || c == '\n' || c == '\r') { if(c == '\n') lex->line++; c = lex_get(lex, error); } if(c == (char)EOF) { if(lex_eof(lex)) lex->token = TOKEN_EOF; else lex->token = TOKEN_INVALID; goto out; } lex_save(lex, c); if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',') lex->token = c; else if(c == '"') lex_scan_string(lex, error); else if(isdigit(c) || c == '-') { if(lex_scan_number(lex, c, error)) goto out; } else if(isupper(c) || islower(c)) { /* eat up the whole identifier for clearer error messages */ const char *saved_text; c = lex_get_save(lex, error); while(isupper(c) || islower(c)) c = lex_get_save(lex, error); lex_unget_unsave(lex, c); saved_text = strbuffer_value(&lex->saved_text); if(strcmp(saved_text, "true") == 0) lex->token = TOKEN_TRUE; else if(strcmp(saved_text, "false") == 0) lex->token = TOKEN_FALSE; else if(strcmp(saved_text, "null") == 0) lex->token = TOKEN_NULL; else lex->token = TOKEN_INVALID; } else { /* save the rest of the input UTF-8 sequence to get an error message of valid UTF-8 */ lex_save_cached(lex); lex->token = TOKEN_INVALID; } out: return lex->token; }
static void lex_scan_string(lex_t *lex, json_error_t *error) { char c; const char *p; char *t; int i; lex->value.string = NULL; lex->token = TOKEN_INVALID; c = lex_get_save(lex, error); while(c != '"') { if(c == (char)EOF) { lex_unget_unsave(lex, c); if(lex_eof(lex)) error_set(error, lex, "premature end of input"); goto out; } else if((unsigned char)c <= 0x1F) { /* control character */ lex_unget_unsave(lex, c); if(c == '\n') error_set(error, lex, "unexpected newline", c); else error_set(error, lex, "control character 0x%x", c); goto out; } else if(c == '\\') { c = lex_get_save(lex, error); if(c == 'u') { c = lex_get_save(lex, error); for(i = 0; i < 4; i++) { if(!isxdigit(c)) { lex_unget_unsave(lex, c); error_set(error, lex, "invalid escape"); goto out; } c = lex_get_save(lex, error); } } else if(c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't') c = lex_get_save(lex, error); else { lex_unget_unsave(lex, c); error_set(error, lex, "invalid escape"); goto out; } } else c = lex_get_save(lex, error); } /* the actual value is at most of the same length as the source string, because: - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte - a single \uXXXX escape (length 6) is converted to at most 3 bytes - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair are converted to 4 bytes */ lex->value.string = malloc(lex->saved_text.length + 1); if(!lex->value.string) { /* this is not very nice, since TOKEN_INVALID is returned */ goto out; } /* the target */ t = lex->value.string; /* + 1 to skip the " */ p = strbuffer_value(&lex->saved_text) + 1; while(*p != '"') { if(*p == '\\') { p++; if(*p == 'u') { char buffer[4]; int length; int32_t value; value = decode_unicode_escape(p); p += 5; if(0xD800 <= value && value <= 0xDBFF) { /* surrogate pair */ if(*p == '\\' && *(p + 1) == 'u') { int32_t value2 = decode_unicode_escape(++p); p += 5; if(0xDC00 <= value2 && value2 <= 0xDFFF) { /* valid second surrogate */ value = ((value - 0xD800) << 10) + (value2 - 0xDC00) + 0x10000; } else { /* invalid second surrogate */ error_set(error, lex, "invalid Unicode '\\u%04X\\u%04X'", value, value2); goto out; } } else { /* no second surrogate */ error_set(error, lex, "invalid Unicode '\\u%04X'", value); goto out; } } else if(0xDC00 <= value && value <= 0xDFFF) { error_set(error, lex, "invalid Unicode '\\u%04X'", value); goto out; } else if(value == 0) { error_set(error, lex, "\\u0000 is not allowed"); goto out; } if(utf8_encode(value, buffer, &length)) assert(0); memcpy(t, buffer, length); t += length; } else { switch(*p) { case '"': case '\\': case '/': *t = *p; break; case 'b': *t = '\b'; break; case 'f': *t = '\f'; break; case 'n': *t = '\n'; break; case 'r': *t = '\r'; break; case 't': *t = '\t'; break; default: assert(0); } t++; p++; } } else *(t++) = *(p++); } *t = '\0'; lex->token = TOKEN_STRING; return; out: free(lex->value.string); }
void get_lexeme (void) { /* AGR 568 begin */ if (current_agent(lex_alias)) { current_agent(lexeme) = current_agent(lex_alias)->lexeme; current_agent(lex_alias) = current_agent(lex_alias)->next; return; } /* AGR 568 end */ current_agent(lexeme).length = 0; current_agent(lexeme).string[0] = 0; #ifndef USE_X_DISPLAY if (current_agent(lexeme).type==EOF_LEXEME && reading_from_top_level() && current_lexer_parentheses_level()==0 && /* AGR 534 */ current_agent(print_prompt_flag)) #ifdef USE_TCL {} #else /* REW: begin 09.15.96 */ if (current_agent(operand2_mode) == TRUE) print ("\nOPERAND %s> ", current_agent(name)); /* REW: end 09.15.96 */ else print ("\n%s> ", current_agent(name)); #endif /* USE_TCL */ #endif /* USE_X_DISPLAY */ /* AGR 534 The only time a prompt should be printed out is if there's a command being expected; ie. the prompt shouldn't print out if we're in the middle of entering a production. So if we're in the middle of entering a production, then the parentheses level will be > 0, so that's the criteria we will use. AGR 5-Apr-94 */ current_agent(load_errors_quit) = FALSE; /* AGR 527c */ while (current_agent(load_errors_quit)==FALSE) { /* AGR 527c */ if (current_agent(current_char)==EOF_AS_CHAR) break; if (whitespace[(unsigned char)current_agent(current_char)]) { if (current_agent(current_char) == '\n') { if (current_agent(current_file)->fake_rparen_at_eol) { do_fake_rparen(); return; } #ifndef USE_X_DISPLAY if (current_lexer_parentheses_level()==0 && /* AGR 534 */ current_agent(print_prompt_flag)) #ifdef USE_TCL {} #else /* REW: begin 09.15.96 */ if (current_agent(operand2_mode) == TRUE) print ("\nOPERAND %s> ", current_agent(name)); /* REW: end 09.15.96 */ else print ("\n%s> ", current_agent(name)); #endif /* USE_TCL */ #endif /* USE_X_DISPLAY */ } get_next_char(); continue; } #ifdef USE_TCL if (current_agent(current_char)==';') { /* --- skip the semi-colon, forces newline in TCL --- */ get_next_char(); /* consume it */ continue; } if (current_agent(current_char)=='#') { /* --- read from hash to end-of-line --- */ while ((current_agent(current_char)!='\n') && (current_agent(current_char)!=EOF_AS_CHAR)) get_next_char(); if (current_agent(current_file)->fake_rparen_at_eol) { do_fake_rparen(); return; } if (current_agent(current_char)!=EOF_AS_CHAR) get_next_char(); continue; } #else if (current_agent(current_char)==';') { /* --- read from semicolon to end-of-line --- */ while ((current_agent(current_char)!='\n') && (current_agent(current_char)!=EOF_AS_CHAR)) get_next_char(); if (current_agent(current_file)->fake_rparen_at_eol) { do_fake_rparen(); return; } if (current_agent(current_char)!=EOF_AS_CHAR) get_next_char(); continue; } if (current_agent(current_char)=='#') { /* --- comments surrounded by "#|" and "|#" delimiters --- */ record_position_of_start_of_lexeme(); /* in case of later error mesg. */ get_next_char(); if (current_agent(current_char)!='|') { print ("Error: '#' not followed by '|'\n"); print_location_of_most_recent_lexeme(); continue; } get_next_char(); /* consume the vbar */ while (TRUE) { if (current_agent(current_char)==EOF_AS_CHAR) { print ("Error: '#|' without terminating '|#'\n"); print_location_of_most_recent_lexeme(); break; } if (current_agent(current_char)!='|') { get_next_char(); continue; } get_next_char(); if (current_agent(current_char)=='#') break; } get_next_char(); /* consume the closing '#' */ continue; /* continue outer while(TRUE), reading more whitespace */ } #endif /* USE_TCL */ break; /* if no whitespace or comments found, break out of the loop */ } /* --- no more whitespace, so go get the actual lexeme --- */ record_position_of_start_of_lexeme(); if (current_agent(current_char)!=EOF_AS_CHAR) (*(lexer_routines[(unsigned char)current_agent(current_char)]))(); else lex_eof(); }