Esempio n. 1
0
File: lex.c Progetto: cptaffe/sbc
// scan through whitespace
static void scan_whitespace(Lexer *l) {
  char c;
  for (; (c = lex_current(l)), !lex_eof(l); lex_next(l)) {
    if (!is_whitespace(c)) {
      break;
    }
  }
}
Esempio n. 2
0
File: lex.c Progetto: cptaffe/sbc
size_t lex_keyword(Lexer *l) {
  enum { kKwExprFunc };
  char c;
  for (size_t i = 0; (c = lex_current(l)), !lex_eof(l); i++, lex_next(l)) {
    if (c >= 'a' && c <= 'z') {
      // keywords are always lowercase alphabetics
    } else if (c == ' ') {
      // proper end of a keyword
      char *s[] = {"var", "func"};
      Keyword k[] = {kKeywordVar, kKeywordFunc};
      for (size_t j = 0; j < sizeof(k) / sizeof(Keyword); j++) {
        if (memcmp(&l->input[-i], s[j], i) == 0) {
          // found keyword
          scan_whitespace(l);
          lex_emit(l, (Token){
                          .type = kTokenTypeKeyword, .keyword = k[j],
                      });
          return kKwExprFunc;
        }
      }
Esempio n. 3
0
void get_lexeme (agent* thisAgent) {

  /* AGR 568 begin */
  if (thisAgent->lex_alias) {
    thisAgent->lexeme = thisAgent->lex_alias->lexeme;
    thisAgent->lex_alias = thisAgent->lex_alias->next;
    return;
  }
  /* AGR 568 end */

  thisAgent->lexeme.length = 0;
  thisAgent->lexeme.string[0] = 0;

/* AGR 534  The only time a prompt should be printed out is if there's
   a command being expected; ie. the prompt shouldn't print out if we're
   in the middle of entering a production.  So if we're in the middle of
   entering a production, then the parentheses level will be > 0, so that's
   the criteria we will use.  AGR  5-Apr-94  */

  thisAgent->load_errors_quit = FALSE;  /* AGR 527c */

  while (thisAgent->load_errors_quit==FALSE) {   /* AGR 527c */
    if (thisAgent->current_char==EOF) break;
    if (whitespace[static_cast<unsigned char>(thisAgent->current_char)]) {
      if (thisAgent->current_char == '\n')
      {    
         if (thisAgent->current_file->fake_rparen_at_eol) {
              do_fake_rparen(thisAgent);
              return;
         }
      }
      get_next_char(thisAgent);
      continue;
    }

//#ifdef USE_TCL 
    if (thisAgent->current_char==';') {
      /* --- skip the semi-colon, forces newline in TCL --- */
      get_next_char(thisAgent);  /* consume it */
      continue;
    }
    if (thisAgent->current_char=='#') {
      /* --- read from hash to end-of-line --- */
      while ((thisAgent->current_char!='\n') &&
             (thisAgent->current_char!=EOF))
        get_next_char(thisAgent);
      if (thisAgent->current_file->fake_rparen_at_eol) {
        do_fake_rparen(thisAgent);
        return;
      }
      if (thisAgent->current_char!=EOF) get_next_char(thisAgent);
      continue;
    }
//#else
//    if (thisAgent->current_char==';') {
//      /* --- read from semicolon to end-of-line --- */
//      while ((thisAgent->current_char!='\n') &&
//             (thisAgent->current_char!=EOF))
//        get_next_char(thisAgent);
//      if (thisAgent->current_file->fake_rparen_at_eol) {
//        do_fake_rparen(thisAgent);
//        return;
//      }
//      if (thisAgent->current_char!=EOF) get_next_char(thisAgent);
//      continue;
//    }
//    if (thisAgent->current_char=='#') {
//      /* --- comments surrounded by "#|" and "|#" delimiters --- */
//      record_position_of_start_of_lexeme(); /* in case of later error mesg. */
//      get_next_char(thisAgent);
//      if (thisAgent->current_char!='|') {
//        print ("Error: '#' not followed by '|'\n");
//        print_location_of_most_recent_lexeme(thisAgent);
//        continue;
//      }
//      get_next_char(thisAgent);  /* consume the vbar */
//      while (TRUE) {
//        if (thisAgent->current_char==EOF) {
//          print ("Error: '#|' without terminating '|#'\n");
//          print_location_of_most_recent_lexeme(thisAgent);
//          break;
//        }
//        if (thisAgent->current_char!='|') { get_next_char(thisAgent); continue; }
//        get_next_char(thisAgent);
//        if (thisAgent->current_char=='#') break;
//      }
//      get_next_char(thisAgent);  /* consume the closing '#' */
//      continue; /* continue outer while(TRUE), reading more whitespace */
//    }
//#endif  /* USE_TCL */
    break; /* if no whitespace or comments found, break out of the loop */
  }
  /* --- no more whitespace, so go get the actual lexeme --- */
  record_position_of_start_of_lexeme(thisAgent);
  if (thisAgent->current_char!=EOF)
    (*(lexer_routines[static_cast<unsigned char>(thisAgent->current_char)]))(thisAgent);
  else
    lex_eof(thisAgent);
}
Esempio n. 4
0
static int lex_scan(lex_t *lex, json_error_t *error)
{
    char c;

    strbuffer_clear(&lex->saved_text);

    if(lex->token == TOKEN_STRING) {
        free(lex->value.string);
        lex->value.string = NULL;
    }

    c = lex_get(lex, error);
    while(c == ' ' || c == '\t' || c == '\n' || c == '\r')
    {
        if(c == '\n')
            lex->line++;

        c = lex_get(lex, error);
    }

    if(c == (char)EOF) {
        if(lex_eof(lex))
            lex->token = TOKEN_EOF;
        else
            lex->token = TOKEN_INVALID;
        goto out;
    }

    lex_save(lex, c);

    if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',')
        lex->token = c;

    else if(c == '"')
        lex_scan_string(lex, error);

    else if(isdigit(c) || c == '-') {
        if(lex_scan_number(lex, c, error))
            goto out;
    }

    else if(isupper(c) || islower(c)) {
        /* eat up the whole identifier for clearer error messages */
        const char *saved_text;

        c = lex_get_save(lex, error);
        while(isupper(c) || islower(c))
            c = lex_get_save(lex, error);
        lex_unget_unsave(lex, c);

        saved_text = strbuffer_value(&lex->saved_text);

        if(strcmp(saved_text, "true") == 0)
            lex->token = TOKEN_TRUE;
        else if(strcmp(saved_text, "false") == 0)
            lex->token = TOKEN_FALSE;
        else if(strcmp(saved_text, "null") == 0)
            lex->token = TOKEN_NULL;
        else
            lex->token = TOKEN_INVALID;
    }

    else {
        /* save the rest of the input UTF-8 sequence to get an error
           message of valid UTF-8 */
        lex_save_cached(lex);
        lex->token = TOKEN_INVALID;
    }

out:
    return lex->token;
}
Esempio n. 5
0
static void lex_scan_string(lex_t *lex, json_error_t *error)
{
    char c;
    const char *p;
    char *t;
    int i;

    lex->value.string = NULL;
    lex->token = TOKEN_INVALID;

    c = lex_get_save(lex, error);

    while(c != '"') {
        if(c == (char)EOF) {
            lex_unget_unsave(lex, c);
            if(lex_eof(lex))
                error_set(error, lex, "premature end of input");
            goto out;
        }

        else if((unsigned char)c <= 0x1F) {
            /* control character */
            lex_unget_unsave(lex, c);
            if(c == '\n')
                error_set(error, lex, "unexpected newline", c);
            else
                error_set(error, lex, "control character 0x%x", c);
            goto out;
        }

        else if(c == '\\') {
            c = lex_get_save(lex, error);
            if(c == 'u') {
                c = lex_get_save(lex, error);
                for(i = 0; i < 4; i++) {
                    if(!isxdigit(c)) {
                        lex_unget_unsave(lex, c);
                        error_set(error, lex, "invalid escape");
                        goto out;
                    }
                    c = lex_get_save(lex, error);
                }
            }
            else if(c == '"' || c == '\\' || c == '/' || c == 'b' ||
                    c == 'f' || c == 'n' || c == 'r' || c == 't')
                c = lex_get_save(lex, error);
            else {
                lex_unget_unsave(lex, c);
                error_set(error, lex, "invalid escape");
                goto out;
            }
        }
        else
            c = lex_get_save(lex, error);
    }

    /* the actual value is at most of the same length as the source
       string, because:
         - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte
         - a single \uXXXX escape (length 6) is converted to at most 3 bytes
         - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair
           are converted to 4 bytes
    */
    lex->value.string = malloc(lex->saved_text.length + 1);
    if(!lex->value.string) {
        /* this is not very nice, since TOKEN_INVALID is returned */
        goto out;
    }

    /* the target */
    t = lex->value.string;

    /* + 1 to skip the " */
    p = strbuffer_value(&lex->saved_text) + 1;

    while(*p != '"') {
        if(*p == '\\') {
            p++;
            if(*p == 'u') {
                char buffer[4];
                int length;
                int32_t value;

                value = decode_unicode_escape(p);
                p += 5;

                if(0xD800 <= value && value <= 0xDBFF) {
                    /* surrogate pair */
                    if(*p == '\\' && *(p + 1) == 'u') {
                        int32_t value2 = decode_unicode_escape(++p);
                        p += 5;

                        if(0xDC00 <= value2 && value2 <= 0xDFFF) {
                            /* valid second surrogate */
                            value =
                                ((value - 0xD800) << 10) +
                                (value2 - 0xDC00) +
                                0x10000;
                        }
                        else {
                            /* invalid second surrogate */
                            error_set(error, lex,
                                      "invalid Unicode '\\u%04X\\u%04X'",
                                      value, value2);
                            goto out;
                        }
                    }
                    else {
                        /* no second surrogate */
                        error_set(error, lex, "invalid Unicode '\\u%04X'",
                                  value);
                        goto out;
                    }
                }
                else if(0xDC00 <= value && value <= 0xDFFF) {
                    error_set(error, lex, "invalid Unicode '\\u%04X'", value);
                    goto out;
                }
                else if(value == 0)
                {
                    error_set(error, lex, "\\u0000 is not allowed");
                    goto out;
                }

                if(utf8_encode(value, buffer, &length))
                    assert(0);

                memcpy(t, buffer, length);
                t += length;
            }
            else {
                switch(*p) {
                    case '"': case '\\': case '/':
                        *t = *p; break;
                    case 'b': *t = '\b'; break;
                    case 'f': *t = '\f'; break;
                    case 'n': *t = '\n'; break;
                    case 'r': *t = '\r'; break;
                    case 't': *t = '\t'; break;
                    default: assert(0);
                }
                t++;
                p++;
            }
        }
        else
            *(t++) = *(p++);
    }
    *t = '\0';
    lex->token = TOKEN_STRING;
    return;

out:
    free(lex->value.string);
}
Esempio n. 6
0
void get_lexeme (void) {

  /* AGR 568 begin */
  if (current_agent(lex_alias)) {
    current_agent(lexeme) = current_agent(lex_alias)->lexeme;
    current_agent(lex_alias) = current_agent(lex_alias)->next;
    return;
  }
  /* AGR 568 end */

  current_agent(lexeme).length = 0;
  current_agent(lexeme).string[0] = 0;

#ifndef USE_X_DISPLAY
if (current_agent(lexeme).type==EOF_LEXEME && 
    reading_from_top_level() &&
    current_lexer_parentheses_level()==0 &&  /* AGR 534 */
    current_agent(print_prompt_flag))
#ifdef USE_TCL
  {}
#else

 /* REW: begin 09.15.96 */
 if (current_agent(operand2_mode) == TRUE)
   print ("\nOPERAND %s> ", current_agent(name));
 /* REW: end   09.15.96 */
 else
   print ("\n%s> ", current_agent(name));

#endif /* USE_TCL */
#endif /* USE_X_DISPLAY */

/* AGR 534  The only time a prompt should be printed out is if there's
   a command being expected; ie. the prompt shouldn't print out if we're
   in the middle of entering a production.  So if we're in the middle of
   entering a production, then the parentheses level will be > 0, so that's
   the criteria we will use.  AGR  5-Apr-94  */

  current_agent(load_errors_quit) = FALSE;  /* AGR 527c */

  while (current_agent(load_errors_quit)==FALSE) {   /* AGR 527c */
    if (current_agent(current_char)==EOF_AS_CHAR) break;
    if (whitespace[(unsigned char)current_agent(current_char)]) {
      if (current_agent(current_char) == '\n')
      {    
         if (current_agent(current_file)->fake_rparen_at_eol) {
              do_fake_rparen();
              return;
         }
#ifndef USE_X_DISPLAY
         if (current_lexer_parentheses_level()==0 &&  /* AGR 534 */
             current_agent(print_prompt_flag))
#ifdef USE_TCL
         {}
#else

	 /* REW: begin 09.15.96 */
         if (current_agent(operand2_mode) == TRUE)
	   print ("\nOPERAND %s> ", current_agent(name));
	 /* REW: end   09.15.96 */
	 else
	   print ("\n%s> ", current_agent(name));

#endif /* USE_TCL */
#endif /* USE_X_DISPLAY */
      }
      get_next_char();
      continue;
    }

#ifdef USE_TCL 
    if (current_agent(current_char)==';') {
      /* --- skip the semi-colon, forces newline in TCL --- */
      get_next_char();  /* consume it */
      continue;
    }
    if (current_agent(current_char)=='#') {
      /* --- read from hash to end-of-line --- */
      while ((current_agent(current_char)!='\n') &&
             (current_agent(current_char)!=EOF_AS_CHAR))
        get_next_char();
      if (current_agent(current_file)->fake_rparen_at_eol) {
        do_fake_rparen();
        return;
      }
      if (current_agent(current_char)!=EOF_AS_CHAR) get_next_char();
      continue;
    }
#else
    if (current_agent(current_char)==';') {
      /* --- read from semicolon to end-of-line --- */
      while ((current_agent(current_char)!='\n') &&
             (current_agent(current_char)!=EOF_AS_CHAR))
        get_next_char();
      if (current_agent(current_file)->fake_rparen_at_eol) {
        do_fake_rparen();
        return;
      }
      if (current_agent(current_char)!=EOF_AS_CHAR) get_next_char();
      continue;
    }
    if (current_agent(current_char)=='#') {
      /* --- comments surrounded by "#|" and "|#" delimiters --- */
      record_position_of_start_of_lexeme(); /* in case of later error mesg. */
      get_next_char();
      if (current_agent(current_char)!='|') {
        print ("Error: '#' not followed by '|'\n");
        print_location_of_most_recent_lexeme();
        continue;
      }
      get_next_char();  /* consume the vbar */
      while (TRUE) {
        if (current_agent(current_char)==EOF_AS_CHAR) {
          print ("Error: '#|' without terminating '|#'\n");
          print_location_of_most_recent_lexeme();
          break;
        }
        if (current_agent(current_char)!='|') { get_next_char(); continue; }
        get_next_char();
        if (current_agent(current_char)=='#') break;
      }
      get_next_char();  /* consume the closing '#' */
      continue; /* continue outer while(TRUE), reading more whitespace */
    }
#endif  /* USE_TCL */
    break; /* if no whitespace or comments found, break out of the loop */
  }
  /* --- no more whitespace, so go get the actual lexeme --- */
  record_position_of_start_of_lexeme();
  if (current_agent(current_char)!=EOF_AS_CHAR)
    (*(lexer_routines[(unsigned char)current_agent(current_char)]))();
  else
    lex_eof();
}