Пример #1
0
/*
 * Parse succesive blocks of XML data, generating events for the 
 * handlers/callbacks as we go. State is maintained in the 
 * simple_xml_parser object.
 * If the top level XML document ends before the last character, 
 * the "read" parameter indicates how much input was consumed.
 */
hcerr_t xml_parse(xml_parser *formal_parser, char s[], hc_long_t size, hc_long_t *read){
  simple_xml_parser *parser = (simple_xml_parser *)  formal_parser;

  int i = 0;

  if (DEBUG == TRUE){
    print_state(parser->state, parser->depth);
    printf ("in parser with " LL_FORMAT " %s\n", size, s);
    fflush(stdout);
  }

  while (i < size){

    switch(parser->state){

    case OUTSIDE_ELEMENT:
      if (is_white_space(s[i])){
	/*skip_white_space */
	break;
      }
      if (s[i] == '<'){
	parser->start_tag = TRUE;
	change_state(&parser->state, EXPECTING_OPEN_OR_CLOSE_TAG, parser->depth);
      }
      else {
	HC_ERR_LOG(("Expected '<', read %c at %d %s\n", s[i], i, s));
	return HCERR_XML_EXPECTED_LT;
      }
      break;
      
      
    case DOCUMENT_ELEMENT:
      /* discard document element */
      if (s[i] != '>'){
	if (DEBUG == TRUE)
	  printf("discarding %c", s[i]);
	break;
      }
      else{
	parser->state = OUTSIDE_ELEMENT;
	break;
      }

    case EXPECTING_OPEN_OR_CLOSE_TAG:
      if (is_white_space(s[i])){
	/*skip_white_space */
	break;
      }
      if (s[i] == '/'){
	if (DEBUG)
	  printf("parser->start_tag = FALSE\n");
	parser->start_tag = FALSE;
	break;
      }

    case EXPECTING_TAG:
      
      if (is_name_first_char(s[i]) == TRUE){
	change_state(&parser->state, SCANNING_TAG, parser->depth);
	require_ok(token_append(&parser->buffer_list, s[i]));
	break;
      }

      /* Discard document element */
      else if (s[i] == '?' && parser->depth == 0){
	parser->state = DOCUMENT_ELEMENT;
	break;
      }
      else{
	HC_ERR_LOG(("Invalid first character for element name : %c %d %s\n", s[i], i, s));
	return HCERR_XML_INVALID_ELEMENT_TAG;
      }
      
      // FALLTHRU INTENTIONAL???

      /* Start tag is terminated by whitespace, /, or >
	 End tag is terminated by whitespace or >
      */
    case SCANNING_TAG:
      /* Still reading token */
      if (is_name_char(s[i]) == TRUE){
	require_ok(token_append(&parser->buffer_list, s[i]));
	break;
      }
      else if (is_white_space(s[i]) == TRUE) {
	parser->current_tag = token_finish(&parser->buffer_list);
	if (parser->start_tag == TRUE){
	  /*printf("Start element: %s\n", parser->current_tag);*/
	  change_state(&parser->state, SCANNING_ATTRIBUTES, parser->depth);
	  break;
	}
	else{
	  change_state(&parser->state, SCANNING_CLOSE_TAG, parser->depth);
	  break;
	}
      }
      else if (s[i] == '>') {
	if (DEBUG == TRUE)
	  printf("parser->depth: %d\n", parser->depth);
	require_ok(close_tag(&i, parser));
	if (DEBUG == TRUE)
	  printf("parser->depth: %d\n", parser->depth);
	if (parser->depth == 0){
	  *read = i + 1;
	  return HCERR_OK;
	}
      }
      /* <element/> */
      else if (s[i] == '/' && parser->start_tag == TRUE) {
	if (DEBUG == TRUE){
	  printf("Start element: %s\n", parser->current_tag);	
	  printf("End element: %s\n", parser->current_tag);
	}
	change_state(&parser->state, EXPECTING_RIGHT_BRACKET, parser->depth);
	break;
      }
      else {
	HC_ERR_LOG(("Invalid character '%c' in tag. %i %s\n", s[i], i, s));
	return HCERR_XML_INVALID_ELEMENT_TAG;
      }
      break;

    case EXPECTING_RIGHT_BRACKET:
      if (s[i] != '>') {
	HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s));
	return HCERR_XML_MALFORMED_START_ELEMENT;
      }
      if (parser->depth == 0){
	*read = i + 1;
	return HCERR_OK;
      }
      change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth);
      break;


    case SCANNING_CLOSE_TAG:
	  if (is_white_space(s[i])) {
	    break;
	  }
	  if (DEBUG == TRUE)
	    fprintf(stdout, "End element: %s\n", parser->current_tag);
	  if (s[i] != '>') {
	    HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s));
	    return HCERR_XML_MALFORMED_END_ELEMENT;
	  }
	  require_ok((*parser->end_element_callback)(parser->current_tag, parser->data));
	  parser->depth--;
	  if (parser->depth == 0){
	    *read = i + 1;
	    return HCERR_OK;
	  }

	  change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth);
	  break;


      /* Expected tokens:
       *   attribute_name
       *   '/'
       *   >
       */
    case SCANNING_ATTRIBUTES:
      if (is_white_space(s[i])){
	/*skip_white_space */
	break;
      }
      if (is_name_first_char(s[i]) == TRUE) {
	change_state(&parser->state, SCANNING_ATTRIBUTE_NAME, parser->depth);
	require_ok(token_append(&parser->buffer_list, s[i]));
      }
      else if (s[i] == '/' && parser->start_tag == TRUE) {

	if (DEBUG == TRUE){
	  int j = 0;
	  printf("SA Start element: %s\n", parser->current_tag);		 
	  fprintf(stdout, "Start  element: %s %d\n", parser->current_tag, parser->current_attribute);
	  for (j = 0; j < parser->current_attribute; j++){
	    printf(" %s=\"%s\"", *(parser->attribute_names + j), *(parser->attribute_values + j));
	  }
	  fprintf(stdout, "End  element: %s\n", parser->current_tag);
	  fflush(stdout);
	} 
	require_ok((*parser->start_element_callback)(parser->current_tag, 
						     parser->data, 
						     parser->attribute_names,
						     parser->attribute_values,
						     parser->current_attribute));
	require_ok((*parser->end_element_callback)(parser->current_tag, parser->data));

	parser->current_attribute = 0;
	change_state(&parser->state, EXPECTING_RIGHT_BRACKET, parser->depth);
      }
      else  if (s[i] == '>') { 
	if (DEBUG == TRUE){
	  int j = 0;
	  fprintf(stdout, "Start  element event: %s %d\n", parser->current_tag, parser->current_attribute);
	  for (j = 0; j < parser->current_attribute; j++){
	    printf(" %s=\"%s\"", *(parser->attribute_names + j), *(parser->attribute_values + j));
	  }
	}
	require_ok((*parser->start_element_callback)(parser->current_tag, 
						     parser->data,
						     parser->attribute_names,
						     parser->attribute_values,
						     parser->current_attribute));

	parser->current_attribute = 0;
	parser->depth++;
	change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth);
      }
      else{
	HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s));
	return HCERR_XML_MALFORMED_START_ELEMENT;
      }
      break;

    case SCANNING_ATTRIBUTE_NAME:
      if (s[i] == '='){
	if (parser->current_attribute == parser->attribute_arrays_size){
	  require_ok(grow_attribute_arrays(parser));
	}
	parser->attribute_names[parser->current_attribute] = token_finish(&parser->buffer_list);

	change_state(&parser->state, SCANNING_START_ATTRIBUTE_VALUE, parser->depth);
      }
      else if (is_name_char(s[i]) == TRUE) {
	require_ok(token_append(&parser->buffer_list, s[i]));
      }
      else{
	HC_ERR_LOG(("Illegal char %c in attribute name. %i <<%s>>\n", s[i], i, s));
	return HCERR_XML_BAD_ATTRIBUTE_NAME;
      }
      break;

    case SCANNING_START_ATTRIBUTE_VALUE:
      if (is_white_space(s[i])){
	break;
      }
      else if (s[i] != '"'){
	HC_ERR_LOG(("Attribute value does not begin with quote: '%c'. %i %s\n", s[i], i, s));
	return HCERR_XML_BAD_ATTRIBUTE_NAME;
      }
      change_state(&parser->state, SCANNING_ATTRIBUTE_VALUE, parser->depth);
      break;


    case SCANNING_ATTRIBUTE_VALUE:
      if (s[i] == '\\') {
	if (parser->backslash == TRUE){
	  parser->backslash = FALSE;
	}
	else{
	  parser->backslash = TRUE;
	}
      }
      else if (s[i] == '"' && parser->backslash == FALSE) {
	parser->attribute_values[parser->current_attribute++] = token_finish(&parser->buffer_list);
	change_state(&parser->state, SCANNING_ATTRIBUTES, parser->depth);
      	break;
      }
      require_ok(token_append(&parser->buffer_list, s[i]));
      
      break;
    }
    i++;
  }
  return HCERR_OK;
}
Пример #2
0
int tokenize(struct token_list* tk_list, char* file_buffer)
{
  enum Status status;
   line_num;
  size_t token_begin, token_end;
  token_begin = 0, token_end  = 0;
  status = STATUS_INVALID;
  str_toupper(file_buffer);
  /*
   * Careful: it seems an error to let "i <= len", 
   * but we need one more execution to flush the last token into token list.
   */
  size_t line_num = 1;
  for (size_t i = 0; ; ++i) {
    struct token_node* tok_node;
    switch (status) {
    case STATUS_LETTER:
      if (!IS_LETTER(file_buffer[i]) && !IS_DIGIT(file_buffer[i])) {
        token_end = i;
        tok_node = create_token(TOKEN_LABEL, file_buffer + token_begin, token_end - token_begin);
        tok_node->type = letter_type(tok_node->liter, tok_node->len);
        token_append(tk_list, tok_node);
        token_begin = i;
        status = next_status(status, file_buffer[i]);
      }
      break;
    case STATUS_PRAGMA:
      if (!IS_LETTER(file_buffer[i])) {
        int type;
        token_end = i;
        type = pragma_type(file_buffer + token_begin, token_end - token_begin);
        if (type < 0) {
          error("invalid pragma ad line %d\n", line_num);
          return -4;
        }
        tok_node = create_token(type, file_buffer + token_begin, token_end - token_begin);
        token_append(tk_list, tok_node);
        token_begin = i;
        status = next_status(status, file_buffer[i]);
      }
      break;
    case STATUS_PUNCTUATION:
      token_end = i;
      tok_node = create_token(file_buffer[token_begin], file_buffer + token_begin, token_end - token_begin);
      token_append(tk_list, tok_node);
      token_begin = i;
      status = next_status(status, file_buffer[i]);
      break;
    case STATUS_NUMBER:
      if (!IS_NUMBER(file_buffer[i])) {
        token_end = i;
        if (!check_number(file_buffer + token_begin, token_end - token_begin)) {
          error("invalid number format at line %d\n", line_num);
          return -2;
        }
        tok_node = create_token(TOKEN_NUMBER, file_buffer + token_begin, token_end - token_begin);
        tok_node->data = parse_number(tok_node->liter);
        token_append(tk_list, tok_node);
        token_begin = i;
        status = next_status(status, file_buffer[i]);
      }
      break;
    case STATUS_BLANK:
      if (!IS_BLANK(file_buffer[i])) {
        token_begin = i;
        status = next_status(status, file_buffer[i]);
      }
      break;
    case STATUS_COMMENTS:
      //once status is in comments, it will always be in comments
      if ('\n' == file_buffer[i]) {
        token_begin = i;
        status = next_status(status, file_buffer[i]);
      }
      break;
    case STATUS_INVALID:
      token_begin = i;
      status = next_status(status, file_buffer[i]);
      if (STATUS_INVALID == status && 0 != file_buffer[i]) {
        error("invalid format at line %d\n", line_num);
        return -3;
      }
      break;
    }
    if (0 == file_buffer[i])
      break;
    else if ('\n' == file_buffer[i])
      ++line_num;
  }
  return 0;
}