/* * Parse succesive blocks of XML data, generating events for the * handlers/callbacks as we go. State is maintained in the * simple_xml_parser object. * If the top level XML document ends before the last character, * the "read" parameter indicates how much input was consumed. */ hcerr_t xml_parse(xml_parser *formal_parser, char s[], hc_long_t size, hc_long_t *read){ simple_xml_parser *parser = (simple_xml_parser *) formal_parser; int i = 0; if (DEBUG == TRUE){ print_state(parser->state, parser->depth); printf ("in parser with " LL_FORMAT " %s\n", size, s); fflush(stdout); } while (i < size){ switch(parser->state){ case OUTSIDE_ELEMENT: if (is_white_space(s[i])){ /*skip_white_space */ break; } if (s[i] == '<'){ parser->start_tag = TRUE; change_state(&parser->state, EXPECTING_OPEN_OR_CLOSE_TAG, parser->depth); } else { HC_ERR_LOG(("Expected '<', read %c at %d %s\n", s[i], i, s)); return HCERR_XML_EXPECTED_LT; } break; case DOCUMENT_ELEMENT: /* discard document element */ if (s[i] != '>'){ if (DEBUG == TRUE) printf("discarding %c", s[i]); break; } else{ parser->state = OUTSIDE_ELEMENT; break; } case EXPECTING_OPEN_OR_CLOSE_TAG: if (is_white_space(s[i])){ /*skip_white_space */ break; } if (s[i] == '/'){ if (DEBUG) printf("parser->start_tag = FALSE\n"); parser->start_tag = FALSE; break; } case EXPECTING_TAG: if (is_name_first_char(s[i]) == TRUE){ change_state(&parser->state, SCANNING_TAG, parser->depth); require_ok(token_append(&parser->buffer_list, s[i])); break; } /* Discard document element */ else if (s[i] == '?' && parser->depth == 0){ parser->state = DOCUMENT_ELEMENT; break; } else{ HC_ERR_LOG(("Invalid first character for element name : %c %d %s\n", s[i], i, s)); return HCERR_XML_INVALID_ELEMENT_TAG; } // FALLTHRU INTENTIONAL??? /* Start tag is terminated by whitespace, /, or > End tag is terminated by whitespace or > */ case SCANNING_TAG: /* Still reading token */ if (is_name_char(s[i]) == TRUE){ require_ok(token_append(&parser->buffer_list, s[i])); break; } else if (is_white_space(s[i]) == TRUE) { parser->current_tag = token_finish(&parser->buffer_list); if (parser->start_tag == TRUE){ /*printf("Start element: %s\n", parser->current_tag);*/ change_state(&parser->state, SCANNING_ATTRIBUTES, parser->depth); break; } else{ change_state(&parser->state, SCANNING_CLOSE_TAG, parser->depth); break; } } else if (s[i] == '>') { if (DEBUG == TRUE) printf("parser->depth: %d\n", parser->depth); require_ok(close_tag(&i, parser)); if (DEBUG == TRUE) printf("parser->depth: %d\n", parser->depth); if (parser->depth == 0){ *read = i + 1; return HCERR_OK; } } /* <element/> */ else if (s[i] == '/' && parser->start_tag == TRUE) { if (DEBUG == TRUE){ printf("Start element: %s\n", parser->current_tag); printf("End element: %s\n", parser->current_tag); } change_state(&parser->state, EXPECTING_RIGHT_BRACKET, parser->depth); break; } else { HC_ERR_LOG(("Invalid character '%c' in tag. %i %s\n", s[i], i, s)); return HCERR_XML_INVALID_ELEMENT_TAG; } break; case EXPECTING_RIGHT_BRACKET: if (s[i] != '>') { HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s)); return HCERR_XML_MALFORMED_START_ELEMENT; } if (parser->depth == 0){ *read = i + 1; return HCERR_OK; } change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth); break; case SCANNING_CLOSE_TAG: if (is_white_space(s[i])) { break; } if (DEBUG == TRUE) fprintf(stdout, "End element: %s\n", parser->current_tag); if (s[i] != '>') { HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s)); return HCERR_XML_MALFORMED_END_ELEMENT; } require_ok((*parser->end_element_callback)(parser->current_tag, parser->data)); parser->depth--; if (parser->depth == 0){ *read = i + 1; return HCERR_OK; } change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth); break; /* Expected tokens: * attribute_name * '/' * > */ case SCANNING_ATTRIBUTES: if (is_white_space(s[i])){ /*skip_white_space */ break; } if (is_name_first_char(s[i]) == TRUE) { change_state(&parser->state, SCANNING_ATTRIBUTE_NAME, parser->depth); require_ok(token_append(&parser->buffer_list, s[i])); } else if (s[i] == '/' && parser->start_tag == TRUE) { if (DEBUG == TRUE){ int j = 0; printf("SA Start element: %s\n", parser->current_tag); fprintf(stdout, "Start element: %s %d\n", parser->current_tag, parser->current_attribute); for (j = 0; j < parser->current_attribute; j++){ printf(" %s=\"%s\"", *(parser->attribute_names + j), *(parser->attribute_values + j)); } fprintf(stdout, "End element: %s\n", parser->current_tag); fflush(stdout); } require_ok((*parser->start_element_callback)(parser->current_tag, parser->data, parser->attribute_names, parser->attribute_values, parser->current_attribute)); require_ok((*parser->end_element_callback)(parser->current_tag, parser->data)); parser->current_attribute = 0; change_state(&parser->state, EXPECTING_RIGHT_BRACKET, parser->depth); } else if (s[i] == '>') { if (DEBUG == TRUE){ int j = 0; fprintf(stdout, "Start element event: %s %d\n", parser->current_tag, parser->current_attribute); for (j = 0; j < parser->current_attribute; j++){ printf(" %s=\"%s\"", *(parser->attribute_names + j), *(parser->attribute_values + j)); } } require_ok((*parser->start_element_callback)(parser->current_tag, parser->data, parser->attribute_names, parser->attribute_values, parser->current_attribute)); parser->current_attribute = 0; parser->depth++; change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth); } else{ HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s)); return HCERR_XML_MALFORMED_START_ELEMENT; } break; case SCANNING_ATTRIBUTE_NAME: if (s[i] == '='){ if (parser->current_attribute == parser->attribute_arrays_size){ require_ok(grow_attribute_arrays(parser)); } parser->attribute_names[parser->current_attribute] = token_finish(&parser->buffer_list); change_state(&parser->state, SCANNING_START_ATTRIBUTE_VALUE, parser->depth); } else if (is_name_char(s[i]) == TRUE) { require_ok(token_append(&parser->buffer_list, s[i])); } else{ HC_ERR_LOG(("Illegal char %c in attribute name. %i <<%s>>\n", s[i], i, s)); return HCERR_XML_BAD_ATTRIBUTE_NAME; } break; case SCANNING_START_ATTRIBUTE_VALUE: if (is_white_space(s[i])){ break; } else if (s[i] != '"'){ HC_ERR_LOG(("Attribute value does not begin with quote: '%c'. %i %s\n", s[i], i, s)); return HCERR_XML_BAD_ATTRIBUTE_NAME; } change_state(&parser->state, SCANNING_ATTRIBUTE_VALUE, parser->depth); break; case SCANNING_ATTRIBUTE_VALUE: if (s[i] == '\\') { if (parser->backslash == TRUE){ parser->backslash = FALSE; } else{ parser->backslash = TRUE; } } else if (s[i] == '"' && parser->backslash == FALSE) { parser->attribute_values[parser->current_attribute++] = token_finish(&parser->buffer_list); change_state(&parser->state, SCANNING_ATTRIBUTES, parser->depth); break; } require_ok(token_append(&parser->buffer_list, s[i])); break; } i++; } return HCERR_OK; }
int tokenize(struct token_list* tk_list, char* file_buffer) { enum Status status; line_num; size_t token_begin, token_end; token_begin = 0, token_end = 0; status = STATUS_INVALID; str_toupper(file_buffer); /* * Careful: it seems an error to let "i <= len", * but we need one more execution to flush the last token into token list. */ size_t line_num = 1; for (size_t i = 0; ; ++i) { struct token_node* tok_node; switch (status) { case STATUS_LETTER: if (!IS_LETTER(file_buffer[i]) && !IS_DIGIT(file_buffer[i])) { token_end = i; tok_node = create_token(TOKEN_LABEL, file_buffer + token_begin, token_end - token_begin); tok_node->type = letter_type(tok_node->liter, tok_node->len); token_append(tk_list, tok_node); token_begin = i; status = next_status(status, file_buffer[i]); } break; case STATUS_PRAGMA: if (!IS_LETTER(file_buffer[i])) { int type; token_end = i; type = pragma_type(file_buffer + token_begin, token_end - token_begin); if (type < 0) { error("invalid pragma ad line %d\n", line_num); return -4; } tok_node = create_token(type, file_buffer + token_begin, token_end - token_begin); token_append(tk_list, tok_node); token_begin = i; status = next_status(status, file_buffer[i]); } break; case STATUS_PUNCTUATION: token_end = i; tok_node = create_token(file_buffer[token_begin], file_buffer + token_begin, token_end - token_begin); token_append(tk_list, tok_node); token_begin = i; status = next_status(status, file_buffer[i]); break; case STATUS_NUMBER: if (!IS_NUMBER(file_buffer[i])) { token_end = i; if (!check_number(file_buffer + token_begin, token_end - token_begin)) { error("invalid number format at line %d\n", line_num); return -2; } tok_node = create_token(TOKEN_NUMBER, file_buffer + token_begin, token_end - token_begin); tok_node->data = parse_number(tok_node->liter); token_append(tk_list, tok_node); token_begin = i; status = next_status(status, file_buffer[i]); } break; case STATUS_BLANK: if (!IS_BLANK(file_buffer[i])) { token_begin = i; status = next_status(status, file_buffer[i]); } break; case STATUS_COMMENTS: //once status is in comments, it will always be in comments if ('\n' == file_buffer[i]) { token_begin = i; status = next_status(status, file_buffer[i]); } break; case STATUS_INVALID: token_begin = i; status = next_status(status, file_buffer[i]); if (STATUS_INVALID == status && 0 != file_buffer[i]) { error("invalid format at line %d\n", line_num); return -3; } break; } if (0 == file_buffer[i]) break; else if ('\n' == file_buffer[i]) ++line_num; } return 0; }