static int is_name_char(char c){ if (is_name_first_char(c) == TRUE || (c >= '0' && c <= '9') || c == '-' || c == '.') return TRUE; else return FALSE; }
/* * Parse succesive blocks of XML data, generating events for the * handlers/callbacks as we go. State is maintained in the * simple_xml_parser object. * If the top level XML document ends before the last character, * the "read" parameter indicates how much input was consumed. */ hcerr_t xml_parse(xml_parser *formal_parser, char s[], hc_long_t size, hc_long_t *read){ simple_xml_parser *parser = (simple_xml_parser *) formal_parser; int i = 0; if (DEBUG == TRUE){ print_state(parser->state, parser->depth); printf ("in parser with " LL_FORMAT " %s\n", size, s); fflush(stdout); } while (i < size){ switch(parser->state){ case OUTSIDE_ELEMENT: if (is_white_space(s[i])){ /*skip_white_space */ break; } if (s[i] == '<'){ parser->start_tag = TRUE; change_state(&parser->state, EXPECTING_OPEN_OR_CLOSE_TAG, parser->depth); } else { HC_ERR_LOG(("Expected '<', read %c at %d %s\n", s[i], i, s)); return HCERR_XML_EXPECTED_LT; } break; case DOCUMENT_ELEMENT: /* discard document element */ if (s[i] != '>'){ if (DEBUG == TRUE) printf("discarding %c", s[i]); break; } else{ parser->state = OUTSIDE_ELEMENT; break; } case EXPECTING_OPEN_OR_CLOSE_TAG: if (is_white_space(s[i])){ /*skip_white_space */ break; } if (s[i] == '/'){ if (DEBUG) printf("parser->start_tag = FALSE\n"); parser->start_tag = FALSE; break; } case EXPECTING_TAG: if (is_name_first_char(s[i]) == TRUE){ change_state(&parser->state, SCANNING_TAG, parser->depth); require_ok(token_append(&parser->buffer_list, s[i])); break; } /* Discard document element */ else if (s[i] == '?' && parser->depth == 0){ parser->state = DOCUMENT_ELEMENT; break; } else{ HC_ERR_LOG(("Invalid first character for element name : %c %d %s\n", s[i], i, s)); return HCERR_XML_INVALID_ELEMENT_TAG; } // FALLTHRU INTENTIONAL??? /* Start tag is terminated by whitespace, /, or > End tag is terminated by whitespace or > */ case SCANNING_TAG: /* Still reading token */ if (is_name_char(s[i]) == TRUE){ require_ok(token_append(&parser->buffer_list, s[i])); break; } else if (is_white_space(s[i]) == TRUE) { parser->current_tag = token_finish(&parser->buffer_list); if (parser->start_tag == TRUE){ /*printf("Start element: %s\n", parser->current_tag);*/ change_state(&parser->state, SCANNING_ATTRIBUTES, parser->depth); break; } else{ change_state(&parser->state, SCANNING_CLOSE_TAG, parser->depth); break; } } else if (s[i] == '>') { if (DEBUG == TRUE) printf("parser->depth: %d\n", parser->depth); require_ok(close_tag(&i, parser)); if (DEBUG == TRUE) printf("parser->depth: %d\n", parser->depth); if (parser->depth == 0){ *read = i + 1; return HCERR_OK; } } /* <element/> */ else if (s[i] == '/' && parser->start_tag == TRUE) { if (DEBUG == TRUE){ printf("Start element: %s\n", parser->current_tag); printf("End element: %s\n", parser->current_tag); } change_state(&parser->state, EXPECTING_RIGHT_BRACKET, parser->depth); break; } else { HC_ERR_LOG(("Invalid character '%c' in tag. %i %s\n", s[i], i, s)); return HCERR_XML_INVALID_ELEMENT_TAG; } break; case EXPECTING_RIGHT_BRACKET: if (s[i] != '>') { HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s)); return HCERR_XML_MALFORMED_START_ELEMENT; } if (parser->depth == 0){ *read = i + 1; return HCERR_OK; } change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth); break; case SCANNING_CLOSE_TAG: if (is_white_space(s[i])) { break; } if (DEBUG == TRUE) fprintf(stdout, "End element: %s\n", parser->current_tag); if (s[i] != '>') { HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s)); return HCERR_XML_MALFORMED_END_ELEMENT; } require_ok((*parser->end_element_callback)(parser->current_tag, parser->data)); parser->depth--; if (parser->depth == 0){ *read = i + 1; return HCERR_OK; } change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth); break; /* Expected tokens: * attribute_name * '/' * > */ case SCANNING_ATTRIBUTES: if (is_white_space(s[i])){ /*skip_white_space */ break; } if (is_name_first_char(s[i]) == TRUE) { change_state(&parser->state, SCANNING_ATTRIBUTE_NAME, parser->depth); require_ok(token_append(&parser->buffer_list, s[i])); } else if (s[i] == '/' && parser->start_tag == TRUE) { if (DEBUG == TRUE){ int j = 0; printf("SA Start element: %s\n", parser->current_tag); fprintf(stdout, "Start element: %s %d\n", parser->current_tag, parser->current_attribute); for (j = 0; j < parser->current_attribute; j++){ printf(" %s=\"%s\"", *(parser->attribute_names + j), *(parser->attribute_values + j)); } fprintf(stdout, "End element: %s\n", parser->current_tag); fflush(stdout); } require_ok((*parser->start_element_callback)(parser->current_tag, parser->data, parser->attribute_names, parser->attribute_values, parser->current_attribute)); require_ok((*parser->end_element_callback)(parser->current_tag, parser->data)); parser->current_attribute = 0; change_state(&parser->state, EXPECTING_RIGHT_BRACKET, parser->depth); } else if (s[i] == '>') { if (DEBUG == TRUE){ int j = 0; fprintf(stdout, "Start element event: %s %d\n", parser->current_tag, parser->current_attribute); for (j = 0; j < parser->current_attribute; j++){ printf(" %s=\"%s\"", *(parser->attribute_names + j), *(parser->attribute_values + j)); } } require_ok((*parser->start_element_callback)(parser->current_tag, parser->data, parser->attribute_names, parser->attribute_values, parser->current_attribute)); parser->current_attribute = 0; parser->depth++; change_state(&parser->state, OUTSIDE_ELEMENT, parser->depth); } else{ HC_ERR_LOG(("Unexpected character %c after close element. %d %s", s[i], i, s)); return HCERR_XML_MALFORMED_START_ELEMENT; } break; case SCANNING_ATTRIBUTE_NAME: if (s[i] == '='){ if (parser->current_attribute == parser->attribute_arrays_size){ require_ok(grow_attribute_arrays(parser)); } parser->attribute_names[parser->current_attribute] = token_finish(&parser->buffer_list); change_state(&parser->state, SCANNING_START_ATTRIBUTE_VALUE, parser->depth); } else if (is_name_char(s[i]) == TRUE) { require_ok(token_append(&parser->buffer_list, s[i])); } else{ HC_ERR_LOG(("Illegal char %c in attribute name. %i <<%s>>\n", s[i], i, s)); return HCERR_XML_BAD_ATTRIBUTE_NAME; } break; case SCANNING_START_ATTRIBUTE_VALUE: if (is_white_space(s[i])){ break; } else if (s[i] != '"'){ HC_ERR_LOG(("Attribute value does not begin with quote: '%c'. %i %s\n", s[i], i, s)); return HCERR_XML_BAD_ATTRIBUTE_NAME; } change_state(&parser->state, SCANNING_ATTRIBUTE_VALUE, parser->depth); break; case SCANNING_ATTRIBUTE_VALUE: if (s[i] == '\\') { if (parser->backslash == TRUE){ parser->backslash = FALSE; } else{ parser->backslash = TRUE; } } else if (s[i] == '"' && parser->backslash == FALSE) { parser->attribute_values[parser->current_attribute++] = token_finish(&parser->buffer_list); change_state(&parser->state, SCANNING_ATTRIBUTES, parser->depth); break; } require_ok(token_append(&parser->buffer_list, s[i])); break; } i++; } return HCERR_OK; }
void NCDConfigTokenizer_Tokenize (char *str, size_t left, NCDConfigTokenizer_output output, void *user) { size_t line = 1; size_t line_char = 1; while (left > 0) { size_t l; int error = 0; int token; void *token_val = NULL; size_t token_len = 0; if (*str == '#') { l = 1; while (l < left && str[l] != '\n') { l++; } token = 0; } else if (l = data_begins_with(str, left, "{")) { token = NCD_TOKEN_CURLY_OPEN; } else if (l = data_begins_with(str, left, "}")) { token = NCD_TOKEN_CURLY_CLOSE; } else if (l = data_begins_with(str, left, "(")) { token = NCD_TOKEN_ROUND_OPEN; } else if (l = data_begins_with(str, left, ")")) { token = NCD_TOKEN_ROUND_CLOSE; } else if (l = data_begins_with(str, left, ";")) { token = NCD_TOKEN_SEMICOLON; } else if (l = data_begins_with(str, left, ".")) { token = NCD_TOKEN_DOT; } else if (l = data_begins_with(str, left, ",")) { token = NCD_TOKEN_COMMA; } else if (l = data_begins_with(str, left, ":")) { token = NCD_TOKEN_COLON; } else if (l = data_begins_with(str, left, "[")) { token = NCD_TOKEN_BRACKET_OPEN; } else if (l = data_begins_with(str, left, "]")) { token = NCD_TOKEN_BRACKET_CLOSE; } else if (l = data_begins_with(str, left, "->")) { token = NCD_TOKEN_ARROW; } else if (l = data_begins_with(str, left, "If")) { token = NCD_TOKEN_IF; } else if (l = data_begins_with(str, left, "Elif")) { token = NCD_TOKEN_ELIF; } else if (l = data_begins_with(str, left, "elif")) { token = NCD_TOKEN_ELIF; } else if (l = data_begins_with(str, left, "Else")) { token = NCD_TOKEN_ELSE; } else if (l = data_begins_with(str, left, "else")) { token = NCD_TOKEN_ELSE; } else if (l = data_begins_with(str, left, "Foreach")) { token = NCD_TOKEN_FOREACH; } else if (l = data_begins_with(str, left, "As")) { token = NCD_TOKEN_AS; } else if (l = data_begins_with(str, left, "include_guard")) { token = NCD_TOKEN_INCLUDE_GUARD; } else if (l = data_begins_with(str, left, "include")) { token = NCD_TOKEN_INCLUDE; } else if (is_name_first_char(*str)) { l = 1; while (l < left && is_name_char(str[l])) { l++; } // allocate buffer bsize_t bufsize = bsize_add(bsize_fromsize(l), bsize_fromint(1)); char *buf; if (bufsize.is_overflow || !(buf = malloc(bufsize.value))) { BLog(BLOG_ERROR, "malloc failed"); error = 1; goto out; } // copy and terminate memcpy(buf, str, l); buf[l] = '\0'; if (!strcmp(buf, "process")) { token = NCD_TOKEN_PROCESS; free(buf); } else if (!strcmp(buf, "template")) { token = NCD_TOKEN_TEMPLATE; free(buf); } else { token = NCD_TOKEN_NAME; token_val = buf; token_len = l; } } else if (*str == '"') do { // init string ExpString estr; if (!ExpString_Init(&estr)) { BLog(BLOG_ERROR, "ExpString_Init failed"); goto string_fail0; } // skip start quote l = 1; // decode string while (l < left) { uint8_t dec_ch; // get character if (str[l] == '\\') { if (left - l < 2) { BLog(BLOG_ERROR, "escape character found in string but nothing follows"); goto string_fail1; } size_t extra = 0; switch (str[l + 1]) { case '\'': case '\"': case '\\': case '\?': dec_ch = str[l + 1]; break; case 'a': dec_ch = '\a'; break; case 'b': dec_ch = '\b'; break; case 'f': dec_ch = '\f'; break; case 'n': dec_ch = '\n'; break; case 'r': dec_ch = '\r'; break; case 't': dec_ch = '\t'; break; case 'v': dec_ch = '\v'; break; case '0': dec_ch = 0; break; case 'x': { if (left - l < 4) { BLog(BLOG_ERROR, "hexadecimal escape found in string but too little characters follow"); goto string_fail1; } uintmax_t hex_val; if (!parse_unsigned_hex_integer_bin(&str[l + 2], 2, &hex_val)) { BLog(BLOG_ERROR, "hexadecimal escape found in string but two hex characters don't follow"); goto string_fail1; } dec_ch = hex_val; extra = 2; } break; default: BLog(BLOG_ERROR, "bad escape sequence in string"); goto string_fail1; } l += 2 + extra; } else if (str[l] == '"') { break; } else { dec_ch = str[l]; l++; } // append character to string if (!ExpString_AppendByte(&estr, dec_ch)) { BLog(BLOG_ERROR, "ExpString_AppendChar failed"); goto string_fail1; } } // make sure ending quote was found if (l == left) { BLog(BLOG_ERROR, "missing ending quote for string"); goto string_fail1; } // skip ending quote l++; token = NCD_TOKEN_STRING; token_val = ExpString_Get(&estr); token_len = ExpString_Length(&estr); break; string_fail1: ExpString_Free(&estr); string_fail0: error = 1; } while (0); else if (is_space_char(*str)) { token = 0; l = 1; } else { BLog(BLOG_ERROR, "unrecognized character"); error = 1; } out: // report error if (error) { output(user, NCD_ERROR, NULL, 0, line, line_char); return; } // output token if (token) { if (!output(user, token, token_val, token_len, line, line_char)) { return; } } // update line/char counters for (size_t i = 0; i < l; i++) { if (str[i] == '\n') { line++; line_char = 1; } else { line_char++; } } str += l; left -= l; } output(user, NCD_EOF, NULL, 0, line, line_char); }
void NCDConfigTokenizer_Tokenize (char *str, size_t left, NCDConfigTokenizer_output output, void *user) { size_t line = 1; size_t line_char = 1; while (left > 0) { size_t l; int error = 0; int token; void *token_val = NULL; if (*str == '#') { l = 1; while (l < left && str[l] != '\n') { l++; } token = 0; } else if (l = data_begins_with(str, left, "{")) { token = NCD_TOKEN_CURLY_OPEN; } else if (l = data_begins_with(str, left, "}")) { token = NCD_TOKEN_CURLY_CLOSE; } else if (l = data_begins_with(str, left, "(")) { token = NCD_TOKEN_ROUND_OPEN; } else if (l = data_begins_with(str, left, ")")) { token = NCD_TOKEN_ROUND_CLOSE; } else if (l = data_begins_with(str, left, ";")) { token = NCD_TOKEN_SEMICOLON; } else if (l = data_begins_with(str, left, ".")) { token = NCD_TOKEN_DOT; } else if (l = data_begins_with(str, left, ",")) { token = NCD_TOKEN_COMMA; } else if (l = data_begins_with(str, left, ":")) { token = NCD_TOKEN_COLON; } else if (l = data_begins_with(str, left, "[")) { token = NCD_TOKEN_BRACKET_OPEN; } else if (l = data_begins_with(str, left, "]")) { token = NCD_TOKEN_BRACKET_CLOSE; } else if (l = data_begins_with(str, left, "->")) { token = NCD_TOKEN_ARROW; } else if (is_name_first_char(*str)) { l = 1; while (l < left && is_name_char(str[l])) { l++; } // allocate buffer bsize_t bufsize = bsize_add(bsize_fromsize(l), bsize_fromint(1)); char *buf; if (bufsize.is_overflow || !(buf = malloc(bufsize.value))) { BLog(BLOG_ERROR, "malloc failed"); error = 1; goto out; } // copy and terminate memcpy(buf, str, l); buf[l] = '\0'; if (!strcmp(buf, "process")) { token = NCD_TOKEN_PROCESS; free(buf); } else if (!strcmp(buf, "template")) { token = NCD_TOKEN_TEMPLATE; free(buf); } else { token = NCD_TOKEN_NAME; token_val = buf; } } else if (*str == '"') do { // init string ExpString estr; if (!ExpString_Init(&estr)) { BLog(BLOG_ERROR, "ExpString_Init failed"); goto string_fail0; } // skip start quote l = 1; // decode string while (l < left) { char dec_ch; // get character if (str[l] == '\\') { if (left - l < 2) { BLog(BLOG_ERROR, "escape character found in string but nothing follows"); goto string_fail1; } dec_ch = str[l + 1]; l += 2; } else if (str[l] == '"') { break; } else { dec_ch = str[l]; l++; } // string cannot contain zeros bytes if (dec_ch == '\0') { BLog(BLOG_ERROR, "string contains zero byte"); goto string_fail1; } // append character to string if (!ExpString_AppendChar(&estr, dec_ch)) { BLog(BLOG_ERROR, "ExpString_AppendChar failed"); goto string_fail1; } } // make sure ending quote was found if (l == left) { BLog(BLOG_ERROR, "missing ending quote for string"); goto string_fail1; } // skip ending quote l++; token = NCD_TOKEN_STRING; token_val = ExpString_Get(&estr); break; string_fail1: ExpString_Free(&estr); string_fail0: error = 1; } while (0); else if (is_space_char(*str)) { token = 0; l = 1; } else { BLog(BLOG_ERROR, "unrecognized character"); error = 1; } out: // report error if (error) { output(user, NCD_ERROR, NULL, line, line_char); return; } // output token if (token) { if (!output(user, token, token_val, line, line_char)) { return; } } // update line/char counters for (size_t i = 0; i < l; i++) { if (str[i] == '\n') { line++; line_char = 1; } else { line_char++; } } str += l; left -= l; } output(user, NCD_EOF, NULL, line, line_char); }