/** * Pass a chunk of data to a hubbub parser for parsing * * \param parser Parser instance to use * \param data Data to parse (encoded in the input charset) * \param len Length, in bytes, of data * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, const uint8_t *data, size_t len) { parserutils_error perror; hubbub_error error; if (parser == NULL || data == NULL) return HUBBUB_BADPARM; perror = parserutils_inputstream_append(parser->stream, data, len); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); error = hubbub_tokeniser_run(parser->tok); if (error == HUBBUB_BADENCODING) { /* Ok, we autodetected an encoding that we don't actually * support. We've not actually processed any data at this * point so fall back to Windows-1252 and hope for the best */ perror = parserutils_inputstream_change_charset(parser->stream, "Windows-1252", HUBBUB_CHARSET_TENTATIVE); /* Under no circumstances should we get here if we've managed * to process data. If there is a way, I want to know about it */ assert(perror != PARSERUTILS_INVALID); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); /* Retry the tokenisation */ error = hubbub_tokeniser_run(parser->tok); } if (error != HUBBUB_OK) return error; return HUBBUB_OK; }
/** * Inform the parser that the last chunk of data has been parsed * * \param parser Parser to inform * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_completed(hubbub_parser *parser) { parserutils_error perror; hubbub_error error; if (parser == NULL) return HUBBUB_BADPARM; perror = parserutils_inputstream_append(parser->stream, NULL, 0); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); error = hubbub_tokeniser_run(parser->tok); if (error != HUBBUB_OK) return error; return HUBBUB_OK; }
/** * Pass a chunk of extraneous data to a hubbub parser for parsing * * \param parser Parser instance to use * \param data Data to parse (encoded in UTF-8) * \param len Length, in byte, of data * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser, const uint8_t *data, size_t len) { hubbub_error error; /** \todo In some cases, we don't actually want script-inserted * data to be parsed until later. We'll need some way of flagging * this through the public API, and the inputstream API will need * some way of marking the insertion point so that, when the * tokeniser is run, only the inserted chunk is parsed. */ if (parser == NULL || data == NULL) return HUBBUB_BADPARM; error = parserutils_inputstream_insert(parser->stream, data, len); if (error != HUBBUB_OK) return error; error = hubbub_tokeniser_run(parser->tok); if (error != HUBBUB_OK) return error; return HUBBUB_OK; }
void run_test(context *ctx) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; int i, max_i; struct array_list *outputsave = ctx->output; if (ctx->content_model == NULL) { max_i = 1; } else { max_i = array_list_length(ctx->content_model); } /* We test for each of the content models specified */ for (i = 0; i < max_i; i++) { /* Reset expected output */ ctx->output = outputsave; ctx->output_index = 0; ctx->char_off = 0; assert(parserutils_inputstream_create("UTF-8", 0, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, &tok) == HUBBUB_OK); if (ctx->last_start_tag != NULL) { /* Fake up a start tag, in PCDATA state */ size_t len = strlen(ctx->last_start_tag) + 3; uint8_t *buf = malloc(len); snprintf((char *) buf, len, "<%s>", ctx->last_start_tag); assert(parserutils_inputstream_append(stream, buf, len - 1) == PARSERUTILS_OK); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); free(buf); } if (ctx->process_cdata) { params.process_cdata = ctx->process_cdata; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_PROCESS_CDATA, ¶ms) == HUBBUB_OK); } params.token_handler.handler = token_handler; params.token_handler.pw = ctx; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); if (ctx->content_model == NULL) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else { const char *cm = json_object_get_string( (struct json_object *) array_list_get_idx(ctx->content_model, i)); if (strcmp(cm, "PCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else if (strcmp(cm, "RCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_RCDATA; } else if (strcmp(cm, "CDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_CDATA; } else { params.content_model.model = HUBBUB_CONTENT_MODEL_PLAINTEXT; } } assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_CONTENT_MODEL, ¶ms) == HUBBUB_OK); assert(parserutils_inputstream_append(stream, ctx->input, ctx->input_len) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, NULL, 0) == PARSERUTILS_OK); printf("Input: '%.*s' (%d)\n", (int) ctx->input_len, (const char *) ctx->input, (int) ctx->input_len); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); } }
int main(int argc, char **argv) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; FILE *fp; size_t len, origlen; #define CHUNK_SIZE (4096) uint8_t buf[CHUNK_SIZE]; if (argc != 2) { printf("Usage: %s <filename>\n", argv[0]); return 1; } assert(parserutils_inputstream_create("UTF-8", 0, NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, myrealloc, NULL, &tok) == HUBBUB_OK); params.token_handler.handler = token_handler; params.token_handler.pw = NULL; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); fp = fopen(argv[1], "rb"); if (fp == NULL) { printf("Failed opening %s\n", argv[1]); return 1; } fseek(fp, 0, SEEK_END); origlen = len = ftell(fp); fseek(fp, 0, SEEK_SET); while (len > 0) { ssize_t bytes_read = fread(buf, 1, CHUNK_SIZE, fp); if (bytes_read < 1) break; assert(parserutils_inputstream_append(stream, buf, bytes_read) == HUBBUB_OK); len -= bytes_read; assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); } assert(len == 0); fclose(fp); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); printf("PASS\n"); return 0; }