/** * Create a hubbub parser * * \param enc Source document encoding, or NULL to autodetect * \param fix_enc Permit fixing up of encoding if it's frequently misused * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data (may be NULL) * \param parser Pointer to location to receive parser instance * \return HUBBUB_OK on success, * HUBBUB_BADPARM on bad parameters, * HUBBUB_NOMEM on memory exhaustion, * HUBBUB_BADENCODING if ::enc is unsupported */ hubbub_error hubbub_parser_create(const char *enc, bool fix_enc, hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser) { parserutils_error perror; hubbub_error error; hubbub_parser *p; if (alloc == NULL || parser == NULL) return HUBBUB_BADPARM; p = alloc(NULL, sizeof(hubbub_parser), pw); if (p == NULL) return HUBBUB_NOMEM; /* If we have an encoding and we're permitted to fix up likely broken * ones, then attempt to do so. */ if (enc != NULL && fix_enc == true) { uint16_t mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc)); if (mibenum != 0) { hubbub_charset_fix_charset(&mibenum); enc = parserutils_charset_mibenum_to_name(mibenum); } } perror = parserutils_inputstream_create(enc, enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, hubbub_charset_extract, alloc, pw, &p->stream); if (perror != PARSERUTILS_OK) { alloc(p, 0, pw); return hubbub_error_from_parserutils_error(perror); } error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok); if (error != HUBBUB_OK) { parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb); if (error != HUBBUB_OK) { hubbub_tokeniser_destroy(p->tok); parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } p->alloc = alloc; p->pw = pw; *parser = p; return HUBBUB_OK; }
/** * Destroy a hubbub parser * * \param parser Parser instance to destroy * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_destroy(hubbub_parser *parser) { if (parser == NULL) return HUBBUB_BADPARM; hubbub_treebuilder_destroy(parser->tb); hubbub_tokeniser_destroy(parser->tok); parserutils_inputstream_destroy(parser->stream); free(parser); return HUBBUB_OK; }
void run_test(context *ctx) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; int i, max_i; struct array_list *outputsave = ctx->output; if (ctx->content_model == NULL) { max_i = 1; } else { max_i = array_list_length(ctx->content_model); } /* We test for each of the content models specified */ for (i = 0; i < max_i; i++) { /* Reset expected output */ ctx->output = outputsave; ctx->output_index = 0; ctx->char_off = 0; assert(parserutils_inputstream_create("UTF-8", 0, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, &tok) == HUBBUB_OK); if (ctx->last_start_tag != NULL) { /* Fake up a start tag, in PCDATA state */ size_t len = strlen(ctx->last_start_tag) + 3; uint8_t *buf = malloc(len); snprintf((char *) buf, len, "<%s>", ctx->last_start_tag); assert(parserutils_inputstream_append(stream, buf, len - 1) == PARSERUTILS_OK); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); free(buf); } if (ctx->process_cdata) { params.process_cdata = ctx->process_cdata; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_PROCESS_CDATA, ¶ms) == HUBBUB_OK); } params.token_handler.handler = token_handler; params.token_handler.pw = ctx; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); if (ctx->content_model == NULL) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else { const char *cm = json_object_get_string( (struct json_object *) array_list_get_idx(ctx->content_model, i)); if (strcmp(cm, "PCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else if (strcmp(cm, "RCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_RCDATA; } else if (strcmp(cm, "CDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_CDATA; } else { params.content_model.model = HUBBUB_CONTENT_MODEL_PLAINTEXT; } } assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_CONTENT_MODEL, ¶ms) == HUBBUB_OK); assert(parserutils_inputstream_append(stream, ctx->input, ctx->input_len) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, NULL, 0) == PARSERUTILS_OK); printf("Input: '%.*s' (%d)\n", (int) ctx->input_len, (const char *) ctx->input, (int) ctx->input_len); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); } }
int main(int argc, char **argv) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; FILE *fp; size_t len, origlen; #define CHUNK_SIZE (4096) uint8_t buf[CHUNK_SIZE]; if (argc != 2) { printf("Usage: %s <filename>\n", argv[0]); return 1; } assert(parserutils_inputstream_create("UTF-8", 0, NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, myrealloc, NULL, &tok) == HUBBUB_OK); params.token_handler.handler = token_handler; params.token_handler.pw = NULL; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); fp = fopen(argv[1], "rb"); if (fp == NULL) { printf("Failed opening %s\n", argv[1]); return 1; } fseek(fp, 0, SEEK_END); origlen = len = ftell(fp); fseek(fp, 0, SEEK_SET); while (len > 0) { ssize_t bytes_read = fread(buf, 1, CHUNK_SIZE, fp); if (bytes_read < 1) break; assert(parserutils_inputstream_append(stream, buf, bytes_read) == HUBBUB_OK); len -= bytes_read; assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); } assert(len == 0); fclose(fp); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); printf("PASS\n"); return 0; }