/** * Create a hubbub parser * * \param enc Source document encoding, or NULL to autodetect * \param fix_enc Permit fixing up of encoding if it's frequently misused * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data (may be NULL) * \param parser Pointer to location to receive parser instance * \return HUBBUB_OK on success, * HUBBUB_BADPARM on bad parameters, * HUBBUB_NOMEM on memory exhaustion, * HUBBUB_BADENCODING if ::enc is unsupported */ hubbub_error hubbub_parser_create(const char *enc, bool fix_enc, hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser) { parserutils_error perror; hubbub_error error; hubbub_parser *p; if (alloc == NULL || parser == NULL) return HUBBUB_BADPARM; p = alloc(NULL, sizeof(hubbub_parser), pw); if (p == NULL) return HUBBUB_NOMEM; /* If we have an encoding and we're permitted to fix up likely broken * ones, then attempt to do so. */ if (enc != NULL && fix_enc == true) { uint16_t mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc)); if (mibenum != 0) { hubbub_charset_fix_charset(&mibenum); enc = parserutils_charset_mibenum_to_name(mibenum); } } perror = parserutils_inputstream_create(enc, enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, hubbub_charset_extract, alloc, pw, &p->stream); if (perror != PARSERUTILS_OK) { alloc(p, 0, pw); return hubbub_error_from_parserutils_error(perror); } error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok); if (error != HUBBUB_OK) { parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb); if (error != HUBBUB_OK) { hubbub_tokeniser_destroy(p->tok); parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } p->alloc = alloc; p->pw = pw; *parser = p; return HUBBUB_OK; }
void run_test(context *ctx) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; int i, max_i; struct array_list *outputsave = ctx->output; if (ctx->content_model == NULL) { max_i = 1; } else { max_i = array_list_length(ctx->content_model); } /* We test for each of the content models specified */ for (i = 0; i < max_i; i++) { /* Reset expected output */ ctx->output = outputsave; ctx->output_index = 0; ctx->char_off = 0; assert(parserutils_inputstream_create("UTF-8", 0, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, &tok) == HUBBUB_OK); if (ctx->last_start_tag != NULL) { /* Fake up a start tag, in PCDATA state */ size_t len = strlen(ctx->last_start_tag) + 3; uint8_t *buf = malloc(len); snprintf((char *) buf, len, "<%s>", ctx->last_start_tag); assert(parserutils_inputstream_append(stream, buf, len - 1) == PARSERUTILS_OK); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); free(buf); } if (ctx->process_cdata) { params.process_cdata = ctx->process_cdata; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_PROCESS_CDATA, ¶ms) == HUBBUB_OK); } params.token_handler.handler = token_handler; params.token_handler.pw = ctx; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); if (ctx->content_model == NULL) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else { const char *cm = json_object_get_string( (struct json_object *) array_list_get_idx(ctx->content_model, i)); if (strcmp(cm, "PCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else if (strcmp(cm, "RCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_RCDATA; } else if (strcmp(cm, "CDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_CDATA; } else { params.content_model.model = HUBBUB_CONTENT_MODEL_PLAINTEXT; } } assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_CONTENT_MODEL, ¶ms) == HUBBUB_OK); assert(parserutils_inputstream_append(stream, ctx->input, ctx->input_len) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, NULL, 0) == PARSERUTILS_OK); printf("Input: '%.*s' (%d)\n", (int) ctx->input_len, (const char *) ctx->input, (int) ctx->input_len); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); } }
int main(int argc, char **argv) { parserutils_inputstream *stream; /* This is specially calculated so that the inputstream is forced to * reallocate (it assumes that the inputstream's buffer chunk size * is 4k) */ #define BUFFER_SIZE (4096 + 4) uint8_t input_buffer[BUFFER_SIZE]; // uint8_t *buffer; // size_t buflen; const uint8_t *c; size_t clen; UNUSED(argc); UNUSED(argv); /* Populate the buffer with something sane */ memset(input_buffer, 'a', BUFFER_SIZE); /* Now, set up our test data */ input_buffer[BUFFER_SIZE - 1] = '5'; input_buffer[BUFFER_SIZE - 2] = '4'; input_buffer[BUFFER_SIZE - 3] = '\xbd'; input_buffer[BUFFER_SIZE - 4] = '\xbf'; /* This byte will occupy the 4095th byte in the buffer and * thus cause the entirety of U+FFFD to be buffered until after * the buffer has been enlarged */ input_buffer[BUFFER_SIZE - 5] = '\xef'; input_buffer[BUFFER_SIZE - 6] = '3'; input_buffer[BUFFER_SIZE - 7] = '2'; input_buffer[BUFFER_SIZE - 8] = '1'; assert(parserutils_inputstream_create("UTF-8", 0, NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, input_buffer, BUFFER_SIZE) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, NULL, 0) == PARSERUTILS_OK); while (parserutils_inputstream_peek(stream, 0, &c, &clen) != PARSERUTILS_EOF) parserutils_inputstream_advance(stream, clen); /* assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) == CSS_OK); assert(buflen == BUFFER_SIZE); printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8)); assert( buffer[BUFFER_SIZE - 6] == '3' && buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' && buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' && buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' && buffer[BUFFER_SIZE - 2] == '4'); free(buffer); */ parserutils_inputstream_destroy(stream); printf("PASS\n"); return 0; }
int main(int argc, char **argv) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; FILE *fp; size_t len, origlen; #define CHUNK_SIZE (4096) uint8_t buf[CHUNK_SIZE]; if (argc != 2) { printf("Usage: %s <filename>\n", argv[0]); return 1; } assert(parserutils_inputstream_create("UTF-8", 0, NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, myrealloc, NULL, &tok) == HUBBUB_OK); params.token_handler.handler = token_handler; params.token_handler.pw = NULL; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); fp = fopen(argv[1], "rb"); if (fp == NULL) { printf("Failed opening %s\n", argv[1]); return 1; } fseek(fp, 0, SEEK_END); origlen = len = ftell(fp); fseek(fp, 0, SEEK_SET); while (len > 0) { ssize_t bytes_read = fread(buf, 1, CHUNK_SIZE, fp); if (bytes_read < 1) break; assert(parserutils_inputstream_append(stream, buf, bytes_read) == HUBBUB_OK); len -= bytes_read; assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); } assert(len == 0); fclose(fp); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); printf("PASS\n"); return 0; }