/** * Inform the parser that the last chunk of data has been parsed * * \param parser Parser to inform * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_completed(hubbub_parser *parser) { parserutils_error perror; hubbub_error error; if (parser == NULL) return HUBBUB_BADPARM; perror = parserutils_inputstream_append(parser->stream, NULL, 0); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); error = hubbub_tokeniser_run(parser->tok); if (error != HUBBUB_OK) return error; return HUBBUB_OK; }
/** * Pass a chunk of data to a hubbub parser for parsing * * \param parser Parser instance to use * \param data Data to parse (encoded in the input charset) * \param len Length, in bytes, of data * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, const uint8_t *data, size_t len) { parserutils_error perror; hubbub_error error; if (parser == NULL || data == NULL) return HUBBUB_BADPARM; perror = parserutils_inputstream_append(parser->stream, data, len); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); error = hubbub_tokeniser_run(parser->tok); if (error == HUBBUB_BADENCODING) { /* Ok, we autodetected an encoding that we don't actually * support. We've not actually processed any data at this * point so fall back to Windows-1252 and hope for the best */ perror = parserutils_inputstream_change_charset(parser->stream, "Windows-1252", HUBBUB_CHARSET_TENTATIVE); /* Under no circumstances should we get here if we've managed * to process data. If there is a way, I want to know about it */ assert(perror != PARSERUTILS_INVALID); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); /* Retry the tokenisation */ error = hubbub_tokeniser_run(parser->tok); } if (error != HUBBUB_OK) return error; return HUBBUB_OK; }
int main(int argc, char **argv) { parserutils_inputstream *stream; /* This is specially calculated so that the inputstream is forced to * reallocate (it assumes that the inputstream's buffer chunk size * is 4k) */ #define BUFFER_SIZE (4096 + 4) uint8_t input_buffer[BUFFER_SIZE]; // uint8_t *buffer; // size_t buflen; const uint8_t *c; size_t clen; UNUSED(argc); UNUSED(argv); /* Populate the buffer with something sane */ memset(input_buffer, 'a', BUFFER_SIZE); /* Now, set up our test data */ input_buffer[BUFFER_SIZE - 1] = '5'; input_buffer[BUFFER_SIZE - 2] = '4'; input_buffer[BUFFER_SIZE - 3] = '\xbd'; input_buffer[BUFFER_SIZE - 4] = '\xbf'; /* This byte will occupy the 4095th byte in the buffer and * thus cause the entirety of U+FFFD to be buffered until after * the buffer has been enlarged */ input_buffer[BUFFER_SIZE - 5] = '\xef'; input_buffer[BUFFER_SIZE - 6] = '3'; input_buffer[BUFFER_SIZE - 7] = '2'; input_buffer[BUFFER_SIZE - 8] = '1'; assert(parserutils_inputstream_create("UTF-8", 0, NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, input_buffer, BUFFER_SIZE) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, NULL, 0) == PARSERUTILS_OK); while (parserutils_inputstream_peek(stream, 0, &c, &clen) != PARSERUTILS_EOF) parserutils_inputstream_advance(stream, clen); /* assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) == CSS_OK); assert(buflen == BUFFER_SIZE); printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8)); assert( buffer[BUFFER_SIZE - 6] == '3' && buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' && buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' && buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' && buffer[BUFFER_SIZE - 2] == '4'); free(buffer); */ parserutils_inputstream_destroy(stream); printf("PASS\n"); return 0; }
void run_test(context *ctx) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; int i, max_i; struct array_list *outputsave = ctx->output; if (ctx->content_model == NULL) { max_i = 1; } else { max_i = array_list_length(ctx->content_model); } /* We test for each of the content models specified */ for (i = 0; i < max_i; i++) { /* Reset expected output */ ctx->output = outputsave; ctx->output_index = 0; ctx->char_off = 0; assert(parserutils_inputstream_create("UTF-8", 0, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, &tok) == HUBBUB_OK); if (ctx->last_start_tag != NULL) { /* Fake up a start tag, in PCDATA state */ size_t len = strlen(ctx->last_start_tag) + 3; uint8_t *buf = malloc(len); snprintf((char *) buf, len, "<%s>", ctx->last_start_tag); assert(parserutils_inputstream_append(stream, buf, len - 1) == PARSERUTILS_OK); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); free(buf); } if (ctx->process_cdata) { params.process_cdata = ctx->process_cdata; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_PROCESS_CDATA, ¶ms) == HUBBUB_OK); } params.token_handler.handler = token_handler; params.token_handler.pw = ctx; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); if (ctx->content_model == NULL) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else { const char *cm = json_object_get_string( (struct json_object *) array_list_get_idx(ctx->content_model, i)); if (strcmp(cm, "PCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_PCDATA; } else if (strcmp(cm, "RCDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_RCDATA; } else if (strcmp(cm, "CDATA") == 0) { params.content_model.model = HUBBUB_CONTENT_MODEL_CDATA; } else { params.content_model.model = HUBBUB_CONTENT_MODEL_PLAINTEXT; } } assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_CONTENT_MODEL, ¶ms) == HUBBUB_OK); assert(parserutils_inputstream_append(stream, ctx->input, ctx->input_len) == PARSERUTILS_OK); assert(parserutils_inputstream_append(stream, NULL, 0) == PARSERUTILS_OK); printf("Input: '%.*s' (%d)\n", (int) ctx->input_len, (const char *) ctx->input, (int) ctx->input_len); assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); } }
int main(int argc, char **argv) { parserutils_inputstream *stream; hubbub_tokeniser *tok; hubbub_tokeniser_optparams params; FILE *fp; size_t len, origlen; #define CHUNK_SIZE (4096) uint8_t buf[CHUNK_SIZE]; if (argc != 2) { printf("Usage: %s <filename>\n", argv[0]); return 1; } assert(parserutils_inputstream_create("UTF-8", 0, NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK); assert(hubbub_tokeniser_create(stream, myrealloc, NULL, &tok) == HUBBUB_OK); params.token_handler.handler = token_handler; params.token_handler.pw = NULL; assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, ¶ms) == HUBBUB_OK); fp = fopen(argv[1], "rb"); if (fp == NULL) { printf("Failed opening %s\n", argv[1]); return 1; } fseek(fp, 0, SEEK_END); origlen = len = ftell(fp); fseek(fp, 0, SEEK_SET); while (len > 0) { ssize_t bytes_read = fread(buf, 1, CHUNK_SIZE, fp); if (bytes_read < 1) break; assert(parserutils_inputstream_append(stream, buf, bytes_read) == HUBBUB_OK); len -= bytes_read; assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); } assert(len == 0); fclose(fp); hubbub_tokeniser_destroy(tok); parserutils_inputstream_destroy(stream); printf("PASS\n"); return 0; }