Ejemplo n.º 1
0
/**
 * Pass a chunk of data to a hubbub parser for parsing
 *
 * \param parser  Parser instance to use
 * \param data    Data to parse (encoded in the input charset)
 * \param len     Length, in bytes, of data
 * \return HUBBUB_OK on success, appropriate error otherwise
 */
hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
		const uint8_t *data, size_t len)
{
	parserutils_error perror;
	hubbub_error error;

	if (parser == NULL || data == NULL)
		return HUBBUB_BADPARM;

	perror = parserutils_inputstream_append(parser->stream, data, len);
	if (perror != PARSERUTILS_OK)
		return hubbub_error_from_parserutils_error(perror);

	error = hubbub_tokeniser_run(parser->tok);
	if (error == HUBBUB_BADENCODING) {
		/* Ok, we autodetected an encoding that we don't actually
		 * support. We've not actually processed any data at this
		 * point so fall back to Windows-1252 and hope for the best
		 */
		perror = parserutils_inputstream_change_charset(parser->stream,
				"Windows-1252", HUBBUB_CHARSET_TENTATIVE);
		/* Under no circumstances should we get here if we've managed
		 * to process data. If there is a way, I want to know about it
		 */
		assert(perror != PARSERUTILS_INVALID);
		if (perror != PARSERUTILS_OK)
			return hubbub_error_from_parserutils_error(perror);

		/* Retry the tokenisation */
		error = hubbub_tokeniser_run(parser->tok);
	}

	if (error != HUBBUB_OK)
		return error;

	return HUBBUB_OK;
}
Ejemplo n.º 2
0
/**
 * Inform the parser that the last chunk of data has been parsed
 *
 * \param parser  Parser to inform
 * \return HUBBUB_OK on success, appropriate error otherwise
 */
hubbub_error hubbub_parser_completed(hubbub_parser *parser)
{
	parserutils_error perror;
	hubbub_error error;

	if (parser == NULL)
		return HUBBUB_BADPARM;

	perror = parserutils_inputstream_append(parser->stream, NULL, 0);
	if (perror != PARSERUTILS_OK)
		return hubbub_error_from_parserutils_error(perror);

	error = hubbub_tokeniser_run(parser->tok);
	if (error != HUBBUB_OK)
		return error;

	return HUBBUB_OK;
}
Ejemplo n.º 3
0
/**
 * Pass a chunk of extraneous data to a hubbub parser for parsing
 *
 * \param parser  Parser instance to use
 * \param data    Data to parse (encoded in UTF-8)
 * \param len     Length, in byte, of data
 * \return HUBBUB_OK on success, appropriate error otherwise
 */
hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser,
		const uint8_t *data, size_t len)
{
	hubbub_error error;

	/** \todo In some cases, we don't actually want script-inserted
	 * data to be parsed until later. We'll need some way of flagging
	 * this through the public API, and the inputstream API will need
	 * some way of marking the insertion point so that, when the
	 * tokeniser is run, only the inserted chunk is parsed. */

	if (parser == NULL || data == NULL)
		return HUBBUB_BADPARM;

	error = parserutils_inputstream_insert(parser->stream, data, len);
	if (error != HUBBUB_OK)
		return error;

	error = hubbub_tokeniser_run(parser->tok);
	if (error != HUBBUB_OK)
		return error;

	return HUBBUB_OK;
}
Ejemplo n.º 4
0
void run_test(context *ctx)
{
	parserutils_inputstream *stream;
	hubbub_tokeniser *tok;
	hubbub_tokeniser_optparams params;
	int i, max_i;
	struct array_list *outputsave = ctx->output;

	if (ctx->content_model == NULL) {
		max_i = 1;
	} else {
		max_i = array_list_length(ctx->content_model);
	}

	/* We test for each of the content models specified */
	for (i = 0; i < max_i; i++) {
		/* Reset expected output */
		ctx->output = outputsave;
		ctx->output_index = 0;
		ctx->char_off = 0;

		assert(parserutils_inputstream_create("UTF-8", 0, NULL,
				&stream) == PARSERUTILS_OK);

		assert(hubbub_tokeniser_create(stream, &tok) == HUBBUB_OK);

		if (ctx->last_start_tag != NULL) {
			/* Fake up a start tag, in PCDATA state */
			size_t len = strlen(ctx->last_start_tag) + 3;
			uint8_t *buf = malloc(len);

			snprintf((char *) buf, len, "<%s>", 
					ctx->last_start_tag);

			assert(parserutils_inputstream_append(stream,
				buf, len - 1) == PARSERUTILS_OK);

			assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);

			free(buf);
		}

		if (ctx->process_cdata) {
			params.process_cdata = ctx->process_cdata;
			assert(hubbub_tokeniser_setopt(tok,
					HUBBUB_TOKENISER_PROCESS_CDATA,
					&params) == HUBBUB_OK);
		}

		params.token_handler.handler = token_handler;
		params.token_handler.pw = ctx;
		assert(hubbub_tokeniser_setopt(tok,
				HUBBUB_TOKENISER_TOKEN_HANDLER,
				&params) == HUBBUB_OK);

		if (ctx->content_model == NULL) {
			params.content_model.model =
					HUBBUB_CONTENT_MODEL_PCDATA;
		} else {
			const char *cm = json_object_get_string(
				(struct json_object *)
				array_list_get_idx(ctx->content_model, i));

			if (strcmp(cm, "PCDATA") == 0) {
				params.content_model.model =
						HUBBUB_CONTENT_MODEL_PCDATA;
			} else if (strcmp(cm, "RCDATA") == 0) {
				params.content_model.model =
						HUBBUB_CONTENT_MODEL_RCDATA;
			} else if (strcmp(cm, "CDATA") == 0) {
				params.content_model.model =
						HUBBUB_CONTENT_MODEL_CDATA;
			} else {
				params.content_model.model =
					HUBBUB_CONTENT_MODEL_PLAINTEXT;
			}
		}
		assert(hubbub_tokeniser_setopt(tok,
				HUBBUB_TOKENISER_CONTENT_MODEL,
				&params) == HUBBUB_OK);

		assert(parserutils_inputstream_append(stream,
				ctx->input, ctx->input_len) == PARSERUTILS_OK);

		assert(parserutils_inputstream_append(stream, NULL, 0) ==
				PARSERUTILS_OK);

		printf("Input: '%.*s' (%d)\n", (int) ctx->input_len,
				(const char *) ctx->input, 
				(int) ctx->input_len);

		assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);

		hubbub_tokeniser_destroy(tok);

		parserutils_inputstream_destroy(stream);
	}
}
Ejemplo n.º 5
0
int main(int argc, char **argv)
{
    parserutils_inputstream *stream;
    hubbub_tokeniser *tok;
    hubbub_tokeniser_optparams params;
    FILE *fp;
    size_t len, origlen;
#define CHUNK_SIZE (4096)
    uint8_t buf[CHUNK_SIZE];

    if (argc != 2) {
        printf("Usage: %s <filename>\n", argv[0]);
        return 1;
    }

    assert(parserutils_inputstream_create("UTF-8", 0, NULL,
                                          myrealloc, NULL, &stream) == PARSERUTILS_OK);

    assert(hubbub_tokeniser_create(stream, myrealloc, NULL, &tok) ==
           HUBBUB_OK);

    params.token_handler.handler = token_handler;
    params.token_handler.pw = NULL;
    assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER,
                                   &params) == HUBBUB_OK);

    fp = fopen(argv[1], "rb");
    if (fp == NULL) {
        printf("Failed opening %s\n", argv[1]);
        return 1;
    }

    fseek(fp, 0, SEEK_END);
    origlen = len = ftell(fp);
    fseek(fp, 0, SEEK_SET);

    while (len > 0) {
        ssize_t bytes_read = fread(buf, 1, CHUNK_SIZE, fp);

        if (bytes_read < 1)
            break;

        assert(parserutils_inputstream_append(stream,
                                              buf, bytes_read) == HUBBUB_OK);

        len -= bytes_read;

        assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);
    }

    assert(len == 0);

    fclose(fp);

    hubbub_tokeniser_destroy(tok);

    parserutils_inputstream_destroy(stream);

    printf("PASS\n");

    return 0;
}