Exemple #1
0
/**
 * Create a hubbub parser
 *
 * \param enc      Source document encoding, or NULL to autodetect
 * \param fix_enc  Permit fixing up of encoding if it's frequently misused
 * \param alloc    Memory (de)allocation function
 * \param pw       Pointer to client-specific private data (may be NULL)
 * \param parser   Pointer to location to receive parser instance
 * \return HUBBUB_OK on success,
 *         HUBBUB_BADPARM on bad parameters,
 *         HUBBUB_NOMEM on memory exhaustion,
 *         HUBBUB_BADENCODING if ::enc is unsupported
 */
hubbub_error hubbub_parser_create(const char *enc, bool fix_enc,
		hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser)
{
	parserutils_error perror;
	hubbub_error error;
	hubbub_parser *p;

	if (alloc == NULL || parser == NULL)
		return HUBBUB_BADPARM;

	p = alloc(NULL, sizeof(hubbub_parser), pw);
	if (p == NULL)
		return HUBBUB_NOMEM;

	/* If we have an encoding and we're permitted to fix up likely broken
	 * ones, then attempt to do so. */
	if (enc != NULL && fix_enc == true) {
		uint16_t mibenum = parserutils_charset_mibenum_from_name(enc,
				strlen(enc));

		if (mibenum != 0) {
			hubbub_charset_fix_charset(&mibenum);

			enc = parserutils_charset_mibenum_to_name(mibenum);
		}
	}

	perror = parserutils_inputstream_create(enc,
		enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN,
		hubbub_charset_extract, alloc, pw, &p->stream);
	if (perror != PARSERUTILS_OK) {
		alloc(p, 0, pw);
		return hubbub_error_from_parserutils_error(perror);
	}

	error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok);
	if (error != HUBBUB_OK) {
		parserutils_inputstream_destroy(p->stream);
		alloc(p, 0, pw);
		return error;
	}

	error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb);
	if (error != HUBBUB_OK) {
		hubbub_tokeniser_destroy(p->tok);
		parserutils_inputstream_destroy(p->stream);
		alloc(p, 0, pw);
		return error;
	}

	p->alloc = alloc;
	p->pw = pw;

	*parser = p;

	return HUBBUB_OK;
}
Exemple #2
0
void run_test(context *ctx)
{
	parserutils_inputstream *stream;
	hubbub_tokeniser *tok;
	hubbub_tokeniser_optparams params;
	int i, max_i;
	struct array_list *outputsave = ctx->output;

	if (ctx->content_model == NULL) {
		max_i = 1;
	} else {
		max_i = array_list_length(ctx->content_model);
	}

	/* We test for each of the content models specified */
	for (i = 0; i < max_i; i++) {
		/* Reset expected output */
		ctx->output = outputsave;
		ctx->output_index = 0;
		ctx->char_off = 0;

		assert(parserutils_inputstream_create("UTF-8", 0, NULL,
				&stream) == PARSERUTILS_OK);

		assert(hubbub_tokeniser_create(stream, &tok) == HUBBUB_OK);

		if (ctx->last_start_tag != NULL) {
			/* Fake up a start tag, in PCDATA state */
			size_t len = strlen(ctx->last_start_tag) + 3;
			uint8_t *buf = malloc(len);

			snprintf((char *) buf, len, "<%s>", 
					ctx->last_start_tag);

			assert(parserutils_inputstream_append(stream,
				buf, len - 1) == PARSERUTILS_OK);

			assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);

			free(buf);
		}

		if (ctx->process_cdata) {
			params.process_cdata = ctx->process_cdata;
			assert(hubbub_tokeniser_setopt(tok,
					HUBBUB_TOKENISER_PROCESS_CDATA,
					&params) == HUBBUB_OK);
		}

		params.token_handler.handler = token_handler;
		params.token_handler.pw = ctx;
		assert(hubbub_tokeniser_setopt(tok,
				HUBBUB_TOKENISER_TOKEN_HANDLER,
				&params) == HUBBUB_OK);

		if (ctx->content_model == NULL) {
			params.content_model.model =
					HUBBUB_CONTENT_MODEL_PCDATA;
		} else {
			const char *cm = json_object_get_string(
				(struct json_object *)
				array_list_get_idx(ctx->content_model, i));

			if (strcmp(cm, "PCDATA") == 0) {
				params.content_model.model =
						HUBBUB_CONTENT_MODEL_PCDATA;
			} else if (strcmp(cm, "RCDATA") == 0) {
				params.content_model.model =
						HUBBUB_CONTENT_MODEL_RCDATA;
			} else if (strcmp(cm, "CDATA") == 0) {
				params.content_model.model =
						HUBBUB_CONTENT_MODEL_CDATA;
			} else {
				params.content_model.model =
					HUBBUB_CONTENT_MODEL_PLAINTEXT;
			}
		}
		assert(hubbub_tokeniser_setopt(tok,
				HUBBUB_TOKENISER_CONTENT_MODEL,
				&params) == HUBBUB_OK);

		assert(parserutils_inputstream_append(stream,
				ctx->input, ctx->input_len) == PARSERUTILS_OK);

		assert(parserutils_inputstream_append(stream, NULL, 0) ==
				PARSERUTILS_OK);

		printf("Input: '%.*s' (%d)\n", (int) ctx->input_len,
				(const char *) ctx->input, 
				(int) ctx->input_len);

		assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);

		hubbub_tokeniser_destroy(tok);

		parserutils_inputstream_destroy(stream);
	}
}
int main(int argc, char **argv)
{
	parserutils_inputstream *stream;

	/* This is specially calculated so that the inputstream is forced to 
	 * reallocate (it assumes that the inputstream's buffer chunk size 
	 * is 4k) */
#define BUFFER_SIZE (4096 + 4)
	uint8_t input_buffer[BUFFER_SIZE];
//	uint8_t *buffer;
//	size_t buflen;
	const uint8_t *c;
	size_t clen;

	UNUSED(argc);
	UNUSED(argv);

	/* Populate the buffer with something sane */
	memset(input_buffer, 'a', BUFFER_SIZE);
	/* Now, set up our test data */
	input_buffer[BUFFER_SIZE - 1] = '5';
	input_buffer[BUFFER_SIZE - 2] = '4';
	input_buffer[BUFFER_SIZE - 3] = '\xbd';
	input_buffer[BUFFER_SIZE - 4] = '\xbf';
	/* This byte will occupy the 4095th byte in the buffer and
	 * thus cause the entirety of U+FFFD to be buffered until after
	 * the buffer has been enlarged */
	input_buffer[BUFFER_SIZE - 5] = '\xef';
	input_buffer[BUFFER_SIZE - 6] = '3';
	input_buffer[BUFFER_SIZE - 7] = '2';
	input_buffer[BUFFER_SIZE - 8] = '1';

	assert(parserutils_inputstream_create("UTF-8", 0, 
			NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK);

	assert(parserutils_inputstream_append(stream, 
			input_buffer, BUFFER_SIZE) == PARSERUTILS_OK);

	assert(parserutils_inputstream_append(stream, NULL, 0) == 
			PARSERUTILS_OK);

	while (parserutils_inputstream_peek(stream, 0, &c, &clen) != 
			PARSERUTILS_EOF)
		parserutils_inputstream_advance(stream, clen);

/*
	assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) == 
			CSS_OK);

	assert(buflen == BUFFER_SIZE);

	printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8));

	assert( buffer[BUFFER_SIZE - 6] == '3' && 
		buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' && 
		buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' && 
		buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' && 
		buffer[BUFFER_SIZE - 2] == '4');

	free(buffer);
*/

	parserutils_inputstream_destroy(stream);

	printf("PASS\n");

	return 0;
}
Exemple #4
0
int main(int argc, char **argv)
{
    parserutils_inputstream *stream;
    hubbub_tokeniser *tok;
    hubbub_tokeniser_optparams params;
    FILE *fp;
    size_t len, origlen;
#define CHUNK_SIZE (4096)
    uint8_t buf[CHUNK_SIZE];

    if (argc != 2) {
        printf("Usage: %s <filename>\n", argv[0]);
        return 1;
    }

    assert(parserutils_inputstream_create("UTF-8", 0, NULL,
                                          myrealloc, NULL, &stream) == PARSERUTILS_OK);

    assert(hubbub_tokeniser_create(stream, myrealloc, NULL, &tok) ==
           HUBBUB_OK);

    params.token_handler.handler = token_handler;
    params.token_handler.pw = NULL;
    assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER,
                                   &params) == HUBBUB_OK);

    fp = fopen(argv[1], "rb");
    if (fp == NULL) {
        printf("Failed opening %s\n", argv[1]);
        return 1;
    }

    fseek(fp, 0, SEEK_END);
    origlen = len = ftell(fp);
    fseek(fp, 0, SEEK_SET);

    while (len > 0) {
        ssize_t bytes_read = fread(buf, 1, CHUNK_SIZE, fp);

        if (bytes_read < 1)
            break;

        assert(parserutils_inputstream_append(stream,
                                              buf, bytes_read) == HUBBUB_OK);

        len -= bytes_read;

        assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);
    }

    assert(len == 0);

    fclose(fp);

    hubbub_tokeniser_destroy(tok);

    parserutils_inputstream_destroy(stream);

    printf("PASS\n");

    return 0;
}