Пример #1
0
/**
 * Create a hubbub parser
 *
 * \param enc      Source document encoding, or NULL to autodetect
 * \param fix_enc  Permit fixing up of encoding if it's frequently misused
 * \param alloc    Memory (de)allocation function
 * \param pw       Pointer to client-specific private data (may be NULL)
 * \param parser   Pointer to location to receive parser instance
 * \return HUBBUB_OK on success,
 *         HUBBUB_BADPARM on bad parameters,
 *         HUBBUB_NOMEM on memory exhaustion,
 *         HUBBUB_BADENCODING if ::enc is unsupported
 */
hubbub_error hubbub_parser_create(const char *enc, bool fix_enc,
		hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser)
{
	parserutils_error perror;
	hubbub_error error;
	hubbub_parser *p;

	if (alloc == NULL || parser == NULL)
		return HUBBUB_BADPARM;

	p = alloc(NULL, sizeof(hubbub_parser), pw);
	if (p == NULL)
		return HUBBUB_NOMEM;

	/* If we have an encoding and we're permitted to fix up likely broken
	 * ones, then attempt to do so. */
	if (enc != NULL && fix_enc == true) {
		uint16_t mibenum = parserutils_charset_mibenum_from_name(enc,
				strlen(enc));

		if (mibenum != 0) {
			hubbub_charset_fix_charset(&mibenum);

			enc = parserutils_charset_mibenum_to_name(mibenum);
		}
	}

	perror = parserutils_inputstream_create(enc,
		enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN,
		hubbub_charset_extract, alloc, pw, &p->stream);
	if (perror != PARSERUTILS_OK) {
		alloc(p, 0, pw);
		return hubbub_error_from_parserutils_error(perror);
	}

	error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok);
	if (error != HUBBUB_OK) {
		parserutils_inputstream_destroy(p->stream);
		alloc(p, 0, pw);
		return error;
	}

	error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb);
	if (error != HUBBUB_OK) {
		hubbub_tokeniser_destroy(p->tok);
		parserutils_inputstream_destroy(p->stream);
		alloc(p, 0, pw);
		return error;
	}

	p->alloc = alloc;
	p->pw = pw;

	*parser = p;

	return HUBBUB_OK;
}
Пример #2
0
int main (int argc, char **argv)
{
	parserutils_charset_aliases_canon *c;

	UNUSED(argc);
	UNUSED(argv);

	c = parserutils__charset_alias_canonicalise("moose", 5);
	if (c) {
		printf("FAIL - found invalid encoding 'moose'\n");
		return 1;
	}

	c = parserutils__charset_alias_canonicalise("csinvariant", 11);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'csinvariant'\n");
		return 1;
	}

	c = parserutils__charset_alias_canonicalise("csinvariant\"", 12);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'csinvariant'\n");
		return 1;
	}

	c = parserutils__charset_alias_canonicalise("nats-sefi-add", 13);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
		return 1;
	}

	printf("%d\n", parserutils_charset_mibenum_from_name(c->name,
			strlen(c->name)));

	printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum));


	c = parserutils__charset_alias_canonicalise("u.t.f.8", 7);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'u.t.f.8'\n");
		return 1;
	}

	printf("PASS\n");

	return 0;
}
Пример #3
0
void run_test(const uint8_t *data, size_t len, char *expected)
{
	uint16_t mibenum = 0;
	hubbub_charset_source source = HUBBUB_CHARSET_UNKNOWN;
	static int testnum;

	assert(hubbub_charset_extract(data, len,
			&mibenum, &source) == HUBBUB_OK);

	assert(mibenum != 0);

	printf("%d: Detected charset %s (%d) Source %d Expected %s (%d)\n",
			++testnum, parserutils_charset_mibenum_to_name(mibenum),
			mibenum, source, expected,
			parserutils_charset_mibenum_from_name(
					expected, strlen(expected)));

	assert(mibenum == parserutils_charset_mibenum_from_name(
			expected, strlen(expected)));
}
/**
 * Process a meta tag as if "in head".
 *
 * \param treebuilder  The treebuilder instance
 * \param token        The token to process
 */
static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
		const hubbub_token *token)
{
	static uint16_t utf16, utf16be, utf16le;
	uint16_t charset_enc = 0;
	uint16_t content_type_enc = 0;
	size_t i;
	hubbub_error err = HUBBUB_OK;

	err = insert_element(treebuilder, &token->data.tag, false);
	if (err != HUBBUB_OK)
		return err;

	/** \todo ack sc flag */

	if (treebuilder->tree_handler->encoding_change == NULL)
		return err;

	/* Grab UTF-16 MIBenums */
	if (utf16 == 0) {
		utf16 = parserutils_charset_mibenum_from_name(
				"utf-16", SLEN("utf-16"));
		utf16be = parserutils_charset_mibenum_from_name(
				"utf-16be", SLEN("utf-16be"));
		utf16le = parserutils_charset_mibenum_from_name(
				"utf-16le", SLEN("utf-16le"));
		assert(utf16 != 0 && utf16be != 0 && utf16le != 0);
	}

	for (i = 0; i < token->data.tag.n_attributes; i++) {
		hubbub_attribute *attr = &token->data.tag.attributes[i];

		if (hubbub_string_match(attr->name.ptr, attr->name.len,
				(const uint8_t *) "charset",
				SLEN("charset")) == true) {
			/* Extract charset */
			charset_enc = parserutils_charset_mibenum_from_name(
					(const char *) attr->value.ptr,
					attr->value.len);
		} else if (hubbub_string_match(attr->name.ptr, attr->name.len,
				(const uint8_t *) "content",
				SLEN("content")) == true) {
			/* Extract charset from Content-Type */
			content_type_enc = hubbub_charset_parse_content(
					attr->value.ptr, attr->value.len);
		}
	}

	/* Fall back, if necessary */
	if (charset_enc == 0 && content_type_enc != 0)
		charset_enc = content_type_enc;

	if (charset_enc != 0) {
		const char *name;

		hubbub_charset_fix_charset(&charset_enc);

		/* Change UTF-16 to UTF-8 */
		if (charset_enc == utf16le || charset_enc == utf16be ||
				charset_enc == utf16) {
			charset_enc = parserutils_charset_mibenum_from_name(
					"UTF-8", SLEN("UTF-8"));
		}

		name = parserutils_charset_mibenum_to_name(charset_enc);

		err = treebuilder->tree_handler->encoding_change(
				treebuilder->tree_handler->ctx,	name);
	}

	return err;
}