/**
 * Determine whether this codec handles a specific charset
 *
 * \param charset  Charset to test
 * \return true if handleable, false otherwise
 */
bool charset_ascii_codec_handles_charset(const char *charset)
{
    static uint16_t ascii;
    uint16_t match = parserutils_charset_mibenum_from_name(charset,
                     strlen(charset));

    if (ascii == 0) {
        ascii = parserutils_charset_mibenum_from_name(
                    "US-ASCII", SLEN("US-ASCII"));
    }

    if (ascii != 0 && ascii == match)
        return true;

    return false;
}
Esempio n. 2
0
/**
 * Create a hubbub parser
 *
 * \param enc      Source document encoding, or NULL to autodetect
 * \param fix_enc  Permit fixing up of encoding if it's frequently misused
 * \param alloc    Memory (de)allocation function
 * \param pw       Pointer to client-specific private data (may be NULL)
 * \param parser   Pointer to location to receive parser instance
 * \return HUBBUB_OK on success,
 *         HUBBUB_BADPARM on bad parameters,
 *         HUBBUB_NOMEM on memory exhaustion,
 *         HUBBUB_BADENCODING if ::enc is unsupported
 */
hubbub_error hubbub_parser_create(const char *enc, bool fix_enc,
		hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser)
{
	parserutils_error perror;
	hubbub_error error;
	hubbub_parser *p;

	if (alloc == NULL || parser == NULL)
		return HUBBUB_BADPARM;

	p = alloc(NULL, sizeof(hubbub_parser), pw);
	if (p == NULL)
		return HUBBUB_NOMEM;

	/* If we have an encoding and we're permitted to fix up likely broken
	 * ones, then attempt to do so. */
	if (enc != NULL && fix_enc == true) {
		uint16_t mibenum = parserutils_charset_mibenum_from_name(enc,
				strlen(enc));

		if (mibenum != 0) {
			hubbub_charset_fix_charset(&mibenum);

			enc = parserutils_charset_mibenum_to_name(mibenum);
		}
	}

	perror = parserutils_inputstream_create(enc,
		enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN,
		hubbub_charset_extract, alloc, pw, &p->stream);
	if (perror != PARSERUTILS_OK) {
		alloc(p, 0, pw);
		return hubbub_error_from_parserutils_error(perror);
	}

	error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok);
	if (error != HUBBUB_OK) {
		parserutils_inputstream_destroy(p->stream);
		alloc(p, 0, pw);
		return error;
	}

	error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb);
	if (error != HUBBUB_OK) {
		hubbub_tokeniser_destroy(p->tok);
		parserutils_inputstream_destroy(p->stream);
		alloc(p, 0, pw);
		return error;
	}

	p->alloc = alloc;
	p->pw = pw;

	*parser = p;

	return HUBBUB_OK;
}
Esempio n. 3
0
int main (int argc, char **argv)
{
	parserutils_charset_aliases_canon *c;

	UNUSED(argc);
	UNUSED(argv);

	c = parserutils__charset_alias_canonicalise("moose", 5);
	if (c) {
		printf("FAIL - found invalid encoding 'moose'\n");
		return 1;
	}

	c = parserutils__charset_alias_canonicalise("csinvariant", 11);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'csinvariant'\n");
		return 1;
	}

	c = parserutils__charset_alias_canonicalise("csinvariant\"", 12);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'csinvariant'\n");
		return 1;
	}

	c = parserutils__charset_alias_canonicalise("nats-sefi-add", 13);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
		return 1;
	}

	printf("%d\n", parserutils_charset_mibenum_from_name(c->name,
			strlen(c->name)));

	printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum));


	c = parserutils__charset_alias_canonicalise("u.t.f.8", 7);
	if (c) {
		printf("%s %d\n", c->name, c->mib_enum);
	} else {
		printf("FAIL - failed finding encoding 'u.t.f.8'\n");
		return 1;
	}

	printf("PASS\n");

	return 0;
}
Esempio n. 4
0
void run_test(const uint8_t *data, size_t len, char *expected)
{
	uint16_t mibenum = 0;
	hubbub_charset_source source = HUBBUB_CHARSET_UNKNOWN;
	static int testnum;

	assert(hubbub_charset_extract(data, len,
			&mibenum, &source) == HUBBUB_OK);

	assert(mibenum != 0);

	printf("%d: Detected charset %s (%d) Source %d Expected %s (%d)\n",
			++testnum, parserutils_charset_mibenum_to_name(mibenum),
			mibenum, source, expected,
			parserutils_charset_mibenum_from_name(
					expected, strlen(expected)));

	assert(mibenum == parserutils_charset_mibenum_from_name(
			expected, strlen(expected)));
}
Esempio n. 5
0
/**
 * Detect if a parserutils_charset is Unicode
 *
 * \param mibenum  The MIB enum to consider
 * \return true if a Unicode variant, false otherwise
 */
bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
{
	static uint16_t ucs4;
	static uint16_t ucs2;
	static uint16_t utf8;
	static uint16_t utf16;
	static uint16_t utf16be;
	static uint16_t utf16le;
	static uint16_t utf32;
	static uint16_t utf32be;
	static uint16_t utf32le;

	if (ucs4 == 0) {
		ucs4 = parserutils_charset_mibenum_from_name("UCS-4", 
				SLEN("UCS-4"));
		ucs2 = parserutils_charset_mibenum_from_name("UCS-2", 
				SLEN("UCS-2"));
		utf8 = parserutils_charset_mibenum_from_name("UTF-8", 
				SLEN("UTF-8"));
		utf16 = parserutils_charset_mibenum_from_name("UTF-16", 
				SLEN("UTF-16"));
		utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
				SLEN("UTF-16BE"));
		utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
				SLEN("UTF-16LE"));
		utf32 = parserutils_charset_mibenum_from_name("UTF-32", 
				SLEN("UTF-32"));
		utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
				SLEN("UTF-32BE"));
		utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
				SLEN("UTF-32LE"));
	}

	return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 ||
			mibenum == utf16 || mibenum == utf16be || 
			mibenum == utf16le || mibenum == utf32 ||
			mibenum == utf32be || mibenum == utf32le);
}
/**
 * Process a meta tag as if "in head".
 *
 * \param treebuilder  The treebuilder instance
 * \param token        The token to process
 */
static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
		const hubbub_token *token)
{
	static uint16_t utf16, utf16be, utf16le;
	uint16_t charset_enc = 0;
	uint16_t content_type_enc = 0;
	size_t i;
	hubbub_error err = HUBBUB_OK;

	err = insert_element(treebuilder, &token->data.tag, false);
	if (err != HUBBUB_OK)
		return err;

	/** \todo ack sc flag */

	if (treebuilder->tree_handler->encoding_change == NULL)
		return err;

	/* Grab UTF-16 MIBenums */
	if (utf16 == 0) {
		utf16 = parserutils_charset_mibenum_from_name(
				"utf-16", SLEN("utf-16"));
		utf16be = parserutils_charset_mibenum_from_name(
				"utf-16be", SLEN("utf-16be"));
		utf16le = parserutils_charset_mibenum_from_name(
				"utf-16le", SLEN("utf-16le"));
		assert(utf16 != 0 && utf16be != 0 && utf16le != 0);
	}

	for (i = 0; i < token->data.tag.n_attributes; i++) {
		hubbub_attribute *attr = &token->data.tag.attributes[i];

		if (hubbub_string_match(attr->name.ptr, attr->name.len,
				(const uint8_t *) "charset",
				SLEN("charset")) == true) {
			/* Extract charset */
			charset_enc = parserutils_charset_mibenum_from_name(
					(const char *) attr->value.ptr,
					attr->value.len);
		} else if (hubbub_string_match(attr->name.ptr, attr->name.len,
				(const uint8_t *) "content",
				SLEN("content")) == true) {
			/* Extract charset from Content-Type */
			content_type_enc = hubbub_charset_parse_content(
					attr->value.ptr, attr->value.len);
		}
	}

	/* Fall back, if necessary */
	if (charset_enc == 0 && content_type_enc != 0)
		charset_enc = content_type_enc;

	if (charset_enc != 0) {
		const char *name;

		hubbub_charset_fix_charset(&charset_enc);

		/* Change UTF-16 to UTF-8 */
		if (charset_enc == utf16le || charset_enc == utf16be ||
				charset_enc == utf16) {
			charset_enc = parserutils_charset_mibenum_from_name(
					"UTF-8", SLEN("UTF-8"));
		}

		name = parserutils_charset_mibenum_to_name(charset_enc);

		err = treebuilder->tree_handler->encoding_change(
				treebuilder->tree_handler->ctx,	name);
	}

	return err;
}