/** * Create a hubbub parser * * \param enc Source document encoding, or NULL to autodetect * \param fix_enc Permit fixing up of encoding if it's frequently misused * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data (may be NULL) * \param parser Pointer to location to receive parser instance * \return HUBBUB_OK on success, * HUBBUB_BADPARM on bad parameters, * HUBBUB_NOMEM on memory exhaustion, * HUBBUB_BADENCODING if ::enc is unsupported */ hubbub_error hubbub_parser_create(const char *enc, bool fix_enc, hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser) { parserutils_error perror; hubbub_error error; hubbub_parser *p; if (alloc == NULL || parser == NULL) return HUBBUB_BADPARM; p = alloc(NULL, sizeof(hubbub_parser), pw); if (p == NULL) return HUBBUB_NOMEM; /* If we have an encoding and we're permitted to fix up likely broken * ones, then attempt to do so. */ if (enc != NULL && fix_enc == true) { uint16_t mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc)); if (mibenum != 0) { hubbub_charset_fix_charset(&mibenum); enc = parserutils_charset_mibenum_to_name(mibenum); } } perror = parserutils_inputstream_create(enc, enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, hubbub_charset_extract, alloc, pw, &p->stream); if (perror != PARSERUTILS_OK) { alloc(p, 0, pw); return hubbub_error_from_parserutils_error(perror); } error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok); if (error != HUBBUB_OK) { parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb); if (error != HUBBUB_OK) { hubbub_tokeniser_destroy(p->tok); parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } p->alloc = alloc; p->pw = pw; *parser = p; return HUBBUB_OK; }
int main (int argc, char **argv) { parserutils_charset_aliases_canon *c; UNUSED(argc); UNUSED(argv); c = parserutils__charset_alias_canonicalise("moose", 5); if (c) { printf("FAIL - found invalid encoding 'moose'\n"); return 1; } c = parserutils__charset_alias_canonicalise("csinvariant", 11); if (c) { printf("%s %d\n", c->name, c->mib_enum); } else { printf("FAIL - failed finding encoding 'csinvariant'\n"); return 1; } c = parserutils__charset_alias_canonicalise("csinvariant\"", 12); if (c) { printf("%s %d\n", c->name, c->mib_enum); } else { printf("FAIL - failed finding encoding 'csinvariant'\n"); return 1; } c = parserutils__charset_alias_canonicalise("nats-sefi-add", 13); if (c) { printf("%s %d\n", c->name, c->mib_enum); } else { printf("FAIL - failed finding encoding 'nats-sefi-add'\n"); return 1; } printf("%d\n", parserutils_charset_mibenum_from_name(c->name, strlen(c->name))); printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum)); c = parserutils__charset_alias_canonicalise("u.t.f.8", 7); if (c) { printf("%s %d\n", c->name, c->mib_enum); } else { printf("FAIL - failed finding encoding 'u.t.f.8'\n"); return 1; } printf("PASS\n"); return 0; }
void run_test(const uint8_t *data, size_t len, char *expected) { uint16_t mibenum = 0; hubbub_charset_source source = HUBBUB_CHARSET_UNKNOWN; static int testnum; assert(hubbub_charset_extract(data, len, &mibenum, &source) == HUBBUB_OK); assert(mibenum != 0); printf("%d: Detected charset %s (%d) Source %d Expected %s (%d)\n", ++testnum, parserutils_charset_mibenum_to_name(mibenum), mibenum, source, expected, parserutils_charset_mibenum_from_name( expected, strlen(expected))); assert(mibenum == parserutils_charset_mibenum_from_name( expected, strlen(expected))); }
/** * Process a meta tag as if "in head". * * \param treebuilder The treebuilder instance * \param token The token to process */ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, const hubbub_token *token) { static uint16_t utf16, utf16be, utf16le; uint16_t charset_enc = 0; uint16_t content_type_enc = 0; size_t i; hubbub_error err = HUBBUB_OK; err = insert_element(treebuilder, &token->data.tag, false); if (err != HUBBUB_OK) return err; /** \todo ack sc flag */ if (treebuilder->tree_handler->encoding_change == NULL) return err; /* Grab UTF-16 MIBenums */ if (utf16 == 0) { utf16 = parserutils_charset_mibenum_from_name( "utf-16", SLEN("utf-16")); utf16be = parserutils_charset_mibenum_from_name( "utf-16be", SLEN("utf-16be")); utf16le = parserutils_charset_mibenum_from_name( "utf-16le", SLEN("utf-16le")); assert(utf16 != 0 && utf16be != 0 && utf16le != 0); } for (i = 0; i < token->data.tag.n_attributes; i++) { hubbub_attribute *attr = &token->data.tag.attributes[i]; if (hubbub_string_match(attr->name.ptr, attr->name.len, (const uint8_t *) "charset", SLEN("charset")) == true) { /* Extract charset */ charset_enc = parserutils_charset_mibenum_from_name( (const char *) attr->value.ptr, attr->value.len); } else if (hubbub_string_match(attr->name.ptr, attr->name.len, (const uint8_t *) "content", SLEN("content")) == true) { /* Extract charset from Content-Type */ content_type_enc = hubbub_charset_parse_content( attr->value.ptr, attr->value.len); } } /* Fall back, if necessary */ if (charset_enc == 0 && content_type_enc != 0) charset_enc = content_type_enc; if (charset_enc != 0) { const char *name; hubbub_charset_fix_charset(&charset_enc); /* Change UTF-16 to UTF-8 */ if (charset_enc == utf16le || charset_enc == utf16be || charset_enc == utf16) { charset_enc = parserutils_charset_mibenum_from_name( "UTF-8", SLEN("UTF-8")); } name = parserutils_charset_mibenum_to_name(charset_enc); err = treebuilder->tree_handler->encoding_change( treebuilder->tree_handler->ctx, name); } return err; }