/** * Create a hubbub parser * * \param enc Source document encoding, or NULL to autodetect * \param fix_enc Permit fixing up of encoding if it's frequently misused * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data (may be NULL) * \param parser Pointer to location to receive parser instance * \return HUBBUB_OK on success, * HUBBUB_BADPARM on bad parameters, * HUBBUB_NOMEM on memory exhaustion, * HUBBUB_BADENCODING if ::enc is unsupported */ hubbub_error hubbub_parser_create(const char *enc, bool fix_enc, hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser) { parserutils_error perror; hubbub_error error; hubbub_parser *p; if (alloc == NULL || parser == NULL) return HUBBUB_BADPARM; p = alloc(NULL, sizeof(hubbub_parser), pw); if (p == NULL) return HUBBUB_NOMEM; /* If we have an encoding and we're permitted to fix up likely broken * ones, then attempt to do so. */ if (enc != NULL && fix_enc == true) { uint16_t mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc)); if (mibenum != 0) { hubbub_charset_fix_charset(&mibenum); enc = parserutils_charset_mibenum_to_name(mibenum); } } perror = parserutils_inputstream_create(enc, enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, hubbub_charset_extract, alloc, pw, &p->stream); if (perror != PARSERUTILS_OK) { alloc(p, 0, pw); return hubbub_error_from_parserutils_error(perror); } error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok); if (error != HUBBUB_OK) { parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb); if (error != HUBBUB_OK) { hubbub_tokeniser_destroy(p->tok); parserutils_inputstream_destroy(p->stream); alloc(p, 0, pw); return error; } p->alloc = alloc; p->pw = pw; *parser = p; return HUBBUB_OK; }
/** * Pass a chunk of data to a hubbub parser for parsing * * \param parser Parser instance to use * \param data Data to parse (encoded in the input charset) * \param len Length, in bytes, of data * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, const uint8_t *data, size_t len) { parserutils_error perror; hubbub_error error; if (parser == NULL || data == NULL) return HUBBUB_BADPARM; perror = parserutils_inputstream_append(parser->stream, data, len); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); error = hubbub_tokeniser_run(parser->tok); if (error == HUBBUB_BADENCODING) { /* Ok, we autodetected an encoding that we don't actually * support. We've not actually processed any data at this * point so fall back to Windows-1252 and hope for the best */ perror = parserutils_inputstream_change_charset(parser->stream, "Windows-1252", HUBBUB_CHARSET_TENTATIVE); /* Under no circumstances should we get here if we've managed * to process data. If there is a way, I want to know about it */ assert(perror != PARSERUTILS_INVALID); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); /* Retry the tokenisation */ error = hubbub_tokeniser_run(parser->tok); } if (error != HUBBUB_OK) return error; return HUBBUB_OK; }
/** * Inform the parser that the last chunk of data has been parsed * * \param parser Parser to inform * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error hubbub_parser_completed(hubbub_parser *parser) { parserutils_error perror; hubbub_error error; if (parser == NULL) return HUBBUB_BADPARM; perror = parserutils_inputstream_append(parser->stream, NULL, 0); if (perror != PARSERUTILS_OK) return hubbub_error_from_parserutils_error(perror); error = hubbub_tokeniser_run(parser->tok); if (error != HUBBUB_OK) return error; return HUBBUB_OK; }