static int html_processor(const char *filename) { struct stat statbuf = {0}; htmlDocPtr htmldoc; htmlctx_t hctx = {0}; hctx.state = YS_FIRST_PASS; htmldoc = htmlSAXParseFile(filename, NULL, html_handler, &hctx); if (htmldoc != NULL) { xmlFreeDoc(htmldoc); } if (hctx.title[0] == 0) strncpy(hctx.title, filebasename(filename), sizeof hctx.title); printf("<?xml version=\"1.0\"?>\n"); printf("<!DOCTYPE YASEFILE SYSTEM \"yase.dtd\">\n"); printf("<YASEFILE title=\"%s\" type=\"HTML\" author=\"%s\" keywords=\"%s\">\n", hctx.title, hctx.author, hctx.keywords); printf("<YASEDOC title=\"%s\">\n", hctx.title); hctx.state = YS_IN_DOC; htmldoc = htmlSAXParseFile(filename, NULL, html_handler, &hctx); if (htmldoc != NULL) { xmlFreeDoc(htmldoc); } printf("</YASEDOC>\n"); printf("</YASEFILE>\n"); xmlCleanupParser(); return 0; }
static void f_parse_html(INT32 args) { xmlDocPtr doc = NULL; char * encoding = "utf-8"; struct pike_string *encode_data = NULL; if ( args == 1 ) { if ( ARG(1).type != T_STRING ) Pike_error("Incorrect type for argument 0: expected string (encoding)\n"); encode_data = ARG(1).u.string; encoding = encode_data->str; } // do nothing if ( THIS->input_data->len == 0 ) push_int(0); switch (THIS->parsing_method) { case PARSE_PUSH_PARSER: Pike_error("Push parser not implemented yet. Please bug [email protected] to implement it."); case PARSE_MEMORY_PARSER: htmlHandleOmittedElem(1); doc=htmlSAXParseDoc(THIS->input_data->str, encoding, THIS->sax, NULL); break; case PARSE_FILE_PARSER: htmlHandleOmittedElem(1); doc=htmlSAXParseFile(THIS->input_data->str, "utf-8", THIS->sax, NULL); break; } if ( doc != NULL ) xmlFreeDoc(doc); push_int(0); }
static VALUE native_parse_file(VALUE self, VALUE data, VALUE encoding) { xmlSAXHandlerPtr handler; Data_Get_Struct(self, xmlSAXHandler, handler); htmlSAXParseFile( StringValuePtr(data), (const char *)StringValuePtr(encoding), (htmlSAXHandlerPtr)handler, (void *)self ); return data; }
/* * call-seq: * native_parse_file(data, encoding) * * Parse +data+ with +encoding+ */ static VALUE native_parse_file(VALUE self, VALUE data, VALUE encoding) { xmlSAXHandlerPtr handler; htmlDocPtr hdoc ; Data_Get_Struct(self, xmlSAXHandler, handler); hdoc = htmlSAXParseFile( StringValuePtr(data), (const char *)StringValuePtr(encoding), (htmlSAXHandlerPtr)handler, NOKOGIRI_SAX_TUPLE_NEW(NULL, self) ); xmlFreeDoc(hdoc); return data; }
static GSList * html_to_ascii(GFile *file, gboolean is_recipebook) { Ctxt *ctxt = g_slice_new0(Ctxt); ctxt->chars = g_string_new(""); DocText *doctext = g_slice_new0(DocText); doctext->is_recipebook = is_recipebook; doctext->file = g_object_ref(file); ctxt->doctext = doctext; char *path = g_file_get_path(file); htmlSAXParseFile(path, NULL, &i7_html_sax, ctxt); g_free(path); doctext->body = g_string_free(ctxt->chars, FALSE); GSList *retval = g_slist_prepend(ctxt->completed_doctexts, ctxt->doctext); g_slice_free(Ctxt, ctxt); return retval; }
G_MODULE_EXPORT gboolean tracker_extract_get_metadata (TrackerExtractInfo *info) { TrackerSparqlBuilder *metadata; GFile *file; TrackerConfig *config; htmlDocPtr doc; parser_data pd; gchar *filename; xmlSAXHandler handler = { NULL, /* internalSubset */ NULL, /* isStandalone */ NULL, /* hasInternalSubset */ NULL, /* hasExternalSubset */ NULL, /* resolveEntity */ NULL, /* getEntity */ NULL, /* entityDecl */ NULL, /* notationDecl */ NULL, /* attributeDecl */ NULL, /* elementDecl */ NULL, /* unparsedEntityDecl */ NULL, /* setDocumentLocator */ NULL, /* startDocument */ NULL, /* endDocument */ parser_start_element, /* startElement */ parser_end_element, /* endElement */ NULL, /* reference */ parser_characters, /* characters */ NULL, /* ignorableWhitespace */ NULL, /* processingInstruction */ NULL, /* comment */ NULL, /* xmlParserWarning */ NULL, /* xmlParserError */ NULL, /* xmlParserError */ NULL, /* getParameterEntity */ NULL, /* cdataBlock */ NULL, /* externalSubset */ 1, /* initialized */ NULL, /* private */ NULL, /* startElementNsSAX2Func */ NULL, /* endElementNsSAX2Func */ NULL /* xmlStructuredErrorFunc */ }; metadata = tracker_extract_info_get_metadata_builder (info); file = tracker_extract_info_get_file (info); tracker_sparql_builder_predicate (metadata, "a"); tracker_sparql_builder_object (metadata, "nfo:HtmlDocument"); pd.metadata = metadata; pd.current = -1; pd.in_body = FALSE; pd.plain_text = g_string_new (NULL); pd.title = g_string_new (NULL); config = tracker_main_get_config (); pd.n_bytes_remaining = tracker_config_get_max_bytes (config); filename = g_file_get_path (file); doc = htmlSAXParseFile (filename, NULL, &handler, &pd); g_free (filename); if (doc) { xmlFreeDoc (doc); } g_strstrip (pd.plain_text->str); g_strstrip (pd.title->str); if (pd.title->str && *pd.title->str != '\0') { tracker_sparql_builder_predicate (metadata, "nie:title"); tracker_sparql_builder_object_unvalidated (metadata, pd.title->str); } if (pd.plain_text->str && *pd.plain_text->str != '\0') { tracker_sparql_builder_predicate (metadata, "nie:plainTextContent"); tracker_sparql_builder_object_unvalidated (metadata, pd.plain_text->str); } g_string_free (pd.plain_text, TRUE); g_string_free (pd.title, TRUE); return TRUE; }