static int
html_processor(const char *filename)
{
	struct stat statbuf = {0};
	htmlDocPtr htmldoc;
	htmlctx_t hctx = {0};

	hctx.state = YS_FIRST_PASS;
	htmldoc = htmlSAXParseFile(filename, NULL, 
		html_handler, &hctx);
	if (htmldoc != NULL) {
		xmlFreeDoc(htmldoc);
	}
	if (hctx.title[0] == 0)
		strncpy(hctx.title, filebasename(filename), sizeof hctx.title);
	printf("<?xml version=\"1.0\"?>\n");
	printf("<!DOCTYPE YASEFILE SYSTEM \"yase.dtd\">\n");
	printf("<YASEFILE title=\"%s\" type=\"HTML\" author=\"%s\" keywords=\"%s\">\n", hctx.title, hctx.author, hctx.keywords);
	printf("<YASEDOC title=\"%s\">\n", hctx.title);
	hctx.state = YS_IN_DOC;
	htmldoc = htmlSAXParseFile(filename, NULL, 
		html_handler, &hctx);
	if (htmldoc != NULL) {
		xmlFreeDoc(htmldoc);
	}
	printf("</YASEDOC>\n");
	printf("</YASEFILE>\n");
	xmlCleanupParser();
	return 0;
}
示例#2
0
static void f_parse_html(INT32 args)
{
  xmlDocPtr   doc = NULL;
  char * encoding = "utf-8";
  struct pike_string *encode_data = NULL;
  
  if ( args == 1 ) {
      if ( ARG(1).type != T_STRING ) 
	  Pike_error("Incorrect type for argument 0: expected string (encoding)\n");
      encode_data = ARG(1).u.string;
      encoding = encode_data->str;
  }
  // do nothing
  if ( THIS->input_data->len == 0 )
    push_int(0);

  switch (THIS->parsing_method) {
      case PARSE_PUSH_PARSER:
        Pike_error("Push parser not implemented yet. Please bug [email protected] to implement it.");
        
      case PARSE_MEMORY_PARSER:
        htmlHandleOmittedElem(1);
	doc=htmlSAXParseDoc(THIS->input_data->str, encoding, THIS->sax, NULL);
        break;

      case PARSE_FILE_PARSER:
        htmlHandleOmittedElem(1);
	doc=htmlSAXParseFile(THIS->input_data->str, "utf-8", THIS->sax, NULL);
	break;
  }
  if ( doc != NULL )
    xmlFreeDoc(doc);
  
  push_int(0);
}
示例#3
0
static VALUE native_parse_file(VALUE self, VALUE data, VALUE encoding)
{
    xmlSAXHandlerPtr handler;
    Data_Get_Struct(self, xmlSAXHandler, handler);
    htmlSAXParseFile( StringValuePtr(data),
                      (const char *)StringValuePtr(encoding),
                      (htmlSAXHandlerPtr)handler,
                      (void *)self );
    return data;
}
示例#4
0
/*
 * call-seq:
 *  native_parse_file(data, encoding)
 *
 * Parse +data+ with +encoding+
 */
static VALUE native_parse_file(VALUE self, VALUE data, VALUE encoding)
{
  xmlSAXHandlerPtr handler;
  htmlDocPtr hdoc ;
  Data_Get_Struct(self, xmlSAXHandler, handler);
  hdoc = htmlSAXParseFile( StringValuePtr(data),
                           (const char *)StringValuePtr(encoding),
                           (htmlSAXHandlerPtr)handler,
                           NOKOGIRI_SAX_TUPLE_NEW(NULL, self) );
  xmlFreeDoc(hdoc);
  return data;
}
示例#5
0
static GSList *
html_to_ascii(GFile *file, gboolean is_recipebook)
{
	Ctxt *ctxt = g_slice_new0(Ctxt);
	ctxt->chars = g_string_new("");
	DocText *doctext = g_slice_new0(DocText);
	doctext->is_recipebook = is_recipebook;
	doctext->file = g_object_ref(file);
	ctxt->doctext = doctext;

	char *path = g_file_get_path(file);
	htmlSAXParseFile(path, NULL, &i7_html_sax, ctxt);
	g_free(path);

	doctext->body = g_string_free(ctxt->chars, FALSE);
	GSList *retval = g_slist_prepend(ctxt->completed_doctexts, ctxt->doctext);
	g_slice_free(Ctxt, ctxt);
	return retval;
}
示例#6
0
G_MODULE_EXPORT gboolean
tracker_extract_get_metadata (TrackerExtractInfo *info)
{
	TrackerSparqlBuilder *metadata;
	GFile *file;
	TrackerConfig *config;
	htmlDocPtr doc;
	parser_data pd;
	gchar *filename;
	xmlSAXHandler handler = {
		NULL, /* internalSubset */
		NULL, /* isStandalone */
		NULL, /* hasInternalSubset */
		NULL, /* hasExternalSubset */
		NULL, /* resolveEntity */
		NULL, /* getEntity */
		NULL, /* entityDecl */
		NULL, /* notationDecl */
		NULL, /* attributeDecl */
		NULL, /* elementDecl */
		NULL, /* unparsedEntityDecl */
		NULL, /* setDocumentLocator */
		NULL, /* startDocument */
		NULL, /* endDocument */
		parser_start_element, /* startElement */
		parser_end_element, /* endElement */
		NULL, /* reference */
		parser_characters, /* characters */
		NULL, /* ignorableWhitespace */
		NULL, /* processingInstruction */
		NULL, /* comment */
		NULL, /* xmlParserWarning */
		NULL, /* xmlParserError */
		NULL, /* xmlParserError */
		NULL, /* getParameterEntity */
		NULL, /* cdataBlock */
		NULL, /* externalSubset */
		1,    /* initialized */
		NULL, /* private */
		NULL, /* startElementNsSAX2Func */
		NULL, /* endElementNsSAX2Func */
		NULL  /* xmlStructuredErrorFunc */
	};

	metadata = tracker_extract_info_get_metadata_builder (info);
	file = tracker_extract_info_get_file (info);

	tracker_sparql_builder_predicate (metadata, "a");
	tracker_sparql_builder_object (metadata, "nfo:HtmlDocument");

	pd.metadata = metadata;
	pd.current = -1;
	pd.in_body = FALSE;
	pd.plain_text = g_string_new (NULL);
	pd.title = g_string_new (NULL);

	config = tracker_main_get_config ();
	pd.n_bytes_remaining = tracker_config_get_max_bytes (config);

	filename = g_file_get_path (file);
	doc = htmlSAXParseFile (filename, NULL, &handler, &pd);
	g_free (filename);

	if (doc) {
		xmlFreeDoc (doc);
	}

	g_strstrip (pd.plain_text->str);
	g_strstrip (pd.title->str);

	if (pd.title->str &&
	    *pd.title->str != '\0') {
		tracker_sparql_builder_predicate (metadata, "nie:title");
		tracker_sparql_builder_object_unvalidated (metadata, pd.title->str);
	}

	if (pd.plain_text->str &&
	    *pd.plain_text->str != '\0') {
		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
		tracker_sparql_builder_object_unvalidated (metadata, pd.plain_text->str);
	}

	g_string_free (pd.plain_text, TRUE);
	g_string_free (pd.title, TRUE);

	return TRUE;
}