Пример #1
0
/**
 * HTML整形
 */
const wxString ExtractBoardList::HtmlFormat(const wxString& html)
{
/**
### XML入出力のためのバッファを作成する ###

xmlOutputBufferPtr xmlOutputBufferCreateIO (xmlOutputWriteCallback iowrite, 
					    xmlOutputCloseCallback ioclose, 
					    void * ioctx, 
					    xmlCharEncodingHandlerPtr encoder)
*/

     htmlDocPtr docPtr = htmlReadMemory(html.mb_str(), html.Len(), "", "utf-8", HTML_PARSE_RECOVER);
     if (docPtr)
     {
	  // libxml2の***Ptrは何かの構造体のポインタ
	  wxString val;
	  xmlOutputBufferPtr buf = xmlOutputBufferCreateIO((xmlOutputWriteCallback)writeToWxString,
							   (xmlOutputCloseCallback)closeWxString,
							   &val, 0);
	  
	  htmlDocContentDumpOutput(buf,
				   docPtr, 
				   "utf-8");

	  return val;
     }
}
Пример #2
0
Файл: wiki.c Проект: maxux/z03
char *wiki_head(char *url) {
	curl_data_t *curl;
	xmlDoc *doc = NULL;
	xmlXPathContext *ctx = NULL;
	xmlXPathObject *xpathObj = NULL;
	char *text = NULL;
	
	curl = curl_data_new();
	
	if(curl_download_text(url, curl))
		return NULL;
	
	doc = (xmlDoc *) htmlReadMemory(curl->data, strlen(curl->data), "/", "utf-8", HTML_PARSE_NOERROR);
	
	/* creating xpath request */
	ctx = xmlXPathNewContext(doc);
	xpathObj = xmlXPathEvalExpression((const xmlChar *) "//div[@id='mw-content-text']/p", ctx);
	
	if(!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
		if(xmlNodeGetContent(xpathObj->nodesetval->nodeTab[0]))
			text = strdup((char *) xmlNodeGetContent(xpathObj->nodesetval->nodeTab[0]));	
	}

	xmlXPathFreeObject(xpathObj);
	xmlXPathFreeContext(ctx);
	xmlFreeDoc(doc);
	curl_data_free(curl);
	
	return text;
}
Пример #3
0
htmlDocPtr htmlParse(void *buffer, int buffer_len, void *url, void *encoding, int options, void *error_buffer, int error_buffer_len) {
	const char *c_buffer       = (char*)buffer;
	const char *c_url          = (char*)url;
	const char *c_encoding     = (char*)encoding;
	xmlDoc *doc = NULL;
	
	xmlResetLastError();
	doc = htmlReadMemory(c_buffer, buffer_len, c_url, c_encoding, options);

	if(doc == NULL) {
		xmlErrorPtr error;
	    xmlFreeDoc(doc);
	    error = xmlGetLastError();
		if(error != NULL && error_buffer != NULL && error->level >= XML_ERR_ERROR) {
			char *c_error_buffer = (char*)error_buffer;
			if (error->message != NULL) {
				strncpy(c_error_buffer, error->message, error_buffer_len-1);
				c_error_buffer[error_buffer_len-1] = '\0';
			}
			else {
				snprintf(c_error_buffer, error_buffer_len, "xml parsing error:%d", error->code);
			}
		}
	}
	return doc;
}
Пример #4
0
shared_ptr<xml::Document> ManifestItem::ReferencedDocument() const
{
    // TODO: handle remote URLs
    string path(BaseHref());
    
    auto package = this->Owner();
    if ( !package )
        return nullptr;
	
    shared_ptr<xml::Document> result(nullptr);
	
#if EPUB_USE(LIBXML2)
    // Sometimes, we want to filter the manifest items through
    // the content filter chain when unpacking the Package. For
    // example, with some DRM algorithms, the navigation tables are
    // encrypted and should be filtered before being parsed.
    ePub3::ManifestItemPtr manifestRef = std::const_pointer_cast<ManifestItem>(Ptr());
    if (!manifestRef)
        return nullptr;
	
    shared_ptr<ByteStream> byteStream = package->GetFilterChainByteStream(manifestRef);
    if (!byteStream)
        return nullptr;
	
	void *docBuf = nullptr;
	std::size_t resbuflen = byteStream->ReadAllBytes(&docBuf);
	
    // In some EPUBs, UTF-8 XML/HTML files have a superfluous (erroneous?) BOM, so we either:
    // pass "utf-8" and expect InputBuffer::read_cb (in io.cpp) to skip the 3 erroneous bytes
    // (otherwise the XML parser fails),
    // or we pass NULL (in which case the parser auto-detects encoding)
    const char * encoding = nullptr;
    //const char * encoding = "utf-8";

	xmlDocPtr raw;
    if ( _mediaType == "text/html" ) {
        raw = htmlReadMemory((const char*)docBuf, resbuflen, path.c_str(), encoding, ArchiveXmlReader::DEFAULT_OPTIONS);
    } else {
        raw = xmlReadMemory((const char*)docBuf, resbuflen, path.c_str(), encoding, ArchiveXmlReader::DEFAULT_OPTIONS);
    }
	
	result = xml::Wrapped<xml::Document>(raw);
	
    if (docBuf)
        free(docBuf);
	
#elif EPUB_USE(WIN_XML)
	// TODO: filtering referenced document through Content Filters
	// is not yet supported on Windows
    unique_ptr<ArchiveXmlReader> reader = package->XmlReaderForRelativePath(path);
    if ( !reader )
        return nullptr;
	
    result = reader->ReadDocument(path.c_str(), "utf-8", 0);
#endif
	
    return result;
}
Пример #5
0
static xmlDocPtr
xhtml_parse (const gchar *html, gint len)
{
	xmlDocPtr out = NULL;

	g_assert (html != NULL);
	g_assert (len >= 0);

	/* Note: NONET is not implemented so it will return an error
	   because it doesn't know how to handle NONET. But, it might
	   learn in the future. */
	out = htmlReadMemory (html, len, NULL, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NONET | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
	return out;
}
Пример #6
0
xmlNode* htmlParseFragmentAsDoc(void *doc, void *buffer, int buffer_len, void *url, void *encoding, int options, void *error_buffer, int error_buffer_len) {
	xmlDoc* tmpDoc = NULL;
	xmlNode* tmpRoot = NULL;
	tmpDoc = htmlReadMemory((char*)buffer, buffer_len, (char*)url, (char*)encoding, options);
	if (tmpDoc == NULL) {
		return NULL;
	}
	tmpRoot = xmlDocGetRootElement(tmpDoc);
	if (tmpRoot == NULL) {
		return NULL;
	}
	tmpRoot = xmlDocCopyNode(tmpRoot, doc, 1);
	xmlFreeDoc(tmpDoc);
	return tmpRoot;
}
Пример #7
0
Файл: google.c Проект: maxux/z03
google_search_t * google_search(char *keywords) {
	curl_data_t *curl;
	google_search_t *search;
	xmlDoc *doc = NULL;
	xmlXPathContext *ctx = NULL;
	xmlXPathObject *xpathObj = NULL;
	xmlNode *node = NULL;
	char url[2048];
	int i;
	
	curl = curl_data_new();
	
	snprintf(url, sizeof(url), "%s%s", baseurlen, space_encode(keywords));
	
	if(curl_download_text(url, curl))
		return NULL;
	
	doc = (xmlDoc *) htmlReadMemory(curl->data, strlen(curl->data), "/", "utf-8", HTML_PARSE_NOERROR);
	
	/* creating xpath request */
	ctx = xmlXPathNewContext(doc);
	xpathObj = xmlXPathEvalExpression((const xmlChar *) "//li/div/h3/a", ctx);
	
	search = (google_search_t *) calloc(1, sizeof(google_search_t));
	
	if(!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
		search->length = xpathObj->nodesetval->nodeNr;
		search->result = (google_result_t *) calloc(1, sizeof(google_result_t) * search->length);
		
		for(i = 0; i < xpathObj->nodesetval->nodeNr; i++) {
			node = xpathObj->nodesetval->nodeTab[i];
			
			if(xmlNodeGetContent(node))
				search->result[i].title = strdup((char *) xmlNodeGetContent(node));
			
			if(xmlGetProp(node, (unsigned char *) "href"))
				search->result[i].url   = strdup((char *) xmlGetProp(node, (unsigned char *) "href"));
		}
	}

	xmlXPathFreeObject(xpathObj);
	xmlXPathFreeContext(ctx);	
	xmlFreeDoc(doc);
	curl_data_free(curl);
	
	return search;
}
Пример #8
0
/**
 * execute_xpath_expression:
 * @filename:		the input XML filename.
 * @xpathExpr:		the xpath expression for evaluation.
 * @nsList:		the optional list of known namespaces in 
 *			"<prefix1>=<href1> <prefix2>=href2> ..." format.
 *
 * Parses input XML file, evaluates XPath expression and prints results.
 *
 * Returns 0 on success and a negative value otherwise.
 */
int execute_xpath_expression(const char* filename, const xmlChar* xpathExpr, char **get, int num) {
    xmlDocPtr doc;
    xmlXPathContextPtr xpathCtx; 
    xmlXPathObjectPtr xpathObj; 
    
    assert(filename);
    assert(xpathExpr);

    /* Load XML document */
    //doc = xmlParseFile(filename);
    
    doc = htmlReadMemory(filename, strlen(filename), NULL, NULL,  HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
    //doc = htmlDocPtr(filename, NULL);
    if (doc == NULL) {
	fprintf(stderr, "Error: unable to parse this string\n");
	return(-1);
    }
    
    /* Create xpath evaluation context */
    xpathCtx = xmlXPathNewContext(doc);
    if(xpathCtx == NULL) {
        fprintf(stderr,"Error: unable to create new XPath context\n");
        xmlFreeDoc(doc); 
        return -1;
    }
    

    /* Evaluate xpath expression */
    xpathObj = xmlXPathEvalExpression(xpathExpr, xpathCtx);
    if(xpathObj == NULL) {
        fprintf(stderr,"Error: unable to evaluate xpath expression \"%s\"\n", xpathExpr);
        xmlXPathFreeContext(xpathCtx); 
        xmlFreeDoc(doc); 
        return -1;
    }

    /* Print results */
    int size = print_xpath_nodes(xpathObj->nodesetval, get, num);

    /* Cleanup */
    xmlXPathFreeObject(xpathObj);
    xmlXPathFreeContext(xpathCtx); 
    xmlFreeDoc(doc); 
    
    return size;
}
Пример #9
0
/* HREF finder implemented in libxml2 but could be any HTML parser */
size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
{
  int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
             HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
  htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
  if(!doc)
    return 0;
  xmlChar *xpath = (xmlChar*) "//a/@href";
  xmlXPathContextPtr context = xmlXPathNewContext(doc);
  xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
  xmlXPathFreeContext(context);
  if(!result)
    return 0;
  xmlNodeSetPtr nodeset = result->nodesetval;
  if(xmlXPathNodeSetIsEmpty(nodeset)) {
    xmlXPathFreeObject(result);
    return 0;
  }
  size_t count = 0;
  for(int i = 0; i < nodeset->nodeNr; i++) {
    double r = rand();
    int x = r * nodeset->nodeNr / RAND_MAX;
    const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
    xmlChar *href = xmlNodeListGetString(doc, node, 1);
    if(follow_relative_links) {
      xmlChar *orig = href;
      href = xmlBuildURI(href, (xmlChar *) url);
      xmlFree(orig);
    }
    char *link = (char *) href;
    if(!link || strlen(link) < 20)
      continue;
    if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
      curl_multi_add_handle(multi_handle, make_handle(link));
      if(count++ == max_link_per_page)
        break;
    }
    xmlFree(link);
  }
  xmlXPathFreeObject(result);
  return count;
}
Пример #10
0
/**
 * レスをIDで抽出してファイルから読み取ってDOM形式にして送り返す
 * @param  const wxString& rawHtml                スレッドのHTML
 * @param  const wxString& extractId		  抽出対象のID
 * @return wxString	   取得したレスの内容
 */
wxString XrossBoardUtil::FindResponseByIndex(const wxString& rawHtml, const wxString& extractIndex) {

     // wxString::mb_str で変換するとWindowsの場合CP932が返ってくるので
     // まずはUTF-8のwxCharBufferに変換してやる
     const wxCharBuffer &cb = rawHtml.utf8_str();
     const htmlDocPtr docPtr = htmlReadMemory(cb.data(), ::strlen(cb.data()), "", "utf-8", 
 					      HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);

     // HTMLのDOM形式にする
     wxString lumpOfHTML = HTML_HEADER_POPUP;

     if (docPtr)
     {
	  const htmlNodePtr root = xmlDocGetRootElement(docPtr);
	  const htmlNodePtr body = root->children->next;
	  for (htmlNodePtr node = body->children; node != NULL; node = node->next)
	  {
	       if (node->type == XML_ELEMENT_NODE && 
		   xmlStrcasecmp(node->name, (const xmlChar*) "dd") == 0)
	       {
		    const htmlNodePtr dd = node->children;
		    if (DDNodeHasTarget(dd, extractIndex))
		    {
			 xmlBufferPtr buffer = xmlBufferCreate();
			 xmlNodeDump(buffer, docPtr, node->prev, 0, 1);
			 xmlNodeDump(buffer, docPtr, node, 0, 1);
			 lumpOfHTML += wxString::FromUTF8(reinterpret_cast<const char*>(buffer->content));

			 continue;
		    }
	       }
	  }

	  xmlFreeDoc(docPtr);
	  xmlCleanupParser();
     }

     // HTMLソースを加える
     lumpOfHTML += HTML_FOOTER;

     return lumpOfHTML;
}
Пример #11
0
/**
 * HTML整形
 */
const wxString ExtractBoardList::HtmlFormat(const wxString& html)
{
     wxString val;
     const wxCharBuffer& cb = html.utf8_str();

     htmlDocPtr docPtr = htmlReadMemory(cb.data(), ::strlen(cb.data()), "", "utf-8", HTML_PARSE_RECOVER);
     if (docPtr)
     {
	  // libxml2の***Ptrは何かの構造体のポインタ
	  xmlOutputBufferPtr buf = xmlOutputBufferCreateIO((xmlOutputWriteCallback)writeToWxString,
							   (xmlOutputCloseCallback)closeWxString,
							   &val, 0);
	  
	  htmlDocContentDumpOutput(buf,
				   docPtr, 
				   "utf-8");

	  xmlOutputBufferClose(buf);
	  xmlFreeDoc(docPtr);
     }
     xmlCleanupParser();

     return val;
}
Пример #12
0
static void ga_gabuddy_parse_info_cb(HttpHandler* handler, gchar* response, gsize len, gpointer userdata)
{
	htmlDocPtr doc;
	xmlXPathContextPtr xpathCtx;
	xmlXPathObjectPtr xpathObj;
	GayAttitudeAccount *gaa = handler->data;
	GayAttitudeBuddyInfoRequest *request = userdata;

	purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Fetching info for '%s'.\n", request->gabuddy->buddy->name);

	doc = htmlReadMemory(response, len, "gayattitude.xml", NULL, 0);
	if (doc == NULL)
	{
		purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XML Parsing).\n");
		return;
	}

	/* Create xpath evaluation context */
	xpathCtx = xmlXPathNewContext(doc);
	if(xpathCtx == NULL)
	{
		purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath context init).\n");
		xmlFreeDoc(doc);
		return;
	}

	xmlNode *info_node;

	/* Search internal Ref ID */
	if (!request->gabuddy->ref_id)
	{
		purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Fetching missing ref_id for '%s'.\n", request->gabuddy->buddy->name);

		xpathObj = xmlXPathEvalExpression((xmlChar*) "//input[@type='hidden' and @name='ref_id']", xpathCtx);
		if(xpathObj == NULL)
		{
			purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath evaluation).\n");
			xmlXPathFreeContext(xpathCtx);
			xmlFreeDoc(doc);
			return;
		}
		if (!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval))
		{
			gchar *ref_id;

			info_node = xpathObj->nodesetval->nodeTab[0];
			ref_id  = (gchar*) xmlGetProp(info_node, (xmlChar*) "value");
			if (request->gabuddy->real_gabuddy)
				request->gabuddy->real_gabuddy->ref_id = ref_id;
			else
				request->gabuddy->ref_id = ref_id;
			purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Found ref_id for '%s': %s.\n", request->gabuddy->buddy->name, request->gabuddy->ref_id);
		}
		xmlXPathFreeObject(xpathObj);
	}

	if (request->advertise)
	{
		PurpleNotifyUserInfo *user_info = purple_notify_user_info_new();
		int i;
		GString *str = NULL;

		/* Search short description */
		xpathCtx->node = doc->parent;
		xpathObj = xmlXPathEvalExpression((xmlChar*) "//div[@id='PORTRAITHEADER2']/p/text()", xpathCtx);
		if(xpathObj == NULL)
		{
			purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath evaluation).\n");
			xmlXPathFreeContext(xpathCtx);
			xmlFreeDoc(doc);
			return;
		}
		if (!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval))
		{
			info_node = xpathObj->nodesetval->nodeTab[0];
			purple_notify_user_info_add_pair(user_info, "Short Description", (gchar*) info_node->content);
		}
		xmlXPathFreeObject(xpathObj);

		/* Search user research */
		xpathCtx->node = doc->parent;
		xpathObj = xmlXPathEvalExpression((xmlChar*) "//div[@id='bloc_recherche']/p/text()", xpathCtx);
		if(xpathObj == NULL)
		{
			purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath evaluation).\n");
			xmlXPathFreeContext(xpathCtx);
			xmlFreeDoc(doc);
			return;
		}
		if (!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval))
		{
			for(i = 0; i < xpathObj->nodesetval->nodeNr; i++)
			{
				info_node = xpathObj->nodesetval->nodeTab[i];
				if (i == 0)
					str = g_string_new((gchar*) info_node->content);
				else
					g_string_append_printf(str, " -- %s", info_node->content);
			}
			purple_notify_user_info_add_pair(user_info, "Research", str->str);
			g_string_free(str, TRUE);
		}
		xmlXPathFreeObject(xpathObj);

		purple_notify_userinfo(gaa->pc, request->gabuddy->buddy->name, user_info, NULL, NULL);
		purple_notify_user_info_destroy(user_info);
	}

	/* Cleanup */
	xmlXPathFreeContext(xpathCtx);
	xmlFreeDoc(doc);

	/* Chained Callback */
	if (request->callback)
	{
		purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Calling callback after info for '%s' was retrieved\n", request->gabuddy->buddy->name);
		request->callback(gaa, request->callback_data);
	}
}
Пример #13
0
#include "DobHtmlParser.h"


DobHtmlParser::DobHtmlParser () {
    doc = NULL;
}
DobHtmlParser::DobHtmlParser (QString *document, QString *data_xpath )/*{{{*/
{
    doc = NULL;
    setDoc ( document );
    setXpath ( data_xpath );
}/*}}}*/
DobHtmlParser::~DobHtmlParser ()/*{{{*/
{
    if ( doc )
        xmlFreeDoc( doc );
    if ( xpath )
        xmlFree(xpath);
    xmlCleanupParser ();
}/*}}}*/
void DobHtmlParser::setDoc ( QString *document )/*{{{*/
{
    if ( doc )
        xmlFreeDoc ( doc );
    doc = htmlReadMemory ( document->toLatin1(), document->size(), "noname.html", NULL, 0 ); 
}/*}}}*/
Пример #14
0
/**
 * >>xx のようなアンカーを受けているレスを赤くする
 */
wxString XrossBoardUtil::AddColorAnchoredID(const wxString& html)
{

     const std::string temporary = std::string(html.mb_str());
     const htmlDocPtr docPtr = htmlReadMemory(temporary.c_str(), temporary.size(), "", "utf-8", 
					      HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);

     WX_DECLARE_STRING_HASH_MAP( int, ExtractIdHash );
     ExtractIdHash hashmap;

     if (docPtr)
     {
	  const htmlNodePtr root = xmlDocGetRootElement(docPtr);
	  const htmlNodePtr body = root->children->next;
	  for (htmlNodePtr node = body->children; node != NULL; node = node->next)
	  {
	       if (node->type == XML_ELEMENT_NODE && 
		   xmlStrcasecmp(node->name, (const xmlChar*) "dd") == 0)
	       {
		    const htmlNodePtr dd = node->children;

		    if (node != NULL && dd != NULL     &&
			node->type == XML_ELEMENT_NODE &&
			xmlStrcasecmp(dd->name, (const xmlChar*) "table") == 0)
		    {
			 for (htmlNodePtr ptr = dd->children; ptr != NULL; ptr = ptr->next) 
			 {
			      if (ptr->type == XML_ELEMENT_NODE && 
				  xmlStrcasecmp(ptr->name, (const xmlChar*) "a") == 0)
			      {
				   xmlAttr* attribute = ptr->properties;
				   while(attribute && attribute->name && attribute->children)
				   {
					xmlChar* value = xmlNodeListGetString(ptr->doc, attribute->children, 1);
					//do something with value
					if (xmlStrcasecmp(value, (const xmlChar*) "_blank") == 0)
					{
					     // >>xxx (= ptr->children->content) データは実体参照ではない ">>12"
					     const wxString anchor = wxString::FromUTF8(reinterpret_cast<const char*>(ptr->children->content));
					     const wxString number = anchor.SubString(2, anchor.Len() - 1);

					     if (hashmap.find( number ) == hashmap.end()) {
						  // 初めてのNUMBERなので新しく追加する
						  hashmap[number] = 1;
					     } else {
						  // レス数を増やす
						  hashmap[number] = hashmap[number] + 1; // hashmap[number]++ と書くとclangでは最適化されて思うように動かない
					     }
					}
					
					xmlFree(value); 
					attribute = attribute->next;
				   }
			      }
			 }
		    }
	       }
	  }

	  xmlFreeDoc(docPtr);
	  xmlCleanupParser();

	  // 赤レスを集計し終わったら赤くして返す
	  // sample -> <a href="#2">2</a>
	  wxString text = html;
	  wxString tmp, result;
	  size_t start, len;

	  if (regexURL.IsValid() && regexIndex.Matches(html)) {

	       for (tmp = text; regexIndex.Matches(tmp); tmp = tmp.SubString(start + len, tmp.Len())) {

		    const wxString index = regexIndex.GetMatch(tmp, 1);
		    wxString color = wxEmptyString;
		    
		    switch (hashmap[index])
		    {
		    case 0:
			 color = wxT("#0000ff");
			 break;

		    case 1:
		    case 2:
		    case 3:
		    case 4:
			 color = wxT("#ff00ff");
			 break;
			 
		    case 5:
			 color = wxT("#ff0000");
			 break;

		    default:
			 color = wxT("#ff0000");
			 break;
		    }

		    regexIndex.GetMatch(&start, &len, 0);
		    result += tmp.SubString(0, start - 1);
		    result += wxT("<a href=\"#");
		    result += index;
		    result += wxT("\"><font color=\"");
		    result += color;
		    result += wxT("\">");
		    result += index;
		    result += wxT("</font></a>");
	       }

	       result += tmp;

	       return result;
	  }

	  // 失敗したらそのまま返す
	  return html;
     }

     xmlFreeDoc(docPtr);
     xmlCleanupParser();

     return html;
}
Пример #15
0
    /** Parse the 'detailed' Anime fields from HTML
     *
     * 
     */
    std::shared_ptr<Anime> AnimeSerializer::deserialize_details(const std::string& xml) const
    {
        typedef std::unique_ptr<xmlChar, XmlCharDeleter> xmlStringUPtr;
        auto res = std::make_shared<Anime>();
		std::unique_ptr<char[]> cstr(new char[xml.size()]);
		std::memcpy(cstr.get(), xml.c_str(), xml.size());
        std::unique_ptr<xmlDoc, XmlDocDeleter> doc(htmlReadMemory(cstr.get(), xml.size(), "http://myanimelist.net/",
                                                                   nullptr, HTML_PARSE_RECOVER
                                                                   | HTML_PARSE_NOERROR 
                                                                   | HTML_PARSE_NOWARNING
                                                                   | HTML_PARSE_NONET));
		std::unique_ptr<xmlTextReader, xmlTextReaderDeleter> reader(xmlReaderWalker(doc.get()));
		if (!reader) {
			std::cerr << "Error: Couldn't create XML reader" << std::endl;
            std::cerr << "XML follows: " << xml << std::endl;
			return nullptr;
		}
        
        enum { PRIORITY, STORAGE, REWATCHVALUE, DISCUSS, SELECTOR_NONE } selector = SELECTOR_NONE;
        enum { TAGS, COMMENTS, NONE } textarea = NONE;
        std::string textbuf;
        int ret = 1;
		for( ret = xmlTextReaderRead(reader.get()); ret == 1;
		     ret = xmlTextReaderRead(reader.get()) ) {
			const std::string name  = xmlchar_to_str(xmlTextReaderConstName (reader.get()));

            if (name == "input") {
                xmlStringUPtr type(xmlTextReaderGetAttribute(reader.get(), "type"_xml));
                xmlStringUPtr attr_name(xmlTextReaderGetAttribute(reader.get(), "name"_xml));
                xmlStringUPtr attr_value(xmlTextReaderGetAttribute(reader.get(), "value"_xml));
                if (type) {
                    if (xmlStrEqual(type.get(), "text"_xml) || xmlStrEqual(type.get(), "checkbox"_xml)) {
                        if (xmlStrEqual(attr_name.get(), "fansub_group"_xml))
                            res->set_fansub_group(xmlchar_to_str(attr_value.get()));
                        else if (xmlStrEqual(attr_name.get(), "list_downloaded_eps"_xml))
                            res->set_downloaded_items(xmlchar_to_str(attr_value.get()));
                        else if (xmlStrEqual(attr_name.get(), "list_times_watched"_xml))
                            res->set_times_consumed(xmlchar_to_str(attr_value.get()));
                        else if (xmlStrEqual(attr_name.get(), "storageVal"_xml))
                            res->set_storage_value(xmlchar_to_str(attr_value.get()));
                    }
                }
            } else if (name == "textarea" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_ELEMENT) {
                xmlStringUPtr attr_name(xmlTextReaderGetAttribute(reader.get(), "name"_xml));
                if (xmlStrEqual(attr_name.get(), "tags"_xml)) textarea = TAGS;
                else if (xmlStrEqual(attr_name.get(), "list_comments"_xml)) textarea = COMMENTS;
                else textarea = NONE;
                textbuf.clear();
            } else if (name == "textarea" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_END_ELEMENT) {
                if (textarea != NONE) {
                    switch (textarea) {
                        case TAGS:
                            /* Not a 'detailed' field */
                            break;
                        case COMMENTS:
                            res->set_comments(std::string(textbuf));
                            break;
                        case NONE:
                        default:
                            break;
                    }
                    textarea = NONE;
                }
            } else if (name == "#text" && textarea != NONE) {
                textbuf.append(xmlchar_to_str(xmlTextReaderConstValue(reader.get())));
            } else if (name == "select" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_ELEMENT) {
                xmlStringUPtr attr_name(xmlTextReaderGetAttribute(reader.get(), "name"_xml));
                if (xmlStrEqual(attr_name.get(), "priority"_xml)) selector = PRIORITY;
                if (xmlStrEqual(attr_name.get(), "storage"_xml)) selector = STORAGE;
                if (xmlStrEqual(attr_name.get(), "list_rewatch_value"_xml)) selector = REWATCHVALUE;
                if (xmlStrEqual(attr_name.get(), "discuss"_xml)) selector = DISCUSS;                
            } else if (name == "select" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_END_ELEMENT)  {
                selector = SELECTOR_NONE;
            } else if (name == "option" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_ELEMENT) {
                xmlStringUPtr value(xmlTextReaderGetAttribute(reader.get(), "value"_xml));
                if (xmlTextReaderMoveToAttribute(reader.get(), "selected"_xml) == 1) {
                    switch (selector) {
                        case PRIORITY:
                            res->set_priority(xmlchar_to_str(value.get()));
                            break;
                        case STORAGE:
                            res->set_storage_value(xmlchar_to_str(value.get()));
                            break;
                        case REWATCHVALUE:
                            res->set_reconsume_value(xmlchar_to_str(value.get()));
                            break;
                        case DISCUSS:
                            res->set_enable_discussion(xmlchar_to_str(value.get()));
                            break;
                        case SELECTOR_NONE:
                        default:
                            break;
                    }
                }
            }
        }

        if (ret != 0) return nullptr; // Some sort of parsing error
        
        return res;
    }
Пример #16
0
static Lisp_Object
parse_region (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, int htmlp)
{
  xmlDoc *doc;
  Lisp_Object result = Qnil;
  const char *burl = "";
  EMACS_INT bytes;
  EMACS_INT istart, iend;

  LIBXML_TEST_VERSION;

  validate_region (&start, &end);

  istart = XINT (start);
  iend = XINT (end);

  if (istart < GPT && GPT < iend)
    move_gap (iend);

  if (! NILP (base_url))
    {
      CHECK_STRING (base_url);
      burl = SSDATA (base_url);
    }

  bytes = CHAR_TO_BYTE (iend) - CHAR_TO_BYTE (istart);

  if (htmlp)
    doc = htmlReadMemory ((char *) BYTE_POS_ADDR (CHAR_TO_BYTE (istart)),
			  bytes, burl, "utf-8",
			  HTML_PARSE_RECOVER|HTML_PARSE_NONET|
			  HTML_PARSE_NOWARNING|HTML_PARSE_NOERROR|
			  HTML_PARSE_NOBLANKS);
  else
    doc = xmlReadMemory ((char *) BYTE_POS_ADDR (CHAR_TO_BYTE (istart)),
			 bytes, burl, "utf-8",
			 XML_PARSE_NONET|XML_PARSE_NOWARNING|
			 XML_PARSE_NOBLANKS |XML_PARSE_NOERROR);

  if (doc != NULL)
    {
      xmlNode *n = doc->children->next;
      Lisp_Object r = Qnil;

      while (n) {
	if (!NILP (r))
	  result = Fcons (r, result);
	r = make_dom (n);
	n = n->next;
      }

      if (NILP (result))
	result = r;
      else
	result = Fcons (intern ("top"),
			Fcons (Qnil, Fnreverse (Fcons (r, result))));

      xmlFreeDoc (doc);
      xmlCleanupParser ();
    }

  return result;
}
Пример #17
0
int _tmain(int argc, _TCHAR* argv[])
{
    char message[100] = "[URL]https://www.youtube.com/watch?v=wCRStRWMdWM#t=39s[/URL]\0";

	CURLcode curlCode;
	const char *curlMessage = "";
	struct MemoryStruct chunk;

	htmlDocPtr doc;
	xmlXPathContextPtr context;
	xmlXPathObjectPtr result;
	char * keyword;

	int i;
	char newMessage[1024];
	char errorMessage[128];
	const char* url;

    //printf("Enter your message: ");
    //fgets(message, 100, stdin);
    printf("Your Message is: %s\n", message);

	system("pause");

	url = GetURLFromMessage(message);
			
	chunk.memory = (char *) malloc(1);  /* will be grown as needed by the realloc above */ 
	chunk.size = 0;    /* no data at this point */

	if (url == NULL) {
		printf("URL is null, exiting...\n");
		system("pause");
		free(chunk.memory);
		return 0;
	}

	printf("URL: ");
	printf(url);
	printf("\n");

	//ts3Functions.logMessage("Opening URL: ", LogLevel_INFO, "Plugin", serverConnectionHandlerID);
	//ts3Functions.logMessage(url, LogLevel_INFO, "Plugin", serverConnectionHandlerID);

	GetHTML(url, &chunk, &curlCode, curlMessage);
	
	printf("curlMessage: ");
	printf(curlMessage);
	printf("\n");

	//if (curlCode != 0) {
	//	ts3Functions.logMessage("cURL Error: ", LogLevel_ERROR, "Plugin", serverConnectionHandlerID);
	//	ts3Functions.logMessage(curlMessage, LogLevel_ERROR, "Plugin", serverConnectionHandlerID);
	//}

#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
	sprintf_s(errorMessage, 128, "Reading HTML file that is the following bytes long: %d", chunk.size);
#else
	sprintf(errorMessage, "Reading HTML file that is the following bytes long: %d", chunk.size);
#endif
	//ts3Functions.logMessage(errorMessage, LogLevel_INFO, "Plugin", serverConnectionHandlerID);

	doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
	if (!doc) {
		printf("Could not read HTML document from memory\n");

	system("pause");
		//ts3Functions.logMessage("Could not read HTML document from memory", LogLevel_ERROR, "Plugin", serverConnectionHandlerID);
		free(chunk.memory);
		return 0;
	}

	context = xmlXPathNewContext(doc);
	result = xmlXPathEvalExpression("/html/head/title", context);

	if (xmlXPathNodeSetIsEmpty(result->nodesetval)) {
		printf("Could not read HTML node set from memory\n");
		system("pause");
		//ts3Functions.logMessage("Could not read HTML node set from memory", LogLevel_ERROR, "Plugin", serverConnectionHandlerID);
		xmlXPathFreeObject(result);
		free(chunk.memory);
		return 0;
	}

	for (i=0; i < result->nodesetval->nodeNr; i++) {
		keyword = (char *) xmlNodeListGetString(doc, result->nodesetval->nodeTab[i]->xmlChildrenNode, 1);
		continue;
		//printf("keyword: %s\n", keyword);
	}

#if defined(_WIN32) || defined(WIN32) || defined(WIN64) || defined(_WIN64)
	strcpy_s(newMessage, 1024, "\"");
	strcat_s(newMessage, 1024, (const char *) keyword);
	strcat_s(newMessage, 1024, "\" <[URL]");
	strcat_s(newMessage, 1024, url);
	strcat_s(newMessage, 1024, "[/URL]>");
#else
	strcpy(newMessage, "\"");
	strcat(newMessage, (const char *) keyword);
	strcat(newMessage, "\" <");
	strcat(newMessage, message);
	strcat(newMessage, ">");
#endif
			
	//xmlFree(keyword);
	xmlXPathFreeObject(result);
	xmlFreeDoc(doc);
	free(chunk.memory);

	printf("New message: ");
	printf(newMessage);
	printf("\n");

	printf("End\n");
	system("pause");

    return 1;
}
Пример #18
0
/* free() returned text */
static char *scrape_lyrics_from_lyricwiki_edit_page(const char *buf, int64_t len)
{
	xmlDocPtr doc;
	gchar *ret = NULL;

	/*
	 * temporarily set our error-handling functor to our suppression function,
	 * but we have to set it back because other components of Audacious depend
	 * on libxml and we don't want to step on their code paths.
	 *
	 * unfortunately, libxml is anti-social and provides us with no way to get
	 * the previous error functor, so we just have to set it back to default after
	 * parsing and hope for the best.
	 */
	xmlSetGenericErrorFunc(NULL, libxml_error_handler);
	doc = htmlReadMemory(buf, (int) len, NULL, "utf-8", (HTML_PARSE_RECOVER | HTML_PARSE_NONET));
	xmlSetGenericErrorFunc(NULL, NULL);

	if (doc != NULL)
	{
		xmlXPathContextPtr xpath_ctx = NULL;
		xmlXPathObjectPtr xpath_obj = NULL;
		xmlNodePtr node = NULL;

		xpath_ctx = xmlXPathNewContext(doc);
		if (xpath_ctx == NULL)
			goto give_up;

		xpath_obj = xmlXPathEvalExpression((xmlChar *) "//*[@id=\"wpTextbox1\"]", xpath_ctx);
		if (xpath_obj == NULL)
			goto give_up;

		if (!xpath_obj->nodesetval->nodeMax)
			goto give_up;

		node = xpath_obj->nodesetval->nodeTab[0];
give_up:
		if (xpath_obj != NULL)
			xmlXPathFreeObject(xpath_obj);

		if (xpath_ctx != NULL)
			xmlXPathFreeContext(xpath_ctx);

		if (node != NULL)
		{
			xmlChar *lyric = xmlNodeGetContent(node);

			if (lyric != NULL)
			{
				GMatchInfo *match_info;
				GRegex *reg;

				reg = g_regex_new("<(lyrics?)>[[:space:]]*(.*?)[[:space:]]*</\\1>", (G_REGEX_MULTILINE | G_REGEX_DOTALL), 0, NULL);
				g_regex_match(reg, (gchar *) lyric, G_REGEX_MATCH_NEWLINE_ANY, &match_info);

				ret = g_match_info_fetch(match_info, 2);
				if (!g_utf8_collate(ret, "<!-- PUT LYRICS HERE (and delete this entire line) -->"))
				{
					free(ret);
					ret = strdup(_("No lyrics available"));
				}

				g_regex_unref(reg);
			}

			xmlFree(lyric);
		}

		xmlFreeDoc(doc);
	}

	return ret;
}
Пример #19
0
static Lisp_Object
parse_region (Lisp_Object start, Lisp_Object end, Lisp_Object base_url,
              Lisp_Object discard_comments, bool htmlp)
{
    xmlDoc *doc;
    Lisp_Object result = Qnil;
    const char *burl = "";
    ptrdiff_t istart, iend, istart_byte, iend_byte;
    unsigned char *buftext;

    xmlCheckVersion (LIBXML_VERSION);

    validate_region (&start, &end);

    istart = XINT (start);
    iend = XINT (end);
    istart_byte = CHAR_TO_BYTE (istart);
    iend_byte = CHAR_TO_BYTE (iend);

    if (istart < GPT && GPT < iend)
        move_gap_both (iend, iend_byte);

    if (! NILP (base_url))
    {
        CHECK_STRING (base_url);
        burl = SSDATA (base_url);
    }

    buftext = BYTE_POS_ADDR (istart_byte);
#ifdef REL_ALLOC
    /* Prevent ralloc.c from relocating the current buffer while libxml2
       functions below read its text.  */
    r_alloc_inhibit_buffer_relocation (1);
#endif
    if (htmlp)
        doc = htmlReadMemory ((char *)buftext,
                              iend_byte - istart_byte, burl, "utf-8",
                              HTML_PARSE_RECOVER|HTML_PARSE_NONET|
                              HTML_PARSE_NOWARNING|HTML_PARSE_NOERROR|
                              HTML_PARSE_NOBLANKS);
    else
        doc = xmlReadMemory ((char *)buftext,
                             iend_byte - istart_byte, burl, "utf-8",
                             XML_PARSE_NONET|XML_PARSE_NOWARNING|
                             XML_PARSE_NOBLANKS |XML_PARSE_NOERROR);

#ifdef REL_ALLOC
    r_alloc_inhibit_buffer_relocation (0);
#endif
    /* If the assertion below fails, malloc was called inside the above
       libxml2 functions, and ralloc.c caused relocation of buffer text,
       so we could have read from unrelated memory.  */
    eassert (buftext == BYTE_POS_ADDR (istart_byte));

    if (doc != NULL)
    {
        Lisp_Object r = Qnil;
        if (NILP(discard_comments))
        {
            /* If the document has toplevel comments, then this should
               get us the nodes and the comments. */
            xmlNode *n = doc->children;

            while (n) {
                if (!NILP (r))
                    result = Fcons (r, result);
                r = make_dom (n);
                n = n->next;
            }
        }

        if (NILP (result)) {
            /* The document doesn't have toplevel comments or we discarded
               them.  Get the tree the proper way. */
            xmlNode *node = xmlDocGetRootElement (doc);
            if (node != NULL)
                result = make_dom (node);
        } else
            result = Fcons (Qtop, Fcons (Qnil, Fnreverse (Fcons (r, result))));

        xmlFreeDoc (doc);
    }

    return result;
}