/** * HTML整形 */ const wxString ExtractBoardList::HtmlFormat(const wxString& html) { /** ### XML入出力のためのバッファを作成する ### xmlOutputBufferPtr xmlOutputBufferCreateIO (xmlOutputWriteCallback iowrite, xmlOutputCloseCallback ioclose, void * ioctx, xmlCharEncodingHandlerPtr encoder) */ htmlDocPtr docPtr = htmlReadMemory(html.mb_str(), html.Len(), "", "utf-8", HTML_PARSE_RECOVER); if (docPtr) { // libxml2の***Ptrは何かの構造体のポインタ wxString val; xmlOutputBufferPtr buf = xmlOutputBufferCreateIO((xmlOutputWriteCallback)writeToWxString, (xmlOutputCloseCallback)closeWxString, &val, 0); htmlDocContentDumpOutput(buf, docPtr, "utf-8"); return val; } }
char *wiki_head(char *url) { curl_data_t *curl; xmlDoc *doc = NULL; xmlXPathContext *ctx = NULL; xmlXPathObject *xpathObj = NULL; char *text = NULL; curl = curl_data_new(); if(curl_download_text(url, curl)) return NULL; doc = (xmlDoc *) htmlReadMemory(curl->data, strlen(curl->data), "/", "utf-8", HTML_PARSE_NOERROR); /* creating xpath request */ ctx = xmlXPathNewContext(doc); xpathObj = xmlXPathEvalExpression((const xmlChar *) "//div[@id='mw-content-text']/p", ctx); if(!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) { if(xmlNodeGetContent(xpathObj->nodesetval->nodeTab[0])) text = strdup((char *) xmlNodeGetContent(xpathObj->nodesetval->nodeTab[0])); } xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(ctx); xmlFreeDoc(doc); curl_data_free(curl); return text; }
htmlDocPtr htmlParse(void *buffer, int buffer_len, void *url, void *encoding, int options, void *error_buffer, int error_buffer_len) { const char *c_buffer = (char*)buffer; const char *c_url = (char*)url; const char *c_encoding = (char*)encoding; xmlDoc *doc = NULL; xmlResetLastError(); doc = htmlReadMemory(c_buffer, buffer_len, c_url, c_encoding, options); if(doc == NULL) { xmlErrorPtr error; xmlFreeDoc(doc); error = xmlGetLastError(); if(error != NULL && error_buffer != NULL && error->level >= XML_ERR_ERROR) { char *c_error_buffer = (char*)error_buffer; if (error->message != NULL) { strncpy(c_error_buffer, error->message, error_buffer_len-1); c_error_buffer[error_buffer_len-1] = '\0'; } else { snprintf(c_error_buffer, error_buffer_len, "xml parsing error:%d", error->code); } } } return doc; }
shared_ptr<xml::Document> ManifestItem::ReferencedDocument() const { // TODO: handle remote URLs string path(BaseHref()); auto package = this->Owner(); if ( !package ) return nullptr; shared_ptr<xml::Document> result(nullptr); #if EPUB_USE(LIBXML2) // Sometimes, we want to filter the manifest items through // the content filter chain when unpacking the Package. For // example, with some DRM algorithms, the navigation tables are // encrypted and should be filtered before being parsed. ePub3::ManifestItemPtr manifestRef = std::const_pointer_cast<ManifestItem>(Ptr()); if (!manifestRef) return nullptr; shared_ptr<ByteStream> byteStream = package->GetFilterChainByteStream(manifestRef); if (!byteStream) return nullptr; void *docBuf = nullptr; std::size_t resbuflen = byteStream->ReadAllBytes(&docBuf); // In some EPUBs, UTF-8 XML/HTML files have a superfluous (erroneous?) BOM, so we either: // pass "utf-8" and expect InputBuffer::read_cb (in io.cpp) to skip the 3 erroneous bytes // (otherwise the XML parser fails), // or we pass NULL (in which case the parser auto-detects encoding) const char * encoding = nullptr; //const char * encoding = "utf-8"; xmlDocPtr raw; if ( _mediaType == "text/html" ) { raw = htmlReadMemory((const char*)docBuf, resbuflen, path.c_str(), encoding, ArchiveXmlReader::DEFAULT_OPTIONS); } else { raw = xmlReadMemory((const char*)docBuf, resbuflen, path.c_str(), encoding, ArchiveXmlReader::DEFAULT_OPTIONS); } result = xml::Wrapped<xml::Document>(raw); if (docBuf) free(docBuf); #elif EPUB_USE(WIN_XML) // TODO: filtering referenced document through Content Filters // is not yet supported on Windows unique_ptr<ArchiveXmlReader> reader = package->XmlReaderForRelativePath(path); if ( !reader ) return nullptr; result = reader->ReadDocument(path.c_str(), "utf-8", 0); #endif return result; }
static xmlDocPtr xhtml_parse (const gchar *html, gint len) { xmlDocPtr out = NULL; g_assert (html != NULL); g_assert (len >= 0); /* Note: NONET is not implemented so it will return an error because it doesn't know how to handle NONET. But, it might learn in the future. */ out = htmlReadMemory (html, len, NULL, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NONET | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); return out; }
xmlNode* htmlParseFragmentAsDoc(void *doc, void *buffer, int buffer_len, void *url, void *encoding, int options, void *error_buffer, int error_buffer_len) { xmlDoc* tmpDoc = NULL; xmlNode* tmpRoot = NULL; tmpDoc = htmlReadMemory((char*)buffer, buffer_len, (char*)url, (char*)encoding, options); if (tmpDoc == NULL) { return NULL; } tmpRoot = xmlDocGetRootElement(tmpDoc); if (tmpRoot == NULL) { return NULL; } tmpRoot = xmlDocCopyNode(tmpRoot, doc, 1); xmlFreeDoc(tmpDoc); return tmpRoot; }
google_search_t * google_search(char *keywords) { curl_data_t *curl; google_search_t *search; xmlDoc *doc = NULL; xmlXPathContext *ctx = NULL; xmlXPathObject *xpathObj = NULL; xmlNode *node = NULL; char url[2048]; int i; curl = curl_data_new(); snprintf(url, sizeof(url), "%s%s", baseurlen, space_encode(keywords)); if(curl_download_text(url, curl)) return NULL; doc = (xmlDoc *) htmlReadMemory(curl->data, strlen(curl->data), "/", "utf-8", HTML_PARSE_NOERROR); /* creating xpath request */ ctx = xmlXPathNewContext(doc); xpathObj = xmlXPathEvalExpression((const xmlChar *) "//li/div/h3/a", ctx); search = (google_search_t *) calloc(1, sizeof(google_search_t)); if(!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) { search->length = xpathObj->nodesetval->nodeNr; search->result = (google_result_t *) calloc(1, sizeof(google_result_t) * search->length); for(i = 0; i < xpathObj->nodesetval->nodeNr; i++) { node = xpathObj->nodesetval->nodeTab[i]; if(xmlNodeGetContent(node)) search->result[i].title = strdup((char *) xmlNodeGetContent(node)); if(xmlGetProp(node, (unsigned char *) "href")) search->result[i].url = strdup((char *) xmlGetProp(node, (unsigned char *) "href")); } } xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(ctx); xmlFreeDoc(doc); curl_data_free(curl); return search; }
/** * execute_xpath_expression: * @filename: the input XML filename. * @xpathExpr: the xpath expression for evaluation. * @nsList: the optional list of known namespaces in * "<prefix1>=<href1> <prefix2>=href2> ..." format. * * Parses input XML file, evaluates XPath expression and prints results. * * Returns 0 on success and a negative value otherwise. */ int execute_xpath_expression(const char* filename, const xmlChar* xpathExpr, char **get, int num) { xmlDocPtr doc; xmlXPathContextPtr xpathCtx; xmlXPathObjectPtr xpathObj; assert(filename); assert(xpathExpr); /* Load XML document */ //doc = xmlParseFile(filename); doc = htmlReadMemory(filename, strlen(filename), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); //doc = htmlDocPtr(filename, NULL); if (doc == NULL) { fprintf(stderr, "Error: unable to parse this string\n"); return(-1); } /* Create xpath evaluation context */ xpathCtx = xmlXPathNewContext(doc); if(xpathCtx == NULL) { fprintf(stderr,"Error: unable to create new XPath context\n"); xmlFreeDoc(doc); return -1; } /* Evaluate xpath expression */ xpathObj = xmlXPathEvalExpression(xpathExpr, xpathCtx); if(xpathObj == NULL) { fprintf(stderr,"Error: unable to evaluate xpath expression \"%s\"\n", xpathExpr); xmlXPathFreeContext(xpathCtx); xmlFreeDoc(doc); return -1; } /* Print results */ int size = print_xpath_nodes(xpathObj->nodesetval, get, num); /* Cleanup */ xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); xmlFreeDoc(doc); return size; }
/* HREF finder implemented in libxml2 but could be any HTML parser */ size_t follow_links(CURLM *multi_handle, memory *mem, char *url) { int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ HTML_PARSE_NOWARNING | HTML_PARSE_NONET; htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); if(!doc) return 0; xmlChar *xpath = (xmlChar*) "//a/@href"; xmlXPathContextPtr context = xmlXPathNewContext(doc); xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); xmlXPathFreeContext(context); if(!result) return 0; xmlNodeSetPtr nodeset = result->nodesetval; if(xmlXPathNodeSetIsEmpty(nodeset)) { xmlXPathFreeObject(result); return 0; } size_t count = 0; for(int i = 0; i < nodeset->nodeNr; i++) { double r = rand(); int x = r * nodeset->nodeNr / RAND_MAX; const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; xmlChar *href = xmlNodeListGetString(doc, node, 1); if(follow_relative_links) { xmlChar *orig = href; href = xmlBuildURI(href, (xmlChar *) url); xmlFree(orig); } char *link = (char *) href; if(!link || strlen(link) < 20) continue; if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { curl_multi_add_handle(multi_handle, make_handle(link)); if(count++ == max_link_per_page) break; } xmlFree(link); } xmlXPathFreeObject(result); return count; }
/** * レスをIDで抽出してファイルから読み取ってDOM形式にして送り返す * @param const wxString& rawHtml スレッドのHTML * @param const wxString& extractId 抽出対象のID * @return wxString 取得したレスの内容 */ wxString XrossBoardUtil::FindResponseByIndex(const wxString& rawHtml, const wxString& extractIndex) { // wxString::mb_str で変換するとWindowsの場合CP932が返ってくるので // まずはUTF-8のwxCharBufferに変換してやる const wxCharBuffer &cb = rawHtml.utf8_str(); const htmlDocPtr docPtr = htmlReadMemory(cb.data(), ::strlen(cb.data()), "", "utf-8", HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); // HTMLのDOM形式にする wxString lumpOfHTML = HTML_HEADER_POPUP; if (docPtr) { const htmlNodePtr root = xmlDocGetRootElement(docPtr); const htmlNodePtr body = root->children->next; for (htmlNodePtr node = body->children; node != NULL; node = node->next) { if (node->type == XML_ELEMENT_NODE && xmlStrcasecmp(node->name, (const xmlChar*) "dd") == 0) { const htmlNodePtr dd = node->children; if (DDNodeHasTarget(dd, extractIndex)) { xmlBufferPtr buffer = xmlBufferCreate(); xmlNodeDump(buffer, docPtr, node->prev, 0, 1); xmlNodeDump(buffer, docPtr, node, 0, 1); lumpOfHTML += wxString::FromUTF8(reinterpret_cast<const char*>(buffer->content)); continue; } } } xmlFreeDoc(docPtr); xmlCleanupParser(); } // HTMLソースを加える lumpOfHTML += HTML_FOOTER; return lumpOfHTML; }
/** * HTML整形 */ const wxString ExtractBoardList::HtmlFormat(const wxString& html) { wxString val; const wxCharBuffer& cb = html.utf8_str(); htmlDocPtr docPtr = htmlReadMemory(cb.data(), ::strlen(cb.data()), "", "utf-8", HTML_PARSE_RECOVER); if (docPtr) { // libxml2の***Ptrは何かの構造体のポインタ xmlOutputBufferPtr buf = xmlOutputBufferCreateIO((xmlOutputWriteCallback)writeToWxString, (xmlOutputCloseCallback)closeWxString, &val, 0); htmlDocContentDumpOutput(buf, docPtr, "utf-8"); xmlOutputBufferClose(buf); xmlFreeDoc(docPtr); } xmlCleanupParser(); return val; }
static void ga_gabuddy_parse_info_cb(HttpHandler* handler, gchar* response, gsize len, gpointer userdata) { htmlDocPtr doc; xmlXPathContextPtr xpathCtx; xmlXPathObjectPtr xpathObj; GayAttitudeAccount *gaa = handler->data; GayAttitudeBuddyInfoRequest *request = userdata; purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Fetching info for '%s'.\n", request->gabuddy->buddy->name); doc = htmlReadMemory(response, len, "gayattitude.xml", NULL, 0); if (doc == NULL) { purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XML Parsing).\n"); return; } /* Create xpath evaluation context */ xpathCtx = xmlXPathNewContext(doc); if(xpathCtx == NULL) { purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath context init).\n"); xmlFreeDoc(doc); return; } xmlNode *info_node; /* Search internal Ref ID */ if (!request->gabuddy->ref_id) { purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Fetching missing ref_id for '%s'.\n", request->gabuddy->buddy->name); xpathObj = xmlXPathEvalExpression((xmlChar*) "//input[@type='hidden' and @name='ref_id']", xpathCtx); if(xpathObj == NULL) { purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath evaluation).\n"); xmlXPathFreeContext(xpathCtx); xmlFreeDoc(doc); return; } if (!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) { gchar *ref_id; info_node = xpathObj->nodesetval->nodeTab[0]; ref_id = (gchar*) xmlGetProp(info_node, (xmlChar*) "value"); if (request->gabuddy->real_gabuddy) request->gabuddy->real_gabuddy->ref_id = ref_id; else request->gabuddy->ref_id = ref_id; purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Found ref_id for '%s': %s.\n", request->gabuddy->buddy->name, request->gabuddy->ref_id); } xmlXPathFreeObject(xpathObj); } if (request->advertise) { PurpleNotifyUserInfo *user_info = purple_notify_user_info_new(); int i; GString *str = NULL; /* Search short description */ xpathCtx->node = doc->parent; xpathObj = xmlXPathEvalExpression((xmlChar*) "//div[@id='PORTRAITHEADER2']/p/text()", xpathCtx); if(xpathObj == NULL) { purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath evaluation).\n"); xmlXPathFreeContext(xpathCtx); xmlFreeDoc(doc); return; } if (!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) { info_node = xpathObj->nodesetval->nodeTab[0]; purple_notify_user_info_add_pair(user_info, "Short Description", (gchar*) info_node->content); } xmlXPathFreeObject(xpathObj); /* Search user research */ xpathCtx->node = doc->parent; xpathObj = xmlXPathEvalExpression((xmlChar*) "//div[@id='bloc_recherche']/p/text()", xpathCtx); if(xpathObj == NULL) { purple_debug(PURPLE_DEBUG_ERROR, "gayattitude", "ga_buddy: Unable to parse response (XPath evaluation).\n"); xmlXPathFreeContext(xpathCtx); xmlFreeDoc(doc); return; } if (!xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) { for(i = 0; i < xpathObj->nodesetval->nodeNr; i++) { info_node = xpathObj->nodesetval->nodeTab[i]; if (i == 0) str = g_string_new((gchar*) info_node->content); else g_string_append_printf(str, " -- %s", info_node->content); } purple_notify_user_info_add_pair(user_info, "Research", str->str); g_string_free(str, TRUE); } xmlXPathFreeObject(xpathObj); purple_notify_userinfo(gaa->pc, request->gabuddy->buddy->name, user_info, NULL, NULL); purple_notify_user_info_destroy(user_info); } /* Cleanup */ xmlXPathFreeContext(xpathCtx); xmlFreeDoc(doc); /* Chained Callback */ if (request->callback) { purple_debug(PURPLE_DEBUG_INFO, "gayattitude", "ga_buddy: Calling callback after info for '%s' was retrieved\n", request->gabuddy->buddy->name); request->callback(gaa, request->callback_data); } }
#include "DobHtmlParser.h" DobHtmlParser::DobHtmlParser () { doc = NULL; } DobHtmlParser::DobHtmlParser (QString *document, QString *data_xpath )/*{{{*/ { doc = NULL; setDoc ( document ); setXpath ( data_xpath ); }/*}}}*/ DobHtmlParser::~DobHtmlParser ()/*{{{*/ { if ( doc ) xmlFreeDoc( doc ); if ( xpath ) xmlFree(xpath); xmlCleanupParser (); }/*}}}*/ void DobHtmlParser::setDoc ( QString *document )/*{{{*/ { if ( doc ) xmlFreeDoc ( doc ); doc = htmlReadMemory ( document->toLatin1(), document->size(), "noname.html", NULL, 0 ); }/*}}}*/
/** * >>xx のようなアンカーを受けているレスを赤くする */ wxString XrossBoardUtil::AddColorAnchoredID(const wxString& html) { const std::string temporary = std::string(html.mb_str()); const htmlDocPtr docPtr = htmlReadMemory(temporary.c_str(), temporary.size(), "", "utf-8", HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); WX_DECLARE_STRING_HASH_MAP( int, ExtractIdHash ); ExtractIdHash hashmap; if (docPtr) { const htmlNodePtr root = xmlDocGetRootElement(docPtr); const htmlNodePtr body = root->children->next; for (htmlNodePtr node = body->children; node != NULL; node = node->next) { if (node->type == XML_ELEMENT_NODE && xmlStrcasecmp(node->name, (const xmlChar*) "dd") == 0) { const htmlNodePtr dd = node->children; if (node != NULL && dd != NULL && node->type == XML_ELEMENT_NODE && xmlStrcasecmp(dd->name, (const xmlChar*) "table") == 0) { for (htmlNodePtr ptr = dd->children; ptr != NULL; ptr = ptr->next) { if (ptr->type == XML_ELEMENT_NODE && xmlStrcasecmp(ptr->name, (const xmlChar*) "a") == 0) { xmlAttr* attribute = ptr->properties; while(attribute && attribute->name && attribute->children) { xmlChar* value = xmlNodeListGetString(ptr->doc, attribute->children, 1); //do something with value if (xmlStrcasecmp(value, (const xmlChar*) "_blank") == 0) { // >>xxx (= ptr->children->content) データは実体参照ではない ">>12" const wxString anchor = wxString::FromUTF8(reinterpret_cast<const char*>(ptr->children->content)); const wxString number = anchor.SubString(2, anchor.Len() - 1); if (hashmap.find( number ) == hashmap.end()) { // 初めてのNUMBERなので新しく追加する hashmap[number] = 1; } else { // レス数を増やす hashmap[number] = hashmap[number] + 1; // hashmap[number]++ と書くとclangでは最適化されて思うように動かない } } xmlFree(value); attribute = attribute->next; } } } } } } xmlFreeDoc(docPtr); xmlCleanupParser(); // 赤レスを集計し終わったら赤くして返す // sample -> <a href="#2">2</a> wxString text = html; wxString tmp, result; size_t start, len; if (regexURL.IsValid() && regexIndex.Matches(html)) { for (tmp = text; regexIndex.Matches(tmp); tmp = tmp.SubString(start + len, tmp.Len())) { const wxString index = regexIndex.GetMatch(tmp, 1); wxString color = wxEmptyString; switch (hashmap[index]) { case 0: color = wxT("#0000ff"); break; case 1: case 2: case 3: case 4: color = wxT("#ff00ff"); break; case 5: color = wxT("#ff0000"); break; default: color = wxT("#ff0000"); break; } regexIndex.GetMatch(&start, &len, 0); result += tmp.SubString(0, start - 1); result += wxT("<a href=\"#"); result += index; result += wxT("\"><font color=\""); result += color; result += wxT("\">"); result += index; result += wxT("</font></a>"); } result += tmp; return result; } // 失敗したらそのまま返す return html; } xmlFreeDoc(docPtr); xmlCleanupParser(); return html; }
/** Parse the 'detailed' Anime fields from HTML * * */ std::shared_ptr<Anime> AnimeSerializer::deserialize_details(const std::string& xml) const { typedef std::unique_ptr<xmlChar, XmlCharDeleter> xmlStringUPtr; auto res = std::make_shared<Anime>(); std::unique_ptr<char[]> cstr(new char[xml.size()]); std::memcpy(cstr.get(), xml.c_str(), xml.size()); std::unique_ptr<xmlDoc, XmlDocDeleter> doc(htmlReadMemory(cstr.get(), xml.size(), "http://myanimelist.net/", nullptr, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET)); std::unique_ptr<xmlTextReader, xmlTextReaderDeleter> reader(xmlReaderWalker(doc.get())); if (!reader) { std::cerr << "Error: Couldn't create XML reader" << std::endl; std::cerr << "XML follows: " << xml << std::endl; return nullptr; } enum { PRIORITY, STORAGE, REWATCHVALUE, DISCUSS, SELECTOR_NONE } selector = SELECTOR_NONE; enum { TAGS, COMMENTS, NONE } textarea = NONE; std::string textbuf; int ret = 1; for( ret = xmlTextReaderRead(reader.get()); ret == 1; ret = xmlTextReaderRead(reader.get()) ) { const std::string name = xmlchar_to_str(xmlTextReaderConstName (reader.get())); if (name == "input") { xmlStringUPtr type(xmlTextReaderGetAttribute(reader.get(), "type"_xml)); xmlStringUPtr attr_name(xmlTextReaderGetAttribute(reader.get(), "name"_xml)); xmlStringUPtr attr_value(xmlTextReaderGetAttribute(reader.get(), "value"_xml)); if (type) { if (xmlStrEqual(type.get(), "text"_xml) || xmlStrEqual(type.get(), "checkbox"_xml)) { if (xmlStrEqual(attr_name.get(), "fansub_group"_xml)) res->set_fansub_group(xmlchar_to_str(attr_value.get())); else if (xmlStrEqual(attr_name.get(), "list_downloaded_eps"_xml)) res->set_downloaded_items(xmlchar_to_str(attr_value.get())); else if (xmlStrEqual(attr_name.get(), "list_times_watched"_xml)) res->set_times_consumed(xmlchar_to_str(attr_value.get())); else if (xmlStrEqual(attr_name.get(), "storageVal"_xml)) res->set_storage_value(xmlchar_to_str(attr_value.get())); } } } else if (name == "textarea" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_ELEMENT) { xmlStringUPtr attr_name(xmlTextReaderGetAttribute(reader.get(), "name"_xml)); if (xmlStrEqual(attr_name.get(), "tags"_xml)) textarea = TAGS; else if (xmlStrEqual(attr_name.get(), "list_comments"_xml)) textarea = COMMENTS; else textarea = NONE; textbuf.clear(); } else if (name == "textarea" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_END_ELEMENT) { if (textarea != NONE) { switch (textarea) { case TAGS: /* Not a 'detailed' field */ break; case COMMENTS: res->set_comments(std::string(textbuf)); break; case NONE: default: break; } textarea = NONE; } } else if (name == "#text" && textarea != NONE) { textbuf.append(xmlchar_to_str(xmlTextReaderConstValue(reader.get()))); } else if (name == "select" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_ELEMENT) { xmlStringUPtr attr_name(xmlTextReaderGetAttribute(reader.get(), "name"_xml)); if (xmlStrEqual(attr_name.get(), "priority"_xml)) selector = PRIORITY; if (xmlStrEqual(attr_name.get(), "storage"_xml)) selector = STORAGE; if (xmlStrEqual(attr_name.get(), "list_rewatch_value"_xml)) selector = REWATCHVALUE; if (xmlStrEqual(attr_name.get(), "discuss"_xml)) selector = DISCUSS; } else if (name == "select" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_END_ELEMENT) { selector = SELECTOR_NONE; } else if (name == "option" && xmlTextReaderNodeType(reader.get()) == XML_READER_TYPE_ELEMENT) { xmlStringUPtr value(xmlTextReaderGetAttribute(reader.get(), "value"_xml)); if (xmlTextReaderMoveToAttribute(reader.get(), "selected"_xml) == 1) { switch (selector) { case PRIORITY: res->set_priority(xmlchar_to_str(value.get())); break; case STORAGE: res->set_storage_value(xmlchar_to_str(value.get())); break; case REWATCHVALUE: res->set_reconsume_value(xmlchar_to_str(value.get())); break; case DISCUSS: res->set_enable_discussion(xmlchar_to_str(value.get())); break; case SELECTOR_NONE: default: break; } } } } if (ret != 0) return nullptr; // Some sort of parsing error return res; }
static Lisp_Object parse_region (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, int htmlp) { xmlDoc *doc; Lisp_Object result = Qnil; const char *burl = ""; EMACS_INT bytes; EMACS_INT istart, iend; LIBXML_TEST_VERSION; validate_region (&start, &end); istart = XINT (start); iend = XINT (end); if (istart < GPT && GPT < iend) move_gap (iend); if (! NILP (base_url)) { CHECK_STRING (base_url); burl = SSDATA (base_url); } bytes = CHAR_TO_BYTE (iend) - CHAR_TO_BYTE (istart); if (htmlp) doc = htmlReadMemory ((char *) BYTE_POS_ADDR (CHAR_TO_BYTE (istart)), bytes, burl, "utf-8", HTML_PARSE_RECOVER|HTML_PARSE_NONET| HTML_PARSE_NOWARNING|HTML_PARSE_NOERROR| HTML_PARSE_NOBLANKS); else doc = xmlReadMemory ((char *) BYTE_POS_ADDR (CHAR_TO_BYTE (istart)), bytes, burl, "utf-8", XML_PARSE_NONET|XML_PARSE_NOWARNING| XML_PARSE_NOBLANKS |XML_PARSE_NOERROR); if (doc != NULL) { xmlNode *n = doc->children->next; Lisp_Object r = Qnil; while (n) { if (!NILP (r)) result = Fcons (r, result); r = make_dom (n); n = n->next; } if (NILP (result)) result = r; else result = Fcons (intern ("top"), Fcons (Qnil, Fnreverse (Fcons (r, result)))); xmlFreeDoc (doc); xmlCleanupParser (); } return result; }
int _tmain(int argc, _TCHAR* argv[]) { char message[100] = "[URL]https://www.youtube.com/watch?v=wCRStRWMdWM#t=39s[/URL]\0"; CURLcode curlCode; const char *curlMessage = ""; struct MemoryStruct chunk; htmlDocPtr doc; xmlXPathContextPtr context; xmlXPathObjectPtr result; char * keyword; int i; char newMessage[1024]; char errorMessage[128]; const char* url; //printf("Enter your message: "); //fgets(message, 100, stdin); printf("Your Message is: %s\n", message); system("pause"); url = GetURLFromMessage(message); chunk.memory = (char *) malloc(1); /* will be grown as needed by the realloc above */ chunk.size = 0; /* no data at this point */ if (url == NULL) { printf("URL is null, exiting...\n"); system("pause"); free(chunk.memory); return 0; } printf("URL: "); printf(url); printf("\n"); //ts3Functions.logMessage("Opening URL: ", LogLevel_INFO, "Plugin", serverConnectionHandlerID); //ts3Functions.logMessage(url, LogLevel_INFO, "Plugin", serverConnectionHandlerID); GetHTML(url, &chunk, &curlCode, curlMessage); printf("curlMessage: "); printf(curlMessage); printf("\n"); //if (curlCode != 0) { // ts3Functions.logMessage("cURL Error: ", LogLevel_ERROR, "Plugin", serverConnectionHandlerID); // ts3Functions.logMessage(curlMessage, LogLevel_ERROR, "Plugin", serverConnectionHandlerID); //} #if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64) sprintf_s(errorMessage, 128, "Reading HTML file that is the following bytes long: %d", chunk.size); #else sprintf(errorMessage, "Reading HTML file that is the following bytes long: %d", chunk.size); #endif //ts3Functions.logMessage(errorMessage, LogLevel_INFO, "Plugin", serverConnectionHandlerID); doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (!doc) { printf("Could not read HTML document from memory\n"); system("pause"); //ts3Functions.logMessage("Could not read HTML document from memory", LogLevel_ERROR, "Plugin", serverConnectionHandlerID); free(chunk.memory); return 0; } context = xmlXPathNewContext(doc); result = xmlXPathEvalExpression("/html/head/title", context); if (xmlXPathNodeSetIsEmpty(result->nodesetval)) { printf("Could not read HTML node set from memory\n"); system("pause"); //ts3Functions.logMessage("Could not read HTML node set from memory", LogLevel_ERROR, "Plugin", serverConnectionHandlerID); xmlXPathFreeObject(result); free(chunk.memory); return 0; } for (i=0; i < result->nodesetval->nodeNr; i++) { keyword = (char *) xmlNodeListGetString(doc, result->nodesetval->nodeTab[i]->xmlChildrenNode, 1); continue; //printf("keyword: %s\n", keyword); } #if defined(_WIN32) || defined(WIN32) || defined(WIN64) || defined(_WIN64) strcpy_s(newMessage, 1024, "\""); strcat_s(newMessage, 1024, (const char *) keyword); strcat_s(newMessage, 1024, "\" <[URL]"); strcat_s(newMessage, 1024, url); strcat_s(newMessage, 1024, "[/URL]>"); #else strcpy(newMessage, "\""); strcat(newMessage, (const char *) keyword); strcat(newMessage, "\" <"); strcat(newMessage, message); strcat(newMessage, ">"); #endif //xmlFree(keyword); xmlXPathFreeObject(result); xmlFreeDoc(doc); free(chunk.memory); printf("New message: "); printf(newMessage); printf("\n"); printf("End\n"); system("pause"); return 1; }
/* free() returned text */ static char *scrape_lyrics_from_lyricwiki_edit_page(const char *buf, int64_t len) { xmlDocPtr doc; gchar *ret = NULL; /* * temporarily set our error-handling functor to our suppression function, * but we have to set it back because other components of Audacious depend * on libxml and we don't want to step on their code paths. * * unfortunately, libxml is anti-social and provides us with no way to get * the previous error functor, so we just have to set it back to default after * parsing and hope for the best. */ xmlSetGenericErrorFunc(NULL, libxml_error_handler); doc = htmlReadMemory(buf, (int) len, NULL, "utf-8", (HTML_PARSE_RECOVER | HTML_PARSE_NONET)); xmlSetGenericErrorFunc(NULL, NULL); if (doc != NULL) { xmlXPathContextPtr xpath_ctx = NULL; xmlXPathObjectPtr xpath_obj = NULL; xmlNodePtr node = NULL; xpath_ctx = xmlXPathNewContext(doc); if (xpath_ctx == NULL) goto give_up; xpath_obj = xmlXPathEvalExpression((xmlChar *) "//*[@id=\"wpTextbox1\"]", xpath_ctx); if (xpath_obj == NULL) goto give_up; if (!xpath_obj->nodesetval->nodeMax) goto give_up; node = xpath_obj->nodesetval->nodeTab[0]; give_up: if (xpath_obj != NULL) xmlXPathFreeObject(xpath_obj); if (xpath_ctx != NULL) xmlXPathFreeContext(xpath_ctx); if (node != NULL) { xmlChar *lyric = xmlNodeGetContent(node); if (lyric != NULL) { GMatchInfo *match_info; GRegex *reg; reg = g_regex_new("<(lyrics?)>[[:space:]]*(.*?)[[:space:]]*</\\1>", (G_REGEX_MULTILINE | G_REGEX_DOTALL), 0, NULL); g_regex_match(reg, (gchar *) lyric, G_REGEX_MATCH_NEWLINE_ANY, &match_info); ret = g_match_info_fetch(match_info, 2); if (!g_utf8_collate(ret, "<!-- PUT LYRICS HERE (and delete this entire line) -->")) { free(ret); ret = strdup(_("No lyrics available")); } g_regex_unref(reg); } xmlFree(lyric); } xmlFreeDoc(doc); } return ret; }
static Lisp_Object parse_region (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, Lisp_Object discard_comments, bool htmlp) { xmlDoc *doc; Lisp_Object result = Qnil; const char *burl = ""; ptrdiff_t istart, iend, istart_byte, iend_byte; unsigned char *buftext; xmlCheckVersion (LIBXML_VERSION); validate_region (&start, &end); istart = XINT (start); iend = XINT (end); istart_byte = CHAR_TO_BYTE (istart); iend_byte = CHAR_TO_BYTE (iend); if (istart < GPT && GPT < iend) move_gap_both (iend, iend_byte); if (! NILP (base_url)) { CHECK_STRING (base_url); burl = SSDATA (base_url); } buftext = BYTE_POS_ADDR (istart_byte); #ifdef REL_ALLOC /* Prevent ralloc.c from relocating the current buffer while libxml2 functions below read its text. */ r_alloc_inhibit_buffer_relocation (1); #endif if (htmlp) doc = htmlReadMemory ((char *)buftext, iend_byte - istart_byte, burl, "utf-8", HTML_PARSE_RECOVER|HTML_PARSE_NONET| HTML_PARSE_NOWARNING|HTML_PARSE_NOERROR| HTML_PARSE_NOBLANKS); else doc = xmlReadMemory ((char *)buftext, iend_byte - istart_byte, burl, "utf-8", XML_PARSE_NONET|XML_PARSE_NOWARNING| XML_PARSE_NOBLANKS |XML_PARSE_NOERROR); #ifdef REL_ALLOC r_alloc_inhibit_buffer_relocation (0); #endif /* If the assertion below fails, malloc was called inside the above libxml2 functions, and ralloc.c caused relocation of buffer text, so we could have read from unrelated memory. */ eassert (buftext == BYTE_POS_ADDR (istart_byte)); if (doc != NULL) { Lisp_Object r = Qnil; if (NILP(discard_comments)) { /* If the document has toplevel comments, then this should get us the nodes and the comments. */ xmlNode *n = doc->children; while (n) { if (!NILP (r)) result = Fcons (r, result); r = make_dom (n); n = n->next; } } if (NILP (result)) { /* The document doesn't have toplevel comments or we discarded them. Get the tree the proper way. */ xmlNode *node = xmlDocGetRootElement (doc); if (node != NULL) result = make_dom (node); } else result = Fcons (Qtop, Fcons (Qnil, Fnreverse (Fcons (r, result)))); xmlFreeDoc (doc); } return result; }