static MessageBody* get_body(CollectedPart *body_part, gboolean sanitize_body, GPtrArray *inlines) { g_return_val_if_fail(body_part != NULL, NULL); MessageBody *mb = new_message_body(); // We keep the raw size intentionally mb->size = body_part->content->len; mb->content_type = g_strdup(body_part->content_type); if (sanitize_body) { // Parse any HTML tags GString *raw_content = g_string_new_len((const gchar*) body_part->content->data, body_part->content->len); GumboOutput* output = gumbo_parse_with_options(&kGumboDefaultOptions, raw_content->str, raw_content->len); // Remove unallowed HTML tags (like scripts, bad href etc..) GString *sanitized_content = sanitize(output->document, inlines); mb->content = sanitized_content; gumbo_destroy_output(&kGumboDefaultOptions, output); g_string_free(raw_content, TRUE); } else { mb->content = g_string_new_len((const gchar*) body_part->content->data, body_part->content->len); } return mb; }
int main(int argc, const char** argv){ if(argc != 2){ printf("得到html文件的各个节点(请加上html文件名作为执行参数)"); exit(1); } const char* filename = argv[1]; // FILE *fp; if(!(fp = fopen(filename, "r"))){ printf("file open error!"); exit(1); } // char* input; int length; readFile(fp, &input, &length); // GumboOutput* output = gumbo_parse_with_options( &kGumboDefaultOptions, input, length); findTag(output->root, 0); // gumbo_destroy_output(&kGumboDefaultOptions, output); fclose(fp); free(input); }
string HtmlParser::buscarTextoInTag(char *html, Tags *tag) { GumboOutput* output = gumbo_parse(html); string res = search_text(output->root, tag); gumbo_destroy_output(&kGumboDefaultOptions, output); return res; }
void gumboPtrDeleter(GumboOutput* gumboPtr) { if (gumboPtr != nullptr) { gumbo_destroy_output(&kGumboDefaultOptions, gumboPtr); } }
int main(int argc, char** argv) { if (argc != 2) { puts("Usage: clean_text <html filename>"); exit(EXIT_FAILURE); } const char* filename = argv[1]; FILE *in = fopen(filename, "rb"); /* *if (!in) { * std::cout << "File " << filename << " not found!\n"; * exit(EXIT_FAILURE); *} */ /* *std::string contents; *in.seekg(0, std::ios::end); *contents.resize(in.tellg()); *in.seekg(0, std::ios::beg); *in.read(&contents[0], contents.size()); *in.close(); */ char *contents = malloc(65535); fread(contents, 1, 65535, in); GumboOutput* output = gumbo_parse(contents); puts(cleantext(output->root)); gumbo_destroy_output(&kGumboDefaultOptions, output); }
GumboInterface::~GumboInterface() { if (m_output != NULL) { gumbo_destroy_output(m_output); m_output = NULL; m_utf8src = ""; } }
static void es_gumbo_output_release(es_gumbo_output_t *ego) { if(atomic_dec(&ego->ego_refcount)) return; gumbo_destroy_output(&kGumboDefaultOptions, ego->ego_output); free(ego); }
void HtmlParser::buscarElem2(char *html, Tags *tag1, string atributoElem1, string valAtributoElem1 , Tags *tag2, string atributoElem2) { listUrlInfoSong.clear(); GumboNode* enlace = NULL; GumboOutput* output = gumbo_parse(html); vector <string> resultados; GumboNode* ret = search_for_elem2(output->root, tag1, atributoElem1, valAtributoElem1, tag2, atributoElem2); gumbo_destroy_output(&kGumboDefaultOptions, output); }
static int parse(lua_State *L) { size_t input_len, tagname_len; GumboOptions options = kGumboDefaultOptions; options.max_errors = 0; const char *input = luaL_checklstring(L, 1, &input_len); options.tab_stop = (int)luaL_optinteger(L, 2, 8); const char *tagname = luaL_optlstring(L, 3, NULL, &tagname_len); if (tagname != NULL) { options.fragment_context = gumbo_tagn_enum(tagname, tagname_len); } static const char *namespaces[] = {"html", "svg", "math", NULL}; options.fragment_namespace = luaL_checkoption(L, 4, "html", namespaces); for (int i = 1; i <= nupvalues; i++) { luaL_checktype(L, i + 4, LUA_TTABLE); } lua_pushcclosure(L, push_document, nupvalues); GumboOutput *output = gumbo_parse_with_options(&options, input, input_len); if (output == NULL) { lua_pushnil(L); lua_pushliteral(L, "gumbo_parse_with_options() returned NULL"); return 2; } GumboOutputStatus status = output->status; if (status != GUMBO_STATUS_OK) { gumbo_destroy_output(output); lua_pushnil(L); lua_pushstring(L, gumbo_status_to_string(status)); return 2; } lua_pushlightuserdata(L, &output->document->v.document); int err = lua_pcall(L, 1, 1, 0); gumbo_destroy_output(output); if (err == 0) { // LUA_OK return 1; } else { lua_pushnil(L); lua_pushvalue(L, -2); return 2; } }
string HtmlParser::buscarElem(char *html, Tags *tag, string atributo, string valAtributo) { GumboOutput* output = gumbo_parse(html); string ret = search_for_elem(output->root, tag, atributo, valAtributo); char *arr = new char[ret.length()+1]; strcpy(arr, ret.c_str()); Constant::utf8ascii(arr); ret = string(arr); delete [] arr; gumbo_destroy_output(&kGumboDefaultOptions, output); return ret; }
list_t *wiki_registry(const char *url) { response_t *res = http_get(url); if (!res->ok) return NULL; GumboOutput* output = gumbo_parse(res->data); list_t *pkgs = list_new(); wiki_registry_find_body(output->root, pkgs); gumbo_destroy_output(&kGumboDefaultOptions, output); return pkgs; }
DocInfo HTMLParser::parse(RICPNS::Document &document) { oneurl curl; string html; cleanText(document.getText(), html); GumboOutput* output = gumbo_parse(html.c_str()); GumboNode* node = output->root; string docUrl = document.getURL(); string content, pageTitle; list<pair<string, string> > links; // thread t1(&HTMLParser::extractContent, this, node, ref(content)); // thread t2(&HTMLParser::extractPageTitle, this, node, ref(pageTitle)); // thread t3(&HTMLParser::extractLinks, this, node, ref(links), ref(docUrl)); extractContent(node, content); extractPageTitle(node, pageTitle); extractLinks(node, links, docUrl); // t1.join(); // t2.join(); // t3.join(); gumbo_destroy_output(&kGumboDefaultOptions, output); DocInfo docInfo; docInfo.setContent(content); docInfo.setCanonicalUrl( curl.Parse(docUrl) ? curl.CNormalize(docUrl) : docUrl); docInfo.setUrl(docUrl); docInfo.setTitle(pageTitle); docInfo.setLinks(links); // cout << docInfo.getUrl() << " - "<< " " << link << endl; // static int i=1; // cout << i++ << " - " << docInfo.getCanonicalUrl() << endl; // // // for(pair<string, string> link : links){ // cout << " ------- " << link.first << endl; // cout << link.second << endl; // } return docInfo; }
static int parse(lua_State *L) { size_t length; const char *input = luaL_checklstring(L, 1, &length); GumboOptions options = kGumboDefaultOptions; options.tab_stop = luaL_optint(L, 2, 8); GumboOutput *output = gumbo_parse_with_options(&options, input, length); if (output) { push_node(L, output->document); lua_rawgeti(L, -1, output->root->index_within_parent + 1); lua_setfield(L, -2, "root"); gumbo_destroy_output(&options, output); return 1; } else { lua_pushnil(L); lua_pushliteral(L, "Failed to parse"); return 2; } }
void Client::Parser(CCUserDataPtr &data, CCLinksPtr &links) { CCLinksPtr link(new CCLinks); std::string html=data->getdata(); GumboOutput *output; std::vector<std::string> vec; output = gumbo_parse(html.c_str()); if(!output) { return; } getlinks(output->root, vec); gumbo_destroy_output(&kGumboDefaultOptions, output); link->setlinks(vec); data->setdata(html); links = link; }
void t2HTMLParser::t2LabelParser::parse(const char* html) { GumboOutput* output = gumbo_parse(html); GumboNode *root = output->root; if(root->type != GUMBO_NODE_ELEMENT) { t2PrintError("html格式有误"); return; } // find head GumboVector *rootChildren = &root->v.element.children; GumboNode *h = NULL, *b = NULL; for(int i = 0; i < rootChildren->length; i++) { GumboNode *child = (GumboNode *) rootChildren->data[i]; if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_HEAD) h = child; else if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_BODY) b = child; } if(!h) { t2PrintError("html文件缺少<head>标签"); return; } head(h); // find body if(!b) { t2PrintError("html文件缺少<body>标签"); return; } body(b); gumbo_destroy_output(&kGumboDefaultOptions, output); }
HTMLParser::HTMLParser() { GumboOutput* output = gumbo_parse("<h1>Hello, World!</h1>"); // Do stuff with output->root gumbo_destroy_output(&kGumboDefaultOptions, output); }
void HtmlParser::buscarElementos(char *html, Tags *tag) { GumboOutput* output = gumbo_parse(html); search_for_links(output->root, tag); gumbo_destroy_output(&kGumboDefaultOptions, output); }
void crawl_parse_parse( crawl_parse_t *c, uint8_t *content ) { c->output = gumbo_parse( (const char *)content ); crawl_parse_real(c, c->output->root); gumbo_destroy_output( &kGumboDefaultOptions, c->output ); }
bool doFetchLesson(Class t, ThreadSafeQueue<Class> *classList, std::atomic<int> *totalLessons) { std::string buffer; CURL *curl; CURLcode res; weekAmountList.lock(); int weekAmount = -1; for (int i = 0; i < weekAmountList.size(); i++) { if (weekAmountList.at(i).departmentStringId == t.departmentString() && weekAmountList.at(i).cpath == t.cpath()) { weekAmount = weekAmountList.at(i).amountOfWeeks; break; } } if(weekAmount == -1) printf("ERROR: NOT FOUND: %s - %s\n", t.departmentString().c_str(), t.cpath().c_str()); weekAmountList.unlock(); char postFields[1024] = ""; for (int i = 1; i < weekAmount+1; i++) { int n = sprintf(postFields, "%sweken[]=%d&", postFields, i); postFields[n] = '\0'; } int n = sprintf(postFields, "%ssleutelveld=%s&object=%s&filter=%s", postFields, t.classIdString().c_str(), t.cpath().c_str(), t.departmentString().c_str()); postFields[n] = '\0'; //printf("%s\n", postFields); curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_URL, "https://rooster.nhl.nl/1516/rooster.php"); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer); curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postFields); //curl_easy_setopt(curl, CURLOPT_REFERER, NHL_REFERER); curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT); // 5 sec time out on whole request curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, CURL_CONNECT_TIMEOUT); // 10 sec connect time out res = curl_easy_perform(curl); //printf("------\n%s------\n\n", buffer.c_str()); if (res == CURLE_OK) { GumboOutput* output = gumbo_parse(buffer.c_str()); GumboNode* node = GetTBodyNode(output->root); if (node == NULL) { if (t.cpath() == "ttstud"){ printf("[%s] FAIL(%s-%s-%s)\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str()); curl_easy_cleanup(curl); } else{ printf("[%s] FAIL(%s-%s-%s), aborting program\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str()); curl_easy_cleanup(curl); exit(1); } return true; } GumboVector* children = &node->v.element.children; bool newDay = false; int lessonAmount = 0; int yearOffset = -1; int titleOffset = -1; int locationOffset = -1; int teacherOffset = -1; int typeOffset = -1; int commentOffset = -1; int endOffset = -1; std::string date; char dayName[128]; int day; int month; int year; for (unsigned int i = 0; i < children->length; ++i) { GumboNode *node1 = static_cast<GumboNode*>(children->data[i]); if (node1->v.element.tag == GUMBO_TAG_TR){ GumboAttribute *att = gumbo_get_attribute(&node1->v.element.attributes, "class"); if (att) { //printf("TR CLASS: %s\n", att->value); std::string value = att->value; if (value == "datarij") { std::string yearType = ""; GumboNode *startTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[0]); std::string endDate = ""; if (endOffset != -1) { GumboNode *endTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[endOffset]); endDate = GetTextFromElement(endTimeNode); } if (yearOffset != -1) { GumboNode *yearNode = static_cast<GumboNode*>(node1->v.element.children.data[yearOffset]); // optional if (yearNode) yearType = GetTextFromElement(yearNode); } std::string typeStr = ""; GumboNode *titleNode = static_cast<GumboNode*>(node1->v.element.children.data[titleOffset]); GumboNode *locationNode = static_cast<GumboNode*>(node1->v.element.children.data[locationOffset]); std::string teacher = ""; if (teacherOffset != -1) { GumboNode *teacherNode = static_cast<GumboNode*>(node1->v.element.children.data[teacherOffset]); teacher = GetTextFromElement(teacherNode); } //if(teacher == "" && t.departmentString() == "TEE" && t.cpath() == "stud"){ //printf("Teacher empty: %s - %s\n", GetTextFromElement(titleNode).c_str(), GetTextFromElement(startTimeNode).c_str()); //printf("Buffer:\n%s\n----\n", buffer.c_str()); //} //printf("%s\n", teacher.c_str()); if (typeOffset != -1) { GumboNode *typeNode = static_cast<GumboNode*>(node1->v.element.children.data[typeOffset]); typeStr = GetTextFromElement(typeNode); } GumboNode *commentsNode = static_cast<GumboNode*>(node1->v.element.children.data[commentOffset]); std::string startDate = GetTextFromElement(startTimeNode); int startHour; int startMinute; sscanf(startDate.c_str(), "%02d:%02d", &startHour, &startMinute); char newStartDate[128]; //YYYY-MM-DDTHH:MM:SS int n = sprintf(newStartDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, startHour, startMinute); newStartDate[n] = '\0'; char newEndDate[128]; if (endDate != "") { int endHour; int endMinute; sscanf(endDate.c_str(), "%02d:%02d", &endHour, &endMinute); //YYYY-MM-DDTHH:MM:SS int ne = sprintf(newEndDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, endHour, endMinute); newEndDate[ne] = '\0'; }else newEndDate[0] = '\0'; //printf("%s - %s - %s - %s - %s (YEAR: %s)\n", GetTextFromElement(titleNode).c_str(), newStartDate, newEndDate, GetTextFromElement(teacherNode).c_str(), GetTextFromElement(locationNode).c_str(), yearType.c_str()); int weekNr = getWeekNrFromDate(newStartDate); // Calculate time difference to remove old lessons time_t timeStampThisWeek = getTimeStampFromDate(newStartDate); time_t timeStampCurrentWeek; time(&timeStampCurrentWeek); //printf("Current week: %d - Lesson week: %d\n", getCurrentWeekNumber(), weekNr); double diff = difftime(timeStampThisWeek, timeStampCurrentWeek); double weeks = diff / 604800; //printf("Time difference: %.f\n", weeks); if (weeks > -2) { // ignore old lessons t.addLesson(shared_ptr<Lesson>(new Lesson(GetTextFromElement(titleNode), GetTextFromElement(commentsNode), teacher, replaceAll(GetTextFromElement(locationNode), " ", ", "), newStartDate, newEndDate, weekNr, yearType, typeStr))); ++*totalLessons; } lessonAmount++; } else if (value == "weekheader") { yearOffset = -1; titleOffset = -1; locationOffset = -1; teacherOffset = -1; typeOffset = -1; commentOffset = -1; GumboVector* children = &node1->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { GumboNode *child = static_cast<GumboNode*>(children->data[i]); std::string text = GetTextFromElement(child); if (text == "jaar") yearOffset = i; else if (text == "activiteit") titleOffset = i; else if (text == "lokaal") locationOffset = i; else if (text == "docent(en)" || text == "klas(en)") teacherOffset = i; else if (text == "werkvorm") typeOffset = i; else if (text == "opmerkingen") commentOffset = i; else if (text == "eind") endOffset = i; } } } else { // no class GumboNode *td = static_cast<GumboNode*>(node1->v.element.children.data[0]); if (td->v.element.tag == GUMBO_TAG_TD) { GumboAttribute *classAtt = gumbo_get_attribute(&td->v.element.attributes, "class"); if (classAtt) { std::string dayRow = classAtt->value; if (dayRow == "dagrij") { GumboNode *dateNode = static_cast<GumboNode*>(td->v.element.children.data[0]); date = dateNode->v.text.text; // get date with sscanf sscanf(date.c_str(), "%s %02d-%02d-%04d", &dayName, &day, &month, &year); //printf("New day @ %s\n", date.c_str()); } } } } } } //printf("Lessons: %d\n", lessonAmount); gumbo_destroy_output(&kGumboDefaultOptions, output); curl_easy_cleanup(curl); classList->lock(); classList->push(t); classList->unlock(); return true; } else{ //printf("[%s] Fail: %s, aborting program", currentDateTime().c_str(), curl_easy_strerror(res)); curl_easy_cleanup(curl); //exit(1); return false; } /* New ICAL style as of semester starting at 01-09-2015 */ /* icalcomponent *rootNode = icalparser_parse_string(cstr); icalcomponent *comp = icalcomponent_get_first_component(rootNode, ICAL_VEVENT_COMPONENT); //icalcomponent *zoneComp = icalcomponent_get_first_component(rootNode, ICAL_VTIMEZONE_COMPONENT); //icaltimezone *zone = icaltimezone_get_builtin_timezone("Europe/Amsterdam"); //icalcomponent *next = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT); while (comp != NULL){ //printf("%s\n", icalcomponent_as_ical_string(comp)); std::string summary = icalcomponent_get_summary(comp); icaltimetype dtstart = icalcomponent_get_dtstart(comp); std::string starttime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtstart))); icaltimetype dtend = icalcomponent_get_dtend(comp); std::string endtime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtend))); const char *locationStr = icalcomponent_get_location(comp); // can be null std::string location = ""; if (locationStr){ location = locationStr; location = replaceAll(location, " ", ", "); } std::string commentStr = (char *)icalcomponent_get_comment(comp); printf("Comment: %s\n", commentStr.c_str()); char *comment = (char *)commentStr.c_str(); char *line = strtok(comment, "\n"); std::string docenten; while (line != NULL){ if (strstr(line, "Docent(en): ")){ line += 12; // 12 is length of Docent(en) int length = strlen(line); docenten = line; } line = strtok(NULL, "\n"); } int weekNr = getWeekNrFromDate(starttime); //printf("%s - %s\n", summary.c_str(), starttime.c_str()); if (weekNr > getCurrentWeekNumber() - 2) { // ignore old lessons t.addLesson(shared_ptr<Lesson>(new Lesson(summary, summary, docenten, location, starttime, endtime, weekNr))); printf("1Add Lesson(%s): %s - %s - %s - %s(%s)\n", t.className().c_str(), summary.c_str(), starttime.c_str(), endtime.c_str(), docenten.c_str(), commentStr.c_str()); } icalcomponent_free(comp); comp = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT); } icalcomponent_free(comp); icalcomponent_free(rootNode); classList->lock(); classList->push(t); classList->unlock(); */ // delete our garbage //delete[] refererUrlBuffer; //delete[] urlBuffer; //delete[] cstr; //curl_free(classIdStringEscaped); //curl_easy_cleanup(curl); //return true; /* Old XML Style xml_document<> doc; doc.parse<0>(cstr); xml_node<> *pRoot = doc.first_node(); if (pRoot == 0) { std::cout << "doFetchLesson() ERROR: Invalid rootnode" << std::endl; exit(1); // Immediately abort program as the document is unreadable } else if (pRoot != NULL) { pRoot = pRoot->first_node(); if (pRoot == 0) { std::cout << "doFetchLesson() ERROR: Rootnode has an invalid first node" << std::endl; exit(1); // Immediately abort program as the document is unreadable } } for (xml_node<> *pNode = pRoot->first_node("item"); pNode; pNode = pNode->next_sibling()) { std::string title = pNode->first_node("title") ? pNode->first_node("title")->value() : ""; if (title.length() > 2) title = title.substr(title.find(": ") + 2, title.length()); std::string description = pNode->first_node("description") ? pNode->first_node("description")->value() : ""; std::string teacher = getStringBetween(" - ", " -", description, "([a-zA-Z,. ]+)"); std::string location = pNode->first_node("ev:location") ? pNode->first_node("ev:location")->value() : ""; location = trim(location); char *locationDecoded = curl_easy_unescape(curl, location.c_str(), 0, NULL); location = std::string(locationDecoded); curl_free(locationDecoded); std::string startdate = pNode->first_node("ev:startdate") ? pNode->first_node("ev:startdate")->value() : ""; std::string enddate = pNode->first_node("ev:enddate") ? pNode->first_node("ev:enddate")->value() : ""; int weekNr = getWeekNrFromDate(startdate); if (weekNr > getCurrentWeekNumber() - 2) // ignore old lessons t.addLesson(shared_ptr<Lesson>(new Lesson(title, description, teacher, location, startdate, enddate, weekNr))); } // ready to push back the class classList->lock(); classList->push_back(t); classList->unlock(); // delete our garbage delete[] refererUrlBuffer; delete[] urlBuffer; delete[] cstr; curl_free(classIdStringEscaped); curl_easy_cleanup(curl); return true;*/ }
GumboOutputWrapper::~GumboOutputWrapper() { if (output_) { gumbo_destroy_output(&kGumboDefaultOptions, output_); output_ = nullptr; } }
Article::Article(const std::string& path, const bool detectRedirects) { invalid = false; /* aid */ aid = path.substr(directoryPath.size()+1); /* url */ url = aid; /* mime-type */ mimeType = getMimeTypeForFile(aid); /* namespace */ ns = getNamespaceForMimeType(mimeType)[0]; /* HTML specific code */ if (mimeType.find("text/html") != std::string::npos) { std::size_t found; std::string html = getFileContent(path); GumboOutput* output = gumbo_parse(html.c_str()); GumboNode* root = output->root; /* Search the content of the <title> tag in the HTML */ if (root->type == GUMBO_NODE_ELEMENT && root->v.element.children.length >= 2) { const GumboVector* root_children = &root->v.element.children; GumboNode* head = NULL; for (int i = 0; i < root_children->length; ++i) { GumboNode* child = (GumboNode*)(root_children->data[i]); if (child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_HEAD) { head = child; break; } } if (head != NULL) { GumboVector* head_children = &head->v.element.children; for (int i = 0; i < head_children->length; ++i) { GumboNode* child = (GumboNode*)(head_children->data[i]); if (child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_TITLE) { if (child->v.element.children.length == 1) { GumboNode* title_text = (GumboNode*)(child->v.element.children.data[0]); if (title_text->type == GUMBO_NODE_TEXT) { title = title_text->v.text.text; stripTitleInvalidChars(title); } } } } /* Detect if this is a redirection (if no redirects CSV specified) */ std::string targetUrl; try { targetUrl = detectRedirects ? extractRedirectUrlFromHtml(head_children) : ""; } catch (std::string &error) { std::cerr << error << std::endl; } if (!targetUrl.empty()) { redirectAid = computeAbsolutePath(aid, decodeUrl(targetUrl)); if (!fileExists(directoryPath + "/" + redirectAid)) { redirectAid.clear(); invalid = true; } } } /* If no title, then compute one from the filename */ if (title.empty()) { found = path.rfind("/"); if (found != std::string::npos) { title = path.substr(found+1); found = title.rfind("."); if (found!=std::string::npos) { title = title.substr(0, found); } } else { title = path; } std::replace(title.begin(), title.end(), '_', ' '); } } gumbo_destroy_output(&kGumboDefaultOptions, output); } }
int main(int argc, char *argv[]) { // system("chcp 65001"); system("chcp 1251"); // Read HTML file contents std::string htmlFileName = "C:\\Projects\\__DATA\\test_page_30.html"; std::ifstream htmlFileStream(htmlFileName, std::ios::in | std::ios::binary); if (!htmlFileStream) { std::cout << "File " << htmlFileName << " not found!"; return( EXIT_FAILURE ); } std::string htmlFileContents; htmlFileStream.seekg(0, std::ios::end); htmlFileContents.resize(htmlFileStream.tellg()); htmlFileStream.seekg(0, std::ios::beg); htmlFileStream.read(&htmlFileContents[0], htmlFileContents.size()); htmlFileStream.close(); // Convert it to UTF-8 // NOTE: Gumbo works ONLY with UTF-8 documents char* bufUtf8 = new char[htmlFileContents.length() * 2]; memset(bufUtf8, 0, htmlFileContents.length() * 2); cp1251ToUtf8(bufUtf8, htmlFileContents.c_str()); // delete[] bufUtf8; // return 0; // Parse web page contents GumboOutput* output = gumbo_parse(/*htmlFileContents.c_str()*/ bufUtf8 ); // search_for_links(output->root); searchForDivBlocks(output->root); gumbo_destroy_output(&kGumboDefaultOptions, output); delete[] bufUtf8; _getchar_nolock(); return 0; }