/** * Returns all Links found in an URL * @return a vector of strings of the links we got */ vector < GURL > Page::getLinks() { vector< GURL > answer; HTML::ParserDom parser; GURL curr( url ); if (getContent() == "") return answer; tree<HTML::Node> dom = parser.parseTree(content); tree<HTML::Node>::iterator it = dom.begin(); tree<HTML::Node>::iterator end = dom.end(); for (; it != end; ++it) { if (it->tagName() == "a") { it->parseAttributes(); string relative = it->attribute("href").second; GURL resolved = curr.Resolve(relative); if (resolved.host() == curr.host() ) answer.push_back( resolved ); } } return answer; }
std::list<url_struct> getAllLinks(string filename,url_struct myurl) { //Parse some html code string html = get_file_contents(filename.c_str()); HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(html); //Print whole DOM tree // cout << dom << endl; //Dump all links in the tree tree<HTML::Node>::iterator it = dom.begin(); tree<HTML::Node>::iterator end = dom.end(); string base; if(myurl.flag) base = myurl.baseurl+"/"; else base = myurl.url+"/"; std::list<url_struct> url_list; string link; url_struct link_struct; for (; it != end; ++it) { if (it->tagName() == "a") { it->parseAttributes(); link = it->attribute("href").second; if(link.length()!=0 && link!="#") { std::size_t search = link.find("http"); if(search!=std::string::npos && search==0) { link_struct.url=link; link_struct.baseurl=link; link_struct.flag=0; url_list.push_back(link_struct); } else { if((search = link.find("mailto:"))==std::string::npos) { link = base+link; link_struct.url = link; link_struct.baseurl = myurl.baseurl; link_struct.flag=1; url_list.push_back(link_struct); } } } } } return url_list; }
DatabasePtr Tracker::track(bool gratis) { DatabasePtr db = DatabasePtr(new Database); std::string content = fetch(); HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(content); for (tree<HTML::Node>::iterator it = dom.begin(); it != dom.end(); ++it) { if (it->tagName()=="div" && it->text().find("class=\"media-body\"") != std::string::npos) { DataItemPtr di = getOneDataItem(content, it, gratis); if (!inIgnoreList(di->m_URL)) db->add(di); } } return db; }
std::list<std::string> LinkExtractor::links() { if(m_linksExtracted == false) { std::list<std::string> links(64); char *htmlContents = NULL; unsigned long htmlLength = 0; if(m_source) htmlContents = m_source; else if(m_filename) file_get_contents(m_filename, &htmlContents, &htmlLength); else return std::list<std::string>(); HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(htmlContents); tree<HTML::Node>::iterator it = dom.begin(); tree<HTML::Node>::iterator end = dom.end(); for(; it != end; ++it) { if(it->tagName() == "a") { it->parseAttributes(); std::pair<bool, std::string> linkPair = it->attribute("href"); links.push_back(linkPair.second); m_links.push_back(linkPair.second); m_numLinks++; } } m_linksExtracted = true; return links; } else { return m_links; } }
void parser(string html){ HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(html); // cout << dom << endl; tree<HTML::Node>::iterator it = dom.begin(); tree<HTML::Node>::iterator end = dom.end(); it = dom.begin(); end = dom.end(); for (; it != end; ++it) { if ((!it->isTag())&&(!it->isComment())) { //cout <<"tagname:"<<it->tagName()<<endl; std::string _tempxd = it->text(); std::wstring _temps = util::Utf8ToUnicode(_tempxd.c_str()); std::string _tempx = util::UnicodeToAnsi(_temps.c_str()); cout <<"tagvalue:"<<_tempx.c_str()<<endl; //string node=html.substr(it->offset(),it->length()); //cout <<"tagvalue:"<<node; } } }
/* * === FUNCTION ====================================================================== * Name: main * Author: bbxyard * Created: 2014年08月08日 18时22分11秒 * Description: * ===================================================================================== */ int main (int argc, char *argv[]) { using namespace std; using namespace htmlcxx; //Parse some html code string html = "<html><body>hey<A href=\"www.bbxyard.com\">myhome</A></body></html>"; HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(html); //Print whole DOM tree cout << dom << endl; //Dump all links in the tree tree<HTML::Node>::iterator it = dom.begin(); tree<HTML::Node>::iterator end = dom.end(); for (; it != end; ++it) { if (strcasecmp(it->tagName().c_str(), "A") == 0) { it->parseAttributes(); cout << it->attribute("href").second << endl; } } //Dump all text of the document it = dom.begin(); end = dom.end(); for (; it != end; ++it) { if ((!it->isTag()) && (!it->isComment())) { cout << it->text() << " "; } } cout << endl; return 0; }
void Page::parseHTML(string html) { HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(html); tree<HTML::Node>::iterator it = dom.begin(); text_ += url_; for (; it != dom.end(); ++it) { if(it.node != 0 && dom.parent(it) != NULL){ string parent_tag = dom.parent(it)->tagName(); //Pular código javascript boost::to_lower(parent_tag); if(parent_tag == "script" || parent_tag == "noscript" ){ it.skip_children(); continue; } } //Parse plain text of the page if ((!it->isTag()) && (!it->isComment()) ) { text_ += " "; text_ += it->text(); } else { //Parse metadata string tagName = it->tagName(); boost::to_lower(tagName); if(tagName == "title"){ it++; if(it == dom.end()) return; title_ = it->text(); } else if(tagName == "meta"){ it->parseAttributes(); std::pair<bool, std::string> attrib = it->attribute("name"); if(attrib.first == true){ boost::to_lower(attrib.second); if(attrib.second == "description") description_ = it->attribute("content").second; if(attrib.second == "keywords") keywords_ = it->attribute("content").second; } attrib = it->attribute("http-equiv"); boost::to_lower(attrib.second); if(attrib.first == true && attrib.second == "content-type"){ contentType_ = it->attribute("content").second; } } else if(tagName == "a"){ it->parseAttributes(); std::pair<bool, std::string> attrib = it->attribute("rel"); boost::to_lower(attrib.second); if(attrib.first == true && attrib.second == "nofollow"){ }else{ attrib = it->attribute("href"); string anchor_text; int children = it.number_of_children(); for(int i=0; i<children; i++){ it++; if(it == dom.end()) return; if(!it->isTag()) anchor_text += it->text(); } links_[HTML::convert_link(attrib.second, url_)] = anchor_text; text_ += " "; text_ += anchor_text; } } } } }
int main(void) { CURL *curl; CURLcode res; std::string readBuffer; curl = curl_easy_init(); if(curl) { //curl_easy_setopt(curl, CURLOPT_URL, "http://curl.haxx.se/libcurl/"); curl_easy_setopt(curl, CURLOPT_URL, "http://www.firstpost.com"); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer); res = curl_easy_perform(curl); curl_easy_cleanup(curl); //cout << readBuffer << endl; HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(readBuffer); //Print whole DOM tree //cout << dom << endl; tree<HTML::Node>::iterator it = dom.begin(); tree<HTML::Node>::iterator end = dom.end(); string tagarr[] = {"script", "style"}; //print all text for(; it != end; ++it) { //if(it->isTag() && (strcasecmp(it->tagName().c_str(), "script") == 0)) int tagarr_count = sizeof(tagarr)/sizeof(string); string* pstr = std::find(tagarr, tagarr+tagarr_count, it->tagName()); if(pstr != tagarr+tagarr_count) { if((++it)->isTag()) --it; continue; } if((!it->isTag()) && (!it->isComment())) { if(it->text().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { continue; } cout << "||" << it->text() << "||" << endl; cout << "--------------------------" << endl; } } cout << "print all links" << endl; const boost::regex brexpress("https?:\/\/[^\s/$.?#].[^\s]*"); it = dom.begin(); end = dom.end(); for (; it != end; ++it) { if (it->tagName() == "a") { it->parseAttributes(); if(boost::regex_match(it->attribute("href").second, brexpress)) cout << it->attribute("href").second << endl; } } //std::cout << readBuffer << std::endl; } return 0; }
int main() { vector<string> tabStrings; tabStrings.push_back( "pricings" ); tabStrings.push_back( "filings" ); vector<string> monthStrings; for ( int i=1; i<=12; i++ ) { stringstream out; out << i; if ( i<10 ) { monthStrings.push_back("2011-0"+out.str()); } else { monthStrings.push_back("2011-"+out.str()); } } monthStrings.push_back("2012-01"); monthStrings.push_back("2012-02"); fstream outFile( "/tmp/NasdaqSPOs.csv", fstream::out ); for ( vector<string>::iterator iter=tabStrings.begin(); iter!=tabStrings.end(); iter++ ) { for ( vector<string>::iterator monthIter=monthStrings.begin(); monthIter!=monthStrings.end(); monthIter++ ) { cout << "Scraping " << *iter << " for month = " << *monthIter << endl; try { curlpp::Easy myRequest; myRequest.setOpt(curlpp::options::Url((std::string( "http://www.nasdaq.com/markets/spos/activity.aspx?tab="+*iter+"&month="+*monthIter)))); ostringstream os; os << myRequest; string content = os.str(); HTML::ParserDom parser; if ( *iter == "pricings" ) { string type = "PRICING"; int startIdx = content.find("<div class=\"genTable\">"); int endIdx = content.find("<!-- end tabpane"); if ( startIdx > 0 && endIdx > startIdx ) { string htmlContent = content.substr(startIdx,(endIdx-startIdx)); tree<HTML::Node> dom = parser.parseTree( htmlContent ); for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) { if ( treeIter->tagName() == "tr" ) { string trHtml = treeIter->content( htmlContent ); tree<HTML::Node> trDom = parser.parseTree( trHtml ); int tdCount = 0; string name = ""; string nasdaqUrl = ""; string ticker = ""; string market = ""; string price = ""; string shares = ""; string offerAmount = ""; string datePriced = ""; for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) { if ( trIter->tagName() == "td" ) { if ( tdCount == 0 ) { string tdHtml = trIter->content( trHtml ); int startIndex = tdHtml.find("\">"); int endIndex = tdHtml.find("</a>"); name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2)); startIndex = tdHtml.find("href=\""); endIndex = tdHtml.find("\">"); nasdaqUrl = tdHtml.substr(startIndex+6,(endIndex-startIndex-6)); } else if ( tdCount == 1 ) { string tdHtml = trIter->content( trHtml ); int startIndex = tdHtml.find("\">"); int endIndex = tdHtml.find("</a>"); ticker = tdHtml.substr(startIndex+2,(endIndex-startIndex-2)); } else if ( tdCount == 2 ) { string tdHtml = trIter->content( trHtml ); int startIndex = tdHtml.find("\">"); int endIndex = tdHtml.find("</a>"); market = tdHtml.substr(startIndex+2,(endIndex-startIndex-2)); } else if ( tdCount == 3 ) { string tdHtml = trIter->content( trHtml ); price = tdHtml; } else if ( tdCount == 4 ) { string tdHtml = trIter->content( trHtml ); shares = tdHtml; } else if ( tdCount == 5 ) { string tdHtml = trIter->content( trHtml ); offerAmount = tdHtml; } else if ( tdCount == 6 ) { string tdHtml = trIter->content( trHtml ); datePriced = tdHtml; } tdCount++; } } if ( name != "" ) { outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + datePriced + "\",\"" + nasdaqUrl + "\"" << endl; } } } } } else if ( *iter == "filings" ) { string type = "FILING"; int startIdx = content.find("<div class=\"genTable\">"); int endIdx = content.find("<!-- end tabpane"); if ( startIdx > 0 && endIdx > startIdx ) { string htmlContent = content.substr(startIdx,(endIdx-startIdx)); tree<HTML::Node> dom = parser.parseTree( htmlContent ); for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) { if ( treeIter->tagName() == "tr" ) { string trHtml = treeIter->content( htmlContent ); tree<HTML::Node> trDom = parser.parseTree( trHtml ); int tdCount = 0; string name = ""; string nasdaqUrl = ""; string ticker = ""; string offerAmount = ""; string dateFiled = ""; string market = ""; string price = ""; string shares = ""; for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) { if ( trIter->tagName() == "td" ) { if ( tdCount == 0 ) { string tdHtml = trIter->content( trHtml ); int startIndex = tdHtml.find("\">"); int endIndex = tdHtml.find("</a>"); name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2)); startIndex = tdHtml.find("href=\""); endIndex = tdHtml.find("\">"); nasdaqUrl = tdHtml.substr(startIndex+6,(endIndex-startIndex-6)); } else if ( tdCount == 1 ) { string tdHtml = trIter->content( trHtml ); ticker = tdHtml; try { if ( ticker.find("</a>") != string::npos ) { int startIndex = ticker.find("\">"); int endIndex = ticker.find("</a>"); ticker = ticker.substr(startIndex+2,(endIndex-startIndex-2)); } } catch ( std::out_of_range &e ) {} } else if ( tdCount == 2 ) { string tdHtml = trIter->content( trHtml ); offerAmount = tdHtml; } else if ( tdCount == 3 ) { string tdHtml = trIter->content( trHtml ); dateFiled = tdHtml; } tdCount++; } } if ( name != "" ) { outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + dateFiled + "\",\"" + nasdaqUrl + "\"" << endl; } } } } } } catch( curlpp::RuntimeError &e ) { std::cout << e.what() << std::endl; } catch( curlpp::LogicError &e ) { std::cout << e.what() << std::endl; } catch( std::out_of_range &e ) { cout << e.what() << endl; } usleep( 125000 ); } } outFile.close(); return 0; }
bool ParseHtml(const char *phtml,pelem_feature pelemfeature,plist_result presult) { if(!phtml || NULL==pelemfeature || pelemfeature->tagname.size() == 0 || NULL==presult) return false; tree<HTML::Node> tr; std::string html(phtml); HTML::ParserDom parser; parser.parse(html); tr = parser.getTree(); makelower(pelemfeature->tagname); hash_map_alone dataalone; for (tree<HTML::Node>::iterator it = tr.begin();it!=tr.end();it++) { //比较tagname std::string tagname = it->tagName(); makelower(tagname); if (tagname != pelemfeature->tagname) { continue; } //比较标签文本 bool bctxtextmatch = true; if( pelemfeature->sub_contenttext.size() > 0 ) { bctxtextmatch = stringcheck(it->mContentText,pelemfeature->sub_contenttext); } if ( false == bctxtextmatch ) { continue; } //比较属性 it->parseAttributes(); bool battributematch = true; std::pair<bool,std::string> attrpair = it->attribute(pelemfeature->strattributename); if( false == stringcheck(attrpair.second,pelemfeature->sub_attributevalue) ) { battributematch = false; } if ( false == battributematch) { continue; } if (pelemfeature->attributequery.size() != 0 && pelemfeature->attributequery != pelemfeature->strattributename) { attrpair = it->attribute(pelemfeature->attributequery); } DWORD dwchksum = CRC32((void *)(attrpair.second.c_str()),attrpair.second.size()); if(dataalone.find(dwchksum) == dataalone.end()) { dataalone[dwchksum]='0'; presult->push_back(attrpair.second); } else { int a=0; } } return true; }
void FetcherScholarshipPositions::fetch() { const std::string now = "spgmail" + currentDateTime(); std::size_t count = 0; std::ifstream file_input(m_FilenameHtmlScholarshipPositionsGmail.c_str()); std::ofstream file_output(m_FilenameInputScholarshipPositionsGmail.c_str()); if (!file_input.is_open()) { std::cerr << "Cannot open file \"" << m_FilenameHtmlScholarshipPositionsGmail << "\" for reading!" << std::endl; return; } if (!file_output.is_open()) { std::cerr << "Cannot open file \"" << m_FilenameInputScholarshipPositionsGmail << "\" for writing!" << std::endl; return; } std::string content_gmail((std::istreambuf_iterator<char>(file_input)), std::istreambuf_iterator<char>()); HTML::ParserDom parser; tree<HTML::Node> dom = parser.parseTree(content_gmail); tree<HTML::Node>::iterator beg = dom.begin(); tree<HTML::Node>::iterator end = dom.end(); std::string ct = ""; std::string link_title_str = ""; std::string deadline_str = ""; tree<HTML::Node>::iterator previous_bold_it = NULL; for (tree<HTML::Node>::iterator it = beg; it != end; ++it) { if (it->tagName() == "strong" || it->tagName() == "b") { ct = it->content(content_gmail); // 20150908: // The link to the scholarship can be in the text content of this node or the previous node with tag <strong>. // if (ct == "Provided by:") { link_title_str = it->content(content_gmail); if (link_title_str.find("href=\"") == std::string::npos) link_title_str = previous_bold_it->content(content_gmail); assert(link_title_str.find("href=\"") != std::string::npos); } if (ct.find("Application Deadline") != std::string::npos) { tree<HTML::Node>::iterator jt = it; // To deal with the following form: // <strong><span style="background: white">Application Deadline</span></strong> // <span style="background: white"> 22 September 2014</span><br /> ++jt; ++jt; ++jt; deadline_str = jt->content(content_gmail); // // To deal with the following form: // <strong>Application Deadline</strong> 31 October 2014<br /> // if (deadline_str == "") { for (jt = it; jt != end; ++jt) { if (jt->tagName() == "br") { jt--; deadline_str = jt->text(); break; } } } fetchOneScholarshipPosition(file_output, now, count, link_title_str, deadline_str); } previous_bold_it = it; } } DBGINFO("Fetched " << count << " scholarship items from ScholarshipPositions-Gmail!") }