Beispiel #1
0
/** 
 * Returns all Links found in an URL
 * @return a vector of strings of the links we got
 */
vector < GURL > Page::getLinks()
{	
    vector< GURL > answer;
    HTML::ParserDom parser;
    GURL curr( url );

    if (getContent() == "")
        return answer;

    tree<HTML::Node> dom = parser.parseTree(content);

    tree<HTML::Node>::iterator it = dom.begin();
    tree<HTML::Node>::iterator end = dom.end();
    for (; it != end; ++it)
    {
        if (it->tagName() == "a")
        {
            it->parseAttributes();
            string relative = it->attribute("href").second;

            GURL resolved = curr.Resolve(relative);
            if (resolved.host() == curr.host() )
                answer.push_back( resolved );
        }
    }

    return answer;
}
std::list<url_struct> getAllLinks(string filename,url_struct myurl)
{
//Parse some html code
  string html = get_file_contents(filename.c_str());
  HTML::ParserDom parser;
  tree<HTML::Node> dom = parser.parseTree(html);
  
  //Print whole DOM tree
  // cout << dom << endl;
  
  //Dump all links in the tree
  tree<HTML::Node>::iterator it = dom.begin();
  tree<HTML::Node>::iterator end = dom.end();
  string base;
  if(myurl.flag)
    base = myurl.baseurl+"/";
  else
  	base = myurl.url+"/";


  std::list<url_struct> url_list;
  string link;
  url_struct link_struct;
  for (; it != end; ++it)
  {
  	if (it->tagName() == "a")
  	{
  		it->parseAttributes();
  		link = it->attribute("href").second;
  		if(link.length()!=0 && link!="#")
  		{
  			std::size_t search = link.find("http");
  			if(search!=std::string::npos && search==0)
  			{
  				link_struct.url=link;
  				link_struct.baseurl=link;
  				link_struct.flag=0;
  				url_list.push_back(link_struct);
  			}
  			else
  			{

  				if((search = link.find("mailto:"))==std::string::npos)
  				{
  					link = base+link;
  					link_struct.url = link;
  					link_struct.baseurl = myurl.baseurl;
  					link_struct.flag=1;
  					url_list.push_back(link_struct);  					
  				}
  				
  			}

  		}
  	}
  }
  return url_list;
}
Beispiel #3
0
DatabasePtr
Tracker::track(bool gratis)
{
    DatabasePtr db = DatabasePtr(new Database);
    
    std::string content = fetch();
    
    HTML::ParserDom parser;
    tree<HTML::Node> dom = parser.parseTree(content);
    
    for (tree<HTML::Node>::iterator it = dom.begin(); it != dom.end(); ++it)
    {
        if (it->tagName()=="div" && it->text().find("class=\"media-body\"") != std::string::npos)
        {
            DataItemPtr di = getOneDataItem(content, it, gratis);
            if (!inIgnoreList(di->m_URL)) db->add(di);
        }
    }
    
    return db;
}
std::list<std::string> LinkExtractor::links()
{
    if(m_linksExtracted == false)
    {
        std::list<std::string> links(64);
        char *htmlContents = NULL;
        unsigned long htmlLength = 0;

        if(m_source)
            htmlContents = m_source;
        else if(m_filename)
            file_get_contents(m_filename, &htmlContents, &htmlLength);
        else
            return std::list<std::string>();

        HTML::ParserDom parser;
        tree<HTML::Node> dom = parser.parseTree(htmlContents);
        tree<HTML::Node>::iterator it = dom.begin();
        tree<HTML::Node>::iterator end = dom.end();
        for(; it != end; ++it)
        {
            if(it->tagName() == "a")
            {
                it->parseAttributes();
                std::pair<bool, std::string> linkPair = it->attribute("href");
                links.push_back(linkPair.second);
                m_links.push_back(linkPair.second);
                m_numLinks++;
            }
        }
        m_linksExtracted = true;
        return links;
    }
    else
    {
        return m_links;
    }
}
Beispiel #5
0
void parser(string html){

	HTML::ParserDom parser;
	tree<HTML::Node> dom = parser.parseTree(html);
	// cout << dom << endl;
	tree<HTML::Node>::iterator it = dom.begin();
	tree<HTML::Node>::iterator end = dom.end();
	it = dom.begin();
	end = dom.end();
	for (; it != end; ++it)
	{
		if ((!it->isTag())&&(!it->isComment()))
		{
			//cout <<"tagname:"<<it->tagName()<<endl;
			std::string _tempxd = it->text();
			std::wstring _temps = util::Utf8ToUnicode(_tempxd.c_str());
			std::string _tempx = util::UnicodeToAnsi(_temps.c_str());
			cout <<"tagvalue:"<<_tempx.c_str()<<endl;
			//string node=html.substr(it->offset(),it->length());
			//cout <<"tagvalue:"<<node;
		}
	}
}
Beispiel #6
0
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  main
 *       Author:  bbxyard
 *      Created:  2014年08月08日 18时22分11秒
 *  Description:  
 * =====================================================================================
 */
int main (int argc, char *argv[])
{
  using namespace std;
  using namespace htmlcxx;
  
  //Parse some html code
  string html = "<html><body>hey<A href=\"www.bbxyard.com\">myhome</A></body></html>";
  HTML::ParserDom parser;
  tree<HTML::Node> dom = parser.parseTree(html);
  //Print whole DOM tree
  cout << dom << endl;
  
  //Dump all links in the tree
  tree<HTML::Node>::iterator it = dom.begin();
  tree<HTML::Node>::iterator end = dom.end();
  for (; it != end; ++it)
  {
     if (strcasecmp(it->tagName().c_str(), "A") == 0)
     {
       it->parseAttributes();
       cout << it->attribute("href").second << endl;
     }
  }
  
  //Dump all text of the document
  it = dom.begin();
  end = dom.end();
  for (; it != end; ++it)
  {
    if ((!it->isTag()) && (!it->isComment()))
    {
      cout << it->text() << " ";
    }
  }
  cout << endl;
  return 0;
}
Beispiel #7
0
void Page::parseHTML(string html) {

    HTML::ParserDom parser;
    tree<HTML::Node> dom = parser.parseTree(html);
    tree<HTML::Node>::iterator it = dom.begin();
    text_ += url_;
    for (; it != dom.end(); ++it) {
        if(it.node != 0 && dom.parent(it) != NULL){
            string parent_tag = dom.parent(it)->tagName();
            //Pular código javascript
            boost::to_lower(parent_tag);
            if(parent_tag == "script" || parent_tag == "noscript" ){
                it.skip_children();
                continue;
            }
        }
        //Parse plain text of the page
        if ((!it->isTag()) && (!it->isComment()) ) {
            text_ += " ";
            text_ += it->text();
        }
        else { //Parse metadata
            string tagName = it->tagName();
            boost::to_lower(tagName);
            if(tagName == "title"){
                it++;
                if(it == dom.end()) return;
                title_ = it->text();
            }
            else if(tagName == "meta"){
                it->parseAttributes();
                std::pair<bool, std::string> attrib = it->attribute("name");
                if(attrib.first == true){
                    boost::to_lower(attrib.second);
                    if(attrib.second == "description")
                        description_ = it->attribute("content").second;
                    if(attrib.second == "keywords")
                        keywords_ = it->attribute("content").second;
                }
                attrib = it->attribute("http-equiv");
                boost::to_lower(attrib.second);
                if(attrib.first == true && attrib.second == "content-type"){
                    contentType_ = it->attribute("content").second;
                }
            }
            else if(tagName == "a"){
                it->parseAttributes();
                std::pair<bool, std::string> attrib = it->attribute("rel");
                boost::to_lower(attrib.second);
                if(attrib.first == true && attrib.second == "nofollow"){

                }else{
                    attrib = it->attribute("href");
                    string anchor_text;
                    int children = it.number_of_children();
                    for(int i=0; i<children; i++){
                        it++;
                        if(it == dom.end()) return;
                        if(!it->isTag()) anchor_text += it->text();
                    }
                    links_[HTML::convert_link(attrib.second, url_)] = anchor_text;
                    text_ += " ";
                    text_ += anchor_text;
                }
            }
        }
    }
}
Beispiel #8
0
int main(void)
{
  CURL *curl;
  CURLcode res;
  std::string readBuffer;

  curl = curl_easy_init();
  if(curl) {
    //curl_easy_setopt(curl, CURLOPT_URL, "http://curl.haxx.se/libcurl/");
	curl_easy_setopt(curl, CURLOPT_URL, "http://www.firstpost.com");
    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
    res = curl_easy_perform(curl);
    curl_easy_cleanup(curl);

    //cout << readBuffer << endl;
    HTML::ParserDom parser;
    tree<HTML::Node> dom = parser.parseTree(readBuffer);

    //Print whole DOM tree
    //cout << dom << endl;

    tree<HTML::Node>::iterator it = dom.begin();
    tree<HTML::Node>::iterator end = dom.end();

    string tagarr[] = {"script", "style"};
    //print all text
    for(; it != end; ++it)
    {
        //if(it->isTag() && (strcasecmp(it->tagName().c_str(), "script") == 0))
        int tagarr_count = sizeof(tagarr)/sizeof(string);
        string* pstr = std::find(tagarr, tagarr+tagarr_count, it->tagName());
        if(pstr != tagarr+tagarr_count)
        {
            if((++it)->isTag())
                --it;
            continue;
        }
    	if((!it->isTag()) && (!it->isComment()))
    	{
            if(it->text().find_first_not_of(" \t\n\v\f\r") == std::string::npos)
            {
                continue;
            }
    		cout << "||" << it->text() << "||" << endl;
    		cout << "--------------------------" << endl;
    	}
    }

    cout << "print all links" << endl;

    const boost::regex brexpress("https?:\/\/[^\s/$.?#].[^\s]*");
    it = dom.begin();
    end = dom.end();
    for (; it != end; ++it)
      {
      	if (it->tagName() == "a")
      	{
      		it->parseAttributes();
      		if(boost::regex_match(it->attribute("href").second, brexpress))
      			cout << it->attribute("href").second << endl;
      	}
      }

    //std::cout << readBuffer << std::endl;
  }
  return 0;
}
Beispiel #9
0
int main() {
  vector<string> tabStrings;
  tabStrings.push_back( "pricings" );
  tabStrings.push_back( "filings" );

  vector<string> monthStrings;
  for ( int i=1; i<=12; i++ ) {
    stringstream out;
    out << i;
    if ( i<10 ) { 
      monthStrings.push_back("2011-0"+out.str());
    }
    else {
      monthStrings.push_back("2011-"+out.str());
    }
  }
  monthStrings.push_back("2012-01");
  monthStrings.push_back("2012-02");

  fstream outFile( "/tmp/NasdaqSPOs.csv", fstream::out );

  for ( vector<string>::iterator iter=tabStrings.begin(); iter!=tabStrings.end(); iter++ ) {
      for ( vector<string>::iterator monthIter=monthStrings.begin(); monthIter!=monthStrings.end(); monthIter++ ) {
	cout << "Scraping " << *iter << " for month = " << *monthIter << endl;
        try {
          curlpp::Easy myRequest;
          myRequest.setOpt(curlpp::options::Url((std::string( "http://www.nasdaq.com/markets/spos/activity.aspx?tab="+*iter+"&month="+*monthIter))));
          ostringstream os;
          os << myRequest;
          string content = os.str();
          
          HTML::ParserDom parser;

          if ( *iter == "pricings" ) {
            string type = "PRICING";

	    int startIdx = content.find("<div class=\"genTable\">");          
            int endIdx = content.find("<!-- end tabpane");
                                  
            if ( startIdx > 0 && endIdx > startIdx ) {
              string htmlContent = content.substr(startIdx,(endIdx-startIdx));               
              tree<HTML::Node> dom = parser.parseTree( htmlContent );

	      for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) {
		if ( treeIter->tagName() == "tr" ) {
		  string trHtml = treeIter->content( htmlContent );
                  tree<HTML::Node> trDom = parser.parseTree( trHtml );
                  int tdCount = 0;
                  string name = "";
                  string nasdaqUrl = "";
                  string ticker = "";
                  string market = "";
                  string price = "";
                  string shares = "";
                  string offerAmount = "";
                  string datePriced = "";

                  for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) {
                    if ( trIter->tagName() == "td" ) {
                      if ( tdCount == 0 ) {
			string tdHtml = trIter->content( trHtml );
                        int startIndex = tdHtml.find("\">");
                        int endIndex = tdHtml.find("</a>");
                        name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
                        startIndex = tdHtml.find("href=\"");
                        endIndex = tdHtml.find("\">");
                        nasdaqUrl = tdHtml.substr(startIndex+6,(endIndex-startIndex-6));
                      }
                      else if ( tdCount == 1 ) {
                        string tdHtml = trIter->content( trHtml );
                        int startIndex = tdHtml.find("\">");
                        int endIndex = tdHtml.find("</a>");
                        ticker = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
                      }
                      else if ( tdCount == 2 ) {
                        string tdHtml = trIter->content( trHtml );
                        int startIndex = tdHtml.find("\">");
                        int endIndex = tdHtml.find("</a>");
                        market = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
                      }
                      else if ( tdCount == 3 ) {
                        string tdHtml = trIter->content( trHtml );
                        price = tdHtml;
                      }
                      else if ( tdCount == 4 ) {
                        string tdHtml = trIter->content( trHtml );
                        shares = tdHtml;
                      }
                      else if ( tdCount == 5 ) {
                        string tdHtml = trIter->content( trHtml );
                        offerAmount = tdHtml;
                      }
                      else if ( tdCount == 6 ) {
                        string tdHtml = trIter->content( trHtml );
                        datePriced = tdHtml;
                      }
                      tdCount++;
                    }
                  }
                  if ( name != "" ) {
                     outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + datePriced + "\",\"" + nasdaqUrl + "\"" << endl;
                  }                   
                }            
              }
            } 
          }
          else if ( *iter == "filings" ) {
	    string type = "FILING";
            int startIdx = content.find("<div class=\"genTable\">");        
            int endIdx = content.find("<!-- end tabpane");
            
            if ( startIdx > 0 && endIdx > startIdx ) {
              string htmlContent = content.substr(startIdx,(endIdx-startIdx));               
              tree<HTML::Node> dom = parser.parseTree( htmlContent );
             
              for ( tree<HTML::Node>::iterator treeIter=dom.begin(); treeIter!=dom.end(); treeIter++ ) {
		if ( treeIter->tagName() == "tr" ) {
		  string trHtml = treeIter->content( htmlContent );
                  tree<HTML::Node> trDom = parser.parseTree( trHtml );
                  int tdCount = 0;
                  string name = "";
                  string nasdaqUrl = "";
                  string ticker = "";
                  string offerAmount = "";
                  string dateFiled = "";
                  string market = "";
                  string price = "";
                  string shares = "";
          
                  for ( tree<HTML::Node>::iterator trIter=trDom.begin(); trIter!=trDom.end(); trIter++ ) {
                    if ( trIter->tagName() == "td" ) {
                      if ( tdCount == 0 ) {
			string tdHtml = trIter->content( trHtml );
                        int startIndex = tdHtml.find("\">");
                        int endIndex = tdHtml.find("</a>");
                        name = tdHtml.substr(startIndex+2,(endIndex-startIndex-2));
                        startIndex = tdHtml.find("href=\"");
                        endIndex = tdHtml.find("\">");
                        nasdaqUrl = tdHtml.substr(startIndex+6,(endIndex-startIndex-6));
                      }
                      else if ( tdCount == 1 ) {
                        string tdHtml = trIter->content( trHtml );
                        ticker = tdHtml;
                        try {
                          if ( ticker.find("</a>") != string::npos ) {
                            int startIndex = ticker.find("\">");
                            int endIndex = ticker.find("</a>");
                            ticker = ticker.substr(startIndex+2,(endIndex-startIndex-2));
                          }
                        } catch ( std::out_of_range &e ) {}
                      }
                      else if ( tdCount == 2 ) {
                        string tdHtml = trIter->content( trHtml );
                        offerAmount = tdHtml;
                      }
                      else if ( tdCount == 3 ) {
                        string tdHtml = trIter->content( trHtml );
                        dateFiled = tdHtml;
                      }
                      tdCount++;
                    }
                  }
                  if ( name != "" ) {
                     outFile << "\"" + type + "\",\"" + name + "\",\"" + ticker + "\",\"" + market + "\",\"" + price + "\",\"" + shares + "\",\"" + offerAmount + "\",\"" + dateFiled + "\",\"" + nasdaqUrl + "\"" << endl;
                  }                   
                }            
              }
            }             
             
          }
        } catch( curlpp::RuntimeError &e ) { std::cout << e.what() << std::endl; }
        catch( curlpp::LogicError &e ) { std::cout << e.what() << std::endl; }
        catch( std::out_of_range &e ) { cout << e.what() << endl; }
        usleep( 125000 );        
      }
      } 
  outFile.close();
  return 0;
}
Beispiel #10
0
bool ParseHtml(const char *phtml,pelem_feature pelemfeature,plist_result presult)
{
	if(!phtml || NULL==pelemfeature || pelemfeature->tagname.size() == 0 || NULL==presult) return false;

	tree<HTML::Node> tr;
	std::string html(phtml);
	HTML::ParserDom parser;
	parser.parse(html);
	tr = parser.getTree();

	makelower(pelemfeature->tagname);

	hash_map_alone dataalone;

	for (tree<HTML::Node>::iterator it = tr.begin();it!=tr.end();it++)
	{
		//比较tagname
		std::string tagname = it->tagName();
		makelower(tagname);
		if (tagname != pelemfeature->tagname)
		{
			continue;
		}

		//比较标签文本
		bool bctxtextmatch = true;
		if( pelemfeature->sub_contenttext.size() > 0  )
		{
			bctxtextmatch = stringcheck(it->mContentText,pelemfeature->sub_contenttext);
		}

		if ( false == bctxtextmatch )
		{
			continue;
		}
		
		//比较属性

		it->parseAttributes();

		bool battributematch = true;

		std::pair<bool,std::string> attrpair = it->attribute(pelemfeature->strattributename);

		if( false == stringcheck(attrpair.second,pelemfeature->sub_attributevalue) )
		{
			battributematch = false;
		}

		if ( false == battributematch)
		{
			continue;
		}

		if (pelemfeature->attributequery.size() != 0 && pelemfeature->attributequery != pelemfeature->strattributename)
		{
			attrpair = it->attribute(pelemfeature->attributequery);
		}

		DWORD dwchksum = CRC32((void *)(attrpair.second.c_str()),attrpair.second.size());
		if(dataalone.find(dwchksum) == dataalone.end())
		{
			dataalone[dwchksum]='0';
			presult->push_back(attrpair.second);
		}
		else
		{
			int a=0;
		}
		
	}

	return true;
}
void
FetcherScholarshipPositions::fetch()
{
    const std::string now = "spgmail" + currentDateTime();
    std::size_t count = 0;
    
    std::ifstream file_input(m_FilenameHtmlScholarshipPositionsGmail.c_str());
    std::ofstream file_output(m_FilenameInputScholarshipPositionsGmail.c_str());
    
    if (!file_input.is_open())
    {
        std::cerr << "Cannot open file \"" << m_FilenameHtmlScholarshipPositionsGmail << "\" for reading!" << std::endl;
        return;
    }
    
    if (!file_output.is_open())
    {
        std::cerr << "Cannot open file \"" << m_FilenameInputScholarshipPositionsGmail << "\" for writing!" << std::endl;
        return;
    }
    
    std::string content_gmail((std::istreambuf_iterator<char>(file_input)), std::istreambuf_iterator<char>());
    
    HTML::ParserDom parser;
    tree<HTML::Node> dom = parser.parseTree(content_gmail);
    
    tree<HTML::Node>::iterator beg = dom.begin();
    tree<HTML::Node>::iterator end = dom.end();
    
    std::string ct = "";
    std::string link_title_str = "";
    std::string deadline_str = "";
    
    tree<HTML::Node>::iterator previous_bold_it = NULL;
    
    for (tree<HTML::Node>::iterator it = beg; it != end; ++it)
    {
        if (it->tagName() == "strong" || it->tagName() == "b")
        {
            ct = it->content(content_gmail);
            
            // 20150908:
            // The link to the scholarship can be in the text content of this node or the previous node with tag <strong>.
            //
            if (ct == "Provided by:")
            {
                link_title_str = it->content(content_gmail);
                if (link_title_str.find("href=\"") == std::string::npos) link_title_str = previous_bold_it->content(content_gmail);
                assert(link_title_str.find("href=\"") != std::string::npos);
            }
            if (ct.find("Application Deadline") != std::string::npos)
            {
                tree<HTML::Node>::iterator jt = it;
                
                // To deal with the following form:
                // <strong><span style="background: white">Application Deadline</span></strong>
                // <span style="background: white">&nbsp;22 September 2014</span><br />
                ++jt;
                ++jt;
                ++jt;
                deadline_str = jt->content(content_gmail);
                
                //
                // To deal with the following form:
                // <strong>Application Deadline</strong> 31 October 2014<br />
                //
                if (deadline_str == "")
                {
                    for (jt = it; jt != end; ++jt)
                    {
                        if (jt->tagName() == "br")
                        {
                            jt--;
                            deadline_str = jt->text();
                            break;
                        }
                    }
                }
                
                fetchOneScholarshipPosition(file_output, now, count,
                                            link_title_str, deadline_str);
            }
            previous_bold_it = it;
        }
    }
    
    DBGINFO("Fetched " << count << " scholarship items from ScholarshipPositions-Gmail!")
}