void UninitUploader()
{
        if (nProcStatus != 1)
                return;
        nProcStatus = 2;
        StopMgr();
        Qiniu_Global_Cleanup();
        
        return;
}
Exemple #2
0
void Qiniu_Servend_Cleanup()
{
	Qiniu_Global_Cleanup();
}
Exemple #3
0
int main(int argc, char* argv[])
{
	QINIU_ACCESS_KEY = "sn7d6X2kmRQKkNyO0_ZY_Hz2utVrXIeEmc8QutVC";
	QINIU_SECRET_KEY = "jeQSWafTp7kczgR4qVa-erKOaHk0_qcvMNacxO8E";
	curl_global_init(CURL_GLOBAL_ALL);
	Qiniu_Global_Init(-1);                  /* 全局初始化函数,整个进程只需要调用一次 */
	if(strcmp(argv[1],"imgtest") == 0 && argc == 4)
	{
		string oldurl,newurl,articleurl;
		articleurl = argv[2];
		oldurl     = argv[3];
		newurl =  _expandlinks(articleurl,oldurl);
		printf("newurl=%s\n",oldurl.c_str());
	}
	if(strcmp(argv[1],"-start") == 0 && argc == 4)
	{
		int ntime = atoi(argv[2]);
		int nprocesstimes = 1;
		ArticleManage m_article;
		theLog.SetLogFilePath(GetFullPath());
		if(!ReadConfigFile((GetFullPath()+"/sqlconfig.conf").c_str(),p_config))
		{
			return 0;
		}
		if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler))
		{
			return 0;
		}
		while(1)
		{
			
			m_article.start(atoi(argv[3]));
			theLog.WriteLog(LOG_LEVEL_SYS,"the proc ArticleExtract %d times completed!",nprocesstimes);
			nprocesstimes++;
			sleep(ntime * 60);
		}
	}
	
	if(argc == 3 && strcmp(argv[1],"-start") != 0 )
	{
		if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler))
		{
			return 0;
		}
		string page;
		list<string> ImgStrList;
		page.clear();
		const char* url = argv[1];
		string sourcepagename = argv[2];
		//const char* url = "http://admin.wechat.com/mp/appmsg/show?__biz=MjM5MTIwODcxNA==&appmsgid=10001872&itemidx=1&sign=d5997fecd12a3af79f8c8d65600f82a1";
		printf("url=%s\n",url);
		string urlstr = url;
		if(0 == sourcepagename.compare("zatu")) 
		{
			string strtmp = "";
			iconv_string("utf-8","gbk", urlstr.c_str(), urlstr.length(),strtmp,1);
			urlstr = strtmp;
			printf("urlstr=%s\n",urlstr.c_str());
		}
		
		int method = 0;
		if(!getPage(urlstr.c_str(), method,page))
		{
			printf("不能获取URL内容\n");
			return 0;
		}
		string content;
		string Introduction;
		string publishtime;
		string titlestr;
		string contentimg;
		page = mainpagetagclean(page);
		if(page.length() < 2048)
		{
			printf("the page source length too short ! \n");
			return 0;
		}
		printf("page length=%d\n",page.length());
		
		HtmlExtract sorceExtract(page,p_spiderruler[sourcepagename]);
		sorceExtract.Extract();
		titlestr	= sorceExtract.GetTitle();
		boost::regex title_reg("((?i)(&nbsp;))");
		titlestr = boost::regex_replace(titlestr,title_reg,"");
		printf("title=%s\n",titlestr.c_str());
		content			= sorceExtract.ArticleContent;
		publishtime		= sorceExtract.GetDateTime();
		Introduction	= sorceExtract.Introduction;
		strltrim(publishtime);
		publishtime		= publish_time_deal(publishtime);
		if(0 == sourcepagename.compare("geekpark"))  //极客公园特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(id=\"tags\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("cuntuba"))  //苹果网特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"cont\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("macx"))  //苹果网特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"v2-t_fsz\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
			reg14.assign("(?i)(<a[^>]*>.*?</a>)");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("leiphone"))  //雷锋网尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"post_content\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
			reg14.assign("(?i)(<div>.*?</div>)");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("tech163"))  //网易科技尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!id=\"endtext\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("pingwest"))  //pingwest尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg15("((?i)(<div.*?>[^<]+</div>))");
			content = boost::regex_replace(content,reg15,"");
		}
		if(0 == sourcepagename.compare("zatu"))  //杂图天下尾部特殊处理
		{
			//boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))");
			boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"format_text entry-content\").*?</div>))");
			content = boost::regex_replace(content,reg14,"");
		}
		if(0 == sourcepagename.compare("jandan"))
		{
			boost::regex reg11("((?i)(<a[^>]*>.*?</a>))");
			content = boost::regex_replace(content,reg11,"");
			boost::regex reg12("((?i)(<span[^>]*>.*?</span>))");
			content = boost::regex_replace(content,reg12,"");
		}
		if(0 == sourcepagename.compare("guaixun"))
		{
			boost::regex reg13("((?i)(<div style=\"position:absolute.*?</div>))");
			content = boost::regex_replace(content,reg13,"");
		}
		
		content = maincontenttagclean(content);	
		strltrim(content);
		boost::smatch m;
		boost::regex reg8;
		if(sourcepagename == "sinablogit")
		{
			reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
			content = boost::regex_replace(content,reg8,"");
			reg8.assign("(?i)( real_src =)");
			content = boost::regex_replace(content,reg8," src=");
		}
		else if(sourcepagename == "aqee")
		{
			reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
			content = boost::regex_replace(content,reg8,"");
			reg8.assign("(?i)( data-original=)");
			content = boost::regex_replace(content,reg8," src=");
		}
		else if(sourcepagename == "macx")
		{
			reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
			content = boost::regex_replace(content,reg8,"");
			reg8.assign("(?i)( zoomfile=)");
			content = boost::regex_replace(content,reg8," src=");
		}
		if(0 == sourcepagename.compare("cuntuba"))  //寸土吧特殊处理
		{
			reg8.assign("(?i)(\')");
			content = boost::regex_replace(content,reg8,"\"");
		}
		reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")");
		
		contentimg = content;
		boost::regex reg("((?i)<((?!img)[^>]*>))");
		contentimg = boost::regex_replace(contentimg,reg,"");

		//boost::regex reg8("(?i)(src=\"([^\"]*)\")");
		std::string::const_iterator start = contentimg.begin();
		std::string::const_iterator end = contentimg.end();
		try
		{
			while(boost::regex_search(start,end,m,reg8))
			{
				if (m[0].matched)
				{
					string tempurl(m[0].first,m[0].second);
					string regurl;
					tempurl = tempurl.substr(tempurl.find_first_of('\"',0)+1,tempurl.find_last_of('\"')-tempurl.find_first_of('\"',0)-1);
					//if( 0 == sourcepagename.compare("zatu") )
				//	{
						regurl =  _expandlinks(urlstr,tempurl);
			//		}
					if(0 != tempurl.length())
					{
						boost::regex reg(tempurl);
						content = boost::regex_replace(content,reg,regurl);
						ImgStrList.push_back(regurl);
					}
					start = m[0].second;
				}
			} 
		}
		catch (const boost::bad_expression& e)
		{
			theLog.WriteLog(LOG_LEVEL_ERROR,"cann't create regex with %s!",urlstr.c_str());
		}
		list<string>::iterator it;
		for( it = ImgStrList.begin(); it != ImgStrList.end(); it++)
		{
			printf("%s\n",(*it).c_str());
		}
		if( 0 == Introduction.length() )
		{
			boost::smatch m1;
			boost::regex reg("(?i)(<p>.*?</p>)");
			std::string::const_iterator start = content.begin();
			std::string::const_iterator end = content.end();
			while(boost::regex_search(start,end,m1,reg))
			{
				if (m1[0].matched)
				{
					Introduction = m1[0].str();
				}
				break;
			}
				
			int pos;
			if( 0 == Introduction.length() )
			{
				if(-1 != (pos = content.find_first_of("\x0d\x0a",0)))
				{
					Introduction = content.substr(0,pos);
				}
			}
		}
		Introduction = Introductioncontenttagclean(Introduction);
		strltrim(Introduction);
		if( 0 == p_spiderruler[sourcepagename].summaryisinbody.compare("yes") )
		{
			content = Introduction + content;
		}
		printf("publishtime=%s\n",publishtime.c_str());
		printf("Introduction=%s\n",Introduction.c_str());
		printf("content=%s\n",content.c_str());
	}
	Qiniu_Global_Cleanup();                 /* 全局清理函数,只需要在进程退出时调用一次 */
	return 0;


}