void UninitUploader() { if (nProcStatus != 1) return; nProcStatus = 2; StopMgr(); Qiniu_Global_Cleanup(); return; }
void Qiniu_Servend_Cleanup() { Qiniu_Global_Cleanup(); }
int main(int argc, char* argv[]) { QINIU_ACCESS_KEY = "sn7d6X2kmRQKkNyO0_ZY_Hz2utVrXIeEmc8QutVC"; QINIU_SECRET_KEY = "jeQSWafTp7kczgR4qVa-erKOaHk0_qcvMNacxO8E"; curl_global_init(CURL_GLOBAL_ALL); Qiniu_Global_Init(-1); /* 全局初始化函数,整个进程只需要调用一次 */ if(strcmp(argv[1],"imgtest") == 0 && argc == 4) { string oldurl,newurl,articleurl; articleurl = argv[2]; oldurl = argv[3]; newurl = _expandlinks(articleurl,oldurl); printf("newurl=%s\n",oldurl.c_str()); } if(strcmp(argv[1],"-start") == 0 && argc == 4) { int ntime = atoi(argv[2]); int nprocesstimes = 1; ArticleManage m_article; theLog.SetLogFilePath(GetFullPath()); if(!ReadConfigFile((GetFullPath()+"/sqlconfig.conf").c_str(),p_config)) { return 0; } if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler)) { return 0; } while(1) { m_article.start(atoi(argv[3])); theLog.WriteLog(LOG_LEVEL_SYS,"the proc ArticleExtract %d times completed!",nprocesstimes); nprocesstimes++; sleep(ntime * 60); } } if(argc == 3 && strcmp(argv[1],"-start") != 0 ) { if(!ReadSpiderRulerConfigFile((GetFullPath()+"/spiderruler.conf").c_str(),p_spiderruler)) { return 0; } string page; list<string> ImgStrList; page.clear(); const char* url = argv[1]; string sourcepagename = argv[2]; //const char* url = "http://admin.wechat.com/mp/appmsg/show?__biz=MjM5MTIwODcxNA==&appmsgid=10001872&itemidx=1&sign=d5997fecd12a3af79f8c8d65600f82a1"; printf("url=%s\n",url); string urlstr = url; if(0 == sourcepagename.compare("zatu")) { string strtmp = ""; iconv_string("utf-8","gbk", urlstr.c_str(), urlstr.length(),strtmp,1); urlstr = strtmp; printf("urlstr=%s\n",urlstr.c_str()); } int method = 0; if(!getPage(urlstr.c_str(), method,page)) { printf("不能获取URL内容\n"); return 0; } string content; string Introduction; string publishtime; string titlestr; string contentimg; page = mainpagetagclean(page); if(page.length() < 2048) { printf("the page source length too short ! \n"); return 0; } printf("page length=%d\n",page.length()); HtmlExtract sorceExtract(page,p_spiderruler[sourcepagename]); sorceExtract.Extract(); titlestr = sorceExtract.GetTitle(); boost::regex title_reg("((?i)( ))"); titlestr = boost::regex_replace(titlestr,title_reg,""); printf("title=%s\n",titlestr.c_str()); content = sorceExtract.ArticleContent; publishtime = sorceExtract.GetDateTime(); Introduction = sorceExtract.Introduction; strltrim(publishtime); publishtime = publish_time_deal(publishtime); if(0 == sourcepagename.compare("geekpark")) //极客公园特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(id=\"tags\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("cuntuba")) //苹果网特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"cont\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("macx")) //苹果网特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"v2-t_fsz\").*?</div>))"); content = boost::regex_replace(content,reg14,""); reg14.assign("(?i)(<a[^>]*>.*?</a>)"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("leiphone")) //雷锋网尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"post_content\").*?</div>))"); content = boost::regex_replace(content,reg14,""); reg14.assign("(?i)(<div>.*?</div>)"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("tech163")) //网易科技尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!id=\"endtext\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("pingwest")) //pingwest尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg15("((?i)(<div.*?>[^<]+</div>))"); content = boost::regex_replace(content,reg15,""); } if(0 == sourcepagename.compare("zatu")) //杂图天下尾部特殊处理 { //boost::regex reg14("((?i)(<div\\s{1,4}class=\"yarpp-related\".*?</div>))"); boost::regex reg14("((?i)(<div\\s{1,4}(?!class=\"format_text entry-content\").*?</div>))"); content = boost::regex_replace(content,reg14,""); } if(0 == sourcepagename.compare("jandan")) { boost::regex reg11("((?i)(<a[^>]*>.*?</a>))"); content = boost::regex_replace(content,reg11,""); boost::regex reg12("((?i)(<span[^>]*>.*?</span>))"); content = boost::regex_replace(content,reg12,""); } if(0 == sourcepagename.compare("guaixun")) { boost::regex reg13("((?i)(<div style=\"position:absolute.*?</div>))"); content = boost::regex_replace(content,reg13,""); } content = maincontenttagclean(content); strltrim(content); boost::smatch m; boost::regex reg8; if(sourcepagename == "sinablogit") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( real_src =)"); content = boost::regex_replace(content,reg8," src="); } else if(sourcepagename == "aqee") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( data-original=)"); content = boost::regex_replace(content,reg8," src="); } else if(sourcepagename == "macx") { reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); content = boost::regex_replace(content,reg8,""); reg8.assign("(?i)( zoomfile=)"); content = boost::regex_replace(content,reg8," src="); } if(0 == sourcepagename.compare("cuntuba")) //寸土吧特殊处理 { reg8.assign("(?i)(\')"); content = boost::regex_replace(content,reg8,"\""); } reg8.assign("(?i)( src\\s{0,2}=\\s{0,2}\"([^\"]*)\")"); contentimg = content; boost::regex reg("((?i)<((?!img)[^>]*>))"); contentimg = boost::regex_replace(contentimg,reg,""); //boost::regex reg8("(?i)(src=\"([^\"]*)\")"); std::string::const_iterator start = contentimg.begin(); std::string::const_iterator end = contentimg.end(); try { while(boost::regex_search(start,end,m,reg8)) { if (m[0].matched) { string tempurl(m[0].first,m[0].second); string regurl; tempurl = tempurl.substr(tempurl.find_first_of('\"',0)+1,tempurl.find_last_of('\"')-tempurl.find_first_of('\"',0)-1); //if( 0 == sourcepagename.compare("zatu") ) // { regurl = _expandlinks(urlstr,tempurl); // } if(0 != tempurl.length()) { boost::regex reg(tempurl); content = boost::regex_replace(content,reg,regurl); ImgStrList.push_back(regurl); } start = m[0].second; } } } catch (const boost::bad_expression& e) { theLog.WriteLog(LOG_LEVEL_ERROR,"cann't create regex with %s!",urlstr.c_str()); } list<string>::iterator it; for( it = ImgStrList.begin(); it != ImgStrList.end(); it++) { printf("%s\n",(*it).c_str()); } if( 0 == Introduction.length() ) { boost::smatch m1; boost::regex reg("(?i)(<p>.*?</p>)"); std::string::const_iterator start = content.begin(); std::string::const_iterator end = content.end(); while(boost::regex_search(start,end,m1,reg)) { if (m1[0].matched) { Introduction = m1[0].str(); } break; } int pos; if( 0 == Introduction.length() ) { if(-1 != (pos = content.find_first_of("\x0d\x0a",0))) { Introduction = content.substr(0,pos); } } } Introduction = Introductioncontenttagclean(Introduction); strltrim(Introduction); if( 0 == p_spiderruler[sourcepagename].summaryisinbody.compare("yes") ) { content = Introduction + content; } printf("publishtime=%s\n",publishtime.c_str()); printf("Introduction=%s\n",Introduction.c_str()); printf("content=%s\n",content.c_str()); } Qiniu_Global_Cleanup(); /* 全局清理函数,只需要在进程退出时调用一次 */ return 0; }