int CMyAsyncHttp::HttpQueryInfo(HINTERNET request,CMyString &info,int code) { unsigned long rl=0; ::HttpQueryInfo(request,code,NULL,&rl,0); if(rl>0) { info.Resize(rl+1); return ::HttpQueryInfo(request,code,info.GetBuffer(),&rl,0); } return false; }
virtual void FileProcess(const char *parentUrl,const char* url,const char* fileData,int dataLen) { CMyString lUrl=(char*)url; CMyString path=CUrl::GetUrlPath(lUrl); CMyString name=CUrl::GetFileName(lUrl); if(path!=""&&name!="") { GetDirName(path); m_TempStr=m_FilePath+m_TempStr; CMyFile::CreateDir(m_TempStr.GetBuffer()); m_TempStr+="\\"+name; } }
int CMyAsyncHttp::HttpGetCharset(HINTERNET request,CMyString &charset) { CMyString str; if(HttpQueryInfo(request,str,HTTP_QUERY_CONTENT_TYPE)) { char *p=str.GetBuffer(); p=strstr(p,"charset="); if(p)charset=p+8; return 1; } #ifdef _DEBUG int d=::GetLastError(); if(d==ERROR_HTTP_HEADER_NOT_FOUND) { LOG(TAG,"HttpGetCharset,can't find the header!"); } #endif return 1; }
void SpiderThread::AnalysisData(SpiderHttp* spiderHttp) { CMyString url; CMyString host; bool haveUrl=true; if(spiderHttp->IsTxtPage()) { if(!InitalFetchEngine(spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen())) haveUrl=false; while(haveUrl) { int urlCount=m_InterfaceConfig.m_FetchUrl->FetchUrl(m_CurrentP,m_Regex.GetMatchStrLen(),m_SameRegex); for(int i=0;i<urlCount;i++) { url =*(m_InterfaceConfig.m_FetchUrl->GetUrl(i)); if(url=="./")continue; ReBuildUrlIfNeed(spiderHttp->m_Url,url,host); if(HaveAcess(host,url))continue; for(int i=0;i<m_InterfaceConfig.m_UrlFilterList.size();i++) { if(!m_InterfaceConfig.m_UrlFilterList[i]->FilterCheck(spiderHttp->m_Url.GetBuffer(),url.GetBuffer()))break; } if(i<m_InterfaceConfig.m_UrlFilterList.size())continue; if(m_InterfaceConfig.m_UrlModify) { m_InterfaceConfig.m_UrlModify->ModifyUrl(spiderHttp->m_Url.GetBuffer(),url); } AddHashMap(host,url); AddTempUrlList(url); } if(!FetchUrl(url))break; } if(m_InterfaceConfig.m_PageProcess) { m_InterfaceConfig.m_PageProcess->PageProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen()); } if(m_UrlCmp) { SortTempUrlList(); } //根据深度优先还是广度优先进行抓取 AddAllUrlToUrlList(spiderHttp->m_Url); } else if(m_InterfaceConfig.m_FileProcess) { m_InterfaceConfig.m_FileProcess->FileProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen()); } }