Beispiel #1
0
int CMyAsyncHttp::HttpQueryInfo(HINTERNET request,CMyString &info,int code)
{
	unsigned long rl=0;
	::HttpQueryInfo(request,code,NULL,&rl,0);

	if(rl>0)
	{
		info.Resize(rl+1);
		return ::HttpQueryInfo(request,code,info.GetBuffer(),&rl,0);
	}
	return false;
}
	virtual	void FileProcess(const char *parentUrl,const char* url,const char* fileData,int dataLen)
	{
		CMyString lUrl=(char*)url;
		CMyString path=CUrl::GetUrlPath(lUrl);
		CMyString name=CUrl::GetFileName(lUrl);
		if(path!=""&&name!="")
		{
			GetDirName(path);
			m_TempStr=m_FilePath+m_TempStr;
			CMyFile::CreateDir(m_TempStr.GetBuffer());
			m_TempStr+="\\"+name;
		}
	}
Beispiel #3
0
int CMyAsyncHttp::HttpGetCharset(HINTERNET request,CMyString &charset)
{
	CMyString str;
	if(HttpQueryInfo(request,str,HTTP_QUERY_CONTENT_TYPE))
	{
		char *p=str.GetBuffer();
		p=strstr(p,"charset=");
		if(p)charset=p+8;
		return 1;
	}
#ifdef _DEBUG
	int d=::GetLastError();
	if(d==ERROR_HTTP_HEADER_NOT_FOUND)
	{
		LOG(TAG,"HttpGetCharset,can't find the header!");
	}
#endif
	return 1;
	
}
void SpiderThread::AnalysisData(SpiderHttp* spiderHttp)
{
	CMyString url;
	CMyString host;

	bool	  haveUrl=true;
	if(spiderHttp->IsTxtPage())
	{
		if(!InitalFetchEngine(spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen()))
			haveUrl=false;
		while(haveUrl)
		{
			int urlCount=m_InterfaceConfig.m_FetchUrl->FetchUrl(m_CurrentP,m_Regex.GetMatchStrLen(),m_SameRegex);
			for(int i=0;i<urlCount;i++)
			{
				url	=*(m_InterfaceConfig.m_FetchUrl->GetUrl(i));
				if(url=="./")continue;
				ReBuildUrlIfNeed(spiderHttp->m_Url,url,host);
				if(HaveAcess(host,url))continue;
				for(int i=0;i<m_InterfaceConfig.m_UrlFilterList.size();i++)
				{
					if(!m_InterfaceConfig.m_UrlFilterList[i]->FilterCheck(spiderHttp->m_Url.GetBuffer(),url.GetBuffer()))break;
				}
				if(i<m_InterfaceConfig.m_UrlFilterList.size())continue;
				if(m_InterfaceConfig.m_UrlModify)
				{
					m_InterfaceConfig.m_UrlModify->ModifyUrl(spiderHttp->m_Url.GetBuffer(),url);
				}
				AddHashMap(host,url);
				AddTempUrlList(url);
			}
			if(!FetchUrl(url))break;
		}
		
		if(m_InterfaceConfig.m_PageProcess)
		{
			m_InterfaceConfig.m_PageProcess->PageProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen());
		}
		if(m_UrlCmp)
		{
			SortTempUrlList();
		}
		
		//根据深度优先还是广度优先进行抓取
		AddAllUrlToUrlList(spiderHttp->m_Url);
	}
	else if(m_InterfaceConfig.m_FileProcess)
	{
		m_InterfaceConfig.m_FileProcess->FileProcess(spiderHttp->m_ParentUrl.GetBuffer(),spiderHttp->m_Url.GetBuffer(),spiderHttp->GetReceiveData(),spiderHttp->GetReceiveDataLen());
	}	
}