Exemple #1
0
void CCrawl::Fetch(void *arg)
{
	string str_url,host;

	int nGsock = -1;//之前的套接字文件描述符
	string strGHost;//之前的主机号

	//生成一个PSE file来存放网页数据
	//string ofs_name = DATA_PSE_FILE + "." + CStrFunction::itos(GetCurrentThreadId());//PSE.raw+当前线程号

	string ofs_name = DATA_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt
	CPSEFile pse_file(ofs_name);//创建一个PSE格式的文件,保存为原始网页库

	//生成一个link_for_pse file来存放链接数据
	ofs_name = DATA_LINK_FOR_PSE_FILE  + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt
	CLinkForPSEFile link_for_pse_file(ofs_name);//创建一个网页结构库

	int isleep_cnt = 0;//线程运行控制参数

	for(;;)
	{
		WaitForSingleObject(mutex_collection,INFINITE);//互斥锁

		int cnt = map_urls.size();
		if(cnt > 0)
		{
			//已经收集的没有访问的url
			cout<<"collection has "<<cnt<<" unvisited urls"<<endl;
			multimap<string,string>::iterator it = map_urls.begin();
			if(it != map_urls.end())
			{
				//从带访问的url队列中得到一个url进行访问
				str_url = (*it).second;
				map_urls.erase(it);

				ReleaseMutex(mutex_collection);

				//分解url
				CUrl iurl;
				//看看url是否有http://,没有则返回

				if(iurl.ParseUrl(str_url) == false)
				{
					cout<<"parse url false in Fetch"<<str_url<<endl;
					continue;
				}

				//表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同
				//我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的
				//套接字文件描述符进行通信,这是由于循环导致的
				if(strGHost != iurl.host_name)
				{
					closesocket(nGsock);
					nGsock = -1;
					strGHost = iurl.host_name;
				}

				//根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
				((CCrawl *)arg)->DownroadFile(&pse_file,&link_for_pse_file,iurl,nGsock);

				cnt = 0;
			}else	
			{
				ReleaseMutex(mutex_collection);
				
			}
		}else
		{
			//等待访问的url队列map_urls已经没有url了,这是我们需要挂起线程进行等待
			ReleaseMutex(mutex_collection);
			Sleep(1000);
			isleep_cnt++;
		}

		if(b_f_over == true && isleep_cnt == 200)//当线程挂起的次数达到两百的时候,结束调用fetch
		{
			break;
		}
	}

	pse_file.Close();
	link_for_pse_file.Close();

}
Exemple #2
0
//将url放入map_urls到容器中
void CCrawl::AddUrl(const char * url)
{
	string str_url = url;
	if(str_url.empty() || str_url.length() < 8)
	{
		cout<<"the url is empty or too short"<<endl;
		return ;
	}

	CPage ipage;
	if(ipage.NormalizeUrl(str_url) == false)
		return ;

	CUrl iurl;

	//图片类型的网页,存放到历史网页链接库中
	if(iurl.IsImageUrl(str_url))
	{
		if(ofs_link_for_history_file)
		{
			WaitForSingleObject(mutex_link_for_history_file,INFINITE);

			ofs_link_for_history_file<<str_url<<endl;

			ReleaseMutex(mutex_link_for_history_file);
		}

		return ;
	}

	if(iurl.ParseUrl(str_url) == false)
	{
		cout<<"parse url error in AddUrl"<<endl;
		return ;
	}

	if(iurl.IsValidHost(iurl.host_name.c_str()) == false)
	{
		cout<<"not the valid host in AddUrl"<<endl;
		return ;
	}

	if(iurl.IsForeignHost(iurl.host_name.c_str()) )
	{
		cout<<"foreign host in AddUrl"<<endl;
		return ;
	}

	//如果是阻塞的ip地址,剔除掉
	unsigned long inaddr = 0;
	char *ip = NULL;

	inaddr =(unsigned long) inet_addr(iurl.host_name.c_str());

	if(inaddr != INADDR_NONE)
	{
		ip = new char[iurl.host_name.size() + 1];
		memset(ip,0,iurl.host_name.size() + 1);
		memcpy(ip,iurl.host_name.c_str(),iurl.host_name.size());
		if(!iurl.IsValidIp(ip))
		{
			delete []ip;
			ip = NULL;
			return ;
		}

		delete []ip;
		ip = NULL;
	}


	CStrFunction::StrToLower(iurl.host_name,iurl.host_name.size());

	CMD5 imd5;
	imd5.GenerateMd5((unsigned char *)str_url.c_str(),str_url.size());

	string str_digest = imd5.ToString();

	if(set_visited_url_md5.find(str_digest) != set_visited_url_md5.end())
	{
		return ;
	}

	if(set_unvisited_url_md5.find(str_digest) != set_unvisited_url_md5.end())
	{
		return ;
	}
	else
	{
		WaitForSingleObject(mutex_unvisited_url_md5,INFINITE);
		set_unvisited_url_md5.insert(str_digest);
		ReleaseMutex(mutex_unvisited_url_md5);
	}

	//确保同一个线程在一个网站上爬取
	int cnt = 0;

	for(;;)
	{
		if(1)//???????
		{
			WaitForSingleObject(mutex_visited_url_md5,INFINITE);
			map_urls.insert(val_type(iurl.host_name,str_url));
			ReleaseMutex(mutex_visited_url_md5);
			break;
		}
		else
		{
			cnt++;
			if(cnt%100 == 0)
				cout<<"~";
			

			if(cnt == 5000)
			{
				cout<<"remove it"<<endl;
			}

			Sleep(4000);
		}
	}

}