void CCrawl::Fetch(void *arg) { string str_url,host; int nGsock = -1;//之前的套接字文件描述符 string strGHost;//之前的主机号 //生成一个PSE file来存放网页数据 //string ofs_name = DATA_PSE_FILE + "." + CStrFunction::itos(GetCurrentThreadId());//PSE.raw+当前线程号 string ofs_name = DATA_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt CPSEFile pse_file(ofs_name);//创建一个PSE格式的文件,保存为原始网页库 //生成一个link_for_pse file来存放链接数据 ofs_name = DATA_LINK_FOR_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt CLinkForPSEFile link_for_pse_file(ofs_name);//创建一个网页结构库 int isleep_cnt = 0;//线程运行控制参数 for(;;) { WaitForSingleObject(mutex_collection,INFINITE);//互斥锁 int cnt = map_urls.size(); if(cnt > 0) { //已经收集的没有访问的url cout<<"collection has "<<cnt<<" unvisited urls"<<endl; multimap<string,string>::iterator it = map_urls.begin(); if(it != map_urls.end()) { //从带访问的url队列中得到一个url进行访问 str_url = (*it).second; map_urls.erase(it); ReleaseMutex(mutex_collection); //分解url CUrl iurl; //看看url是否有http://,没有则返回 if(iurl.ParseUrl(str_url) == false) { cout<<"parse url false in Fetch"<<str_url<<endl; continue; } //表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同 //我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的 //套接字文件描述符进行通信,这是由于循环导致的 if(strGHost != iurl.host_name) { closesocket(nGsock); nGsock = -1; strGHost = iurl.host_name; } //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库 ((CCrawl *)arg)->DownroadFile(&pse_file,&link_for_pse_file,iurl,nGsock); cnt = 0; }else { ReleaseMutex(mutex_collection); } }else { //等待访问的url队列map_urls已经没有url了,这是我们需要挂起线程进行等待 ReleaseMutex(mutex_collection); Sleep(1000); isleep_cnt++; } if(b_f_over == true && isleep_cnt == 200)//当线程挂起的次数达到两百的时候,结束调用fetch { break; } } pse_file.Close(); link_for_pse_file.Close(); }
//将url放入map_urls到容器中 void CCrawl::AddUrl(const char * url) { string str_url = url; if(str_url.empty() || str_url.length() < 8) { cout<<"the url is empty or too short"<<endl; return ; } CPage ipage; if(ipage.NormalizeUrl(str_url) == false) return ; CUrl iurl; //图片类型的网页,存放到历史网页链接库中 if(iurl.IsImageUrl(str_url)) { if(ofs_link_for_history_file) { WaitForSingleObject(mutex_link_for_history_file,INFINITE); ofs_link_for_history_file<<str_url<<endl; ReleaseMutex(mutex_link_for_history_file); } return ; } if(iurl.ParseUrl(str_url) == false) { cout<<"parse url error in AddUrl"<<endl; return ; } if(iurl.IsValidHost(iurl.host_name.c_str()) == false) { cout<<"not the valid host in AddUrl"<<endl; return ; } if(iurl.IsForeignHost(iurl.host_name.c_str()) ) { cout<<"foreign host in AddUrl"<<endl; return ; } //如果是阻塞的ip地址,剔除掉 unsigned long inaddr = 0; char *ip = NULL; inaddr =(unsigned long) inet_addr(iurl.host_name.c_str()); if(inaddr != INADDR_NONE) { ip = new char[iurl.host_name.size() + 1]; memset(ip,0,iurl.host_name.size() + 1); memcpy(ip,iurl.host_name.c_str(),iurl.host_name.size()); if(!iurl.IsValidIp(ip)) { delete []ip; ip = NULL; return ; } delete []ip; ip = NULL; } CStrFunction::StrToLower(iurl.host_name,iurl.host_name.size()); CMD5 imd5; imd5.GenerateMd5((unsigned char *)str_url.c_str(),str_url.size()); string str_digest = imd5.ToString(); if(set_visited_url_md5.find(str_digest) != set_visited_url_md5.end()) { return ; } if(set_unvisited_url_md5.find(str_digest) != set_unvisited_url_md5.end()) { return ; } else { WaitForSingleObject(mutex_unvisited_url_md5,INFINITE); set_unvisited_url_md5.insert(str_digest); ReleaseMutex(mutex_unvisited_url_md5); } //确保同一个线程在一个网站上爬取 int cnt = 0; for(;;) { if(1)//??????? { WaitForSingleObject(mutex_visited_url_md5,INFINITE); map_urls.insert(val_type(iurl.host_name,str_url)); ReleaseMutex(mutex_visited_url_md5); break; } else { cnt++; if(cnt%100 == 0) cout<<"~"; if(cnt == 5000) { cout<<"remove it"<<endl; } Sleep(4000); } } }