C++ (Cpp) CUrl::ParseUrl Exemples

Langage de programmation: C++ (Cpp)

Class/Type: CUrl

Méthode/Fonction: ParseUrl

Exemples au hotexamples.com: 2

C++ (Cpp) CUrl::ParseUrl - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de CUrl::ParseUrl extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

GetUrlPath(6)

GetPortValue(6)

GetHost(6)

GetScheme(5)

GetPort(4)

CrackUrl(4)

GetPortNumber(3)

GetHostName(3)

GetExtraInfo(2)

GetResource(2)

getUrl(2)

ConstructL(2)

ParseUrl(2)

getPort(1)

getDomain(1)

IsValidHost(1)

UrlDes(1)

SplitUrl(1)

OnWriteOver(1)

IsValidIp(1)

Component(1)

IsUrlType(1)

IsUrl(1)

IsImageUrl(1)

IsForeignHost(1)

GetUserName(1)

GetSchemeName(1)

GetPassword(1)

GetHttpAll(1)

GetBrute(1)

get_response_code(1)

Méthodes fréquemment utilisées

GetUrlPath (6)

GetPortValue (6)

GetHost (6)

GetScheme (5)

GetPort (4)

CrackUrl (4)

GetPortNumber (3)

GetHostName (3)

GetExtraInfo (2)

GetResource (2)

Méthodes fréquemment utilisées

getUrl (2)

ConstructL (2)

ParseUrl (2)

getPort (1)

getDomain (1)

IsValidHost (1)

UrlDes (1)

SplitUrl (1)

OnWriteOver (1)

IsValidIp (1)

Component (1)

IsUrlType (1)

IsUrl (1)

IsImageUrl (1)

IsForeignHost (1)

GetUserName (1)

GetSchemeName (1)

GetPassword (1)

GetHttpAll (1)

GetBrute (1)

Méthodes fréquemment utilisées

Component (1)

IsUrlType (1)

IsUrl (1)

IsImageUrl (1)

IsForeignHost (1)

GetUserName (1)

GetSchemeName (1)

GetPassword (1)

GetHttpAll (1)

GetBrute (1)

get_response_code (1)

Méthodes fréquemment utilisées

get_response_code (1)

Exemple #1

0

Afficher le fichier

Fichier : Crawl.cpp Projet : TimmaWang/PictureCrawl

void CCrawl::Fetch(void *arg) { string str_url,host; int nGsock = -1;//之前的套接字文件描述符 string strGHost;//之前的主机号 //生成一个PSE file来存放网页数据 //string ofs_name = DATA_PSE_FILE + "." + CStrFunction::itos(GetCurrentThreadId());//PSE.raw+当前线程号 string ofs_name = DATA_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt CPSEFile pse_file(ofs_name);//创建一个PSE格式的文件，保存为原始网页库 //生成一个link_for_pse file来存放链接数据 ofs_name = DATA_LINK_FOR_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt CLinkForPSEFile link_for_pse_file(ofs_name);//创建一个网页结构库 int isleep_cnt = 0;//线程运行控制参数 for(;;) { WaitForSingleObject(mutex_collection,INFINITE);//互斥锁 int cnt = map_urls.size(); if(cnt > 0) { //已经收集的没有访问的url cout<<"collection has "<<cnt<<" unvisited urls"<<endl; multimap<string,string>::iterator it = map_urls.begin(); if(it != map_urls.end()) { //从带访问的url队列中得到一个url进行访问 str_url = (*it).second; map_urls.erase(it); ReleaseMutex(mutex_collection); //分解url CUrl iurl; //看看url是否有http://，没有则返回 if(iurl.ParseUrl(str_url) == false) { cout<<"parse url false in Fetch"<<str_url<<endl; continue; } //表明现在抓取的网页所在的主机，同之前抓取的网页所在的主机不同 //我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的 //套接字文件描述符进行通信,这是由于循环导致的 if(strGHost != iurl.host_name) { closesocket(nGsock); nGsock = -1; strGHost = iurl.host_name; } //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库 ((CCrawl *)arg)->DownroadFile(&pse_file,&link_for_pse_file,iurl,nGsock); cnt = 0; }else { ReleaseMutex(mutex_collection); } }else { //等待访问的url队列map_urls已经没有url了，这是我们需要挂起线程进行等待 ReleaseMutex(mutex_collection); Sleep(1000); isleep_cnt++; } if(b_f_over == true && isleep_cnt == 200)//当线程挂起的次数达到两百的时候，结束调用fetch { break; } } pse_file.Close(); link_for_pse_file.Close(); }

Exemple #2

0

Afficher le fichier

Fichier : Crawl.cpp Projet : TimmaWang/PictureCrawl

//将url放入map_urls到容器中 void CCrawl::AddUrl(const char * url) { string str_url = url; if(str_url.empty() || str_url.length() < 8) { cout<<"the url is empty or too short"<<endl; return ; } CPage ipage; if(ipage.NormalizeUrl(str_url) == false) return ; CUrl iurl; //图片类型的网页，存放到历史网页链接库中 if(iurl.IsImageUrl(str_url)) { if(ofs_link_for_history_file) { WaitForSingleObject(mutex_link_for_history_file,INFINITE); ofs_link_for_history_file<<str_url<<endl; ReleaseMutex(mutex_link_for_history_file); } return ; } if(iurl.ParseUrl(str_url) == false) { cout<<"parse url error in AddUrl"<<endl; return ; } if(iurl.IsValidHost(iurl.host_name.c_str()) == false) { cout<<"not the valid host in AddUrl"<<endl; return ; } if(iurl.IsForeignHost(iurl.host_name.c_str()) ) { cout<<"foreign host in AddUrl"<<endl; return ; } //如果是阻塞的ip地址，剔除掉 unsigned long inaddr = 0; char *ip = NULL; inaddr =(unsigned long) inet_addr(iurl.host_name.c_str()); if(inaddr != INADDR_NONE) { ip = new char[iurl.host_name.size() + 1]; memset(ip,0,iurl.host_name.size() + 1); memcpy(ip,iurl.host_name.c_str(),iurl.host_name.size()); if(!iurl.IsValidIp(ip)) { delete []ip; ip = NULL; return ; } delete []ip; ip = NULL; } CStrFunction::StrToLower(iurl.host_name,iurl.host_name.size()); CMD5 imd5; imd5.GenerateMd5((unsigned char *)str_url.c_str(),str_url.size()); string str_digest = imd5.ToString(); if(set_visited_url_md5.find(str_digest) != set_visited_url_md5.end()) { return ; } if(set_unvisited_url_md5.find(str_digest) != set_unvisited_url_md5.end()) { return ; } else { WaitForSingleObject(mutex_unvisited_url_md5,INFINITE); set_unvisited_url_md5.insert(str_digest); ReleaseMutex(mutex_unvisited_url_md5); } //确保同一个线程在一个网站上爬取 int cnt = 0; for(;;) { if(1)//？？？？？？？ { WaitForSingleObject(mutex_visited_url_md5,INFINITE); map_urls.insert(val_type(iurl.host_name,str_url)); ReleaseMutex(mutex_visited_url_md5); break; } else { cnt++; if(cnt%100 == 0) cout<<"~"; if(cnt == 5000) { cout<<"remove it"<<endl; } Sleep(4000); } } }