void AddEventos::ClickUrl() { if(ComboEvento->currentIndex()==0) AddUrl(true); else AddUrl(false); }
void RadioWidget::handleAddCurrentUrl () { const auto& url = Player_->GetSourceObject ()-> GetCurrentSource ().ToUrl (); if (url.isLocalFile ()) return; AddUrl (url); }
//返回值提取到的有效url个数 int ISpiderFetchUrl::FetchUrl(char* context,int contextLen,bool regexSame) { //提取context中的有效url m_Context =context; m_ContextLen=contextLen; m_CurrentP =m_Context; m_UrlCount =0; if(!regexSame) { InitalRegex(context,contextLen); while(GetUrl()) { if(this->onFetchUrl(m_TempUrl.GetBuffer(),context,contextLen)) { AddUrl(); } } } else { m_TempUrl =CMyString::StringFromMem(m_CurrentP,0,contextLen); //去除结尾" m_TempUrl.EraseFromRight(1); //去除href="和src=" if(m_TempUrl[0]=='h') m_TempUrl.Erase(0,6); else m_TempUrl.Erase(0,5); //去除http:// if(m_TempUrl.FindString("http://")!=-1) m_TempUrl.Erase(0,7); int len=m_TempUrl.GetStrLen(); m_TempUrl.Trim(); AddUrl(); } return GetUrlCount(); }
nsresult nsMsgSearchSession::BuildUrlQueue () { PRInt32 i; for (i = m_idxRunningScope; i < m_scopeList.Count(); i++) { nsMsgSearchScopeTerm *scope = m_scopeList.ElementAt(i); if (scope->m_attribute != nsMsgSearchScope::onlineMail && (scope->m_attribute != nsMsgSearchScope::news && scope->m_searchServer)) break; nsCOMPtr <nsIMsgSearchAdapter> adapter = do_QueryInterface((m_scopeList.ElementAt(i))->m_adapter); nsCString url; if (adapter) { adapter->GetEncoding(getter_Copies(url)); AddUrl (url.get()); } } if (i > 0) GetNextUrl(); return NS_OK; }
void RadioWidget::handleAddUrl () { AddUrl ({}); }
//CCrawl类的总控制函数 void CCrawl::DoCrawl() { //set the signal function 作用???????????????? /* signal(SIGTERM,SigTerm); signal(SIGILL,SigTerm); signal(SIGFPE,SigTerm); signal(SIGINT,SIG_IGN); signal(SIGBREAK,SIG_IGN);*/ //output the begin time char str_time[128]; time_t t_date; memset(str_time,0,128); time(&t_date); strftime(str_time,128,"%a, %d %b %Y %H:%M:%S GMT",localtime(&t_date)); cout << "\n\nBegin at: " << str_time << "\n\n"; //从文件中获取到其他的信息 GetVisitedUrlMd5(); GetVisitedPageMd5(); GetIpBlock(); GetUnreachHostMd5(); //打开url种子库文件 ifstream ifs_seed(input_file_name.c_str()); if(!ifs_seed) { cout<<"can not open the url seed file "<<input_file_name<<endl; return ; } //打开文件输出流 OpenFilesForOutput(); //创建线程 DWORD dw_thread_id[NUMBER_WORKERS]; HANDLE h_thread[NUMBER_WORKERS]; for(int i = 0;i<NUMBER_WORKERS;i++) { h_thread[i] = CreateThread(NULL,0,start,this,0,&dw_thread_id[i]); if(h_thread[i] == NULL) { cout<<"can not create the thread "<<i<<endl; ExitProcess(i); } } string str_url; CPage ipage; while(getline(ifs_seed,str_url)) { string::size_type index; if(str_url[0] == '\0' ||str_url[0] == '#' || str_url[0] == '\n') continue; index = str_url.find('\t'); if(index != string::npos) { str_url = str_url.substr(0,index); } index = CStrFunction::FindCase(str_url,"http"); if(index == string::npos) { index = str_url.find('/'); if(index == string::npos) { str_url = "http://" + str_url + "/"; } else str_url = "http://" + str_url; } if(ipage.IsFilterLink(str_url)) continue; AddUrl(str_url.c_str()); } //获得未访问的url ifstream ifs_unvisited_url(UNVISITED_FILE.c_str(),ios::binary); if(!ifs_unvisited_url) { while(getline(ifs_unvisited_url,str_url)) { string::size_type index; if(str_url[0] == '\0' ||str_url[0] == '#' || str_url[0] == '\n') continue; index = str_url.find('\t'); if(index != string::npos) { str_url = str_url.substr(0,index); } if(ipage.IsFilterLink(str_url)) continue; AddUrl(str_url.c_str()); } } else cout<<"can not open the file : "<<UNVISITED_FILE<<endl; b_f_over = true; cout<<"finished to get all unvisited urls"<<endl; //等待所有的线程结束 WaitForMultipleObjects(NUMBER_WORKERS,h_thread,false,INFINITE); //关闭所有的线程句柄 for(int i = 0;i<NUMBER_WORKERS;i++) { CloseHandle(h_thread[i]); } cout<<"all of "<<NUMBER_WORKERS<<" thread have been closed"<<endl; SaveUnvisitedUrl(); SaveReplicas("repli"); memset(str_time,0,128); time(&t_date); strftime(str_time, 128, "%a, %d %b %Y %H:%M:%S GMT", localtime(&t_date)); cout << "\n\nEnd at: " << str_time << "\n\n"; }
//函数:下载文件 void CCrawl::DownroadFile(CPSEFile *pse_file_ptr,CLinkForPSEFile *link_for_pse_file_ptr, CUrl iurl,int nGsock) { char *downroaded_file = NULL,//网页体信息 *file_header = NULL,//网页头信息 *location = NULL;//网页重定向 int file_length = 0;//网页体真实的字节长度 string str_url_location = "";//保存网页的重定向超链接 //之后请求的网页和之前请求的网页位于同一个主机上,我们可以利用之前 //的套接字文件描述符进行通信,这样我们可以节约带宽,节省时间????为何??????????????????????????????? int nsock = nGsock;//将之前的套接字文件描述符赋值给nsock cout<<"PID = "<<GetCurrentThreadId()<<" nsock = "<<nsock<<endl; CHttp ihttp; //真正的抓取网页的函数,有了URL,搜集系统可以根据URL的标识抓取其对应的网页 file_length = ihttp.Fetch(iurl.origin_url,&downroaded_file,&file_header,&location,&nsock); int location_count = 0;//标识url重定向的次数,如果重定向了3次,我们就不要抓取它对应的网页 while(file_length == -300)//表明此时重定向了 { //转到其他的地址 if( strlen(location)> URL_LEN -1 || location_count == 3 ||strlen(location) == 0) { if(location) { free(location); location = 0; } file_length = -1; break; } //将获取到的重定向的URL给str_url_location,为下次抓取网页做准备 str_url_location = location; if(location) { free(location); location = 0; } //因为重定向的URL可能是相对路径,所以我们必须将它转化为绝对路径 //跟CPage类中提取超链接信息一样 string::size_type index1 = CStrFunction::FindCase(str_url_location,"http"); if(index1 != 0)//没有找到http协议 { char c1 = iurl.origin_url.at(iurl.origin_url.length()-1); char c2 = str_url_location.at(0); if(c2 == '/')//重定向的url一定是相对url str_url_location = "http://" + iurl.host_name + str_url_location; else if(c1 != '/' && c2 != '/') { string::size_type index; index = iurl.origin_url.rfind('/'); if(index != string::npos) { if(index > 6) { str_url_location = iurl.origin_url.substr(0,index+1) + str_url_location; } else str_url_location = iurl.origin_url + "/" + str_url_location; } else { file_length = -1; break; } } else { if(c1 == '/') str_url_location = iurl.origin_url + str_url_location; else str_url_location = iurl.origin_url + "/" + str_url_location; } } CPage ipage; if(ipage.IsFilterLink(str_url_location)) { file_length = -1; break; } cout<<"PID= "<<GetCurrentThreadId()<<" sock= "<<nGsock<<endl; file_length = ihttp.Fetch(str_url_location,&downroaded_file,&file_header,&location,&nGsock); location_count++; }//while循环结束 nGsock = nsock;//将得到的套接字文件描述符给之前的套接字文件描述符,为下次重用做准备 if(file_length == -1)//其他的各种错误,错误在CHttp的Fetch函数中 { cout<<"error in the page of"<<iurl.origin_url<<endl; if(file_header) { free(file_header); file_header = NULL; } if(downroaded_file) { free(downroaded_file); downroaded_file = NULL; } cout<<"unreach host : "<<iurl.host_name<<endl; return ; } if(file_length == -2)//在ip阻塞范围内 { if(file_header) { free(file_header); file_header = NULL; } if(downroaded_file) { free(downroaded_file); downroaded_file = NULL; } SaveUnreachHost(iurl.host_name); cout<<"out of block host : "<<iurl.host_name<<endl; return ; } if(file_length == -3)//错误的ip地址 { if(file_header) { free(file_header); file_header = NULL; } if(downroaded_file) { free(downroaded_file); downroaded_file = NULL; } cout<<"invalid host : "<<iurl.host_name<<endl; return ; } if(file_length == -4)//图片类型的网页 { if(file_header) { free(file_header); file_header = NULL; } if(downroaded_file) { free(downroaded_file); downroaded_file = NULL; } if(ofs_link_for_history_file) { } cout<<"image host : "<<iurl.host_name<<endl; return ; } //处理正常的网页,只要网页头或网页体的信息有一个为空,我们就认为网页不正常 if(!file_header || !downroaded_file) { if(file_header) { free(file_header); file_header = NULL; } if(downroaded_file) { free(downroaded_file); downroaded_file = NULL; } closesocket(nGsock); nGsock = -1; cout<<"not the nomal host"<<endl; return ; } //将抓取的网页信息放入到CPage类中 CPage ipage(iurl.origin_url,str_url_location,file_header,downroaded_file,file_length); if(file_header) { free(file_header); file_header = NULL; } if(downroaded_file) { free(downroaded_file); downroaded_file = NULL; } //解析网页头信息 ipage.ParseHeaderInfo(ipage.header); if(ipage.connection_state == false) { closesocket(nGsock); nGsock = -1; } //过滤掉不是我们想要的网页体信息,注意?????????????????????????????????忽略了图片形式的网页 if(ipage.content_type != "text/html" &&ipage.content_type != "text/plain" &&ipage.content_type != "text/xml" &&ipage.content_type != "application/msword" &&ipage.content_type != "applicaiton/pdf" &&ipage.content_type != "text/rtf" &&ipage.content_type != "applicaiton/postscript" &&ipage.content_type != "applicaiton/vnd.ms-excel" &&ipage.content_type != "application/vnd.ms-powerpoint") { cout<<"unwant host type"<<iurl.host_name<<endl; return ; } //解压缩 //如果是gzip编码,要解压缩,然后提取超链接信息 /* char unzip_content_buffer[1024000]; int unzip_length = 0; if(ipage.content_encoding == "gzip" && ipage.content_type == "text/html") { gzFile zip; string ofs_gzip_name; ofs_gzip_name = CStrFunction::itos(GetCurrentThreadId()) + ".gz"; //以二进制截断的方式打开文件 //ios::trunc 如果文件存在,则将文件长度截断为0,并清除文件的内容,如果文件不存在,则创建该文件 ofstream ofs_downroad_file(ofs_gzip_name.c_str(),ios::trunc|ios::binary); cout<<"file length : "<<endl; ofs_downroad_file.write(ipage.body_content.c_str(),ipage.body_content_length); ofs_downroad_file.close(); zip = gzopen(ofs_gzip_name.c_str(),"rb"); if(zip == NULL) { cout<<"open zip file "<<ofs_gzip_name.c_str()<<" error ."<<endl; exit(-1); } //解压缩过程,将解压缩后的网页体信息放入到缓冲区域unzip_content_buffer unzip_length = gzread(zip,unzip_content_buffer,1024000); if(unzip_length == -1) { cout<<"read zip file "<<ofs_gzip_name.c_str()<<" error ."<<endl; exit(-1); } unzip_content_buffer[unzip_length] = 0; gzclose(zip); }//解压缩过程结束 */ CMD5 imd5; string str_digest; //判断该URL是否在set_visited_url_md5中,在返回;不在加到其中,并保存 imd5.GenerateMd5((unsigned char *)iurl.origin_url.c_str(),iurl.origin_url.length());//生成md5码 str_digest = imd5.ToString(); WaitForSingleObject(mutex_visited_url_md5,INFINITE); if(set_visited_url_md5.find(str_digest) != set_visited_url_md5.end())//表明已经抓取过 { cout<<"the url :"<<iurl.origin_url.c_str()<<" have crawled !"<<endl; ReleaseMutex(mutex_visited_url_md5); return ; } //不在set_visited_url_md5中,现在必须插入set_visited_url_md5中 //因为该URL现在已经访问过了 set_visited_url_md5.insert(str_digest); SaveVisitedUrlMd5(str_digest); ReleaseMutex(mutex_visited_url_md5); //判断该网页体是否已经访问过,访问过返回,没有访问过加到set_visited_page_md5集合中 imd5.GenerateMd5((unsigned char *)ipage.body_content.c_str(),ipage.body_content.length()); str_digest = imd5.ToString(); WaitForSingleObject(mutex_visited_page_md5,INFINITE); //网页体MD5同URL的关系插入到容器replicas中 replicas.insert(pair<string,string>(str_digest,iurl.origin_url)); if(set_visited_page_md5.find(str_digest) != set_visited_page_md5.end())//表明出现了镜像文件 { cout<<"the page have crawled !"<<endl; ReleaseMutex(mutex_visited_page_md5); return ; } //不在set_visited_page_md5中,现在必须插入set_visited_page_md5中 //因为该URL现在已经访问过了 set_visited_page_md5.insert(str_digest); SaveVisitedPageMd5(str_digest); ReleaseMutex(mutex_visited_page_md5); //将抓取到的网页以PSE格式放到原始网页库中 SavePseRawData(pse_file_ptr,&iurl,&ipage); if(ipage.location.length()<1) { SaveVisitedUrl(iurl.origin_url); } else SaveVisitedUrl(ipage.location); if(ipage.content_type != "text/html")//只可以在text/html中发现超链接 return ; //=====================================保存单个网页的所有连接信息 if (ipage.ParseHyperLinks() == false) { return; } SaveLinkForPSE( &ipage); SaveLinkForHistory( &ipage); map<string,string>::iterator it = ipage.map_link_for_pse.begin(); string str; for( ; it!= ipage.map_link_for_pse.end(); ++it ) { str = (*it).first; AddUrl( str.c_str() ); } //======================================== return ; }