Esempio n. 1
0
void AddEventos::ClickUrl()
{
    if(ComboEvento->currentIndex()==0)
        AddUrl(true);
    else
        AddUrl(false);
}
Esempio n. 2
0
	void RadioWidget::handleAddCurrentUrl ()
	{
		const auto& url = Player_->GetSourceObject ()->
				GetCurrentSource ().ToUrl ();
		if (url.isLocalFile ())
			return;

		AddUrl (url);
	}
//返回值提取到的有效url个数
int	ISpiderFetchUrl::FetchUrl(char* context,int contextLen,bool regexSame)
{
	//提取context中的有效url
	m_Context	=context;
	m_ContextLen=contextLen;
	m_CurrentP	=m_Context;
	m_UrlCount	=0;
	if(!regexSame)
	{
		InitalRegex(context,contextLen);
		while(GetUrl())
		{
			if(this->onFetchUrl(m_TempUrl.GetBuffer(),context,contextLen))
			{
				AddUrl();
			}
		}
	}
	else
	{
		m_TempUrl	=CMyString::StringFromMem(m_CurrentP,0,contextLen);
		//去除结尾"
		m_TempUrl.EraseFromRight(1);
		//去除href="和src="
		if(m_TempUrl[0]=='h')
			m_TempUrl.Erase(0,6);
		else
			m_TempUrl.Erase(0,5);
		//去除http://
		if(m_TempUrl.FindString("http://")!=-1)
			m_TempUrl.Erase(0,7);
		int len=m_TempUrl.GetStrLen();
		
		m_TempUrl.Trim();
		AddUrl();
	}
	return GetUrlCount();
}
nsresult nsMsgSearchSession::BuildUrlQueue ()
{
    PRInt32 i;
    for (i = m_idxRunningScope; i < m_scopeList.Count(); i++)
    {
        nsMsgSearchScopeTerm *scope = m_scopeList.ElementAt(i);
        if (scope->m_attribute != nsMsgSearchScope::onlineMail &&
                (scope->m_attribute != nsMsgSearchScope::news && scope->m_searchServer))
            break;
        nsCOMPtr <nsIMsgSearchAdapter> adapter = do_QueryInterface((m_scopeList.ElementAt(i))->m_adapter);
        nsCString url;
        if (adapter)
        {
            adapter->GetEncoding(getter_Copies(url));
            AddUrl (url.get());
        }
    }

    if (i > 0)
        GetNextUrl();

    return NS_OK;
}
Esempio n. 5
0
	void RadioWidget::handleAddUrl ()
	{
		AddUrl ({});
	}
Esempio n. 6
0
//CCrawl类的总控制函数
void CCrawl::DoCrawl()
{
	//set the signal function 作用????????????????
/*	signal(SIGTERM,SigTerm);
	signal(SIGILL,SigTerm);
	signal(SIGFPE,SigTerm);
	signal(SIGINT,SIG_IGN);
	signal(SIGBREAK,SIG_IGN);*/

	//output the begin time
	char str_time[128];
	time_t t_date;
	memset(str_time,0,128);
	time(&t_date);
	strftime(str_time,128,"%a, %d %b %Y %H:%M:%S GMT",localtime(&t_date));

	cout << "\n\nBegin at: " << str_time << "\n\n";

	//从文件中获取到其他的信息
	GetVisitedUrlMd5();
	GetVisitedPageMd5();
	GetIpBlock();
	GetUnreachHostMd5();
	

	//打开url种子库文件
	ifstream ifs_seed(input_file_name.c_str());
	
	if(!ifs_seed)
	{
		cout<<"can not open the url seed file "<<input_file_name<<endl;
		return ;
	}

	//打开文件输出流
	OpenFilesForOutput();

	//创建线程
	DWORD dw_thread_id[NUMBER_WORKERS];
	HANDLE h_thread[NUMBER_WORKERS];

	for(int i = 0;i<NUMBER_WORKERS;i++)
	{
		h_thread[i] = CreateThread(NULL,0,start,this,0,&dw_thread_id[i]);

		if(h_thread[i] == NULL)
		{
			cout<<"can not create the thread "<<i<<endl;
			ExitProcess(i);
		}
	}


	string str_url;
	CPage ipage;

	while(getline(ifs_seed,str_url))
	{
		string::size_type index;

		if(str_url[0] == '\0' ||str_url[0] == '#'
			|| str_url[0] == '\n')
			continue;

		index = str_url.find('\t');
		if(index != string::npos)
		{
			str_url = str_url.substr(0,index);
		}

		index = CStrFunction::FindCase(str_url,"http");
		if(index == string::npos)
		{
			index = str_url.find('/');
			if(index == string::npos)
			{
				str_url = "http://" + str_url + "/";
			}
			else
				str_url = "http://" + str_url;
		}

		if(ipage.IsFilterLink(str_url))
			continue;
		AddUrl(str_url.c_str());
	}

	//获得未访问的url
	ifstream ifs_unvisited_url(UNVISITED_FILE.c_str(),ios::binary);
	if(!ifs_unvisited_url)
	{
		while(getline(ifs_unvisited_url,str_url))
		{
			string::size_type index;

			if(str_url[0] == '\0' ||str_url[0] == '#'
				|| str_url[0] == '\n')
				continue;

			index = str_url.find('\t');
			if(index != string::npos)
			{
				str_url = str_url.substr(0,index);
			}

			if(ipage.IsFilterLink(str_url))
				continue;

			AddUrl(str_url.c_str());
				
		}
	}
	else
		cout<<"can not open the file : "<<UNVISITED_FILE<<endl;

	b_f_over = true;
	
	cout<<"finished to get all unvisited urls"<<endl;
	//等待所有的线程结束
	WaitForMultipleObjects(NUMBER_WORKERS,h_thread,false,INFINITE);
	//关闭所有的线程句柄
	for(int i = 0;i<NUMBER_WORKERS;i++)
	{
		CloseHandle(h_thread[i]);
	}
	
	cout<<"all of "<<NUMBER_WORKERS<<" thread have been closed"<<endl;

	SaveUnvisitedUrl();
	SaveReplicas("repli");

	memset(str_time,0,128);
	time(&t_date);
	strftime(str_time, 128, "%a, %d %b %Y %H:%M:%S GMT", localtime(&t_date));
	cout << "\n\nEnd at: " << str_time << "\n\n";

}
Esempio n. 7
0
//函数:下载文件
void CCrawl::DownroadFile(CPSEFile *pse_file_ptr,CLinkForPSEFile *link_for_pse_file_ptr,
		CUrl iurl,int nGsock)
{
	char *downroaded_file = NULL,//网页体信息
		*file_header = NULL,//网页头信息
		*location = NULL;//网页重定向

	int file_length = 0;//网页体真实的字节长度
	string str_url_location = "";//保存网页的重定向超链接

	//之后请求的网页和之前请求的网页位于同一个主机上,我们可以利用之前
	//的套接字文件描述符进行通信,这样我们可以节约带宽,节省时间????为何???????????????????????????????
	int nsock = nGsock;//将之前的套接字文件描述符赋值给nsock
	cout<<"PID = "<<GetCurrentThreadId()<<" nsock = "<<nsock<<endl;

	CHttp ihttp;
	//真正的抓取网页的函数,有了URL,搜集系统可以根据URL的标识抓取其对应的网页
	file_length = ihttp.Fetch(iurl.origin_url,&downroaded_file,&file_header,&location,&nsock);

	int location_count = 0;//标识url重定向的次数,如果重定向了3次,我们就不要抓取它对应的网页

	while(file_length == -300)//表明此时重定向了
	{
		//转到其他的地址
		if( strlen(location)> URL_LEN -1 || location_count == 3 ||strlen(location) == 0)
		{
			if(location)
			{
				free(location);
				location = 0;
			}
			file_length = -1;
			break;
		}

		//将获取到的重定向的URL给str_url_location,为下次抓取网页做准备
		str_url_location = location;
		if(location)
		{
			free(location);
			location = 0;
		}

		//因为重定向的URL可能是相对路径,所以我们必须将它转化为绝对路径
		//跟CPage类中提取超链接信息一样
		string::size_type index1 = CStrFunction::FindCase(str_url_location,"http");
		if(index1 != 0)//没有找到http协议
		{
			char c1 = iurl.origin_url.at(iurl.origin_url.length()-1);
			char c2 = str_url_location.at(0);

			if(c2 == '/')//重定向的url一定是相对url
				str_url_location = "http://" + iurl.host_name + str_url_location;
			else if(c1 != '/' && c2 != '/')
			{
				string::size_type index;
				index = iurl.origin_url.rfind('/');
				if(index != string::npos)
				{
					if(index > 6)
					{
						str_url_location = iurl.origin_url.substr(0,index+1) + str_url_location;
					}
					else
						str_url_location = iurl.origin_url + "/" + str_url_location;
				}
				else
				{
					file_length = -1;
					break;
				}
			}
			else
			{
				if(c1 == '/')
					str_url_location = iurl.origin_url + str_url_location;
				else
					str_url_location = iurl.origin_url + "/" + str_url_location;
			}

		}

		CPage ipage;
		if(ipage.IsFilterLink(str_url_location))
		{
			file_length = -1;
			break;
		}

		cout<<"PID= "<<GetCurrentThreadId()<<" sock= "<<nGsock<<endl;
		file_length = ihttp.Fetch(str_url_location,&downroaded_file,&file_header,&location,&nGsock);
		location_count++;


	}//while循环结束

	nGsock = nsock;//将得到的套接字文件描述符给之前的套接字文件描述符,为下次重用做准备

	if(file_length == -1)//其他的各种错误,错误在CHttp的Fetch函数中
	{
		cout<<"error in the page of"<<iurl.origin_url<<endl;
		
		if(file_header)
		{
			free(file_header);
			file_header = NULL;
		}

		if(downroaded_file)
		{
			free(downroaded_file);
			downroaded_file = NULL;
		}

		cout<<"unreach host : "<<iurl.host_name<<endl;
		return ;
	}

	if(file_length == -2)//在ip阻塞范围内
	{
		if(file_header)
		{
			free(file_header);
			file_header = NULL;
		}

		if(downroaded_file)
		{
			free(downroaded_file);
			downroaded_file = NULL;
		}

		SaveUnreachHost(iurl.host_name);
		cout<<"out of block host : "<<iurl.host_name<<endl;
		return ;
	}

	if(file_length == -3)//错误的ip地址
	{
		if(file_header)
		{
			free(file_header);
			file_header = NULL;
		}

		if(downroaded_file)
		{
			free(downroaded_file);
			downroaded_file = NULL;
		}

		cout<<"invalid host : "<<iurl.host_name<<endl;
		return ;
	}

	if(file_length == -4)//图片类型的网页
	{
		if(file_header)
		{
			free(file_header);
			file_header = NULL;
		}

		if(downroaded_file)
		{
			free(downroaded_file);
			downroaded_file = NULL;
		}

		if(ofs_link_for_history_file)
		{
			
		}
		cout<<"image host : "<<iurl.host_name<<endl;
		return ;
	}
	
	//处理正常的网页,只要网页头或网页体的信息有一个为空,我们就认为网页不正常
	if(!file_header || !downroaded_file)
	{
		if(file_header)
		{
			free(file_header);
			file_header = NULL;
		}

		if(downroaded_file)
		{
			free(downroaded_file);
			downroaded_file = NULL;
		}

		closesocket(nGsock);
		nGsock = -1;
		cout<<"not the nomal host"<<endl;
		return ;
	}

	//将抓取的网页信息放入到CPage类中
	CPage ipage(iurl.origin_url,str_url_location,file_header,downroaded_file,file_length);

	if(file_header)
	{
		free(file_header);
		file_header = NULL;
	}

	if(downroaded_file)
	{
		free(downroaded_file);
		downroaded_file = NULL;
	}
	
	//解析网页头信息
	ipage.ParseHeaderInfo(ipage.header);

	if(ipage.connection_state == false)
	{
		closesocket(nGsock);
		nGsock = -1;
	}

	//过滤掉不是我们想要的网页体信息,注意?????????????????????????????????忽略了图片形式的网页
	if(ipage.content_type != "text/html"
		&&ipage.content_type != "text/plain"
		&&ipage.content_type != "text/xml"
		&&ipage.content_type != "application/msword"
		&&ipage.content_type != "applicaiton/pdf"
		&&ipage.content_type != "text/rtf"
		&&ipage.content_type != "applicaiton/postscript"
		&&ipage.content_type != "applicaiton/vnd.ms-excel"
		&&ipage.content_type != "application/vnd.ms-powerpoint")
	{
		cout<<"unwant host type"<<iurl.host_name<<endl;
		return ;
	}

	//解压缩
	//如果是gzip编码,要解压缩,然后提取超链接信息
/*	char unzip_content_buffer[1024000];
	int unzip_length = 0;

	if(ipage.content_encoding == "gzip" && ipage.content_type == "text/html")
	{
		gzFile zip;
		string ofs_gzip_name;
		ofs_gzip_name = CStrFunction::itos(GetCurrentThreadId()) + ".gz";

		//以二进制截断的方式打开文件
		//ios::trunc 如果文件存在,则将文件长度截断为0,并清除文件的内容,如果文件不存在,则创建该文件
		ofstream ofs_downroad_file(ofs_gzip_name.c_str(),ios::trunc|ios::binary);

		cout<<"file length : "<<endl;
		ofs_downroad_file.write(ipage.body_content.c_str(),ipage.body_content_length);
		ofs_downroad_file.close();

		zip = gzopen(ofs_gzip_name.c_str(),"rb");

		if(zip == NULL)
		{
			cout<<"open zip file "<<ofs_gzip_name.c_str()<<" error ."<<endl;
			exit(-1);

		}

		//解压缩过程,将解压缩后的网页体信息放入到缓冲区域unzip_content_buffer
		unzip_length = gzread(zip,unzip_content_buffer,1024000);
		if(unzip_length == -1)
		{
			cout<<"read zip file "<<ofs_gzip_name.c_str()<<" error ."<<endl;
			exit(-1);
		}

		unzip_content_buffer[unzip_length] = 0;
		gzclose(zip);
	}//解压缩过程结束

	*/
	CMD5 imd5;
	string str_digest;

	//判断该URL是否在set_visited_url_md5中,在返回;不在加到其中,并保存
	imd5.GenerateMd5((unsigned char *)iurl.origin_url.c_str(),iurl.origin_url.length());//生成md5码
	str_digest = imd5.ToString();

	WaitForSingleObject(mutex_visited_url_md5,INFINITE);

	if(set_visited_url_md5.find(str_digest) != set_visited_url_md5.end())//表明已经抓取过
	{
		cout<<"the url :"<<iurl.origin_url.c_str()<<" have crawled !"<<endl;

		ReleaseMutex(mutex_visited_url_md5);
		return ;
	}

	//不在set_visited_url_md5中,现在必须插入set_visited_url_md5中
	//因为该URL现在已经访问过了
	set_visited_url_md5.insert(str_digest);
	SaveVisitedUrlMd5(str_digest);
	ReleaseMutex(mutex_visited_url_md5);

	//判断该网页体是否已经访问过,访问过返回,没有访问过加到set_visited_page_md5集合中
	imd5.GenerateMd5((unsigned char *)ipage.body_content.c_str(),ipage.body_content.length());
	str_digest = imd5.ToString();

	WaitForSingleObject(mutex_visited_page_md5,INFINITE);
	//网页体MD5同URL的关系插入到容器replicas中
	replicas.insert(pair<string,string>(str_digest,iurl.origin_url));

	if(set_visited_page_md5.find(str_digest) != set_visited_page_md5.end())//表明出现了镜像文件
	{
		cout<<"the page  have crawled !"<<endl;

		ReleaseMutex(mutex_visited_page_md5);
		return ;
	}

	//不在set_visited_page_md5中,现在必须插入set_visited_page_md5中
	//因为该URL现在已经访问过了
	set_visited_page_md5.insert(str_digest);
	SaveVisitedPageMd5(str_digest);
	ReleaseMutex(mutex_visited_page_md5);
	

	//将抓取到的网页以PSE格式放到原始网页库中
	SavePseRawData(pse_file_ptr,&iurl,&ipage);

	if(ipage.location.length()<1)
	{
		SaveVisitedUrl(iurl.origin_url);
	}
	else
		SaveVisitedUrl(ipage.location);

	if(ipage.content_type != "text/html")//只可以在text/html中发现超链接
		return ;

//=====================================保存单个网页的所有连接信息

	if (ipage.ParseHyperLinks() == false)
	{
		return;
	}
	
	SaveLinkForPSE( &ipage);
	SaveLinkForHistory( &ipage);

	map<string,string>::iterator it = ipage.map_link_for_pse.begin();
	string str;
	for( ; it!= ipage.map_link_for_pse.end(); ++it )
	{
		str = (*it).first;
		AddUrl( str.c_str() );

	}

//========================================

	return ;
}