Beispiel #1
0
void Query::display()
{
	EncodingConverter trans;
	std::string word;

/*	hash_search_iter hash_iter = hash_query.begin();
	while(hash_iter != hash_query.end())
	{
		word = hash_iter->first ;
		std::cout<<trans.gbkToutf8(word)<<'\t';
		for(std::vector<std::pair<int, double> >::iterator iter = hash_iter->second.begin(); iter != hash_iter->second.end(); ++iter)
		{
			std::cout<<iter->first<<'\t'<<iter->second<<'\t';
		}
		std::cout<<std::endl;
		++hash_iter ;
	}
*/

	m_common_iter iter = m_common_query.begin();
	while(iter != m_common_query.end())
	{
		std::cout<<iter->first<<std::endl;
		for(std::vector<std::pair<std::string, double> >::iterator v_iter = iter->second.begin(); v_iter != iter->second.end(); ++v_iter)
		{
			word = v_iter->first ;
			std::cout<<trans.gbkToutf8(word)<<'\t'<<v_iter->second<<'\t';
		}
		std::cout<<std::endl;
		++iter ;
	}
}
//transform lib function
//void StringParse(ifstream &ifs, string &in_file_name, vector<string> vec_str) {
void StringParse(std::string &in_file_name) {
	//cut words sector
	EncodingConverter trans;
	std::vector<std::string> word_vec; //store word into vector
	std::vector<std::string>::iterator iter_word_vec;

	std::string line; //读入行
	std::string content; //一次性读入全部
	std::string out_file_name = in_file_name; //生成新的文件名,就是替换替换txt为utf8
	std::ifstream ifs; //文本输入流
	std::ofstream ofs; //文本输出流

	out_file_name.erase(out_file_name.size() - 3, out_file_name.npos);
	out_file_name += "utf8";

	OpenInFile(ifs, in_file_name);
	OpenOutFile(ofs, out_file_name);

	while (getline(ifs, line)) {
		content += line + "\n";
	}
	ofs << trans.gbk_to_utf8(content) << std::endl;
	log_file << "tranform complete " << out_file_name << std::endl;

	ofs.close();
	ifs.close();
}
void Document::display(int docid, WebPage &page)
{
	EncodingConverter trans;
	if(page._docid < 20)
	{
		std::cout<<"page.m_word: "<<page.m_word.size()<<std::endl;
		std::cout<<"page.top_word: "<<page.top_word.size()<<std::endl;
		for(std::vector<std::string>::iterator iter = page.top_word.begin(); iter != page.top_word.end(); ++iter)
		{
			std::string temp = *iter;
			temp = trans.gbkToutf8(temp);
			std::cout<<temp<<"---"<<std::endl;
		}
		std::cout<<"*************"<<std::endl;
	}
}
Beispiel #4
0
void build_web_pages_lib(ofstream &out, const string lib_path, const string file_path,
		int doc_id) {
	ifstream in;
	string line;
	EncodingConverter trans;
	string title = "【 标  题 】";
	string web_page_title; //网页标题
	string url = file_path; //网页的url
//	int fd = open(file_path.c_str(), O_RDONLY);
	title = trans.utf8_to_gbk(title);
	in.close();
	in.clear();
	in.open(file_path.c_str());
	int pos;
	//首先将web_page_title设为文档第一行,如果正文中找到了标题则替换之,如果没有则以文档第一行为标题
	if (getline(in, line)) {
		web_page_title = line;
		cout << "doc_id: " << doc_id << " title: " << web_page_title << endl;
	}
	while (getline(in, line)) {
		//找标题
		if (string::npos != (pos = line.find(title))) {
			web_page_title = line.substr(pos + title.size());
//			cout << "doc_id: " << doc_id << " title: " << trans.gbk_to_utf8(web_page_title) << endl;
			break;
		}
	}
	if (!in.eof()) {
		in.seekg(0); //rewind,定位到文件开头
	}  else {
		in.close();
		in.clear();
		in.open(file_path.c_str());
	}

	//构建网页
	out << "<doc>" << "<docid>" << doc_id << "</docid>" << "<url>" << url
			<< "</url>" << "<title>" << web_page_title << "</title>"
			<< "<content>";

	while (getline(in, line)) {
//		cout << trans.gbk_to_utf8(line) << endl;
		out << line;
	}
	out << "</content>" << "</doc>" << endl;
	in.close();
}
Beispiel #5
0
//启动UDP服务器
void UDPServer::start(ThreadPool &thread_pool) {
//	fd_set fd_read, fd_read_back;
//	struct timeval tm;
//	FD_ZERO(&fd_read);
	//监听socket上的输入
//	FD_SET(server_fd, &fd_read);
//	tm.tv_sec = 3;
//	tm.tv_usec = 0;
	int addr_len = sizeof(client_addr);
	char *recv_buf = new char[1024];
	EncodingConverter trans;
	while(1) {
//		tm.tv_sec = 3;
//		tm.tv_usec = 0;
//		fd_read_back = fd_read;
		std::cout << "recv。。。" << std::endl ;
//		select(1024, &fd_read_back, NULL, NULL, &tm);
//		if(FD_ISSET(server_fd, &fd_read_back)) {

			//接受数据
			memset(recv_buf, 0, 1024);
			memset(&client_addr, 0, sizeof(client_addr));
			cout << "================" << endl;
			if (-1 == recvfrom(server_fd, recv_buf, 1024, 0, (struct sockaddr*)&client_addr, (socklen_t*)&addr_len))
				Log::get_instance()->write("UDPServer recv data from client failed!");
			string recv = trans.gbk_to_utf8(string(recv_buf));
			cout << "server recv: " << recv << endl;
			Task task;
			task.set_task(recv_buf);
			task.set_addr(client_addr);
			//输出地址
			//cout << "UDPServer: " << task._addr.sin_addr.s_addr << endl;

			if(!thread_pool.add_task_to_pool(task)) {
				Log::get_instance()->write("failed to add task to pool");
			}
			cout << "-----------------------------" << endl;
			//cout << "queue size: " << thread_pool.get_task_queue_size() << endl;
//		}
	}
}
void FeatureCode::build_feature_code()
{
	//处理中文标点,转化成uint16_t 
	EncodingConverter trans;
	string cpunct = trans.utf8_to_gbk(CHINESE_PUNCT);
	uint16_t punct = (cpunct[0] << 8) + cpunct[1];

	//只能查找中文的标点的方法(注释部分)
	// int pos = 0;
	// while((pos = _content.find(punct, pos)) != string::npos)
	// {
	// 	cout << "------------------" << endl;
	// 	cout << pos << endl;
	// 	cout << "------------------" << endl;
	// 	 _feature_code += _content.substr(pos - 6, CUT_LEN);
	// 	++ pos;
	// }

	for(string::size_type ix = 0; ix != _content.size(); ++ix)
	{
		if((_content[ix] & 0x80))	//GBK
		{
			uint16_t tmp = (_content[ix] << 8) + _content[ix + 1];
			if(tmp == punct)	//if find Chinese punct ','
			{
				_feature_code += _content.substr(ix - 6, CUT_LEN);
			}
			++ix;
		}	
		else	//ASCII
		{
			if(_content[ix] == ENGLISH_PUNCT)	 //if find english punct ','
			{
				_feature_code += _content.substr(ix - 6, CUT_LEN - 1);
			}
		}
	}
}