Beispiel #1
0
void build_web_pages_lib(ofstream &out, const string lib_path, const string file_path,
		int doc_id) {
	ifstream in;
	string line;
	EncodingConverter trans;
	string title = "【 标  题 】";
	string web_page_title; //网页标题
	string url = file_path; //网页的url
//	int fd = open(file_path.c_str(), O_RDONLY);
	title = trans.utf8_to_gbk(title);
	in.close();
	in.clear();
	in.open(file_path.c_str());
	int pos;
	//首先将web_page_title设为文档第一行,如果正文中找到了标题则替换之,如果没有则以文档第一行为标题
	if (getline(in, line)) {
		web_page_title = line;
		cout << "doc_id: " << doc_id << " title: " << web_page_title << endl;
	}
	while (getline(in, line)) {
		//找标题
		if (string::npos != (pos = line.find(title))) {
			web_page_title = line.substr(pos + title.size());
//			cout << "doc_id: " << doc_id << " title: " << trans.gbk_to_utf8(web_page_title) << endl;
			break;
		}
	}
	if (!in.eof()) {
		in.seekg(0); //rewind,定位到文件开头
	}  else {
		in.close();
		in.clear();
		in.open(file_path.c_str());
	}

	//构建网页
	out << "<doc>" << "<docid>" << doc_id << "</docid>" << "<url>" << url
			<< "</url>" << "<title>" << web_page_title << "</title>"
			<< "<content>";

	while (getline(in, line)) {
//		cout << trans.gbk_to_utf8(line) << endl;
		out << line;
	}
	out << "</content>" << "</doc>" << endl;
	in.close();
}
void FeatureCode::build_feature_code()
{
	//处理中文标点,转化成uint16_t 
	EncodingConverter trans;
	string cpunct = trans.utf8_to_gbk(CHINESE_PUNCT);
	uint16_t punct = (cpunct[0] << 8) + cpunct[1];

	//只能查找中文的标点的方法(注释部分)
	// int pos = 0;
	// while((pos = _content.find(punct, pos)) != string::npos)
	// {
	// 	cout << "------------------" << endl;
	// 	cout << pos << endl;
	// 	cout << "------------------" << endl;
	// 	 _feature_code += _content.substr(pos - 6, CUT_LEN);
	// 	++ pos;
	// }

	for(string::size_type ix = 0; ix != _content.size(); ++ix)
	{
		if((_content[ix] & 0x80))	//GBK
		{
			uint16_t tmp = (_content[ix] << 8) + _content[ix + 1];
			if(tmp == punct)	//if find Chinese punct ','
			{
				_feature_code += _content.substr(ix - 6, CUT_LEN);
			}
			++ix;
		}	
		else	//ASCII
		{
			if(_content[ix] == ENGLISH_PUNCT)	 //if find english punct ','
			{
				_feature_code += _content.substr(ix - 6, CUT_LEN - 1);
			}
		}
	}
}