示例#1
0
// 仅用来搜索关键字,结果保存在char* resultFile中
void SearchKeysInFile(const char* file,const char * keys,const char * resultFile)
{
	// text
	//char * text = "12.5, a1.1, 0.123, 178";

	// declare
	//static CRegexpT <char> regexp("\\b\\d+\\.\\d+");
	//char * key=m_Keyword.GetBuffer();
	static CRegexpT <char> regexp(keys);
	CContext * pContext;

	//读文件,分块查找
	std::ifstream ifs;
	ifs.open(file,ifstream::binary);
	ifs.seekg(0,ifstream::beg);

	std::ofstream ofs;
	ofs.open(resultFile,ofstream::binary);
	ofs.clear();
	//ofs.seekg(0,ofstream::beg);

	char * pBuffer=new char[BLOCK_SIZE];
	while(!ifs.eof())
	{
		ifs.read(pBuffer,BLOCK_SIZE);

		//////////////////////////////////////////////////////////////////////////
		//分块查找

		// prepare
		pContext= regexp.PrepareMatch(pBuffer);

		// loop
		MatchResult result = regexp.Match(pContext);

		while( result.IsMatched() )
		{
			// 写入结果文件
			//	printf("%.*s\n", result.GetEnd() - result.GetStart(), pBuffer + result.GetStart());
			
			// 先转换,再写入
			ofs.write(pBuffer + result.GetStart(),result.GetEnd() - result.GetStart());
			ofs<<endl;
			ofs<<"——————————————————————"<<endl;
			// get next
			result = regexp.Match(pContext);
		}
	}

	// 搜索成功结束
	ifs.close();
	ofs.close();
	

	// release
	regexp.ReleaseContext(pContext);

	delete[] pBuffer;

}
示例#2
0
int main(int argc, char * argv[])
{
	std::ifstream fs("C:\\Lookup\\112.127.141.86.html");
	std::string in;
	load_file(in, fs);
	fs.close();
	//static CRegexpT <char> regexp1("\\d+");
	static CRegexpT <char> regexp1("target=\\\"_blank\\\"\\>(.+?)\\</a\\>\\</td\\>");
	// loop
	MatchResult result1 = regexp1.Match(in.c_str());

	while( result1.IsMatched() )
	{  
		//GetEnd匹配成功后,获取所匹配到的子字符串的结束位置。如果匹配失败,则返回负值。
		//GetStart匹配成功后,获取所匹配到的子字符串的开始位置。如果匹配失败,则返回负值。
		printf("%.*s\n", result1.GetEnd() - result1.GetStart(), in.c_str() + result1.GetStart());
		// get next
		result1 = regexp1.Match(in.c_str(), result1.GetEnd());  //返回匹配结果 MatchResult 对象。
		// 通过 MatchResult 对象,可以得知是否匹配成功。如果成功,通过 MatchResult 对象可以获取捕获信息。

	}

	// text
	char * text = "http://www.cppprog.com/2009/0112/48.html";
	// declare
	static CRegexpT <char> regexp("\\d+");
	// loop
	MatchResult result = regexp.Match(text);
	//IsMatched返回非零值表示匹配成功,返回 0 表示匹配失败。
	while( result.IsMatched() )
	{  
		//GetEnd匹配成功后,获取所匹配到的子字符串的结束位置。如果匹配失败,则返回负值。
		//GetStart匹配成功后,获取所匹配到的子字符串的开始位置。如果匹配失败,则返回负值。
		printf("%.*s\n", result.GetEnd() - result.GetStart(), text + result.GetStart());
		// get next
		result = regexp.Match(text, result.GetEnd());  //返回匹配结果 MatchResult 对象。
		// 通过 MatchResult 对象,可以得知是否匹配成功。如果成功,通过 MatchResult 对象可以获取捕获信息。

	}
	return 0;
}
pfc::string8 provider_darklyrics::lookup_one(unsigned p_index, const metadb_handle_ptr & p_meta, threaded_process_status & p_status, abort_callback & p_abort)
{
	TRACK_CALL_TEXT("provider_darklyrics::lookup_one");

	const float threshold = 0.8f;
	
	const pfc::string8 site = "darklyrics.com";

	// Regular Expression Class
	CRegexpT<char> regexp;
	MatchResult match;

	// Buffer
	pfc::string8 buff;

	try
	{
		// Init fetcher
		curl_wrapper_simple fetcher(&m_config_item);

		const metadb_handle_ptr & p = p_meta;

		if (p.is_empty())
		{
			return "";
		}

		pfc::string8_fast artist, title, album, keywords;

		file_info_impl info;
		p->get_info(info);

		// Get count of artists
		t_size count = info.meta_get_count_by_name("album");

		if (count == 0)
			return "";
		// Get Album
		album = info.meta_get("album", 0);

		count = info.meta_get_count_by_name("title");

		if (count == 0)
			return "";

		// Get TITLE
		title = info.meta_get("title", 0);

		count = info.meta_get_count_by_name("artist");

		// Iterate through all artists listed
		for (int j = 0; j < count; j++)
		{
			// Get Artist
			artist = info.meta_get("artist", j);		//Fetching from HTTP

			keywords = artist;
			keywords += "+";
			keywords += album;

			keywords.replace_char(' ', '+');

			// Get it now
			try
			{
				fetcher.fetch_googleluck(site, keywords, buff);
			}
			catch (pfc::exception & e)
			{
				console_error(e.what());
				continue;
			}
			catch (...)
			{
				continue;
			}

			const char * regex_ahref = "<a\\shref=\"#(?P<no>\\d+)\">(?P<text>.+?)</a>";

			// expression for extract lyrics
			regexp.Compile(regex_ahref, IGNORECASE);

			// match
			MatchResult result = regexp.Match(buff.get_ptr());

			int noGroup = regexp.GetNamedGroupNumber("no");
			int textGroup = regexp.GetNamedGroupNumber("text");

			int jump_to = 0;
			pfc::string8_fast compare = title;
			compare.insert_chars(0, ". ");
			float good;
			float best = 0.0f;


			while (result.IsMatched())
			{
				int gStart = result.GetGroupStart(noGroup);
				int gEnd = result.GetGroupEnd(noGroup);
				pfc::string8_fast temp(buff.get_ptr() + gStart, gEnd - gStart);
				int no = StrToIntA(temp);

				gStart = result.GetGroupStart(textGroup);
				gEnd = result.GetGroupEnd(textGroup);

				temp = pfc::string8_fast(buff.get_ptr()+gStart, gEnd - gStart);

				int levDist = LD(compare, compare.get_length(), temp, temp.get_length());

				good = 1.0f - (levDist / (float)compare.get_length());

				if (good >= threshold && good > best)
				{
					jump_to = no;
					best = good;
				}
				result = regexp.Match(buff.get_ptr(),result.GetEnd());
			}

			if (jump_to == 0)
			{
				continue;
			}

			char regex_lyrics[100];

			sprintf(regex_lyrics, "<a\\s+name=%d><font*(.*?)</font*(.*?)>(?P<lyrics>.+?)<font", jump_to);

			// expression for extract lyrics
			regexp.Compile(regex_lyrics, IGNORECASE | SINGLELINE);

			noGroup = regexp.GetNamedGroupNumber("lyrics");

			result = regexp.Match(buff.get_ptr());

			if (result.IsMatched())
			{
				int nStart = result.GetGroupStart(noGroup);
				int nEnd = result.GetGroupEnd(noGroup);
				pfc::string8 lyric(buff.get_ptr() + nStart, nEnd - nStart);

				convert_html_to_plain(lyric);

				if (lyric.get_length() > 0)
				{
					string_helper::remove_beginning_linebreaks(lyric);
					string_helper::remove_end_linebreaks(lyric);
					string_helper::remove_beginning(lyric, ' ');
					string_helper::remove_beginning(lyric, '\t');
					return lyric;
				}
			}
		}
	}
	catch (pfc::exception & e)
	{
		console_error(e.what());
		return "";
	}
	catch (...)
	{
		return "";
	}

	return "";
}
//************************************************************************
//*                             Dark Lyrics                              *
//************************************************************************
pfc::string_list_impl * provider_darklyrics::lookup(unsigned p_index, metadb_handle_list_cref p_meta, threaded_process_status & p_status, abort_callback & p_abort)
{
	TRACK_CALL_TEXT("provider_darklyrics::lookup");

	const float threshold = 0.8f;

	const pfc::string8 site = "darklyrics.com";

	// Regular Expression Class
	CRegexpT<char> regexp;
	MatchResult match;

	// Buffer
	pfc::string8 buff;
	pfc::string_list_impl * str_list = new pfc::string_list_impl;

	try
	{
		// Init fetcher
		curl_wrapper_simple fetcher(&m_config_item);

		for (t_size i = 0; i < p_meta.get_count(); ++i)
		{
			if (p_abort.is_aborting())
				break;

			// Sleep
			how_to_sleep(i);
			// Clear buff
			buff.reset();

			const metadb_handle_ptr & p = p_meta.get_item(i);

			if (p.is_empty())
			{
				str_list->add_item("");
				continue;
			}

			// Set progress
			pfc::string8_fast path = file_path_canonical(p->get_path());

			// add subsong index?
			if (p->get_subsong_index() > 0)
			{
				path.add_string(" /index:");
				path.add_string(pfc::format_uint(p->get_subsong_index()));
			}

			p_status.set_item_path(path);
			p_status.set_progress(i + 1, p_meta.get_count());

			pfc::string8_fast artist, title, album, keywords;

			file_info_impl info;
			p->get_info(info);

			// Get count of artists
			t_size count = info.meta_get_count_by_name("album");

			if (count == 0)
				continue;
			// Get Album
			album = info.meta_get("album", 0);

			count = info.meta_get_count_by_name("title");

			if (count == 0)
				continue;

			// Get TITLE
			title = info.meta_get("title", 0);

			count = info.meta_get_count_by_name("artist");

			bool found = false;

			// Iterate through all artists listed
			for (int j = 0; j < count && !found; j++)
			{
				// Get Artist
				artist = info.meta_get("artist", j);

				keywords = artist;
				keywords += "+";
				keywords += album;

				keywords.replace_char(' ', '+');

				// Get it now
				try
				{
					fetcher.fetch_googleluck(site, keywords, buff);
				}
				catch (pfc::exception & e)
				{
					console_error(e.what());
					continue;
				}
				catch (...)
				{
					continue;
				}

				const char * regex_ahref = "<a\\shref=\"#(?P<no>\\d+)\">(?P<text>.+?)</a>";

				// expression for extract lyrics
				regexp.Compile(regex_ahref, IGNORECASE | SINGLELINE);

				// match
				MatchResult result = regexp.Match(buff.get_ptr());

				int noGroup = regexp.GetNamedGroupNumber("no");
				int textGroup = regexp.GetNamedGroupNumber("text");

				int jump_to = 0;
				pfc::string8_fast compare = title;
				compare.insert_chars(0, ". ");
				float good;
				float best = 0.0f;

				while (result.IsMatched())
				{
					int gStart = result.GetGroupStart(noGroup);
					int gEnd = result.GetGroupEnd(noGroup);
					pfc::string8_fast temp(buff.get_ptr() + gStart, gEnd - gStart);
					int no = StrToIntA(temp);

					gStart = result.GetGroupStart(textGroup);
					gEnd = result.GetGroupEnd(textGroup);

					temp = pfc::string8_fast(buff.get_ptr()+gStart, gEnd - gStart);

					if (temp.find_first(title) != -1)
					{
						jump_to = no;
						break;
					}

					int levDist = LD(compare, compare.get_length(), temp, temp.get_length());

					good = 1.0f - (levDist / (float)compare.get_length());

					if (good >= threshold && good > best)
					{
						jump_to = no;
						best = good;
					}

					result = regexp.Match(buff.get_ptr(),result.GetEnd());
				}

				if (jump_to == 0)
				{
					continue;
				}

				char regex_lyrics[100];

				sprintf(regex_lyrics, "<a\\s+name=%d><font*(.*?)</font*(.*?)>(?P<lyrics>.+?)<font", jump_to);

				// expression for extract lyrics
				regexp.Compile(regex_lyrics, IGNORECASE | SINGLELINE);

				noGroup = regexp.GetNamedGroupNumber("lyrics");

				result = regexp.Match(buff.get_ptr());

				if (result.IsMatched())
				{
					int nStart = result.GetGroupStart(noGroup);
					int nEnd = result.GetGroupEnd(noGroup);
					pfc::string8 lyric(buff.get_ptr() + nStart, nEnd - nStart);

					convert_html_to_plain(lyric);

					if (lyric.get_length() > 0)
					{
						string_helper::remove_beginning_linebreaks(lyric);
						string_helper::remove_end_linebreaks(lyric);
						string_helper::remove_beginning(lyric, ' ');
						string_helper::remove_beginning(lyric, '\t');

						console::print(lyric);

						str_list->add_item(lyric);
						found = true;
						continue;
					}
				}
			}
			if (found)
				continue;
			else
				str_list->add_item("");
		}
	}
	catch (pfc::exception & e)
	{
		console_error(e.what());
		delete str_list;
		return NULL;
	}
	catch (...)
	{
		delete str_list;
		return NULL;
	}

	return str_list;
}