// 仅用来搜索关键字,结果保存在char* resultFile中 void SearchKeysInFile(const char* file,const char * keys,const char * resultFile) { // text //char * text = "12.5, a1.1, 0.123, 178"; // declare //static CRegexpT <char> regexp("\\b\\d+\\.\\d+"); //char * key=m_Keyword.GetBuffer(); static CRegexpT <char> regexp(keys); CContext * pContext; //读文件,分块查找 std::ifstream ifs; ifs.open(file,ifstream::binary); ifs.seekg(0,ifstream::beg); std::ofstream ofs; ofs.open(resultFile,ofstream::binary); ofs.clear(); //ofs.seekg(0,ofstream::beg); char * pBuffer=new char[BLOCK_SIZE]; while(!ifs.eof()) { ifs.read(pBuffer,BLOCK_SIZE); ////////////////////////////////////////////////////////////////////////// //分块查找 // prepare pContext= regexp.PrepareMatch(pBuffer); // loop MatchResult result = regexp.Match(pContext); while( result.IsMatched() ) { // 写入结果文件 // printf("%.*s\n", result.GetEnd() - result.GetStart(), pBuffer + result.GetStart()); // 先转换,再写入 ofs.write(pBuffer + result.GetStart(),result.GetEnd() - result.GetStart()); ofs<<endl; ofs<<"——————————————————————"<<endl; // get next result = regexp.Match(pContext); } } // 搜索成功结束 ifs.close(); ofs.close(); // release regexp.ReleaseContext(pContext); delete[] pBuffer; }
int main(int argc, char * argv[]) { std::ifstream fs("C:\\Lookup\\112.127.141.86.html"); std::string in; load_file(in, fs); fs.close(); //static CRegexpT <char> regexp1("\\d+"); static CRegexpT <char> regexp1("target=\\\"_blank\\\"\\>(.+?)\\</a\\>\\</td\\>"); // loop MatchResult result1 = regexp1.Match(in.c_str()); while( result1.IsMatched() ) { //GetEnd匹配成功后,获取所匹配到的子字符串的结束位置。如果匹配失败,则返回负值。 //GetStart匹配成功后,获取所匹配到的子字符串的开始位置。如果匹配失败,则返回负值。 printf("%.*s\n", result1.GetEnd() - result1.GetStart(), in.c_str() + result1.GetStart()); // get next result1 = regexp1.Match(in.c_str(), result1.GetEnd()); //返回匹配结果 MatchResult 对象。 // 通过 MatchResult 对象,可以得知是否匹配成功。如果成功,通过 MatchResult 对象可以获取捕获信息。 } // text char * text = "http://www.cppprog.com/2009/0112/48.html"; // declare static CRegexpT <char> regexp("\\d+"); // loop MatchResult result = regexp.Match(text); //IsMatched返回非零值表示匹配成功,返回 0 表示匹配失败。 while( result.IsMatched() ) { //GetEnd匹配成功后,获取所匹配到的子字符串的结束位置。如果匹配失败,则返回负值。 //GetStart匹配成功后,获取所匹配到的子字符串的开始位置。如果匹配失败,则返回负值。 printf("%.*s\n", result.GetEnd() - result.GetStart(), text + result.GetStart()); // get next result = regexp.Match(text, result.GetEnd()); //返回匹配结果 MatchResult 对象。 // 通过 MatchResult 对象,可以得知是否匹配成功。如果成功,通过 MatchResult 对象可以获取捕获信息。 } return 0; }
pfc::string8 provider_darklyrics::lookup_one(unsigned p_index, const metadb_handle_ptr & p_meta, threaded_process_status & p_status, abort_callback & p_abort) { TRACK_CALL_TEXT("provider_darklyrics::lookup_one"); const float threshold = 0.8f; const pfc::string8 site = "darklyrics.com"; // Regular Expression Class CRegexpT<char> regexp; MatchResult match; // Buffer pfc::string8 buff; try { // Init fetcher curl_wrapper_simple fetcher(&m_config_item); const metadb_handle_ptr & p = p_meta; if (p.is_empty()) { return ""; } pfc::string8_fast artist, title, album, keywords; file_info_impl info; p->get_info(info); // Get count of artists t_size count = info.meta_get_count_by_name("album"); if (count == 0) return ""; // Get Album album = info.meta_get("album", 0); count = info.meta_get_count_by_name("title"); if (count == 0) return ""; // Get TITLE title = info.meta_get("title", 0); count = info.meta_get_count_by_name("artist"); // Iterate through all artists listed for (int j = 0; j < count; j++) { // Get Artist artist = info.meta_get("artist", j); //Fetching from HTTP keywords = artist; keywords += "+"; keywords += album; keywords.replace_char(' ', '+'); // Get it now try { fetcher.fetch_googleluck(site, keywords, buff); } catch (pfc::exception & e) { console_error(e.what()); continue; } catch (...) { continue; } const char * regex_ahref = "<a\\shref=\"#(?P<no>\\d+)\">(?P<text>.+?)</a>"; // expression for extract lyrics regexp.Compile(regex_ahref, IGNORECASE); // match MatchResult result = regexp.Match(buff.get_ptr()); int noGroup = regexp.GetNamedGroupNumber("no"); int textGroup = regexp.GetNamedGroupNumber("text"); int jump_to = 0; pfc::string8_fast compare = title; compare.insert_chars(0, ". "); float good; float best = 0.0f; while (result.IsMatched()) { int gStart = result.GetGroupStart(noGroup); int gEnd = result.GetGroupEnd(noGroup); pfc::string8_fast temp(buff.get_ptr() + gStart, gEnd - gStart); int no = StrToIntA(temp); gStart = result.GetGroupStart(textGroup); gEnd = result.GetGroupEnd(textGroup); temp = pfc::string8_fast(buff.get_ptr()+gStart, gEnd - gStart); int levDist = LD(compare, compare.get_length(), temp, temp.get_length()); good = 1.0f - (levDist / (float)compare.get_length()); if (good >= threshold && good > best) { jump_to = no; best = good; } result = regexp.Match(buff.get_ptr(),result.GetEnd()); } if (jump_to == 0) { continue; } char regex_lyrics[100]; sprintf(regex_lyrics, "<a\\s+name=%d><font*(.*?)</font*(.*?)>(?P<lyrics>.+?)<font", jump_to); // expression for extract lyrics regexp.Compile(regex_lyrics, IGNORECASE | SINGLELINE); noGroup = regexp.GetNamedGroupNumber("lyrics"); result = regexp.Match(buff.get_ptr()); if (result.IsMatched()) { int nStart = result.GetGroupStart(noGroup); int nEnd = result.GetGroupEnd(noGroup); pfc::string8 lyric(buff.get_ptr() + nStart, nEnd - nStart); convert_html_to_plain(lyric); if (lyric.get_length() > 0) { string_helper::remove_beginning_linebreaks(lyric); string_helper::remove_end_linebreaks(lyric); string_helper::remove_beginning(lyric, ' '); string_helper::remove_beginning(lyric, '\t'); return lyric; } } } } catch (pfc::exception & e) { console_error(e.what()); return ""; } catch (...) { return ""; } return ""; }
//************************************************************************ //* Dark Lyrics * //************************************************************************ pfc::string_list_impl * provider_darklyrics::lookup(unsigned p_index, metadb_handle_list_cref p_meta, threaded_process_status & p_status, abort_callback & p_abort) { TRACK_CALL_TEXT("provider_darklyrics::lookup"); const float threshold = 0.8f; const pfc::string8 site = "darklyrics.com"; // Regular Expression Class CRegexpT<char> regexp; MatchResult match; // Buffer pfc::string8 buff; pfc::string_list_impl * str_list = new pfc::string_list_impl; try { // Init fetcher curl_wrapper_simple fetcher(&m_config_item); for (t_size i = 0; i < p_meta.get_count(); ++i) { if (p_abort.is_aborting()) break; // Sleep how_to_sleep(i); // Clear buff buff.reset(); const metadb_handle_ptr & p = p_meta.get_item(i); if (p.is_empty()) { str_list->add_item(""); continue; } // Set progress pfc::string8_fast path = file_path_canonical(p->get_path()); // add subsong index? if (p->get_subsong_index() > 0) { path.add_string(" /index:"); path.add_string(pfc::format_uint(p->get_subsong_index())); } p_status.set_item_path(path); p_status.set_progress(i + 1, p_meta.get_count()); pfc::string8_fast artist, title, album, keywords; file_info_impl info; p->get_info(info); // Get count of artists t_size count = info.meta_get_count_by_name("album"); if (count == 0) continue; // Get Album album = info.meta_get("album", 0); count = info.meta_get_count_by_name("title"); if (count == 0) continue; // Get TITLE title = info.meta_get("title", 0); count = info.meta_get_count_by_name("artist"); bool found = false; // Iterate through all artists listed for (int j = 0; j < count && !found; j++) { // Get Artist artist = info.meta_get("artist", j); keywords = artist; keywords += "+"; keywords += album; keywords.replace_char(' ', '+'); // Get it now try { fetcher.fetch_googleluck(site, keywords, buff); } catch (pfc::exception & e) { console_error(e.what()); continue; } catch (...) { continue; } const char * regex_ahref = "<a\\shref=\"#(?P<no>\\d+)\">(?P<text>.+?)</a>"; // expression for extract lyrics regexp.Compile(regex_ahref, IGNORECASE | SINGLELINE); // match MatchResult result = regexp.Match(buff.get_ptr()); int noGroup = regexp.GetNamedGroupNumber("no"); int textGroup = regexp.GetNamedGroupNumber("text"); int jump_to = 0; pfc::string8_fast compare = title; compare.insert_chars(0, ". "); float good; float best = 0.0f; while (result.IsMatched()) { int gStart = result.GetGroupStart(noGroup); int gEnd = result.GetGroupEnd(noGroup); pfc::string8_fast temp(buff.get_ptr() + gStart, gEnd - gStart); int no = StrToIntA(temp); gStart = result.GetGroupStart(textGroup); gEnd = result.GetGroupEnd(textGroup); temp = pfc::string8_fast(buff.get_ptr()+gStart, gEnd - gStart); if (temp.find_first(title) != -1) { jump_to = no; break; } int levDist = LD(compare, compare.get_length(), temp, temp.get_length()); good = 1.0f - (levDist / (float)compare.get_length()); if (good >= threshold && good > best) { jump_to = no; best = good; } result = regexp.Match(buff.get_ptr(),result.GetEnd()); } if (jump_to == 0) { continue; } char regex_lyrics[100]; sprintf(regex_lyrics, "<a\\s+name=%d><font*(.*?)</font*(.*?)>(?P<lyrics>.+?)<font", jump_to); // expression for extract lyrics regexp.Compile(regex_lyrics, IGNORECASE | SINGLELINE); noGroup = regexp.GetNamedGroupNumber("lyrics"); result = regexp.Match(buff.get_ptr()); if (result.IsMatched()) { int nStart = result.GetGroupStart(noGroup); int nEnd = result.GetGroupEnd(noGroup); pfc::string8 lyric(buff.get_ptr() + nStart, nEnd - nStart); convert_html_to_plain(lyric); if (lyric.get_length() > 0) { string_helper::remove_beginning_linebreaks(lyric); string_helper::remove_end_linebreaks(lyric); string_helper::remove_beginning(lyric, ' '); string_helper::remove_beginning(lyric, '\t'); console::print(lyric); str_list->add_item(lyric); found = true; continue; } } } if (found) continue; else str_list->add_item(""); } } catch (pfc::exception & e) { console_error(e.what()); delete str_list; return NULL; } catch (...) { delete str_list; return NULL; } return str_list; }