bool ETagUrlChecker::check(HttpHeader& header, Url&url) { if (header.getEtag() == "") return false; if (header.getEtag() == url.getEtag()) return false; return true; }
void wmtor::CheckUrl(Url &url, HttpClient&cli, ofstream &ofile) { HttpHeader header; cli.requestHeader(url, header); ofile << "url:" << url.getUrlStr() << endl; ofile << "header:" << header.getHeaderStr() << endl; cout << "url:" << url.getUrlStr() << endl; cout << "urlEtag:" << header.getEtag() << endl; if (header.getEtag() != "") { if (header.getEtag() != url.getEtag()) { HttpContent content; cli.requestContent(url, content, header.getContentLength()); string contentMD5 = content.getMD5Str(); url.setEtag(header.getEtag()); url.setAge(header.getAge()); url.setPageMD5Str(contentMD5); savePage(content, header); reportNewWebPage(url); } } else { } }
void wmtor::start() { if (urlist.empty()) { cout << "Empty queue." << endl; return; } else { ofstream recordFile("recordFile.txt"); if (!recordFile.is_open()) { cerr << "Cannot open recordFile.txt!" << endl; return; } int count = 0; int max_count = 500; HttpClient cli; ofstream ofile("hfiles.txt", ios::app); if (!ofile.is_open()) { cerr << "Cannot open file: header.txt" << endl; recordFile.close(); return; } char *t_str; time_t cur_time; priority_queue<Url> urlist2; while (count < max_count) { cout << "Now checking for " << ++count << endl; cur_time = time(NULL); t_str = ctime(&cur_time); ofile << t_str << endl; //free(t_str); while (!urlist.empty()) { Url url = urlist.top(); urlist.pop(); CheckUrl_wholePage(url, cli, ofile); recordFile << count << '\t' << url.getUrlStr() << '\t'; recordFile << url.getPageMD5Str() << '\t' << url.getAge() << '\t' << url.getEtag() << endl; urlist2.push(url); } #ifdef _MSC_VER Sleep(sleep_time); #else sleep(sleep_time); #endif cout << "Now checking for " << ++count << endl; cur_time = time(NULL); t_str = ctime(&cur_time); ofile << t_str << endl; //free(t_str); while (!urlist2.empty()) { Url url = urlist2.top(); urlist2.pop(); CheckUrl_wholePage(url, cli, ofile); recordFile << count << '\t' << url.getUrlStr() << '\t'; recordFile << url.getPageMD5Str() << '\t' << url.getAge() << '\t' << url.getEtag() << endl; urlist.push(url); } #ifdef _MSC_VER Sleep(sleep_time); #else sleep(sleep_time); #endif } ofile.close(); recordFile.close(); } }