void wmtor::reportNewWebPage(Url& url) { ofstream logFile("newPageReport.txt", ios::app); if (logFile.is_open()) { time_t cur_time = time(NULL); char *t_str; t_str = ctime(&cur_time); logFile << t_str << endl; logFile << "Url Changed:" << url.getUrlStr() << "\nPageMD5:" << url.getPageMD5Str() << endl; logFile.close(); } cout << "Url Changed:" << url.getUrlStr() << "\nPageMD5:" << url.getPageMD5Str() << endl; }
void wmtor::CheckUrl_wholePage(Url &url, HttpClient&cli, ofstream &ofile) { HttpHeader header; HttpContent content; cli.requestWebPage(url, header, content); ofile << "url:" << url.getUrlStr() << endl; ofile << "header:" << header.getHeaderStr() << endl; cout << "url:" << url.getUrlStr() << endl; cout << "urlEtag:" << header.getEtag() << endl; string contentMD5 = content.getMD5Str(); if ( !contentMD5.empty() && contentMD5 != url.getPageMD5Str()) { url.setEtag(header.getEtag()); url.setAge(0); url.setPageMD5Str(contentMD5); saveNewPage(url,content, header); reportNewWebPage(url); } }
void wmtor::start() { if (urlist.empty()) { cout << "Empty queue." << endl; return; } else { ofstream recordFile("recordFile.txt"); if (!recordFile.is_open()) { cerr << "Cannot open recordFile.txt!" << endl; return; } int count = 0; int max_count = 500; HttpClient cli; ofstream ofile("hfiles.txt", ios::app); if (!ofile.is_open()) { cerr << "Cannot open file: header.txt" << endl; recordFile.close(); return; } char *t_str; time_t cur_time; priority_queue<Url> urlist2; while (count < max_count) { cout << "Now checking for " << ++count << endl; cur_time = time(NULL); t_str = ctime(&cur_time); ofile << t_str << endl; //free(t_str); while (!urlist.empty()) { Url url = urlist.top(); urlist.pop(); CheckUrl_wholePage(url, cli, ofile); recordFile << count << '\t' << url.getUrlStr() << '\t'; recordFile << url.getPageMD5Str() << '\t' << url.getAge() << '\t' << url.getEtag() << endl; urlist2.push(url); } #ifdef _MSC_VER Sleep(sleep_time); #else sleep(sleep_time); #endif cout << "Now checking for " << ++count << endl; cur_time = time(NULL); t_str = ctime(&cur_time); ofile << t_str << endl; //free(t_str); while (!urlist2.empty()) { Url url = urlist2.top(); urlist2.pop(); CheckUrl_wholePage(url, cli, ofile); recordFile << count << '\t' << url.getUrlStr() << '\t'; recordFile << url.getPageMD5Str() << '\t' << url.getAge() << '\t' << url.getEtag() << endl; urlist.push(url); } #ifdef _MSC_VER Sleep(sleep_time); #else sleep(sleep_time); #endif } ofile.close(); recordFile.close(); } }