Пример #1
0
bool ETagUrlChecker::check(HttpHeader& header, Url&url) {
    if (header.getEtag() == "")
        return false;
    if (header.getEtag() == url.getEtag())
        return false;
    return true;
}
Пример #2
0
void wmtor::CheckUrl(Url &url, HttpClient&cli, ofstream &ofile) {
    HttpHeader header;
    cli.requestHeader(url, header);
    ofile << "url:" << url.getUrlStr() << endl;
    ofile << "header:" << header.getHeaderStr() << endl;
    cout << "url:" << url.getUrlStr() << endl;
    cout << "urlEtag:" << header.getEtag() << endl;
    if (header.getEtag() != "") {
        if (header.getEtag() != url.getEtag()) {
            HttpContent content;
            cli.requestContent(url, content, header.getContentLength());
            string contentMD5 = content.getMD5Str();
            url.setEtag(header.getEtag());
            url.setAge(header.getAge());
            url.setPageMD5Str(contentMD5);
            savePage(content, header);
            reportNewWebPage(url);
        }
    } else {
    }
}
Пример #3
0
void wmtor::start() {
    if (urlist.empty()) {
        cout << "Empty queue." << endl;
        return;
    }
    else {
        ofstream recordFile("recordFile.txt");
        if (!recordFile.is_open()) {
            cerr << "Cannot open recordFile.txt!" << endl;
            return;
        }

        int count = 0;
        int max_count = 500;
        HttpClient cli;
        ofstream ofile("hfiles.txt", ios::app);
        if (!ofile.is_open()) {
            cerr << "Cannot open file: header.txt" << endl;
            recordFile.close();
            return;
        }

        char *t_str;
        time_t cur_time;

        priority_queue<Url> urlist2;
        while (count < max_count) {
            cout << "Now checking for " << ++count << endl;
            cur_time = time(NULL);
            t_str = ctime(&cur_time);
            ofile << t_str << endl;
            //free(t_str);
            while (!urlist.empty()) {
                Url url = urlist.top();
                urlist.pop();
                CheckUrl_wholePage(url, cli, ofile);
                recordFile << count << '\t' << url.getUrlStr() << '\t';
                recordFile << url.getPageMD5Str() << '\t' << url.getAge() << '\t' << url.getEtag() << endl;
                urlist2.push(url);
            }
#ifdef _MSC_VER
            Sleep(sleep_time);
#else
            sleep(sleep_time);
#endif

            cout << "Now checking for " << ++count << endl;
            cur_time = time(NULL);
            t_str = ctime(&cur_time);
            ofile << t_str << endl;
            //free(t_str);
            while (!urlist2.empty()) {
                Url url = urlist2.top();
                urlist2.pop();
                CheckUrl_wholePage(url, cli, ofile);
                recordFile << count << '\t' << url.getUrlStr() << '\t';
                recordFile << url.getPageMD5Str() << '\t' << url.getAge() << '\t' << url.getEtag() << endl;
                urlist.push(url);
            }
#ifdef _MSC_VER
            Sleep(sleep_time);
#else
            sleep(sleep_time);
#endif

        }
        ofile.close();
        recordFile.close();
    }
}