Exemple #1
0
void wmtor::CheckUrl(Url &url, HttpClient&cli, ofstream &ofile) {
    HttpHeader header;
    cli.requestHeader(url, header);
    ofile << "url:" << url.getUrlStr() << endl;
    ofile << "header:" << header.getHeaderStr() << endl;
    cout << "url:" << url.getUrlStr() << endl;
    cout << "urlEtag:" << header.getEtag() << endl;
    if (header.getEtag() != "") {
        if (header.getEtag() != url.getEtag()) {
            HttpContent content;
            cli.requestContent(url, content, header.getContentLength());
            string contentMD5 = content.getMD5Str();
            url.setEtag(header.getEtag());
            url.setAge(header.getAge());
            url.setPageMD5Str(contentMD5);
            savePage(content, header);
            reportNewWebPage(url);
        }
    } else {
    }
}
Exemple #2
0
int HttpClient::requestWebPage(Url &url, HttpHeader &httpHeader, HttpContent &httpContent) {
    // construct a request
    string requestStr;
    string path = url.getPath();
    if (path.empty())
        path = "/";
    requestStr = "GET " + path + " HTTP/1.0\r\nHost: " + url.getHost()
            + "\r\nUser-Agent: openSE/1.0 (Ubuntu11.04)\r\nAccept-Language: zh,en-us\r\nAccept-Charset: gb2312,utf-8\r\nConnection: Keep-Alive\r\n\r\n";

    cout << "requestStr:\n" << requestStr << endl;

    // send request:
    if (url.getHost() != _preHost) {
        if (_preSockFd != -1) {
            closesocket(_preSockFd);
            _preSockFd = -1;
        }
    }
    int sockFd;
    bool sendSuccess = false;

    // try to use previous connection
    if (_preSockFd != -1) {
        sockFd = _preSockFd;
        if (rio_writen(sockFd, requestStr.c_str(), requestStr.size()) == -1) {
            cerr << "use previous connection:rio_writen error !" << endl;
            closesocket(_preSockFd);
            _preSockFd = -1;
        } else
            sendSuccess = true;
    }

    if (!sendSuccess) {
        // try to creat a new connection
        sockFd = tcpConnect(url.getIp(), url.getPort());
        if (sockFd == -1) {
            cerr << "tcpConnect error" << endl;
            return -1;
        }
        // try to use new connection
        if (rio_writen(sockFd, requestStr.c_str(), requestStr.size()) == -1) {
            cerr << "rio_writen error for requestStr:" << requestStr << endl;
            closesocket(sockFd);
            return -1;
        }
    }


    // receive http header
    string headerStr;
    headerStr.reserve(1024);
    if (receiveHeader(sockFd, headerStr, DEFAULT_TIMEOUT_SECONDS) <= 0) {
        cerr << "receiveHeader error" << endl;
        closesocket(sockFd);
        _preSockFd = -1;
        return -1;
    }

    //cout << "headerStr:\n" << headerStr << endl;

    // parser http header
    httpHeader.setHeaderStr(headerStr);

    // check StatusCode
    int stausCode = httpHeader.getStatusCode();
    if (stausCode == -1) {
        cerr << "not find status code in httpHeader: " << httpHeader.getHeaderStr() << endl;
    }

    if (stausCode == 301 || stausCode == 302) {
        closesocket(sockFd);
        _preSockFd = -1;
        string locationUrlStr = httpHeader.getLocation();
        if (locationUrlStr.empty()) {
            cerr << "error location in httpHeader: " << httpHeader.getHeaderStr() << endl;
        }
        //locationStr = location;
        Url locationUrl(locationUrlStr);
        return requestWebPage(locationUrl, httpHeader, httpContent);
    }

    if (stausCode < 200 || stausCode > 299) {
        closesocket(sockFd);
        _preSockFd = -1;
        cerr << "status code beyond [200-300) in httpHeader: " << httpHeader.getHeaderStr() << endl;
        return -1;
    }

    // check content type
    string contentType = httpHeader.getContentType();
    if (contentType.find("image") != string::npos) {
        closesocket(sockFd);
        _preSockFd = -1;
        cerr << "contentType is image in httpHeader: " << httpHeader.getHeaderStr() << endl;
        return -1;
    }

    // check ContentLength
    int contentLength = httpHeader.getContentLength();

    if (contentLength == -1) {
        //cerr << "contentLength is not finded in httpHeader: " << httpHeader.getHeaderStr() << endl;
        contentLength = MAX_HTTPCONTENT_SIZE / 10;
    }

    if (contentLength == 0) {
        closesocket(sockFd);
        _preSockFd = -1;
        cerr << "contentLength is 0 in httpHeader: " << httpHeader.getHeaderStr() << endl;
        return -1;
    }

    if (contentLength > MAX_HTTPCONTENT_SIZE) {
        closesocket(sockFd);
        _preSockFd = -1;
        cerr << "contentLength > MAX_HTTPCONTENT_SIZE in httpHeader: "
                << httpHeader.getHeaderStr() << endl;
        return -1;
    }

    // receive content
    string contentStr;
    if (receiveContent(sockFd, contentLength, contentStr, DEFAULT_TIMEOUT_SECONDS)
            == -1) {
        closesocket(sockFd);
        _preSockFd = -1;
        cerr << "receiveContent error for url: " << url.getUrlStr() << endl;
        //cout << contentStr << endl;
        return -1;
    } else
        _preSockFd = sockFd;

    //cout << "contentStr:\n" << contentStr << endl;
    // cout << "content finished,url is:"<<url.getUrlStr()<<endl;
    // set http content
    httpContent.setContentStr(contentStr);
    return 0;
}