void processPage(int sockfd, t_Buffer* document) { char *pos = document->ptr; char query_get[1024] = {0}; char qid[22] = {0}; t_Buffer *header_q = t_Buffer_new(1024); t_Buffer *document_q = t_Buffer_new(1024); srand(1200); // Busca URL de una pregunta de la página while ( (pos = strstr(pos, SEARCH_QUESTION_TOKEN )) ) { // Identifica el qid pos = strstr(pos, SEARCH_QID_TOKEN ); memcpy(qid, pos+5, 21); sprintf( query_get,\ "GET %s%s%s HTTP/%s\r\nHost: %s\r\nUser-Agent: Mozilla Firefox 11.0 \r\n\r\n",\ SEARCH_QUESTION_TOKEN, SEARCH_QID_TOKEN, qid,\ "1.1", HOST); // Enviar if ( send(sockfd, query_get, strlen(query_get), 0) < 0 ) { perror("send()"); exit(EXIT_FAILURE); } receiveHeader(sockfd, header_q); receiveContent(sockfd, document_q, TE_CHUNKED); if ( strstr(document_q->ptr, SEARCH_USER) ) printf("http://%s%s%s%s\n", HOST,SEARCH_QUESTION_TOKEN, SEARCH_QID_TOKEN,qid); sleep(rand() % 3); } printf("\n"); // Liberar recursos t_Buffer_delete(header_q); t_Buffer_delete(document_q); }
int HttpClient::requestContent(Url &url, string &contentStr, int expectContentLength) { // construct a request string requestStr; string path = url.getPath(); if (path.empty()) path = "/"; requestStr = "GET " + path + " HTTP/1.0\r\nHost: " + url.getHost() + "\r\nUser-Agent: openSE/1.0 (Ubuntu11.04)\r\nAccept-Language: zh,en-us\r\nAccept-Charset: gb2312,utf-8\r\nConnection: Keep-Alive\r\n\r\n"; cout << "requestStr:\n" << requestStr << endl; // send request: if (url.getHost() != _preHost) { if (_preSockFd != -1) { closesocket(_preSockFd); _preSockFd = -1; } } int sockFd; bool sendSuccess = false; // try to use previous connection if (_preSockFd != -1) { sockFd = _preSockFd; if (rio_writen(sockFd, requestStr.c_str(), requestStr.size()) == -1) { cerr << "use previous connection:rio_writen error !" << endl; closesocket(_preSockFd); _preSockFd = -1; } else sendSuccess = true; } if (!sendSuccess) { // try to creat a new connection sockFd = tcpConnect(url.getIp(), url.getPort()); if (sockFd == -1) { cerr << "tcpConnect error" << endl; return -1; } // try to use new connection if (rio_writen(sockFd, requestStr.c_str(), requestStr.size()) == -1) { cerr << "rio_writen error for requestStr:" << requestStr << endl; closesocket(sockFd); return -1; } } //// receive http header //string headerStr; //headerStr.reserve(1024); //if(receiveHeader(sockFd, headerStr, DEFAULT_TIMEOUT_SECONDS) <= 0){ // cerr << "receiveHeader error" << endl; // closesocket(sockFd); // _preSockFd = -1; // return -1; // } //cout << "headerStr:\n" << headerStr << endl; //// parser http header //httpHeader.setHeaderStr(headerStr); //// check StatusCode //int stausCode = httpHeader.getStatusCode(); //if(stausCode == -1){ // cerr << "not find status code in httpHeader: " << httpHeader.getHeaderStr() << endl; // } //if(stausCode == 301 || stausCode == 302){ // closesocket(sockFd); // _preSockFd = -1; // string locationUrlStr = httpHeader.getLocation(); // if(locationUrlStr.empty()){ // cerr << "error location in httpHeader: " << httpHeader.getHeaderStr() << endl; // } // //locationStr = location; // Url locationUrl(locationUrlStr); // return requestWebPage(locationUrl, httpHeader, httpContent); // } //if(stausCode < 200 || stausCode > 299){ // closesocket(sockFd); // _preSockFd = -1; // cerr << "status code beyond [200-300) in httpHeader: " << httpHeader.getHeaderStr() << endl; // return -1; // } //// check content type //string contentType = httpHeader.getContentType(); //if(contentType.find("image") != string::npos){ // closesocket(sockFd); // _preSockFd = -1; // cerr << "contentType is image in httpHeader: " << httpHeader.getHeaderStr() << endl; // return -1; // } //// check ContentLength //int contentLength = httpHeader.getContentLength(); //if(contentLength == -1){ // //cerr << "contentLength is not finded in httpHeader: " << httpHeader.getHeaderStr() << endl; // contentLength = MAX_HTTPCONTENT_SIZE/10; // } //if(contentLength == 0){ // closesocket(sockFd); // _preSockFd = -1; // cerr << "contentLength is 0 in httpHeader: " << httpHeader.getHeaderStr() << endl; // return -1; // } //if(contentLength > MAX_HTTPCONTENT_SIZE){ // closesocket(sockFd); // _preSockFd = -1; // cerr << "contentLength > MAX_HTTPCONTENT_SIZE in httpHeader: " // << httpHeader.getHeaderStr() << endl; // return -1; // } // receive content if (receiveContent(sockFd, expectContentLength, contentStr, DEFAULT_TIMEOUT_SECONDS) == -1) { closesocket(sockFd); _preSockFd = -1; cerr << "receiveContent error for url: " << url.getUrlStr() << endl; cout << contentStr << endl; return -1; } else _preSockFd = sockFd; cout << "contentStr:\n" << contentStr << endl; // cout << "content finished,url is:"<<url.getUrlStr()<<endl; // set http content return 0; }