コード例 #1
0
// . form an HTTP request 
// . use size 0 for HEAD requests
// . use size -1 for GET whole doc requests
// . fill in your own offset/size for partial GET requests
// . returns false and sets g_errno on error
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
		       char *userAgent , char *proto , bool doPost ,
		       char *cookie ) {

	m_reqBufValid = false;

	long hlen ;
	long port = 80;
	char *hptr = getHostFast ( url , &hlen , &port );
	char *path = getPathFast ( url );

	char *pathEnd  = NULL;
	char *postData = NULL;
	if ( doPost ) {
		pathEnd  = strstr(path,"?");
		if ( pathEnd ) {
			*pathEnd = '\0';
			postData = pathEnd + 1;
		}
	}

	// if no legit host
	if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; }
	// sanity check. port is only 16 bits
	if ( port > (long)0xffff ) { g_errno = EBADURL; return false; }
	// return false and set g_errno if url too big
	//if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { 
	//	g_errno = EURLTOOBIG; return false;}
	// assume request type is a GET
	m_requestType = 0;
	// get the host NULL terminated
	char host[1024+8];
	//long hlen = url->getHostLen();
	strncpy ( host , hptr , hlen );
	host [ hlen ] = '\0';
	// then port
	//unsigned short port = url->getPort();
	if ( port != 80 ) {
		sprintf ( host + hlen , ":%lu" , port );
		hlen += gbstrlen ( host + hlen );
	}
	// the if-modified-since field
	char  ibuf[64];
	char *ims = "";
	if ( ifModifiedSince ) {
		// NOTE: ctime appends a \n 
		sprintf(ibuf,"If-Modified-Since: %s UTC",
			asctime(gmtime(&ifModifiedSince)));
		// get the length
		long ilen = gbstrlen(ibuf);
		// hack off \n from ctime - replace with \r\n\0
		ibuf [ ilen - 1 ] = '\r';
		ibuf [ ilen     ] = '\n';
		ibuf [ ilen + 1 ] = '\0';
		// set ims to this string
		ims = ibuf;
	}
	// . until we fix if-modified-since, take it out
	// . seems like we are being called with it as true when should not be
	ims="";

	// . use one in conf file if caller did not provide
	// . this is usually Gigabot/1.0
	if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent;
	// accept only these
	char *accept = "*/*";
	/*
		 "text/html, "
		 "text/plain, "
		 "text/xml, "
		 "application/pdf, "
		 "application/msword, "
		 "application/vnd.ms-excel, "
		 "application/mspowerpoint, "
		 "application/postscript";
	*/

	char *cmd = "GET";
	if ( size == 0 ) cmd = "HEAD";
	if ( doPost    ) cmd = "POST";

	 // . now use "Accept-Language: en" to tell servers we prefer english
	 // . i removed keep-alive connection since some connections close on
	 //   non-200 ok http statuses and we think they're open since close
	 //   signal (read 0 bytes) may have been delayed
	 char* acceptEncoding = "";
	 // the scraper is getting back gzipped search results from goog,
	 // so disable this for now
	 // i am re-enabling now for testing...
	 if(g_conf.m_gzipDownloads)
	 	 acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n";
	 // i thought this might stop wikipedia from forcing gzip on us
	 // but it did not!
	 // else
	 //	 acceptEncoding = "Accept-Encoding:\r\n";

	 // char *p = m_buf;
	 // init the safebuf to point to this buffer in our class to avoid
	 // a potential alloc
	 // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 );
	 m_reqBuf.purge();
	 // indicate this is good
	 m_reqBufValid = true;

	 if ( size == 0 ) {
		 // 1 for HEAD requests
		 m_requestType = 1; 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   //"Connection: Keep-Alive\r\n" 
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n\r\n" ,
			   "Accept: %s\r\n" ,
				 cmd,
			   path , proto, host , 
			   ims , userAgent , accept );
	 }
	 else if ( size != -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%li-%li\r\n" ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
			   offset + size );
	 else if ( offset > 0  && size == -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%li-\r\n" ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset );
	 // Wget's request:
	 // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n
	 // firefox's request:
	 // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n
	 else {
		 // until we fix if-modified-since, take it out
		 //ims="";
		 //userAgent = "Wget/1.10.2";
		 //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7";
		 //proto = "HTTP/1.0";
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "User-Agent: %s\r\n"
			   "Accept: */*\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   //"Connection: Keep-Alive\r\n"
			   //"Accept-Language: en\r\n"
				"%s",
			   //"Accept: %s\r\n\r\n" ,
				//"\r\n",
				cmd,
			   path ,
			   proto ,
			   userAgent ,
			   host ,
			   ims ,
			   acceptEncoding);
			   //accept );
	 }

	 // cookie here
	 if ( cookie ) 
		 m_reqBuf.safePrintf("Cookie: %s\r\n",cookie );

	 // print content-length: if post
	 if ( postData ) {
		 // dammit... recaptcha does not work without this!!!!
		 m_reqBuf.safePrintf (
			      "Content-Type: "
			      "application/x-www-form-urlencoded\r\n");
		 long contentLen = strlen(postData);
		 m_reqBuf.safePrintf ("Content-Length: %li\r\n", contentLen );
		 m_reqBuf.safePrintf("\r\n");
		 m_reqBuf.safePrintf("%s",postData);
		 // log it for debug
		 //log("captch: %s",m_buf);
	 }
	 else {
		 m_reqBuf.safePrintf("\r\n");
	 }

	 // set m_bufLen
	 //m_bufLen = p - m_buf;//gbstrlen ( m_buf );
	 // sanity check
	 // if ( m_bufLen + 1 > MAX_REQ_LEN ) {
	 //	 log("build: HttpRequest buf is too small.");
	 //	 char *xx = NULL; *xx = 0;
	 // }

	 // restore url buffer
	 if ( pathEnd ) *pathEnd = '?';

	 return true;
 }
コード例 #2
0
// . form an HTTP request 
// . use size 0 for HEAD requests
// . use size -1 for GET whole doc requests
// . fill in your own offset/size for partial GET requests
// . returns false and sets g_errno on error
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
bool HttpRequest::set (char *url,int32_t offset,int32_t size,time_t ifModifiedSince,
		       const char *userAgent, const char *proto, bool doPost,
		       const char *cookieJar, const char *additionalHeader,
		       // if posting something, how many bytes is it?
		       int32_t postContentLen ,
		       // are we sending the request through an http proxy?
		       // if so this will be non-zero
		       int32_t proxyIp ,
		       const char *proxyUsernamePwd ) {

	m_reqBufValid = false;

	int32_t hlen ;
	int32_t port = 80;
	const char *hptr = getHostFast ( url , &hlen , &port );
	char *path = getPathFast ( url );

	// . use the full url if sending to an http proxy
	// . HACK: do NOT do this if it is httpS because we end up
	//   using the http tunnel using the CONNECT cmd and the squid proxy
	//   will just forward/proxy just the entire tcp packets.
	if ( proxyIp && strncmp(url,"https://",8) != 0 ) path = url;

	char *pathEnd  = NULL;
	const char *postData = NULL;
	if ( doPost ) {
		pathEnd  = strstr(path,"?");
		if ( pathEnd ) {
			*pathEnd = '\0';
			postData = pathEnd + 1;
		}
	}

	// if no legit host
	if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; }
	// sanity check. port is only 16 bits
	if ( port > (int32_t)0xffff ) { g_errno = EBADURL; return false; }
	// return false and set g_errno if url too big
	//if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { 
	//	g_errno = EURLTOOBIG; return false;}
	// assume request type is a GET
	m_requestType = RT_GET;//0;
	// get the host NULL terminated
	char host[1024+8];
	//int32_t hlen = url->getHostLen();
	strncpy ( host , hptr , hlen );
	host [ hlen ] = '\0';
	// then port
	//uint16_t port = url->getPort();
	if ( port != 80 ) {
		sprintf ( host + hlen , ":%" PRIu32 , (uint32_t)port );
		hlen += strlen ( host + hlen );
	}
	// the if-modified-since field
	const char *ims = "";
#if 0
	char  ibuf[64];
	if ( ifModifiedSince ) {
		struct tm tm_buf;
		char buf[64];
		// NOTE: ctime appends a \n 
		snprintf(ibuf, sizeof(ibuf), "If-Modified-Since: %s UTC",
			asctime_r(gmtime_r(&ifModifiedSince,&tm_buf),buf));
		// get the length
		int32_t ilen = strlen(ibuf);
		if( ilen && ilen < (int32_t)sizeof(ibuf)-1 ) {
			// hack off \n from ctime - replace with \r\n\0
			ibuf [ ilen - 1 ] = '\r';
			ibuf [ ilen     ] = '\n';
			ibuf [ ilen + 1 ] = '\0';
			// set ims to this string
			ims = ibuf;
		}
	}
	// . until we fix if-modified-since, take it out
	// . seems like we are being called with it as true when should not be
	ims="";
#endif

	// . use one in conf file if caller did not provide
	// . this is usually Gigabot/1.0
	if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent;
	// accept only these
	const char *accept = "*/*";
	/*
		 "text/html, "
		 "text/plain, "
		 "text/xml, "
		 "application/pdf, "
		 "application/msword, "
		 "application/vnd.ms-excel, "
		 "application/mspowerpoint, "
		 "application/postscript";
	*/

	const char *cmd = "GET";
	if ( size == 0 ) cmd = "HEAD";
	if ( doPost    ) cmd = "POST";

	// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
	// note Connection: Close\r\n when making requests
	//proto = "HTTP/1.1";

	SafeBuf tmp;
	const char *up = "";
	if ( proxyUsernamePwd && proxyUsernamePwd[0] ) {
		tmp.safePrintf("Proxy-Authorization: Basic ");
		tmp.base64Encode (proxyUsernamePwd,strlen(proxyUsernamePwd));
		tmp.safePrintf("\r\n");
		up = tmp.getBufStart();
	}

	 // . now use "Accept-Language: en" to tell servers we prefer english
	 // . i removed keep-alive connection since some connections close on
	 //   non-200 ok http statuses and we think they're open since close
	 //   signal (read 0 bytes) may have been delayed
	 const char* acceptEncoding = "";
	 // the scraper is getting back gzipped search results from goog,
	 // so disable this for now
	 // i am re-enabling now for testing...
	 if(g_conf.m_gzipDownloads)
	 	 acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n";
	 // i thought this might stop wikipedia from forcing gzip on us
	 // but it did not!
	 // else
	 //	 acceptEncoding = "Accept-Encoding:\r\n";

	 // char *p = m_buf;
	 // init the safebuf to point to this buffer in our class to avoid
	 // a potential alloc
	 // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 );
	 m_reqBuf.purge();
	 // indicate this is good
	 m_reqBufValid = true;

	 if ( size == 0 ) {
		 // 1 for HEAD requests
		 m_requestType = RT_HEAD; 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n" 
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n\r\n" ,
			   "Accept: %s\r\n" 
			   "%s"
			   ,
				 cmd,
			   path , proto, host , 
			   ims , userAgent , accept , up );
	 }
	 else 
	 if ( size != -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-%" PRId32"\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
			   offset + size ,
				      up);
	 else 
	 if ( offset > 0 ) 	// size is -1
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
				      up );
	 // Wget's request:
	 // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n
	 // firefox's request:
	 // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n
	 else {
		 // until we fix if-modified-since, take it out
		 //ims="";
		 //userAgent = "Wget/1.10.2";
		 //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7";
		 //proto = "HTTP/1.0";
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "User-Agent: %s\r\n"
			   "Accept: */*\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   //"Accept-Language: en\r\n"
				"%s"
			   "%s"
			   ,
			   //"Accept: %s\r\n\r\n" ,
				//"\r\n",
				cmd,
			   path ,
			   proto ,
			   userAgent ,
			   host ,
			   ims ,
			   acceptEncoding,
				      up );
			   //accept );
	 }

	 if ( additionalHeader )
		 m_reqBuf.safePrintf("%s\r\n",additionalHeader );

	 // cookie here
	if (cookieJar) {
		HttpMime::addCookieHeader(cookieJar, url, &m_reqBuf);
	}

	 // print content-length: if post
	 if ( postData ) {
		 // dammit... recaptcha does not work without this!!!!
		 m_reqBuf.safePrintf (
			      "Content-Type: "
			      "application/x-www-form-urlencoded\r\n");
	 }

	 // we need this if doing a post even if postData is NULL
	 if ( doPost ) {
		 int32_t contentLen = 0;
		 if ( postData ) contentLen = strlen(postData);
		 // this overrides if provided. -1 is default
		 if ( postContentLen >= 0 ) contentLen = postContentLen;
		 m_reqBuf.safePrintf ("Content-Length: %" PRId32"\r\n", contentLen );
		 m_reqBuf.safePrintf("\r\n");
		 if ( postData ) m_reqBuf.safePrintf("%s",postData);
		 // log it for debug
		 //log("captch: %s",m_buf);
	 }

	 if ( ! doPost ) { // ! postData ) {
		 m_reqBuf.safePrintf("\r\n");
	 }

	 // restore url buffer
	 if ( pathEnd ) *pathEnd = '?';

	 return true;
 }
コード例 #3
0
void Blaster::gotDoc3 ( void *state, TcpSocket *s){
	StateBD2 *st2=(StateBD2 *)state;
	StateBD *st=st2->m_st;
	if (!s) {
		log (LOG_WARN,"blaster: Got a null s in gotDoc3."
		     "Happened because ip could not be found");
		st->m_numUrlDocsReceived++;
		//Free StateBD2
		mdelete(st2,sizeof(StateBD2),"Blaster4");
		if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
			m_launched--;
			// Free stateBD
			freeStateBD(st);
		}
		return;
	}
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("blasterDiff : lost the Request in gotDoc3");
		st->m_numUrlDocsReceived++;
		//Free StateBD2
		mdelete(st2,sizeof(StateBD2),"Blaster4");
		if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
			m_launched--;
			// Free stateBD
			freeStateBD(st);
		}
		return;
	}
	char *reply = s->m_readBuf ;
	long  size  = s->m_readOffset;
	HttpMime mime;
	mime.set(reply,size,NULL);

	long httpStatus=mime.getHttpStatus();
	if(httpStatus==404){
		if (m_verbose)
			log(LOG_WARN,"blaster: The page was not found - 404");
		st->m_numUrlDocsReceived++;
	}
	// If the url is a redirect check if it is still http (might have
	// become https or something else, in which case we aren't going to
	// follow it
	else if (httpStatus>=300){
		Url *u=mime.getLocationUrl();

		//If max number of redirects done, bail
		if(!st2->m_numRedirects--){
			log(LOG_WARN,"blaster: Max number of redirects "
			    "reached.");
			st->m_numUrlDocsReceived++;
		}
		//check if it is still http (might have become https or
		// something else, in which case we aren't going to follow it
		else if (!u->isHttp()){
			log(LOG_WARN,"blaster: Redirection not for an http "
			    "page for url %s",u->getUrl());
			st->m_numUrlDocsReceived++;
		}
		// sometimes idiots don't supply us with a Location: mime
		else if ( u->getUrlLen() == 0 ) {
			log(LOG_WARN,"blaster: Redirect url is of 0 length");
			st->m_numUrlDocsReceived++;
		}
		else{
			// I'm not checking as yet if the redirect url is the
			// same as the earlier url, as I've set the max number
			// of redirs to 6 Now lets get the redirect url. Do not
			// increase the numDocsReceived because this wrapper
			// will be called back  for the page
			if (m_verbose)
				log(LOG_WARN,"blaster: Downloading redirect"
				    " %s",u->getUrl());
			//Changing the url to the new place
			//st2->m_url.set(u,false);
			st2->m_url = u->getUrl();
			bool status = g_httpServer.getDoc (st2->m_url, // url
							    0,//ip
							    0 ,  // offset
							    -1 ,  // size
							    0 ,
							    st2 ,  // state
							    gotDocWrapper3,
							    60*1000, // timeout
							    0, // proxy ip
							    0, // proxy port
						    30*1024*1024, //maxLen
							    30*1024*1024);
			// If not blocked, there is an error.
			if (status ) 
				st->m_numUrlDocsReceived++;
		}
	}
	else if(httpStatus<200){
		log(LOG_WARN,"blaster: Bad HTTP status %li",httpStatus);
		st->m_numUrlDocsReceived++;
	}
	else{
		// This means the page is still there, somewhere. Status must 
		// be 200 So find it on server2. This server is assumed to be
		// running an instance of gb, so it shall be given the query in
		// the format 'xxxxx.com/search?q=url%3Ayyyy&code=gbmonitor. 
		// Then check if we have the exact page in the search results 
		// that have come back. So now the problem is that we do
		// not know which url has been got. So I get the location
		// url from mime.
		// The site name is in st->m_u2.getSite()
		// But copy it because it is not nulled.
		char tmp[1024];
		//char site[1024];//how long could a site be?
		long siteLen = 0;
		char *site    = getHostFast(st->m_u2,&siteLen);
		char c = site[siteLen];
		site[siteLen] = 0;
		//strncpy(site,st->m_u2.getSite(),
		//	st->m_u2.getSiteLen());
		//site[st->m_u2.getSiteLen()]='\0';
		sprintf(tmp,"%ssearch?"
			"code=gbmonitor&"
			"q=url%%3A%s",site,st2->m_url);
		site[siteLen] = c;
		if (m_verbose)
			log(LOG_WARN,"blaster: Checking %s",tmp);
		//Url u;
		//u.set(tmp,gbstrlen(tmp));
		//Now get the doc
		bool status = g_httpServer.getDoc ( tmp,//&u,
						    0,//ip
						    0,  // offset
						    -1 ,  // size
						    0 ,
						    st , // state
						    gotDocWrapper4,
						    60*1000, // timeout
						    0,//atoip("66.154.102.20",13),//proxy ip
						    0,//3128,//proxy port
						    30*1024*1024,
						    30*1024*1024);
		// continue if it blocked
		// If not blocked, there is an error. Since we are
		// getting the doc from a gigablast server, report it
		if (status ){
			st->m_numUrlDocsReceived++;
			log(LOG_WARN,"blaster: could not get back"
				    "%s from server in gotDoc3",tmp);
		}
	}
	// If we reached here, that means all the url redirects have been 
	// finished, and there is no need for st2. Free it
	mdelete(st2,sizeof(StateBD2),"Blaster4");


	if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
		m_launched--;
		// Free stateBD
		freeStateBD(st);
	}
	return;
}
コード例 #4
0
// . returns false if blocked, true otherwise
// . returns true on error and sets g_errno
bool SiteGetter::getSiteList ( ) {

top:
	// . setSite() will return TRUE and set g_errno on error, and returns
	//   false if it blocked adding a tag, which will call callback once
	//   tag is added
	// . stop at this point
	if ( m_pathDepth >= 3 ) return setSite();
	// or if no more
	if ( m_pathDepth >= m_maxPathDepth ) return setSite();

	// . make the termid
	// . but here we get are based on "m_pathDepth" which ranges
	//   from 1 to N
	// . if m_pathDepth==0 use "www.xyz.com" as site
	// . if m_pathDepth==1 use "www.xyz.com/foo/" as site ...
	char *pend = getPathEnd ( m_url , m_pathDepth );
	// hash up to that
	//char *host = m_u.getHost();
	char *host = getHostFast ( m_url , NULL );
	// hash the prefix first to match XmlDoc::hashNoSplit()
	char *prefix = "siteterm";
	// hash that and we will incorporate it to match XmlDoc::hashNoSplit()
	int64_t ph = hash64 ( prefix , gbstrlen(prefix) );
	// . this should match basically what is in XmlDoc.cpp::hash()
	// . and this now does not include pages that have no outlinks 
	//   "underneath" them.
	int64_t termId = hash64 ( host , pend - host , ph ) & TERMID_MASK;

	// get all pages that have this as their termid!
	key144_t start ;
	key144_t end   ;
	g_posdb.makeStartKey ( &start, termId );
	g_posdb.makeEndKey   ( &end  , termId );

	// . now see how many urls art at this path depth from this hostname
	// . if it is a huge # then we know they are all subsites!
	//   because it is too bushy to be anything else
	// . i'd say 100 nodes is good enough to qualify as a homestead site

	int32_t minRecSizes = 5000000;
	// get the group this list is in
	//uint32_t gid ;
	//gid = getGroupId ( RDB_POSDB , (char *)&start , false ); //split?
	//uint32_t shardNum ;
	//shardNum = getShardNum( RDB_POSDB , (char *)&start , false ); //split?

	// i guess this is split by termid and not docid????
	int32_t shardNum = g_hostdb.getShardNumByTermId ( &start );

	// we need a group #. the column #.
	//int32_t split = g_hostdb.getGroupNum ( gid );
	// int16_tcut
	Msg0 *m = &m_msg0;
	// get the list. returns false if blocked.
	if ( ! m->getList ( -1                 , // hostId
			    0                  , // ip
			    0                  , // port
			    0                  , // maxCacheAge
			    false              , // addToCache
			    RDB_POSDB        ,
			    m_collnum             ,
			    &m_list            ,
			    (char *)&start     ,
			    (char *)&end       ,
			    minRecSizes        ,
			    this               ,
			    gotSiteListWrapper ,
			    m_niceness         , // MAX_NICENESS
			    // default parms follow
			    true  ,  // doErrorCorrection?
			    true  ,  // includeTree?
			    true  ,  // doMerge?
			    -1    ,  // firstHostId
			    0     ,  // startFileNum
			    -1    ,  // numFiles
			    999999,  // timeout
			    -1    ,  // syncPoint
			    -1    ,  // preferLocalReads
			    NULL  ,  // msg5
			    NULL  ,  // msg5b
			    false ,  // isrealmerge?
			    true  ,  // allowpagecache?
			    false ,  // forceLocalIndexdb?
			    false ,  // doIndexdbSplit? nosplit
			    shardNum ) )//split ))
		return false;

	// return false if this blocked
	if ( ! gotSiteList() ) return false;
	// error?
	if ( g_errno ) return true;
	// or all done
	if ( m_allDone ) return true;
	// otherwise, try the next path component!
	goto top;
}