// . form an HTTP request 
// . use size 0 for HEAD requests
// . use size -1 for GET whole doc requests
// . fill in your own offset/size for partial GET requests
// . returns false and sets g_errno on error
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
		       char *userAgent , char *proto , bool doPost ,
		       char *cookie ) {

	m_reqBufValid = false;

	long hlen ;
	long port = 80;
	char *hptr = getHostFast ( url , &hlen , &port );
	char *path = getPathFast ( url );

	char *pathEnd  = NULL;
	char *postData = NULL;
	if ( doPost ) {
		pathEnd  = strstr(path,"?");
		if ( pathEnd ) {
			*pathEnd = '\0';
			postData = pathEnd + 1;
		}
	}

	// if no legit host
	if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; }
	// sanity check. port is only 16 bits
	if ( port > (long)0xffff ) { g_errno = EBADURL; return false; }
	// return false and set g_errno if url too big
	//if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { 
	//	g_errno = EURLTOOBIG; return false;}
	// assume request type is a GET
	m_requestType = 0;
	// get the host NULL terminated
	char host[1024+8];
	//long hlen = url->getHostLen();
	strncpy ( host , hptr , hlen );
	host [ hlen ] = '\0';
	// then port
	//unsigned short port = url->getPort();
	if ( port != 80 ) {
		sprintf ( host + hlen , ":%lu" , port );
		hlen += gbstrlen ( host + hlen );
	}
	// the if-modified-since field
	char  ibuf[64];
	char *ims = "";
	if ( ifModifiedSince ) {
		// NOTE: ctime appends a \n 
		sprintf(ibuf,"If-Modified-Since: %s UTC",
			asctime(gmtime(&ifModifiedSince)));
		// get the length
		long ilen = gbstrlen(ibuf);
		// hack off \n from ctime - replace with \r\n\0
		ibuf [ ilen - 1 ] = '\r';
		ibuf [ ilen     ] = '\n';
		ibuf [ ilen + 1 ] = '\0';
		// set ims to this string
		ims = ibuf;
	}
	// . until we fix if-modified-since, take it out
	// . seems like we are being called with it as true when should not be
	ims="";

	// . use one in conf file if caller did not provide
	// . this is usually Gigabot/1.0
	if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent;
	// accept only these
	char *accept = "*/*";
	/*
		 "text/html, "
		 "text/plain, "
		 "text/xml, "
		 "application/pdf, "
		 "application/msword, "
		 "application/vnd.ms-excel, "
		 "application/mspowerpoint, "
		 "application/postscript";
	*/

	char *cmd = "GET";
	if ( size == 0 ) cmd = "HEAD";
	if ( doPost    ) cmd = "POST";

	 // . now use "Accept-Language: en" to tell servers we prefer english
	 // . i removed keep-alive connection since some connections close on
	 //   non-200 ok http statuses and we think they're open since close
	 //   signal (read 0 bytes) may have been delayed
	 char* acceptEncoding = "";
	 // the scraper is getting back gzipped search results from goog,
	 // so disable this for now
	 // i am re-enabling now for testing...
	 if(g_conf.m_gzipDownloads)
	 	 acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n";
	 // i thought this might stop wikipedia from forcing gzip on us
	 // but it did not!
	 // else
	 //	 acceptEncoding = "Accept-Encoding:\r\n";

	 // char *p = m_buf;
	 // init the safebuf to point to this buffer in our class to avoid
	 // a potential alloc
	 // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 );
	 m_reqBuf.purge();
	 // indicate this is good
	 m_reqBufValid = true;

	 if ( size == 0 ) {
		 // 1 for HEAD requests
		 m_requestType = 1; 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   //"Connection: Keep-Alive\r\n" 
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n\r\n" ,
			   "Accept: %s\r\n" ,
				 cmd,
			   path , proto, host , 
			   ims , userAgent , accept );
	 }
	 else if ( size != -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%li-%li\r\n" ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
			   offset + size );
	 else if ( offset > 0  && size == -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%li-\r\n" ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset );
	 // Wget's request:
	 // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n
	 // firefox's request:
	 // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n
	 else {
		 // until we fix if-modified-since, take it out
		 //ims="";
		 //userAgent = "Wget/1.10.2";
		 //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7";
		 //proto = "HTTP/1.0";
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "User-Agent: %s\r\n"
			   "Accept: */*\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   //"Connection: Keep-Alive\r\n"
			   //"Accept-Language: en\r\n"
				"%s",
			   //"Accept: %s\r\n\r\n" ,
				//"\r\n",
				cmd,
			   path ,
			   proto ,
			   userAgent ,
			   host ,
			   ims ,
			   acceptEncoding);
			   //accept );
	 }

	 // cookie here
	 if ( cookie ) 
		 m_reqBuf.safePrintf("Cookie: %s\r\n",cookie );

	 // print content-length: if post
	 if ( postData ) {
		 // dammit... recaptcha does not work without this!!!!
		 m_reqBuf.safePrintf (
			      "Content-Type: "
			      "application/x-www-form-urlencoded\r\n");
		 long contentLen = strlen(postData);
		 m_reqBuf.safePrintf ("Content-Length: %li\r\n", contentLen );
		 m_reqBuf.safePrintf("\r\n");
		 m_reqBuf.safePrintf("%s",postData);
		 // log it for debug
		 //log("captch: %s",m_buf);
	 }
	 else {
		 m_reqBuf.safePrintf("\r\n");
	 }

	 // set m_bufLen
	 //m_bufLen = p - m_buf;//gbstrlen ( m_buf );
	 // sanity check
	 // if ( m_bufLen + 1 > MAX_REQ_LEN ) {
	 //	 log("build: HttpRequest buf is too small.");
	 //	 char *xx = NULL; *xx = 0;
	 // }

	 // restore url buffer
	 if ( pathEnd ) *pathEnd = '?';

	 return true;
 }
// . form an HTTP request 
// . use size 0 for HEAD requests
// . use size -1 for GET whole doc requests
// . fill in your own offset/size for partial GET requests
// . returns false and sets g_errno on error
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
bool HttpRequest::set (char *url,int32_t offset,int32_t size,time_t ifModifiedSince,
		       const char *userAgent, const char *proto, bool doPost,
		       const char *cookieJar, const char *additionalHeader,
		       // if posting something, how many bytes is it?
		       int32_t postContentLen ,
		       // are we sending the request through an http proxy?
		       // if so this will be non-zero
		       int32_t proxyIp ,
		       const char *proxyUsernamePwd ) {

	m_reqBufValid = false;

	int32_t hlen ;
	int32_t port = 80;
	const char *hptr = getHostFast ( url , &hlen , &port );
	char *path = getPathFast ( url );

	// . use the full url if sending to an http proxy
	// . HACK: do NOT do this if it is httpS because we end up
	//   using the http tunnel using the CONNECT cmd and the squid proxy
	//   will just forward/proxy just the entire tcp packets.
	if ( proxyIp && strncmp(url,"https://",8) != 0 ) path = url;

	char *pathEnd  = NULL;
	const char *postData = NULL;
	if ( doPost ) {
		pathEnd  = strstr(path,"?");
		if ( pathEnd ) {
			*pathEnd = '\0';
			postData = pathEnd + 1;
		}
	}

	// if no legit host
	if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; }
	// sanity check. port is only 16 bits
	if ( port > (int32_t)0xffff ) { g_errno = EBADURL; return false; }
	// return false and set g_errno if url too big
	//if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { 
	//	g_errno = EURLTOOBIG; return false;}
	// assume request type is a GET
	m_requestType = RT_GET;//0;
	// get the host NULL terminated
	char host[1024+8];
	//int32_t hlen = url->getHostLen();
	strncpy ( host , hptr , hlen );
	host [ hlen ] = '\0';
	// then port
	//uint16_t port = url->getPort();
	if ( port != 80 ) {
		sprintf ( host + hlen , ":%" PRIu32 , (uint32_t)port );
		hlen += strlen ( host + hlen );
	}
	// the if-modified-since field
	const char *ims = "";
#if 0
	char  ibuf[64];
	if ( ifModifiedSince ) {
		struct tm tm_buf;
		char buf[64];
		// NOTE: ctime appends a \n 
		snprintf(ibuf, sizeof(ibuf), "If-Modified-Since: %s UTC",
			asctime_r(gmtime_r(&ifModifiedSince,&tm_buf),buf));
		// get the length
		int32_t ilen = strlen(ibuf);
		if( ilen && ilen < (int32_t)sizeof(ibuf)-1 ) {
			// hack off \n from ctime - replace with \r\n\0
			ibuf [ ilen - 1 ] = '\r';
			ibuf [ ilen     ] = '\n';
			ibuf [ ilen + 1 ] = '\0';
			// set ims to this string
			ims = ibuf;
		}
	}
	// . until we fix if-modified-since, take it out
	// . seems like we are being called with it as true when should not be
	ims="";
#endif

	// . use one in conf file if caller did not provide
	// . this is usually Gigabot/1.0
	if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent;
	// accept only these
	const char *accept = "*/*";
	/*
		 "text/html, "
		 "text/plain, "
		 "text/xml, "
		 "application/pdf, "
		 "application/msword, "
		 "application/vnd.ms-excel, "
		 "application/mspowerpoint, "
		 "application/postscript";
	*/

	const char *cmd = "GET";
	if ( size == 0 ) cmd = "HEAD";
	if ( doPost    ) cmd = "POST";

	// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
	// note Connection: Close\r\n when making requests
	//proto = "HTTP/1.1";

	SafeBuf tmp;
	const char *up = "";
	if ( proxyUsernamePwd && proxyUsernamePwd[0] ) {
		tmp.safePrintf("Proxy-Authorization: Basic ");
		tmp.base64Encode (proxyUsernamePwd,strlen(proxyUsernamePwd));
		tmp.safePrintf("\r\n");
		up = tmp.getBufStart();
	}

	 // . now use "Accept-Language: en" to tell servers we prefer english
	 // . i removed keep-alive connection since some connections close on
	 //   non-200 ok http statuses and we think they're open since close
	 //   signal (read 0 bytes) may have been delayed
	 const char* acceptEncoding = "";
	 // the scraper is getting back gzipped search results from goog,
	 // so disable this for now
	 // i am re-enabling now for testing...
	 if(g_conf.m_gzipDownloads)
	 	 acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n";
	 // i thought this might stop wikipedia from forcing gzip on us
	 // but it did not!
	 // else
	 //	 acceptEncoding = "Accept-Encoding:\r\n";

	 // char *p = m_buf;
	 // init the safebuf to point to this buffer in our class to avoid
	 // a potential alloc
	 // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 );
	 m_reqBuf.purge();
	 // indicate this is good
	 m_reqBufValid = true;

	 if ( size == 0 ) {
		 // 1 for HEAD requests
		 m_requestType = RT_HEAD; 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n" 
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n\r\n" ,
			   "Accept: %s\r\n" 
			   "%s"
			   ,
				 cmd,
			   path , proto, host , 
			   ims , userAgent , accept , up );
	 }
	 else 
	 if ( size != -1 ) 
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-%" PRId32"\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
			   offset + size ,
				      up);
	 else 
	 if ( offset > 0 ) 	// size is -1
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "User-Agent: %s\r\n"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   "Accept-Language: en\r\n"
			   //"Accept: */*\r\n"
			   "Accept: %s\r\n"
			   "Range: bytes=%" PRId32"-\r\n" 
			   "%s"
			   ,
				cmd,
			   path ,
			   proto ,
			   host ,
			   ims  ,
			   userAgent ,
			   accept ,
			   offset ,
				      up );
	 // Wget's request:
	 // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n
	 // firefox's request:
	 // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n
	 else {
		 // until we fix if-modified-since, take it out
		 //ims="";
		 //userAgent = "Wget/1.10.2";
		 //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7";
		 //proto = "HTTP/1.0";
		 m_reqBuf.safePrintf (
			   "%s %s %s\r\n" 
			   "User-Agent: %s\r\n"
			   "Accept: */*\r\n" 
			   "Host: %s\r\n"
			   "%s"
			   "Connection: Close\r\n"
			   //"Connection: Keep-Alive\r\n"
			   //"Accept-Language: en\r\n"
				"%s"
			   "%s"
			   ,
			   //"Accept: %s\r\n\r\n" ,
				//"\r\n",
				cmd,
			   path ,
			   proto ,
			   userAgent ,
			   host ,
			   ims ,
			   acceptEncoding,
				      up );
			   //accept );
	 }

	 if ( additionalHeader )
		 m_reqBuf.safePrintf("%s\r\n",additionalHeader );

	 // cookie here
	if (cookieJar) {
		HttpMime::addCookieHeader(cookieJar, url, &m_reqBuf);
	}

	 // print content-length: if post
	 if ( postData ) {
		 // dammit... recaptcha does not work without this!!!!
		 m_reqBuf.safePrintf (
			      "Content-Type: "
			      "application/x-www-form-urlencoded\r\n");
	 }

	 // we need this if doing a post even if postData is NULL
	 if ( doPost ) {
		 int32_t contentLen = 0;
		 if ( postData ) contentLen = strlen(postData);
		 // this overrides if provided. -1 is default
		 if ( postContentLen >= 0 ) contentLen = postContentLen;
		 m_reqBuf.safePrintf ("Content-Length: %" PRId32"\r\n", contentLen );
		 m_reqBuf.safePrintf("\r\n");
		 if ( postData ) m_reqBuf.safePrintf("%s",postData);
		 // log it for debug
		 //log("captch: %s",m_buf);
	 }

	 if ( ! doPost ) { // ! postData ) {
		 m_reqBuf.safePrintf("\r\n");
	 }

	 // restore url buffer
	 if ( pathEnd ) *pathEnd = '?';

	 return true;
 }