// . form an HTTP request // . use size 0 for HEAD requests // . use size -1 for GET whole doc requests // . fill in your own offset/size for partial GET requests // . returns false and sets g_errno on error // . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not) bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince, char *userAgent , char *proto , bool doPost , char *cookie ) { m_reqBufValid = false; long hlen ; long port = 80; char *hptr = getHostFast ( url , &hlen , &port ); char *path = getPathFast ( url ); char *pathEnd = NULL; char *postData = NULL; if ( doPost ) { pathEnd = strstr(path,"?"); if ( pathEnd ) { *pathEnd = '\0'; postData = pathEnd + 1; } } // if no legit host if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; } // sanity check. port is only 16 bits if ( port > (long)0xffff ) { g_errno = EBADURL; return false; } // return false and set g_errno if url too big //if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { // g_errno = EURLTOOBIG; return false;} // assume request type is a GET m_requestType = 0; // get the host NULL terminated char host[1024+8]; //long hlen = url->getHostLen(); strncpy ( host , hptr , hlen ); host [ hlen ] = '\0'; // then port //unsigned short port = url->getPort(); if ( port != 80 ) { sprintf ( host + hlen , ":%lu" , port ); hlen += gbstrlen ( host + hlen ); } // the if-modified-since field char ibuf[64]; char *ims = ""; if ( ifModifiedSince ) { // NOTE: ctime appends a \n sprintf(ibuf,"If-Modified-Since: %s UTC", asctime(gmtime(&ifModifiedSince))); // get the length long ilen = gbstrlen(ibuf); // hack off \n from ctime - replace with \r\n\0 ibuf [ ilen - 1 ] = '\r'; ibuf [ ilen ] = '\n'; ibuf [ ilen + 1 ] = '\0'; // set ims to this string ims = ibuf; } // . until we fix if-modified-since, take it out // . seems like we are being called with it as true when should not be ims=""; // . use one in conf file if caller did not provide // . this is usually Gigabot/1.0 if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent; // accept only these char *accept = "*/*"; /* "text/html, " "text/plain, " "text/xml, " "application/pdf, " "application/msword, " "application/vnd.ms-excel, " "application/mspowerpoint, " "application/postscript"; */ char *cmd = "GET"; if ( size == 0 ) cmd = "HEAD"; if ( doPost ) cmd = "POST"; // . now use "Accept-Language: en" to tell servers we prefer english // . i removed keep-alive connection since some connections close on // non-200 ok http statuses and we think they're open since close // signal (read 0 bytes) may have been delayed char* acceptEncoding = ""; // the scraper is getting back gzipped search results from goog, // so disable this for now // i am re-enabling now for testing... if(g_conf.m_gzipDownloads) acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n"; // i thought this might stop wikipedia from forcing gzip on us // but it did not! // else // acceptEncoding = "Accept-Encoding:\r\n"; // char *p = m_buf; // init the safebuf to point to this buffer in our class to avoid // a potential alloc // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 ); m_reqBuf.purge(); // indicate this is good m_reqBufValid = true; if ( size == 0 ) { // 1 for HEAD requests m_requestType = 1; m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n\r\n" , "Accept: %s\r\n" , cmd, path , proto, host , ims , userAgent , accept ); } else if ( size != -1 ) m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n" "Accept: %s\r\n" "Range: bytes=%li-%li\r\n" , cmd, path , proto , host , ims , userAgent , accept , offset , offset + size ); else if ( offset > 0 && size == -1 ) m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n" "Accept: %s\r\n" "Range: bytes=%li-\r\n" , cmd, path , proto , host , ims , userAgent , accept , offset ); // Wget's request: // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n // firefox's request: // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n else { // until we fix if-modified-since, take it out //ims=""; //userAgent = "Wget/1.10.2"; //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7"; //proto = "HTTP/1.0"; m_reqBuf.safePrintf ( "%s %s %s\r\n" "User-Agent: %s\r\n" "Accept: */*\r\n" "Host: %s\r\n" "%s" //"Connection: Keep-Alive\r\n" //"Accept-Language: en\r\n" "%s", //"Accept: %s\r\n\r\n" , //"\r\n", cmd, path , proto , userAgent , host , ims , acceptEncoding); //accept ); } // cookie here if ( cookie ) m_reqBuf.safePrintf("Cookie: %s\r\n",cookie ); // print content-length: if post if ( postData ) { // dammit... recaptcha does not work without this!!!! m_reqBuf.safePrintf ( "Content-Type: " "application/x-www-form-urlencoded\r\n"); long contentLen = strlen(postData); m_reqBuf.safePrintf ("Content-Length: %li\r\n", contentLen ); m_reqBuf.safePrintf("\r\n"); m_reqBuf.safePrintf("%s",postData); // log it for debug //log("captch: %s",m_buf); } else { m_reqBuf.safePrintf("\r\n"); } // set m_bufLen //m_bufLen = p - m_buf;//gbstrlen ( m_buf ); // sanity check // if ( m_bufLen + 1 > MAX_REQ_LEN ) { // log("build: HttpRequest buf is too small."); // char *xx = NULL; *xx = 0; // } // restore url buffer if ( pathEnd ) *pathEnd = '?'; return true; }
// . form an HTTP request // . use size 0 for HEAD requests // . use size -1 for GET whole doc requests // . fill in your own offset/size for partial GET requests // . returns false and sets g_errno on error // . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not) bool HttpRequest::set (char *url,int32_t offset,int32_t size,time_t ifModifiedSince, const char *userAgent, const char *proto, bool doPost, const char *cookieJar, const char *additionalHeader, // if posting something, how many bytes is it? int32_t postContentLen , // are we sending the request through an http proxy? // if so this will be non-zero int32_t proxyIp , const char *proxyUsernamePwd ) { m_reqBufValid = false; int32_t hlen ; int32_t port = 80; const char *hptr = getHostFast ( url , &hlen , &port ); char *path = getPathFast ( url ); // . use the full url if sending to an http proxy // . HACK: do NOT do this if it is httpS because we end up // using the http tunnel using the CONNECT cmd and the squid proxy // will just forward/proxy just the entire tcp packets. if ( proxyIp && strncmp(url,"https://",8) != 0 ) path = url; char *pathEnd = NULL; const char *postData = NULL; if ( doPost ) { pathEnd = strstr(path,"?"); if ( pathEnd ) { *pathEnd = '\0'; postData = pathEnd + 1; } } // if no legit host if ( hlen <= 0 || ! hptr ) { g_errno = EBADURL; return false; } // sanity check. port is only 16 bits if ( port > (int32_t)0xffff ) { g_errno = EBADURL; return false; } // return false and set g_errno if url too big //if ( url->getUrlLen() + 400 >= MAX_REQ_LEN ) { // g_errno = EURLTOOBIG; return false;} // assume request type is a GET m_requestType = RT_GET;//0; // get the host NULL terminated char host[1024+8]; //int32_t hlen = url->getHostLen(); strncpy ( host , hptr , hlen ); host [ hlen ] = '\0'; // then port //uint16_t port = url->getPort(); if ( port != 80 ) { sprintf ( host + hlen , ":%" PRIu32 , (uint32_t)port ); hlen += strlen ( host + hlen ); } // the if-modified-since field const char *ims = ""; #if 0 char ibuf[64]; if ( ifModifiedSince ) { struct tm tm_buf; char buf[64]; // NOTE: ctime appends a \n snprintf(ibuf, sizeof(ibuf), "If-Modified-Since: %s UTC", asctime_r(gmtime_r(&ifModifiedSince,&tm_buf),buf)); // get the length int32_t ilen = strlen(ibuf); if( ilen && ilen < (int32_t)sizeof(ibuf)-1 ) { // hack off \n from ctime - replace with \r\n\0 ibuf [ ilen - 1 ] = '\r'; ibuf [ ilen ] = '\n'; ibuf [ ilen + 1 ] = '\0'; // set ims to this string ims = ibuf; } } // . until we fix if-modified-since, take it out // . seems like we are being called with it as true when should not be ims=""; #endif // . use one in conf file if caller did not provide // . this is usually Gigabot/1.0 if ( ! userAgent ) userAgent = g_conf.m_spiderUserAgent; // accept only these const char *accept = "*/*"; /* "text/html, " "text/plain, " "text/xml, " "application/pdf, " "application/msword, " "application/vnd.ms-excel, " "application/mspowerpoint, " "application/postscript"; */ const char *cmd = "GET"; if ( size == 0 ) cmd = "HEAD"; if ( doPost ) cmd = "POST"; // crap, can't spider nyt.com if we are 1.0, so use 1.0 but also // note Connection: Close\r\n when making requests //proto = "HTTP/1.1"; SafeBuf tmp; const char *up = ""; if ( proxyUsernamePwd && proxyUsernamePwd[0] ) { tmp.safePrintf("Proxy-Authorization: Basic "); tmp.base64Encode (proxyUsernamePwd,strlen(proxyUsernamePwd)); tmp.safePrintf("\r\n"); up = tmp.getBufStart(); } // . now use "Accept-Language: en" to tell servers we prefer english // . i removed keep-alive connection since some connections close on // non-200 ok http statuses and we think they're open since close // signal (read 0 bytes) may have been delayed const char* acceptEncoding = ""; // the scraper is getting back gzipped search results from goog, // so disable this for now // i am re-enabling now for testing... if(g_conf.m_gzipDownloads) acceptEncoding = "Accept-Encoding: gzip;q=1.0\r\n"; // i thought this might stop wikipedia from forcing gzip on us // but it did not! // else // acceptEncoding = "Accept-Encoding:\r\n"; // char *p = m_buf; // init the safebuf to point to this buffer in our class to avoid // a potential alloc // m_reqBuf.setBuf ( m_buf , MAX_REQ_LEN , 0 , false, csUTF8 ); m_reqBuf.purge(); // indicate this is good m_reqBufValid = true; if ( size == 0 ) { // 1 for HEAD requests m_requestType = RT_HEAD; m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n\r\n" , "Accept: %s\r\n" "%s" , cmd, path , proto, host , ims , userAgent , accept , up ); } else if ( size != -1 ) m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n" "Accept: %s\r\n" "Range: bytes=%" PRId32"-%" PRId32"\r\n" "%s" , cmd, path , proto , host , ims , userAgent , accept , offset , offset + size , up); else if ( offset > 0 ) // size is -1 m_reqBuf.safePrintf ( "%s %s %s\r\n" "Host: %s\r\n" "%s" "User-Agent: %s\r\n" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" "Accept-Language: en\r\n" //"Accept: */*\r\n" "Accept: %s\r\n" "Range: bytes=%" PRId32"-\r\n" "%s" , cmd, path , proto , host , ims , userAgent , accept , offset , up ); // Wget's request: // GET / HTTP/1.0\r\nUser-Agent: Wget/1.10.2\r\nAccept: */*\r\nHost: 127.0.0.1:8000\r\nConnection: Keep-Alive\r\n\r\n // firefox's request: // GET /master?c=main HTTP/1.1\r\nHost: 10.5.1.203:8000\r\nUser-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 115\r\nConnection: keep-alive\r\nReferer: http://10.5.0.2:8002/qpmdw.html\r\nCookie: __utma=267617550.1103353528.1269214594.1273256655.1276103782.12; __utmz=267617550.1269214594.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _incvi=qCffL7N8chFyJLwWrBDMbNz2Q3EWmAnf4uA; s_lastvisit=1269900225815; s_pers=%20s_getnr%3D1276103782254-New%7C1339175782254%3B%20s_nrgvo%3DNew%7C1339175782258%3B\r\n\r\n else { // until we fix if-modified-since, take it out //ims=""; //userAgent = "Wget/1.10.2"; //userAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.7) Gecko/20100715 Ubuntu/10.04 (lucid) Firefox/3.6.7"; //proto = "HTTP/1.0"; m_reqBuf.safePrintf ( "%s %s %s\r\n" "User-Agent: %s\r\n" "Accept: */*\r\n" "Host: %s\r\n" "%s" "Connection: Close\r\n" //"Connection: Keep-Alive\r\n" //"Accept-Language: en\r\n" "%s" "%s" , //"Accept: %s\r\n\r\n" , //"\r\n", cmd, path , proto , userAgent , host , ims , acceptEncoding, up ); //accept ); } if ( additionalHeader ) m_reqBuf.safePrintf("%s\r\n",additionalHeader ); // cookie here if (cookieJar) { HttpMime::addCookieHeader(cookieJar, url, &m_reqBuf); } // print content-length: if post if ( postData ) { // dammit... recaptcha does not work without this!!!! m_reqBuf.safePrintf ( "Content-Type: " "application/x-www-form-urlencoded\r\n"); } // we need this if doing a post even if postData is NULL if ( doPost ) { int32_t contentLen = 0; if ( postData ) contentLen = strlen(postData); // this overrides if provided. -1 is default if ( postContentLen >= 0 ) contentLen = postContentLen; m_reqBuf.safePrintf ("Content-Length: %" PRId32"\r\n", contentLen ); m_reqBuf.safePrintf("\r\n"); if ( postData ) m_reqBuf.safePrintf("%s",postData); // log it for debug //log("captch: %s",m_buf); } if ( ! doPost ) { // ! postData ) { m_reqBuf.safePrintf("\r\n"); } // restore url buffer if ( pathEnd ) *pathEnd = '?'; return true; }
void Blaster::gotDoc3 ( void *state, TcpSocket *s){ StateBD2 *st2=(StateBD2 *)state; StateBD *st=st2->m_st; if (!s) { log (LOG_WARN,"blaster: Got a null s in gotDoc3." "Happened because ip could not be found"); st->m_numUrlDocsReceived++; //Free StateBD2 mdelete(st2,sizeof(StateBD2),"Blaster4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } // bail if got cut off if ( s->m_readOffset == 0 ) { log("blasterDiff : lost the Request in gotDoc3"); st->m_numUrlDocsReceived++; //Free StateBD2 mdelete(st2,sizeof(StateBD2),"Blaster4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } char *reply = s->m_readBuf ; long size = s->m_readOffset; HttpMime mime; mime.set(reply,size,NULL); long httpStatus=mime.getHttpStatus(); if(httpStatus==404){ if (m_verbose) log(LOG_WARN,"blaster: The page was not found - 404"); st->m_numUrlDocsReceived++; } // If the url is a redirect check if it is still http (might have // become https or something else, in which case we aren't going to // follow it else if (httpStatus>=300){ Url *u=mime.getLocationUrl(); //If max number of redirects done, bail if(!st2->m_numRedirects--){ log(LOG_WARN,"blaster: Max number of redirects " "reached."); st->m_numUrlDocsReceived++; } //check if it is still http (might have become https or // something else, in which case we aren't going to follow it else if (!u->isHttp()){ log(LOG_WARN,"blaster: Redirection not for an http " "page for url %s",u->getUrl()); st->m_numUrlDocsReceived++; } // sometimes idiots don't supply us with a Location: mime else if ( u->getUrlLen() == 0 ) { log(LOG_WARN,"blaster: Redirect url is of 0 length"); st->m_numUrlDocsReceived++; } else{ // I'm not checking as yet if the redirect url is the // same as the earlier url, as I've set the max number // of redirs to 6 Now lets get the redirect url. Do not // increase the numDocsReceived because this wrapper // will be called back for the page if (m_verbose) log(LOG_WARN,"blaster: Downloading redirect" " %s",u->getUrl()); //Changing the url to the new place //st2->m_url.set(u,false); st2->m_url = u->getUrl(); bool status = g_httpServer.getDoc (st2->m_url, // url 0,//ip 0 , // offset -1 , // size 0 , st2 , // state gotDocWrapper3, 60*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024); // If not blocked, there is an error. if (status ) st->m_numUrlDocsReceived++; } } else if(httpStatus<200){ log(LOG_WARN,"blaster: Bad HTTP status %li",httpStatus); st->m_numUrlDocsReceived++; } else{ // This means the page is still there, somewhere. Status must // be 200 So find it on server2. This server is assumed to be // running an instance of gb, so it shall be given the query in // the format 'xxxxx.com/search?q=url%3Ayyyy&code=gbmonitor. // Then check if we have the exact page in the search results // that have come back. So now the problem is that we do // not know which url has been got. So I get the location // url from mime. // The site name is in st->m_u2.getSite() // But copy it because it is not nulled. char tmp[1024]; //char site[1024];//how long could a site be? long siteLen = 0; char *site = getHostFast(st->m_u2,&siteLen); char c = site[siteLen]; site[siteLen] = 0; //strncpy(site,st->m_u2.getSite(), // st->m_u2.getSiteLen()); //site[st->m_u2.getSiteLen()]='\0'; sprintf(tmp,"%ssearch?" "code=gbmonitor&" "q=url%%3A%s",site,st2->m_url); site[siteLen] = c; if (m_verbose) log(LOG_WARN,"blaster: Checking %s",tmp); //Url u; //u.set(tmp,gbstrlen(tmp)); //Now get the doc bool status = g_httpServer.getDoc ( tmp,//&u, 0,//ip 0, // offset -1 , // size 0 , st , // state gotDocWrapper4, 60*1000, // timeout 0,//atoip("66.154.102.20",13),//proxy ip 0,//3128,//proxy port 30*1024*1024, 30*1024*1024); // continue if it blocked // If not blocked, there is an error. Since we are // getting the doc from a gigablast server, report it if (status ){ st->m_numUrlDocsReceived++; log(LOG_WARN,"blaster: could not get back" "%s from server in gotDoc3",tmp); } } // If we reached here, that means all the url redirects have been // finished, and there is no need for st2. Free it mdelete(st2,sizeof(StateBD2),"Blaster4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; }
// . returns false if blocked, true otherwise // . returns true on error and sets g_errno bool SiteGetter::getSiteList ( ) { top: // . setSite() will return TRUE and set g_errno on error, and returns // false if it blocked adding a tag, which will call callback once // tag is added // . stop at this point if ( m_pathDepth >= 3 ) return setSite(); // or if no more if ( m_pathDepth >= m_maxPathDepth ) return setSite(); // . make the termid // . but here we get are based on "m_pathDepth" which ranges // from 1 to N // . if m_pathDepth==0 use "www.xyz.com" as site // . if m_pathDepth==1 use "www.xyz.com/foo/" as site ... char *pend = getPathEnd ( m_url , m_pathDepth ); // hash up to that //char *host = m_u.getHost(); char *host = getHostFast ( m_url , NULL ); // hash the prefix first to match XmlDoc::hashNoSplit() char *prefix = "siteterm"; // hash that and we will incorporate it to match XmlDoc::hashNoSplit() int64_t ph = hash64 ( prefix , gbstrlen(prefix) ); // . this should match basically what is in XmlDoc.cpp::hash() // . and this now does not include pages that have no outlinks // "underneath" them. int64_t termId = hash64 ( host , pend - host , ph ) & TERMID_MASK; // get all pages that have this as their termid! key144_t start ; key144_t end ; g_posdb.makeStartKey ( &start, termId ); g_posdb.makeEndKey ( &end , termId ); // . now see how many urls art at this path depth from this hostname // . if it is a huge # then we know they are all subsites! // because it is too bushy to be anything else // . i'd say 100 nodes is good enough to qualify as a homestead site int32_t minRecSizes = 5000000; // get the group this list is in //uint32_t gid ; //gid = getGroupId ( RDB_POSDB , (char *)&start , false ); //split? //uint32_t shardNum ; //shardNum = getShardNum( RDB_POSDB , (char *)&start , false ); //split? // i guess this is split by termid and not docid???? int32_t shardNum = g_hostdb.getShardNumByTermId ( &start ); // we need a group #. the column #. //int32_t split = g_hostdb.getGroupNum ( gid ); // int16_tcut Msg0 *m = &m_msg0; // get the list. returns false if blocked. if ( ! m->getList ( -1 , // hostId 0 , // ip 0 , // port 0 , // maxCacheAge false , // addToCache RDB_POSDB , m_collnum , &m_list , (char *)&start , (char *)&end , minRecSizes , this , gotSiteListWrapper , m_niceness , // MAX_NICENESS // default parms follow true , // doErrorCorrection? true , // includeTree? true , // doMerge? -1 , // firstHostId 0 , // startFileNum -1 , // numFiles 999999, // timeout -1 , // syncPoint -1 , // preferLocalReads NULL , // msg5 NULL , // msg5b false , // isrealmerge? true , // allowpagecache? false , // forceLocalIndexdb? false , // doIndexdbSplit? nosplit shardNum ) )//split )) return false; // return false if this blocked if ( ! gotSiteList() ) return false; // error? if ( g_errno ) return true; // or all done if ( m_allDone ) return true; // otherwise, try the next path component! goto top; }