// for procog bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) { // make a state State8 *st; try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; st->m_state = NULL; //st->m_callback = callback; //st->m_q = q; //st->m_termFreqs = termFreqs; //st->m_termFreqWeights = termFreqWeights; //st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( ! coll ) coll = g_conf.m_defaultColl; if ( ! coll ) coll = "main"; long collLen = gbstrlen(coll); if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given st->m_docId = r->getLongLong ("d",-1); st->m_docId = r->getLongLong ("docid",st->m_docId); long ulen; char *u = st->m_r.getString("u",&ulen,NULL); if ( ! u ) u = st->m_r.getString("url",&ulen,NULL); if ( ! u && st->m_docId == -1LL ) return sendErrorReply ( st , EBADREQUEST ); // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = u; st->m_ulen = 0; if ( u ) st->m_ulen = gbstrlen(u); // should we recycle link info? st->m_recycle = r->getLong("recycle",1); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); st->m_recompute = r->getLong("recompute" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); //st->m_page = r->getLong("page",1); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value //st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header //xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " // "content=\"text/html; charset=utf-8\">\n"); XmlDoc *xd = &st->m_xd; long isXml = r->getLong("xml",0); // if got docid, use that if ( st->m_docId != -1 ) { if ( ! xd->set3 ( st->m_docId, st->m_coll, 0 ) ) // niceness // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); xd->m_pbuf = &st->m_wbuf; // reset this flag st->m_donePrinting = false; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; // now tell it to fetch the old title rec if ( ! xd->loadFromOldTitleRec () ) // return false if this blocks return false; return gotXmlDoc ( st ); } // set this up SpiderRequest sreq; sreq.reset(); if ( st->m_u ) strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; //uint8_t contentType = CT_HTML; //if ( isXml ) contentType = CT_XML; long ctype = r->getLong("ctype",CT_HTML); // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , // we need this so the term table is set! &st->m_wbuf , // XmlDoc::m_pbuf 0, // try 0 now! 1 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip ctype )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); // reset this flag st->m_donePrinting = false; // prevent a core here in the event we download the page content xd->m_crawlDelayValid = true; xd->m_crawlDelay = 0; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; // only recycle if docid is given!! if ( st->m_recycle ) xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; return gotXmlDoc ( st ); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageGet ( TcpSocket *s , HttpRequest *r ) { // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) { g_errno = ENOCOLLREC; log("query: Archived copy retrieval failed. " "No collection record found for " "collection \"%s\".",coll); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // does this collection ban this IP? if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; //log("PageGet::sendDynamicReply0: permission denied for %s", // iptoa(s->m_ip) ); g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // . get fields from cgi field of the requested url // . get the search query long qlen = 0; char *q = r->getString ( "q" , &qlen , NULL /*default*/); // ensure query not too big if ( qlen >= MAX_QUERY_LEN-1 ) { g_errno=EQUERYTOOBIG; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // the docId long long docId = r->getLongLong ( "d" , 0LL /*default*/ ); // get url char *url = r->getString ( "u",NULL); if ( docId == 0 && ! url ) { g_errno = EMISSINGINPUT; return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno)); } // . should we do a sequential lookup? // . we need to match summary here so we need to know this //bool seq = r->getLong ( "seq" , false ); // restrict to root file? bool rtq = r->getLong ( "rtq" , false ); // . get the titleRec // . TODO: redirect client to a better http server to save bandwidth State2 *st ; try { st = new (State2); } catch (... ) { g_errno = ENOMEM; log("PageGet: new(%i): %s", (int)sizeof(State2),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State2) , "PageGet1" ); // save the socket and if Host: is local in the Http request Mime st->m_socket = s; st->m_isAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_docId = docId; st->m_printed = false; // include header ... "this page cached by Gigablast on..." st->m_includeHeader = r->getLong ("ih" , true ); st->m_includeBaseHref = r->getLong ("ibh" , false ); st->m_queryHighlighting = r->getLong ("qh" , true ); st->m_strip = r->getLong ("strip" , 0 ); st->m_clickAndScroll = r->getLong ("cas" , true ); st->m_cnsPage = r->getLong ("cnsp" , true ); char *langAbbr = r->getString("qlang",NULL); st->m_langId = langUnknown; if ( langAbbr ) { uint8_t langId = getLangIdFromAbbr ( langAbbr ); st->m_langId = langId; } strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 ); // store query for query highlighting st->m_netTestResults = r->getLong ("rnettest", false ); //if( st->m_netTestResults ) { // mdelete ( st , sizeof(State2) , "PageGet1" ); // delete ( st ); // return sendPageNetResult( s ); //} if ( q && qlen > 0 ) strcpy ( st->m_q , q ); else st->m_q[0] = '\0'; st->m_qlen = qlen; //st->m_seq = seq; st->m_rtq = rtq; st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ ); st->m_isBanned = false; st->m_noArchive = false; st->m_socket = s; st->m_format = r->getReplyFormat(); // default to 0 niceness st->m_niceness = 0; st->m_r.copy ( r ); //st->m_cr = cr; st->m_printDisclaimer = true; if ( st->m_cnsPage ) st->m_printDisclaimer = false; if ( st->m_strip ) // ! st->m_evbits.isEmpty() ) st->m_printDisclaimer = false; // should we cache it? char useCache = r->getLong ( "usecache" , 1 ); char rcache = r->getLong ( "rcache" , 1 ); char wcache = r->getLong ( "wcache" , 1 ); long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour if ( useCache == 0 ) { cacheAge = 0; wcache = 0; } if ( rcache == 0 ) cacheAge = 0; // . fetch the TitleRec // . a max cache age of 0 means not to read from the cache XmlDoc *xd = &st->m_xd; // url based? if ( url ) { SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, url ); sreq.setDataSize(); // this returns false if "coll" is invalid if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) goto hadSetError; } // . when getTitleRec() is called it will load the old one // since XmlDoc::m_setFromTitleRec will be true // . niceness is 0 // . use st->m_coll since XmlDoc just points to it! // . this returns false if "coll" is invalid else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) { hadSetError: mdelete ( st , sizeof(State2) , "PageGet1" ); delete ( st ); g_errno = ENOMEM; log("PageGet: set3: %s", mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // if it blocks while it loads title rec, it will re-call this routine xd->setCallback ( st , processLoopWrapper ); // good to go! return processLoop ( st ); }
// . a new interface so Msg3b can call this with "s" set to NULL // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageParser2 ( TcpSocket *s , HttpRequest *r , State8 *st , long long docId , Query *q , // in query term space, not imap space long long *termFreqs , // in imap space float *termFreqWeights , // in imap space float *affWeights , void *state , void (* callback)(void *state) ) { //log("parser: read sock=%li",s->m_sd); // might a simple request to addsomething to validated.*.txt file // from XmlDoc::print() or XmlDoc::validateOutput() char *add = r->getString("add",NULL); //long long uh64 = r->getLongLong("uh64",0LL); char *uh64str = r->getString("uh64",NULL); //char *divTag = r->getString("div",NULL); if ( uh64str ) { // convert add to number long addNum = 0; if ( to_lower_a(add[0])=='t' ) // "true" or "false"? addNum = 1; // convert it. skip beginning "str" inserted to prevent // javascript from messing with the long long since it // was rounding it! //long long uh64 = atoll(uh64str);//+3); // urldecode that //long divTagLen = gbstrlen(divTag); //long newLen = urlDecode ( divTag , divTag , divTagLen ); // null term? //divTag[newLen] = '\0'; // do it. this is defined in XmlDoc.cpp //addCheckboxSpan ( uh64 , divTag , addNum ); // make basic reply char *reply; reply = "HTTP/1.0 200 OK\r\n" "Connection: Close\r\n"; // that is it! send a basic reply ok bool status = g_httpServer.sendDynamicPage( s , reply, gbstrlen(reply), -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); return status; } // make a state if ( st ) st->m_freeIt = false; if ( ! st ) { try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; } // msg3b uses this to get a score from the query st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_termFreqs = termFreqs; st->m_termFreqWeights = termFreqWeights; st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; st->m_recompute = false; //st->m_url.reset(); // do not allow more than one to be launched at a time if in // a quickpoll. will cause quickpoll in quickpoll. g_inPageParser = true; // password, too long pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( st->m_collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); if ( ! coll ) return sendErrorReply ( st , ENOCOLLREC ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); //long ulen = 0; //char *u = r->getString ( "u" , &ulen , NULL /*default*/); long old = r->getLong ( "old", 0 ); // set query long qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given if ( ! st->m_u || ! st->m_u[0] ) st->m_docId = r->getLongLong ("docid",-1); else st->m_docId = -1; // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = st->m_r.getString("u",&st->m_ulen,NULL); // should we recycle link info? st->m_recycle = r->getLong("recycle",0); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); long linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header SafeBuf *xbuf = &st->m_xbuf; xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">\n"); // print standard header g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r ); // print the standard header for admin pages char *dd = ""; char *rr = ""; char *rr2 = ""; char *render = ""; char *oips = ""; char *us = ""; if ( st->m_u && st->m_u[0] ) us = st->m_u; //if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn ); if ( st->m_old ) dd = " checked"; if ( st->m_recycle ) rr = " checked"; if ( st->m_recycle2 ) rr2 = " checked"; if ( st->m_render ) render = " checked"; if ( st->m_oips ) oips = " checked"; xbuf->safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); long clen; char *contentParm = r->getString("content",&clen,""); // print the input form xbuf->safePrintf ( "<style>\n" "h2{font-size: 12px; color: #666666;}\n" ".gbtag { border: 1px solid gray;" "background: #ffffef;display:inline;}\n" ".gbcomment { border: 1px solid gray;" "color: #888888; font-style:italic; " "background: #ffffef;display:inline;}\n" ".token { border: 1px solid gray;" "background: #f0ffff;display:inline;}\n" ".spam { border: 1px solid gray;" "background: #af0000;" "color: #ffffa0;}" ".hs {color: #009900;}" "</style>\n" "<center>" "<table %s>" "<tr><td colspan=5><center><b>" "Parser" "</b></center></td></tr>\n" "<tr class=poo>" "<td>" "<b>url</b>" "<br><font size=-2>" "Type in <b>FULL</b> url to parse." "</font>" "</td>" "</td>" "<td>" "<input type=text name=u value=\"%s\" size=\"40\">\n" "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Parser version to use: " "</td>" "<td>" "<input type=text name=\"version\" size=\"4\" value=\"-1\"> " "</td>" "<td>" "(-1 means to use latest title rec version)<br>" "</td>" "</tr>" */ /* "<tr class=poo>" "<td>" "Hop count to use: " "</td>" "<td>" "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> " "</td>" "<td>" "(-1 is unknown. For root urls hopcount is always 0)<br>" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>use cached</b>" "<br><font size=-2>" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=old value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Reparse root:" "</td>" "<td>" "<input type=checkbox name=artr value=1%s> " "</td>" "<td>" "Apply selected ruleset to root to update quality" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>recycle link info</b>" "<br><font size=-2>" "Recycle the link info from the title rec" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=recycle value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Recycle Link Info Imported:" "</td>" "<td>" "<input type=checkbox name=recycleimp value=1%s> " "</td>" "<td>" "Recycle the link info imported from other coll" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>render html</b>" "<br><font size=-2>" "Render document content as HTML" "</font>" "</td>" "<td>" "<input type=checkbox name=render value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Lookup outlinks' ruleset, ips, quality:" "</td>" "<td>" "<input type=checkbox name=oips value=1%s> " "</td>" "<td>" "To compute quality lookup IP addresses of roots " "of outlinks." "</td>" "</tr>" "<tr class=poo>" "<td>" "LinkInfo Coll:" "</td>" "<td>" "<input type=text name=\"oli\" size=\"10\" value=\"\"> " "</td>" "<td>" "Leave empty usually. Uses this coll to lookup link info." "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>optional query</b>" "<br><font size=-2>" "Leave empty usually. For title generation only." "</font>" "</td>" "<td>" "<input type=text name=\"q\" size=\"20\" value=\"\"> " "</td>" "</tr>", TABLE_STYLE, us , dd, rr, render ); xbuf->safePrintf( "<tr class=poo>" "<td>" "<b>content type below is</b>" "<br><font size=-2>" "Is the content below HTML? XML? JSON?" "</font>" "</td>" "<td>" //"<input type=checkbox name=xml value=1> " "<select name=ctype>\n" "<option value=%li selected>HTML</option>\n" "<option value=%li selected>XML</option>\n" "<option value=%li selected>JSON</option>\n" "</select>\n" "</td>" "</tr>", (long)CT_HTML, (long)CT_XML, (long)CT_JSON ); xbuf->safePrintf( "<tr class=poo>" "<td><b>content</b>" "<br><font size=-2>" "Use this content for the provided <i>url</i> " "rather than downloading it from the web." "</td>" "<td>" "<textarea rows=10 cols=80 name=content>" "%s" "</textarea>" "</td>" "</tr>" "</table>" "</center>" "</form>" "<br>", //oips , contentParm ); xbuf->safePrintf( "<center>" "<input type=submit value=Submit>" "</center>" ); // just print the page if no url given if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st ); XmlDoc *xd = &st->m_xd; // set this up SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,st->m_u); long firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME long contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; uint8_t contentType = CT_HTML; if ( r->getBool("xml",0) ) contentType = CT_XML; contentType = r->getLong("ctype",contentType);//CT_HTML); // if facebook, load xml content from title rec... bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/"); if ( isFacebook && ! content ) { long long docId = g_titledb.getProbableDocId(st->m_u); sprintf(sreq.m_url ,"%llu", docId ); sreq.m_isPageReindex = true; } // hack if ( content ) { st->m_dbuf.purge(); st->m_dbuf.safeStrcpy(content); //char *data = strstr(content,"\r\n\r\n"); //long dataPos = 0; //if ( data ) dataPos = (data + 4) - content; //st->m_dbuf.convertJSONtoXML(0,dataPos); //st->m_dbuf.decodeJSON(0); content = st->m_dbuf.getBufStart(); } // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , &st->m_wbuf , 0 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip contentType )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , processLoop ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( st->m_recycle ) xd->m_recycleContent = true; return processLoop ( st ); }
// . this returns false if blocks, true otherwise // . sets g_errno on failure bool Msg1c::gotList ( ) { if ( g_errno ) return true; int64_t *tmpDocIds = m_msg3a.getDocIds(); int32_t numDocIds = m_msg3a.getNumDocIds(); if ( m_startNum > 0) { numDocIds -= m_startNum; tmpDocIds = &tmpDocIds[m_startNum]; } m_numDocIds = numDocIds; // save for reporting // log it log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds); // bail if no need if ( numDocIds <= 0 ) return true; // force spiders on on entire network. they will progagate from // host #0... g_conf.m_spideringEnabled = true; int32_t nowGlobal = getTimeGlobal(); HashTableX dt; char dbuf[1024]; dt.set(8,0,64,dbuf,1024,false,0,"ddocids"); m_sb.setLabel("reiadd"); State13 *st = (State13 *)m_state; GigablastRequest *gr = &st->m_gr; m_numDocIdsAdded = 0; // list consists of docIds, loop through each one for(int32_t i = 0; i < numDocIds; i++) { int64_t docId = tmpDocIds[i]; // when searching events we get multiple docids that are same if ( dt.isInTable ( &docId ) ) continue; // add it if ( ! dt.addKey ( &docId ) ) return true; SpiderRequest sr; sr.reset(); // url is a docid! sprintf ( sr.m_url , "%" PRIu64 , docId ); // make a fake first ip // use only 64k values so we don't stress doledb/waittrees/etc. // for large #'s of docids int32_t firstIp = (docId & 0x0000ffff); // bits 6-13 of the docid are the domain hash so use those // when doing a REINDEX (not delete!) to ensure that requests // on the same domain go to the same shard, at least when // we have up to 256 shards. if we have more than 256 shards // at this point some shards will not participate in the // query reindex/delete process because of this, so // we'll want to allow more bits in in that case perhaps. // check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp // to see what shard is responsible for storing and indexing // this SpiderRequest based on the firstIp. if ( ! m_forceDel ) { // if we are a REINDEX not a delete because // deletes don't need to spider/redownload the doc // so the distribution can be more random firstIp >>= 6; firstIp &= 0xff; } // 0 is not a legit val. it'll core below. if ( firstIp == 0 ) { firstIp = 1; } // use a fake ip sr.m_firstIp = firstIp; // we are not really injecting... sr.m_isInjecting = false;//true; sr.m_hopCount = -1; sr.m_isPageReindex = 1; sr.m_urlIsDocId = 1; sr.m_fakeFirstIp = 1; // now you can recycle content instead of re-downloading it // for every docid sr.m_recycleContent = gr->m_recycleContent; // if this is zero we end up getting deduped in // dedupSpiderList() if there was a SpiderReply whose // spider time was > 0 sr.m_addedTime = nowGlobal; sr.m_forceDelete = m_forceDel ? 1 : 0; // . complete its m_key member // . parentDocId is used to make the key, but only allow one // page reindex spider request per url... so use "0" // . this will set "uh48" to hash64b(m_url) which is the docid sr.setKey( firstIp, 0LL , false ); // how big to serialize int32_t recSize = sr.getRecSize(); m_numDocIdsAdded++; // store it if ( ! m_sb.safeMemcpy ( (char *)&sr , recSize ) ) { // g_errno must be set if ( ! g_errno ) { g_process.shutdownAbort(true); } log(LOG_LOGIC, "admin: Query reindex size of %" PRId32" " "too big. Aborting. Bad engineer." , (int32_t)0);//m_list.getListSize() ); return true; } }
// . "uf" is printf url format to scrape with a %s for the query // . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0"; bool Msg7::scrapeQuery ( ) { // advance round now in case we return early m_round++; // error? if ( m_qbuf.length() > 500 ) { g_errno = EQUERYTOOBIG; return true; } // first encode the query SafeBuf ebuf; ebuf.urlEncode ( m_qbuf.getBufStart() ); // queryUNEncoded ); char *uf; if ( m_round == 1 ) // set to 1 for debugging uf="http://www.google.com/search?num=20&" "q=%s&scoring=d&filter=0"; //uf = "https://startpage.com/do/search?q=%s"; //uf = "http://www.google.com/" // "/cse?cx=013269018370076798483%3A8eec3papwpi&" // "ie=UTF-8&q=%s&" // "num=20"; else uf="http://www.bing.com/search?q=%s"; // skip bing for now //if ( m_round == 2 ) // return true; //if ( m_round == 1 ) // return true; // make the url we will download char ubuf[2048]; sprintf ( ubuf , uf , ebuf.getBufStart() ); // log it log("inject: SCRAPING %s",ubuf); SpiderRequest sreq; sreq.reset(); // set the SpiderRequest strcpy(sreq.m_url, ubuf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; long firstIp = hash32n(ubuf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); // forceDEl = false, niceness = 0 m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd.m_useRobotsTxt = false; // this will tell it to index ahrefs first before indexing // the doc. but do NOT do this if we are from ahrefs.com // ourselves to avoid recursive explosion!! if ( m_useAhrefs ) m_xd.m_useAhrefs = true; m_xd.m_reallyInjectLinks = m_injectLinks; // // rather than just add the links of the page to spiderdb, // let's inject them! // m_xd.setCallback ( this , doneInjectingLinksWrapper ); // niceness is 0 m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2"); // do we actually inject the links, or just scrape? if ( ! m_xd.injectLinks ( &m_linkDedupTable , NULL, this , doneInjectingLinksWrapper ) ) return false; // otherwise, just download the google/bing search results so we // can display them in xml //else if ( m_xd.getUtf8Content() == (char **)-1 ) // return false; // print reply.. //printReply(); return true; }
bool Msg7::inject ( char *url , long forcedIp , char *content , long contentLen , bool recycleContent, uint8_t contentType, char *coll , bool quickReply , char *username , char *pwd , long niceness, void *state , void (*callback)(void *state), long firstIndexed, long lastSpidered, long hopCount, char newOnly, short charset, char spiderLinks, char deleteIt, char hasMime, bool doConsistencyTesting ) { m_quickReply = quickReply; // store coll if ( ! coll ) { g_errno = ENOCOLLREC; return true; } long collLen = gbstrlen ( coll ); if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( m_coll , coll , collLen ); m_coll [ collLen ] = '\0'; // store user //long ulen = 0; //if ( username ) ulen = gbstrlen(username); //if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;} //if ( username ) strcpy( m_username, username ); // store password //long pwdLen = 0; //if ( pwd ) pwdLen = gbstrlen(pwd); //m_pwd [ 0 ] ='\0'; //if ( pwdLen > 31 ) pwdLen = 31; //if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen ); //m_pwd [ pwdLen ] = '\0'; // store url if ( ! url ) { g_errno = 0; return true; } long urlLen = gbstrlen(url); if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; } // skip injecting if no url given! just print the admin page. if ( urlLen <= 0 ) return true; //strcpy ( m_url , url ); if ( g_repairMode ) { g_errno = EREPAIRING; return true; } // send template reply if no content supplied if ( ! content && ! recycleContent ) { log("inject: no content supplied to inject command and " "recycleContent is false."); //return true; } // clean url? // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer char cleanUrl[MAX_URL_LEN+1]; urlLen = cleanInput ( cleanUrl, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // this can go on the stack since set4() copies it SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url, cleanUrl ); // parentdocid of 0 long firstIp = hash32n(cleanUrl); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 1; sreq.m_isPageInject = 1; sreq.m_hopCount = hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; // shortcut XmlDoc *xd = &m_xd; // log it now //log("inject: injecting doc %s",cleanUrl); static char s_dummy[3]; // sometims the content is indeed NULL... if ( newOnly && ! content ) { // don't let it be NULL because then xmldoc will // try to download the page! s_dummy[0] = '\0'; content = s_dummy; //char *xx=NULL;*xx=0; } } // . use the enormous power of our new XmlDoc class // . this returns false with g_errno set on error if ( //m_needsSet && ! xd->set4 ( &sreq , NULL , m_coll , NULL , // pbuf // give it a niceness of 1, we have to be // careful since we are a niceness of 0!!!! niceness, // 1 , // inject this content content , deleteIt, // false, // deleteFromIndex , forcedIp , contentType , lastSpidered , hasMime )) { // g_errno should be set if that returned false if ( ! g_errno ) { char *xx=NULL;*xx=0; } return true; } // do not re-call the set //m_needsSet = false; // make this our callback in case something blocks xd->setCallback ( state , callback ); xd->m_doConsistencyTesting = doConsistencyTesting; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( recycleContent ) xd->m_recycleContent = true; // othercrap if ( firstIndexed ) { xd->m_firstIndexedDate = firstIndexed; xd->m_firstIndexedDateValid = true; } if ( lastSpidered ) { xd->m_spideredTime = lastSpidered; xd->m_spideredTimeValid = true; } if ( hopCount != -1 ) { xd->m_hopCount = hopCount; xd->m_hopCountValid = true; } if ( charset != -1 && charset != csUnknown ) { xd->m_charset = charset; xd->m_charsetValid = true; } // avoid looking up ip of each outlink to add "firstip" tag to tagdb // because that can be slow!!!!!!! xd->m_spiderLinks = spiderLinks; xd->m_spiderLinks2 = spiderLinks; xd->m_spiderLinksValid = true; // . newOnly is true --> do not inject if document is already indexed! // . maybe just set indexCode xd->m_newOnly = newOnly; // do not re-lookup the robots.txt xd->m_isAllowed = true; xd->m_isAllowedValid = true; xd->m_crawlDelay = -1; // unknown xd->m_crawlDelayValid = true; // set this now g_inPageInject = true; // log it now //log("inject: indexing injected doc %s",cleanUrl); // . now tell it to index // . this returns false if blocked bool status = xd->indexDoc ( ); // log it. i guess only for errors when it does not block? // because xmldoc.cpp::indexDoc calls logIt() if ( status ) xd->logIt(); // undo it g_inPageInject = false; // note that it blocked //if ( ! status ) log("inject: blocked for %s",cleanUrl); // return false if it blocked return status; }
void Scraper::gotPhrase ( ) { // error getting random phrase? bail! if ( g_errno ) log("scraper: got error getting random phrase: %s", mstrerror(g_errno)); CollectionRec *cr = g_collectiondb.getRec ( m_coll ); loop: // what type of query should we do? m_qtype = rand() % 3; // make sure web, news, blog is enabled if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb ) goto loop; if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews ) goto loop; if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop; // scraping is off when repairing obviously if ( g_repairMode ) return; // get it char *s = g_wiki.m_randPhrase; // convert _'s to spaces for ( char *p = s ; *p ; p++ ) if ( *p == '_' ) *p = ' '; // . url encode the random phrase // . truncate it to 200 bytes to keep things sane // . Wiki::doneReadingWiki() keeps it below 128 i think anyway char qe[400]; urlEncode(qe, 200, s , gbstrlen(s) ); char *end = qe + 390; // half the time append a random word from dictionary so that we // discovery those tail-end sites better if ( m_qtype == 0 && (rand() % 2) ) { // point into it for appending char *p = qe + gbstrlen(qe); // add a space, url encoded *p++ = '+'; // append a random word to it from dictionary char *rw = g_speller.getRandomWord(); // append that in urlEncode( p , end - p - 1 , rw , gbstrlen(rw) ); } // make a query to scrape char buf[2048]; char *uf ; if ( m_qtype == 0 ) uf="http://www.google.com/search?num=50&q=%s&scoring=d" "&filter=0"; // google news query? sort by date. else if ( m_qtype == 1 ) uf="http://news.google.com/news?num=50&q=%s&sort=n" "&filter=0"; // google blog query? else if ( m_qtype == 2 ) uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d" "&filter=0"; // sanity check else { char *xx=NULL;*xx=0; } // make the url we will download sprintf ( buf , uf , qe ); SpiderRequest sreq; // set the SpiderRequest strcpy(sreq.m_url, uf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; long firstIp = hash32n(uf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); // forceDEl = false, niceness = 0 m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd.m_useRobotsTxt = false; // call this when index completes m_xd.setCallback ( NULL , indexedDocWrapper ); // assume it blocked m_numSent++; // scraper is special m_xd.m_usePosdb = false; m_xd.m_useDatedb = false; m_xd.m_useClusterdb = false; m_xd.m_useLinkdb = false; m_xd.m_useSpiderdb = true; // only this one i guess m_xd.m_useTitledb = false; m_xd.m_useTagdb = false; m_xd.m_usePlacedb = false; //m_xd.m_useTimedb = false; //m_xd.m_useSectiondb = false; //m_xd.m_useRevdb = false; // . return false if this blocks // . will add the spider recs to spiderdb of the outlinks // . will add "ingoogle", etc. tags for each outlink if ( ! m_xd.indexDoc ( ) ) return ; // we didn't block indexedDoc ( ); }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) { // . get fields from cgi field of the requested url // . get the search query long urlLen = 0; char *url = r->getString ( "u" , &urlLen , NULL /*default*/); // see if they provided a url of a file of urls if they did not // provide a url to add directly //bool isAdmin = g_collectiondb.isAdmin ( r , s ); bool isAdmin = r->getIsLocal(); long ufuLen = 0; char *ufu = NULL; if ( isAdmin ) // get the url of a file of urls (ufu) ufu = r->getString ( "ufu" , &ufuLen , NULL ); // can't be too long, that's obnoxious if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) { g_errno = EBUFTOOSMALL; g_msg = " (error: url too long)"; return g_httpServer.sendErrorReply(s,500,"url too long"); } // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // get collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; g_msg = " (error: no collection)"; return g_httpServer.sendErrorReply(s,500,"no coll rec"); } // . make sure the ip is not banned // . we may also have an exclusive list of IPs for private collections if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // make a new state State1 *st1 ; try { st1 = new (State1); } catch ( ... ) { g_errno = ENOMEM; log("PageAddUrl: new(%i): %s", sizeof(State1),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } mnew ( st1 , sizeof(State1) , "PageAddUrl" ); // save socket and isAdmin st1->m_socket = s; st1->m_isAdmin = isAdmin; // assume no url buf yet, set below //st1->m_ubuf = NULL; //st1->m_ubufAlloc = NULL; //st1->m_metaList = NULL; // save the url st1->m_url[0] = '\0'; if ( url ) { // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer st1->m_urlLen=cleanInput ( st1->m_url, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // point to that as the url "buf" to add //st1->m_ubuf = st1->m_url; //st1->m_ubufSize = urlLen; //st1->m_ubufAlloc = NULL; // do not free it! } // save the "ufu" (url of file of urls) st1->m_ufu[0] = '\0'; st1->m_ufuLen = ufuLen; memcpy ( st1->m_ufu , ufu , ufuLen ); st1->m_ufu[ufuLen] = '\0'; st1->m_doTuringTest = cr->m_doTuringTest; char *username = g_users.getUsername(r); if(username) strcpy(st1->m_username,username); //st1->m_user = g_pages.getUserType ( s , r ); st1->m_spiderLinks = true; st1->m_strip = true; //st1->m_raw = r->getLong("raw",0); // init state2 for ( long i = 0; i < 5; i++ ){ st1->m_state2[i].m_buf = NULL; st1->m_state2[i].m_bufLen = 0; st1->m_state2[i].m_bufMaxLen = 0; } // save the collection name in the State1 class if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( st1->m_coll , coll , collLen ); st1->m_coll [ collLen ] = '\0'; // assume they answered turing test correctly st1->m_goodAnswer = true; // if addurl is turned off, just print "disabled" msg if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false ); // can also be turned off in the collection rec if ( ! cr->m_addUrlEnabled ) return sendReply ( st1 , false ); // or if in read-only mode if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false ); // cannot add if another Msg10 from here is still in progress if ( s_inprogress ) return sendReply ( st1 , true ); // use now as the spiderTime // get ip of submitter //unsigned long h = ipdom ( s->m_ip ); // . use top 2 bytes now, some isps have large blocks // . if this causes problems, then they can do pay for inclusion unsigned long h = iptop ( s->m_ip ); long codeLen; char* code = r->getString("code", &codeLen); if(g_autoBan.hasCode(code, codeLen, s->m_ip)) { long uipLen = 0; char* uip = r->getString("uip",&uipLen); long hip = 0; //use the uip when we have a raw query to test if //we can submit if(uip) { hip = atoip(uip, uipLen); h = iptop( hip ); } } st1->m_strip = r->getLong("strip",0); // Remember, for cgi, if the box is not checked, then it is not // reported in the request, so set default return value to 0 long spiderLinks = r->getLong("spiderLinks",-1); // also support all lowercase like PageInject.cpp uses if ( spiderLinks == -1 ) spiderLinks = r->getLong("spiderlinks",0); // . should we force it into spiderdb even if already in there // . use to manually update spider times for a url // . however, will not remove old scheduled spider times // . mdw: made force on the default st1->m_forceRespider = r->getLong("force",1); // 0); long now = getTimeGlobal(); // . allow 1 submit every 1 hour // . restrict by submitter domain ip if ( ! st1->m_isAdmin && ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) { // return error page g_errno = ETOOEARLY; return sendReply ( st1 , true ); } //st1->m_query = r->getString( "qts", &st1->m_queryLen ); // check it, if turing test is enabled for this collection if ( ! st1->m_isAdmin && cr->m_doTuringTest && ! g_turingTest.isHuman(r) ) { // log note so we know it didn't make it g_msg = " (error: bad answer)"; //log("PageAddUrl:: addurl failed for %s : bad answer", // iptoa(s->m_ip)); st1->m_goodAnswer = false; return sendReply ( st1 , true /*addUrl enabled?*/ ); } //if ( st1->m_queryLen > 0 ) // return getPages( st1 ); // if no url given, just print a blank page if ( ! url ) return sendReply ( st1 , true ); // // make a SpiderRequest // SpiderRequest *sreq = &st1->m_sreq; // reset it sreq->reset(); // make the probable docid long long probDocId = g_titledb.getProbableDocId ( st1->m_url ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); // . now fill it up // . TODO: calculate the other values... lazy!!! (m_isRSSExt, // m_siteNumInlinks,...) sreq->m_isNewOutlink = 1; sreq->m_isAddUrl = 1; sreq->m_addedTime = now; sreq->m_fakeFirstIp = 1; sreq->m_probDocId = probDocId; sreq->m_firstIp = firstIp; sreq->m_hopCount = 0; // its valid if root Url uu; uu.set ( st1->m_url ); if ( uu.isRoot() ) sreq->m_hopCountValid = true; // too big? //long len = st1->m_urlLen; // the url! includes \0 strcpy ( sreq->m_url , st1->m_url ); // call this to set sreq->m_dataSize now sreq->setDataSize(); // make the key dude -- after setting url sreq->setKey ( firstIp , 0LL, false ); // need a fake first ip lest we core! //sreq->m_firstIp = (pdocId & 0xffffffff); // how to set m_firstIp? i guess addurl can be throttled independently // of the other urls??? use the hash of the domain for it! long dlen; char *dom = getDomFast ( st1->m_url , &dlen ); // fake it for this... //sreq->m_firstIp = hash32 ( dom , dlen ); // sanity if ( ! dom ) { g_errno = EBADURL; return sendReply ( st1 , true ); } // shortcut Msg4 *m = &st1->m_msg4; // now add that to spiderdb using msg4 if ( ! m->addMetaList ( (char *)sreq , sreq->getRecSize() , coll , st1 , // state addedStuff , MAX_NICENESS , RDB_SPIDERDB ) ) // we blocked return false; // send back the reply return sendReply ( st1 , true ); }