// . a new interface so Msg3b can call this with "s" set to NULL // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageParser2 ( TcpSocket *s , HttpRequest *r , State8 *st , int64_t docId , Query *q , // in query term space, not imap space int64_t *termFreqs , // in imap space float *termFreqWeights , // in imap space float *affWeights , void *state , void (* callback)(void *state) ) { //log("parser: read sock=%"INT32"",s->m_sd); // might a simple request to addsomething to validated.*.txt file // from XmlDoc::print() or XmlDoc::validateOutput() char *add = r->getString("add",NULL); //int64_t uh64 = r->getLongLong("uh64",0LL); char *uh64str = r->getString("uh64",NULL); //char *divTag = r->getString("div",NULL); if ( uh64str ) { // convert add to number int32_t addNum = 0; if ( to_lower_a(add[0])=='t' ) // "true" or "false"? addNum = 1; // convert it. skip beginning "str" inserted to prevent // javascript from messing with the int64_t since it // was rounding it! //int64_t uh64 = atoll(uh64str);//+3); // urldecode that //int32_t divTagLen = gbstrlen(divTag); //int32_t newLen = urlDecode ( divTag , divTag , divTagLen ); // null term? //divTag[newLen] = '\0'; // do it. this is defined in XmlDoc.cpp //addCheckboxSpan ( uh64 , divTag , addNum ); // make basic reply char *reply; reply = "HTTP/1.0 200 OK\r\n" "Connection: Close\r\n"; // that is it! send a basic reply ok bool status = g_httpServer.sendDynamicPage( s , reply, gbstrlen(reply), -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); return status; } // make a state if ( st ) st->m_freeIt = false; if ( ! st ) { try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; } // msg3b uses this to get a score from the query st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_termFreqs = termFreqs; st->m_termFreqWeights = termFreqWeights; st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; st->m_recompute = false; //st->m_url.reset(); // do not allow more than one to be launched at a time if in // a quickpoll. will cause quickpoll in quickpoll. g_inPageParser = true; // password, too int32_t pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( st->m_collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); if ( ! coll ) return sendErrorReply ( st , ENOCOLLREC ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); //int32_t ulen = 0; //char *u = r->getString ( "u" , &ulen , NULL /*default*/); int32_t old = r->getLong ( "old", 0 ); // set query int32_t qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given if ( ! st->m_u || ! st->m_u[0] ) st->m_docId = r->getLongLong ("docid",-1); else st->m_docId = -1; // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = st->m_r.getString("u",&st->m_ulen,NULL); // should we recycle link info? st->m_recycle = r->getLong("recycle",0); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); int32_t linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header SafeBuf *xbuf = &st->m_xbuf; xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">\n"); // print standard header g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r ); // print the standard header for admin pages char *dd = ""; char *rr = ""; char *rr2 = ""; char *render = ""; char *oips = ""; char *us = ""; if ( st->m_u && st->m_u[0] ) us = st->m_u; //if ( st->m_sfn != -1 ) sprintf ( rtu , "%"INT32"",st->m_sfn ); if ( st->m_old ) dd = " checked"; if ( st->m_recycle ) rr = " checked"; if ( st->m_recycle2 ) rr2 = " checked"; if ( st->m_render ) render = " checked"; if ( st->m_oips ) oips = " checked"; xbuf->safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); int32_t clen; char *contentParm = r->getString("content",&clen,""); // print the input form xbuf->safePrintf ( "<style>\n" "h2{font-size: 12px; color: #666666;}\n" ".gbtag { border: 1px solid gray;" "background: #ffffef;display:inline;}\n" ".gbcomment { border: 1px solid gray;" "color: #888888; font-style:italic; " "background: #ffffef;display:inline;}\n" ".token { border: 1px solid gray;" "background: #f0ffff;display:inline;}\n" ".spam { border: 1px solid gray;" "background: #af0000;" "color: #ffffa0;}" ".hs {color: #009900;}" "</style>\n" "<center>" "<table %s>" "<tr><td colspan=5><center><b>" "Parser" "</b></center></td></tr>\n" "<tr class=poo>" "<td>" "<b>url</b>" "<br><font size=-2>" "Type in <b>FULL</b> url to parse." "</font>" "</td>" "</td>" "<td>" "<input type=text name=u value=\"%s\" size=\"40\">\n" "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Parser version to use: " "</td>" "<td>" "<input type=text name=\"version\" size=\"4\" value=\"-1\"> " "</td>" "<td>" "(-1 means to use latest title rec version)<br>" "</td>" "</tr>" */ /* "<tr class=poo>" "<td>" "Hop count to use: " "</td>" "<td>" "<input type=text name=\"hc\" size=\"4\" value=\"%"INT32"\"> " "</td>" "<td>" "(-1 is unknown. For root urls hopcount is always 0)<br>" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>use cached</b>" "<br><font size=-2>" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=old value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Reparse root:" "</td>" "<td>" "<input type=checkbox name=artr value=1%s> " "</td>" "<td>" "Apply selected ruleset to root to update quality" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>recycle link info</b>" "<br><font size=-2>" "Recycle the link info from the title rec" "Load page from cache (titledb)?" "</font>" "</td>" "<td>" "<input type=checkbox name=recycle value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Recycle Link Info Imported:" "</td>" "<td>" "<input type=checkbox name=recycleimp value=1%s> " "</td>" "<td>" "Recycle the link info imported from other coll" "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>render html</b>" "<br><font size=-2>" "Render document content as HTML" "</font>" "</td>" "<td>" "<input type=checkbox name=render value=1%s> " "</td>" "</tr>" /* "<tr class=poo>" "<td>" "Lookup outlinks' ruleset, ips, quality:" "</td>" "<td>" "<input type=checkbox name=oips value=1%s> " "</td>" "<td>" "To compute quality lookup IP addresses of roots " "of outlinks." "</td>" "</tr>" "<tr class=poo>" "<td>" "LinkInfo Coll:" "</td>" "<td>" "<input type=text name=\"oli\" size=\"10\" value=\"\"> " "</td>" "<td>" "Leave empty usually. Uses this coll to lookup link info." "</td>" "</tr>" */ "<tr class=poo>" "<td>" "<b>optional query</b>" "<br><font size=-2>" "Leave empty usually. For title generation only." "</font>" "</td>" "<td>" "<input type=text name=\"q\" size=\"20\" value=\"\"> " "</td>" "</tr>", TABLE_STYLE, us , dd, rr, render ); xbuf->safePrintf( "<tr class=poo>" "<td>" "<b>content type below is</b>" "<br><font size=-2>" "Is the content below HTML? XML? JSON?" "</font>" "</td>" "<td>" //"<input type=checkbox name=xml value=1> " "<select name=ctype>\n" "<option value=%"INT32" selected>HTML</option>\n" "<option value=%"INT32">XML</option>\n" "<option value=%"INT32">JSON</option>\n" "</select>\n" "</td>" "</tr>", (int32_t)CT_HTML, (int32_t)CT_XML, (int32_t)CT_JSON ); xbuf->safePrintf( "<tr class=poo>" "<td><b>content</b>" "<br><font size=-2>" "Use this content for the provided <i>url</i> " "rather than downloading it from the web." "</td>" "<td>" "<textarea rows=10 cols=80 name=content>" "%s" "</textarea>" "</td>" "</tr>" "</table>" "</center>" "</form>" "<br>", //oips , contentParm ); xbuf->safePrintf( "<center>" "<input type=submit value=Submit>" "</center>" ); // just print the page if no url given if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st ); XmlDoc *xd = &st->m_xd; // set this up SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,st->m_u); int32_t firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME int32_t contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; uint8_t contentType = CT_HTML; if ( r->getBool("xml",0) ) contentType = CT_XML; contentType = r->getLong("ctype",contentType);//CT_HTML); // if facebook, load xml content from title rec... bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/"); if ( isFacebook && ! content ) { int64_t docId = g_titledb.getProbableDocId(st->m_u); sprintf(sreq.m_url ,"%"UINT64"", docId ); sreq.m_isPageReindex = true; } // hack if ( content ) { st->m_dbuf.purge(); st->m_dbuf.safeStrcpy(content); //char *data = strstr(content,"\r\n\r\n"); //int32_t dataPos = 0; //if ( data ) dataPos = (data + 4) - content; //st->m_dbuf.convertJSONtoXML(0,dataPos); //st->m_dbuf.decodeJSON(0); content = st->m_dbuf.getBufStart(); } // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , &st->m_wbuf , 0 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip contentType )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , processLoop ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag if ( st->m_recycle ) xd->m_recycleContent = true; return processLoop ( st ); }
// for procog bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) { // make a state State8 *st; try { st = new (State8); } catch ( ... ) { g_errno = ENOMEM; log("PageParser: new(%i): %s", (int)sizeof(State8),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno));} mnew ( st , sizeof(State8) , "PageParser" ); st->m_freeIt = true; st->m_state = NULL; //st->m_callback = callback; //st->m_q = q; //st->m_termFreqs = termFreqs; //st->m_termFreqWeights = termFreqWeights; //st->m_affWeights = affWeights; //st->m_total = (score_t)-1; st->m_indexCode = 0; st->m_blocked = false; st->m_didRootDom = false; st->m_didRootWWW = false; st->m_wasRootDom = false; st->m_u = NULL; // password, too int32_t pwdLen = 0; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // save socket ptr st->m_s = s; st->m_r.copy ( r ); // get the collection char *coll = r->getString ( "c" , &st->m_collLen ,NULL /*default*/); if ( ! coll ) coll = g_conf.m_defaultColl; if ( ! coll ) coll = "main"; int32_t collLen = gbstrlen(coll); if ( collLen > MAX_COLL_LEN ) return sendErrorReply ( st , ENOBUFS ); strcpy ( st->m_coll , coll ); // version to use, if -1 use latest st->m_titleRecVersion = r->getLong("version",-1); if ( st->m_titleRecVersion == -1 ) st->m_titleRecVersion = TITLEREC_CURRENT_VERSION; // default to 0 if not provided st->m_hopCount = r->getLong("hc",0); int32_t old = r->getLong ( "old", 0 ); // set query int32_t qlen; char *qs = r->getString("q",&qlen,NULL); if ( qs ) st->m_tq.set2 ( qs , langUnknown , true ); // url will override docid if given st->m_docId = r->getLongLong ("d",-1); st->m_docId = r->getLongLong ("docid",st->m_docId); int32_t ulen; char *u = st->m_r.getString("u",&ulen,NULL); if ( ! u ) u = st->m_r.getString("url",&ulen,NULL); if ( ! u && st->m_docId == -1LL ) return sendErrorReply ( st , EBADREQUEST ); // set url in state class (may have length 0) //if ( u ) st->m_url.set ( u , ulen ); //st->m_urlLen = ulen; st->m_u = u; st->m_ulen = 0; if ( u ) st->m_ulen = gbstrlen(u); // should we recycle link info? st->m_recycle = r->getLong("recycle",1); st->m_recycle2 = r->getLong("recycleimp",0); st->m_render = r->getLong("render" ,0); st->m_recompute = r->getLong("recompute" ,0); // for quality computation... takes way longer cuz we have to // lookup the IP address of every outlink, so we can get its root // quality using Msg25 which needs to filter out voters from that IP // range. st->m_oips = r->getLong("oips" ,0); //st->m_page = r->getLong("page",1); int32_t linkInfoLen = 0; // default is NULL char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL ); if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl ); else st->m_linkInfoColl[0] = '\0'; // set the flag in our SafeBuf class so that Words.cpp knows to show // html or html source depending on this value //st->m_xbuf.m_renderHtml = st->m_render; // should we use the old title rec? st->m_old = old; // are we coming from a local machine? st->m_isLocal = r->isLocal(); //no more setting the default root quality to 30, instead if we do not // know it setting it to -1 st->m_rootQuality=-1; // header //xbuf->safePrintf("<meta http-equiv=\"Content-Type\" " // "content=\"text/html; charset=utf-8\">\n"); XmlDoc *xd = &st->m_xd; int32_t isXml = r->getLong("xml",0); // if got docid, use that if ( st->m_docId != -1 ) { if ( ! xd->set3 ( st->m_docId, st->m_coll, 0 ) ) // niceness // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); xd->m_pbuf = &st->m_wbuf; // reset this flag st->m_donePrinting = false; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; // now tell it to fetch the old title rec if ( ! xd->loadFromOldTitleRec () ) // return false if this blocks return false; return gotXmlDoc ( st ); } // set this up SpiderRequest sreq; sreq.reset(); if ( st->m_u ) strcpy(sreq.m_url,st->m_u); int32_t firstIp = hash32n(st->m_u); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; // parentdocid of 0 sreq.setKey( firstIp, 0LL, false ); sreq.m_isPageParser = 1; sreq.m_hopCount = st->m_hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; Url nu; nu.set(sreq.m_url); sreq.m_domHash32 = nu.getDomainHash32(); sreq.m_siteHash32 = nu.getHostHash32(); // . get provided content if any // . will be NULL if none provided // . "content" may contain a MIME int32_t contentLen = 0; char *content = r->getString ( "content" , &contentLen , NULL ); // is the "content" url-encoded? default is true. bool contentIsEncoded = true; // mark doesn't like to url-encode his content if ( ! content ) { content = r->getUnencodedContent (); contentLen = r->getUnencodedContentLen (); contentIsEncoded = false; } // ensure null if ( contentLen == 0 ) content = NULL; //uint8_t contentType = CT_HTML; //if ( isXml ) contentType = CT_XML; int32_t ctype = r->getLong("ctype",CT_HTML); // . use the enormous power of our new XmlDoc class // . this returns false if blocked if ( ! xd->set4 ( &sreq , NULL , st->m_coll , // we need this so the term table is set! &st->m_wbuf , // XmlDoc::m_pbuf 0, // try 0 now! 1 ,//PP_NICENESS )) content , false, // deletefromindex 0, // forced ip ctype )) // return error reply if g_errno is set return sendErrorReply ( st , g_errno ); // make this our callback in case something blocks xd->setCallback ( st , gotXmlDoc ); // reset this flag st->m_donePrinting = false; // prevent a core here in the event we download the page content xd->m_crawlDelayValid = true; xd->m_crawlDelay = 0; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( st->m_recycle ) xd->m_recycleContent = true; // only recycle if docid is given!! if ( st->m_recycle ) xd->m_recycleContent = true; // force this on //xd->m_useSiteLinkBuf = true; //xd->m_usePageLinkBuf = true; if ( isXml ) xd->m_printInXml = true; return gotXmlDoc ( st ); }
void processReply ( char *reply , long replyLen ) { // store our current reply SafeBuf fb2; fb2.safeMemcpy(reply,replyLen ); fb2.nullTerm(); // log that we got the reply log("qa: got reply(len=%li)(errno=%s)=%s", replyLen,mstrerror(g_errno),reply); char *content = NULL; long contentLen = 0; // get mime if ( reply ) { HttpMime mime; mime.set ( reply, replyLen , NULL ); // only hash content since mime has a timestamp in it content = mime.getContent(); contentLen = mime.getContentLen(); if ( content && contentLen>0 && content[contentLen] ) { char *xx=NULL;*xx=0; } } if ( ! content ) { content = ""; contentLen = 0; } s_content = content; // take out <responseTimeMS> markOut ( content , "<currentTimeUTC>"); markOut ( content , "<responseTimeMS>"); // until i figure this one out, take it out markOut ( content , "<docsInCollection>"); // until i figure this one out, take it out markOut ( content , "<hits>"); // for those links in the html pages markOut ( content, "rand64="); // for json markOut ( content , "\"currentTimeUTC\":" ); markOut ( content , "\"responseTimeMS\":"); markOut ( content , "\"docsInCollection\":"); // for xml markOut ( content , "<currentTimeUTC>" ); markOut ( content , "<responseTimeMS>"); markOut ( content , "<docsInCollection>"); // indexed 1 day ago markOut ( content,"indexed:"); // modified 1 day ago markOut ( content,"modified:"); // s_gigabitCount... it is perpetually incrementing static counter // in PageResults.cpp markOut(content,"ccc("); markOut(content,"id=fd"); markOut(content,"id=sd"); // for some reason the term freq seems to change a little in // the scoring table markOut(content,"id=tf"); // make checksum. we ignore back to back spaces so this // hash works for <docsInCollection>10 vs <docsInCollection>9 long contentCRC = 0; if ( content ) contentCRC = qa_hash32 ( content ); // note it log("qa: got contentCRC of %lu",contentCRC); // if what we expected, save to disk if not there yet, then // call s_callback() to resume the qa pipeline /* if ( contentCRC == s_expectedCRC ) { // save content if good char fn3[1024]; sprintf(fn3,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); File ff; ff.set ( fn3 ); if ( ! ff.doesExist() ) { // if not there yet then save it fb2.save(fn3); } // . continue on with the qa process // . which qa function that may be //s_callback(); return; } */ // // if crc of content does not match what was expected then do a diff // so we can see why not // // this means caller does not care about the response if ( ! s_checkCRC ) { //s_callback(); return; } //const char *emsg = "qa: bad contentCRC of %li should be %li " // "\n";//"phase=%li\n"; //fprintf(stderr,emsg,contentCRC,s_expectedCRC);//,s_phase-1); // hash url long urlHash32 = hash32n ( s_url.getUrl() ); // combine test function too since two tests may use the same url long nameHash = hash32n ( s_qt->m_testName ); // combine together urlHash32 = hash32h ( nameHash , urlHash32 ); static bool s_init = false; if ( ! s_init ) { s_init = true; s_ht.set(4,4,1024,NULL,0,false,0,"qaht"); // make symlink //char cmd[512]; //snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir); //system(cmd); char dir[1024]; snprintf(dir,1000,"%sqa",g_hostdb.m_dir); long status = ::mkdir ( dir , S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH | S_IXOTH ); if ( status == -1 && errno != EEXIST && errno ) log("qa: Failed to make directory %s: %s.", dir,mstrerror(errno)); // try to load from disk SafeBuf fn; fn.safePrintf("%s/qa/",g_hostdb.m_dir); log("qa: loading crctable.dat"); s_ht.load ( fn.getBufStart() , "crctable.dat" ); } // break up into lines char fn2[1024]; sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); fb2.save ( fn2 ); // look up in hashtable to see what reply crc should be long *val = (long *)s_ht.getValue ( &urlHash32 ); // just return if the same if ( val && contentCRC == *val ) { g_qaOutput.safePrintf("<b style=color:green;>" "passed test</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu " "crc=<a href=/qa/content.%lu>" "%lu</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } if ( ! val ) { // add it so we know s_ht.addKey ( &urlHash32 , &contentCRC ); g_qaOutput.safePrintf("<b style=color:blue;>" "first time testing</b><br>%s : " "<a href=%s>%s</a> " "(urlhash=%lu " "crc=<a href=/qa/content.%lu>%lu" "</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } log("qa: crc changed for url %s from %li to %li", s_url.getUrl(),*val,contentCRC); // get response on file SafeBuf fb1; char fn1[1024]; sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val); fb1.load(fn1); fb1.nullTerm(); // do the diff between the two replies so we can see what changed char cmd[1024]; sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2); log("qa: %s\n",cmd); system(cmd); g_numErrors++; g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu)<br>" "<input type=checkbox name=urlhash%lu value=1 " // use ajax to update test crc. if you undo your // check then it should put the old val back. // when you first click the checkbox it should // gray out the diff i guess. "onclick=submitchanges(%lu,%lu);> " "Accept changes" "<br>" "original on left, new on right. " "oldcrc = <a href=/qa/content.%lu>%lu</a>" " != <a href=/qa/content.%lu>%lu</a> = newcrc" "<br>diff output follows:<br>" "<pre id=%lu style=background-color:0xffffff;>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, // input checkbox name field urlHash32, // submitchanges() parms urlHash32, contentCRC, // original/old content.%lu *val, *val, // new content.%lu contentCRC, contentCRC, // for the pre tag id: urlHash32); // store in output SafeBuf sb; sb.load("/tmp/diffout"); g_qaOutput.htmlEncode ( sb.getBufStart() ); g_qaOutput.safePrintf("</pre><br><hr>"); // if this is zero allow it to slide by. it is learning mode i guess. // so we can learn what crc we need to use. // otherwise, stop right there for debugging //if ( s_expectedCRC != 0 ) exit(1); // keep on going //s_callback(); }
void Scraper::gotPhrase ( ) { // error getting random phrase? bail! if ( g_errno ) log("scraper: got error getting random phrase: %s", mstrerror(g_errno)); CollectionRec *cr = g_collectiondb.getRec ( m_coll ); loop: // what type of query should we do? m_qtype = rand() % 3; // make sure web, news, blog is enabled if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb ) goto loop; if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews ) goto loop; if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop; // scraping is off when repairing obviously if ( g_repairMode ) return; // get it char *s = g_wiki.m_randPhrase; // convert _'s to spaces for ( char *p = s ; *p ; p++ ) if ( *p == '_' ) *p = ' '; // . url encode the random phrase // . truncate it to 200 bytes to keep things sane // . Wiki::doneReadingWiki() keeps it below 128 i think anyway char qe[400]; urlEncode(qe, 200, s , gbstrlen(s) ); char *end = qe + 390; // half the time append a random word from dictionary so that we // discovery those tail-end sites better if ( m_qtype == 0 && (rand() % 2) ) { // point into it for appending char *p = qe + gbstrlen(qe); // add a space, url encoded *p++ = '+'; // append a random word to it from dictionary char *rw = g_speller.getRandomWord(); // append that in urlEncode( p , end - p - 1 , rw , gbstrlen(rw) ); } // make a query to scrape char buf[2048]; char *uf ; if ( m_qtype == 0 ) uf="http://www.google.com/search?num=50&q=%s&scoring=d" "&filter=0"; // google news query? sort by date. else if ( m_qtype == 1 ) uf="http://news.google.com/news?num=50&q=%s&sort=n" "&filter=0"; // google blog query? else if ( m_qtype == 2 ) uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d" "&filter=0"; // sanity check else { char *xx=NULL;*xx=0; } // make the url we will download sprintf ( buf , uf , qe ); SpiderRequest sreq; // set the SpiderRequest strcpy(sreq.m_url, uf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; long firstIp = hash32n(uf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); // forceDEl = false, niceness = 0 m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd.m_useRobotsTxt = false; // call this when index completes m_xd.setCallback ( NULL , indexedDocWrapper ); // assume it blocked m_numSent++; // scraper is special m_xd.m_usePosdb = false; m_xd.m_useDatedb = false; m_xd.m_useClusterdb = false; m_xd.m_useLinkdb = false; m_xd.m_useSpiderdb = true; // only this one i guess m_xd.m_useTitledb = false; m_xd.m_useTagdb = false; m_xd.m_usePlacedb = false; //m_xd.m_useTimedb = false; //m_xd.m_useSectiondb = false; //m_xd.m_useRevdb = false; // . return false if this blocks // . will add the spider recs to spiderdb of the outlinks // . will add "ingoogle", etc. tags for each outlink if ( ! m_xd.indexDoc ( ) ) return ; // we didn't block indexedDoc ( ); }
// . "uf" is printf url format to scrape with a %s for the query // . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0"; bool Msg7::scrapeQuery ( ) { // advance round now in case we return early m_round++; GigablastRequest *gr = &m_gr; // error? char *qts = gr->m_queryToScrape; if ( ! qts ) { char *xx=NULL;*xx=0; } if ( gbstrlen(qts) > 500 ) { g_errno = EQUERYTOOBIG; return true; } // first encode the query SafeBuf ebuf; ebuf.urlEncode ( qts ); // queryUNEncoded ); ebuf.nullTerm(); char *uf; if ( m_round == 1 ) // set to 1 for debugging uf="http://www.google.com/search?num=20&" "q=%s&scoring=d&filter=0"; //uf = "https://startpage.com/do/search?q=%s"; //uf = "http://www.google.com/" // "/cse?cx=013269018370076798483%3A8eec3papwpi&" // "ie=UTF-8&q=%s&" // "num=20"; else uf="http://www.bing.com/search?q=%s"; // skip bing for now //if ( m_round == 2 ) // return true; //if ( m_round == 1 ) // return true; // make the url we will download char ubuf[2048]; sprintf ( ubuf , uf , ebuf.getBufStart() ); // log it log("inject: SCRAPING %s",ubuf); SpiderRequest sreq; sreq.reset(); // set the SpiderRequest strcpy(sreq.m_url, ubuf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; long firstIp = hash32n(ubuf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); char *coll2 = gr->m_coll; CollectionRec *cr = g_collectiondb.getRec ( coll2 ); // forceDEl = false, niceness = 0 m_xd.set4 ( &sreq , NULL , cr->m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd.m_useRobotsTxt = false; // this will tell it to index ahrefs first before indexing // the doc. but do NOT do this if we are from ahrefs.com // ourselves to avoid recursive explosion!! if ( m_useAhrefs ) m_xd.m_useAhrefs = true; m_xd.m_reallyInjectLinks = true;//gr->m_injectLinks; // // rather than just add the links of the page to spiderdb, // let's inject them! // m_xd.setCallback ( this , doneInjectingLinksWrapper ); // niceness is 0 m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2"); // do we actually inject the links, or just scrape? if ( ! m_xd.injectLinks ( &m_linkDedupTable , NULL, this , doneInjectingLinksWrapper ) ) return false; // otherwise, just download the google/bing search results so we // can display them in xml //else if ( m_xd.getUtf8Content() == (char **)-1 ) // return false; // print reply.. //printReply(); return true; }
// . m_key bitmap in statsdb: // tttttttt tttttttt tttttttt tttttttt t = time in milliseconds, t1 // tttttttt tttttttt tttttttt tttttttt // hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh h = hash32 of m_title // . returns false if could not add stat, true otherwise // . do not set g_errno if we return false just to keep things simple // . we only add the stat to our local statsdb rdb, but because // we might be dumping statsdb to disk or something it is possible // we get an ETRYAGAIN error, so we try to accumulate stats in a // local buffer in that case // . "label" is something like "queryLatency" or whatever // . [t1,t2] are the time endpoints for the operation being measured // . "value" is usually "numBytes", or a quantity indicator of whatever // was processed. // . oldVal, newVal are reflect a state change, like maybe changing the // value of a parm. typically for such things t1 equals t2 bool Statsdb::addStat ( int32_t niceness , const char *label , int64_t t1Arg , int64_t t2Arg , float value , // y-value really, "numBytes" int32_t parmHash , float oldVal , float newVal , int32_t userId32 ) { if ( ! g_conf.m_useStatsdb ) return true; // so Process.cpp can turn it off when dumping core if ( m_disabled ) return true; // not thread safe! //if ( g_threads.amThread() ) { // log("statsdb: called from thread"); // g_process.shutdownAbort(true); //} // . for now we can only add stats if we are synced with host #0 clock // . this is kinda a hack and it would be nice to not miss stats! if ( ! isClockInSync() ) return true; RdbTree *tree = &m_rdb.m_tree; // do not add stats to our tree if it is loading if ( tree->m_isLoading ) return true; // convert into host #0 synced time t1Arg = localToGlobalTimeMilliseconds ( t1Arg ); t2Arg = localToGlobalTimeMilliseconds ( t2Arg ); // sanity check if ( ! label ) { g_process.shutdownAbort(true); } int32_t labelHash; if ( parmHash ) labelHash = parmHash; else labelHash = hash32n ( label ); // fix it for parm changes, and docs_indexed stat, etc. if ( t1Arg == t2Arg ) t2Arg++; // how many SECONDS did the op take? (convert from ms to secs) float dtms = (t2Arg - t1Arg); float dtSecs = dtms / 1000.0; // we have already flushed stats 30+ seconds old, so if this op took // 30 seconds, discard it! if ( dtSecs >= 30 ) { //log("statsdb: stat is %" PRId32" secs > 30 secs old, discarding.", // (int32_t)dtSecs); return true; } int64_t nextup; // loop over all "second" buckets for ( int64_t tx = t1Arg ; tx < t2Arg ; tx = nextup ) { // get next second-aligned point in milliseconds nextup = ((tx +1000)/ 1000) * 1000; // truncate if we need to if ( nextup > t2Arg ) nextup = t2Arg; // . how much of the stat is in this time interval? // . like if operation took 3 seconds, we might cover // 50% of the first 1-second interval. so we use this // as a weight for the stats we keep for that particular // second. then we can plot a point for each second // in time which is an average of all the queries that // were in progress at that second. float fractionTime = ((float)(nextup - tx)) / dtms; // . get the time point bucket in which this stat belongs // . every "second" in time has a bucket uint32_t t1 = tx / 1000; StatKey sk; memset(&sk,0,sizeof(sk)); sk.m_zero = 0x01; // make it a positive key sk.m_time1 = t1; sk.m_labelHash = labelHash; // so we can show just the stats for a particular user... if ( userId32 ) { sk.m_zero = userId32; // make it positive sk.m_zero |= 0x01; } // if we already have added a bucket for this "second" then // get it from the tree so we can add to its accumulated stats. int32_t node1 = tree->getNode ( 0 , (char *)&sk ); int32_t node2; StatData *sd; // get that stat, see if we are accumulating it already if ( node1 >= 0 ) sd = (StatData *)tree->getData ( node1 ); // make a new one if not there else { StatData tmp; // init it memset(&tmp,0,sizeof(tmp)); tmp.m_totalOps = 0.0; tmp.m_totalQuantity = 0.0; tmp.m_totalTime = 0.0; // save this int32_t saved = g_errno; // need to add using rdb so it can gbmemcpy the data if ( ! m_rdb.addRecord ( (collnum_t)0 , (char *)&sk, (char *)&tmp, sizeof(StatData), niceness ) ) { if ( g_errno != ETRYAGAIN ) log("statsdb: add rec failed: %s", mstrerror(g_errno)); // caller does not care about g_errno g_errno = saved; return false; } // caller does not care about g_errno g_errno = saved; // get the node in the tree //sd = (StatData *)tree->getData ( node1 ); // must be there! node2 = tree->getNode ( 0 , (char *)&sk ); // must be there! if ( node2 < 0 ) { g_process.shutdownAbort(true); } // point to it sd = (StatData *)tree->getData ( node2 ); } // use the milliseconds elapsed as the value if none given //if ( value == 0 && ! parmHash ) // value = t2Arg - t1Arg; // if we got it for this time, accumulate it // convert x into pixel position sd->m_totalOps += 1 * fractionTime; sd->m_totalQuantity += value * fractionTime; sd->m_totalTime += dtSecs * fractionTime; if ( ! parmHash ) continue; sd->m_totalOps = 0; sd->m_totalQuantity = oldVal; sd->m_newVal = newVal; // no fractions for this! break; } //logf(LOG_DEBUG,"statsdb: sp=0x%" PRIx32,(int32_t)sp); return true; }
bool Statsdb::init ( ) { // 20 pixel borders m_bx = 10; m_by = 40; // keep it at least at 20MB otherwise it is filling up the tree // constantly and dumping int32_t maxTreeMem = g_conf.m_statsdbMaxTreeMem; if ( maxTreeMem < 10000000 ) maxTreeMem = 10000000; int32_t nodeSize = sizeof(StatData)+8+12+4 + sizeof(collnum_t); // for debug //nodeSize = 50000; // . We take a snapshot of g_stats every minute. // . Each sample struct taken from g_stats ranges from 1k - 2k // after compression depending on the state of the // all errors arrays. uint32_t maxTreeNodes = maxTreeMem / nodeSize; uint32_t maxCacheNodes = g_conf.m_statsdbMaxCacheMem / nodeSize; // assume low niceness m_niceness = 0; // init the label table static char s_buf[832]; if ( ! m_labelTable.set(4,sizeof(Label *),64, s_buf,832,false,0,"statcolors") ) return false; // stock the table int32_t n = (int32_t)sizeof(s_labels)/ sizeof(Label); for ( int32_t i = 0 ; i < n ; i++ ) { Label *bb = &s_labels[i]; // hash the label bb->m_labelHash = hash32n ( bb->m_label ); // then incorporate the bool parm bb->m_graphHash = hash32h( bb->m_labelHash , bb->m_graphType ); // add it to labeltable... why??? if ( ! m_labelTable.addKey (&bb->m_graphHash,&bb ) ) { g_process.shutdownAbort(true); } } // sanity test //Stat ts; //ts.setKey ( 0x123456789LL , 0x7654321 ); //if ( ts.getTime1() != 0x123456789LL ) { g_process.shutdownAbort(true); } //if ( ts.getLabelHash() != 0x7654321 ) { g_process.shutdownAbort(true); } //ts.setKey ( 1268261684329LL , -246356284 ); //if ( ts.getTime1() != 1268261684329LL ) { g_process.shutdownAbort(true); } //if ( ts.getLabelHash() != -246356284 ) { g_process.shutdownAbort(true); } // call this twice per second if ( ! g_loop.registerSleepCallback(500,NULL,flushStatsWrapper)) return log("statsdb: Failed to initialize timer callback2."); m_init = true; // make the rec cache 0 bytes, cuz we are just using page cache now if ( ! m_rdb.init ( g_hostdb.m_dir , // working directory "statsdb" , // dbname true , // dedup keys sizeof(StatData) , // fixed record size 200,//g_conf.m_statsdbMinFilesToMerge , maxTreeMem , maxTreeNodes , true , // balance tree? 0 , // m_statsdbMaxCchMem maxCacheNodes , false , // use half keys? false , // cache from disk? NULL , // page cache pointer false , // is titledb? false , sizeof(key96_t) , // key size false, // bias disk page cache? true ) ) // is collectionless? return false; m_disabled = false; // normally Collectiondb.addColl() will call Rdb::addColl() which // will init the CollectionRec::m_rdbBase, which is what // Rdb::getBase(collnum_t) will return. however, for collectionless // rdb databases we set Rdb::m_collectionlessBase special here. return m_rdb.addRdbBase1 ( NULL ); }