bool loadUrls ( ) { static bool s_loaded = false; if ( s_loaded ) return true; s_loaded = true; // use injectme3 file s_ubuf1.load("./injectme3"); // scan for +++URL: xxxxx char *s = s_ubuf1.getBufStart(); for ( ; *s ; s++ ) { if ( strncmp(s,"+++URL: ",8) ) continue; // got one // \0 term it for s_contentPtrs below *s = '\0'; // find end of it s += 8; char *e = s; for ( ; *e && ! is_wspace_a(*e); e++ ); // null term it if ( *e ) *e = '\0'; // store ptr s_ubuf2.pushLong((long)s); // skip past that s = e; // point to content s_cbuf2.pushLong((long)(s+1)); } // make array of url ptrs s_urlPtrs = (char **)s_ubuf2.getBufStart(); s_contentPtrs= (char **)s_cbuf2.getBufStart(); return true; }
bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) { SafeBuf sb; sb.safePrintf ( "http://%s:%li%s" , iptoa(g_hostdb.m_myHost->m_ip) , (long)g_hostdb.m_myHost->m_port , path ); Url u; u.set ( sb.getBufStart() ); if ( ! g_httpServer.getDoc ( u.getUrl() , 0 , // ip 0 , // offset -1 , // size 0 , // ifmodsince NULL , callback , 60*1000, // timeout 0, // proxyip 0, // proxyport -1, // maxtextdoclen -1, // maxotherdoclen NULL ) ) // useragent return false; // error? log("qa: getUrl error: %s",mstrerror(g_errno)); return true; }
// for example, RENAME log000 to log000-bak20131104-181932 static bool renameCurrentLogFile ( ) { File f; char tmp[16]; sprintf(tmp,"log%03" PRId32,g_hostdb.m_hostId); f.set ( g_hostdb.m_dir , tmp ); // make new filename like log000-bak20131104-181932 time_t now = time(NULL); struct tm tm_buf; tm *tm1 = gmtime_r(&now,&tm_buf); char tmp2[64]; strftime(tmp2,64,"%Y%m%d-%H%M%S",tm1); SafeBuf newName; if ( ! newName.safePrintf ( "%slog%03" PRId32"-bak%s", g_hostdb.m_dir, g_hostdb.m_hostId, tmp2 ) ) { fprintf(stderr,"log rename failed\n"); return false; } // rename log000 to log000-2013_11_04-18:19:32 if ( f.doesExist() ) { //fprintf(stdout,"renaming file\n"); f.rename ( newName.getBufStart() ); } return true; }
bool saveHashTable ( ) { if ( s_ht.m_numSlotsUsed <= 0 ) return true; SafeBuf fn; fn.safePrintf("%s/qa/",g_hostdb.m_dir); log("qa: saving crctable.dat"); s_ht.save ( fn.getBufStart() , "crctable.dat" ); return true; }
bool qascrape ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // scrape it if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf( "/admin/inject?c=qatest123&" "format=xml&qts=test"); if ( ! getUrl ( sb.getBufStart() , 999 ) ) return false; } // verify no results for gbhopcount:2 query //static bool s_y4 = false; if ( ! s_flags[6] ) { s_flags[6] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=test", -1310551262 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA SCRAPE TEST"); return true; } return true; }
// returns false if blocked, true otherwise, like on quick connect error bool getUrl( char *path , long checkCRC = 0 , char *post = NULL ) { SafeBuf sb; sb.safePrintf ( "http://%s:%li%s" , iptoa(g_hostdb.m_myHost->m_ip) , (long)g_hostdb.m_myHost->m_httpPort , path ); s_checkCRC = checkCRC; bool doPost = true; if ( strncmp ( path , "/search" , 7 ) == 0 ) doPost = false; //Url u; s_url.set ( sb.getBufStart() ); log("qa: getting %s",sb.getBufStart()); if ( ! g_httpServer.getDoc ( s_url.getUrl() , 0 , // ip 0 , // offset -1 , // size 0 , // ifmodsince NULL , gotReplyWrapper, 999999*1000, // timeout ms 0, // proxyip 0, // proxyport -1, // maxtextdoclen -1, // maxotherdoclen NULL , // useragent "HTTP/1.0" , // protocol doPost , // doPost NULL , // cookie NULL , // additionalHeader NULL , // fullRequest post ) ) return false; // error? processReply ( NULL , 0 ); //log("qa: getUrl error: %s",mstrerror(g_errno)); return true; }
// . the url being reuqested // . removes &code= facebook cruft bool HttpRequest::getCurrentUrl ( SafeBuf &cu ) { // makre sure we got enough room if ( ! cu.reserve ( m_hostLen + 64 + m_plen + 1 + 1 ) ) return false; // need a "Host: " char *host = m_host; if ( ! host ) host = APPSUBDOMAIN; cu.safePrintf("http"); if ( m_isSSL ) cu.pushChar('s'); cu.safePrintf("://%s",host); char *path = m_path; long plen = m_plen; if ( ! path ) { path = "/"; plen = 1; } // . scan path and change \0 back to = or & // . similar logic in HttpServer.cpp for logging! char *dst = cu.getBuf(); char *src = path; char *srcEnd = path + plen; char dd = '='; for ( ; src < srcEnd ; src++ , dst++ ) { *dst = *src; if ( *src ) continue; *dst = dd; if ( dd == '=' ) dd = '&'; else dd = '='; } *dst = '\0'; // cut it off at facebook's &code= char *buf = cu.getBufStart(); char *code = strstr( buf,"&code="); // fix for eventguru.com/blog.html?code= if ( ! code ) code = strstr(buf,"?code="); // hack that off if there if ( code ) { *code = '\0'; dst = code; } // update length cu.setLength( dst - cu.getBufStart() ); return true; }
// . run a series of tests to ensure that gb is functioning properly // . uses the ./qa subdirectory to hold archive pages, ips, spider dates to // ensure consistency between tests for exact replays bool qatest ( ) { if ( s_registered ) { g_loop.unregisterSleepCallback(NULL,qatestWrapper); s_registered = false; } if ( ! s_callback ) s_callback = qatest; if ( ! g_qaSock ) return true; // returns true when done, false when blocked //if ( ! qainject ( ) ) return false; // returns true when done, false when blocked //if ( ! qaspider ( ) ) return false; long n = sizeof(s_qatests)/sizeof(QATest); for ( long i = 0 ; i < n ; i++ ) { QATest *qt = &s_qatests[i]; if ( ! qt->m_doTest ) continue; // store that s_qt = qt; // point to flags s_flags = qt->m_flags; // call the qatest if ( ! qt->m_func() ) return false; } // save this saveHashTable(); // do not reset since we don't reload it above! //s_ht.reset(); //if ( g_numErrors ) // g_qaOutput.safePrintf("<input type=submit value=submit><br>"); g_qaOutput.safePrintf("<br>DONE RUNNING QA TESTS<br>"); // . print the output // . the result of each test is stored in the g_qaOutput safebuf g_httpServer.sendDynamicPage(g_qaSock, g_qaOutput.getBufStart(), g_qaOutput.length(), -1/*cachetime*/); g_qaOutput.purge(); g_qaSock = NULL; return true; }
void doneInjectingLinksWrapper ( void *state ) { Msg7 *msg7 = (Msg7 *)state; SafeBuf *sb = &msg7->m_sb; // copy the serps into ou rbuf if ( ! g_errno ) { // print header if ( sb->length() == 0 ) { // print header of page sb->safePrintf("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); } // serp header if ( msg7->m_round == 1 ) sb->safePrintf("\t<googleResults>\n"); else sb->safePrintf("\t<bingResults>\n"); // print results sb->safeMemcpy(&msg7->m_xd.m_serpBuf); // end that if ( msg7->m_round == 1 ) sb->safePrintf("\t</googleResults>\n"); else sb->safePrintf("\t</bingResults>\n"); } // do bing now if ( msg7->m_round == 1 ) { // return if it blocks if ( ! msg7->scrapeQuery() ) return; } // otherwise, parse out the search results so steve can display them if ( g_errno ) sb->safePrintf("<error><![CDATA[%s]]></error>\n", mstrerror(g_errno)); // print header of page sb->safePrintf("</response>\n"); // page is not more than 32k //char buf[1024*32]; //char *p = buf; // return docid and hostid //p += sprintf ( p , "scraping status "); // print error msg out, too or "Success" //p += sprintf ( p , "%s", mstrerror(g_errno)); TcpSocket *sock = msg7->m_socket; g_httpServer.sendDynamicPage ( sock, sb->getBufStart(), sb->length(), -1/*cachetime*/); // hopefully sb buffer is copied becaues this will free it: mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); }
bool deleteUrls ( ) { static long s_ii2 = 0; for ( ; s_ii2 < s_numUrls ; ) { // pre-inc it s_ii2++; // reject using html api SafeBuf sb; sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u="); sb.urlEncode ( s_urlPtrs[s_ii2] ); return getUrl ( sb.getBufStart() , qatestWrapper ); } return true; }
// ensure search results are consistent bool searchTest2 () { long nq = sizeof(s_queries)/sizeof(char *); for ( ; s_qi2 < nq ; ) { // pre-inc it s_qi2++; // inject using html api SafeBuf sb; // qa=1 tell gb to exclude "variable" or "random" things // from the serps so we can checksum it consistently sb.safePrintf ( "/search?c=qatest123&qa=1&q=" ); sb.urlEncode ( s_queries[s_qi2] ); return getUrl ( sb.getBufStart() , doneSearching2 ); } return true; }
bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) { // reset, but don't free mem etc. just set m_length to 0 nameBuf.reset(); // get its full compound name like "meta.twitter.title" JsonItem *p = this;//ji; char *lastName = NULL; char *nameArray[20]; int32_t numNames = 0; for ( ; p ; p = p->m_parent ) { // empty name? if ( ! p->m_name ) continue; if ( ! p->m_name[0] ) continue; // dup? can happen with arrays. parent of string // in object, has same name as his parent, the // name of the array. "dupname":[{"a":"b"},{"c":"d"}] if ( p->m_name == lastName ) continue; // update lastName = p->m_name; // add it up nameArray[numNames++] = p->m_name; // breach? if ( numNames < 15 ) continue; log("build: too many names in json tag"); break; } // assemble the names in reverse order which is correct order for ( int32_t i = 1 ; i <= numNames ; i++ ) { // copy into our safebuf if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) return false; // separate names with periods if ( ! nameBuf.pushChar('.') ) return false; } // remove last period nameBuf.removeLastChar('.'); // and null terminate if ( ! nameBuf.nullTerm() ) return false; // change all :'s in names to .'s since : is reserved! char *px = nameBuf.getBufStart(); for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.'; return true; }
bool Log::init ( char *filename ) { // set the main process id //s_pid = getpidtid(); setPid(); // init these m_numErrors = 0; m_bufPtr = 0; m_fd = -1; m_disabled = false; #ifdef DEBUG g_dbufSize = 4096; g_dbuf = (char*)mmalloc(g_dbufSize,"Log: DebugBuffer"); if (!g_dbuf) fprintf(stderr, "Unable to init debug buffer"); #endif // m_hostname = g_conf.m_hostname; // m_port = port; // is there a filename to log our errors to? m_filename = filename; if ( ! m_filename ) return true; // skip this for now //return true; // // RENAME log000 to log000-2013_11_04-18:19:32 // if ( g_conf.m_runAsDaemon ) { File f; char tmp[16]; sprintf(tmp,"log%03li",g_hostdb.m_hostId); f.set ( g_hostdb.m_dir , tmp ); // make new filename like log000-2013_11_04-18:19:32 time_t now = getTimeLocal(); tm *tm1 = gmtime((const time_t *)&now); char tmp2[64]; strftime(tmp2,64,"%Y_%m_%d-%T",tm1); SafeBuf newName; if ( ! newName.safePrintf ( "%slog%03li-%s", g_hostdb.m_dir, g_hostdb.m_hostId, tmp2 ) ) { fprintf(stderr,"log rename failed\n"); return false; } // rename log000 to log000-2013_11_04-18:19:32 if ( f.doesExist() ) { //fprintf(stdout,"renaming file\n"); f.rename ( newName.getBufStart() ); } } // open it for appending. // create with -rw-rw-r-- permissions if it's not there. m_fd = open ( m_filename , O_APPEND | O_CREAT | O_RDWR , S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ); if ( m_fd >= 0 ) return true; // bitch to stderr and return false on error fprintf(stderr,"could not open log file %s for appending\n", m_filename); return false; }
// . now come here when we got the necessary index lists // . returns false if blocked, true otherwise // . sets g_errno on error bool Msg39::intersectLists ( ) { // bool updateReadInfo ) { // bail on error if ( g_errno ) { hadError: log("msg39: Had error getting termlists: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply (m_slot,this,NULL,0,0,true); return true; } // timestamp log if ( m_debug ) { log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Got %"INT32" lists in %"INT64" ms" , (PTRTYPE)this,m_tmpq.getNumTerms(), gettimeofdayInMilliseconds() - m_startTime); m_startTime = gettimeofdayInMilliseconds(); } // breathe QUICKPOLL ( m_r->m_niceness ); // ensure collection not deleted from under us CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum ); if ( ! cr ) { g_errno = ENOCOLLREC; goto hadError; } // . set the IndexTable so it can set it's score weights from the // termFreqs of each termId in the query // . this now takes into account the special termIds used for sorting // by date (0xdadadada and 0xdadadad2 & TERMID_MASK) // . it should weight them so much so that the summation of scores // from other query terms cannot make up for a lower date score // . this will actually calculate the top // . this might also change m_tmpq.m_termSigns // . this won't do anything if it was already called m_posdbTable.init ( &m_tmpq , m_debug , this , &m_tt , m_r->m_collnum,//ptr_coll , &m_msg2 , // m_lists , //m_tmpq.m_numTerms , // m_numLists m_r ); // breathe QUICKPOLL ( m_r->m_niceness ); // . we have to do this here now too // . but if we are getting weights, we don't need m_tt! // . actually we were using it before for rat=0/bool queries but // i got rid of NO_RAT_SLOTS if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) { if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply ( m_slot , this , NULL , 0 , 0 , true); return true; } // if msg2 had ALL empty lists we can cut it int16_t if ( m_posdbTable.m_topTree->m_numNodes == 0 ) { //estimateHitsAndSendReply ( ); return true; } // we have to allocate this with each call because each call can // be a different docid range from doDocIdSplitLoop. if ( ! m_posdbTable.allocWhiteListTable() ) { log("msg39: Had error allocating white list table: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply (m_slot,this,NULL,0,0,true); return true; } // do not re do it if doing docid range splitting m_allocedTree = true; // . now we must call this separately here, not in allocTopTree() // . we have to re-set the QueryTermInfos with each docid range split // since it will set the list ptrs from the msg2 lists if ( ! m_posdbTable.setQueryTermInfo () ) return true; // print query term bit numbers here for ( int32_t i = 0 ; m_debug && i < m_tmpq.getNumTerms() ; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char tmp = *tpc; *tpc = '\0'; SafeBuf sb; sb.safePrintf("query: msg39: BITNUM query term #%"INT32" \"%s\" " "bitnum=%"INT32" ", i , qt->m_term, qt->m_bitNum ); // put it back *tpc = tmp; logf(LOG_DEBUG,"%s",sb.getBufStart()); } // timestamp log if ( m_debug ) { log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Preparing to intersect " "took %"INT64" ms", (PTRTYPE)this, gettimeofdayInMilliseconds() - m_startTime ); m_startTime = gettimeofdayInMilliseconds(); } // time it int64_t start = gettimeofdayInMilliseconds(); int64_t diff; // . don't bother making a thread if lists are small // . look at STAGE? in IndexReadInfo.cpp to see how we read in stages // . it's always saying msg39 handler is hogging cpu...could this be it //if ( m_msg2.getTotalRead() < 2000*8 ) goto skipThread; // debug //goto skipThread; // . NOW! let's do this in a thread so we can continue to service // incoming requests // . don't launch more than 1 thread at a time for this // . set callback when thread done // breathe QUICKPOLL ( m_r->m_niceness ); // . create the thread // . only one of these type of threads should be launched at a time if ( ! m_debug && g_threads.call ( INTERSECT_THREAD , // threadType m_r->m_niceness , this , // top 4 bytes must be cback controlLoopWrapper2,//threadDoneWrapper , addListsWrapper ) ) { m_blocked = true; return false; } // if it failed //log(LOG_INFO,"query: Intersect thread creation failed. Doing " // "blocking. Hurts performance."); // check tree if ( m_tt.m_nodes == NULL ) { log(LOG_LOGIC,"query: msg39: Badness."); char *xx = NULL; *xx = 0; } // sometimes we skip the thread //skipThread: // . addLists() should never have a problem // . g_errno should be set by prepareToAddLists() above if there is // going to be a problem //if ( m_r->m_useNewAlgo ) m_posdbTable.intersectLists10_r ( ); //else // m_posdbTable.intersectLists9_r ( ); // time it diff = gettimeofdayInMilliseconds() - start; if ( diff > 10 ) log("query: Took %"INT64" ms for intersection",diff); // returns false if blocked, true otherwise //return addedLists (); return true; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . called either from // 1) doDocIdSplitLoop // 2) or getDocIds2() if only 1 docidsplit bool Msg39::getLists () { if ( m_debug ) m_startTime = gettimeofdayInMilliseconds(); // . ask Indexdb for the IndexLists we need for these termIds // . each rec in an IndexList is a termId/score/docId tuple // // restrict to docid range? // // . get the docid start and end // . do docid paritioning so we can send to all hosts // in the network, not just one stripe int64_t docIdStart = 0; int64_t docIdEnd = MAX_DOCID; // . restrict to this docid? // . will really make gbdocid:| searches much faster! int64_t dr = m_tmpq.m_docIdRestriction; if ( dr ) { docIdStart = dr; docIdEnd = dr + 1; } // . override // . this is set from Msg39::doDocIdSplitLoop() to compute // search results in stages, so that we do not load massive // termlists into memory and got OOM (out of memory) if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId; if ( m_r->m_maxDocId != -1 ) docIdEnd = m_r->m_maxDocId+1; // if we have twins, then make sure the twins read different // pieces of the same docid range to make things 2x faster //bool useTwins = false; //if ( g_hostdb.getNumStripes() == 2 ) useTwins = true; //if ( useTwins ) { // int64_t delta2 = ( docIdEnd - docIdStart ) / 2; // if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2; // else docIdStart = docIdStart + delta2; //} // new striping logic: int32_t numStripes = g_hostdb.getNumStripes(); int64_t delta2 = ( docIdEnd - docIdStart ) / numStripes; int32_t stripe = g_hostdb.getMyHost()->m_stripe; docIdStart += delta2 * stripe; // is this right? docIdEnd = docIdStart + delta2; // add 1 to be safe so we don't lose a docid docIdEnd++; // TODO: add triplet support later for this to split the // read 3 ways. 4 ways for quads, etc. //if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;} // do not go over MAX_DOCID because it gets masked and // ends up being 0!!! and we get empty lists if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID; // remember so Msg2.cpp can use them to restrict the termlists // from "whiteList" as well m_docIdStart = docIdStart; m_docIdEnd = docIdEnd; // // set startkey/endkey for each term/termlist // for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // breathe QUICKPOLL ( m_r->m_niceness ); // int16_tcuts QueryTerm *qterm = &m_tmpq.m_qterms[i]; char *sk = qterm->m_startKey; char *ek = qterm->m_endKey; // get the term id int64_t tid = m_tmpq.getTermId(i); // if only 1 stripe //if ( g_hostdb.getNumStripes() == 1 ) { // docIdStart = 0; // docIdEnd = MAX_DOCID; //} // debug if ( m_debug ) log("query: setting sk/ek for docids %"INT64"" " to %"INT64" for termid=%"INT64"" , docIdStart , docIdEnd , tid ); // store now in qterm g_posdb.makeStartKey ( sk , tid , docIdStart ); g_posdb.makeEndKey ( ek , tid , docIdEnd ); qterm->m_ks = sizeof(POSDBKEY);//key144_t); } // debug msg if ( m_debug || g_conf.m_logDebugQuery ) { for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // get the term in utf8 //char bb[256]; QueryTerm *qt = &m_tmpq.m_qterms[i]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char tmp = *tpc; *tpc = '\0'; char sign = qt->m_termSign; if ( sign == 0 ) sign = '0'; QueryWord *qw = qt->m_qword; int32_t wikiPhrId = qw->m_wikiPhraseId; if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0; char leftwikibigram = 0; char rightwikibigram = 0; if ( qt->m_leftPhraseTerm && qt->m_leftPhraseTerm->m_isWikiHalfStopBigram ) leftwikibigram = 1; if ( qt->m_rightPhraseTerm && qt->m_rightPhraseTerm->m_isWikiHalfStopBigram ) rightwikibigram = 1; /* char c = m_tmpq.getTermSign(i); char tt[512]; int32_t ttlen = m_tmpq.getTermLen(i); if ( ttlen > 254 ) ttlen = 254; if ( ttlen < 0 ) ttlen = 0; // old:painful: convert each term from unicode to ascii gbmemcpy ( tt , m_tmpq.getTerm(i) , ttlen ); */ int32_t isSynonym = 0; QueryTerm *st = qt->m_synonymOf; if ( st ) isSynonym = true; SafeBuf sb; // now we can display it //tt[ttlen]='\0'; //if ( c == '\0' ) c = ' '; sb.safePrintf( "query: msg39: [%"PTRFMT"] " "query term #%"INT32" \"%s\" " "phr=%"INT32" termId=%"UINT64" rawTermId=%"UINT64" " //"estimatedTermFreq=%"INT64" (+/- ~16000) " "tfweight=%.02f " "sign=%c " "numPlusses=%hhu " "required=%"INT32" " "fielcode=%"INT32" " "ebit=0x%0"XINT64" " "impBits=0x%0"XINT64" " "wikiphrid=%"INT32" " "leftwikibigram=%"INT32" " "rightwikibigram=%"INT32" " //"range.startTermNum=%hhi range.endTermNum=%hhi " //"minRecSizes=%"INT32" " "readSizeInBytes=%"INT32" " //"ebit=0x%"XINT64" " //"impBits=0x%"XINT64" " "hc=%"INT32" " "component=%"INT32" " "otermLen=%"INT32" " "isSynonym=%"INT32" " "querylangid=%"INT32" " , (PTRTYPE)this , i , qt->m_term,//bb , (int32_t)m_tmpq.isPhrase (i) , m_tmpq.getTermId (i) , m_tmpq.getRawTermId (i) , ((float *)m_r->ptr_termFreqWeights)[i] , sign , //c , 0 , (int32_t)qt->m_isRequired, (int32_t)qt->m_fieldCode, (int64_t)qt->m_explicitBit , (int64_t)qt->m_implicitBits , wikiPhrId, (int32_t)leftwikibigram, (int32_t)rightwikibigram, ((int32_t *)m_r->ptr_readSizes)[i] , //(int64_t)m_tmpq.m_qterms[i].m_explicitBit , //(int64_t)m_tmpq.m_qterms[i].m_implicitBits , (int32_t)m_tmpq.m_qterms[i].m_hardCount , (int32_t)m_tmpq.m_componentCodes[i], (int32_t)m_tmpq.getTermLen(i) , isSynonym, (int32_t)m_tmpq.m_langId ); // ,tt // put it back *tpc = tmp; if ( st ) { int32_t stnum = st - m_tmpq.m_qterms; sb.safePrintf("synofterm#=%"INT32"",stnum); //sb.safeMemcpy(st->m_term,st->m_termLen); sb.pushChar(' '); sb.safePrintf("synwid0=%"INT64" ",qt->m_synWids0); sb.safePrintf("synwid1=%"INT64" ",qt->m_synWids1); sb.safePrintf("synalnumwords=%"INT32" ", qt->m_numAlnumWordsInSynonym); // like for synonym "nj" it's base, // "new jersey" has 2 alnum words! sb.safePrintf("synbasealnumwords=%"INT32" ", qt->m_numAlnumWordsInBase); } logf(LOG_DEBUG,"%s",sb.getBufStart()); } m_tmpq.printBooleanTree(); } // timestamp log if ( m_debug ) log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Getting %"INT32" index lists ", (PTRTYPE)this,m_tmpq.getNumTerms()); // . now get the index lists themselves // . return if it blocked // . not doing a merge (last parm) means that the lists we receive // will be an appending of a bunch of lists so keys won't be in order // . merging is uneccessary for us here because we hash the keys anyway // . and merging takes up valuable cpu time // . caution: the index lists returned from Msg2 are now compressed // . now i'm merging because it's 10 times faster than hashing anyway // and the reply buf should now always be <= minRecSizes so we can // pre-allocate one better, and, 3) this should fix the yahoo.com // reindex bug char rdbId = RDB_POSDB; // . TODO: MDW: fix // . partap says there is a bug in this??? we can't cache UOR'ed lists? bool checkCache = false; // split is us???? //int32_t split = g_hostdb.m_myHost->m_group; int32_t split = g_hostdb.m_myHost->m_shardNum; // call msg2 if ( ! m_msg2.getLists ( rdbId , m_r->m_collnum,//m_r->ptr_coll , m_r->m_maxAge , m_r->m_addToCache , //m_tmpq.m_qterms , &m_tmpq, m_r->ptr_whiteList, // we need to restrict docid range for // whitelist as well! this is from // doDocIdSplitLoop() m_docIdStart, m_docIdEnd, // how much of each termlist to read in bytes (int32_t *)m_r->ptr_readSizes , //m_tmpq.getNumTerms() , // numLists // 1-1 with query terms m_lists , this , controlLoopWrapper,//gotListsWrapper , m_r , m_r->m_niceness , true , // do merge? m_debug , NULL , // best hostids m_r->m_restrictPosdbForQuery , split , checkCache )) { m_blocked = true; return false; } // error? //if ( g_errno ) { // log("msg39: Had error getting termlists2: %s.", // mstrerror(g_errno)); // // don't bail out here because we are in docIdSplitLoop() // //sendReply (m_slot,this,NULL,0,0,true); // return true; //} //return gotLists ( true ); return true; }
void Msg39::estimateHitsAndSendReply ( ) { // no longer in use m_inUse = false; // now this for the query loop on the QueryLogEntries. m_topDocId50 = 0LL; m_topScore50 = 0.0; // a little hack for the seo pipeline in xmldoc.cpp m_topDocId = 0LL; m_topScore = 0.0; m_topDocId2 = 0LL; m_topScore2 = 0.0; int32_t ti = m_tt.getHighNode(); if ( ti >= 0 ) { TopNode *t = &m_tt.m_nodes[ti]; m_topDocId = t->m_docId; m_topScore = t->m_score; } // try the 2nd one too int32_t ti2 = -1; if ( ti >= 0 ) ti2 = m_tt.getNext ( ti ); if ( ti2 >= 0 ) { TopNode *t2 = &m_tt.m_nodes[ti2]; m_topDocId2 = t2->m_docId; m_topScore2 = t2->m_score; } // convenience ptrs. we will store the docids/scores into these arrays int64_t *topDocIds; double *topScores; key_t *topRecs; // numDocIds counts docs in all tiers when using toptree. int32_t numDocIds = m_tt.m_numUsedNodes; // the msg39 reply we send back int32_t replySize; char *reply; //m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6; // make the reply? Msg39Reply mr; // this is what you want to look at if there is no seo.cpp module... if ( ! m_callback ) { // if we got clusterdb recs in here, use 'em if ( m_gotClusterRecs ) numDocIds = m_numVisible; // don't send more than the docs that are asked for if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet; // # of QueryTerms in query int32_t nqt = m_tmpq.m_numTerms; // start setting the stuff mr.m_numDocIds = numDocIds; // copy # estiamted hits into 8 bytes of reply //int64_t est = m_posdbTable.m_estimatedTotalHits; // ensure it has at least as many results as we got //if ( est < numDocIds ) est = numDocIds; // or if too big... //if ( numDocIds < m_r->m_docsToGet ) est = numDocIds; // . total estimated hits // . this is now an EXACT count! mr.m_estimatedHits = m_numTotalHits; // sanity check mr.m_nqt = nqt; // the m_errno if any mr.m_errno = m_errno; // int16_tcut PosdbTable *pt = &m_posdbTable; // the score info, in no particular order right now mr.ptr_scoreInfo = pt->m_scoreInfoBuf.getBufStart(); mr.size_scoreInfo = pt->m_scoreInfoBuf.length(); // that has offset references into posdbtable::m_pairScoreBuf // and m_singleScoreBuf, so we need those too now mr.ptr_pairScoreBuf = pt->m_pairScoreBuf.getBufStart(); mr.size_pairScoreBuf = pt->m_pairScoreBuf.length(); mr.ptr_singleScoreBuf = pt->m_singleScoreBuf.getBufStart(); mr.size_singleScoreBuf = pt->m_singleScoreBuf.length(); // save some time since seo.cpp gets from posdbtable directly, // so we can avoid serializing/copying this stuff at least if ( ! m_r->m_makeReply ) { mr.size_scoreInfo = 0; mr.size_pairScoreBuf = 0; mr.size_singleScoreBuf = 0; } //mr.m_sectionStats = pt->m_sectionStats; // reserve space for these guys, we fill them in below mr.ptr_docIds = NULL; mr.ptr_scores = NULL; mr.ptr_clusterRecs = NULL; // this is how much space to reserve mr.size_docIds = 8 * numDocIds; // int64_t mr.size_scores = sizeof(double) * numDocIds; // float // if not doing site clustering, we won't have these perhaps... if ( m_gotClusterRecs ) mr.size_clusterRecs = sizeof(key_t) *numDocIds; else mr.size_clusterRecs = 0; #define MAX_FACETS 20000 ///////////////// // // FACETS // ///////////////// // We can have multiple gbfacet: terms in a query so // serialize all the QueryTerm::m_facetHashTables into // Msg39Reply::ptr_facetHashList. // // combine the facet hash lists of each query term into // a list of lists. each lsit is preceeded by the query term // id of the query term (like gbfacet:xpathsitehash12345) // followed by a 4 byte length of the following 32-bit // facet values int32_t need = 0; for ( int32_t i = 0 ; i < m_tmpq.m_numTerms; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; // skip if not facet if ( qt->m_fieldCode != FIELD_GBFACETSTR && qt->m_fieldCode != FIELD_GBFACETINT && qt->m_fieldCode != FIELD_GBFACETFLOAT ) continue; HashTableX *ft = &qt->m_facetHashTable; if ( ft->m_numSlotsUsed == 0 ) continue; int32_t used = ft->m_numSlotsUsed; // limit for memory if ( used > (int32_t)MAX_FACETS ) { log("msg39: truncating facet list to 20000 " "from %"INT32" for %s",used,qt->m_term); used = (int32_t)MAX_FACETS; } // store query term id 64 bit need += 8; // then size need += 4; // then buckets. keys and counts need += (4+sizeof(FacetEntry)) * used; } // allocate SafeBuf tmp; if ( ! tmp.reserve ( need ) ) { log("query: Could not allocate memory " "to hold reply facets"); sendReply(m_slot,this,NULL,0,0,true); return; } // point to there char *p = tmp.getBufStart(); for ( int32_t i = 0 ; i < m_tmpq.m_numTerms ; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; // skip if not facet if ( qt->m_fieldCode != FIELD_GBFACETSTR && qt->m_fieldCode != FIELD_GBFACETINT && qt->m_fieldCode != FIELD_GBFACETFLOAT ) continue; // get all the facet hashes and their counts HashTableX *ft = &qt->m_facetHashTable; // skip if none if ( ft->m_numSlotsUsed == 0 ) continue; // store query term id 64 bit *(int64_t *)p = qt->m_termId; p += 8; int32_t used = ft->getNumSlotsUsed(); if ( used > (int32_t)MAX_FACETS ) used = (int32_t)MAX_FACETS; // store count *(int32_t *)p = used; p += 4; int32_t count = 0; // for sanity check char *pend = p + (used * (4+sizeof(FacetEntry))); // serialize the key/val pairs for ( int32_t k = 0 ; k < ft->m_numSlots ; k++ ) { // skip empty buckets if ( ! ft->m_flags[k] ) continue; // store key. the hash of the facet value. *(int32_t *)p = ft->getKey32FromSlot(k); p += 4; // then store count //*(int32_t *)p = ft->getVal32FromSlot(k); p += 4; // now this has a docid on it so we can // lookup the text of the facet in Msg40.cpp FacetEntry *fe; fe = (FacetEntry *)ft->getValFromSlot(k); // sanity // no, count can be zero if its a range facet // that was never added to. we add those // empty FaceEntries only for range facets // in Posdb.cpp //if(fe->m_count == 0 ) { char *xx=NULL;*xx=0;} gbmemcpy ( p , fe , sizeof(FacetEntry) ); p += sizeof(FacetEntry); // do not breach if ( ++count >= (int32_t)MAX_FACETS ) break; } // sanity check if ( p != pend ) { char *xx=NULL;*xx=0; } // do the next query term } // now point to that so it can be serialized below mr.ptr_facetHashList = tmp.getBufStart(); mr.size_facetHashList = p - tmp.getBufStart();//tmp.length(); ///////////// // // END FACETS // ///////////// // . that is pretty much it,so serialize it into buffer,"reply" // . mr.ptr_docIds, etc., will point into the buffer so we can // re-serialize into it below from the tree // . returns NULL and sets g_errno on error // . "true" means we should make mr.ptr_* reference into the // newly serialized buffer. reply = serializeMsg ( sizeof(Msg39Reply), // baseSize &mr.size_docIds, // firstSizeParm &mr.size_clusterRecs,//lastSizePrm &mr.ptr_docIds , // firstStrPtr &mr , // thisPtr &replySize , NULL , 0 , true ) ; if ( ! reply ) { log("query: Could not allocated memory " "to hold reply of docids to send back."); sendReply(m_slot,this,NULL,0,0,true); return; } topDocIds = (int64_t *) mr.ptr_docIds; topScores = (double *) mr.ptr_scores; topRecs = (key_t *) mr.ptr_clusterRecs; } int32_t docCount = 0; // loop over all results in the TopTree for ( int32_t ti = m_tt.getHighNode() ; ti >= 0 ; ti = m_tt.getPrev(ti) ) { // get the guy TopNode *t = &m_tt.m_nodes[ti]; // skip if clusterLevel is bad! if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) continue; // if not sending back a reply... we were called from seo.cpp // State3f logic to evaluate a QueryLogEntry, etc. if ( m_callback ) { // skip results past #50 if ( docCount > 50 ) continue; // set this m_topScore50 = t->m_score; m_topDocId50 = t->m_docId; // that's it continue; } // get the docid ptr //char *diptr = t->m_docIdPtr; //int64_t docId = getDocIdFromPtr(diptr); // sanity check if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; } //add it to the reply topDocIds [docCount] = t->m_docId; topScores [docCount] = t->m_score; if ( m_tt.m_useIntScores ) topScores[docCount] = (double)t->m_intScore; // supply clusterdb rec? only for full splits if ( m_gotClusterRecs ) topRecs [docCount] = t->m_clusterRec; //topExplicits [docCount] = // getNumBitsOn(t->m_explicits) docCount++; // 50th score? set this for seo.cpp. if less than 50 results // we want the score of the last doc then. if ( docCount <= 50 ) m_topScore50 = t->m_score; if ( m_debug ) { logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "%03"INT32") docId=%012"UINT64" sum=%.02f", (PTRTYPE)this, docCount, t->m_docId,t->m_score); } //don't send more than the docs that are wanted if ( docCount >= numDocIds ) break; } if ( docCount > 300 && m_debug ) log("query: Had %"INT32" nodes in top tree",docCount); // this is sensitive info if ( m_debug ) { log(LOG_DEBUG, "query: msg39: [%"PTRFMT"] " "Intersected lists took %"INT64" (%"INT64") " "ms " "docIdsToGet=%"INT32" docIdsGot=%"INT32" " "q=%s", (PTRTYPE)this , m_posdbTable.m_addListsTime , gettimeofdayInMilliseconds() - m_startTime , m_r->m_docsToGet , numDocIds , m_tmpq.getQuery() ); } // if we blocked because we used a thread then call callback if // summoned from a msg3f handler and not a msg39 handler if ( m_callback ) { // if we blocked call user callback if ( m_blocked ) m_callback ( m_state ); // if not sending back a udp reply, return now return; } // now send back the reply sendReply(m_slot,this,reply,replySize,replySize,false); return; }
bool sendReply ( void *state , bool addUrlEnabled ) { // allow others to add now //s_inprogress = false; // get the state properly //gr *st1 = (gr *) state; GigablastRequest *gr = (GigablastRequest *)state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched SafeBuf xb; if ( gr->m_urlsBuf ) { xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 ); log(LOG_INFO,"http: add url %s (%s)", xb.getBufStart(),mstrerror(g_errno)); } char format = gr->m_hr.getReplyFormat(); TcpSocket *sock = gr->m_socket; if ( format == FORMAT_JSON || format == FORMAT_XML ) { bool status = g_httpServer.sendSuccessReply ( gr ); // nuke state mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return status; } long ulen = 0; char *url = gr->m_urlsBuf; if ( url ) ulen = gbstrlen (url); // re-null it out if just http:// bool printUrl = true; if ( ulen == 0 ) printUrl = false; if ( ! gr->m_urlsBuf ) printUrl = false; if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7)) printUrl = false; if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8)) printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); //char rawbuf[1024*8]; //SafeBuf rb(rawbuf, 1024*8); //rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"); //rb.safePrintf("<status>\n"); //CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll ); // collection name char tt [ 128 ]; tt[0] = '\0'; g_pages.printAdminTop ( &sb , sock , &gr->m_hr ); // display url //char *url = gr->m_urlsBuf; //if ( url && ! url[0] ) url = NULL; // watch out for NULLs if ( ! url ) url = "http://"; // if there was an error let them know //char msg[MAX_URL_LEN + 1024]; SafeBuf mbuf; //char *pm = ""; if ( g_errno ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); mbuf.safePrintf("</font></center>"); //pm = msg; //rb.safePrintf("Error adding url(s): %s[%i]", // mstrerror(g_errno) , g_errno); } else if ( printUrl ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("<b><u>"); mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200); mbuf.safePrintf("</u></b> added to spider " "queue " "successfully<br><br>"); mbuf.safePrintf("</font></center>"); //rb.safePrintf("%s added to spider " // "queue successfully", url ); //pm = msg; //url = "http://"; //else // pm = "Don't forget to <a href=/gigaboost.html>" // "Gigaboost</a> your URL."; } if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() ); g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // print the final tail g_pages.printTail ( &sb, true ); // admin? // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); delete (gr); return g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), -1 ); // cachetime }
bool qaspider1 ( ) { // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // restrict hopcount to 0 or 1 in url filters so we do not spider // too deep //static bool s_z1 = false; if ( ! s_flags[2] ) { s_flags[2] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&" // make it the custom filter "ufp=0&" "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&" // take out hopcount for now, just test quotas // "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&" // just one spider out allowed for consistency "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&" "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&" ); if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) ) return false; } // set the site list to // a few sites //static bool s_z2 = false; if ( ! s_flags[3] ) { s_flags[3] = true; SafeBuf sb; sb.safePrintf("&c=qatest123&format=xml&sitelist="); sb.urlEncode("tag:shallow site:www.walmart.com\r\n" "tag:shallow site:http://www.ibm.com/\r\n"); sb.nullTerm(); if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) ) return false; } // // use the add url interface now // walmart.com above was not seeded because of the site: directive // so this will seed it. // //static bool s_y2 = false; if ( ! s_flags[4] ) { s_flags[4] = true; SafeBuf sb; // delim=+++URL: sb.safePrintf("&c=qatest123" "&format=json" "&strip=1" "&spiderlinks=1" "&urls=www.walmart.com+ibm.com" ); // . now a list of websites we want to spider // . the space is already encoded as + //sb.urlEncode(s_urls1); if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) ) return false; } // // wait for spidering to stop // checkagain: // wait until spider finishes. check the spider status page // in json to see when completed //static bool s_k1 = false; if ( ! s_flags[5] ) { // wait 5 seconds, call sleep timer... then call qatest() //usleep(5000000); // 5 seconds wait(3.0); s_flags[5] = true; return false; } if ( ! s_flags[15] ) { s_flags[15] = true; if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) ) return false; } //static bool s_k2 = false; if ( ! s_flags[6] ) { // ensure spiders are done. // "Nothing currently available to spider" if ( s_content&&!strstr(s_content,"Nothing currently avail")){ s_flags[5] = false; s_flags[15] = false; goto checkagain; } s_flags[6] = true; } // wait for index msg4 to not be cached to ensure all results indexed if ( ! s_flags[22] ) { s_flags[22] = true; wait(1.5); } // verify no results for gbhopcount:2 query //static bool s_y4 = false; if ( ! s_flags[7] ) { s_flags[7] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A2", -1672870556 ) ) return false; } // but some for gbhopcount:0 query //static bool s_t0 = false; if ( ! s_flags[8] ) { s_flags[8] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=gbhopcount%3A0", 908338607 ) ) return false; } // check facet sections query for walmart //static bool s_y5 = false; if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&" "q=gbfacetstr%3Agbxpathsitehash2492664135", 55157060 ) ) return false; } //static bool s_y6 = false; if ( ! s_flags[10] ) { s_flags[10] = true; if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // in xml //static bool s_y7 = false; if ( ! s_flags[11] ) { s_flags[11] = true; if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // and json //static bool s_y8 = false; if ( ! s_flags[12] ) { s_flags[12] = true; if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) ) return false; } // delete the collection //static bool s_fee = false; // if ( ! s_flags[13] ) { // s_flags[13] = true; // if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) ) // return false; // } if ( ! s_flags[17] ) { s_flags[17] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" "q=site2%3Awww.walmart.com+" "gbsortby%3Agbspiderdate", 999 ) ) return false; } // xpath is like a title here i think. check the returned // facet table in the left column if ( ! s_flags[18] ) { s_flags[18] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&" "q=gbfacetstr%3Agbxpathsitehash3624590799" , 999 ) ) return false; } if ( ! s_flags[19] ) { s_flags[19] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&" "q=gbfacetint%3Agbhopcount" , 999 ) ) return false; } if ( ! s_flags[20] ) { s_flags[20] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&" "q=gbfacetint%3Alog.score" , 999 ) ) return false; } if ( ! s_flags[21] ) { s_flags[21] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&" "q=gbfacetfloat%3Atalks.rating" , 999 ) ) return false; } if ( ! s_flags[23] ) { s_flags[23] = true; // test facets mixed with gigabits in left hand column if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&" "q=gbfacetint%3Agbhopcount+walmart" , 999 ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[14] ) { s_flags[14] = true; log("qa: SUCCESSFULLY COMPLETED " "QA SPIDER1 TEST"); return true; } return true; }
bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) { // get collection name //int32_t nclen; //char *nc = r->getString ( "nc" , &nclen ); //int32_t cpclen; //char *cpc = r->getString ( "cpc" , &cpclen ); g_errno = 0; //bool cast = r->getLong("cast",0); const char *msg = NULL; // if any host in network is dead, do not do this //if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead."; char format = r->getReplyFormat(); if ( format == FORMAT_XML || format == FORMAT_JSON ) { // no addcoll given? int32_t page = g_pages.getDynamicPageNumber ( r ); const char *addcoll = r->getString("addcoll",NULL); const char *delcoll = r->getString("delcoll",NULL); if ( ! addcoll ) addcoll = r->getString("addColl",NULL); if ( ! delcoll ) delcoll = r->getString("delColl",NULL); if ( page == PAGE_ADDCOLL && ! addcoll ) { g_errno = EBADENGINEER; const char *msg = "no addcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } if ( page == PAGE_DELCOLL && ! delcoll ) { g_errno = EBADENGINEER; const char *msg = "no delcoll parm provided"; return g_httpServer.sendErrorReply(s,g_errno,msg,NULL); } return g_httpServer.sendSuccessReply(s,format); } // error? const char *action = r->getString("action",NULL); const char *addColl = r->getString("addcoll",NULL); char buf [ 64*1024 ]; SafeBuf p(buf, 64*1024); // // CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS // SafeBuf gtmp; char *gmsg = NULL; // is it too big? if ( action && addColl && strlen(addColl) > MAX_COLL_LEN ) { gtmp.safePrintf("search engine name is too long"); gmsg = gtmp.getBufStart(); } // from Collectiondb.cpp::addNewColl() ensure coll name is legit const char *x = addColl; for ( ; x && *x ; x++ ) { if ( is_alnum_a(*x) ) continue; if ( *x == '-' ) continue; if ( *x == '_' ) continue; // underscore now allowed break; } if ( x && *x ) { g_errno = EBADENGINEER; gtmp.safePrintf("<font color=red>Error. \"%s\" is a " "malformed name because it " "contains the '%c' character.</font><br><br>", addColl,*x); gmsg = gtmp.getBufStart(); } // // END GIGABOT ERRORS // // // CLOUD SEARCH ENGINE SUPPORT // // if added the coll successfully, do not print same page, jump to // printing the basic settings page so they can add sites to it. // crap, this GET request, "r", is missing the "c" parm sometimes. // we need to use the "addcoll" parm anyway. maybe print a meta // redirect then? char guide = r->getLong("guide",0); // do not redirect if gmsg is set, there was a problem with the name if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) { //return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS ); // just redirect to it if ( addColl ) p.safePrintf("<meta http-equiv=Refresh " "content=\"0; URL=/admin/settings" "?guide=1&c=%s\">", addColl); return g_httpServer.sendDynamicPage (s, p.getBufStart(), p.length()); } // print standard header g_pages.printAdminTop ( &p , s , r , NULL, "onload=document." "getElementById('acbox').focus();"); if ( g_errno ) { msg = mstrerror( g_errno ); } if ( msg && ! guide ) { const char *cc = "deleting"; if ( add ) cc = "adding"; p.safePrintf ( "<center>\n" "<font color=red>" "<b>Error %s collection: %s. " "See log file for details.</b>" "</font>" "</center><br>\n",cc,msg); } // // CLOUD SEARCH ENGINE SUPPORT // if ( add && guide ) printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg ); // print the add collection box if ( add /*&& (! nc[0] || g_errno ) */ ) { const char *t1 = "Add Collection"; if ( guide ) t1 = "Add Search Engine"; p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td colspan=2>" "<center><b>%s</b></center>" "</td></tr>\n" ,TABLE_STYLE ,t1 ); const char *t2 = "collection"; if ( guide ) t2 = "search engine"; const char *str = addColl; if ( ! addColl ) str = ""; p.safePrintf ( "<tr bgcolor=#%s>" "<td><b>name of new %s to add</td>\n" "<td><input type=text name=addcoll size=30 " "id=acbox " "value=\"%s\">" "</td></tr>\n" , LIGHT_BLUE , t2 , str ); // don't show the clone box if we are under gigabot the guide if ( ! guide ) p.safePrintf( "<tr bgcolor=#%s>" "<td><b>clone settings from this " "collection</b>" "<br><font size=1>Copy settings from " "this pre-existing collection. Leave " "blank to " "accept default values.</font></td>\n" "<td><input type=text name=clonecoll " "size=30>" "</td>" "</tr>" , LIGHT_BLUE ); // collection pwds p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection passwords" "</b>" "<br><font size=1>List of white space separated " "passwords allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collpwd " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // ips box for security p.safePrintf( "<tr bgcolor=#%s>" "<td><b>collection ips" "</b>" "<br><font size=1>List of white space separated " "IPs allowed to adminster collection." "</font>" "</td>\n" "<td><input type=text name=collips " "size=60>" "</td>" "</tr>" , LIGHT_BLUE ); // now list collections from which to copy the config //p.safePrintf ( // "<tr><td><b>copy configuration from this " // "collection</b><br><font size=1>Leave blank to " // "accept default values.</font></td>\n" // "<td><input type=text name=cpc value=\"%s\" size=30>" // "</td></tr>\n",coll); p.safePrintf ( "</table></center><br>\n"); // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); } // if we added a collection, print its page //if ( add && nc[0] && ! g_errno ) // return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH , // nc , pwd ); if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip; // print all collections out in a checklist so you can check the // ones you want to delete, the values will be the id of that collectn p.safePrintf ( "<center>\n<table %s>\n" "<tr class=hdrow><td><center><b>Delete Collections" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" "<center><b>Select the collections you wish to delete. " //"<font color=red>This feature is currently under " //"development.</font>" "</b></center></td></tr>\n" "<tr bgcolor=#%s><td>" // table within a table "<center><table width=20%%>\n", TABLE_STYLE, LIGHT_BLUE, DARK_BLUE ); for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; p.safePrintf ( "<tr bgcolor=#%s><td>" "<input type=checkbox name=delcoll value=\"%s\"> " "%s</td></tr>\n", DARK_BLUE, cr->m_coll,cr->m_coll); } p.safePrintf( "</table></center></td></tr></table><br>\n" ); skip: // wrap up the form started by printAdminTop g_pages.printAdminBottom ( &p ); int32_t bufLen = p.length(); return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen); }
bool processLoop ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; if ( st->m_u && st->m_u[0] ) { // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp CollectionRec *cr = xd->getCollRec(); if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123")) // use same dir that XmlDoc::getTestDir() would use //saveTestBuf ( "test-page-parser" ); saveTestBuf("qa"); // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf char *metalist = xd->getMetaList ( ); if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked if ( metalist == (void *)-1 ) return false; // for debug... if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // print it out xd->printDoc( xbuf ); } // print reason we can't analyze it (or index it) //if ( st->m_indexCode != 0 ) { // xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>", // mstrerror(st->m_indexCode)); //} // we are done g_inPageParser = false; // print the final tail //p += g_httpServer.printTail ( p , pend - p ); //log("parser: send sock=%li",st->m_s->m_sd); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
bool Msg3a::gotAllSplitReplies ( ) { // if any of the split requests had an error, give up and set m_errno // but don't set if for non critical errors like query truncation if ( m_errno ) { g_errno = m_errno; return true; } // also reset the finalbuf and the oldNumTopDocIds if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // update our estimated total hits m_numTotalEstimatedHits = 0; for ( long i = 0; i < m_numHosts ; i++ ) { // get that host that gave us the reply //Host *h = g_hostdb.getHost(i); // . get the reply from multicast // . multicast should have destroyed all slots, but saved reply // . we are responsible for freeing the reply // . we need to call this even if g_errno or m_errno is // set so we can free the replies in Msg3a::reset() // . if we don't call getBestReply() on it multicast should // free it, because Multicast::m_ownReadBuf is still true Multicast *m = &m_mcast[i]; bool freeit = false; long replySize = 0; long replyMaxSize; char *rbuf; Msg39Reply *mr; // . only get it if the reply not already full // . if reply already processed, skip // . perhaps it had no more docids to give us or all termlists // were exhausted on its disk and this is a re-call // . we have to re-process it for count m_numTotalEstHits, etc. rbuf = m->getBestReply ( &replySize , &replyMaxSize , &freeit , true ); //stealIt? // cast it mr = (Msg39Reply *)rbuf; // in case of mem leak, re-label from "mcast" to this so we // can determine where it came from, "Msg3a-GBR" relabel( rbuf, replyMaxSize , "Msg3a-GBR" ); // . we must be able to free it... we must own it // . this is true if we should free it, but we should not have // to free it since it is owned by the slot? if ( freeit ) { log(LOG_LOGIC,"query: msg3a: Steal failed."); char *xx = NULL; *xx=0; } // bad reply? if ( ! mr ) { log(LOG_LOGIC,"query: msg3a: Bad NULL reply."); m_reply [i] = NULL; m_replyMaxSize[i] = 0; // it might have been timd out, just ignore it!! continue; // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; // all reply buffers should be freed on reset() return true; } // how did this happen? if ( replySize < 29 && ! mr->m_errno ) { // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.", replySize); // all reply buffers should be freed on reset() return true; } // can this be non-null? we shouldn't be overwriting one // without freeing it... if ( m_reply[i] ) // note the mem leak now log("query: mem leaking a 0x39 reply"); // cast it and set it m_reply [i] = mr; m_replyMaxSize[i] = replyMaxSize; // deserialize it (just sets the ptr_ and size_ member vars) //mr->deserialize ( ); deserializeMsg ( sizeof(Msg39Reply) , &mr->size_docIds, &mr->size_clusterRecs, &mr->ptr_docIds, mr->m_buf ); // sanity check if ( mr->m_nqt != m_q->getNumTerms() ) { g_errno = EBADREPLY; m_errno = EBADREPLY; log("query: msg3a: Split reply qterms=%li != %li.", (long)mr->m_nqt,(long)m_q->getNumTerms() ); return true; } // return if split had an error, but not for a non-critical // error like query truncation if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) { g_errno = mr->m_errno; m_errno = mr->m_errno; log("query: msg3a: Split had error: %s", mstrerror(g_errno)); return true; } // skip down here if reply was already set //skip: // add of the total hits from each split, this is how many // total results the lastest split is estimated to be able to // return // . THIS should now be exact since we read all termlists // of posdb... m_numTotalEstimatedHits += mr->m_estimatedHits; // debug log stuff if ( ! m_debug ) continue; // cast these for printing out long long *docIds = (long long *)mr->ptr_docIds; score_t *scores = (score_t *)mr->ptr_scores; // print out every docid in this split reply for ( long j = 0; j < mr->m_numDocIds ; j++ ) { // print out score_t logf( LOG_DEBUG, "query: msg3a: [%lu] %03li) " "split=%li docId=%012llu domHash=0x%02lx " "score=%lu" , (unsigned long)this , j , i , docIds [j] , (long)g_titledb.getDomHash8FromDocId(docIds[j]), (long)scores[j] ); } } // this seems to always return true! mergeLists ( ); if ( ! m_r->m_useSeoResultsCache ) return true; // now cache the reply SafeBuf cr; long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4); long need = sizeof(key_t) + 4 + dataSize; bool status = cr.reserve ( need ); // sanity if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) { char *xx=NULL; *xx=0; } // ignore errors g_errno = 0; // return on error with g_errno cleared if cache add failed if ( ! status ) return true; // add to buf otherwise cr.safeMemcpy ( &m_ckey , sizeof(key_t) ); cr.safeMemcpy ( &dataSize , 4 ); long now = getTimeGlobal(); cr.pushLong ( now ); cr.pushLong ( m_numDocIds ); cr.pushLong ( m_numTotalEstimatedHits );//Results ); long max = m_numDocIds; // then the docids for ( long i = 0 ; i < max ; i++ ) cr.pushLongLong(m_docIds[i] ); for ( long i = 0 ; i < max ; i++ ) cr.pushFloat(m_scores[i]); for ( long i = 0 ; i < max ; i++ ) cr.pushLong(getSiteHash26(i)); // sanity if ( cr.length() != need ) { char *xx=NULL; *xx=0; } // make these key_t startKey; key_t endKey; startKey = m_ckey; // clear delbit startKey.n0 &= 0xfffffffffffffffeLL; // end key is us endKey = m_ckey; // that is the single record m_seoCacheList.set ( cr.getBufStart() , cr.length(), cr.getBufStart(), // alloc cr.getCapacity(), // alloc size (char *)&startKey, (char *)&endKey, -1, // fixeddatasize true, // owndata? false,// use half keys? sizeof(key_t) ); // do not allow cr to free it, msg1 will cr.detachBuf(); // note it //log("seopipe: storing ckey=%s q=%s" // ,KEYSTR(&m_ckey,12) // ,m_r->ptr_query // ); //log("msg1: sending niceness=%li",(long)m_r->m_niceness); // this will often block, but who cares!? it just sends a request off if ( ! m_msg1.addList ( &m_seoCacheList , RDB_SERPDB,//RDB_CACHEDB, m_r->ptr_coll, this, // state gotSerpdbReplyWrapper, // callback false, // forcelocal? m_r->m_niceness ) ) { //log("blocked"); return false; } // we can safely delete m_msg17... just return true return true; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList ( void *state ) { // the state State10 *st = (State10 *) state; // launch more if ( ! launchRequests ( st ) ) return false; /* // get the date list //fprintf(stderr,"termId now=%lli\n",st->m_termId); //fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK)); // . now get the indexList for this termId // . date is complemented, so start with bigger one first key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff); key128_t endKey = g_datedb.makeEndKey ( st->m_termId ,0x0); // get the rdb ptr to titledb's rdb //Rdb *rdb = g_indexdb.getRdb(); // -1 means read from all files in Indexdb long numFiles = -1; // make it zero if caller doesn't want to hit the disk if ( ! st->m_useDisk ) numFiles = 0; // get the title rec at or after this docId if ( ! st->m_msg0.getList ( -1 , 0 , 0 , 0 , // max cache age false , // add to cache? RDB_DATEDB , // rdbId of 2 = indexdb st->m_coll , &st->m_list2 , (char *)&startKey , (char *)&endKey , st->m_numRecs * sizeof(key128_t),//recSizes //st->m_useTree , // include tree? //st->m_useCache , // include cache? //false , // add to cache? //0 , // startFileNum //numFiles , // numFiles st , // state gotIndexListWrapper2 , 0 ) ) // niceness return false; // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error return gotIndexList2 ( (void *) st , NULL ); } void gotIndexListWrapper2 ( void *state , RdbList *list ) { gotIndexList2 ( state , list ); } void addedKeyWrapper ( void *state ) { gotIndexList2 ( state, NULL ); } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList2 ( void *state , RdbList *list ) { // the state State10 *st = (State10 *) state; */ // get the socket TcpSocket *s = st->m_socket; // don't allow pages bigger than 128k in cache //char buf [ 64*1024 ]; // a ptr into "buf" //char *p = buf; //char *pend = buf + 64*1024; /* // get termId key_t k = *(key_t *)st->m_list.getStartKey(); long long termId = g_indexdb.getTermId ( k ); // get groupId from termId //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k ); long hostnum = g_hostdb.makeHostId ( groupId ); */ // check box " checked" strings char *ubs = ""; char *uts = ""; char *uds = ""; char *ucs = ""; char *add = ""; char *del = ""; if ( st->m_useDatedb) ubs = " checked"; if ( st->m_useTree ) uts = " checked"; if ( st->m_useDisk ) uds = " checked"; if ( st->m_useCache ) ucs = " checked"; if ( st->m_add ) add = " checked"; if ( st->m_del ) del = " checked"; SafeBuf *pbuf = &st->m_pbuf; g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true; // print the standard header for admin pages pbuf->safePrintf ( "<center>\n" "<table cellpadding=2><tr><td colspan=4>" "useDatedb:<input type=checkbox value=1 name=ub%s> " "useTree:<input type=checkbox value=1 name=ut%s> " "useDisk:<input type=checkbox value=1 name=ud%s> " "useCache:<input type=checkbox value=1 name=uc%s> " "ADD:<input type=checkbox value=1 name=add%s> " "DELETE:<input type=checkbox value=1 name=del%s>" "</td></tr><tr><td>" "query:" "</td><td>" "<input type=text name=q value=\"%s\" size=20>" "</td><td>" "collection:" "</td><td>" "<input type=text name=c value=\"%s\" size=10>" "</td></tr><tr><td>" "termId:" "</td><td>" "<input type=text name=t value=%lli size=20>" "</td><td>" "numRecs:" "</td><td>" "<input type=text name=numRecs value=%li size=10> " "</td></tr><tr><td>" "docId:" "</td><td>" "<input type=text name=d value=%lli size=20> " "</td><td>" "score:" "</td><td>" "<input type=text name=score value=%li size=10> " "</td><td>" "<input type=submit value=ok border=0>" "</td></tr>" "<tr><td colspan=2>" "term appears in about %lli docs +/- %li" "</td></tr>" //"<tr><td colspan=2>" //"this indexlist held by host #%li and twins" //"</td></tr>" "</table>" "</form><br><br>" , ubs, uts, uds, ucs, add, del, st->m_query , st->m_coll , st->m_termId , st->m_numRecs , st->m_docId , (long)st->m_score , st->m_termFreq , 2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * base->getNumFiles() ); //hostnum ); if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){ if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno)); else pbuf->safePrintf("List is empty"); pbuf->safePrintf("</center>"); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage(s , pbuf->getBufStart(), pbuf->length() ); // delete it mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st); return status; } pbuf->safePrintf ( "<table cellpadding=1 border=1>" "<tr><td>#</td><td>score</td>" "<td>docId</td><td>domHash</td></tr>"); //if ( searchingEvents // now print the score/docId of indexlist long i = 0; for ( st->m_list.resetListPtr () ; ! st->m_list.isExhausted () ; st->m_list.skipCurrentRecord () ) { // break if buf is low //if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list.getCurrentDocId () ; unsigned long groupId = getGroupIdFromDocId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // log the first docid so we can blaster url: queries // to PageIndexdb and see if they are in indexdb if ( i == 0 ) logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query); // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } unsigned long date = 0; if ( st->m_useDatedb ) date = (unsigned long)st->m_list.getCurrentDate(); uint8_t dh = g_titledb.getDomHash8FromDocId ( docId ); char ds[32]; ds[0]=0; if ( st->m_useDatedb ) sprintf (ds,"%lu/",date); pbuf->safePrintf ( "<tr><td>%li.</td>" "<td>%s%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td>" "<td>" "0x%02lx" "</td>" "</tr>\n" , i++, ds, (int)st->m_list.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId , (long)dh ); } pbuf->safePrintf ( "</table>" ); /* if ( ! st->m_list2.isEmpty() ) p += sprintf ( p , "<br>" "<br>" "<table cellpadding=1 border=1>" "<tr><td>#</td><td>termId</td>" "<td>date</td><td>score</td>" "<td>docId</td></tr>"); // now print the score/docId of datedb list i = 0; for ( st->m_list2.resetListPtr () ; ! st->m_list2.isExhausted () ; st->m_list2.skipCurrentRecord () ) { // break if buf is low if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list2.getCurrentDocId () ; unsigned long groupId = g_titledb.getGroupId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } // debug char kb[16]; st->m_list2.getCurrentKey(kb); //log(LOG_INFO,"debug: n1=%016llx n0=%016llx", // *(long long *)(kb+8),*(long long *)(kb+0)); //if ( (unsigned long)st->m_list2.getCurrentDate() == 0 ) // log("STOP"); sprintf ( p , "<tr><td>%li.</td>" "<td>%llu</td>" "<td>%lu</td><td>%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td></tr>\n" , i++, st->m_list2.getTermId16(kb) , (unsigned long)st->m_list2.getCurrentDate() , (int)st->m_list2.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId ); p += gbstrlen ( p ); } */ if ( ! st->m_list.isEmpty() ) pbuf->safePrintf ( "</table>" ); // print msg if we could fit all into buf //if ( p + 1024 >= pend ) { // sprintf ( p ,"... truncated ... no mem" ); // p += gbstrlen ( p ); //} // print the final tail //p += g_httpServer.printTail ( p , pend - p ); pbuf->safePrintf ( "</center>\n"); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage ( s , pbuf->getBufStart() , pbuf->length() ); // delete the state mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st) ; return status; }
// . "uf" is printf url format to scrape with a %s for the query // . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0"; bool Msg7::scrapeQuery ( ) { // advance round now in case we return early m_round++; // error? if ( m_qbuf.length() > 500 ) { g_errno = EQUERYTOOBIG; return true; } // first encode the query SafeBuf ebuf; ebuf.urlEncode ( m_qbuf.getBufStart() ); // queryUNEncoded ); char *uf; if ( m_round == 1 ) // set to 1 for debugging uf="http://www.google.com/search?num=20&" "q=%s&scoring=d&filter=0"; //uf = "https://startpage.com/do/search?q=%s"; //uf = "http://www.google.com/" // "/cse?cx=013269018370076798483%3A8eec3papwpi&" // "ie=UTF-8&q=%s&" // "num=20"; else uf="http://www.bing.com/search?q=%s"; // skip bing for now //if ( m_round == 2 ) // return true; //if ( m_round == 1 ) // return true; // make the url we will download char ubuf[2048]; sprintf ( ubuf , uf , ebuf.getBufStart() ); // log it log("inject: SCRAPING %s",ubuf); SpiderRequest sreq; sreq.reset(); // set the SpiderRequest strcpy(sreq.m_url, ubuf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; long firstIp = hash32n(ubuf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); // forceDEl = false, niceness = 0 m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd.m_useRobotsTxt = false; // this will tell it to index ahrefs first before indexing // the doc. but do NOT do this if we are from ahrefs.com // ourselves to avoid recursive explosion!! if ( m_useAhrefs ) m_xd.m_useAhrefs = true; m_xd.m_reallyInjectLinks = m_injectLinks; // // rather than just add the links of the page to spiderdb, // let's inject them! // m_xd.setCallback ( this , doneInjectingLinksWrapper ); // niceness is 0 m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2"); // do we actually inject the links, or just scrape? if ( ! m_xd.injectLinks ( &m_linkDedupTable , NULL, this , doneInjectingLinksWrapper ) ) return false; // otherwise, just download the google/bing search results so we // can display them in xml //else if ( m_xd.getUtf8Content() == (char **)-1 ) // return false; // print reply.. //printReply(); return true; }
bool sendReply ( void *state ) { StateCatdb *st = (StateCatdb*)state; // check for error if (g_errno) { if (st->m_catLookup) log("PageCatdb: Msg8b had error getting Site Rec: %s", mstrerror(g_errno)); else log("PageCatdb: Msg2a had error generating Catdb: %s", mstrerror(g_errno)); st->m_catLookup = false; g_errno = 0; } long long endTime = gettimeofdayInMilliseconds(); // page buffer SafeBuf sb; sb.reserve(64*1024); // . print standard header // . do not print big links if only an assassin, just print host ids g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r ); sb.safePrintf( "<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); sb.safePrintf ( "<table %s>" "<tr><td colspan=2>" "<center><font size=+1><b>Catdb</b></font></center>" "</td></tr>", TABLE_STYLE ); // instructions sb.safePrintf("<tr bgcolor=#%s>" "<td colspan=3>" "<font size=-2>" "<center>" "Don't just start using this, you need to follow the " "instructions in the <i>admin guide</i> for adding " "DMOZ support." "</center>" "</font>" "</td>" "</tr>" ,DARK_BLUE ); // print the generate Catdb link sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=2\">" "Update Catdb</a> " "</center></td></tr>", st->m_coll ); sb.safePrintf ( "<tr class=poo>" "<td>Generate New Catdb from DMOZ data.</td>" "<td><center>" "<a href=\"/master/catdb?c=%s&gencatdb=1\">" "Generate Catdb</a> " "</center></td></tr>", st->m_coll ); if (st->m_genCatdb) sb.safePrintf ( "<tr class=poo>" "<td> Catdb Generation took %lli ms." "</td></tr>", endTime - st->m_startTime ); // print Url Catgory Lookup sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>" "<td><input type=text name=caturl size=80" " value=\""); if (st->m_catLookup) { sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); } sb.safePrintf("\"></center></td></tr>" ); // print Url Info if Lookup was done if (st->m_catLookup) { sb.safePrintf("<tr><td>"); // print the url sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen()); sb.safePrintf(" (%lli ms)</td><td>", endTime - st->m_startTime ); // print each category id and path for (long i = 0; i < st->m_catRec.m_numCatids; i++) { sb.safePrintf("<b>[%li] ", st->m_catRec.m_catids[i]); g_categories->printPathFromId(&sb, st->m_catRec.m_catids[i]); sb.safePrintf("</b><br>"); // lookup title and summary char title[1024]; long titleLen = 0; char summ[4096]; long summLen = 0; char anchor[256]; unsigned char anchorLen = 0; g_categories->getTitleAndSummary( st->m_url.getUrl(), st->m_url.getUrlLen(), st->m_catRec.m_catids[i], title, &titleLen, 1023, summ, &summLen, 4098, anchor, &anchorLen, 255 ); title[titleLen] = '\0'; summ[summLen] = '\0'; anchor[anchorLen] = '\0'; // print title and summary sb.safePrintf("<b>Title:</b> %s<br>" "<b>Summary:</b> %s<br>", title, summ); if (anchorLen > 0) sb.safePrintf("<b>Anchor:</b> %s<br>", anchor); sb.safePrintf("<br>"); } sb.safePrintf("<b>Filenum:</b> %li<br>", st->m_catRec.m_filenum); // print indirect catids if (st->m_catRec.m_numIndCatids > 0) { sb.safePrintf("<hr><b>Indirect Catids [%li]:" "</b><br>\n", st->m_catRec.m_numIndCatids ); for (long i = 0; i < st->m_catRec.m_numIndCatids; i++) { sb.safePrintf("%lu<br>", st->m_catRec.m_indCatids[i]); } } sb.safePrintf("</td></tr>"); } // end it sb.safePrintf ( "</center></td></tr></table>" ); // print submit button sb.safePrintf ( "<br><center>" "<input type=submit value=\"Submit\" border=0>" "</form></center>" ); // print the final tail //p += g_httpServer.printTail ( p , pend - p ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // extract the socket TcpSocket *s = st->m_socket; // clear the state mdelete ( st, sizeof(StateCatdb), "PageCatdb" ); delete st; // . send this page // . encapsulates in html header and tail // . make a Mime return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length()); }
void doneReindexing ( void *state ) { // cast it State13 *st = (State13 *)state; GigablastRequest *gr = &st->m_gr; // note it if ( gr->m_query && gr->m_query[0] ) log(LOG_INFO,"admin: Done with query reindex. %s", mstrerror(g_errno)); //// // // print the html page // ///// HttpRequest *hr = &gr->m_hr; char format = hr->getReplyFormat(); SafeBuf sb; const char *ct = "text/html"; if ( format == FORMAT_JSON ) ct = "application/json"; if ( format == FORMAT_XML ) ct = "text/xml"; if ( format == FORMAT_XML ) { sb.safePrintf("<response>\n" "\t<statusCode>0</statusCode>\n" "\t<statusMsg>Success</statusMsg>\n" "\t<matchingResults>%" PRId32"</matchingResults>\n" "</response>" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } if ( format == FORMAT_JSON ) { sb.safePrintf("{\"response\":{\n" "\t\"statusCode\":0,\n" "\t\"statusMsg\":\"Success\",\n" "\t\"matchingResults\":%" PRId32"\n" "}\n" "}\n" , st->m_msg1c.m_numDocIdsAdded ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false,ct); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); return; } g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr ); sb.safePrintf("<style>" ".poo { background-color:#%s;}\n" "</style>\n" , LIGHT_BLUE ); // // print error msg if any // if ( gr->m_query && gr->m_query[0] && ! g_errno ) sb.safePrintf ( "<center><font color=red><b>Success. " "Added %" PRId32" docid(s) to " "spider queue.</b></font></center><br>" , st->m_msg1c.m_numDocIdsAdded ); if ( gr->m_query && gr->m_query[0] && g_errno ) sb.safePrintf ( "<center><font color=red><b>Error. " "%s</b></font></center><br>" , mstrerror(g_errno)); // print the reindex interface g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr ); g_httpServer.sendDynamicPage ( gr->m_socket, sb.getBufStart(), sb.length(), -1, false); mdelete ( st , sizeof(State13) , "PageTagdb" ); delete (st); }
bool sendReply ( void *state ) { GigablastRequest *gr = (GigablastRequest *)state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched SafeBuf xb; if ( gr->m_urlsBuf ) { xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 ); log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) ); } char format = gr->m_hr.getReplyFormat(); TcpSocket *sock = gr->m_socket; if ( format == FORMAT_JSON || format == FORMAT_XML ) { bool status = g_httpServer.sendSuccessReply ( gr ); // nuke state mdelete ( gr , sizeof(gr) , "PageAddUrl" ); delete (gr); return status; } int32_t ulen = 0; const char *url = gr->m_urlsBuf; if ( url ) ulen = gbstrlen (url); // re-null it out if just http:// bool printUrl = true; if ( ulen == 0 ) printUrl = false; if ( ! gr->m_urlsBuf ) printUrl = false; if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7)) printUrl = false; if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8)) printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); g_pages.printAdminTop ( &sb , sock , &gr->m_hr ); // if there was an error let them know SafeBuf mbuf; if ( g_errno ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno); mbuf.safePrintf("</font></center>"); } else if ( printUrl ) { mbuf.safePrintf("<center><font color=red>"); mbuf.safePrintf("<b><u>"); mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200); mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>"); mbuf.safePrintf("</font></center>"); } if ( mbuf.length() ) { sb.safeStrcpy( mbuf.getBufStart() ); } g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // print the final tail g_pages.printTail ( &sb, true ); // admin? // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" ); delete (gr); return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime }
// returns false if blocked, true otherwise bool processLoop ( void *state ) { // get it State2 *st = (State2 *)state; // get the tcp socket from the state TcpSocket *s = st->m_socket; // get it XmlDoc *xd = &st->m_xd; if ( ! xd->m_loaded ) { // setting just the docid. niceness is 0. //xd->set3 ( st->m_docId , st->m_coll , 0 ); // callback xd->setCallback ( state , processLoop ); // . and tell it to load from the old title rec // . this sets xd->m_oldTitleRec/m_oldTitleRecSize // . this sets xd->ptr_* and all other member vars from // the old title rec if found in titledb. if ( ! xd->loadFromOldTitleRec ( ) ) return false; } if ( g_errno ) return sendErrorReply ( st , g_errno ); // now force it to load old title rec //char **tr = xd->getTitleRec(); SafeBuf *tr = xd->getTitleRecBuf(); // blocked? return false if so. it will call processLoop() when it rets if ( tr == (void *)-1 ) return false; // we did not block. check for error? this will free "st" too. if ( ! tr ) return sendErrorReply ( st , g_errno ); // if title rec was empty, that is a problem if ( xd->m_titleRecBuf.length() == 0 ) return sendErrorReply ( st , ENOTFOUND); // set callback char *na = xd->getIsNoArchive(); // wait if blocked if ( na == (void *)-1 ) return false; // error? if ( ! na ) return sendErrorReply ( st , g_errno ); // forbidden? allow turkeys through though... if ( ! st->m_isAdmin && *na ) return sendErrorReply ( st , ENOCACHE ); SafeBuf *sb = &st->m_sb; // &page=4 will print rainbow sections if ( ! st->m_printed && st->m_r.getLong("page",0) ) { // do not repeat this call st->m_printed = true; // this will call us again since we called // xd->setCallback() above to us if ( ! xd->printDocForProCog ( sb , &st->m_r ) ) return false; } char *contentType = "text/html"; char format = st->m_format; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // if we printed a special page (like rainbow sections) then return now if ( st->m_printed ) { bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, //"text/html", contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); return status; } /* // this was calling XmlDoc and setting sections, etc. to // get the SpiderReply junk... no no no // is it banned or filtered? this ignores the TagRec in the titleRec // and uses msg8a to get it fresh instead char *vi = xd->getIsFiltered();//Visible( ); // wait if blocked if ( vi == (void *)-1 ) return false; // error? if ( ! vi ) return sendErrorReply ( st , g_errno ); // banned? if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED); */ // get the utf8 content char **utf8 = xd->getUtf8Content(); //long len = xd->size_utf8Content - 1; // wait if blocked??? if ( utf8 == (void *)-1 ) return false; // strange if ( xd->size_utf8Content<=0) { log("pageget: utf8 content <= 0"); return sendErrorReply(st,EBADENGINEER ); } // alloc error? if ( ! utf8 ) return sendErrorReply ( st , g_errno ); // get this host Host *h = g_hostdb.getHost ( g_hostdb.m_hostId ); if ( ! h ) { log("pageget: hostid %li is bad",g_hostdb.m_hostId); return sendErrorReply(st,EBADENGINEER ); } char *content = xd->ptr_utf8Content; long contentLen = xd->size_utf8Content - 1; // shortcut char strip = st->m_strip; // alloc buffer now //char *buf = NULL; //long bufMaxSize = 0; //bufMaxSize = len + ( 32 * 1024 ) ; //bufMaxSize = contentLen + ( 32 * 1024 ) ; //buf = (char *)mmalloc ( bufMaxSize , "PageGet2" ); //char *p = buf; //char *bufEnd = buf + bufMaxSize; //if ( ! buf ) { // return sendErrorReply ( st , g_errno ); //} // for undoing the header //char *start1 = p; long startLen1 = sb->length(); // we are always utfu if ( strip != 2 ) sb->safePrintf( "<meta http-equiv=\"Content-Type\" " "content=\"text/html;charset=utf8\">\n"); // base href //Url *base = &xd->m_firstUrl; //if ( xd->ptr_redirUrl.m_url[0] ) // base = &xd->m_redirUrl; char *base = xd->ptr_firstUrl; if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl; //Url *redir = *xd->getRedirUrl(); if ( strip != 2 ) { sb->safePrintf ( "<BASE HREF=\"%s\">" , base ); //p += gbstrlen ( p ); } // default colors in case css files missing if ( strip != 2 ) { sb->safePrintf( "\n<style type=\"text/css\">\n" "body{background-color:white;color:black;}\n" "</style>\n"); //p += gbstrlen ( p ); } //char format = st->m_format; if ( format == FORMAT_XML ) sb->reset(); if ( format == FORMAT_JSON ) sb->reset(); // for undoing the stuff below long startLen2 = sb->length();//p; // query should be NULL terminated char *q = st->m_q; long qlen = st->m_qlen; char styleTitle[128] = "font-size:14px;font-weight:600;" "color:#000000;"; char styleText[128] = "font-size:14px;font-weight:400;" "color:#000000;"; char styleLink[128] = "font-size:14px;font-weight:400;" "color:#0000ff;"; char styleTell[128] = "font-size:14px;font-weight:600;" "color:#cc0000;"; // get the url of the title rec Url *f = xd->getFirstUrl(); bool printDisclaimer = st->m_printDisclaimer; if ( xd->m_contentType == CT_JSON ) printDisclaimer = false; if ( format == FORMAT_XML ) printDisclaimer = false; if ( format == FORMAT_JSON ) printDisclaimer = false; char tbuf[100]; tbuf[0] = 0; time_t lastSpiderDate = xd->m_spideredTime; if ( printDisclaimer || format == FORMAT_XML || format == FORMAT_JSON ) { struct tm *timeStruct = gmtime ( &lastSpiderDate ); strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); } // We should always be displaying this disclaimer. // - May eventually want to display this at a different location // on the page, or on the click 'n' scroll browser page itself // when this page is not being viewed solo. // CNS: if ( ! st->m_clickNScroll ) { if ( printDisclaimer ) { sb->safePrintf(//sprintf ( p , //"<BASE HREF=\"%s\">" //"<table border=1 width=100%%>" //"<tr><td>" "<table border=\"1\" bgcolor=\"#" BGCOLOR "\" cellpadding=\"10\" " //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\"" "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">" "<tr" //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\"" "><td>" //"<font face=times,sans-serif color=black size=-1>" "<span style=\"%s\">" "This is Gigablast's cached page of </span>" "<a href=\"%s\" style=\"%s\">%s</a>" "" , styleTitle, f->getUrl(), styleLink, f->getUrl() ); //p += gbstrlen ( p ); // then the rest //sprintf(p , sb->safePrintf( "<span style=\"%s\">. " "Gigablast is not responsible for the content of " "this page.</span>", styleTitle ); //p += gbstrlen ( p ); sb->safePrintf ( "<br/><span style=\"%s\">" "Cached: </span>" "<span style=\"%s\">", styleTitle, styleText ); //p += gbstrlen ( p ); // then the spider date in GMT // time_t lastSpiderDate = xd->m_spideredTime; // struct tm *timeStruct = gmtime ( &lastSpiderDate ); // char tbuf[100]; // strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct); //p += gbstrlen ( p ); sb->safeStrcpy(tbuf); // Moved over from PageResults.cpp sb->safePrintf( "</span> - <a href=\"" "/get?" "q=%s&c=%s&rtq=%li&" "d=%lli&strip=1\"" " style=\"%s\">" "[stripped]</a>", q , st->m_coll , (long)st->m_rtq, st->m_docId, styleLink ); // a link to alexa if ( f->getUrlLen() > 5 ) { sb->safePrintf( " - <a href=\"http:" "//web.archive.org/web/*/%s\"" " style=\"%s\">" "[older copies]</a>" , f->getUrl(), styleLink ); } if (st->m_noArchive){ sb->safePrintf( " - <span style=\"%s\"><b>" "[NOARCHIVE]</b></span>", styleTell ); } if (st->m_isBanned){ sb->safePrintf(" - <span style=\"%s\"><b>" "[BANNED]</b></span>", styleTell ); } // only print this if we got a query if ( qlen > 0 ) { sb->safePrintf("<br/><br/><span style=\"%s\"> " "These search terms have been " "highlighted: ", styleText ); //p += gbstrlen ( p ); } } // how much space left in p? //long avail = bufEnd - p; // . make the url that we're outputting for (like in PageResults.cpp) // . "thisUrl" is the baseUrl for click & scroll char thisUrl[MAX_URL_LEN]; char *thisUrlEnd = thisUrl + MAX_URL_LEN; char *x = thisUrl; // . use the external ip of our gateway // . construct the NAT mapped port // . you should have used iptables to map port to the correct // internal ip:port //unsigned long ip =g_conf.m_mainExternalIp ; // h->m_externalIp; //unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort // local check //if ( st->m_isLocal ) { unsigned long ip = h->m_ip; unsigned short port = h->m_httpPort; //} //sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port ); // . we no longer put the port in here // . but still need http:// since we use <base href=> if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip)); else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port); x += gbstrlen ( x ); // the query url encoded long elen = urlEncode ( x , thisUrlEnd - x , q , qlen ); x += elen; // separate cgi vars with a & //sprintf ( x, "&seq=%li&rtq=%lid=%lli", // (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId()); sprintf ( x, "&d=%lli",st->m_docId ); x += gbstrlen(x); // set our query for highlighting Query qq; qq.set2 ( q, st->m_langId , true ); // print the query terms into our highlight buffer Highlight hi; // make words so we can set the scores to ignore fielded terms Words qw; qw.set ( q , // content being highlighted, utf8 qlen , // content being highlighted, utf8 TITLEREC_CURRENT_VERSION, true , // computeIds false ); // hasHtmlEntities? // . assign scores of 0 to query words that should be ignored // . TRICKY: loop over words in qq.m_qwords, but they should be 1-1 // with words in qw. // . sanity check //if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;} // declare up here Matches m; // do the loop //Scores ss; //ss.set ( &qw , NULL ); //for ( long i = 0 ; i < qq.m_numWords ; i++ ) // if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0; // now set m.m_matches[] to those words in qw that match a query word // or phrase in qq. m.setQuery ( &qq ); //m.addMatches ( &qw , &ss , true ); m.addMatches ( &qw ); long hilen = 0; // CNS: if ( ! st->m_clickNScroll ) { // and highlight the matches if ( printDisclaimer ) { hilen = hi.set ( //p , //avail , sb , &qw , // words to highlight &m , // matches relative to qw false , // doSteming false , // st->m_clickAndScroll , (char *)thisUrl );// base url for ClcknScrll //p += hilen; // now an hr //memcpy ( p , "</span></table></table>\n" , 24 ); p += 24; sb->safeStrcpy("</span></table></table>\n"); } bool includeHeader = st->m_includeHeader; // do not show header for json object display if ( xd->m_contentType == CT_JSON ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false; //mfree(uq, uqCapacity, "PageGet"); // undo the header writes if we should if ( ! includeHeader ) { // including base href is off by default when not including // the header, so the caller must explicitly turn it back on if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2; else sb->m_length=startLen1;//p=start1; } //sb->safeStrcpy(tbuf); if ( format == FORMAT_XML ) { sb->safePrintf("<response>\n"); sb->safePrintf("<statusCode>0</statusCode>\n"); sb->safePrintf("<statusMsg>Success</statusMsg>\n"); sb->safePrintf("<url><![CDATA["); sb->cdataEncode(xd->m_firstUrl.m_url); sb->safePrintf("]]></url>\n"); sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId); sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n", lastSpiderDate); sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf); } if ( format == FORMAT_JSON ) { sb->safePrintf("{\"response\":{\n"); sb->safePrintf("\t\"statusCode\":0,\n"); sb->safePrintf("\t\"statusMsg\":\"Success\",\n"); sb->safePrintf("\t\"url\":\""); sb->jsonEncode(xd->m_firstUrl.m_url); sb->safePrintf("\",\n"); sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId); sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate); sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf); } // identify start of <title> tag we wrote out char *sbstart = sb->getBufStart(); char *sbend = sb->getBufEnd(); char *titleStart = NULL; char *titleEnd = NULL; for ( char *t = sbstart ; t < sbend ; t++ ) { // title tag? if ( t[0]!='<' ) continue; if ( to_lower_a(t[1])!='t' ) continue; if ( to_lower_a(t[2])!='i' ) continue; if ( to_lower_a(t[3])!='t' ) continue; if ( to_lower_a(t[4])!='l' ) continue; if ( to_lower_a(t[5])!='e' ) continue; // point to it char *x = t + 5; // max - to keep things fast char *max = x + 500; for ( ; *x && *x != '>' && x < max ; x++ ); x++; // find end char *e = x; for ( ; *e && e < max ; e++ ) { if ( e[0]=='<' && to_lower_a(e[1])=='/' && to_lower_a(e[2])=='t' && to_lower_a(e[3])=='i' && to_lower_a(e[4])=='t' && to_lower_a(e[5])=='l' && to_lower_a(e[6])=='e' ) break; } if ( e < max ) { titleStart = x; titleEnd = e; } break; } // . print title at top! // . consider moving if ( titleStart ) { char *ebuf = st->m_r.getString("eb"); if ( ! ebuf ) ebuf = ""; //p += sprintf ( p , sb->safePrintf( "<table border=1 " "cellpadding=10 " "cellspacing=0 " "width=100%% " "color=#ffffff>" ); long printLinks = st->m_r.getLong("links",0); if ( ! printDisclaimer && printLinks ) sb->safePrintf(//p += sprintf ( p , // first put cached and live link "<tr>" "<td bgcolor=lightyellow>" // print cached link //"<center>" " " "<b>" "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=\"" "/get?" "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">" "cached link</a>" " " "<a " "style=\"font-size:18px;font-weight:600;" "color:#000000;\" " "href=%s>live link</a>" "</b>" //"</center>" "</td>" "</tr>\n" ,st->m_coll ,st->m_docId ,ebuf ,thisUrl // st->ptr_ubuf ); if ( printLinks ) { sb->safePrintf(//p += sprintf ( p , "<tr><td bgcolor=pink>" "<span style=\"font-size:18px;" "font-weight:600;" "color:#000000;\">" " " "<b>PAGE TITLE:</b> " ); long tlen = titleEnd - titleStart; sb->safeMemcpy ( titleStart , tlen ); sb->safePrintf ( "</span></td></tr>" ); } sb->safePrintf( "</table><br>\n" ); } // is the content preformatted? bool pre = false; char ctype = (char)xd->m_contentType; if ( ctype == CT_TEXT ) pre = true ; // text/plain if ( ctype == CT_DOC ) pre = true ; // filtered msword if ( ctype == CT_PS ) pre = true ; // filtered postscript if ( format == FORMAT_XML ) pre = false; if ( format == FORMAT_JSON ) pre = false; // if it is content-type text, add a <pre> if ( pre ) {//p + 5 < bufEnd && pre ) { sb->safePrintf("<pre>"); //p += 5; } if ( st->m_strip == 1 ) contentLen = stripHtml( content, contentLen, (long)xd->m_version, st->m_strip ); // it returns -1 and sets g_errno on error, line OOM if ( contentLen == -1 ) { //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } Xml xml; Words ww; // if no highlighting, skip it bool queryHighlighting = st->m_queryHighlighting; if ( st->m_strip == 2 ) queryHighlighting = false; // do not do term highlighting if json if ( xd->m_contentType == CT_JSON ) queryHighlighting = false; SafeBuf tmp; SafeBuf *xb = sb; if ( format == FORMAT_XML ) xb = &tmp; if ( format == FORMAT_JSON ) xb = &tmp; if ( ! queryHighlighting ) { xb->safeMemcpy ( content , contentLen ); //p += contentLen ; } else { // get the content as xhtml (should be NULL terminated) //Words *ww = xd->getWords(); if ( ! xml.set ( content , contentLen , false , 0 , false , TITLEREC_CURRENT_VERSION , false , 0 , CT_HTML ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // sanity check //if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } // how much space left in p? //avail = bufEnd - p; Matches m; m.setQuery ( &qq ); m.addMatches ( &ww ); hilen = hi.set ( xb , // p , avail , &ww , &m , false /*doStemming?*/ , st->m_clickAndScroll , thisUrl /*base url for click & scroll*/); //p += hilen; log(LOG_DEBUG, "query: Done highlighting cached page content"); } if ( format == FORMAT_XML ) { sb->safePrintf("\t<content><![CDATA["); sb->cdataEncode ( xb->getBufStart() ); sb->safePrintf("]]></content>\n"); sb->safePrintf("</response>\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"content\":\"\n"); sb->jsonEncode ( xb->getBufStart() ); sb->safePrintf("\"\n}\n}\n"); } // if it is content-type text, add a </pre> if ( pre ) { // p + 6 < bufEnd && pre ) { sb->safeMemcpy ( "</pre>" , 6 ); //p += 6; } // calculate bufLen //long bufLen = p - buf; long ct = xd->m_contentType; // now filter the entire buffer to escape out the xml tags // so it is displayed nice SafeBuf newbuf; if ( ct == CT_XML ) { // encode the xml tags into <tagname> sequences if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() , sb->getLength(), 0)){// niceness=0 //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); return sendErrorReply ( st , g_errno ); } // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // reassign //buf = newbuf.getBufStart(); //bufLen = newbuf.length(); sb->stealBuf ( &newbuf ); } // now encapsulate it in html head/tail and send it off // sendErr: contentType = "text/html"; if ( strip == 2 ) contentType = "text/xml"; // xml is usually buggy and this throws browser off //if ( ctype == CT_XML ) contentType = "text/xml"; if ( xd->m_contentType == CT_JSON ) contentType = "application/json"; if ( format == FORMAT_XML ) contentType = "text/xml"; if ( format == FORMAT_JSON ) contentType = "application/json"; // safebuf, sb, is a member of "st" so this should copy the buffer // when it constructs the http reply, and we gotta call delete(st) // AFTER this so sb is still valid. bool status = g_httpServer.sendDynamicPage (s, //buf,bufLen, sb->getBufStart(), sb->getLength(), -1,false, contentType, -1, NULL, "utf8" ); // nuke state2 mdelete ( st , sizeof(State2) , "PageGet1" ); delete (st); // free out buffer that we alloc'd before returning since this // should have copied it into another buffer //if ( ct == CT_XML ) newbuf.purge(); //else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" ); // and convey the status return status; }
bool qainject2 ( ) { //if ( ! s_callback ) s_callback = qainject2; // // delete the 'qatest123' collection // //static bool s_x1 = false; if ( ! s_flags[0] ) { s_flags[0] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } // // add the 'qatest123' collection // //static bool s_x2 = false; if ( ! s_flags[1] ) { s_flags[1] = true; if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , // checksum of reply expected 238170006 ) ) return false; } // // try delimeter based injecting // //static bool s_y2 = false; if ( ! s_flags[7] ) { s_flags[7] = true; SafeBuf sb; // delim=+++URL: sb.safePrintf("&c=qatest123&deleteurl=0&" "delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&" "hasmime=1&content="); // use injectme3 file SafeBuf ubuf; ubuf.load("./injectme3"); sb.urlEncode(ubuf.getBufStart()); if ( ! getUrl ( "/admin/inject", // check reply, seems to have only a single // docid in it -1970198487, sb.getBufStart()) ) return false; } // now query check //static bool s_y4 = false; if ( ! s_flags[8] ) { wait(1.5); s_flags[8] = true; return false; } if ( ! s_flags[14] ) { s_flags[14] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe", -1804253505 ) ) return false; } //static bool s_y5 = false; if ( ! s_flags[9] ) { s_flags[9] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports" "+news&ns=1&tml=20&smxcpl=30&" "sw=10&showimages=1" ,-1874756636 ) ) return false; } //static bool s_y6 = false; if ( ! s_flags[10] ) { s_flags[10] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports" "+news&ns=1&tml=20&smxcpl=30&" "sw=10&showimages=0&hacr=1" ,1651330319 ) ) return false; } //static bool s_y7 = false; if ( ! s_flags[11] ) { s_flags[11] = true; if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports" "+news&ns=1&tml=20&smxcpl=30&" "sw=10&showimages=0&sc=1" ,-1405546537 ) ) return false; } // // delete the 'qatest123' collection // if ( ! s_flags[12] ) { s_flags[12] = true; if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) return false; } //static bool s_fee2 = false; if ( ! s_flags[13] ) { s_flags[13] = true; log("qa: SUCCESSFULLY COMPLETED " "QA INJECT TEST 2"); //if ( s_callback == qainject ) exit(0); return true; } return true; }
bool gotXmlDoc ( void *state ) { // cast it State8 *st = (State8 *)state; // get the xmldoc XmlDoc *xd = &st->m_xd; // if we loaded from old title rec, it should be there! // . save the ips.txt file if we are the test coll // . saveTestBuf() is a function in Msge1.cpp //if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123")) // // use same dir that XmlDoc::getTestDir() would use // saveTestBuf ( "test-page-parser" ); // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); // shortcut SafeBuf *xbuf = &st->m_xbuf; bool printIt = false; if ( st->m_u && st->m_u[0] ) printIt = true; if ( st->m_docId != -1LL ) printIt = true; if ( st->m_donePrinting ) printIt = false; // do not re-call this if printDocForProCog blocked... (check length()) if ( printIt ) { // mark as done st->m_donePrinting = true; // always re-compute the page inlinks dynamically, do not // use the ptr_linkInfo1 stored in titlerec!! // NO! not if set from titlerec/docid if ( st->m_recompute ) xd->m_linkInfo1Valid = false; // try a recompute regardless, because we do not store the // bad inlinkers, and ppl want to see why they are bad! //xd->m_linkInfo1Valid = false; // now get the meta list, in the process it will print out a // bunch of junk into st->m_xbuf //char *metalist = xd->getMetaList ( ); //if ( ! metalist ) return sendErrorReply ( st , g_errno ); // return false if it blocked //if ( metalist == (void *)-1 ) return false; // for debug... //if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false ); // . print it out // . returns false if blocks, true otherwise // . sets g_errno on error if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) ) return false; // error? if ( g_errno ) return sendErrorReply ( st , g_errno ); } long isXml = st->m_r.getLong("xml",0); char ctype2 = CT_HTML; if ( isXml ) ctype2 = CT_XML; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage( st->m_s , xbuf->getBufStart(), xbuf->length() , -1, //cachtime false ,//postreply? &ctype2, -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now if ( st->m_freeIt ) { mdelete ( st , sizeof(State8) , "PageParser" ); delete (st); } // return the status return status; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotTitleRec ( void *state ) { // cast the State4 out State4 *st = (State4 *) state; // get the socket TcpSocket *s = st->m_socket; SafeBuf sb; // get it's docId long long docId = st->m_docId; // make the query string for passing to different hosts char qs[64]; sprintf(qs,"&d=%lli",docId); if ( docId==0LL ) qs[0] = 0; // print standard header sb.reserve2x ( 32768 ); g_pages.printAdminTop (&sb, st->m_socket, &st->m_r ); //PAGE_TITLEDB, // st->m_username,//NULL , // st->m_coll , st->m_pwd , s->m_ip , qs ); // shortcut XmlDoc *xd = &st->m_xd; // . deal with errors // . print none if non title rec at or after the provided docId if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) { // print docId in box sb.safePrintf ( "<center>\nEnter docId: " "<input type=text name=d value=%lli size=15>", docId); sb.safePrintf ( "</form><br>\n" ); if ( docId == 0 ) sb.safePrintf("<br>"); else if ( g_errno ) sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno)); else sb.safePrintf("<br><br>No titleRec for that docId " "or higher"); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "\n</center>" ); mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage ( s , sb.getBufStart(), sb.length() ); } // print docId in box sb.safePrintf ("<br>\n" "<center>Enter docId: " "<input type=text name=d value=%lli size=15>", docId ); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "</form><br>\n" ); //char *coll = st->m_coll; Title *ti = xd->getTitle(); if ( ! ti ) { log ( "admin: Could not set title" ); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // sanity check. should not block if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; } // print it out xd->printDoc ( &sb ); // don't forget to cleanup mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length()); }