bool loadUrls ( ) { static bool s_loaded = false; if ( s_loaded ) return true; s_loaded = true; // use injectme3 file s_ubuf1.load("./injectme3"); // scan for +++URL: xxxxx char *s = s_ubuf1.getBufStart(); for ( ; *s ; s++ ) { if ( strncmp(s,"+++URL: ",8) ) continue; // got one // \0 term it for s_contentPtrs below *s = '\0'; // find end of it s += 8; char *e = s; for ( ; *e && ! is_wspace_a(*e); e++ ); // null term it if ( *e ) *e = '\0'; // store ptr s_ubuf2.pushLong((long)s); // skip past that s = e; // point to content s_cbuf2.pushLong((long)(s+1)); } // make array of url ptrs s_urlPtrs = (char **)s_ubuf2.getBufStart(); s_contentPtrs= (char **)s_cbuf2.getBufStart(); return true; }
bool Msg3a::gotAllSplitReplies ( ) { // if any of the split requests had an error, give up and set m_errno // but don't set if for non critical errors like query truncation if ( m_errno ) { g_errno = m_errno; return true; } // also reset the finalbuf and the oldNumTopDocIds if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // update our estimated total hits m_numTotalEstimatedHits = 0; for ( long i = 0; i < m_numHosts ; i++ ) { // get that host that gave us the reply //Host *h = g_hostdb.getHost(i); // . get the reply from multicast // . multicast should have destroyed all slots, but saved reply // . we are responsible for freeing the reply // . we need to call this even if g_errno or m_errno is // set so we can free the replies in Msg3a::reset() // . if we don't call getBestReply() on it multicast should // free it, because Multicast::m_ownReadBuf is still true Multicast *m = &m_mcast[i]; bool freeit = false; long replySize = 0; long replyMaxSize; char *rbuf; Msg39Reply *mr; // . only get it if the reply not already full // . if reply already processed, skip // . perhaps it had no more docids to give us or all termlists // were exhausted on its disk and this is a re-call // . we have to re-process it for count m_numTotalEstHits, etc. rbuf = m->getBestReply ( &replySize , &replyMaxSize , &freeit , true ); //stealIt? // cast it mr = (Msg39Reply *)rbuf; // in case of mem leak, re-label from "mcast" to this so we // can determine where it came from, "Msg3a-GBR" relabel( rbuf, replyMaxSize , "Msg3a-GBR" ); // . we must be able to free it... we must own it // . this is true if we should free it, but we should not have // to free it since it is owned by the slot? if ( freeit ) { log(LOG_LOGIC,"query: msg3a: Steal failed."); char *xx = NULL; *xx=0; } // bad reply? if ( ! mr ) { log(LOG_LOGIC,"query: msg3a: Bad NULL reply."); m_reply [i] = NULL; m_replyMaxSize[i] = 0; // it might have been timd out, just ignore it!! continue; // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; // all reply buffers should be freed on reset() return true; } // how did this happen? if ( replySize < 29 && ! mr->m_errno ) { // if size is 0 it can be Msg39 giving us an error! g_errno = EBADREPLYSIZE; m_errno = EBADREPLYSIZE; log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.", replySize); // all reply buffers should be freed on reset() return true; } // can this be non-null? we shouldn't be overwriting one // without freeing it... if ( m_reply[i] ) // note the mem leak now log("query: mem leaking a 0x39 reply"); // cast it and set it m_reply [i] = mr; m_replyMaxSize[i] = replyMaxSize; // deserialize it (just sets the ptr_ and size_ member vars) //mr->deserialize ( ); deserializeMsg ( sizeof(Msg39Reply) , &mr->size_docIds, &mr->size_clusterRecs, &mr->ptr_docIds, mr->m_buf ); // sanity check if ( mr->m_nqt != m_q->getNumTerms() ) { g_errno = EBADREPLY; m_errno = EBADREPLY; log("query: msg3a: Split reply qterms=%li != %li.", (long)mr->m_nqt,(long)m_q->getNumTerms() ); return true; } // return if split had an error, but not for a non-critical // error like query truncation if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) { g_errno = mr->m_errno; m_errno = mr->m_errno; log("query: msg3a: Split had error: %s", mstrerror(g_errno)); return true; } // skip down here if reply was already set //skip: // add of the total hits from each split, this is how many // total results the lastest split is estimated to be able to // return // . THIS should now be exact since we read all termlists // of posdb... m_numTotalEstimatedHits += mr->m_estimatedHits; // debug log stuff if ( ! m_debug ) continue; // cast these for printing out long long *docIds = (long long *)mr->ptr_docIds; score_t *scores = (score_t *)mr->ptr_scores; // print out every docid in this split reply for ( long j = 0; j < mr->m_numDocIds ; j++ ) { // print out score_t logf( LOG_DEBUG, "query: msg3a: [%lu] %03li) " "split=%li docId=%012llu domHash=0x%02lx " "score=%lu" , (unsigned long)this , j , i , docIds [j] , (long)g_titledb.getDomHash8FromDocId(docIds[j]), (long)scores[j] ); } } // this seems to always return true! mergeLists ( ); if ( ! m_r->m_useSeoResultsCache ) return true; // now cache the reply SafeBuf cr; long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4); long need = sizeof(key_t) + 4 + dataSize; bool status = cr.reserve ( need ); // sanity if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) { char *xx=NULL; *xx=0; } // ignore errors g_errno = 0; // return on error with g_errno cleared if cache add failed if ( ! status ) return true; // add to buf otherwise cr.safeMemcpy ( &m_ckey , sizeof(key_t) ); cr.safeMemcpy ( &dataSize , 4 ); long now = getTimeGlobal(); cr.pushLong ( now ); cr.pushLong ( m_numDocIds ); cr.pushLong ( m_numTotalEstimatedHits );//Results ); long max = m_numDocIds; // then the docids for ( long i = 0 ; i < max ; i++ ) cr.pushLongLong(m_docIds[i] ); for ( long i = 0 ; i < max ; i++ ) cr.pushFloat(m_scores[i]); for ( long i = 0 ; i < max ; i++ ) cr.pushLong(getSiteHash26(i)); // sanity if ( cr.length() != need ) { char *xx=NULL; *xx=0; } // make these key_t startKey; key_t endKey; startKey = m_ckey; // clear delbit startKey.n0 &= 0xfffffffffffffffeLL; // end key is us endKey = m_ckey; // that is the single record m_seoCacheList.set ( cr.getBufStart() , cr.length(), cr.getBufStart(), // alloc cr.getCapacity(), // alloc size (char *)&startKey, (char *)&endKey, -1, // fixeddatasize true, // owndata? false,// use half keys? sizeof(key_t) ); // do not allow cr to free it, msg1 will cr.detachBuf(); // note it //log("seopipe: storing ckey=%s q=%s" // ,KEYSTR(&m_ckey,12) // ,m_r->ptr_query // ); //log("msg1: sending niceness=%li",(long)m_r->m_niceness); // this will often block, but who cares!? it just sends a request off if ( ! m_msg1.addList ( &m_seoCacheList , RDB_SERPDB,//RDB_CACHEDB, m_r->ptr_coll, this, // state gotSerpdbReplyWrapper, // callback false, // forcelocal? m_r->m_niceness ) ) { //log("blocked"); return false; } // we can safely delete m_msg17... just return true return true; }
// // . ENTRY POINT FOR IMPORTING TITLEDB RECS FROM ANOTHER CLUSTER // . when user clicks 'begin' in import page we come here.. // . so when that parm changes in Parms.cpp we sense that and call // beginImport(CollectionRec *cr) // . or on startup we call resumeImports to check each coll for // an import in progress. // . search for files named titledb*.dat // . if none found just return // . when msg7 inject competes it calls this // . call this from sleep wrapper in Process.cpp // . returns false if would block (outstanding injects), true otherwise // . sets g_errno on error bool ImportState::importLoop ( ) { CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! cr || g_hostdb.m_hostId != 0 ) { // if coll was deleted! log("import: collnum %li deleted while importing into", (long)m_collnum); //if ( m_numOut > m_numIn ) return true; // delete the entire import state i guess // what happens if we have a msg7 reply come back in? // it should see the collrec is NULL and just fail. mdelete ( this, sizeof(ImportState) , "impstate"); delete (this); return true; } INJECTLOOP: // stop if waiting on outstanding injects long long out = m_numOut - m_numIn; if ( out >= cr->m_numImportInjects ) { g_errno = 0; return false; } if ( ! cr->m_importEnabled ) { // wait for all to return if ( out > 0 ) return false; // then delete it log("import: collnum %li import loop disabled", (long)m_collnum); mdelete ( this, sizeof(ImportState) , "impstate"); delete (this); return true; } // scan each titledb file scanning titledb0001.dat first, // titledb0003.dat second etc. //long long offset = -1; // . when offset is too big for current m_bigFile file then // we go to the next and set offset to 0. // . sets m_bf and m_fileOffset if ( ! setCurrentTitleFileAndOffset ( ) ) {//cr , -1 ); log("import: import: no files to read"); //goto INJECTLOOP; return true; } // this is -1 if none remain! if ( m_fileOffset == -1 ) { log("import: import fileoffset is -1. done."); return true; } long long saved = m_fileOffset; //Msg7 *msg7; //GigablastRequest *gr; //SafeBuf *sbuf = NULL; long need = 12; long dataSize = -1; //XmlDoc xd; key_t tkey; bool status; SafeBuf tmp; SafeBuf *sbuf = &tmp; long long docId; long shardNum; long key; Multicast *mcast; char *req; long reqSize; if ( m_fileOffset >= m_bfFileSize ) { log("inject: import: done processing file %li %s", m_bfFileId,m_bf.getFilename()); goto nextFile; } // read in title rec key and data size status = m_bf.read ( &tkey, sizeof(key_t) , m_fileOffset ); //if ( n != 12 ) goto nextFile; if ( g_errno ) { log("inject: import: reading file error: %s. advancing " "to next file",mstrerror(g_errno)); goto nextFile; } m_fileOffset += 12; // if negative key, skip if ( (tkey.n0 & 0x01) == 0 ) { goto INJECTLOOP; } // if non-negative then read in size status = m_bf.read ( &dataSize , 4 , m_fileOffset ); if ( g_errno ) { log("main: failed to read in title rec " "file. %s. Skipping file %s", mstrerror(g_errno),m_bf.getFilename()); goto nextFile; } m_fileOffset += 4; need += 4; need += dataSize; need += 4; // collnum, first 4 bytes if ( dataSize < 0 || dataSize > 500000000 ) { log("main: could not scan in titledb rec of " "corrupt dataSize of %li. BAILING ENTIRE " "SCAN of file %s",dataSize,m_bf.getFilename()); goto nextFile; } //gr = &msg7->m_gr; //XmlDoc *xd = getAvailXmlDoc(); //msg7 = getAvailMsg7(); mcast = getAvailMulticast(); // if none, must have to wait for some to come back to us if ( ! mcast ) { // restore file offset //m_fileOffset = saved; // no, must have been a oom or something log("import: import no mcast available"); return true;//false; } // this is for holding a compressed titlerec //sbuf = &mcast->m_sbuf;//&gr->m_sbuf; // point to start of buf sbuf->reset(); // ensure we have enough room sbuf->reserve ( need ); // collnum first 4 bytes sbuf->pushLong( (long)m_collnum ); // store title key sbuf->safeMemcpy ( &tkey , sizeof(key_t) ); // then datasize if any. neg rec will have -1 datasize if ( dataSize >= 0 ) sbuf->pushLong ( dataSize ); // then read data rec itself into it, compressed titlerec part if ( dataSize > 0 ) { // read in the titlerec after the key/datasize status = m_bf.read ( sbuf->getBuf() , dataSize , m_fileOffset ); if ( g_errno ) { // n != dataSize ) { log("main: failed to read in title rec " "file. %s. Skipping file %s", mstrerror(g_errno),m_bf.getFilename()); // essentially free up this msg7 now //msg7->m_inUse = false; //msg7->reset(); goto nextFile; } // advance m_fileOffset += dataSize; // it's good, count it sbuf->m_length += dataSize; } // set xmldoc from the title rec //xd->set ( sbuf.getBufStart() ); //xd->m_masterState = NULL; //xd->m_masterCallback ( titledbInjectLoop ); // we use this so we know where the doc we are injecting // was in the foregien titledb file. so we can update our bookmark // code. mcast->m_hackFileOff = saved;//m_fileOffset; mcast->m_hackFileId = m_bfFileId; // // inject a title rec buf this time, we are doing an import // FROM A TITLEDB FILE!!! // //gr->m_titleRecBuf = &sbuf; // break it down into gw // xd.set2 ( sbuf.getBufStart() , // sbuf.length() , // max size // cr->m_coll, // use our coll // NULL , // pbuf for page parser // 1 , // niceness // NULL ); //sreq ); // // note it // log("import: importing %s",xd.m_firstUrl.getUrl()); // now we can set gr for the injection // TODO: inject the whole "sbuf" so we get sitenuminlinks etc // all exactly the same... // gr->m_url = xd.getFirstUrl()->getUrl(); // gr->m_queryToScrape = NULL; // gr->m_contentDelim = 0; // gr->m_contentTypeStr = g_contentTypeStrings [xd.m_contentType]; // gr->m_contentFile = NULL; // gr->m_content = xd.ptr_utf8Content; // gr->m_diffbotReply = NULL; // gr->m_injectLinks = false; // gr->m_spiderLinks = true; // gr->m_shortReply = false; // gr->m_newOnly = false; // gr->m_deleteUrl = false; // gr->m_recycle = true; // recycle content? or sitelinks? // gr->m_dedup = false; // gr->m_hasMime = false; // gr->m_doConsistencyTesting = false; // gr->m_getSections = false; // gr->m_gotSections = false; // gr->m_charset = xd.m_charset; // gr->m_hopCount = xd.m_hopCount; // // point to next doc in the titledb file // //m_fileOffset += need; // get docid from key docId = g_titledb.getDocIdFromKey ( &tkey ); // get shard that holds the titlerec for it shardNum = g_hostdb.getShardNumFromDocId ( docId ); // for selecting which host in the shard receives it key = (long)docId; m_numOut++; // then index it. master callback will be called //if ( ! xd->index() ) return false; // TODO: make this forward the request to an appropriate host!! // . gr->m_sbuf is set to the titlerec so this should handle that // and use XmlDoc::set4() or whatever // if ( msg7->injectTitleRec ( msg7 , // state // gotMsg7ReplyWrapper , // callback // cr )) { // // it didn't block somehow... // msg7->m_inUse = false; // msg7->gotMsg7Reply(); // } req = sbuf->getBufStart(); reqSize = sbuf->length(); if ( reqSize != need ) { char *xx=NULL;*xx=0 ; } // do not free it, let multicast free it after sending it sbuf->detachBuf(); if ( ! mcast->send ( req , reqSize , 0x07 , true , // ownmsg? shardNum, false, // send to whole shard? key , // for selecting host in shard mcast , // state NULL , // state2 gotMulticastReplyWrapper , 999999 ) ) { // total timeout in seconds log("import: import mcast had error: %s",mstrerror(g_errno)); m_numIn++; } goto INJECTLOOP; nextFile: // invalidate this flag //m_offIsValid = false; // . and call this function. we add one to m_bfFileId so we // do not re-get the file we just injected. // . sets m_bf and m_fileOffset // . returns false if nothing to read if ( ! setCurrentTitleFileAndOffset ( ) ) { //cr , m_bfFileId+1 ); log("import: import: no files left to read"); //goto INJECTLOOP; return true; } // if it returns NULL we are done! log("main: titledb injection loop completed. waiting for " "outstanding injects to return."); if ( m_numOut > m_numIn ) return false; log("main: all injects have returned. DONE."); // dummy return return true; }