void Blaster::gotDoc2 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; // bail if got cut off if ( s->m_readOffset == 0 ) { log("blaster: Lost the Request in gotDoc2"); m_launched--; //No need to point p2 // Free stateBD freeStateBD(st); return; } // . don't let TcpServer free m_buf when socket is recycled/closed // . we own it now and are responsible for freeing it // s->m_readBuf = NULL; long long now = gettimeofdayInMilliseconds(); // So now after getting both docIds, get their contents char *reply1 = st->m_buf1 ; long size1 = st->m_buf1Len; HttpMime mime1; mime1.set ( reply1 , size1 , NULL ); char *content1 = reply1 + mime1.getMimeLen(); long content1Len = size1 - mime1.getMimeLen(); unsigned long h = hash32 ( content1 , content1Len ); // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s", s->m_readOffset , (long)(now - s->m_startTime) , st->m_u2 , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (%li) (%li ms) " "(hash=%lx) %s", s->m_readOffset , (long)(now - s->m_startTime) , h , st->m_u2 ); if (m_verbose){ log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s", content1Len,content1); log(LOG_WARN,"\n"); } char *reply2 = s->m_readBuf ; long size2 = s->m_readOffset; HttpMime mime2; mime2.set ( reply2 , size2 , NULL ); char *content2 = reply2 + mime2.getMimeLen(); long content2Len = size2 - mime2.getMimeLen(); if (m_verbose) log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s", content2Len,content2); // Now that we've got the contents, lets get the url links out // of these pages.Passing them to function getSearchLinks should // get the first x links found out. /* st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3"); st->m_links2=st->m_links1+100*MAX_URL_LEN; st->m_numLinks1=100; st->m_numLinks2=100;*/ /* long numLinks1=getSearchLinks(content1,content1Len, st->m_links1,st->m_numLinks1); long numLinks2=getSearchLinks(content2,content2Len, st->m_links2,st->m_numLinks2);*/ content1[content1Len]='\0'; //short csEnum1= get_iana_charset(mime1.getCharset(), // mime1.getCharsetLen()); /* if (csEnum1== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml1; // assume utf8 if (!xml1.set(content1, content1Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2"); } Links links1; Url parent; parent.set ( st->m_u1); if (!links1.set(false , // userellnofollow &xml1, &parent,//mime1.getLocationUrl(), parent Url false, // setLinkHashes NULL , // baseUrl TITLEREC_CURRENT_VERSION, // version 0 , // niceness false , // parent is permalink? NULL )) { // oldLinks log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2"); } content2[content2Len]='\0'; //short csEnum2= get_iana_charset(mime2.getCharset(), // mime2.getCharsetLen()); /* if (csEnum2== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml2; if (!xml2.set(content2, content2Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2"); } Links links2; parent.set(st->m_u2); if (!links2.set(0,//siterec xml &xml2, &parent,//&st->m_u2,//mime2.getLocationUrl(), false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2"); } // put the hash of the sites into a hashtable, since we have // about a 100 or so of them HashTableT<unsigned long, bool> urlHash; // put the urls from doc2 into the hastable, but first check if // they are links to google or gigablast (for now). For msn and // yahoo we have to add other checks. char domain2[256]; long dlen = 0; char *dom = getDomFast ( st->m_u2 , &dlen ); if ( dom ) strncpy(domain2,dom,dlen); domain2[dlen]='\0'; for (long i=0;i<links2.getNumLinks();i++){ // The dots check if exactly google or gigablast are present // in the link char *ss=links2.getLink(i); char *p; p=strstr(ss,domain2); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc2=%s" ,links2.getLink(i)); unsigned long h=hash32Lower_a(links2.getLink(i), links2.getLinkLen(i)); //should i check for conflict. no, because it doesn't matter urlHash.addKey(h,1); } // now check if the urls from doc1 are in doc2. save the // ones that are not // in there for later. /* long numUrlsToCheck=links2.getNumLinks();*/ long numUrlsNotFound=0; /*if (numLinks1<numUrlsToCheck) numUrlsToCheck=numLinks1;*/ char domain1[256]; dlen = 0; dom = getDomFast ( st->m_u1 ,&dlen ); if ( dom ) strncpy(domain1,dom,dlen); domain1[dlen]='\0'; for (long i=0;i<links1.getNumLinks();i++){ char *ss=links1.getLink(i); char *p; p=strstr(ss,domain1); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc1=%s" ,links1.getLink(i)); unsigned long h=hash32Lower_a(links1.getLink(i), links1.getLinkLen(i)); long slot= urlHash.getSlot(h); if(slot!=-1) continue; // if url is not present, get its doc. if (m_verbose || m_justDisplay) log(LOG_WARN,"blaster: NOT FOUND %s in %s" ,links1.getLink(i),domain2); numUrlsNotFound++; //Don't do anything else if just have to display the urls if (m_justDisplay) continue; //now get the doc of these urls //initialize st->m_numUrlDocsReceived=0; StateBD2 *st2; try { st2 = new (StateBD2); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.", (long)sizeof(StateBD2)); return; } mnew ( st2 , sizeof(StateBD2) , "Blaster4" ); //Point to the big state; st2->m_st=st; //Msg16 does 6 redirects, so I do 6 too st2->m_numRedirects=6; //st2->m_url.set(links1.getLink(i),links1.getLinkLen(i)); st2->m_url = links1.getLink(i); // No need for a proxy ip here, since we are fetching // doc's from different IPs. Faster this way bool status = g_httpServer.getDoc ( st2->m_url, // url 0,//ip 0 , // offset -1 , // size 0 , // ifModifiedSince st2, // state gotDocWrapper3, // callback 60*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) continue; // If not blocked, there is an error. st->m_numUrlDocsReceived++; } st->m_numUrlDocsSent=numUrlsNotFound; //There might have been an error while sending the docs, so if there //has been put a check if ( st->m_numUrlDocsReceived > 0 && st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){ log(LOG_WARN,"blaster: %li docs could not be sent due to " "error",st->m_numUrlDocsReceived); m_launched--; freeStateBD(st); return; } if (numUrlsNotFound==0){ //job done for this pair log(LOG_WARN,"blaster: All urls from %s found in " "%s",domain1,domain2); m_launched--; // Free stateBD freeStateBD(st); return; } log(LOG_WARN,"blaster: %li urls from %s Not found in %s", numUrlsNotFound,domain1,domain2); if(m_justDisplay){ m_launched--; // Free stateBD freeStateBD(st); } return; }
// . merge all the replies together // . put final merged docids into m_docIds[],m_bitScores[],m_scores[],... // . this calls Msg51 to get cluster levels when done merging // . Msg51 remembers clusterRecs from previous call to avoid repeating lookups // . returns false if blocked, true otherwise // . sets g_errno and returns true on error bool Msg3a::mergeLists ( ) { // time how long the merge takes if ( m_debug ) { logf( LOG_DEBUG, "query: msg3a: --- Final DocIds --- " ); m_startTime = gettimeofdayInMilliseconds(); } // reset our final docids count here in case we are a re-call m_numDocIds = 0; // a secondary count, how many unique docids we scanned, and not // necessarily added to the m_docIds[] array //m_totalDocCount = 0; // long docCount = 0; m_moreDocIdsAvail = true; // shortcut //long numSplits = m_numHosts;//indexdbSplit; // . point to the various docids, etc. in each split reply // . tcPtr = term count. how many required query terms does the doc // have? formerly called topExplicits in IndexTable2.cpp long long *diPtr [MAX_INDEXDB_SPLIT]; float *rsPtr [MAX_INDEXDB_SPLIT]; key_t *ksPtr [MAX_INDEXDB_SPLIT]; long long *diEnd [MAX_INDEXDB_SPLIT]; for ( long j = 0; j < m_numHosts ; j++ ) { Msg39Reply *mr =m_reply[j]; // if we have gbdocid:| in query this could be NULL if ( ! mr ) { diPtr[j] = NULL; diEnd[j] = NULL; rsPtr[j] = NULL; ksPtr[j] = NULL; continue; } diPtr [j] = (long long *)mr->ptr_docIds; rsPtr [j] = (float *)mr->ptr_scores; ksPtr [j] = (key_t *)mr->ptr_clusterRecs; diEnd [j] = (long long *)(mr->ptr_docIds + mr->m_numDocIds * 8); } // clear if we had it if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // // HACK: START section stats merge // m_sectionStats.reset(); long sneed = 0; for ( long j = 0; j < m_numHosts ; j++ ) { Msg39Reply *mr = m_reply[j]; if ( ! mr ) continue; sneed += mr->size_siteHashList/4; } HashTableX dt; //char tmpBuf[5000]; if (sneed&&!dt.set(4,0,sneed,NULL,0,false, m_r->m_niceness,"uniqsit")) return true; for ( long j = 0; sneed && j < m_numHosts ; j++ ) { Msg39Reply *mr =m_reply[j]; if ( ! mr ) continue; SectionStats *src = &mr->m_sectionStats; SectionStats *dst = &m_sectionStats; dst->m_onSiteDocIds += src->m_onSiteDocIds; dst->m_offSiteDocIds += src->m_offSiteDocIds; // now the list should be the unique site hashes that // had the section hash. we need to uniquify them again // here. long *p = (long *)mr->ptr_siteHashList; long np = mr->size_siteHashList / 4; for ( long k = 0 ; k < np ; k++ ) // hash it up, no dups! dt.addKey(&p[k]); // update our count based on that dst->m_numUniqueSites = dt.getNumSlotsUsed(); } if ( m_r->m_getSectionStats ) return true; // // HACK: END section stats merge // if ( m_docsToGet <= 0 ) { char *xx=NULL; *xx=0; } // . how much do we need to store final merged docids, etc.? // . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1 long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1); // allocate it m_finalBuf = (char *)mmalloc ( need , "finalBuf" ); m_finalBufSize = need; // g_errno should be set if this fails if ( ! m_finalBuf ) return true; // hook into it char *p = m_finalBuf; m_docIds = (long long *)p; p += m_docsToGet * 8; m_scores = (float *)p; p += m_docsToGet * sizeof(float); m_clusterRecs = (key_t *)p; p += m_docsToGet * sizeof(key_t); m_clusterLevels = (char *)p; p += m_docsToGet * 1; m_scoreInfos = (DocIdScore **)p; p+=m_docsToGet*sizeof(DocIdScore *); // sanity check char *pend = m_finalBuf + need; if ( p != pend ) { char *xx = NULL; *xx =0; } // . now allocate for hash table // . get at least twice as many slots as docids HashTableT<long long,char> htable; // returns false and sets g_errno on error if ( ! htable.set ( m_docsToGet * 2 ) ) return true; // hash table for doing site clustering, provided we // are fully split and we got the site recs now HashTableT<long long,long> htable2; if ( m_r->m_doSiteClustering && ! htable2.set ( m_docsToGet * 2 ) ) return true; // // ***MERGE ALL SPLITS INTO m_docIds[], etc.*** // // . merge all lists in m_replyDocIds[splitNum] // . we may be re-called later after m_docsToGet is increased // if too many docids were clustered/filtered out after the call // to Msg51. mergeLoop: // the winning docid will be diPtr[maxj] long maxj = -1; //Msg39Reply *mr; long hslot; // get the next highest-scoring docids from all split lists for ( long j = 0; j < m_numHosts; j++ ) { // . skip exhausted lists // . these both should be NULL if reply was skipped because // we did a gbdocid:| query if ( diPtr[j] >= diEnd[j] ) continue; // compare the score if ( maxj == -1 ) { maxj = j; continue; } if ( *rsPtr[j] < *rsPtr[maxj] ) continue; if ( *rsPtr[j] > *rsPtr[maxj] ) { maxj = j; continue; } // prefer lower docids on top if ( *diPtr[j] < *diPtr[maxj] ) { maxj = j; continue; } } if ( maxj == -1 ) { m_moreDocIdsAvail = false; goto doneMerge; } // only do this logic if we have clusterdb recs included if ( m_r->m_doSiteClustering && // if the clusterLevel was set to CR_*errorCode* then this key // will be 0, so in that case, it might have been a not found // or whatever, so let it through regardless ksPtr[maxj]->n0 != 0LL && ksPtr[maxj]->n1 != 0 ) { // get the hostname hash, a long long long sh = g_clusterdb.getSiteHash26 ((char *)ksPtr[maxj]); // do we have enough from this hostname already? long slot = htable2.getSlot ( sh ); // if this hostname already visible, do not over-display it... if ( slot >= 0 ) { // get the count long val = htable2.getValueFromSlot ( slot ); // . if already 2 or more, give up // . if the site hash is 0, that usually means a // "not found" in clusterdb, and the accompanying // cluster level would be set as such, but since we // did not copy the cluster levels over in the merge // algo above, we don't know for sure... cluster recs // are set to 0 in the Msg39.cpp clustering. if ( sh && val >= 2 ) goto skip; // inc the count val++; // store it htable2.setValue ( slot , val ); } // . add it, this should be pre-allocated! // . returns false and sets g_errno on error else if ( ! htable2.addKey(sh,1) ) return true; } hslot = htable.getSlot ( *diPtr[maxj] ); // . only add it to the final list if the docid is "unique" // . BUT since different event ids share the same docid, exception! if ( hslot < 0 ) { // always inc this //m_totalDocCount++; // only do this if we need more if ( m_numDocIds < m_docsToGet ) { // get DocIdScore class for this docid Msg39Reply *mr = m_reply[maxj]; // point to the array of DocIdScores DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo; long nds = mr->size_scoreInfo/sizeof(DocIdScore); DocIdScore *dp = NULL; for ( long i = 0 ; i < nds ; i++ ) { if ( ds[i].m_docId != *diPtr[maxj] ) continue; dp = &ds[i]; break; } // add the max to the final merged lists m_docIds [m_numDocIds] = *diPtr[maxj]; // wtf? if ( ! dp ) { // this is empty if no scoring info // supplied! if ( m_r->m_getDocIdScoringInfo ) log("msg3a: CRAP! got empty score " "info for " "d=%lli", m_docIds[m_numDocIds]); //char *xx=NULL; *xx=0; 261561804684 // qry = www.yahoo } // point to the single DocIdScore for this docid m_scoreInfos[m_numDocIds] = dp; // reset this just in case if ( dp ) { dp->m_singleScores = NULL; dp->m_pairScores = NULL; } // now fix DocIdScore::m_pairScores and m_singleScores // ptrs so they reference into the // Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf // like they should. it seems we do not free the // Msg39Replies so we should be ok referencing them. if ( dp && dp->m_singlesOffset >= 0 ) dp->m_singleScores = (SingleScore *)(mr->ptr_singleScoreBuf+ dp->m_singlesOffset) ; if ( dp && dp->m_pairsOffset >= 0 ) dp->m_pairScores = (PairScore *)(mr->ptr_pairScoreBuf + dp->m_pairsOffset ); // turn it into a float, that is what rscore_t is. // we do this to make it easier for PostQueryRerank.cpp m_scores [m_numDocIds]=(float)*rsPtr[maxj]; if ( m_r->m_doSiteClustering ) m_clusterRecs[m_numDocIds]= *ksPtr[maxj]; // clear this out //m_eventIdBits[m_numDocIds].clear(); // set this for use below hslot = m_numDocIds; // point to next available slot to add to m_numDocIds++; } // if it has ALL the required query terms, count it //if ( *bsPtr[maxj] & 0x60 ) m_numAbove++; // . add it, this should be pre-allocated! // . returns false and sets g_errno on error if ( ! htable.addKey(*diPtr[maxj],1) ) return true; } skip: // increment the split pointers from which we took the max rsPtr[maxj]++; diPtr[maxj]++; ksPtr[maxj]++; // get the next highest docid and add it in if ( m_numDocIds < m_docsToGet ) goto mergeLoop; doneMerge: if ( m_debug ) { // show how long it took logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li " "splits in %llu ms. " , (unsigned long)this, m_numDocIds, (long)m_numHosts, gettimeofdayInMilliseconds() - m_startTime ); // show the final merged docids for ( long i = 0 ; i < m_numDocIds ; i++ ) { long sh = 0; if ( m_r->m_doSiteClustering ) sh=g_clusterdb.getSiteHash26((char *) &m_clusterRecs[i]); // print out score_t logf(LOG_DEBUG,"query: msg3a: [%lu] " "%03li) merged docId=%012llu " "score=%.01f hosthash=0x%lx", (unsigned long)this, i, m_docIds [i] , (float)m_scores [i] , sh ); } } // if we had a full split, we should have gotten the cluster recs // from each split already memset ( m_clusterLevels , CR_OK , m_numDocIds ); return true; }