コード例 #1
0
// returns NULL and set g_errno on error
int32_t Msg20Reply::serialize(char *buf, int32_t bufSize) const {
#ifdef _VALGRIND_
	VALGRIND_CHECK_MEM_IS_DEFINED(this,sizeof(*this));
	if(ptr_htag)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_htag,size_htag);
	if(ptr_ubuf)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_ubuf,size_ubuf);
	if(ptr_rubuf)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_rubuf,size_rubuf);
	if(ptr_displaySum)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_displaySum,size_displaySum);
	if(ptr_dbuf)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_dbuf,size_dbuf);
	if(ptr_vbuf)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vbuf,size_vbuf);
	if(ptr_imgData)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_imgData,size_imgData);
	if(ptr_site)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_site,size_site);
	if(ptr_linkInfo)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_linkInfo,size_linkInfo);
	if(ptr_outlinks)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_outlinks,size_outlinks);
	if(ptr_vector1)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vector1,size_vector1);
	if(ptr_vector2)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vector2,size_vector2);
	if(ptr_vector3)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vector3,size_vector3);
	if(ptr_linkText)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_linkText,size_linkText);
	if(ptr_surroundingText)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_surroundingText,size_surroundingText);
	if(ptr_linkUrl)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_linkUrl,size_linkUrl);
	if(ptr_rssItem)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_rssItem,size_rssItem);
	if(ptr_categories)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_categories,size_categories);
	if(ptr_content)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_content,size_content);
	if(ptr_templateVector)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_templateVector,size_templateVector);
	if(ptr_metadataBuf)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_metadataBuf,size_metadataBuf);
	if(ptr_note)
		VALGRIND_CHECK_MEM_IS_DEFINED(ptr_note,size_note);
#endif
	int32_t retSize;
	serializeMsg(sizeof(*this),
	             &size_tbuf, &size_note,
	             &ptr_tbuf,
	             this,
	             &retSize,
	             buf, bufSize);
	if ( retSize > bufSize ) { g_process.shutdownAbort(true); }
	// return it
	return retSize;
}
コード例 #2
0
// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
char *Msg20Request::serialize(int32_t *retSize) const {
	// make a buffer to serialize into
	int32_t  need = getStoredSize();
	// alloc if we should
	char *buf = (char *)mmalloc ( need , "Msg20Ra" );
	// bail on error, g_errno should be set
	if ( ! buf ) return NULL;

	return serializeMsg(sizeof(*this),
			    &size_qbuf, &size_displayMetas,
			    &ptr_qbuf,
			    this,
			    retSize,
			    buf, need);
}
コード例 #3
0
// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
char *serializeMsg(int32_t             baseSize,
		   const int32_t      *firstSizeParm,
		   const int32_t      *lastSizeParm,
		   const char * const *firstStrPtr,
		   const void         *thisPtr,
		   int32_t            *retSize,
		   char               *userBuf,
		   int32_t             userBufSize)
{
	return serializeMsg(baseSize,
	                    const_cast<int32_t*>(firstSizeParm),
			    const_cast<int32_t*>(lastSizeParm),
			    const_cast<char**>(firstStrPtr),
			    const_cast<void*>(thisPtr),
			    retSize,
			    userBuf,
			    userBufSize,
			    false);
}
コード例 #4
0
void Msg39::estimateHitsAndSendReply ( ) {

	// no longer in use
	m_inUse = false;

	// now this for the query loop on the QueryLogEntries.
	m_topDocId50 = 0LL;
	m_topScore50 = 0.0;

	// a little hack for the seo pipeline in xmldoc.cpp
	m_topDocId  = 0LL;
	m_topScore  = 0.0;
	m_topDocId2 = 0LL;
	m_topScore2 = 0.0;
	int32_t ti = m_tt.getHighNode();
	if ( ti >= 0 ) {
		TopNode *t = &m_tt.m_nodes[ti];
		m_topDocId = t->m_docId;
		m_topScore = t->m_score;
	}
	// try the 2nd one too
	int32_t ti2 = -1;
	if ( ti >= 0 ) ti2 = m_tt.getNext ( ti );
	if ( ti2 >= 0 ) {
		TopNode *t2 = &m_tt.m_nodes[ti2];
		m_topDocId2 = t2->m_docId;
		m_topScore2 = t2->m_score;
	}

	// convenience ptrs. we will store the docids/scores into these arrays
	int64_t *topDocIds;
	double    *topScores;
	key_t     *topRecs;

	// numDocIds counts docs in all tiers when using toptree.
	int32_t numDocIds = m_tt.m_numUsedNodes;

	// the msg39 reply we send back
	int32_t  replySize;
	char *reply;

	//m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6;

	// make the reply?
	Msg39Reply mr;

	// this is what you want to look at if there is no seo.cpp module...
	if ( ! m_callback ) {
		// if we got clusterdb recs in here, use 'em
		if ( m_gotClusterRecs ) numDocIds = m_numVisible;
		
		// don't send more than the docs that are asked for
		if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet;

		// # of QueryTerms in query
		int32_t nqt = m_tmpq.m_numTerms;
		// start setting the stuff
		mr.m_numDocIds = numDocIds;
		// copy # estiamted hits into 8 bytes of reply
		//int64_t est = m_posdbTable.m_estimatedTotalHits;
		// ensure it has at least as many results as we got
		//if ( est < numDocIds ) est = numDocIds;
		// or if too big...
		//if ( numDocIds < m_r->m_docsToGet ) est = numDocIds;
		// . total estimated hits
		// . this is now an EXACT count!
		mr.m_estimatedHits = m_numTotalHits;
		// sanity check
		mr.m_nqt = nqt;
		// the m_errno if any
		mr.m_errno = m_errno;
		// int16_tcut
		PosdbTable *pt = &m_posdbTable;
		// the score info, in no particular order right now
		mr.ptr_scoreInfo  = pt->m_scoreInfoBuf.getBufStart();
		mr.size_scoreInfo = pt->m_scoreInfoBuf.length();
		// that has offset references into posdbtable::m_pairScoreBuf 
		// and m_singleScoreBuf, so we need those too now
		mr.ptr_pairScoreBuf    = pt->m_pairScoreBuf.getBufStart();
		mr.size_pairScoreBuf   = pt->m_pairScoreBuf.length();
		mr.ptr_singleScoreBuf  = pt->m_singleScoreBuf.getBufStart();
		mr.size_singleScoreBuf = pt->m_singleScoreBuf.length();
		// save some time since seo.cpp gets from posdbtable directly,
		// so we can avoid serializing/copying this stuff at least
		if ( ! m_r->m_makeReply ) {
			mr.size_scoreInfo      = 0;
			mr.size_pairScoreBuf   = 0;
			mr.size_singleScoreBuf = 0;
		}
		//mr.m_sectionStats    = pt->m_sectionStats;
		// reserve space for these guys, we fill them in below
		mr.ptr_docIds       = NULL;
		mr.ptr_scores       = NULL;
		mr.ptr_clusterRecs  = NULL;
		// this is how much space to reserve
		mr.size_docIds      = 8 * numDocIds; // int64_t
		mr.size_scores      = sizeof(double) * numDocIds; // float
		// if not doing site clustering, we won't have these perhaps...
		if ( m_gotClusterRecs ) 
			mr.size_clusterRecs = sizeof(key_t) *numDocIds;
		else    
			mr.size_clusterRecs = 0;

		#define MAX_FACETS 20000

		/////////////////
		//
		// FACETS
		//
		/////////////////

		// We can have multiple gbfacet: terms in a query so
		// serialize all the QueryTerm::m_facetHashTables into
		// Msg39Reply::ptr_facetHashList.
		//
		// combine the facet hash lists of each query term into
		// a list of lists. each lsit is preceeded by the query term
		// id of the query term (like gbfacet:xpathsitehash12345)
		// followed by a 4 byte length of the following 32-bit
		// facet values
		int32_t need = 0;
		for ( int32_t i = 0 ; i < m_tmpq.m_numTerms; i++ ) {
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			// skip if not facet
			if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
			     qt->m_fieldCode != FIELD_GBFACETINT &&
			     qt->m_fieldCode != FIELD_GBFACETFLOAT )
				continue;
			HashTableX *ft = &qt->m_facetHashTable;
			if ( ft->m_numSlotsUsed == 0 ) continue;
			int32_t used = ft->m_numSlotsUsed;
			// limit for memory
			if ( used > (int32_t)MAX_FACETS ) {
				log("msg39: truncating facet list to 20000 "
				    "from %"INT32" for %s",used,qt->m_term);
				used = (int32_t)MAX_FACETS;
			}
			// store query term id 64 bit
			need += 8;
			// then size
			need += 4;
			// then buckets. keys and counts
			need += (4+sizeof(FacetEntry)) * used;
		}
		// allocate
		SafeBuf tmp;
		if ( ! tmp.reserve ( need ) ) {
			log("query: Could not allocate memory "
			    "to hold reply facets");
			sendReply(m_slot,this,NULL,0,0,true);
			return;
		}
		// point to there
		char *p = tmp.getBufStart();
		for ( int32_t i = 0 ; i < m_tmpq.m_numTerms ; i++ ) {
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			// skip if not facet
			if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
			     qt->m_fieldCode != FIELD_GBFACETINT &&
			     qt->m_fieldCode != FIELD_GBFACETFLOAT )
				continue;
			// get all the facet hashes and their counts
			HashTableX *ft = &qt->m_facetHashTable;
			// skip if none
			if ( ft->m_numSlotsUsed == 0 ) continue;
			// store query term id 64 bit
			*(int64_t *)p = qt->m_termId;
			p += 8;
			int32_t used = ft->getNumSlotsUsed();
			if ( used > (int32_t)MAX_FACETS ) 
				used = (int32_t)MAX_FACETS;
			// store count
			*(int32_t *)p = used;
			p += 4;
			int32_t count = 0;
			// for sanity check
			char *pend = p + (used * (4+sizeof(FacetEntry)));
			// serialize the key/val pairs
			for ( int32_t k = 0 ; k < ft->m_numSlots ; k++ ) {
				// skip empty buckets
				if ( ! ft->m_flags[k] ) continue;
				// store key. the hash of the facet value.
				*(int32_t *)p = ft->getKey32FromSlot(k); p += 4;
				// then store count
				//*(int32_t *)p = ft->getVal32FromSlot(k); p += 4;
				// now this has a docid on it so we can
				// lookup the text of the facet in Msg40.cpp
				FacetEntry *fe;
				fe = (FacetEntry *)ft->getValFromSlot(k);
				// sanity
				// no, count can be zero if its a range facet
				// that was never added to. we add those
				// empty FaceEntries only for range facets
				// in Posdb.cpp
				//if(fe->m_count == 0 ) { char *xx=NULL;*xx=0;}
				gbmemcpy ( p , fe , sizeof(FacetEntry) );
				p += sizeof(FacetEntry);
				// do not breach
				if ( ++count >= (int32_t)MAX_FACETS ) break;
			}
			// sanity check
			if ( p != pend ) { char *xx=NULL;*xx=0; }
			// do the next query term
		}
		// now point to that so it can be serialized below
		mr.ptr_facetHashList  = tmp.getBufStart();
		mr.size_facetHashList = p - tmp.getBufStart();//tmp.length();

		/////////////
		//
		// END FACETS
		//
		/////////////


		// . that is pretty much it,so serialize it into buffer,"reply"
		// . mr.ptr_docIds, etc., will point into the buffer so we can
		//   re-serialize into it below from the tree
		// . returns NULL and sets g_errno on error
		// . "true" means we should make mr.ptr_* reference into the 
		//   newly  serialized buffer.
		reply = serializeMsg ( sizeof(Msg39Reply), // baseSize
				       &mr.size_docIds, // firstSizeParm
				       &mr.size_clusterRecs,//lastSizePrm
				       &mr.ptr_docIds , // firstStrPtr
				       &mr , // thisPtr
				       &replySize , 
				       NULL , 
				       0 , 
				       true ) ;
		if ( ! reply ) {
			log("query: Could not allocated memory "
			    "to hold reply of docids to send back.");
			sendReply(m_slot,this,NULL,0,0,true);
			return;
		}
		topDocIds    = (int64_t *) mr.ptr_docIds;
		topScores    = (double    *) mr.ptr_scores;
		topRecs      = (key_t     *) mr.ptr_clusterRecs;
	}

	int32_t docCount = 0;
	// loop over all results in the TopTree
	for ( int32_t ti = m_tt.getHighNode() ; ti >= 0 ; 
	      ti = m_tt.getPrev(ti) ) {
		// get the guy
		TopNode *t = &m_tt.m_nodes[ti];
		// skip if clusterLevel is bad!
		if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) 
			continue;

		// if not sending back a reply... we were called from seo.cpp
		// State3f logic to evaluate a QueryLogEntry, etc.
		if ( m_callback ) {
			// skip results past #50
			if ( docCount > 50 ) continue;
			// set this
			m_topScore50 = t->m_score;
			m_topDocId50 = t->m_docId;
			// that's it
			continue;
		}

		// get the docid ptr
		//char      *diptr = t->m_docIdPtr;
		//int64_t  docId = getDocIdFromPtr(diptr);
		// sanity check
		if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; }
		//add it to the reply
		topDocIds         [docCount] = t->m_docId;
		topScores         [docCount] = t->m_score;
		if ( m_tt.m_useIntScores ) 
			topScores[docCount] = (double)t->m_intScore;
		// supply clusterdb rec? only for full splits
		if ( m_gotClusterRecs ) 
			topRecs [docCount] = t->m_clusterRec;
		//topExplicits      [docCount] = 
		//	getNumBitsOn(t->m_explicits)
		docCount++;

		// 50th score? set this for seo.cpp. if less than 50 results
		// we want the score of the last doc then.
		if ( docCount <= 50 ) m_topScore50 = t->m_score;
		
		if ( m_debug ) {
			logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
			    "%03"INT32") docId=%012"UINT64" sum=%.02f",
			    (PTRTYPE)this, docCount,
			    t->m_docId,t->m_score);
		}
		//don't send more than the docs that are wanted
		if ( docCount >= numDocIds ) break;
	}
 	if ( docCount > 300 && m_debug )
		log("query: Had %"INT32" nodes in top tree",docCount);

	// this is sensitive info
	if ( m_debug ) {
		log(LOG_DEBUG,
		    "query: msg39: [%"PTRFMT"] "
		    "Intersected lists took %"INT64" (%"INT64") "
		    "ms "
		    "docIdsToGet=%"INT32" docIdsGot=%"INT32" "
		    "q=%s",
		    (PTRTYPE)this                        ,
		    m_posdbTable.m_addListsTime       ,
		    gettimeofdayInMilliseconds() - m_startTime ,
		    m_r->m_docsToGet                       ,
		    numDocIds                         ,
		    m_tmpq.getQuery()                 );
	}


	// if we blocked because we used a thread then call callback if
	// summoned from a msg3f handler and not a msg39 handler
	if ( m_callback ) {
		// if we blocked call user callback
		if ( m_blocked ) m_callback ( m_state );
		// if not sending back a udp reply, return now
		return;
	}

	// now send back the reply
	sendReply(m_slot,this,reply,replySize,replySize,false);
	return;
}
コード例 #5
0
bool Msg3a::gotCacheReply ( ) {

    // in cache?
    if ( ! m_seoCacheList.isEmpty() ) {
        // note it
        //log("seopipe: found ckey=%s q=%s"
        //    ,KEYSTR(&m_ckey,12)
        //    ,m_r->ptr_query
        //    );
        char *p = m_seoCacheList.getList();
        // skip key
        p += sizeof(key_t);
        // datasize
        p += 4;
        // timestamp
        //long cachedTime = *(long *)p;
        p += 4;
        // # docids
        m_numDocIds = *(long *)p;
        p += 4;
        // total # results
        m_numTotalEstimatedHits = *(long *)p;
        p += 4;
        // docids
        m_docIds = (long long *)p;
        p += 8 * m_numDocIds;
        // scores
        m_scores = (float *)p;
        p += sizeof(float) * m_numDocIds;
        // site hashes
        m_siteHashes26 = (long *)p;
        p += 4 * m_numDocIds;
        // log to log as well
        char tmp[50000];
        p = tmp;
        p += sprintf(p,
                     "seopipe: hit cache "
                     "docids=%li "
                     "query=\"%s\" ",
                     m_numDocIds,
                     m_r->ptr_query );
        // log each docid
        //for ( long i = 0 ; i < m_numDocIds ; i++ ) {
        //	//float score = m_msg3a->getScores()[i];
        //	long long d = m_docIds[i];
        //	//long sh32 = m_msg3a->getSiteHash32(i);
        //	p += sprintf(p,"d%li=%lli ",i,d);
        //}
        log("%s",tmp);
        // all done!
        return true;
    }

    CollectionRec *cr;
    cr = g_collectiondb.getRec(m_r->ptr_coll,m_r->size_coll-1);

    setTermFreqWeights ( cr->m_coll,m_q,m_termFreqs , m_termFreqWeights );

    if ( m_debug ) {
        //long long *termIds = m_q->getTermIds();
        //if ( m_numCandidates ) termIds = m_synIds;
        for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
            // get the term in utf8
            QueryTerm *qt = &m_q->m_qterms[i];
            //char bb[256];
            //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
            char *tpc = qt->m_term + qt->m_termLen;
            char c = *tpc;
            *tpc = 0;
            // this term freq is estimated from the rdbmap and
            // does not hit disk...
            logf(LOG_DEBUG,"query: term #%li \"%s\" "
                 "termid=%lli termFreq=%lli termFreqWeight=%.03f",
                 i,
                 qt->m_term,
                 qt->m_termId,
                 m_termFreqs[i],
                 m_termFreqWeights[i]);
            // put it back
            *tpc = c;
        }
    }

    // time how long to get each split's docids
    if ( m_debug )
        m_startTime = gettimeofdayInMilliseconds();

    // reset replies received count
    m_numReplies  = 0;
    // shortcut
    long n = m_q->m_numTerms;

    /////////////////////////////
    //
    // set the Msg39 request
    //
    /////////////////////////////

    // free if we should
    if ( m_rbufPtr && m_rbufPtr != m_rbuf ) {
        mfree ( m_rbufPtr , m_rbufSize , "Msg3a");
        m_rbufPtr = NULL;
    }

    // a tmp buf
    long readSizes[MAX_QUERY_TERMS];
    // update our read info
    for ( long j = 0; j < n ; j++ ) {
        // the read size for THIS query term
        long rs = 300000000; // toRead; 300MB i guess...
        // limit to 50MB man! this was 30MB but the
        // 'time enough for love' query was hitting 30MB termlists.
        //rs = 50000000;
        rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB!
        // if section stats, limit to 1MB
        if ( m_r->m_getSectionStats ) rs = 1000000;
        // get the jth query term
        QueryTerm *qt = &m_q->m_qterms[j];
        // if query term is ignored, skip it
        if ( qt->m_ignored ) rs = 0;
        // set it
        readSizes[j] = rs;
    }

    // serialize this
    m_r->ptr_readSizes  = (char *)readSizes;
    m_r->size_readSizes = 4 * n;
    // and this
    m_r->ptr_termFreqWeights  = (char *)m_termFreqWeights;
    m_r->size_termFreqWeights = 4 * n;
    // store query into request, might have changed since we called
    // Query::expandQuery() above
    m_r->ptr_query  = m_q->m_orig;
    m_r->size_query = m_q->m_origLen+1;
    // free us?
    if ( m_rbufPtr && m_rbufPtr != m_rbuf ) {
        mfree ( m_rbufPtr , m_rbufSize, "Msg3a" );
        m_rbufPtr = NULL;
    }
    m_r->m_stripe = 0;
    // debug thing
    g_r        = m_r;
    // . (re)serialize the request
    // . returns NULL and sets g_errno on error
    // . "m_rbuf" is a local storage space that can save a malloc
    // . do not "makePtrsRefNewBuf" because if we do that and this gets
    //   called a 2nd time because m_getWeights got set to 0, then we
    //   end up copying over ourselves.
    m_rbufPtr = serializeMsg ( sizeof(Msg39Request),
                               &m_r->size_readSizes,
                               &m_r->size_coll,
                               &m_r->ptr_readSizes,
                               m_r,
                               &m_rbufSize ,
                               m_rbuf ,
                               RBUF_SIZE ,
                               false );

    if ( ! m_rbufPtr ) return true;

    // free this one too
    m_rbuf2.purge();
    // and copy that!
    if ( ! m_rbuf2.safeMemcpy ( m_rbufPtr , m_rbufSize ) ) return true;
    // and tweak it
    ((Msg39Request *)(m_rbuf2.getBufStart()))->m_stripe = 1;

    /////////////////////////////
    //
    // end formulating the Msg39 request
    //
    /////////////////////////////

    // . set timeout based on docids requested!
    // . the more docs requested the longer it will take to get
    long timeout = (50 * m_docsToGet) / 1000;
    // at least 20 seconds
    if ( timeout < 20 ) timeout = 20;
    // override? this is USUALLY -1, but DupDectector.cpp needs it
    // high because it is a spider time thing.
    if ( m_r->m_timeout > 0 ) timeout = m_r->m_timeout;
    // for new posdb stuff
    if ( timeout < 60 ) timeout = 60;

    long long qh = 0LL;
    if ( m_q ) qh = m_q->getQueryHash();

    m_numHosts = g_hostdb.getNumHosts();
    // only send to one host?
    if ( ! m_q->isSplit() ) m_numHosts = 1;

    // now we run it over ALL hosts that are up!
    for ( long i = 0; i < m_numHosts ; i++ ) { // m_indexdbSplit; i++ ) {
        // get that host
        Host *h = g_hostdb.getHost(i);
        // if not a full split, just round robin the group, i am not
        // going to sweat over performance on non-fully split indexes
        // because they suck really bad anyway compared to full
        // split indexes. "gid" is already set if we are not split.
        unsigned long gid = h->m_groupId;//g_hostdb.getGroupId(i);
        long firstHostId = h->m_hostId;
        // get strip num
        char *req = m_rbufPtr;
        // if sending to twin, use slightly different request
        if ( h->m_stripe == 1 ) req = m_rbuf2.getBufStart();
        // if we are a non-split query, like gbdom:xyz.com just send
        // to the host that has the first termid local. it will call
        // msg2 to download all termlists. msg2 should be smart
        // enough to download the "non split" termlists over the net.
        // TODO: fix msg2 to do that...
        if ( ! m_q->isSplit() ) {
            long long     tid  = m_q->getTermId(0);
            key_t         k    = g_indexdb.makeKey(tid,1,1,false );
            // split = false! do not split
            gid = getGroupId ( RDB_POSDB,&k,false);
            firstHostId = -1;
        }
        // debug log
        if ( m_debug )
            logf(LOG_DEBUG,"query: Msg3a[%lu]: forwarding request "
                 "of query=%s to groupid 0x%lx.",
                 (long)this, m_q->getQuery(), gid);
        // send to this guy
        Multicast *m = &m_mcast[i];
        // clear it for transmit
        m->reset();
        // . send out a msg39 request to each split
        // . multicasts to a host in group "groupId"
        // . we always block waiting for the reply with a multicast
        // . returns false and sets g_errno on error
        // . sends the request to fastest host in group "groupId"
        // . if that host takes more than about 5 secs then sends to
        //   next host
        // . key should be largest termId in group we're sending to
        bool status;
        status = m->send ( req , // m_rbufPtr         ,
                           m_rbufSize        , // request size
                           0x39              , // msgType 0x39
                           false             , // mcast owns m_request?
                           gid               , // group to send to
                           false             , // send to whole group?
                           (long)qh          , // 0 // startKey.n1
                           this              , // state1 data
                           m                 , // state2 data
                           gotReplyWrapper3a ,
                           timeout           , // in seconds
                           m_r->m_niceness   ,
                           false             , // realtime?
                           firstHostId, // -1// bestHandlingHostId ,
                           NULL              , // m_replyBuf   ,
                           0                 , // MSG39REPLYSIZE,
                           // this is true if multicast should free the
                           // reply, otherwise caller is responsible
                           // for freeing it after calling
                           // getBestReply().
                           // actually, this should always be false,
                           // there is a bug in Multicast.cpp.
                           // no, if we error out and never steal
                           // the buffers then they will go unfreed
                           // so they are freed by multicast by default
                           // then we steal control explicitly
                           true             );
        // if successfully launch, do the next one
        if ( status ) continue;
        // . this serious error should make the whole query fail
        // . must allow other replies to come in though, so keep going
        m_numReplies++;
        log("query: Multicast Msg3a had error: %s",mstrerror(g_errno));
        m_errno = g_errno;
        g_errno = 0;
    }
    // return false if blocked on a reply
    if ( m_numReplies < m_numHosts ) return false;//indexdbSplit )
    // . otherwise, we did not block... error?
    // . it must have been an error or just no new lists available!!
    // . if we call gotAllSplitReplies() here, and we were called by
    //   mergeLists() we end up calling mergeLists() again... bad. so
    //   just return true in that case.
    //return gotAllSplitReplies();
    return true;
}
コード例 #6
0
void Msg39::estimateHits ( ) {

	// no longer in use
	m_inUse = false;

	// now this for the query loop on the QueryLogEntries.
	m_topDocId50 = 0LL;
	m_topScore50 = 0.0;

	// a little hack for the seo pipeline in xmldoc.cpp
	m_topDocId  = 0LL;
	m_topScore  = 0.0;
	m_topDocId2 = 0LL;
	m_topScore2 = 0.0;
	long ti = m_tt.getHighNode();
	if ( ti >= 0 ) {
		TopNode *t = &m_tt.m_nodes[ti];
		m_topDocId = t->m_docId;
		m_topScore = t->m_score;
	}
	// try the 2nd one too
	long ti2 = -1;
	if ( ti >= 0 ) ti2 = m_tt.getNext ( ti );
	if ( ti2 >= 0 ) {
		TopNode *t2 = &m_tt.m_nodes[ti2];
		m_topDocId2 = t2->m_docId;
		m_topScore2 = t2->m_score;
	}

	// convenience ptrs. we will store the docids/scores into these arrays
	long long *topDocIds;
	float     *topScores;
	key_t     *topRecs;

	// numDocIds counts docs in all tiers when using toptree.
	long numDocIds = m_tt.m_numUsedNodes;

	// the msg39 reply we send back
	long  replySize;
	char *reply;

	//m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6;

	// make the reply?
	Msg39Reply mr;

	if ( ! m_callback ) {
		// if we got clusterdb recs in here, use 'em
		if ( m_gotClusterRecs ) numDocIds = m_numVisible;
		
		// don't send more than the docs that are asked for
		if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet;

		// # of QueryTerms in query
		long nqt = m_tmpq.m_numTerms;
		// start setting the stuff
		mr.m_numDocIds = numDocIds;
		// copy # estiamted hits into 8 bytes of reply
		//long long est = m_posdbTable.m_estimatedTotalHits;
		// ensure it has at least as many results as we got
		//if ( est < numDocIds ) est = numDocIds;
		// or if too big...
		//if ( numDocIds < m_r->m_docsToGet ) est = numDocIds;
		// . total estimated hits
		// . this is now an EXACT count!
		mr.m_estimatedHits = m_numTotalHits;
		// sanity check
		mr.m_nqt = nqt;
		// the m_errno if any
		mr.m_errno = m_errno;
		// shortcut
		PosdbTable *pt = &m_posdbTable;
		// the score info, in no particular order right now
		mr.ptr_scoreInfo  = pt->m_scoreInfoBuf.getBufStart();
		mr.size_scoreInfo = pt->m_scoreInfoBuf.length();
		// that has offset references into posdbtable::m_pairScoreBuf 
		// and m_singleScoreBuf, so we need those too now
		mr.ptr_pairScoreBuf    = pt->m_pairScoreBuf.getBufStart();
		mr.size_pairScoreBuf   = pt->m_pairScoreBuf.length();
		mr.ptr_singleScoreBuf  = pt->m_singleScoreBuf.getBufStart();
		mr.size_singleScoreBuf = pt->m_singleScoreBuf.length();
		// save some time since seo.cpp gets from posdbtable directly,
		// so we can avoid serializing/copying this stuff at least
		if ( ! m_r->m_makeReply ) {
			mr.size_scoreInfo      = 0;
			mr.size_pairScoreBuf   = 0;
			mr.size_singleScoreBuf = 0;
		}
		// and now the sitehash list if it exists
		mr.ptr_siteHashList  = pt->m_siteHashList.getBufStart();
		mr.size_siteHashList = pt->m_siteHashList.length();
		mr.m_sectionStats    = pt->m_sectionStats;
		// reserve space for these guys, we fill them in below
		mr.ptr_docIds       = NULL;
		mr.ptr_scores       = NULL;
		mr.ptr_clusterRecs  = NULL;
		// this is how much space to reserve
		mr.size_docIds      = 8 * numDocIds; // long long
		mr.size_scores      = 4 * numDocIds; // float
		// if not doing site clustering, we won't have these perhaps...
		if ( m_gotClusterRecs ) 
			mr.size_clusterRecs = sizeof(key_t) *numDocIds;
		else    
			mr.size_clusterRecs = 0;
		// . that is pretty much it,so serialize it into buffer,"reply"
		// . mr.ptr_docIds, etc., will point into the buffer so we can
		//   re-serialize into it below from the tree
		// . returns NULL and sets g_errno on error
		// . "true" means we should make mr.ptr_* reference into the 
		//   newly  serialized buffer.
		reply = serializeMsg ( sizeof(Msg39Reply), // baseSize
				       &mr.size_docIds, // firstSizeParm
				       &mr.size_clusterRecs,//lastSizePrm
				       &mr.ptr_docIds , // firstStrPtr
				       &mr , // thisPtr
				       &replySize , 
				       NULL , 
				       0 , 
				       true ) ;
		if ( ! reply ) {
			log("query: Could not allocated memory "
			    "to hold reply of docids to send back.");
			sendReply(m_slot,this,NULL,0,0,true);
			return ; 
		}
		topDocIds    = (long long *) mr.ptr_docIds;
		topScores    = (float     *) mr.ptr_scores;
		topRecs      = (key_t     *) mr.ptr_clusterRecs;
	}

	long docCount = 0;
	// loop over all results in the TopTree
	for ( long ti = m_tt.getHighNode() ; ti >= 0 ; 
	      ti = m_tt.getPrev(ti) ) {
		// get the guy
		TopNode *t = &m_tt.m_nodes[ti];
		// skip if clusterLevel is bad!
		if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) 
			continue;

		// if not sending back a reply... we were called from seo.cpp
		// State3f logic to evaluate a QueryLogEntry, etc.
		if ( m_callback ) {
			// skip results past #50
			if ( docCount > 50 ) continue;
			// set this
			m_topScore50 = t->m_score;
			m_topDocId50 = t->m_docId;
			// that's it
			continue;
		}

		// get the docid ptr
		//char      *diptr = t->m_docIdPtr;
		//long long  docId = getDocIdFromPtr(diptr);
		// sanity check
		if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; }
		//add it to the reply
		topDocIds         [docCount] = t->m_docId;
		topScores         [docCount] = t->m_score;
		// supply clusterdb rec? only for full splits
		if ( m_gotClusterRecs ) 
			topRecs [docCount] = t->m_clusterRec;
		//topExplicits      [docCount] = 
		//	getNumBitsOn(t->m_explicits)
		docCount++;

		// 50th score? set this for seo.cpp. if less than 50 results
		// we want the score of the last doc then.
		if ( docCount <= 50 ) m_topScore50 = t->m_score;
		
		if ( m_debug ) {
			log(LOG_DEBUG,"query: msg39: [%lu] "
			    "%03li) docId=%012llu sum=%.02f",
			    (long)this, docCount,
			    t->m_docId,t->m_score);
		}
		//don't send more than the docs that are wanted
		if ( docCount >= numDocIds ) break;
	}
 	if ( docCount > 300 && m_debug )
		log("query: Had %li nodes in top tree",docCount);

	// this is sensitive info
	if ( m_debug ) {
		log(LOG_DEBUG,
		    "query: msg39: [%li] Intersected lists took %lli (%lli) "
		    "ms "
		    "docIdsToGet=%li docIdsGot=%li "
		    "q=%s",
		    (long)this                        ,
		    m_posdbTable.m_addListsTime       ,
		    gettimeofdayInMilliseconds() - m_startTime ,
		    m_r->m_docsToGet                       ,
		    numDocIds                         ,
		    m_tmpq.getQuery()                 );
	}


	// if we blocked because we used a thread then call callback if
	// summoned from a msg3f handler and not a msg39 handler
	if ( m_callback ) {
		// if we blocked call user callback
		if ( m_blocked ) m_callback ( m_state );
		// if not sending back a udp reply, return now
		return;
	}

	// now send back the reply
	sendReply(m_slot,this,reply,replySize,replySize,false);
	return;
}