// returns NULL and set g_errno on error int32_t Msg20Reply::serialize(char *buf, int32_t bufSize) const { #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(this,sizeof(*this)); if(ptr_htag) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_htag,size_htag); if(ptr_ubuf) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_ubuf,size_ubuf); if(ptr_rubuf) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_rubuf,size_rubuf); if(ptr_displaySum) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_displaySum,size_displaySum); if(ptr_dbuf) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_dbuf,size_dbuf); if(ptr_vbuf) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vbuf,size_vbuf); if(ptr_imgData) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_imgData,size_imgData); if(ptr_site) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_site,size_site); if(ptr_linkInfo) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_linkInfo,size_linkInfo); if(ptr_outlinks) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_outlinks,size_outlinks); if(ptr_vector1) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vector1,size_vector1); if(ptr_vector2) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vector2,size_vector2); if(ptr_vector3) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_vector3,size_vector3); if(ptr_linkText) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_linkText,size_linkText); if(ptr_surroundingText) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_surroundingText,size_surroundingText); if(ptr_linkUrl) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_linkUrl,size_linkUrl); if(ptr_rssItem) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_rssItem,size_rssItem); if(ptr_categories) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_categories,size_categories); if(ptr_content) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_content,size_content); if(ptr_templateVector) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_templateVector,size_templateVector); if(ptr_metadataBuf) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_metadataBuf,size_metadataBuf); if(ptr_note) VALGRIND_CHECK_MEM_IS_DEFINED(ptr_note,size_note); #endif int32_t retSize; serializeMsg(sizeof(*this), &size_tbuf, &size_note, &ptr_tbuf, this, &retSize, buf, bufSize); if ( retSize > bufSize ) { g_process.shutdownAbort(true); } // return it return retSize; }
// . return ptr to the buffer we serialize into // . return NULL and set g_errno on error char *Msg20Request::serialize(int32_t *retSize) const { // make a buffer to serialize into int32_t need = getStoredSize(); // alloc if we should char *buf = (char *)mmalloc ( need , "Msg20Ra" ); // bail on error, g_errno should be set if ( ! buf ) return NULL; return serializeMsg(sizeof(*this), &size_qbuf, &size_displayMetas, &ptr_qbuf, this, retSize, buf, need); }
// . return ptr to the buffer we serialize into // . return NULL and set g_errno on error char *serializeMsg(int32_t baseSize, const int32_t *firstSizeParm, const int32_t *lastSizeParm, const char * const *firstStrPtr, const void *thisPtr, int32_t *retSize, char *userBuf, int32_t userBufSize) { return serializeMsg(baseSize, const_cast<int32_t*>(firstSizeParm), const_cast<int32_t*>(lastSizeParm), const_cast<char**>(firstStrPtr), const_cast<void*>(thisPtr), retSize, userBuf, userBufSize, false); }
void Msg39::estimateHitsAndSendReply ( ) { // no longer in use m_inUse = false; // now this for the query loop on the QueryLogEntries. m_topDocId50 = 0LL; m_topScore50 = 0.0; // a little hack for the seo pipeline in xmldoc.cpp m_topDocId = 0LL; m_topScore = 0.0; m_topDocId2 = 0LL; m_topScore2 = 0.0; int32_t ti = m_tt.getHighNode(); if ( ti >= 0 ) { TopNode *t = &m_tt.m_nodes[ti]; m_topDocId = t->m_docId; m_topScore = t->m_score; } // try the 2nd one too int32_t ti2 = -1; if ( ti >= 0 ) ti2 = m_tt.getNext ( ti ); if ( ti2 >= 0 ) { TopNode *t2 = &m_tt.m_nodes[ti2]; m_topDocId2 = t2->m_docId; m_topScore2 = t2->m_score; } // convenience ptrs. we will store the docids/scores into these arrays int64_t *topDocIds; double *topScores; key_t *topRecs; // numDocIds counts docs in all tiers when using toptree. int32_t numDocIds = m_tt.m_numUsedNodes; // the msg39 reply we send back int32_t replySize; char *reply; //m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6; // make the reply? Msg39Reply mr; // this is what you want to look at if there is no seo.cpp module... if ( ! m_callback ) { // if we got clusterdb recs in here, use 'em if ( m_gotClusterRecs ) numDocIds = m_numVisible; // don't send more than the docs that are asked for if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet; // # of QueryTerms in query int32_t nqt = m_tmpq.m_numTerms; // start setting the stuff mr.m_numDocIds = numDocIds; // copy # estiamted hits into 8 bytes of reply //int64_t est = m_posdbTable.m_estimatedTotalHits; // ensure it has at least as many results as we got //if ( est < numDocIds ) est = numDocIds; // or if too big... //if ( numDocIds < m_r->m_docsToGet ) est = numDocIds; // . total estimated hits // . this is now an EXACT count! mr.m_estimatedHits = m_numTotalHits; // sanity check mr.m_nqt = nqt; // the m_errno if any mr.m_errno = m_errno; // int16_tcut PosdbTable *pt = &m_posdbTable; // the score info, in no particular order right now mr.ptr_scoreInfo = pt->m_scoreInfoBuf.getBufStart(); mr.size_scoreInfo = pt->m_scoreInfoBuf.length(); // that has offset references into posdbtable::m_pairScoreBuf // and m_singleScoreBuf, so we need those too now mr.ptr_pairScoreBuf = pt->m_pairScoreBuf.getBufStart(); mr.size_pairScoreBuf = pt->m_pairScoreBuf.length(); mr.ptr_singleScoreBuf = pt->m_singleScoreBuf.getBufStart(); mr.size_singleScoreBuf = pt->m_singleScoreBuf.length(); // save some time since seo.cpp gets from posdbtable directly, // so we can avoid serializing/copying this stuff at least if ( ! m_r->m_makeReply ) { mr.size_scoreInfo = 0; mr.size_pairScoreBuf = 0; mr.size_singleScoreBuf = 0; } //mr.m_sectionStats = pt->m_sectionStats; // reserve space for these guys, we fill them in below mr.ptr_docIds = NULL; mr.ptr_scores = NULL; mr.ptr_clusterRecs = NULL; // this is how much space to reserve mr.size_docIds = 8 * numDocIds; // int64_t mr.size_scores = sizeof(double) * numDocIds; // float // if not doing site clustering, we won't have these perhaps... if ( m_gotClusterRecs ) mr.size_clusterRecs = sizeof(key_t) *numDocIds; else mr.size_clusterRecs = 0; #define MAX_FACETS 20000 ///////////////// // // FACETS // ///////////////// // We can have multiple gbfacet: terms in a query so // serialize all the QueryTerm::m_facetHashTables into // Msg39Reply::ptr_facetHashList. // // combine the facet hash lists of each query term into // a list of lists. each lsit is preceeded by the query term // id of the query term (like gbfacet:xpathsitehash12345) // followed by a 4 byte length of the following 32-bit // facet values int32_t need = 0; for ( int32_t i = 0 ; i < m_tmpq.m_numTerms; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; // skip if not facet if ( qt->m_fieldCode != FIELD_GBFACETSTR && qt->m_fieldCode != FIELD_GBFACETINT && qt->m_fieldCode != FIELD_GBFACETFLOAT ) continue; HashTableX *ft = &qt->m_facetHashTable; if ( ft->m_numSlotsUsed == 0 ) continue; int32_t used = ft->m_numSlotsUsed; // limit for memory if ( used > (int32_t)MAX_FACETS ) { log("msg39: truncating facet list to 20000 " "from %"INT32" for %s",used,qt->m_term); used = (int32_t)MAX_FACETS; } // store query term id 64 bit need += 8; // then size need += 4; // then buckets. keys and counts need += (4+sizeof(FacetEntry)) * used; } // allocate SafeBuf tmp; if ( ! tmp.reserve ( need ) ) { log("query: Could not allocate memory " "to hold reply facets"); sendReply(m_slot,this,NULL,0,0,true); return; } // point to there char *p = tmp.getBufStart(); for ( int32_t i = 0 ; i < m_tmpq.m_numTerms ; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; // skip if not facet if ( qt->m_fieldCode != FIELD_GBFACETSTR && qt->m_fieldCode != FIELD_GBFACETINT && qt->m_fieldCode != FIELD_GBFACETFLOAT ) continue; // get all the facet hashes and their counts HashTableX *ft = &qt->m_facetHashTable; // skip if none if ( ft->m_numSlotsUsed == 0 ) continue; // store query term id 64 bit *(int64_t *)p = qt->m_termId; p += 8; int32_t used = ft->getNumSlotsUsed(); if ( used > (int32_t)MAX_FACETS ) used = (int32_t)MAX_FACETS; // store count *(int32_t *)p = used; p += 4; int32_t count = 0; // for sanity check char *pend = p + (used * (4+sizeof(FacetEntry))); // serialize the key/val pairs for ( int32_t k = 0 ; k < ft->m_numSlots ; k++ ) { // skip empty buckets if ( ! ft->m_flags[k] ) continue; // store key. the hash of the facet value. *(int32_t *)p = ft->getKey32FromSlot(k); p += 4; // then store count //*(int32_t *)p = ft->getVal32FromSlot(k); p += 4; // now this has a docid on it so we can // lookup the text of the facet in Msg40.cpp FacetEntry *fe; fe = (FacetEntry *)ft->getValFromSlot(k); // sanity // no, count can be zero if its a range facet // that was never added to. we add those // empty FaceEntries only for range facets // in Posdb.cpp //if(fe->m_count == 0 ) { char *xx=NULL;*xx=0;} gbmemcpy ( p , fe , sizeof(FacetEntry) ); p += sizeof(FacetEntry); // do not breach if ( ++count >= (int32_t)MAX_FACETS ) break; } // sanity check if ( p != pend ) { char *xx=NULL;*xx=0; } // do the next query term } // now point to that so it can be serialized below mr.ptr_facetHashList = tmp.getBufStart(); mr.size_facetHashList = p - tmp.getBufStart();//tmp.length(); ///////////// // // END FACETS // ///////////// // . that is pretty much it,so serialize it into buffer,"reply" // . mr.ptr_docIds, etc., will point into the buffer so we can // re-serialize into it below from the tree // . returns NULL and sets g_errno on error // . "true" means we should make mr.ptr_* reference into the // newly serialized buffer. reply = serializeMsg ( sizeof(Msg39Reply), // baseSize &mr.size_docIds, // firstSizeParm &mr.size_clusterRecs,//lastSizePrm &mr.ptr_docIds , // firstStrPtr &mr , // thisPtr &replySize , NULL , 0 , true ) ; if ( ! reply ) { log("query: Could not allocated memory " "to hold reply of docids to send back."); sendReply(m_slot,this,NULL,0,0,true); return; } topDocIds = (int64_t *) mr.ptr_docIds; topScores = (double *) mr.ptr_scores; topRecs = (key_t *) mr.ptr_clusterRecs; } int32_t docCount = 0; // loop over all results in the TopTree for ( int32_t ti = m_tt.getHighNode() ; ti >= 0 ; ti = m_tt.getPrev(ti) ) { // get the guy TopNode *t = &m_tt.m_nodes[ti]; // skip if clusterLevel is bad! if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) continue; // if not sending back a reply... we were called from seo.cpp // State3f logic to evaluate a QueryLogEntry, etc. if ( m_callback ) { // skip results past #50 if ( docCount > 50 ) continue; // set this m_topScore50 = t->m_score; m_topDocId50 = t->m_docId; // that's it continue; } // get the docid ptr //char *diptr = t->m_docIdPtr; //int64_t docId = getDocIdFromPtr(diptr); // sanity check if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; } //add it to the reply topDocIds [docCount] = t->m_docId; topScores [docCount] = t->m_score; if ( m_tt.m_useIntScores ) topScores[docCount] = (double)t->m_intScore; // supply clusterdb rec? only for full splits if ( m_gotClusterRecs ) topRecs [docCount] = t->m_clusterRec; //topExplicits [docCount] = // getNumBitsOn(t->m_explicits) docCount++; // 50th score? set this for seo.cpp. if less than 50 results // we want the score of the last doc then. if ( docCount <= 50 ) m_topScore50 = t->m_score; if ( m_debug ) { logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "%03"INT32") docId=%012"UINT64" sum=%.02f", (PTRTYPE)this, docCount, t->m_docId,t->m_score); } //don't send more than the docs that are wanted if ( docCount >= numDocIds ) break; } if ( docCount > 300 && m_debug ) log("query: Had %"INT32" nodes in top tree",docCount); // this is sensitive info if ( m_debug ) { log(LOG_DEBUG, "query: msg39: [%"PTRFMT"] " "Intersected lists took %"INT64" (%"INT64") " "ms " "docIdsToGet=%"INT32" docIdsGot=%"INT32" " "q=%s", (PTRTYPE)this , m_posdbTable.m_addListsTime , gettimeofdayInMilliseconds() - m_startTime , m_r->m_docsToGet , numDocIds , m_tmpq.getQuery() ); } // if we blocked because we used a thread then call callback if // summoned from a msg3f handler and not a msg39 handler if ( m_callback ) { // if we blocked call user callback if ( m_blocked ) m_callback ( m_state ); // if not sending back a udp reply, return now return; } // now send back the reply sendReply(m_slot,this,reply,replySize,replySize,false); return; }
bool Msg3a::gotCacheReply ( ) { // in cache? if ( ! m_seoCacheList.isEmpty() ) { // note it //log("seopipe: found ckey=%s q=%s" // ,KEYSTR(&m_ckey,12) // ,m_r->ptr_query // ); char *p = m_seoCacheList.getList(); // skip key p += sizeof(key_t); // datasize p += 4; // timestamp //long cachedTime = *(long *)p; p += 4; // # docids m_numDocIds = *(long *)p; p += 4; // total # results m_numTotalEstimatedHits = *(long *)p; p += 4; // docids m_docIds = (long long *)p; p += 8 * m_numDocIds; // scores m_scores = (float *)p; p += sizeof(float) * m_numDocIds; // site hashes m_siteHashes26 = (long *)p; p += 4 * m_numDocIds; // log to log as well char tmp[50000]; p = tmp; p += sprintf(p, "seopipe: hit cache " "docids=%li " "query=\"%s\" ", m_numDocIds, m_r->ptr_query ); // log each docid //for ( long i = 0 ; i < m_numDocIds ; i++ ) { // //float score = m_msg3a->getScores()[i]; // long long d = m_docIds[i]; // //long sh32 = m_msg3a->getSiteHash32(i); // p += sprintf(p,"d%li=%lli ",i,d); //} log("%s",tmp); // all done! return true; } CollectionRec *cr; cr = g_collectiondb.getRec(m_r->ptr_coll,m_r->size_coll-1); setTermFreqWeights ( cr->m_coll,m_q,m_termFreqs , m_termFreqWeights ); if ( m_debug ) { //long long *termIds = m_q->getTermIds(); //if ( m_numCandidates ) termIds = m_synIds; for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) { // get the term in utf8 QueryTerm *qt = &m_q->m_qterms[i]; //char bb[256]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char c = *tpc; *tpc = 0; // this term freq is estimated from the rdbmap and // does not hit disk... logf(LOG_DEBUG,"query: term #%li \"%s\" " "termid=%lli termFreq=%lli termFreqWeight=%.03f", i, qt->m_term, qt->m_termId, m_termFreqs[i], m_termFreqWeights[i]); // put it back *tpc = c; } } // time how long to get each split's docids if ( m_debug ) m_startTime = gettimeofdayInMilliseconds(); // reset replies received count m_numReplies = 0; // shortcut long n = m_q->m_numTerms; ///////////////////////////// // // set the Msg39 request // ///////////////////////////// // free if we should if ( m_rbufPtr && m_rbufPtr != m_rbuf ) { mfree ( m_rbufPtr , m_rbufSize , "Msg3a"); m_rbufPtr = NULL; } // a tmp buf long readSizes[MAX_QUERY_TERMS]; // update our read info for ( long j = 0; j < n ; j++ ) { // the read size for THIS query term long rs = 300000000; // toRead; 300MB i guess... // limit to 50MB man! this was 30MB but the // 'time enough for love' query was hitting 30MB termlists. //rs = 50000000; rs = DEFAULT_POSDB_READSIZE;//90000000; // 90MB! // if section stats, limit to 1MB if ( m_r->m_getSectionStats ) rs = 1000000; // get the jth query term QueryTerm *qt = &m_q->m_qterms[j]; // if query term is ignored, skip it if ( qt->m_ignored ) rs = 0; // set it readSizes[j] = rs; } // serialize this m_r->ptr_readSizes = (char *)readSizes; m_r->size_readSizes = 4 * n; // and this m_r->ptr_termFreqWeights = (char *)m_termFreqWeights; m_r->size_termFreqWeights = 4 * n; // store query into request, might have changed since we called // Query::expandQuery() above m_r->ptr_query = m_q->m_orig; m_r->size_query = m_q->m_origLen+1; // free us? if ( m_rbufPtr && m_rbufPtr != m_rbuf ) { mfree ( m_rbufPtr , m_rbufSize, "Msg3a" ); m_rbufPtr = NULL; } m_r->m_stripe = 0; // debug thing g_r = m_r; // . (re)serialize the request // . returns NULL and sets g_errno on error // . "m_rbuf" is a local storage space that can save a malloc // . do not "makePtrsRefNewBuf" because if we do that and this gets // called a 2nd time because m_getWeights got set to 0, then we // end up copying over ourselves. m_rbufPtr = serializeMsg ( sizeof(Msg39Request), &m_r->size_readSizes, &m_r->size_coll, &m_r->ptr_readSizes, m_r, &m_rbufSize , m_rbuf , RBUF_SIZE , false ); if ( ! m_rbufPtr ) return true; // free this one too m_rbuf2.purge(); // and copy that! if ( ! m_rbuf2.safeMemcpy ( m_rbufPtr , m_rbufSize ) ) return true; // and tweak it ((Msg39Request *)(m_rbuf2.getBufStart()))->m_stripe = 1; ///////////////////////////// // // end formulating the Msg39 request // ///////////////////////////// // . set timeout based on docids requested! // . the more docs requested the longer it will take to get long timeout = (50 * m_docsToGet) / 1000; // at least 20 seconds if ( timeout < 20 ) timeout = 20; // override? this is USUALLY -1, but DupDectector.cpp needs it // high because it is a spider time thing. if ( m_r->m_timeout > 0 ) timeout = m_r->m_timeout; // for new posdb stuff if ( timeout < 60 ) timeout = 60; long long qh = 0LL; if ( m_q ) qh = m_q->getQueryHash(); m_numHosts = g_hostdb.getNumHosts(); // only send to one host? if ( ! m_q->isSplit() ) m_numHosts = 1; // now we run it over ALL hosts that are up! for ( long i = 0; i < m_numHosts ; i++ ) { // m_indexdbSplit; i++ ) { // get that host Host *h = g_hostdb.getHost(i); // if not a full split, just round robin the group, i am not // going to sweat over performance on non-fully split indexes // because they suck really bad anyway compared to full // split indexes. "gid" is already set if we are not split. unsigned long gid = h->m_groupId;//g_hostdb.getGroupId(i); long firstHostId = h->m_hostId; // get strip num char *req = m_rbufPtr; // if sending to twin, use slightly different request if ( h->m_stripe == 1 ) req = m_rbuf2.getBufStart(); // if we are a non-split query, like gbdom:xyz.com just send // to the host that has the first termid local. it will call // msg2 to download all termlists. msg2 should be smart // enough to download the "non split" termlists over the net. // TODO: fix msg2 to do that... if ( ! m_q->isSplit() ) { long long tid = m_q->getTermId(0); key_t k = g_indexdb.makeKey(tid,1,1,false ); // split = false! do not split gid = getGroupId ( RDB_POSDB,&k,false); firstHostId = -1; } // debug log if ( m_debug ) logf(LOG_DEBUG,"query: Msg3a[%lu]: forwarding request " "of query=%s to groupid 0x%lx.", (long)this, m_q->getQuery(), gid); // send to this guy Multicast *m = &m_mcast[i]; // clear it for transmit m->reset(); // . send out a msg39 request to each split // . multicasts to a host in group "groupId" // . we always block waiting for the reply with a multicast // . returns false and sets g_errno on error // . sends the request to fastest host in group "groupId" // . if that host takes more than about 5 secs then sends to // next host // . key should be largest termId in group we're sending to bool status; status = m->send ( req , // m_rbufPtr , m_rbufSize , // request size 0x39 , // msgType 0x39 false , // mcast owns m_request? gid , // group to send to false , // send to whole group? (long)qh , // 0 // startKey.n1 this , // state1 data m , // state2 data gotReplyWrapper3a , timeout , // in seconds m_r->m_niceness , false , // realtime? firstHostId, // -1// bestHandlingHostId , NULL , // m_replyBuf , 0 , // MSG39REPLYSIZE, // this is true if multicast should free the // reply, otherwise caller is responsible // for freeing it after calling // getBestReply(). // actually, this should always be false, // there is a bug in Multicast.cpp. // no, if we error out and never steal // the buffers then they will go unfreed // so they are freed by multicast by default // then we steal control explicitly true ); // if successfully launch, do the next one if ( status ) continue; // . this serious error should make the whole query fail // . must allow other replies to come in though, so keep going m_numReplies++; log("query: Multicast Msg3a had error: %s",mstrerror(g_errno)); m_errno = g_errno; g_errno = 0; } // return false if blocked on a reply if ( m_numReplies < m_numHosts ) return false;//indexdbSplit ) // . otherwise, we did not block... error? // . it must have been an error or just no new lists available!! // . if we call gotAllSplitReplies() here, and we were called by // mergeLists() we end up calling mergeLists() again... bad. so // just return true in that case. //return gotAllSplitReplies(); return true; }
void Msg39::estimateHits ( ) { // no longer in use m_inUse = false; // now this for the query loop on the QueryLogEntries. m_topDocId50 = 0LL; m_topScore50 = 0.0; // a little hack for the seo pipeline in xmldoc.cpp m_topDocId = 0LL; m_topScore = 0.0; m_topDocId2 = 0LL; m_topScore2 = 0.0; long ti = m_tt.getHighNode(); if ( ti >= 0 ) { TopNode *t = &m_tt.m_nodes[ti]; m_topDocId = t->m_docId; m_topScore = t->m_score; } // try the 2nd one too long ti2 = -1; if ( ti >= 0 ) ti2 = m_tt.getNext ( ti ); if ( ti2 >= 0 ) { TopNode *t2 = &m_tt.m_nodes[ti2]; m_topDocId2 = t2->m_docId; m_topScore2 = t2->m_score; } // convenience ptrs. we will store the docids/scores into these arrays long long *topDocIds; float *topScores; key_t *topRecs; // numDocIds counts docs in all tiers when using toptree. long numDocIds = m_tt.m_numUsedNodes; // the msg39 reply we send back long replySize; char *reply; //m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6; // make the reply? Msg39Reply mr; if ( ! m_callback ) { // if we got clusterdb recs in here, use 'em if ( m_gotClusterRecs ) numDocIds = m_numVisible; // don't send more than the docs that are asked for if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet; // # of QueryTerms in query long nqt = m_tmpq.m_numTerms; // start setting the stuff mr.m_numDocIds = numDocIds; // copy # estiamted hits into 8 bytes of reply //long long est = m_posdbTable.m_estimatedTotalHits; // ensure it has at least as many results as we got //if ( est < numDocIds ) est = numDocIds; // or if too big... //if ( numDocIds < m_r->m_docsToGet ) est = numDocIds; // . total estimated hits // . this is now an EXACT count! mr.m_estimatedHits = m_numTotalHits; // sanity check mr.m_nqt = nqt; // the m_errno if any mr.m_errno = m_errno; // shortcut PosdbTable *pt = &m_posdbTable; // the score info, in no particular order right now mr.ptr_scoreInfo = pt->m_scoreInfoBuf.getBufStart(); mr.size_scoreInfo = pt->m_scoreInfoBuf.length(); // that has offset references into posdbtable::m_pairScoreBuf // and m_singleScoreBuf, so we need those too now mr.ptr_pairScoreBuf = pt->m_pairScoreBuf.getBufStart(); mr.size_pairScoreBuf = pt->m_pairScoreBuf.length(); mr.ptr_singleScoreBuf = pt->m_singleScoreBuf.getBufStart(); mr.size_singleScoreBuf = pt->m_singleScoreBuf.length(); // save some time since seo.cpp gets from posdbtable directly, // so we can avoid serializing/copying this stuff at least if ( ! m_r->m_makeReply ) { mr.size_scoreInfo = 0; mr.size_pairScoreBuf = 0; mr.size_singleScoreBuf = 0; } // and now the sitehash list if it exists mr.ptr_siteHashList = pt->m_siteHashList.getBufStart(); mr.size_siteHashList = pt->m_siteHashList.length(); mr.m_sectionStats = pt->m_sectionStats; // reserve space for these guys, we fill them in below mr.ptr_docIds = NULL; mr.ptr_scores = NULL; mr.ptr_clusterRecs = NULL; // this is how much space to reserve mr.size_docIds = 8 * numDocIds; // long long mr.size_scores = 4 * numDocIds; // float // if not doing site clustering, we won't have these perhaps... if ( m_gotClusterRecs ) mr.size_clusterRecs = sizeof(key_t) *numDocIds; else mr.size_clusterRecs = 0; // . that is pretty much it,so serialize it into buffer,"reply" // . mr.ptr_docIds, etc., will point into the buffer so we can // re-serialize into it below from the tree // . returns NULL and sets g_errno on error // . "true" means we should make mr.ptr_* reference into the // newly serialized buffer. reply = serializeMsg ( sizeof(Msg39Reply), // baseSize &mr.size_docIds, // firstSizeParm &mr.size_clusterRecs,//lastSizePrm &mr.ptr_docIds , // firstStrPtr &mr , // thisPtr &replySize , NULL , 0 , true ) ; if ( ! reply ) { log("query: Could not allocated memory " "to hold reply of docids to send back."); sendReply(m_slot,this,NULL,0,0,true); return ; } topDocIds = (long long *) mr.ptr_docIds; topScores = (float *) mr.ptr_scores; topRecs = (key_t *) mr.ptr_clusterRecs; } long docCount = 0; // loop over all results in the TopTree for ( long ti = m_tt.getHighNode() ; ti >= 0 ; ti = m_tt.getPrev(ti) ) { // get the guy TopNode *t = &m_tt.m_nodes[ti]; // skip if clusterLevel is bad! if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) continue; // if not sending back a reply... we were called from seo.cpp // State3f logic to evaluate a QueryLogEntry, etc. if ( m_callback ) { // skip results past #50 if ( docCount > 50 ) continue; // set this m_topScore50 = t->m_score; m_topDocId50 = t->m_docId; // that's it continue; } // get the docid ptr //char *diptr = t->m_docIdPtr; //long long docId = getDocIdFromPtr(diptr); // sanity check if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; } //add it to the reply topDocIds [docCount] = t->m_docId; topScores [docCount] = t->m_score; // supply clusterdb rec? only for full splits if ( m_gotClusterRecs ) topRecs [docCount] = t->m_clusterRec; //topExplicits [docCount] = // getNumBitsOn(t->m_explicits) docCount++; // 50th score? set this for seo.cpp. if less than 50 results // we want the score of the last doc then. if ( docCount <= 50 ) m_topScore50 = t->m_score; if ( m_debug ) { log(LOG_DEBUG,"query: msg39: [%lu] " "%03li) docId=%012llu sum=%.02f", (long)this, docCount, t->m_docId,t->m_score); } //don't send more than the docs that are wanted if ( docCount >= numDocIds ) break; } if ( docCount > 300 && m_debug ) log("query: Had %li nodes in top tree",docCount); // this is sensitive info if ( m_debug ) { log(LOG_DEBUG, "query: msg39: [%li] Intersected lists took %lli (%lli) " "ms " "docIdsToGet=%li docIdsGot=%li " "q=%s", (long)this , m_posdbTable.m_addListsTime , gettimeofdayInMilliseconds() - m_startTime , m_r->m_docsToGet , numDocIds , m_tmpq.getQuery() ); } // if we blocked because we used a thread then call callback if // summoned from a msg3f handler and not a msg39 handler if ( m_callback ) { // if we blocked call user callback if ( m_blocked ) m_callback ( m_state ); // if not sending back a udp reply, return now return; } // now send back the reply sendReply(m_slot,this,reply,replySize,replySize,false); return; }