C++ (Cpp) setClusterRecs示例

示例#1

0

显示文件

文件： Msg39.cpp 项目： abhayprakash/open-source-search-engine

// return false if blocked, true otherwise
bool Msg39::addedLists ( ) {

	if ( m_posdbTable.m_t1 ) {
		// . measure time to add the lists in bright green
		// . use darker green if rat is false (default OR)
		long color;
		//char *label;
		color = 0x0000ff00 ;
		//label = "termlist_intersect";
		g_stats.addStat_r ( 0 , 
				    m_posdbTable.m_t1 , 
				    m_posdbTable.m_t2 , color );
	}


	// accumulate total hits count over each docid split
	m_numTotalHits += m_posdbTable.m_docIdVoteBuf.length() / 6;

	// before wrapping up, complete our docid split loops!
	// so do not send the reply back yet... send reply back from
	// the docid loop function... doDocIdSplitLoop()
	if ( m_numDocIdSplits >= 2 ) return true;


	// . save some memory,free m_topDocIdPtrs2,m_topScores2,m_topExplicits2
	// . the m_topTree should have been filled from the call to
	//   IndexTable2::fillTopDocIds() and it no longer has ptrs to the
	//   docIds, but has the docIds themselves
	//m_posdbTable.freeMem();

	// error?
	if ( m_posdbTable.m_errno ) {
		// we do not need to store the intersection i guess...??
		m_posdbTable.freeMem();
		g_errno = m_posdbTable.m_errno;
		log("query: posdbtable had error = %s",mstrerror(g_errno));
		sendReply ( m_slot , this , NULL , 0 , 0 ,true);
		return true;
	}


	// should we put cluster recs in the tree?
	//m_gotClusterRecs = ( g_conf.m_fullSplit && m_r->m_doSiteClustering );
	m_gotClusterRecs = ( m_r->m_doSiteClustering );
	
	// . before we send the top docids back, lookup their site hashes
	//   in clusterdb so we can do filtering at this point.
	//   BUT only do this if we are in a "full split" config, because that
	//   way we can guarantee all clusterdb recs are local (on this host)
	//   and should be in the page cache. the page cache should do ultra
	//   quick lookups and no memcpy()'s for this operation. it should
	//   be <<1ms to lookup thousands of docids.
	// . when doing innerLoopSiteClustering we always use top tree now
	//   because our number of "top docids" can be somewhat unpredictably 
	//   large due to having a ton of results with the same "domain hash" 
	//   (see the "vcount" in IndexTable2.cpp)
	// . do NOT do if we are just "getting weights", phr and aff weights
	if ( m_gotClusterRecs ) {
		// . set the clusterdb recs in the top tree
		return setClusterRecs ( ) ;
	}

	// if we did not call setClusterRecs, go on to estimate the hits
	estimateHits();
	return true;
}

示例#2

0

显示文件

文件： Msg39.cpp 项目： DeadNumbers/open-source-search-engine

// . returns false if blocks true otherwise
// 1. read all termlists for docid range
// 2. intersect termlists to get the intersecting docids
// 3. increment docid ranges and keep going
// 4. when done return the top docids
bool Msg39::controlLoop ( ) {

 loop:
	
	// error?
	if ( g_errno ) {
	hadError:
		log(LOG_LOGIC,"query: msg39: controlLoop: %s." , 
		    mstrerror(g_errno) );
		sendReply ( m_slot , this , NULL , 0 , 0 , true );
		return true; 
	}

	if ( m_phase == 0 ) {
		// next phase
		m_phase++;
		// the starting docid...
		int64_t d0 = m_ddd;
		// int16_tcut
		int64_t delta = MAX_DOCID / (int64_t)m_r->m_numDocIdSplits;
		// advance to point to the exclusive endpoint
		m_ddd += delta;
		// ensure this is exclusive of ddd since it will be
		// inclusive in the following iteration.
		int64_t d1 = m_ddd;
		// fix rounding errors
		if ( d1 + 20LL > MAX_DOCID ) {
			d1    = MAX_DOCID;
			m_ddd = MAX_DOCID;
		}
		// fix it
		m_r->m_minDocId = d0;
		m_r->m_maxDocId = d1; // -1; // exclude d1
		// allow posdbtable re-initialization each time to set
		// the msg2 termlist ptrs anew, otherwise we core in
		// call to PosdbTable::init() below
		//m_posdbTable.m_initialized = false;
		// reset ourselves, partially, anyway, not tmpq etc.
		reset2();
		// debug log
		if ( ! m_r->m_forSectionStats && m_debug )
			log("msg39: docid split phase %"INT64"-%"INT64"",d0,d1);
		// wtf?
		//if ( d0 >= d1 ) break;
		// load termlists for these docid ranges using msg2 from posdb
		if ( ! getLists() ) return false;
	}

	if ( m_phase == 1 ) {
		m_phase++;
		// intersect the lists we loaded using a thread
		if ( ! intersectLists() ) return false;
		// error?
		if ( g_errno ) goto hadError;
	}

	// sum up some stats
	if ( m_phase == 2 ) {
		m_phase++;
		if ( m_posdbTable.m_t1 ) {
			// . measure time to add the lists in bright green
			// . use darker green if rat is false (default OR)
			int32_t color;
			//char *label;
			color = 0x0000ff00 ;
			//label = "termlist_intersect";
			g_stats.addStat_r ( 0 , 
					    m_posdbTable.m_t1 , 
					    m_posdbTable.m_t2 , color );
		}
		// accumulate total hits count over each docid split
		m_numTotalHits += m_posdbTable.m_docIdVoteBuf.length() / 6;
		// minus the shit we filtered out because of gbminint/gbmaxint/
		// gbmin/gbmax/gbsortby/gbrevsortby/gbsortbyint/gbrevsortbyint
		m_numTotalHits -= m_posdbTable.m_filtered;
		// error?
		if ( m_posdbTable.m_errno ) {
			// we do not need to store the intersection i guess..??
			m_posdbTable.freeMem();
			g_errno = m_posdbTable.m_errno;
			log("query: posdbtable had error = %s",
			    mstrerror(g_errno));
			sendReply ( m_slot , this , NULL , 0 , 0 ,true);
			return true;
		}
		// if we have more docid ranges remaining do more
		if ( m_ddd < m_dddEnd ) {
			m_phase = 0;
			goto loop;
		}
	}

	// ok, we are done, get cluster recs of the winning docids
	if ( m_phase == 3 ) {
		m_phase++;
		// . this loads them using msg51 from clusterdb
		// . if m_r->m_doSiteClustering is false it just returns true
		// . this sets m_gotClusterRecs to true if we get them
		if ( ! setClusterRecs ( ) ) return false;
		// error setting clusterrecs?
		if ( g_errno ) goto hadError;
	}

	// process the cluster recs if we got them
	if ( m_gotClusterRecs && ! gotClusterRecs() )
		goto hadError;

	// . all done! set stats and send back reply
	// . only sends back the cluster recs if m_gotClusterRecs is true
	estimateHitsAndSendReply();

	return true;
}

示例#3

0

显示文件

文件： Msg39.cpp 项目： abhayprakash/open-source-search-engine

// . returns false if blocked, true if done
// . to avoid running out of memory, generate the search results for
//   multiple smaller docid-ranges, one range at a time.
bool Msg39::doDocIdSplitLoop ( ) {
	long long delta = MAX_DOCID / (long long)m_numDocIdSplits;
	for ( ; m_ddd < m_dddEnd ; ) {
		// the starting docid...
		long long d0 = m_ddd;
		// advance to point to the exclusive endpoint
		m_ddd += delta;
		// ensure this is exclusive of ddd since it will be
		// inclusive in the following iteration.
		long long d1 = m_ddd;
		// fix rounding errors
		if ( d1 + 20LL > MAX_DOCID ) {
			d1    = MAX_DOCID;
			m_ddd = MAX_DOCID;
		}
		// fix it
		m_r->m_minDocId = d0;
		m_r->m_maxDocId = d1; // -1; // exclude d1
		// allow posdbtable re-initialization each time to set
		// the msg2 termlist ptrs anew, otherwise we core in
		// call to PosdbTable::init() below
		//m_posdbTable.m_initialized = false;
		// reset ourselves, partially, anyway, not tmpq etc.
		reset2();
		// debug log
		log("msg39: docid split phase %lli-%lli",d0,d1);
		// wtf?
		if ( d0 >= d1 ) break;
		// use this
		//m_debug = true;
		//log("call1");
		// . get the lists
		// . i think this always should block!
		// . it will also intersect the termlists to get the search
		//   results and accumulate the winners into the "tree"
		if ( ! getLists() ) return false;
		//log("call2 g_errno=%li",(long)g_errno);
		// if there was an error, stop!
		if ( g_errno ) break;
	}

	// return error reply if we had an error
	if ( g_errno ) {
		log("msg39: Had error3: %s.", mstrerror(g_errno));
		sendReply (m_slot,this,NULL,0,0 , true);
		return true; 
	}

	if ( m_debug ) 
		log("msg39: done with all docid range splits");

	// all done. this will send reply back
	//estimateHits();
	//addedLists();

	// should we put cluster recs in the tree?
	//m_gotClusterRecs = ( g_conf.m_fullSplit && m_r->m_doSiteClustering );
	m_gotClusterRecs = ( m_r->m_doSiteClustering );
	
	// . before we send the top docids back, lookup their site hashes
	//   in clusterdb so we can do filtering at this point.
	//   BUT only do this if we are in a "full split" config, because that
	//   way we can guarantee all clusterdb recs are local (on this host)
	//   and should be in the page cache. the page cache should do ultra
	//   quick lookups and no memcpy()'s for this operation. it should
	//   be <<1ms to lookup thousands of docids.
	// . when doing innerLoopSiteClustering we always use top tree now
	//   because our number of "top docids" can be somewhat unpredictably 
	//   large due to having a ton of results with the same "domain hash" 
	//   (see the "vcount" in IndexTable2.cpp)
	// . do NOT do if we are just "getting weights", phr and aff weights
	if ( m_gotClusterRecs ) {
		// . set the clusterdb recs in the top tree
		// . this calls estimateHits() in its reply wrapper when done
		return setClusterRecs ( ) ;
	}

	// if we did not call setClusterRecs, go on to estimate the hits
	estimateHits();

	// no block, we are done
	return true;
}