bool Titledb::isLocal ( int64_t docId ) {
	// shift it up (64 minus 38) bits so we can mask it
	//key96_t key = makeTitleRecKey ( docId , false /*isDelKey?*/ );
	// mask upper bits of the top 4 bytes
	//return ( getGroupIdFromDocId ( docId ) == g_hostdb.m_groupId ) ;
	return ( getShardNumFromDocId(docId) == getMyShardNum() );
}
/*
bool Monitordb::addColl ( char *coll, bool doVerify ) {
	if ( ! m_rdb.addColl ( coll ) ) return false;
	if ( ! doVerify ) return true;
	// verify
	if ( verify(coll) ) return true;
	// if not allowing scale, return false
	if ( ! g_conf.m_allowScale ) return false;
	// otherwise let it go
	log ( "db: Verify failed, but scaling is allowed, passing." );
	return true;
}
*/
bool Monitordb::verify ( char *coll ) {
	log ( LOG_INFO, "db: Verifying Monitordb for coll %s...", coll );
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key224_t startKey;
	key224_t endKey;
	startKey.setMin();
	endKey.setMax();
	long minRecSizes = 64000;
	CollectionRec *cr = g_collectiondb.getRec(coll);

	if ( ! msg5.getList ( RDB_MONITORDB   ,
			      cr->m_collnum,
			      &list         ,
			      (char*)&startKey      ,
			      (char*)&endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          ,
			      0             ,
			      -1            ,
			      true          ,
			      -1LL          ,
			      &msg5b        ,
			      true          )) {
		g_threads.enableThreads();
		return log("db: HEY! it did not block");
	}

	long count = 0;
	long got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key224_t k;
		list.getCurrentKey((char*)&k);
		count++;
		uint32_t shardNum = getShardNum ( RDB_MONITORDB , &k );
		if ( shardNum == getMyShardNum() ) got++;
	}
	if ( got != count ) {
		log ("db: Out of first %li records in Monitordb , "
		     "only %li belong to our group.",count,got);

		/*
		// repeat with log
		for ( list.resetListPtr() ; ! list.isExhausted() ;
		      list.skipCurrentRecord() ) {

			key224_t k;
			list.getCurrentKey((char*)&k);
			uint32_t shardNum = getShardNum ( RDB_MONITORDB , &k );
			long groupNum = g_hostdb.getGroupNum(groupId);
			unsigned long sh32 ;
			sh32 = g_monitordb.getLinkeeSiteHash32_uk(&k);
			uint16_t sh16 = sh32 >> 19;
			log("db: sh16=0x%lx group=%li",
			    (long)sh16,groupNum);
		}
		*/


		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) log("db: Are you sure you have the "
				    "right "
				    "data in the right directory? "
				    "Exiting.");
		log ( "db: Exiting due to inconsistency.");
		g_threads.enableThreads();
		return g_conf.m_bypassValidation;
	}
	log ( LOG_INFO, "db: Monitordb passed verification successfully for "
	      "%li recs.", count );
	// DONE
	g_threads.enableThreads();
	return true;
}
示例#3
0
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc.
// . if list is stored locally this tries to get it locally
// . otherwise tries to get the list from the network
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . NOTE: i was having problems with queries being cached too long, you
//   see the cache here is a NETWORK cache, so when the machines that owns
//   the list updates it on disk it can't flush our cache... so use a small
//   maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( int64_t hostId      , // host to ask (-1 if none)
		     int32_t      ip          , // info on hostId
		     int16_t     port        ,
		     int32_t      maxCacheAge , // max cached age in seconds
		     bool      addToCache  , // add net recv'd list to cache?
		     char      rdbId       , // specifies the rdb
		     collnum_t collnum ,
		     RdbList  *list        ,
		     const char     *startKey    ,
		     const char     *endKey      ,
		     int32_t      minRecSizes ,  // use -1 for no max
		     void     *state       ,
		     void    (* callback)(void *state ),//, RdbList *list ) ,
		     int32_t      niceness    ,
		     bool      doErrorCorrection ,
		     bool      includeTree ,
		     bool      doMerge     ,
		     int32_t      firstHostId   ,
		     int32_t      startFileNum  ,
		     int32_t      numFiles      ,
		     int64_t      timeout       ,
		     int64_t syncPoint     ,
		     int32_t      preferLocalReads ,
		     Msg5     *msg5             ,
		     bool      isRealMerge      ,
		     bool      allowPageCache    ,
		     bool      forceLocalIndexdb ,
		     bool      noSplit ,
		     int32_t      forceParitySplit  ) {
	logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId );

	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0.");

	// reset the list they passed us
	list->reset();
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( rdbId );
	
//	if( g_conf.m_logTraceMsg0 ) 
//	{
//		log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId);
//		log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks);
//		log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId);
//	}

	// if startKey > endKey, don't read anything
	//if ( startKey > endKey ) return true;
	if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue
	// . reset hostid if it is dead
	// . this is causing UOR queries to take forever when we have a dead
	if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1;
	// no longer accept negative minrecsize
	if ( minRecSizes < 0 ) {
		g_errno = EBADENGINEER;
		logTrace( g_conf.m_logTraceMsg0, "END" );

		log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported.");
		char *xx=NULL;*xx=0;
	}

	// remember these
	m_state         = state;
	m_callback      = callback;
	m_list          = list;
	m_hostId        = hostId;
	m_niceness      = niceness;
	m_addToCache    = addToCache;
	// . these define our request 100%
	KEYSET(m_startKey,startKey,m_ks);
	KEYSET(m_endKey,endKey,m_ks);
	m_minRecSizes   = minRecSizes;
	m_rdbId         = rdbId;
	m_collnum = collnum;//          = coll;
	m_isRealMerge   = isRealMerge;
	m_allowPageCache = allowPageCache;

	// . group to ask is based on the first key 
	// . we only do 1 group per call right now
	// . groupMask must turn on higher bits first (count downwards kinda)
	// . titledb and spiderdb use special masks to get groupId

	// if diffbot.cpp is reading spiderdb from each shard we have to
	// get groupid from hostid here lest we core in getGroupId() below.
	// it does that for dumping spiderdb to the client browser. they
	// can download the whole enchilada.
	if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
		m_shardNum = 0;
	// did they force it? core until i figure out what this is
	else if ( forceParitySplit >= 0 ) 
		//m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
		m_shardNum = forceParitySplit;
	else
		//m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
		m_shardNum = getShardNum ( m_rdbId , startKey );

	// if we are looking up a termlist in posdb that is split by termid and
	// not the usual docid then we have to set this posdb key bit that tells
	// us that ...
	if ( noSplit && m_rdbId == RDB_POSDB )
		m_shardNum = g_hostdb.getShardNumByTermId ( startKey );

	// how is this used?
	if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();


//	if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum);


	// . store these parameters
	// . get a handle to the rdb in case we can satisfy locally
	// . returns NULL and sets g_errno on error
	QUICKPOLL((m_niceness));
	Rdb *rdb = getRdbFromId ( m_rdbId );
	if ( ! rdb ) return true;
	// we need the fixedDataSize
	m_fixedDataSize = rdb->getFixedDataSize();
	m_useHalfKeys   = rdb->useHalfKeys();
	// . debug msg
	// . Msg2 does this when checking for a cached compound list.
	//   compound lists do not actually exist, they are merges of smaller
	//   UOR'd lists.
	if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) {
		log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId );
	}

	// set this here since we may not call msg5 if list not local
	//m_list->setFixedDataSize ( m_fixedDataSize );

	// . now that we do load balancing we don't want to do a disk lookup
	//   even if local if we are merging or dumping
	// . UNLESS g_conf.m_preferLocalReads is true
	if ( preferLocalReads == -1 ) 
		preferLocalReads = g_conf.m_preferLocalReads;

	// . always prefer local for full split clusterdb
	// . and keep the tfndb/titledb lookups in the same stripe
	// . so basically we can't do biased caches if fully split
	//if ( g_conf.m_fullSplit ) preferLocalReads = true;
	preferLocalReads = true;

	// it it stored locally?
	bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId );
			 m_shardNum == getMyShardNum() );
	// only do local lookups if this is true
	if ( ! preferLocalReads ) isLocal = false;

	/*
	int64_t singleDocIdQuery = 0LL;
	if ( rdbId == RDB_POSDB ) {
		int64_t d1 = g_posdb.getDocId(m_startKey);
		int64_t d2 = g_posdb.getDocId(m_endKey);
		if ( d1+1 == d2 ) singleDocIdQuery = d1;
	}

	// . try the LOCAL termlist cache
	// . so when msg2 is evaluating a gbdocid:| query and it has to
	//   use msg0 to go across the network to get the same damn termlist
	//   over and over again for the same docid, this will help alot.
	// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
	//   send the same gbdocid:xxxx docids to the same hosts. maybe hash
	//   based on docid into the list of hosts and if that host is busy
	//   just chain until we find someone not busy.
	if ( singleDocIdQuery &&
	     getListFromTermListCache ( coll,
					m_startKey,
					m_endKey,
					maxCacheAge,
					list ) )
		// found!
		return true;
	*/

	// but always local if only one host
	if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;

	// . if the group is local then do it locally
	// . Msg5::getList() returns false if blocked, true otherwise
	// . Msg5::getList() sets g_errno on error
	// . don't do this if m_hostId was specified
	if ( isLocal ) {
		logTrace( g_conf.m_logTraceMsg0, "isLocal" );

		if ( msg5 ) {
			m_msg5 = msg5;
			m_deleteMsg5 = false;
		}
		else {
			try { m_msg5 = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" );
			m_deleteMsg5 = true;
		}

		QUICKPOLL(m_niceness);
		if ( ! m_msg5->getList ( rdbId,
					 m_collnum ,
					 m_list ,
					 m_startKey ,
					 m_endKey   ,
					 m_minRecSizes ,
					 includeTree   , // include Tree?
					 addToCache    , // addToCache?
					 maxCacheAge   ,
					 startFileNum  , 
					 numFiles      ,
					 this ,
					 gotListWrapper2   ,
					 niceness          ,
					 doErrorCorrection ,
					 NULL , // cacheKeyPtr
					 0    , // retryNum
					 -1   , // maxRetries
					 true , // compensateForMerge
					 syncPoint ,
					 m_isRealMerge ,
					 m_allowPageCache ) ) {
			logTrace( g_conf.m_logTraceMsg0, "END, return false" );
			return false;
		}

		// nuke it
		reset();
		logTrace( g_conf.m_logTraceMsg0, "END, return true" );
		return true;
	}
skip:
	// debug msg
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,"net: msg0: Sending request for data to "
		    "shard=%" PRIu32" "
		    "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" "
		    //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")",
		    "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")",
		    //g_hostdb.makeHostId ( m_groupId ) ,
		    m_shardNum,
		    (PTRTYPE)m_list,
		    m_minRecSizes, g_posdb.getTermId(m_startKey) , 
		    //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness);
		    KEY1(m_startKey,m_ks),KEY0(m_startKey),
		    (int32_t)m_niceness);

	char *replyBuf = NULL;
	int32_t  replyBufMaxSize = 0;
	bool  freeReply = true;

	// . make a request with the info above (note: not in network order)
	// . IMPORTANT!!!!! if you change this change 
	//   Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!!
	//   no, not anymore, we commented out that request peeking code
	char *p = m_request;
	*(int64_t *) p = syncPoint        ; p += 8;
	//*(key_t     *) p = m_startKey       ; p += sizeof(key_t);
	//*(key_t     *) p = m_endKey         ; p += sizeof(key_t);
	*(int32_t      *) p = m_minRecSizes    ; p += 4;
	*(int32_t      *) p = startFileNum     ; p += 4;
	*(int32_t      *) p = numFiles         ; p += 4;
	*(int32_t      *) p = maxCacheAge      ; p += 4;
	if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; }
	*p               = m_rdbId          ; p++;
	*p               = addToCache       ; p++;
	*p               = doErrorCorrection; p++;
	*p               = includeTree      ; p++;
	*p               = (char)niceness   ; p++;
	*p               = (char)m_allowPageCache; p++;
	KEYSET(p,m_startKey,m_ks);          ; p+=m_ks;
	KEYSET(p,m_endKey,m_ks);            ; p+=m_ks;
	// NULL terminated collection name
	//strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
	*(collnum_t *)p = m_collnum; p += sizeof(collnum_t);
	m_requestSize    = p - m_request;
	// ask an individual host for this list if hostId is NOT -1
	if ( m_hostId != -1 ) {
		// get Host
		Host *h = g_hostdb.getHost ( m_hostId );
		if ( ! h ) { 
			g_errno = EBADHOSTID; 
			log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId);
			logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" );
			return true;
		}
		
		// if niceness is 0, use the higher priority udpServer
		UdpServer *us ;
		uint16_t port;
		QUICKPOLL(m_niceness);

		us = &g_udpServer ; port = h->m_port ; 
		// . returns false on error and sets g_errno, true otherwise
		// . calls callback when reply is received (or error)
		// . we return true if it returns false
		if ( ! us->sendRequest ( m_request     ,
					 m_requestSize ,
					 0x00          , // msgType
					 h->m_ip       ,
					 port          ,
					 m_hostId      ,
					 NULL          , // the slotPtr
					 this          ,
					 gotSingleReplyWrapper ,
					 timeout       ,
					 -1            , // backoff
					 -1            , // maxwait
					 replyBuf      ,
					 replyBufMaxSize ,
					 m_niceness     ) ) { // cback niceness
			logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" );
			return true;
		}
		
		// return false cuz it blocked
		logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" );
		return false;
	}
	// timing debug
	if ( g_conf.m_logTimingNet )
		m_startTime = gettimeofdayInMilliseconds();
	else
		m_startTime = 0;

	// . get the top int32_t of the key
	// . i guess this will work for 128 bit keys... hmmmmm
	int32_t keyTop = hash32 ( (char *)startKey , m_ks );

	// . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . calls callback on completion
	// . select first host to send to in group based on upper 32 bits
	//   of termId (m_startKey.n1)
	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	//for ( int32_t i = 0; i < m_numSplit; i++ ) {

	QUICKPOLL(m_niceness);
	//int32_t gr;
	char *buf;
	buf = replyBuf;

	// get the multicast
	Multicast *m = &m_mcast;

        if ( ! m->send ( m_request    , 
			      m_requestSize,
			      0x00         , // msgType 0x00
			      false        , // does multicast own request?
			 m_shardNum ,
			      false        , // send to whole group?
			      //m_startKey.n1, // key is passed on startKey
			      keyTop       , // key is passed on startKey
			      this         , // state data
			      NULL         , // state data
			      gotMulticastReplyWrapper0 ,
			      timeout*1000 , // timeout
			      niceness     ,
			      firstHostId  ,
			      buf             ,
			      replyBufMaxSize ,
			      freeReply       , // free reply buf?
			      true            , // do disk load balancing?
			      maxCacheAge     ,
			      //(key_t *)cacheKey        ,
			      // multicast uses it for determining the best
			      // host to send the request to when doing 
			      // disk load balancing. if the host has our 
			      // data cached, then it will probably get to
			      // handle the request. for now let's just assume
			      // this is a 96-bit key. TODO: fix...
			 0 , // *(key_t *)cacheKey        ,
			      rdbId           ,
			      minRecSizes     ) ) 
	{
		log(LOG_ERROR, "net: Failed to send request for data from %s in shard "
		    "#%" PRIu32" over network: %s.",
		    getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
		// but speed it up
		m_errno = g_errno;
		m->reset();
		if ( m_numRequests > 0 ) {
			logTrace( g_conf.m_logTraceMsg0, "END - returning false" );
			
			return false;
		}

		logTrace( g_conf.m_logTraceMsg0, "END - returning true" );
		return true;
	}

	m_numRequests++;

	// we blocked
	logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" );
	return false;
}
/*
bool Titledb::addColl ( char *coll, bool doVerify ) {
	if ( ! m_rdb.addColl ( coll ) ) return false;
	if ( ! doVerify ) return true;
	// verify
	if ( verify(coll) ) return true;
	// if not allowing scale, return false
	if ( ! g_conf.m_allowScale ) return false;
	// otherwise let it go
	log ( "db: Verify failed, but scaling is allowed, passing." );
	return true;
}
*/
bool Titledb::verify(const char *coll) {
	log ( LOG_DEBUG, "db: Verifying Titledb for coll %s...", coll );

	Msg5 msg5;
	RdbList list;
	key96_t startKey;
	key96_t endKey;
	startKey.setMin();
	endKey.setMax();
	//int32_t minRecSizes = 64000;
	const CollectionRec *cr = g_collectiondb.getRec(coll);

	if ( ! msg5.getList ( RDB_TITLEDB   ,
			      cr->m_collnum       ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      1024*1024     , // minRecSizes   ,
			      true          , // includeTree   ,
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      -1LL          , // sync point
			      false         , // isRealMerge
			      true))          // allowPageCache
	{
		log(LOG_DEBUG, "db: HEY! it did not block");
		return false;
	}

	int32_t count = 0;
	int32_t got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key96_t k = list.getCurrentKey();
		// skip negative keys
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		count++;
		//uint32_t groupId = getGroupId ( RDB_TITLEDB , &k );
		//if ( groupId == g_hostdb.m_groupId ) got++;
		uint32_t shardNum = getShardNum ( RDB_TITLEDB, &k );
		if ( shardNum == getMyShardNum() ) got++;
	}
	if ( got != count ) {
		// tally it up
		g_rebalance.m_numForeignRecs += count - got;
		log ("db: Out of first %" PRId32" records in titledb, "
		     "only %" PRId32" belong to our shard. c=%s",count,got,coll);
		// exit if NONE, we probably got the wrong data
		if ( count > 10 && got == 0 ) 
			log("db: Are you sure you have the right "
				   "data in the right directory? "
			    "coll=%s "
			    "Exiting.",
			    coll);
		// repeat with log
		for ( list.resetListPtr() ; ! list.isExhausted() ;
		      list.skipCurrentRecord() ) {
			key96_t k = list.getCurrentKey();
			//uint32_t groupId = getGroupId ( RDB_TITLEDB,&k);
			//int32_t groupNum = g_hostdb.getGroupNum(groupId);
			int32_t shardNum = getShardNum ( RDB_TITLEDB, &k );
			log("db: docid=%" PRId64" shard=%" PRId32,
			    getDocId(&k),shardNum);
		}
		//if ( g_conf.m_bypassValidation ) return true;
		//if ( g_conf.m_allowScale ) return true;
		// don't exit any more, allow it, but do not delete
		// recs that belong to different shards when we merge now!
		log ( "db: db shards unbalanced. "
		      "Click autoscale in master controls.");
		//return false;
		return true;
	}

	log ( LOG_DEBUG, "db: Titledb passed verification successfully for %" PRId32
			" recs.", count );
	// DONE
	return true;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . "termIds/termFreqs" should NOT be on the stack in case we block
// . i based this on ../titledb/Msg23.cpp 
bool Msg36::getTermFreq ( char      *coll       ,
			  long       maxAge     ,
			  long long  termId     ,
			  void      *state      ,
			  void (* callback)(void *state ) ,
			  long       niceness   ,
			  bool       exactCount ,
			  bool       incCount   ,
			  bool       decCount   ,
			  bool       isSplit) {
	// sanity check
	if ( termId == 0LL ) { 
		g_errno = EBADENGINEER;
		log("quota: msg36: termid is 0.");
		return true;
	}
	// warning
	if ( ! coll ) log(LOG_LOGIC,"quota: msg36: NULL collection.");
	// no more quotas here!
	if ( incCount || decCount ) { char *xx = NULL; *xx = 0; }
	// sanity check
	//if ( incCount && ! exactCount ) { char *xx = NULL; *xx = 0; }
	//if ( decCount && ! exactCount ) { char *xx = NULL; *xx = 0; }
	// sanity check
	//if ( incCount && isSplit ) { char *xx = NULL; *xx = 0; }
	//if ( decCount && isSplit ) { char *xx = NULL; *xx = 0; }
	// cannot call handler asynchronously when doing exact counts...
	//if ( exactCount ) niceness = MAX_NICENESS;
	// keep a pointer for the caller
	m_state    = state;
	m_callback = callback;
	m_termFreq = 0LL;
	m_niceness = niceness;

	m_errno    = 0LL;
	m_isSplit = isSplit;
	// TODO: have a local by-pass for speed
	// if we have this termlist local then we can skip the network stuff
	//if ( g_indexdb.isLocal ( termId ) ) { return getTermFreqLocally(); }

	// make a key from our termId, and if docId is provided, that too.
	key144_t key ;
	g_posdb.makeStartKey ( &key, termId , 0LL );
	// . now what group do we belong to?
	// . groupMask has hi bits set before it sets low bits
	//unsigned long groupId = key.n1 & g_hostdb.m_groupMask;
	//unsigned long groupId;
	/*
	if ( g_hostdb.m_indexSplits > 1 ) 
		groupId = g_indexdb.getBaseGroupId(&key);
	else
		groupId = g_indexdb.getGroupIdFromKey(&key);
	*/
	//groupId = g_indexdb.getNoSplitGroupId(&key);
	uint32_t shardNum = getShardNum ( RDB_POSDB , &key );
	
	log(LOG_DEBUG,"quota: msg36 termid=%lli inc=%li dec=%li "
	    "sending to shard=%li\n",termId,(long)incCount,(long)decCount,
	    (long)shardNum);

		//unsigned long groupId = g_indexdb.getBaseGroupId(&key);
	                                    //getGroupIdFromKey ( &key );
	// . what is the ideal hostId based on this key?
	// . this is what multicast does to determine the 1st host to send to
	//if ( groupId == g_hostdb.m_groupId &&
	bool local = true;
	if ( g_hostdb.m_indexSplits != 1   ) local = false;
	if ( shardNum != getMyShardNum()   ) local = false;
	//if ( g_conf.m_fullSplit            ) local = true;
	local = true;
	if ( exactCount                    ) local = false;
	//if ( g_hostdb.m_indexSplits == 1 &&
	//     groupId == g_hostdb.m_groupId &&
	//     //!g_conf.m_interfaceMachine &&
	//    !exactCount ) {
	if ( local ) {
		//long numHosts;
		//Host *hosts = g_hostdb.getGroup(g_hostdb.m_groupId,&numHosts);
		//unsigned long i = ((unsigned long)groupId/*key*/) % numHosts;
		// if it's us then no need to multicast to ourselves
		//if(hosts[i].m_hostId==g_hostdb.m_hostId||g_conf.m_fullSplit) {
		m_termFreq = g_posdb.getTermFreq ( coll , termId );
		// clear g_errno
		g_errno = 0;
		return true;
	}

	// . make a request
	// . just send the termId and collection name
	char *p = m_request;
	*p = 0;
	// exact flag
	if ( exactCount ) *p |= 0x01;
	//if ( incCount   ) *p |= 0x02;
	//if ( decCount   ) *p |= 0x04;
	if ( m_niceness ) *p |= 0x08;
	p++;
	*(long long *)p = termId ; p += sizeof(long long);
	strcpy ( p , coll ); p += gbstrlen(coll) + 1; // copy includes \0

	long timeout = 5;
	//if ( incCount || decCount ) timeout = 9999999;
	if ( exactCount           ) timeout = 9999999;

	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	bool blocked  = false;
	// just do one host and multiply his count by the split
	// for now to increase performance
	bool semiExact = true;
	if(!m_isSplit) semiExact = false;
	// send a request for every split
	for ( long i = 0; i < g_hostdb.m_indexSplits; i++ ) {
		long  gr;
		char *buf;
		// semiExact overrides all
		if ( semiExact && g_hostdb.m_indexSplits > 1 ) {
			long nn = (unsigned long)termId % 
				g_hostdb.m_indexSplits;
			// sanity check
			if ( nn < 0 || nn >= g_hostdb.m_indexSplits ) {
				char *xx = NULL; *xx = 0; }
			//gr = g_indexdb.getSplitGroupId ( groupId , nn);
			// need to select the first buffer
			buf = &m_reply[i*8];
			// do not use this!
			char *xx=NULL;*xx=0;
		}			
		else if ( g_hostdb.m_indexSplits > 1 && m_isSplit) {
			//gr  = g_indexdb.getSplitGroupId ( groupId, i );
			buf = &m_reply[i*8];
			// do not use this!
			char *xx=NULL;*xx=0;
		}
		else {
			gr  = shardNum;  //this is just the baseGroupId
			buf = m_reply;
		}
		// in case it fails somehow
		*(long long *)buf = 0LL;

		// .  multicast to a host in group
		// . returns false and sets g_errno on error
		if ( ! m_mcast[i].
		     send ( m_request    , 
			    p - m_request, // request size
			    0x36         , // msgType 0x36
			    false        , // multicast owns msg?
			    gr           , // shard num
			    false        , // send to whole group?
			    termId       , // key is termId
			    this         , // state data
			    NULL         , // state data
			    gotReplyWrapper36 ,
			    timeout,
			    //5               , // 5 second timeout
			    m_niceness      ,
			    false           , // realtime?
			    -1              , // first hostid
			    buf             ,
			    8               ,
			    false           ) ) { // free reply buf?
			log("quota: msg36: sending mcast had error: %s",
			    mstrerror(g_errno));
			//return true;
		}
		else {
			m_numRequests++;
			blocked = true;
		}
		// only launch (attempt to launch) one request if semiExact
		if ( semiExact ) break;
		// is we are not split only one host has the termlist
		if ( ! m_isSplit ) break;
		// no inefficient looping! let's nuke this mcast array
		char *xx = NULL; *xx = 0;
	}
	// we blocked on the multicast
	if ( blocked ) return false;
	return true;
}
bool Clusterdb::verify ( char *coll ) {
	log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll );
	g_jobScheduler.disallow_new_jobs();

	Msg5 msg5;
	RdbList list;
	key_t startKey;
	key_t endKey;
	startKey.setMin();
	endKey.setMax();
	//int32_t minRecSizes = 64000;
	CollectionRec *cr = g_collectiondb.getRec(coll);
	
	if ( ! msg5.getList ( RDB_CLUSTERDB ,
			      cr->m_collnum          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      64000         , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          ,
			      0             ,
			      -1            ,
			      true          ,
			      -1LL          ,
			      true          )) {
		g_jobScheduler.allow_new_jobs();
		log("db: HEY! it did not block");
		return false;
	}

	int32_t count = 0;
	int32_t got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		// skip negative keys
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		count++;
		//uint32_t groupId = getGroupId ( RDB_CLUSTERDB , &k );
		//if ( groupId == g_hostdb.m_groupId ) got++;
		uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k );
		if ( shardNum == getMyShardNum() ) got++;
	}
	if ( got != count ) {
		// tally it up
		g_rebalance.m_numForeignRecs += count - got;
		log ("db: Out of first %" PRId32" records in clusterdb, "
		     "only %" PRId32" belong to our group.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) log("db: Are you sure you have the "
					   "right "
					   "data in the right directory? "
					   "Exiting.");
		log ( "db: Exiting due to Clusterdb inconsistency." );
		g_jobScheduler.allow_new_jobs();
		return g_conf.m_bypassValidation;
	}
	log ( LOG_DEBUG, "db: Clusterdb passed verification successfully for "
			"%" PRId32" recs.", count );
	// DONE
	g_jobScheduler.allow_new_jobs();
	return true;
}
bool Syncdb::verify ( char *coll ) {
	log ( LOG_INFO, "db: Verifying Syncdb for coll %s...", coll );
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key_t startKey;
	key_t endKey;
	startKey.setMin();
	endKey.setMax();
	CollectionRec *cr = g_collectiondb.getRec(coll);
	
	if ( ! msg5.getList ( RDB_SYNCDB    ,
			      cr->m_collnum          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      64000         , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          ,
			      0             ,
			      -1            ,
			      true          ,
			      -1LL          ,
			      &msg5b        ,
			      true          )) {
		g_threads.enableThreads();
		return log("db: HEY! it did not block");
	}

	long count = 0;
	long got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		count++;
		//unsigned long groupId = getGroupId ( RDB_SYNCDB , &k );
		//if ( groupId == g_hostdb.m_groupId ) got++;
		uint32_t shardNum = getShardNum ( RDB_SYNCDB , (char *)&k );
		if ( shardNum == getMyShardNum() ) got++;
	}
	if ( got != count ) {
		log ("db: Out of first %li records in syncdb, "
		     "only %li belong to our group.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) log("db: Are you sure you have the "
					   "right "
					   "data in the right directory? "
					   "Exiting.");
		log ( "db: Exiting due to Syncdb inconsistency." );
		g_threads.enableThreads();
		return g_conf.m_bypassValidation;
	}
	log ( LOG_INFO, "db: Syncdb passed verification successfully for "
			"%li recs.", count );
	// DONE
	g_threads.enableThreads();
	return true;
}
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc.
// . if list is stored locally this tries to get it locally
// . otherwise tries to get the list from the network
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . NOTE: i was having problems with queries being cached too long, you
//   see the cache here is a NETWORK cache, so when the machines that owns
//   the list updates it on disk it can't flush our cache... so use a small
//   maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( int64_t hostId      , // host to ask (-1 if none)
		     int32_t      ip          , // info on hostId
		     int16_t     port        ,
		     int32_t      maxCacheAge , // max cached age in seconds
		     bool      addToCache  , // add net recv'd list to cache?
		     char      rdbId       , // specifies the rdb
		     //char     *coll        ,
		     collnum_t collnum ,
		     RdbList  *list        ,
		     //key_t     startKey    , 
		     //key_t     endKey      , 
		     char     *startKey    ,
		     char     *endKey      ,
		     int32_t      minRecSizes ,  // use -1 for no max
		     void     *state       ,
		     void    (* callback)(void *state ),//, RdbList *list ) ,
		     int32_t      niceness    ,
		     bool      doErrorCorrection ,
		     bool      includeTree ,
		     bool      doMerge     ,
		     int32_t      firstHostId   ,
		     int32_t      startFileNum  ,
		     int32_t      numFiles      ,
		     int32_t      timeout       ,
		     int64_t syncPoint     ,
		     int32_t      preferLocalReads ,
		     Msg5     *msg5             ,
		     Msg5     *msg5b            ,
		     bool      isRealMerge      ,
//#ifdef SPLIT_INDEXDB
		     bool      allowPageCache    ,
		     bool      forceLocalIndexdb ,
		     bool      noSplit , // doIndexdbSplit    ,
		     int32_t      forceParitySplit  ) {
//#else
//		     bool      allowPageCache ) {
//#endif
	// this is obsolete! mostly, but we need it for PageIndexdb.cpp to 
	// show a "termlist" for a given query term in its entirety so you 
	// don't have to check each machine in the network. if this is true it
	// means to query each split and merge the results together into a
	// single unified termlist. only applies to indexdb/datedb.
	//if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; }
	// note this because if caller is wrong it hurts performance major!!
	//if ( doIndexdbSplit ) 
	//	logf(LOG_DEBUG,"net: doing msg0 with indexdb split true");
	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0.");

	//if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; }

	// reset the list they passed us
	list->reset();
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( rdbId );
	// if startKey > endKey, don't read anything
	//if ( startKey > endKey ) return true;
	if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue
	// . reset hostid if it is dead
	// . this is causing UOR queries to take forever when we have a dead
	if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1;
	// no longer accept negative minrecsize
	if ( minRecSizes < 0 ) {
		g_errno = EBADENGINEER;
		log(LOG_LOGIC,
		    "net: msg0: Negative minRecSizes no longer supported.");
		char *xx=NULL;*xx=0;
		return true;
	}

	// debug msg
	//if ( niceness != 0 ) log("HEY start");
	// ensure startKey last bit clear, endKey last bit set
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	//	log("Msg0::getList: warning startKey lastbit set"); 
	//if ( (endKey.n0   & 0x01) == 0x00 ) 
	//	log("Msg0::getList: warning endKey lastbit clear"); 
	// remember these
	m_state         = state;
	m_callback      = callback;
	m_list          = list;
	m_hostId        = hostId;
	m_niceness      = niceness;
	//m_ip            = ip;
	//m_port          = port;
	m_addToCache    = addToCache;
	// . these define our request 100%
	//m_startKey      = startKey;
	//m_endKey        = endKey;
	KEYSET(m_startKey,startKey,m_ks);
	KEYSET(m_endKey,endKey,m_ks);
	m_minRecSizes   = minRecSizes;
	m_rdbId         = rdbId;
	m_collnum = collnum;//          = coll;
	m_isRealMerge   = isRealMerge;
	m_allowPageCache = allowPageCache;

	// . group to ask is based on the first key 
	// . we only do 1 group per call right now
	// . groupMask must turn on higher bits first (count downwards kinda)
	// . titledb and spiderdb use special masks to get groupId

	// if diffbot.cpp is reading spiderdb from each shard we have to
	// get groupid from hostid here lest we core in getGroupId() below.
	// it does that for dumping spiderdb to the client browser. they
	// can download the whole enchilada.
	if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
		m_shardNum = 0;
	// did they force it? core until i figure out what this is
	else if ( forceParitySplit >= 0 ) 
		//m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
		m_shardNum = forceParitySplit;
	else
		//m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
		m_shardNum = getShardNum ( m_rdbId , startKey );

	// if we are looking up a termlist in posdb that is split by termid and
	// not the usual docid then we have to set this posdb key bit that tells
	// us that ...
	if ( noSplit && m_rdbId == RDB_POSDB )
		m_shardNum = g_hostdb.getShardNumByTermId ( startKey );

	// how is this used?
	//if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId;
	if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();

	// . store these parameters
	// . get a handle to the rdb in case we can satisfy locally
	// . returns NULL and sets g_errno on error
	QUICKPOLL((m_niceness));
	Rdb *rdb = getRdbFromId ( m_rdbId );
	if ( ! rdb ) return true;
	// we need the fixedDataSize
	m_fixedDataSize = rdb->getFixedDataSize();
	m_useHalfKeys   = rdb->useHalfKeys();
	// . debug msg
	// . Msg2 does this when checking for a cached compound list.
	//   compound lists do not actually exist, they are merges of smaller
	//   UOR'd lists.
	if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree))
		log(LOG_LOGIC,"net: msg0: "
		    "Weird. check but don't add... rdbid=%"INT32".",(int32_t)m_rdbId);
	// set this here since we may not call msg5 if list not local
	//m_list->setFixedDataSize ( m_fixedDataSize );

	// . now that we do load balancing we don't want to do a disk lookup
	//   even if local if we are merging or dumping
	// . UNLESS g_conf.m_preferLocalReads is true
	if ( preferLocalReads == -1 ) 
		preferLocalReads = g_conf.m_preferLocalReads;

	// . always prefer local for full split clusterdb
	// . and keep the tfndb/titledb lookups in the same stripe
	// . so basically we can't do biased caches if fully split
	//if ( g_conf.m_fullSplit ) preferLocalReads = true;
	preferLocalReads = true;

	// it it stored locally?
	bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId );
			 m_shardNum == getMyShardNum() );
	// only do local lookups if this is true
	if ( ! preferLocalReads ) isLocal = false;

	/*
	m_numSplit = 1;
	if ( g_hostdb.m_indexSplits > 1 &&
	     ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&&
	     ! forceLocalIndexdb && doIndexdbSplit ) {
		isLocal  = false;
		//m_numSplit = INDEXDB_SPLIT;
		m_numSplit = g_hostdb.m_indexSplits;
		char *xx=NULL;*xx=0;
	}
	*/
	/*
	int64_t singleDocIdQuery = 0LL;
	if ( rdbId == RDB_POSDB ) {
		int64_t d1 = g_posdb.getDocId(m_startKey);
		int64_t d2 = g_posdb.getDocId(m_endKey);
		if ( d1+1 == d2 ) singleDocIdQuery = d1;
	}

	// . try the LOCAL termlist cache
	// . so when msg2 is evaluating a gbdocid:| query and it has to
	//   use msg0 to go across the network to get the same damn termlist
	//   over and over again for the same docid, this will help alot.
	// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
	//   send the same gbdocid:xxxx docids to the same hosts. maybe hash
	//   based on docid into the list of hosts and if that host is busy
	//   just chain until we find someone not busy.
	if ( singleDocIdQuery &&
	     getListFromTermListCache ( coll,
					m_startKey,
					m_endKey,
					maxCacheAge,
					list ) )
		// found!
		return true;
	*/

	// but always local if only one host
	if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;

	// force a msg0 if doing a docid restrictive query like
	// gbdocid:xxxx|<query> so we call cacheTermLists() 
	//if ( singleDocIdQuery ) isLocal = false;

	// . if the group is local then do it locally
	// . Msg5::getList() returns false if blocked, true otherwise
	// . Msg5::getList() sets g_errno on error
	// . don't do this if m_hostId was specified
	if ( isLocal ) { // && !g_conf.m_interfaceMachine ) {
		if ( msg5 ) {
			m_msg5 = msg5;
			m_deleteMsg5 = false;
		}
		else {
			try { m_msg5 = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5 , sizeof(Msg5) , "Msg0" );
			m_deleteMsg5 = true;
		}

		QUICKPOLL(m_niceness);
		// same for msg5b
		if ( msg5b ) {
			m_msg5b = msg5b;
			m_deleteMsg5b = false;
		}
		/*
		else if ( m_rdbId == RDB_TITLEDB ) {
			try { m_msg5b = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request. 2.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" );
			m_deleteMsg5b = true;
		}
		*/
		QUICKPOLL(m_niceness);
		if ( ! m_msg5->getList ( rdbId,
					 m_collnum ,
					 m_list ,
					 m_startKey ,
					 m_endKey   ,
					 m_minRecSizes ,
					 includeTree   , // include Tree?
					 addToCache    , // addToCache?
					 maxCacheAge   ,
					 startFileNum  , 
					 numFiles      ,
					 this ,
					 gotListWrapper2   ,
					 niceness          ,
					 doErrorCorrection ,
					 NULL , // cacheKeyPtr
					 0    , // retryNum
					 -1   , // maxRetries
					 true , // compensateForMerge
					 syncPoint ,
					 NULL,//m_msg5b   ,
					 m_isRealMerge ,
					 m_allowPageCache ) ) return false;
		// nuke it
		reset();
		return true;
	}
skip:
	// debug msg
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,"net: msg0: Sending request for data to "
		    "shard=%"UINT32" "
		    "listPtr=%"PTRFMT" minRecSizes=%"INT32" termId=%"UINT64" "
		    //"startKey.n1=%"XINT32",n0=%"XINT64" (niceness=%"INT32")",
		    "startKey.n1=%"XINT64",n0=%"XINT64" (niceness=%"INT32")",
		    //g_hostdb.makeHostId ( m_groupId ) ,
		    m_shardNum,
		    (PTRTYPE)m_list,
		    m_minRecSizes, g_posdb.getTermId(m_startKey) , 
		    //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness);
		    KEY1(m_startKey,m_ks),KEY0(m_startKey),
		    (int32_t)m_niceness);

	char *replyBuf = NULL;
	int32_t  replyBufMaxSize = 0;
	bool  freeReply = true;

	// adjust niceness for net transmission
	bool realtime = false;
	//if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true;

	// if we're niceness 0 we need to pre-allocate for reply since it
	// might be received within the asynchronous signal handler which
	// cannot call mmalloc()
	if ( realtime ) { // niceness <= 0 || netnice == 0 ) {
		// . we should not get back more than minRecSizes bytes since 
		//   we are now performing merges
		// . it should not slow things down too much since the hashing
		//   is 10 times slower than merging anyhow...
		// . CAUTION: if rdb is not fixed-datasize then this will
		//            not work for us! it can exceed m_minRecSizes.
		replyBufMaxSize = m_minRecSizes ;
		// . get a little extra to fix the error where we ask for 64 
		//   but get 72
		// . where is that coming from?
		// . when getting titleRecs we often exceed the minRecSizes 
		// . ?Msg8? was having trouble. was int16_t 32 bytes sometimes.
		replyBufMaxSize += 36;
		// why add ten percent?
		//replyBufMaxSize *= 110 ;
		//replyBufMaxSize /= 100 ;
		// make a buffer to hold the reply
//#ifdef SPLIT_INDEXDB
/*
		if ( m_numSplit > 1 ) {
			m_replyBufSize = replyBufMaxSize * m_numSplit;
			replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0");
			m_replyBuf  = replyBuf;
			freeReply = false;
		}
		else
*/
//#endif
			replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0");
		// g_errno is set and we return true if it failed
		if ( ! replyBuf ) {
			log("net: Failed to pre-allocate %"INT32" bytes to hold "
			    "data read remotely from %s: %s.",
			    replyBufMaxSize,getDbnameFromId(m_rdbId),
			    mstrerror(g_errno));
			return true;
		}
	}

	// . make a request with the info above (note: not in network order)
	// . IMPORTANT!!!!! if you change this change 
	//   Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!!
	//   no, not anymore, we commented out that request peeking code
	char *p = m_request;
	*(int64_t *) p = syncPoint        ; p += 8;
	//*(key_t     *) p = m_startKey       ; p += sizeof(key_t);
	//*(key_t     *) p = m_endKey         ; p += sizeof(key_t);
	*(int32_t      *) p = m_minRecSizes    ; p += 4;
	*(int32_t      *) p = startFileNum     ; p += 4;
	*(int32_t      *) p = numFiles         ; p += 4;
	*(int32_t      *) p = maxCacheAge      ; p += 4;
	if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; }
	*p               = m_rdbId          ; p++;
	*p               = addToCache       ; p++;
	*p               = doErrorCorrection; p++;
	*p               = includeTree      ; p++;
	*p               = (char)niceness   ; p++;
	*p               = (char)m_allowPageCache; p++;
	KEYSET(p,m_startKey,m_ks);          ; p+=m_ks;
	KEYSET(p,m_endKey,m_ks);            ; p+=m_ks;
	// NULL terminated collection name
	//strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
	*(collnum_t *)p = m_collnum; p += sizeof(collnum_t);
	m_requestSize    = p - m_request;
	// ask an individual host for this list if hostId is NOT -1
	if ( m_hostId != -1 ) {
		// get Host
		Host *h = g_hostdb.getHost ( m_hostId );
		if ( ! h ) { 
			g_errno = EBADHOSTID; 
			log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".",
			    m_hostId);
			return true;
		}
		// if niceness is 0, use the higher priority udpServer
		UdpServer *us ;
		uint16_t port;
		QUICKPOLL(m_niceness);
		//if ( niceness <= 0 || netnice == 0 ) { 
		//if ( realtime ) {
		//	us = &g_udpServer2; port = h->m_port2; }
		//else                 { 
		us = &g_udpServer ; port = h->m_port ; 
		// . returns false on error and sets g_errno, true otherwise
		// . calls callback when reply is received (or error)
		// . we return true if it returns false
		if ( ! us->sendRequest ( m_request     ,
					 m_requestSize ,
					 0x00          , // msgType
					 h->m_ip       ,
					 port          ,
					 m_hostId      ,
					 NULL          , // the slotPtr
					 this          ,
					 gotSingleReplyWrapper ,
					 timeout       ,
					 -1            , // backoff
					 -1            , // maxwait
					 replyBuf      ,
					 replyBufMaxSize ,
					 m_niceness     ) ) // cback niceness
			return true;
		// return false cuz it blocked
		return false;
	}
	// timing debug
	if ( g_conf.m_logTimingNet )
		m_startTime = gettimeofdayInMilliseconds();
	else
		m_startTime = 0;
	//if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. "
	//			"termId=%"UINT64", "
	//			"groupNum=%"UINT32"",
	//			g_indexdb.getTermId(m_startKey) ,
	//			g_hostdb.makeHostId ( m_groupId ) );

	/*
	// make the cache key so we can see what remote host cached it, if any
	char cacheKey[MAX_KEY_BYTES];
	//key_t cacheKey = makeCacheKey ( startKey     ,
	makeCacheKey ( startKey     ,
		       endKey       ,
		       includeTree  ,
		       minRecSizes  ,
		       startFileNum ,
		       numFiles     ,
		       cacheKey     ,
		       m_ks         );
	*/

	// . get the top int32_t of the key
	// . i guess this will work for 128 bit keys... hmmmmm
	int32_t keyTop = hash32 ( (char *)startKey , m_ks );

	/*
	// allocate space
	if ( m_numSplit > 1 ) {
		int32_t  need = m_numSplit * sizeof(Multicast) ;
		char *buf  = (char *)mmalloc ( need,"msg0mcast" );
		if ( ! buf ) return true;
		m_mcasts = (Multicast *)buf;
		for ( int32_t i = 0; i < m_numSplit ; i++ )
			m_mcasts[i].constructor();
	}
	*/

        // . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . calls callback on completion
	// . select first host to send to in group based on upper 32 bits
	//   of termId (m_startKey.n1)
//#ifdef SPLIT_INDEXDB
	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	//for ( int32_t i = 0; i < m_numSplit; i++ ) {

	QUICKPOLL(m_niceness);
	//int32_t gr;
	char *buf;
	/*
	if ( m_numSplit > 1 ) {
		gr  = g_indexdb.getSplitGroupId ( baseGroupId, i );
		buf = &replyBuf[i*replyBufMaxSize];
	}
	else {
	*/
	//gr  = m_groupId;
	buf = replyBuf;
	//}

	// get the multicast
	Multicast *m = &m_mcast;
	//if ( m_numSplit > 1 ) m = &m_mcasts[i];

        if ( ! m->send ( m_request    , 
//#else
//        if ( ! m_mcast.send ( m_request    , 
//#endif
			      m_requestSize, 
			      0x00         , // msgType 0x00
			      false        , // does multicast own request?
			 m_shardNum ,
//#ifdef SPLIT_INDEXDB
//			      gr           , // group + offset
//#else
//			      m_groupId    , // group to send to (groupKey)
//#endif
			      false        , // send to whole group?
			      //m_startKey.n1, // key is passed on startKey
			      keyTop       , // key is passed on startKey
			      this         , // state data
			      NULL         , // state data
			      gotMulticastReplyWrapper0 ,
			      timeout      , // timeout in seconds (was 30)
			      niceness     ,
			      realtime     ,
			      firstHostId  ,
//#ifdef SPLIT_INDEXDB
//			      &replyBuf[i*replyBufMaxSize] ,
//#else
//			      replyBuf        ,
//#endif
			      buf             ,
			      replyBufMaxSize ,
			      freeReply       , // free reply buf?
			      true            , // do disk load balancing?
			      maxCacheAge     ,
			      //(key_t *)cacheKey        ,
			      // multicast uses it for determining the best
			      // host to send the request to when doing 
			      // disk load balancing. if the host has our 
			      // data cached, then it will probably get to
			      // handle the request. for now let's just assume
			      // this is a 96-bit key. TODO: fix...
			 0 , // *(key_t *)cacheKey        ,
			      rdbId           ,
			      minRecSizes     ) ) {
		log("net: Failed to send request for data from %s in shard "
		    "#%"UINT32" over network: %s.",
		    getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
		// no, multicast will free this when it is destroyed
		//if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" );
		// but speed it up
//#ifdef SPLIT_INDEXDB
		m_errno = g_errno;
		m->reset();
		if ( m_numRequests > 0 )
			return false;
//#else
//		m_mcast.reset();
//#endif
		return true;
	}
//#ifdef SPLIT_INDEXDB
	m_numRequests++;

//#endif
	// we blocked
	return false;
}
bool Indexdb::verify ( char *coll ) {
	return true;
	log ( LOG_INFO, "db: Verifying Indexdb for coll %s...", coll );
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key_t startKey;
	key_t endKey;
	startKey.setMin();
	endKey.setMax();
	//long minRecSizes = 64000;
	
	if ( ! msg5.getList ( RDB_INDEXDB   ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      64000         , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          ,
			      0             ,
			      -1            ,
			      true          ,
			      -1LL          ,
			      &msg5b        ,
			      true          )) {
		g_threads.enableThreads();
		return log("db: HEY! it did not block");
	}

	long count = 0;
	long got   = 0;
	bool printedKey = false;
	bool printedZeroKey = false;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		count++;
		//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
		//unsigned long groupId = getGroupId ( RDB_INDEXDB , &k );
		//if ( groupId == g_hostdb.m_groupId ) got++;
		unsigned long shardNum = getShardNum( RDB_INDEXDB , &k );
		if ( shardNum == getMyShardNum() ) got++;
		else if ( !printedKey ) {
			log ( "db: Found bad key in list (only printing once): "
			      "%lx %llx", k.n1, k.n0 );
			printedKey = true;
		}
		if ( k.n1 == 0 && k.n0 == 0 ) {
			if ( !printedZeroKey ) {
				log ( "db: Found Zero key in list, passing. "
				      "(only printing once)." );
				printedZeroKey = true;
			}
			if ( shardNum != getMyShardNum() )
				got++;
		}
	}
	if ( got != count ) {
		log ("db: Out of first %li records in indexdb, only %li belong "
		     "to our group.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) log("db: Are you sure you have the "
				    "right "
				    "data in the right directory? "
				    "Exiting.");
		log ( "db: Exiting due to Indexdb inconsistency." );
		g_threads.enableThreads();
		return g_conf.m_bypassValidation;
	}
	log ( LOG_INFO, "db: Indexdb passed verification successfully for %li "
			"recs.", count );
	// DONE
	g_threads.enableThreads();
	return true;
}
void Indexdb::deepVerify ( char *coll ) {
	log ( LOG_INFO, "db: Deep Verifying Indexdb for coll %s...", coll );
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key_t startKey;
	key_t endKey;
	startKey.setMin();
	endKey.setMax();
	//long minRecSizes = 64000;
	
	collnum_t collnum = g_collectiondb.getCollnum(coll);
	RdbBase *rdbBase = g_indexdb.m_rdb.getBase(collnum);
	long numFiles = rdbBase->getNumFiles();
	long currentFile = 0;
	
deepLoop:
	// done after scanning all files
	if ( currentFile >= numFiles ) {
		g_threads.enableThreads();
		log ( LOG_INFO, "db: Finished deep verify for %li files.",
				numFiles );
		return;
	}
	// scan this file
	if ( ! msg5.getList ( RDB_INDEXDB   ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      64000         , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      currentFile   , // startFileNum  ,
			      1             , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          ,
			      0             ,
			      -1            ,
			      true          ,
			      -1LL          ,
			      &msg5b        ,
			      false         )) {
		g_threads.enableThreads();
		log("db: HEY! it did not block");
		return;
	}

	long count = 0;
	long got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		count++;
		//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
		//unsigned long groupId = getGroupId ( RDB_INDEXDB , &k );
		//if ( groupId == g_hostdb.m_groupId ) got++;
		unsigned long shardNum = getShardNum( RDB_INDEXDB , &k );
		if ( shardNum == getMyShardNum() ) got++;
	}
	if ( got != count ) {
		BigFile *f = rdbBase->getFile(currentFile);
		log ("db: File %s: Out of first %li records in indexdb, "
		     "only %li belong to our group.",
		     f->getFilename(),count,got );
	}
	//else
	//	log ( LOG_INFO, "db: File %li: Indexdb passed verification "
	//	      "successfully for %li recs.",currentFile,count );
	// next file
	currentFile++;
	goto deepLoop;
}
示例#11
0
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . if the list is sorted by keys this will be the most efficient
bool Msg1::sendSomeOfList ( ) {
	// sanity check
	if ( m_list->m_ks !=  8 &&
	     m_list->m_ks != 12 &&
	     m_list->m_ks != 16 &&
	     m_list->m_ks != 24 ) { 
		g_process.shutdownAbort(true); }
	// debug msg
	//log("sendSomeOfList: mcast=%" PRIu32" exhausted=%" PRId32,
	//    (int32_t)&m_mcast,(int32_t)m_list->isExhausted());
 loop:
	// return true if list exhausted and nothing left to add
	if ( m_list->isExhausted() ) return true;
	// get key of the first record in the list
	//key_t firstKey = m_list->getCurrentKey();
	char firstKey[MAX_KEY_BYTES];
	m_list->getCurrentKey(firstKey);
 	QUICKPOLL(m_niceness);
	// get groupId from this key
	//uint32_t groupId ; 
	// . use the new Hostdb.h inlined function
	uint32_t shardNum = getShardNum ( m_rdbId , firstKey );

	// point to start of data we're going to send
	char *dataStart = m_list->getListPtr();
	// how many records belong to the same group as "firstKey"
	//key_t key;
	char key[MAX_KEY_BYTES];
	while ( ! m_list->isExhausted() ) {
		//key = m_list->getCurrentKey();
		m_list->getCurrentKey(key);
#ifdef GBSANITYCHECK
		// no half bits in here!
		// debug point
		if ( m_list->useHalfKeys() && 
		     m_list->isHalfBitOn ( m_list->getCurrentRec() ) )
			log(LOG_LOGIC,"net: msg1: Got half bit. Bad "
			    "engineer.");
#endif
		// . if key belongs to same group as firstKey then continue
		// . titledb now uses last bits of docId to determine groupId
		// . but uses the top 32 bits of key still
		// . spiderdb uses last 64 bits to determine groupId
		// . tfndb now is like titledb(top 32 bits are top 32 of docId)
		//if ( getGroupId(m_rdbId,key) != groupId ) goto done;
		if ( getShardNum(m_rdbId,key) != shardNum ) goto done;

		// . break so we don't send more than MAX_DGRAMS defined in 
		//   UdpServer.cpp.
		// . let's boost it from 16k to 64k for speed
		if ( m_list->getListPtr() - dataStart > 64*1024 ) goto done;
		// . point to next record
		// . will point passed records if no more left!
 		QUICKPOLL(m_niceness);
		//int32_t crec = m_list->getCurrentRecSize();
		m_list->skipCurrentRecord();
		// sanity check
		if ( m_list->m_listPtr > m_list->m_listEnd ) {
			g_process.shutdownAbort(true); }
	}
 done:
	// now point to the end of the data
	char *dataEnd = m_list->getListPtr();
	// . if force local is true we force the data to be added locally
	// . this fixes the bug we had from spiderdb since a key got corrupted
	//   just enough to put it into a different groupId (but not out
	//   of order) so we couldn't delete it cuz our delete keys would go
	//   elsewhere
	if ( m_forceLocal && shardNum != getMyShardNum() &&
	     ! g_conf.m_interfaceMachine ) {
		// make the groupId local, our group
		//groupId = g_hostdb.m_groupId;
		// bitch about this to log it
		log("net: Data does not belong in shard %" PRIu32", but adding "
		    "to %s anyway. Probable data corruption.",
		    (uint32_t)shardNum,getDbnameFromId(m_rdbId));
	}
	
 	QUICKPOLL(m_niceness);

	// sanity test for new rdbs
	if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) {
		g_process.shutdownAbort(true); }

	// . now send this list to the host
	// . this returns false if blocked, true otherwise
	// . it also sets g_errno on error
	// . if it blocked return false
	if ( ! sendData ( shardNum , dataStart , dataEnd - dataStart ) )
		return false;
	// if there was an error return true
	if ( g_errno ) return true;
	// otherwise, keep adding
	goto loop;
}
/*
bool Revdb::addColl ( char *coll, bool doVerify ) {
	if ( ! m_rdb.addColl ( coll ) ) return false;
	if ( ! doVerify ) return true;
	// verify
	if ( verify(coll) ) return true;
	// if not allowing scale, return false
	if ( ! g_conf.m_allowScale ) return false;
	// otherwise let it go
	log ( "db: Verify failed, but scaling is allowed, passing." );
	return true;
}
*/
bool Revdb::verify ( char *coll ) {
	log ( LOG_INFO, "db: Verifying Revdb for coll %s...", coll );
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key_t startKey;
	key_t endKey;
	startKey.setMin();
	endKey.setMax();
	//int32_t minRecSizes = 64000;
	CollectionRec *cr = g_collectiondb.getRec(coll);

	if ( ! msg5.getList ( RDB_REVDB   ,
			      cr->m_collnum       ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      1024*1024     , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      true          , // compensate for merge
			      -1LL          , // sync point
			      &msg5b        ,
			      false         )) {
		g_threads.enableThreads();
		return log("db: HEY! it did not block");
	}

	int32_t count = 0;
	int32_t got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		count++;
		//uint32_t groupId = getGroupId ( RDB_REVDB , &k );
		//if ( groupId == g_hostdb.m_groupId ) got++;
		uint32_t shardNum = getShardNum( RDB_REVDB , &k );
		if ( shardNum == getMyShardNum() ) got++;
	}
	if ( got != count ) {
		log ("db: Out of first %"INT32" records in revdb, "
		     "only %"INT32" belong to our group.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( count > 10 && got == 0 ) 
			log("db: Are you sure you have the right "
				   "data in the right directory? "
				   "Exiting.");
		log ( "db: Exiting due to Revdb inconsistency." );
		g_threads.enableThreads();
		return g_conf.m_bypassValidation;
	}

	log ( LOG_INFO, "db: Revdb passed verification successfully for %"INT32""
			" recs.", count );
	// DONE
	g_threads.enableThreads();
	return true;
}
/*
bool Placedb::addColl ( char *coll, bool doVerify ) {
	if ( ! m_rdb.addColl ( coll ) ) return false;
	if ( ! doVerify ) return true;
	// verify
	if ( verify(coll) ) return true;
	// if not allowing scale, return false
	if ( ! g_conf.m_allowScale ) return false;
	// otherwise let it go
	log ( "db: Verify failed, but scaling is allowed, passing." );
	return true;
}
*/
bool Placedb::verify ( char *coll ) {
	log ( LOG_INFO, "db: Verifying Placedb for coll %s...", coll );
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key_t startKey;
	startKey.setMin();
	key_t endKey;
	endKey.setMax();
	
	if ( ! msg5.getList ( RDB_PLACEDB     ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      64000         , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          ,
			      0             ,
			      -1            ,
			      true          ,
			      -1LL          ,
			      &msg5b        ,
			      true          ,
			      false         )) { // allow page cache?
		g_threads.enableThreads();
		return log("db: HEY! it did not block");
	}

	long count = 0;
	long got   = 0;
	bool printedKey = false;
	bool printedZeroKey = false;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		count++;
		// verify the group
		uint32_t shardNum = getShardNum ( RDB_PLACEDB , (char *)&k );
		if ( shardNum == getMyShardNum() )
			got++;
		else if ( !printedKey ) {
			log ("db: Found bad key in list (only printing once): "
			      "%lx %llx", k.n1, k.n0 );
			printedKey = true;
		}
		if ( k.n1 == 0 && k.n0 == 0 ) {
			if ( !printedZeroKey ) {
				log ( "db: Found Zero key in list, passing. "
				      "(only printing once)." );
				printedZeroKey = true;
			}
			// pass if we didn't match above
			if ( shardNum != getMyShardNum() )
				got++;
		}
	}
	if ( got != count ) {
		log("db: Out of first %li records in placedb, only %li passed "
		     "verification.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) log("db: Are you sure you have the "
					   "right "
					   "data in the right directory? "
					   "Exiting.");
		g_threads.enableThreads();
		// if only one let it slide, i saw this happen on gb1 cluster
		if ( got - count >= -1 && got - count <= 1 )
			return true;
		log ( "db: Exiting due to Placedb inconsistency." );
		return g_conf.m_bypassValidation;
	}

	log ( LOG_INFO, "db: Placedb passed verification successfully for %li "
			"recs.", count );
	// DONE
	g_threads.enableThreads();
	return true;
}
示例#14
0
// . return false if blocked, true otherwise
// . sets g_errno on error
bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) {
	// debug msg
	//log("sendData: mcast=%lu listSize=%li",
	//    (long)&m_mcast,(long)listSize);

	// bail if this is an interface machine, don't write to the main
	if ( g_conf.m_interfaceMachine ) return true;
	// return true if no data
	if ( listSize == 0 ) return true;
	// how many hosts in this group
	//long numHosts = g_hostdb.getNumHostsPerShard();
	// . NOTE: for now i'm removing this until I handle ETRYAGAIN errors
	//         properly... by waiting and retrying...
	// . if this is local data just for us just do an addList to OUR rdb
	/*
	if ( groupId == g_hostdb.m_groupId  && numHosts == 1 ) {
		// this sets g_errno on error
		Msg0 msg0;
		Rdb *rdb = msg0.getRdb ( (char) m_rdbId );
		if ( ! rdb ) return true;
		// make a list from this data
		RdbList list;
		list.set (listData,listSize,listSize,rdb->getFixedDataSize(),
			  false) ; // ownData?
		// this returns false and sets g_errno on error
		rdb->addList ( &list );
		// . if we got a ETRYAGAIN cuz the buffer we add to was full
		//   then we should sleep and try again!
		// . return false cuz this blocks for a period of time
		//   before trying again
		if ( g_errno == ETRYAGAIN ) {
			// try adding again in 1 second
			registerSleepCallback ( 1000, slot, tryAgainWrapper1 );
			// return now
			return false;
		}
		// . always return true cuz we did not block
		// . g_errno may be set
		return true;
	}
	*/
	// if the data is being added to our group, don't send ourselves
	// a msg1, if we can add it right now
	bool sendToSelf = true;
	if ( shardNum == getMyShardNum() &&
	     ! g_conf.m_interfaceMachine ) {
		// get the rdb to which it belongs, use Msg0::getRdb()
		Rdb *rdb = getRdbFromId ( (char) m_rdbId );
		if ( ! rdb ) goto skip;
		// key size
		long ks = getKeySizeFromRdbId ( m_rdbId );
		// reset g_errno
		g_errno = 0;
		// . make a list from this data
		// . skip over the first 4 bytes which is the rdbId
		// . TODO: embed the rdbId in the msgtype or something...
		RdbList list;
		// set the list
		list.set ( listData ,
			   listSize ,
			   listData ,
			   listSize ,
			   rdb->getFixedDataSize() ,
			   false                   ,  // ownData?
			   rdb->useHalfKeys()      ,
			   ks                      ); 
		// note that
		//log("msg1: local addlist niceness=%li",m_niceness);
		// this returns false and sets g_errno on error
		rdb->addList ( m_coll , &list , m_niceness );
		// if titledb, add tfndb recs to map the title recs
		//if ( ! g_errno && rdb == g_titledb.getRdb() && m_injecting ) 
		//	// this returns false and sets g_errno on error
		//	updateTfndb ( m_coll , &list , true , m_niceness);
		// if no error, no need to use a Msg1 UdpSlot for ourselves
		if ( ! g_errno ) sendToSelf = false;
		else {
			log("rdb: msg1 had error: %s",mstrerror(g_errno));
			// this is messing up generate catdb's huge rdblist add
			// why did we put it in there??? from msg9b.cpp
			//return true;
		}
		
 		QUICKPOLL(m_niceness);
		// if we're the only one in the group, bail, we're done
		if ( ! sendToSelf &&
		     g_hostdb.getNumHostsPerShard() == 1 ) return true;
	}
skip:
	// . make an add record request to multicast to a bunch of machines
	// . this will alloc new space, returns NULL on failure
	//char *request = makeRequest ( listData, listSize, groupId , 
	//m_rdbId , &requestLen );
	long collLen = gbstrlen ( m_coll );
	// . returns NULL and sets g_errno on error
	// . calculate total size of the record
	// . 1 byte for rdbId, 1 byte for flags,
	//   then collection NULL terminated, then list
	long requestLen = 1 + 1 + collLen + 1 + listSize ;
	// make the request
	char *request = (char *) mmalloc ( requestLen ,"Msg1" );
	if ( ! request ) return true;
	char *p = request;
	// store the rdbId at top of request
	*p++ = m_rdbId;
	// then the flags
	*p = 0;
	if ( m_injecting ) *p |= 0x80;
	p++;
	// then collection name
	memcpy ( p , m_coll , collLen );
	p += collLen;
	*p++ = '\0';
	// sanity check
	if ( collLen <= 0 ) {
		log(LOG_LOGIC,"net: No collection specified for list add.");
		//char *xx = NULL; *xx = 0;
		g_errno = ENOCOLLREC;
		return true;
	}
	//if ( m_deleteRecs    ) request[1] |= 0x80;
	//if ( m_overwriteRecs ) request[1] |= 0x40;
	// store the list after coll
	memcpy ( p , listData , listSize );
 	QUICKPOLL(m_niceness);
	// debug msg
	//if ( ! m_waitForReply ) // (m_rdbId == RDB_SPIDERDB || 
	//m_rdbId == RDB_TFNDB)  )
	//	// if we don't get here we lose it!!!!!!!!!!!!!!!!!!!!!
	//	log("using mcast=%lu rdbId=%li listData=%lu listSize=%lu "
	//	    "gid=%lu",
	//	   (long)&m_mcast,(long)m_rdbId,(long)listData,(long)listSize,
	//	    groupId);
	// for small packets
	//long niceness = 2;
	//if ( requestLen < TMPBUFSIZE - 32 ) niceness = 0;
	//log("msg1: sending mcast niceness=%li",m_niceness);
	// . multicast to all hosts in group "groupId"
	// . multicast::send() returns false and sets g_errno on error
	// . we return false if we block, true otherwise
	// . will loop indefinitely if a host in this group is down
	key_t k; k.setMin();
	if ( m_mcast.send ( request    , // sets mcast->m_msg    to this
			    requestLen , // sets mcast->m_msgLen to this
			    0x01       , // msgType for add rdb record
			    true       , // does multicast own msg?
			    shardNum   , // group to send to (groupKey)
			    true       , // send to whole group?
			    0          , // key is useless for us
			    this       , // state data
			    NULL       , // state data
			    gotReplyWrapper1 ,
			    60         , // timeout in secs
			    m_niceness , // niceness 
			    false    , // realtime
			    -1    , // first host to try
			    NULL  , // replyBuf        = NULL ,
			    0     , // replyBufMaxSize = 0 ,
			    true  , // freeReplyBuf    = true ,
			    false , // doDiskLoadBalancing = false ,
			    -1    , // no max cache age limit
			    //(key_t)0 , // cache key
			    k    , // cache key
			    RDB_NONE , // bogus rdbId
			    -1    , // unknown minRecSizes read size
			    sendToSelf ))
		return false;

 	QUICKPOLL(m_niceness);
	// g_errno should be set
	log("net: Had error when sending request to add data to %s in shard "
	    "#%lu: %s.", getDbnameFromId(m_rdbId),shardNum,mstrerror(g_errno));
	return true;	
}
示例#15
0
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . if the list is sorted by keys this will be the most efficient
bool Msg1::sendSomeOfList ( ) {
	// sanity check
	if ( m_list->m_ks !=  8 &&
	     m_list->m_ks != 12 &&
	     m_list->m_ks != 16 &&
	     m_list->m_ks != 24 ) { 
		char *xx=NULL;*xx=0; }
	// debug msg
	//log("sendSomeOfList: mcast=%lu exhausted=%li",
	//    (long)&m_mcast,(long)m_list->isExhausted());
 loop:
	// return true if list exhausted and nothing left to add
	if ( m_list->isExhausted() ) return true;
	// get key of the first record in the list
	//key_t firstKey = m_list->getCurrentKey();
	char firstKey[MAX_KEY_BYTES];
	m_list->getCurrentKey(firstKey);
 	QUICKPOLL(m_niceness);
	// get groupId from this key
	//unsigned long groupId ; 
	// . use the new Hostdb.h inlined function
	uint32_t shardNum = getShardNum ( m_rdbId , firstKey );
	// . default is to use top bits of the key
	// . but if we're adding to titledb use last bits in the top of key
	// . but if we're adding to spiderdb we use the last long in the key
	// . tfndb urlRec key same as titleRec key
	/*
	if      ( m_rdbId == RDB_INDEXDB )
		groupId = g_indexdb.getGroupIdFromKey((key_t *)firstKey);
	else if ( m_rdbId == RDB_DATEDB )
		groupId = g_datedb.getGroupIdFromKey((key128_t *)firstKey);
	else if ( m_rdbId == RDB_TITLEDB)
		groupId = g_titledb.getGroupIdFromKey((key_t *)firstKey);
	else if ( m_rdbId == RDB_CHECKSUMDB)
		groupId = g_checksumdb.getGroupId ( firstKey );
	else if ( m_rdbId == RDB_SPIDERDB )
		groupId = g_spiderdb.getGroupId ( (key_t *)firstKey );
	else if ( m_rdbId == RDB_TFNDB )
		groupId = g_tfndb.getGroupId    ( (key_t *)firstKey );
	else if ( m_rdbId == RDB_CLUSTERDB )
		groupId = g_clusterdb.getGroupIdFromKey((key_t *)firstKey);

	else if ( m_rdbId == RDB2_INDEXDB2 )
		groupId = g_indexdb.getGroupIdFromKey((key_t *)firstKey);
	else if ( m_rdbId == RDB2_DATEDB2 )
		groupId = g_datedb.getGroupIdFromKey((key128_t *)firstKey);
	else if ( m_rdbId == RDB2_TITLEDB2)
		groupId = g_titledb.getGroupIdFromKey((key_t *)firstKey);
	else if ( m_rdbId == RDB2_CHECKSUMDB2)
		groupId = g_checksumdb.getGroupId ( firstKey );
	else if ( m_rdbId == RDB2_SPIDERDB2 )
		groupId = g_spiderdb.getGroupId ( (key_t *)firstKey );
	else if ( m_rdbId == RDB2_TFNDB2 )
		groupId = g_tfndb.getGroupId    ( (key_t *)firstKey );
	else if ( m_rdbId == RDB2_CLUSTERDB2 )
		groupId = g_clusterdb.getGroupIdFromKey((key_t *)firstKey);
	//else    groupId=firstKey.n1 & g_hostdb.m_groupMask;
	else    groupId = (((key_t *)firstKey)->n1) & g_hostdb.m_groupMask;
	*/
	// point to start of data we're going to send
	char *dataStart = m_list->getListPtr();
	// how many records belong to the same group as "firstKey"
	//key_t key;
	char key[MAX_KEY_BYTES];
	while ( ! m_list->isExhausted() ) {
		//key = m_list->getCurrentKey();
		m_list->getCurrentKey(key);
#ifdef _SANITYCHECK_
		// no half bits in here!
		// debug point
		if ( m_list->useHalfKeys() && 
		     m_list->isHalfBitOn ( m_list->getCurrentRec() ) )
			log(LOG_LOGIC,"net: msg1: Got half bit. Bad "
			    "engineer.");
#endif
		// . if key belongs to same group as firstKey then continue
		// . titledb now uses last bits of docId to determine groupId
		// . but uses the top 32 bits of key still
		// . spiderdb uses last 64 bits to determine groupId
		// . tfndb now is like titledb(top 32 bits are top 32 of docId)
		//if ( getGroupId(m_rdbId,key) != groupId ) goto done;
		if ( getShardNum(m_rdbId,key) != shardNum ) goto done;
		/*
		switch ( m_rdbId ) {
		case RDB_TITLEDB: 
			if(g_titledb.getGroupIdFromKey((key_t *)key)!=groupId) 
			goto done;
			break;
		case RDB_CHECKSUMDB:
			if(g_checksumdb.getGroupId (         key)!=groupId)
			goto done;
			break;
		case RDB_SPIDERDB: 
			if ( g_spiderdb.getGroupId ((key_t *)key) != groupId) 
			goto done;
			break;
		case RDB_TFNDB:	
			if ( g_tfndb.getGroupId    ((key_t *)key) != groupId) 
			goto done;
			break;
		case RDB_CLUSTERDB:
		      if(g_clusterdb.getGroupIdFromKey((key_t *)key)!=groupId) 
			goto done;
			break;
		case RDB_DATEDB:
		       if(g_datedb.getGroupIdFromKey((key128_t *)key)!=groupId)
			goto done;
			break;
		case RDB_INDEXDB:
		       if(g_indexdb.getGroupIdFromKey((key_t *)key)!=groupId)
			goto done;
			break;
		//default:if ((key.n1&g_hostdb.m_groupMask)  != groupId) 
		default:  if ( ((((key_t *)key)->n1) & g_hostdb.m_groupMask) !=
			       groupId) 
			goto done;
		}
		*/
		// . break so we don't send more than MAX_DGRAMS defined in 
		//   UdpServer.cpp.
		// . let's boost it from 16k to 64k for speed
		if ( m_list->getListPtr() - dataStart > 64*1024 ) goto done;
		// . point to next record
		// . will point passed records if no more left!
 		QUICKPOLL(m_niceness);
		//long crec = m_list->getCurrentRecSize();
		m_list->skipCurrentRecord();
		// sanity check
		if ( m_list->m_listPtr > m_list->m_listEnd ) {
			char *xx=NULL;*xx=0; }
	}
 done:
	// now point to the end of the data
	char *dataEnd = m_list->getListPtr();
	// . if force local is true we force the data to be added locally
	// . this fixes the bug we had from spiderdb since a key got corrupted
	//   just enough to put it into a different groupId (but not out
	//   of order) so we couldn't delete it cuz our delete keys would go
	//   elsewhere
	if ( m_forceLocal && shardNum != getMyShardNum() &&
	     ! g_conf.m_interfaceMachine ) {
		// make the groupId local, our group
		//groupId = g_hostdb.m_groupId;
		// bitch about this to log it
		log("net: Data does not belong in shard %lu, but adding "
		    "to %s anyway. Probable data corruption.",
		    (unsigned long)shardNum,getDbnameFromId(m_rdbId));
	}
	
 	QUICKPOLL(m_niceness);

	// sanity test for new rdbs
	if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) {
		char *xx=NULL;*xx=0; }

	// little debug thing for genCatdb from msg9b's huge list add
	//if ( m_list->m_listSize > 10000000 )
	//	log("msg1: adding chunk @ %li of %li bytes",
	//	    (long)(dataStart - m_list->m_list) ,
	//	    (long)m_list->m_listSize );

	// . now send this list to the host
	// . this returns false if blocked, true otherwise
	// . it also sets g_errno on error
	// . if it blocked return false
	if ( ! sendData ( shardNum , dataStart , dataEnd - dataStart ) )
		return false;
	// if there was an error return true
	if ( g_errno ) return true;
	// otherwise, keep adding
	goto loop;
}