// . return false if blocked, true otherwise
// . set g_errno on error
// . list should be truncated, possible have all negative keys removed,
//   and de-duped thanks to RdbList::indexMerge_r() and RdbList::merge_r()
bool RdbMerge::dumpList ( ) {
	// return true on g_errno
	if ( g_errno ) return true;

	// . it's suspended so we count this as blocking
	// . resumeMerge() will call getNextList() again, not dumpList() so
	//   don't advance m_startKey
	if ( m_isSuspended ) {
		m_isReadyToSave = true;
		return false;
	}

	// . set the list to only those records that should be in our group
	// . filter the records that don't belong in this group via groupId
	//filterList ( &m_list );

	// . compute the new m_startKey to get the next list from disk
	// . m_list was formed via RdbList::merge() 
	// . m_list may be empty because of negative/positive collisions
	//   but there may still be data left
	//m_startKey = m_list.getLastKey() ;
	//m_list.getLastKey(m_startKey) ;
	// if we use getLastKey() for this the merge completes but then
	// tries to merge two empty lists and cores in the merge function
	// because of that. i guess it relies on endkey rollover only and
	// not on reading less than minRecSizes to determine when to stop
	// doing the merge.
	m_list.getEndKey(m_startKey) ;
	//m_startKey += (uint32_t)1;
	KEYADD(m_startKey,m_ks);

	/////
	//
	// dedup for spiderdb before we dump it. try to save disk space.
	//
	/////
	if ( m_rdbId == RDB_SPIDERDB )
		// removeNegRecs? = false
		dedupSpiderdbList(&m_list, false);

	// if the startKey rolled over we're done
	//if ( m_startKey.n0 == 0LL && m_startKey.n1 == 0 ) m_doneMerging=true;
	if ( KEYCMP(m_startKey,KEYMIN(),m_ks)==0 ) m_doneMerging = true;
	// debug msg
	log(LOG_DEBUG,"db: Dumping list.");
	// debug msg
	//fprintf(stderr,"list startKey.n1=%" PRIu32",n0=%" PRIu64", endKey.n1=%" PRIu32",n0=%" PRIu64","
	//	" size=%" PRId32"\n",
	//	m_list.getStartKey().n1, 
	//	m_list.getStartKey().n0, 
	//	m_list.getLastKey().n1, 
	//	m_list.getLastKey().n0,  m_list.getListSize() );
	// . send the whole list to the dump
	// . it returns false if blocked, true otherwise
	// . it sets g_errno on error
	// . it calls dumpListWrapper when done dumping
	// . return true if m_dump had an error or it did not block
	// . if it gets a EFILECLOSED error it will keep retrying forever
	return m_dump.dumpList ( &m_list , m_niceness , false/*recall?*/ ) ;
}
Esempio n. 2
0
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc.
// . if list is stored locally this tries to get it locally
// . otherwise tries to get the list from the network
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . NOTE: i was having problems with queries being cached too long, you
//   see the cache here is a NETWORK cache, so when the machines that owns
//   the list updates it on disk it can't flush our cache... so use a small
//   maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( int64_t hostId      , // host to ask (-1 if none)
		     int32_t      ip          , // info on hostId
		     int16_t     port        ,
		     int32_t      maxCacheAge , // max cached age in seconds
		     bool      addToCache  , // add net recv'd list to cache?
		     char      rdbId       , // specifies the rdb
		     collnum_t collnum ,
		     RdbList  *list        ,
		     const char     *startKey    ,
		     const char     *endKey      ,
		     int32_t      minRecSizes ,  // use -1 for no max
		     void     *state       ,
		     void    (* callback)(void *state ),//, RdbList *list ) ,
		     int32_t      niceness    ,
		     bool      doErrorCorrection ,
		     bool      includeTree ,
		     bool      doMerge     ,
		     int32_t      firstHostId   ,
		     int32_t      startFileNum  ,
		     int32_t      numFiles      ,
		     int64_t      timeout       ,
		     int64_t syncPoint     ,
		     int32_t      preferLocalReads ,
		     Msg5     *msg5             ,
		     bool      isRealMerge      ,
		     bool      allowPageCache    ,
		     bool      forceLocalIndexdb ,
		     bool      noSplit ,
		     int32_t      forceParitySplit  ) {
	logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId );

	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0.");

	// reset the list they passed us
	list->reset();
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( rdbId );
	
//	if( g_conf.m_logTraceMsg0 ) 
//	{
//		log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId);
//		log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks);
//		log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId);
//	}

	// if startKey > endKey, don't read anything
	//if ( startKey > endKey ) return true;
	if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue
	// . reset hostid if it is dead
	// . this is causing UOR queries to take forever when we have a dead
	if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1;
	// no longer accept negative minrecsize
	if ( minRecSizes < 0 ) {
		g_errno = EBADENGINEER;
		logTrace( g_conf.m_logTraceMsg0, "END" );

		log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported.");
		char *xx=NULL;*xx=0;
	}

	// remember these
	m_state         = state;
	m_callback      = callback;
	m_list          = list;
	m_hostId        = hostId;
	m_niceness      = niceness;
	m_addToCache    = addToCache;
	// . these define our request 100%
	KEYSET(m_startKey,startKey,m_ks);
	KEYSET(m_endKey,endKey,m_ks);
	m_minRecSizes   = minRecSizes;
	m_rdbId         = rdbId;
	m_collnum = collnum;//          = coll;
	m_isRealMerge   = isRealMerge;
	m_allowPageCache = allowPageCache;

	// . group to ask is based on the first key 
	// . we only do 1 group per call right now
	// . groupMask must turn on higher bits first (count downwards kinda)
	// . titledb and spiderdb use special masks to get groupId

	// if diffbot.cpp is reading spiderdb from each shard we have to
	// get groupid from hostid here lest we core in getGroupId() below.
	// it does that for dumping spiderdb to the client browser. they
	// can download the whole enchilada.
	if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
		m_shardNum = 0;
	// did they force it? core until i figure out what this is
	else if ( forceParitySplit >= 0 ) 
		//m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
		m_shardNum = forceParitySplit;
	else
		//m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
		m_shardNum = getShardNum ( m_rdbId , startKey );

	// if we are looking up a termlist in posdb that is split by termid and
	// not the usual docid then we have to set this posdb key bit that tells
	// us that ...
	if ( noSplit && m_rdbId == RDB_POSDB )
		m_shardNum = g_hostdb.getShardNumByTermId ( startKey );

	// how is this used?
	if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();


//	if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum);


	// . store these parameters
	// . get a handle to the rdb in case we can satisfy locally
	// . returns NULL and sets g_errno on error
	QUICKPOLL((m_niceness));
	Rdb *rdb = getRdbFromId ( m_rdbId );
	if ( ! rdb ) return true;
	// we need the fixedDataSize
	m_fixedDataSize = rdb->getFixedDataSize();
	m_useHalfKeys   = rdb->useHalfKeys();
	// . debug msg
	// . Msg2 does this when checking for a cached compound list.
	//   compound lists do not actually exist, they are merges of smaller
	//   UOR'd lists.
	if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) {
		log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId );
	}

	// set this here since we may not call msg5 if list not local
	//m_list->setFixedDataSize ( m_fixedDataSize );

	// . now that we do load balancing we don't want to do a disk lookup
	//   even if local if we are merging or dumping
	// . UNLESS g_conf.m_preferLocalReads is true
	if ( preferLocalReads == -1 ) 
		preferLocalReads = g_conf.m_preferLocalReads;

	// . always prefer local for full split clusterdb
	// . and keep the tfndb/titledb lookups in the same stripe
	// . so basically we can't do biased caches if fully split
	//if ( g_conf.m_fullSplit ) preferLocalReads = true;
	preferLocalReads = true;

	// it it stored locally?
	bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId );
			 m_shardNum == getMyShardNum() );
	// only do local lookups if this is true
	if ( ! preferLocalReads ) isLocal = false;

	/*
	int64_t singleDocIdQuery = 0LL;
	if ( rdbId == RDB_POSDB ) {
		int64_t d1 = g_posdb.getDocId(m_startKey);
		int64_t d2 = g_posdb.getDocId(m_endKey);
		if ( d1+1 == d2 ) singleDocIdQuery = d1;
	}

	// . try the LOCAL termlist cache
	// . so when msg2 is evaluating a gbdocid:| query and it has to
	//   use msg0 to go across the network to get the same damn termlist
	//   over and over again for the same docid, this will help alot.
	// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
	//   send the same gbdocid:xxxx docids to the same hosts. maybe hash
	//   based on docid into the list of hosts and if that host is busy
	//   just chain until we find someone not busy.
	if ( singleDocIdQuery &&
	     getListFromTermListCache ( coll,
					m_startKey,
					m_endKey,
					maxCacheAge,
					list ) )
		// found!
		return true;
	*/

	// but always local if only one host
	if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;

	// . if the group is local then do it locally
	// . Msg5::getList() returns false if blocked, true otherwise
	// . Msg5::getList() sets g_errno on error
	// . don't do this if m_hostId was specified
	if ( isLocal ) {
		logTrace( g_conf.m_logTraceMsg0, "isLocal" );

		if ( msg5 ) {
			m_msg5 = msg5;
			m_deleteMsg5 = false;
		}
		else {
			try { m_msg5 = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" );
			m_deleteMsg5 = true;
		}

		QUICKPOLL(m_niceness);
		if ( ! m_msg5->getList ( rdbId,
					 m_collnum ,
					 m_list ,
					 m_startKey ,
					 m_endKey   ,
					 m_minRecSizes ,
					 includeTree   , // include Tree?
					 addToCache    , // addToCache?
					 maxCacheAge   ,
					 startFileNum  , 
					 numFiles      ,
					 this ,
					 gotListWrapper2   ,
					 niceness          ,
					 doErrorCorrection ,
					 NULL , // cacheKeyPtr
					 0    , // retryNum
					 -1   , // maxRetries
					 true , // compensateForMerge
					 syncPoint ,
					 m_isRealMerge ,
					 m_allowPageCache ) ) {
			logTrace( g_conf.m_logTraceMsg0, "END, return false" );
			return false;
		}

		// nuke it
		reset();
		logTrace( g_conf.m_logTraceMsg0, "END, return true" );
		return true;
	}
skip:
	// debug msg
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,"net: msg0: Sending request for data to "
		    "shard=%" PRIu32" "
		    "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" "
		    //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")",
		    "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")",
		    //g_hostdb.makeHostId ( m_groupId ) ,
		    m_shardNum,
		    (PTRTYPE)m_list,
		    m_minRecSizes, g_posdb.getTermId(m_startKey) , 
		    //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness);
		    KEY1(m_startKey,m_ks),KEY0(m_startKey),
		    (int32_t)m_niceness);

	char *replyBuf = NULL;
	int32_t  replyBufMaxSize = 0;
	bool  freeReply = true;

	// . make a request with the info above (note: not in network order)
	// . IMPORTANT!!!!! if you change this change 
	//   Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!!
	//   no, not anymore, we commented out that request peeking code
	char *p = m_request;
	*(int64_t *) p = syncPoint        ; p += 8;
	//*(key_t     *) p = m_startKey       ; p += sizeof(key_t);
	//*(key_t     *) p = m_endKey         ; p += sizeof(key_t);
	*(int32_t      *) p = m_minRecSizes    ; p += 4;
	*(int32_t      *) p = startFileNum     ; p += 4;
	*(int32_t      *) p = numFiles         ; p += 4;
	*(int32_t      *) p = maxCacheAge      ; p += 4;
	if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; }
	*p               = m_rdbId          ; p++;
	*p               = addToCache       ; p++;
	*p               = doErrorCorrection; p++;
	*p               = includeTree      ; p++;
	*p               = (char)niceness   ; p++;
	*p               = (char)m_allowPageCache; p++;
	KEYSET(p,m_startKey,m_ks);          ; p+=m_ks;
	KEYSET(p,m_endKey,m_ks);            ; p+=m_ks;
	// NULL terminated collection name
	//strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
	*(collnum_t *)p = m_collnum; p += sizeof(collnum_t);
	m_requestSize    = p - m_request;
	// ask an individual host for this list if hostId is NOT -1
	if ( m_hostId != -1 ) {
		// get Host
		Host *h = g_hostdb.getHost ( m_hostId );
		if ( ! h ) { 
			g_errno = EBADHOSTID; 
			log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId);
			logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" );
			return true;
		}
		
		// if niceness is 0, use the higher priority udpServer
		UdpServer *us ;
		uint16_t port;
		QUICKPOLL(m_niceness);

		us = &g_udpServer ; port = h->m_port ; 
		// . returns false on error and sets g_errno, true otherwise
		// . calls callback when reply is received (or error)
		// . we return true if it returns false
		if ( ! us->sendRequest ( m_request     ,
					 m_requestSize ,
					 0x00          , // msgType
					 h->m_ip       ,
					 port          ,
					 m_hostId      ,
					 NULL          , // the slotPtr
					 this          ,
					 gotSingleReplyWrapper ,
					 timeout       ,
					 -1            , // backoff
					 -1            , // maxwait
					 replyBuf      ,
					 replyBufMaxSize ,
					 m_niceness     ) ) { // cback niceness
			logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" );
			return true;
		}
		
		// return false cuz it blocked
		logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" );
		return false;
	}
	// timing debug
	if ( g_conf.m_logTimingNet )
		m_startTime = gettimeofdayInMilliseconds();
	else
		m_startTime = 0;

	// . get the top int32_t of the key
	// . i guess this will work for 128 bit keys... hmmmmm
	int32_t keyTop = hash32 ( (char *)startKey , m_ks );

	// . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . calls callback on completion
	// . select first host to send to in group based on upper 32 bits
	//   of termId (m_startKey.n1)
	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	//for ( int32_t i = 0; i < m_numSplit; i++ ) {

	QUICKPOLL(m_niceness);
	//int32_t gr;
	char *buf;
	buf = replyBuf;

	// get the multicast
	Multicast *m = &m_mcast;

        if ( ! m->send ( m_request    , 
			      m_requestSize,
			      0x00         , // msgType 0x00
			      false        , // does multicast own request?
			 m_shardNum ,
			      false        , // send to whole group?
			      //m_startKey.n1, // key is passed on startKey
			      keyTop       , // key is passed on startKey
			      this         , // state data
			      NULL         , // state data
			      gotMulticastReplyWrapper0 ,
			      timeout*1000 , // timeout
			      niceness     ,
			      firstHostId  ,
			      buf             ,
			      replyBufMaxSize ,
			      freeReply       , // free reply buf?
			      true            , // do disk load balancing?
			      maxCacheAge     ,
			      //(key_t *)cacheKey        ,
			      // multicast uses it for determining the best
			      // host to send the request to when doing 
			      // disk load balancing. if the host has our 
			      // data cached, then it will probably get to
			      // handle the request. for now let's just assume
			      // this is a 96-bit key. TODO: fix...
			 0 , // *(key_t *)cacheKey        ,
			      rdbId           ,
			      minRecSizes     ) ) 
	{
		log(LOG_ERROR, "net: Failed to send request for data from %s in shard "
		    "#%" PRIu32" over network: %s.",
		    getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
		// but speed it up
		m_errno = g_errno;
		m->reset();
		if ( m_numRequests > 0 ) {
			logTrace( g_conf.m_logTraceMsg0, "END - returning false" );
			
			return false;
		}

		logTrace( g_conf.m_logTraceMsg0, "END - returning true" );
		return true;
	}

	m_numRequests++;

	// we blocked
	logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" );
	return false;
}
Esempio n. 3
0
// . return false if blocked, true otherwise
// . set g_errno on error
// . read list of keys in [startKey,endKey] range
// . read at least "minRecSizes" bytes of keys in that range
// . the "m_endKey" of resulting, merged list may have a smaller endKey
//   than the argument, "endKey" due to limitation by "minRecSizes"
// . resulting list will contain ALL keys between ITS [m_startKey,m_endKey]
// . final merged list "should" try to have a size of at least "minRecSizes"
//   but due to negative/postive rec elimination may be less
// . the endKey of the lists we read may be <= "endKey" provided
// . we try to shrink the endKey if minRecSizes is >= 0 in order to
//   avoid excessive reading
// . by shrinking the endKey we cannot take into account the size of deleted
//   records, so therefore we may fall short of "minRecSizes" in actuality,
//   in fact, the returned list may even be empty with a shrunken endKey
// . we merge all lists read from disk into the provided "list"
// . caller should call Msg3.getList(int32_t i) and Msg3:getNumLists() to retrieve
// . this makes the query engine faster since we don't need to merge the docIds
//   and can just send them across the network separately and they will be
//   hashed into IndexTable's table w/o having to do time-wasting merging.
// . caller can specify array of filenums to read from so incremental syncing
//   in Sync class can just read from titledb*.dat files that were formed
//   since the last sync point.
bool Msg3::readList  ( char           rdbId         ,
		       collnum_t collnum ,
		       const char       *startKeyArg   ,
		       const char       *endKeyArg     ,
		       int32_t           minRecSizes   , // max size of scan
		       int32_t           startFileNum  , // first file to scan
		       int32_t           numFiles      , // rel. to startFileNum
		       void          *state         , // for callback
		       void        (* callback ) ( void *state ) ,
		       int32_t           niceness      ,
		       int32_t           retryNum      ,
		       int32_t           maxRetries    ,
		       bool           compensateForMerge ,
		       bool           justGetEndKey ,
		       bool           allowPageCache ,
		       bool           hitDisk        ) {

	// set this to true to validate
	m_validateCache = false;//true;

	// clear, this MUST be done so if we return true g_errno is correct
	g_errno = 0;
	// assume lists are not checked for corruption
	m_listsChecked = false;
	// warn
	if ( minRecSizes < -1 ) {
		log(LOG_LOGIC,"db: Msg3 got minRecSizes of %" PRId32", changing "
		    "to -1.",minRecSizes);
		minRecSizes = -1;
	}
	// reset m_alloc and data in all lists in case we are a re-call
	reset();
	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3.");
	// remember the callback
	m_rdbId              = rdbId;
	m_collnum = collnum;
	m_callback           = callback;
	m_state              = state;
	m_niceness           = niceness;
	m_numScansCompleted  = 0;
	m_retryNum           = retryNum;
	m_maxRetries         = maxRetries;
	m_compensateForMerge = compensateForMerge;
	m_allowPageCache     = allowPageCache;
	m_hitDisk            = hitDisk;
	m_hadCorruption      = false;
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( m_rdbId );
	// reset the group error
	m_errno    = 0;
	// . reset all our lists 
	// . these are reset in call the RdbScan::setRead() below
	//for ( int32_t i = 0 ; i < MAX_RDB_FILES ; i++ ) m_lists[i].reset();
	// . ensure startKey last bit clear, endKey last bit set
	// . no! this warning is now only in Msg5
	// . if RdbMerge is merging some files, not involving the root 
	//   file, then we can expect to get a lot of unmatched negative recs.
	// . as a consequence, our endKeys may often be negative. This means
	//   it may not annihilate with the positive key, but we should only
	//   miss like this at the boundaries of the lists we fetch.
	// . so in that case RdbList::merge will stop merging once the
	//   minRecSizes limit is reached even if it means ending on a negative
	//   rec key
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	if ( !KEYNEG(startKeyArg) )
		log(LOG_REMIND,"net: msg3: StartKey lastbit set."); 
	if (  KEYNEG(endKeyArg) )
		log(LOG_REMIND,"net: msg3: EndKey lastbit clear."); 

	// declare vars here becaues of 'goto skip' below
	int32_t mergeFileNum = -1 ;
	int32_t max ;

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base = getRdbBase( m_rdbId, m_collnum );
	if ( ! base ) {
		return true;
	}

	// store the file numbers in the array, these are the files we read
	m_numFileNums = 0;

	// save startFileNum here, just for recall
	m_startFileNum = startFileNum;
	m_numFiles     = numFiles;

	// . if we have a merge going on, we may have to change startFileNum
	// . if some files get unlinked because merge completes then our 
	//   reads will detect the error and loop back here
	// . we launch are reads right after this without giving up the cpu
	//   and we use file descriptors, so any changes to Rdb::m_files[]
	//   should not hurt us
	// . WARNING: just make sure you don't lose control of cpu until after
	//   you call RdbScan::set()
	// . we use hasMergeFile() instead of isMerging() because he may not 
	//   be merging cuz he got suspended or he restarted and
	//   hasn't called attemptMerge() yet, but he may still contain it
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,
		    "net: msg3: "
		    "c=%" PRId32" hmf=%" PRId32" sfn=%" PRId32" msfn=%" PRId32" nf=%" PRId32" db=%s.",
		     (int32_t)compensateForMerge,(int32_t)base->hasMergeFile(),
		     (int32_t)startFileNum,(int32_t)base->m_mergeStartFileNum-1,
		     (int32_t)numFiles,base->m_dbname);
	int32_t pre = -10;
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum >= base->m_mergeStartFileNum - 1 &&
	     (startFileNum > 0 || numFiles != -1) ) {
		// now also include the file being merged into, but only
		// if we are reading from a file being merged...
		if ( startFileNum < base->m_mergeStartFileNum +
		     base->m_numFilesToMerge - 1 )
			//m_fileNums [ m_numFileNums++ ] =
			//	base->m_mergeStartFileNum - 1;
			pre = base->m_mergeStartFileNum - 1;
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,
			   "net: msg3: startFileNum from %" PRId32" to %" PRId32" (mfn=%" PRId32")",
			    startFileNum,startFileNum+1,mergeFileNum);
		// if merge file was inserted before us, inc our file number
		startFileNum++;
	}
	// adjust num files if we need to, as well
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum < base->m_mergeStartFileNum - 1 &&
	     numFiles != -1 &&
	     startFileNum + numFiles - 1 >= base->m_mergeStartFileNum - 1 ) {
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,"net: msg3: numFiles up one.");
		// if merge file was inserted before us, inc our file number
		numFiles++;
	}

	// . how many rdb files does this base have?
	// . IMPORTANT: this can change since files are unstable because they
	//   might have all got merged into one!
	// . so do this check to make sure we're safe... especially if
	//   there was an error before and we called readList() on ourselves
	max = base->getNumFiles();
	// -1 means we should scan ALL the files in the base
	if ( numFiles == -1 ) numFiles = max;
	// limit it by startFileNum, however
	if ( numFiles > max - startFileNum ) numFiles = max - startFileNum;
	// set g_errno and return true if it is < 0
	if ( numFiles < 0 ) { 
		log(LOG_LOGIC,
		   "net: msg3: readList: numFiles = %" PRId32" < 0 (max=%" PRId32")(sf=%" PRId32")",
		    numFiles , max , startFileNum );
		g_errno = EBADENGINEER; 
		// force core dump
		char *xx=NULL;*xx=0;
		return true; 
	}

	// . allocate buffer space
	// . m_scans, m_startpg, m_endpg, m_hintKeys, m_hintOffsets,
	//   m_fileNums, m_lists
	int32_t chunk = sizeof(RdbScan) + // m_scans
		4 +                    // m_startpg
		4 +                    // m_endpg
		//sizeof(key_t) +        // m_hintKeys
		m_ks +                 // m_hintKeys
		4 +                    // m_hintOffsets
		4 +                    // m_fileNums
		sizeof(RdbList) ;      // m_lists
	int32_t nn   = numFiles;
	if ( pre != -10 ) nn++;
	m_numChunks = nn;
	int32_t need = nn * (chunk);
	m_alloc = m_buf;
	if ( need > (int32_t)MSG3_BUF_SIZE ) {
		m_allocSize = need;
		m_alloc = (char *)mcalloc ( need , "Msg3" );
		if ( ! m_alloc ) {
			log("disk: Could not allocate %" PRId32" bytes read "
			    "structures to read %s.",need,base->m_dbname);
			return true;
		}
	}
	char *p = m_alloc;
	m_scans       = (RdbScan *)p; p += nn * sizeof(RdbScan);
	m_startpg     = (int32_t    *)p; p += nn * 4;
	m_endpg       = (int32_t    *)p; p += nn * 4;
	//m_hintKeys    = (key_t   *)p; p += nn * sizeof(key_t);
	m_hintKeys    = (char    *)p; p += nn * m_ks;
	m_hintOffsets = (int32_t    *)p; p += nn * 4;
	m_fileNums    = (int32_t    *)p; p += nn * 4;
	m_lists       = (RdbList *)p; p += nn * sizeof(RdbList);
	// sanity check
	if ( p - m_alloc != need ) {
		log(LOG_LOGIC,"disk: Bad malloc in Msg3.cpp.");
		char *xx = NULL; *xx = 0;
	}
	// call constructors
	for ( int32_t i = 0 ; i < nn ; i++ ) m_lists[i].constructor();
	// make fix from up top
	if ( pre != -10 ) m_fileNums [ m_numFileNums++ ] = pre;

	// store them all
	for ( int32_t i = startFileNum ; i < startFileNum + numFiles ; i++ )
		m_fileNums [ m_numFileNums++ ] = i;

	// . remove file nums that are being unlinked after a merge now
	// . keep it here (below skip: label) so sync point reads can use it
	int32_t n = 0;
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// skip those that are being unlinked after the merge
		if ( base->m_isUnlinking && 
		     m_fileNums[i] >= base->m_mergeStartFileNum &&
		     m_fileNums[i] <  base->m_mergeStartFileNum + 
		                      base->m_numFilesToMerge      )
			continue;
		// otherwise, keep it
		m_fileNums[n++] = m_fileNums[i];
	}
	m_numFileNums = n;

	// . if root file is being merged, he's file #0, & root file is file #1
	// . this is a hack so caller gets what he wants
	//if ( startFileNum == 0 && base->getFileId(0) == 0 && numFiles == 1 )
	//	numFiles = 2;

	// remember the file range we should scan
	m_numScansStarted    = 0;
	m_numScansCompleted  = 0;
	//m_startKey           = startKey;
	//m_endKey             = endKey;
	//m_constrainKey       = endKey; // set in case justGetEndKey is true
	KEYSET(m_startKey,startKeyArg,m_ks);
	KEYSET(m_endKey,endKeyArg,m_ks);
	KEYSET(m_constrainKey,endKeyArg,m_ks);//set incase justGetEndKey istrue
	m_minRecSizes        = minRecSizes;
	m_compensateForMerge = compensateForMerge;
	// bail if 0 files to scan -- no! need to set startKey/endKey
	if ( numFiles == 0 ) return true;
	// don't read anything if endKey < startKey
	//if ( m_startKey > m_endKey ) return true;
	if ( KEYCMP(m_startKey,m_endKey,m_ks)>0 ) return true;
	// keep the original in tact in case g_errno == ETRYAGAIN
	//m_endKeyOrig        = endKey;
	KEYSET(m_endKeyOrig,endKeyArg,m_ks);
	m_minRecSizesOrig   = minRecSizes;
	// start reading at this key
	m_fileStartKey = startKeyArg;
	// start the timer, keep it fast for clusterdb though
	if ( g_conf.m_logTimingDb ) m_startTime = gettimeofdayInMilliseconds();
	// translate base to an id, for the sake of m_msg0
	//char baseId = m_msg0->getRdbId ( base );
	// map ptrs
	RdbMap **maps = base->getMaps();
	// . we now boost m_minRecSizes to account for negative recs 
	// . but not if only reading one list, cuz it won't get merged and
	//   it will be too big to send back
	if ( m_numFileNums > 1 ) compensateForNegativeRecs ( base );
	// . often endKey is too big for an efficient read of minRecSizes bytes
	//   because we end up reading too much from all the files
	// . this will set m_startpg[i], m_endpg[i] for each RdbScan/RdbFile
	//   to ensure we read "minRecSizes" worth of records, not much more
	// . returns the new endKey for all ranges
	// . now this just overwrites m_endKey
	//m_endKey = setPageRanges ( base           ,
	setPageRanges ( base           ,
			m_fileNums     ,
			m_numFileNums  ,
			m_fileStartKey , // start reading @ key
			m_endKey       , // stop reading @ key
			m_minRecSizes  );

	// . NEVER let m_endKey be a negative key, because it will 
	//   always be unmatched, since delbit is cleared
	// . adjusting it here ensures our generated hints are valid
	// . we will use this key to call constrain() with
	//m_constrainKey = m_endKey;
	//if ( ( m_constrainKey.n0 & 0x01) == 0x00 ) 
	//	m_constrainKey -= (uint32_t)1;
	KEYSET(m_constrainKey,m_endKey,m_ks);
	if ( KEYNEG(m_constrainKey) )
		KEYSUB(m_constrainKey,m_ks);

	// Msg5 likes to get the endkey for getting the list from the tree
	if ( justGetEndKey ) return true;

	// sanity check
	if ( m_numFileNums > nn ) {
		log(LOG_LOGIC,"disk: Failed sanity check in Msg3.");
		char *xx = NULL; *xx = 0;
	}

	// debug msg
	//log("msg3 getting list (msg5=%" PRIu32")",m_state);
	// . MDW removed this -- go ahead an end on a delete key
	// . RdbMerge might not pick it up this round, but oh well
	// . so we can have both positive and negative co-existing in same file
	// make sure the last bit is set so we don't end on a delete key
	//m_endKey.n0 |= 0x01LL;
	// . now start reading/scanning the files
	// . our m_scans array starts at 0
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// get the page range
		//int32_t p1 = m_startpg [ i ];
		//int32_t p2 = m_endpg   [ i ];
		//#ifdef GBSANITYCHECK
		int32_t fn = m_fileNums[i];
		// this can happen somehow!
		if ( fn < 0 ) {
			log(LOG_LOGIC,"net: msg3: fn=%" PRId32". Bad engineer.",fn);
			continue;
		}
		// sanity check
		if ( i > 0 && m_fileNums[i-1] >= fn ) {
			log(LOG_LOGIC,
			    "net: msg3: files must be read in order "
			    "from oldest to newest so RdbList::indexMerge_r "
			    "works properly. Otherwise, corruption will "
			    "result. ");
			char *xx = NULL; *xx = 0;
			return true;
		}
		// . sanity check?
		// . no, we must get again since we turn on endKey's last bit
		int32_t p1 , p2;
		maps[fn]->getPageRange ( m_fileStartKey , 
					m_endKey       , 
					&p1            , 
					&p2            ,
					NULL           );
		//if ( p1 != p1c || p2 != p2c ) {
		//	fprintf(stderr,"Msg3::bad page range\n");
		//	sleep(50000);
		//}
		// sanity check, each endpg's key should be > endKey
		//if ( p2 < maps[fn]->getNumPages() && 
		//     maps[fn]->getKey ( p2 ) <= m_endKey ) {
		//	fprintf(stderr,"Msg3::bad page range 2\n");
		//	sleep(50000);
		//}
		//#endif
		//int32_t p1 , p2; 
		//maps[fn]->getPageRange (startKey,endKey,minRecSizes,&p1,&p2);
		// now get some read info
		int64_t offset      = maps[fn]->getAbsoluteOffset ( p1 );
		int32_t      bytesToRead = maps[fn]->getRecSizes ( p1, p2, false);
		// max out the endkey for this list
		// debug msg
		//#ifdef _DEBUG_		
		//if ( minRecSizes == 2000000 ) 
		//log("Msg3:: reading %" PRId32" bytes from file #%" PRId32,bytesToRead,i);
		//#endif
		// inc our m_numScans
		m_numScansStarted++;
		// . keep stats on our disk accesses
		// . count disk seeks (assuming no fragmentation)
		// . count disk bytes read
		if ( bytesToRead > 0 ) {
			base->m_rdb->didSeek (             );
			base->m_rdb->didRead ( bytesToRead );
		}
		// . the startKey may be different for each RdbScan class
		// . RdbLists must have all keys within their [startKey,endKey]
		// . therefore set startKey individually from first page in map
		// . this endKey must be >= m_endKey 
		// . this startKey must be < m_startKey
		//key_t startKey = maps[fn]->getKey ( p1 );
		//key_t endKey   = maps[fn]->getKey ( p2 );
		char startKey2 [ MAX_KEY_BYTES ];
		char endKey2   [ MAX_KEY_BYTES ];
		maps[fn]->getKey ( p1 , startKey2 );
		maps[fn]->getKey ( p2 , endKey2 );
		//char *startKey = maps[fn]->getKeyPtr ( p1 );
		//char *endKey   = maps[fn]->getKeyPtr ( p2 );
		// store in here
		m_startpg [ i ] = p1;
		m_endpg   [ i ] = p2;

		// . we read UP TO that endKey, so reduce by 1
		// . but iff p2 is NOT the last page in the map/file
		// . maps[fn]->getKey(lastPage) will return the LAST KEY
		//   and maps[fn]->getOffset(lastPage) the length of the file
		//if ( maps[fn]->getNumPages()!=p2) endKey -=(uint32_t)1;
		if ( maps[fn]->getNumPages() != p2 ) KEYSUB(endKey2,m_ks);
		// otherwise, if we're reading all pages, then force the
		// endKey to virtual inifinite
		//else endKey.setMax();
		else KEYMAX(endKey2,m_ks);

		// . set up the hints
		// . these are only used if we are only reading from 1 file
		// . these are used to call constrain() so we can constrain
		//   the end of the list w/o looping through all the recs
		//   in the list
		int32_t h2 = p2 ;
		// decrease by one page if we're on the last page
		if ( h2 > p1 && maps[fn]->getNumPages() == h2 ) h2--;
		// . decrease hint page until key is <= endKey on that page
		//   AND offset is NOT -1 because the old way would give
		//   us hints passed the endkey
		// . also decrease so we can constrain on minRecSizes in
		//   case we're the only list being read
		// . use >= m_minRecSizes instead of >, otherwise we may
		//   never be able to set "size" in RdbList::constrain()
		//   because "p" could equal "maxPtr" right away
		while ( h2 > p1 && 
			//( maps[fn]->getKey   (h2) > m_constrainKey ||
		      (KEYCMP(maps[fn]->getKeyPtr(h2),m_constrainKey,m_ks)>0||
			  maps[fn]->getOffset(h2) == -1            ||
			  maps[fn]->getAbsoluteOffset(h2) - offset >=
			  m_minRecSizes ) )
			h2--;
		// now set the hint
		m_hintOffsets [ i ] = maps[fn]->getAbsoluteOffset ( h2 ) -
			              maps[fn]->getAbsoluteOffset ( p1 ) ;
		//m_hintKeys    [ i ] = maps[fn]->getKey            ( h2 );
		KEYSET(&m_hintKeys[i*m_ks],maps[fn]->getKeyPtr(h2),m_ks);

		// reset g_errno before calling setRead()
		g_errno = 0;
		// . this fix is now in RdbList::checklist_r()
		// . we can now have dup keys, so, we may read in
		//   a rec with key "lastMinKey" even though we don't read
		//   in the first key on the end page, so don't subtract 1...
		//if ( endKey != m_endKeyOrig ) 
		//	endKey += (uint32_t) 1;

		// timing debug
		if ( g_conf.m_logTimingDb )
			log(LOG_TIMING,
			    "net: msg: reading %" PRId32" bytes from %s file #%" PRId32" "
			     "(niceness=%" PRId32")",
			     bytesToRead,base->m_dbname,i,m_niceness);

		// log huge reads, those hurt us
		if ( bytesToRead > 150000000 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes at offset %" PRId64" "
			    "from %s.",
			    bytesToRead,offset,base->m_dbname);
		}

		// if any keys in the map are the same report corruption
		char tmpKey    [16];
		char lastTmpKey[16];
		int32_t ccount = 0;
		if ( bytesToRead     > 10000000      && 
		     bytesToRead / 2 > m_minRecSizes &&
		     base->m_fixedDataSize >= 0        ) {
			for ( int32_t pn = p1 ; pn <= p2 ; pn++ ) {
				maps[fn]->getKey ( pn , tmpKey );
				if ( KEYCMP(tmpKey,lastTmpKey,m_ks) == 0 ) 
					ccount++;
				gbmemcpy(lastTmpKey,tmpKey,m_ks);
			}
		}
		if ( ccount > 10 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes from %s file #"
			     "%" PRId32" when min "
			     "required is %" PRId32". Map is corrupt and has %" PRId32" "
			     "identical consecutive page keys because the "
			     "map was \"repaired\" because out of order keys "
			     "in the index.",
			     (int32_t)bytesToRead,
			     base->m_dbname,fn,
			     (int32_t)m_minRecSizes,
			     (int32_t)ccount);
			m_numScansCompleted++;
			m_errno = ECORRUPTDATA;
			m_hadCorruption = true;
			//m_maxRetries = 0;
			break;
		}

		////////
		//
		// try to get from PAGE CACHE
		//
		////////
		BigFile *ff = base->getFile(m_fileNums[i]);
		RdbCache *rpc = getDiskPageCache ( m_rdbId );
		if ( ! m_allowPageCache ) rpc = NULL;
		// . vfd is unique 64 bit file id
		// . if file is opened vfd is -1, only set in call to open()
		int64_t vfd = ff->getVfd();
		key192_t ck = makeCacheKey ( vfd , offset, bytesToRead);
		char *rec; int32_t recSize;
		bool inCache = false;
		if ( rpc && vfd != -1 && ! m_validateCache ) 
			inCache = rpc->getRecord ( (collnum_t)0 , // collnum
						   (char *)&ck , 
						   &rec , 
						   &recSize ,
						   true , // copy?
						   -1 , // maxAge, none 
						   true ); // inccounts?
		m_scans[i].m_inPageCache = false;
		if ( inCache ) {
			m_scans[i].m_inPageCache = true;
			m_numScansCompleted++;
			// now we have to store this value, 6 or 12 so
			// we can modify the hint appropriately
			m_scans[i].m_shifted = *rec;
			m_lists[i].set ( rec +1,
					 recSize-1 ,
					 rec , // alloc
					 recSize , // allocSize
					 startKey2 ,
					 endKey2 ,
					 base->m_fixedDataSize ,
					 true , // owndata
					 base->useHalfKeys() ,
					 getKeySizeFromRdbId ( m_rdbId ) );
			continue;
		}

		// . do the scan/read of file #i
		// . this returns false if blocked, true otherwise
		// . this will set g_errno on error
		bool done = m_scans[i].setRead (base->getFile(m_fileNums[i]),
						base->m_fixedDataSize ,
						 offset                 ,
						 bytesToRead            ,
						 startKey2              ,
						 endKey2                ,
						m_ks                    ,
						 &m_lists[i]            ,
						 this                   ,
						 doneScanningWrapper    ,
						 base->useHalfKeys()    ,
						m_rdbId,
						 m_niceness             ,
						 m_allowPageCache       ,
						 m_hitDisk              ) ;
		// . damn, usually the above will indirectly launch a thread
		//   to do the reading, but it sets g_errno to EINTR,
		//   "interrupted system call"!
		// . i guess the thread does the read w/o blocking and then
		//   queues the signal on g_loop's queue before it exits
		// . try ignoring, and keep going
		if ( g_errno == EINTR ) {
			log("net: Interrupted system call while reading file. "
			    "Ignoring.");
			g_errno = 0;
		}
		// debug msg
		//fprintf(stderr,"Msg3:: reading %" PRId32" bytes from file #%" PRId32","
		//	"done=%" PRId32",offset=%" PRId64",g_errno=%s,"
		//	"startKey=n1=%" PRIu32",n0=%" PRIu64",  "
		//	"endKey=n1=%" PRIu32",n0=%" PRIu64"\n",
		//	bytesToRead,i,(int32_t)done,offset,mstrerror(g_errno),
		//	m_startKey,m_endKey);
		//if ( bytesToRead == 0 )
		//	fprintf(stderr,"shit\n");
		// if it did not block then it completed, so count it
		if ( done ) m_numScansCompleted++;
		// break on an error, and remember g_errno in case we block
		if ( g_errno && g_errno != ENOTHREADSLOTS ) { 
			int32_t tt = LOG_WARN;
			if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
			log(tt,"disk: Reading %s had error: %s.",
			    base->m_dbname, mstrerror(g_errno));
			m_errno = g_errno; 
			break; 
		}
	}
	// debug test
	//if ( rand() % 100 <= 10 ) m_errno = EIO;

	// if we blocked, return false
	if ( m_numScansCompleted < m_numScansStarted ) return false;
	// . if all scans completed without blocking then wrap it up & ret true
	// . doneScanning may now block if it finds data corruption and must
	//   get the list remotely
	return doneScanning();
}
Esempio n. 4
0
// . returns a new, smaller endKey
// . shrinks endKey while still preserving the minRecSizes requirement
// . this is the most confusing subroutine in the project
// . this now OVERWRITES endKey with the new one
//key_t Msg3::setPageRanges ( RdbBase *base ,
void  Msg3::setPageRanges ( RdbBase *base ,
			    int32_t  *fileNums      ,
			    int32_t   numFileNums   ,
			    const char  *startKey   ,
			    char  *endKey        ,
			    int32_t   minRecSizes   ) {
	// sanity check
	//if ( m_ks != 12 && m_ks != 16 ) { char *xx=NULL;*xx=0; }
	// get the file maps from the rdb
	RdbMap **maps = base->getMaps();
	// . initialize the startpg/endpg for each file
	// . we read from the first offset on m_startpg to offset on m_endpg
	// . since we set them equal that means an empty range for each file
	for ( int32_t i = 0 ; i < numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		if ( fn < 0 ) { char *xx = NULL; *xx = 0; }
		m_startpg[i] = maps[fn]->getPage( startKey );
		m_endpg  [i] = m_startpg[i];
	}
	// just return if minRecSizes 0 (no reading needed)
	//if ( minRecSizes <= 0 ) return endKey ;
	if ( minRecSizes <= 0 ) return;
	// calculate minKey minus one
	//key_t lastMinKey ;
	char lastMinKey[MAX_KEY_BYTES];
	char lastMinKeyIsValid = 0;
	// loop until we find the page ranges that barely satisfy "minRecSizes"
  loop:
	// find the map whose next page has the lowest key
	int32_t  minpg   = -1;
	//key_t minKey; 
	char minKey[MAX_KEY_BYTES];
	for ( int32_t i = 0 ; i < numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		// this guy is out of race if his end key > "endKey" already
		//if ( maps[fn]->getKey ( m_endpg[i] ) > endKey ) continue;
		if(KEYCMP(maps[fn]->getKeyPtr(m_endpg[i]),endKey,m_ks)>0)
			continue;
		// get the next page after m_endpg[i]
		int32_t nextpg = m_endpg[i] + 1;
		// if endpg[i]+1 == m_numPages then we maxed out this range
		if ( nextpg > maps[fn]->getNumPages() ) continue;
		// . but this may have an offset of -1
		// . which means the page has no key starting on it and
		//   it's occupied by a rec which starts on a previous page
		while ( nextpg < maps[fn]->getNumPages() &&
			maps[fn]->getOffset ( nextpg ) == -1 ) nextpg++;
		// . continue if his next page doesn't have the minimum key
		// . if nextpg == getNumPages() then it returns the LAST KEY
		//   contained in the corresponding RdbFile
		//if ( minpg != -1 && maps[fn]->getKey ( nextpg ) > minKey ) 
		if (minpg != -1 && 
		    KEYCMP(maps[fn]->getKeyPtr(nextpg),minKey,m_ks)>0)continue;
		// . we got a winner, his next page has the current min key
		// . if m_endpg[i]+1 == getNumPages() then getKey() returns the
		//   last key in the mapped file
		// . minKey should never equal the key on m_endpg[i] UNLESS 
		//   it's on page #m_numPages
		//minKey = maps[fn]->getKey ( nextpg );
		KEYSET(minKey,maps[fn]->getKeyPtr(nextpg),m_ks);
		minpg  = i;
		// if minKey is same as the current key on this endpg, inc it
		// so we cause some advancement, otherwise, we'll loop forever
		//if ( minKey != maps[fn]->getKey ( m_endpg[i] ) ) continue;
		if ( KEYCMP(minKey,maps[fn]->getKeyPtr(m_endpg[i]),m_ks)!=0) 
			continue;
		//minKey += (uint32_t) 1;
		KEYADD(minKey,m_ks);
	}
	// . we're done if we hit the end of all maps in the race
	// . return the max end key
	// key_t maxEndKey; maxEndKey.setMax(); return maxEndKey; }
	// . no, just the endKey
	//if ( minpg  == -1 ) return endKey;
	if ( minpg  == -1 ) return;
	// sanity check
	if ( lastMinKeyIsValid && KEYCMP(minKey,lastMinKey,m_ks)<=0 ) {
		g_errno = ECORRUPTDATA;
		log("db: Got corrupted map in memory for %s. This is almost "
		    "always because of bad memory. Please replace your RAM.",
		    base->m_dbname);
		// do not wait for any merge to complete... otherwise
		// Rdb.cpp will not close until the merge is done
		g_merge.m_isMerging  = false;
		g_merge2.m_isMerging = false;
		// to complete
		// shutdown with urgent=true so threads are disabled.
		g_process.shutdown(true);
		//g_numCorrupt++;
		// sleep for now until we make sure this works
		//sleep(2000);
		return;
	}
	// don't let minKey exceed endKey, however
	//if ( minKey > endKey ) {
	if ( KEYCMP(minKey,endKey,m_ks)>0 ) {
		//minKey      = endKey ;
		//minKey     += (uint32_t) 1;
		//lastMinKey  = endKey;
		KEYSET(minKey,endKey,m_ks);
		KEYADD(minKey,m_ks);
		KEYSET(lastMinKey,endKey,m_ks);
	}
	else {
		//lastMinKey = minKey ;
		//lastMinKey -= (uint32_t) 1;
		KEYSET(lastMinKey,minKey,m_ks);
		KEYSUB(lastMinKey,m_ks);
	}
	// it is now valid
	lastMinKeyIsValid = 1;
	// . advance m_endpg[i] so that next page < minKey 
	// . we want to read UP TO the first key on m_endpg[i]
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		m_endpg[i] = maps[fn]->getEndPage ( m_endpg[i], lastMinKey );
	}
	// . if the minKey is BIGGER than the provided endKey we're done
	// . we don't necessarily include records whose key is "minKey"
	//if ( minKey > endKey ) return endKey;
	if ( KEYCMP(minKey,endKey,m_ks)>0) return;
	// . calculate recSizes per page within [startKey,minKey-1]
	// . compute bytes of records in [startKey,minKey-1] for each map
	// . this includes negative records so we may have annihilations
	//   when merging into "diskList" and get less than what we wanted
	//   but endKey should be shortened, so our caller will know to call
	//   again if he wants more
	int32_t recSizes = 0;
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		recSizes += maps[fn]->getMinRecSizes ( m_startpg[i] , 
						       m_endpg  [i] ,
						       startKey     , 
						       lastMinKey   ,
						       false        );
	}
	// if we hit it then return minKey -1 so we only read UP TO "minKey"
	// not including "minKey"
	//if ( recSizes >= minRecSizes ) 
	if ( recSizes >= minRecSizes ) {
		// . sanity check
		// . this sanity check fails sometimes, but leave it
		//   out for now... causes the Illegal endkey msgs in
		//   RdbList::indexMerge_r()
		//if ( KEYNEG(lastMinKey) ) { char *xx=NULL;*xx=0; }
		KEYSET(endKey,lastMinKey,m_ks);
		//return lastMinKey;
		return;
	}
	// keep on truckin'
	goto loop;
}
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc.
// . if list is stored locally this tries to get it locally
// . otherwise tries to get the list from the network
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . NOTE: i was having problems with queries being cached too long, you
//   see the cache here is a NETWORK cache, so when the machines that owns
//   the list updates it on disk it can't flush our cache... so use a small
//   maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( int64_t hostId      , // host to ask (-1 if none)
		     int32_t      ip          , // info on hostId
		     int16_t     port        ,
		     int32_t      maxCacheAge , // max cached age in seconds
		     bool      addToCache  , // add net recv'd list to cache?
		     char      rdbId       , // specifies the rdb
		     //char     *coll        ,
		     collnum_t collnum ,
		     RdbList  *list        ,
		     //key_t     startKey    , 
		     //key_t     endKey      , 
		     char     *startKey    ,
		     char     *endKey      ,
		     int32_t      minRecSizes ,  // use -1 for no max
		     void     *state       ,
		     void    (* callback)(void *state ),//, RdbList *list ) ,
		     int32_t      niceness    ,
		     bool      doErrorCorrection ,
		     bool      includeTree ,
		     bool      doMerge     ,
		     int32_t      firstHostId   ,
		     int32_t      startFileNum  ,
		     int32_t      numFiles      ,
		     int32_t      timeout       ,
		     int64_t syncPoint     ,
		     int32_t      preferLocalReads ,
		     Msg5     *msg5             ,
		     Msg5     *msg5b            ,
		     bool      isRealMerge      ,
//#ifdef SPLIT_INDEXDB
		     bool      allowPageCache    ,
		     bool      forceLocalIndexdb ,
		     bool      noSplit , // doIndexdbSplit    ,
		     int32_t      forceParitySplit  ) {
//#else
//		     bool      allowPageCache ) {
//#endif
	// this is obsolete! mostly, but we need it for PageIndexdb.cpp to 
	// show a "termlist" for a given query term in its entirety so you 
	// don't have to check each machine in the network. if this is true it
	// means to query each split and merge the results together into a
	// single unified termlist. only applies to indexdb/datedb.
	//if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; }
	// note this because if caller is wrong it hurts performance major!!
	//if ( doIndexdbSplit ) 
	//	logf(LOG_DEBUG,"net: doing msg0 with indexdb split true");
	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0.");

	//if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; }

	// reset the list they passed us
	list->reset();
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( rdbId );
	// if startKey > endKey, don't read anything
	//if ( startKey > endKey ) return true;
	if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue
	// . reset hostid if it is dead
	// . this is causing UOR queries to take forever when we have a dead
	if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1;
	// no longer accept negative minrecsize
	if ( minRecSizes < 0 ) {
		g_errno = EBADENGINEER;
		log(LOG_LOGIC,
		    "net: msg0: Negative minRecSizes no longer supported.");
		char *xx=NULL;*xx=0;
		return true;
	}

	// debug msg
	//if ( niceness != 0 ) log("HEY start");
	// ensure startKey last bit clear, endKey last bit set
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	//	log("Msg0::getList: warning startKey lastbit set"); 
	//if ( (endKey.n0   & 0x01) == 0x00 ) 
	//	log("Msg0::getList: warning endKey lastbit clear"); 
	// remember these
	m_state         = state;
	m_callback      = callback;
	m_list          = list;
	m_hostId        = hostId;
	m_niceness      = niceness;
	//m_ip            = ip;
	//m_port          = port;
	m_addToCache    = addToCache;
	// . these define our request 100%
	//m_startKey      = startKey;
	//m_endKey        = endKey;
	KEYSET(m_startKey,startKey,m_ks);
	KEYSET(m_endKey,endKey,m_ks);
	m_minRecSizes   = minRecSizes;
	m_rdbId         = rdbId;
	m_collnum = collnum;//          = coll;
	m_isRealMerge   = isRealMerge;
	m_allowPageCache = allowPageCache;

	// . group to ask is based on the first key 
	// . we only do 1 group per call right now
	// . groupMask must turn on higher bits first (count downwards kinda)
	// . titledb and spiderdb use special masks to get groupId

	// if diffbot.cpp is reading spiderdb from each shard we have to
	// get groupid from hostid here lest we core in getGroupId() below.
	// it does that for dumping spiderdb to the client browser. they
	// can download the whole enchilada.
	if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
		m_shardNum = 0;
	// did they force it? core until i figure out what this is
	else if ( forceParitySplit >= 0 ) 
		//m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
		m_shardNum = forceParitySplit;
	else
		//m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
		m_shardNum = getShardNum ( m_rdbId , startKey );

	// if we are looking up a termlist in posdb that is split by termid and
	// not the usual docid then we have to set this posdb key bit that tells
	// us that ...
	if ( noSplit && m_rdbId == RDB_POSDB )
		m_shardNum = g_hostdb.getShardNumByTermId ( startKey );

	// how is this used?
	//if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId;
	if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();

	// . store these parameters
	// . get a handle to the rdb in case we can satisfy locally
	// . returns NULL and sets g_errno on error
	QUICKPOLL((m_niceness));
	Rdb *rdb = getRdbFromId ( m_rdbId );
	if ( ! rdb ) return true;
	// we need the fixedDataSize
	m_fixedDataSize = rdb->getFixedDataSize();
	m_useHalfKeys   = rdb->useHalfKeys();
	// . debug msg
	// . Msg2 does this when checking for a cached compound list.
	//   compound lists do not actually exist, they are merges of smaller
	//   UOR'd lists.
	if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree))
		log(LOG_LOGIC,"net: msg0: "
		    "Weird. check but don't add... rdbid=%"INT32".",(int32_t)m_rdbId);
	// set this here since we may not call msg5 if list not local
	//m_list->setFixedDataSize ( m_fixedDataSize );

	// . now that we do load balancing we don't want to do a disk lookup
	//   even if local if we are merging or dumping
	// . UNLESS g_conf.m_preferLocalReads is true
	if ( preferLocalReads == -1 ) 
		preferLocalReads = g_conf.m_preferLocalReads;

	// . always prefer local for full split clusterdb
	// . and keep the tfndb/titledb lookups in the same stripe
	// . so basically we can't do biased caches if fully split
	//if ( g_conf.m_fullSplit ) preferLocalReads = true;
	preferLocalReads = true;

	// it it stored locally?
	bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId );
			 m_shardNum == getMyShardNum() );
	// only do local lookups if this is true
	if ( ! preferLocalReads ) isLocal = false;

	/*
	m_numSplit = 1;
	if ( g_hostdb.m_indexSplits > 1 &&
	     ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&&
	     ! forceLocalIndexdb && doIndexdbSplit ) {
		isLocal  = false;
		//m_numSplit = INDEXDB_SPLIT;
		m_numSplit = g_hostdb.m_indexSplits;
		char *xx=NULL;*xx=0;
	}
	*/
	/*
	int64_t singleDocIdQuery = 0LL;
	if ( rdbId == RDB_POSDB ) {
		int64_t d1 = g_posdb.getDocId(m_startKey);
		int64_t d2 = g_posdb.getDocId(m_endKey);
		if ( d1+1 == d2 ) singleDocIdQuery = d1;
	}

	// . try the LOCAL termlist cache
	// . so when msg2 is evaluating a gbdocid:| query and it has to
	//   use msg0 to go across the network to get the same damn termlist
	//   over and over again for the same docid, this will help alot.
	// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
	//   send the same gbdocid:xxxx docids to the same hosts. maybe hash
	//   based on docid into the list of hosts and if that host is busy
	//   just chain until we find someone not busy.
	if ( singleDocIdQuery &&
	     getListFromTermListCache ( coll,
					m_startKey,
					m_endKey,
					maxCacheAge,
					list ) )
		// found!
		return true;
	*/

	// but always local if only one host
	if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;

	// force a msg0 if doing a docid restrictive query like
	// gbdocid:xxxx|<query> so we call cacheTermLists() 
	//if ( singleDocIdQuery ) isLocal = false;

	// . if the group is local then do it locally
	// . Msg5::getList() returns false if blocked, true otherwise
	// . Msg5::getList() sets g_errno on error
	// . don't do this if m_hostId was specified
	if ( isLocal ) { // && !g_conf.m_interfaceMachine ) {
		if ( msg5 ) {
			m_msg5 = msg5;
			m_deleteMsg5 = false;
		}
		else {
			try { m_msg5 = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5 , sizeof(Msg5) , "Msg0" );
			m_deleteMsg5 = true;
		}

		QUICKPOLL(m_niceness);
		// same for msg5b
		if ( msg5b ) {
			m_msg5b = msg5b;
			m_deleteMsg5b = false;
		}
		/*
		else if ( m_rdbId == RDB_TITLEDB ) {
			try { m_msg5b = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request. 2.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" );
			m_deleteMsg5b = true;
		}
		*/
		QUICKPOLL(m_niceness);
		if ( ! m_msg5->getList ( rdbId,
					 m_collnum ,
					 m_list ,
					 m_startKey ,
					 m_endKey   ,
					 m_minRecSizes ,
					 includeTree   , // include Tree?
					 addToCache    , // addToCache?
					 maxCacheAge   ,
					 startFileNum  , 
					 numFiles      ,
					 this ,
					 gotListWrapper2   ,
					 niceness          ,
					 doErrorCorrection ,
					 NULL , // cacheKeyPtr
					 0    , // retryNum
					 -1   , // maxRetries
					 true , // compensateForMerge
					 syncPoint ,
					 NULL,//m_msg5b   ,
					 m_isRealMerge ,
					 m_allowPageCache ) ) return false;
		// nuke it
		reset();
		return true;
	}
skip:
	// debug msg
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,"net: msg0: Sending request for data to "
		    "shard=%"UINT32" "
		    "listPtr=%"PTRFMT" minRecSizes=%"INT32" termId=%"UINT64" "
		    //"startKey.n1=%"XINT32",n0=%"XINT64" (niceness=%"INT32")",
		    "startKey.n1=%"XINT64",n0=%"XINT64" (niceness=%"INT32")",
		    //g_hostdb.makeHostId ( m_groupId ) ,
		    m_shardNum,
		    (PTRTYPE)m_list,
		    m_minRecSizes, g_posdb.getTermId(m_startKey) , 
		    //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness);
		    KEY1(m_startKey,m_ks),KEY0(m_startKey),
		    (int32_t)m_niceness);

	char *replyBuf = NULL;
	int32_t  replyBufMaxSize = 0;
	bool  freeReply = true;

	// adjust niceness for net transmission
	bool realtime = false;
	//if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true;

	// if we're niceness 0 we need to pre-allocate for reply since it
	// might be received within the asynchronous signal handler which
	// cannot call mmalloc()
	if ( realtime ) { // niceness <= 0 || netnice == 0 ) {
		// . we should not get back more than minRecSizes bytes since 
		//   we are now performing merges
		// . it should not slow things down too much since the hashing
		//   is 10 times slower than merging anyhow...
		// . CAUTION: if rdb is not fixed-datasize then this will
		//            not work for us! it can exceed m_minRecSizes.
		replyBufMaxSize = m_minRecSizes ;
		// . get a little extra to fix the error where we ask for 64 
		//   but get 72
		// . where is that coming from?
		// . when getting titleRecs we often exceed the minRecSizes 
		// . ?Msg8? was having trouble. was int16_t 32 bytes sometimes.
		replyBufMaxSize += 36;
		// why add ten percent?
		//replyBufMaxSize *= 110 ;
		//replyBufMaxSize /= 100 ;
		// make a buffer to hold the reply
//#ifdef SPLIT_INDEXDB
/*
		if ( m_numSplit > 1 ) {
			m_replyBufSize = replyBufMaxSize * m_numSplit;
			replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0");
			m_replyBuf  = replyBuf;
			freeReply = false;
		}
		else
*/
//#endif
			replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0");
		// g_errno is set and we return true if it failed
		if ( ! replyBuf ) {
			log("net: Failed to pre-allocate %"INT32" bytes to hold "
			    "data read remotely from %s: %s.",
			    replyBufMaxSize,getDbnameFromId(m_rdbId),
			    mstrerror(g_errno));
			return true;
		}
	}

	// . make a request with the info above (note: not in network order)
	// . IMPORTANT!!!!! if you change this change 
	//   Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!!
	//   no, not anymore, we commented out that request peeking code
	char *p = m_request;
	*(int64_t *) p = syncPoint        ; p += 8;
	//*(key_t     *) p = m_startKey       ; p += sizeof(key_t);
	//*(key_t     *) p = m_endKey         ; p += sizeof(key_t);
	*(int32_t      *) p = m_minRecSizes    ; p += 4;
	*(int32_t      *) p = startFileNum     ; p += 4;
	*(int32_t      *) p = numFiles         ; p += 4;
	*(int32_t      *) p = maxCacheAge      ; p += 4;
	if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; }
	*p               = m_rdbId          ; p++;
	*p               = addToCache       ; p++;
	*p               = doErrorCorrection; p++;
	*p               = includeTree      ; p++;
	*p               = (char)niceness   ; p++;
	*p               = (char)m_allowPageCache; p++;
	KEYSET(p,m_startKey,m_ks);          ; p+=m_ks;
	KEYSET(p,m_endKey,m_ks);            ; p+=m_ks;
	// NULL terminated collection name
	//strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
	*(collnum_t *)p = m_collnum; p += sizeof(collnum_t);
	m_requestSize    = p - m_request;
	// ask an individual host for this list if hostId is NOT -1
	if ( m_hostId != -1 ) {
		// get Host
		Host *h = g_hostdb.getHost ( m_hostId );
		if ( ! h ) { 
			g_errno = EBADHOSTID; 
			log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".",
			    m_hostId);
			return true;
		}
		// if niceness is 0, use the higher priority udpServer
		UdpServer *us ;
		uint16_t port;
		QUICKPOLL(m_niceness);
		//if ( niceness <= 0 || netnice == 0 ) { 
		//if ( realtime ) {
		//	us = &g_udpServer2; port = h->m_port2; }
		//else                 { 
		us = &g_udpServer ; port = h->m_port ; 
		// . returns false on error and sets g_errno, true otherwise
		// . calls callback when reply is received (or error)
		// . we return true if it returns false
		if ( ! us->sendRequest ( m_request     ,
					 m_requestSize ,
					 0x00          , // msgType
					 h->m_ip       ,
					 port          ,
					 m_hostId      ,
					 NULL          , // the slotPtr
					 this          ,
					 gotSingleReplyWrapper ,
					 timeout       ,
					 -1            , // backoff
					 -1            , // maxwait
					 replyBuf      ,
					 replyBufMaxSize ,
					 m_niceness     ) ) // cback niceness
			return true;
		// return false cuz it blocked
		return false;
	}
	// timing debug
	if ( g_conf.m_logTimingNet )
		m_startTime = gettimeofdayInMilliseconds();
	else
		m_startTime = 0;
	//if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. "
	//			"termId=%"UINT64", "
	//			"groupNum=%"UINT32"",
	//			g_indexdb.getTermId(m_startKey) ,
	//			g_hostdb.makeHostId ( m_groupId ) );

	/*
	// make the cache key so we can see what remote host cached it, if any
	char cacheKey[MAX_KEY_BYTES];
	//key_t cacheKey = makeCacheKey ( startKey     ,
	makeCacheKey ( startKey     ,
		       endKey       ,
		       includeTree  ,
		       minRecSizes  ,
		       startFileNum ,
		       numFiles     ,
		       cacheKey     ,
		       m_ks         );
	*/

	// . get the top int32_t of the key
	// . i guess this will work for 128 bit keys... hmmmmm
	int32_t keyTop = hash32 ( (char *)startKey , m_ks );

	/*
	// allocate space
	if ( m_numSplit > 1 ) {
		int32_t  need = m_numSplit * sizeof(Multicast) ;
		char *buf  = (char *)mmalloc ( need,"msg0mcast" );
		if ( ! buf ) return true;
		m_mcasts = (Multicast *)buf;
		for ( int32_t i = 0; i < m_numSplit ; i++ )
			m_mcasts[i].constructor();
	}
	*/

        // . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . calls callback on completion
	// . select first host to send to in group based on upper 32 bits
	//   of termId (m_startKey.n1)
//#ifdef SPLIT_INDEXDB
	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	//for ( int32_t i = 0; i < m_numSplit; i++ ) {

	QUICKPOLL(m_niceness);
	//int32_t gr;
	char *buf;
	/*
	if ( m_numSplit > 1 ) {
		gr  = g_indexdb.getSplitGroupId ( baseGroupId, i );
		buf = &replyBuf[i*replyBufMaxSize];
	}
	else {
	*/
	//gr  = m_groupId;
	buf = replyBuf;
	//}

	// get the multicast
	Multicast *m = &m_mcast;
	//if ( m_numSplit > 1 ) m = &m_mcasts[i];

        if ( ! m->send ( m_request    , 
//#else
//        if ( ! m_mcast.send ( m_request    , 
//#endif
			      m_requestSize, 
			      0x00         , // msgType 0x00
			      false        , // does multicast own request?
			 m_shardNum ,
//#ifdef SPLIT_INDEXDB
//			      gr           , // group + offset
//#else
//			      m_groupId    , // group to send to (groupKey)
//#endif
			      false        , // send to whole group?
			      //m_startKey.n1, // key is passed on startKey
			      keyTop       , // key is passed on startKey
			      this         , // state data
			      NULL         , // state data
			      gotMulticastReplyWrapper0 ,
			      timeout      , // timeout in seconds (was 30)
			      niceness     ,
			      realtime     ,
			      firstHostId  ,
//#ifdef SPLIT_INDEXDB
//			      &replyBuf[i*replyBufMaxSize] ,
//#else
//			      replyBuf        ,
//#endif
			      buf             ,
			      replyBufMaxSize ,
			      freeReply       , // free reply buf?
			      true            , // do disk load balancing?
			      maxCacheAge     ,
			      //(key_t *)cacheKey        ,
			      // multicast uses it for determining the best
			      // host to send the request to when doing 
			      // disk load balancing. if the host has our 
			      // data cached, then it will probably get to
			      // handle the request. for now let's just assume
			      // this is a 96-bit key. TODO: fix...
			 0 , // *(key_t *)cacheKey        ,
			      rdbId           ,
			      minRecSizes     ) ) {
		log("net: Failed to send request for data from %s in shard "
		    "#%"UINT32" over network: %s.",
		    getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
		// no, multicast will free this when it is destroyed
		//if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" );
		// but speed it up
//#ifdef SPLIT_INDEXDB
		m_errno = g_errno;
		m->reset();
		if ( m_numRequests > 0 )
			return false;
//#else
//		m_mcast.reset();
//#endif
		return true;
	}
//#ifdef SPLIT_INDEXDB
	m_numRequests++;

//#endif
	// we blocked
	return false;
}
// . return false if blocked, true otherwise
// . sets g_errno on error
// . this one is also called by RdbMerge to dump lists
bool RdbDump::dumpList ( RdbList *list , int32_t niceness , bool recall ) {

	// if we had a write error and are being recalled...
	if ( recall ) { m_offset -= m_bytesToWrite; goto recallskip; }
	// assume we don't hack the list
	m_hacked = false;
	m_hacked12 = false;
	// save ptr to list... why?
	m_list = list;
	// nothing to do if list is empty
	if ( m_list->isEmpty() ) return true;
	// we're now in dump mode again 
	m_isDumping = true;
	//#ifdef GBSANITYCHECK
	// don't check list if we're dumping an unordered list from tree!
	if ( g_conf.m_verifyWrites && m_orderedDump ) {
		m_list->checkList_r ( false /*removedNegRecs?*/ );
		// print list stats
		// log("dump: sk=%s ",KEYSTR(m_list->m_startKey,m_ks));
		// log("dump: ek=%s ",KEYSTR(m_list->m_endKey,m_ks));
	}
	//#endif

	// before calling RdbMap::addList(), always reset list ptr
	// since we no longer call this in RdbMap::addList() so we don't
	// mess up the possible HACK below
	m_list->resetListPtr();

	// . SANITY CHECK
	// . ensure first key is >= last key added to the map map
	if ( m_offset > 0 && m_map ) {
		//key_t k       = m_list->getCurrentKey();
		char k[MAX_KEY_BYTES];
		m_list->getCurrentKey(k);
		//key_t lastKey = m_map->getLastKey    (); // m_lastKey
		char lastKey[MAX_KEY_BYTES];
		m_map->getLastKey(lastKey);
		//char *lastKey = m_map->getLastKey();
		//if ( k <= lastKey ) {
		if ( KEYCMP(k,lastKey,m_ks)<=0 ) {
			log(LOG_LOGIC,"db: Dumping list key out of order. "
			    //"lastKey.n1=%"XINT32" n0=%"XINT64" k.n1=%"XINT32" n0=%"XINT64"",
			    //lastKey.n1,lastKey.n0,k.n1,k.n0);
			    "lastKey=%s k=%s",
			    KEYSTR(lastKey,m_ks),
			    KEYSTR(k,m_ks));
			g_errno = EBADENGINEER;
			//return true;
			char *xx = NULL; *xx = 0;
		}
	}

	if ( g_conf.m_verifyWrites ) {
		char rdbId = 0;
		if ( m_rdb ) rdbId = m_rdb->m_rdbId;
		m_list->checkList_r(false,false,rdbId);//RDB_POSDB);
		m_list->resetListPtr();
	}

	// HACK! POSDB
	if ( m_ks == 18 && m_orderedDump && m_offset > 0 ) {
		char k[MAX_KEY_BYTES];
		m_list->getCurrentKey(k);
		// . same top 6 bytes as last key we added?
		// . if so, we should only add 6 bytes from this key, not 12
		//   so on disk it is compressed consistently
		if ( memcmp ( (k             ) + (m_ks-12) ,
			      (m_prevLastKey ) + (m_ks-12) , 12 ) == 0 ) {
			char tmp[MAX_KEY_BYTES];
			char *p = m_list->getList();
			// swap high 12 bytes with low 6 bytes for first key
			gbmemcpy ( tmp   , p            , m_ks-12 );
			gbmemcpy ( p     , p + (m_ks-12) ,      12 );
			gbmemcpy ( p + 12, tmp          , m_ks-12 );
			// big hack here
			m_list->m_list         = p + 12;
			m_list->m_listPtr      = p + 12;
			m_list->m_listPtrLo    = p ;
			m_list->m_listPtrHi    = p + 6;
			m_list->m_listSize    -= 12 ;
			// turn on both bits to indicate double compression
			*(p+12) |= 0x06;
			m_hacked12 = true;
		}
	}

	// . HACK
	// . if we're doing an ordered dump then hack the list's first 12 byte
	//   key to make it a 6 byte iff the last key we dumped last time
	//   shares the same top 6 bytes as the first key of this list
	// . this way we maintain compression consistency on the disk
	//   so IndexTable.cpp can expect all 6 byte keys for the same termid
	//   and RdbList::checkList_r() can expect the half bits to always be
	//   on when they can be on
	// . IMPORTANT: calling m_list->resetListPtr() will mess this HACK up!!
	if ( m_useHalfKeys && m_orderedDump && m_offset > 0 && ! m_hacked12 ) {
		//key_t k = m_list->getCurrentKey();	
		char k[MAX_KEY_BYTES];
		m_list->getCurrentKey(k);
		// . same top 6 bytes as last key we added?
		// . if so, we should only add 6 bytes from this key, not 12
		//   so on disk it is compressed consistently
		//if ( memcmp ( ((char *)&k             ) + 6 ,
		//	      ((char *)&m_prevLastKey ) + 6 , 6 ) == 0 ) {
		if ( memcmp ( (k             ) + (m_ks-6) ,
			      (m_prevLastKey ) + (m_ks-6) , 6 ) == 0 ) {
			m_hacked = true;
			//char tmp[6];
			char tmp[MAX_KEY_BYTES];
			char *p = m_list->getList();
			//gbmemcpy ( tmp   , p     , 6 );
			//gbmemcpy ( p     , p + 6 , 6 );
			//gbmemcpy ( p + 6 , tmp   , 6 );
			gbmemcpy ( tmp   , p            , m_ks-6 );
			gbmemcpy ( p     , p + (m_ks-6) ,      6 );
			gbmemcpy ( p + 6 , tmp          , m_ks-6 );
			// big hack here
			m_list->m_list       = p + 6;
			m_list->m_listPtr    = p + 6;
			// make this work for POSDB, too
			m_list->m_listPtrLo  = p + 6 + 6;
			m_list->m_listPtrHi  = p ;
			m_list->m_listSize  -= 6 ;
			// hack on the half bit, too
			*(p+6) |= 0x02;
		}
	}

	// update old last key
	//m_prevLastKey = m_list->getLastKey();
	m_list->getLastKey(m_prevLastKey);

	// now write it to disk
	m_buf          = m_list->getList    ();
	m_bytesToWrite = m_list->getListSize();
	//#ifdef GBSANITYCHECK
	//if (m_list->getListSize()!=m_list->getListEnd() - m_list->getList()){
	//	log("RdbDump::dumpList: major problem here!");
	//	sleep(50000);
	//}
	//#endif
 recallskip:
	// make sure we have enough mem to add to map after a successful
	// dump up here, otherwise, if we write it and fail to add to map
	// the map is not in sync if we core thereafter
	if ( m_addToMap && m_map && ! m_map->prealloc ( m_list ) ) {
		log("db: Failed to prealloc list into map: %s.",
		    mstrerror(g_errno));
		// g_errno should be set to something if that failed
		if ( ! g_errno ) { char *xx = NULL; *xx = 0; }
		return true;
	}
	// tab to the old offset
	int64_t offset = m_offset;
	// might as well update the offset now, even before write is done
	m_offset += m_bytesToWrite ;
	// write thread is out
	m_writing = true;
	//m_bytesWritten = 0;

	// sanity check
	//log("dump: writing %"INT32" bytes at offset %"INT64"",m_bytesToWrite,offset);

	// . if we're called by RdbMerge directly use m_callback/m_state
	// . otherwise, use doneWritingWrapper() which will call dumpTree()
	// . BigFile::write() return 0 if blocked,-1 on error,>0 on completion
	// . it also sets g_errno on error
	bool isDone = m_file->write ( m_buf          ,
				      m_bytesToWrite ,
				      offset         ,
				      &m_fstate      ,
				      this           ,
				      doneWritingWrapper ,
				      niceness         );
	// debug msg
	//log("RdbDump dumped %"INT32" bytes, done=%"INT32"\n",
	//	m_bytesToWrite,isDone); 
	// return false if it blocked
	if ( ! isDone ) return false;
	// done writing
	m_writing = false;
	// return true on error
	if ( g_errno    ) return true;
	// . delete list from tree, incorporate list into cache, add to map
	// . returns false if blocked, true otherwise, sets g_errno on error
	// . will only block in calling updateTfndb()
	return doneDumpingList ( true );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . dumps the RdbTree, m_tree, into m_file
// . also sets and writes the RdbMap for m_file
// . we methodically get RdbLists from the RdbTree 
// . dumped recs are ordered by key if "orderedDump" was true in call to set()
//   otherwise, lists are ordered by node #
// . we write each list of recs to the file until the whole tree has been done
// . we delete all records in list from the tree after we've written the list
// . if a cache was provided we incorporate the list into the cache before
//   deleting it from the tree to keep the cache in sync. NO we do NOT!
// . called again by writeBuf() when it's done writing the whole list
bool RdbDump::dumpTree ( bool recall ) {
	// set up some vars
	//int32_t  nextNode;
	//key_t maxEndKey;
	//maxEndKey.setMax();
	char maxEndKey[MAX_KEY_BYTES];
	KEYMAX(maxEndKey,m_ks);
	// if dumping statsdb, we can only dump records 30 seconds old or
	// more because Statsdb.cpp can "back modify" such records in the tree
	// because it may have a query that took 10 seconds come in then it
	// needs to add a partial stat to the last 10 stats for those 10 secs.
	// we use Global time at this juncture
	if ( m_rdb->m_rdbId == RDB_STATSDB ) {
		int32_t nowSecs = getTimeGlobal();
		StatKey *sk = (StatKey *)maxEndKey;
		sk->m_zero      = 0x01;
		sk->m_labelHash = 0xffffffff;
		// leave last 60 seconds in there just to be safe
		sk->m_time1     = nowSecs - 60;
	}

	// this list will hold the list of nodes/recs from m_tree
	m_list = &m_ourList;
	// convert coll to collnum
	//collnum_t collnum = g_collectiondb.getCollnum ( m_coll );
	// a collnum of -1 is for collectionless rdbs
	//if ( collnum < 0 ) {
	//	//if ( g_catdb->getRdb() == m_rdb )
	//	if ( ! m_rdb->m_isCollectionLess ) {
	//		char *xx=NULL;*xx=0; //return true;
	//	}
	//	g_errno = 0;
	//	collnum = 0;
	//}
	// getMemOccupiedForList2() can take some time, so breathe
	int32_t niceness = 1;
 loop:
	// if the lastKey was the max end key last time then we're done
	if ( m_rolledOver     ) return true;
	// this is set to -1 when we're done with our unordered dump
	if ( m_nextNode == -1 ) return true;
	// . NOTE: list's buffer space should be re-used!! (TODO)
	// . "lastNode" is set to the last node # in the list
	bool status = true;
	//if ( ! m_orderedDump ) {
	//	status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode ,
	//							 m_maxBufSize ,
	//							 m_list , 
	//							 &nextNode );
	//	// this is -1 when no more nodes are left
	//	m_nextNode = nextNode;
	//}
	// "lastKey" is set to the last key in the list
	//else {
	{

		// can we remove neg recs?
		// class RdbBase *base = m_rdb->getBase(m_collnum);
		// bool removeNegRecs = false;
		// if ( base->m_numFiles <= 0 ) removeNegRecs = true;

		if ( recall ) goto skip;

		// debug msg
		//log("RdbDump:: getting list");
		m_t1 = gettimeofdayInMilliseconds();
		if(m_tree)
			status = m_tree->getList ( m_collnum       ,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys ,
						   niceness );
		else if(m_buckets)
			status = m_buckets->getList ( m_collnum,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys );


		// don't dump out any neg recs if it is our first time dumping
		// to a file for this rdb/coll. TODO: implement this later.
		//if ( removeNegRecs )
		//	m_list.removeNegRecs();

 		// if(!m_list->checkList_r ( false , // removeNegRecs?
 		// 			 false , // sleep on problem?
 		// 			 m_rdb->m_rdbId )) {
 		// 	log("db: list to dump is not sane!");
		// 	char *xx=NULL;*xx=0;
 		// }


	skip:
		int64_t t2;
		//key_t lastKey;
		char *lastKey;
		// if error getting list (out of memory?)
		if ( ! status ) goto hadError;
		// debug msg
		t2 = gettimeofdayInMilliseconds();
		log(LOG_INFO,"db: Get list took %"INT64" ms. "
		    "%"INT32" positive. %"INT32" negative.",
		    t2 - m_t1 , m_numPosRecs , m_numNegRecs );
		// keep a total count for reporting when done
		m_totalPosDumped += m_numPosRecs;
		m_totalNegDumped += m_numNegRecs;
		// . check the list we got from the tree for problems
		// . ensures keys are ordered from lowest to highest as well
		//#ifdef GBSANITYCHECK
		if ( g_conf.m_verifyWrites ) {
			char *s = "none";
			if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
			log("dump: verifying list before dumping (rdb=%s)",s);
			m_list->checkList_r ( false , // removeNegRecs?
					      false , // sleep on problem?
					      m_rdb->m_rdbId );
		}
		// if list is empty, we're done!
		if ( status && m_list->isEmpty() ) {
			// consider that a rollover?
			if ( m_rdb->m_rdbId == RDB_STATSDB )
				m_rolledOver = true;
			return true;
		}
		// get the last key of the list
		lastKey = m_list->getLastKey();
		// advance m_nextKey
		//m_nextKey  = lastKey ;
		//m_nextKey += (uint32_t)1;
		//if ( m_nextKey < lastKey ) m_rolledOver = true;
		KEYSET(m_nextKey,lastKey,m_ks);
		KEYADD(m_nextKey,1,m_ks);
		if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true;
	      // debug msg
	      //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0);
	      //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0);
	}
	// . return true on error, g_errno should have been set
	// . this is probably out of memory error
	if ( ! status ) {
	hadError:
		log("db: Had error getting data for dump: %s. Retrying.", 
		    mstrerror(g_errno));
		// debug msg
		//log("RdbDump::getList: sleeping and retrying");
		// retry for the remaining two types of errors
		if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){
			log(
			    "db: Retry failed. Could not register callback.");
			return true;
		}
		// wait for sleep
		return false;
	}
	// if list is empty, we're done!
	if ( m_list->isEmpty() ) return true;
	// . set m_firstKeyInQueue and m_lastKeyInQueue
	// . this doesn't work if you're doing an unordered dump, but we should
	//   not allow adds when closing
	m_lastKeyInQueue  = m_list->getLastKey();
	//m_firstKeyInQueue = m_list->getCurrentKey();
	m_list->getCurrentKey(m_firstKeyInQueue);
	// . write this list to disk
	// . returns false if blocked, true otherwise
	// . sets g_errno on error
	// . if this blocks it should call us (dumpTree() back)
	if ( ! dumpList ( m_list , m_niceness , false ) ) return false;
	// close up shop on a write/dumpList error
	if ( g_errno ) return true;
	// . if dumpList() did not block then keep on truckin'
	// . otherwise, wait for callback of dumpTree()
	goto loop;
}
Esempio n. 8
0
int main()
{
	atddtree_key min = 1;
	atddtree_key max;// = 1000;
	atddtree_key keys[10] = {2,3,5,8,16,23,26,35,48,50};
	int i;
	atddtree* t;

	int nuser = 1024*1024-1;	//1024*1024*64-1;
	double mean_ia = 205;

	
	gsl_rng *r;
		const gsl_rng_type *T;
		int n=5;
		double u;
			
		T=gsl_rng_ranlxs0;		//设随机数生成器类型是 ranlxs0
		
		
		//gen arrival
		gsl_rng_default_seed = ((unsigned long)(time(NULL))); 	 //设seed值为当前时间
		r=gsl_rng_alloc(T); 	 //生成实例
		
		double* exp_sample_ir = MALLOC(nuser, double);
		double abstemp = 0;
		for(i=0;i<nuser;i++)	{
				exp_sample_ir[i] = gsl_ran_exponential(r, mean_ia);
				//exp_sample_ir[i] =  2+(i%10000)*0.3;
#ifdef LOGISTIC				
				abstemp = gsl_ran_logistic(r, 1);
				if(abstemp<0)	{
					abstemp=0-abstemp;
				}
				exp_sample_ir[i] = abstemp;				
#endif	
				//exp_sample_ir[i] = 5*gsl_ran_beta(r, 5, 1);
				//exp_sample_ir[i] = 5*gsl_ran_lognormal(r, 5, 0.25);
				//printf("exp: %f\n", exp_sample_ir[i]);
		}
		
		
		double* arrival_real	= MALLOC(nuser, double);
		arrival_real[0] = 1.0;
		for(i=1;i<nuser;i++)	{
			arrival_real[i] = arrival_real[i-1]+exp_sample_ir[i-1];
			//printf("arrival_real: %f\n", arrival_real[i]);
		}

		atddtree_key* arrival	= MALLOC(nuser, atddtree_key);
		for(i=0;i<nuser;i++)	{
					arrival[i] = (atddtree_key)arrival_real[i];
					//printf("arrival: %ld\n", arrival[i]);
		}


		max = 0;
	for(i=0;i<nuser;i++)	{
		
		if(KEYCMP(arrival[i],max)>0)	{
			KEYCPY(max,arrival[i]);
		}
		
	}

	printf("---max=%ld\n", max);









	
	t = atddtree_create(&min, &max);
	for(i=0;i<nuser;i++)	{
		atddtree_insert(t, arrival+i);
		//printf("insert %ld, height=%d\n", arrival[i], t->h);
	}
	printf("height=%d\n", t->h);
}