// . return false if blocked, true otherwise
// . set g_errno on error
// . list should be truncated, possible have all negative keys removed,
//   and de-duped thanks to RdbList::indexMerge_r() and RdbList::merge_r()
bool RdbMerge::dumpList ( ) {
	// return true on g_errno
	if ( g_errno ) return true;

	// . it's suspended so we count this as blocking
	// . resumeMerge() will call getNextList() again, not dumpList() so
	//   don't advance m_startKey
	if ( m_isSuspended ) {
		m_isReadyToSave = true;
		return false;
	}

	// . set the list to only those records that should be in our group
	// . filter the records that don't belong in this group via groupId
	//filterList ( &m_list );

	// . compute the new m_startKey to get the next list from disk
	// . m_list was formed via RdbList::merge() 
	// . m_list may be empty because of negative/positive collisions
	//   but there may still be data left
	//m_startKey = m_list.getLastKey() ;
	//m_list.getLastKey(m_startKey) ;
	// if we use getLastKey() for this the merge completes but then
	// tries to merge two empty lists and cores in the merge function
	// because of that. i guess it relies on endkey rollover only and
	// not on reading less than minRecSizes to determine when to stop
	// doing the merge.
	m_list.getEndKey(m_startKey) ;
	//m_startKey += (uint32_t)1;
	KEYADD(m_startKey,m_ks);

	/////
	//
	// dedup for spiderdb before we dump it. try to save disk space.
	//
	/////
	if ( m_rdbId == RDB_SPIDERDB )
		// removeNegRecs? = false
		dedupSpiderdbList(&m_list, false);

	// if the startKey rolled over we're done
	//if ( m_startKey.n0 == 0LL && m_startKey.n1 == 0 ) m_doneMerging=true;
	if ( KEYCMP(m_startKey,KEYMIN(),m_ks)==0 ) m_doneMerging = true;
	// debug msg
	log(LOG_DEBUG,"db: Dumping list.");
	// debug msg
	//fprintf(stderr,"list startKey.n1=%" PRIu32",n0=%" PRIu64", endKey.n1=%" PRIu32",n0=%" PRIu64","
	//	" size=%" PRId32"\n",
	//	m_list.getStartKey().n1, 
	//	m_list.getStartKey().n0, 
	//	m_list.getLastKey().n1, 
	//	m_list.getLastKey().n0,  m_list.getListSize() );
	// . send the whole list to the dump
	// . it returns false if blocked, true otherwise
	// . it sets g_errno on error
	// . it calls dumpListWrapper when done dumping
	// . return true if m_dump had an error or it did not block
	// . if it gets a EFILECLOSED error it will keep retrying forever
	return m_dump.dumpList ( &m_list , m_niceness , false/*recall?*/ ) ;
}
示例#2
0
// . returns a new, smaller endKey
// . shrinks endKey while still preserving the minRecSizes requirement
// . this is the most confusing subroutine in the project
// . this now OVERWRITES endKey with the new one
//key_t Msg3::setPageRanges ( RdbBase *base ,
void  Msg3::setPageRanges ( RdbBase *base ,
			    int32_t  *fileNums      ,
			    int32_t   numFileNums   ,
			    const char  *startKey   ,
			    char  *endKey        ,
			    int32_t   minRecSizes   ) {
	// sanity check
	//if ( m_ks != 12 && m_ks != 16 ) { char *xx=NULL;*xx=0; }
	// get the file maps from the rdb
	RdbMap **maps = base->getMaps();
	// . initialize the startpg/endpg for each file
	// . we read from the first offset on m_startpg to offset on m_endpg
	// . since we set them equal that means an empty range for each file
	for ( int32_t i = 0 ; i < numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		if ( fn < 0 ) { char *xx = NULL; *xx = 0; }
		m_startpg[i] = maps[fn]->getPage( startKey );
		m_endpg  [i] = m_startpg[i];
	}
	// just return if minRecSizes 0 (no reading needed)
	//if ( minRecSizes <= 0 ) return endKey ;
	if ( minRecSizes <= 0 ) return;
	// calculate minKey minus one
	//key_t lastMinKey ;
	char lastMinKey[MAX_KEY_BYTES];
	char lastMinKeyIsValid = 0;
	// loop until we find the page ranges that barely satisfy "minRecSizes"
  loop:
	// find the map whose next page has the lowest key
	int32_t  minpg   = -1;
	//key_t minKey; 
	char minKey[MAX_KEY_BYTES];
	for ( int32_t i = 0 ; i < numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		// this guy is out of race if his end key > "endKey" already
		//if ( maps[fn]->getKey ( m_endpg[i] ) > endKey ) continue;
		if(KEYCMP(maps[fn]->getKeyPtr(m_endpg[i]),endKey,m_ks)>0)
			continue;
		// get the next page after m_endpg[i]
		int32_t nextpg = m_endpg[i] + 1;
		// if endpg[i]+1 == m_numPages then we maxed out this range
		if ( nextpg > maps[fn]->getNumPages() ) continue;
		// . but this may have an offset of -1
		// . which means the page has no key starting on it and
		//   it's occupied by a rec which starts on a previous page
		while ( nextpg < maps[fn]->getNumPages() &&
			maps[fn]->getOffset ( nextpg ) == -1 ) nextpg++;
		// . continue if his next page doesn't have the minimum key
		// . if nextpg == getNumPages() then it returns the LAST KEY
		//   contained in the corresponding RdbFile
		//if ( minpg != -1 && maps[fn]->getKey ( nextpg ) > minKey ) 
		if (minpg != -1 && 
		    KEYCMP(maps[fn]->getKeyPtr(nextpg),minKey,m_ks)>0)continue;
		// . we got a winner, his next page has the current min key
		// . if m_endpg[i]+1 == getNumPages() then getKey() returns the
		//   last key in the mapped file
		// . minKey should never equal the key on m_endpg[i] UNLESS 
		//   it's on page #m_numPages
		//minKey = maps[fn]->getKey ( nextpg );
		KEYSET(minKey,maps[fn]->getKeyPtr(nextpg),m_ks);
		minpg  = i;
		// if minKey is same as the current key on this endpg, inc it
		// so we cause some advancement, otherwise, we'll loop forever
		//if ( minKey != maps[fn]->getKey ( m_endpg[i] ) ) continue;
		if ( KEYCMP(minKey,maps[fn]->getKeyPtr(m_endpg[i]),m_ks)!=0) 
			continue;
		//minKey += (uint32_t) 1;
		KEYADD(minKey,m_ks);
	}
	// . we're done if we hit the end of all maps in the race
	// . return the max end key
	// key_t maxEndKey; maxEndKey.setMax(); return maxEndKey; }
	// . no, just the endKey
	//if ( minpg  == -1 ) return endKey;
	if ( minpg  == -1 ) return;
	// sanity check
	if ( lastMinKeyIsValid && KEYCMP(minKey,lastMinKey,m_ks)<=0 ) {
		g_errno = ECORRUPTDATA;
		log("db: Got corrupted map in memory for %s. This is almost "
		    "always because of bad memory. Please replace your RAM.",
		    base->m_dbname);
		// do not wait for any merge to complete... otherwise
		// Rdb.cpp will not close until the merge is done
		g_merge.m_isMerging  = false;
		g_merge2.m_isMerging = false;
		// to complete
		// shutdown with urgent=true so threads are disabled.
		g_process.shutdown(true);
		//g_numCorrupt++;
		// sleep for now until we make sure this works
		//sleep(2000);
		return;
	}
	// don't let minKey exceed endKey, however
	//if ( minKey > endKey ) {
	if ( KEYCMP(minKey,endKey,m_ks)>0 ) {
		//minKey      = endKey ;
		//minKey     += (uint32_t) 1;
		//lastMinKey  = endKey;
		KEYSET(minKey,endKey,m_ks);
		KEYADD(minKey,m_ks);
		KEYSET(lastMinKey,endKey,m_ks);
	}
	else {
		//lastMinKey = minKey ;
		//lastMinKey -= (uint32_t) 1;
		KEYSET(lastMinKey,minKey,m_ks);
		KEYSUB(lastMinKey,m_ks);
	}
	// it is now valid
	lastMinKeyIsValid = 1;
	// . advance m_endpg[i] so that next page < minKey 
	// . we want to read UP TO the first key on m_endpg[i]
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		m_endpg[i] = maps[fn]->getEndPage ( m_endpg[i], lastMinKey );
	}
	// . if the minKey is BIGGER than the provided endKey we're done
	// . we don't necessarily include records whose key is "minKey"
	//if ( minKey > endKey ) return endKey;
	if ( KEYCMP(minKey,endKey,m_ks)>0) return;
	// . calculate recSizes per page within [startKey,minKey-1]
	// . compute bytes of records in [startKey,minKey-1] for each map
	// . this includes negative records so we may have annihilations
	//   when merging into "diskList" and get less than what we wanted
	//   but endKey should be shortened, so our caller will know to call
	//   again if he wants more
	int32_t recSizes = 0;
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		int32_t fn = fileNums[i];
		recSizes += maps[fn]->getMinRecSizes ( m_startpg[i] , 
						       m_endpg  [i] ,
						       startKey     , 
						       lastMinKey   ,
						       false        );
	}
	// if we hit it then return minKey -1 so we only read UP TO "minKey"
	// not including "minKey"
	//if ( recSizes >= minRecSizes ) 
	if ( recSizes >= minRecSizes ) {
		// . sanity check
		// . this sanity check fails sometimes, but leave it
		//   out for now... causes the Illegal endkey msgs in
		//   RdbList::indexMerge_r()
		//if ( KEYNEG(lastMinKey) ) { char *xx=NULL;*xx=0; }
		KEYSET(endKey,lastMinKey,m_ks);
		//return lastMinKey;
		return;
	}
	// keep on truckin'
	goto loop;
}
// . buffer is used for reading and writing
// . return false if blocked, true otherwise
// . sets g_errno on error
// . if niceness is 0 merge will block, otherwise will not block
// . we now use niceness of 1 which should spawn threads that don't allow
//   niceness 2 threads to launch while they're running
// . spider process now uses mostly niceness 2 
// . we need the merge to take priority over spider processes on disk otherwise
//   there's too much contention from spider lookups on disk for the merge
//   to finish in a decent amount of time and we end up getting too many files!
bool RdbMerge::merge ( char     rdbId        ,
		       //char    *coll         , //RdbBase *base         , 
		       collnum_t collnum,
		       BigFile *target       , 
		       RdbMap  *targetMap    ,
		       long     id2          , // target's secondary id
		       long     startFileNum , 
		       long     numFiles     ,
		       long     niceness     ,
		       class DiskPageCache *pc   ,
		       long long maxTargetFileSize ,
		       char     keySize      ) {
	// reset ourselves
	reset();
	// set it
	m_rdbId = rdbId;
	Rdb *rdb = getRdbFromId ( rdbId );
	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; if (!(base=getRdbBase(m_rdbId,collnum))) return true;
	// don't breech the max
	//if ( numFiles > m_maxFilesToMerge ) numFiles = m_maxFilesToMerge;
	// reset this map! it's m_crcs needs to be reset
	//targetMap->reset();
	// remember some parms
	//if ( ! coll && rdb->m_isCollectionLess )
	//	strcpy ( m_coll , rdb->m_dbname );
	//else
	//	strcpy ( m_coll , coll );

	m_collnum = collnum;
	if ( rdb->m_isCollectionLess ) m_collnum = 0;

	m_target          = target;
	m_targetMap       = targetMap;
	m_id2             = id2;
	m_startFileNum    = startFileNum;
	m_numFiles        = numFiles;
	m_dedup           = base->m_dedup;
	m_fixedDataSize   = base->m_fixedDataSize;
	m_niceness        = niceness;
	m_pc              = pc;
	m_maxTargetFileSize = maxTargetFileSize;
	m_doneMerging     = false;
	m_ks              = keySize;
	// . set the key range we want to retrieve from the files
	// . just get from the files, not tree (not cache?)
	//m_startKey.setMin();
	//m_endKey.setMax();
	KEYMIN(m_startKey,m_ks);
	KEYMAX(m_endKey,m_ks);
	// if we're resuming a killed merge, set m_startKey to last
	// key the map knows about.
	// the dump will start dumping at the end of the targetMap's data file.
	if ( m_targetMap->getNumRecs() > 0 ) {
		log(LOG_INIT,"db: Resuming a killed merge.");
		//m_startKey = m_targetMap->getLastKey();
		m_targetMap->getLastKey(m_startKey);
		//m_startKey += (unsigned long) 1;
		KEYADD(m_startKey,1,m_ks);
		// if power goes out and we are not doing synchronous writes
		// then we could have completely lost some data and unlinked
		// a part file from the file being merged, so that the data is
		// gone. to be able to resume merging, we must increment the
		// startKey until it references a valid offset in all the 
		// files being merged. invalid offsets will reference parts 
		// that have been chopped.
		/*
		RdbMap  **maps  = rdb->getMaps();
		BigFile **files = rdb->getFiles();
		for ( long i=m_startFileNum;i<m_startFileNum+m_numFiles;i++){
			long long minOff = 0LL;
			long k = 0;
			while ( k < files[i]->m_maxParts &&
				!   files[i]->m_files[k]    ) {
				k++;
				minOff += MAX_PART_SIZE;
			}
			long pn0 = maps[i]->getPage ( m_startKey );
			long pn  = pn0;
			while ( maps[i]->getAbsoluteOffset(pn) < minOff ) pn++;
			if ( pn != pn0 ) {
				log("db: Lost data during merge. Starting "
				    "merge at page number %li from %li for "
				    "file.",pn,pn0);
				m_startKey = maps[i]->getKey ( pn );
			}
		}
		*/
	}
	// free our list's memory, just in case
	//m_list.freeList();
	// . we may have multiple hosts running on the same cpu/hardDrive
	// . therefore, to maximize disk space, we should only have 1 merge
	//   at a time going on between these hosts
	// . now tfndb has own merge class since titledb merge writes url recs
	/*
	if ( s_isMergeLocked ) {
		//log("RdbMerge::merge: someone else merging sleeping.");
		log("RdbMerge::merge: someone else merging. bad engineer.");
		return false;
		// if it fails then sleep until it works
		//returng_loop.registerSleepCallback(5000,this,getLockWrapper);
	}
	*/
	return gotLock();
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . dumps the RdbTree, m_tree, into m_file
// . also sets and writes the RdbMap for m_file
// . we methodically get RdbLists from the RdbTree 
// . dumped recs are ordered by key if "orderedDump" was true in call to set()
//   otherwise, lists are ordered by node #
// . we write each list of recs to the file until the whole tree has been done
// . we delete all records in list from the tree after we've written the list
// . if a cache was provided we incorporate the list into the cache before
//   deleting it from the tree to keep the cache in sync. NO we do NOT!
// . called again by writeBuf() when it's done writing the whole list
bool RdbDump::dumpTree ( bool recall ) {
	// set up some vars
	//int32_t  nextNode;
	//key_t maxEndKey;
	//maxEndKey.setMax();
	char maxEndKey[MAX_KEY_BYTES];
	KEYMAX(maxEndKey,m_ks);
	// if dumping statsdb, we can only dump records 30 seconds old or
	// more because Statsdb.cpp can "back modify" such records in the tree
	// because it may have a query that took 10 seconds come in then it
	// needs to add a partial stat to the last 10 stats for those 10 secs.
	// we use Global time at this juncture
	if ( m_rdb->m_rdbId == RDB_STATSDB ) {
		int32_t nowSecs = getTimeGlobal();
		StatKey *sk = (StatKey *)maxEndKey;
		sk->m_zero      = 0x01;
		sk->m_labelHash = 0xffffffff;
		// leave last 60 seconds in there just to be safe
		sk->m_time1     = nowSecs - 60;
	}

	// this list will hold the list of nodes/recs from m_tree
	m_list = &m_ourList;
	// convert coll to collnum
	//collnum_t collnum = g_collectiondb.getCollnum ( m_coll );
	// a collnum of -1 is for collectionless rdbs
	//if ( collnum < 0 ) {
	//	//if ( g_catdb->getRdb() == m_rdb )
	//	if ( ! m_rdb->m_isCollectionLess ) {
	//		char *xx=NULL;*xx=0; //return true;
	//	}
	//	g_errno = 0;
	//	collnum = 0;
	//}
	// getMemOccupiedForList2() can take some time, so breathe
	int32_t niceness = 1;
 loop:
	// if the lastKey was the max end key last time then we're done
	if ( m_rolledOver     ) return true;
	// this is set to -1 when we're done with our unordered dump
	if ( m_nextNode == -1 ) return true;
	// . NOTE: list's buffer space should be re-used!! (TODO)
	// . "lastNode" is set to the last node # in the list
	bool status = true;
	//if ( ! m_orderedDump ) {
	//	status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode ,
	//							 m_maxBufSize ,
	//							 m_list , 
	//							 &nextNode );
	//	// this is -1 when no more nodes are left
	//	m_nextNode = nextNode;
	//}
	// "lastKey" is set to the last key in the list
	//else {
	{

		// can we remove neg recs?
		// class RdbBase *base = m_rdb->getBase(m_collnum);
		// bool removeNegRecs = false;
		// if ( base->m_numFiles <= 0 ) removeNegRecs = true;

		if ( recall ) goto skip;

		// debug msg
		//log("RdbDump:: getting list");
		m_t1 = gettimeofdayInMilliseconds();
		if(m_tree)
			status = m_tree->getList ( m_collnum       ,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys ,
						   niceness );
		else if(m_buckets)
			status = m_buckets->getList ( m_collnum,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys );


		// don't dump out any neg recs if it is our first time dumping
		// to a file for this rdb/coll. TODO: implement this later.
		//if ( removeNegRecs )
		//	m_list.removeNegRecs();

 		// if(!m_list->checkList_r ( false , // removeNegRecs?
 		// 			 false , // sleep on problem?
 		// 			 m_rdb->m_rdbId )) {
 		// 	log("db: list to dump is not sane!");
		// 	char *xx=NULL;*xx=0;
 		// }


	skip:
		int64_t t2;
		//key_t lastKey;
		char *lastKey;
		// if error getting list (out of memory?)
		if ( ! status ) goto hadError;
		// debug msg
		t2 = gettimeofdayInMilliseconds();
		log(LOG_INFO,"db: Get list took %"INT64" ms. "
		    "%"INT32" positive. %"INT32" negative.",
		    t2 - m_t1 , m_numPosRecs , m_numNegRecs );
		// keep a total count for reporting when done
		m_totalPosDumped += m_numPosRecs;
		m_totalNegDumped += m_numNegRecs;
		// . check the list we got from the tree for problems
		// . ensures keys are ordered from lowest to highest as well
		//#ifdef GBSANITYCHECK
		if ( g_conf.m_verifyWrites ) {
			char *s = "none";
			if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
			log("dump: verifying list before dumping (rdb=%s)",s);
			m_list->checkList_r ( false , // removeNegRecs?
					      false , // sleep on problem?
					      m_rdb->m_rdbId );
		}
		// if list is empty, we're done!
		if ( status && m_list->isEmpty() ) {
			// consider that a rollover?
			if ( m_rdb->m_rdbId == RDB_STATSDB )
				m_rolledOver = true;
			return true;
		}
		// get the last key of the list
		lastKey = m_list->getLastKey();
		// advance m_nextKey
		//m_nextKey  = lastKey ;
		//m_nextKey += (uint32_t)1;
		//if ( m_nextKey < lastKey ) m_rolledOver = true;
		KEYSET(m_nextKey,lastKey,m_ks);
		KEYADD(m_nextKey,1,m_ks);
		if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true;
	      // debug msg
	      //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0);
	      //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0);
	}
	// . return true on error, g_errno should have been set
	// . this is probably out of memory error
	if ( ! status ) {
	hadError:
		log("db: Had error getting data for dump: %s. Retrying.", 
		    mstrerror(g_errno));
		// debug msg
		//log("RdbDump::getList: sleeping and retrying");
		// retry for the remaining two types of errors
		if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){
			log(
			    "db: Retry failed. Could not register callback.");
			return true;
		}
		// wait for sleep
		return false;
	}
	// if list is empty, we're done!
	if ( m_list->isEmpty() ) return true;
	// . set m_firstKeyInQueue and m_lastKeyInQueue
	// . this doesn't work if you're doing an unordered dump, but we should
	//   not allow adds when closing
	m_lastKeyInQueue  = m_list->getLastKey();
	//m_firstKeyInQueue = m_list->getCurrentKey();
	m_list->getCurrentKey(m_firstKeyInQueue);
	// . write this list to disk
	// . returns false if blocked, true otherwise
	// . sets g_errno on error
	// . if this blocks it should call us (dumpTree() back)
	if ( ! dumpList ( m_list , m_niceness , false ) ) return false;
	// close up shop on a write/dumpList error
	if ( g_errno ) return true;
	// . if dumpList() did not block then keep on truckin'
	// . otherwise, wait for callback of dumpTree()
	goto loop;
}