// scan all Rdb databases and ensure no recs (it was a clean delete)
bool checkRdbLists ( long *rdbId ) {
	CollectionRec *cr = g_collectiondb.getRec("qatest123");
	if ( ! cr ) return true;
	collnum_t cn = cr->m_collnum;
	for ( ; *rdbId < RDB_END ; ) {
		// pre-inc it
		*rdbId = *rdbId + 1;
		char minKey[MAX_KEY_BYTES];
		char maxKey[MAX_KEY_BYTES];
	        KEYMIN(minKey,MAX_KEY_BYTES);
	        KEYMAX(maxKey,MAX_KEY_BYTES);
		if ( ! s_msg0.getList ( 0 , // hostid
					0 , // ip
					0 , // port
					0 , // cacheage
					false, // addtocache
					*rdbId , // rdbid
					cn , // collnum
					&s_list ,
					minKey ,
					maxKey ,
					1000 , // minrecsizes
					rdbId , // state
					gotList33,
					0 // niceness
					) )
			return false;
	}
	return true;
}
// . return false if blocked, true otherwise
// . set g_errno on error
// . read list of keys in [startKey,endKey] range
// . read at least "minRecSizes" bytes of keys in that range
// . the "m_endKey" of resulting, merged list may have a smaller endKey
//   than the argument, "endKey" due to limitation by "minRecSizes"
// . resulting list will contain ALL keys between ITS [m_startKey,m_endKey]
// . final merged list "should" try to have a size of at least "minRecSizes"
//   but due to negative/postive rec elimination may be less
// . the endKey of the lists we read may be <= "endKey" provided
// . we try to shrink the endKey if minRecSizes is >= 0 in order to
//   avoid excessive reading
// . by shrinking the endKey we cannot take into account the size of deleted
//   records, so therefore we may fall short of "minRecSizes" in actuality,
//   in fact, the returned list may even be empty with a shrunken endKey
// . we merge all lists read from disk into the provided "list"
// . caller should call Msg3.getList(int32_t i) and Msg3:getNumLists() to retrieve
// . this makes the query engine faster since we don't need to merge the docIds
//   and can just send them across the network separately and they will be
//   hashed into IndexTable's table w/o having to do time-wasting merging.
// . caller can specify array of filenums to read from so incremental syncing
//   in Sync class can just read from titledb*.dat files that were formed
//   since the last sync point.
bool Msg3::readList  ( char           rdbId         ,
		       collnum_t collnum ,
		       const char       *startKeyArg   ,
		       const char       *endKeyArg     ,
		       int32_t           minRecSizes   , // max size of scan
		       int32_t           startFileNum  , // first file to scan
		       int32_t           numFiles      , // rel. to startFileNum
		       void          *state         , // for callback
		       void        (* callback ) ( void *state ) ,
		       int32_t           niceness      ,
		       int32_t           retryNum      ,
		       int32_t           maxRetries    ,
		       bool           compensateForMerge ,
		       bool           justGetEndKey ,
		       bool           allowPageCache ,
		       bool           hitDisk        ) {

	// set this to true to validate
	m_validateCache = false;//true;

	// clear, this MUST be done so if we return true g_errno is correct
	g_errno = 0;
	// assume lists are not checked for corruption
	m_listsChecked = false;
	// warn
	if ( minRecSizes < -1 ) {
		log(LOG_LOGIC,"db: Msg3 got minRecSizes of %" PRId32", changing "
		    "to -1.",minRecSizes);
		minRecSizes = -1;
	}
	// reset m_alloc and data in all lists in case we are a re-call
	reset();
	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3.");
	// remember the callback
	m_rdbId              = rdbId;
	m_collnum = collnum;
	m_callback           = callback;
	m_state              = state;
	m_niceness           = niceness;
	m_numScansCompleted  = 0;
	m_retryNum           = retryNum;
	m_maxRetries         = maxRetries;
	m_compensateForMerge = compensateForMerge;
	m_allowPageCache     = allowPageCache;
	m_hitDisk            = hitDisk;
	m_hadCorruption      = false;
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( m_rdbId );
	// reset the group error
	m_errno    = 0;
	// . reset all our lists 
	// . these are reset in call the RdbScan::setRead() below
	//for ( int32_t i = 0 ; i < MAX_RDB_FILES ; i++ ) m_lists[i].reset();
	// . ensure startKey last bit clear, endKey last bit set
	// . no! this warning is now only in Msg5
	// . if RdbMerge is merging some files, not involving the root 
	//   file, then we can expect to get a lot of unmatched negative recs.
	// . as a consequence, our endKeys may often be negative. This means
	//   it may not annihilate with the positive key, but we should only
	//   miss like this at the boundaries of the lists we fetch.
	// . so in that case RdbList::merge will stop merging once the
	//   minRecSizes limit is reached even if it means ending on a negative
	//   rec key
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	if ( !KEYNEG(startKeyArg) )
		log(LOG_REMIND,"net: msg3: StartKey lastbit set."); 
	if (  KEYNEG(endKeyArg) )
		log(LOG_REMIND,"net: msg3: EndKey lastbit clear."); 

	// declare vars here becaues of 'goto skip' below
	int32_t mergeFileNum = -1 ;
	int32_t max ;

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base = getRdbBase( m_rdbId, m_collnum );
	if ( ! base ) {
		return true;
	}

	// store the file numbers in the array, these are the files we read
	m_numFileNums = 0;

	// save startFileNum here, just for recall
	m_startFileNum = startFileNum;
	m_numFiles     = numFiles;

	// . if we have a merge going on, we may have to change startFileNum
	// . if some files get unlinked because merge completes then our 
	//   reads will detect the error and loop back here
	// . we launch are reads right after this without giving up the cpu
	//   and we use file descriptors, so any changes to Rdb::m_files[]
	//   should not hurt us
	// . WARNING: just make sure you don't lose control of cpu until after
	//   you call RdbScan::set()
	// . we use hasMergeFile() instead of isMerging() because he may not 
	//   be merging cuz he got suspended or he restarted and
	//   hasn't called attemptMerge() yet, but he may still contain it
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,
		    "net: msg3: "
		    "c=%" PRId32" hmf=%" PRId32" sfn=%" PRId32" msfn=%" PRId32" nf=%" PRId32" db=%s.",
		     (int32_t)compensateForMerge,(int32_t)base->hasMergeFile(),
		     (int32_t)startFileNum,(int32_t)base->m_mergeStartFileNum-1,
		     (int32_t)numFiles,base->m_dbname);
	int32_t pre = -10;
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum >= base->m_mergeStartFileNum - 1 &&
	     (startFileNum > 0 || numFiles != -1) ) {
		// now also include the file being merged into, but only
		// if we are reading from a file being merged...
		if ( startFileNum < base->m_mergeStartFileNum +
		     base->m_numFilesToMerge - 1 )
			//m_fileNums [ m_numFileNums++ ] =
			//	base->m_mergeStartFileNum - 1;
			pre = base->m_mergeStartFileNum - 1;
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,
			   "net: msg3: startFileNum from %" PRId32" to %" PRId32" (mfn=%" PRId32")",
			    startFileNum,startFileNum+1,mergeFileNum);
		// if merge file was inserted before us, inc our file number
		startFileNum++;
	}
	// adjust num files if we need to, as well
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum < base->m_mergeStartFileNum - 1 &&
	     numFiles != -1 &&
	     startFileNum + numFiles - 1 >= base->m_mergeStartFileNum - 1 ) {
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,"net: msg3: numFiles up one.");
		// if merge file was inserted before us, inc our file number
		numFiles++;
	}

	// . how many rdb files does this base have?
	// . IMPORTANT: this can change since files are unstable because they
	//   might have all got merged into one!
	// . so do this check to make sure we're safe... especially if
	//   there was an error before and we called readList() on ourselves
	max = base->getNumFiles();
	// -1 means we should scan ALL the files in the base
	if ( numFiles == -1 ) numFiles = max;
	// limit it by startFileNum, however
	if ( numFiles > max - startFileNum ) numFiles = max - startFileNum;
	// set g_errno and return true if it is < 0
	if ( numFiles < 0 ) { 
		log(LOG_LOGIC,
		   "net: msg3: readList: numFiles = %" PRId32" < 0 (max=%" PRId32")(sf=%" PRId32")",
		    numFiles , max , startFileNum );
		g_errno = EBADENGINEER; 
		// force core dump
		char *xx=NULL;*xx=0;
		return true; 
	}

	// . allocate buffer space
	// . m_scans, m_startpg, m_endpg, m_hintKeys, m_hintOffsets,
	//   m_fileNums, m_lists
	int32_t chunk = sizeof(RdbScan) + // m_scans
		4 +                    // m_startpg
		4 +                    // m_endpg
		//sizeof(key_t) +        // m_hintKeys
		m_ks +                 // m_hintKeys
		4 +                    // m_hintOffsets
		4 +                    // m_fileNums
		sizeof(RdbList) ;      // m_lists
	int32_t nn   = numFiles;
	if ( pre != -10 ) nn++;
	m_numChunks = nn;
	int32_t need = nn * (chunk);
	m_alloc = m_buf;
	if ( need > (int32_t)MSG3_BUF_SIZE ) {
		m_allocSize = need;
		m_alloc = (char *)mcalloc ( need , "Msg3" );
		if ( ! m_alloc ) {
			log("disk: Could not allocate %" PRId32" bytes read "
			    "structures to read %s.",need,base->m_dbname);
			return true;
		}
	}
	char *p = m_alloc;
	m_scans       = (RdbScan *)p; p += nn * sizeof(RdbScan);
	m_startpg     = (int32_t    *)p; p += nn * 4;
	m_endpg       = (int32_t    *)p; p += nn * 4;
	//m_hintKeys    = (key_t   *)p; p += nn * sizeof(key_t);
	m_hintKeys    = (char    *)p; p += nn * m_ks;
	m_hintOffsets = (int32_t    *)p; p += nn * 4;
	m_fileNums    = (int32_t    *)p; p += nn * 4;
	m_lists       = (RdbList *)p; p += nn * sizeof(RdbList);
	// sanity check
	if ( p - m_alloc != need ) {
		log(LOG_LOGIC,"disk: Bad malloc in Msg3.cpp.");
		char *xx = NULL; *xx = 0;
	}
	// call constructors
	for ( int32_t i = 0 ; i < nn ; i++ ) m_lists[i].constructor();
	// make fix from up top
	if ( pre != -10 ) m_fileNums [ m_numFileNums++ ] = pre;

	// store them all
	for ( int32_t i = startFileNum ; i < startFileNum + numFiles ; i++ )
		m_fileNums [ m_numFileNums++ ] = i;

	// . remove file nums that are being unlinked after a merge now
	// . keep it here (below skip: label) so sync point reads can use it
	int32_t n = 0;
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// skip those that are being unlinked after the merge
		if ( base->m_isUnlinking && 
		     m_fileNums[i] >= base->m_mergeStartFileNum &&
		     m_fileNums[i] <  base->m_mergeStartFileNum + 
		                      base->m_numFilesToMerge      )
			continue;
		// otherwise, keep it
		m_fileNums[n++] = m_fileNums[i];
	}
	m_numFileNums = n;

	// . if root file is being merged, he's file #0, & root file is file #1
	// . this is a hack so caller gets what he wants
	//if ( startFileNum == 0 && base->getFileId(0) == 0 && numFiles == 1 )
	//	numFiles = 2;

	// remember the file range we should scan
	m_numScansStarted    = 0;
	m_numScansCompleted  = 0;
	//m_startKey           = startKey;
	//m_endKey             = endKey;
	//m_constrainKey       = endKey; // set in case justGetEndKey is true
	KEYSET(m_startKey,startKeyArg,m_ks);
	KEYSET(m_endKey,endKeyArg,m_ks);
	KEYSET(m_constrainKey,endKeyArg,m_ks);//set incase justGetEndKey istrue
	m_minRecSizes        = minRecSizes;
	m_compensateForMerge = compensateForMerge;
	// bail if 0 files to scan -- no! need to set startKey/endKey
	if ( numFiles == 0 ) return true;
	// don't read anything if endKey < startKey
	//if ( m_startKey > m_endKey ) return true;
	if ( KEYCMP(m_startKey,m_endKey,m_ks)>0 ) return true;
	// keep the original in tact in case g_errno == ETRYAGAIN
	//m_endKeyOrig        = endKey;
	KEYSET(m_endKeyOrig,endKeyArg,m_ks);
	m_minRecSizesOrig   = minRecSizes;
	// start reading at this key
	m_fileStartKey = startKeyArg;
	// start the timer, keep it fast for clusterdb though
	if ( g_conf.m_logTimingDb ) m_startTime = gettimeofdayInMilliseconds();
	// translate base to an id, for the sake of m_msg0
	//char baseId = m_msg0->getRdbId ( base );
	// map ptrs
	RdbMap **maps = base->getMaps();
	// . we now boost m_minRecSizes to account for negative recs 
	// . but not if only reading one list, cuz it won't get merged and
	//   it will be too big to send back
	if ( m_numFileNums > 1 ) compensateForNegativeRecs ( base );
	// . often endKey is too big for an efficient read of minRecSizes bytes
	//   because we end up reading too much from all the files
	// . this will set m_startpg[i], m_endpg[i] for each RdbScan/RdbFile
	//   to ensure we read "minRecSizes" worth of records, not much more
	// . returns the new endKey for all ranges
	// . now this just overwrites m_endKey
	//m_endKey = setPageRanges ( base           ,
	setPageRanges ( base           ,
			m_fileNums     ,
			m_numFileNums  ,
			m_fileStartKey , // start reading @ key
			m_endKey       , // stop reading @ key
			m_minRecSizes  );

	// . NEVER let m_endKey be a negative key, because it will 
	//   always be unmatched, since delbit is cleared
	// . adjusting it here ensures our generated hints are valid
	// . we will use this key to call constrain() with
	//m_constrainKey = m_endKey;
	//if ( ( m_constrainKey.n0 & 0x01) == 0x00 ) 
	//	m_constrainKey -= (uint32_t)1;
	KEYSET(m_constrainKey,m_endKey,m_ks);
	if ( KEYNEG(m_constrainKey) )
		KEYSUB(m_constrainKey,m_ks);

	// Msg5 likes to get the endkey for getting the list from the tree
	if ( justGetEndKey ) return true;

	// sanity check
	if ( m_numFileNums > nn ) {
		log(LOG_LOGIC,"disk: Failed sanity check in Msg3.");
		char *xx = NULL; *xx = 0;
	}

	// debug msg
	//log("msg3 getting list (msg5=%" PRIu32")",m_state);
	// . MDW removed this -- go ahead an end on a delete key
	// . RdbMerge might not pick it up this round, but oh well
	// . so we can have both positive and negative co-existing in same file
	// make sure the last bit is set so we don't end on a delete key
	//m_endKey.n0 |= 0x01LL;
	// . now start reading/scanning the files
	// . our m_scans array starts at 0
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// get the page range
		//int32_t p1 = m_startpg [ i ];
		//int32_t p2 = m_endpg   [ i ];
		//#ifdef GBSANITYCHECK
		int32_t fn = m_fileNums[i];
		// this can happen somehow!
		if ( fn < 0 ) {
			log(LOG_LOGIC,"net: msg3: fn=%" PRId32". Bad engineer.",fn);
			continue;
		}
		// sanity check
		if ( i > 0 && m_fileNums[i-1] >= fn ) {
			log(LOG_LOGIC,
			    "net: msg3: files must be read in order "
			    "from oldest to newest so RdbList::indexMerge_r "
			    "works properly. Otherwise, corruption will "
			    "result. ");
			char *xx = NULL; *xx = 0;
			return true;
		}
		// . sanity check?
		// . no, we must get again since we turn on endKey's last bit
		int32_t p1 , p2;
		maps[fn]->getPageRange ( m_fileStartKey , 
					m_endKey       , 
					&p1            , 
					&p2            ,
					NULL           );
		//if ( p1 != p1c || p2 != p2c ) {
		//	fprintf(stderr,"Msg3::bad page range\n");
		//	sleep(50000);
		//}
		// sanity check, each endpg's key should be > endKey
		//if ( p2 < maps[fn]->getNumPages() && 
		//     maps[fn]->getKey ( p2 ) <= m_endKey ) {
		//	fprintf(stderr,"Msg3::bad page range 2\n");
		//	sleep(50000);
		//}
		//#endif
		//int32_t p1 , p2; 
		//maps[fn]->getPageRange (startKey,endKey,minRecSizes,&p1,&p2);
		// now get some read info
		int64_t offset      = maps[fn]->getAbsoluteOffset ( p1 );
		int32_t      bytesToRead = maps[fn]->getRecSizes ( p1, p2, false);
		// max out the endkey for this list
		// debug msg
		//#ifdef _DEBUG_		
		//if ( minRecSizes == 2000000 ) 
		//log("Msg3:: reading %" PRId32" bytes from file #%" PRId32,bytesToRead,i);
		//#endif
		// inc our m_numScans
		m_numScansStarted++;
		// . keep stats on our disk accesses
		// . count disk seeks (assuming no fragmentation)
		// . count disk bytes read
		if ( bytesToRead > 0 ) {
			base->m_rdb->didSeek (             );
			base->m_rdb->didRead ( bytesToRead );
		}
		// . the startKey may be different for each RdbScan class
		// . RdbLists must have all keys within their [startKey,endKey]
		// . therefore set startKey individually from first page in map
		// . this endKey must be >= m_endKey 
		// . this startKey must be < m_startKey
		//key_t startKey = maps[fn]->getKey ( p1 );
		//key_t endKey   = maps[fn]->getKey ( p2 );
		char startKey2 [ MAX_KEY_BYTES ];
		char endKey2   [ MAX_KEY_BYTES ];
		maps[fn]->getKey ( p1 , startKey2 );
		maps[fn]->getKey ( p2 , endKey2 );
		//char *startKey = maps[fn]->getKeyPtr ( p1 );
		//char *endKey   = maps[fn]->getKeyPtr ( p2 );
		// store in here
		m_startpg [ i ] = p1;
		m_endpg   [ i ] = p2;

		// . we read UP TO that endKey, so reduce by 1
		// . but iff p2 is NOT the last page in the map/file
		// . maps[fn]->getKey(lastPage) will return the LAST KEY
		//   and maps[fn]->getOffset(lastPage) the length of the file
		//if ( maps[fn]->getNumPages()!=p2) endKey -=(uint32_t)1;
		if ( maps[fn]->getNumPages() != p2 ) KEYSUB(endKey2,m_ks);
		// otherwise, if we're reading all pages, then force the
		// endKey to virtual inifinite
		//else endKey.setMax();
		else KEYMAX(endKey2,m_ks);

		// . set up the hints
		// . these are only used if we are only reading from 1 file
		// . these are used to call constrain() so we can constrain
		//   the end of the list w/o looping through all the recs
		//   in the list
		int32_t h2 = p2 ;
		// decrease by one page if we're on the last page
		if ( h2 > p1 && maps[fn]->getNumPages() == h2 ) h2--;
		// . decrease hint page until key is <= endKey on that page
		//   AND offset is NOT -1 because the old way would give
		//   us hints passed the endkey
		// . also decrease so we can constrain on minRecSizes in
		//   case we're the only list being read
		// . use >= m_minRecSizes instead of >, otherwise we may
		//   never be able to set "size" in RdbList::constrain()
		//   because "p" could equal "maxPtr" right away
		while ( h2 > p1 && 
			//( maps[fn]->getKey   (h2) > m_constrainKey ||
		      (KEYCMP(maps[fn]->getKeyPtr(h2),m_constrainKey,m_ks)>0||
			  maps[fn]->getOffset(h2) == -1            ||
			  maps[fn]->getAbsoluteOffset(h2) - offset >=
			  m_minRecSizes ) )
			h2--;
		// now set the hint
		m_hintOffsets [ i ] = maps[fn]->getAbsoluteOffset ( h2 ) -
			              maps[fn]->getAbsoluteOffset ( p1 ) ;
		//m_hintKeys    [ i ] = maps[fn]->getKey            ( h2 );
		KEYSET(&m_hintKeys[i*m_ks],maps[fn]->getKeyPtr(h2),m_ks);

		// reset g_errno before calling setRead()
		g_errno = 0;
		// . this fix is now in RdbList::checklist_r()
		// . we can now have dup keys, so, we may read in
		//   a rec with key "lastMinKey" even though we don't read
		//   in the first key on the end page, so don't subtract 1...
		//if ( endKey != m_endKeyOrig ) 
		//	endKey += (uint32_t) 1;

		// timing debug
		if ( g_conf.m_logTimingDb )
			log(LOG_TIMING,
			    "net: msg: reading %" PRId32" bytes from %s file #%" PRId32" "
			     "(niceness=%" PRId32")",
			     bytesToRead,base->m_dbname,i,m_niceness);

		// log huge reads, those hurt us
		if ( bytesToRead > 150000000 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes at offset %" PRId64" "
			    "from %s.",
			    bytesToRead,offset,base->m_dbname);
		}

		// if any keys in the map are the same report corruption
		char tmpKey    [16];
		char lastTmpKey[16];
		int32_t ccount = 0;
		if ( bytesToRead     > 10000000      && 
		     bytesToRead / 2 > m_minRecSizes &&
		     base->m_fixedDataSize >= 0        ) {
			for ( int32_t pn = p1 ; pn <= p2 ; pn++ ) {
				maps[fn]->getKey ( pn , tmpKey );
				if ( KEYCMP(tmpKey,lastTmpKey,m_ks) == 0 ) 
					ccount++;
				gbmemcpy(lastTmpKey,tmpKey,m_ks);
			}
		}
		if ( ccount > 10 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes from %s file #"
			     "%" PRId32" when min "
			     "required is %" PRId32". Map is corrupt and has %" PRId32" "
			     "identical consecutive page keys because the "
			     "map was \"repaired\" because out of order keys "
			     "in the index.",
			     (int32_t)bytesToRead,
			     base->m_dbname,fn,
			     (int32_t)m_minRecSizes,
			     (int32_t)ccount);
			m_numScansCompleted++;
			m_errno = ECORRUPTDATA;
			m_hadCorruption = true;
			//m_maxRetries = 0;
			break;
		}

		////////
		//
		// try to get from PAGE CACHE
		//
		////////
		BigFile *ff = base->getFile(m_fileNums[i]);
		RdbCache *rpc = getDiskPageCache ( m_rdbId );
		if ( ! m_allowPageCache ) rpc = NULL;
		// . vfd is unique 64 bit file id
		// . if file is opened vfd is -1, only set in call to open()
		int64_t vfd = ff->getVfd();
		key192_t ck = makeCacheKey ( vfd , offset, bytesToRead);
		char *rec; int32_t recSize;
		bool inCache = false;
		if ( rpc && vfd != -1 && ! m_validateCache ) 
			inCache = rpc->getRecord ( (collnum_t)0 , // collnum
						   (char *)&ck , 
						   &rec , 
						   &recSize ,
						   true , // copy?
						   -1 , // maxAge, none 
						   true ); // inccounts?
		m_scans[i].m_inPageCache = false;
		if ( inCache ) {
			m_scans[i].m_inPageCache = true;
			m_numScansCompleted++;
			// now we have to store this value, 6 or 12 so
			// we can modify the hint appropriately
			m_scans[i].m_shifted = *rec;
			m_lists[i].set ( rec +1,
					 recSize-1 ,
					 rec , // alloc
					 recSize , // allocSize
					 startKey2 ,
					 endKey2 ,
					 base->m_fixedDataSize ,
					 true , // owndata
					 base->useHalfKeys() ,
					 getKeySizeFromRdbId ( m_rdbId ) );
			continue;
		}

		// . do the scan/read of file #i
		// . this returns false if blocked, true otherwise
		// . this will set g_errno on error
		bool done = m_scans[i].setRead (base->getFile(m_fileNums[i]),
						base->m_fixedDataSize ,
						 offset                 ,
						 bytesToRead            ,
						 startKey2              ,
						 endKey2                ,
						m_ks                    ,
						 &m_lists[i]            ,
						 this                   ,
						 doneScanningWrapper    ,
						 base->useHalfKeys()    ,
						m_rdbId,
						 m_niceness             ,
						 m_allowPageCache       ,
						 m_hitDisk              ) ;
		// . damn, usually the above will indirectly launch a thread
		//   to do the reading, but it sets g_errno to EINTR,
		//   "interrupted system call"!
		// . i guess the thread does the read w/o blocking and then
		//   queues the signal on g_loop's queue before it exits
		// . try ignoring, and keep going
		if ( g_errno == EINTR ) {
			log("net: Interrupted system call while reading file. "
			    "Ignoring.");
			g_errno = 0;
		}
		// debug msg
		//fprintf(stderr,"Msg3:: reading %" PRId32" bytes from file #%" PRId32","
		//	"done=%" PRId32",offset=%" PRId64",g_errno=%s,"
		//	"startKey=n1=%" PRIu32",n0=%" PRIu64",  "
		//	"endKey=n1=%" PRIu32",n0=%" PRIu64"\n",
		//	bytesToRead,i,(int32_t)done,offset,mstrerror(g_errno),
		//	m_startKey,m_endKey);
		//if ( bytesToRead == 0 )
		//	fprintf(stderr,"shit\n");
		// if it did not block then it completed, so count it
		if ( done ) m_numScansCompleted++;
		// break on an error, and remember g_errno in case we block
		if ( g_errno && g_errno != ENOTHREADSLOTS ) { 
			int32_t tt = LOG_WARN;
			if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
			log(tt,"disk: Reading %s had error: %s.",
			    base->m_dbname, mstrerror(g_errno));
			m_errno = g_errno; 
			break; 
		}
	}
	// debug test
	//if ( rand() % 100 <= 10 ) m_errno = EIO;

	// if we blocked, return false
	if ( m_numScansCompleted < m_numScansStarted ) return false;
	// . if all scans completed without blocking then wrap it up & ret true
	// . doneScanning may now block if it finds data corruption and must
	//   get the list remotely
	return doneScanning();
}
// . buffer is used for reading and writing
// . return false if blocked, true otherwise
// . sets g_errno on error
// . if niceness is 0 merge will block, otherwise will not block
// . we now use niceness of 1 which should spawn threads that don't allow
//   niceness 2 threads to launch while they're running
// . spider process now uses mostly niceness 2 
// . we need the merge to take priority over spider processes on disk otherwise
//   there's too much contention from spider lookups on disk for the merge
//   to finish in a decent amount of time and we end up getting too many files!
bool RdbMerge::merge ( char     rdbId        ,
		       //char    *coll         , //RdbBase *base         , 
		       collnum_t collnum,
		       BigFile *target       , 
		       RdbMap  *targetMap    ,
		       long     id2          , // target's secondary id
		       long     startFileNum , 
		       long     numFiles     ,
		       long     niceness     ,
		       class DiskPageCache *pc   ,
		       long long maxTargetFileSize ,
		       char     keySize      ) {
	// reset ourselves
	reset();
	// set it
	m_rdbId = rdbId;
	Rdb *rdb = getRdbFromId ( rdbId );
	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; if (!(base=getRdbBase(m_rdbId,collnum))) return true;
	// don't breech the max
	//if ( numFiles > m_maxFilesToMerge ) numFiles = m_maxFilesToMerge;
	// reset this map! it's m_crcs needs to be reset
	//targetMap->reset();
	// remember some parms
	//if ( ! coll && rdb->m_isCollectionLess )
	//	strcpy ( m_coll , rdb->m_dbname );
	//else
	//	strcpy ( m_coll , coll );

	m_collnum = collnum;
	if ( rdb->m_isCollectionLess ) m_collnum = 0;

	m_target          = target;
	m_targetMap       = targetMap;
	m_id2             = id2;
	m_startFileNum    = startFileNum;
	m_numFiles        = numFiles;
	m_dedup           = base->m_dedup;
	m_fixedDataSize   = base->m_fixedDataSize;
	m_niceness        = niceness;
	m_pc              = pc;
	m_maxTargetFileSize = maxTargetFileSize;
	m_doneMerging     = false;
	m_ks              = keySize;
	// . set the key range we want to retrieve from the files
	// . just get from the files, not tree (not cache?)
	//m_startKey.setMin();
	//m_endKey.setMax();
	KEYMIN(m_startKey,m_ks);
	KEYMAX(m_endKey,m_ks);
	// if we're resuming a killed merge, set m_startKey to last
	// key the map knows about.
	// the dump will start dumping at the end of the targetMap's data file.
	if ( m_targetMap->getNumRecs() > 0 ) {
		log(LOG_INIT,"db: Resuming a killed merge.");
		//m_startKey = m_targetMap->getLastKey();
		m_targetMap->getLastKey(m_startKey);
		//m_startKey += (unsigned long) 1;
		KEYADD(m_startKey,1,m_ks);
		// if power goes out and we are not doing synchronous writes
		// then we could have completely lost some data and unlinked
		// a part file from the file being merged, so that the data is
		// gone. to be able to resume merging, we must increment the
		// startKey until it references a valid offset in all the 
		// files being merged. invalid offsets will reference parts 
		// that have been chopped.
		/*
		RdbMap  **maps  = rdb->getMaps();
		BigFile **files = rdb->getFiles();
		for ( long i=m_startFileNum;i<m_startFileNum+m_numFiles;i++){
			long long minOff = 0LL;
			long k = 0;
			while ( k < files[i]->m_maxParts &&
				!   files[i]->m_files[k]    ) {
				k++;
				minOff += MAX_PART_SIZE;
			}
			long pn0 = maps[i]->getPage ( m_startKey );
			long pn  = pn0;
			while ( maps[i]->getAbsoluteOffset(pn) < minOff ) pn++;
			if ( pn != pn0 ) {
				log("db: Lost data during merge. Starting "
				    "merge at page number %li from %li for "
				    "file.",pn,pn0);
				m_startKey = maps[i]->getKey ( pn );
			}
		}
		*/
	}
	// free our list's memory, just in case
	//m_list.freeList();
	// . we may have multiple hosts running on the same cpu/hardDrive
	// . therefore, to maximize disk space, we should only have 1 merge
	//   at a time going on between these hosts
	// . now tfndb has own merge class since titledb merge writes url recs
	/*
	if ( s_isMergeLocked ) {
		//log("RdbMerge::merge: someone else merging sleeping.");
		log("RdbMerge::merge: someone else merging. bad engineer.");
		return false;
		// if it fails then sleep until it works
		//returng_loop.registerSleepCallback(5000,this,getLockWrapper);
	}
	*/
	return gotLock();
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . dumps the RdbTree, m_tree, into m_file
// . also sets and writes the RdbMap for m_file
// . we methodically get RdbLists from the RdbTree 
// . dumped recs are ordered by key if "orderedDump" was true in call to set()
//   otherwise, lists are ordered by node #
// . we write each list of recs to the file until the whole tree has been done
// . we delete all records in list from the tree after we've written the list
// . if a cache was provided we incorporate the list into the cache before
//   deleting it from the tree to keep the cache in sync. NO we do NOT!
// . called again by writeBuf() when it's done writing the whole list
bool RdbDump::dumpTree ( bool recall ) {
	// set up some vars
	//int32_t  nextNode;
	//key_t maxEndKey;
	//maxEndKey.setMax();
	char maxEndKey[MAX_KEY_BYTES];
	KEYMAX(maxEndKey,m_ks);
	// if dumping statsdb, we can only dump records 30 seconds old or
	// more because Statsdb.cpp can "back modify" such records in the tree
	// because it may have a query that took 10 seconds come in then it
	// needs to add a partial stat to the last 10 stats for those 10 secs.
	// we use Global time at this juncture
	if ( m_rdb->m_rdbId == RDB_STATSDB ) {
		int32_t nowSecs = getTimeGlobal();
		StatKey *sk = (StatKey *)maxEndKey;
		sk->m_zero      = 0x01;
		sk->m_labelHash = 0xffffffff;
		// leave last 60 seconds in there just to be safe
		sk->m_time1     = nowSecs - 60;
	}

	// this list will hold the list of nodes/recs from m_tree
	m_list = &m_ourList;
	// convert coll to collnum
	//collnum_t collnum = g_collectiondb.getCollnum ( m_coll );
	// a collnum of -1 is for collectionless rdbs
	//if ( collnum < 0 ) {
	//	//if ( g_catdb->getRdb() == m_rdb )
	//	if ( ! m_rdb->m_isCollectionLess ) {
	//		char *xx=NULL;*xx=0; //return true;
	//	}
	//	g_errno = 0;
	//	collnum = 0;
	//}
	// getMemOccupiedForList2() can take some time, so breathe
	int32_t niceness = 1;
 loop:
	// if the lastKey was the max end key last time then we're done
	if ( m_rolledOver     ) return true;
	// this is set to -1 when we're done with our unordered dump
	if ( m_nextNode == -1 ) return true;
	// . NOTE: list's buffer space should be re-used!! (TODO)
	// . "lastNode" is set to the last node # in the list
	bool status = true;
	//if ( ! m_orderedDump ) {
	//	status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode ,
	//							 m_maxBufSize ,
	//							 m_list , 
	//							 &nextNode );
	//	// this is -1 when no more nodes are left
	//	m_nextNode = nextNode;
	//}
	// "lastKey" is set to the last key in the list
	//else {
	{

		// can we remove neg recs?
		// class RdbBase *base = m_rdb->getBase(m_collnum);
		// bool removeNegRecs = false;
		// if ( base->m_numFiles <= 0 ) removeNegRecs = true;

		if ( recall ) goto skip;

		// debug msg
		//log("RdbDump:: getting list");
		m_t1 = gettimeofdayInMilliseconds();
		if(m_tree)
			status = m_tree->getList ( m_collnum       ,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys ,
						   niceness );
		else if(m_buckets)
			status = m_buckets->getList ( m_collnum,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys );


		// don't dump out any neg recs if it is our first time dumping
		// to a file for this rdb/coll. TODO: implement this later.
		//if ( removeNegRecs )
		//	m_list.removeNegRecs();

 		// if(!m_list->checkList_r ( false , // removeNegRecs?
 		// 			 false , // sleep on problem?
 		// 			 m_rdb->m_rdbId )) {
 		// 	log("db: list to dump is not sane!");
		// 	char *xx=NULL;*xx=0;
 		// }


	skip:
		int64_t t2;
		//key_t lastKey;
		char *lastKey;
		// if error getting list (out of memory?)
		if ( ! status ) goto hadError;
		// debug msg
		t2 = gettimeofdayInMilliseconds();
		log(LOG_INFO,"db: Get list took %"INT64" ms. "
		    "%"INT32" positive. %"INT32" negative.",
		    t2 - m_t1 , m_numPosRecs , m_numNegRecs );
		// keep a total count for reporting when done
		m_totalPosDumped += m_numPosRecs;
		m_totalNegDumped += m_numNegRecs;
		// . check the list we got from the tree for problems
		// . ensures keys are ordered from lowest to highest as well
		//#ifdef GBSANITYCHECK
		if ( g_conf.m_verifyWrites ) {
			char *s = "none";
			if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
			log("dump: verifying list before dumping (rdb=%s)",s);
			m_list->checkList_r ( false , // removeNegRecs?
					      false , // sleep on problem?
					      m_rdb->m_rdbId );
		}
		// if list is empty, we're done!
		if ( status && m_list->isEmpty() ) {
			// consider that a rollover?
			if ( m_rdb->m_rdbId == RDB_STATSDB )
				m_rolledOver = true;
			return true;
		}
		// get the last key of the list
		lastKey = m_list->getLastKey();
		// advance m_nextKey
		//m_nextKey  = lastKey ;
		//m_nextKey += (uint32_t)1;
		//if ( m_nextKey < lastKey ) m_rolledOver = true;
		KEYSET(m_nextKey,lastKey,m_ks);
		KEYADD(m_nextKey,1,m_ks);
		if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true;
	      // debug msg
	      //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0);
	      //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0);
	}
	// . return true on error, g_errno should have been set
	// . this is probably out of memory error
	if ( ! status ) {
	hadError:
		log("db: Had error getting data for dump: %s. Retrying.", 
		    mstrerror(g_errno));
		// debug msg
		//log("RdbDump::getList: sleeping and retrying");
		// retry for the remaining two types of errors
		if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){
			log(
			    "db: Retry failed. Could not register callback.");
			return true;
		}
		// wait for sleep
		return false;
	}
	// if list is empty, we're done!
	if ( m_list->isEmpty() ) return true;
	// . set m_firstKeyInQueue and m_lastKeyInQueue
	// . this doesn't work if you're doing an unordered dump, but we should
	//   not allow adds when closing
	m_lastKeyInQueue  = m_list->getLastKey();
	//m_firstKeyInQueue = m_list->getCurrentKey();
	m_list->getCurrentKey(m_firstKeyInQueue);
	// . write this list to disk
	// . returns false if blocked, true otherwise
	// . sets g_errno on error
	// . if this blocks it should call us (dumpTree() back)
	if ( ! dumpList ( m_list , m_niceness , false ) ) return false;
	// close up shop on a write/dumpList error
	if ( g_errno ) return true;
	// . if dumpList() did not block then keep on truckin'
	// . otherwise, wait for callback of dumpTree()
	goto loop;
}