void RdbMerge::doneMerging ( ) {
	// let RdbDump free its m_verifyBuf buffer if it existed
	m_dump.reset();
	// debug msg
	//fprintf(stderr,"exiting, g_errno=%s!\n",mstrerror(g_errno));
	//exit(-1);
	// . free the list's memory, reset() doesn't do it
	// . when merging titledb i'm still seeing 200MB allocs to read from
	//   tfndb.
	m_list.freeList();
	// nuke our msg3
	//delete (m_msg3);
	// log a msg
	log(LOG_INFO,"db: Merge status: %s.",mstrerror(g_errno));
	// . reset our class
	// . this will free it's cutoff keys buffer, trash buffer, treelist
	// . TODO: should we not reset to keep the mem handy for next time
	//   to help avoid out of mem errors?
	m_msg5.reset();
	// . do we really need these anymore?
	// . turn these off before calling incorporateMerge() since it
	//   will call attemptMerge() on all the other dbs
	m_isMerging     = false;
	m_isSuspended   = false;
	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return;
	// pass g_errno on to incorporate merge so merged file can be unlinked
	base->incorporateMerge ( );
	// nuke the lock so others can merge
	//s_isMergeLocked = false;
}
Exemplo n.º 2
0
void doneScanningWrapper ( void *state ) {
	Msg3 *THIS = (Msg3 *) state;
	// inc the scan count
	THIS->m_numScansCompleted++;
	// we decided to try to ignore these errors
	if ( g_errno == EINTR ) {
		log("net: Interrupted system call while reading file. "
		    "Ignoring.");
		g_errno = 0;
	}
	// if we had an error, remember it
	if ( g_errno ) { 
		// get base, returns NULL and sets g_errno to ENOCOLLREC on err
		RdbBase *base = getRdbBase( THIS->m_rdbId, THIS->m_collnum );
		char *dbname = "NOT FOUND";
		if ( base ) dbname = base->m_dbname;
		int32_t tt = LOG_WARN;
		if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
		log(tt,"net: Reading %s had error: %s.",
		    dbname,mstrerror(g_errno));
		THIS->m_errno = g_errno; 
		g_errno = 0; 
	}
	// return now if we're awaiting more scan completions
	if ( THIS->m_numScansCompleted < THIS->m_numScansStarted ) return;
	// . give control to doneScanning
	// . return if it blocks
	if ( ! THIS->doneScanning() ) return;
	// if one of our lists was *huge* and could not alloc mem, it was
	// due to corruption
	if ( THIS->m_hadCorruption ) g_errno = ECORRUPTDATA;
	// if it doesn't block call the callback, g_errno may be set
	THIS->m_callback ( THIS->m_state );
}
Exemplo n.º 3
0
void RdbMerge::doneMerging ( ) {
	// save this
	int32_t saved = g_errno;
	// let RdbDump free its m_verifyBuf buffer if it existed
	m_dump.reset();
	// debug msg
	//fprintf(stderr,"exiting, g_errno=%s!\n",mstrerror(g_errno));
	//exit(-1);
	// . free the list's memory, reset() doesn't do it
	// . when merging titledb i'm still seeing 200MB allocs to read from
	//   tfndb.
	m_list.freeList();
	// nuke our msg3
	//delete (m_msg3);
	// log a msg
	log(LOG_INFO,"db: Merge status: %s.",mstrerror(g_errno));
	// . reset our class
	// . this will free it's cutoff keys buffer, trash buffer, treelist
	// . TODO: should we not reset to keep the mem handy for next time
	//   to help avoid out of mem errors?
	m_msg5.reset();
	// . do we really need these anymore?
	// . turn these off before calling incorporateMerge() since it
	//   will call attemptMerge() on all the other dbs
	m_isMerging     = false;
	m_isSuspended   = false;

	// if collection rec was deleted while merging files for it
	// then the rdbbase should be NULL i guess.
	if ( saved == ENOCOLLREC ) return;

	// if we are exiting then dont bother renaming the files around now.
	// this prevents a core in RdbBase::incorporateMerge()
	if ( g_process.m_mode == EXIT_MODE ) {
		log("merge: exiting. not ending merge.");
		return;
	}

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base = getRdbBase( m_rdbId, m_collnum );
	if ( ! base ) {
		return;
	}
	// pass g_errno on to incorporate merge so merged file can be unlinked
	base->incorporateMerge ( );
	// nuke the lock so others can merge
	//s_isMergeLocked = false;
}
void setTermFreqWeights ( char *coll,
                          Query *q ,
                          long long *termFreqs,
                          float *termFreqWeights ) {

    long long numDocsInColl = 0;
    RdbBase *base = getRdbBase ( RDB_CLUSTERDB  , coll );
    if ( base ) numDocsInColl = base->getNumGlobalRecs();
    // issue? set it to 1000 if so
    if ( numDocsInColl < 0 ) {
        log("query: Got num docs in coll of %lli < 0",numDocsInColl);
        // avoid divide by zero below
        numDocsInColl = 1;
    }
    // now get term freqs again, like the good old days
    long long *termIds = q->getTermIds();
    // just use rdbmap to estimate!
    for ( long i = 0 ; i < q->getNumTerms(); i++ ) {
        long long tf = g_posdb.getTermFreq ( coll ,termIds[i]);
        if ( termFreqs ) termFreqs[i] = tf;
        float tfw = getTermFreqWeight(tf,numDocsInColl);
        termFreqWeights[i] = tfw;
    }
}
Exemplo n.º 5
0
// . but now that we may get a list remotely to fix data corruption,
//   this may indeed block
bool Msg3::doneScanning ( ) {
	QUICKPOLL(m_niceness);
	// . did we have any error on any scan?
	// . if so, repeat ALL of the scans
	g_errno = m_errno;
	// 2 retry is the default
	int32_t max = 2;
	// see if explicitly provided by the caller
	if ( m_maxRetries >= 0 ) max = m_maxRetries;
	// now use -1 (no max) as the default no matter what
	max = -1;
	// ENOMEM is particulary contagious, so watch out with it...
	if ( g_errno == ENOMEM && m_maxRetries == -1 ) max = 0;
	// msg0 sets maxRetries to 2, don't let max stay set to -1
	if ( g_errno == ENOMEM && m_maxRetries != -1 ) max = m_maxRetries;
	// when thread cannot alloc enough read buf it keeps the read buf
	// set to NULL and BigFile.cpp sets g_errno to EBUFTOOSMALL
	if ( g_errno == EBUFTOOSMALL && m_maxRetries == -1 ) max = 0;
	// msg0 sets maxRetries to 2, don't let max stay set to -1
	if ( g_errno == EBUFTOOSMALL && m_maxRetries != -1 ) max = m_maxRetries;
	// . if no thread slots available, that hogs up serious memory.
	//   the size of Msg3 is 82k, so having just 5000 of them is 430MB.
	// . i just made Msg3 alloc mem when it needs more than about 2k
	//   so this problem is greatly reduced, therefore let's keep 
	//   retrying... forever if no thread slots in thread queue since
	//   we become the thread queue in a way.
	if ( g_errno == ENOTHREADSLOTS ) max = -1;
	// this is set above if the map has the same consecutive key repeated
	// and the read is enormous
	if ( g_errno == ECORRUPTDATA ) max = 0;
	// usually bad disk failures, don't retry those forever
	//if ( g_errno == EIO ) max = 3;
        // no, now our hitachis return these even when they're good so
	// we have to keep retrying forever
	if ( g_errno == EIO ) max = -1;
	// count these so we do not take drives offline just because
	// kernel ring buffer complains...
	if ( g_errno == EIO ) g_numIOErrors++;
	// bail early on high priority reads for these errors
	if ( g_errno == EDISKSTUCK && m_niceness == 0 ) max = 0;
	if ( g_errno == EIO        && m_niceness == 0 ) max = 0;

	// how does this happen? we should never bail out on a low priority
	// disk read... we just wait for it to complete...
	if ( g_errno == EDISKSTUCK && m_niceness != 0 ) { char *xx=NULL;*xx=0;}

	// on I/O, give up at call it corrupt after a while. some hitachis
	// have I/O errros on little spots, like gk88, maybe we can fix him
	if ( g_errno == EIO && m_retryNum >= 5 ) {
		m_errno = ECORRUPTDATA;
		m_hadCorruption = true;
		// do not do any retries any more
		max = 0;
	}

	// convert m_errno to ECORRUPTDATA if it is EBUFTOOSMALL and the
	// max of the bytesToRead are over 500MB.
	// if bytesToRead was ludicrous, then assume that the data file
	// was corrupted, the map was regenerated and it patched
	// over the corrupted bits which were 500MB or more in size.
	// we cannot practically allocate that much, so let's just
	// give back an empty buffer. treat it like corruption...
	// the way it patches is to store the same key over all the corrupted
	// pages, which can get pretty big. so if you read a range with that
	// key you will be hurting!!
	// this may be the same scenario as when the rdbmap has consecutive
	// same keys. see above where we set m_errno to ECORRUPTDATA...
	if ( g_errno == EBUFTOOSMALL ) { 
		int32_t biggest = 0;
		for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
			if ( m_scans[i].m_bytesToRead < biggest ) continue;
			biggest = m_scans[i].m_bytesToRead;
		}
		if ( biggest > 500000000 ) {
			log("db: Max read size was %" PRId32" > 500000000. Assuming "
			    "corrupt data in data file.",biggest);
			m_errno = ECORRUPTDATA;
			m_hadCorruption = true;
			// do not do any retries on this, the read was > 500MB
			max = 0;
		}
	}

	// if shutting down gb then limit to 20 so we can shutdown because
	// it can't shutdown until all threads are out of the queue i think
	if ( g_process.m_mode == EXIT_MODE && max < 0 ) {
		//log("msg3: forcing retries to 0 because shutting down");
		max = 0;
	}

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base = getRdbBase( m_rdbId, m_collnum );
	if ( ! base ) {
		return true;
	}

	// this really slows things down because it blocks the cpu so
	// leave it out for now
#ifdef GBSANITYCHECK
	// check for corruption here, do not do it again in Msg5 if we pass
	if ( ! g_errno ) { // && g_conf.m_doErrorCorrection ) {
		int32_t i;
		for ( i = 0 ; i < m_numFileNums ; i++ )
			if ( ! m_lists[i].checkList_r ( false, false ) ) break;
		if ( i < m_numFileNums ) {
			g_errno = ECORRUPTDATA;
			m_errno = ECORRUPTDATA;
			max     = g_conf.m_corruptRetries; // try 100 times
			log("db: Encountered corrupt list in file %s.",
			    base->getFile(m_fileNums[i])->getFilename());
		}
		else
			m_listsChecked = true;
	}
#endif

	// try to fix this error i've seen
	if ( g_errno == EBADENGINEER && max == -1 )
		max = 100;

	// . if we had a ETRYAGAIN error, then try again now
	// . it usually means the whole file or a part of it was deleted 
	//   before we could finish reading it, so we should re-read all now
	// . RdbMerge deletes BigFiles after it merges them and also chops
	//   off file heads
	// . now that we have threads i'd imagine we'd get EBADFD or something
	// . i've also seen "illegal seek" as well
	if ( m_errno && (m_retryNum < max || max < 0) &&
	     // this will complete in due time, we can't call a sleep wrapper
	     // on it because the read is really still pending...
	     m_errno != EDISKSTUCK ) {
		// print the error
		static time_t s_time  = 0;
		time_t now = getTime();
		if ( now - s_time > 5 || g_errno != ENOTHREADSLOTS ) {
			log("net: Had error reading %s: %s. Retrying. "
			    "(retry #%" PRId32")", 
			    base->m_dbname,mstrerror(m_errno) , m_retryNum );
			s_time = now;
		}
		// send email alert if in an infinite loop, but don't send
		// more than once every 2 hours
		static int32_t s_lastSendTime = 0;
		if ( m_retryNum == 100 && getTime() - s_lastSendTime > 3600*2){
			// remove this for now it is going off all the time
			//g_pingServer.sendEmail(NULL,//g_hostdb.getMyHost(),
			//		       "100 read retries",true);
			s_lastSendTime = getTime();
		}
		// clear g_errno cuz we should for call to readList()
		g_errno = 0;
		// free the list buffer since if we have 1000 Msg3s retrying
		// it will totally use all of our memory
		for ( int32_t i = 0 ; i < m_numChunks ; i++ ) 
			m_lists[i].destructor();
		// count retries
		m_retryNum++;
		// backoff scheme, wait 100ms more each time
		int32_t wait ;
		if ( m_retryNum == 1 ) wait = 10;
		else                   wait = 200 * m_retryNum;
		// . don't wait more than 10 secs between tries
		// . i've seen gf0 and gf16 get mega saturated
		if ( wait > 10000 ) wait = 10000;
		// wait 500 ms
		if ( g_loop.registerSleepCallback ( wait  , // ms
						    this  ,
						    doneSleepingWrapper3,
						    m_niceness))
			return false;
		// otherwise, registration failed
		log(
		    "net: Failed to register sleep callback for retry. "
		    "Abandoning read. This is bad.");
		// return, g_errno should be set
		g_errno = EBUFTOOSMALL;
		m_errno = EBUFTOOSMALL;
		return true;
	}

	// if we got an error and should not retry any more then give up
	if ( g_errno ) {
		log(
		    "net: Had error reading %s: %s. Giving up after %" PRId32" "
		    "retries.",
		    base->m_dbname,mstrerror(g_errno) , m_retryNum );
		return true;
	}

	// note it if the retry finally worked
	if ( m_retryNum > 0 ) 
		log(LOG_INFO,"disk: Read succeeded after retrying %" PRId32" times.",
		    (int32_t)m_retryNum);

	// count total bytes for logging
	int32_t count = 0;
	// . constrain all lists to make merging easier
	// . if we have only one list, then that's nice cuz the constrain
	//   will allow us to send it right away w/ zero copying
	// . if we have only 1 list, it won't be merged into a final list,
	//   that is, we'll just set m_list = &m_lists[i]
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		QUICKPOLL(m_niceness);
		// count total bytes for logging
		count += m_lists[i].getListSize();
		// . hint offset is relative to the offset of first key we read
		// . if that key was only 6 bytes RdbScan shift the list buf
		//   down 6 bytes to make the first key 12 bytes... a 
		//   requirement for all RdbLists
		// . don't inc it, though, if it was 0, pointing to the start
		//   of the list because our shift won't affect that
		if ( m_scans[i].m_shifted == 6 && m_hintOffsets[i] > 0 ) 
			m_hintOffsets[i] += 6;
		// posdb double compression
		if ( m_scans[i].m_shifted == 12 && m_hintOffsets[i] > 0 ) 
			m_hintOffsets[i] += 12;
		// . don't constrain on minRecSizes here because it may
		//   make our endKey smaller, which will cause problems
		//   when Msg5 merges these lists.
		// . If all lists have different endKeys RdbList's merge
		//   chooses the min and will merge in recs beyond that
		//   causing a bad list BECAUSE we don't check to make
		//   sure that recs we are adding are below the endKey
		// . if we only read from one file then constrain based 
		//   on minRecSizes so we can send the list back w/o merging
		//   OR if just merging with RdbTree's list
		int32_t mrs ;
		// . constrain to m_minRecSizesOrig, not m_minRecSizes cuz 
		//   that  could be adjusted by compensateForNegativeRecs()
		// . but, really, they should be the same if we only read from
		//   the root file
		if ( m_numFileNums == 1 ) mrs = m_minRecSizesOrig;
		else                      mrs = -1;
		// . this returns false and sets g_errno on error
		// . like if data is corrupt
		BigFile *ff = base->getFile(m_fileNums[i]);
		// if we did a merge really quick and delete one of the 
		// files we were reading, i've seen 'ff' be NULL
		char *filename = "lostfilename";
		if ( ff ) filename = ff->getFilename();

		// compute cache info
		RdbCache *rpc = getDiskPageCache ( m_rdbId );
		if ( ! m_allowPageCache ) rpc = NULL;
		int64_t vfd ;
		if ( ff ) vfd = ff->getVfd();
		key192_t ck ;
		if ( ff )
			ck = makeCacheKey ( vfd ,
					    m_scans[i].m_offset ,
					    m_scans[i].m_bytesToRead );
		if ( m_validateCache && ff && rpc && vfd != -1 ) {
			bool inCache;
			char *rec; int32_t recSize;
			inCache = rpc->getRecord ( (collnum_t)0 , // collnum
						   (char *)&ck , 
						   &rec , 
						   &recSize ,
						   true , // copy?
						   -1 , // maxAge, none 
						   true ); // inccounts?
			if ( inCache && 
			     // 1st byte is RdbScan::m_shifted
			     ( m_lists[i].m_listSize != recSize-1 ||
			       memcmp ( m_lists[i].m_list , rec+1,recSize-1) ||
			       *rec != m_scans[i].m_shifted ) ) {
				log("msg3: cache did not validate");
				char *xx=NULL;*xx=0;
			}
			mfree ( rec , recSize , "vca" );
		}


		///////
		//
		// STORE IN PAGE CACHE
		//
		///////
		// store what we read in the cache. don't bother storing
		// if it was a retry, just in case something strange happened.
		// store pre-constrain call is more efficient.
		if ( m_retryNum<=0 && ff && rpc && vfd != -1 &&
		     ! m_scans[i].m_inPageCache )
			rpc->addRecord ( (collnum_t)0 , // collnum
					 (char *)&ck , 
					 // rec1 is this little thingy
					 &m_scans[i].m_shifted,
					 1,
					 // rec2
					 m_lists[i].getList() ,
					 m_lists[i].getListSize() ,
					 0 ); // timestamp. 0 = now

		QUICKPOLL(m_niceness);

		// if from our 'page' cache, no need to constrain
		if ( ! m_lists[i].constrain ( m_startKey       ,
					      m_constrainKey   , // m_endKey
					      mrs           , // m_minRecSizes
					      m_hintOffsets[i] ,
					      //m_hintKeys   [i] ,
					      &m_hintKeys   [i*m_ks] ,
					      filename,//ff->getFilename() ,
					      m_niceness ) ) {
			log("net: Had error while constraining list read from "
			    "%s: %s/%s. vfd=%" PRId32" parts=%" PRId32". "
			    "This is likely caused by corrupted "
			    "data on disk.", 
			    mstrerror(g_errno), ff->getDir(),
			    ff->getFilename(), ff->m_vfd , 
			    (int32_t)ff->m_numParts );
			continue;
		}
	}

	// print the time
	if ( g_conf.m_logTimingDb ) {
		int64_t now = gettimeofdayInMilliseconds();
		int64_t took = now - m_startTime;
		log(LOG_TIMING,
		    "net: Took %" PRId64" ms to read %" PRId32" lists of %" PRId32" bytes total"
		     " from %s (niceness=%" PRId32").",
		     took,m_numFileNums,count,base->m_dbname,m_niceness);
	}
	return true;
}
Exemplo n.º 6
0
// . return false if blocked, true otherwise
// . set g_errno on error
// . read list of keys in [startKey,endKey] range
// . read at least "minRecSizes" bytes of keys in that range
// . the "m_endKey" of resulting, merged list may have a smaller endKey
//   than the argument, "endKey" due to limitation by "minRecSizes"
// . resulting list will contain ALL keys between ITS [m_startKey,m_endKey]
// . final merged list "should" try to have a size of at least "minRecSizes"
//   but due to negative/postive rec elimination may be less
// . the endKey of the lists we read may be <= "endKey" provided
// . we try to shrink the endKey if minRecSizes is >= 0 in order to
//   avoid excessive reading
// . by shrinking the endKey we cannot take into account the size of deleted
//   records, so therefore we may fall short of "minRecSizes" in actuality,
//   in fact, the returned list may even be empty with a shrunken endKey
// . we merge all lists read from disk into the provided "list"
// . caller should call Msg3.getList(int32_t i) and Msg3:getNumLists() to retrieve
// . this makes the query engine faster since we don't need to merge the docIds
//   and can just send them across the network separately and they will be
//   hashed into IndexTable's table w/o having to do time-wasting merging.
// . caller can specify array of filenums to read from so incremental syncing
//   in Sync class can just read from titledb*.dat files that were formed
//   since the last sync point.
bool Msg3::readList  ( char           rdbId         ,
		       collnum_t collnum ,
		       const char       *startKeyArg   ,
		       const char       *endKeyArg     ,
		       int32_t           minRecSizes   , // max size of scan
		       int32_t           startFileNum  , // first file to scan
		       int32_t           numFiles      , // rel. to startFileNum
		       void          *state         , // for callback
		       void        (* callback ) ( void *state ) ,
		       int32_t           niceness      ,
		       int32_t           retryNum      ,
		       int32_t           maxRetries    ,
		       bool           compensateForMerge ,
		       bool           justGetEndKey ,
		       bool           allowPageCache ,
		       bool           hitDisk        ) {

	// set this to true to validate
	m_validateCache = false;//true;

	// clear, this MUST be done so if we return true g_errno is correct
	g_errno = 0;
	// assume lists are not checked for corruption
	m_listsChecked = false;
	// warn
	if ( minRecSizes < -1 ) {
		log(LOG_LOGIC,"db: Msg3 got minRecSizes of %" PRId32", changing "
		    "to -1.",minRecSizes);
		minRecSizes = -1;
	}
	// reset m_alloc and data in all lists in case we are a re-call
	reset();
	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg3.");
	// remember the callback
	m_rdbId              = rdbId;
	m_collnum = collnum;
	m_callback           = callback;
	m_state              = state;
	m_niceness           = niceness;
	m_numScansCompleted  = 0;
	m_retryNum           = retryNum;
	m_maxRetries         = maxRetries;
	m_compensateForMerge = compensateForMerge;
	m_allowPageCache     = allowPageCache;
	m_hitDisk            = hitDisk;
	m_hadCorruption      = false;
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( m_rdbId );
	// reset the group error
	m_errno    = 0;
	// . reset all our lists 
	// . these are reset in call the RdbScan::setRead() below
	//for ( int32_t i = 0 ; i < MAX_RDB_FILES ; i++ ) m_lists[i].reset();
	// . ensure startKey last bit clear, endKey last bit set
	// . no! this warning is now only in Msg5
	// . if RdbMerge is merging some files, not involving the root 
	//   file, then we can expect to get a lot of unmatched negative recs.
	// . as a consequence, our endKeys may often be negative. This means
	//   it may not annihilate with the positive key, but we should only
	//   miss like this at the boundaries of the lists we fetch.
	// . so in that case RdbList::merge will stop merging once the
	//   minRecSizes limit is reached even if it means ending on a negative
	//   rec key
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	if ( !KEYNEG(startKeyArg) )
		log(LOG_REMIND,"net: msg3: StartKey lastbit set."); 
	if (  KEYNEG(endKeyArg) )
		log(LOG_REMIND,"net: msg3: EndKey lastbit clear."); 

	// declare vars here becaues of 'goto skip' below
	int32_t mergeFileNum = -1 ;
	int32_t max ;

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base = getRdbBase( m_rdbId, m_collnum );
	if ( ! base ) {
		return true;
	}

	// store the file numbers in the array, these are the files we read
	m_numFileNums = 0;

	// save startFileNum here, just for recall
	m_startFileNum = startFileNum;
	m_numFiles     = numFiles;

	// . if we have a merge going on, we may have to change startFileNum
	// . if some files get unlinked because merge completes then our 
	//   reads will detect the error and loop back here
	// . we launch are reads right after this without giving up the cpu
	//   and we use file descriptors, so any changes to Rdb::m_files[]
	//   should not hurt us
	// . WARNING: just make sure you don't lose control of cpu until after
	//   you call RdbScan::set()
	// . we use hasMergeFile() instead of isMerging() because he may not 
	//   be merging cuz he got suspended or he restarted and
	//   hasn't called attemptMerge() yet, but he may still contain it
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,
		    "net: msg3: "
		    "c=%" PRId32" hmf=%" PRId32" sfn=%" PRId32" msfn=%" PRId32" nf=%" PRId32" db=%s.",
		     (int32_t)compensateForMerge,(int32_t)base->hasMergeFile(),
		     (int32_t)startFileNum,(int32_t)base->m_mergeStartFileNum-1,
		     (int32_t)numFiles,base->m_dbname);
	int32_t pre = -10;
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum >= base->m_mergeStartFileNum - 1 &&
	     (startFileNum > 0 || numFiles != -1) ) {
		// now also include the file being merged into, but only
		// if we are reading from a file being merged...
		if ( startFileNum < base->m_mergeStartFileNum +
		     base->m_numFilesToMerge - 1 )
			//m_fileNums [ m_numFileNums++ ] =
			//	base->m_mergeStartFileNum - 1;
			pre = base->m_mergeStartFileNum - 1;
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,
			   "net: msg3: startFileNum from %" PRId32" to %" PRId32" (mfn=%" PRId32")",
			    startFileNum,startFileNum+1,mergeFileNum);
		// if merge file was inserted before us, inc our file number
		startFileNum++;
	}
	// adjust num files if we need to, as well
	if ( compensateForMerge && base->hasMergeFile() && 
	     startFileNum < base->m_mergeStartFileNum - 1 &&
	     numFiles != -1 &&
	     startFileNum + numFiles - 1 >= base->m_mergeStartFileNum - 1 ) {
		// debug msg
		if ( g_conf.m_logDebugQuery )
			log(LOG_DEBUG,"net: msg3: numFiles up one.");
		// if merge file was inserted before us, inc our file number
		numFiles++;
	}

	// . how many rdb files does this base have?
	// . IMPORTANT: this can change since files are unstable because they
	//   might have all got merged into one!
	// . so do this check to make sure we're safe... especially if
	//   there was an error before and we called readList() on ourselves
	max = base->getNumFiles();
	// -1 means we should scan ALL the files in the base
	if ( numFiles == -1 ) numFiles = max;
	// limit it by startFileNum, however
	if ( numFiles > max - startFileNum ) numFiles = max - startFileNum;
	// set g_errno and return true if it is < 0
	if ( numFiles < 0 ) { 
		log(LOG_LOGIC,
		   "net: msg3: readList: numFiles = %" PRId32" < 0 (max=%" PRId32")(sf=%" PRId32")",
		    numFiles , max , startFileNum );
		g_errno = EBADENGINEER; 
		// force core dump
		char *xx=NULL;*xx=0;
		return true; 
	}

	// . allocate buffer space
	// . m_scans, m_startpg, m_endpg, m_hintKeys, m_hintOffsets,
	//   m_fileNums, m_lists
	int32_t chunk = sizeof(RdbScan) + // m_scans
		4 +                    // m_startpg
		4 +                    // m_endpg
		//sizeof(key_t) +        // m_hintKeys
		m_ks +                 // m_hintKeys
		4 +                    // m_hintOffsets
		4 +                    // m_fileNums
		sizeof(RdbList) ;      // m_lists
	int32_t nn   = numFiles;
	if ( pre != -10 ) nn++;
	m_numChunks = nn;
	int32_t need = nn * (chunk);
	m_alloc = m_buf;
	if ( need > (int32_t)MSG3_BUF_SIZE ) {
		m_allocSize = need;
		m_alloc = (char *)mcalloc ( need , "Msg3" );
		if ( ! m_alloc ) {
			log("disk: Could not allocate %" PRId32" bytes read "
			    "structures to read %s.",need,base->m_dbname);
			return true;
		}
	}
	char *p = m_alloc;
	m_scans       = (RdbScan *)p; p += nn * sizeof(RdbScan);
	m_startpg     = (int32_t    *)p; p += nn * 4;
	m_endpg       = (int32_t    *)p; p += nn * 4;
	//m_hintKeys    = (key_t   *)p; p += nn * sizeof(key_t);
	m_hintKeys    = (char    *)p; p += nn * m_ks;
	m_hintOffsets = (int32_t    *)p; p += nn * 4;
	m_fileNums    = (int32_t    *)p; p += nn * 4;
	m_lists       = (RdbList *)p; p += nn * sizeof(RdbList);
	// sanity check
	if ( p - m_alloc != need ) {
		log(LOG_LOGIC,"disk: Bad malloc in Msg3.cpp.");
		char *xx = NULL; *xx = 0;
	}
	// call constructors
	for ( int32_t i = 0 ; i < nn ; i++ ) m_lists[i].constructor();
	// make fix from up top
	if ( pre != -10 ) m_fileNums [ m_numFileNums++ ] = pre;

	// store them all
	for ( int32_t i = startFileNum ; i < startFileNum + numFiles ; i++ )
		m_fileNums [ m_numFileNums++ ] = i;

	// . remove file nums that are being unlinked after a merge now
	// . keep it here (below skip: label) so sync point reads can use it
	int32_t n = 0;
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// skip those that are being unlinked after the merge
		if ( base->m_isUnlinking && 
		     m_fileNums[i] >= base->m_mergeStartFileNum &&
		     m_fileNums[i] <  base->m_mergeStartFileNum + 
		                      base->m_numFilesToMerge      )
			continue;
		// otherwise, keep it
		m_fileNums[n++] = m_fileNums[i];
	}
	m_numFileNums = n;

	// . if root file is being merged, he's file #0, & root file is file #1
	// . this is a hack so caller gets what he wants
	//if ( startFileNum == 0 && base->getFileId(0) == 0 && numFiles == 1 )
	//	numFiles = 2;

	// remember the file range we should scan
	m_numScansStarted    = 0;
	m_numScansCompleted  = 0;
	//m_startKey           = startKey;
	//m_endKey             = endKey;
	//m_constrainKey       = endKey; // set in case justGetEndKey is true
	KEYSET(m_startKey,startKeyArg,m_ks);
	KEYSET(m_endKey,endKeyArg,m_ks);
	KEYSET(m_constrainKey,endKeyArg,m_ks);//set incase justGetEndKey istrue
	m_minRecSizes        = minRecSizes;
	m_compensateForMerge = compensateForMerge;
	// bail if 0 files to scan -- no! need to set startKey/endKey
	if ( numFiles == 0 ) return true;
	// don't read anything if endKey < startKey
	//if ( m_startKey > m_endKey ) return true;
	if ( KEYCMP(m_startKey,m_endKey,m_ks)>0 ) return true;
	// keep the original in tact in case g_errno == ETRYAGAIN
	//m_endKeyOrig        = endKey;
	KEYSET(m_endKeyOrig,endKeyArg,m_ks);
	m_minRecSizesOrig   = minRecSizes;
	// start reading at this key
	m_fileStartKey = startKeyArg;
	// start the timer, keep it fast for clusterdb though
	if ( g_conf.m_logTimingDb ) m_startTime = gettimeofdayInMilliseconds();
	// translate base to an id, for the sake of m_msg0
	//char baseId = m_msg0->getRdbId ( base );
	// map ptrs
	RdbMap **maps = base->getMaps();
	// . we now boost m_minRecSizes to account for negative recs 
	// . but not if only reading one list, cuz it won't get merged and
	//   it will be too big to send back
	if ( m_numFileNums > 1 ) compensateForNegativeRecs ( base );
	// . often endKey is too big for an efficient read of minRecSizes bytes
	//   because we end up reading too much from all the files
	// . this will set m_startpg[i], m_endpg[i] for each RdbScan/RdbFile
	//   to ensure we read "minRecSizes" worth of records, not much more
	// . returns the new endKey for all ranges
	// . now this just overwrites m_endKey
	//m_endKey = setPageRanges ( base           ,
	setPageRanges ( base           ,
			m_fileNums     ,
			m_numFileNums  ,
			m_fileStartKey , // start reading @ key
			m_endKey       , // stop reading @ key
			m_minRecSizes  );

	// . NEVER let m_endKey be a negative key, because it will 
	//   always be unmatched, since delbit is cleared
	// . adjusting it here ensures our generated hints are valid
	// . we will use this key to call constrain() with
	//m_constrainKey = m_endKey;
	//if ( ( m_constrainKey.n0 & 0x01) == 0x00 ) 
	//	m_constrainKey -= (uint32_t)1;
	KEYSET(m_constrainKey,m_endKey,m_ks);
	if ( KEYNEG(m_constrainKey) )
		KEYSUB(m_constrainKey,m_ks);

	// Msg5 likes to get the endkey for getting the list from the tree
	if ( justGetEndKey ) return true;

	// sanity check
	if ( m_numFileNums > nn ) {
		log(LOG_LOGIC,"disk: Failed sanity check in Msg3.");
		char *xx = NULL; *xx = 0;
	}

	// debug msg
	//log("msg3 getting list (msg5=%" PRIu32")",m_state);
	// . MDW removed this -- go ahead an end on a delete key
	// . RdbMerge might not pick it up this round, but oh well
	// . so we can have both positive and negative co-existing in same file
	// make sure the last bit is set so we don't end on a delete key
	//m_endKey.n0 |= 0x01LL;
	// . now start reading/scanning the files
	// . our m_scans array starts at 0
	for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) {
		// get the page range
		//int32_t p1 = m_startpg [ i ];
		//int32_t p2 = m_endpg   [ i ];
		//#ifdef GBSANITYCHECK
		int32_t fn = m_fileNums[i];
		// this can happen somehow!
		if ( fn < 0 ) {
			log(LOG_LOGIC,"net: msg3: fn=%" PRId32". Bad engineer.",fn);
			continue;
		}
		// sanity check
		if ( i > 0 && m_fileNums[i-1] >= fn ) {
			log(LOG_LOGIC,
			    "net: msg3: files must be read in order "
			    "from oldest to newest so RdbList::indexMerge_r "
			    "works properly. Otherwise, corruption will "
			    "result. ");
			char *xx = NULL; *xx = 0;
			return true;
		}
		// . sanity check?
		// . no, we must get again since we turn on endKey's last bit
		int32_t p1 , p2;
		maps[fn]->getPageRange ( m_fileStartKey , 
					m_endKey       , 
					&p1            , 
					&p2            ,
					NULL           );
		//if ( p1 != p1c || p2 != p2c ) {
		//	fprintf(stderr,"Msg3::bad page range\n");
		//	sleep(50000);
		//}
		// sanity check, each endpg's key should be > endKey
		//if ( p2 < maps[fn]->getNumPages() && 
		//     maps[fn]->getKey ( p2 ) <= m_endKey ) {
		//	fprintf(stderr,"Msg3::bad page range 2\n");
		//	sleep(50000);
		//}
		//#endif
		//int32_t p1 , p2; 
		//maps[fn]->getPageRange (startKey,endKey,minRecSizes,&p1,&p2);
		// now get some read info
		int64_t offset      = maps[fn]->getAbsoluteOffset ( p1 );
		int32_t      bytesToRead = maps[fn]->getRecSizes ( p1, p2, false);
		// max out the endkey for this list
		// debug msg
		//#ifdef _DEBUG_		
		//if ( minRecSizes == 2000000 ) 
		//log("Msg3:: reading %" PRId32" bytes from file #%" PRId32,bytesToRead,i);
		//#endif
		// inc our m_numScans
		m_numScansStarted++;
		// . keep stats on our disk accesses
		// . count disk seeks (assuming no fragmentation)
		// . count disk bytes read
		if ( bytesToRead > 0 ) {
			base->m_rdb->didSeek (             );
			base->m_rdb->didRead ( bytesToRead );
		}
		// . the startKey may be different for each RdbScan class
		// . RdbLists must have all keys within their [startKey,endKey]
		// . therefore set startKey individually from first page in map
		// . this endKey must be >= m_endKey 
		// . this startKey must be < m_startKey
		//key_t startKey = maps[fn]->getKey ( p1 );
		//key_t endKey   = maps[fn]->getKey ( p2 );
		char startKey2 [ MAX_KEY_BYTES ];
		char endKey2   [ MAX_KEY_BYTES ];
		maps[fn]->getKey ( p1 , startKey2 );
		maps[fn]->getKey ( p2 , endKey2 );
		//char *startKey = maps[fn]->getKeyPtr ( p1 );
		//char *endKey   = maps[fn]->getKeyPtr ( p2 );
		// store in here
		m_startpg [ i ] = p1;
		m_endpg   [ i ] = p2;

		// . we read UP TO that endKey, so reduce by 1
		// . but iff p2 is NOT the last page in the map/file
		// . maps[fn]->getKey(lastPage) will return the LAST KEY
		//   and maps[fn]->getOffset(lastPage) the length of the file
		//if ( maps[fn]->getNumPages()!=p2) endKey -=(uint32_t)1;
		if ( maps[fn]->getNumPages() != p2 ) KEYSUB(endKey2,m_ks);
		// otherwise, if we're reading all pages, then force the
		// endKey to virtual inifinite
		//else endKey.setMax();
		else KEYMAX(endKey2,m_ks);

		// . set up the hints
		// . these are only used if we are only reading from 1 file
		// . these are used to call constrain() so we can constrain
		//   the end of the list w/o looping through all the recs
		//   in the list
		int32_t h2 = p2 ;
		// decrease by one page if we're on the last page
		if ( h2 > p1 && maps[fn]->getNumPages() == h2 ) h2--;
		// . decrease hint page until key is <= endKey on that page
		//   AND offset is NOT -1 because the old way would give
		//   us hints passed the endkey
		// . also decrease so we can constrain on minRecSizes in
		//   case we're the only list being read
		// . use >= m_minRecSizes instead of >, otherwise we may
		//   never be able to set "size" in RdbList::constrain()
		//   because "p" could equal "maxPtr" right away
		while ( h2 > p1 && 
			//( maps[fn]->getKey   (h2) > m_constrainKey ||
		      (KEYCMP(maps[fn]->getKeyPtr(h2),m_constrainKey,m_ks)>0||
			  maps[fn]->getOffset(h2) == -1            ||
			  maps[fn]->getAbsoluteOffset(h2) - offset >=
			  m_minRecSizes ) )
			h2--;
		// now set the hint
		m_hintOffsets [ i ] = maps[fn]->getAbsoluteOffset ( h2 ) -
			              maps[fn]->getAbsoluteOffset ( p1 ) ;
		//m_hintKeys    [ i ] = maps[fn]->getKey            ( h2 );
		KEYSET(&m_hintKeys[i*m_ks],maps[fn]->getKeyPtr(h2),m_ks);

		// reset g_errno before calling setRead()
		g_errno = 0;
		// . this fix is now in RdbList::checklist_r()
		// . we can now have dup keys, so, we may read in
		//   a rec with key "lastMinKey" even though we don't read
		//   in the first key on the end page, so don't subtract 1...
		//if ( endKey != m_endKeyOrig ) 
		//	endKey += (uint32_t) 1;

		// timing debug
		if ( g_conf.m_logTimingDb )
			log(LOG_TIMING,
			    "net: msg: reading %" PRId32" bytes from %s file #%" PRId32" "
			     "(niceness=%" PRId32")",
			     bytesToRead,base->m_dbname,i,m_niceness);

		// log huge reads, those hurt us
		if ( bytesToRead > 150000000 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes at offset %" PRId64" "
			    "from %s.",
			    bytesToRead,offset,base->m_dbname);
		}

		// if any keys in the map are the same report corruption
		char tmpKey    [16];
		char lastTmpKey[16];
		int32_t ccount = 0;
		if ( bytesToRead     > 10000000      && 
		     bytesToRead / 2 > m_minRecSizes &&
		     base->m_fixedDataSize >= 0        ) {
			for ( int32_t pn = p1 ; pn <= p2 ; pn++ ) {
				maps[fn]->getKey ( pn , tmpKey );
				if ( KEYCMP(tmpKey,lastTmpKey,m_ks) == 0 ) 
					ccount++;
				gbmemcpy(lastTmpKey,tmpKey,m_ks);
			}
		}
		if ( ccount > 10 ) {
			logf(LOG_INFO,"disk: Reading %" PRId32" bytes from %s file #"
			     "%" PRId32" when min "
			     "required is %" PRId32". Map is corrupt and has %" PRId32" "
			     "identical consecutive page keys because the "
			     "map was \"repaired\" because out of order keys "
			     "in the index.",
			     (int32_t)bytesToRead,
			     base->m_dbname,fn,
			     (int32_t)m_minRecSizes,
			     (int32_t)ccount);
			m_numScansCompleted++;
			m_errno = ECORRUPTDATA;
			m_hadCorruption = true;
			//m_maxRetries = 0;
			break;
		}

		////////
		//
		// try to get from PAGE CACHE
		//
		////////
		BigFile *ff = base->getFile(m_fileNums[i]);
		RdbCache *rpc = getDiskPageCache ( m_rdbId );
		if ( ! m_allowPageCache ) rpc = NULL;
		// . vfd is unique 64 bit file id
		// . if file is opened vfd is -1, only set in call to open()
		int64_t vfd = ff->getVfd();
		key192_t ck = makeCacheKey ( vfd , offset, bytesToRead);
		char *rec; int32_t recSize;
		bool inCache = false;
		if ( rpc && vfd != -1 && ! m_validateCache ) 
			inCache = rpc->getRecord ( (collnum_t)0 , // collnum
						   (char *)&ck , 
						   &rec , 
						   &recSize ,
						   true , // copy?
						   -1 , // maxAge, none 
						   true ); // inccounts?
		m_scans[i].m_inPageCache = false;
		if ( inCache ) {
			m_scans[i].m_inPageCache = true;
			m_numScansCompleted++;
			// now we have to store this value, 6 or 12 so
			// we can modify the hint appropriately
			m_scans[i].m_shifted = *rec;
			m_lists[i].set ( rec +1,
					 recSize-1 ,
					 rec , // alloc
					 recSize , // allocSize
					 startKey2 ,
					 endKey2 ,
					 base->m_fixedDataSize ,
					 true , // owndata
					 base->useHalfKeys() ,
					 getKeySizeFromRdbId ( m_rdbId ) );
			continue;
		}

		// . do the scan/read of file #i
		// . this returns false if blocked, true otherwise
		// . this will set g_errno on error
		bool done = m_scans[i].setRead (base->getFile(m_fileNums[i]),
						base->m_fixedDataSize ,
						 offset                 ,
						 bytesToRead            ,
						 startKey2              ,
						 endKey2                ,
						m_ks                    ,
						 &m_lists[i]            ,
						 this                   ,
						 doneScanningWrapper    ,
						 base->useHalfKeys()    ,
						m_rdbId,
						 m_niceness             ,
						 m_allowPageCache       ,
						 m_hitDisk              ) ;
		// . damn, usually the above will indirectly launch a thread
		//   to do the reading, but it sets g_errno to EINTR,
		//   "interrupted system call"!
		// . i guess the thread does the read w/o blocking and then
		//   queues the signal on g_loop's queue before it exits
		// . try ignoring, and keep going
		if ( g_errno == EINTR ) {
			log("net: Interrupted system call while reading file. "
			    "Ignoring.");
			g_errno = 0;
		}
		// debug msg
		//fprintf(stderr,"Msg3:: reading %" PRId32" bytes from file #%" PRId32","
		//	"done=%" PRId32",offset=%" PRId64",g_errno=%s,"
		//	"startKey=n1=%" PRIu32",n0=%" PRIu64",  "
		//	"endKey=n1=%" PRIu32",n0=%" PRIu64"\n",
		//	bytesToRead,i,(int32_t)done,offset,mstrerror(g_errno),
		//	m_startKey,m_endKey);
		//if ( bytesToRead == 0 )
		//	fprintf(stderr,"shit\n");
		// if it did not block then it completed, so count it
		if ( done ) m_numScansCompleted++;
		// break on an error, and remember g_errno in case we block
		if ( g_errno && g_errno != ENOTHREADSLOTS ) { 
			int32_t tt = LOG_WARN;
			if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
			log(tt,"disk: Reading %s had error: %s.",
			    base->m_dbname, mstrerror(g_errno));
			m_errno = g_errno; 
			break; 
		}
	}
	// debug test
	//if ( rand() % 100 <= 10 ) m_errno = EIO;

	// if we blocked, return false
	if ( m_numScansCompleted < m_numScansStarted ) return false;
	// . if all scans completed without blocking then wrap it up & ret true
	// . doneScanning may now block if it finds data corruption and must
	//   get the list remotely
	return doneScanning();
}
bool RdbMerge::getAnotherList ( ) {
	log(LOG_DEBUG,"db: Getting another list for merge.");
	// clear it up in case it was already set
	g_errno = 0;
	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
	// if merging titledb files, we must adjust m_endKey so we do
	// not have to read a huge 200MB+ tfndb list
	//key_t newEndKey = m_endKey;
	char newEndKey[MAX_KEY_BYTES];
	KEYSET(newEndKey,m_endKey,m_ks);

	//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
	//char *coll = cr->m_coll;

	/*
	if ( m_rdbId == RDB_TITLEDB ) { // && m_rdbId == RDB_TFNDB ) {
		//long long docId1 = g_titledb.getDocIdFromKey ( m_startKey );
	       long long docId1=g_titledb.getDocIdFromKey((key_t *)m_startKey);
		//long long docId2 = g_titledb.getDocIdFromKey ( m_endKey );
		// tfndb is pretty much uniformly distributed
		RdbBase *ubase = getRdbBase(RDB_TFNDB,m_coll);
		if ( ! ubase ) return true;
		long long space    = ubase->getDiskSpaceUsed();
		//long long readSize = (space * (docId2-docId1)) / DOCID_MASK;
		long long bufSize  = g_conf.m_mergeBufSize;
		// for now force to 100k
		bufSize = 100000;
		if ( bufSize > space ) bufSize = space;
		long long docId3   = (long long) (((double)bufSize /
						  (double)space) *
			(double)DOCID_MASK  + docId1);
		// constrain newEndKey based on docId3
		if ( docId3 < 0 ) docId3 = DOCID_MASK;
		//if ( docId3 >= DOCID_MASK ) newEndKey.setMax();
		if ( docId3 >= DOCID_MASK ) KEYMAX(newEndKey,m_ks);
		//else newEndKey = g_titledb.makeLastKey ( docId3 );
		else {
			key_t nk = g_titledb.makeLastKey(docId3);
			KEYSET(newEndKey,(char *)&nk,m_ks);
		}
		//log(LOG_DEBUG,"build: remapping endkey from %lx.%llx to "
		//    "%lx.%llx to avoid big tfndb read.",
		//    m_endKey.n1,m_endKey.n0, newEndKey.n1,newEndKey.n0);
		log(LOG_DEBUG,"build: remapping endkey from %llx.%llx to "
		    "%llx.%llx to avoid big tfndb read.",
		    KEY1(m_endKey,m_ks),KEY0(m_endKey),
		    KEY1(newEndKey,m_ks),KEY0(newEndKey));
	}
	*/
	// . this returns false if blocked, true otherwise
	// . sets g_errno on error
	// . we return false if it blocked
	// . m_maxBufSize may be exceeded by a rec, it's just a target size
	// . niceness is usually MAX_NICENESS, but reindex.cpp sets to 0
	// . this was a call to Msg3, but i made it call Msg5 since
	//   we now do the merging in Msg5, not in msg3 anymore
	// . this will now handle truncation, dup and neg rec removal
	// . it remembers last termId and count so it can truncate even when
	//   IndexList is split between successive reads
	// . IMPORTANT: when merging titledb we could be merging about 255
	//   files, so if we are limited to only X fds it can have a cascade
	//   affect where reading from one file closes the fd of another file
	//   in the read (since we call open before spawning the read thread)
	//   and can therefore take 255 retries for the Msg3 to complete 
	//   because each read gives a EFILCLOSED error.
	//   so to fix it we allow one retry for each file in the read plus
	//   the original retry of 25
	long nn = base->getNumFiles();
	if ( m_numFiles > 0 && m_numFiles < nn ) nn = m_numFiles;
	// don't access any biased page caches
	bool usePageCache = true;
	if ( m_rdbId == RDB_CLUSTERDB )
		usePageCache = false;
	// . i don't trust page cache too much (mdw)... well, give it a shot
	// . see if ths helps fix WD corruption... i doubt it
	usePageCache = false;
	// for now force to 100k
	long bufSize = 100000; // g_conf.m_mergeBufSize , // minRecSizes
	// get it
	return m_msg5.getList ( m_rdbId        ,
				m_collnum           ,
				&m_list        ,
				m_startKey     ,
				newEndKey      , // usually is maxed!
				bufSize        ,
				false          , // includeTree?
				false          , // add to cache?
				0              , // max cache age for lookup
				m_startFileNum , // startFileNum
				m_numFiles     ,
				this           , // state 
				gotListWrapper , // callback
				m_niceness     , // niceness
				true           , // do error correction?
				NULL           , // cache key ptr
				0              , // retry #
				nn + 75        , // max retries (mk it high)
				false          , // compensate for merge?
				-1LL           , // sync point
				&m_msg5b       ,
				true           , // isRealMerge? absolutely!
				usePageCache   );
}
// . buffer is used for reading and writing
// . return false if blocked, true otherwise
// . sets g_errno on error
// . if niceness is 0 merge will block, otherwise will not block
// . we now use niceness of 1 which should spawn threads that don't allow
//   niceness 2 threads to launch while they're running
// . spider process now uses mostly niceness 2 
// . we need the merge to take priority over spider processes on disk otherwise
//   there's too much contention from spider lookups on disk for the merge
//   to finish in a decent amount of time and we end up getting too many files!
bool RdbMerge::merge ( char     rdbId        ,
		       //char    *coll         , //RdbBase *base         , 
		       collnum_t collnum,
		       BigFile *target       , 
		       RdbMap  *targetMap    ,
		       long     id2          , // target's secondary id
		       long     startFileNum , 
		       long     numFiles     ,
		       long     niceness     ,
		       class DiskPageCache *pc   ,
		       long long maxTargetFileSize ,
		       char     keySize      ) {
	// reset ourselves
	reset();
	// set it
	m_rdbId = rdbId;
	Rdb *rdb = getRdbFromId ( rdbId );
	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; if (!(base=getRdbBase(m_rdbId,collnum))) return true;
	// don't breech the max
	//if ( numFiles > m_maxFilesToMerge ) numFiles = m_maxFilesToMerge;
	// reset this map! it's m_crcs needs to be reset
	//targetMap->reset();
	// remember some parms
	//if ( ! coll && rdb->m_isCollectionLess )
	//	strcpy ( m_coll , rdb->m_dbname );
	//else
	//	strcpy ( m_coll , coll );

	m_collnum = collnum;
	if ( rdb->m_isCollectionLess ) m_collnum = 0;

	m_target          = target;
	m_targetMap       = targetMap;
	m_id2             = id2;
	m_startFileNum    = startFileNum;
	m_numFiles        = numFiles;
	m_dedup           = base->m_dedup;
	m_fixedDataSize   = base->m_fixedDataSize;
	m_niceness        = niceness;
	m_pc              = pc;
	m_maxTargetFileSize = maxTargetFileSize;
	m_doneMerging     = false;
	m_ks              = keySize;
	// . set the key range we want to retrieve from the files
	// . just get from the files, not tree (not cache?)
	//m_startKey.setMin();
	//m_endKey.setMax();
	KEYMIN(m_startKey,m_ks);
	KEYMAX(m_endKey,m_ks);
	// if we're resuming a killed merge, set m_startKey to last
	// key the map knows about.
	// the dump will start dumping at the end of the targetMap's data file.
	if ( m_targetMap->getNumRecs() > 0 ) {
		log(LOG_INIT,"db: Resuming a killed merge.");
		//m_startKey = m_targetMap->getLastKey();
		m_targetMap->getLastKey(m_startKey);
		//m_startKey += (unsigned long) 1;
		KEYADD(m_startKey,1,m_ks);
		// if power goes out and we are not doing synchronous writes
		// then we could have completely lost some data and unlinked
		// a part file from the file being merged, so that the data is
		// gone. to be able to resume merging, we must increment the
		// startKey until it references a valid offset in all the 
		// files being merged. invalid offsets will reference parts 
		// that have been chopped.
		/*
		RdbMap  **maps  = rdb->getMaps();
		BigFile **files = rdb->getFiles();
		for ( long i=m_startFileNum;i<m_startFileNum+m_numFiles;i++){
			long long minOff = 0LL;
			long k = 0;
			while ( k < files[i]->m_maxParts &&
				!   files[i]->m_files[k]    ) {
				k++;
				minOff += MAX_PART_SIZE;
			}
			long pn0 = maps[i]->getPage ( m_startKey );
			long pn  = pn0;
			while ( maps[i]->getAbsoluteOffset(pn) < minOff ) pn++;
			if ( pn != pn0 ) {
				log("db: Lost data during merge. Starting "
				    "merge at page number %li from %li for "
				    "file.",pn,pn0);
				m_startKey = maps[i]->getKey ( pn );
			}
		}
		*/
	}
	// free our list's memory, just in case
	//m_list.freeList();
	// . we may have multiple hosts running on the same cpu/hardDrive
	// . therefore, to maximize disk space, we should only have 1 merge
	//   at a time going on between these hosts
	// . now tfndb has own merge class since titledb merge writes url recs
	/*
	if ( s_isMergeLocked ) {
		//log("RdbMerge::merge: someone else merging sleeping.");
		log("RdbMerge::merge: someone else merging. bad engineer.");
		return false;
		// if it fails then sleep until it works
		//returng_loop.registerSleepCallback(5000,this,getLockWrapper);
	}
	*/
	return gotLock();
}
// . return false if blocked, true otherwise
// . sets g_errno on error
bool RdbMerge::getNextList ( ) {
	// return true if g_errno is set
	if ( g_errno || m_doneMerging ) return true;
	// it's suspended so we count this as blocking
	if ( m_isSuspended ) {
		m_isReadyToSave = true;
		return false;
	}
	// if the power is off, suspend the merging
	if ( ! g_process.m_powerIsOn ) {
		m_isReadyToSave = true;
		doSleep();
		return false;
	}
	// no chop threads
	m_numThreads = 0;
	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
	// . if a contributor has just surpassed a "part" in his BigFile
	//   then we can delete that part from the BigFile and the map
	for ( long i = m_startFileNum ; i < m_startFileNum + m_numFiles; i++ ){
		RdbMap    *map    = base->m_maps[i];
		long       page   = map->getPage ( m_startKey );
		long long  offset = map->getAbsoluteOffset ( page );
		BigFile   *file   = base->m_files[i];
		long       part   = file->getPartNum ( offset ) ;
		if ( part == 0 ) continue;
		// i've seen this bug happen if we chop a part off on our
		// last dump and the merge never completes for some reason...
		// so if we're in the last part then don't chop the part b4 us
		if ( part >= file->m_maxParts - 1 ) continue;
		// if we already unlinked part # (part-1) then continue
		if ( ! file->doesPartExist ( part - 1 ) ) continue;
		// . otherwise, excise from the map
		// . we must be able to chop the mapped segments corresponding
		//   EXACTLY to the part file
		// . therefore, PAGES_PER_SEGMENT define'd in RdbMap.h must
		//   evenly divide MAX_PART_SIZE in BigFile.h
		// . i do this check in RdbMap.cpp
		if ( ! map->chopHead ( MAX_PART_SIZE ) ) {
			// we had an error!
			log("db: Failed to remove data from map for "
			    "%s.part%li.",
			    file->getFilename(),part);
			return true;
		}
		// . also, unlink any part files BELOW part # "part"
		// . this returns false if it blocked, true otherwise
		// . this sets g_errno on error
		// . now we just unlink part file #(part-1) explicitly
		if ( ! file->chopHead ( part - 1 , chopWrapper , this ) ) 
			m_numThreads++;
		if ( ! g_errno ) continue;
		log("db: Failed to unlink file %s.part%li.",
		    file->getFilename(),part);
		return true;
	}
	// wait for file to be unlinked before getting list
	if ( m_numThreads > 0 ) return false;
	// otherwise, get it now
	return getAnotherList ( );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool RdbMerge::gotLock ( ) {
	// get total recSizes of files we're merging
	//long totalSize = 0;
	//for ( long i=m_startFileNum ; i < m_startFileNum + m_numFiles ; i++ )
	//totalSize += m_base->m_files[i]->getSize();
	// . grow the map now so it doesn't have to keep growing dynamically
	//   which wastes memory
	// . setMapSize() returns false and sets g_errno on error
	// . we return true if it had an error
	//if ( ! m_targetMap->setMapSizeFromFileSize ( totalSize ) ) {
	//log("RdbMerge::getLockFile: targetMap setMapSize failed");
	//return true;
	//}

	// . get last mapped offset
	// . this may actually be smaller than the file's actual size
	//   but the excess is not in the map, so we need to do it again
	long long startOffset = m_targetMap->getFileSize();

	// if startOffset is > 0 use the last key as RdbDump:m_prevLastKey
	// so it can compress the next key it dumps providee m_useHalfKeys
	// is true (key compression) and the next key has the same top 6 bytes
	// as m_prevLastKey
	//key_t prevLastKey;
	//if ( startOffset > 0 ) prevLastKey = m_targetMap->getLastKey();
	//else                   prevLastKey.setMin();
	char prevLastKey[MAX_KEY_BYTES];
	if ( startOffset > 0 ) m_targetMap->getLastKey(prevLastKey);
	else                   KEYMIN(prevLastKey,m_ks);

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;

	// . set up a a file to dump the records into
	// . returns false and sets g_errno on error
	// . this will open m_target as O_RDWR | O_NONBLOCK | O_ASYNC ...
	m_dump.set ( m_collnum          ,
		     m_target           ,
		     m_id2              ,
		     //m_startFileNum - 1 , // merge fileNum in Rdb::m_files[]
		     (m_rdbId == RDB_TITLEDB||m_rdbId== RDB2_TITLEDB2) ,
		     NULL         , // buckets to dump is NULL, we call dumpList
		     NULL         , // tree to dump is NULL, we call dumpList
		     m_targetMap  ,
		     NULL         , // for caching dumped tree
		     0            , // m_maxBufSize. not needed if no tree! 
		     true         , // orderedDump?
		     m_dedup      ,
		     m_niceness   , // niceness of dump
		     this         , // state
		     dumpListWrapper ,
		     base->useHalfKeys() ,
		     startOffset  ,
		     prevLastKey  ,
		     m_ks         ,
		     m_pc         ,
		     m_maxTargetFileSize ,
		     NULL                ); // set m_base::m_needsToSave? no.
	// what kind of error?
	if ( g_errno ) {
		log("db: gotLock: %s.", mstrerror(g_errno) );
		return true;
	}
	// . create a new msg3
	// . don't keep static because it contains a msg3, treeList & diskList
	// . these can take up  many megs of mem
	// . yes, but we need to avoid fragmentation, so hold on to our mem!
	//m_msg3 = new (Msg3);
	//if ( ! m_msg3 ) return false;
	// we're now merging since the dump was set up successfully
	m_isMerging     = true;
	// make it suspended for now
	m_isSuspended   = true;
	// grab the lock
	//s_isMergeLocked = true;
	// . this unsuspends it
	// . this returns false on error and sets g_errno
	// . it returns true if blocked or merge completed successfully
	return resumeMerge ( );
}
void handleRequest22 ( UdpSlot *slot , long netnice ) {
	// shortcut
	UdpServer *us = &g_udpServer;
	// get the request
	Msg22Request *r = (Msg22Request *)slot->m_readBuf;
       // get this
	//char *coll = g_collectiondb.getCollName ( r->m_collnum );

	// sanity check
	long  requestSize = slot->m_readBufSize;
	if ( requestSize < r->getMinSize() ) {
		log("db: Got bad request size of %li bytes for title record. "
		    "Need at least 28.",  requestSize );
		us->sendErrorReply ( slot , EBADREQUESTSIZE );
		return;
	}

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *tbase; 
	if ( ! (tbase=getRdbBase(RDB_TITLEDB,r->m_collnum) ) ) {
		log("db: Could not get title rec in collection # %li "
		    "because rdbbase is null.",
		    (long)r->m_collnum);
		g_errno = EBADENGINEER;
		us->sendErrorReply ( slot , g_errno ); 
		return; 
	}

	// overwrite what is in there so niceness conversion algo works
	r->m_niceness = netnice;

	// if just checking tfndb, do not do the cache lookup in clusterdb
	if ( r->m_justCheckTfndb ) r->m_maxCacheAge = 0;

	// keep track of stats
	//if    (r->m_justCheckTfndb)
	//       g_tfndb.getRdb()->readRequestGet(requestSize);
	//      else    
       g_titledb.getRdb()->readRequestGet  (requestSize);

       // breathe
       QUICKPOLL ( r->m_niceness);

       // sanity check
       if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; }


       // make the state now
       State22 *st ;
       try { st = new (State22); }
       catch ( ... ) {
	       g_errno = ENOMEM;
	       log("query: Msg22: new(%i): %s", sizeof(State22),
		   mstrerror(g_errno));
	       us->sendErrorReply ( slot , g_errno );
	       return;
       }
       mnew ( st , sizeof(State22) , "Msg22" );
       
       // store ptr to the msg22request
       st->m_r = r;
       // save for sending back reply
       st->m_slot = slot;

       // then tell slot not to free it since m_r references it!
       // so we'll have to free it when we destroy State22
       st->m_slotAllocSize = slot->m_readBufMaxSize;
       st->m_slotReadBuf   = slot->m_readBuf;
       slot->m_readBuf = NULL;

       // . make the keys for getting recs from tfndb
       // . url recs map docid to the title file # that contains the titleRec
       //key_t uk1 ;
       //key_t uk2 ;
       // . if docId was explicitly specified...
       // . we may get multiple tfndb recs
       if ( ! r->m_url[0] ) {
	       // there are no del bits in tfndb
	       //uk1 = g_tfndb.makeMinKey ( r->m_docId );
	       //uk2 = g_tfndb.makeMaxKey ( r->m_docId );
	       st->m_docId1 = r->m_docId;
	       st->m_docId2 = r->m_docId;
       }

       // but if we are requesting an available docid, it might be taken
       // so try the range
       if ( r->m_getAvailDocIdOnly ) {
	       long long pd = r->m_docId;
	       long long d1 = g_titledb.getFirstProbableDocId ( pd );
	       long long d2 = g_titledb.getLastProbableDocId  ( pd );
	       // sanity - bad url with bad subdomain?
	       if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	       // make sure we get a decent sample in titledb then in 
	       // case the docid we wanted is not available
	       st->m_docId1 = d1;
	       st->m_docId2 = d2;
       }

       // . otherwise, url was given, like from Msg15
       // . we may get multiple tfndb recs
       if ( r->m_url[0] ) {
	       long  dlen = 0;
	       // this causes ip based urls to be inconsistent with the call
	       // to getProbableDocId(url) below
	       char *dom  = getDomFast ( r->m_url , &dlen );
	       // bogus url?
	       if ( ! dom ) {
		       log("msg22: got bad url in request: %s",r->m_url);
		       g_errno = EBADURL;
		       us->sendErrorReply ( slot , g_errno ); 
		       mdelete ( st , sizeof(State22) , "Msg22" );
		       delete ( st );
		       return; 
	       }
	       long long pd = g_titledb.getProbableDocId (r->m_url,dom,dlen);
	       long long d1 = g_titledb.getFirstProbableDocId ( pd );
	       long long d2 = g_titledb.getLastProbableDocId  ( pd );
	       // sanity - bad url with bad subdomain?
	       if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	       // there are no del bits in tfndb
	       //uk1 = g_tfndb.makeMinKey ( d1 );
	       //uk2 = g_tfndb.makeMaxKey ( d2 );
	       // store these
	       st->m_pd     = pd;
	       st->m_docId1 = d1;
	       st->m_docId2 = d2;
	       st->m_uh48   = hash64b ( r->m_url ) & 0x0000ffffffffffffLL;
       }
       
       QUICKPOLL ( r->m_niceness );
       
       /*
       // shortcut
       Rdb *tdb = g_titledb.getRdb();

       // init this
       st->m_tfn2 = -1;
       // skip tfndb lookup if we can. saves some time.
       if ( g_conf.m_readOnlyMode && 
	    // must not be a *url* lookup, it must be a docid lookup
	    ! r->m_url[0] &&
	    // tree must be empty too i guess
	    tdb->getTree()->getNumUsedNodes() ==0 ) {
	       // the RdbBase contains the BigFiles for tfndb
	       RdbBase *base = tdb->m_bases[r->m_collnum];
	       // can only have one titledb file
	       if ( base->getNumFiles() == 1 ) {
		       // now we can get RdbBase
		       st->m_tfn2 = base->m_fileIds2[0];
		       // sanity check
		       if ( st->m_tfn2 < 0 ) { char *xx = NULL; *xx = 0; }
	       }
       }

       // check the tree for this docid
       RdbTree *tt = tdb->getTree();
       // make titledb keys
       key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
       key_t endKey   = g_titledb.makeLastKey  ( st->m_docId2 );
       long  n        = tt->getNextNode ( r->m_collnum , startKey );
       
       // there should only be one match, one titlerec per docid!
       for ( ; n >= 0 ; n = tt->getNextNode ( n ) ) {
	       // break if collnum does not match. we exceeded our tree range.
	       if ( tt->getCollnum ( n ) != r->m_collnum ) break;
	       // get the key of this node
	       key_t k = *(key_t *)tt->getKey(n);
	       // if passed limit, break out, no match
	       if ( k > endKey ) break;
	       // if we had a url make sure uh48 matches
	       if ( r->m_url[0] ) {
		       // get it
		       long long uh48 = g_titledb.getUrlHash48(&k);
		       // sanity check
		       if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; }
		       // we must match this exactly
		       if ( uh48 != st->m_uh48 ) continue;
	       }
	       // . if we matched a negative key, then skip
	       // . just break out here and enter the normal logic
	       // . it should load tfndb and find that it is not in tfndb
	       //   because when you add a negative key to titledb in
	       //   Rdb::addList, it adds a negative rec to tfndb immediately
	       // . NO! because we add the negative key to the tree when we
	       //   delete the old titledb rec, then we add the new one!
	       //   when a negative key is added Rdb::addRecord() removes
	       //   the positive key (and vice versa) from the tree.
	       if ( KEYNEG((char *)&k) ) continue;
	       // if just checking for its existence, we are done
	       if ( r->m_justCheckTfndb ) {
		       us->sendReply_ass ( NULL,0,NULL,0,slot);
		       // don't forget to free the state
		       mdelete ( st , sizeof(State22) , "Msg22" );
		       delete ( st );
		       return;
	       }
	       // ok, we got a match, return it
	       char *data     = tt->getData     ( n );
	       long  dataSize = tt->getDataSize ( n );
	       // wierd!
	       if ( dataSize == 0 ) { char *xx=NULL;*xx=0; }
	       // send the whole rec back
	       long need = 12 + 4 + dataSize;
	       // will this copy it? not!
	       char *buf = (char *)mmalloc ( need , "msg22t" );
	       if ( ! buf ) {
		       us->sendErrorReply ( slot , g_errno ); 
		       mdelete ( st , sizeof(State22) , "Msg22" );
		       delete ( st );
		       return; 
	       }
	       // log it
	       if ( g_conf.m_logDebugSpider )
		       logf(LOG_DEBUG,"spider: found %s in titledb tree",
			    r->m_url);
	       // store in the buf for sending
	       char *p = buf;
	       // store key
	       *(key_t *)p = k; p += sizeof(key_t);
	       // then dataSize
	       *(long *)p = dataSize; p += 4;
	       // then the data
	       memcpy ( p , data , dataSize ); p += dataSize;
	       // send off the record
	       us->sendReply_ass (buf, need,buf, need,slot);
	       // don't forget to free the state
	       mdelete ( st , sizeof(State22) , "Msg22" );
	       delete ( st );
	       return;
       }

       // if we did not need to consult tfndb cuz we only have one file
       if ( st->m_tfn2 >= 0 ) {
	       gotUrlListWrapper ( st , NULL , NULL );
	       return;
       }

       // . get the list of url recs for this docid range
       // . this should not block, tfndb SHOULD all be in memory all the time
       // . use 500 million for min recsizes to get all in range
       // . no, using 500MB causes problems for RdbTree::getList, so use
       //   100k. how many recs can there be?
       if ( ! st->m_msg5.getList ( RDB_TFNDB         ,
				   coll              ,
				   &st->m_ulist      ,
				   uk1               , // startKey
				   uk2               , // endKey
				   // use 0x7fffffff preceisely because it
				   // will determine eactly how long the
				   // tree list needs to allocate in Msg5.cpp
				   0x7fffffff        , // minRecSizes
				   true              , // includeTree?
				   false             , // addToCache?
				   0                 , // max cache age
				   0                 , // startFileNum
				   -1                , // numFiles (-1 =all)
				   st                ,
				   gotUrlListWrapper ,
				   r->m_niceness     ,
				   true              ))// error correction?
	       return ;
       // we did not block
       gotUrlListWrapper ( st , NULL , NULL );
}

static void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) ;

void gotUrlListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
	// shortcuts
	State22   *st = (State22 *)state;
	UdpServer *us = &g_udpServer;

	// bail on error
	if ( g_errno ) { 
		log("db: Had error getting info from tfndb: %s.",
		    mstrerror(g_errno));
		log("db: uk1.n1=%li n0=%lli uk2.n1=%li n0=%lli "
		    "d1=%lli d2=%lli.",
		    ((key_t *)st->m_msg5.m_startKey)->n1 ,
		    ((key_t *)st->m_msg5.m_startKey)->n0 ,
		    ((key_t *)st->m_msg5.m_endKey)->n1   ,
		    ((key_t *)st->m_msg5.m_endKey)->n0   ,
		    st->m_docId1 ,
		    st->m_docId2 );
		us->sendErrorReply ( st->m_slot , g_errno ); 
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st );
		return;
	}

	// shortcuts
	RdbList      *ulist = &st->m_ulist;
	Msg22Request *r     = st->m_r;
	char         *coll  = g_collectiondb.getCollName ( r->m_collnum );

	// point to top just in case
	ulist->resetListPtr();

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *tbase = getRdbBase(RDB_TITLEDB,coll);

	// set probable docid
	long long pd = 0LL;
	if ( r->m_url[0] ) {
		pd = g_titledb.getProbableDocId(r->m_url);
		// sanity
		if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; }
	}

	// . these are both meant to be available docids
	// . if ad2 gets exhausted we use ad1
	long long ad1 = st->m_docId1;
	long long ad2 = pd;


	long tfn = -1;
	// sanity check. make sure did not load from tfndb if did not need to
	if ( ! ulist->isExhausted() && st->m_tfn2 >= 0 ) {char *xx=NULL;*xx=0;}
	// if only one titledb file and none in memory use it
	if ( st->m_tfn2 >= 0 ) tfn = st->m_tfn2;

	// we may have multiple tfndb recs but we should NEVER have to read
	// multiple titledb files...
	for ( ; ! ulist->isExhausted() ; ulist->skipCurrentRecord() ) {
		// breathe
		QUICKPOLL ( r->m_niceness );
		// get first rec
		key_t k = ulist->getCurrentKey();
		// . skip negative keys
		// . seems to happen when we have tfndb in the tree...
		if ( KEYNEG((char *)&k) ) continue;

		// if we have a url and no docid, we gotta check uh48!
		if ( r->m_url[0] && g_tfndb.getUrlHash48(&k)!=st->m_uh48){
			// get docid of that guy
			long long dd = g_tfndb.getDocId(&k);
			// if matches avail docid, inc it
			if ( dd == ad1 ) ad1++;
			if ( dd == ad2 ) ad2++;
			// try next tfndb key
			continue;
		}
		// . get file num this rec is stored in
		// . this is updated right after the file num is merged by
		//   scanning all records in tfndb. this is very quick if all
		//   of tfndb is in memory, otherwise, it might take a few
		//   seconds. update call done in RdbMerge::incorporateMerge().
		tfn = g_tfndb.getTfn ( &k );
		// i guess we got a good match!
		break;
	}

	// sanity check. 255 used to mean in spiderdb or in tree
	if ( tfn >= 255 ) { char *xx=NULL;*xx=0; }


	// maybe no available docid if we breached our range
	if ( ad1 >= pd           ) ad1 = 0LL;
	if ( ad2 >  st->m_docId2 ) ad2 = 0LL;
	// get best
	long long ad = ad2;
	// but wrap around if we need to
	if ( ad == 0LL ) ad = ad1;

	// breathe
	QUICKPOLL ( r->m_niceness);

	// . log if different
	// . if our url rec was in there, this could still be different
	//   if there was another url rec in there with the same docid and
	//   a diferent extension, but with a tfn of 255, meaning that it
	//   is just in spiderdb and not in titledb yet. so it hasn't been
	//   assigned a permanent docid...
	// . another way "ad" may be different now is from the old bug which
	//   did not chain the docid properly because it limited the docid
	//   chaining to one titleRec file. so conceivably we can have 
	//   different docs sharing the same docids, but with different 
	//   url hash extensions. for instance, on host #9 we have:
	//   00f3b2ff63aec3a9 docId=261670033643 e=0x58 tfn=117 clean=0 half=0 
	//   00f3b2ff63af66c9 docId=261670033643 e=0x6c tfn=217 clean=0 half=0 
	// . Msg16 will only use the avail docid if the titleRec is not found
	if ( r->m_url[0] && pd != ad ) {
		//log(LOG_INFO,"build: Docid %lli collided. %s Changing "
		//
		// http://www.airliegardens.org/events.asp?dt=2&date=8/5/2011
		//
		// COLLIDES WITH
		//
		// http://www.bbonline.com/i/chicago.html
		//
		// collision alert!
		log("spider: Docid %lli collided. %s Changing "
		    "to %lli.", r->m_docId , r->m_url , ad );
		// debug this for now
		//char *xx=NULL;*xx=0; 
	}

	// remember it
	st->m_availDocId = ad;

	// if tfn is -1 then it was not in titledb
	if ( tfn == -1 ) { 
		// store docid in reply
		char *p = st->m_slot->m_tmpBuf;
		// send back the available docid
		*(long long *)p = ad;
		// send it
		us->sendReply_ass ( p , 8 , p , 8 , st->m_slot );
		// don't forget to free state
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st );
		return;
	}

	// sanity
	if ( tfn < 0 ) { char *xx=NULL;*xx=0; }

	// breathe
	QUICKPOLL ( r->m_niceness );

	// ok, if just "checking tfndb" no need to go further
	if ( r->m_justCheckTfndb ) {
		// send back a good reply (empty means found!)
		us->sendReply_ass ( NULL,0,NULL,0,st->m_slot);
		// don't forget to free the state
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st );
		return;
	}

	// . compute the file scan range
	// . tfn is now equivalent to Rdb's id2, a secondary file id, it
	//   follows the hyphen in "titledb0001-023.dat"
	// . default to just scan the root file AND the tree, cuz we're
	//   assuming restrictToRoot was set to true so we did not get a tfndb
	//   list
	// . even if a file number is given, always check the tree in case
	//   it got re-spidered
	// . shit, but we can still miss it if it gets dumped right after
	//   our thread is spawned, in which case we'd fall back to the old
	//   version. no. because if its in the tree now we get it before
	//   spawning a thread. there is no blocking. TRICKY. so if it is in 
	//   the tree at this point we'll get it, but may end up scanning the
	//   file with the older version of the doc... not too bad.
	long startFileNum = tbase->getFileNumFromId2 ( tfn );

	// if tfn refers to a missing titledb file...
	if ( startFileNum < 0 ) {
		if ( r->m_url[0] ) log("db: titledb missing url %s",r->m_url);
		else        log("db: titledb missing docid %lli", r->m_docId);
		us->sendErrorReply ( st->m_slot,ENOTFOUND );
		mdelete ( st , sizeof(State22) , "Msg22" );
		delete ( st ); 
		return ;
	}

	// save this
	st->m_tfn = tfn;
	*/
	// make the cacheKey ourself, since Msg5 would make the key wrong
	// since it would base it on startFileNum and numFiles
	key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId;
	// make titledb keys
	key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
	key_t endKey   = g_titledb.makeLastKey  ( st->m_docId2 );

	// . load the list of title recs from disk now
	// . our file range should be solid
	// . use 500 million for min recsizes to get all in range
	if ( ! st->m_msg5.getList ( RDB_TITLEDB       ,
				    r->m_collnum ,
				    &st->m_tlist      ,
				    startKey          , // startKey
				    endKey            , // endKey
				    500000000         , // minRecSizes
				    true              , // includeTree
				    false,//r->m_addToCache   , // addToCache?
				    0,//r->m_maxCacheAge  , // max cache age
				    0,//startFileNum      ,
				    -1                 , // numFiles
				    st , // state             ,
				    gotTitleList      ,
				    r->m_niceness     ,
				    true              , // do error correct?
				    &cacheKey         ,
				    0                 , // retry num
				    -1                , // maxRetries
				    true              , // compensate for merge
				    -1LL              , // sync point
				    &st->m_msg5b      ) ) return ;

	// we did not block, nice... in cache?
	gotTitleList ( st , NULL , NULL );
}
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList ( void *state ) {
	// the state
	State10 *st = (State10 *) state;
	// launch more
	if ( ! launchRequests ( st ) ) return false;
	/*
	// get the date list
	//fprintf(stderr,"termId now=%lli\n",st->m_termId);
	//fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK));
	// . now get the indexList for this termId
	// . date is complemented, so start with bigger one first
	key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff);
	key128_t endKey   = g_datedb.makeEndKey   ( st->m_termId ,0x0);
	// get the rdb ptr to titledb's rdb
	//Rdb *rdb = g_indexdb.getRdb();
	// -1 means read from all files in Indexdb
	long numFiles = -1;
	// make it zero if caller doesn't want to hit the disk
	if ( ! st->m_useDisk ) numFiles = 0;
	// get the title rec at or after this docId
	if ( ! st->m_msg0.getList ( -1 ,
				    0  ,
				    0  ,
				    0  ,    // max cache age
				    false , // add to cache?
				    RDB_DATEDB  , // rdbId of 2 = indexdb
				    st->m_coll ,
				    &st->m_list2  ,
				    (char *)&startKey  ,
				    (char *)&endKey    ,
				    st->m_numRecs * sizeof(key128_t),//recSizes
				    //st->m_useTree   , // include tree?
				    //st->m_useCache  , // include cache?
				    //false     , // add to cache?
				    //0         , // startFileNum
				    //numFiles  , // numFiles
				    st        , // state
				    gotIndexListWrapper2 ,
				    0  ) )  // niceness
		return false;
	// otherwise call gotResults which returns false if blocked, true else
	// and sets g_errno on error
	return gotIndexList2 ( (void *) st , NULL );
}


void gotIndexListWrapper2 ( void *state , RdbList *list ) {
	gotIndexList2 ( state , list );
}

void addedKeyWrapper ( void *state ) {
	gotIndexList2 ( state, NULL );
}

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList2 ( void *state , RdbList *list ) {
	// the state
	State10 *st = (State10 *) state;
	*/
	// get the socket
	TcpSocket *s = st->m_socket;
	// don't allow pages bigger than 128k in cache
	//char  buf [ 64*1024 ];
	// a ptr into "buf"
	//char *p    = buf;
	//char *pend = buf + 64*1024;
	/*
	// get termId
	key_t k = *(key_t *)st->m_list.getStartKey();
	long long termId = g_indexdb.getTermId ( k );
	// get groupId from termId
	//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
	unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k );
	long hostnum = g_hostdb.makeHostId ( groupId );
	*/
	// check box " checked" strings
	char *ubs = "";
	char *uts = "";
	char *uds = "";
	char *ucs = "";
	char *add = "";
	char *del = "";
	if ( st->m_useDatedb) ubs = " checked";
	if ( st->m_useTree  ) uts = " checked";
	if ( st->m_useDisk  ) uds = " checked";
	if ( st->m_useCache ) ucs = " checked";
	if ( st->m_add      ) add = " checked";
	if ( st->m_del      ) del = " checked";

	SafeBuf *pbuf = &st->m_pbuf;

	g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r );

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; 
	if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true;

	// print the standard header for admin pages
	pbuf->safePrintf ( 
		  "<center>\n"
		  "<table cellpadding=2><tr><td colspan=4>"
		  "useDatedb:<input type=checkbox value=1 name=ub%s> "
		  "useTree:<input type=checkbox value=1 name=ut%s> "
		  "useDisk:<input type=checkbox value=1 name=ud%s> "
		  "useCache:<input type=checkbox value=1 name=uc%s> "
		  "ADD:<input type=checkbox value=1 name=add%s> "
		  "DELETE:<input type=checkbox value=1 name=del%s>"
		  "</td></tr><tr><td>"
		  "query:"
		  "</td><td>"
		  "<input type=text name=q value=\"%s\" size=20>"
		  "</td><td>"
		  "collection:"
		  "</td><td>"
		  "<input type=text name=c value=\"%s\" size=10>"
		  "</td></tr><tr><td>"
		  "termId:"
		  "</td><td>"
		  "<input type=text name=t value=%lli size=20>"
		  "</td><td>"
		  "numRecs:"
		  "</td><td>"
		  "<input type=text name=numRecs value=%li size=10> "
		  "</td></tr><tr><td>"
		  "docId:"
		  "</td><td>"
		  "<input type=text name=d value=%lli size=20> "
		  "</td><td>"
		  "score:"
		  "</td><td>"
		  "<input type=text name=score value=%li size=10> "
		  "</td><td>"
		  "<input type=submit value=ok border=0>"
		  "</td></tr>"
		  "<tr><td colspan=2>"
		  "term appears in about %lli docs +/- %li"
		  "</td></tr>"
		  //"<tr><td colspan=2>"
		  //"this indexlist held by host #%li and twins"
		  //"</td></tr>"
		  "</table>"
		  "</form><br><br>" ,
		  ubs, uts, uds, ucs, add, del,
		  st->m_query , st->m_coll , st->m_termId  , 
		  st->m_numRecs  ,
		  st->m_docId , (long)st->m_score ,
		  st->m_termFreq ,
		  2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * 
		  base->getNumFiles() );
		  //hostnum );

	if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){
		if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno));
		else        pbuf->safePrintf("List is empty");
		pbuf->safePrintf("</center>");
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		bool status = g_httpServer.sendDynamicPage(s , 
							   pbuf->getBufStart(),
							   pbuf->length() );
		// delete it
		mdelete ( st , sizeof(State10) , "PageIndexdb" );
		delete (st);
		return status;
	}

	pbuf->safePrintf ( 
		  "<table cellpadding=1 border=1>" 
		  "<tr><td>#</td><td>score</td>"
		  "<td>docId</td><td>domHash</td></tr>");

	//if ( searchingEvents

	// now print the score/docId of indexlist
	long i = 0;
	for (   st->m_list.resetListPtr () ;
	      ! st->m_list.isExhausted  () ;
		st->m_list.skipCurrentRecord () ) {
		// break if buf is low
		//if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list.getCurrentDocId () ;
		unsigned long groupId = getGroupIdFromDocId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// log the first docid so we can blaster url: queries
		// to PageIndexdb and see if they are in indexdb
		if ( i == 0 ) 
			logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query);
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		unsigned long date = 0;
		if ( st->m_useDatedb )
			date = (unsigned long)st->m_list.getCurrentDate();
		uint8_t dh = g_titledb.getDomHash8FromDocId ( docId );
		char ds[32];
		ds[0]=0;
		if ( st->m_useDatedb ) sprintf (ds,"%lu/",date);
		pbuf->safePrintf ( 
			  "<tr><td>%li.</td>"
			  "<td>%s%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td>"
			  "<td>"
			  "0x%02lx"
			  "</td>"
			  "</tr>\n" ,
			  i++,
			  ds, (int)st->m_list.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId ,
			  (long)dh );
	}	
	pbuf->safePrintf ( "</table>" );

	/*
	if ( ! st->m_list2.isEmpty() ) 
		p += sprintf ( p ,
			       "<br>"
			       "<br>"
			       "<table cellpadding=1 border=1>" 
			       "<tr><td>#</td><td>termId</td>"
			       "<td>date</td><td>score</td>"
			       "<td>docId</td></tr>");

	// now print the score/docId of datedb list
	i = 0;
	for (   st->m_list2.resetListPtr () ;
	      ! st->m_list2.isExhausted  () ;
		st->m_list2.skipCurrentRecord () ) {
		// break if buf is low
		if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list2.getCurrentDocId () ;
		unsigned long groupId = g_titledb.getGroupId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		// debug
		char kb[16];
		st->m_list2.getCurrentKey(kb);
		//log(LOG_INFO,"debug: n1=%016llx n0=%016llx",
		//    *(long long *)(kb+8),*(long long *)(kb+0));
		//if ( (unsigned long)st->m_list2.getCurrentDate() == 0 )
		//	log("STOP");
		sprintf ( p , 
			  "<tr><td>%li.</td>"
			  "<td>%llu</td>"
			  "<td>%lu</td><td>%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td></tr>\n" ,
			  i++,
			  st->m_list2.getTermId16(kb) ,
			  (unsigned long)st->m_list2.getCurrentDate() ,
			  (int)st->m_list2.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId );
		p += gbstrlen ( p );
	}	
	*/
	if ( ! st->m_list.isEmpty() ) 
		pbuf->safePrintf ( "</table>" );


	// print msg if we could fit all into buf
	//if ( p + 1024 >= pend ) {
	//	sprintf ( p ,"... truncated ... no mem" );
	//	p += gbstrlen ( p );		
	//}
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	pbuf->safePrintf ( "</center>\n");
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage ( s , 
						     pbuf->getBufStart() ,
						     pbuf->length() );
	// delete the state
	mdelete ( st , sizeof(State10) , "PageIndexdb" );
	delete (st) ;
	return status;
}
Exemplo n.º 13
0
void handleRequest22 ( UdpSlot *slot , int32_t netnice ) {
	// shortcut
	UdpServer *us = &g_udpServer;
	// get the request
	Msg22Request *r = (Msg22Request *)slot->m_readBuf;

	// sanity check
	int32_t  requestSize = slot->m_readBufSize;
	if ( requestSize < r->getMinSize() ) {
		log("db: Got bad request size of %" PRId32" bytes for title record. "
		    "Need at least 28.",  requestSize );
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , EBADREQUESTSIZE );
		return;
	}

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *tbase = getRdbBase( RDB_TITLEDB, r->m_collnum );
	if ( ! tbase ) {
		log("db: Could not get title rec in collection # %" PRId32" because rdbbase is null.", (int32_t)r->m_collnum);
		g_errno = EBADENGINEER;
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , g_errno ); 
		return; 
	}

	// overwrite what is in there so niceness conversion algo works
	r->m_niceness = netnice;

	// if just checking tfndb, do not do the cache lookup in clusterdb
	if ( r->m_justCheckTfndb ) {
		r->m_maxCacheAge = 0;
	}

	g_titledb.getRdb()->readRequestGet  (requestSize);

	// breathe
	QUICKPOLL ( r->m_niceness);

	// sanity check
	if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; }


	// make the state now
	State22 *st ;
	try { st = new (State22); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("query: Msg22: new(%" PRId32"): %s", (int32_t)sizeof(State22),
		mstrerror(g_errno));
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( slot , g_errno );
		return;
	}
	mnew ( st , sizeof(State22) , "Msg22" );

	// store ptr to the msg22request
	st->m_r = r;
	// save for sending back reply
	st->m_slot = slot;

	// then tell slot not to free it since m_r references it!
	// so we'll have to free it when we destroy State22
	st->m_slotAllocSize = slot->m_readBufMaxSize;
	st->m_slotReadBuf   = slot->m_readBuf;
	slot->m_readBuf = NULL;

	// . if docId was explicitly specified...
	// . we may get multiple tfndb recs
	if ( ! r->m_url[0] ) {
	   st->m_docId1 = r->m_docId;
	   st->m_docId2 = r->m_docId;
	}

	// but if we are requesting an available docid, it might be taken
	// so try the range
	if ( r->m_getAvailDocIdOnly ) {
	   int64_t pd = r->m_docId;
	   int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
	   int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
	   // sanity - bad url with bad subdomain?
	   if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	   // make sure we get a decent sample in titledb then in
	   // case the docid we wanted is not available
	   st->m_docId1 = d1;
	   st->m_docId2 = d2;
	}

	// . otherwise, url was given, like from Msg15
	// . we may get multiple tfndb recs
	if ( r->m_url[0] ) {
	   int32_t  dlen = 0;
	   // this causes ip based urls to be inconsistent with the call
	   // to getProbableDocId(url) below
	   char *dom  = getDomFast ( r->m_url , &dlen );
	   // bogus url?
	   if ( ! dom ) {
	       log("msg22: got bad url in request: %s from "
		   "hostid %" PRId32" for msg22 call ",
		   r->m_url,slot->m_host->m_hostId);
	       g_errno = EBADURL;
	       log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
	       us->sendErrorReply ( slot , g_errno );
	       mdelete ( st , sizeof(State22) , "Msg22" );
	       delete ( st );
	       return;
	   }
	   int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen);
	   int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
	   int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
	   // sanity - bad url with bad subdomain?
	   if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; }
	   // store these
	   st->m_pd     = pd;
	   st->m_docId1 = d1;
	   st->m_docId2 = d2;
	   st->m_uh48   = hash64b ( r->m_url ) & 0x0000ffffffffffffLL;
	}

	QUICKPOLL ( r->m_niceness );

	// make the cacheKey ourself, since Msg5 would make the key wrong
	// since it would base it on startFileNum and numFiles
	key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId;
	// make titledb keys
	key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 );
	key_t endKey   = g_titledb.makeLastKey  ( st->m_docId2 );

	// . load the list of title recs from disk now
	// . our file range should be solid
	// . use 500 million for min recsizes to get all in range
	if ( ! st->m_msg5.getList ( RDB_TITLEDB       ,
				    r->m_collnum ,
				    &st->m_tlist      ,
				    startKey          , // startKey
				    endKey            , // endKey
				    500000000         , // minRecSizes
				    true              , // includeTree
				    false,//r->m_addToCache   , // addToCache?
				    0,//r->m_maxCacheAge  , // max cache age
				    0,//startFileNum      ,
				    -1                 , // numFiles
				    st , // state             ,
				    gotTitleList      ,
				    r->m_niceness     ,
				    true              , // do error correct?
				    &cacheKey         ,
				    0                 , // retry num
				    -1                , // maxRetries
				    true              , // compensate for merge
				    -1LL              ) ) // sync point
		return ;

	// we did not block, nice... in cache?
	gotTitleList ( st , NULL , NULL );
}
Exemplo n.º 14
0
bool RdbMerge::getAnotherList ( ) {
	log(LOG_DEBUG,"db: Getting another list for merge.");
	// clear it up in case it was already set
	g_errno = 0;
	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base = getRdbBase( m_rdbId, m_collnum );
	if ( ! base ) {
		return true;
	}
	// if merging titledb files, we must adjust m_endKey so we do
	// not have to read a huge 200MB+ tfndb list
	//key_t newEndKey = m_endKey;
	char newEndKey[MAX_KEY_BYTES];
	KEYSET(newEndKey,m_endKey,m_ks);

	// . this returns false if blocked, true otherwise
	// . sets g_errno on error
	// . we return false if it blocked
	// . m_maxBufSize may be exceeded by a rec, it's just a target size
	// . niceness is usually MAX_NICENESS, but reindex.cpp sets to 0
	// . this was a call to Msg3, but i made it call Msg5 since
	//   we now do the merging in Msg5, not in msg3 anymore
	// . this will now handle truncation, dup and neg rec removal
	// . it remembers last termId and count so it can truncate even when
	//   IndexList is split between successive reads
	// . IMPORTANT: when merging titledb we could be merging about 255
	//   files, so if we are limited to only X fds it can have a cascade
	//   affect where reading from one file closes the fd of another file
	//   in the read (since we call open before spawning the read thread)
	//   and can therefore take 255 retries for the Msg3 to complete 
	//   because each read gives a EFILCLOSED error.
	//   so to fix it we allow one retry for each file in the read plus
	//   the original retry of 25
	int32_t nn = base->getNumFiles();
	if ( m_numFiles > 0 && m_numFiles < nn ) nn = m_numFiles;
	// don't access any biased page caches
	bool usePageCache = true;
	if ( m_rdbId == RDB_CLUSTERDB )
		usePageCache = false;
	// . i don't trust page cache too much (mdw)... well, give it a shot
	// . see if ths helps fix WD corruption... i doubt it
	usePageCache = false;
	// for now force to 100k
	int32_t bufSize = 100000; // g_conf.m_mergeBufSize , // minRecSizes
	// get it
	return m_msg5.getList ( m_rdbId        ,
				m_collnum           ,
				&m_list        ,
				m_startKey     ,
				newEndKey      , // usually is maxed!
				bufSize        ,
				false          , // includeTree?
				false          , // add to cache?
				0              , // max cache age for lookup
				m_startFileNum , // startFileNum
				m_numFiles     ,
				this           , // state 
				gotListWrapper , // callback
				m_niceness     , // niceness
				true           , // do error correction?
				NULL           , // cache key ptr
				0              , // retry #
				nn + 75        , // max retries (mk it high)
				false          , // compensate for merge?
				-1LL           , // sync point
				true           , // isRealMerge? absolutely!
				usePageCache   );
}