コード例 #1
0
ファイル: omrosdump_helpers.c プロジェクト: bjornvar/omr
/**
 * \internal Make sure the DBF table, which points to all the memory regions collected
 * by CCCPSE, is as compact as possible. The goal is to cut the number of ELF
 * Phdrs down as much as possible by merging adjoining memory regions. In order to do
 * this, it needs to mark each DBFITEM, which is in protected memory, SPK=0. This 
 * function is therefore in SPK=0 for its duration, and restores SPK1 prior to return.
 *
 * \param[in] dbfHdr Address of the Java dump buffer as returned from CCCPSE.
 *
 * \return
 *      The number of program headers after merge
 */
static uintptr_t
adjustDBFTable(DBFHDR *dbfHdr)
{
	DBFITEM *dbfTbl = (DBFITEM *) dbfHdr + 1; /* DBTI[0] starts right after dbfHdr    */
	DBFITEM *pred = dbfTbl; /* Predecessor DBTI                        */
	/*
	 * For first pass only, set pred = curr. We never want to address zero.
	 * We'll use it only to test bitfields which aren't possible to be set
	 * for the very first item.
	 */
	DBFITEM *curr = dbfTbl; /* Current DBTI under examination        */
	DBFITEM *succ = pred + 1; /* Successor DBTI                        */
	DBFITEM *limit = dbfTbl + (dbfHdr->ijavcnt); /* DBTI[n+1] address                */
	uintptr_t phnum = 0; /* Number of expected Elf64_Phdrs        */
	uintptr_t wGap; /* ACCUMULATOR: total ibc gap            */
	void *wVstart; /* WORK: new p_vaddr                    */
	IDATA chunkSize = 0L; /* Semi-permanent per set ... size        */
	IDATA totalIBCSize = 0L; /* Total sizes of all writeable sets    */

	KEY0(); /* JDB's in SPK=0, get auth to write it.*/
	while (curr < limit) {
		memset(DBTITYPE, 0, sizeof(char) + (sizeof(DBTIGAP)));/* Clear DBTI tail            */
		calcGap(curr, succ, &wGap); /* Get GAP size.                        */

		if (0 == wGap) {
			/*
			 *    This pair qualifies to be combined
			 */
			if (PSETSTRT || PSETMIDL) {
				CSETMIDL = TRUE; /* It is either mid-set ...                    */
			} else { /*  ... or starts a new set.                */
				CSETSTRT = TRUE;
				wVstart = DBTIVADDR; /* If it's a start, hold the start vaddr.   */
				chunkSize += DBTISIZ;
				DBTIGAP = wGap;
			}
		} else {
			/*
			 *    This pair does not qualify to be combined. Is it last in set, or is its
			 *    predecessor not part of this one?
			 */
			if (PSETMIDL || PSETSTRT) { /*    It's the end of a set, so summarize        */
				CSETLAST = TRUE; /*     the set's start adrs & size in this    */
				chunkSize += DBTISIZ; /*     JDB index item, it's the one we'll        */
				DBTISIZ += chunkSize; /*     be building the Elf64_Phdr from.        */
				chunkSize = 0L;
				DBTIVADDR = wVstart; /*    Reset starting address & size            */
				wVstart = 0L; /*     accumulators.                            */
			} else {
				CSETSOLO = TRUE; /*    This one is a standalone item.            */
			}
			/*
			 * We will only write Elf64_Phdr items for those DBF items marked CSETLAST
			 * or CSETSOLO.
			 */
			if (CSETLAST || CSETSOLO) {
				phnum += 1; /* We will write a Phdr because of this item*/
			}
		}
		DBTIGAP = wGap; /* Save the gap for later analysis            */
		pred = curr; /* Advance our pair pointers                */
		curr = succ;
		succ += 1;
	}
	UNKEY(); /* Restore SPK=1                            */
	return phnum; /* Return revised count to caller.            */
}
コード例 #2
0
ファイル: omrosdump_helpers.c プロジェクト: bjornvar/omr
/**
 * \internal Build the z/TPF ELF-format core dump from the JDB built by CCCPSE.
 *
 * First figure out the absolute file path (starts with a '/' and
 * ends with a file suffix) we are to write the dump to, then ensure we
 * have enough room left over in the Java dump buffer left for us by 
 * CCCPSE. If we do not, return an error. Otherwise, section the buffer
 * space off, building in order:
 *    - The ELF64 header
 *    - The ELF64 Program Header section
 *    - The ELF64 NOTE section
 *      
 * Finally, write all the above to the indicated filename, and follow it
 * with a copy of storage contents from the Java dump buffer, which is
 * laid out in ascending address order, with the ELF64_Phdrs created to
 * match the dump buffer. Expect a final file size measured in tens
 * of megabytes.
 *
 * As a matter of protocol, set the first 8 bytes of the Java Dump
 * Buffer (struct ijavdbf) to zeros so CCCPSE knows it's okay to 
 * write over its contents once we're done working with it.
 *
 * The arg block is used for the bi-directional passing of data. Those of its
 * members which are pointers (most of them, in fact) are used mainly for 
 * output or at least have an output role. The fields of <arg> which are
 * required are shown as input parameters below. The fields of <arg> which
 * are used for output are detailed as return values.
 *
 * Since the pthread_create() calls limits us to one pointer to type void as the
 * function's input, and the function is required to return a void pointer, this
 * function will take the address of the <arg> block as the input parameter, and
 * will return a pointer to type char which represents the file name which now
 * contains the full path name of the ELF-format core dump file. There are also 
 * fields in <arg> that will be updated as described below.
 *
 * \param[in][out]     arg   pointer to a user-created datatype 'args', declared
 *                           in header file j9osdump_helpers.h
 * \param[in]        arg->OSFilename        Pointer to scratch storage for a path +
 *                                        file name string. It will be presumed 
 *                                        to be at least PATH_MAX+1 in size.
 * \param[in]        arg->wkspcSize        32-bit quantity representing the size in
 *                                        bytes of the wkSpace field, following.
 * \param[in]        arg->wkSpace        64-bit pointer to scratch workspace for
 *                                        use by this function. It is recommended
 *                                        to make it equal to PATH_MAX, since file &
 *                                        path names will be built in this space.
 * \param[in][out]    arg->flags            Indicators as to what is present and what
 *                                        is not.
 * \param[in]        arg->sii            Pointer to scratch storage to be used 
 *                                        forever more as a siginfo_t block
 * \param[in]        arg->uct            Pointer to scratch storage to be used 
 *                                        forever more as a ucontext_t block
 * \param[in]        arg->sct            Pointer to scratch storage to be used 
 *                                        forever more as a sigcontext block
 * \param[in]        arg->portLibrary    Pointer to an initialized OMRPortLibrary
 *                                        block. If there isn't one at call time,
 *                                        leave this value NULL and set flag
 *                                        J9ZTPF_NO_PORT_LIBRARY
 * \param[in]        arg->dibPtr            Address of the DIB attached to the faulting
 *                                        UOW at post-interrupt time.
 * 
 * \returns    Pointer to a hz-terminated string representing the absolute path
 *               name at which the core dump file was written.
 *            
 * \returns    arg->sii                   Filled in.
 * \returns    arg->uct                   Filled in.
 * \returns    arg->sct                   Filled in.
 * \returns    arg->rc                   Final return code. Zero if successful
 *                                       (core file built), non-zero if not.
 * \returns    arg->OSFilename           Same as the function return value.
 */
void *
ztpfBuildCoreFile(void *argv_block)
{
#define  MOUNT_TFS    4  /* TPF Filesystem equate from imount.h */

	args *arg = (args *) argv_block;
	uint8_t *buffer, *endBuffer;
	DBFHDR *dbfptr; /* Ptr to JDB's index header                */
	DIB *dibPtr = dibAddr(arg); /* Ptr to the Dump I'chg Block    */
	Elf64_Ehdr *ehdrp; /* Pointer to Elf64 file header                */
	Elf64_Phdr *phdrp; /* Pointer to Elf64_Phdr block                */
	Elf64_Nhdr *narea; /* Pointer to the ELF NOTE data                */
	char pathName[PATH_MAX]; /* Working buffer for core.* fname.*/
	char *ofn = dumpFilename(arg); /* Output dump file path    */
	uint32_t ofd; /* File descriptor for output dump            */
	uintptr_t rc; /* Working return code d.o.                    */
	uintptr_t wPtr; /* Working byte-sized pointer d.o.            */
	uint64_t phCount; /* Counter of Elf64_Phdrs required            */
	uint8_t *imageBuffer; /* Start of the JDB's ICB            */
	uint64_t imageBufferSize; /* Size of data in the ICB            */
	uint64_t octetsToWrite = 0UL; /* Count of bytes to write        */
	uint64_t octetsWritten = 0UL; /* Count of bytes written        */
	uint64_t spcAvailable;

	/*
	 *    If there is a Java dump buffer belonging to this process, then
	 *    convert its contents into an Elf64 core dump file; otherwise
	 *    return failure.
	 */
	if (!(dibPtr->dibjdb)) { /* No dump buffer? Not possible.*/
		dumpFlags(arg) |= J9TPF_NO_JAVA_DUMPBUFFER;
		returnCode (arg) = -1; /* Set error flags & bad RC ...    */
		return NULL; /* 'Bye.                        */
	}
	dbfptr = dibPtr->dibjdb; /* Pick up the dump buffer ptr    */
	if (0L == dbfptr->ijavcnt) { /* Did CCCPSE write us one?        */
		returnCode (arg) = -1; /* Nope. JDB is locked...        */
		dumpFlags(arg) |= J9TPF_JDUMPBUFFER_LOCKED;
		return NULL; /* See ya next time.            */
	}
	/*
	 *     Calculate the start, end, and net length of the output buffer we'll
	 *     use for the ELF-dictated start of the file content. The start address
	 *     should, as a matter of good practice, start on a paragraph boundary.
	 */
	buffer = (uint8_t *) (dbfptr->ijavbfp); /* Get buffer start as uint8_t ptr    */
	buffer = (uint8_t *) NEXT_PARA(buffer); /* Start it on next paragraph    */
	/*    boundary.                    */
	endBuffer = buffer + dbfptr->ijavbfl; /* Get bytes left in buffer        */
	spcAvailable = endBuffer - buffer; /* Get corrected count of bytes    */

	phCount = dbfptr->ijavcnt;

	int numJavaPgms = sizeof(javaPgmsToDump) / 5;
	octetsToWrite = sizeof(Elf64_Ehdr)
			+ ((phCount + 1 + numJavaPgms) * sizeof(Elf64_Phdr)) +
			NOTE_TABLE_SIZE;
	/*
	 *     Uh oh. Not enough free space remaining in the dump buffer. Now
	 *     we've gotta go to the heap and see if there's enough there. Not
	 *     much hope; but we have to try it.
	 */
	if (octetsToWrite > spcAvailable) { /* Check for available memory,    */
		buffer = malloc64(octetsToWrite); /*  anywhere. We're desperate.    */
		if (!buffer) { /* If we can't buffer our I/Os,    */
			returnCode (arg) = -1; /*  we are done. Indicate error */
			errMsg(arg, "Cannot buffer dump file, out of memory");
			dumpFlags(arg) |= J9TPF_OUT_OF_BUFFERSPACE;
			KEY0();
			dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
			UNKEY();
			return NULL; /* See ya 'round.                */
		}
	}
	/*
	 *    We're ready to write the file. First, we need a full path + filename
	 *    to represent the "OS filename" we're going to write. Get that path
	 *    from the final file name passed in the <tt>args</tt> block, then
	 *    follow that with "core.%X.%d" with the TOD in the second node and
	 *    the PID number in the third. In this way, we can identify the OS
	 *    filename later.
	 *
	 *    Next, we'll try to open it in CREATE+WRITE_ONLY mode. If it fails,
	 *    halt; else write away!
	 */
	splitPathName(ofn, pathName);
	/*
	 *    The "OS filename" will look like ${path}/core.${TOD_in_hex}.${pid}
	 */
	sprintf(workSpace(arg), "core.%lX.%d", dibPtr->dstckf,
			dumpSiginfo(arg)->si_pid);
	{
		int pathLen = strlen(pathName);
		char ebcdicPath[pathLen + 1];
		a2e_len(pathName, ebcdicPath, strlen(pathName));
		int fsystype = pathconf(ebcdicPath, _TPF_PC_FS_TYPE);
		if (fsystype == MOUNT_TFS) {
			errMsg(arg, "Cannot use the tfs for java dumps: path used='%s'\n",
					pathName);
			returnCode (arg) = -1;
			KEY0();
			dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
			UNKEY();
			return NULL;
			//dir_path_name is within the TFS
		}
	}
	strcat(pathName, workSpace(arg));
	ofd = open(pathName, O_WRONLY | O_CREAT | O_EXCL);

	if (-1 == ofd) {
		errMsg(arg, "Cannot open() filename %s: %s\n", strerror(errno));
		returnCode (arg) = -1;
		KEY0();
		dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
		UNKEY();
		return NULL;
	}
	/*
	 *    The file is named and opened. Now we have to go build its ELF-dictated
	 *    parts. Set up pointers to the start of each sub-section, and then go
	 *    build them. When that's complete, write the file in two parts: first,
	 *    the ELF-dictated start, and then follow that with the data in the
	 *    ICB.
	 */
	wPtr = (uintptr_t) buffer; /*    Calc section addresses with single    */
	                           /*     byte arithmetic                    */
	ehdrp = (Elf64_Ehdr *) wPtr; /*    Elf64_Ehdr pointer                    */
	phdrp = (Elf64_Phdr *) (ehdrp + 1); /*    Elf64_Phdr pointer                    */
	buildELFHeader(ehdrp);
	uint64_t numBytes = 0;
	uint64_t pCount = buildELFPheaderBlk(phdrp, dbfptr, phCount, &numBytes);
	KEY0(); /* Get into SPK 0, then write the        */
	ehdrp->e_phnum = pCount; /*  absolutely final Phdr count and     */
	ehdrp->e_phoff = 0x40; /*    offset of its table into place in    */
	UNKEY(); /*  the Ehdr, then get back to SPK 1.    */
	wPtr = (uintptr_t) phdrp; /* Calculate the end address        */
	wPtr += ((phCount + 1 + numJavaPgms) * sizeof(Elf64_Phdr)); /*    of the Elf64_Phdr section        */
	narea = (Elf64_Nhdr *) wPtr; /* Calculate the address of NOTE sec,    */
	buildELFNoteArea(narea, dibPtr); /*    and go write it there.                */
	/*
	 *    Write the ELF-dictated portion of the file first.
	 */
	octetsWritten = writeDumpFile(ofd, buffer, octetsToWrite);
	if (-1 == octetsWritten) {
		errMsg(arg, "Error writing dump file %s: %s", ofn, strerror(errno));
		dumpFlags(arg) |= J9TPF_FILE_SYSTEM_ERROR;
		KEY0();
		dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
		UNKEY();
		return NULL;
	}
	/*
	 *    Finish the output with the ICB portion of the Java Dump Buffer
	 */
	imageBuffer = (uint8_t *) cinfc_fast(CINFC_CMMJDB);
	octetsWritten = writeDumpFile(ofd, imageBuffer, numBytes);
	if (-1 == octetsWritten) {
		errMsg(arg, "Error writing dump file %s: %s", ofn, strerror(errno));
		dumpFlags(arg) |= J9TPF_FILE_SYSTEM_ERROR;
		KEY0();
		dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
		UNKEY();
		return NULL;
	}
	int i = 0;
	for (i = 0; i < numJavaPgms; i++) {
		struct pat * pgmpat = progc(javaPgmsToDump[i], PROGC_PBI);
		struct ifetch *pgmbase = pgmpat->patgca;
		if (pgmbase != NULL) {
			int offset = pgmbase->_iftch_txt_off + pgmbase->_iftch_txt_size
					+ 0x1000;
			char * text = ((char*) pgmbase) + offset;
			uint64_t size = pgmpat->patpsize - offset;
			octetsWritten = writeDumpFile(ofd, text, size);
			if (-1 == octetsWritten) {
				errMsg(arg, "Error writing dump file %s: %s", ofn,
						strerror(errno));
				dumpFlags(arg) |= J9TPF_FILE_SYSTEM_ERROR;
				KEY0();
				dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
				UNKEY();
				return NULL;
			}
		}
	}
	/*
	 *    That's all folks. Close the file and return the so-called "OS Filename"
	 *    to the caller as if the OS had written it.
	 */
	rc = close(ofd); /* Only try to close() the file once    */
	if (-1 == rc) {
		errMsg(arg, "I/O error attempting to close %s:%s\n", ofn,
				strerror(errno));
		KEY0();
		dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
		UNKEY();
		return NULL;
	}
	/*
	 * Make sure we have a recognizable IPC permission on the OS filename,
	 *  then make sure the JDB buffer is unlocked in case CPSE needs it again.
	 */
	rc = chmod(workSpace(arg), S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH);
	KEY0();
	dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/
	UNKEY();
	/*
	 * Return the filename to the caller. Remember that the buffer containing
	 * it is non-JVM-managed heap space and should be free()d back to it to
	 * avoid a mem leak.
	 */
	strcpy(dumpFilename(arg), ofn); /* Store it in the args block so the */
	return (void *) ofn; /*  caller always has it, and goback.*/
}
コード例 #3
0
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc.
// . if list is stored locally this tries to get it locally
// . otherwise tries to get the list from the network
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . NOTE: i was having problems with queries being cached too long, you
//   see the cache here is a NETWORK cache, so when the machines that owns
//   the list updates it on disk it can't flush our cache... so use a small
//   maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( long long hostId      , // host to ask (-1 if none)
		     long      ip          , // info on hostId
		     short     port        ,
		     long      maxCacheAge , // max cached age in seconds
		     bool      addToCache  , // add net recv'd list to cache?
		     char      rdbId       , // specifies the rdb
		     char     *coll        ,
		     RdbList  *list        ,
		     //key_t     startKey    , 
		     //key_t     endKey      , 
		     char     *startKey    ,
		     char     *endKey      ,
		     long      minRecSizes ,  // use -1 for no max
		     void     *state       ,
		     void    (* callback)(void *state ),//, RdbList *list ) ,
		     long      niceness    ,
		     bool      doErrorCorrection ,
		     bool      includeTree ,
		     bool      doMerge     ,
		     long      firstHostId   ,
		     long      startFileNum  ,
		     long      numFiles      ,
		     long      timeout       ,
		     long long syncPoint     ,
		     long      preferLocalReads ,
		     Msg5     *msg5             ,
		     Msg5     *msg5b            ,
		     bool      isRealMerge      ,
//#ifdef SPLIT_INDEXDB
		     bool      allowPageCache    ,
		     bool      forceLocalIndexdb ,
		     bool      noSplit , // doIndexdbSplit    ,
		     long      forceParitySplit  ) {
//#else
//		     bool      allowPageCache ) {
//#endif
	// this is obsolete! mostly, but we need it for PageIndexdb.cpp to 
	// show a "termlist" for a given query term in its entirety so you 
	// don't have to check each machine in the network. if this is true it
	// means to query each split and merge the results together into a
	// single unified termlist. only applies to indexdb/datedb.
	//if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; }
	// note this because if caller is wrong it hurts performance major!!
	//if ( doIndexdbSplit ) 
	//	logf(LOG_DEBUG,"net: doing msg0 with indexdb split true");
	// warning
	if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg0.");

	//if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; }

	// reset the list they passed us
	list->reset();
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( rdbId );
	// if startKey > endKey, don't read anything
	//if ( startKey > endKey ) return true;
	if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue
	// . reset hostid if it is dead
	// . this is causing UOR queries to take forever when we have a dead
	if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1;
	// no longer accept negative minrecsize
	if ( minRecSizes < 0 ) {
		g_errno = EBADENGINEER;
		log(LOG_LOGIC,
		    "net: msg0: Negative minRecSizes no longer supported.");
		char *xx=NULL;*xx=0;
		return true;
	}

	// debug msg
	//if ( niceness != 0 ) log("HEY start");
	// ensure startKey last bit clear, endKey last bit set
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	//	log("Msg0::getList: warning startKey lastbit set"); 
	//if ( (endKey.n0   & 0x01) == 0x00 ) 
	//	log("Msg0::getList: warning endKey lastbit clear"); 
	// remember these
	m_state         = state;
	m_callback      = callback;
	m_list          = list;
	m_hostId        = hostId;
	m_niceness      = niceness;
	//m_ip            = ip;
	//m_port          = port;
	m_addToCache    = addToCache;
	// . these define our request 100%
	//m_startKey      = startKey;
	//m_endKey        = endKey;
	KEYSET(m_startKey,startKey,m_ks);
	KEYSET(m_endKey,endKey,m_ks);
	m_minRecSizes   = minRecSizes;
	m_rdbId         = rdbId;
	m_coll          = coll;
	m_isRealMerge   = isRealMerge;
	m_allowPageCache = allowPageCache;

	// . group to ask is based on the first key 
	// . we only do 1 group per call right now
	// . groupMask must turn on higher bits first (count downwards kinda)
	// . titledb and spiderdb use special masks to get groupId

	// if diffbot.cpp is reading spiderdb from each shard we have to
	// get groupid from hostid here lest we core in getGroupId() below.
	// it does that for dumping spiderdb to the client browser. they
	// can download the whole enchilada.
	if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
		m_shardNum = 0;
	// did they force it? core until i figure out what this is
	else if ( forceParitySplit >= 0 ) 
		//m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
		m_shardNum = forceParitySplit;
	else
		//m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
		m_shardNum = getShardNum ( m_rdbId , startKey );

	// if we are looking up a termlist in posdb that is split by termid and
	// not the usual docid then we have to set this posdb key bit that tells
	// us that ...
	if ( noSplit && m_rdbId == RDB_POSDB )
		m_shardNum = g_hostdb.getShardNumByTermId ( startKey );

	// how is this used?
	//if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId;
	if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();

	// . store these parameters
	// . get a handle to the rdb in case we can satisfy locally
	// . returns NULL and sets g_errno on error
	QUICKPOLL((m_niceness));
	Rdb *rdb = getRdbFromId ( m_rdbId );
	if ( ! rdb ) return true;
	// we need the fixedDataSize
	m_fixedDataSize = rdb->getFixedDataSize();
	m_useHalfKeys   = rdb->useHalfKeys();
	// . debug msg
	// . Msg2 does this when checking for a cached compound list.
	//   compound lists do not actually exist, they are merges of smaller
	//   UOR'd lists.
	if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree))
		log(LOG_LOGIC,"net: msg0: "
		    "Weird. check but don't add... rdbid=%li.",(long)m_rdbId);
	// set this here since we may not call msg5 if list not local
	//m_list->setFixedDataSize ( m_fixedDataSize );

	// . now that we do load balancing we don't want to do a disk lookup
	//   even if local if we are merging or dumping
	// . UNLESS g_conf.m_preferLocalReads is true
	if ( preferLocalReads == -1 ) 
		preferLocalReads = g_conf.m_preferLocalReads;

	// . always prefer local for full split clusterdb
	// . and keep the tfndb/titledb lookups in the same stripe
	// . so basically we can't do biased caches if fully split
	//if ( g_conf.m_fullSplit ) preferLocalReads = true;
	preferLocalReads = true;

	// it it stored locally?
	bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId );
			 m_shardNum == getMyShardNum() );
	// only do local lookups if this is true
	if ( ! preferLocalReads ) isLocal = false;

	/*
	m_numSplit = 1;
	if ( g_hostdb.m_indexSplits > 1 &&
	     ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&&
	     ! forceLocalIndexdb && doIndexdbSplit ) {
		isLocal  = false;
		//m_numSplit = INDEXDB_SPLIT;
		m_numSplit = g_hostdb.m_indexSplits;
		char *xx=NULL;*xx=0;
	}
	*/
	/*
	long long singleDocIdQuery = 0LL;
	if ( rdbId == RDB_POSDB ) {
		long long d1 = g_posdb.getDocId(m_startKey);
		long long d2 = g_posdb.getDocId(m_endKey);
		if ( d1+1 == d2 ) singleDocIdQuery = d1;
	}

	// . try the LOCAL termlist cache
	// . so when msg2 is evaluating a gbdocid:| query and it has to
	//   use msg0 to go across the network to get the same damn termlist
	//   over and over again for the same docid, this will help alot.
	// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
	//   send the same gbdocid:xxxx docids to the same hosts. maybe hash
	//   based on docid into the list of hosts and if that host is busy
	//   just chain until we find someone not busy.
	if ( singleDocIdQuery &&
	     getListFromTermListCache ( coll,
					m_startKey,
					m_endKey,
					maxCacheAge,
					list ) )
		// found!
		return true;
	*/

	// but always local if only one host
	if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;

	// force a msg0 if doing a docid restrictive query like
	// gbdocid:xxxx|<query> so we call cacheTermLists() 
	//if ( singleDocIdQuery ) isLocal = false;

	// . if the group is local then do it locally
	// . Msg5::getList() returns false if blocked, true otherwise
	// . Msg5::getList() sets g_errno on error
	// . don't do this if m_hostId was specified
	if ( isLocal ) { // && !g_conf.m_interfaceMachine ) {
		if ( msg5 ) {
			m_msg5 = msg5;
			m_deleteMsg5 = false;
		}
		else {
			try { m_msg5 = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5 , sizeof(Msg5) , "Msg0" );
			m_deleteMsg5 = true;
		}

		QUICKPOLL(m_niceness);
		// same for msg5b
		if ( msg5b ) {
			m_msg5b = msg5b;
			m_deleteMsg5b = false;
		}
		/*
		else if ( m_rdbId == RDB_TITLEDB ) {
			try { m_msg5b = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request. 2.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" );
			m_deleteMsg5b = true;
		}
		*/
		QUICKPOLL(m_niceness);
		if ( ! m_msg5->getList ( rdbId,
					 coll ,
					 m_list ,
					 m_startKey ,
					 m_endKey   ,
					 m_minRecSizes ,
					 includeTree   , // include Tree?
					 addToCache    , // addToCache?
					 maxCacheAge   ,
					 startFileNum  , 
					 numFiles      ,
					 this ,
					 gotListWrapper2   ,
					 niceness          ,
					 doErrorCorrection ,
					 NULL , // cacheKeyPtr
					 0    , // retryNum
					 -1   , // maxRetries
					 true , // compensateForMerge
					 syncPoint ,
					 NULL,//m_msg5b   ,
					 m_isRealMerge ,
					 m_allowPageCache ) ) return false;
		// nuke it
		reset();
		return true;
	}
skip:
	// debug msg
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,"net: msg0: Sending request for data to "
		    "shard=%lu listPtr=%li minRecSizes=%li termId=%llu "
		    //"startKey.n1=%lx,n0=%llx (niceness=%li)",
		    "startKey.n1=%llx,n0=%llx (niceness=%li)",
		    //g_hostdb.makeHostId ( m_groupId ) ,
		    m_shardNum,
		    (long)m_list,
		    m_minRecSizes, g_posdb.getTermId(m_startKey) , 
		    //m_startKey.n1,m_startKey.n0 , (long)m_niceness);
		    KEY1(m_startKey,m_ks),KEY0(m_startKey),
		    (long)m_niceness);

	char *replyBuf = NULL;
	long  replyBufMaxSize = 0;
	bool  freeReply = true;

	// adjust niceness for net transmission
	bool realtime = false;
	//if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true;

	// if we're niceness 0 we need to pre-allocate for reply since it
	// might be received within the asynchronous signal handler which
	// cannot call mmalloc()
	if ( realtime ) { // niceness <= 0 || netnice == 0 ) {
		// . we should not get back more than minRecSizes bytes since 
		//   we are now performing merges
		// . it should not slow things down too much since the hashing
		//   is 10 times slower than merging anyhow...
		// . CAUTION: if rdb is not fixed-datasize then this will
		//            not work for us! it can exceed m_minRecSizes.
		replyBufMaxSize = m_minRecSizes ;
		// . get a little extra to fix the error where we ask for 64 
		//   but get 72
		// . where is that coming from?
		// . when getting titleRecs we often exceed the minRecSizes 
		// . ?Msg8? was having trouble. was short 32 bytes sometimes.
		replyBufMaxSize += 36;
		// why add ten percent?
		//replyBufMaxSize *= 110 ;
		//replyBufMaxSize /= 100 ;
		// make a buffer to hold the reply
//#ifdef SPLIT_INDEXDB
/*
		if ( m_numSplit > 1 ) {
			m_replyBufSize = replyBufMaxSize * m_numSplit;
			replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0");
			m_replyBuf  = replyBuf;
			freeReply = false;
		}
		else
*/
//#endif
			replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0");
		// g_errno is set and we return true if it failed
		if ( ! replyBuf ) {
			log("net: Failed to pre-allocate %li bytes to hold "
			    "data read remotely from %s: %s.",
			    replyBufMaxSize,getDbnameFromId(m_rdbId),
			    mstrerror(g_errno));
			return true;
		}
	}

	// . make a request with the info above (note: not in network order)
	// . IMPORTANT!!!!! if you change this change 
	//   Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!!
	//   no, not anymore, we commented out that request peeking code
	char *p = m_request;
	*(long long *) p = syncPoint        ; p += 8;
	//*(key_t     *) p = m_startKey       ; p += sizeof(key_t);
	//*(key_t     *) p = m_endKey         ; p += sizeof(key_t);
	*(long      *) p = m_minRecSizes    ; p += 4;
	*(long      *) p = startFileNum     ; p += 4;
	*(long      *) p = numFiles         ; p += 4;
	*(long      *) p = maxCacheAge      ; p += 4;
	*p               = m_rdbId          ; p++;
	*p               = addToCache       ; p++;
	*p               = doErrorCorrection; p++;
	*p               = includeTree      ; p++;
	*p               = (char)niceness   ; p++;
	*p               = (char)m_allowPageCache; p++;
	KEYSET(p,m_startKey,m_ks);          ; p+=m_ks;
	KEYSET(p,m_endKey,m_ks);            ; p+=m_ks;
	// NULL terminated collection name
	strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
	m_requestSize    = p - m_request;
	// ask an individual host for this list if hostId is NOT -1
	if ( m_hostId != -1 ) {
		// get Host
		Host *h = g_hostdb.getHost ( m_hostId );
		if ( ! h ) { 
			g_errno = EBADHOSTID; 
			log(LOG_LOGIC,"net: msg0: Bad hostId of %lli.",
			    m_hostId);
			return true;
		}
		// if niceness is 0, use the higher priority udpServer
		UdpServer *us ;
		unsigned short port;
		QUICKPOLL(m_niceness);
		//if ( niceness <= 0 || netnice == 0 ) { 
		//if ( realtime ) {
		//	us = &g_udpServer2; port = h->m_port2; }
		//else                 { 
		us = &g_udpServer ; port = h->m_port ; 
		// . returns false on error and sets g_errno, true otherwise
		// . calls callback when reply is received (or error)
		// . we return true if it returns false
		if ( ! us->sendRequest ( m_request     ,
					 m_requestSize ,
					 0x00          , // msgType
					 h->m_ip       ,
					 port          ,
					 m_hostId      ,
					 NULL          , // the slotPtr
					 this          ,
					 gotSingleReplyWrapper ,
					 timeout       ,
					 -1            , // backoff
					 -1            , // maxwait
					 replyBuf      ,
					 replyBufMaxSize ,
					 m_niceness     ) ) // cback niceness
			return true;
		// return false cuz it blocked
		return false;
	}
	// timing debug
	if ( g_conf.m_logTimingNet )
		m_startTime = gettimeofdayInMilliseconds();
	else
		m_startTime = 0;
	//if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. "
	//			"termId=%llu, "
	//			"groupNum=%lu",
	//			g_indexdb.getTermId(m_startKey) ,
	//			g_hostdb.makeHostId ( m_groupId ) );

	/*
	// make the cache key so we can see what remote host cached it, if any
	char cacheKey[MAX_KEY_BYTES];
	//key_t cacheKey = makeCacheKey ( startKey     ,
	makeCacheKey ( startKey     ,
		       endKey       ,
		       includeTree  ,
		       minRecSizes  ,
		       startFileNum ,
		       numFiles     ,
		       cacheKey     ,
		       m_ks         );
	*/

	// . get the top long of the key
	// . i guess this will work for 128 bit keys... hmmmmm
	long keyTop = hash32 ( (char *)startKey , m_ks );

	/*
	// allocate space
	if ( m_numSplit > 1 ) {
		long  need = m_numSplit * sizeof(Multicast) ;
		char *buf  = (char *)mmalloc ( need,"msg0mcast" );
		if ( ! buf ) return true;
		m_mcasts = (Multicast *)buf;
		for ( long i = 0; i < m_numSplit ; i++ )
			m_mcasts[i].constructor();
	}
	*/

        // . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . calls callback on completion
	// . select first host to send to in group based on upper 32 bits
	//   of termId (m_startKey.n1)
//#ifdef SPLIT_INDEXDB
	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	//for ( long i = 0; i < m_numSplit; i++ ) {

	QUICKPOLL(m_niceness);
	//long gr;
	char *buf;
	/*
	if ( m_numSplit > 1 ) {
		gr  = g_indexdb.getSplitGroupId ( baseGroupId, i );
		buf = &replyBuf[i*replyBufMaxSize];
	}
	else {
	*/
	//gr  = m_groupId;
	buf = replyBuf;
	//}

	// get the multicast
	Multicast *m = &m_mcast;
	//if ( m_numSplit > 1 ) m = &m_mcasts[i];

        if ( ! m->send ( m_request    , 
//#else
//        if ( ! m_mcast.send ( m_request    , 
//#endif
			      m_requestSize, 
			      0x00         , // msgType 0x00
			      false        , // does multicast own request?
			 m_shardNum ,
//#ifdef SPLIT_INDEXDB
//			      gr           , // group + offset
//#else
//			      m_groupId    , // group to send to (groupKey)
//#endif
			      false        , // send to whole group?
			      //m_startKey.n1, // key is passed on startKey
			      keyTop       , // key is passed on startKey
			      this         , // state data
			      NULL         , // state data
			      gotMulticastReplyWrapper0 ,
			      timeout      , // timeout in seconds (was 30)
			      niceness     ,
			      realtime     ,
			      firstHostId  ,
//#ifdef SPLIT_INDEXDB
//			      &replyBuf[i*replyBufMaxSize] ,
//#else
//			      replyBuf        ,
//#endif
			      buf             ,
			      replyBufMaxSize ,
			      freeReply       , // free reply buf?
			      true            , // do disk load balancing?
			      maxCacheAge     ,
			      //(key_t *)cacheKey        ,
			      // multicast uses it for determining the best
			      // host to send the request to when doing 
			      // disk load balancing. if the host has our 
			      // data cached, then it will probably get to
			      // handle the request. for now let's just assume
			      // this is a 96-bit key. TODO: fix...
			 0 , // *(key_t *)cacheKey        ,
			      rdbId           ,
			      minRecSizes     ) ) {
		log("net: Failed to send request for data from %s in shard "
		    "#%lu over network: %s.",
		    getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
		// no, multicast will free this when it is destroyed
		//if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" );
		// but speed it up
//#ifdef SPLIT_INDEXDB
		m_errno = g_errno;
		m->reset();
		if ( m_numRequests > 0 )
			return false;
//#else
//		m_mcast.reset();
//#endif
		return true;
	}
//#ifdef SPLIT_INDEXDB
	m_numRequests++;

//#endif
	// we blocked
	return false;
}