/** * \internal Make sure the DBF table, which points to all the memory regions collected * by CCCPSE, is as compact as possible. The goal is to cut the number of ELF * Phdrs down as much as possible by merging adjoining memory regions. In order to do * this, it needs to mark each DBFITEM, which is in protected memory, SPK=0. This * function is therefore in SPK=0 for its duration, and restores SPK1 prior to return. * * \param[in] dbfHdr Address of the Java dump buffer as returned from CCCPSE. * * \return * The number of program headers after merge */ static uintptr_t adjustDBFTable(DBFHDR *dbfHdr) { DBFITEM *dbfTbl = (DBFITEM *) dbfHdr + 1; /* DBTI[0] starts right after dbfHdr */ DBFITEM *pred = dbfTbl; /* Predecessor DBTI */ /* * For first pass only, set pred = curr. We never want to address zero. * We'll use it only to test bitfields which aren't possible to be set * for the very first item. */ DBFITEM *curr = dbfTbl; /* Current DBTI under examination */ DBFITEM *succ = pred + 1; /* Successor DBTI */ DBFITEM *limit = dbfTbl + (dbfHdr->ijavcnt); /* DBTI[n+1] address */ uintptr_t phnum = 0; /* Number of expected Elf64_Phdrs */ uintptr_t wGap; /* ACCUMULATOR: total ibc gap */ void *wVstart; /* WORK: new p_vaddr */ IDATA chunkSize = 0L; /* Semi-permanent per set ... size */ IDATA totalIBCSize = 0L; /* Total sizes of all writeable sets */ KEY0(); /* JDB's in SPK=0, get auth to write it.*/ while (curr < limit) { memset(DBTITYPE, 0, sizeof(char) + (sizeof(DBTIGAP)));/* Clear DBTI tail */ calcGap(curr, succ, &wGap); /* Get GAP size. */ if (0 == wGap) { /* * This pair qualifies to be combined */ if (PSETSTRT || PSETMIDL) { CSETMIDL = TRUE; /* It is either mid-set ... */ } else { /* ... or starts a new set. */ CSETSTRT = TRUE; wVstart = DBTIVADDR; /* If it's a start, hold the start vaddr. */ chunkSize += DBTISIZ; DBTIGAP = wGap; } } else { /* * This pair does not qualify to be combined. Is it last in set, or is its * predecessor not part of this one? */ if (PSETMIDL || PSETSTRT) { /* It's the end of a set, so summarize */ CSETLAST = TRUE; /* the set's start adrs & size in this */ chunkSize += DBTISIZ; /* JDB index item, it's the one we'll */ DBTISIZ += chunkSize; /* be building the Elf64_Phdr from. */ chunkSize = 0L; DBTIVADDR = wVstart; /* Reset starting address & size */ wVstart = 0L; /* accumulators. */ } else { CSETSOLO = TRUE; /* This one is a standalone item. */ } /* * We will only write Elf64_Phdr items for those DBF items marked CSETLAST * or CSETSOLO. */ if (CSETLAST || CSETSOLO) { phnum += 1; /* We will write a Phdr because of this item*/ } } DBTIGAP = wGap; /* Save the gap for later analysis */ pred = curr; /* Advance our pair pointers */ curr = succ; succ += 1; } UNKEY(); /* Restore SPK=1 */ return phnum; /* Return revised count to caller. */ }
/** * \internal Build the z/TPF ELF-format core dump from the JDB built by CCCPSE. * * First figure out the absolute file path (starts with a '/' and * ends with a file suffix) we are to write the dump to, then ensure we * have enough room left over in the Java dump buffer left for us by * CCCPSE. If we do not, return an error. Otherwise, section the buffer * space off, building in order: * - The ELF64 header * - The ELF64 Program Header section * - The ELF64 NOTE section * * Finally, write all the above to the indicated filename, and follow it * with a copy of storage contents from the Java dump buffer, which is * laid out in ascending address order, with the ELF64_Phdrs created to * match the dump buffer. Expect a final file size measured in tens * of megabytes. * * As a matter of protocol, set the first 8 bytes of the Java Dump * Buffer (struct ijavdbf) to zeros so CCCPSE knows it's okay to * write over its contents once we're done working with it. * * The arg block is used for the bi-directional passing of data. Those of its * members which are pointers (most of them, in fact) are used mainly for * output or at least have an output role. The fields of <arg> which are * required are shown as input parameters below. The fields of <arg> which * are used for output are detailed as return values. * * Since the pthread_create() calls limits us to one pointer to type void as the * function's input, and the function is required to return a void pointer, this * function will take the address of the <arg> block as the input parameter, and * will return a pointer to type char which represents the file name which now * contains the full path name of the ELF-format core dump file. There are also * fields in <arg> that will be updated as described below. * * \param[in][out] arg pointer to a user-created datatype 'args', declared * in header file j9osdump_helpers.h * \param[in] arg->OSFilename Pointer to scratch storage for a path + * file name string. It will be presumed * to be at least PATH_MAX+1 in size. * \param[in] arg->wkspcSize 32-bit quantity representing the size in * bytes of the wkSpace field, following. * \param[in] arg->wkSpace 64-bit pointer to scratch workspace for * use by this function. It is recommended * to make it equal to PATH_MAX, since file & * path names will be built in this space. * \param[in][out] arg->flags Indicators as to what is present and what * is not. * \param[in] arg->sii Pointer to scratch storage to be used * forever more as a siginfo_t block * \param[in] arg->uct Pointer to scratch storage to be used * forever more as a ucontext_t block * \param[in] arg->sct Pointer to scratch storage to be used * forever more as a sigcontext block * \param[in] arg->portLibrary Pointer to an initialized OMRPortLibrary * block. If there isn't one at call time, * leave this value NULL and set flag * J9ZTPF_NO_PORT_LIBRARY * \param[in] arg->dibPtr Address of the DIB attached to the faulting * UOW at post-interrupt time. * * \returns Pointer to a hz-terminated string representing the absolute path * name at which the core dump file was written. * * \returns arg->sii Filled in. * \returns arg->uct Filled in. * \returns arg->sct Filled in. * \returns arg->rc Final return code. Zero if successful * (core file built), non-zero if not. * \returns arg->OSFilename Same as the function return value. */ void * ztpfBuildCoreFile(void *argv_block) { #define MOUNT_TFS 4 /* TPF Filesystem equate from imount.h */ args *arg = (args *) argv_block; uint8_t *buffer, *endBuffer; DBFHDR *dbfptr; /* Ptr to JDB's index header */ DIB *dibPtr = dibAddr(arg); /* Ptr to the Dump I'chg Block */ Elf64_Ehdr *ehdrp; /* Pointer to Elf64 file header */ Elf64_Phdr *phdrp; /* Pointer to Elf64_Phdr block */ Elf64_Nhdr *narea; /* Pointer to the ELF NOTE data */ char pathName[PATH_MAX]; /* Working buffer for core.* fname.*/ char *ofn = dumpFilename(arg); /* Output dump file path */ uint32_t ofd; /* File descriptor for output dump */ uintptr_t rc; /* Working return code d.o. */ uintptr_t wPtr; /* Working byte-sized pointer d.o. */ uint64_t phCount; /* Counter of Elf64_Phdrs required */ uint8_t *imageBuffer; /* Start of the JDB's ICB */ uint64_t imageBufferSize; /* Size of data in the ICB */ uint64_t octetsToWrite = 0UL; /* Count of bytes to write */ uint64_t octetsWritten = 0UL; /* Count of bytes written */ uint64_t spcAvailable; /* * If there is a Java dump buffer belonging to this process, then * convert its contents into an Elf64 core dump file; otherwise * return failure. */ if (!(dibPtr->dibjdb)) { /* No dump buffer? Not possible.*/ dumpFlags(arg) |= J9TPF_NO_JAVA_DUMPBUFFER; returnCode (arg) = -1; /* Set error flags & bad RC ... */ return NULL; /* 'Bye. */ } dbfptr = dibPtr->dibjdb; /* Pick up the dump buffer ptr */ if (0L == dbfptr->ijavcnt) { /* Did CCCPSE write us one? */ returnCode (arg) = -1; /* Nope. JDB is locked... */ dumpFlags(arg) |= J9TPF_JDUMPBUFFER_LOCKED; return NULL; /* See ya next time. */ } /* * Calculate the start, end, and net length of the output buffer we'll * use for the ELF-dictated start of the file content. The start address * should, as a matter of good practice, start on a paragraph boundary. */ buffer = (uint8_t *) (dbfptr->ijavbfp); /* Get buffer start as uint8_t ptr */ buffer = (uint8_t *) NEXT_PARA(buffer); /* Start it on next paragraph */ /* boundary. */ endBuffer = buffer + dbfptr->ijavbfl; /* Get bytes left in buffer */ spcAvailable = endBuffer - buffer; /* Get corrected count of bytes */ phCount = dbfptr->ijavcnt; int numJavaPgms = sizeof(javaPgmsToDump) / 5; octetsToWrite = sizeof(Elf64_Ehdr) + ((phCount + 1 + numJavaPgms) * sizeof(Elf64_Phdr)) + NOTE_TABLE_SIZE; /* * Uh oh. Not enough free space remaining in the dump buffer. Now * we've gotta go to the heap and see if there's enough there. Not * much hope; but we have to try it. */ if (octetsToWrite > spcAvailable) { /* Check for available memory, */ buffer = malloc64(octetsToWrite); /* anywhere. We're desperate. */ if (!buffer) { /* If we can't buffer our I/Os, */ returnCode (arg) = -1; /* we are done. Indicate error */ errMsg(arg, "Cannot buffer dump file, out of memory"); dumpFlags(arg) |= J9TPF_OUT_OF_BUFFERSPACE; KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); return NULL; /* See ya 'round. */ } } /* * We're ready to write the file. First, we need a full path + filename * to represent the "OS filename" we're going to write. Get that path * from the final file name passed in the <tt>args</tt> block, then * follow that with "core.%X.%d" with the TOD in the second node and * the PID number in the third. In this way, we can identify the OS * filename later. * * Next, we'll try to open it in CREATE+WRITE_ONLY mode. If it fails, * halt; else write away! */ splitPathName(ofn, pathName); /* * The "OS filename" will look like ${path}/core.${TOD_in_hex}.${pid} */ sprintf(workSpace(arg), "core.%lX.%d", dibPtr->dstckf, dumpSiginfo(arg)->si_pid); { int pathLen = strlen(pathName); char ebcdicPath[pathLen + 1]; a2e_len(pathName, ebcdicPath, strlen(pathName)); int fsystype = pathconf(ebcdicPath, _TPF_PC_FS_TYPE); if (fsystype == MOUNT_TFS) { errMsg(arg, "Cannot use the tfs for java dumps: path used='%s'\n", pathName); returnCode (arg) = -1; KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); return NULL; //dir_path_name is within the TFS } } strcat(pathName, workSpace(arg)); ofd = open(pathName, O_WRONLY | O_CREAT | O_EXCL); if (-1 == ofd) { errMsg(arg, "Cannot open() filename %s: %s\n", strerror(errno)); returnCode (arg) = -1; KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); return NULL; } /* * The file is named and opened. Now we have to go build its ELF-dictated * parts. Set up pointers to the start of each sub-section, and then go * build them. When that's complete, write the file in two parts: first, * the ELF-dictated start, and then follow that with the data in the * ICB. */ wPtr = (uintptr_t) buffer; /* Calc section addresses with single */ /* byte arithmetic */ ehdrp = (Elf64_Ehdr *) wPtr; /* Elf64_Ehdr pointer */ phdrp = (Elf64_Phdr *) (ehdrp + 1); /* Elf64_Phdr pointer */ buildELFHeader(ehdrp); uint64_t numBytes = 0; uint64_t pCount = buildELFPheaderBlk(phdrp, dbfptr, phCount, &numBytes); KEY0(); /* Get into SPK 0, then write the */ ehdrp->e_phnum = pCount; /* absolutely final Phdr count and */ ehdrp->e_phoff = 0x40; /* offset of its table into place in */ UNKEY(); /* the Ehdr, then get back to SPK 1. */ wPtr = (uintptr_t) phdrp; /* Calculate the end address */ wPtr += ((phCount + 1 + numJavaPgms) * sizeof(Elf64_Phdr)); /* of the Elf64_Phdr section */ narea = (Elf64_Nhdr *) wPtr; /* Calculate the address of NOTE sec, */ buildELFNoteArea(narea, dibPtr); /* and go write it there. */ /* * Write the ELF-dictated portion of the file first. */ octetsWritten = writeDumpFile(ofd, buffer, octetsToWrite); if (-1 == octetsWritten) { errMsg(arg, "Error writing dump file %s: %s", ofn, strerror(errno)); dumpFlags(arg) |= J9TPF_FILE_SYSTEM_ERROR; KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); return NULL; } /* * Finish the output with the ICB portion of the Java Dump Buffer */ imageBuffer = (uint8_t *) cinfc_fast(CINFC_CMMJDB); octetsWritten = writeDumpFile(ofd, imageBuffer, numBytes); if (-1 == octetsWritten) { errMsg(arg, "Error writing dump file %s: %s", ofn, strerror(errno)); dumpFlags(arg) |= J9TPF_FILE_SYSTEM_ERROR; KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); return NULL; } int i = 0; for (i = 0; i < numJavaPgms; i++) { struct pat * pgmpat = progc(javaPgmsToDump[i], PROGC_PBI); struct ifetch *pgmbase = pgmpat->patgca; if (pgmbase != NULL) { int offset = pgmbase->_iftch_txt_off + pgmbase->_iftch_txt_size + 0x1000; char * text = ((char*) pgmbase) + offset; uint64_t size = pgmpat->patpsize - offset; octetsWritten = writeDumpFile(ofd, text, size); if (-1 == octetsWritten) { errMsg(arg, "Error writing dump file %s: %s", ofn, strerror(errno)); dumpFlags(arg) |= J9TPF_FILE_SYSTEM_ERROR; KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); return NULL; } } } /* * That's all folks. Close the file and return the so-called "OS Filename" * to the caller as if the OS had written it. */ rc = close(ofd); /* Only try to close() the file once */ if (-1 == rc) { errMsg(arg, "I/O error attempting to close %s:%s\n", ofn, strerror(errno)); KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); return NULL; } /* * Make sure we have a recognizable IPC permission on the OS filename, * then make sure the JDB buffer is unlocked in case CPSE needs it again. */ rc = chmod(workSpace(arg), S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); KEY0(); dbfptr->ijavcnt = 0L; /* Unlock the JDB so CCCPSE can reuse it*/ UNKEY(); /* * Return the filename to the caller. Remember that the buffer containing * it is non-JVM-managed heap space and should be free()d back to it to * avoid a mem leak. */ strcpy(dumpFilename(arg), ofn); /* Store it in the args block so the */ return (void *) ofn; /* caller always has it, and goback.*/ }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( long long hostId , // host to ask (-1 if none) long ip , // info on hostId short port , long maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb char *coll , RdbList *list , //key_t startKey , //key_t endKey , char *startKey , char *endKey , long minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , long niceness , bool doErrorCorrection , bool includeTree , bool doMerge , long firstHostId , long startFileNum , long numFiles , long timeout , long long syncPoint , long preferLocalReads , Msg5 *msg5 , Msg5 *msg5b , bool isRealMerge , //#ifdef SPLIT_INDEXDB bool allowPageCache , bool forceLocalIndexdb , bool noSplit , // doIndexdbSplit , long forceParitySplit ) { //#else // bool allowPageCache ) { //#endif // this is obsolete! mostly, but we need it for PageIndexdb.cpp to // show a "termlist" for a given query term in its entirety so you // don't have to check each machine in the network. if this is true it // means to query each split and merge the results together into a // single unified termlist. only applies to indexdb/datedb. //if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; } // note this because if caller is wrong it hurts performance major!! //if ( doIndexdbSplit ) // logf(LOG_DEBUG,"net: doing msg0 with indexdb split true"); // warning if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg0."); //if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; } // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; return true; } // debug msg //if ( niceness != 0 ) log("HEY start"); // ensure startKey last bit clear, endKey last bit set //if ( (startKey.n0 & 0x01) == 0x01 ) // log("Msg0::getList: warning startKey lastbit set"); //if ( (endKey.n0 & 0x01) == 0x00 ) // log("Msg0::getList: warning endKey lastbit clear"); // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; //m_ip = ip; //m_port = port; m_addToCache = addToCache; // . these define our request 100% //m_startKey = startKey; //m_endKey = endKey; KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_coll = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? //if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId; if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) log(LOG_LOGIC,"net: msg0: " "Weird. check but don't add... rdbid=%li.",(long)m_rdbId); // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* m_numSplit = 1; if ( g_hostdb.m_indexSplits > 1 && ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&& ! forceLocalIndexdb && doIndexdbSplit ) { isLocal = false; //m_numSplit = INDEXDB_SPLIT; m_numSplit = g_hostdb.m_indexSplits; char *xx=NULL;*xx=0; } */ /* long long singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { long long d1 = g_posdb.getDocId(m_startKey); long long d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // force a msg0 if doing a docid restrictive query like // gbdocid:xxxx|<query> so we call cacheTermLists() //if ( singleDocIdQuery ) isLocal = false; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { // && !g_conf.m_interfaceMachine ) { if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); // same for msg5b if ( msg5b ) { m_msg5b = msg5b; m_deleteMsg5b = false; } /* else if ( m_rdbId == RDB_TITLEDB ) { try { m_msg5b = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request. 2.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" ); m_deleteMsg5b = true; } */ QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, coll , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , NULL,//m_msg5b , m_isRealMerge , m_allowPageCache ) ) return false; // nuke it reset(); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%lu listPtr=%li minRecSizes=%li termId=%llu " //"startKey.n1=%lx,n0=%llx (niceness=%li)", "startKey.n1=%llx,n0=%llx (niceness=%li)", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (long)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (long)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (long)m_niceness); char *replyBuf = NULL; long replyBufMaxSize = 0; bool freeReply = true; // adjust niceness for net transmission bool realtime = false; //if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true; // if we're niceness 0 we need to pre-allocate for reply since it // might be received within the asynchronous signal handler which // cannot call mmalloc() if ( realtime ) { // niceness <= 0 || netnice == 0 ) { // . we should not get back more than minRecSizes bytes since // we are now performing merges // . it should not slow things down too much since the hashing // is 10 times slower than merging anyhow... // . CAUTION: if rdb is not fixed-datasize then this will // not work for us! it can exceed m_minRecSizes. replyBufMaxSize = m_minRecSizes ; // . get a little extra to fix the error where we ask for 64 // but get 72 // . where is that coming from? // . when getting titleRecs we often exceed the minRecSizes // . ?Msg8? was having trouble. was short 32 bytes sometimes. replyBufMaxSize += 36; // why add ten percent? //replyBufMaxSize *= 110 ; //replyBufMaxSize /= 100 ; // make a buffer to hold the reply //#ifdef SPLIT_INDEXDB /* if ( m_numSplit > 1 ) { m_replyBufSize = replyBufMaxSize * m_numSplit; replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0"); m_replyBuf = replyBuf; freeReply = false; } else */ //#endif replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0"); // g_errno is set and we return true if it failed if ( ! replyBuf ) { log("net: Failed to pre-allocate %li bytes to hold " "data read remotely from %s: %s.", replyBufMaxSize,getDbnameFromId(m_rdbId), mstrerror(g_errno)); return true; } } // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(long long *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(long *) p = m_minRecSizes ; p += 4; *(long *) p = startFileNum ; p += 4; *(long *) p = numFiles ; p += 4; *(long *) p = maxCacheAge ; p += 4; *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %lli.", m_hostId); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; unsigned short port; QUICKPOLL(m_niceness); //if ( niceness <= 0 || netnice == 0 ) { //if ( realtime ) { // us = &g_udpServer2; port = h->m_port2; } //else { us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) // cback niceness return true; // return false cuz it blocked return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; //if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. " // "termId=%llu, " // "groupNum=%lu", // g_indexdb.getTermId(m_startKey) , // g_hostdb.makeHostId ( m_groupId ) ); /* // make the cache key so we can see what remote host cached it, if any char cacheKey[MAX_KEY_BYTES]; //key_t cacheKey = makeCacheKey ( startKey , makeCacheKey ( startKey , endKey , includeTree , minRecSizes , startFileNum , numFiles , cacheKey , m_ks ); */ // . get the top long of the key // . i guess this will work for 128 bit keys... hmmmmm long keyTop = hash32 ( (char *)startKey , m_ks ); /* // allocate space if ( m_numSplit > 1 ) { long need = m_numSplit * sizeof(Multicast) ; char *buf = (char *)mmalloc ( need,"msg0mcast" ); if ( ! buf ) return true; m_mcasts = (Multicast *)buf; for ( long i = 0; i < m_numSplit ; i++ ) m_mcasts[i].constructor(); } */ // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) //#ifdef SPLIT_INDEXDB // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( long i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //long gr; char *buf; /* if ( m_numSplit > 1 ) { gr = g_indexdb.getSplitGroupId ( baseGroupId, i ); buf = &replyBuf[i*replyBufMaxSize]; } else { */ //gr = m_groupId; buf = replyBuf; //} // get the multicast Multicast *m = &m_mcast; //if ( m_numSplit > 1 ) m = &m_mcasts[i]; if ( ! m->send ( m_request , //#else // if ( ! m_mcast.send ( m_request , //#endif m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , //#ifdef SPLIT_INDEXDB // gr , // group + offset //#else // m_groupId , // group to send to (groupKey) //#endif false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout , // timeout in seconds (was 30) niceness , realtime , firstHostId , //#ifdef SPLIT_INDEXDB // &replyBuf[i*replyBufMaxSize] , //#else // replyBuf , //#endif buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log("net: Failed to send request for data from %s in shard " "#%lu over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // no, multicast will free this when it is destroyed //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); // but speed it up //#ifdef SPLIT_INDEXDB m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) return false; //#else // m_mcast.reset(); //#endif return true; } //#ifdef SPLIT_INDEXDB m_numRequests++; //#endif // we blocked return false; }