// . this should only be called by m_mcast when it has successfully sent to // ALL hosts in group "groupId" void gotReplyWrapper1 ( void *state , void *state2 ) { Msg1 *THIS = (Msg1 *)state; // print the error if any if ( g_errno && g_errno != ETRYAGAIN ) log("net: Got bad reply when attempting to add data " "to %s: %s",getDbnameFromId(THIS->m_rdbId), mstrerror(g_errno)); //int32_t address = (int32_t)THIS->m_callback; // if our list to send is exhausted then we're done! if ( THIS->m_list->isExhausted() ) { //if(g_conf.m_profilingEnabled){ // g_profiler.startTimer(address, __PRETTY_FUNCTION__); //} if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state ); //if(g_conf.m_profilingEnabled){ // if(!g_profiler.endTimer(address, __PRETTY_FUNCTION__)) // log(LOG_WARN,"admin: Couldn't add the fn %" PRId32, // (int32_t)address); //} return; } // otherwise we got more to send to groups if ( THIS->sendSomeOfList() ) { //if(g_conf.m_profilingEnabled){ // g_profiler.startTimer(address, __PRETTY_FUNCTION__); //} if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state ); //if(g_conf.m_profilingEnabled){ // if(!g_profiler.endTimer(address, __PRETTY_FUNCTION__)) // log(LOG_WARN,"admin: Couldn't add the fn %" PRId32, // (int32_t)address); //} return; } }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb collnum_t collnum , RdbList *list , const char *startKey , const char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int64_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , bool isRealMerge , bool allowPageCache , bool forceLocalIndexdb , bool noSplit , int32_t forceParitySplit ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId ); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if( g_conf.m_logTraceMsg0 ) // { // log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId); // log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks); // log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId); // } // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; logTrace( g_conf.m_logTraceMsg0, "END" ); log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; } // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; m_addToCache = addToCache; // . these define our request 100% KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) { log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId ); } // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { logTrace( g_conf.m_logTraceMsg0, "isLocal" ); if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , m_isRealMerge , m_allowPageCache ) ) { logTrace( g_conf.m_logTraceMsg0, "END, return false" ); return false; } // nuke it reset(); logTrace( g_conf.m_logTraceMsg0, "END, return true" ); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%" PRIu32" " "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" " //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")", "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId); logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" ); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) { // cback niceness logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" ); return true; } // return false cuz it blocked logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" ); return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; buf = replyBuf; // get the multicast Multicast *m = &m_mcast; if ( ! m->send ( m_request , m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout*1000 , // timeout niceness , firstHostId , buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log(LOG_ERROR, "net: Failed to send request for data from %s in shard " "#%" PRIu32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // but speed it up m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) { logTrace( g_conf.m_logTraceMsg0, "END - returning false" ); return false; } logTrace( g_conf.m_logTraceMsg0, "END - returning true" ); return true; } m_numRequests++; // we blocked logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" ); return false; }
UdpStatistic::UdpStatistic(const UdpSlot &slot) : m_transId(slot.getTransId()), m_ip(slot.getIp()), m_port(slot.getPort()), m_msgType(slot.getMsgType()), m_description(), m_niceness(slot.getNiceness()), m_convertedNiceness(slot.getConvertedNiceness()), m_numDatagramRead(slot.getNumDgramsRead()), m_numDatagramSent(slot.getNumDgramsSent()), m_numAckRead(slot.getNumAcksRead()), m_numAckSent(slot.getNumAcksSent()), m_numPendingRead(slot.getDatagramsToRead()), m_numPendingSend(slot.getDatagramsToSend()), m_resendCount(slot.getResendCount()), m_timeout(slot.getTimeout()), m_startTime(slot.getStartTime()), m_lastReadTime(slot.getLastReadTime()), m_lastSendTime(slot.getLastSendTime()), m_hasCallback(slot.hasCallback()), m_hasCalledHandler(slot.hasCalledHandler()), m_hasCalledCallback(slot.hasCalledCallback()), m_extraInfo() { char *buf; int32_t bufSize; if (slot.isIncoming()) { buf = slot.m_readBuf; bufSize = slot.m_readBufSize; } else { buf = slot.m_sendBuf; bufSize = slot.m_sendBufSize; } switch (m_msgType) { case msg_type_0: if (buf && bufSize > RDBIDOFFSET) { rdbid_t rdbId = static_cast<rdbid_t>(buf[RDBIDOFFSET]); snprintf(m_description, sizeof(m_description), "get from %s", getDbnameFromId(rdbId)); } break; case msg_type_1: if (buf) { rdbid_t rdbId = static_cast<rdbid_t>(buf[0]); snprintf(m_description, sizeof(m_description), "add to %s", getDbnameFromId(rdbId)); } break; case msg_type_4: strcpy(m_description, "meta add"); break; case msg_type_7: strcpy(m_description, "inject"); break; case msg_type_c: strcpy(m_description, "getting ip"); break; case msg_type_11: strcpy(m_description, "ping"); break; case msg_type_13: if (buf && static_cast<size_t>(bufSize) >= sizeof(Msg13Request)) { Msg13Request *r = reinterpret_cast<Msg13Request*>(buf); snprintf(m_description, sizeof(m_description), "get %s", r->m_isRobotsTxt ? "web page" : "robot.txt"); } break; case msg_type_1f: strcpy(m_description, "get remote log"); break; case msg_type_20: strcpy(m_description, "get summary"); break; case msg_type_22: strcpy(m_description, "get titlerec"); break; case msg_type_25: strcpy(m_description, "get link info"); break; case msg_type_39: strcpy(m_description, "get docids"); break; case msg_type_3e: strcpy(m_description, "sync parms"); break; case msg_type_3f: strcpy(m_description, "update parms"); break; case msg_type_54: strcpy(m_description, "proxy spider"); break; case msg_type_c1: strcpy(m_description, "get crawl info"); break; case msg_type_fd: strcpy(m_description, "proxy forward"); break; case msg_type_dns: strcpy(m_extraInfo, slot.getExtraInfo()); break; } }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb //char *coll , collnum_t collnum , RdbList *list , //key_t startKey , //key_t endKey , char *startKey , char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int32_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , Msg5 *msg5b , bool isRealMerge , //#ifdef SPLIT_INDEXDB bool allowPageCache , bool forceLocalIndexdb , bool noSplit , // doIndexdbSplit , int32_t forceParitySplit ) { //#else // bool allowPageCache ) { //#endif // this is obsolete! mostly, but we need it for PageIndexdb.cpp to // show a "termlist" for a given query term in its entirety so you // don't have to check each machine in the network. if this is true it // means to query each split and merge the results together into a // single unified termlist. only applies to indexdb/datedb. //if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; } // note this because if caller is wrong it hurts performance major!! //if ( doIndexdbSplit ) // logf(LOG_DEBUG,"net: doing msg0 with indexdb split true"); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); //if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; } // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; return true; } // debug msg //if ( niceness != 0 ) log("HEY start"); // ensure startKey last bit clear, endKey last bit set //if ( (startKey.n0 & 0x01) == 0x01 ) // log("Msg0::getList: warning startKey lastbit set"); //if ( (endKey.n0 & 0x01) == 0x00 ) // log("Msg0::getList: warning endKey lastbit clear"); // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; //m_ip = ip; //m_port = port; m_addToCache = addToCache; // . these define our request 100% //m_startKey = startKey; //m_endKey = endKey; KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? //if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId; if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) log(LOG_LOGIC,"net: msg0: " "Weird. check but don't add... rdbid=%"INT32".",(int32_t)m_rdbId); // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* m_numSplit = 1; if ( g_hostdb.m_indexSplits > 1 && ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&& ! forceLocalIndexdb && doIndexdbSplit ) { isLocal = false; //m_numSplit = INDEXDB_SPLIT; m_numSplit = g_hostdb.m_indexSplits; char *xx=NULL;*xx=0; } */ /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // force a msg0 if doing a docid restrictive query like // gbdocid:xxxx|<query> so we call cacheTermLists() //if ( singleDocIdQuery ) isLocal = false; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { // && !g_conf.m_interfaceMachine ) { if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); // same for msg5b if ( msg5b ) { m_msg5b = msg5b; m_deleteMsg5b = false; } /* else if ( m_rdbId == RDB_TITLEDB ) { try { m_msg5b = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request. 2.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" ); m_deleteMsg5b = true; } */ QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , NULL,//m_msg5b , m_isRealMerge , m_allowPageCache ) ) return false; // nuke it reset(); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%"UINT32" " "listPtr=%"PTRFMT" minRecSizes=%"INT32" termId=%"UINT64" " //"startKey.n1=%"XINT32",n0=%"XINT64" (niceness=%"INT32")", "startKey.n1=%"XINT64",n0=%"XINT64" (niceness=%"INT32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // adjust niceness for net transmission bool realtime = false; //if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true; // if we're niceness 0 we need to pre-allocate for reply since it // might be received within the asynchronous signal handler which // cannot call mmalloc() if ( realtime ) { // niceness <= 0 || netnice == 0 ) { // . we should not get back more than minRecSizes bytes since // we are now performing merges // . it should not slow things down too much since the hashing // is 10 times slower than merging anyhow... // . CAUTION: if rdb is not fixed-datasize then this will // not work for us! it can exceed m_minRecSizes. replyBufMaxSize = m_minRecSizes ; // . get a little extra to fix the error where we ask for 64 // but get 72 // . where is that coming from? // . when getting titleRecs we often exceed the minRecSizes // . ?Msg8? was having trouble. was int16_t 32 bytes sometimes. replyBufMaxSize += 36; // why add ten percent? //replyBufMaxSize *= 110 ; //replyBufMaxSize /= 100 ; // make a buffer to hold the reply //#ifdef SPLIT_INDEXDB /* if ( m_numSplit > 1 ) { m_replyBufSize = replyBufMaxSize * m_numSplit; replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0"); m_replyBuf = replyBuf; freeReply = false; } else */ //#endif replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0"); // g_errno is set and we return true if it failed if ( ! replyBuf ) { log("net: Failed to pre-allocate %"INT32" bytes to hold " "data read remotely from %s: %s.", replyBufMaxSize,getDbnameFromId(m_rdbId), mstrerror(g_errno)); return true; } } // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".", m_hostId); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); //if ( niceness <= 0 || netnice == 0 ) { //if ( realtime ) { // us = &g_udpServer2; port = h->m_port2; } //else { us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) // cback niceness return true; // return false cuz it blocked return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; //if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. " // "termId=%"UINT64", " // "groupNum=%"UINT32"", // g_indexdb.getTermId(m_startKey) , // g_hostdb.makeHostId ( m_groupId ) ); /* // make the cache key so we can see what remote host cached it, if any char cacheKey[MAX_KEY_BYTES]; //key_t cacheKey = makeCacheKey ( startKey , makeCacheKey ( startKey , endKey , includeTree , minRecSizes , startFileNum , numFiles , cacheKey , m_ks ); */ // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); /* // allocate space if ( m_numSplit > 1 ) { int32_t need = m_numSplit * sizeof(Multicast) ; char *buf = (char *)mmalloc ( need,"msg0mcast" ); if ( ! buf ) return true; m_mcasts = (Multicast *)buf; for ( int32_t i = 0; i < m_numSplit ; i++ ) m_mcasts[i].constructor(); } */ // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) //#ifdef SPLIT_INDEXDB // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; /* if ( m_numSplit > 1 ) { gr = g_indexdb.getSplitGroupId ( baseGroupId, i ); buf = &replyBuf[i*replyBufMaxSize]; } else { */ //gr = m_groupId; buf = replyBuf; //} // get the multicast Multicast *m = &m_mcast; //if ( m_numSplit > 1 ) m = &m_mcasts[i]; if ( ! m->send ( m_request , //#else // if ( ! m_mcast.send ( m_request , //#endif m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , //#ifdef SPLIT_INDEXDB // gr , // group + offset //#else // m_groupId , // group to send to (groupKey) //#endif false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout , // timeout in seconds (was 30) niceness , realtime , firstHostId , //#ifdef SPLIT_INDEXDB // &replyBuf[i*replyBufMaxSize] , //#else // replyBuf , //#endif buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log("net: Failed to send request for data from %s in shard " "#%"UINT32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // no, multicast will free this when it is destroyed //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); // but speed it up //#ifdef SPLIT_INDEXDB m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) return false; //#else // m_mcast.reset(); //#endif return true; } //#ifdef SPLIT_INDEXDB m_numRequests++; //#endif // we blocked return false; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . dumps the RdbTree, m_tree, into m_file // . also sets and writes the RdbMap for m_file // . we methodically get RdbLists from the RdbTree // . dumped recs are ordered by key if "orderedDump" was true in call to set() // otherwise, lists are ordered by node # // . we write each list of recs to the file until the whole tree has been done // . we delete all records in list from the tree after we've written the list // . if a cache was provided we incorporate the list into the cache before // deleting it from the tree to keep the cache in sync. NO we do NOT! // . called again by writeBuf() when it's done writing the whole list bool RdbDump::dumpTree ( bool recall ) { // set up some vars //int32_t nextNode; //key_t maxEndKey; //maxEndKey.setMax(); char maxEndKey[MAX_KEY_BYTES]; KEYMAX(maxEndKey,m_ks); // if dumping statsdb, we can only dump records 30 seconds old or // more because Statsdb.cpp can "back modify" such records in the tree // because it may have a query that took 10 seconds come in then it // needs to add a partial stat to the last 10 stats for those 10 secs. // we use Global time at this juncture if ( m_rdb->m_rdbId == RDB_STATSDB ) { int32_t nowSecs = getTimeGlobal(); StatKey *sk = (StatKey *)maxEndKey; sk->m_zero = 0x01; sk->m_labelHash = 0xffffffff; // leave last 60 seconds in there just to be safe sk->m_time1 = nowSecs - 60; } // this list will hold the list of nodes/recs from m_tree m_list = &m_ourList; // convert coll to collnum //collnum_t collnum = g_collectiondb.getCollnum ( m_coll ); // a collnum of -1 is for collectionless rdbs //if ( collnum < 0 ) { // //if ( g_catdb->getRdb() == m_rdb ) // if ( ! m_rdb->m_isCollectionLess ) { // char *xx=NULL;*xx=0; //return true; // } // g_errno = 0; // collnum = 0; //} // getMemOccupiedForList2() can take some time, so breathe int32_t niceness = 1; loop: // if the lastKey was the max end key last time then we're done if ( m_rolledOver ) return true; // this is set to -1 when we're done with our unordered dump if ( m_nextNode == -1 ) return true; // . NOTE: list's buffer space should be re-used!! (TODO) // . "lastNode" is set to the last node # in the list bool status = true; //if ( ! m_orderedDump ) { // status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode , // m_maxBufSize , // m_list , // &nextNode ); // // this is -1 when no more nodes are left // m_nextNode = nextNode; //} // "lastKey" is set to the last key in the list //else { { // can we remove neg recs? // class RdbBase *base = m_rdb->getBase(m_collnum); // bool removeNegRecs = false; // if ( base->m_numFiles <= 0 ) removeNegRecs = true; if ( recall ) goto skip; // debug msg //log("RdbDump:: getting list"); m_t1 = gettimeofdayInMilliseconds(); if(m_tree) status = m_tree->getList ( m_collnum , m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys , niceness ); else if(m_buckets) status = m_buckets->getList ( m_collnum, m_nextKey , maxEndKey , m_maxBufSize , // max recSizes m_list , &m_numPosRecs , &m_numNegRecs , m_useHalfKeys ); // don't dump out any neg recs if it is our first time dumping // to a file for this rdb/coll. TODO: implement this later. //if ( removeNegRecs ) // m_list.removeNegRecs(); // if(!m_list->checkList_r ( false , // removeNegRecs? // false , // sleep on problem? // m_rdb->m_rdbId )) { // log("db: list to dump is not sane!"); // char *xx=NULL;*xx=0; // } skip: int64_t t2; //key_t lastKey; char *lastKey; // if error getting list (out of memory?) if ( ! status ) goto hadError; // debug msg t2 = gettimeofdayInMilliseconds(); log(LOG_INFO,"db: Get list took %"INT64" ms. " "%"INT32" positive. %"INT32" negative.", t2 - m_t1 , m_numPosRecs , m_numNegRecs ); // keep a total count for reporting when done m_totalPosDumped += m_numPosRecs; m_totalNegDumped += m_numNegRecs; // . check the list we got from the tree for problems // . ensures keys are ordered from lowest to highest as well //#ifdef GBSANITYCHECK if ( g_conf.m_verifyWrites ) { char *s = "none"; if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId); log("dump: verifying list before dumping (rdb=%s)",s); m_list->checkList_r ( false , // removeNegRecs? false , // sleep on problem? m_rdb->m_rdbId ); } // if list is empty, we're done! if ( status && m_list->isEmpty() ) { // consider that a rollover? if ( m_rdb->m_rdbId == RDB_STATSDB ) m_rolledOver = true; return true; } // get the last key of the list lastKey = m_list->getLastKey(); // advance m_nextKey //m_nextKey = lastKey ; //m_nextKey += (uint32_t)1; //if ( m_nextKey < lastKey ) m_rolledOver = true; KEYSET(m_nextKey,lastKey,m_ks); KEYADD(m_nextKey,1,m_ks); if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true; // debug msg //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0); //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0); } // . return true on error, g_errno should have been set // . this is probably out of memory error if ( ! status ) { hadError: log("db: Had error getting data for dump: %s. Retrying.", mstrerror(g_errno)); // debug msg //log("RdbDump::getList: sleeping and retrying"); // retry for the remaining two types of errors if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){ log( "db: Retry failed. Could not register callback."); return true; } // wait for sleep return false; } // if list is empty, we're done! if ( m_list->isEmpty() ) return true; // . set m_firstKeyInQueue and m_lastKeyInQueue // . this doesn't work if you're doing an unordered dump, but we should // not allow adds when closing m_lastKeyInQueue = m_list->getLastKey(); //m_firstKeyInQueue = m_list->getCurrentKey(); m_list->getCurrentKey(m_firstKeyInQueue); // . write this list to disk // . returns false if blocked, true otherwise // . sets g_errno on error // . if this blocks it should call us (dumpTree() back) if ( ! dumpList ( m_list , m_niceness , false ) ) return false; // close up shop on a write/dumpList error if ( g_errno ) return true; // . if dumpList() did not block then keep on truckin' // . otherwise, wait for callback of dumpTree() goto loop; }
// . return false if blocked, true otherwise // . sets g_errno on error bool Msg1::sendData ( uint32_t shardNum, char *listData , int32_t listSize) { // debug msg //log("sendData: mcast=%" PRIu32" listSize=%" PRId32, // (int32_t)&m_mcast,(int32_t)listSize); // bail if this is an interface machine, don't write to the main if ( g_conf.m_interfaceMachine ) return true; // return true if no data if ( listSize == 0 ) return true; // how many hosts in this group //int32_t numHosts = g_hostdb.getNumHostsPerShard(); // . NOTE: for now i'm removing this until I handle ETRYAGAIN errors // properly... by waiting and retrying... // . if this is local data just for us just do an addList to OUR rdb /* if ( groupId == g_hostdb.m_groupId && numHosts == 1 ) { // this sets g_errno on error Msg0 msg0; Rdb *rdb = msg0.getRdb ( (char) m_rdbId ); if ( ! rdb ) return true; // make a list from this data RdbList list; list.set (listData,listSize,listSize,rdb->getFixedDataSize(), false) ; // ownData? // this returns false and sets g_errno on error rdb->addList ( &list ); // . if we got a ETRYAGAIN cuz the buffer we add to was full // then we should sleep and try again! // . return false cuz this blocks for a period of time // before trying again if ( g_errno == ETRYAGAIN ) { // try adding again in 1 second registerSleepCallback ( 1000, slot, tryAgainWrapper1 ); // return now return false; } // . always return true cuz we did not block // . g_errno may be set return true; } */ // if the data is being added to our group, don't send ourselves // a msg1, if we can add it right now // MDW: crap this is getting ETRYAGAIN and it isn't being tried again // i guess and Spider.cpp fails to add to doledb but the doleiptable // maintains a positive count, thereby hanging the spiders. let's // just always go through multicast so it will auto-retry ETRYAGAIN /* bool sendToSelf = true; if ( shardNum == getMyShardNum() && ! g_conf.m_interfaceMachine ) { // get the rdb to which it belongs, use Msg0::getRdb() Rdb *rdb = getRdbFromId ( (char) m_rdbId ); if ( ! rdb ) goto skip; // key size int32_t ks = getKeySizeFromRdbId ( m_rdbId ); // reset g_errno g_errno = 0; // . make a list from this data // . skip over the first 4 bytes which is the rdbId // . TODO: embed the rdbId in the msgtype or something... RdbList list; // set the list list.set ( listData , listSize , listData , listSize , rdb->getFixedDataSize() , false , // ownData? rdb->useHalfKeys() , ks ); // note that //log("msg1: local addlist niceness=%" PRId32,m_niceness); // this returns false and sets g_errno on error rdb->addList ( m_coll , &list , m_niceness ); // if titledb, add tfndb recs to map the title recs //if ( ! g_errno && rdb == g_titledb.getRdb() && m_injecting ) // // this returns false and sets g_errno on error // updateTfndb ( m_coll , &list , true , m_niceness); // if no error, no need to use a Msg1 UdpSlot for ourselves if ( ! g_errno ) sendToSelf = false; else { log("rdb: msg1 coll=%s rdb=%s had error: %s", m_coll,rdb->m_dbname,mstrerror(g_errno)); // this is messing up generate catdb's huge rdblist add // why did we put it in there??? from msg9b.cpp //return true; } QUICKPOLL(m_niceness); // if we're the only one in the group, bail, we're done if ( ! sendToSelf && g_hostdb.getNumHostsPerShard() == 1 ) return true; } skip: */ // . make an add record request to multicast to a bunch of machines // . this will alloc new space, returns NULL on failure //char *request = makeRequest ( listData, listSize, groupId , //m_rdbId , &requestLen ); //int32_t collLen = strlen ( m_coll ); // . returns NULL and sets g_errno on error // . calculate total size of the record // . 1 byte for rdbId, 1 byte for flags, // then collection NULL terminated, then list int32_t requestLen = 1 + 1 + sizeof(collnum_t) + listSize ; // make the request char *request = (char *) mmalloc ( requestLen ,"Msg1" ); if ( ! request ) return true; char *p = request; // store the rdbId at top of request *p++ = m_rdbId; // then the flags *p = 0; if ( m_injecting ) *p |= 0x80; p++; // then collection name //gbmemcpy ( p , m_coll , collLen ); //p += collLen; //*p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); // sanity check //if ( collLen <= 0 ) { // log(LOG_LOGIC,"net: No collection specified for list add."); // //g_process.shutdownAbort(true); // g_errno = ENOCOLLREC; // return true; //} //if ( m_deleteRecs ) request[1] |= 0x80; //if ( m_overwriteRecs ) request[1] |= 0x40; // store the list after coll gbmemcpy ( p , listData , listSize ); QUICKPOLL(m_niceness); // for small packets //int32_t niceness = 2; //if ( requestLen < TMPBUFSIZE - 32 ) niceness = 0; //log("msg1: sending mcast niceness=%" PRId32,m_niceness); // . multicast to all hosts in group "groupId" // . multicast::send() returns false and sets g_errno on error // . we return false if we block, true otherwise // . will loop indefinitely if a host in this group is down key_t k; k.setMin(); if ( m_mcast.send ( request , // sets mcast->m_msg to this requestLen , // sets mcast->m_msgLen to this msg_type_1 , true , // does multicast own msg? shardNum , // group to send to (groupKey) true , // send to whole group? 0 , // key is useless for us this , // state data NULL , // state data gotReplyWrapper1 , multicast_msg1_senddata_timeout , // timeout m_niceness , // niceness -1 , // first host to try NULL , // replyBuf = NULL , 0 , // replyBufMaxSize = 0 , true , // freeReplyBuf = true , false , // doDiskLoadBalancing = false , -1 , // no max cache age limit //(key_t)0 , // cache key k , // cache key RDB_NONE , // bogus rdbId -1 , // unknown minRecSizes read size true )) // sendToSelf )) return false; QUICKPOLL(m_niceness); // g_errno should be set log("net: Had error when sending request to add data to %s in shard " "#%" PRIu32": %s.", getDbnameFromId(m_rdbId),shardNum,mstrerror(g_errno)); return true; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . if the list is sorted by keys this will be the most efficient bool Msg1::sendSomeOfList ( ) { // sanity check if ( m_list->m_ks != 8 && m_list->m_ks != 12 && m_list->m_ks != 16 && m_list->m_ks != 24 ) { g_process.shutdownAbort(true); } // debug msg //log("sendSomeOfList: mcast=%" PRIu32" exhausted=%" PRId32, // (int32_t)&m_mcast,(int32_t)m_list->isExhausted()); loop: // return true if list exhausted and nothing left to add if ( m_list->isExhausted() ) return true; // get key of the first record in the list //key_t firstKey = m_list->getCurrentKey(); char firstKey[MAX_KEY_BYTES]; m_list->getCurrentKey(firstKey); QUICKPOLL(m_niceness); // get groupId from this key //uint32_t groupId ; // . use the new Hostdb.h inlined function uint32_t shardNum = getShardNum ( m_rdbId , firstKey ); // point to start of data we're going to send char *dataStart = m_list->getListPtr(); // how many records belong to the same group as "firstKey" //key_t key; char key[MAX_KEY_BYTES]; while ( ! m_list->isExhausted() ) { //key = m_list->getCurrentKey(); m_list->getCurrentKey(key); #ifdef GBSANITYCHECK // no half bits in here! // debug point if ( m_list->useHalfKeys() && m_list->isHalfBitOn ( m_list->getCurrentRec() ) ) log(LOG_LOGIC,"net: msg1: Got half bit. Bad " "engineer."); #endif // . if key belongs to same group as firstKey then continue // . titledb now uses last bits of docId to determine groupId // . but uses the top 32 bits of key still // . spiderdb uses last 64 bits to determine groupId // . tfndb now is like titledb(top 32 bits are top 32 of docId) //if ( getGroupId(m_rdbId,key) != groupId ) goto done; if ( getShardNum(m_rdbId,key) != shardNum ) goto done; // . break so we don't send more than MAX_DGRAMS defined in // UdpServer.cpp. // . let's boost it from 16k to 64k for speed if ( m_list->getListPtr() - dataStart > 64*1024 ) goto done; // . point to next record // . will point passed records if no more left! QUICKPOLL(m_niceness); //int32_t crec = m_list->getCurrentRecSize(); m_list->skipCurrentRecord(); // sanity check if ( m_list->m_listPtr > m_list->m_listEnd ) { g_process.shutdownAbort(true); } } done: // now point to the end of the data char *dataEnd = m_list->getListPtr(); // . if force local is true we force the data to be added locally // . this fixes the bug we had from spiderdb since a key got corrupted // just enough to put it into a different groupId (but not out // of order) so we couldn't delete it cuz our delete keys would go // elsewhere if ( m_forceLocal && shardNum != getMyShardNum() && ! g_conf.m_interfaceMachine ) { // make the groupId local, our group //groupId = g_hostdb.m_groupId; // bitch about this to log it log("net: Data does not belong in shard %" PRIu32", but adding " "to %s anyway. Probable data corruption.", (uint32_t)shardNum,getDbnameFromId(m_rdbId)); } QUICKPOLL(m_niceness); // sanity test for new rdbs if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) { g_process.shutdownAbort(true); } // . now send this list to the host // . this returns false if blocked, true otherwise // . it also sets g_errno on error // . if it blocked return false if ( ! sendData ( shardNum , dataStart , dataEnd - dataStart ) ) return false; // if there was an error return true if ( g_errno ) return true; // otherwise, keep adding goto loop; }
// . send an add command to all machines in the appropriate group // . returns false if blocked, true otherwise // . sets g_errno on error // . groupId is -1 if we choose it automatically // . if waitForReply is false we return true right away, but we can only // launch MAX_MSG1S requests without waiting for replies, and // when the reply does come back we do NOT call the callback bool Msg1::addList ( RdbList *list , char rdbId , collnum_t collnum, // char *coll , void *state , void (* callback)(void *state) , bool forceLocal , int32_t niceness , bool injecting , bool waitForReply , bool *inTransit ) { // warning if ( collnum<0 ) log(LOG_LOGIC,"net: bad collection. msg1.cpp."); // if list has no records in it return true if ( ! list || list->isEmpty() ) return true; // sanity check if ( list->m_ks != 8 && list->m_ks != 12 && list->m_ks != 16 && list->m_ks != 24 ) { g_process.shutdownAbort(true); } // start at the beginning list->resetListPtr(); // if caller does not want reply try to accomodate him if ( ! waitForReply && list != &m_ourList ) { Msg1 *Y = getMsg1(); if ( ! Y ) { waitForReply = true; log(LOG_DEBUG,"net: msg1: " "No floating request slots " "available for adding data. " "Blocking on reply."); goto skip; } // steal the list, we don't want caller to free it gbmemcpy ( &Y->m_ourList , list , sizeof(RdbList) ); QUICKPOLL(niceness); // if list is small enough use our buf if ( ! list->m_ownData && list->m_listSize <= MSG1_BUF_SIZE ) { gbmemcpy ( Y->m_buf , list->m_list , list->m_listSize ); Y->m_ourList.m_list = Y->m_buf; Y->m_ourList.m_listEnd = Y->m_buf + list->m_listSize; Y->m_ourList.m_alloc = NULL; Y->m_ourList.m_ownData = false; } // otherwise, we cannot copy it and i don't want to mdup it... else if ( ! list->m_ownData ) { log(LOG_LOGIC,"net: msg1: List must own data. Bad " "engineer."); g_process.shutdownAbort(true); } // lastly, if it was a clean steal, don't let list free it else list->m_ownData = false; // reset m_listPtr and m_listPtrHi so we pass the isExhausted() // check in sendSomeOfList() below Y->m_ourList.resetListPtr(); // sanity test if ( Y->m_ourList.isExhausted() ) { log(LOG_LOGIC,"net: msg1: List is exhausted. " "Bad engineer."); g_process.shutdownAbort(true); } // now re-call bool inTransit; bool status = Y->addList ( &Y->m_ourList , rdbId , collnum , Y , // state returnMsg1 , // callback forceLocal , niceness , injecting , waitForReply , &inTransit ) ; // if we really blocked return false if ( ! status ) return false; // otherwise, it may have returned true because waitForReply // is false, but the request may still be in transit if ( inTransit ) return true; // debug msg //log("did not block, listSize=%" PRId32,m->m_ourList.m_listSize); // we did it without blocking, but it is still in transit // unless there was an error if ( g_errno ) log("net: Adding data to %s had error: %s.", getDbnameFromId(rdbId), mstrerror(g_errno)); // otherwise, if not in transit and no g_errno then it must // have really completed without blocking. in which case // we are done with "Y" returnMsg1 ( (void *)Y ); return true; } skip: // remember these vars m_list = list; m_rdbId = rdbId; m_collnum = collnum; m_state = state; m_callback = callback; m_forceLocal = forceLocal; m_niceness = niceness; m_injecting = injecting; m_waitForReply = waitForReply; QUICKPOLL(niceness); // reset m_listPtr to point to first record again list->resetListPtr(); // is the request in transit? assume not (assume did not block) if ( inTransit ) *inTransit = false; // . not all records in the list may belong to the same group // . records should be sorted by key so we don't need to sort them // . if this did not block, return true if ( sendSomeOfList ( ) ) return true; // it is in transit if ( inTransit ) *inTransit = true; // if we should waitForReply return false if ( m_waitForReply ) return false; // tell caller we did not block on the reply, even though we did return true; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . if the list is sorted by keys this will be the most efficient bool Msg1::sendSomeOfList ( ) { // sanity check if ( m_list->m_ks != 8 && m_list->m_ks != 12 && m_list->m_ks != 16 && m_list->m_ks != 24 ) { char *xx=NULL;*xx=0; } // debug msg //log("sendSomeOfList: mcast=%lu exhausted=%li", // (long)&m_mcast,(long)m_list->isExhausted()); loop: // return true if list exhausted and nothing left to add if ( m_list->isExhausted() ) return true; // get key of the first record in the list //key_t firstKey = m_list->getCurrentKey(); char firstKey[MAX_KEY_BYTES]; m_list->getCurrentKey(firstKey); QUICKPOLL(m_niceness); // get groupId from this key //unsigned long groupId ; // . use the new Hostdb.h inlined function uint32_t shardNum = getShardNum ( m_rdbId , firstKey ); // . default is to use top bits of the key // . but if we're adding to titledb use last bits in the top of key // . but if we're adding to spiderdb we use the last long in the key // . tfndb urlRec key same as titleRec key /* if ( m_rdbId == RDB_INDEXDB ) groupId = g_indexdb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB_DATEDB ) groupId = g_datedb.getGroupIdFromKey((key128_t *)firstKey); else if ( m_rdbId == RDB_TITLEDB) groupId = g_titledb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB_CHECKSUMDB) groupId = g_checksumdb.getGroupId ( firstKey ); else if ( m_rdbId == RDB_SPIDERDB ) groupId = g_spiderdb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB_TFNDB ) groupId = g_tfndb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB_CLUSTERDB ) groupId = g_clusterdb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB2_INDEXDB2 ) groupId = g_indexdb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB2_DATEDB2 ) groupId = g_datedb.getGroupIdFromKey((key128_t *)firstKey); else if ( m_rdbId == RDB2_TITLEDB2) groupId = g_titledb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB2_CHECKSUMDB2) groupId = g_checksumdb.getGroupId ( firstKey ); else if ( m_rdbId == RDB2_SPIDERDB2 ) groupId = g_spiderdb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB2_TFNDB2 ) groupId = g_tfndb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB2_CLUSTERDB2 ) groupId = g_clusterdb.getGroupIdFromKey((key_t *)firstKey); //else groupId=firstKey.n1 & g_hostdb.m_groupMask; else groupId = (((key_t *)firstKey)->n1) & g_hostdb.m_groupMask; */ // point to start of data we're going to send char *dataStart = m_list->getListPtr(); // how many records belong to the same group as "firstKey" //key_t key; char key[MAX_KEY_BYTES]; while ( ! m_list->isExhausted() ) { //key = m_list->getCurrentKey(); m_list->getCurrentKey(key); #ifdef _SANITYCHECK_ // no half bits in here! // debug point if ( m_list->useHalfKeys() && m_list->isHalfBitOn ( m_list->getCurrentRec() ) ) log(LOG_LOGIC,"net: msg1: Got half bit. Bad " "engineer."); #endif // . if key belongs to same group as firstKey then continue // . titledb now uses last bits of docId to determine groupId // . but uses the top 32 bits of key still // . spiderdb uses last 64 bits to determine groupId // . tfndb now is like titledb(top 32 bits are top 32 of docId) //if ( getGroupId(m_rdbId,key) != groupId ) goto done; if ( getShardNum(m_rdbId,key) != shardNum ) goto done; /* switch ( m_rdbId ) { case RDB_TITLEDB: if(g_titledb.getGroupIdFromKey((key_t *)key)!=groupId) goto done; break; case RDB_CHECKSUMDB: if(g_checksumdb.getGroupId ( key)!=groupId) goto done; break; case RDB_SPIDERDB: if ( g_spiderdb.getGroupId ((key_t *)key) != groupId) goto done; break; case RDB_TFNDB: if ( g_tfndb.getGroupId ((key_t *)key) != groupId) goto done; break; case RDB_CLUSTERDB: if(g_clusterdb.getGroupIdFromKey((key_t *)key)!=groupId) goto done; break; case RDB_DATEDB: if(g_datedb.getGroupIdFromKey((key128_t *)key)!=groupId) goto done; break; case RDB_INDEXDB: if(g_indexdb.getGroupIdFromKey((key_t *)key)!=groupId) goto done; break; //default:if ((key.n1&g_hostdb.m_groupMask) != groupId) default: if ( ((((key_t *)key)->n1) & g_hostdb.m_groupMask) != groupId) goto done; } */ // . break so we don't send more than MAX_DGRAMS defined in // UdpServer.cpp. // . let's boost it from 16k to 64k for speed if ( m_list->getListPtr() - dataStart > 64*1024 ) goto done; // . point to next record // . will point passed records if no more left! QUICKPOLL(m_niceness); //long crec = m_list->getCurrentRecSize(); m_list->skipCurrentRecord(); // sanity check if ( m_list->m_listPtr > m_list->m_listEnd ) { char *xx=NULL;*xx=0; } } done: // now point to the end of the data char *dataEnd = m_list->getListPtr(); // . if force local is true we force the data to be added locally // . this fixes the bug we had from spiderdb since a key got corrupted // just enough to put it into a different groupId (but not out // of order) so we couldn't delete it cuz our delete keys would go // elsewhere if ( m_forceLocal && shardNum != getMyShardNum() && ! g_conf.m_interfaceMachine ) { // make the groupId local, our group //groupId = g_hostdb.m_groupId; // bitch about this to log it log("net: Data does not belong in shard %lu, but adding " "to %s anyway. Probable data corruption.", (unsigned long)shardNum,getDbnameFromId(m_rdbId)); } QUICKPOLL(m_niceness); // sanity test for new rdbs if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) { char *xx=NULL;*xx=0; } // little debug thing for genCatdb from msg9b's huge list add //if ( m_list->m_listSize > 10000000 ) // log("msg1: adding chunk @ %li of %li bytes", // (long)(dataStart - m_list->m_list) , // (long)m_list->m_listSize ); // . now send this list to the host // . this returns false if blocked, true otherwise // . it also sets g_errno on error // . if it blocked return false if ( ! sendData ( shardNum , dataStart , dataEnd - dataStart ) ) return false; // if there was an error return true if ( g_errno ) return true; // otherwise, keep adding goto loop; }