// . returns false on error // . called from PageHosts.cpp!!! bool Syncdb::syncHost ( long syncHostId ) { Host *sh = g_hostdb.getHost ( syncHostId ); if ( ! sh ) return log("sync: bad host id %li",syncHostId); // get its group //Host *hosts = g_hostdb.getGroup ( sh->m_groupId ); Host *hosts = g_hostdb.getShard ( sh->m_shardNum ); // get the best twin for it to sync from for ( long i = 0 ; i < g_hostdb.getNumHostsPerShard() ; i++ ) { // get host Host *h = &hosts[i]; // skip if dead if ( g_hostdb.isDead ( h ) ) continue; // skip if permanent out of sync if ( h->m_isPermanentOutOfSync ) continue; // not itself! it must be dead... wtf!? if ( h == sh ) continue; // save it long tmp = syncHostId; // log it log("sync: sending sync request to host id #%li",h->m_hostId); // shortcut UdpServer *us = &g_udpServer; // use that guy if ( us->sendRequest ( (char *)&tmp , 4 , 0x55 , // SYNCDB REQUEST 0 , // ip 0 , // port h->m_hostId , // hostId NULL , // retSlot NULL , // state gotReplyWrapper55 , // wrapper 15 , // timeout -1 , // backoff -1 , // maxWait NULL , // replyBuf 0 , // replyBufSize MAX_NICENESS )) // success return true; // note it log("sync: had error sending sync request to host id #%li: %s", h->m_hostId,mstrerror(g_errno)); // error! return false; } // none to sync from return log("sync: could not find adequate twin to sync from!"); }
// . returns false if blocked, true otherwise. // . returns true and sets g_errno on error // . before we can spider for a SpiderRequest we must be granted the lock // . each group shares the same doledb and each host in the group competes // for spidering all those urls. // . that way if a host goes down is load is taken over bool Msg12::getLocks ( int64_t uh48, // probDocId , char *url , DOLEDBKEY *doledbKey, collnum_t collnum, int32_t sameIpWaitTime, int32_t maxSpidersOutPerIp, int32_t firstIp, void *state , void (* callback)(void *state) ) { // ensure not in use. not msg12 replies outstanding. if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; } // no longer use this char *xx=NULL;*xx=0; // do not use locks for injections //if ( m_sreq->m_isInjecting ) return true; // get # of hosts in each mirror group int32_t hpg = g_hostdb.getNumHostsPerShard(); // reset m_numRequests = 0; m_numReplies = 0; m_grants = 0; m_removing = false; m_confirming = false; // make sure is really docid //if ( probDocId & ~DOCID_MASK ) { char *xx=NULL;*xx=0; } // . mask out the lower bits that may change if there is a collision // . in this way a url has the same m_probDocId as the same url // in the index. i.e. if we add a new spider request for url X and // url X is already indexed, then they will share the same lock // even though the indexed url X may have a different actual docid // than its probable docid. // . we now use probable docids instead of uh48 because query reindex // in PageReindex.cpp adds docid based spider requests and we // only know the docid, not the uh48 because it is creating // SpiderRequests from docid-only search results. having to look // up the msg20 summary for like 1M search results is too painful! //m_lockKey = g_titledb.getFirstProbableDocId(probDocId); // . use this for locking now, and let the docid-only requests just use // the docid m_lockKeyUh48 = makeLockTableKey ( uh48 , firstIp ); m_url = url; m_callback = callback; m_state = state; m_hasLock = false; m_origUh48 = uh48; // support ability to spider multiple urls from same ip m_doledbKey = *doledbKey; m_collnum = collnum; m_sameIpWaitTime = sameIpWaitTime; m_maxSpidersOutPerIp = maxSpidersOutPerIp; m_firstIp = firstIp; // sanity check, just 6 bytes! (48 bits) if ( uh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; } if ( m_lockKeyUh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // cache time int32_t ct = 120; // if docid based assume it was a query reindex and keep it short! // otherwise we end up waiting 120 seconds for a query reindex to // go through on a docid we just spidered. TODO: use m_urlIsDocId // MDW: check this out if ( url && is_digit(url[0]) ) ct = 2; // . this seems to be messing us up and preventing us from adding new // requests into doledb when only spidering a few IPs. // . make it random in the case of twin contention ct = rand() % 10; // . check our cache to avoid repetitive asking // . use -1 for maxAge to indicate no max age // . returns -1 if not in cache // . use maxage of two minutes, 120 seconds int32_t lockTime ; lockTime = g_spiderLoop.m_lockCache.getLong(0,m_lockKeyUh48,ct,true); // if it was in the cache and less than 2 minutes old then return // true now with m_hasLock set to false. if ( lockTime >= 0 ) { if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: cached missed lock for %s " "lockkey=%" PRIu64, m_url,m_lockKeyUh48); return true; } if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: sending lock request for %s " "lockkey=%" PRIu64, m_url,m_lockKeyUh48); // now the locking group is based on the probable docid //m_lockGroupId = g_hostdb.getGroupIdFromDocId(m_lockKey); // ptr to list of hosts in the group //Host *hosts = g_hostdb.getGroup ( m_lockGroupId ); // the same group (shard) that has the spiderRequest/Reply is // the one responsible for locking. Host *hosts = g_hostdb.getMyShard(); // shortcut UdpServer *us = &g_udpServer; static int32_t s_lockSequence = 0; // remember the lock sequence # in case we have to call remove locks m_lockSequence = s_lockSequence++; LockRequest *lr = &m_lockRequest; lr->m_lockKeyUh48 = m_lockKeyUh48; lr->m_firstIp = m_firstIp; lr->m_removeLock = 0; lr->m_lockSequence = m_lockSequence; lr->m_collnum = collnum; // reset counts m_numRequests = 0; m_numReplies = 0; // point to start of the 12 byte request buffer char *request = (char *)lr;//m_lockKey; int32_t requestSize = sizeof(LockRequest);//12; // loop over hosts in that shard for ( int32_t i = 0 ; i < hpg ; i++ ) { // get a host Host *h = &hosts[i]; // skip if dead! no need to get a reply from dead guys if ( g_hostdb.isDead (h) ) continue; // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: sent lock " "request #%" PRId32" for lockkey=%" PRIu64" %s to " "hid=%" PRId32,m_numRequests,m_lockKeyUh48, m_url,h->m_hostId); // send request to him if ( ! us->sendRequest ( request , requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPtr this , // state data gotLockReplyWrapper , udpserver_sendrequest_infinite_timeout ) ) // udpserver returns false and sets g_errno on error return true; // count them m_numRequests++; } // block? if ( m_numRequests > 0 ) return false; // i guess nothing... hmmm... all dead? //char *xx=NULL; *xx=0; // m_hasLock should be false... all lock hosts seem dead... wait if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: all lock hosts seem dead for %s " "lockkey=%" PRIu64, m_url,m_lockKeyUh48); return true; }
bool Msg12::confirmLockAcquisition ( ) { // ensure not in use. not msg12 replies outstanding. if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; } // no longer use this char *xx=NULL;*xx=0; // we are now removing m_confirming = true; // make that the request // . point to start of the 12 byte request buffer // . m_lockSequence should still be valid ConfirmRequest *cq = &m_confirmRequest; char *request = (char *)cq; int32_t requestSize = sizeof(ConfirmRequest); // sanity if ( requestSize == sizeof(LockRequest)){ char *xx=NULL;*xx=0; } // set it cq->m_collnum = m_collnum; cq->m_doledbKey = m_doledbKey; cq->m_firstIp = m_firstIp; cq->m_lockKeyUh48 = m_lockKeyUh48; cq->m_maxSpidersOutPerIp = m_maxSpidersOutPerIp; // . use the locking group from when we sent the lock request // . get ptr to list of hosts in the group //Host *hosts = g_hostdb.getGroup ( m_lockGroupId ); // the same group (shard) that has the spiderRequest/Reply is // the one responsible for locking. Host *hosts = g_hostdb.getMyShard(); // this must select the same shard that is going to spider it! // i.e. our shard! because we check our local lock table to see // if a doled url is locked before spidering it ourselves. //Host *hosts = g_hostdb.getMyShard(); // shortcut UdpServer *us = &g_udpServer; // get # of hosts in each mirror group int32_t hpg = g_hostdb.getNumHostsPerShard(); // reset counts m_numRequests = 0; m_numReplies = 0; // note it if ( g_conf.m_logDebugSpider ) log("spider: confirming lock for uh48=%" PRIu64" firstip=%s", m_lockKeyUh48,iptoa(m_firstIp)); // loop over hosts in that shard for ( int32_t i = 0 ; i < hpg ; i++ ) { // get a host Host *h = &hosts[i]; // skip if dead! no need to get a reply from dead guys if ( g_hostdb.isDead ( h ) ) continue; // send request to him if ( ! us->sendRequest ( request , // a size of 2 should mean confirm requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPtr this , // state data gotLockReplyWrapper , udpserver_sendrequest_infinite_timeout ) ) // udpserver returns false and sets g_errno on error return true; // count them m_numRequests++; } // block? if ( m_numRequests > 0 ) return false; // did not block return true; }
bool Msg12::removeAllLocks ( ) { // ensure not in use. not msg12 replies outstanding. if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; } // no longer use this char *xx=NULL;*xx=0; // skip if injecting //if ( m_sreq->m_isInjecting ) return true; if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: removing all locks for %s %" PRIu64, m_url,m_lockKeyUh48); // we are now removing m_removing = true; LockRequest *lr = &m_lockRequest; lr->m_lockKeyUh48 = m_lockKeyUh48; lr->m_lockSequence = m_lockSequence; lr->m_firstIp = m_firstIp; lr->m_removeLock = 1; // reset counts m_numRequests = 0; m_numReplies = 0; // make that the request // . point to start of the 12 byte request buffer // . m_lockSequence should still be valid char *request = (char *)lr;//m_lockKey; int32_t requestSize = sizeof(LockRequest);//12; // now the locking group is based on the probable docid //uint32_t groupId = g_hostdb.getGroupIdFromDocId(m_lockKeyUh48); // ptr to list of hosts in the group //Host *hosts = g_hostdb.getGroup ( groupId ); Host *hosts = g_hostdb.getMyShard(); // this must select the same group that is going to spider it! // i.e. our group! because we check our local lock table to see // if a doled url is locked before spidering it ourselves. //Host *hosts = g_hostdb.getMyGroup(); // shortcut UdpServer *us = &g_udpServer; // set the hi bit though for this one //m_lockKey |= 0x8000000000000000LL; // get # of hosts in each mirror group int32_t hpg = g_hostdb.getNumHostsPerShard(); // loop over hosts in that shard for ( int32_t i = 0 ; i < hpg ; i++ ) { // get a host Host *h = &hosts[i]; // skip if dead! no need to get a reply from dead guys if ( g_hostdb.isDead ( h ) ) continue; // send request to him if ( ! us->sendRequest ( request , requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPtr this , // state data gotLockReplyWrapper , udpserver_sendrequest_infinite_timeout ) ) // udpserver returns false and sets g_errno on error return true; // count them m_numRequests++; } // block? if ( m_numRequests > 0 ) return false; // did not block return true; }
// returns true if all done, false if waiting for more replies bool Msg12::gotLockReply ( UdpSlot *slot ) { // no longer use this char *xx=NULL;*xx=0; // got reply m_numReplies++; // don't let udpserver free the request, it's our m_request[] slot->m_sendBufAlloc = NULL; // check for a hammer reply char *reply = slot->m_readBuf; int32_t replySize = slot->m_readBufSize; // if error, treat as a not grant if ( g_errno ) { bool logIt = true; // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 reply error = %s", mstrerror(g_errno)); // if we got an ETRYAGAIN when trying to confirm our lock // that means doledb was saving/dumping to disk and we // could not remove the record from doledb and add an // entry to the waiting tree, so we need to keep trying if ( g_errno == ETRYAGAIN && m_confirming ) { // c ount it again m_numRequests++; // use what we were using char *request = (char *)&m_confirmRequest; int32_t requestSize = sizeof(ConfirmRequest); Host *h = g_hostdb.getHost(slot->m_hostId); // send request to him UdpServer *us = &g_udpServer; if ( ! us->sendRequest ( request , requestSize , 0x12 , // msgType h->m_ip , h->m_port , h->m_hostId , NULL , // retSlotPtrPt this , // state data gotLockReplyWrapper , udpserver_sendrequest_infinite_timeout ) ) return false; // error? // don't spam the log! static int32_t s_last = 0; int32_t now = getTimeLocal(); if ( now - s_last >= 1 ) { s_last = now; log("spider: error re-sending confirm " "request: %s", mstrerror(g_errno)); } } // only log every 10 seconds for ETRYAGAIN if ( g_errno == ETRYAGAIN ) { static time_t s_lastTime = 0; time_t now = getTimeLocal(); logIt = false; if ( now - s_lastTime >= 3 ) { logIt = true; s_lastTime = now; } } if ( logIt ) log ( "sploop: host had error getting lock url=%s" ": %s" , m_url,mstrerror(g_errno) ); } // grant or not if ( replySize == 1 && ! g_errno && *reply == 1 ) m_grants++; // wait for all to get back if ( m_numReplies < m_numRequests ) return false; // all done if we were removing if ( m_removing ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: done removing all locks " "(replies=%" PRId32") for %s", m_numReplies,m_url);//m_sreq->m_url); // we are done m_gettingLocks = false; return true; } // all done if we were confirming if ( m_confirming ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: done confirming all locks " "for %s uh48=%" PRId64,m_url,m_origUh48);//m_sreq->m_url); // we are done m_gettingLocks = false; // . keep processing // . if the collection was nuked from under us the spiderUrl2 // will return true and set g_errno if ( ! m_callback ) return g_spiderLoop.spiderUrl2(); // if we had a callback let our parent call it return true; } // if got ALL locks, spider it if ( m_grants == m_numReplies ) { // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: got lock for docid=lockkey=%" PRIu64, m_lockKeyUh48); // flag this m_hasLock = true; // we are done //m_gettingLocks = false; /////// // // now tell our group (shard) to remove from doledb // and re-add to waiting tree. the evalIpLoop() function // should skip this probable docid because it is in the // LOCK TABLE! // // This logic should allow us to spider multiple urls // from the same IP at the same time. // /////// // returns false if would block if ( ! confirmLockAcquisition ( ) ) return false; // . we did it without blocking, maybe cuz we are a single node // . ok, they are all back, resume loop // . if the collection was nuked from under us the spiderUrl2 // will return true and set g_errno if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( ); // all done return true; } // note it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: missed lock for %s lockkey=%" PRIu64" " "(grants=%" PRId32")", m_url,m_lockKeyUh48,m_grants); // . if it was locked by another then add to our lock cache so we do // not try to lock it again // . if grants is not 0 then one host granted us the lock, but not // all hosts, so we should probably keep trying on it until it is // locked up by one host if ( m_grants == 0 ) { int32_t now = getTimeGlobal(); g_spiderLoop.m_lockCache.addLong(0,m_lockKeyUh48,now,NULL); } // reset again m_numRequests = 0; m_numReplies = 0; // no need to remove them if none were granted because another // host in our group might have it 100% locked. if ( m_grants == 0 ) { // no longer in locks operation mode m_gettingLocks = false; // ok, they are all back, resume loop //if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( ); // all done return true; } // note that if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: sending request to all in shard to " "remove lock uh48=%" PRIu64". grants=%" PRId32, m_lockKeyUh48,(int32_t)m_grants); // remove all locks we tried to get, BUT only if from our hostid! // no no! that doesn't quite work right... we might be the ones // locking it! i.e. another one of our spiders has it locked... if ( ! removeAllLocks ( ) ) return false; // true; // if did not block, how'd that happen? log("sploop: did not block in removeAllLocks: %s",mstrerror(g_errno)); return true; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb collnum_t collnum , RdbList *list , const char *startKey , const char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int64_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , bool isRealMerge , bool allowPageCache , bool forceLocalIndexdb , bool noSplit , int32_t forceParitySplit ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId ); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if( g_conf.m_logTraceMsg0 ) // { // log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId); // log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks); // log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId); // } // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; logTrace( g_conf.m_logTraceMsg0, "END" ); log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; } // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; m_addToCache = addToCache; // . these define our request 100% KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) { log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId ); } // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { logTrace( g_conf.m_logTraceMsg0, "isLocal" ); if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , m_isRealMerge , m_allowPageCache ) ) { logTrace( g_conf.m_logTraceMsg0, "END, return false" ); return false; } // nuke it reset(); logTrace( g_conf.m_logTraceMsg0, "END, return true" ); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%" PRIu32" " "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" " //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")", "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId); logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" ); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) { // cback niceness logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" ); return true; } // return false cuz it blocked logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" ); return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; buf = replyBuf; // get the multicast Multicast *m = &m_mcast; if ( ! m->send ( m_request , m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout*1000 , // timeout niceness , firstHostId , buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log(LOG_ERROR, "net: Failed to send request for data from %s in shard " "#%" PRIu32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // but speed it up m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) { logTrace( g_conf.m_logTraceMsg0, "END - returning false" ); return false; } logTrace( g_conf.m_logTraceMsg0, "END - returning true" ); return true; } m_numRequests++; // we blocked logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" ); return false; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb //char *coll , collnum_t collnum , RdbList *list , //key_t startKey , //key_t endKey , char *startKey , char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int32_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , Msg5 *msg5b , bool isRealMerge , //#ifdef SPLIT_INDEXDB bool allowPageCache , bool forceLocalIndexdb , bool noSplit , // doIndexdbSplit , int32_t forceParitySplit ) { //#else // bool allowPageCache ) { //#endif // this is obsolete! mostly, but we need it for PageIndexdb.cpp to // show a "termlist" for a given query term in its entirety so you // don't have to check each machine in the network. if this is true it // means to query each split and merge the results together into a // single unified termlist. only applies to indexdb/datedb. //if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; } // note this because if caller is wrong it hurts performance major!! //if ( doIndexdbSplit ) // logf(LOG_DEBUG,"net: doing msg0 with indexdb split true"); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); //if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; } // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; return true; } // debug msg //if ( niceness != 0 ) log("HEY start"); // ensure startKey last bit clear, endKey last bit set //if ( (startKey.n0 & 0x01) == 0x01 ) // log("Msg0::getList: warning startKey lastbit set"); //if ( (endKey.n0 & 0x01) == 0x00 ) // log("Msg0::getList: warning endKey lastbit clear"); // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; //m_ip = ip; //m_port = port; m_addToCache = addToCache; // . these define our request 100% //m_startKey = startKey; //m_endKey = endKey; KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? //if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId; if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) log(LOG_LOGIC,"net: msg0: " "Weird. check but don't add... rdbid=%"INT32".",(int32_t)m_rdbId); // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* m_numSplit = 1; if ( g_hostdb.m_indexSplits > 1 && ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&& ! forceLocalIndexdb && doIndexdbSplit ) { isLocal = false; //m_numSplit = INDEXDB_SPLIT; m_numSplit = g_hostdb.m_indexSplits; char *xx=NULL;*xx=0; } */ /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // force a msg0 if doing a docid restrictive query like // gbdocid:xxxx|<query> so we call cacheTermLists() //if ( singleDocIdQuery ) isLocal = false; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { // && !g_conf.m_interfaceMachine ) { if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); // same for msg5b if ( msg5b ) { m_msg5b = msg5b; m_deleteMsg5b = false; } /* else if ( m_rdbId == RDB_TITLEDB ) { try { m_msg5b = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request. 2.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" ); m_deleteMsg5b = true; } */ QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , NULL,//m_msg5b , m_isRealMerge , m_allowPageCache ) ) return false; // nuke it reset(); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%"UINT32" " "listPtr=%"PTRFMT" minRecSizes=%"INT32" termId=%"UINT64" " //"startKey.n1=%"XINT32",n0=%"XINT64" (niceness=%"INT32")", "startKey.n1=%"XINT64",n0=%"XINT64" (niceness=%"INT32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // adjust niceness for net transmission bool realtime = false; //if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true; // if we're niceness 0 we need to pre-allocate for reply since it // might be received within the asynchronous signal handler which // cannot call mmalloc() if ( realtime ) { // niceness <= 0 || netnice == 0 ) { // . we should not get back more than minRecSizes bytes since // we are now performing merges // . it should not slow things down too much since the hashing // is 10 times slower than merging anyhow... // . CAUTION: if rdb is not fixed-datasize then this will // not work for us! it can exceed m_minRecSizes. replyBufMaxSize = m_minRecSizes ; // . get a little extra to fix the error where we ask for 64 // but get 72 // . where is that coming from? // . when getting titleRecs we often exceed the minRecSizes // . ?Msg8? was having trouble. was int16_t 32 bytes sometimes. replyBufMaxSize += 36; // why add ten percent? //replyBufMaxSize *= 110 ; //replyBufMaxSize /= 100 ; // make a buffer to hold the reply //#ifdef SPLIT_INDEXDB /* if ( m_numSplit > 1 ) { m_replyBufSize = replyBufMaxSize * m_numSplit; replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0"); m_replyBuf = replyBuf; freeReply = false; } else */ //#endif replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0"); // g_errno is set and we return true if it failed if ( ! replyBuf ) { log("net: Failed to pre-allocate %"INT32" bytes to hold " "data read remotely from %s: %s.", replyBufMaxSize,getDbnameFromId(m_rdbId), mstrerror(g_errno)); return true; } } // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".", m_hostId); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); //if ( niceness <= 0 || netnice == 0 ) { //if ( realtime ) { // us = &g_udpServer2; port = h->m_port2; } //else { us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) // cback niceness return true; // return false cuz it blocked return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; //if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. " // "termId=%"UINT64", " // "groupNum=%"UINT32"", // g_indexdb.getTermId(m_startKey) , // g_hostdb.makeHostId ( m_groupId ) ); /* // make the cache key so we can see what remote host cached it, if any char cacheKey[MAX_KEY_BYTES]; //key_t cacheKey = makeCacheKey ( startKey , makeCacheKey ( startKey , endKey , includeTree , minRecSizes , startFileNum , numFiles , cacheKey , m_ks ); */ // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); /* // allocate space if ( m_numSplit > 1 ) { int32_t need = m_numSplit * sizeof(Multicast) ; char *buf = (char *)mmalloc ( need,"msg0mcast" ); if ( ! buf ) return true; m_mcasts = (Multicast *)buf; for ( int32_t i = 0; i < m_numSplit ; i++ ) m_mcasts[i].constructor(); } */ // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) //#ifdef SPLIT_INDEXDB // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; /* if ( m_numSplit > 1 ) { gr = g_indexdb.getSplitGroupId ( baseGroupId, i ); buf = &replyBuf[i*replyBufMaxSize]; } else { */ //gr = m_groupId; buf = replyBuf; //} // get the multicast Multicast *m = &m_mcast; //if ( m_numSplit > 1 ) m = &m_mcasts[i]; if ( ! m->send ( m_request , //#else // if ( ! m_mcast.send ( m_request , //#endif m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , //#ifdef SPLIT_INDEXDB // gr , // group + offset //#else // m_groupId , // group to send to (groupKey) //#endif false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout , // timeout in seconds (was 30) niceness , realtime , firstHostId , //#ifdef SPLIT_INDEXDB // &replyBuf[i*replyBufMaxSize] , //#else // replyBuf , //#endif buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log("net: Failed to send request for data from %s in shard " "#%"UINT32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // no, multicast will free this when it is destroyed //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); // but speed it up //#ifdef SPLIT_INDEXDB m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) return false; //#else // m_mcast.reset(); //#endif return true; } //#ifdef SPLIT_INDEXDB m_numRequests++; //#endif // we blocked return false; }