// . returns false if blocked, true otherwise // . see if other pages we've indexed have this same image url bool Images::launchRequests ( ) { // loop over all images for ( long i = m_i ; i < m_numImages ; i++ ) { // advance m_i++; // assume no error m_errors[i] = 0; // make the keys. each term is a gbimage:<imageUrl> term // so we are searching for the image url to see how often // it is repeated on other pages. key144_t startKey ; key144_t endKey ; g_posdb.makeStartKey(&startKey,m_termIds[i]); g_posdb.makeEndKey (&endKey ,m_termIds[i]); // get our residing groupid //unsigned long gid = g_indexdb.getNoSplitGroupId(&startKey); // no split is true for this one, so we do not split by docid //uint32_t gid = getGroupId(RDB_INDEXDB,&startKey,false); unsigned long shardNum; shardNum = getShardNum(RDB_POSDB,&startKey); // get the termlist if ( ! m_msg0.getList ( -1 , // hostid -1 , // ip -1 , // port 0 , // maxAge false , // addToCache? RDB_POSDB, m_collnum , &m_list , // RdbList ptr (char *)&startKey , (char *)&endKey , 1024 , // minRecSize this , gotTermListWrapper , MAX_NICENESS , false , // err correction? true , // inc tree? true , // domergeobsolete -1 , // firstHostId 0 , // start filenum -1 , // numFiles 30 , // timeout -1 , // syncpoint -1 , // preferlocalreads NULL , // msg5 NULL , // msg5b false , // isRealMerge? true , // allow pg cache false , // focelocalindexdb false , // doIndexdbSplit? shardNum ))// force paritysplit return false; // process the msg36 response gotTermList (); } // i guess we didn't block return downloadImages(); }
/* bool Monitordb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; if ( ! doVerify ) return true; // verify if ( verify(coll) ) return true; // if not allowing scale, return false if ( ! g_conf.m_allowScale ) return false; // otherwise let it go log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ bool Monitordb::verify ( char *coll ) { log ( LOG_INFO, "db: Verifying Monitordb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key224_t startKey; key224_t endKey; startKey.setMin(); endKey.setMax(); long minRecSizes = 64000; CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_MONITORDB , cr->m_collnum, &list , (char*)&startKey , (char*)&endKey , minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } long count = 0; long got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key224_t k; list.getCurrentKey((char*)&k); count++; uint32_t shardNum = getShardNum ( RDB_MONITORDB , &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { log ("db: Out of first %li records in Monitordb , " "only %li belong to our group.",count,got); /* // repeat with log for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key224_t k; list.getCurrentKey((char*)&k); uint32_t shardNum = getShardNum ( RDB_MONITORDB , &k ); long groupNum = g_hostdb.getGroupNum(groupId); unsigned long sh32 ; sh32 = g_monitordb.getLinkeeSiteHash32_uk(&k); uint16_t sh16 = sh32 >> 19; log("db: sh16=0x%lx group=%li", (long)sh16,groupNum); } */ // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to inconsistency."); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_INFO, "db: Monitordb passed verification successfully for " "%li recs.", count ); // DONE g_threads.enableThreads(); return true; }
/* bool Titledb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; if ( ! doVerify ) return true; // verify if ( verify(coll) ) return true; // if not allowing scale, return false if ( ! g_conf.m_allowScale ) return false; // otherwise let it go log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ bool Titledb::verify(const char *coll) { log ( LOG_DEBUG, "db: Verifying Titledb for coll %s...", coll ); Msg5 msg5; RdbList list; key96_t startKey; key96_t endKey; startKey.setMin(); endKey.setMax(); //int32_t minRecSizes = 64000; const CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_TITLEDB , cr->m_collnum , &list , startKey , endKey , 1024*1024 , // minRecSizes , true , // includeTree , 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries -1LL , // sync point false , // isRealMerge true)) // allowPageCache { log(LOG_DEBUG, "db: HEY! it did not block"); return false; } int32_t count = 0; int32_t got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key96_t k = list.getCurrentKey(); // skip negative keys if ( (k.n0 & 0x01) == 0x00 ) continue; count++; //uint32_t groupId = getGroupId ( RDB_TITLEDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; uint32_t shardNum = getShardNum ( RDB_TITLEDB, &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { // tally it up g_rebalance.m_numForeignRecs += count - got; log ("db: Out of first %" PRId32" records in titledb, " "only %" PRId32" belong to our shard. c=%s",count,got,coll); // exit if NONE, we probably got the wrong data if ( count > 10 && got == 0 ) log("db: Are you sure you have the right " "data in the right directory? " "coll=%s " "Exiting.", coll); // repeat with log for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key96_t k = list.getCurrentKey(); //uint32_t groupId = getGroupId ( RDB_TITLEDB,&k); //int32_t groupNum = g_hostdb.getGroupNum(groupId); int32_t shardNum = getShardNum ( RDB_TITLEDB, &k ); log("db: docid=%" PRId64" shard=%" PRId32, getDocId(&k),shardNum); } //if ( g_conf.m_bypassValidation ) return true; //if ( g_conf.m_allowScale ) return true; // don't exit any more, allow it, but do not delete // recs that belong to different shards when we merge now! log ( "db: db shards unbalanced. " "Click autoscale in master controls."); //return false; return true; } log ( LOG_DEBUG, "db: Titledb passed verification successfully for %" PRId32 " recs.", count ); // DONE return true; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb collnum_t collnum , RdbList *list , const char *startKey , const char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int64_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , bool isRealMerge , bool allowPageCache , bool forceLocalIndexdb , bool noSplit , int32_t forceParitySplit ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId ); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if( g_conf.m_logTraceMsg0 ) // { // log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId); // log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks); // log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId); // } // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; logTrace( g_conf.m_logTraceMsg0, "END" ); log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; } // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; m_addToCache = addToCache; // . these define our request 100% KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) { log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId ); } // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { logTrace( g_conf.m_logTraceMsg0, "isLocal" ); if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , m_isRealMerge , m_allowPageCache ) ) { logTrace( g_conf.m_logTraceMsg0, "END, return false" ); return false; } // nuke it reset(); logTrace( g_conf.m_logTraceMsg0, "END, return true" ); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%" PRIu32" " "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" " //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")", "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId); logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" ); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) { // cback niceness logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" ); return true; } // return false cuz it blocked logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" ); return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; buf = replyBuf; // get the multicast Multicast *m = &m_mcast; if ( ! m->send ( m_request , m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout*1000 , // timeout niceness , firstHostId , buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log(LOG_ERROR, "net: Failed to send request for data from %s in shard " "#%" PRIu32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // but speed it up m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) { logTrace( g_conf.m_logTraceMsg0, "END - returning false" ); return false; } logTrace( g_conf.m_logTraceMsg0, "END - returning true" ); return true; } m_numRequests++; // we blocked logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" ); return false; }
bool Clusterdb::verify ( char *coll ) { log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll ); g_jobScheduler.disallow_new_jobs(); Msg5 msg5; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); //int32_t minRecSizes = 64000; CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_CLUSTERDB , cr->m_collnum , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , true )) { g_jobScheduler.allow_new_jobs(); log("db: HEY! it did not block"); return false; } int32_t count = 0; int32_t got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); // skip negative keys if ( (k.n0 & 0x01) == 0x00 ) continue; count++; //uint32_t groupId = getGroupId ( RDB_CLUSTERDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { // tally it up g_rebalance.m_numForeignRecs += count - got; log ("db: Out of first %" PRId32" records in clusterdb, " "only %" PRId32" belong to our group.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to Clusterdb inconsistency." ); g_jobScheduler.allow_new_jobs(); return g_conf.m_bypassValidation; } log ( LOG_DEBUG, "db: Clusterdb passed verification successfully for " "%" PRId32" recs.", count ); // DONE g_jobScheduler.allow_new_jobs(); return true; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb //char *coll , collnum_t collnum , RdbList *list , //key_t startKey , //key_t endKey , char *startKey , char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int32_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , Msg5 *msg5b , bool isRealMerge , //#ifdef SPLIT_INDEXDB bool allowPageCache , bool forceLocalIndexdb , bool noSplit , // doIndexdbSplit , int32_t forceParitySplit ) { //#else // bool allowPageCache ) { //#endif // this is obsolete! mostly, but we need it for PageIndexdb.cpp to // show a "termlist" for a given query term in its entirety so you // don't have to check each machine in the network. if this is true it // means to query each split and merge the results together into a // single unified termlist. only applies to indexdb/datedb. //if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; } // note this because if caller is wrong it hurts performance major!! //if ( doIndexdbSplit ) // logf(LOG_DEBUG,"net: doing msg0 with indexdb split true"); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); //if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; } // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; return true; } // debug msg //if ( niceness != 0 ) log("HEY start"); // ensure startKey last bit clear, endKey last bit set //if ( (startKey.n0 & 0x01) == 0x01 ) // log("Msg0::getList: warning startKey lastbit set"); //if ( (endKey.n0 & 0x01) == 0x00 ) // log("Msg0::getList: warning endKey lastbit clear"); // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; //m_ip = ip; //m_port = port; m_addToCache = addToCache; // . these define our request 100% //m_startKey = startKey; //m_endKey = endKey; KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? //if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId; if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) log(LOG_LOGIC,"net: msg0: " "Weird. check but don't add... rdbid=%"INT32".",(int32_t)m_rdbId); // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* m_numSplit = 1; if ( g_hostdb.m_indexSplits > 1 && ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&& ! forceLocalIndexdb && doIndexdbSplit ) { isLocal = false; //m_numSplit = INDEXDB_SPLIT; m_numSplit = g_hostdb.m_indexSplits; char *xx=NULL;*xx=0; } */ /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // force a msg0 if doing a docid restrictive query like // gbdocid:xxxx|<query> so we call cacheTermLists() //if ( singleDocIdQuery ) isLocal = false; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { // && !g_conf.m_interfaceMachine ) { if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); // same for msg5b if ( msg5b ) { m_msg5b = msg5b; m_deleteMsg5b = false; } /* else if ( m_rdbId == RDB_TITLEDB ) { try { m_msg5b = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request. 2.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" ); m_deleteMsg5b = true; } */ QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , NULL,//m_msg5b , m_isRealMerge , m_allowPageCache ) ) return false; // nuke it reset(); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%"UINT32" " "listPtr=%"PTRFMT" minRecSizes=%"INT32" termId=%"UINT64" " //"startKey.n1=%"XINT32",n0=%"XINT64" (niceness=%"INT32")", "startKey.n1=%"XINT64",n0=%"XINT64" (niceness=%"INT32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // adjust niceness for net transmission bool realtime = false; //if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true; // if we're niceness 0 we need to pre-allocate for reply since it // might be received within the asynchronous signal handler which // cannot call mmalloc() if ( realtime ) { // niceness <= 0 || netnice == 0 ) { // . we should not get back more than minRecSizes bytes since // we are now performing merges // . it should not slow things down too much since the hashing // is 10 times slower than merging anyhow... // . CAUTION: if rdb is not fixed-datasize then this will // not work for us! it can exceed m_minRecSizes. replyBufMaxSize = m_minRecSizes ; // . get a little extra to fix the error where we ask for 64 // but get 72 // . where is that coming from? // . when getting titleRecs we often exceed the minRecSizes // . ?Msg8? was having trouble. was int16_t 32 bytes sometimes. replyBufMaxSize += 36; // why add ten percent? //replyBufMaxSize *= 110 ; //replyBufMaxSize /= 100 ; // make a buffer to hold the reply //#ifdef SPLIT_INDEXDB /* if ( m_numSplit > 1 ) { m_replyBufSize = replyBufMaxSize * m_numSplit; replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0"); m_replyBuf = replyBuf; freeReply = false; } else */ //#endif replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0"); // g_errno is set and we return true if it failed if ( ! replyBuf ) { log("net: Failed to pre-allocate %"INT32" bytes to hold " "data read remotely from %s: %s.", replyBufMaxSize,getDbnameFromId(m_rdbId), mstrerror(g_errno)); return true; } } // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".", m_hostId); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); //if ( niceness <= 0 || netnice == 0 ) { //if ( realtime ) { // us = &g_udpServer2; port = h->m_port2; } //else { us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) // cback niceness return true; // return false cuz it blocked return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; //if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. " // "termId=%"UINT64", " // "groupNum=%"UINT32"", // g_indexdb.getTermId(m_startKey) , // g_hostdb.makeHostId ( m_groupId ) ); /* // make the cache key so we can see what remote host cached it, if any char cacheKey[MAX_KEY_BYTES]; //key_t cacheKey = makeCacheKey ( startKey , makeCacheKey ( startKey , endKey , includeTree , minRecSizes , startFileNum , numFiles , cacheKey , m_ks ); */ // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); /* // allocate space if ( m_numSplit > 1 ) { int32_t need = m_numSplit * sizeof(Multicast) ; char *buf = (char *)mmalloc ( need,"msg0mcast" ); if ( ! buf ) return true; m_mcasts = (Multicast *)buf; for ( int32_t i = 0; i < m_numSplit ; i++ ) m_mcasts[i].constructor(); } */ // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) //#ifdef SPLIT_INDEXDB // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; /* if ( m_numSplit > 1 ) { gr = g_indexdb.getSplitGroupId ( baseGroupId, i ); buf = &replyBuf[i*replyBufMaxSize]; } else { */ //gr = m_groupId; buf = replyBuf; //} // get the multicast Multicast *m = &m_mcast; //if ( m_numSplit > 1 ) m = &m_mcasts[i]; if ( ! m->send ( m_request , //#else // if ( ! m_mcast.send ( m_request , //#endif m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , //#ifdef SPLIT_INDEXDB // gr , // group + offset //#else // m_groupId , // group to send to (groupKey) //#endif false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout , // timeout in seconds (was 30) niceness , realtime , firstHostId , //#ifdef SPLIT_INDEXDB // &replyBuf[i*replyBufMaxSize] , //#else // replyBuf , //#endif buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log("net: Failed to send request for data from %s in shard " "#%"UINT32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // no, multicast will free this when it is destroyed //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); // but speed it up //#ifdef SPLIT_INDEXDB m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) return false; //#else // m_mcast.reset(); //#endif return true; } //#ifdef SPLIT_INDEXDB m_numRequests++; //#endif // we blocked return false; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . "termIds/termFreqs" should NOT be on the stack in case we block // . i based this on ../titledb/Msg23.cpp bool Msg36::getTermFreq ( char *coll , long maxAge , long long termId , void *state , void (* callback)(void *state ) , long niceness , bool exactCount , bool incCount , bool decCount , bool isSplit) { // sanity check if ( termId == 0LL ) { g_errno = EBADENGINEER; log("quota: msg36: termid is 0."); return true; } // warning if ( ! coll ) log(LOG_LOGIC,"quota: msg36: NULL collection."); // no more quotas here! if ( incCount || decCount ) { char *xx = NULL; *xx = 0; } // sanity check //if ( incCount && ! exactCount ) { char *xx = NULL; *xx = 0; } //if ( decCount && ! exactCount ) { char *xx = NULL; *xx = 0; } // sanity check //if ( incCount && isSplit ) { char *xx = NULL; *xx = 0; } //if ( decCount && isSplit ) { char *xx = NULL; *xx = 0; } // cannot call handler asynchronously when doing exact counts... //if ( exactCount ) niceness = MAX_NICENESS; // keep a pointer for the caller m_state = state; m_callback = callback; m_termFreq = 0LL; m_niceness = niceness; m_errno = 0LL; m_isSplit = isSplit; // TODO: have a local by-pass for speed // if we have this termlist local then we can skip the network stuff //if ( g_indexdb.isLocal ( termId ) ) { return getTermFreqLocally(); } // make a key from our termId, and if docId is provided, that too. key144_t key ; g_posdb.makeStartKey ( &key, termId , 0LL ); // . now what group do we belong to? // . groupMask has hi bits set before it sets low bits //unsigned long groupId = key.n1 & g_hostdb.m_groupMask; //unsigned long groupId; /* if ( g_hostdb.m_indexSplits > 1 ) groupId = g_indexdb.getBaseGroupId(&key); else groupId = g_indexdb.getGroupIdFromKey(&key); */ //groupId = g_indexdb.getNoSplitGroupId(&key); uint32_t shardNum = getShardNum ( RDB_POSDB , &key ); log(LOG_DEBUG,"quota: msg36 termid=%lli inc=%li dec=%li " "sending to shard=%li\n",termId,(long)incCount,(long)decCount, (long)shardNum); //unsigned long groupId = g_indexdb.getBaseGroupId(&key); //getGroupIdFromKey ( &key ); // . what is the ideal hostId based on this key? // . this is what multicast does to determine the 1st host to send to //if ( groupId == g_hostdb.m_groupId && bool local = true; if ( g_hostdb.m_indexSplits != 1 ) local = false; if ( shardNum != getMyShardNum() ) local = false; //if ( g_conf.m_fullSplit ) local = true; local = true; if ( exactCount ) local = false; //if ( g_hostdb.m_indexSplits == 1 && // groupId == g_hostdb.m_groupId && // //!g_conf.m_interfaceMachine && // !exactCount ) { if ( local ) { //long numHosts; //Host *hosts = g_hostdb.getGroup(g_hostdb.m_groupId,&numHosts); //unsigned long i = ((unsigned long)groupId/*key*/) % numHosts; // if it's us then no need to multicast to ourselves //if(hosts[i].m_hostId==g_hostdb.m_hostId||g_conf.m_fullSplit) { m_termFreq = g_posdb.getTermFreq ( coll , termId ); // clear g_errno g_errno = 0; return true; } // . make a request // . just send the termId and collection name char *p = m_request; *p = 0; // exact flag if ( exactCount ) *p |= 0x01; //if ( incCount ) *p |= 0x02; //if ( decCount ) *p |= 0x04; if ( m_niceness ) *p |= 0x08; p++; *(long long *)p = termId ; p += sizeof(long long); strcpy ( p , coll ); p += gbstrlen(coll) + 1; // copy includes \0 long timeout = 5; //if ( incCount || decCount ) timeout = 9999999; if ( exactCount ) timeout = 9999999; // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; bool blocked = false; // just do one host and multiply his count by the split // for now to increase performance bool semiExact = true; if(!m_isSplit) semiExact = false; // send a request for every split for ( long i = 0; i < g_hostdb.m_indexSplits; i++ ) { long gr; char *buf; // semiExact overrides all if ( semiExact && g_hostdb.m_indexSplits > 1 ) { long nn = (unsigned long)termId % g_hostdb.m_indexSplits; // sanity check if ( nn < 0 || nn >= g_hostdb.m_indexSplits ) { char *xx = NULL; *xx = 0; } //gr = g_indexdb.getSplitGroupId ( groupId , nn); // need to select the first buffer buf = &m_reply[i*8]; // do not use this! char *xx=NULL;*xx=0; } else if ( g_hostdb.m_indexSplits > 1 && m_isSplit) { //gr = g_indexdb.getSplitGroupId ( groupId, i ); buf = &m_reply[i*8]; // do not use this! char *xx=NULL;*xx=0; } else { gr = shardNum; //this is just the baseGroupId buf = m_reply; } // in case it fails somehow *(long long *)buf = 0LL; // . multicast to a host in group // . returns false and sets g_errno on error if ( ! m_mcast[i]. send ( m_request , p - m_request, // request size 0x36 , // msgType 0x36 false , // multicast owns msg? gr , // shard num false , // send to whole group? termId , // key is termId this , // state data NULL , // state data gotReplyWrapper36 , timeout, //5 , // 5 second timeout m_niceness , false , // realtime? -1 , // first hostid buf , 8 , false ) ) { // free reply buf? log("quota: msg36: sending mcast had error: %s", mstrerror(g_errno)); //return true; } else { m_numRequests++; blocked = true; } // only launch (attempt to launch) one request if semiExact if ( semiExact ) break; // is we are not split only one host has the termlist if ( ! m_isSplit ) break; // no inefficient looping! let's nuke this mcast array char *xx = NULL; *xx = 0; } // we blocked on the multicast if ( blocked ) return false; return true; }
void Indexdb::deepVerify ( char *coll ) { log ( LOG_INFO, "db: Deep Verifying Indexdb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); //long minRecSizes = 64000; collnum_t collnum = g_collectiondb.getCollnum(coll); RdbBase *rdbBase = g_indexdb.m_rdb.getBase(collnum); long numFiles = rdbBase->getNumFiles(); long currentFile = 0; deepLoop: // done after scanning all files if ( currentFile >= numFiles ) { g_threads.enableThreads(); log ( LOG_INFO, "db: Finished deep verify for %li files.", numFiles ); return; } // scan this file if ( ! msg5.getList ( RDB_INDEXDB , coll , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age currentFile , // startFileNum , 1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , false )) { g_threads.enableThreads(); log("db: HEY! it did not block"); return; } long count = 0; long got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; //unsigned long groupId = getGroupId ( RDB_INDEXDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; unsigned long shardNum = getShardNum( RDB_INDEXDB , &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { BigFile *f = rdbBase->getFile(currentFile); log ("db: File %s: Out of first %li records in indexdb, " "only %li belong to our group.", f->getFilename(),count,got ); } //else // log ( LOG_INFO, "db: File %li: Indexdb passed verification " // "successfully for %li recs.",currentFile,count ); // next file currentFile++; goto deepLoop; }
bool Syncdb::verify ( char *coll ) { log ( LOG_INFO, "db: Verifying Syncdb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_SYNCDB , cr->m_collnum , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } long count = 0; long got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; //unsigned long groupId = getGroupId ( RDB_SYNCDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; uint32_t shardNum = getShardNum ( RDB_SYNCDB , (char *)&k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { log ("db: Out of first %li records in syncdb, " "only %li belong to our group.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to Syncdb inconsistency." ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_INFO, "db: Syncdb passed verification successfully for " "%li recs.", count ); // DONE g_threads.enableThreads(); return true; }
bool Indexdb::verify ( char *coll ) { return true; log ( LOG_INFO, "db: Verifying Indexdb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); //long minRecSizes = 64000; if ( ! msg5.getList ( RDB_INDEXDB , coll , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } long count = 0; long got = 0; bool printedKey = false; bool printedZeroKey = false; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; //unsigned long groupId = getGroupId ( RDB_INDEXDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; unsigned long shardNum = getShardNum( RDB_INDEXDB , &k ); if ( shardNum == getMyShardNum() ) got++; else if ( !printedKey ) { log ( "db: Found bad key in list (only printing once): " "%lx %llx", k.n1, k.n0 ); printedKey = true; } if ( k.n1 == 0 && k.n0 == 0 ) { if ( !printedZeroKey ) { log ( "db: Found Zero key in list, passing. " "(only printing once)." ); printedZeroKey = true; } if ( shardNum != getMyShardNum() ) got++; } } if ( got != count ) { log ("db: Out of first %li records in indexdb, only %li belong " "to our group.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to Indexdb inconsistency." ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_INFO, "db: Indexdb passed verification successfully for %li " "recs.", count ); // DONE g_threads.enableThreads(); return true; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . if the list is sorted by keys this will be the most efficient bool Msg1::sendSomeOfList ( ) { // sanity check if ( m_list->m_ks != 8 && m_list->m_ks != 12 && m_list->m_ks != 16 && m_list->m_ks != 24 ) { g_process.shutdownAbort(true); } // debug msg //log("sendSomeOfList: mcast=%" PRIu32" exhausted=%" PRId32, // (int32_t)&m_mcast,(int32_t)m_list->isExhausted()); loop: // return true if list exhausted and nothing left to add if ( m_list->isExhausted() ) return true; // get key of the first record in the list //key_t firstKey = m_list->getCurrentKey(); char firstKey[MAX_KEY_BYTES]; m_list->getCurrentKey(firstKey); QUICKPOLL(m_niceness); // get groupId from this key //uint32_t groupId ; // . use the new Hostdb.h inlined function uint32_t shardNum = getShardNum ( m_rdbId , firstKey ); // point to start of data we're going to send char *dataStart = m_list->getListPtr(); // how many records belong to the same group as "firstKey" //key_t key; char key[MAX_KEY_BYTES]; while ( ! m_list->isExhausted() ) { //key = m_list->getCurrentKey(); m_list->getCurrentKey(key); #ifdef GBSANITYCHECK // no half bits in here! // debug point if ( m_list->useHalfKeys() && m_list->isHalfBitOn ( m_list->getCurrentRec() ) ) log(LOG_LOGIC,"net: msg1: Got half bit. Bad " "engineer."); #endif // . if key belongs to same group as firstKey then continue // . titledb now uses last bits of docId to determine groupId // . but uses the top 32 bits of key still // . spiderdb uses last 64 bits to determine groupId // . tfndb now is like titledb(top 32 bits are top 32 of docId) //if ( getGroupId(m_rdbId,key) != groupId ) goto done; if ( getShardNum(m_rdbId,key) != shardNum ) goto done; // . break so we don't send more than MAX_DGRAMS defined in // UdpServer.cpp. // . let's boost it from 16k to 64k for speed if ( m_list->getListPtr() - dataStart > 64*1024 ) goto done; // . point to next record // . will point passed records if no more left! QUICKPOLL(m_niceness); //int32_t crec = m_list->getCurrentRecSize(); m_list->skipCurrentRecord(); // sanity check if ( m_list->m_listPtr > m_list->m_listEnd ) { g_process.shutdownAbort(true); } } done: // now point to the end of the data char *dataEnd = m_list->getListPtr(); // . if force local is true we force the data to be added locally // . this fixes the bug we had from spiderdb since a key got corrupted // just enough to put it into a different groupId (but not out // of order) so we couldn't delete it cuz our delete keys would go // elsewhere if ( m_forceLocal && shardNum != getMyShardNum() && ! g_conf.m_interfaceMachine ) { // make the groupId local, our group //groupId = g_hostdb.m_groupId; // bitch about this to log it log("net: Data does not belong in shard %" PRIu32", but adding " "to %s anyway. Probable data corruption.", (uint32_t)shardNum,getDbnameFromId(m_rdbId)); } QUICKPOLL(m_niceness); // sanity test for new rdbs if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) { g_process.shutdownAbort(true); } // . now send this list to the host // . this returns false if blocked, true otherwise // . it also sets g_errno on error // . if it blocked return false if ( ! sendData ( shardNum , dataStart , dataEnd - dataStart ) ) return false; // if there was an error return true if ( g_errno ) return true; // otherwise, keep adding goto loop; }
/* bool Revdb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; if ( ! doVerify ) return true; // verify if ( verify(coll) ) return true; // if not allowing scale, return false if ( ! g_conf.m_allowScale ) return false; // otherwise let it go log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ bool Revdb::verify ( char *coll ) { log ( LOG_INFO, "db: Verifying Revdb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); //int32_t minRecSizes = 64000; CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_REVDB , cr->m_collnum , &list , startKey , endKey , 1024*1024 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b , false )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } int32_t count = 0; int32_t got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; //uint32_t groupId = getGroupId ( RDB_REVDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; uint32_t shardNum = getShardNum( RDB_REVDB , &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { log ("db: Out of first %"INT32" records in revdb, " "only %"INT32" belong to our group.",count,got); // exit if NONE, we probably got the wrong data if ( count > 10 && got == 0 ) log("db: Are you sure you have the right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to Revdb inconsistency." ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_INFO, "db: Revdb passed verification successfully for %"INT32"" " recs.", count ); // DONE g_threads.enableThreads(); return true; }
/* bool Placedb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; if ( ! doVerify ) return true; // verify if ( verify(coll) ) return true; // if not allowing scale, return false if ( ! g_conf.m_allowScale ) return false; // otherwise let it go log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ bool Placedb::verify ( char *coll ) { log ( LOG_INFO, "db: Verifying Placedb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; startKey.setMin(); key_t endKey; endKey.setMax(); if ( ! msg5.getList ( RDB_PLACEDB , coll , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true , false )) { // allow page cache? g_threads.enableThreads(); return log("db: HEY! it did not block"); } long count = 0; long got = 0; bool printedKey = false; bool printedZeroKey = false; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; // verify the group uint32_t shardNum = getShardNum ( RDB_PLACEDB , (char *)&k ); if ( shardNum == getMyShardNum() ) got++; else if ( !printedKey ) { log ("db: Found bad key in list (only printing once): " "%lx %llx", k.n1, k.n0 ); printedKey = true; } if ( k.n1 == 0 && k.n0 == 0 ) { if ( !printedZeroKey ) { log ( "db: Found Zero key in list, passing. " "(only printing once)." ); printedZeroKey = true; } // pass if we didn't match above if ( shardNum != getMyShardNum() ) got++; } } if ( got != count ) { log("db: Out of first %li records in placedb, only %li passed " "verification.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); g_threads.enableThreads(); // if only one let it slide, i saw this happen on gb1 cluster if ( got - count >= -1 && got - count <= 1 ) return true; log ( "db: Exiting due to Placedb inconsistency." ); return g_conf.m_bypassValidation; } log ( LOG_INFO, "db: Placedb passed verification successfully for %li " "recs.", count ); // DONE g_threads.enableThreads(); return true; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . if the list is sorted by keys this will be the most efficient bool Msg1::sendSomeOfList ( ) { // sanity check if ( m_list->m_ks != 8 && m_list->m_ks != 12 && m_list->m_ks != 16 && m_list->m_ks != 24 ) { char *xx=NULL;*xx=0; } // debug msg //log("sendSomeOfList: mcast=%lu exhausted=%li", // (long)&m_mcast,(long)m_list->isExhausted()); loop: // return true if list exhausted and nothing left to add if ( m_list->isExhausted() ) return true; // get key of the first record in the list //key_t firstKey = m_list->getCurrentKey(); char firstKey[MAX_KEY_BYTES]; m_list->getCurrentKey(firstKey); QUICKPOLL(m_niceness); // get groupId from this key //unsigned long groupId ; // . use the new Hostdb.h inlined function uint32_t shardNum = getShardNum ( m_rdbId , firstKey ); // . default is to use top bits of the key // . but if we're adding to titledb use last bits in the top of key // . but if we're adding to spiderdb we use the last long in the key // . tfndb urlRec key same as titleRec key /* if ( m_rdbId == RDB_INDEXDB ) groupId = g_indexdb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB_DATEDB ) groupId = g_datedb.getGroupIdFromKey((key128_t *)firstKey); else if ( m_rdbId == RDB_TITLEDB) groupId = g_titledb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB_CHECKSUMDB) groupId = g_checksumdb.getGroupId ( firstKey ); else if ( m_rdbId == RDB_SPIDERDB ) groupId = g_spiderdb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB_TFNDB ) groupId = g_tfndb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB_CLUSTERDB ) groupId = g_clusterdb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB2_INDEXDB2 ) groupId = g_indexdb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB2_DATEDB2 ) groupId = g_datedb.getGroupIdFromKey((key128_t *)firstKey); else if ( m_rdbId == RDB2_TITLEDB2) groupId = g_titledb.getGroupIdFromKey((key_t *)firstKey); else if ( m_rdbId == RDB2_CHECKSUMDB2) groupId = g_checksumdb.getGroupId ( firstKey ); else if ( m_rdbId == RDB2_SPIDERDB2 ) groupId = g_spiderdb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB2_TFNDB2 ) groupId = g_tfndb.getGroupId ( (key_t *)firstKey ); else if ( m_rdbId == RDB2_CLUSTERDB2 ) groupId = g_clusterdb.getGroupIdFromKey((key_t *)firstKey); //else groupId=firstKey.n1 & g_hostdb.m_groupMask; else groupId = (((key_t *)firstKey)->n1) & g_hostdb.m_groupMask; */ // point to start of data we're going to send char *dataStart = m_list->getListPtr(); // how many records belong to the same group as "firstKey" //key_t key; char key[MAX_KEY_BYTES]; while ( ! m_list->isExhausted() ) { //key = m_list->getCurrentKey(); m_list->getCurrentKey(key); #ifdef _SANITYCHECK_ // no half bits in here! // debug point if ( m_list->useHalfKeys() && m_list->isHalfBitOn ( m_list->getCurrentRec() ) ) log(LOG_LOGIC,"net: msg1: Got half bit. Bad " "engineer."); #endif // . if key belongs to same group as firstKey then continue // . titledb now uses last bits of docId to determine groupId // . but uses the top 32 bits of key still // . spiderdb uses last 64 bits to determine groupId // . tfndb now is like titledb(top 32 bits are top 32 of docId) //if ( getGroupId(m_rdbId,key) != groupId ) goto done; if ( getShardNum(m_rdbId,key) != shardNum ) goto done; /* switch ( m_rdbId ) { case RDB_TITLEDB: if(g_titledb.getGroupIdFromKey((key_t *)key)!=groupId) goto done; break; case RDB_CHECKSUMDB: if(g_checksumdb.getGroupId ( key)!=groupId) goto done; break; case RDB_SPIDERDB: if ( g_spiderdb.getGroupId ((key_t *)key) != groupId) goto done; break; case RDB_TFNDB: if ( g_tfndb.getGroupId ((key_t *)key) != groupId) goto done; break; case RDB_CLUSTERDB: if(g_clusterdb.getGroupIdFromKey((key_t *)key)!=groupId) goto done; break; case RDB_DATEDB: if(g_datedb.getGroupIdFromKey((key128_t *)key)!=groupId) goto done; break; case RDB_INDEXDB: if(g_indexdb.getGroupIdFromKey((key_t *)key)!=groupId) goto done; break; //default:if ((key.n1&g_hostdb.m_groupMask) != groupId) default: if ( ((((key_t *)key)->n1) & g_hostdb.m_groupMask) != groupId) goto done; } */ // . break so we don't send more than MAX_DGRAMS defined in // UdpServer.cpp. // . let's boost it from 16k to 64k for speed if ( m_list->getListPtr() - dataStart > 64*1024 ) goto done; // . point to next record // . will point passed records if no more left! QUICKPOLL(m_niceness); //long crec = m_list->getCurrentRecSize(); m_list->skipCurrentRecord(); // sanity check if ( m_list->m_listPtr > m_list->m_listEnd ) { char *xx=NULL;*xx=0; } } done: // now point to the end of the data char *dataEnd = m_list->getListPtr(); // . if force local is true we force the data to be added locally // . this fixes the bug we had from spiderdb since a key got corrupted // just enough to put it into a different groupId (but not out // of order) so we couldn't delete it cuz our delete keys would go // elsewhere if ( m_forceLocal && shardNum != getMyShardNum() && ! g_conf.m_interfaceMachine ) { // make the groupId local, our group //groupId = g_hostdb.m_groupId; // bitch about this to log it log("net: Data does not belong in shard %lu, but adding " "to %s anyway. Probable data corruption.", (unsigned long)shardNum,getDbnameFromId(m_rdbId)); } QUICKPOLL(m_niceness); // sanity test for new rdbs if ( m_list->m_fixedDataSize != getDataSizeFromRdbId(m_rdbId) ) { char *xx=NULL;*xx=0; } // little debug thing for genCatdb from msg9b's huge list add //if ( m_list->m_listSize > 10000000 ) // log("msg1: adding chunk @ %li of %li bytes", // (long)(dataStart - m_list->m_list) , // (long)m_list->m_listSize ); // . now send this list to the host // . this returns false if blocked, true otherwise // . it also sets g_errno on error // . if it blocked return false if ( ! sendData ( shardNum , dataStart , dataEnd - dataStart ) ) return false; // if there was an error return true if ( g_errno ) return true; // otherwise, keep adding goto loop; }