~ScopeCache() { if (inShutdown()) return; clear(); }
void run() { if ( _token.size() == 0 && _name.size() == 0 ) { LOG(1) << "mms not configured" << endl; return; } if ( _token.size() == 0 ) { log() << "no token for mms - not running" << endl; return; } if ( _name.size() == 0 ) { log() << "no name for mms - not running" << endl; return; } log() << "mms monitor staring... token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl; Client::initThread( "mms" ); Client& c = cc(); // TODO: using direct client is bad, but easy for now while ( ! inShutdown() ) { sleepsecs( _secsToSleep ); try { stringstream url; url << _baseurl << "?" << "token=" << _token << "&" << "name=" << _name << "&" << "ts=" << time(0) ; BSONObjBuilder bb; // duplicated so the post has everything bb.append( "token" , _token ); bb.append( "name" , _name ); bb.appendDate( "ts" , jsTime() ); // any commands _add( bb , "buildinfo" ); _add( bb , "serverStatus" ); BSONObj postData = bb.obj(); LOG(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;; HttpClient c; HttpClient::Result r; int rc = c.post( url.str() , postData.jsonString() , &r ); LOG(1) << "\t response code: " << rc << endl; if ( rc != 200 ) { log() << "mms error response code:" << rc << endl; LOG(1) << "mms error body:" << r.getEntireResponse() << endl; } } catch ( std::exception& e ) { log() << "mms exception: " << e.what() << endl; } } c.shutdown(); }
void runSyncThread() { Client::initThread("rsSync"); cc().getAuthorizationSession()->grantInternalAuthorization(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Set initial indexPrefetch setting const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; if (!prefetch.empty()) { BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; if (prefetch == "none") prefetchConfig = BackgroundSync::PREFETCH_NONE; else if (prefetch == "_id_only") prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; else if (prefetch == "all") prefetchConfig = BackgroundSync::PREFETCH_ALL; else { warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " << "to \"all\""; } BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); } while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getMemberState().startup()) { warning() << "did not receive a valid config yet, sleeping 5 seconds "; sleepsecs(5); continue; } const MemberState memberState = replCoord->getMemberState(); // An arbiter can never transition to any other state, and doesn't replicate, ever if (memberState.arbiter()) { break; } // If we are removed then we don't belong to the set anymore if (memberState.removed()) { sleepsecs(5); continue; } try { if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { continue; } /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch(const DBException& e) { log() << "Received exception while syncing: " << e.toString(); sleepsecs(10); } catch(const std::exception& e) { log() << "Received exception while syncing: " << e.what(); sleepsecs(10); } } cc().shutdown(); }
NetworkInterfaceMock::~NetworkInterfaceMock() { stdx::unique_lock<stdx::mutex> lk(_mutex); invariant(!_hasStarted || inShutdown()); invariant(_scheduled.empty()); invariant(_blackHoled.empty()); }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r(false /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } r.tailingQueryGTE(rsoplog, _lastOpTimeFetched); } // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { while (!inShutdown()) { if (!r.moreInCurrentBatch()) { if (theReplSet->gotForceSync()) { return; } if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) break; BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); } } // end while { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } // looping back is ok because this is a tailable cursor } }
void Listener::initAndListen() { checkTicketNumbers(); vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _port); vector<int> socks; SOCKET maxfd = 0; // needed for select() for (vector<SockAddr>::iterator it=mine.begin(), end=mine.end(); it != end; ++it) { SockAddr& me = *it; SOCKET sock = ::socket(me.getType(), SOCK_STREAM, 0); if ( sock == INVALID_SOCKET ) { log() << "ERROR: listen(): invalid socket? " << errnoWithDescription() << endl; } if (me.getType() == AF_UNIX) { #if !defined(_WIN32) if (unlink(me.getAddr().c_str()) == -1) { int x = errno; if (x != ENOENT) { log() << "couldn't unlink socket file " << me << errnoWithDescription(x) << " skipping" << endl; continue; } } #endif } else if (me.getType() == AF_INET6) { // IPv6 can also accept IPv4 connections as mapped addresses (::ffff:127.0.0.1) // That causes a conflict if we don't do set it to IPV6_ONLY const int one = 1; setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*) &one, sizeof(one)); } prebindOptions( sock ); if ( ::bind(sock, me.raw(), me.addressSize) != 0 ) { int x = errno; log() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl; if ( x == EADDRINUSE ) log() << " addr already in use" << endl; closesocket(sock); return; } #if !defined(_WIN32) if (me.getType() == AF_UNIX) { if (chmod(me.getAddr().c_str(), 0777) == -1) { log() << "couldn't chmod socket file " << me << errnoWithDescription() << endl; } ListeningSockets::get()->addPath( me.getAddr() ); } #endif if ( ::listen(sock, 128) != 0 ) { log() << "listen(): listen() failed " << errnoWithDescription() << endl; closesocket(sock); return; } ListeningSockets::get()->add( sock ); socks.push_back(sock); if (sock > maxfd) maxfd = sock; } static long connNumber = 0; struct timeval maxSelectTime; while ( ! inShutdown() ) { fd_set fds[1]; FD_ZERO(fds); for (vector<int>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) { FD_SET(*it, fds); } maxSelectTime.tv_sec = 0; maxSelectTime.tv_usec = 10000; const int ret = select(maxfd+1, fds, NULL, NULL, &maxSelectTime); if (ret == 0) { #if defined(__linux__) _elapsedTime += ( 10000 - maxSelectTime.tv_usec ) / 1000; #else _elapsedTime += 10; #endif continue; } _elapsedTime += ret; // assume 1ms to grab connection. very rough if (ret < 0) { int x = errno; #ifdef EINTR if ( x == EINTR ) { log() << "select() signal caught, continuing" << endl; continue; } #endif if ( ! inShutdown() ) log() << "select() failure: ret=" << ret << " " << errnoWithDescription(x) << endl; return; } for (vector<int>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) { if (! (FD_ISSET(*it, fds))) continue; SockAddr from; int s = accept(*it, from.raw(), &from.addressSize); if ( s < 0 ) { int x = errno; // so no global issues if ( x == ECONNABORTED || x == EBADF ) { log() << "Listener on port " << _port << " aborted" << endl; return; } if ( x == 0 && inShutdown() ) { return; // socket closed } if( !inShutdown() ) log() << "Listener: accept() returns " << s << " " << errnoWithDescription(x) << endl; continue; } if (from.getType() != AF_UNIX) disableNagle(s); if ( _logConnect && ! cmdLine.quiet ) log() << "connection accepted from " << from.toString() << " #" << ++connNumber << endl; accepted(s, from); } } }
void Listener::initAndListen() { checkTicketNumbers(); vector<SOCKET> socks; { // normal sockets vector<SockAddr> mine = ipToAddrs(_ip.c_str(), _port, (!cmdLine.noUnixSocket && useUnixSockets())); if ( ! _setupSockets( mine , socks ) ) return; } SOCKET maxfd = 0; // needed for select() for ( unsigned i=0; i<socks.size(); i++ ) { if ( socks[i] > maxfd ) maxfd = socks[i]; } #ifdef MONGO_SSL _logListen(_port, _ssl); #else _logListen(_port, false); #endif struct timeval maxSelectTime; while ( ! inShutdown() ) { fd_set fds[1]; FD_ZERO(fds); for (vector<SOCKET>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) { FD_SET(*it, fds); } maxSelectTime.tv_sec = 0; maxSelectTime.tv_usec = 10000; const int ret = select(maxfd+1, fds, NULL, NULL, &maxSelectTime); if (ret == 0) { #if defined(__linux__) _elapsedTime += ( 10000 - maxSelectTime.tv_usec ) / 1000; #else _elapsedTime += 10; #endif continue; } if (ret < 0) { int x = errno; #ifdef EINTR if ( x == EINTR ) { log() << "select() signal caught, continuing" << endl; continue; } #endif if ( ! inShutdown() ) log() << "select() failure: ret=" << ret << " " << errnoWithDescription(x) << endl; return; } #if defined(__linux__) _elapsedTime += max(ret, (int)(( 10000 - maxSelectTime.tv_usec ) / 1000)); #else _elapsedTime += ret; // assume 1ms to grab connection. very rough #endif for (vector<SOCKET>::iterator it=socks.begin(), end=socks.end(); it != end; ++it) { if (! (FD_ISSET(*it, fds))) continue; SockAddr from; int s = accept(*it, from.raw(), &from.addressSize); if ( s < 0 ) { int x = errno; // so no global issues if ( x == ECONNABORTED || x == EBADF ) { log() << "Listener on port " << _port << " aborted" << endl; return; } if ( x == 0 && inShutdown() ) { return; // socket closed } if( !inShutdown() ) { log() << "Listener: accept() returns " << s << " " << errnoWithDescription(x) << endl; if (x == EMFILE || x == ENFILE) { // Connection still in listen queue but we can't accept it yet error() << "Out of file descriptors. Waiting one second before trying to accept more connections." << warnings; sleepsecs(1); } } continue; } if (from.getType() != AF_UNIX) disableNagle(s); #ifdef SO_NOSIGPIPE // ignore SIGPIPE signals on osx, to avoid process exit const int one = 1; setsockopt( s , SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(int)); #endif long long myConnectionNumber = globalConnectionNumber.addAndFetch(1); if ( _logConnect && ! cmdLine.quiet ){ int conns = globalTicketHolder.used()+1; const char* word = (conns == 1 ? " connection" : " connections"); log() << "connection accepted from " << from.toString() << " #" << myConnectionNumber << " (" << conns << word << " now open)" << endl; } boost::shared_ptr<Socket> pnewSock( new Socket(s, from) ); #ifdef MONGO_SSL if (_ssl) { pnewSock->secureAccepted(_ssl); } #endif accepted( pnewSock , myConnectionNumber ); } } }
void BackgroundSync::_produce( OperationContext* txn, ReplicationCoordinatorExternalState* replicationCoordinatorExternalState) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } SyncSourceResolverResponse syncSourceResp = _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched); if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; StorageInterface::get(txn) ->setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen}); auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _replCoord->signalUpstreamUpdater(); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>(&_threadPoolTaskExecutor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3, stdx::placeholders::_4), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state BatchBoundaries boundaries = StorageInterface::get(txn)->getMinValid(txn); if (!boundaries.start.isNull() || boundaries.end > lastApplied) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << boundaries.end.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
void RangeDeleter::doWork() { Client::initThreadIfNotAlready("RangeDeleter"); Client* client = &cc(); while (!inShutdown() && !stopRequested()) { string errMsg; RangeDeleteEntry* nextTask = NULL; { stdx::unique_lock<stdx::mutex> sl(_queueMutex); while (_taskQueue.empty()) { _taskQueueNotEmptyCV.wait_for( sl, Milliseconds(kNotEmptyTimeoutMillis).toSystemDuration()); if (stopRequested()) { log() << "stopping range deleter worker" << endl; return; } if (_taskQueue.empty()) { // Try to check if some deletes are ready and move them to the // ready queue. TaskList::iterator iter = _notReadyQueue.begin(); while (iter != _notReadyQueue.end()) { RangeDeleteEntry* entry = *iter; set<CursorId> cursorsNow; if (entry->options.waitForOpenCursors) { auto txn = client->makeOperationContext(); _env->getCursorIds(txn.get(), entry->options.range.ns, &cursorsNow); } set<CursorId> cursorsLeft; std::set_intersection(entry->cursorsToWait.begin(), entry->cursorsToWait.end(), cursorsNow.begin(), cursorsNow.end(), std::inserter(cursorsLeft, cursorsLeft.end())); entry->cursorsToWait.swap(cursorsLeft); if (entry->cursorsToWait.empty()) { (*iter)->stats.queueEndTS = jsTime(); _taskQueue.push_back(*iter); _taskQueueNotEmptyCV.notify_one(); iter = _notReadyQueue.erase(iter); } else { logCursorsWaiting(entry); ++iter; } } } } if (stopRequested()) { log() << "stopping range deleter worker" << endl; return; } nextTask = _taskQueue.front(); _taskQueue.pop_front(); _deletesInProgress++; } { auto txn = client->makeOperationContext(); nextTask->stats.deleteStartTS = jsTime(); bool delResult = _env->deleteRange(txn.get(), *nextTask, &nextTask->stats.deletedDocCount, &errMsg); nextTask->stats.deleteEndTS = jsTime(); if (delResult) { nextTask->stats.waitForReplStartTS = jsTime(); if (!_waitForMajority(txn.get(), &errMsg)) { warning() << "Error encountered while waiting for replication: " << errMsg; } nextTask->stats.waitForReplEndTS = jsTime(); } else { warning() << "Error encountered while trying to delete range: " << redact(errMsg); } } { stdx::lock_guard<stdx::mutex> sl(_queueMutex); NSMinMax setEntry(nextTask->options.range.ns, nextTask->options.range.minKey, nextTask->options.range.maxKey); deletePtrElement(&_deleteSet, &setEntry); _deletesInProgress--; if (nextTask->doneSignal) { nextTask->doneSignal->set(); } } recordDelStats(new DeleteJobStats(nextTask->stats)); delete nextTask; nextTask = NULL; } }
void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager, vector<BSONObj>& insertsRemaining, map<ChunkPtr, vector<BSONObj> > insertsForChunks, int retries = 0 ) { uassert( 16055, str::stream() << "too many retries during bulk insert, " << insertsRemaining.size() << " inserts remaining", retries < 30 ); uassert( 16056, str::stream() << "shutting down server during bulk insert, " << insertsRemaining.size() << " inserts remaining", ! inShutdown() ); const int flags = d.reservedField() | InsertOption_ContinueOnError; // ContinueOnError is always on when using sharding. _groupInserts( manager, insertsRemaining, insertsForChunks ); while( ! insertsForChunks.empty() ){ ChunkPtr c = insertsForChunks.begin()->first; vector<BSONObj>& objs = insertsForChunks.begin()->second; const Shard& shard = c->getShard(); const string& ns = r.getns(); ShardConnection dbcon( shard, ns, manager ); try { LOG(4) << " server:" << c->getShard().toString() << " bulk insert " << objs.size() << " documents" << endl; // Taken from single-shard bulk insert, should not need multiple methods in future // insert( c->getShard() , r.getns() , objs , flags); // It's okay if the version is set here, an exception will be thrown if the version is incompatible dbcon.setVersion(); dbcon->insert( ns , objs , flags); // TODO: Option for safe inserts here - can then use this for all inserts dbcon.done(); int bytesWritten = 0; for (vector<BSONObj>::iterator vecIt = objs.begin(); vecIt != objs.end(); ++vecIt) { r.gotInsert(); // Record the correct number of individual inserts bytesWritten += (*vecIt).objsize(); } // TODO: The only reason we're grouping by chunks here is for auto-split, more efficient // to track this separately and bulk insert to shards if ( r.getClientInfo()->autoSplitOk() ) c->splitIfShould( bytesWritten ); } catch ( StaleConfigException& e ) { // Cleanup the connection dbcon.done(); // Assume the inserts did *not* succeed, so we don't want to erase them int logLevel = retries < 2; LOG( logLevel ) << "retrying bulk insert of " << objs.size() << " documents to chunk " << c << " because of StaleConfigException: " << e << endl; if( retries > 2 ){ versionManager.forceRemoteCheckShardVersionCB( e.getns() ); } // TODO: Replace with actual chunk handling code, simplify request r.reset(); manager = r.getChunkManager(); if( ! manager ) { // TODO : We can probably handle this better? uasserted( 14804, "collection no longer sharded" ); } // End TODO // We may need to regroup at least some of our inserts since our chunk manager may have changed _insert( r, d, manager, insertsRemaining, insertsForChunks, retries + 1 ); return; } catch( UserException& ){ // Unexpected exception, so don't clean up the conn dbcon.kill(); // These inserts won't be retried, as something weird happened here insertsForChunks.erase( insertsForChunks.begin() ); // Throw if this is the last chunk bulk-inserted to if( insertsForChunks.empty() ){ throw; } } insertsForChunks.erase( insertsForChunks.begin() ); } }
std::string NetworkInterfaceASIO::getDiagnosticString() { str::stream output; output << "NetworkInterfaceASIO"; output << " inShutdown: " << inShutdown(); return output; }
void threadRun( MessagingPort * inPort) { TicketHolderReleaser connTicketReleaser( &Listener::globalTicketHolder ); { string threadName = "conn"; if ( inPort->connectionId() > 0 ) threadName = str::stream() << threadName << inPort->connectionId(); setThreadName( threadName.c_str() ); } verify( inPort ); inPort->psock->setLogLevel(1); scoped_ptr<MessagingPort> p( inPort ); p->psock->postFork(); string otherSide; Message m; try { LastError * le = new LastError(); lastError.reset( le ); // lastError now has ownership otherSide = p->psock->remoteString(); handler->connected( p.get() ); while ( ! inShutdown() ) { m.reset(); p->psock->clearCounters(); if ( ! p->recv(m) ) { if( !cmdLine.quiet ){ int conns = Listener::globalTicketHolder.used()-1; const char* word = (conns == 1 ? " connection" : " connections"); log() << "end connection " << otherSide << " (" << conns << word << " now open)" << endl; } p->shutdown(); break; } handler->process( m , p.get() , le ); networkCounter.hit( p->psock->getBytesIn() , p->psock->getBytesOut() ); } } catch ( AssertionException& e ) { log() << "AssertionException handling request, closing client connection: " << e << endl; p->shutdown(); } catch ( SocketException& e ) { log() << "SocketException handling request, closing client connection: " << e << endl; p->shutdown(); } catch ( const DBException& e ) { // must be right above std::exception to avoid catching subclasses log() << "DBException handling request, closing client connection: " << e << endl; p->shutdown(); } catch ( std::exception &e ) { error() << "Uncaught std::exception: " << e.what() << ", terminating" << endl; dbexit( EXIT_UNCAUGHT ); } catch ( ... ) { error() << "Uncaught exception, terminating" << endl; dbexit( EXIT_UNCAUGHT ); } handler->disconnected( p.get() ); }
Status NetworkInterfaceASIO::startCommand(const TaskExecutor::CallbackHandle& cbHandle, RemoteCommandRequest& request, const RemoteCommandCompletionFn& onFinish) { MONGO_ASIO_INVARIANT(onFinish, "Invalid completion function"); { stdx::lock_guard<stdx::mutex> lk(_inProgressMutex); const auto insertResult = _inGetConnection.emplace(cbHandle); // We should never see the same CallbackHandle added twice MONGO_ASIO_INVARIANT_INLOCK(insertResult.second, "Same CallbackHandle added twice"); } if (inShutdown()) { return {ErrorCodes::ShutdownInProgress, "NetworkInterfaceASIO shutdown in progress"}; } LOG(2) << "startCommand: " << redact(request.toString()); auto getConnectionStartTime = now(); auto statusMetadata = attachMetadataIfNeeded(request, _metadataHook.get()); if (!statusMetadata.isOK()) { return statusMetadata; } auto nextStep = [this, getConnectionStartTime, cbHandle, request, onFinish]( StatusWith<ConnectionPool::ConnectionHandle> swConn) { if (!swConn.isOK()) { LOG(2) << "Failed to get connection from pool for request " << request.id << ": " << swConn.getStatus(); bool wasPreviouslyCanceled = false; { stdx::lock_guard<stdx::mutex> lk(_inProgressMutex); wasPreviouslyCanceled = _inGetConnection.erase(cbHandle) == 0; } onFinish(wasPreviouslyCanceled ? Status(ErrorCodes::CallbackCanceled, "Callback canceled") : swConn.getStatus()); signalWorkAvailable(); return; } auto conn = static_cast<connection_pool_asio::ASIOConnection*>(swConn.getValue().get()); AsyncOp* op = nullptr; stdx::unique_lock<stdx::mutex> lk(_inProgressMutex); const auto eraseCount = _inGetConnection.erase(cbHandle); // If we didn't find the request, we've been canceled if (eraseCount == 0) { lk.unlock(); onFinish({ErrorCodes::CallbackCanceled, "Callback canceled"}); // Though we were canceled, we know that the stream is fine, so indicate success. conn->indicateSuccess(); signalWorkAvailable(); return; } // We can't release the AsyncOp until we know we were not canceled. auto ownedOp = conn->releaseAsyncOp(); op = ownedOp.get(); // This AsyncOp may be recycled. We expect timeout and canceled to be clean. // If this op was most recently used to connect, its state transitions won't have been // reset, so we do that here. MONGO_ASIO_INVARIANT_INLOCK(!op->canceled(), "AsyncOp has dirty canceled flag", op); MONGO_ASIO_INVARIANT_INLOCK(!op->timedOut(), "AsyncOp has dirty timeout flag", op); op->clearStateTransitions(); // Now that we're inProgress, an external cancel can touch our op, but // not until we release the inProgressMutex. _inProgress.emplace(op, std::move(ownedOp)); op->_cbHandle = std::move(cbHandle); op->_request = std::move(request); op->_onFinish = std::move(onFinish); op->_connectionPoolHandle = std::move(swConn.getValue()); op->startProgress(getConnectionStartTime); // This ditches the lock and gets us onto the strand (so we're // threadsafe) op->_strand.post([this, op, getConnectionStartTime] { // Set timeout now that we have the correct request object if (op->_request.timeout != RemoteCommandRequest::kNoTimeout) { // Subtract the time it took to get the connection from the pool from the request // timeout. auto getConnectionDuration = now() - getConnectionStartTime; if (getConnectionDuration >= op->_request.timeout) { // We only assume that the request timer is guaranteed to fire *after* the // timeout duration - but make no stronger assumption. It is thus possible that // we have already exceeded the timeout. In this case we timeout the operation // manually. std::stringstream msg; msg << "Remote command timed out while waiting to get a connection from the " << "pool, took " << getConnectionDuration; return _completeOperation(op, {ErrorCodes::ExceededTimeLimit, msg.str()}); } // The above conditional guarantees that the adjusted timeout will never underflow. MONGO_ASIO_INVARIANT( op->_request.timeout > getConnectionDuration, "timeout underflowed", op); const auto adjustedTimeout = op->_request.timeout - getConnectionDuration; const auto requestId = op->_request.id; op->_timeoutAlarm = op->_owner->_timerFactory->make(&op->_strand, adjustedTimeout); std::shared_ptr<AsyncOp::AccessControl> access; std::size_t generation; { stdx::lock_guard<stdx::mutex> lk(op->_access->mutex); access = op->_access; generation = access->id; } op->_timeoutAlarm->asyncWait( [this, op, access, generation, requestId, adjustedTimeout](std::error_code ec) { // We must pass a check for safe access before using op inside the // callback or we may attempt access on an invalid pointer. stdx::lock_guard<stdx::mutex> lk(access->mutex); if (generation != access->id) { // The operation has been cleaned up, do not access. return; } if (!ec) { LOG(2) << "Request " << requestId << " timed out" << ", adjusted timeout after getting connection from pool was " << adjustedTimeout << ", op was " << redact(op->toString()); op->timeOut_inlock(); } else { LOG(2) << "Failed to time request " << requestId << "out: " << ec.message() << ", op was " << redact(op->toString()); } }); } _beginCommunication(op); }); }; _connectionPool.get(request.target, request.timeout, nextStep); return Status::OK(); }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; scoped_ptr<Timer> timer; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; OpTime last; while( 1 ) { try { if (str::startsWith(ns, "local.oplog.")){ if (pass == 0) { mutex::scoped_lock lk(OpTime::m); last = OpTime::getLast(lk); } else { last.waitForDifferent(1000/*ms*/); } } Client::ReadContext ctx(ns); // call this readlocked so state can't change replVerifyReadsOk(); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } if (msgdata == 0) { // this should only happen with QueryOption_AwaitData exhaust = false; massert(13073, "shutting down", !inShutdown() ); if ( ! timer ) { timer.reset( new Timer() ); } else { if ( timer->seconds() >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } pass++; if (debug) sleepmillis(20); else sleepmillis(2); // note: the 1100 is beacuse of the waitForDifferent above // should eventually clean this up a bit curop.setExpectedLatencyMs( 1100 + timer->millis() ); continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); log() << errObj << endl; curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaust = ns; } return ok; }
/** * Handles incoming messages from a given socket. * * Terminating conditions: * 1. Assertions while handling the request. * 2. Socket is closed. * 3. Server is shutting down (based on inShutdown) * * @param arg this method is in charge of cleaning up the arg object. * * @return NULL */ static void* handleIncomingMsg(void* arg) { TicketHolderReleaser connTicketReleaser( &Listener::globalTicketHolder ); scoped_ptr<HandleIncomingMsgParam> himArg(static_cast<HandleIncomingMsgParam*>(arg)); MessagingPort* inPort = himArg->inPort; MessageHandler* handler = himArg->handler; { string threadName = "conn"; if ( inPort->connectionId() > 0 ) threadName = str::stream() << threadName << inPort->connectionId(); setThreadName( threadName.c_str() ); } verify( inPort ); inPort->psock->setLogLevel(logger::LogSeverity::Debug(1)); scoped_ptr<MessagingPort> p( inPort ); string otherSide; Message m; try { LastError * le = new LastError(); lastError.reset( le ); // lastError now has ownership otherSide = p->psock->remoteString(); handler->connected( p.get() ); while ( ! inShutdown() ) { m.reset(); p->psock->clearCounters(); if ( ! p->recv(m) ) { if (!serverGlobalParams.quiet) { int conns = Listener::globalTicketHolder.used()-1; const char* word = (conns == 1 ? " connection" : " connections"); log() << "end connection " << otherSide << " (" << conns << word << " now open)" << endl; } p->shutdown(); break; } handler->process( m , p.get() , le ); networkCounter.hit( p->psock->getBytesIn() , p->psock->getBytesOut() ); } } catch ( AssertionException& e ) { log() << "AssertionException handling request, closing client connection: " << e << endl; p->shutdown(); } catch ( SocketException& e ) { log() << "SocketException handling request, closing client connection: " << e << endl; p->shutdown(); } catch ( const DBException& e ) { // must be right above std::exception to avoid catching subclasses log() << "DBException handling request, closing client connection: " << e << endl; p->shutdown(); } catch ( std::exception &e ) { error() << "Uncaught std::exception: " << e.what() << ", terminating" << endl; dbexit( EXIT_UNCAUGHT ); } // Normal disconnect path. #ifdef MONGO_SSL SSLManagerInterface* manager = getSSLManager(); if (manager) manager->cleanupThreadLocals(); #endif handler->disconnected( p.get() ); return NULL; }
void Listener::initAndListen() { if (!_setupSocketsSuccessful) { return; } #ifdef MONGO_SSL _logListen(_port, _ssl); #else _logListen(_port, false); #endif OwnedPointerVector<EventHolder> eventHolders; boost::scoped_array<WSAEVENT> events(new WSAEVENT[_socks.size()]); // Populate events array with an event for each socket we are watching for (size_t count = 0; count < _socks.size(); ++count) { EventHolder* ev(new EventHolder); eventHolders.mutableVector().push_back(ev); events[count] = ev->get(); } while ( ! inShutdown() ) { // Turn on listening for accept-ready sockets for (size_t count = 0; count < _socks.size(); ++count) { int status = WSAEventSelect(_socks[count], events[count], FD_ACCEPT | FD_CLOSE); if (status == SOCKET_ERROR) { const int mongo_errno = WSAGetLastError(); // During shutdown, we may fail to listen on the socket if it has already // been closed if (inShutdown()) { return; } error() << "Windows WSAEventSelect returned " << errnoWithDescription(mongo_errno) << endl; fassertFailed(16727); } } // Wait till one of them goes active, or we time out DWORD result = WSAWaitForMultipleEvents(_socks.size(), events.get(), FALSE, // don't wait for all the events 10, // timeout, in ms FALSE); // do not allow I/O interruptions if (result == WSA_WAIT_FAILED) { const int mongo_errno = WSAGetLastError(); error() << "Windows WSAWaitForMultipleEvents returned " << errnoWithDescription(mongo_errno) << endl; fassertFailed(16723); } if (result == WSA_WAIT_TIMEOUT) { _elapsedTime += 10; continue; } _elapsedTime += 1; // assume 1ms to grab connection. very rough // Determine which socket is ready DWORD eventIndex = result - WSA_WAIT_EVENT_0; WSANETWORKEVENTS networkEvents; // Extract event details, and clear event for next pass int status = WSAEnumNetworkEvents(_socks[eventIndex], events[eventIndex], &networkEvents); if (status == SOCKET_ERROR) { const int mongo_errno = WSAGetLastError(); error() << "Windows WSAEnumNetworkEvents returned " << errnoWithDescription(mongo_errno) << endl; continue; } if (networkEvents.lNetworkEvents & FD_CLOSE) { log() << "listen socket closed" << endl; break; } if (!(networkEvents.lNetworkEvents & FD_ACCEPT)) { error() << "Unexpected network event: " << networkEvents.lNetworkEvents << endl; continue; } int iec = networkEvents.iErrorCode[FD_ACCEPT_BIT]; if (iec != 0) { error() << "Windows socket accept did not work:" << errnoWithDescription(iec) << endl; continue; } status = WSAEventSelect(_socks[eventIndex], NULL, 0); if (status == SOCKET_ERROR) { const int mongo_errno = WSAGetLastError(); error() << "Windows WSAEventSelect returned " << errnoWithDescription(mongo_errno) << endl; continue; } disableNonblockingMode(_socks[eventIndex]); SockAddr from; int s = accept(_socks[eventIndex], from.raw(), &from.addressSize); if ( s < 0 ) { int x = errno; // so no global issues if (x == EBADF) { log() << "Port " << _port << " is no longer valid" << endl; continue; } else if (x == ECONNABORTED) { log() << "Listener on port " << _port << " aborted" << endl; continue; } if ( x == 0 && inShutdown() ) { return; // socket closed } if( !inShutdown() ) { log() << "Listener: accept() returns " << s << " " << errnoWithDescription(x) << endl; if (x == EMFILE || x == ENFILE) { // Connection still in listen queue but we can't accept it yet error() << "Out of file descriptors. Waiting one second before" " trying to accept more connections." << warnings; sleepsecs(1); } } continue; } if (from.getType() != AF_UNIX) disableNagle(s); long long myConnectionNumber = globalConnectionNumber.addAndFetch(1); if (_logConnect && !serverGlobalParams.quiet) { int conns = globalTicketHolder.used()+1; const char* word = (conns == 1 ? " connection" : " connections"); log() << "connection accepted from " << from.toString() << " #" << myConnectionNumber << " (" << conns << word << " now open)" << endl; } boost::shared_ptr<Socket> pnewSock( new Socket(s, from) ); #ifdef MONGO_SSL if (_ssl) { pnewSock->secureAccepted(_ssl); } #endif accepted( pnewSock , myConnectionNumber ); } }
void TransportLayerMock::shutdown() { if (!inShutdown()) { _shutdown = true; endAllSessions(Session::kEmptyTagMask); } }
void RangeDeleter::doWork() { _env->initThread(); while (!inShutdown() && !stopRequested()) { string errMsg; boost::scoped_ptr<OperationContext> txn(getGlobalEnvironment()->newOpCtx()); RangeDeleteEntry* nextTask = NULL; { scoped_lock sl(_queueMutex); while (_taskQueue.empty()) { _taskQueueNotEmptyCV.timed_wait(sl.boost(), duration::milliseconds(kNotEmptyTimeoutMillis)); if (stopRequested()) { log() << "stopping range deleter worker" << endl; return; } if (_taskQueue.empty()) { // Try to check if some deletes are ready and move them to the // ready queue. TaskList::iterator iter = _notReadyQueue.begin(); while (iter != _notReadyQueue.end()) { RangeDeleteEntry* entry = *iter; set<CursorId> cursorsNow; { if (entry->options.waitForOpenCursors) { _env->getCursorIds(txn.get(), entry->options.range.ns, &cursorsNow); } } set<CursorId> cursorsLeft; std::set_intersection(entry->cursorsToWait.begin(), entry->cursorsToWait.end(), cursorsNow.begin(), cursorsNow.end(), std::inserter(cursorsLeft, cursorsLeft.end())); entry->cursorsToWait.swap(cursorsLeft); if (entry->cursorsToWait.empty()) { (*iter)->stats.queueEndTS = jsTime(); _taskQueue.push_back(*iter); _taskQueueNotEmptyCV.notify_one(); iter = _notReadyQueue.erase(iter); } else { logCursorsWaiting(entry); ++iter; } } } } if (stopRequested()) { log() << "stopping range deleter worker" << endl; return; } nextTask = _taskQueue.front(); _taskQueue.pop_front(); _deletesInProgress++; } { nextTask->stats.deleteStartTS = jsTime(); bool delResult = _env->deleteRange(txn.get(), *nextTask, &nextTask->stats.deletedDocCount, &errMsg); nextTask->stats.deleteEndTS = jsTime(); if (delResult) { nextTask->stats.waitForReplStartTS = jsTime(); if (!_waitForMajority(txn.get(), &errMsg)) { warning() << "Error encountered while waiting for replication: " << errMsg; } nextTask->stats.waitForReplEndTS = jsTime(); } else { warning() << "Error encountered while trying to delete range: " << errMsg << endl; } } { scoped_lock sl(_queueMutex); NSMinMax setEntry(nextTask->options.range.ns, nextTask->options.range.minKey, nextTask->options.range.maxKey); deletePtrElement(&_deleteSet, &setEntry); _deletesInProgress--; if (nextTask->notifyDone) { nextTask->notifyDone->notifyOne(); } } recordDelStats(new DeleteJobStats(nextTask->stats)); delete nextTask; nextTask = NULL; } }
void BackgroundSync::produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer && !inShutdown()) { _condvar.wait(lock); } if (inShutdown()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { boost::unique_lock<boost::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } _syncSourceReader.resetConnection(); _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); { boost::unique_lock<boost::mutex> lock(_mutex); // no server found if (_syncSourceReader.getHost().empty()) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = _syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!_syncSourceReader.haveCursor()) { return; } if (_rollbackIfNeeded(txn, _syncSourceReader)) { stop(); return; } while (!inShutdown()) { if (!_syncSourceReader.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = _syncSourceReader.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getCurrentMemberState().primary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncSource()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. _syncSourceReader.more(); } networkByteStats.increment(_syncSourceReader.currentBatchMessageSize()); if (!_syncSourceReader.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause) { return; } } _syncSourceReader.tailCheck(); if( !_syncSourceReader.haveCursor() ) { LOG(1) << "replSet end syncTail pass"; return; } continue; } } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getCurrentMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = _syncSourceReader.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { boost::unique_lock<boost::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty(); } } }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r; OpTime lastOpTimeFetched; // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; } r.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { if (!r.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. r.more(); } networkByteStats.increment(r.currentBatchMessageSize()); if (!r.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
void Balancer::run() { // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely while ( ! inShutdown() ) { if ( ! _init() ) { log() << "will retry to initialize balancer in one minute" << endl; sleepsecs( 60 ); continue; } break; } int sleepTime = 10; // getConnectioString and dist lock constructor does not throw, which is what we expect on while // on the balancer thread ConnectionString config = configServer.getConnectionString(); DistributedLock balanceLock( config , "balancer" ); while ( ! inShutdown() ) { try { ScopedDbConnection conn(config.toString(), 30); // ping has to be first so we keep things in the config server in sync _ping(); // use fresh shard state Shard::reloadShardInfo(); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(); SettingsType balancerConfig; string errMsg; if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) { warning() << errMsg; return ; } // now make sure we should even be running if (balancerConfig.isKeySet() && // balancer config doc exists !grid.shouldBalance(balancerConfig)) { LOG(1) << "skipping balancing round because balancing is disabled" << endl; // Ping again so scripts can determine if we're active without waiting _ping( true ); conn.done(); sleepsecs( sleepTime ); continue; } uassert( 13258 , "oids broken after resetting!" , _checkOIDs() ); { dist_lock_try lk( &balanceLock , "doing balance round" ); if ( ! lk.got() ) { LOG(1) << "skipping balancing round because another balancer is active" << endl; // Ping again so scripts can determine if we're active without waiting _ping( true ); conn.done(); sleepsecs( sleepTime ); // no need to wake up soon continue; } if ( !isConfigServerConsistent() ) { conn.done(); warning() << "Skipping balancing round because data inconsistency" << " was detected amongst the config servers." << endl; sleepsecs( sleepTime ); continue; } const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete() : false); scoped_ptr<WriteConcernOptions> writeConcern; if (balancerConfig.isKeySet()) { // if balancer doc exists. StatusWith<WriteConcernOptions*> extractStatus = balancerConfig.extractWriteConcern(); if (extractStatus.isOK()) { writeConcern.reset(extractStatus.getValue()); } else { warning() << extractStatus.toString(); } } LOG(1) << "*** start balancing round. " << "waitForDelete: " << waitForDelete << ", secondaryThrottle: " << (writeConcern.get() ? writeConcern->toBSON().toString() : "default") << endl; vector<CandidateChunkPtr> candidateChunks; _doBalanceRound( conn.conn() , &candidateChunks ); if ( candidateChunks.size() == 0 ) { LOG(1) << "no need to move any chunk" << endl; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(&candidateChunks, writeConcern.get(), waitForDelete ); } LOG(1) << "*** end of balancing round" << endl; } // Ping again so scripts can determine if we're active without waiting _ping( true ); conn.done(); sleepsecs( _balancedLastTime ? sleepTime / 10 : sleepTime ); } catch ( std::exception& e ) { log() << "caught exception while doing balance: " << e.what() << endl; // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round" << endl; sleepsecs( sleepTime ); // sleep a fair amount b/c of error continue; } } }
void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) { const int flags = d.reservedField(); bool keepGoing = flags & InsertOption_KeepGoing; // modified before assertion if should abort while ( d.moreJSObjs() ) { try { BSONObj o = d.nextJsObj(); if ( ! manager->hasShardKey( o ) ) { bool bad = true; if ( manager->getShardKey().partOfShardKey( "_id" ) ) { BSONObjBuilder b; b.appendOID( "_id" , 0 , true ); b.appendElements( o ); o = b.obj(); bad = ! manager->hasShardKey( o ); } if ( bad ) { log() << "tried to insert object without shard key: " << r.getns() << " " << o << endl; uasserted( 8011 , "tried to insert object without shard key" ); } } // Many operations benefit from having the shard key early in the object o = manager->getShardKey().moveToFront(o); const int maxTries = 30; bool gotThrough = false; for ( int i=0; i<maxTries; i++ ) { try { ChunkPtr c = manager->findChunk( o ); log(4) << " server:" << c->getShard().toString() << " " << o << endl; insert( c->getShard() , r.getns() , o , flags); r.gotInsert(); if ( r.getClientInfo()->autoSplitOk() ) c->splitIfShould( o.objsize() ); gotThrough = true; break; } catch ( StaleConfigException& e ) { int logLevel = i < ( maxTries / 2 ); LOG( logLevel ) << "retrying insert because of StaleConfigException: " << e << " object: " << o << endl; r.reset(); unsigned long long old = manager->getSequenceNumber(); manager = r.getChunkManager(); LOG( logLevel ) << " sequenece number - old: " << old << " new: " << manager->getSequenceNumber() << endl; if (!manager) { keepGoing = false; uasserted(14804, "collection no longer sharded"); } } sleepmillis( i * 20 ); } assert( inShutdown() || gotThrough ); // not caught below } catch (const UserException&){ if (!keepGoing || !d.moreJSObjs()){ throw; } // otherwise ignore and keep going } } }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r(false /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } r.tailingQueryGTE(rsoplog, _lastOpTimeFetched); } // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { while (!inShutdown()) { if (!r.moreInCurrentBatch()) { if (theReplSet->gotForceSync()) { return; } if (theReplSet->isPrimary()) { return; } { boost::unique_lock<boost::mutex> lock(_mutex); if (!_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.more(); } if (!r.more()) break; BSONObj o = r.nextSafe().getOwned(); Timer timer; // the blocking queue will wait (forever) until there's room for us to push OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } _buffer.push(o); { boost::unique_lock<boost::mutex> lock(_mutex); // update counters _queueCounter.waitTime += timer.millis(); _queueCounter.numElems++; _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); } } // end while { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } // looping back is ok because this is a tailable cursor } }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Status* remoteOplogStartStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { return; } if (inShutdown()) { return; } // Check if we have been paused. if (isPaused()) { return; } const auto& queryResponse = result.getValue(); // Forward metadata (containing liveness information) to replication coordinator. bool receivedMetadata = queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName); if (receivedMetadata) { auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(queryResponse.otherFields.metadata); if (!metadataResult.isOK()) { error() << "invalid replication metadata from sync source " << source << ": " << metadataResult.getStatus() << ": " << queryResponse.otherFields.metadata; return; } _replCoord->processReplSetMetadata(metadataResult.getValue()); } const auto& documents = queryResponse.documents; auto documentBegin = documents.cbegin(); auto documentEnd = documents.cend(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> { if (documentBegin == documentEnd) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(documentBegin++); }; *remoteOplogStartStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!remoteOplogStartStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(documentBegin != documents.cbegin()); } // process documents int currentBatchMessageSize = 0; for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) { if (inShutdown()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the fetcher. const BSONObj& o = *documentIter; currentBatchMessageSize += o.objsize(); opsReadStats.increment(); if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { sleepsecs(20); } { stdx::unique_lock<stdx::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = fassertStatusOK(28770, OpTime::parseFromBSON(o)); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } // record time for each batch getmoreReplStats.recordMillis(durationCount<Milliseconds>(queryResponse.elapsedMillis)); networkByteStats.increment(currentBatchMessageSize); // Check some things periodically // (whenever we run out of items in the // current cursor batch) if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (_shouldChangeSyncSource(source)) { return; } // Check if we have been paused. if (isPaused()) { return; } // We fill in 'bob' to signal the fetcher to process with another getMore. invariant(bob); bob->append("getMore", queryResponse.cursorId); bob->append("collection", queryResponse.nss.coll()); bob->append("maxTimeMS", durationCount<Milliseconds>(fetcherMaxTimeMS)); if (receivedMetadata) { bob->append("term", _replCoord->getTerm()); } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { OpQueueBatcher batcher(this); OperationContextImpl txn; auto replCoord = ReplicationCoordinator::get(&txn); ApplyBatchFinalizer finalizer(replCoord); OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; do { if (BackgroundSync::get()->getInitialSyncRequestedFlag()) { // got a resync command return; } tryToGoLiveAsASecondary(&txn, replCoord); // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become // ready in time, we'll loop again so we can do the above checks periodically. ops = batcher.getNextBatch(Seconds(1)); } while (!inShutdown() && ops.empty()); if (inShutdown()) return; invariant(!ops.empty()); const BSONObj lastOp = ops.back().raw; if (lastOp.isEmpty()) { // This means that the network thread has coalesced and we have processed all of its // data. invariant(ops.getDeque().size() == 1); if (replCoord->isWaitingForApplierToDrain()) { replCoord->signalDrainComplete(&txn); } continue; // This wasn't a real op. Don't try to apply it. } handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
void Balancer::run() { // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely while ( ! inShutdown() ) { if ( ! _init() ) { log() << "will retry to initialize balancer in one minute" << endl; sleepsecs( 60 ); continue; } break; } // getConnectioString and dist lock constructor does not throw, which is what we expect on while // on the balancer thread ConnectionString config = configServer.getConnectionString(); DistributedLock balanceLock( config , "balancer" ); while ( ! inShutdown() ) { try { ScopedDbConnection conn( config ); // ping has to be first so we keep things in the config server in sync _ping( conn.conn() ); // now make sure we should even be running if ( ! grid.shouldBalance() ) { LOG(1) << "skipping balancing round because balancing is disabled" << endl; conn.done(); sleepsecs( 30 ); continue; } uassert( 13258 , "oids broken after resetting!" , _checkOIDs() ); // use fresh shard state Shard::reloadShardInfo(); { dist_lock_try lk( &balanceLock , "doing balance round" ); if ( ! lk.got() ) { LOG(1) << "skipping balancing round because another balancer is active" << endl; conn.done(); sleepsecs( 30 ); // no need to wake up soon continue; } LOG(1) << "*** start balancing round" << endl; vector<CandidateChunkPtr> candidateChunks; _doBalanceRound( conn.conn() , &candidateChunks ); if ( candidateChunks.size() == 0 ) { LOG(1) << "no need to move any chunk" << endl; } else { _balancedLastTime = _moveChunks( &candidateChunks ); } LOG(1) << "*** end of balancing round" << endl; } conn.done(); sleepsecs( _balancedLastTime ? 5 : 10 ); } catch ( std::exception& e ) { log() << "caught exception while doing balance: " << e.what() << endl; // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round" << endl; sleepsecs( 30 ); // sleep a fair amount b/c of error continue; } } }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedbackThread"); OperationContextImpl txn; bool positionChanged = false; bool handshakeNeeded = false; ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. { boost::unique_lock<boost::mutex> lock(_mtx); while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { _cond.wait(lock); } if (_shutdownSignaled) { break; } positionChanged = _positionChanged; handshakeNeeded = _handshakeNeeded; _positionChanged = false; _handshakeNeeded = false; } MemberState state = replCoord->getCurrentMemberState(); if (state.primary() || state.fatal() || state.startup()) { _resetConnection(); continue; } const Member* target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (!target) { sleepmillis(500); continue; } if (!_connect(&txn, target->h())) { sleepmillis(500); continue; } handshakeNeeded = true; } if (handshakeNeeded) { if (!replHandshake(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _handshakeNeeded = true; continue; } } if (positionChanged) { if (!updateUpstream(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _positionChanged = true; } } } cc().shutdown(); }
void WriteBackListener::run() { int secsToSleep = 0; scoped_ptr<ChunkVersion> lastNeededVersion; int lastNeededCount = 0; bool needsToReloadShardInfo = false; while ( ! inShutdown() ) { if ( ! Shard::isAShardNode( _addr ) ) { LOG(1) << _addr << " is not a shard node" << endl; sleepsecs( 60 ); continue; } try { if (needsToReloadShardInfo) { // It's possible this shard was removed Shard::reloadShardInfo(); needsToReloadShardInfo = false; } ScopedDbConnection conn(_addr); BSONObj result; { BSONObjBuilder cmd; cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data if ( ! conn->runCommand( "admin" , cmd.obj() , result ) ) { result = result.getOwned(); log() << "writebacklisten command failed! " << result << endl; conn.done(); continue; } } conn.done(); LOG(1) << "writebacklisten result: " << result << endl; BSONObj data = result.getObjectField( "data" ); if ( data.getBoolField( "writeBack" ) ) { string ns = data["ns"].valuestrsafe(); ConnectionIdent cid( "" , 0 ); OID wid; if ( data["connectionId"].isNumber() && data["id"].type() == jstOID ) { string s = ""; if ( data["instanceIdent"].type() == String ) s = data["instanceIdent"].String(); cid = ConnectionIdent( s , data["connectionId"].numberLong() ); wid = data["id"].OID(); } else { warning() << "mongos/mongod version mismatch (1.7.5 is the split)" << endl; } int len; // not used, but needed for next call Message msg( (void*)data["msg"].binData( len ) , false ); massert( 10427 , "invalid writeback message" , msg.header()->valid() ); DBConfigPtr db = grid.getDBConfig( ns ); ChunkVersion needVersion = ChunkVersion::fromBSON( data, "version" ); // // TODO: Refactor the sharded strategy to correctly handle all sharding state changes itself, // we can't rely on WBL to do this for us b/c anything could reset our state in-between. // We should always reload here for efficiency when possible, but staleness is also caught in the // loop below. // ChunkManagerPtr manager; ShardPtr primary; db->getChunkManagerOrPrimary( ns, manager, primary ); ChunkVersion currVersion; if( manager ) currVersion = manager->getVersion(); LOG(1) << "connectionId: " << cid << " writebackId: " << wid << " needVersion : " << needVersion.toString() << " mine : " << currVersion.toString() << endl; LOG(1) << msg.toString() << endl; // // We should reload only if we need to update our version to be compatible *and* we // haven't already done so. This avoids lots of reloading when we remove/add a sharded collection // bool alreadyReloaded = lastNeededVersion && lastNeededVersion->isEquivalentTo( needVersion ); if( alreadyReloaded ){ LOG(1) << "wbl already reloaded config information for version " << needVersion << ", at version " << currVersion << endl; } else if( lastNeededVersion ) { log() << "new version change detected to " << needVersion.toString() << ", " << lastNeededCount << " writebacks processed at " << lastNeededVersion->toString() << endl; lastNeededCount = 0; } // // Set our lastNeededVersion for next time // lastNeededVersion.reset( new ChunkVersion( needVersion ) ); lastNeededCount++; // // Determine if we should reload, if so, reload // bool shouldReload = ! needVersion.isWriteCompatibleWith( currVersion ) && ! alreadyReloaded; if( shouldReload && currVersion.isSet() && needVersion.isSet() && currVersion.hasCompatibleEpoch( needVersion ) ) { // // If we disagree about versions only, reload the chunk manager // db->getChunkManagerIfExists( ns, true ); } else if( shouldReload ){ // // If we disagree about anything else, reload the full db // warning() << "reloading config data for " << db->getName() << ", " << "wanted version " << needVersion.toString() << " but currently have version " << currVersion.toString() << endl; db->reload(); } // do request and then call getLastError // we have to call getLastError so we can return the right fields to the user if they decide to call getLastError BSONObj gle; int attempts = 0; while ( true ) { attempts++; try { Request r( msg , 0 ); r.init(); r.d().reservedField() |= Reserved_FromWriteback; ClientInfo * ci = r.getClientInfo(); if (AuthorizationManager::isAuthEnabled()) { ci->getAuthorizationSession()->grantInternalAuthorization( UserName("_writebackListener", "local")); } ci->noAutoSplit(); r.process( attempts ); ci->newRequest(); // this so we flip prev and cur shards BSONObjBuilder b; string errmsg; if ( ! ci->getLastError( "admin", BSON( "getLastError" << 1 ), b, errmsg, true ) ) { b.appendBool( "commandFailed" , true ); if( ! b.hasField( "errmsg" ) ){ b.append( "errmsg", errmsg ); gle = b.obj(); } else if( errmsg.size() > 0 ){ // Rebuild GLE object with errmsg // TODO: Make this less clumsy by improving GLE interface gle = b.obj(); if( gle["errmsg"].type() == String ){ BSONObj gleNoErrmsg = gle.filterFieldsUndotted( BSON( "errmsg" << 1 ), false ); BSONObjBuilder bb; bb.appendElements( gleNoErrmsg ); bb.append( "errmsg", gle["errmsg"].String() + " ::and:: " + errmsg ); gle = bb.obj().getOwned(); } } } else{ gle = b.obj(); } if ( gle["code"].numberInt() == 9517 ) { log() << "new version change detected, " << lastNeededCount << " writebacks processed previously" << endl; lastNeededVersion.reset(); lastNeededCount = 1; log() << "writeback failed because of stale config, retrying attempts: " << attempts << endl; LOG(1) << "writeback error : " << gle << endl; // // Bringing this in line with the similar retry logic elsewhere // // TODO: Reloading the chunk manager may not help if we dropped a // collection, but we don't actually have that info in the writeback // error // if( attempts <= 2 ){ db->getChunkManagerIfExists( ns, true ); } else{ versionManager.forceRemoteCheckShardVersionCB( ns ); sleepsecs( attempts - 1 ); } uassert( 15884, str::stream() << "Could not reload chunk manager after " << attempts << " attempts.", attempts <= 4 ); continue; } ci->clearSinceLastGetError(); } catch ( DBException& e ) { error() << "error processing writeback: " << e << endl; BSONObjBuilder b; e.getInfo().append( b, "err", "code" ); gle = b.obj(); } break; } { scoped_lock lk( _seenWritebacksLock ); WBStatus& s = _seenWritebacks[cid]; s.id = wid; s.gle = gle; } } else if ( result["noop"].trueValue() ) { // no-op } else { log() << "unknown writeBack result: " << result << endl; } secsToSleep = 0; continue; } catch ( std::exception& e ) { // Attention! Do not call any method that would throw an exception // (or assert) in this block. if ( inShutdown() ) { // we're shutting down, so just clean up return; } log() << "WriteBackListener exception : " << e.what() << endl; needsToReloadShardInfo = true; } catch ( ... ) { log() << "WriteBackListener uncaught exception!" << endl; } secsToSleep++; sleepsecs(secsToSleep); if ( secsToSleep > 10 ) secsToSleep = 0; } log() << "WriteBackListener exiting : address no longer in cluster " << _addr; }
void Balancer::run() { // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely while ( ! inShutdown() ) { if ( ! _init() ) { log() << "will retry to initialize balancer in one minute" << endl; sleepsecs( 60 ); continue; } break; } int sleepTime = 30; // getConnectioString and dist lock constructor does not throw, which is what we expect on while // on the balancer thread ConnectionString config = configServer.getConnectionString(); DistributedLock balanceLock( config , "balancer" ); while ( ! inShutdown() ) { try { ScopedDbConnection conn(config.toString(), 30); // ping has to be first so we keep things in the config server in sync _ping( conn.conn() ); // use fresh shard state Shard::reloadShardInfo(); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(); BSONObj balancerConfig; // now make sure we should even be running if ( ! grid.shouldBalance( "", &balancerConfig ) ) { LOG(1) << "skipping balancing round because balancing is disabled" << endl; // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( sleepTime ); continue; } sleepTime = balancerConfig[SettingsType::shortBalancerSleep()].trueValue() ? 30 : 6; uassert( 13258 , "oids broken after resetting!" , _checkOIDs() ); { dist_lock_try lk( &balanceLock , "doing balance round" ); if ( ! lk.got() ) { LOG(1) << "skipping balancing round because another balancer is active" << endl; // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( sleepTime ); // no need to wake up soon continue; } LOG(1) << "*** start balancing round" << endl; bool waitForDelete = false; if (balancerConfig["_waitForDelete"].trueValue()) { waitForDelete = balancerConfig["_waitForDelete"].trueValue(); } bool secondaryThrottle = true; // default to on if ( balancerConfig[SettingsType::secondaryThrottle()].type() ) { secondaryThrottle = balancerConfig[SettingsType::secondaryThrottle()].trueValue(); } LOG(1) << "waitForDelete: " << waitForDelete << endl; LOG(1) << "secondaryThrottle: " << secondaryThrottle << endl; vector<CandidateChunkPtr> candidateChunks; _doBalanceRound( conn.conn() , &candidateChunks ); if ( candidateChunks.size() == 0 ) { LOG(1) << "no need to move any chunk" << endl; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(&candidateChunks, secondaryThrottle, waitForDelete ); } LOG(1) << "*** end of balancing round" << endl; } // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( _balancedLastTime ? sleepTime / 6 : sleepTime ); } catch ( std::exception& e ) { log() << "caught exception while doing balance: " << e.what() << endl; // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round" << endl; sleepsecs( sleepTime ); // sleep a fair amount b/c of error continue; } } }
void WriteBackListener::run(){ OID lastID; lastID.clear(); int secsToSleep = 0; while ( ! inShutdown() && Shard::isMember( _addr ) ){ if ( lastID.isSet() ){ scoped_lock lk( _seenWritebacksLock ); _seenWritebacks.insert( lastID ); lastID.clear(); } try { ScopedDbConnection conn( _addr ); BSONObj result; { BSONObjBuilder cmd; cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data if ( ! conn->runCommand( "admin" , cmd.obj() , result ) ){ log() << "writebacklisten command failed! " << result << endl; conn.done(); continue; } } log(1) << "writebacklisten result: " << result << endl; BSONObj data = result.getObjectField( "data" ); if ( data.getBoolField( "writeBack" ) ){ string ns = data["ns"].valuestrsafe(); { BSONElement e = data["id"]; if ( e.type() == jstOID ) lastID = e.OID(); } int len; Message m( (void*)data["msg"].binData( len ) , false ); massert( 10427 , "invalid writeback message" , m.header()->valid() ); DBConfigPtr db = grid.getDBConfig( ns ); ShardChunkVersion needVersion( data["version"] ); log(1) << "writeback id: " << lastID << " needVersion : " << needVersion.toString() << " mine : " << db->getChunkManager( ns )->getVersion().toString() << endl;// TODO change to log(3) if ( logLevel ) log(1) << debugString( m ) << endl; if ( needVersion.isSet() && needVersion <= db->getChunkManager( ns )->getVersion() ){ // this means when the write went originally, the version was old // if we're here, it means we've already updated the config, so don't need to do again //db->getChunkManager( ns , true ); // SERVER-1349 } else { // we received a writeback object that was sent to a previous version of a shard // the actual shard may not have the object the writeback operation is for // we need to reload the chunk manager and get the new shard versions db->getChunkManager( ns , true ); } Request r( m , 0 ); r.init(); r.process(); } else if ( result["noop"].trueValue() ){ // no-op } else { log() << "unknown writeBack result: " << result << endl; } conn.done(); secsToSleep = 0; continue; } catch ( std::exception e ){ if ( inShutdown() ){ // we're shutting down, so just clean up return; } log() << "WriteBackListener exception : " << e.what() << endl; // It's possible this shard was removed Shard::reloadShardInfo(); } catch ( ... ){ log() << "WriteBackListener uncaught exception!" << endl; } secsToSleep++; sleepsecs(secsToSleep); if ( secsToSleep > 10 ) secsToSleep = 0; } log() << "WriteBackListener exiting : address no longer in cluster " << _addr; }