void ChunkManager::loadExistingRanges(const ChunkManager* oldManager) { int tries = 3; while (tries--) { ChunkMap chunkMap; set<Shard> shards; ShardVersionMap shardVersions; Timer t; bool success = _load(chunkMap, shards, &shardVersions, oldManager); if (success) { log() << "ChunkManager: time to load chunks for " << _ns << ": " << t.millis() << "ms" << " sequenceNumber: " << _sequenceNumber << " version: " << _version.toString() << " based on: " << (oldManager ? oldManager->getVersion().toString() : "(empty)"); // TODO: Merge into diff code above, so we validate in one place if (isChunkMapValid(chunkMap)) { _chunkMap.swap(chunkMap); _shards.swap(shards); _shardVersions.swap(shardVersions); _chunkRanges.reloadAll(_chunkMap); return; } } if (_chunkMap.size() < 10) { _printChunks(); } warning() << "ChunkManager loaded an invalid config for " << _ns << ", trying again"; sleepmillis(10 * (3 - tries)); } // This will abort construction so we should never have a reference to an invalid config msgasserted(13282, str::stream() << "Couldn't load a valid config for " << _ns << " after 3 attempts. Please try again."); }
bool waitForSyncToFinish(OperationContext* txn, string &errmsg) const { // Wait for slave thread to finish syncing, so sources will be be // reloaded with new saved state on next pass. Timer t; while ( 1 ) { if ( syncing == 0 || t.millis() > 30000 ) break; { Lock::TempRelease t(txn->lockState()); relinquishSyncingSome = 1; sleepmillis(1); } } if ( syncing ) { errmsg = "timeout waiting for sync() to finish"; return false; } return true; }
void MiniWebServer::run() { SockAddr from; while ( 1 ) { int s = accept(sock, from.getSockAddr(), &from.addressSize); if ( s < 0 ) { if ( errno == ECONNABORTED ) { log() << "Listener on port " << port << " aborted." << endl; return; } log() << "MiniWebServer: accept() returns " << s << " errno:" << errno << endl; sleepmillis(200); continue; } disableNagle(s); RARELY log() << "MiniWebServer: connection accepted from " << from.toString() << endl; accepted( s, from ); closesocket(s); } }
void WiredTigerSessionCache::shuttingDown() { uint32_t actual = _shuttingDown.load(); uint32_t expected; // Try to atomically set _shuttingDown flag, but just return if another thread was first. do { expected = actual; actual = _shuttingDown.compareAndSwap(expected, expected | kShuttingDownMask); if (actual & kShuttingDownMask) return; } while (actual != expected); // Spin as long as there are threads in releaseSession while (_shuttingDown.load() != kShuttingDownMask) { sleepmillis(1); } closeAll(); }
/* static */ BSONObj WriteBackListener::waitFor( const ConnectionIdent& ident, const OID& oid ) { Timer t; Timer lastMessageTimer; while ( t.minutes() < 60 ) { { scoped_lock lk( _seenWritebacksLock ); WBStatus s = _seenWritebacks[ident]; if ( oid < s.id ) { // this means we're waiting for a GLE that already passed. // it should be impossible because once we call GLE, no other // writebacks should happen with that connection id msgasserted( 14041 , str::stream() << "got writeback waitfor for older id " << " oid: " << oid << " s.id: " << s.id << " ident: " << ident.toString() ); } else if ( oid == s.id ) { return s.gle; } // Stay in lock so we can use the status if( lastMessageTimer.seconds() > 10 ){ warning() << "waiting for writeback " << oid << " from connection " << ident.toString() << " for " << t.seconds() << " secs" << ", currently at id " << s.id << endl; lastMessageTimer.reset(); } } sleepmillis( 10 ); } uasserted( 13403 , str::stream() << "didn't get writeback for: " << oid << " after: " << t.millis() << " ms" << " from connection " << ident.toString() ); throw 1; // never gets here }
void BackgroundSync::_run() { Client::initThread("rsBackgroundSync"); AuthorizationSession::get(cc())->grantInternalAuthorization(); while (!inShutdown()) { try { _runProducer(); } catch (const DBException& e) { std::string msg(str::stream() << "sync producer problem: " << redact(e)); error() << msg; _replCoord->setMyHeartbeatMessage(msg); sleepmillis(100); // sleep a bit to keep from hammering this thread with temp. errors. } catch (const std::exception& e2) { // redact(std::exception&) doesn't work severe() << "sync producer exception: " << redact(e2.what()); fassertFailed(28546); } } stop(); }
void t() { for( int i = 0; i < 20; i++ ) { sleepmillis(21); string fn = "/tmp/t1"; MongoMMF f; unsigned long long len = 1 * 1024 * 1024; assert( f.create(fn, len, /*sequential*/rand()%2==0) ); { char *p = (char *) f.getView(); assert(p); // write something to the private view as a test strcpy(p, "hello"); } if( cmdLine.dur ) { char *w = (char *) f.view_write(); strcpy(w + 6, "world"); } MongoFileFinder ff; ASSERT( ff.findByPath(fn) ); } }
static void durThread() { Client::initThread("dur"); const int HowOftenToGroupCommitMs = 100; while( 1 ) { try { int millis = HowOftenToGroupCommitMs; { Timer t; journalRotate(); // note we do this part outside of mongomutex millis -= t.millis(); if( millis < 5 || millis > HowOftenToGroupCommitMs ) millis = 5; } sleepmillis(millis); go(); } catch(std::exception& e) { log() << "exception in durThread " << e.what() << endl; } } }
static void runThread() { while (keepGoing) { try { if (current->lock_try( "test" )) { int before = count.addAndFetch(1); sleepmillis(3); int after = count.loadRelaxed(); if (after != before) { error() << " before: " << before << " after: " << after << endl; } current->unlock(); } } catch ( const DBException& ex ) { log() << "*** !Could not try distributed lock." << causedBy( ex ) << endl; } } }
void DataFileSync::run() { Client::initThread( name().c_str() ); if (mmapv1GlobalOptions.syncdelay == 0) { log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl; } else if (mmapv1GlobalOptions.syncdelay == 1) { log() << "--syncdelay 1" << endl; } else if (mmapv1GlobalOptions.syncdelay != 60) { LOG(1) << "--syncdelay " << mmapv1GlobalOptions.syncdelay << endl; } int time_flushing = 0; while ( ! inShutdown() ) { _diaglog.flush(); if (mmapv1GlobalOptions.syncdelay == 0) { // in case at some point we add an option to change at runtime sleepsecs(5); continue; } sleepmillis((long long) std::max(0.0, (mmapv1GlobalOptions.syncdelay * 1000) - time_flushing)); if ( inShutdown() ) { // occasional issue trying to flush during shutdown when sleep interrupted break; } Date_t start = jsTime(); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); int numFiles = storageEngine->flushAllFiles( true ); time_flushing = (int) (jsTime() - start); _flushed(time_flushing); if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000 ) { log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl; } } }
void BackgroundSync::handleSlaveDelay(uint64_t opTimestamp) { dassert(_opSyncRunning); uint64_t slaveDelayMillis = theReplSet->myConfig().slaveDelay * 1000; uint64_t currTime = curTimeMillis64(); uint64_t timeOpShouldBeApplied = opTimestamp + slaveDelayMillis; while (currTime < timeOpShouldBeApplied) { uint64_t sleepTime = (timeOpShouldBeApplied - currTime); // let's sleep for at most one second sleepmillis((sleepTime < 1000) ? sleepTime : 1000); // check if we should bail out, as we don't want to // sleep the whole time possibly long delay time // if we see we should be stopping { boost::unique_lock<boost::mutex> lck(_mutex); if (!_opSyncShouldRun) { break; } } // reset currTime currTime = curTimeMillis64(); } }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { cc().curop()->suppressFromCurop(); cc().curop()->setExpectedLatencyMs( 30000 ); BSONElement e = cmdObj.firstElement(); if ( e.type() != jstOID ) { errmsg = "need oid as first value"; return 0; } // get the command issuer's (a mongos) serverID const OID id = e.__oid(); // the command issuer is blocked awaiting a response // we want to do return at least at every 5 minutes so sockets don't timeout BSONObj z; if ( writeBackManager.getWritebackQueue(id.str())->queue.blockingPop( z, 5 * 60 /* 5 minutes */ ) ) { LOG(1) << "WriteBackCommand got : " << z << endl; result.append( "data" , z ); } else { result.appendBool( "noop" , true ); } #ifdef _DEBUG PseudoRandom r(static_cast<int64_t>(time(0))); // Sleep a short amount of time usually int sleepFor = r.nextInt32( 10 ); sleepmillis( sleepFor ); // Sleep a longer amount of time every once and awhile int sleepLong = r.nextInt32( 50 ); if( sleepLong == 0 ) sleepsecs( 2 ); #endif return true; }
void BackgroundSync::_rollback(OperationContext* txn, const HostAndPort& source, stdx::function<DBClientBase*()> getConnection) { // Abort only when syncRollback detects we are in a unrecoverable state. // In other cases, we log the message contained in the error status and retry later. auto status = syncRollback(txn, OplogInterfaceLocal(txn, rsOplogName), RollbackSourceImpl(getConnection, source, rsOplogName), _replCoord); if (status.isOK()) { // When the syncTail thread sees there is no new data by adding something to the buffer. _signalNoNewDataForApplier(); // Wait until the buffer is empty. // This is an indication that syncTail has removed the sentinal marker from the buffer // and reset its local lastAppliedOpTime via the replCoord. while (!_buffer.empty()) { sleepmillis(10); if (inShutdown()) { return; } } // It is now safe to clear the ROLLBACK state, which may result in the applier thread // transitioning to SECONDARY. This is safe because the applier thread has now reloaded // the new rollback minValid from the database. if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) << " but found self in " << _replCoord->getMemberState(); } return; } if (ErrorCodes::UnrecoverableRollbackError == status.code()) { fassertNoTrace(28723, status); } warning() << "rollback cannot proceed at this time (retrying later): " << status; }
void BackgroundSync::markOplog() { LOG(3) << "replset markOplog: " << _consumedOpTime << " " << theReplSet->lastOpTimeWritten << rsLog; if (theReplSet->syncSourceFeedback.supportsUpdater()) { _consumedOpTime = theReplSet->lastOpTimeWritten; theReplSet->syncSourceFeedback.updateSelfInMap(theReplSet->lastOpTimeWritten); } else { boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock); if (!hasCursor()) { oplogLockSSF.unlock(); sleepmillis(500); return; } if (!theReplSet->syncSourceFeedback.moreInCurrentBatch()) { theReplSet->syncSourceFeedback.more(); } if (!theReplSet->syncSourceFeedback.more()) { theReplSet->syncSourceFeedback.tailCheck(); return; } // if this member has written the op at optime T // we want to nextSafe up to and including T while (_consumedOpTime < theReplSet->lastOpTimeWritten && theReplSet->syncSourceFeedback.more()) { BSONObj temp = theReplSet->syncSourceFeedback.nextSafe(); _consumedOpTime = temp["ts"]._opTime(); } // call more() to signal the sync target that we've synced T theReplSet->syncSourceFeedback.more(); } }
/* static */ BSONObj WriteBackListener::waitFor( const ConnectionIdent& ident, const OID& oid ) { Timer t; for ( int i=0; i<5000; i++ ) { { scoped_lock lk( _seenWritebacksLock ); WBStatus s = _seenWritebacks[ident]; if ( oid < s.id ) { // this means we're waiting for a GLE that already passed. // it should be impossible because once we call GLE, no other // writebacks should happen with that connection id msgasserted( 14041 , str::stream() << "got writeback waitfor for older id " << " oid: " << oid << " s.id: " << s.id << " ident: " << ident.toString() ); } else if ( oid == s.id ) { return s.gle; } } sleepmillis( 10 ); } uasserted( 13403 , str::stream() << "didn't get writeback for: " << oid << " after: " << t.millis() << " ms" ); throw 1; // never gets here }
void run() { Client::initThread("fsyncjob"); Client& c = cc(); { scoped_lock lk(lockedForWritingMutex); lockedForWriting++; } readlock lk(""); MemoryMappedFile::flushAll(true); log() << "db is now locked for snapshotting, no writes allowed. use db.$cmd.sys.unlock.findOne() to unlock" << endl; _ready = true; while( 1 ) { if( unlockRequested ) { unlockRequested = false; break; } sleepmillis(20); } { scoped_lock lk(lockedForWritingMutex); lockedForWriting--; } c.shutdown(); }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r(false /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } r.tailingQueryGTE(rsoplog, _lastOpTimeFetched); } // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { while (!inShutdown()) { if (!r.moreInCurrentBatch()) { int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as postentiallyd efating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) break; BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); } } // end while { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } // looping back is ok because this is a tailable cursor } }
bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { log(1) << " authenticate: " << cmdObj << endl; string user = cmdObj.getStringField("user"); string key = cmdObj.getStringField("key"); string received_nonce = cmdObj.getStringField("nonce"); if( user.empty() || key.empty() || received_nonce.empty() ) { log() << "field missing/wrong type in received authenticate command " << cc().database()->name << endl; errmsg = "auth fails"; sleepmillis(10); return false; } stringstream digestBuilder; { bool reject = false; nonce *ln = lastNonce.release(); if ( ln == 0 ) { reject = true; } else { digestBuilder << hex << *ln; reject = digestBuilder.str() != received_nonce; } if ( reject ) { log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << endl; errmsg = "auth fails"; sleepmillis(30); return false; } } static BSONObj userPattern = fromjson("{\"user\":1}"); string systemUsers = cc().database()->name + ".system.users"; OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1"); BSONObj userObj; { BSONObjBuilder b; b << "user" << user; BSONObj query = b.done(); if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { log() << "auth: couldn't find user " << user << ", " << systemUsers << endl; errmsg = "auth fails"; return false; } } md5digest d; { string pwd = userObj.getStringField("pwd"); digestBuilder << user << pwd; string done = digestBuilder.str(); md5_state_t st; md5_init(&st); md5_append(&st, (const md5_byte_t *) done.c_str(), done.size()); md5_finish(&st, d); } string computed = digestToString( d ); if ( key != computed ){ log() << "auth: key mismatch " << user << ", ns:" << ns << endl; errmsg = "auth fails"; return false; } AuthenticationInfo *ai = cc().getAuthenticationInfo(); if ( userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean() ) { if ( readLockSupported() ){ ai->authorizeReadOnly( cc().database()->name.c_str() ); } else { log() << "warning: old version of boost, read-only users not supported" << endl; ai->authorize( cc().database()->name.c_str() ); } } else { ai->authorize( cc().database()->name.c_str() ); } return true; }
virtual void subthread(int tnumber) { Client::initThread("mongomutextest"); const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext(); OperationContext& txn = *txnPtr; sleepmillis(0); for (int i = 0; i < N; i++) { int x = std::rand(); bool sometimes = (x % 15 == 0); if (i % 7 == 0) { Lock::GlobalRead r(txn.lockState()); // nested test Lock::GlobalRead r2(txn.lockState()); } else if (i % 7 == 1) { Lock::GlobalRead r(txn.lockState()); ASSERT(txn.lockState()->isReadLocked()); } else if (i % 7 == 4 && tnumber == 1 /*only one upgrader legal*/) { Lock::GlobalWrite w(txn.lockState()); ASSERT(txn.lockState()->isW()); if (i % 7 == 2) { Lock::TempRelease t(txn.lockState()); } } else if (i % 7 == 2) { Lock::GlobalWrite w(txn.lockState()); ASSERT(txn.lockState()->isW()); if (sometimes) { Lock::TempRelease t(txn.lockState()); } } else if (i % 7 == 3) { Lock::GlobalWrite w(txn.lockState()); { Lock::TempRelease t(txn.lockState()); } Lock::GlobalRead r(txn.lockState()); ASSERT(txn.lockState()->isW()); if (sometimes) { Lock::TempRelease t(txn.lockState()); } } else if (i % 7 == 5) { { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "foo", MODE_S); } { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "bar", MODE_S); } } else if (i % 7 == 6) { if (i > N / 2) { int q = i % 11; if (q == 0) { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "foo", MODE_S); ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S)); Lock::DBLock r2(txn.lockState(), "foo", MODE_S); ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S)); Lock::DBLock r3(txn.lockState(), "local", MODE_S); ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S)); ASSERT(txn.lockState()->isDbLockedForMode("local", MODE_S)); } else if (q == 1) { // test locking local only -- with no preceding lock { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock x(txn.lockState(), "local", MODE_S); } { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock x(txn.lockState(), "local", MODE_X); // No actual writing here, so no WriteUnitOfWork if (sometimes) { Lock::TempRelease t(txn.lockState()); } } } else if (q == 1) { { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock x(txn.lockState(), "admin", MODE_S); } { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock x(txn.lockState(), "admin", MODE_X); } } else if (q == 3) { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock x(txn.lockState(), "foo", MODE_X); Lock::DBLock y(txn.lockState(), "admin", MODE_S); } else if (q == 4) { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock x(txn.lockState(), "foo2", MODE_S); Lock::DBLock y(txn.lockState(), "admin", MODE_S); } else { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock w(txn.lockState(), "foo", MODE_X); { Lock::TempRelease t(txn.lockState()); } Lock::DBLock r2(txn.lockState(), "foo", MODE_S); Lock::DBLock r3(txn.lockState(), "local", MODE_S); } } else { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "foo", MODE_S); Lock::DBLock r2(txn.lockState(), "foo", MODE_S); Lock::DBLock r3(txn.lockState(), "local", MODE_S); } } pm.hit(); } }
void BackgroundSync::getOplogReader(OplogReader& r) { const Member *target = NULL, *stale = NULL; BSONObj oldest; { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set _currentSyncTarget = NULL; return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer) { _condvar.wait(lock); } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } verify(r.conn() == NULL); while ((target = theReplSet->getMemberToSyncTo()) != NULL) { string current = target->fullName(); if (!r.connect(current)) { LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog; r.resetConnection(); theReplSet->veto(current); sleepsecs(1); continue; } if (isStale(r, oldest)) { r.resetConnection(); theReplSet->veto(current, 600); stale = target; continue; } // if we made it here, the target is up and not stale { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = target; } boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock); theReplSet->syncSourceFeedback.connect(target); return; } // the only viable sync target was stale if (stale) { theReplSet->goStale(stale, oldest); sleepsecs(120); } { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = NULL; } }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r; OpTime lastOpTimeFetched; // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; } r.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { if (!r.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. r.more(); } networkByteStats.increment(r.currentBatchMessageSize()); if (!r.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. // If this status is not OK after the fetcher returns from wait(), // proceed to execute rollback Status remoteOplogStartStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, &remoteOplogStartStatus); auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter" << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())) << "tailable" << true << "oplogReplay" << true << "awaitData" << true << "maxTimeMS" << int(fetcherMaxTimeMS.count())); Fetcher fetcher(taskExecutor, source, nsToDatabase(rsOplogName), cmdObj, fetcherCallback, rpc::makeEmptyMetadata()); auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } // Execute rollback if necessary. // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. if (!remoteOplogStartStatus.isOK()) { const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << remoteOplogStartStatus; _rollback(txn, source, getConnection); stop(); } }
Status MigrationChunkClonerSourceLegacy::awaitUntilCriticalSectionIsAppropriate( OperationContext* txn, Milliseconds maxTimeToWait) { invariant(!txn->lockState()->isLocked()); auto scopedGuard = MakeGuard([&] { cancelClone(txn); }); const auto startTime = Date_t::now(); int iteration = 0; while ((Date_t::now() - startTime) < maxTimeToWait) { // Exponential sleep backoff, up to 1024ms. Don't sleep much on the first few iterations, // since we want empty chunk migrations to be fast. sleepmillis(1 << std::min(iteration, 10)); iteration++; auto responseStatus = _callRecipient(BSON(kRecvChunkStatus << _args.getNss().ns())); if (!responseStatus.isOK()) { return {responseStatus.getStatus().code(), str::stream() << "Failed to contact recipient shard to monitor data transfer due to " << responseStatus.getStatus().toString()}; } BSONObj res = std::move(responseStatus.getValue()); log() << "moveChunk data transfer progress: " << res << " my mem used: " << _memoryUsed; if (res["state"].String() == "steady") { // Ensure all cloned docs have actually been transferred const std::size_t locsRemaining = _cloneLocs.size(); if (locsRemaining != 0) { return { ErrorCodes::OperationIncomplete, str::stream() << "cannot enter critical section before all data is cloned, " << locsRemaining << " locs were not transferred but to-shard thinks they are all cloned"}; } scopedGuard.Dismiss(); return Status::OK(); } if (res["state"].String() == "fail") { return {ErrorCodes::OperationFailed, "Data transfer error"}; } if (res["ns"].str() != _args.getNss().ns() || res["from"].str() != _donorCS.toString() || !res["min"].isABSONObj() || res["min"].Obj().woCompare(_args.getMinKey()) != 0 || !res["max"].isABSONObj() || res["max"].Obj().woCompare(_args.getMaxKey()) != 0) { // This can happen when the destination aborted the migration and received another // recvChunk before this thread sees the transition to the abort state. This is // currently possible only if multiple migrations are happening at once. This is an // unfortunate consequence of the shards not being able to keep track of multiple // incoming and outgoing migrations. return {ErrorCodes::OperationIncomplete, "Destination shard aborted migration because a new one is running"}; } if (_memoryUsed > 500 * 1024 * 1024) { // This is too much memory for us to use so we're going to abort the migration return {ErrorCodes::ExceededMemoryLimit, "Aborting migration because of high memory usage"}; } Status interruptStatus = txn->checkForInterruptNoAssert(); if (!interruptStatus.isOK()) { return interruptStatus; } } scopedGuard.Dismiss(); return {ErrorCodes::ExceededTimeLimit, "Timed out waiting for the cloner to catch up"}; }
/** * @return true if had to do something */ bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) { // TODO: cache, optimize, etc... WriteBackListener::init( *conn_in ); DBConfigPtr conf = grid.getDBConfig( ns ); if ( ! conf ) return false; DBClientBase* conn = getVersionable( conn_in ); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ChunkManagerPtr manager; const bool isSharded = conf->isSharded( ns ); if ( isSharded ) { manager = conf->getChunkManagerIfExists( ns , authoritative ); // It's possible the chunk manager was reset since we checked whether sharded was true, // so must check this here. if( manager ) officialSequenceNumber = manager->getSequenceNumber(); } // Check this manager against the reference manager if( isSharded && manager ){ Shard shard = Shard::make( conn->getServerAddress() ); if( refManager && ! refManager->compatibleWith( manager, shard ) ){ throw SendStaleConfigException( ns, str::stream() << "manager (" << manager->getVersion( shard ).toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refManager->getVersion( shard ).toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")", refManager->getVersion( shard ), manager->getVersion( shard ) ); } } else if( refManager ){ Shard shard = Shard::make( conn->getServerAddress() ); string msg( str::stream() << "not sharded (" << ( (manager.get() == 0) ? string( "<none>" ) : str::stream() << manager->getSequenceNumber() ) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")" ); throw SendStaleConfigException( ns, msg, refManager->getVersion( shard ), ShardChunkVersion( 0, OID() )); } // has the ChunkManager been reloaded since the last time we updated the connection-level version? // (ie., last time we issued the setShardVersions below) unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns); if ( sequenceNumber == officialSequenceNumber ) { return false; } ShardChunkVersion version = ShardChunkVersion( 0, OID() ); if ( isSharded && manager ) { version = manager->getVersion( Shard::make( conn->getServerAddress() ) ); } if( ! version.isSet() ){ LOG(0) << "resetting shard version of " << ns << " on " << conn->getServerAddress() << ", " << ( ! isSharded ? "no longer sharded" : ( ! manager ? "no chunk manager found" : "version is zero" ) ) << endl; } LOG(2) << " have to set shard version for conn: " << conn->getServerAddress() << " ns:" << ns << " my last seq: " << sequenceNumber << " current: " << officialSequenceNumber << " version: " << version << " manager: " << manager.get() << endl; const string versionableServerAddress(conn->getServerAddress()); BSONObj result; if ( setShardVersion( *conn , ns , version , authoritative , result ) ) { // success! LOG(1) << " setShardVersion success: " << result << endl; connectionShardStatus.setSequence( conn , ns , officialSequenceNumber ); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if ( result["need_authoritative"].trueValue() ) massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); if ( ! authoritative ) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if ( result["reloadConfig"].trueValue() ) { if( result["version"].timestampTime() == 0 ){ warning() << "reloading full configuration for " << conf->getName() << ", connection state indicates significant version changes" << endl; // reload db conf->reload(); } else { // reload config conf->getChunkManager( ns , true ); } } const int maxNumTries = 7; if ( tryNumber < maxNumTries ) { LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) << "going to retry checkShardVersion host: " << versionableServerAddress << " " << result << endl; sleepmillis( 10 * tryNumber ); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed host: " << versionableServerAddress << " " << result; log() << " " << errmsg << endl; massert( 10429 , errmsg , 0 ); return true; }
bool RangeDeleter::deleteNow(const std::string& ns, const BSONObj& min, const BSONObj& max, const BSONObj& shardKeyPattern, bool secondaryThrottle, string* errMsg) { if (stopRequested()) { *errMsg = "deleter is already stopped."; return false; } string dummy; if (errMsg == NULL) errMsg = &dummy; NSMinMax deleteRange(ns, min, max); { scoped_lock sl(_queueMutex); if (!canEnqueue_inlock(ns, min, max, errMsg)) { return false; } _deleteSet.insert(&deleteRange); _stats->incTotalDeletes_inlock(); // Note: count for pending deletes is an integral part of the shutdown story. // Therefore, to simplify things, there is no "pending" state for deletes in // deleteNow, the state transition is simply inProgress -> done. _stats->incInProgressDeletes_inlock(); } set<CursorId> cursorsToWait; _env->getCursorIds(ns, &cursorsToWait); long long checkIntervalMillis = 5; if (!cursorsToWait.empty()) { log() << "rangeDeleter waiting for " << cursorsToWait.size() << " cursors in " << ns << " to finish" << endl; } while (!cursorsToWait.empty()) { set<CursorId> cursorsNow; _env->getCursorIds(ns, &cursorsNow); set<CursorId> cursorsLeft; std::set_intersection(cursorsToWait.begin(), cursorsToWait.end(), cursorsNow.begin(), cursorsNow.end(), std::inserter(cursorsLeft, cursorsLeft.end())); cursorsToWait.swap(cursorsLeft); if (stopRequested()) { *errMsg = "deleter was stopped."; scoped_lock sl(_queueMutex); _deleteSet.erase(&deleteRange); _stats->decInProgressDeletes_inlock(); _stats->decTotalDeletes_inlock(); if (!_stats->hasInProgress_inlock()) { _nothingInProgressCV.notify_one(); } return false; } if (checkIntervalMillis < MaxCurorCheckIntervalMillis) { checkIntervalMillis *= 2; } sleepmillis(checkIntervalMillis); } bool result = _env->deleteRange(ns, min, max, shardKeyPattern, secondaryThrottle, errMsg); { scoped_lock sl(_queueMutex); _deleteSet.erase(&deleteRange); _stats->decInProgressDeletes_inlock(); _stats->decTotalDeletes_inlock(); if (!_stats->hasInProgress_inlock()) { _nothingInProgressCV.notify_one(); } } return result; }
/** * @return true if had to do something */ bool checkShardVersion( DBClientBase& conn_in , const string& ns , bool authoritative , int tryNumber ) { // TODO: cache, optimize, etc... WriteBackListener::init( conn_in ); DBConfigPtr conf = grid.getDBConfig( ns ); if ( ! conf ) return false; DBClientBase* conn = 0; switch ( conn_in.type() ) { case ConnectionString::INVALID: assert(0); break; case ConnectionString::MASTER: // great conn = &conn_in; break; case ConnectionString::PAIR: assert( ! "pair not support for sharding" ); break; case ConnectionString::SYNC: // TODO: we should check later that we aren't actually sharded on this conn = &conn_in; break; case ConnectionString::SET: DBClientReplicaSet* set = (DBClientReplicaSet*)&conn_in; conn = &(set->masterConn()); break; } assert(conn); unsigned long long officialSequenceNumber = 0; ChunkManagerPtr manager; const bool isSharded = conf->isSharded( ns ); if ( isSharded ) { manager = conf->getChunkManagerIfExists( ns , authoritative ); // It's possible the chunk manager was reset since we checked whether sharded was true, // so must check this here. if( manager ) officialSequenceNumber = manager->getSequenceNumber(); } // has the ChunkManager been reloaded since the last time we updated the connection-level version? // (ie., last time we issued the setShardVersions below) unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns); if ( sequenceNumber == officialSequenceNumber ) { return false; } ShardChunkVersion version = 0; if ( isSharded && manager ) { version = manager->getVersion( Shard::make( conn->getServerAddress() ) ); } LOG(2) << " have to set shard version for conn: " << conn << " ns:" << ns << " my last seq: " << sequenceNumber << " current: " << officialSequenceNumber << " version: " << version << " manager: " << manager.get() << endl; BSONObj result; if ( setShardVersion( *conn , ns , version , authoritative , result ) ) { // success! LOG(1) << " setShardVersion success: " << result << endl; connectionShardStatus.setSequence( conn , ns , officialSequenceNumber ); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if ( result["need_authoritative"].trueValue() ) massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); if ( ! authoritative ) { checkShardVersion( *conn , ns , 1 , tryNumber + 1 ); return true; } if ( result["reloadConfig"].trueValue() ) { if( result["version"].timestampTime() == 0 ){ // reload db conf->reload(); } else { // reload config conf->getChunkManager( ns , true ); } } const int maxNumTries = 7; if ( tryNumber < maxNumTries ) { LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) << "going to retry checkShardVersion host: " << conn->getServerAddress() << " " << result << endl; sleepmillis( 10 * tryNumber ); checkShardVersion( *conn , ns , true , tryNumber + 1 ); return true; } string errmsg = str::stream() << "setShardVersion failed host: " << conn->getServerAddress() << " " << result; log() << " " << errmsg << endl; massert( 10429 , errmsg , 0 ); return true; }
void runThread(ConnectionString& hostConn, unsigned threadId, unsigned seed, BSONObj& cmdObj, BSONObjBuilder& result) { stringstream ss; ss << "thread-" << threadId; setThreadName(ss.str().c_str()); // Lock name string lockName = string_field(cmdObj, "lockName", this->name + "_lock"); // Range of clock skew in diff threads int skewRange = (int) number_field(cmdObj, "skewRange", 1); // How long to wait with the lock int threadWait = (int) number_field(cmdObj, "threadWait", 30); if(threadWait <= 0) threadWait = 1; // Max amount of time (ms) a thread waits before checking the lock again int threadSleep = (int) number_field(cmdObj, "threadSleep", 30); if(threadSleep <= 0) threadSleep = 1; // How long until the lock is forced in ms, only compared locally unsigned long long takeoverMS = (unsigned long long) number_field(cmdObj, "takeoverMS", 0); // Whether or not we should hang some threads int hangThreads = (int) number_field(cmdObj, "hangThreads", 0); boost::mt19937 gen((boost::mt19937::result_type) seed); boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSkew(gen, boost::uniform_int<>(0, skewRange)); boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomWait(gen, boost::uniform_int<>(1, threadWait)); boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSleep(gen, boost::uniform_int<>(1, threadSleep)); boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomNewLock(gen, boost::uniform_int<>(0, 3)); int skew = 0; if (!lock.get()) { // Pick a skew, but the first two threads skew the whole range if(threadId == 0) skew = -skewRange / 2; else if(threadId == 1) skew = skewRange / 2; else skew = randomSkew() - (skewRange / 2); // Skew this thread jsTimeVirtualThreadSkew( skew ); log() << "Initializing lock with skew of " << skew << " for thread " << threadId << endl; lock.reset(new DistributedLock(hostConn, lockName, takeoverMS, true )); log() << "Skewed time " << jsTime() << " for thread " << threadId << endl << " max wait (with lock: " << threadWait << ", after lock: " << threadSleep << ")" << endl << " takeover in " << takeoverMS << "(ms remote)" << endl; } DistributedLock* myLock = lock.get(); bool errors = false; BSONObj lockObj; while (keepGoing) { try { if (myLock->lock_try("Testing distributed lock with skew.", false, &lockObj )) { log() << "**** Locked for thread " << threadId << " with ts " << lockObj["ts"] << endl; if( count.loadRelaxed() % 2 == 1 && ! myLock->lock_try( "Testing lock re-entry.", true ) ) { errors = true; log() << "**** !Could not re-enter lock already held" << endl; break; } if( count.loadRelaxed() % 3 == 1 && myLock->lock_try( "Testing lock non-re-entry.", false ) ) { errors = true; log() << "**** !Invalid lock re-entry" << endl; break; } int before = count.addAndFetch(1); int sleep = randomWait(); sleepmillis(sleep); int after = count.loadRelaxed(); if(after != before) { errors = true; log() << "**** !Bad increment while sleeping with lock for: " << sleep << "ms" << endl; break; } // Unlock only half the time... if(hangThreads == 0 || threadId % hangThreads != 0) { log() << "**** Unlocking for thread " << threadId << " with ts " << lockObj["ts"] << endl; myLock->unlock( &lockObj ); } else { log() << "**** Not unlocking for thread " << threadId << endl; verify( DistributedLock::killPinger( *myLock ) ); // We're simulating a crashed process... break; } } } catch( const DBException& ex ) { log() << "*** !Could not try distributed lock." << causedBy( ex ) << endl; break; } // Create a new lock 1/3 of the time if( randomNewLock() > 1 ){ lock.reset(new DistributedLock( hostConn, lockName, takeoverMS, true )); myLock = lock.get(); } sleepmillis(randomSleep()); } result << "errors" << errors << "skew" << skew << "takeover" << (long long) takeoverMS << "localTimeout" << (takeoverMS > 0); }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Status* remoteOplogStartStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { return; } if (inShutdown()) { return; } // Check if we have been paused. if (isPaused()) { return; } const auto& queryResponse = result.getValue(); const auto& documents = queryResponse.documents; auto documentBegin = documents.cbegin(); auto documentEnd = documents.cend(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> { if (documentBegin == documentEnd) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(documentBegin++); }; *remoteOplogStartStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!remoteOplogStartStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(documentBegin != documents.cbegin()); } // process documents int currentBatchMessageSize = 0; for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) { if (inShutdown()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the fetcher. const BSONObj& o = *documentIter; currentBatchMessageSize += o.objsize(); opsReadStats.increment(); if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { sleepsecs(20); } { stdx::unique_lock<stdx::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = extractOpTime(o); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } // record time for each batch getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count()); networkByteStats.increment(currentBatchMessageSize); // Check some things periodically // (whenever we run out of items in the // current cursor batch) if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (_shouldChangeSyncSource(source)) { return; } // Check if we have been paused. if (isPaused()) { return; } // We fill in 'bob' to signal the fetcher to process with another getMore. invariant(bob); bob->append("getMore", queryResponse.cursorId); bob->append("collection", queryResponse.nss.coll()); bob->append("maxTimeMS", int(fetcherMaxTimeMS.count())); }
Status CmdAuthenticate::_authenticateCR(const UserName& user, const BSONObj& cmdObj) { if (user == internalSecurity.user->getName() && serverGlobalParams.clusterAuthMode == "x509") { return Status(ErrorCodes::AuthenticationFailed, "Mechanism x509 is required for internal cluster authentication"); } if (!_areNonceAuthenticateCommandsEnabled) { // SERVER-8461, MONGODB-CR must be enabled for authenticating the internal user, so that // cluster members may communicate with each other. if (user != internalSecurity.user->getName()) { return Status(ErrorCodes::BadValue, _nonceAuthenticateCommandsDisabledMessage); } } string key = cmdObj.getStringField("key"); string received_nonce = cmdObj.getStringField("nonce"); if( user.getUser().empty() || key.empty() || received_nonce.empty() ) { sleepmillis(10); return Status(ErrorCodes::ProtocolError, "field missing/wrong type in received authenticate command"); } stringstream digestBuilder; { ClientBasic *client = ClientBasic::getCurrent(); boost::scoped_ptr<AuthenticationSession> session; client->swapAuthenticationSession(session); if (!session || session->getType() != AuthenticationSession::SESSION_TYPE_MONGO) { sleepmillis(30); return Status(ErrorCodes::ProtocolError, "No pending nonce"); } else { nonce64 nonce = static_cast<MongoAuthenticationSession*>(session.get())->getNonce(); digestBuilder << hex << nonce; if (digestBuilder.str() != received_nonce) { sleepmillis(30); return Status(ErrorCodes::AuthenticationFailed, "Received wrong nonce."); } } } User* userObj; Status status = getGlobalAuthorizationManager()->acquireUser(user, &userObj); if (!status.isOK()) { // Failure to find the privilege document indicates no-such-user, a fact that we do not // wish to reveal to the client. So, we return AuthenticationFailed rather than passing // through the returned status. return Status(ErrorCodes::AuthenticationFailed, status.toString()); } string pwd = userObj->getCredentials().password; getGlobalAuthorizationManager()->releaseUser(userObj); md5digest d; { digestBuilder << user.getUser() << pwd; string done = digestBuilder.str(); md5_state_t st; md5_init(&st); md5_append(&st, (const md5_byte_t *) done.c_str(), done.size()); md5_finish(&st, d); } string computed = digestToString( d ); if ( key != computed ) { return Status(ErrorCodes::AuthenticationFailed, "key mismatch"); } AuthorizationSession* authorizationSession = ClientBasic::getCurrent()->getAuthorizationSession(); status = authorizationSession->addAndAuthorizeUser(user); if (!status.isOK()) { return status; } return Status::OK(); }
bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); bool authoritative = cmdObj.getBoolField( "authoritative" ); string configdb = cmdObj["configdb"].valuestrsafe(); { // configdb checking if ( configdb.size() == 0 ) { errmsg = "no configdb"; return false; } if ( shardingState.enabled() ) { if ( configdb != shardingState.getConfigServer() ) { errmsg = "specified a different configdb!"; return false; } } else { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); errmsg = "first setShardVersion"; return false; } shardingState.enable( configdb ); configServer.init( configdb ); } } if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); shardingState.gotShardHost( cmdObj["shardHost"].String() ); } { // setting up ids if ( cmdObj["serverID"].type() != jstOID ) { // TODO: fix this //errmsg = "need serverID to be an OID"; //return 0; } else { OID clientId = cmdObj["serverID"].__oid(); if ( ! info->hasID() ) { info->setID( clientId ); } else if ( clientId != info->getID() ) { errmsg = "server id has changed!"; return 0; } } } unsigned long long version = extractVersion( cmdObj["version"] , errmsg ); if ( errmsg.size() ) { return false; } string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to speciy fully namespace"; return false; } const ConfigVersion oldVersion = info->getVersion(ns); const ConfigVersion globalVersion = shardingState.getVersion(ns); if ( oldVersion > 0 && globalVersion == 0 ) { // this had been reset info->setVersion( ns , 0 ); } if ( version == 0 && globalVersion == 0 ) { // this connection is cleaning itself info->setVersion( ns , 0 ); return true; } if ( version == 0 && globalVersion > 0 ) { if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); result.appendTimestamp( "globalVersion" , globalVersion ); result.appendTimestamp( "oldVersion" , oldVersion ); errmsg = "dropping needs to be authoritative"; return false; } log() << "wiping data for: " << ns << endl; result.appendTimestamp( "beforeDrop" , globalVersion ); // only setting global version on purpose // need clients to re-find meta-data shardingState.resetVersion( ns ); info->setVersion( ns , 0 ); return true; } if ( version < oldVersion ) { errmsg = "you already have a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "oldVersion" , oldVersion ); result.appendTimestamp( "newVersion" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( version < globalVersion ) { while ( shardingState.inCriticalMigrateSection() ) { dbtemprelease r; sleepmillis(2); OCCASIONALLY log() << "waiting till out of critical section" << endl; } errmsg = "going to older version for global for collection '" + ns + "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , globalVersion ); return false; } if ( globalVersion == 0 && ! cmdObj.getBoolField( "authoritative" ) ) { // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } { dbtemprelease unlock; ShardChunkVersion currVersion = version; if ( ! shardingState.trySetVersion( ns , currVersion ) ) { errmsg = str::stream() << "client version differs from config's for colleciton '" << ns << "'"; result.append( "ns" , ns ); result.appendTimestamp( "version" , version ); result.appendTimestamp( "globalVersion" , currVersion ); return false; } } info->setVersion( ns , version ); result.appendTimestamp( "oldVersion" , oldVersion ); result.append( "ok" , 1 ); return true; }