void Socket::recv( char * buf , int len ) { int retries = 0; while( len > 0 ) { int ret = -1; if (MONGO_FAIL_POINT(throwSockExcep)) { #if defined(_WIN32) WSASetLastError(WSAENETUNREACH); #else errno = ENETUNREACH; #endif } else { ret = unsafe_recv(buf, len); } if (ret <= 0) { _handleRecvError(ret, len, &retries); continue; } if ( len <= 4 && ret != len ) LOG(_logLevel) << "Socket recv() got " << ret << " bytes wanted len=" << len << endl; fassert(16508, ret <= len); len -= ret; buf += ret; } }
// sends all data or throws an exception void Socket::send( const char * data , int len, const char *context ) { while( len > 0 ) { int ret = -1; if (MONGO_FAIL_POINT(throwSockExcep)) { #if defined(_WIN32) WSASetLastError(WSAENETUNREACH); #else errno = ENETUNREACH; #endif } else { ret = _send(data, len); } if (ret == -1) _handleSendError(ret, context); _bytesOut += ret; fassert(16507, ret <= len); len -= ret; data += ret; } }
StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn, const GetMoreRequest& request) { auto cursorManager = grid.getCursorManager(); auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid); if (!pinnedCursor.isOK()) { return pinnedCursor.getStatus(); } invariant(request.cursorid == pinnedCursor.getValue().getCursorId()); // If the fail point is enabled, busy wait until it is disabled. while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) { } if (request.awaitDataTimeout) { auto status = pinnedCursor.getValue().setAwaitDataTimeout(*request.awaitDataTimeout); if (!status.isOK()) { return status; } } std::vector<BSONObj> batch; int bytesBuffered = 0; long long batchSize = request.batchSize.value_or(0); long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar(); auto cursorState = ClusterCursorManager::CursorState::NotExhausted; while (!FindCommon::enoughForGetMore(batchSize, batch.size())) { auto next = pinnedCursor.getValue().next(); if (!next.isOK()) { return next.getStatus(); } if (!next.getValue()) { // We reached end-of-stream. if (!pinnedCursor.getValue().isTailable()) { cursorState = ClusterCursorManager::CursorState::Exhausted; } break; } if (!FindCommon::haveSpaceForNext(*next.getValue(), batch.size(), bytesBuffered)) { pinnedCursor.getValue().queueResult(*next.getValue()); break; } // Add doc to the batch. Account for the space overhead associated with returning this doc // inside a BSON array. bytesBuffered += (next.getValue()->objsize() + kPerDocumentOverheadBytesUpperBound); batch.push_back(std::move(*next.getValue())); } // Transfer ownership of the cursor back to the cursor manager. pinnedCursor.getValue().returnCursor(cursorState); CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted) ? CursorId(0) : request.cursorid; return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom); }
void NetworkInterfaceASIO::_asyncRunCommand(AsyncOp* op, NetworkOpHandler handler) { LOG(2) << "Starting asynchronous command " << op->request().id << " on host " << op->request().target.toString(); if (MONGO_FAIL_POINT(NetworkInterfaceASIOasyncRunCommandFail)) { _validateAndRun(op, asio::error::basic_errors::network_unreachable, [] {}); return; } // We invert the following steps below to run a command: // 1 - send the given command // 2 - receive a header for the response // 3 - validate and receive response body // 4 - advance the state machine by calling handler() auto cmd = op->command(); // Step 4 auto recvMessageCallback = [this, cmd, handler, op](std::error_code ec, size_t bytes) { // We don't call _validateAndRun here as we assume the caller will. handler(ec, bytes); }; // Step 3 auto recvHeaderCallback = [this, cmd, handler, recvMessageCallback, op](std::error_code ec, size_t bytes) { // The operation could have been canceled after starting the command, but before // receiving the header _validateAndRun(op, ec, [this, op, recvMessageCallback, ec, bytes, cmd, handler] { // validate response id uint32_t expectedId = cmd->toSend().header().getId(); uint32_t actualId = cmd->header().constView().getResponseToMsgId(); if (actualId != expectedId) { LOG(3) << "got wrong response:" << " expected response id: " << expectedId << ", got response id: " << actualId; return handler(make_error_code(ErrorCodes::ProtocolError), bytes); } asyncRecvMessageBody(cmd->conn().stream(), &cmd->header(), &cmd->toRecv(), std::move(recvMessageCallback)); }); }; // Step 2 auto sendMessageCallback = [this, cmd, handler, recvHeaderCallback, op](std::error_code ec, size_t bytes) { _validateAndRun(op, ec, [this, cmd, op, recvHeaderCallback] { asyncRecvMessageHeader( cmd->conn().stream(), &cmd->header(), std::move(recvHeaderCallback)); }); }; // Step 1 asyncSendMessage(cmd->conn().stream(), &cmd->toSend(), std::move(sendMessageCallback)); }
bool OperationContext::hasDeadlineExpired() const { if (!hasDeadline()) { return false; } if (MONGO_FAIL_POINT(maxTimeNeverTimeOut)) { return false; } if (MONGO_FAIL_POINT(maxTimeAlwaysTimeOut)) { return true; } // TODO: Remove once all OperationContexts are properly connected to Clients and ServiceContexts // in tests. if (MONGO_unlikely(!getClient() || !getServiceContext())) { return false; } const auto now = getServiceContext()->getFastClockSource()->now(); return now >= getDeadline(); }
void CollectionCloner::_insertDocumentsCallback( const executor::TaskExecutor::CallbackArgs& cbd, bool lastBatch, std::shared_ptr<OnCompletionGuard> onCompletionGuard) { if (!cbd.status.isOK()) { stdx::lock_guard<stdx::mutex> lock(_mutex); onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, cbd.status); return; } UniqueLock lk(_mutex); std::vector<BSONObj> docs; if (_documentsToInsert.size() == 0) { warning() << "_insertDocumentsCallback, but no documents to insert for ns:" << _destNss; if (lastBatch) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lk, Status::OK()); } return; } _documentsToInsert.swap(docs); _stats.documentsCopied += docs.size(); ++_stats.fetchBatches; _progressMeter.hit(int(docs.size())); invariant(_collLoader); const auto status = _collLoader->insertDocuments(docs.cbegin(), docs.cend()); if (!status.isOK()) { onCompletionGuard->setResultAndCancelRemainingWork_inlock(lk, status); return; } MONGO_FAIL_POINT_BLOCK(initialSyncHangDuringCollectionClone, options) { const BSONObj& data = options.getData(); if (data["namespace"].String() == _destNss.ns() && static_cast<int>(_stats.documentsCopied) >= data["numDocsToClone"].numberInt()) { lk.unlock(); log() << "initial sync - initialSyncHangDuringCollectionClone fail point " "enabled. Blocking until fail point is disabled."; while (MONGO_FAIL_POINT(initialSyncHangDuringCollectionClone) && !_isShuttingDown()) { mongo::sleepsecs(1); } lk.lock(); } } if (lastBatch) { // Clean up resources once the last batch has been copied over and set the status to OK. onCompletionGuard->setResultAndCancelRemainingWork_inlock(lk, Status::OK()); } }
// sends all data or throws an exception void Socket::send( const char * data , int len, const char *context ) { while( len > 0 ) { int ret = _send( data , len ); if (ret == -1 || MONGO_FAIL_POINT(throwSockExcep)) { _handleSendError(ret, context); } _bytesOut += ret; fassert(16507, ret <= len); len -= ret; data += ret; } }
void WriteUnitOfWork::commit() { invariant(!_committed); invariant(!_released); invariant(_opCtx->_ruState == RecoveryUnitState::kActiveUnitOfWork); if (_toplevel) { if (MONGO_FAIL_POINT(sleepBeforeCommit)) { sleepFor(Milliseconds(100)); } _opCtx->recoveryUnit()->commitUnitOfWork(); _opCtx->_ruState = RecoveryUnitState::kNotInUnitOfWork; } _opCtx->lockState()->endWriteUnitOfWork(); _committed = true; }
RecordFetcher* MmapV1ExtentManager::recordNeedsFetch( const DiskLoc& loc ) const { Record* record = _recordForV1( loc ); // For testing: if failpoint is enabled we randomly request fetches without // going to the RecordAccessTracker. if ( MONGO_FAIL_POINT( recordNeedsFetchFail ) ) { needsFetchFailCounter.increment(); if ( ( needsFetchFailCounter.get() % kNeedsFetchFailFreq ) == 0 ) { return new MmapV1RecordFetcher( record ); } } if ( !_recordAccessTracker->checkAccessedAndMark( record ) ) { return new MmapV1RecordFetcher( record ); } return NULL; }
std::unique_ptr<RecordFetcher> MmapV1ExtentManager::recordNeedsFetch(const DiskLoc& loc) const { if (loc.isNull()) return {}; MmapV1RecordHeader* record = _recordForV1( loc ); // For testing: if failpoint is enabled we randomly request fetches without // going to the RecordAccessTracker. if ( MONGO_FAIL_POINT( recordNeedsFetchFail ) ) { needsFetchFailCounter.increment(); if ( ( needsFetchFailCounter.get() % kNeedsFetchFailFreq ) == 0 ) { return stdx::make_unique<MmapV1RecordFetcher>( record ); } } if ( !_recordAccessTracker->checkAccessedAndMark( record ) ) { return stdx::make_unique<MmapV1RecordFetcher>( record ); } return {}; }
void Socket::recv(char* buf, int len) { while (len > 0) { int ret = -1; if (MONGO_FAIL_POINT(throwSockExcep)) { #if defined(_WIN32) WSASetLastError(WSAENETUNREACH); #else errno = ENETUNREACH; #endif if (ret <= 0) { handleRecvError(ret, len); continue; } } else { ret = unsafe_recv(buf, len); } fassert(16508, ret <= len); len -= ret; buf += ret; } }
int Balancer::_moveChunks(OperationContext* txn, const BalancerChunkSelectionPolicy::MigrateInfoVector& candidateChunks, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { int movedCount = 0; for (const auto& migrateInfo : candidateChunks) { // If the balancer was disabled since we started this round, don't start new chunks // moves. if (!Grid::get(txn)->getBalancerConfiguration()->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "Stopping balancing round early as balancing was disabled"; return movedCount; } // Changes to metadata, borked metadata, and connectivity problems between shards // should cause us to abort this chunk move, but shouldn't cause us to abort the entire // round of chunks. // // TODO(spencer): We probably *should* abort the whole round on issues communicating // with the config servers, but its impossible to distinguish those types of failures // at the moment. // // TODO: Handle all these things more cleanly, since they're expected problems const NamespaceString nss(migrateInfo.ns); try { shared_ptr<DBConfig> cfg = uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString())); // NOTE: We purposely do not reload metadata here, since _getCandidateChunks already // tried to do so once shared_ptr<ChunkManager> cm = cfg->getChunkManager(txn, migrateInfo.ns); uassert(28628, str::stream() << "Collection " << migrateInfo.ns << " was deleted while balancing was active. Aborting balancing round.", cm); shared_ptr<Chunk> c = cm->findIntersectingChunk(txn, migrateInfo.minKey); if (c->getMin().woCompare(migrateInfo.minKey) || c->getMax().woCompare(migrateInfo.maxKey)) { // Likely a split happened somewhere, so force reload the chunk manager cm = cfg->getChunkManager(txn, migrateInfo.ns, true); invariant(cm); c = cm->findIntersectingChunk(txn, migrateInfo.minKey); if (c->getMin().woCompare(migrateInfo.minKey) || c->getMax().woCompare(migrateInfo.maxKey)) { log() << "chunk mismatch after reload, ignoring will retry issue " << migrateInfo; continue; } } BSONObj res; if (c->moveAndCommit(txn, migrateInfo.to, Grid::get(txn)->getBalancerConfiguration()->getMaxChunkSizeBytes(), secondaryThrottle, waitForDelete, 0, /* maxTimeMS */ res)) { movedCount++; continue; } log() << "balancer move failed: " << res << ", migrate: " << migrateInfo; Status moveStatus = getStatusFromCommandResult(res); if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) { // Reload just to be safe cm = cfg->getChunkManager(txn, migrateInfo.ns); invariant(cm); c = cm->findIntersectingChunk(txn, migrateInfo.minKey); log() << "performing a split because migrate failed for size reasons"; auto splitStatus = c->split(txn, Chunk::normal, NULL); if (!splitStatus.isOK()) { log() << "marking chunk as jumbo: " << c->toString(); c->markAsJumbo(txn); // We increment moveCount so we do another round right away movedCount++; } } } catch (const DBException& ex) { log() << "balancer move " << migrateInfo << " failed" << causedBy(ex); } } return movedCount; }
void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { auto txn = cc().makeOperationContext(); if (!_init(txn.get())) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } Seconds balanceRoundInterval(kBalanceRoundDefaultInterval); while (!inShutdown()) { auto txn = cc().makeOperationContext(); BalanceRoundDetails roundDetails; try { // ping has to be first so we keep things in the config server in sync _ping(txn.get(), false); MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) { const BSONObj& data = scopedBalancerRoundInterval.getData(); balanceRoundInterval = Seconds(data["sleepSecs"].numberInt()); } // Use fresh shard state and balancer settings Grid::get(txn.get())->shardRegistry()->reload(txn.get()); auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration(); Status refreshStatus = balancerConfig->refreshAndCheck(txn.get()); if (!refreshStatus.isOK()) { warning() << "Skipping balancing round" << causedBy(refreshStatus); sleepFor(balanceRoundInterval); continue; } // now make sure we should even be running if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); { auto scopedDistLock = grid.catalogManager(txn.get()) ->distLock(txn.get(), "balancer", "doing balance round", DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); // no need to wake up soon continue; } LOG(1) << "*** start balancing round. " << "waitForDelete: " << balancerConfig->waitForDelete() << ", secondaryThrottle: " << balancerConfig->getSecondaryThrottle().toBSON(); OCCASIONALLY warnOnMultiVersion( uassertStatusOK(_clusterStats->getStats(txn.get()))); Status status = _enforceTagRanges(txn.get()); if (!status.isOK()) { warning() << "Failed to enforce tag ranges" << causedBy(status); } else { LOG(1) << "Done enforcing tag range boundaries."; } const auto candidateChunks = uassertStatusOK( _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime)); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, balancerConfig->getSecondaryThrottle(), balancerConfig->waitForDelete()); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error sleepFor(balanceRoundInterval); } } }
void Socket::_handleSendError(int ret, const char* context) { #ifdef MONGO_SSL if (_ssl) { LOG(_logLevel) << "SSL Error ret: " << ret << " err: " << SSL_get_error(_ssl , ret) << " " << ERR_error_string(ERR_get_error(), NULL) << endl; throw SocketException(SocketException::SEND_ERROR , remoteString()); } #endif #if defined(_WIN32) const int mongo_errno = WSAGetLastError(); if ( mongo_errno == WSAETIMEDOUT && _timeout != 0 ) { #else const int mongo_errno = errno; if ( ( mongo_errno == EAGAIN || mongo_errno == EWOULDBLOCK ) && _timeout != 0 ) { #endif LOG(_logLevel) << "Socket " << context << " send() timed out " << remoteString() << endl; throw SocketException(SocketException::SEND_TIMEOUT , remoteString()); } else { LOG(_logLevel) << "Socket " << context << " send() " << errnoWithDescription(mongo_errno) << ' ' << remoteString() << endl; throw SocketException(SocketException::SEND_ERROR , remoteString()); } } void Socket::_handleRecvError(int ret, int len, int* retries) { if (ret == 0 || MONGO_FAIL_POINT(throwSockExcep)) { LOG(3) << "Socket recv() conn closed? " << remoteString() << endl; throw SocketException(SocketException::CLOSED , remoteString()); } // ret < 0 #ifdef MONGO_SSL if (_ssl) { LOG(_logLevel) << "SSL Error ret: " << ret << " err: " << SSL_get_error(_ssl , ret) << " " << ERR_error_string(ERR_get_error(), NULL) << endl; throw SocketException(SocketException::RECV_ERROR, remoteString()); } #endif #if defined(_WIN32) int e = WSAGetLastError(); #else int e = errno; # if defined(EINTR) if (e == EINTR) { LOG(_logLevel) << "EINTR retry " << ++*retries << endl; return; } # endif #endif #if defined(_WIN32) // Windows if ((e == EAGAIN || e == WSAETIMEDOUT) && _timeout > 0) { #else if (e == EAGAIN && _timeout > 0) { #endif // this is a timeout LOG(_logLevel) << "Socket recv() timeout " << remoteString() <<endl; throw SocketException(SocketException::RECV_TIMEOUT, remoteString()); } LOG(_logLevel) << "Socket recv() " << errnoWithDescription(e) << " " << remoteString() <<endl; throw SocketException(SocketException::RECV_ERROR , remoteString()); } void Socket::setTimeout( double secs ) { setSockTimeouts( _fd, secs ); } #if defined(_WIN32) struct WinsockInit { WinsockInit() { WSADATA d; if ( WSAStartup(MAKEWORD(2,2), &d) != 0 ) { out() << "ERROR: wsastartup failed " << errnoWithDescription() << endl; problem() << "ERROR: wsastartup failed " << errnoWithDescription() << endl; _exit(EXIT_NTSERVICE_ERROR); } } } winsock_init; #endif } // namespace mongo
/** sends all data or throws an exception * @param context descriptive for logging */ void Socket::send( const vector< pair< char *, int > > &data, const char *context ) { #ifdef MONGO_SSL if ( _ssl ) { _send( data , context ); return; } #endif #if defined(_WIN32) // TODO use scatter/gather api _send( data , context ); #else vector<struct iovec> d( data.size() ); int i = 0; for (vector< pair<char *, int> >::const_iterator j = data.begin(); j != data.end(); ++j) { if ( j->second > 0 ) { d[ i ].iov_base = j->first; d[ i ].iov_len = j->second; ++i; _bytesOut += j->second; } } struct msghdr meta; memset( &meta, 0, sizeof( meta ) ); meta.msg_iov = &d[ 0 ]; meta.msg_iovlen = d.size(); while( meta.msg_iovlen > 0 ) { int ret = ::sendmsg( _fd , &meta , portSendFlags ); if ( ret == -1 || MONGO_FAIL_POINT(throwSockExcep)) { if ( errno != EAGAIN || _timeout == 0 ) { LOG(_logLevel) << "Socket " << context << " send() " << errnoWithDescription() << ' ' << remoteString() << endl; throw SocketException( SocketException::SEND_ERROR , remoteString() ); } else { LOG(_logLevel) << "Socket " << context << " send() remote timeout " << remoteString() << endl; throw SocketException( SocketException::SEND_TIMEOUT , remoteString() ); } } else { struct iovec *& i = meta.msg_iov; while( ret > 0 ) { if ( i->iov_len > unsigned( ret ) ) { i->iov_len -= ret; i->iov_base = (char*)(i->iov_base) + ret; ret = 0; } else { ret -= i->iov_len; ++i; --(meta.msg_iovlen); } } } } #endif }
void WiredTigerOplogManager::_oplogJournalThreadLoop(WiredTigerSessionCache* sessionCache, WiredTigerRecordStore* oplogRecordStore, const bool updateOldestTimestamp) noexcept { Client::initThread("WTOplogJournalThread"); // This thread updates the oplog read timestamp, the timestamp used to read from the oplog with // forward cursors. The timestamp is used to hide oplog entries that might be committed but // have uncommitted entries ahead of them. while (true) { stdx::unique_lock<stdx::mutex> lk(_oplogVisibilityStateMutex); { MONGO_IDLE_THREAD_BLOCK; _opsWaitingForJournalCV.wait(lk, [&] { return _shuttingDown || _opsWaitingForJournal; }); // If we're not shutting down and nobody is actively waiting for the oplog to become // durable, delay journaling a bit to reduce the sync rate. auto journalDelay = Milliseconds(storageGlobalParams.journalCommitIntervalMs.load()); if (journalDelay == Milliseconds(0)) { journalDelay = Milliseconds(WiredTigerKVEngine::kDefaultJournalDelayMillis); } auto now = Date_t::now(); auto deadline = now + journalDelay; auto shouldSyncOpsWaitingForJournal = [&] { return _shuttingDown || _opsWaitingForVisibility || oplogRecordStore->haveCappedWaiters(); }; // Eventually it would be more optimal to merge this with the normal journal flushing // and block for either oplog tailers or operations waiting for oplog visibility. For // now this loop will poll once a millisecond up to the journalDelay to see if we have // any waiters yet. This reduces sync-related I/O on the primary when secondaries are // lagged, but will avoid significant delays in confirming majority writes on replica // sets with infrequent writes. // Callers of waitForAllEarlierOplogWritesToBeVisible() like causally consistent reads // will preempt this delay. while (now < deadline && !_opsWaitingForJournalCV.wait_until( lk, now.toSystemTimePoint(), shouldSyncOpsWaitingForJournal)) { now += Milliseconds(1); } } while (!_shuttingDown && MONGO_FAIL_POINT(WTPausePrimaryOplogDurabilityLoop)) { lk.unlock(); sleepmillis(10); lk.lock(); } if (_shuttingDown) { log() << "oplog journal thread loop shutting down"; return; } invariant(_opsWaitingForJournal); _opsWaitingForJournal = false; lk.unlock(); const uint64_t newTimestamp = fetchAllCommittedValue(sessionCache->conn()); // The newTimestamp may actually go backward during secondary batch application, // where we commit data file changes separately from oplog changes, so ignore // a non-incrementing timestamp. if (newTimestamp <= _oplogReadTimestamp.load()) { LOG(2) << "no new oplog entries were made visible: " << newTimestamp; continue; } // In order to avoid oplog holes after an unclean shutdown, we must ensure this proposed // oplog read timestamp's documents are durable before publishing that timestamp. sessionCache->waitUntilDurable(/*forceCheckpoint=*/false, false); lk.lock(); // Publish the new timestamp value. Avoid going backward. auto oldTimestamp = getOplogReadTimestamp(); if (newTimestamp > oldTimestamp) { _setOplogReadTimestamp(lk, newTimestamp); } lk.unlock(); if (updateOldestTimestamp) { const bool force = false; sessionCache->getKVEngine()->setOldestTimestamp(Timestamp(newTimestamp), force); } // Wake up any await_data cursors and tell them more data might be visible now. oplogRecordStore->notifyCappedWaitersIfNeeded(); } }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View getMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; const NamespaceString nss(ns); // Depending on the type of cursor being operated on, we hold locks for the whole getMore, // or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors don't // own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection lock. // We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for the // unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that the // pin's destructor is called before the lock destructors (so that the unpin occurs under // the lock). boost::scoped_ptr<AutoGetCollectionForRead> ctx; boost::scoped_ptr<Lock::DBLock> unpinDBLock; boost::scoped_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); cursorManager = collection->getCursorManager(); } LOG(5) << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorManager, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->ns(), ns == cc->ns()); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (txn->getClient()->isInDirectClient()) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn); } if (cc->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForGetMore(ntoreturn, numResults, bb.len())) { break; } } if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) { // Propagate this error to caller. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc)); // In order to deregister a cursor, we need to be holding the DB + collection lock and // if the cursor is aggregation, we release these locks. if (cc->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS)); unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS)); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In // this case, the pin's destructor will be invoked, which will call release() on the // pin. Because our ClientCursorPin is declared after our lock is declared, this // will happen under the lock. if (!shouldSaveCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; curop.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!txn->getClient()->isInDirectClient()) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper->dismiss(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); LOG(5) << "getMore returned " << numResults << " results\n"; return qr; }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Status* remoteOplogStartStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { return; } if (inShutdown()) { return; } // Check if we have been paused. if (isPaused()) { return; } const auto& queryResponse = result.getValue(); const auto& documents = queryResponse.documents; auto documentBegin = documents.cbegin(); auto documentEnd = documents.cend(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> { if (documentBegin == documentEnd) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(documentBegin++); }; *remoteOplogStartStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!remoteOplogStartStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(documentBegin != documents.cbegin()); } // process documents int currentBatchMessageSize = 0; for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) { if (inShutdown()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the fetcher. const BSONObj& o = *documentIter; currentBatchMessageSize += o.objsize(); opsReadStats.increment(); if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { sleepsecs(20); } { stdx::unique_lock<stdx::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = extractOpTime(o); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } // record time for each batch getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count()); networkByteStats.increment(currentBatchMessageSize); // Check some things periodically // (whenever we run out of items in the // current cursor batch) if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (_shouldChangeSyncSource(source)) { return; } // Check if we have been paused. if (isPaused()) { return; } // We fill in 'bob' to signal the fetcher to process with another getMore. invariant(bob); bob->append("getMore", queryResponse.cursorId); bob->append("collection", queryResponse.nss.coll()); bob->append("maxTimeMS", int(fetcherMaxTimeMS.count())); }
Status dropCollection(OperationContext* opCtx, const NamespaceString& collectionName, BSONObjBuilder& result, const repl::OpTime& dropOpTime, DropCollectionSystemCollectionMode systemCollectionMode) { if (!serverGlobalParams.quiet.load()) { log() << "CMD: drop " << collectionName; } return writeConflictRetry(opCtx, "drop", collectionName.ns(), [&] { AutoGetDb autoDb(opCtx, collectionName.db(), MODE_X); Database* const db = autoDb.getDb(); Collection* coll = db ? db->getCollection(opCtx, collectionName) : nullptr; auto view = db && !coll ? db->getViewCatalog()->lookup(opCtx, collectionName.ns()) : nullptr; if (MONGO_FAIL_POINT(hangDuringDropCollection)) { log() << "hangDuringDropCollection fail point enabled. Blocking until fail point is " "disabled."; MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangDuringDropCollection); } if (!db || (!coll && !view)) { return Status(ErrorCodes::NamespaceNotFound, "ns not found"); } const bool shardVersionCheck = true; OldClientContext context(opCtx, collectionName.ns(), shardVersionCheck); bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, collectionName); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping collection " << collectionName); } WriteUnitOfWork wunit(opCtx); if (!result.hasField("ns")) { result.append("ns", collectionName.ns()); } if (coll) { invariant(!view); int numIndexes = coll->getIndexCatalog()->numIndexesTotal(opCtx); BackgroundOperation::assertNoBgOpInProgForNs(collectionName.ns()); Status s = systemCollectionMode == DropCollectionSystemCollectionMode::kDisallowSystemCollectionDrops ? db->dropCollection(opCtx, collectionName.ns(), dropOpTime) : db->dropCollectionEvenIfSystem(opCtx, collectionName, dropOpTime); if (!s.isOK()) { return s; } result.append("nIndexesWas", numIndexes); } else { invariant(view); Status status = db->dropView(opCtx, collectionName.ns()); if (!status.isOK()) { return status; } } wunit.commit(); return Status::OK(); }); }
void BackgroundSync::getOplogReader(OplogReader& r) { const Member *target = NULL, *stale = NULL; BSONObj oldest; { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set _currentSyncTarget = NULL; return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer) { _condvar.wait(lock); } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } verify(r.conn() == NULL); while ((target = theReplSet->getMemberToSyncTo()) != NULL) { string current = target->fullName(); if (!r.connect(current)) { LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog; r.resetConnection(); theReplSet->veto(current); sleepsecs(1); continue; } if (isStale(r, oldest)) { r.resetConnection(); theReplSet->veto(current, 600); stale = target; continue; } // if we made it here, the target is up and not stale { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = target; } boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock); theReplSet->syncSourceFeedback.connect(target); return; } // the only viable sync target was stale if (stale) { theReplSet->goStale(stale, oldest); sleepsecs(120); } { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = NULL; } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); ApplyBatchFinalizer finalizer(replCoord); OperationContextImpl txn; OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); } const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj lastOp = ops.back(); const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } if (MONGO_FAIL_POINT(rsSyncApplyStop)) { break; } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns // true when we need to end a // batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); if (inShutdown()) return; } if (ops.empty()) { continue; } const BSONObj lastOp = ops.back(); handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
void BackgroundSync::_produce(OperationContext* txn) { while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) { sleepmillis(0); } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (!_replCoord->isCatchingUp() && (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) { return; } if (_inShutdown_inlock()) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; SyncSourceResolverResponse syncSourceResp; SyncSourceResolver* syncSourceResolver; OpTime minValid; if (_replCoord->getMemberState().recovering()) { auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn); if (minValidSaved > lastOpTimeFetched) { minValid = minValidSaved; } } { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, minValid, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); syncSourceResolver = _syncSourceResolver.get(); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); syncSourceResolver->join(); syncSourceResolver = nullptr; { stdx::unique_lock<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->isCatchingUp()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); sleepsecs(1); return; } error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; if (!_replCoord->isCatchingUp()) { _replCoord->signalUpstreamUpdater(); } } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) { StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto executor = _replicationCoordinatorExternalState->getTaskExecutor(); auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>( executor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { if (_replCoord->isCatchingUp()) { warning() << "Rollback situation detected in catch-up mode; catch-up mode will end."; sleepsecs(1); return; } // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << redact(fetcherReturnStatus); // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state const auto minValid = StorageInterface::get(txn)->getMinValid(txn); if (lastApplied < minValid) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << minValid.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }
DocumentSource::GetNextResult Exchange::getNext(OperationContext* opCtx, size_t consumerId) { // Grab a lock. stdx::unique_lock<stdx::mutex> lk(_mutex); for (;;) { // Execute only in case we have not encountered an error. if (!_errorInLoadNextBatch.isOK()) { uasserted(ErrorCodes::ExchangePassthrough, "Exchange failed due to an error on different thread."); } // Check if we have a document. if (!_consumers[consumerId]->isEmpty()) { auto doc = _consumers[consumerId]->getNext(); unblockLoading(consumerId); return doc; } // There is not any document so try to load more from the source. if (_loadingThreadId == kInvalidThreadId) { LOG(3) << "A consumer " << consumerId << " begins loading"; try { // This consumer won the race and will fill the buffers. _loadingThreadId = consumerId; _pipeline->reattachToOperationContext(opCtx); // This will return when some exchange buffer is full and we cannot make any forward // progress anymore. // The return value is an index of a full consumer buffer. size_t fullConsumerId = loadNextBatch(); if (MONGO_FAIL_POINT(exchangeFailLoadNextBatch)) { log() << "exchangeFailLoadNextBatch fail point enabled."; uasserted(ErrorCodes::FailPointEnabled, "Asserting on loading the next batch due to failpoint."); } _pipeline->detachFromOperationContext(); // The loading cannot continue until the consumer with the full buffer consumes some // documents. _loadingThreadId = fullConsumerId; // Wake up everybody and try to make some progress. _haveBufferSpace.notify_all(); } catch (const DBException& ex) { _errorInLoadNextBatch = ex.toStatus(); // We have to wake up all other blocked threads so they can detect the error and // fail too. They can be woken up only after _errorInLoadNextBatch has been set. _haveBufferSpace.notify_all(); throw; } } else { // Some other consumer is already loading the buffers. There is nothing else we can do // but wait. _haveBufferSpace.wait(lk); } } }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; scoped_ptr<Timer> timer; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; OpTime last; while( 1 ) { bool isCursorAuthorized = false; try { const NamespaceString nsString( ns ); uassert( 16258, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid() ); Status status = cc().getAuthorizationManager()->checkAuthForGetMore(ns); uassert(16543, status.reason(), status.isOK()); if (str::startsWith(ns, "local.oplog.")){ while (MONGO_FAIL_POINT(rsStopGetMore)) { sleepmillis(0); } if (pass == 0) { mutex::scoped_lock lk(OpTime::m); last = OpTime::getLast(lk); } else { last.waitForDifferent(1000/*ms*/); } } msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, &isCursorAuthorized); } catch ( AssertionException& e ) { if ( isCursorAuthorized ) { // If a cursor with id 'cursorid' was authorized, it may have been advanced // before an exception terminated processGetMore. Erase the ClientCursor // because it may now be out of sync with the client's iteration state. // SERVER-7952 // TODO Temporary code, see SERVER-4563 for a cleanup overview. ClientCursor::erase( cursorid ); } ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } if (msgdata == 0) { // this should only happen with QueryOption_AwaitData exhaust = false; massert(13073, "shutting down", !inShutdown() ); if ( ! timer ) { timer.reset( new Timer() ); } else { if ( timer->seconds() >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } pass++; if (debug) sleepmillis(20); else sleepmillis(2); // note: the 1100 is beacuse of the waitForDifferent above // should eventually clean this up a bit curop.setExpectedLatencyMs( 1100 + timer->millis() ); continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); log() << errObj << endl; curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaustNS = ns; } return ok; }
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. // If this status is not OK after the fetcher returns from wait(), // proceed to execute rollback Status remoteOplogStartStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, &remoteOplogStartStatus); auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter" << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())) << "tailable" << true << "oplogReplay" << true << "awaitData" << true << "maxTimeMS" << int(fetcherMaxTimeMS.count())); Fetcher fetcher(taskExecutor, source, nsToDatabase(rsOplogName), cmdObj, fetcherCallback, rpc::makeEmptyMetadata()); auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } // Execute rollback if necessary. // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. if (!remoteOplogStartStatus.isOK()) { const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << remoteOplogStartStatus; _rollback(txn, source, getConnection); stop(); } }
// Theory of operation for waitForConditionOrInterruptNoAssertUntil and markKilled: // // An operation indicates to potential killers that it is waiting on a condition variable by setting // _waitMutex and _waitCV, while holding the lock on its parent Client. It then unlocks its Client, // unblocking any killers, which are required to have locked the Client before calling markKilled. // // When _waitMutex and _waitCV are set, killers must lock _waitMutex before setting the _killCode, // and must signal _waitCV before releasing _waitMutex. Unfortunately, they must lock _waitMutex // without holding a lock on Client to avoid a deadlock with callers of // waitForConditionOrInterruptNoAssertUntil(). So, in the event that _waitMutex is set, the killer // increments _numKillers, drops the Client lock, acquires _waitMutex and then re-acquires the // Client lock. We know that the Client, its OperationContext and _waitMutex will remain valid // during this period because the caller of waitForConditionOrInterruptNoAssertUntil will not return // while _numKillers > 0 and will not return until it has itself reacquired _waitMutex. Instead, // that caller will keep waiting on _waitCV until _numKillers drops to 0. // // In essence, when _waitMutex is set, _killCode is guarded by _waitMutex and _waitCV, but when // _waitMutex is not set, it is guarded by the Client spinlock. Changing _waitMutex is itself // guarded by the Client spinlock and _numKillers. // // When _numKillers does drop to 0, the waiter will null out _waitMutex and _waitCV. // // This implementation adds a minimum of two spinlock acquire-release pairs to every condition // variable wait. StatusWith<stdx::cv_status> OperationContext::waitForConditionOrInterruptNoAssertUntil( stdx::condition_variable& cv, stdx::unique_lock<stdx::mutex>& m, Date_t deadline) noexcept { invariant(getClient()); { stdx::lock_guard<Client> clientLock(*getClient()); invariant(!_waitMutex); invariant(!_waitCV); invariant(0 == _numKillers); // This interrupt check must be done while holding the client lock, so as not to race with a // concurrent caller of markKilled. auto status = checkForInterruptNoAssert(); if (!status.isOK()) { return status; } _waitMutex = m.mutex(); _waitCV = &cv; } // If the maxTimeNeverTimeOut failpoint is set, behave as though the operation's deadline does // not exist. Under normal circumstances, if the op has an existing deadline which is sooner // than the deadline passed into this method, we replace our deadline with the op's. This means // that we expect to time out at the same time as the existing deadline expires. If, when we // time out, we find that the op's deadline has not expired (as will always be the case if // maxTimeNeverTimeOut is set) then we assume that the incongruity is due to a clock mismatch // and return _timeoutError regardless. To prevent this behaviour, only consider the op's // deadline in the event that the maxTimeNeverTimeOut failpoint is not set. bool opHasDeadline = (hasDeadline() && !MONGO_FAIL_POINT(maxTimeNeverTimeOut)); if (opHasDeadline) { deadline = std::min(deadline, getDeadline()); } const auto waitStatus = [&] { if (Date_t::max() == deadline) { Waitable::wait(_baton.get(), getServiceContext()->getPreciseClockSource(), cv, m); return stdx::cv_status::no_timeout; } return getServiceContext()->getPreciseClockSource()->waitForConditionUntil( cv, m, deadline, _baton.get()); }(); // Continue waiting on cv until no other thread is attempting to kill this one. Waitable::wait(_baton.get(), getServiceContext()->getPreciseClockSource(), cv, m, [this] { stdx::lock_guard<Client> clientLock(*getClient()); if (0 == _numKillers) { _waitMutex = nullptr; _waitCV = nullptr; return true; } return false; }); auto status = checkForInterruptNoAssert(); if (!status.isOK()) { return status; } if (opHasDeadline && waitStatus == stdx::cv_status::timeout && deadline == getDeadline()) { // It's possible that the system clock used in stdx::condition_variable::wait_until // is slightly ahead of the FastClock used in checkForInterrupt. In this case, // we treat the operation as though it has exceeded its time limit, just as if the // FastClock and system clock had agreed. if (!_hasArtificialDeadline) { markKilled(_timeoutError); } return Status(_timeoutError, "operation exceeded time limit"); } return waitStatus; }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View newGetMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized, bool fromDBDirectClient) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; // This is a read lock. const NamespaceString nss(ns); scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); QLOG() << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(collection, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (fromDBDirectClient) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn)); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn, curop); } if (cc->isAggCursor) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } // We save the client cursor when there might be more results, and hence we may receive // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know // that we will not be producing more results. We indicate that the cursor is closed by // sending a cursorId of 0 back to the client. // // On the other hand, if we retrieve all results necessary for this batch, then // 'saveClientCursor' is true and we send a valid cursorId back to the client. In // this case, there may or may not actually be more results (for example, the next call // to getNext(...) might just return EOF). bool saveClientCursor = false; if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) { // Propagate this error to caller. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (PlanExecutor::IS_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(PlanExecutor::ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); QLOG() << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!fromDBDirectClient) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper.reset(); delete cc->releaseOwnedRecoveryUnit(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; verify( !Lock::isLocked() ); Timer batchTimer; int lastTimeChecked = 0; do { if (theReplSet->isPrimary()) { massert(16620, "there are ops to sync, but I'm primary", ops.empty()); return; } int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { { boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex); if (theReplSet->initialSyncRequested) { // got a resync command return; } } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) { OpTime minvalid; OperationContextImpl txn; theReplSet->tryToGoLiveAsASecondary(&txn, minvalid); } // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = theReplSet->myConfig().slaveDelay; if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes)); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } const BSONObj& lastOp = ops.getDeque().back(); setOplogVersion(lastOp); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog theReplSet->setMinValid(lastOp); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to apply batch up to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } multiApply(ops.getDeque(), multiSyncApply); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to update oplog to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue if (!theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r(false /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } r.tailingQueryGTE(rsoplog, _lastOpTimeFetched); } // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { while (!inShutdown()) { if (!r.moreInCurrentBatch()) { int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as postentiallyd efating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) break; BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); } } // end while { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } // looping back is ok because this is a tailable cursor } }
QueryResult* processGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized ) { bool hasRunner = false; // Scoped to kill the pin after seeing if the runner's there. { // See if there's a runner. We do this until agg. is behind a Runner instead of a CC. ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if (NULL != cc && NULL != cc->getRunner()) { hasRunner = true; } } if (hasRunner) { return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } exhaust = false; int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns)); // call this readlocked so state can't change replVerifyReadsOk(); ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if ( unlikely(!cc) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Some internal users create a ClientCursor with a Runner. Don't crash if this // happens. Instead, hand them off to the new framework. if (NULL != cc->getRunner()) { p.release(); return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // This must be done after auth check to ensure proper cleanup. uassert(16951, "failing getmore due to set failpoint", !MONGO_FAIL_POINT(getMoreError)); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() ); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); curop.debug().query = cc->query(); curop.setQuery( cc->query() ); start = cc->pos(); Cursor *c = cc->c(); if (!c->requiresLock()) { // make sure it won't be destroyed under us fassert(16952, !c->shouldDestroyOnNSDeletion()); fassert(16953, !c->supportYields()); ctx.reset(); // unlocks } c->recoverFromYield(); DiskLoc last; // This metadata may be stale, but it's the state of chunking when the cursor was // created. CollectionMetadataPtr metadata = cc->getCollMetadata(); KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() ); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { // when a tailable cursor hits "EOF", ok() goes false, and current() is // null. however advance() can still be retries as a reactivation attempt. // when there is new data, it will return true. that's what we are doing // here. if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; cc = 0; break; } MatchDetails details; if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) { // field projection specified, and contains an array operator details.requestElemMatchKey(); } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches( &details ) ) { } else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) { LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; // Fill out the fields requested by the query. const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly(); if ( keyFieldsOnly ) { fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate( c->currKey() ), &details ); } else { DiskLoc loc = c->currLoc(); fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details, ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) ); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ? ClientCursor::DontNeed : ClientCursor::WillNeed ) ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; break; } } if ( cc ) { if ( c->supportYields() ) { ClientCursor::YieldData data; verify( cc->prepareToYield( data ) ); } else { cc->c()->noteLocation(); } cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }