HDC * CreateImage(long width, long height) { BITMAPINFO bmInfo = {0}; memset(&bmInfo,0,sizeof(bmInfo)); bmInfo.bmiHeader.biSize = sizeof(BITMAPINFOHEADER); bmInfo.bmiHeader.biWidth = width; bmInfo.bmiHeader.biHeight = height * -1; // Negative gives us a top-down image. bmInfo.bmiHeader.biPlanes = 1; bmInfo.bmiHeader.biBitCount = 24; bmInfo.bmiHeader.biCompression = BI_RGB; HDC hdc = ::CreateCompatibleDC(NULL); ScopeGuard guardDC = MakeGuard(::DeleteDC, hdc); unsigned char *pBits = NULL; HBITMAP hbm = ::CreateDIBSection(hdc, &bmInfo, DIB_RGB_COLORS, (void **)&pBits, NULL, NULL); writeTIFF("C:\\Flake.tif", hdc); //DeleteObject(SelectObject(hdc, hbm)); return &hdc; }
/** * Removes the specified set of session ids from the persistent sessions collection and returns the * number of sessions actually removed. */ int removeSessionsRecords(OperationContext* opCtx, SessionsCollection& sessionsCollection, const LogicalSessionIdSet& sessionIdsToRemove) { if (sessionIdsToRemove.empty()) { return 0; } Locker* locker = opCtx->lockState(); Locker::LockSnapshot snapshot; invariant(locker->saveLockStateAndUnlock(&snapshot)); const auto guard = MakeGuard([&] { UninterruptibleLockGuard noInterrupt(opCtx->lockState()); locker->restoreLockState(opCtx, snapshot); }); // Top-level locks are freed, release any potential low-level (storage engine-specific // locks). If we are yielding, we are at a safe place to do so. opCtx->recoveryUnit()->abandonSnapshot(); // Track the number of yields in CurOp. CurOp::get(opCtx)->yielded(); auto removed = uassertStatusOK(sessionsCollection.findRemovedSessions(opCtx, sessionIdsToRemove)); uassertStatusOK(sessionsCollection.removeTransactionRecords(opCtx, removed)); return removed.size(); }
Status MigrationSourceManager::startClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCreated); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); grid.catalogClient(txn)->logChange(txn, "moveChunk.start", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); _cloneDriver = stdx::make_unique<MigrationChunkClonerSourceLegacy>( _args, _committedMetadata->getKeyPattern()); { // Register for notifications from the replication subsystem ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMigrationSourceManager(txn, this); } Status startCloneStatus = _cloneDriver->startClone(txn); if (!startCloneStatus.isOK()) { return startCloneStatus; } _state = kCloning; scopedGuard.Dismiss(); return Status::OK(); }
SSLConnection* SSLManager::accept(Socket* socket) { SSLConnection* sslConn = new SSLConnection(_serverContext, socket); ScopeGuard sslGuard = MakeGuard(::SSL_free, sslConn->ssl); ScopeGuard bioGuard = MakeGuard(::BIO_free, sslConn->networkBIO); int ret; do { ret = ::SSL_accept(sslConn->ssl); } while(!_doneWithSSLOp(sslConn, ret)); if (ret != 1) _handleSSLError(SSL_get_error(sslConn, ret)); sslGuard.Dismiss(); bioGuard.Dismiss(); return sslConn; }
Status MigrationChunkClonerSourceLegacy::startClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); auto scopedGuard = MakeGuard([&] { cancelClone(txn); }); // Resolve the donor and recipient shards and their connection string { auto donorShardStatus = grid.shardRegistry()->getShard(txn, _args.getFromShardId()); if (!donorShardStatus.isOK()) { return donorShardStatus.getStatus(); } _donorCS = donorShardStatus.getValue()->getConnString(); } { auto recipientShardStatus = grid.shardRegistry()->getShard(txn, _args.getToShardId()); if (!recipientShardStatus.isOK()) { return recipientShardStatus.getStatus(); } auto recipientShard = recipientShardStatus.getValue(); auto shardHostStatus = recipientShard->getTargeter()->findHost( ReadPreferenceSetting{ReadPreference::PrimaryOnly}); if (!shardHostStatus.isOK()) { return shardHostStatus.getStatus(); } _recipientHost = std::move(shardHostStatus.getValue()); } // Prepare the currently available documents Status status = _storeCurrentLocs(txn); if (!status.isOK()) { return status; } // Tell the recipient shard to start cloning BSONObjBuilder cmdBuilder; StartChunkCloneRequest::appendAsCommand(&cmdBuilder, _args.getNss(), _sessionId, _args.getConfigServerCS(), _donorCS, _args.getFromShardId(), _args.getToShardId(), _args.getMinKey(), _args.getMaxKey(), _shardKeyPattern.toBSON(), _args.getSecondaryThrottle()); auto responseStatus = _callRecipient(cmdBuilder.obj()); if (!responseStatus.isOK()) { return responseStatus.getStatus(); } scopedGuard.Dismiss(); return Status::OK(); }
SSL* SSLManager::accept(int fd) { SSL* ssl = _secure(fd); ScopeGuard guard = MakeGuard(::SSL_free, ssl); int ret = SSL_accept(ssl); if (ret != 1) _handleSSLError(SSL_get_error(ssl, ret)); guard.Dismiss(); return ssl; }
void GlobalEnvironmentMongoD::setGlobalStorageEngine(const std::string& name) { // This should be set once. invariant(!_storageEngine); const StorageEngine::Factory* factory = _storageFactories[name]; uassert(18656, str::stream() << "Cannot start server with an unknown storage engine: " << name, factory); std::string canonicalName = factory->getCanonicalName().toString(); // Do not proceed if data directory has been used by a different storage engine previously. std::auto_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::validate(storageGlobalParams.dbpath, canonicalName); // Validate options in metadata against current startup options. if (metadata.get()) { uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams)); } try { _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath)); } catch (const std::exception& ex) { uassert(28596, str::stream() << "Unable to determine status of lock file in the data directory " << storageGlobalParams.dbpath << ": " << ex.what(), false); } if (_lockFile->createdByUncleanShutdown()) { warning() << "Detected unclean shutdown - " << _lockFile->getFilespec() << " is not empty."; } uassertStatusOK(_lockFile->open()); ScopeGuard guard = MakeGuard(&StorageEngineLockFile::close, _lockFile.get()); _storageEngine = factory->create(storageGlobalParams, *_lockFile); _storageEngine->finishInit(); uassertStatusOK(_lockFile->writePid()); // Write a new metadata file if it is not present. if (!metadata.get()) { metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath)); metadata->setStorageEngine(canonicalName); metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams)); uassertStatusOK(metadata->write()); } guard.Dismiss(); _supportsDocLocking = _storageEngine->supportsDocLocking(); }
IndexInsertionContinuation *beginInsertIntoIndex( int idxNo, IndexDetails &_idx, DiskLoc _recordLoc, const BSONObj &_key, const Ordering& _order, bool dupsAllowed) { IndexInsertionContinuationImpl<V> *continuation = new IndexInsertionContinuationImpl<V>( _idx.head, _recordLoc, _key, _order, _idx); ScopeGuard allocGuard = MakeGuard(boost::checked_delete<IndexInsertionContinuation>, continuation); _idx.head.btree<V>()->twoStepInsert(_idx.head, *continuation, dupsAllowed); allocGuard.Dismiss(); return continuation; }
void WiredTigerSnapshotManager::beginTransactionOnLocalSnapshot(WT_SESSION* session, bool ignorePrepare) const { invariantWTOK( session->begin_transaction(session, (ignorePrepare) ? "ignore_prepare=true" : nullptr)); auto rollbacker = MakeGuard([&] { invariant(session->rollback_transaction(session, nullptr) == 0); }); stdx::lock_guard<stdx::mutex> lock(_localSnapshotMutex); invariant(_localSnapshot); LOG(3) << "begin_transaction on local snapshot " << _localSnapshot.get().toString(); auto status = setTransactionReadTimestamp(_localSnapshot.get(), session); fassert(50775, status); rollbacker.Dismiss(); }
Status MigrationSourceManager::awaitToCatchUp(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCloning); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Block until the cloner deems it appropriate to enter the critical section. Status catchUpStatus = _cloneDriver->awaitUntilCriticalSectionIsAppropriate( txn, kMaxWaitToEnterCriticalSectionTimeout); if (!catchUpStatus.isOK()) { return catchUpStatus; } _state = kCloneCaughtUp; scopedGuard.Dismiss(); return Status::OK(); }
Timestamp WiredTigerSnapshotManager::beginTransactionOnCommittedSnapshot( WT_SESSION* session) const { invariantWTOK(session->begin_transaction(session, nullptr)); auto rollbacker = MakeGuard([&] { invariant(session->rollback_transaction(session, nullptr) == 0); }); stdx::lock_guard<stdx::mutex> lock(_committedSnapshotMutex); uassert(ErrorCodes::ReadConcernMajorityNotAvailableYet, "Committed view disappeared while running operation", _committedSnapshot); auto status = setTransactionReadTimestamp(_committedSnapshot.get(), session); fassert(30635, status); rollbacker.Dismiss(); return *_committedSnapshot; }
void WiredTigerOplogManager::waitForAllEarlierOplogWritesToBeVisible( const WiredTigerRecordStore* oplogRecordStore, OperationContext* opCtx) { invariant(opCtx->lockState()->isNoop() || !opCtx->lockState()->inAWriteUnitOfWork()); // In order to reliably detect rollback situations, we need to fetch the latestVisibleTimestamp // prior to querying the end of the oplog. auto currentLatestVisibleTimestamp = getOplogReadTimestamp(); // Procedure: issue a read on a reverse cursor (which is not subject to the oplog visibility // rules), see what is last, and wait for that to become visible. std::unique_ptr<SeekableRecordCursor> cursor = oplogRecordStore->getCursor(opCtx, false /* false = reverse cursor */); auto lastRecord = cursor->next(); if (!lastRecord) { LOG(2) << "Trying to query an empty oplog"; opCtx->recoveryUnit()->abandonSnapshot(); return; } const auto waitingFor = lastRecord->id; // Close transaction before we wait. opCtx->recoveryUnit()->abandonSnapshot(); stdx::unique_lock<stdx::mutex> lk(_oplogVisibilityStateMutex); // Prevent any scheduled journal flushes from being delayed and blocking this wait excessively. _opsWaitingForVisibility++; invariant(_opsWaitingForVisibility > 0); auto exitGuard = MakeGuard([&] { _opsWaitingForVisibility--; }); opCtx->waitForConditionOrInterrupt(_opsBecameVisibleCV, lk, [&] { auto newLatestVisibleTimestamp = getOplogReadTimestamp(); if (newLatestVisibleTimestamp < currentLatestVisibleTimestamp) { LOG(1) << "oplog latest visible timestamp went backwards"; // If the visibility went backwards, this means a rollback occurred. // Thus, we are finished waiting. return true; } currentLatestVisibleTimestamp = newLatestVisibleTimestamp; RecordId latestVisible = RecordId(currentLatestVisibleTimestamp); if (latestVisible < waitingFor) { LOG(2) << "Operation is waiting for " << waitingFor << "; latestVisible is " << currentLatestVisibleTimestamp; } return latestVisible >= waitingFor; }); }
Status CollectionBulkLoaderImpl::_runTaskReleaseResourcesOnFailure(F task) noexcept { AlternativeClientRegion acr(_client); ScopeGuard guard = MakeGuard(&CollectionBulkLoaderImpl::_releaseResources, this); try { const auto status = [&task]() noexcept { return task(); } (); if (status.isOK()) { guard.Dismiss(); } return status; } catch (...) { std::terminate(); } }
void WiredTigerSnapshotManager::beginTransactionOnOplog(WiredTigerOplogManager* oplogManager, WT_SESSION* session) const { invariantWTOK(session->begin_transaction(session, nullptr)); auto rollbacker = MakeGuard([&] { invariant(session->rollback_transaction(session, nullptr) == 0); }); auto allCommittedTimestamp = oplogManager->getOplogReadTimestamp(); invariant(Timestamp(static_cast<unsigned long long>(allCommittedTimestamp)).asULL() == allCommittedTimestamp); auto status = setTransactionReadTimestamp( Timestamp(static_cast<unsigned long long>(allCommittedTimestamp)), session, true /* roundToOldest */); fassert(50771, status); rollbacker.Dismiss(); }
StatusWith<ReplicationExecutor::EventHandle> ScatterGatherRunner::start( ReplicationExecutor* executor, const stdx::function<void ()>& onCompletion) { invariant(!_started); _started = true; _actualResponses = 0; _onCompletion = onCompletion; StatusWith<ReplicationExecutor::EventHandle> evh = executor->makeEvent(); if (!evh.isOK()) { return evh; } _sufficientResponsesReceived = evh.getValue(); ScopeGuard earlyReturnGuard = MakeGuard( &ScatterGatherRunner::_signalSufficientResponsesReceived, this, executor); const ReplicationExecutor::RemoteCommandCallbackFn cb = stdx::bind( &ScatterGatherRunner::_processResponse, stdx::placeholders::_1, this); std::vector<RemoteCommandRequest> requests = _algorithm->getRequests(); for (size_t i = 0; i < requests.size(); ++i) { const StatusWith<ReplicationExecutor::CallbackHandle> cbh = executor->scheduleRemoteCommand(requests[i], cb); if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { return StatusWith<ReplicationExecutor::EventHandle>(cbh.getStatus()); } fassert(18743, cbh.getStatus()); _callbacks.push_back(cbh.getValue()); } if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) { invariant(_algorithm->hasReceivedSufficientResponses()); _signalSufficientResponsesReceived(executor); } earlyReturnGuard.Dismiss(); return evh; }
LockResult LockerImpl<IsForMMAPV1>::_acquireTicket(OperationContext* opCtx, LockMode mode, Date_t deadline) { const bool reader = isSharedLockMode(mode); auto holder = shouldAcquireTicket() ? ticketHolders[mode] : nullptr; if (holder) { _clientState.store(reader ? kQueuedReader : kQueuedWriter); // If the ticket wait is interrupted, restore the state of the client. auto restoreStateOnErrorGuard = MakeGuard([&] { _clientState.store(kInactive); }); if (deadline == Date_t::max()) { holder->waitForTicket(opCtx); } else if (!holder->waitForTicketUntil(opCtx, deadline)) { return LOCK_TIMEOUT; } restoreStateOnErrorGuard.Dismiss(); } _clientState.store(reader ? kActiveReader : kActiveWriter); return LOCK_OK; }
void FFmpegDecoder::videoParseRunnable() { CHANNEL_LOG(ffmpeg_threads) << "Video thread started"; m_videoStartClock = VIDEO_START_CLOCK_NOT_INITIALIZED; double videoClock = 0; // pts of last decoded frame / predicted pts of next decoded frame VideoParseContext context{}; for (;;) { if (m_isPaused && !m_isVideoSeekingWhilePaused) { boost::unique_lock<boost::mutex> locker(m_isPausedMutex); while (m_isPaused && !m_isVideoSeekingWhilePaused) { m_isPausedCV.wait(locker); } } for (;;) { AVPacket packet; if (!m_videoPacketsQueue.pop(packet, [this] { return m_isPaused && !m_isVideoSeekingWhilePaused; })) { break; } auto packetGuard = MakeGuard(&packet, av_packet_unref); if (!handleVideoPacket(packet, videoClock, context) || m_isPaused && !m_isVideoSeekingWhilePaused) { break; } } } }
Status MigrationSourceManager::enterCriticalSection(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCloneCaughtUp); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Mark the shard as running critical operation, which requires recovery on crash Status status = ShardingStateRecovery::startMetadataOp(txn); if (!status.isOK()) { return status; } { ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); if (!css->getMetadata() || !css->getMetadata()->getCollVersion().equals(_committedMetadata->getCollVersion())) { return {ErrorCodes::IncompatibleShardingMetadata, str::stream() << "Sharding metadata changed while holding distributed lock. Expected: " << _committedMetadata->getCollVersion().toString() << ", actual: " << css->getMetadata()->getCollVersion().toString()}; } // IMPORTANT: After this line, the critical section is in place and needs to be rolled back // if anything fails, which would prevent commit to the config servers. _critSecSignal = std::make_shared<Notification<void>>(); } log() << "Successfully entered critical section."; _state = kCriticalSection; scopedGuard.Dismiss(); return Status::OK(); }
LockResult LockerImpl::_acquireTicket(OperationContext* opCtx, LockMode mode, Date_t deadline) { const bool reader = isSharedLockMode(mode); auto holder = shouldAcquireTicket() ? ticketHolders[mode] : nullptr; if (holder) { _clientState.store(reader ? kQueuedReader : kQueuedWriter); if (_maxLockTimeout && !_uninterruptibleLocksRequested) { deadline = std::min(deadline, Date_t::now() + _maxLockTimeout.get()); } // If the ticket wait is interrupted, restore the state of the client. auto restoreStateOnErrorGuard = MakeGuard([&] { _clientState.store(kInactive); }); OperationContext* interruptible = _uninterruptibleLocksRequested ? nullptr : opCtx; if (deadline == Date_t::max()) { holder->waitForTicket(interruptible); } else if (!holder->waitForTicketUntil(interruptible, deadline)) { return LOCK_TIMEOUT; } restoreStateOnErrorGuard.Dismiss(); } _clientState.store(reader ? kActiveReader : kActiveWriter); return LOCK_OK; }
bool MozJSImplScope::_interruptCallback(JSContext* cx) { auto scope = getScope(cx); JS_SetInterruptCallback(scope->_runtime, nullptr); auto guard = MakeGuard([&]() { JS_SetInterruptCallback(scope->_runtime, _interruptCallback); }); if (scope->_pendingGC.load()) { scope->_pendingGC.store(false); JS_GC(scope->_runtime); } else { JS_MaybeGC(cx); } bool kill = scope->isKillPending(); if (kill) { scope->_engine->getDeadlineMonitor().stopDeadline(scope); scope->unregisterOperation(); scope->_status = Status(ErrorCodes::JSInterpreterFailure, "Interrupted by the host"); } return !kill; }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // Make a best effort to parse the response and retrieve the cursor id. We need the cursor // id in order to issue a killCursors command against it. if (cbData.response.isOK()) { auto cursorResponse = parseCursorResponse(cbData.response.getValue().data, remote); if (cursorResponse.isOK()) { remote.cursorId = cursorResponse.getValue().getCursorId(); } } // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); StatusWith<CursorResponse> cursorResponseStatus( cbData.response.isOK() ? parseCursorResponse(cbData.response.getValue().data, remote) : cbData.response.getStatus()); if (!cursorResponseStatus.isOK()) { auto shard = remote.getShard(); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId << " containing host " << remote.getTargetHost().toString()); } else { shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus()); // Retry initial cursor establishment if possible. Never retry getMores to avoid // accidentally skipping results. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && shard->isRetriableError(cursorResponseStatus.getStatus().code(), Shard::RetryPolicy::kIdempotent)) { invariant(remote.shardId); LOG(1) << "Initial cursor establishment failed with retriable error and will be " "retried" << causedBy(redact(cursorResponseStatus.getStatus())); ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be // faulty, the call below may end up getting a different host. remote.status = askForNextBatch_inlock(remoteIndex); if (remote.status.isOK()) { return; } // If we end up here, it means we failed to schedule the retry request, which is a // more // severe error that should not be retried. Just pass through to the error handling // logic below. } else { remote.status = cursorResponseStatus.getStatus(); } } // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We // remove the unreachable host entirely from consideration by marking it as exhausted. if (_params.isAllowPartialResults) { remote.status = Status::OK(); // Clear the results buffer and cursor id. std::queue<BSONObj> emptyBuffer; std::swap(remote.docBuffer, emptyBuffer); remote.cursorId = 0; } return; } // Cursor id successfully established. auto cursorResponse = std::move(cursorResponseStatus.getValue()); remote.cursorId = cursorResponse.getCursorId(); remote.initialCmdObj = boost::none; for (const auto& obj : cursorResponse.getBatch()) { // If there's a sort, we're expecting the remote node to give us back a sort key. if (!_params.sort.isEmpty() && obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) { remote.status = Status(ErrorCodes::InternalError, str::stream() << "Missing field '" << ClusterClientCursorParams::kSortKeyField << "' in document: " << obj); return; } remote.docBuffer.push(obj); ++remote.fetchedCount; } // If we're doing a sorted merge, then we have to make sure to put this remote onto the // merge queue. if (!_params.sort.isEmpty() && !cursorResponse.getBatch().empty()) { _mergeQueue.push(remoteIndex); } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. if (_params.isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) { remote.status = askForNextBatch_inlock(remoteIndex); if (!remote.status.isOK()) { return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }
Status MigrationChunkClonerSourceLegacy::awaitUntilCriticalSectionIsAppropriate( OperationContext* txn, Milliseconds maxTimeToWait) { invariant(!txn->lockState()->isLocked()); auto scopedGuard = MakeGuard([&] { cancelClone(txn); }); const auto startTime = Date_t::now(); int iteration = 0; while ((Date_t::now() - startTime) < maxTimeToWait) { // Exponential sleep backoff, up to 1024ms. Don't sleep much on the first few iterations, // since we want empty chunk migrations to be fast. sleepmillis(1 << std::min(iteration, 10)); iteration++; auto responseStatus = _callRecipient(BSON(kRecvChunkStatus << _args.getNss().ns())); if (!responseStatus.isOK()) { return {responseStatus.getStatus().code(), str::stream() << "Failed to contact recipient shard to monitor data transfer due to " << responseStatus.getStatus().toString()}; } BSONObj res = std::move(responseStatus.getValue()); log() << "moveChunk data transfer progress: " << res << " my mem used: " << _memoryUsed; if (res["state"].String() == "steady") { // Ensure all cloned docs have actually been transferred const std::size_t locsRemaining = _cloneLocs.size(); if (locsRemaining != 0) { return { ErrorCodes::OperationIncomplete, str::stream() << "cannot enter critical section before all data is cloned, " << locsRemaining << " locs were not transferred but to-shard thinks they are all cloned"}; } scopedGuard.Dismiss(); return Status::OK(); } if (res["state"].String() == "fail") { return {ErrorCodes::OperationFailed, "Data transfer error"}; } if (res["ns"].str() != _args.getNss().ns() || res["from"].str() != _donorCS.toString() || !res["min"].isABSONObj() || res["min"].Obj().woCompare(_args.getMinKey()) != 0 || !res["max"].isABSONObj() || res["max"].Obj().woCompare(_args.getMaxKey()) != 0) { // This can happen when the destination aborted the migration and received another // recvChunk before this thread sees the transition to the abort state. This is // currently possible only if multiple migrations are happening at once. This is an // unfortunate consequence of the shards not being able to keep track of multiple // incoming and outgoing migrations. return {ErrorCodes::OperationIncomplete, "Destination shard aborted migration because a new one is running"}; } if (_memoryUsed > 500 * 1024 * 1024) { // This is too much memory for us to use so we're going to abort the migration return {ErrorCodes::ExceededMemoryLimit, "Aborting migration because of high memory usage"}; } Status interruptStatus = txn->checkForInterruptNoAssert(); if (!interruptStatus.isOK()) { return interruptStatus; } } scopedGuard.Dismiss(); return {ErrorCodes::ExceededTimeLimit, "Timed out waiting for the cloner to catch up"}; }
hints.ai_protocol = 0; if (mode == HostnameCanonicalizationMode::kForward) { hints.ai_flags = AI_CANONNAME; } int err; shim_addrinfo* info; auto nativeHostName = shim_toNativeString(hostName.c_str()); if ((err = shim_getaddrinfo(nativeHostName.c_str(), nullptr, &hints, &info)) != 0) { ONCE { warning() << "Failed to obtain address information for hostname " << hostName << ": " << getAddrInfoStrError(err); } return results; } const auto guard = MakeGuard([&shim_freeaddrinfo, &info] { shim_freeaddrinfo(info); }); if (mode == HostnameCanonicalizationMode::kForward) { results.emplace_back(shim_fromNativeString(info->ai_canonname)); return results; } bool encounteredErrors = false; std::stringstream getNameInfoErrors; getNameInfoErrors << "Failed to obtain name info for: [ "; for (shim_addrinfo* p = info; p; p = p->ai_next) { shim_char host[NI_MAXHOST] = {}; if ((err = shim_getnameinfo( p->ai_addr, p->ai_addrlen, host, sizeof(host), nullptr, 0, NI_NAMEREQD)) == 0) { results.emplace_back(shim_fromNativeString(host)); } else {
bool MessagingPort::recv(Message& m) { try { #ifdef MONGO_CONFIG_SSL again: #endif // mmm( log() << "* recv() sock:" << this->sock << endl; ) MSGHEADER::Value header; int headerLen = sizeof(MSGHEADER::Value); psock->recv((char*)&header, headerLen); int len = header.constView().getMessageLength(); if (len == 542393671) { // an http GET string msg = "It looks like you are trying to access MongoDB over HTTP on the native driver " "port.\n"; LOG(psock->getLogLevel()) << msg; std::stringstream ss; ss << "HTTP/1.0 200 OK\r\nConnection: close\r\nContent-Type: " "text/plain\r\nContent-Length: " << msg.size() << "\r\n\r\n" << msg; string s = ss.str(); send(s.c_str(), s.size(), "http"); return false; } // If responseTo is not 0 or -1 for first packet assume SSL else if (psock->isAwaitingHandshake()) { #ifndef MONGO_CONFIG_SSL if (header.constView().getResponseTo() != 0 && header.constView().getResponseTo() != -1) { uasserted(17133, "SSL handshake requested, SSL feature not available in this build"); } #else if (header.constView().getResponseTo() != 0 && header.constView().getResponseTo() != -1) { uassert(17132, "SSL handshake received but server is started without SSL support", sslGlobalParams.sslMode.load() != SSLParams::SSLMode_disabled); setX509SubjectName( psock->doSSLHandshake(reinterpret_cast<const char*>(&header), sizeof(header))); psock->setHandshakeReceived(); goto again; } uassert(17189, "The server is configured to only allow SSL connections", sslGlobalParams.sslMode.load() != SSLParams::SSLMode_requireSSL); #endif // MONGO_CONFIG_SSL } if (static_cast<size_t>(len) < sizeof(MSGHEADER::Value) || static_cast<size_t>(len) > MaxMessageSizeBytes) { LOG(0) << "recv(): message len " << len << " is invalid. " << "Min " << sizeof(MSGHEADER::Value) << " Max: " << MaxMessageSizeBytes; return false; } psock->setHandshakeReceived(); int z = (len + 1023) & 0xfffffc00; verify(z >= len); MsgData::View md = reinterpret_cast<char*>(mongolMalloc(z)); ScopeGuard guard = MakeGuard(free, md.view2ptr()); verify(md.view2ptr()); memcpy(md.view2ptr(), &header, headerLen); int left = len - headerLen; psock->recv(md.data(), left); guard.Dismiss(); m.setData(md.view2ptr(), true); return true; } catch (const SocketException& e) { logger::LogSeverity severity = psock->getLogLevel(); if (!e.shouldPrint()) severity = severity.lessSevere(); LOG(severity) << "SocketException: remote: " << remote() << " error: " << e; m.reset(); return false; } }
Status dropDatabase(OperationContext* opCtx, const std::string& dbName) { uassert(ErrorCodes::IllegalOperation, "Cannot drop a database in read-only mode", !storageGlobalParams.readOnly); // TODO (Kal): OldClientContext legacy, needs to be removed { CurOp::get(opCtx)->ensureStarted(); stdx::lock_guard<Client> lk(*opCtx->getClient()); CurOp::get(opCtx)->setNS_inlock(dbName); } auto replCoord = repl::ReplicationCoordinator::get(opCtx); std::size_t numCollectionsToDrop = 0; // We have to wait for the last drop-pending collection to be removed if there are no // collections to drop. repl::OpTime latestDropPendingOpTime; using Result = boost::optional<Status>; // Get an optional result--if it's there, early return; otherwise, wait for collections to drop. auto result = writeConflictRetry(opCtx, "dropDatabase_collection", dbName, [&] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); Database* const db = autoDB.getDb(); if (!db) { return Result(Status(ErrorCodes::NamespaceNotFound, str::stream() << "Could not drop database " << dbName << " because it does not exist")); } bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !replCoord->canAcceptWritesForDatabase(opCtx, dbName); if (userInitiatedWritesAndNotPrimary) { return Result( Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping database " << dbName)); } log() << "dropDatabase " << dbName << " - starting"; db->setDropPending(opCtx, true); // If Database::dropCollectionEventIfSystem() fails, we should reset the drop-pending state // on Database. auto dropPendingGuard = MakeGuard([&db, opCtx] { db->setDropPending(opCtx, false); }); for (auto collection : *db) { const auto& nss = collection->ns(); if (nss.isDropPendingNamespace() && replCoord->isReplEnabled() && opCtx->writesAreReplicated()) { log() << "dropDatabase " << dbName << " - found drop-pending collection: " << nss; latestDropPendingOpTime = std::max( latestDropPendingOpTime, uassertStatusOK(nss.getDropPendingNamespaceOpTime())); continue; } if (replCoord->isOplogDisabledFor(opCtx, nss) || nss.isSystemDotIndexes()) { continue; } log() << "dropDatabase " << dbName << " - dropping collection: " << nss; WriteUnitOfWork wunit(opCtx); fassertStatusOK(40476, db->dropCollectionEvenIfSystem(opCtx, nss)); wunit.commit(); numCollectionsToDrop++; } dropPendingGuard.Dismiss(); // If there are no collection drops to wait for, we complete the drop database operation. if (numCollectionsToDrop == 0U && latestDropPendingOpTime.isNull()) { return Result(_finishDropDatabase(opCtx, dbName, db)); } return Result(boost::none); }); if (result) { return *result; } // If waitForWriteConcern() returns an error or throws an exception, we should reset the // drop-pending state on Database. auto dropPendingGuardWhileAwaitingReplication = MakeGuard([dbName, opCtx] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); if (auto db = autoDB.getDb()) { db->setDropPending(opCtx, false); } }); { // Holding of any locks is disallowed while awaiting replication because this can // potentially block for long time while doing network activity. // // Even though dropDatabase() does not explicitly acquire any locks before awaiting // replication, it is possible that the caller of this function may already have acquired // a lock. The applyOps command is an example of a dropDatabase() caller that does this. // Therefore, we have to release any locks using a TempRelease RAII object. // // TODO: Remove the use of this TempRelease object when SERVER-29802 is completed. // The work in SERVER-29802 will adjust the locking rules around applyOps operations and // dropDatabase is expected to be one of the operations where we expect to no longer acquire // the global lock. Lock::TempRelease release(opCtx->lockState()); if (numCollectionsToDrop > 0U) { auto status = replCoord->awaitReplicationOfLastOpForClient(opCtx, kDropDatabaseWriteConcern) .status; if (!status.isOK()) { return Status(status.code(), str::stream() << "dropDatabase " << dbName << " failed waiting for " << numCollectionsToDrop << " collection drops to replicate: " << status.reason()); } log() << "dropDatabase " << dbName << " - successfully dropped " << numCollectionsToDrop << " collections. dropping database"; } else { invariant(!latestDropPendingOpTime.isNull()); auto status = replCoord ->awaitReplication(opCtx, latestDropPendingOpTime, kDropDatabaseWriteConcern) .status; if (!status.isOK()) { return Status( status.code(), str::stream() << "dropDatabase " << dbName << " failed waiting for pending collection drops (most recent drop optime: " << latestDropPendingOpTime.toString() << ") to replicate: " << status.reason()); } log() << "dropDatabase " << dbName << " - pending collection drops completed. dropping database"; } } dropPendingGuardWhileAwaitingReplication.Dismiss(); return writeConflictRetry(opCtx, "dropDatabase_database", dbName, [&] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); if (auto db = autoDB.getDb()) { return _finishDropDatabase(opCtx, dbName, db); } return Status(ErrorCodes::NamespaceNotFound, str::stream() << "Could not drop database " << dbName << " because it does not exist after dropping " << numCollectionsToDrop << " collection(s)."); }); }
void ServiceContextMongoD::initializeGlobalStorageEngine() { // This should be set once. invariant(!_storageEngine); // We should have a _lockFile or be in read-only mode. Confusingly, we can still have a lockFile // if we are in read-only mode. This can happen if the server is started in read-only mode on a // writable dbpath. invariant(_lockFile || storageGlobalParams.readOnly); const std::string dbpath = storageGlobalParams.dbpath; if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) { if (storageGlobalParams.engineSetByUser) { // Verify that the name of the user-supplied storage engine matches the contents of // the metadata file. const StorageEngine::Factory* factory = mapFindWithDefault(_storageFactories, storageGlobalParams.engine, static_cast<const StorageEngine::Factory*>(nullptr)); if (factory) { uassert(28662, str::stream() << "Cannot start server. Detected data files in " << dbpath << " created by" << " the '" << *existingStorageEngine << "' storage engine, but the" << " specified storage engine was '" << factory->getCanonicalName() << "'.", factory->getCanonicalName() == *existingStorageEngine); } } else { // Otherwise set the active storage engine as the contents of the metadata file. log() << "Detected data files in " << dbpath << " created by the '" << *existingStorageEngine << "' storage engine, so setting the active" << " storage engine to '" << *existingStorageEngine << "'."; storageGlobalParams.engine = *existingStorageEngine; } } else if (!storageGlobalParams.engineSetByUser) { // Ensure the default storage engine is available with this build of mongod. uassert(28663, str::stream() << "Cannot start server. The default storage engine '" << storageGlobalParams.engine << "' is not available with this build of mongod. Please specify a different" << " storage engine explicitly, e.g. --storageEngine=mmapv1.", isRegisteredStorageEngine(storageGlobalParams.engine)); } const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine]; uassert(18656, str::stream() << "Cannot start server with an unknown storage engine: " << storageGlobalParams.engine, factory); if (storageGlobalParams.readOnly) { uassert(34368, str::stream() << "Server was started in read-only mode, but the configured storage engine, " << storageGlobalParams.engine << ", does not support read-only operation", factory->supportsReadOnly()); } std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath); if (storageGlobalParams.readOnly) { uassert(34415, "Server was started in read-only mode, but the storage metadata file was not" " found.", metadata.get()); } // Validate options in metadata against current startup options. if (metadata.get()) { uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams)); } ScopeGuard guard = MakeGuard([&] { if (_lockFile) { _lockFile->close(); } }); _storageEngine = factory->create(storageGlobalParams, _lockFile.get()); _storageEngine->finishInit(); if (_lockFile) { uassertStatusOK(_lockFile->writePid()); } // Write a new metadata file if it is not present. if (!metadata.get()) { invariant(!storageGlobalParams.readOnly); metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath)); metadata->setStorageEngine(factory->getCanonicalName().toString()); metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams)); uassertStatusOK(metadata->write()); } guard.Dismiss(); _supportsDocLocking = _storageEngine->supportsDocLocking(); }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, OperationContext* opCtx, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(opCtx); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); StatusWith<CursorResponse> cursorResponseStatus( cbData.response.isOK() ? parseCursorResponse(cbData.response.data, remote) : cbData.response.status); if (!cursorResponseStatus.isOK()) { auto shard = remote.getShard(); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard containing host " << remote.getTargetHost().toString()); } else { shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus()); remote.status = cursorResponseStatus.getStatus(); } // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We // remove the unreachable host entirely from consideration by marking it as exhausted. if (_params->isAllowPartialResults) { remote.status = Status::OK(); // Clear the results buffer and cursor id. std::queue<ClusterQueryResult> emptyBuffer; std::swap(remote.docBuffer, emptyBuffer); remote.cursorId = 0; } return; } // Response successfully received. auto cursorResponse = std::move(cursorResponseStatus.getValue()); // Update the cursorId; it is sent as '0' when the cursor has been exhausted on the shard. remote.cursorId = cursorResponse.getCursorId(); // Save the batch in the remote's buffer. if (!addBatchToBuffer(remoteIndex, cursorResponse.getBatch())) { return; } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. // (Note: tailable cursors are only valid on unsharded collections, so the end of the batch from // one shard means the end of the overall batch). if (_params->isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params->isTailable && !remote.hasNext() && !remote.exhausted()) { remote.status = askForNextBatch_inlock(opCtx, remoteIndex); if (!remote.status.isOK()) { return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }
StatusWith<std::string> ShardingCatalogManager::addShard( OperationContext* opCtx, const std::string* shardProposedName, const ConnectionString& shardConnectionString, const long long maxSize) { if (shardConnectionString.type() == ConnectionString::INVALID) { return {ErrorCodes::BadValue, "Invalid connection string"}; } if (shardProposedName && shardProposedName->empty()) { return {ErrorCodes::BadValue, "shard name cannot be empty"}; } // Only one addShard operation can be in progress at a time. Lock::ExclusiveLock lk(opCtx->lockState(), _kShardMembershipLock); // Check if this shard has already been added (can happen in the case of a retry after a network // error, for example) and thus this addShard request should be considered a no-op. auto existingShard = _checkIfShardExists(opCtx, shardConnectionString, shardProposedName, maxSize); if (!existingShard.isOK()) { return existingShard.getStatus(); } if (existingShard.getValue()) { // These hosts already belong to an existing shard, so report success and terminate the // addShard request. Make sure to set the last optime for the client to the system last // optime so that we'll still wait for replication so that this state is visible in the // committed snapshot. repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx); return existingShard.getValue()->getName(); } // Force a reload of the ShardRegistry to ensure that, in case this addShard is to re-add a // replica set that has recently been removed, we have detached the ReplicaSetMonitor for the // set with that setName from the ReplicaSetMonitorManager and will create a new // ReplicaSetMonitor when targeting the set below. // Note: This is necessary because as of 3.4, removeShard is performed by mongos (unlike // addShard), so the ShardRegistry is not synchronously reloaded on the config server when a // shard is removed. if (!Grid::get(opCtx)->shardRegistry()->reload(opCtx)) { // If the first reload joined an existing one, call reload again to ensure the reload is // fresh. Grid::get(opCtx)->shardRegistry()->reload(opCtx); } // TODO: Don't create a detached Shard object, create a detached RemoteCommandTargeter instead. const std::shared_ptr<Shard> shard{ Grid::get(opCtx)->shardRegistry()->createConnection(shardConnectionString)}; invariant(shard); auto targeter = shard->getTargeter(); auto stopMonitoringGuard = MakeGuard([&] { if (shardConnectionString.type() == ConnectionString::SET) { // This is a workaround for the case were we could have some bad shard being // requested to be added and we put that bad connection string on the global replica set // monitor registry. It needs to be cleaned up so that when a correct replica set is // added, it will be recreated. ReplicaSetMonitor::remove(shardConnectionString.getSetName()); } }); // Validate the specified connection string may serve as shard at all auto shardStatus = _validateHostAsShard(opCtx, targeter, shardProposedName, shardConnectionString); if (!shardStatus.isOK()) { return shardStatus.getStatus(); } ShardType& shardType = shardStatus.getValue(); // Check that none of the existing shard candidate's dbs exist already auto dbNamesStatus = _getDBNamesListFromShard(opCtx, targeter); if (!dbNamesStatus.isOK()) { return dbNamesStatus.getStatus(); } for (const auto& dbName : dbNamesStatus.getValue()) { auto dbt = Grid::get(opCtx)->catalogClient()->getDatabase( opCtx, dbName, repl::ReadConcernLevel::kLocalReadConcern); if (dbt.isOK()) { const auto& dbDoc = dbt.getValue().value; return Status(ErrorCodes::OperationFailed, str::stream() << "can't add shard " << "'" << shardConnectionString.toString() << "'" << " because a local database '" << dbName << "' exists in another " << dbDoc.getPrimary()); } else if (dbt != ErrorCodes::NamespaceNotFound) { return dbt.getStatus(); } } // Check that the shard candidate does not have a local config.system.sessions collection auto res = _dropSessionsCollection(opCtx, targeter); if (!res.isOK()) { return res.withContext( "can't add shard with a local copy of config.system.sessions, please drop this " "collection from the shard manually and try again."); } // If a name for a shard wasn't provided, generate one if (shardType.getName().empty()) { auto result = generateNewShardName(opCtx); if (!result.isOK()) { return result.getStatus(); } shardType.setName(result.getValue()); } if (maxSize > 0) { shardType.setMaxSizeMB(maxSize); } // Helper function that runs a command on the to-be shard and returns the status auto runCmdOnNewShard = [this, &opCtx, &targeter](const BSONObj& cmd) -> Status { auto swCommandResponse = _runCommandForAddShard(opCtx, targeter.get(), NamespaceString::kAdminDb, cmd); if (!swCommandResponse.isOK()) { return swCommandResponse.getStatus(); } // Grabs the underlying status from a StatusWith object by taking the first // non-OK status, if there is one. This is needed due to the semantics of // _runCommandForAddShard. auto commandResponse = std::move(swCommandResponse.getValue()); BatchedCommandResponse batchResponse; return Shard::CommandResponse::processBatchWriteResponse(commandResponse, &batchResponse); }; AddShard addShardCmd = add_shard_util::createAddShardCmd(opCtx, shardType.getName()); auto addShardCmdBSON = [&]() { // In 4.2, use the _addShard command to add the shard, which in turn inserts a // shardIdentity document into the shard and triggers sharding state initialization. // In the unlikely scenario that there's a downgrade to 4.0 between the // construction of this command object and the issuing of the command // on the receiving shard, the user will receive a rather harmless // CommandNotFound error for _addShard, and can simply retry. if (serverGlobalParams.featureCompatibility.getVersion() == ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo42) { // Needed for IDL toBSON method BSONObj passthroughFields; return addShardCmd.toBSON(passthroughFields); } else { // To support backwards compatibility with v4.0 shards, insert a shardIdentity document // directly. return add_shard_util::createShardIdentityUpsertForAddShard(addShardCmd); } }(); auto addShardStatus = runCmdOnNewShard(addShardCmdBSON); if (!addShardStatus.isOK()) { return addShardStatus; } { // Hold the fcvLock across checking the FCV, sending setFCV to the new shard, and // writing the entry for the new shard to config.shards. This ensures the FCV doesn't change // after we send setFCV to the new shard, but before we write its entry to config.shards. // (Note, we don't use a Global IX lock here, because we don't want to hold the global lock // while blocking on the network). invariant(!opCtx->lockState()->isLocked()); Lock::SharedLock lk(opCtx->lockState(), FeatureCompatibilityVersion::fcvLock); BSONObj setFCVCmd; switch (serverGlobalParams.featureCompatibility.getVersion()) { case ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo42: case ServerGlobalParams::FeatureCompatibility::Version::kUpgradingTo42: setFCVCmd = BSON(FeatureCompatibilityVersionCommandParser::kCommandName << FeatureCompatibilityVersionParser::kVersion42 << WriteConcernOptions::kWriteConcernField << opCtx->getWriteConcern().toBSON()); break; default: setFCVCmd = BSON(FeatureCompatibilityVersionCommandParser::kCommandName << FeatureCompatibilityVersionParser::kVersion40 << WriteConcernOptions::kWriteConcernField << opCtx->getWriteConcern().toBSON()); break; } auto versionResponse = _runCommandForAddShard(opCtx, targeter.get(), NamespaceString::kAdminDb, setFCVCmd); if (!versionResponse.isOK()) { return versionResponse.getStatus(); } if (!versionResponse.getValue().commandStatus.isOK()) { return versionResponse.getValue().commandStatus; } log() << "going to insert new entry for shard into config.shards: " << shardType.toString(); Status result = Grid::get(opCtx)->catalogClient()->insertConfigDocument( opCtx, ShardType::ConfigNS, shardType.toBSON(), ShardingCatalogClient::kLocalWriteConcern); if (!result.isOK()) { log() << "error adding shard: " << shardType.toBSON() << " err: " << result.reason(); return result; } } // Add all databases which were discovered on the new shard for (const auto& dbName : dbNamesStatus.getValue()) { DatabaseType dbt(dbName, shardType.getName(), false, databaseVersion::makeNew()); { const auto status = Grid::get(opCtx)->catalogClient()->updateConfigDocument( opCtx, DatabaseType::ConfigNS, BSON(DatabaseType::name(dbName)), dbt.toBSON(), true, ShardingCatalogClient::kLocalWriteConcern); if (!status.isOK()) { log() << "adding shard " << shardConnectionString.toString() << " even though could not add database " << dbName; } } } // Record in changelog BSONObjBuilder shardDetails; shardDetails.append("name", shardType.getName()); shardDetails.append("host", shardConnectionString.toString()); Grid::get(opCtx)->catalogClient()->logChange( opCtx, "addShard", "", shardDetails.obj(), ShardingCatalogClient::kMajorityWriteConcern); // Ensure the added shard is visible to this process. auto shardRegistry = Grid::get(opCtx)->shardRegistry(); if (!shardRegistry->getShard(opCtx, shardType.getName()).isOK()) { return {ErrorCodes::OperationFailed, "Could not find shard metadata for shard after adding it. This most likely " "indicates that the shard was removed immediately after it was added."}; } stopMonitoringGuard.Dismiss(); return shardType.getName(); }
Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCriticalSection); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Tell the recipient shard to fetch the latest changes Status commitCloneStatus = _cloneDriver->commitClone(txn); if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) { commitCloneStatus = {ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint."}; } if (!commitCloneStatus.isOK()) { return {commitCloneStatus.code(), str::stream() << "commit clone failed due to " << commitCloneStatus.toString()}; } // Generate the next collection version. ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion(); uncommittedCollVersion.incMajor(); // applyOps preparation for reflecting the uncommitted metadata on the config server // Preconditions BSONArrayBuilder preCond; { BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1))); { BSONObjBuilder bb(b.subobjStart("res")); // TODO: For backwards compatibility, we can't yet require an epoch here bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), _committedMetadata->getCollVersion().toLong()); bb.done(); } preCond.append(b.obj()); } // Update for the chunk which is being donated BSONArrayBuilder updates; { BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); // No upserting op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), _args.getMinKey()); n.append(ChunkType::max(), _args.getMaxKey()); n.append(ChunkType::shard(), _args.getToShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); q.done(); updates.append(op.obj()); } // Update for the chunk being moved // Version at which the next highest lastmod will be set. If the chunk being moved is the last // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the // chunk being bumped on the FROM-shard. ChunkVersion nextVersion = uncommittedCollVersion; // If we have chunks left on the FROM shard, update the version of one of them as well. We can // figure that out by grabbing the metadata as it has been changed. if (_committedMetadata->getNumChunks() > 1) { ChunkType bumpChunk; invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk)); BSONObj bumpMin = bumpChunk.getMin(); BSONObj bumpMax = bumpChunk.getMax(); nextVersion.incMinor(); dassert(bumpMin.woCompare(_args.getMinKey()) != 0); BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), bumpMin); n.append(ChunkType::max(), bumpMax); n.append(ChunkType::shard(), _args.getFromShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); q.done(); updates.append(op.obj()); log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'"; } else { log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'"; } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeCommitMigration); Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated( txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion); if (MONGO_FAIL_POINT(failCommitMigrationCommand)) { applyOpsStatus = Status(ErrorCodes::InternalError, "Failpoint 'failCommitMigrationCommand' generated error"); } if (applyOpsStatus.isOK()) { // Now that applyOps succeeded and the new collection version is committed, update the // collection metadata to the new collection version and forget the migrated chunk. ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); ChunkType migratingChunkToForget; migratingChunkToForget.setMin(_args.getMinKey()); migratingChunkToForget.setMax(_args.getMaxKey()); _committedMetadata = _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMetadata(_committedMetadata); } else { // This could be an unrelated error (e.g. network error). Check whether the metadata update // succeeded by refreshing the collection metadata from the config server and checking that // the original chunks no longer exist. warning() << "Migration metadata commit may have failed: refreshing metadata to check" << causedBy(applyOpsStatus); // Need to get the latest optime in case the refresh request goes to a secondary -- // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done. Status status = grid.catalogClient(txn)->logChange( txn, "moveChunk.validating", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); if (!status.isOK()) { fassertStatusOK( 40137, {status.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and updating the optime with a write before refreshing the " << "metadata also failed: " << causedBy(status)}); } ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); fassertStatusOK(34431, {refreshStatus.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and refreshing collection metadata failed: " << causedBy(refreshStatus)}); { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata(); if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) { invariant(refreshedMetadata->getCollVersion() == _committedMetadata->getCollVersion()); // After refresh, the collection metadata indicates that the donor shard still owns // the chunk, so no migration changes were written to the config server metadata. return {applyOpsStatus.code(), str::stream() << "Migration was not committed, applyOps failed: " << causedBy(applyOpsStatus)}; } ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion(); if (!refreshedCollectionVersion.equals(nextVersion)) { // The refreshed collection metadata's collection version does not match the control // chunk's updated collection version, which should now be the highest. The control // chunk was not committed, but the migrated chunk was. This state is not // recoverable. fassertStatusOK(40138, {applyOpsStatus.code(), str::stream() << "Migration was partially committed, state is " << "unrecoverable. applyOps error: " << causedBy(applyOpsStatus)}); } } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection); scopedGuard.Dismiss(); _cleanup(txn); grid.catalogClient(txn)->logChange(txn, "moveChunk.commit", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); return Status::OK(); }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { Lock::GlobalWrite globalWriteLock(txn->lockState()); string source = cmdObj.getStringField( name.c_str() ); string target = cmdObj.getStringField( "to" ); // We stay in source context the whole time. This is mostly to set the CurOp namespace. Client::Context ctx(txn, source); if ( !NamespaceString::validCollectionComponent(target.c_str()) ) { errmsg = "invalid collection name: " + target; return false; } if ( source.empty() || target.empty() ) { errmsg = "invalid command syntax"; return false; } if (!fromRepl) { // If it got through on the master, need to allow it here too Status sourceStatus = userAllowedWriteNS(source); if (!sourceStatus.isOK()) { errmsg = "error with source namespace: " + sourceStatus.reason(); return false; } Status targetStatus = userAllowedWriteNS(target); if (!targetStatus.isOK()) { errmsg = "error with target namespace: " + targetStatus.reason(); return false; } } if (NamespaceString(source).coll() == "system.indexes" || NamespaceString(target).coll() == "system.indexes") { errmsg = "renaming system.indexes is not allowed"; return false; } Database* const sourceDB = dbHolder().get(txn, nsToDatabase(source)); Collection* const sourceColl = sourceDB ? sourceDB->getCollection(txn, source) : NULL; if (!sourceColl) { errmsg = "source namespace does not exist"; return false; } { // Ensure that collection name does not exceed maximum length. // Ensure that index names do not push the length over the max. // Iterator includes unfinished indexes. IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator( txn, true ); int longestIndexNameLength = 0; while ( sourceIndIt.more() ) { int thisLength = sourceIndIt.next()->indexName().length(); if ( thisLength > longestIndexNameLength ) longestIndexNameLength = thisLength; } unsigned int longestAllowed = min(int(NamespaceString::MaxNsCollectionLen), int(NamespaceString::MaxNsLen) - 2/*strlen(".$")*/ - longestIndexNameLength); if (target.size() > longestAllowed) { StringBuilder sb; sb << "collection name length of " << target.size() << " exceeds maximum length of " << longestAllowed << ", allowing for index names"; errmsg = sb.str(); return false; } } const std::vector<BSONObj> indexesInProg = stopIndexBuilds(txn, sourceDB, cmdObj); // Dismissed on success ScopeGuard indexBuildRestorer = MakeGuard(IndexBuilder::restoreIndexes, indexesInProg); Database* const targetDB = dbHolder().openDb(txn, nsToDatabase(target)); { WriteUnitOfWork wunit(txn); // Check if the target namespace exists and if dropTarget is true. // If target exists and dropTarget is not true, return false. if (targetDB->getCollection(txn, target)) { if (!cmdObj["dropTarget"].trueValue()) { errmsg = "target namespace exists"; return false; } Status s = targetDB->dropCollection(txn, target); if ( !s.isOK() ) { errmsg = s.toString(); return false; } } // If we are renaming in the same database, just // rename the namespace and we're done. if (sourceDB == targetDB) { Status s = targetDB->renameCollection(txn, source, target, cmdObj["stayTemp"].trueValue() ); if (!s.isOK()) { return appendCommandStatus(result, s); } if (!fromRepl) { repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj); } wunit.commit(); indexBuildRestorer.Dismiss(); return true; } wunit.commit(); } // If we get here, we are renaming across databases, so we must copy all the data and // indexes, then remove the source collection. // Create the target collection. It will be removed if we fail to copy the collection. // TODO use a temp collection and unset the temp flag on success. Collection* targetColl = NULL; { CollectionOptions options; options.setNoIdIndex(); if (sourceColl->isCapped()) { const CollectionOptions sourceOpts = sourceColl->getCatalogEntry()->getCollectionOptions(txn); options.capped = true; options.cappedSize = sourceOpts.cappedSize; options.cappedMaxDocs = sourceOpts.cappedMaxDocs; } WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. targetColl = targetDB->createCollection(txn, target, options); if (!targetColl) { errmsg = "Failed to create target collection."; return false; } wunit.commit(); } // Dismissed on success ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target); MultiIndexBlock indexer(txn, targetColl); indexer.allowInterruption(); // Copy the index descriptions from the source collection, adjusting the ns field. { std::vector<BSONObj> indexesToCopy; IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator( txn, true ); while (sourceIndIt.more()) { const BSONObj currIndex = sourceIndIt.next()->infoObj(); // Process the source index. BSONObjBuilder newIndex; newIndex.append("ns", target); newIndex.appendElementsUnique(currIndex); indexesToCopy.push_back(newIndex.obj()); } indexer.init(indexesToCopy); } { // Copy over all the data from source collection to target collection. boost::scoped_ptr<RecordIterator> sourceIt(sourceColl->getIterator(txn)); while (!sourceIt->isEOF()) { txn->checkForInterrupt(false); const BSONObj obj = sourceColl->docFor(txn, sourceIt->getNext()); WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. Status status = targetColl->insertDocument(txn, obj, &indexer, true).getStatus(); if (!status.isOK()) return appendCommandStatus(result, status); wunit.commit(); } } Status status = indexer.doneInserting(); if (!status.isOK()) return appendCommandStatus(result, status); { // Getting here means we successfully built the target copy. We now remove the // source collection and finalize the rename. WriteUnitOfWork wunit(txn); Status status = sourceDB->dropCollection(txn, source); if (!status.isOK()) return appendCommandStatus(result, status); indexer.commit(); if (!fromRepl) { repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj); } wunit.commit(); } indexBuildRestorer.Dismiss(); targetCollectionDropper.Dismiss(); return true; }