virtual void accepted(boost::shared_ptr<Socket> psocket, long long connectionId ) { ScopeGuard sleepAfterClosingPort = MakeGuard(sleepmillis, 2); std::auto_ptr<MessagingPortWithHandler> portWithHandler( new MessagingPortWithHandler(psocket, _handler, connectionId)); if ( ! Listener::globalTicketHolder.tryAcquire() ) { log() << "connection refused because too many open connections: " << Listener::globalTicketHolder.used() << endl; return; } try { #ifndef __linux__ // TODO: consider making this ifdef _WIN32 { boost::thread thr(stdx::bind(&handleIncomingMsg, portWithHandler.get())); } #else pthread_attr_t attrs; pthread_attr_init(&attrs); pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); static const size_t STACK_SIZE = 1024*1024; // if we change this we need to update the warning struct rlimit limits; verify(getrlimit(RLIMIT_STACK, &limits) == 0); if (limits.rlim_cur > STACK_SIZE) { size_t stackSizeToSet = STACK_SIZE; #if !__has_feature(address_sanitizer) if (kDebugBuild) stackSizeToSet /= 2; #endif pthread_attr_setstacksize(&attrs, stackSizeToSet); } else if (limits.rlim_cur < 1024*1024) { warning() << "Stack size set to " << (limits.rlim_cur/1024) << "KB. We suggest 1MB" << endl; } pthread_t thread; int failed = pthread_create(&thread, &attrs, &handleIncomingMsg, portWithHandler.get()); pthread_attr_destroy(&attrs); if (failed) { log() << "pthread_create failed: " << errnoWithDescription(failed) << endl; throw boost::thread_resource_error(); // for consistency with boost::thread } #endif // __linux__ portWithHandler.release(); sleepAfterClosingPort.Dismiss(); } catch ( boost::thread_resource_error& ) { Listener::globalTicketHolder.release(); log() << "can't create new thread, closing connection" << endl; } catch ( ... ) { Listener::globalTicketHolder.release(); log() << "unknown error accepting new socket" << endl; } }
StatusWith<EventHandle> ScatterGatherRunner::RunnerImpl::start( const RemoteCommandCallbackFn processResponseCB) { LockGuard lk(_mutex); invariant(!_started); _started = true; StatusWith<EventHandle> evh = _executor->makeEvent(); if (!evh.isOK()) { return evh; } _sufficientResponsesReceived = evh.getValue(); ScopeGuard earlyReturnGuard = MakeGuard(&RunnerImpl::_signalSufficientResponsesReceived, this); std::vector<RemoteCommandRequest> requests = _algorithm->getRequests(); for (size_t i = 0; i < requests.size(); ++i) { log() << "Scheduling remote command request for " << _logMessage << ": " << requests[i].toString(); const StatusWith<CallbackHandle> cbh = _executor->scheduleRemoteCommand(requests[i], processResponseCB); if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { return StatusWith<EventHandle>(cbh.getStatus()); } fassert(18743, cbh.getStatus()); _callbacks.push_back(cbh.getValue()); } if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) { invariant(_algorithm->hasReceivedSufficientResponses()); _signalSufficientResponsesReceived(); } earlyReturnGuard.Dismiss(); return evh; }
SSL* SSLManager::accept(int fd) { SSL* ssl = _secure(fd); ScopeGuard guard = MakeGuard(::SSL_free, ssl); int ret = SSL_accept(ssl); if (ret != 1) _handleSSLError(SSL_get_error(ssl, ret)); guard.Dismiss(); return ssl; }
void GlobalEnvironmentMongoD::setGlobalStorageEngine(const std::string& name) { // This should be set once. invariant(!_storageEngine); const StorageEngine::Factory* factory = _storageFactories[name]; uassert(18656, str::stream() << "Cannot start server with an unknown storage engine: " << name, factory); std::string canonicalName = factory->getCanonicalName().toString(); // Do not proceed if data directory has been used by a different storage engine previously. std::auto_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::validate(storageGlobalParams.dbpath, canonicalName); // Validate options in metadata against current startup options. if (metadata.get()) { uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams)); } try { _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath)); } catch (const std::exception& ex) { uassert(28596, str::stream() << "Unable to determine status of lock file in the data directory " << storageGlobalParams.dbpath << ": " << ex.what(), false); } if (_lockFile->createdByUncleanShutdown()) { warning() << "Detected unclean shutdown - " << _lockFile->getFilespec() << " is not empty."; } uassertStatusOK(_lockFile->open()); ScopeGuard guard = MakeGuard(&StorageEngineLockFile::close, _lockFile.get()); _storageEngine = factory->create(storageGlobalParams, *_lockFile); _storageEngine->finishInit(); uassertStatusOK(_lockFile->writePid()); // Write a new metadata file if it is not present. if (!metadata.get()) { metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath)); metadata->setStorageEngine(canonicalName); metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams)); uassertStatusOK(metadata->write()); } guard.Dismiss(); _supportsDocLocking = _storageEngine->supportsDocLocking(); }
IndexInsertionContinuation *beginInsertIntoIndex( int idxNo, IndexDetails &_idx, DiskLoc _recordLoc, const BSONObj &_key, const Ordering& _order, bool dupsAllowed) { IndexInsertionContinuationImpl<V> *continuation = new IndexInsertionContinuationImpl<V>( _idx.head, _recordLoc, _key, _order, _idx); ScopeGuard allocGuard = MakeGuard(boost::checked_delete<IndexInsertionContinuation>, continuation); _idx.head.btree<V>()->twoStepInsert(_idx.head, *continuation, dupsAllowed); allocGuard.Dismiss(); return continuation; }
Status CollectionBulkLoaderImpl::_runTaskReleaseResourcesOnFailure(F task) noexcept { AlternativeClientRegion acr(_client); ScopeGuard guard = MakeGuard(&CollectionBulkLoaderImpl::_releaseResources, this); try { const auto status = [&task]() noexcept { return task(); } (); if (status.isOK()) { guard.Dismiss(); } return status; } catch (...) { std::terminate(); } }
SSLConnection* SSLManager::accept(Socket* socket) { SSLConnection* sslConn = new SSLConnection(_serverContext, socket); ScopeGuard sslGuard = MakeGuard(::SSL_free, sslConn->ssl); ScopeGuard bioGuard = MakeGuard(::BIO_free, sslConn->networkBIO); int ret; do { ret = ::SSL_accept(sslConn->ssl); } while(!_doneWithSSLOp(sslConn, ret)); if (ret != 1) _handleSSLError(SSL_get_error(sslConn, ret)); sslGuard.Dismiss(); bioGuard.Dismiss(); return sslConn; }
StatusWith<ReplicationExecutor::EventHandle> ScatterGatherRunner::start( ReplicationExecutor* executor, const stdx::function<void ()>& onCompletion) { invariant(!_started); _started = true; _actualResponses = 0; _onCompletion = onCompletion; StatusWith<ReplicationExecutor::EventHandle> evh = executor->makeEvent(); if (!evh.isOK()) { return evh; } _sufficientResponsesReceived = evh.getValue(); ScopeGuard earlyReturnGuard = MakeGuard( &ScatterGatherRunner::_signalSufficientResponsesReceived, this, executor); const ReplicationExecutor::RemoteCommandCallbackFn cb = stdx::bind( &ScatterGatherRunner::_processResponse, stdx::placeholders::_1, this); std::vector<RemoteCommandRequest> requests = _algorithm->getRequests(); for (size_t i = 0; i < requests.size(); ++i) { const StatusWith<ReplicationExecutor::CallbackHandle> cbh = executor->scheduleRemoteCommand(requests[i], cb); if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { return StatusWith<ReplicationExecutor::EventHandle>(cbh.getStatus()); } fassert(18743, cbh.getStatus()); _callbacks.push_back(cbh.getValue()); } if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) { invariant(_algorithm->hasReceivedSufficientResponses()); _signalSufficientResponsesReceived(executor); } earlyReturnGuard.Dismiss(); return evh; }
Junk * Attic::RemoveUselessJunk( void ) { ScopeGuard callGuard = MakeObjGuard( *this, &Attic::CallObservers ); ScopeGuard testGuard = MakeObjGuard( *this, &Attic::CheckInvariants, __FUNCTION__, __LINE__ ); ScopeGuard fixGuard = MakeObjGuard( *this, &Attic::EliminateHoles ); callGuard.SetExceptionPolicy( ScopeGuardImplBase::CallIfNoException ); (void)testGuard; (void)fixGuard; for ( JunkPileIter it( m_storage.begin() ); it != m_storage.end(); ++it ) { Junk * junk = *it; if ( junk->IsUgly() ) return junk; if ( junk->IsUseful() ) continue; delete junk; *it = NULL; } return NULL; }
void AsyncClusterClientCursor::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEvent_inlock(); // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncClusterClientCursor::signalCurrentEvent_inlock, this); if (!cbData.response.isOK()) { _remotes[remoteIndex].status = cbData.response.getStatus(); return; } auto getMoreParseStatus = GetMoreResponse::parseFromBSON(cbData.response.getValue().data); if (!getMoreParseStatus.isOK()) { _remotes[remoteIndex].status = getMoreParseStatus.getStatus(); return; } auto getMoreResponse = getMoreParseStatus.getValue(); // If we have a cursor established, and we get a non-zero cursorid that is not equal to the // established cursorid, we will fail the operation. if (remote.cursorId && getMoreResponse.cursorId != 0 && *remote.cursorId != getMoreResponse.cursorId) { _remotes[remoteIndex].status = Status(ErrorCodes::BadValue, str::stream() << "Expected cursorid " << *remote.cursorId << " but received " << getMoreResponse.cursorId); return; } remote.cursorId = getMoreResponse.cursorId; for (const auto& obj : getMoreResponse.batch) { remote.docBuffer.push(obj); } // If we're doing a sorted merge, then we have to make sure to put this remote onto the // merge queue. if (!_params.sort.isEmpty() && !getMoreResponse.batch.empty()) { _mergeQueue.push(remoteIndex); } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEvent_inlock(); }
void ServiceContextMongoD::initializeGlobalStorageEngine() { // This should be set once. invariant(!_storageEngine); // We should have a _lockFile or be in read-only mode. Confusingly, we can still have a lockFile // if we are in read-only mode. This can happen if the server is started in read-only mode on a // writable dbpath. invariant(_lockFile || storageGlobalParams.readOnly); const std::string dbpath = storageGlobalParams.dbpath; if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) { if (storageGlobalParams.engineSetByUser) { // Verify that the name of the user-supplied storage engine matches the contents of // the metadata file. const StorageEngine::Factory* factory = mapFindWithDefault(_storageFactories, storageGlobalParams.engine, static_cast<const StorageEngine::Factory*>(nullptr)); if (factory) { uassert(28662, str::stream() << "Cannot start server. Detected data files in " << dbpath << " created by" << " the '" << *existingStorageEngine << "' storage engine, but the" << " specified storage engine was '" << factory->getCanonicalName() << "'.", factory->getCanonicalName() == *existingStorageEngine); } } else { // Otherwise set the active storage engine as the contents of the metadata file. log() << "Detected data files in " << dbpath << " created by the '" << *existingStorageEngine << "' storage engine, so setting the active" << " storage engine to '" << *existingStorageEngine << "'."; storageGlobalParams.engine = *existingStorageEngine; } } else if (!storageGlobalParams.engineSetByUser) { // Ensure the default storage engine is available with this build of mongod. uassert(28663, str::stream() << "Cannot start server. The default storage engine '" << storageGlobalParams.engine << "' is not available with this build of mongod. Please specify a different" << " storage engine explicitly, e.g. --storageEngine=mmapv1.", isRegisteredStorageEngine(storageGlobalParams.engine)); } const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine]; uassert(18656, str::stream() << "Cannot start server with an unknown storage engine: " << storageGlobalParams.engine, factory); if (storageGlobalParams.readOnly) { uassert(34368, str::stream() << "Server was started in read-only mode, but the configured storage engine, " << storageGlobalParams.engine << ", does not support read-only operation", factory->supportsReadOnly()); } std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath); if (storageGlobalParams.readOnly) { uassert(34415, "Server was started in read-only mode, but the storage metadata file was not" " found.", metadata.get()); } // Validate options in metadata against current startup options. if (metadata.get()) { uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams)); } ScopeGuard guard = MakeGuard([&] { if (_lockFile) { _lockFile->close(); } }); _storageEngine = factory->create(storageGlobalParams, _lockFile.get()); _storageEngine->finishInit(); if (_lockFile) { uassertStatusOK(_lockFile->writePid()); } // Write a new metadata file if it is not present. if (!metadata.get()) { invariant(!storageGlobalParams.readOnly); metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath)); metadata->setStorageEngine(factory->getCanonicalName().toString()); metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams)); uassertStatusOK(metadata->write()); } guard.Dismiss(); _supportsDocLocking = _storageEngine->supportsDocLocking(); }
PlanStage::StageState DeleteStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; ++_commonStats.advanced; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasLoc()) { // We expect to be here because of an invalidation causing a force-fetch, and // doc-locking storage engines do not issue invalidations. ++_specificStats.nInvalidateSkips; ++_commonStats.needTime; return PlanStage::NEED_TIME; } RecordId rloc = member->loc; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. std::unique_ptr<RecordCursor> cursor; if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { cursor = _collection->getCursor(getOpCtx()); if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) { // Doc is already deleted. Nothing more to do. ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. if (_params.returnDeleted) { member->makeObjOwnedIfNeeded(); } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { if (supportsDocLocking()) { // Doc-locking engines require this before saveState() since they don't use // invalidations. WorkingSetCommon::prepareForSnapshotChange(_ws); } child()->saveState(); } catch (const WriteConflictException& wce) { std::terminate(); } if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); member->loc = RecordId(); member->transitionToOwnedObj(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), rloc); wunit.commit(); } ++_specificStats.docsDeleted; } catch (const WriteConflictException& wce) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may be // freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException& wce) { // Note we don't need to retry anything in this case since the delete already // was committed. However, we still need to return the deleted document // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return status; } else if (PlanStage::NEED_TIME == status) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == status) { *out = id; ++_commonStats.needYield; } return status; }
void ServiceContextMongoD::initializeGlobalStorageEngine() { // This should be set once. invariant(!_storageEngine); const std::string dbpath = storageGlobalParams.dbpath; if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) { if (storageGlobalParams.engineSetByUser) { // Verify that the name of the user-supplied storage engine matches the contents of // the metadata file. const StorageEngine::Factory* factory = mapFindWithDefault(_storageFactories, storageGlobalParams.engine, static_cast<const StorageEngine::Factory*>(nullptr)); if (factory) { uassert(28662, str::stream() << "Cannot start server. Detected data files in " << dbpath << " created by" << " the '" << *existingStorageEngine << "' storage engine, but the" << " specified storage engine was '" << factory->getCanonicalName() << "'.", factory->getCanonicalName() == *existingStorageEngine); } } else { // Otherwise set the active storage engine as the contents of the metadata file. log() << "Detected data files in " << dbpath << " created by the '" << *existingStorageEngine << "' storage engine, so setting the active" << " storage engine to '" << *existingStorageEngine << "'."; storageGlobalParams.engine = *existingStorageEngine; } } else if (!storageGlobalParams.engineSetByUser) { // Ensure the default storage engine is available with this build of mongod. uassert(28663, str::stream() << "Cannot start server. The default storage engine '" << storageGlobalParams.engine << "' is not available with this build of mongod. Please specify a different" << " storage engine explicitly, e.g. --storageEngine=mmapv1.", isRegisteredStorageEngine(storageGlobalParams.engine)); } const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine]; uassert(18656, str::stream() << "Cannot start server with an unknown storage engine: " << storageGlobalParams.engine, factory); std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath); // Validate options in metadata against current startup options. if (metadata.get()) { uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams)); } try { _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath)); } catch (const std::exception& ex) { uassert(28596, str::stream() << "Unable to determine status of lock file in the data directory " << storageGlobalParams.dbpath << ": " << ex.what(), false); } if (_lockFile->createdByUncleanShutdown()) { warning() << "Detected unclean shutdown - " << _lockFile->getFilespec() << " is not empty."; } uassertStatusOK(_lockFile->open()); ScopeGuard guard = MakeGuard(&StorageEngineLockFile::close, _lockFile.get()); _storageEngine = factory->create(storageGlobalParams, *_lockFile); _storageEngine->finishInit(); uassertStatusOK(_lockFile->writePid()); // Write a new metadata file if it is not present. if (!metadata.get()) { metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath)); metadata->setStorageEngine(factory->getCanonicalName().toString()); metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams)); uassertStatusOK(metadata->write()); } guard.Dismiss(); _supportsDocLocking = _storageEngine->supportsDocLocking(); }
void WiredTigerSizeStorer::syncCache(bool syncToDisk) { stdx::lock_guard<stdx::mutex> cursorLock(_cursorMutex); _checkMagic(); Map myMap; { stdx::lock_guard<stdx::mutex> lk(_entriesMutex); for (Map::iterator it = _entries.begin(); it != _entries.end(); ++it) { std::string uriKey = it->first; Entry& entry = it->second; if (entry.rs) { if (entry.dataSize != entry.rs->dataSize(NULL)) { entry.dataSize = entry.rs->dataSize(NULL); entry.dirty = true; } if (entry.numRecords != entry.rs->numRecords(NULL)) { entry.numRecords = entry.rs->numRecords(NULL); entry.dirty = true; } } if (!entry.dirty) continue; myMap[uriKey] = entry; } } if (myMap.empty()) return; // Nothing to do. WT_SESSION* session = _session.getSession(); invariantWTOK(session->begin_transaction(session, syncToDisk ? "sync=true" : "")); ScopeGuard rollbacker = MakeGuard(session->rollback_transaction, session, ""); for (Map::iterator it = myMap.begin(); it != myMap.end(); ++it) { string uriKey = it->first; Entry& entry = it->second; BSONObj data; { BSONObjBuilder b; b.append("numRecords", entry.numRecords); b.append("dataSize", entry.dataSize); data = b.obj(); } LOG(2) << "WiredTigerSizeStorer::storeInto " << uriKey << " -> " << redact(data); WiredTigerItem key(uriKey.c_str(), uriKey.size()); WiredTigerItem value(data.objdata(), data.objsize()); _cursor->set_key(_cursor, key.Get()); _cursor->set_value(_cursor, value.Get()); invariantWTOK(_cursor->insert(_cursor)); } invariantWTOK(_cursor->reset(_cursor)); rollbacker.Dismiss(); invariantWTOK(session->commit_transaction(session, NULL)); { stdx::lock_guard<stdx::mutex> lk(_entriesMutex); for (Map::iterator it = _entries.begin(); it != _entries.end(); ++it) { it->second.dirty = false; } } }
PlanStage::StageState DeleteStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = _child->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasLoc()) { // We expect to be here because of an invalidation causing a force-fetch, and // doc-locking storage engines do not issue invalidations. dassert(!supportsDocLocking()); ++_specificStats.nInvalidateSkips; ++_commonStats.needTime; return PlanStage::NEED_TIME; } RecordId rloc = member->loc; try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { if (!WorkingSetCommon::fetch(_txn, member, _collection)) { // Doc is already deleted. Nothing more to do. ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { _child->saveState(); if (supportsDocLocking()) { // Doc-locking engines require this after saveState() since they don't use // invalidations. WorkingSetCommon::prepareForSnapshotChange(_ws); } } catch ( const WriteConflictException& wce ) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(_txn); const bool deleteCappedOK = false; const bool deleteNoWarn = false; BSONObj deletedDoc; _collection->deleteDocument(_txn, rloc, deleteCappedOK, deleteNoWarn, _params.shouldCallLogOp ? &deletedDoc : NULL); if (_params.shouldCallLogOp) { if (deletedDoc.isEmpty()) { log() << "Deleted object without id in collection " << _collection->ns() << ", not logging."; } else { getGlobalServiceContext()->getOpObserver()->onDelete( _txn, _collection->ns().ns(), deletedDoc, _params.fromMigrate); } } wunit.commit(); } ++_specificStats.docsDeleted; } catch ( const WriteConflictException& wce ) { _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { _child->restoreState(_txn); } catch ( const WriteConflictException& wce ) { // Note we don't need to retry anything in this case since the delete already // was committed. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } return status; } else if (PlanStage::NEED_TIME == status) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == status) { *out = id; ++_commonStats.needYield; } return status; }
Status runAggregate(OperationContext* opCtx, const NamespaceString& origNss, const AggregationRequest& request, const BSONObj& cmdObj, BSONObjBuilder& result) { // For operations on views, this will be the underlying namespace. NamespaceString nss = request.getNamespaceString(); // The collation to use for this aggregation. boost::optional to distinguish between the case // where the collation has not yet been resolved, and where it has been resolved to nullptr. boost::optional<std::unique_ptr<CollatorInterface>> collatorToUse; unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec; boost::intrusive_ptr<ExpressionContext> expCtx; Pipeline* unownedPipeline; auto curOp = CurOp::get(opCtx); { const LiteParsedPipeline liteParsedPipeline(request); // Check whether the parsed pipeline supports the given read concern. liteParsedPipeline.assertSupportsReadConcern(opCtx, request.getExplain()); if (liteParsedPipeline.hasChangeStream()) { nss = NamespaceString::kRsOplogNamespace; // If the read concern is not specified, upgrade to 'majority' and wait to make sure we // have a snapshot available. if (!repl::ReadConcernArgs::get(opCtx).hasLevel()) { const repl::ReadConcernArgs readConcern( repl::ReadConcernLevel::kMajorityReadConcern); uassertStatusOK(waitForReadConcern(opCtx, readConcern, true)); } if (!origNss.isCollectionlessAggregateNS()) { // AutoGetCollectionForReadCommand will raise an error if 'origNss' is a view. AutoGetCollectionForReadCommand origNssCtx(opCtx, origNss); // Resolve the collator to either the user-specified collation or the default // collation of the collection on which $changeStream was invoked, so that we do not // end up resolving the collation on the oplog. invariant(!collatorToUse); Collection* origColl = origNssCtx.getCollection(); collatorToUse.emplace(resolveCollator(opCtx, request, origColl)); } } const auto& pipelineInvolvedNamespaces = liteParsedPipeline.getInvolvedNamespaces(); // If emplaced, AutoGetCollectionForReadCommand will throw if the sharding version for this // connection is out of date. If the namespace is a view, the lock will be released before // re-running the expanded aggregation. boost::optional<AutoGetCollectionForReadCommand> ctx; // If this is a collectionless aggregation, we won't create 'ctx' but will still need an // AutoStatsTracker to record CurOp and Top entries. boost::optional<AutoStatsTracker> statsTracker; // If this is a collectionless aggregation with no foreign namespaces, we don't want to // acquire any locks. Otherwise, lock the collection or view. if (nss.isCollectionlessAggregateNS() && pipelineInvolvedNamespaces.empty()) { statsTracker.emplace(opCtx, nss, Top::LockType::NotLocked, 0); } else { ctx.emplace(opCtx, nss, AutoGetCollection::ViewMode::kViewsPermitted); } Collection* collection = ctx ? ctx->getCollection() : nullptr; // The collator may already have been set if this is a $changeStream pipeline. If not, // resolve the collator to either the user-specified collation or the collection default. if (!collatorToUse) { collatorToUse.emplace(resolveCollator(opCtx, request, collection)); } // If this is a view, resolve it by finding the underlying collection and stitching view // pipelines and this request's pipeline together. We then release our locks before // recursively calling runAggregate(), which will re-acquire locks on the underlying // collection. (The lock must be released because recursively acquiring locks on the // database will prohibit yielding.) if (ctx && ctx->getView() && !liteParsedPipeline.startsWithCollStats()) { invariant(nss != NamespaceString::kRsOplogNamespace); invariant(!nss.isCollectionlessAggregateNS()); // Check that the default collation of 'view' is compatible with the operation's // collation. The check is skipped if the request did not specify a collation. if (!request.getCollation().isEmpty()) { invariant(collatorToUse); // Should already be resolved at this point. if (!CollatorInterface::collatorsMatch(ctx->getView()->defaultCollator(), collatorToUse->get())) { return {ErrorCodes::OptionNotSupportedOnView, "Cannot override a view's default collation"}; } } ViewShardingCheck::throwResolvedViewIfSharded(opCtx, ctx->getDb(), ctx->getView()); auto resolvedView = ctx->getDb()->getViewCatalog()->resolveView(opCtx, nss); if (!resolvedView.isOK()) { return resolvedView.getStatus(); } // With the view & collation resolved, we can relinquish locks. ctx.reset(); // Parse the resolved view into a new aggregation request. auto newRequest = resolvedView.getValue().asExpandedViewAggregation(request); auto newCmd = newRequest.serializeToCommandObj().toBson(); auto status = runAggregate(opCtx, origNss, newRequest, newCmd, result); { // Set the namespace of the curop back to the view namespace so ctx records // stats on this view namespace on destruction. stdx::lock_guard<Client> lk(*opCtx->getClient()); curOp->setNS_inlock(nss.ns()); } return status; } invariant(collatorToUse); expCtx.reset( new ExpressionContext(opCtx, request, std::move(*collatorToUse), std::make_shared<PipelineD::MongoDInterface>(opCtx), uassertStatusOK(resolveInvolvedNamespaces(opCtx, request)))); expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; auto session = OperationContextSession::get(opCtx); expCtx->inSnapshotReadOrMultiDocumentTransaction = session && session->inSnapshotReadOrMultiDocumentTransaction(); auto pipeline = uassertStatusOK(Pipeline::parse(request.getPipeline(), expCtx)); // Check that the view's collation matches the collation of any views involved in the // pipeline. if (!pipelineInvolvedNamespaces.empty()) { invariant(ctx); auto pipelineCollationStatus = collatorCompatibleWithPipeline( opCtx, ctx->getDb(), expCtx->getCollator(), pipeline.get()); if (!pipelineCollationStatus.isOK()) { return pipelineCollationStatus; } } pipeline->optimizePipeline(); if (kDebugBuild && !expCtx->explain && !expCtx->fromMongos) { // Make sure all operations round-trip through Pipeline::serialize() correctly by // re-parsing every command in debug builds. This is important because sharded // aggregations rely on this ability. Skipping when fromMongos because this has // already been through the transformation (and this un-sets expCtx->fromMongos). pipeline = reparsePipeline(pipeline.get(), request, expCtx); } // Prepare a PlanExecutor to provide input into the pipeline, if needed. if (liteParsedPipeline.hasChangeStream()) { // If we are using a change stream, the cursor stage should have a simple collation, // regardless of what the user's collation was. std::unique_ptr<CollatorInterface> collatorForCursor = nullptr; auto collatorStash = expCtx->temporarilyChangeCollator(std::move(collatorForCursor)); PipelineD::prepareCursorSource(collection, nss, &request, pipeline.get()); } else { PipelineD::prepareCursorSource(collection, nss, &request, pipeline.get()); } // Optimize again, since there may be additional optimizations that can be done after adding // the initial cursor stage. Note this has to be done outside the above blocks to ensure // this process uses the correct collation if it does any string comparisons. pipeline->optimizePipeline(); // Transfer ownership of the Pipeline to the PipelineProxyStage. unownedPipeline = pipeline.get(); auto ws = make_unique<WorkingSet>(); auto proxy = make_unique<PipelineProxyStage>(opCtx, std::move(pipeline), ws.get()); // This PlanExecutor will simply forward requests to the Pipeline, so does not need to // yield or to be registered with any collection's CursorManager to receive invalidations. // The Pipeline may contain PlanExecutors which *are* yielding PlanExecutors and which *are* // registered with their respective collection's CursorManager auto statusWithPlanExecutor = PlanExecutor::make(opCtx, std::move(ws), std::move(proxy), nss, PlanExecutor::NO_YIELD); invariant(statusWithPlanExecutor.isOK()); exec = std::move(statusWithPlanExecutor.getValue()); { auto planSummary = Explain::getPlanSummary(exec.get()); stdx::lock_guard<Client> lk(*opCtx->getClient()); curOp->setPlanSummary_inlock(std::move(planSummary)); } } // Having released the collection lock, we can now create a cursor that returns results from the // pipeline. This cursor owns no collection state, and thus we register it with the global // cursor manager. The global cursor manager does not deliver invalidations or kill // notifications; the underlying PlanExecutor(s) used by the pipeline will be receiving // invalidations and kill notifications themselves, not the cursor we create here. ClientCursorParams cursorParams( std::move(exec), origNss, AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), opCtx->recoveryUnit()->getReadConcernLevel(), cmdObj); if (expCtx->tailableMode == TailableModeEnum::kTailableAndAwaitData) { cursorParams.setTailable(true); cursorParams.setAwaitData(true); } auto pin = CursorManager::getGlobalCursorManager()->registerCursor(opCtx, std::move(cursorParams)); ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, &pin); // If both explain and cursor are specified, explain wins. if (expCtx->explain) { Explain::explainPipelineExecutor( pin.getCursor()->getExecutor(), *(expCtx->explain), &result); } else { // Cursor must be specified, if explain is not. const bool keepCursor = handleCursorCommand(opCtx, origNss, pin.getCursor(), request, result); if (keepCursor) { cursorFreer.Dismiss(); } } if (!expCtx->explain) { PlanSummaryStats stats; Explain::getSummaryStats(*(pin.getCursor()->getExecutor()), &stats); curOp->debug().setPlanSummaryMetrics(stats); curOp->debug().nreturned = stats.nReturned; } // Any code that needs the cursor pinned must be inside the try block, above. return Status::OK(); }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // Make a best effort to parse the response and retrieve the cursor id. We need the cursor // id in order to issue a killCursors command against it. if (cbData.response.isOK()) { auto cursorResponse = parseCursorResponse(cbData.response.getValue().data, remote); if (cursorResponse.isOK()) { remote.cursorId = cursorResponse.getValue().getCursorId(); } } // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); StatusWith<CursorResponse> cursorResponseStatus( cbData.response.isOK() ? parseCursorResponse(cbData.response.getValue().data, remote) : cbData.response.getStatus()); if (!cursorResponseStatus.isOK()) { auto shard = remote.getShard(); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId << " containing host " << remote.getTargetHost().toString()); } else { shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus()); // Retry initial cursor establishment if possible. Never retry getMores to avoid // accidentally skipping results. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && shard->isRetriableError(cursorResponseStatus.getStatus().code(), Shard::RetryPolicy::kIdempotent)) { invariant(remote.shardId); LOG(1) << "Initial cursor establishment failed with retriable error and will be " "retried" << causedBy(redact(cursorResponseStatus.getStatus())); ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be // faulty, the call below may end up getting a different host. remote.status = askForNextBatch_inlock(remoteIndex); if (remote.status.isOK()) { return; } // If we end up here, it means we failed to schedule the retry request, which is a // more // severe error that should not be retried. Just pass through to the error handling // logic below. } else { remote.status = cursorResponseStatus.getStatus(); } } // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We // remove the unreachable host entirely from consideration by marking it as exhausted. if (_params.isAllowPartialResults) { remote.status = Status::OK(); // Clear the results buffer and cursor id. std::queue<BSONObj> emptyBuffer; std::swap(remote.docBuffer, emptyBuffer); remote.cursorId = 0; } return; } // Cursor id successfully established. auto cursorResponse = std::move(cursorResponseStatus.getValue()); remote.cursorId = cursorResponse.getCursorId(); remote.initialCmdObj = boost::none; for (const auto& obj : cursorResponse.getBatch()) { // If there's a sort, we're expecting the remote node to give us back a sort key. if (!_params.sort.isEmpty() && obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) { remote.status = Status(ErrorCodes::InternalError, str::stream() << "Missing field '" << ClusterClientCursorParams::kSortKeyField << "' in document: " << obj); return; } remote.docBuffer.push(obj); ++remote.fetchedCount; } // If we're doing a sorted merge, then we have to make sure to put this remote onto the // merge queue. if (!_params.sort.isEmpty() && !cursorResponse.getBatch().empty()) { _mergeQueue.push(remoteIndex); } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. if (_params.isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) { remote.status = askForNextBatch_inlock(remoteIndex); if (!remote.status.isOK()) { return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }
bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { // Counted as a getMore, not as a command. globalOpCounters.gotGetMore(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus(result, Status(ErrorCodes::IllegalOperation, "Cannot run getMore command from eval()")); } StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj); if (!parseStatus.isOK()) { return appendCommandStatus(result, parseStatus.getStatus()); } const GetMoreRequest& request = parseStatus.getValue(); // Depending on the type of cursor being operated on, we hold locks for the whole // getMore, or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors // don't own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection // lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for // the unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that // the pin's destructor is called before the lock destructors (so that the unpin occurs // under the lock). std::unique_ptr<AutoGetCollectionForRead> ctx; std::unique_ptr<Lock::DBLock> unpinDBLock; std::unique_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(request.cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); Collection* collection = ctx->getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, "collection dropped between getMore calls")); } cursorManager = collection->getCursorManager(); } ClientCursorPin ccPin(cursorManager, request.cursorid); ClientCursor* cursor = ccPin.c(); if (!cursor) { // We didn't find the cursor. return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream() << "Cursor not found, cursor id: " << request.cursorid)); } if (request.nss.ns() != cursor->ns()) { return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace '" << request.nss.ns() << "', but cursor belongs to a different namespace")); } const bool hasOwnMaxTime = CurOp::get(txn)->isMaxTimeSet(); // Validation related to awaitData. if (isCursorAwaitData(cursor)) { invariant(isCursorTailable(cursor)); if (!hasOwnMaxTime) { Status status(ErrorCodes::BadValue, str::stream() << "Must set maxTimeMS on a getMore if the initial " << "query had 'awaitData' set: " << cmdObj); return appendCommandStatus(result, status); } if (cursor->isAggCursor()) { Status status(ErrorCodes::BadValue, "awaitData cannot be set on an aggregation cursor"); return appendCommandStatus(result, status); } } // On early return, get rid of the cursor. ScopeGuard cursorFreer = MakeGuard(&GetMoreCmd::cleanupCursor, txn, &ccPin, request); if (!cursor->hasRecoveryUnit()) { // Start using a new RecoveryUnit. cursor->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ScopedRecoveryUnitSwapper ruSwapper(cursor, txn); // Reset timeout timer on the cursor since the cursor is still in use. cursor->setIdleTime(0); // If there is no time limit set directly on this getMore command, but the operation // that spawned this cursor had a time limit set, then we have to apply any leftover // time to this getMore. if (!hasOwnMaxTime) { CurOp::get(txn)->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros()); } txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (cursor->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } PlanExecutor* exec = cursor->getExecutor(); exec->restoreState(txn); // If we're tailing a capped collection, retrieve a monotonically increasing insert // counter. uint64_t lastInsertCount = 0; if (isCursorAwaitData(cursor)) { invariant(ctx->getCollection()->isCapped()); lastInsertCount = ctx->getCollection()->getCappedInsertNotifier()->getCount(); } CursorId respondWithId = 0; BSONArrayBuilder nextBatch; BSONObj obj; PlanExecutor::ExecState state; int numResults = 0; Status batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults); if (!batchStatus.isOK()) { return appendCommandStatus(result, batchStatus); } // If this is an await data cursor, and we hit EOF without generating any results, then // we block waiting for new oplog data to arrive. if (isCursorAwaitData(cursor) && state == PlanExecutor::IS_EOF && numResults == 0) { // Retrieve the notifier which we will wait on until new data arrives. We make sure // to do this in the lock because once we drop the lock it is possible for the // collection to become invalid. The notifier itself will outlive the collection if // the collection is dropped, as we keep a shared_ptr to it. auto notifier = ctx->getCollection()->getCappedInsertNotifier(); // Save the PlanExecutor and drop our locks. exec->saveState(); ctx.reset(); // Block waiting for data. Microseconds timeout(CurOp::get(txn)->getRemainingMaxTimeMicros()); notifier->waitForInsert(lastInsertCount, timeout); notifier.reset(); ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); exec->restoreState(txn); // We woke up because either the timed_wait expired, or there was more data. Either // way, attempt to generate another batch of results. batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults); if (!batchStatus.isOK()) { return appendCommandStatus(result, batchStatus); } } if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) { respondWithId = request.cursorid; exec->saveState(); // If maxTimeMS was set directly on the getMore rather than being rolled over // from a previous find, then don't roll remaining micros over to the next // getMore. if (!hasOwnMaxTime) { cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); } cursor->incPos(numResults); if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) { // Rather than swapping their existing RU into the client cursor, tailable // cursors should get a new recovery unit. ruSwapper.dismiss(); } } else { CurOp::get(txn)->debug().cursorExhausted = true; } appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result); if (respondWithId) { cursorFreer.Dismiss(); // If we are operating on an aggregation cursor, then we dropped our collection lock // earlier and need to reacquire it in order to clean up our ClientCursorPin. if (cursor->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset( new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS)); unpinCollLock.reset( new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS)); } } return true; }
void Listener::setupSockets() { checkTicketNumbers(); #if !defined(_WIN32) _mine = ipToAddrs(_ip.c_str(), _port, (!serverGlobalParams.noUnixSocket && useUnixSockets())); #else _mine = ipToAddrs(_ip.c_str(), _port, false); #endif for (std::vector<SockAddr>::const_iterator it=_mine.begin(), end=_mine.end(); it != end; ++it) { const SockAddr& me = *it; if (!me.isValid()) { error() << "listen(): socket is invalid." << endl; return; } SOCKET sock = ::socket(me.getType(), SOCK_STREAM, 0); ScopeGuard socketGuard = MakeGuard(&closesocket, sock); massert( 15863 , str::stream() << "listen(): invalid socket? " << errnoWithDescription() , sock >= 0 ); if (me.getType() == AF_UNIX) { #if !defined(_WIN32) if (unlink(me.getAddr().c_str()) == -1) { if (errno != ENOENT) { error() << "Failed to unlink socket file " << me << " " << errnoWithDescription(errno); fassertFailedNoTrace(28578); } } #endif } else if (me.getType() == AF_INET6) { // IPv6 can also accept IPv4 connections as mapped addresses (::ffff:127.0.0.1) // That causes a conflict if we don't do set it to IPV6_ONLY const int one = 1; setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*) &one, sizeof(one)); } #if !defined(_WIN32) { const int one = 1; if ( setsockopt( sock , SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0 ) log() << "Failed to set socket opt, SO_REUSEADDR" << endl; } #endif if ( ::bind(sock, me.raw(), me.addressSize) != 0 ) { int x = errno; error() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl; if ( x == EADDRINUSE ) error() << " addr already in use" << endl; return; } #if !defined(_WIN32) if (me.getType() == AF_UNIX) { if (chmod(me.getAddr().c_str(), serverGlobalParams.unixSocketPermissions) == -1) { error() << "Failed to chmod socket file " << me << " " << errnoWithDescription(errno); fassertFailedNoTrace(28582); } ListeningSockets::get()->addPath( me.getAddr() ); } #endif _socks.push_back(sock); socketGuard.Dismiss(); } _setupSocketsSuccessful = true; }
PlanStage::StageState UpdateStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } if (doneUpdating()) { // Even if we're done updating, we may have some inserting left to do. if (needInsert()) { // TODO we may want to handle WriteConflictException here. Currently we bounce it // out to a higher level since if this WCEs it is likely that we raced with another // upsert that may have matched our query, and therefore this may need to perform an // update rather than an insert. Bouncing to the higher level allows restarting the // query in this case. doInsert(); invariant(isEOF()); if (_params.request->shouldReturnNewDocs()) { // Want to return the document we just inserted, create it as a WorkingSetMember // so that we can return it. BSONObj newObj = _specificStats.objInserted; *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(), newObj.getOwned()); member->transitionToOwnedObj(); return PlanStage::ADVANCED; } } // At this point either we're done updating and there was no insert to do, // or we're done updating and we're done inserting. Either way, we're EOF. invariant(isEOF()); return PlanStage::IS_EOF; } // If we're here, then we still have to ask for results from the child and apply // updates to them. We should only get here if the collection exists. invariant(_collection); // It is possible that after an update was applied, a WriteConflictException // occurred and prevented us from returning ADVANCED with the requested version // of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.request->shouldReturnAnyDocs()); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { // Need to get these things from the result returned by the child. RecordId recordId; WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry updating or returning // it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasRecordId()) { // We expect to be here because of an invalidation causing a force-fetch. ++_specificStats.nInvalidateSkips; return PlanStage::NEED_TIME; } recordId = member->recordId; // Updates can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); // We fill this with the new RecordIds of moved doc so we don't double-update. if (_updatedRecordIds && _updatedRecordIds->count(recordId) > 0) { // Found a RecordId that refers to a document we had already updated. Note that // we can never remove from _updatedRecordIds because updates by other clients // could cause us to encounter a document again later. return PlanStage::NEED_TIME; } bool docStillMatches; try { docStillMatches = write_stage_common::ensureStillMatches( _collection, getOpCtx(), _ws, id, _params.canonicalQuery); } catch (const WriteConflictException&) { // There was a problem trying to detect if the document still exists, so retry. memberFreer.Dismiss(); return prepareToRetryWSM(id, out); } if (!docStillMatches) { // Either the document has been deleted, or it has been updated such that it no longer // matches the predicate. if (shouldRestartUpdateIfNoLongerMatches(_params)) { throw WriteConflictException(); } return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. member->makeObjOwnedIfNeeded(); // Save state before making changes WorkingSetCommon::prepareForSnapshotChange(_ws); try { child()->saveState(); } catch (const WriteConflictException&) { std::terminate(); } // If we care about the pre-updated version of the doc, save it out here. BSONObj oldObj; if (_params.request->shouldReturnOldDocs()) { oldObj = member->obj.value().getOwned(); } BSONObj newObj; try { // Do the update, get us the new version of the doc. newObj = transformAndUpdate(member->obj, recordId); } catch (const WriteConflictException&) { memberFreer.Dismiss(); // Keep this member around so we can retry updating it. return prepareToRetryWSM(id, out); } // Set member's obj to be the doc we want to return. if (_params.request->shouldReturnAnyDocs()) { if (_params.request->shouldReturnNewDocs()) { member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(), newObj.getOwned()); } else { invariant(_params.request->shouldReturnOldDocs()); member->obj.setValue(oldObj); } member->recordId = RecordId(); member->transitionToOwnedObj(); } // This should be after transformAndUpdate to make sure we actually updated this doc. ++_specificStats.nMatched; // Restore state after modification // As restoreState may restore (recreate) cursors, make sure to restore the // state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException&) { // Note we don't need to retry updating anything in this case since the update // already was committed. However, we still need to return the updated document // (if it was requested). if (_params.request->shouldReturnAnyDocs()) { // member->obj should refer to the document we want to return. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.request->shouldReturnAnyDocs()) { // member->obj should refer to the document we want to return. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == status) { // The child is out of results, but we might not be done yet because we still might // have to do an insert. return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "update stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); if (!cbData.response.isOK()) { remote.status = cbData.response.getStatus(); // If we failed to retrieve the batch because we couldn't contact the remote, we notify that // targeter that the host is unreachable. The caller can then retry on a new host. if (remote.status == ErrorCodes::HostUnreachable && remote.shardId) { auto shard = _params.shardRegistry->getShard(_params.txn, *remote.shardId); if (!shard) { remote.status = Status(ErrorCodes::HostUnreachable, str::stream() << "Could not find shard " << *remote.shardId << " containing host " << remote.hostAndPort.toString()); } else { shard->getTargeter()->markHostUnreachable(remote.hostAndPort); } } return; } auto getMoreParseStatus = CursorResponse::parseFromBSON(cbData.response.getValue().data); if (!getMoreParseStatus.isOK()) { remote.status = getMoreParseStatus.getStatus(); return; } auto cursorResponse = getMoreParseStatus.getValue(); // If we have a cursor established, and we get a non-zero cursorid that is not equal to the // established cursorid, we will fail the operation. if (remote.cursorId && cursorResponse.cursorId != 0 && *remote.cursorId != cursorResponse.cursorId) { remote.status = Status(ErrorCodes::BadValue, str::stream() << "Expected cursorid " << *remote.cursorId << " but received " << cursorResponse.cursorId); return; } remote.cursorId = cursorResponse.cursorId; remote.cmdObj = boost::none; for (const auto& obj : cursorResponse.batch) { // If there's a sort, we're expecting the remote node to give us back a sort key. if (!_params.sort.isEmpty() && obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) { remote.status = Status(ErrorCodes::InternalError, str::stream() << "Missing field '" << ClusterClientCursorParams::kSortKeyField << "' in document: " << obj); return; } remote.docBuffer.push(obj); ++remote.fetchedCount; } // If we're doing a sorted merge, then we have to make sure to put this remote onto the // merge queue. if (!_params.sort.isEmpty() && !cursorResponse.batch.empty()) { _mergeQueue.push(remoteIndex); } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. if (_params.isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) { auto nextBatchStatus = askForNextBatch_inlock(remoteIndex); if (!nextBatchStatus.isOK()) { remote.status = nextBatchStatus; return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; if (_idRetrying != WorkingSet::INVALID_ID) { id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } else { auto status = child()->work(&id); switch (status) { case PlanStage::ADVANCED: break; case PlanStage::FAILURE: case PlanStage::DEAD: // The stage which produces a failure is responsible for allocating a working set // member with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return status; case PlanStage::NEED_TIME: return status; case PlanStage::NEED_YIELD: *out = id; return status; case PlanStage::IS_EOF: return status; default: MONGO_UNREACHABLE; } } // We advanced, or are retrying, and id is set to the WSM to work on. WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry deleting or returning it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); invariant(member->hasRecordId()); RecordId recordId = member->recordId; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); // Ensure the document still exists and matches the predicate. bool docStillMatches; try { docStillMatches = write_stage_common::ensureStillMatches( _collection, getOpCtx(), _ws, id, _params.canonicalQuery); } catch (const WriteConflictException&) { // There was a problem trying to detect if the document still exists, so retry. memberFreer.Dismiss(); return prepareToRetryWSM(id, out); } if (!docStillMatches) { // Either the document has already been deleted, or it has been updated such that it no // longer matches the predicate. if (shouldRestartDeleteIfNoLongerMatches(_params)) { throw WriteConflictException(); } return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() is // allowed to free the memory. if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted, but keep it in the RID_AND_OBJ // state in case we need to retry deleting it. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); } // TODO: Do we want to buffer docs and delete them in a group rather than saving/restoring state // repeatedly? WorkingSetCommon::prepareForSnapshotChange(_ws); try { child()->saveState(); } catch (const WriteConflictException&) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { try { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), _params.stmtId, recordId, _params.opDebug, _params.fromMigrate, false, _params.returnDeleted ? Collection::StoreDeletedDoc::On : Collection::StoreDeletedDoc::Off); wunit.commit(); } catch (const WriteConflictException&) { memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. return prepareToRetryWSM(id, out); } } ++_specificStats.docsDeleted; if (_params.returnDeleted) { // After deleting the document, the RecordId associated with this member is invalid. // Remove the 'recordId' from the WorkingSetMember before returning it. member->recordId = RecordId(); member->transitionToOwnedObj(); } // As restoreState may restore (recreate) cursors, cursors are tied to the transaction in which // they are created, and a WriteUnitOfWork is a transaction, make sure to restore the state // outside of the WriteUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException&) { // Note we don't need to retry anything in this case since the delete already was committed. // However, we still need to return the deleted document (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; }
bool MessagingPort::recv(Message& m) { try { #ifdef MONGO_CONFIG_SSL again: #endif // mmm( log() << "* recv() sock:" << this->sock << endl; ) MSGHEADER::Value header; int headerLen = sizeof(MSGHEADER::Value); psock->recv((char*)&header, headerLen); int len = header.constView().getMessageLength(); if (len == 542393671) { // an http GET string msg = "It looks like you are trying to access MongoDB over HTTP on the native driver " "port.\n"; LOG(psock->getLogLevel()) << msg; std::stringstream ss; ss << "HTTP/1.0 200 OK\r\nConnection: close\r\nContent-Type: " "text/plain\r\nContent-Length: " << msg.size() << "\r\n\r\n" << msg; string s = ss.str(); send(s.c_str(), s.size(), "http"); return false; } // If responseTo is not 0 or -1 for first packet assume SSL else if (psock->isAwaitingHandshake()) { #ifndef MONGO_CONFIG_SSL if (header.constView().getResponseTo() != 0 && header.constView().getResponseTo() != -1) { uasserted(17133, "SSL handshake requested, SSL feature not available in this build"); } #else if (header.constView().getResponseTo() != 0 && header.constView().getResponseTo() != -1) { uassert(17132, "SSL handshake received but server is started without SSL support", sslGlobalParams.sslMode.load() != SSLParams::SSLMode_disabled); setX509SubjectName( psock->doSSLHandshake(reinterpret_cast<const char*>(&header), sizeof(header))); psock->setHandshakeReceived(); goto again; } uassert(17189, "The server is configured to only allow SSL connections", sslGlobalParams.sslMode.load() != SSLParams::SSLMode_requireSSL); #endif // MONGO_CONFIG_SSL } if (static_cast<size_t>(len) < sizeof(MSGHEADER::Value) || static_cast<size_t>(len) > MaxMessageSizeBytes) { LOG(0) << "recv(): message len " << len << " is invalid. " << "Min " << sizeof(MSGHEADER::Value) << " Max: " << MaxMessageSizeBytes; return false; } psock->setHandshakeReceived(); int z = (len + 1023) & 0xfffffc00; verify(z >= len); MsgData::View md = reinterpret_cast<char*>(mongolMalloc(z)); ScopeGuard guard = MakeGuard(free, md.view2ptr()); verify(md.view2ptr()); memcpy(md.view2ptr(), &header, headerLen); int left = len - headerLen; psock->recv(md.data(), left); guard.Dismiss(); m.setData(md.view2ptr(), true); return true; } catch (const SocketException& e) { logger::LogSeverity severity = psock->getLogLevel(); if (!e.shouldPrint()) severity = severity.lessSevere(); LOG(severity) << "SocketException: remote: " << remote() << " error: " << e; m.reset(); return false; } }
bool tryLock(const Zstring& lockfilepath) //throw FileError { #ifdef ZEN_WIN #ifdef TODO_MinFFS_ActivatePriviledge try { activatePrivilege(SE_BACKUP_NAME); } catch (const FileError&) {} try { activatePrivilege(SE_RESTORE_NAME); } catch (const FileError&) {} #endif//TODO_MinFFS_ActivatePriviledge const HANDLE fileHandle = ::CreateFile(applyLongPathPrefix(lockfilepath).c_str(), //_In_ LPCTSTR lpFileName, //use both when writing over network, see comment in file_io.cpp GENERIC_READ | GENERIC_WRITE, //_In_ DWORD dwDesiredAccess, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, //_In_ DWORD dwShareMode, nullptr, //_In_opt_ LPSECURITY_ATTRIBUTES lpSecurityAttributes, CREATE_NEW, //_In_ DWORD dwCreationDisposition, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_BACKUP_SEMANTICS, //_In_ DWORD dwFlagsAndAttributes, nullptr); //_In_opt_ HANDLE hTemplateFile if (fileHandle == INVALID_HANDLE_VALUE) { const DWORD ec = ::GetLastError(); //copy before directly/indirectly making other system calls! if (ec == ERROR_FILE_EXISTS || //confirmed to be used ec == ERROR_ALREADY_EXISTS) //comment on msdn claims, this one is used on Windows Mobile 6 return false; throw FileError(replaceCpy(_("Cannot write file %x."), L"%x", fmtPath(lockfilepath)), formatSystemError(L"CreateFile", ec)); } ScopeGuard guardLockFile = zen::makeGuard([&] { removeFile(lockfilepath); }); FileOutput fileOut(fileHandle, lockfilepath); //pass handle ownership //be careful to avoid CreateFile() + CREATE_ALWAYS on a hidden file -> see file_io.cpp //=> we don't need it that badly //::SetFileAttributes(applyLongPathPrefix(lockfilepath).c_str(), FILE_ATTRIBUTE_HIDDEN); //(try to) hide it #elif defined ZEN_LINUX || defined ZEN_MAC const mode_t oldMask = ::umask(0); //important: we want the lock file to have exactly the permissions specified ZEN_ON_SCOPE_EXIT(::umask(oldMask)); //O_EXCL contains a race condition on NFS file systems: http://linux.die.net/man/2/open const int fileHandle = ::open(lockfilepath.c_str(), O_CREAT | O_EXCL | O_WRONLY, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); if (fileHandle == -1) { if (errno == EEXIST) return false; else THROW_LAST_FILE_ERROR(replaceCpy(_("Cannot write file %x."), L"%x", fmtPath(lockfilepath)), L"open"); } ScopeGuard guardLockFile = zen::makeGuard([&] { removeFile(lockfilepath); }); FileOutput fileOut(fileHandle, lockfilepath); //pass handle ownership #endif //write housekeeping info: user, process info, lock GUID ByteArray binStream = [&] { MemStreamOut streamOut; LockInformation(FromCurrentProcess()).toStream(streamOut); return streamOut.ref(); }(); if (!binStream.empty()) fileOut.write(&*binStream.begin(), binStream.size()); //throw FileError guardLockFile.dismiss(); //lockfile created successfully return true; }
void DoExceptionTests( void ) { ::std::cout << ::std::endl; { try { ::std::cout << "The function should be called." << ::std::endl; ScopeGuard guard = MakeGuard( &CalledAlways ); (void)guard; FunctionMightThrow( false ); } catch ( ... ) { ::std::cout << "Caught exception." << ::std::endl; assert( false ); } } ::std::cout << ::std::endl; { try { ::std::cout << "The function should be called." << ::std::endl; ScopeGuard guard = MakeGuard( &CalledAlways ); (void)guard; FunctionMightThrow( true ); assert( false ); } catch ( ... ) { ::std::cout << "Caught exception." << ::std::endl; } } ::std::cout << ::std::endl; { try { ::std::cout << "The function should be called." << ::std::endl; ScopeGuard guard = MakeGuard( &CalledIfNoException ); guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfNoException ); FunctionMightThrow( false ); ::std::cout << "No exception thrown." << ::std::endl; } catch ( ... ) { ::std::cout << "Caught exception." << ::std::endl; assert( false ); } } ::std::cout << ::std::endl; { try { ::std::cout << "The function should not be called." << ::std::endl; ScopeGuard guard = MakeGuard( &CalledIfNoException ); guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfNoException ); FunctionMightThrow( true ); assert( false ); } catch ( ... ) { ::std::cout << "Caught exception." << ::std::endl; } } ::std::cout << ::std::endl; { try { ::std::cout << "The function should not be called." << ::std::endl; ScopeGuard guard = MakeGuard( &CalledIfException ); guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfException ); FunctionMightThrow( false ); ::std::cout << "No exception thrown." << ::std::endl; } catch ( ... ) { ::std::cout << "Caught exception." << ::std::endl; assert( false ); } } ::std::cout << ::std::endl; { try { ::std::cout << "The function should be called." << ::std::endl; ScopeGuard guard = MakeGuard( &CalledIfException ); guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfException ); FunctionMightThrow( true ); assert( false ); } catch ( ... ) { ::std::cout << "Caught exception." << ::std::endl; } } ::std::cout << ::std::endl; }
/** * Runs a query using the following steps: * 1) Parsing. * 2) Acquire locks. * 3) Plan query, obtaining an executor that can run it. * 4) Setup a cursor for the query, which may be used on subsequent getMores. * 5) Generate the first batch. * 6) Save state for getMore. * 7) Generate response to send to the client. * * TODO: Rather than using the sharding version available in thread-local storage (i.e. the * call to ShardingState::needCollectionMetadata() below), shard version information * should be passed as part of the command parameter. */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { const std::string fullns = parseNs(dbname, cmdObj); const NamespaceString nss(fullns); if (!nss.isValid()) { return appendCommandStatus(result, {ErrorCodes::InvalidNamespace, str::stream() << "Invalid collection name: " << nss.ns()}); } // Although it is a command, a find command gets counted as a query. globalOpCounters.gotQuery(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus( result, Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()")); } // 1a) Parse the command BSON to a LiteParsedQuery. const bool isExplain = false; auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain); if (!lpqStatus.isOK()) { return appendCommandStatus(result, lpqStatus.getStatus()); } auto& lpq = lpqStatus.getValue(); // Validate term, if provided. if (auto term = lpq->getReplicationTerm()) { auto replCoord = repl::ReplicationCoordinator::get(txn); Status status = replCoord->updateTerm(*term); // Note: updateTerm returns ok if term stayed the same. if (!status.isOK()) { return appendCommandStatus(result, status); } } // Fill out curop information. long long ntoreturn = lpq->getBatchSize().value_or(0); beginQueryOp(txn, nss, cmdObj, ntoreturn, lpq->getSkip()); // 1b) Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery. WhereCallbackReal whereCallback(txn, nss.db()); auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), whereCallback); if (!statusWithCQ.isOK()) { return appendCommandStatus(result, statusWithCQ.getStatus()); } std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); // 2) Acquire locks. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; ShardingState* const shardingState = ShardingState::get(txn); // It is possible that the sharding version will change during yield while we are // retrieving a plan executor. If this happens we will throw an error and mongos will // retry. const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns()); // 3) Get the execution plan for the query. auto statusWithPlanExecutor = getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); // TODO: Currently, chunk ranges are kept around until all ClientCursors created while // the chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // Version changed while retrieving a PlanExecutor. Terminate the operation, // signaling that mongos should retry. throw SendStaleConfigException(nss.ns(), "version changed during find command", shardingVersionAtStart, shardingState->getVersion(nss.ns())); } if (!collection) { // No collection. Just fill out curop indicating that there were zero results and // there is no ClientCursor id, and then return. const long long numResults = 0; const CursorId cursorId = 0; endQueryOp(txn, *exec, dbProfilingLevel, numResults, cursorId); appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result); return true; } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // 4) If possible, register the execution plan inside a ClientCursor, and pin that // cursor. In this case, ownership of the PlanExecutor is transferred to the // ClientCursor, and 'exec' becomes null. // // First unregister the PlanExecutor so it can be re-registered with ClientCursor. exec->deregisterExec(); // Create a ClientCursor containing this plan executor. We don't have to worry // about leaking it as it's inserted into a global map by its ctor. ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), pq.getOptions(), pq.getFilter()); CursorId cursorId = cursor->cursorid(); ClientCursorPin ccPin(collection->getCursorManager(), cursorId); // On early return, get rid of the the cursor. ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin); invariant(!exec); PlanExecutor* cursorExec = cursor->getExecutor(); // 5) Stream query results, adding them to a BSONArray as we go. BSONArrayBuilder firstBatch; BSONObj obj; PlanExecutor::ExecState state; long long numResults = 0; while (!enoughForFirstBatch(pq, numResults, firstBatch.len()) && PlanExecutor::ADVANCED == (state = cursorExec->getNext(&obj, NULL))) { // If adding this object will cause us to exceed the BSON size limit, then we stash // it for later. if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) { cursorExec->enqueue(obj); break; } // Add result to output buffer. firstBatch.append(obj); numResults++; } // Throw an assertion if query execution fails for any reason. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { const std::unique_ptr<PlanStageStats> stats(cursorExec->getStats()); error() << "Plan executor error during find command: " << PlanExecutor::statestr(state) << ", stats: " << Explain::statsToBSON(*stats); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "Executor error during find command: " << WorkingSetCommon::toStatusString(obj))); } // 6) Set up the cursor for getMore. if (shouldSaveCursor(txn, collection, state, cursorExec)) { // State will be restored on getMore. cursorExec->saveState(); cursorExec->detachFromOperationContext(); cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); cursor->setPos(numResults); } else { cursorId = 0; } // Fill out curop based on the results. endQueryOp(txn, *cursorExec, dbProfilingLevel, numResults, cursorId); // 7) Generate the response object to send to the client. appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result); if (cursorId) { cursorFreer.Dismiss(); } return true; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result) { const NamespaceString ns(parseNs(dbname, cmdObj)); Status status = userAllowedWriteNS(ns); if (!status.isOK()) return appendCommandStatus(result, status); if (cmdObj["indexes"].type() != Array) { errmsg = "indexes has to be an array"; result.append("cmdObj", cmdObj); return false; } std::vector<BSONObj> specs; { BSONObjIterator i(cmdObj["indexes"].Obj()); while (i.more()) { BSONElement e = i.next(); if (e.type() != Object) { errmsg = "everything in indexes has to be an Object"; result.append("cmdObj", cmdObj); return false; } specs.push_back(e.Obj()); } } if (specs.size() == 0) { errmsg = "no indexes to add"; return false; } // check specs for (size_t i = 0; i < specs.size(); i++) { BSONObj spec = specs[i]; if (spec["ns"].eoo()) { spec = _addNsToSpec(ns, spec); specs[i] = spec; } if (spec["ns"].type() != String) { errmsg = "ns field must be a string"; result.append("spec", spec); return false; } std::string nsFromUser = spec["ns"].String(); if (nsFromUser.empty()) { errmsg = "ns field cannot be an empty string"; result.append("spec", spec); return false; } if (ns != nsFromUser) { errmsg = str::stream() << "value of ns field '" << nsFromUser << "' doesn't match namespace " << ns.ns(); result.append("spec", spec); return false; } } // now we know we have to create index(es) // Note: createIndexes command does not currently respect shard versioning. ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock dbLock(txn->lockState(), ns.db(), MODE_X); if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) { return appendCommandStatus( result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating indexes in " << ns.ns())); } Database* db = dbHolder().get(txn, ns.db()); if (!db) { db = dbHolder().openDb(txn, ns.db()); } Collection* collection = db->getCollection(ns.ns()); if (collection) { result.appendBool("createdCollectionAutomatically", false); } else { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); collection = db->createCollection(txn, ns.ns(), CollectionOptions()); invariant(collection); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns()); result.appendBool("createdCollectionAutomatically", true); } const int numIndexesBefore = collection->getIndexCatalog()->numIndexesTotal(txn); result.append("numIndexesBefore", numIndexesBefore); auto client = txn->getClient(); ScopeGuard lastOpSetterGuard = MakeObjGuard(repl::ReplClientInfo::forClient(client), &repl::ReplClientInfo::setLastOpToSystemLastOpTime, txn); MultiIndexBlock indexer(txn, collection); indexer.allowBackgroundBuilding(); indexer.allowInterruption(); const size_t origSpecsSize = specs.size(); indexer.removeExistingIndexes(&specs); if (specs.size() == 0) { result.append("numIndexesAfter", numIndexesBefore); result.append("note", "all indexes already exist"); return true; } if (specs.size() != origSpecsSize) { result.append("note", "index already exists"); } for (size_t i = 0; i < specs.size(); i++) { const BSONObj& spec = specs[i]; if (spec["unique"].trueValue()) { status = checkUniqueIndexConstraints(txn, ns.ns(), spec["key"].Obj()); if (!status.isOK()) { return appendCommandStatus(result, status); } } if (spec["v"].isNumber() && spec["v"].numberInt() == 0) { return appendCommandStatus( result, Status(ErrorCodes::CannotCreateIndex, str::stream() << "illegal index specification: " << spec << ". " << "The option v:0 cannot be passed explicitly")); } } MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { uassertStatusOK(indexer.init(specs)); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns()); // If we're a background index, replace exclusive db lock with an intent lock, so that // other readers and writers can proceed during this phase. if (indexer.getBuildInBackground()) { txn->recoveryUnit()->abandonSnapshot(); dbLock.relockWithMode(MODE_IX); if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) { return appendCommandStatus( result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating background indexes in " << ns.ns())); } } try { Lock::CollectionLock colLock(txn->lockState(), ns.ns(), MODE_IX); uassertStatusOK(indexer.insertAllDocumentsInCollection()); } catch (const DBException& e) { invariant(e.getCode() != ErrorCodes::WriteConflict); // Must have exclusive DB lock before we clean up the index build via the // destructor of 'indexer'. if (indexer.getBuildInBackground()) { try { // This function cannot throw today, but we will preemptively prepare for // that day, to avoid data corruption due to lack of index cleanup. txn->recoveryUnit()->abandonSnapshot(); dbLock.relockWithMode(MODE_X); if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) { return appendCommandStatus( result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating background indexes in " << ns.ns() << ": cleaning up index build failure due to " << e.toString())); } } catch (...) { std::terminate(); } } throw; } // Need to return db lock back to exclusive, to complete the index build. if (indexer.getBuildInBackground()) { txn->recoveryUnit()->abandonSnapshot(); dbLock.relockWithMode(MODE_X); uassert(ErrorCodes::NotMaster, str::stream() << "Not primary while completing index build in " << dbname, repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)); Database* db = dbHolder().get(txn, ns.db()); uassert(28551, "database dropped during index build", db); uassert(28552, "collection dropped during index build", db->getCollection(ns.ns())); } MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); indexer.commit(); for (size_t i = 0; i < specs.size(); i++) { std::string systemIndexes = ns.getSystemIndexesCollection(); getGlobalServiceContext()->getOpObserver()->onCreateIndex( txn, systemIndexes, specs[i]); } wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns()); result.append("numIndexesAfter", collection->getIndexCatalog()->numIndexesTotal(txn)); lastOpSetterGuard.Dismiss(); return true; }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, OperationContext* opCtx, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(opCtx); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); StatusWith<CursorResponse> cursorResponseStatus( cbData.response.isOK() ? parseCursorResponse(cbData.response.data, remote) : cbData.response.status); if (!cursorResponseStatus.isOK()) { auto shard = remote.getShard(); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard containing host " << remote.getTargetHost().toString()); } else { shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus()); remote.status = cursorResponseStatus.getStatus(); } // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We // remove the unreachable host entirely from consideration by marking it as exhausted. if (_params->isAllowPartialResults) { remote.status = Status::OK(); // Clear the results buffer and cursor id. std::queue<ClusterQueryResult> emptyBuffer; std::swap(remote.docBuffer, emptyBuffer); remote.cursorId = 0; } return; } // Response successfully received. auto cursorResponse = std::move(cursorResponseStatus.getValue()); // Update the cursorId; it is sent as '0' when the cursor has been exhausted on the shard. remote.cursorId = cursorResponse.getCursorId(); // Save the batch in the remote's buffer. if (!addBatchToBuffer(remoteIndex, cursorResponse.getBatch())) { return; } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. // (Note: tailable cursors are only valid on unsharded collections, so the end of the batch from // one shard means the end of the overall batch). if (_params->isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params->isTailable && !remote.hasNext() && !remote.exhausted()) { remote.status = askForNextBatch_inlock(opCtx, remoteIndex); if (!remote.status.isOK()) { return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; if (_idRetrying != WorkingSet::INVALID_ID) { id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } else { auto status = child()->work(&id); switch (status) { case PlanStage::ADVANCED: break; case PlanStage::FAILURE: case PlanStage::DEAD: *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which // case 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return status; case PlanStage::NEED_TIME: return status; case PlanStage::NEED_YIELD: *out = id; return status; case PlanStage::IS_EOF: return status; default: MONGO_UNREACHABLE; } } // We advanced, or are retrying, and id is set to the WSM to work on. WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasRecordId()) { // We expect to be here because of an invalidation causing a force-fetch. ++_specificStats.nInvalidateSkips; return PlanStage::NEED_TIME; } RecordId recordId = member->recordId; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. std::unique_ptr<SeekableRecordCursor> cursor; if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { cursor = _collection->getCursor(getOpCtx()); if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) { // Doc is already deleted. Nothing more to do. return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. return PlanStage::NEED_TIME; } } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted, but keep it in the // RID_AND_OBJ state in case we need to retry deleting it. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { WorkingSetCommon::prepareForSnapshotChange(_ws); child()->saveState(); } catch (const WriteConflictException& wce) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), recordId, _params.fromMigrate); wunit.commit(); } ++_specificStats.docsDeleted; } catch (const WriteConflictException& wce) { // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will // not produce any more results even if there is another matching document. Re-throw the WCE // here so that these operations get another chance to find a matching document. The // findAndModify command should automatically retry if it gets a WCE. // TODO: this is not necessary if there was no sort specified. if (_params.returnDeleted) { throw; } _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // After deleting the document, the RecordId associated with this member is invalid. // Remove the 'recordId' from the WorkingSetMember before returning it. member->recordId = RecordId(); member->transitionToOwnedObj(); } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException& wce) { // Note we don't need to retry anything in this case since the delete already // was committed. However, we still need to return the deleted document // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { Lock::GlobalWrite globalWriteLock(txn->lockState()); string source = cmdObj.getStringField( name.c_str() ); string target = cmdObj.getStringField( "to" ); // We stay in source context the whole time. This is mostly to set the CurOp namespace. Client::Context ctx(txn, source); if ( !NamespaceString::validCollectionComponent(target.c_str()) ) { errmsg = "invalid collection name: " + target; return false; } if ( source.empty() || target.empty() ) { errmsg = "invalid command syntax"; return false; } if (!fromRepl) { // If it got through on the master, need to allow it here too Status sourceStatus = userAllowedWriteNS(source); if (!sourceStatus.isOK()) { errmsg = "error with source namespace: " + sourceStatus.reason(); return false; } Status targetStatus = userAllowedWriteNS(target); if (!targetStatus.isOK()) { errmsg = "error with target namespace: " + targetStatus.reason(); return false; } } if (NamespaceString(source).coll() == "system.indexes" || NamespaceString(target).coll() == "system.indexes") { errmsg = "renaming system.indexes is not allowed"; return false; } Database* const sourceDB = dbHolder().get(txn, nsToDatabase(source)); Collection* const sourceColl = sourceDB ? sourceDB->getCollection(txn, source) : NULL; if (!sourceColl) { errmsg = "source namespace does not exist"; return false; } { // Ensure that collection name does not exceed maximum length. // Ensure that index names do not push the length over the max. // Iterator includes unfinished indexes. IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator( txn, true ); int longestIndexNameLength = 0; while ( sourceIndIt.more() ) { int thisLength = sourceIndIt.next()->indexName().length(); if ( thisLength > longestIndexNameLength ) longestIndexNameLength = thisLength; } unsigned int longestAllowed = min(int(NamespaceString::MaxNsCollectionLen), int(NamespaceString::MaxNsLen) - 2/*strlen(".$")*/ - longestIndexNameLength); if (target.size() > longestAllowed) { StringBuilder sb; sb << "collection name length of " << target.size() << " exceeds maximum length of " << longestAllowed << ", allowing for index names"; errmsg = sb.str(); return false; } } const std::vector<BSONObj> indexesInProg = stopIndexBuilds(txn, sourceDB, cmdObj); // Dismissed on success ScopeGuard indexBuildRestorer = MakeGuard(IndexBuilder::restoreIndexes, indexesInProg); Database* const targetDB = dbHolder().openDb(txn, nsToDatabase(target)); { WriteUnitOfWork wunit(txn); // Check if the target namespace exists and if dropTarget is true. // If target exists and dropTarget is not true, return false. if (targetDB->getCollection(txn, target)) { if (!cmdObj["dropTarget"].trueValue()) { errmsg = "target namespace exists"; return false; } Status s = targetDB->dropCollection(txn, target); if ( !s.isOK() ) { errmsg = s.toString(); return false; } } // If we are renaming in the same database, just // rename the namespace and we're done. if (sourceDB == targetDB) { Status s = targetDB->renameCollection(txn, source, target, cmdObj["stayTemp"].trueValue() ); if (!s.isOK()) { return appendCommandStatus(result, s); } if (!fromRepl) { repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj); } wunit.commit(); indexBuildRestorer.Dismiss(); return true; } wunit.commit(); } // If we get here, we are renaming across databases, so we must copy all the data and // indexes, then remove the source collection. // Create the target collection. It will be removed if we fail to copy the collection. // TODO use a temp collection and unset the temp flag on success. Collection* targetColl = NULL; { CollectionOptions options; options.setNoIdIndex(); if (sourceColl->isCapped()) { const CollectionOptions sourceOpts = sourceColl->getCatalogEntry()->getCollectionOptions(txn); options.capped = true; options.cappedSize = sourceOpts.cappedSize; options.cappedMaxDocs = sourceOpts.cappedMaxDocs; } WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. targetColl = targetDB->createCollection(txn, target, options); if (!targetColl) { errmsg = "Failed to create target collection."; return false; } wunit.commit(); } // Dismissed on success ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target); MultiIndexBlock indexer(txn, targetColl); indexer.allowInterruption(); // Copy the index descriptions from the source collection, adjusting the ns field. { std::vector<BSONObj> indexesToCopy; IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator( txn, true ); while (sourceIndIt.more()) { const BSONObj currIndex = sourceIndIt.next()->infoObj(); // Process the source index. BSONObjBuilder newIndex; newIndex.append("ns", target); newIndex.appendElementsUnique(currIndex); indexesToCopy.push_back(newIndex.obj()); } indexer.init(indexesToCopy); } { // Copy over all the data from source collection to target collection. boost::scoped_ptr<RecordIterator> sourceIt(sourceColl->getIterator(txn)); while (!sourceIt->isEOF()) { txn->checkForInterrupt(false); const BSONObj obj = sourceColl->docFor(txn, sourceIt->getNext()); WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. Status status = targetColl->insertDocument(txn, obj, &indexer, true).getStatus(); if (!status.isOK()) return appendCommandStatus(result, status); wunit.commit(); } } Status status = indexer.doneInserting(); if (!status.isOK()) return appendCommandStatus(result, status); { // Getting here means we successfully built the target copy. We now remove the // source collection and finalize the rename. WriteUnitOfWork wunit(txn); Status status = sourceDB->dropCollection(txn, source); if (!status.isOK()) return appendCommandStatus(result, status); indexer.commit(); if (!fromRepl) { repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj); } wunit.commit(); } indexBuildRestorer.Dismiss(); targetCollectionDropper.Dismiss(); return true; }