boost::optional<Date_t> CollectionRangeDeleter::cleanUpNextRange(
    OperationContext* opCtx,
    NamespaceString const& nss,
    OID const& epoch,
    int maxToDelete,
    CollectionRangeDeleter* forTestOnly) {

    StatusWith<int> wrote = 0;

    auto range = boost::optional<ChunkRange>(boost::none);
    auto notification = DeleteNotification();

    {
        UninterruptibleLockGuard noInterrupt(opCtx->lockState());
        AutoGetCollection autoColl(opCtx, nss, MODE_IX);

        auto* const collection = autoColl.getCollection();
        auto* const css = CollectionShardingRuntime::get(opCtx, nss);
        auto& metadataManager = css->_metadataManager;
        auto* const self = forTestOnly ? forTestOnly : &metadataManager->_rangesToClean;

        const auto scopedCollectionMetadata =
            metadataManager->getActiveMetadata(metadataManager, boost::none);

        if (!scopedCollectionMetadata) {
            LOG(0) << "Abandoning any range deletions because the metadata for " << nss.ns()
                   << " was reset";
            stdx::lock_guard<stdx::mutex> lk(css->_metadataManager->_managerLock);
            css->_metadataManager->_clearAllCleanups(lk);
            return boost::none;
        }

        const auto& metadata = *scopedCollectionMetadata;

        if (!forTestOnly && (!collection || !metadata->isSharded())) {
            if (!collection) {
                LOG(0) << "Abandoning any range deletions left over from dropped " << nss.ns();
            } else {
                LOG(0) << "Abandoning any range deletions left over from previously sharded"
                       << nss.ns();
            }

            stdx::lock_guard<stdx::mutex> lk(css->_metadataManager->_managerLock);
            css->_metadataManager->_clearAllCleanups(lk);
            return boost::none;
        }

        if (!forTestOnly && metadata->getCollVersion().epoch() != epoch) {
            LOG(1) << "Range deletion task for " << nss.ns() << " epoch " << epoch << " woke;"
                   << " (current is " << metadata->getCollVersion() << ")";
            return boost::none;
        }

        bool writeOpLog = false;

        {
            stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);
            if (self->isEmpty()) {
                LOG(1) << "No further range deletions scheduled on " << nss.ns();
                return boost::none;
            }

            auto& orphans = self->_orphans;
            if (orphans.empty()) {
                // We have delayed deletions; see if any are ready.
                auto& df = self->_delayedOrphans.front();
                if (df.whenToDelete > Date_t::now()) {
                    LOG(0) << "Deferring deletion of " << nss.ns() << " range "
                           << redact(df.range.toString()) << " until " << df.whenToDelete;
                    return df.whenToDelete;
                }

                // Move a single range from _delayedOrphans to _orphans
                orphans.splice(orphans.end(), self->_delayedOrphans, self->_delayedOrphans.begin());
                LOG(1) << "Proceeding with deferred deletion of " << nss.ns() << " range "
                       << redact(orphans.front().range.toString());

                writeOpLog = true;
            }

            invariant(!orphans.empty());
            const auto& frontRange = orphans.front().range;
            range.emplace(frontRange.getMin().getOwned(), frontRange.getMax().getOwned());
            notification = orphans.front().notification;
        }

        invariant(range);

        if (writeOpLog) {
            // Secondaries will watch for this update, and kill any queries that may depend on
            // documents in the range -- excepting any queries with a read-concern option
            // 'ignoreChunkMigration'
            try {
                AutoGetCollection autoAdmin(
                    opCtx, NamespaceString::kServerConfigurationNamespace, MODE_IX);

                Helpers::upsert(opCtx,
                                NamespaceString::kServerConfigurationNamespace.ns(),
                                BSON("_id"
                                     << "startRangeDeletion"
                                     << "ns"
                                     << nss.ns()
                                     << "epoch"
                                     << epoch
                                     << "min"
                                     << range->getMin()
                                     << "max"
                                     << range->getMax()));
            } catch (const DBException& e) {
                stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);
                css->_metadataManager->_clearAllCleanups(
                    scopedLock,
                    e.toStatus("cannot push startRangeDeletion record to Op Log,"
                               " abandoning scheduled range deletions"));
                return boost::none;
            }
        }

        try {
            wrote = self->_doDeletion(
                opCtx, collection, metadata->getKeyPattern(), *range, maxToDelete);
        } catch (const DBException& e) {
            wrote = e.toStatus();
            warning() << e.what();
        }
    }  // drop autoColl

    if (!wrote.isOK() || wrote.getValue() == 0) {
        if (wrote.isOK()) {
            LOG(0) << "No documents remain to delete in " << nss << " range "
                   << redact(range->toString());
        }

        // Wait for majority replication even when wrote isn't OK or == 0, because it might have
        // been OK and/or > 0 previously, and the deletions must be persistent before notifying
        // clients in _pop().

        LOG(0) << "Waiting for majority replication of local deletions in " << nss.ns() << " range "
               << redact(range->toString());

        repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx);
        const auto clientOpTime = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();

        // Wait for replication outside the lock
        const auto status = [&] {
            try {
                WriteConcernResult unusedWCResult;
                return waitForWriteConcern(
                    opCtx, clientOpTime, kMajorityWriteConcern, &unusedWCResult);
            } catch (const DBException& e) {
                return e.toStatus();
            }
        }();

        // Get the lock again to finish off this range (including notifying, if necessary).
        // Don't allow lock interrupts while cleaning up.
        UninterruptibleLockGuard noInterrupt(opCtx->lockState());
        AutoGetCollection autoColl(opCtx, nss, MODE_IX);
        auto* const css = CollectionShardingRuntime::get(opCtx, nss);
        auto& metadataManager = css->_metadataManager;
        auto* const self = forTestOnly ? forTestOnly : &metadataManager->_rangesToClean;

        stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);

        if (!status.isOK()) {
            LOG(0) << "Error when waiting for write concern after removing " << nss << " range "
                   << redact(range->toString()) << " : " << redact(status.reason());

            // If range were already popped (e.g. by dropping nss during the waitForWriteConcern
            // above) its notification would have been triggered, so this check suffices to ensure
            // that it is safe to pop the range here
            if (!notification.ready()) {
                invariant(!self->isEmpty() && self->_orphans.front().notification == notification);
                LOG(0) << "Abandoning deletion of latest range in " << nss.ns() << " after local "
                       << "deletions because of replication failure";
                self->_pop(status);
            }
        } else {
            LOG(0) << "Finished deleting documents in " << nss.ns() << " range "
                   << redact(range->toString());

            self->_pop(wrote.getStatus());
        }

        if (!self->_orphans.empty()) {
            LOG(1) << "Deleting " << nss.ns() << " range "
                   << redact(self->_orphans.front().range.toString()) << " next.";
        }

        return Date_t::now() + stdx::chrono::milliseconds{rangeDeleterBatchDelayMS.load()};
    }

    invariant(range);
    invariant(wrote.getStatus());
    invariant(wrote.getValue() > 0);

    notification.abandon();
    return Date_t::now() + stdx::chrono::milliseconds{rangeDeleterBatchDelayMS.load()};
}
Example #2
0
StatusWith<bool> FTDCFileReader::hasNext() {
    while (true) {
        if (_state == State::kNeedsDoc) {
            if (_stream.eof()) {
                return {false};
            }

            auto swDoc = readDocument();
            if (!swDoc.isOK()) {
                return swDoc.getStatus();
            }

            if (swDoc.getValue().isEmpty()) {
                return {false};
            }

            _parent = swDoc.getValue();

            // metadata or metrics?
            auto swType = FTDCBSONUtil::getBSONDocumentType(_parent);
            if (!swType.isOK()) {
                return swType.getStatus();
            }

            auto swId = FTDCBSONUtil::getBSONDocumentId(_parent);
            if (!swId.isOK()) {
                return swId.getStatus();
            }

            _dateId = swId.getValue();

            FTDCBSONUtil::FTDCType type = swType.getValue();

            if (type == FTDCBSONUtil::FTDCType::kMetadata) {
                _state = State::kMetadataDoc;

                auto swMetadata = FTDCBSONUtil::getBSONDocumentFromMetadataDoc(_parent);
                if (!swMetadata.isOK()) {
                    return swMetadata.getStatus();
                }

                _metadata = swMetadata.getValue();
            } else if (type == FTDCBSONUtil::FTDCType::kMetricChunk) {
                _state = State::kMetricChunk;

                auto swDocs = FTDCBSONUtil::getMetricsFromMetricDoc(_parent, &_decompressor);
                if (!swDocs.isOK()) {
                    return swDocs.getStatus();
                }

                _docs = swDocs.getValue();

                // There is always at least the reference document
                _pos = 0;
            }

            return {true};
        }

        // We previously returned a metadata document, now we need another document from disk
        if (_state == State::kMetadataDoc) {
            _state = State::kNeedsDoc;
            continue;
        }

        // If we have a metric chunk, return the next document in the chunk until the chunk is
        // exhausted
        if (_state == State::kMetricChunk) {
            if (_pos + 1 == _docs.size()) {
                _state = State::kNeedsDoc;
                continue;
            }

            _pos++;

            return {true};
        }
    }
}
Example #3
0
void Strategy::queryOp(OperationContext* txn, Request& request) {
    verify(!NamespaceString(request.getns()).isCommand());

    Timer queryTimer;

    globalOpCounters.gotQuery();

    QueryMessage q(request.d());

    NamespaceString ns(q.ns);
    ClientBasic* client = txn->getClient();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForFind(ns, false);
    audit::logQueryAuthzCheck(client, ns, q.query, status.code());
    uassertStatusOK(status);

    LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd"))
        throw UserException(8010, "something is wrong, shouldn't see a command here");

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18526,
                  string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns +
                      " " + q.query.toString());
    }

    // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor
    // code path.
    // TODO: Delete the spigot and always use the new code.
    if (useClusterClientCursor) {
        // Determine the default read preference mode based on the value of the slaveOk flag.
        ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk)
            ? ReadPreference::SecondaryPreferred
            : ReadPreference::PrimaryOnly;
        ReadPreferenceSetting readPreference(readPreferenceOption, TagSet());

        BSONElement rpElem;
        auto readPrefExtractStatus = bsonExtractTypedField(
            q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem);

        if (readPrefExtractStatus.isOK()) {
            auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj());
            uassertStatusOK(parsedRps.getStatus());
            readPreference = parsedRps.getValue();
        } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) {
            uassertStatusOK(readPrefExtractStatus);
        }

        auto canonicalQuery = CanonicalQuery::canonicalize(q, ExtensionsCallbackNoop());
        uassertStatusOK(canonicalQuery.getStatus());

        // If the $explain flag was set, we must run the operation on the shards as an explain
        // command rather than a find command.
        if (canonicalQuery.getValue()->getParsed().isExplain()) {
            const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed();
            BSONObj findCommand = lpq.asFindCommand();

            // We default to allPlansExecution verbosity.
            auto verbosity = ExplainCommon::EXEC_ALL_PLANS;

            const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly);
            rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference);

            BSONObjBuilder explainBuilder;
            uassertStatusOK(
                Strategy::explainFind(txn, findCommand, lpq, verbosity, metadata, &explainBuilder));

            BSONObj explainObj = explainBuilder.done();
            replyToQuery(0,  // query result flags
                         request.p(),
                         request.m(),
                         static_cast<const void*>(explainObj.objdata()),
                         explainObj.objsize(),
                         1,  // numResults
                         0,  // startingFrom
                         CursorId(0));
            return;
        }

        // Do the work to generate the first batch of results. This blocks waiting to get responses
        // from the shard(s).
        std::vector<BSONObj> batch;

        // 0 means the cursor is exhausted and
        // otherwise we assume that a cursor with the returned id can be retrieved via the
        // ClusterCursorManager
        auto cursorId =
            ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch);
        uassertStatusOK(cursorId.getStatus());

        // TODO: this constant should be shared between mongos and mongod, and should
        // not be inside ShardedClientCursor.
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);

        // Fill out the response buffer.
        int numResults = 0;
        for (const auto& obj : batch) {
            buffer.appendBuf((void*)obj.objdata(), obj.objsize());
            numResults++;
        }

        replyToQuery(0,  // query result flags
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     numResults,
                     0,  // startingFrom
                     cursorId.getValue());
        return;
    }

    QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions);

    // Parse "$maxTimeMS".
    StatusWith<int> maxTimeMS =
        LiteParsedQuery::parseMaxTimeMS(q.query[LiteParsedQuery::queryOptionMaxTimeMS]);
    uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK());

    if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) {
        return;
    }

    ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo());
    verify(cursor);

    // TODO:  Move out to Request itself, not strategy based
    try {
        cursor->init(txn);

        if (qSpec.isExplain()) {
            BSONObjBuilder explain_builder;
            cursor->explain(explain_builder);
            explain_builder.appendNumber("executionTimeMillis",
                                         static_cast<long long>(queryTimer.millis()));
            BSONObj b = explain_builder.obj();

            replyToQuery(0, request.p(), request.m(), b);
            delete (cursor);
            return;
        }
    } catch (...) {
        delete cursor;
        throw;
    }

    // TODO: Revisit all of this when we revisit the sharded cursor cache

    if (cursor->getNumQueryShards() != 1) {
        // More than one shard (or zero), manage with a ShardedClientCursor
        // NOTE: We may also have *zero* shards here when the returnPartial flag is set.
        // Currently the code in ShardedClientCursor handles this.

        ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor));

        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cc->getTotalSent();
        bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount);

        if (hasMore) {
            LOG(5) << "storing cursor : " << cc->getId();

            int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis();
            if (maxTimeMS.getValue() == 0) {  // 0 represents "no limit".
                cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit;
            } else if (cursorLeftoverMillis <= 0) {
                cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
            }

            cursorCache.store(cc, cursorLeftoverMillis);
        }

        replyToQuery(0,
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cc->getId() : 0);
    } else {
        // Only one shard is used

        // Remote cursors are stored remotely, we shouldn't need this around.
        unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor);

        ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId());
        verify(shard.get());
        DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId());

        // Implicitly stores the cursor in the cache
        request.reply(*(shardCursor->getMessage()), shardCursor->originalHost());

        // We don't want to kill the cursor remotely if there's still data left
        shardCursor->decouple();
    }
}
Example #4
0
File: db.cpp Project: Machyne/mongo
void repairDatabasesAndCheckVersion(OperationContext* txn) {
    LOG(1) << "enter repairDatabases (to check pdfile version #)";

    ScopedTransaction transaction(txn, MODE_X);
    Lock::GlobalWrite lk(txn->lockState());

    vector<string> dbNames;

    StorageEngine* storageEngine = txn->getServiceContext()->getGlobalStorageEngine();
    storageEngine->listDatabases(&dbNames);

    // Repair all databases first, so that we do not try to open them if they are in bad shape
    if (storageGlobalParams.repair) {
        invariant(!storageGlobalParams.readOnly);
        for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) {
            const string dbName = *i;
            LOG(1) << "    Repairing database: " << dbName;

            fassert(18506, repairDatabase(txn, storageEngine, dbName));
        }
    }

    const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings();

    // On replica set members we only clear temp collections on DBs other than "local" during
    // promotion to primary. On pure slaves, they are only cleared when the oplog tells them
    // to. The local DB is special because it is not replicated.  See SERVER-10927 for more
    // details.
    const bool shouldClearNonLocalTmpCollections =
        !(checkIfReplMissingFromCommandLine(txn) || replSettings.usingReplSets() ||
          replSettings.isSlave());

    const bool shouldDoCleanupForSERVER23299 = isSubjectToSERVER23299(txn);

    for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) {
        const string dbName = *i;
        LOG(1) << "    Recovering database: " << dbName;

        Database* db = dbHolder().openDb(txn, dbName);
        invariant(db);

        // First thing after opening the database is to check for file compatibility,
        // otherwise we might crash if this is a deprecated format.
        auto status = db->getDatabaseCatalogEntry()->currentFilesCompatible(txn);
        if (!status.isOK()) {
            if (status.code() == ErrorCodes::CanRepairToDowngrade) {
                // Convert CanRepairToDowngrade statuses to MustUpgrade statuses to avoid logging a
                // potentially confusing and inaccurate message.
                //
                // TODO SERVER-24097: Log a message informing the user that they can start the
                // current version of mongod with --repair and then proceed with normal startup.
                status = {ErrorCodes::MustUpgrade, status.reason()};
            }
            severe() << "Unable to start mongod due to an incompatibility with the data files and"
                        " this version of mongod: "
                     << redact(status);
            severe() << "Please consult our documentation when trying to downgrade to a previous"
                        " major release";
            quickExit(EXIT_NEED_UPGRADE);
            return;
        }

        // Check if admin.system.version contains an invalid featureCompatibilityVersion.
        // If a valid featureCompatibilityVersion is present, cache it as a server parameter.
        if (dbName == "admin") {
            if (Collection* versionColl =
                    db->getCollection(FeatureCompatibilityVersion::kCollection)) {
                BSONObj featureCompatibilityVersion;
                if (Helpers::findOne(txn,
                                     versionColl,
                                     BSON("_id" << FeatureCompatibilityVersion::kParameterName),
                                     featureCompatibilityVersion)) {
                    auto version = FeatureCompatibilityVersion::parse(featureCompatibilityVersion);
                    if (!version.isOK()) {
                        severe() << version.getStatus();
                        fassertFailedNoTrace(40283);
                    }
                    serverGlobalParams.featureCompatibility.version.store(version.getValue());
                }
            }
        }

        // Major versions match, check indexes
        const string systemIndexes = db->name() + ".system.indexes";

        Collection* coll = db->getCollection(systemIndexes);
        unique_ptr<PlanExecutor> exec(
            InternalPlanner::collectionScan(txn, systemIndexes, coll, PlanExecutor::YIELD_MANUAL));

        BSONObj index;
        PlanExecutor::ExecState state;
        while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) {
            const BSONObj key = index.getObjectField("key");
            const string plugin = IndexNames::findPluginName(key);

            if (db->getDatabaseCatalogEntry()->isOlderThan24(txn)) {
                if (IndexNames::existedBefore24(plugin)) {
                    continue;
                }

                log() << "Index " << index << " claims to be of type '" << plugin << "', "
                      << "which is either invalid or did not exist before v2.4. "
                      << "See the upgrade section: "
                      << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog;
            }

            if (index["v"].isNumber() && index["v"].numberInt() == 0) {
                log() << "WARNING: The index: " << index << " was created with the deprecated"
                      << " v:0 format.  This format will not be supported in a future release."
                      << startupWarningsLog;
                log() << "\t To fix this, you need to rebuild this index."
                      << " For instructions, see http://dochub.mongodb.org/core/rebuild-v0-indexes"
                      << startupWarningsLog;
            }
        }

        // Non-yielding collection scans from InternalPlanner will never error.
        invariant(PlanExecutor::IS_EOF == state);

        if (replSettings.usingReplSets()) {
            // We only care about the _id index if we are in a replset
            checkForIdIndexes(txn, db);
            // Ensure oplog is capped (mmap does not guarantee order of inserts on noncapped
            // collections)
            if (db->name() == "local") {
                checkForCappedOplog(txn, db);
            }
        }

        if (shouldDoCleanupForSERVER23299) {
            handleSERVER23299ForDb(txn, db);
        }

        if (!storageGlobalParams.readOnly &&
            (shouldClearNonLocalTmpCollections || dbName == "local")) {
            db->clearTmpCollections(txn);
        }
    }

    LOG(1) << "done repairDatabases";
}
Example #5
0
StatusWith<BSONObj> FTDCFileReader::readDocument() {
    if (!_stream.is_open()) {
        return {ErrorCodes::FileNotOpen, "open() needs to be called first."};
    }

    char buf[sizeof(std::int32_t)];

    _stream.read(buf, sizeof(buf));

    if (sizeof(buf) != _stream.gcount()) {
        // Did we read exactly zero bytes and hit the eof?
        // Then return an empty document to indicate we are done reading the file.
        if (0 == _stream.gcount() && _stream.eof()) {
            return BSONObj();
        }

        return {ErrorCodes::FileStreamFailed,
                str::stream() << "Failed to read 4 bytes from file \'" << _file.generic_string()
                              << "\'"};
    }

    std::uint32_t bsonLength = ConstDataView(buf).read<LittleEndian<std::int32_t>>();

    // Reads past the end of the file will be caught below
    // The interim file sentinel is 8 bytes of zero.
    if (bsonLength == 0) {
        return BSONObj();
    }

    // Reads past the end of the file will be caught below
    if (bsonLength > _fileSize || bsonLength < BSONObj::kMinBSONLength) {
        return {ErrorCodes::InvalidLength,
                str::stream() << "Invalid BSON length found in file \'" << _file.generic_string()
                              << "\'"};
    }

    // Read the BSON document
    _buffer.resize(bsonLength);

    // Stuff the length into the front
    memcpy(_buffer.data(), buf, sizeof(std::int32_t));

    // Read the length - 4 bytes from the file
    std::int32_t readSize = bsonLength - sizeof(std::int32_t);

    _stream.read(_buffer.data() + sizeof(std::int32_t), readSize);

    if (readSize != _stream.gcount()) {
        return {ErrorCodes::FileStreamFailed,
                str::stream() << "Failed to read " << readSize << " bytes from file \'"
                              << _file.generic_string() << "\'"};
    }

    ConstDataRange cdr(_buffer.data(), _buffer.data() + bsonLength);

    // TODO: Validated only validates objects based on a flag which is the default at the moment
    auto swl = cdr.read<Validated<BSONObj>>();
    if (!swl.isOK()) {
        return swl.getStatus();
    }

    return {swl.getValue().val};
}
StatusWith<ShardType> ShardingCatalogManager::_validateHostAsShard(
    OperationContext* opCtx,
    std::shared_ptr<RemoteCommandTargeter> targeter,
    const std::string* shardProposedName,
    const ConnectionString& connectionString) {
    auto swCommandResponse =
        _runCommandForAddShard(opCtx, targeter.get(), "admin", BSON("isMaster" << 1));
    if (swCommandResponse.getStatus() == ErrorCodes::IncompatibleServerVersion) {
        return swCommandResponse.getStatus().withReason(
            str::stream() << "Cannot add " << connectionString.toString()
                          << " as a shard because its binary version is not compatible with "
                             "the cluster's featureCompatibilityVersion.");
    } else if (!swCommandResponse.isOK()) {
        return swCommandResponse.getStatus();
    }

    // Check for a command response error
    auto resIsMasterStatus = std::move(swCommandResponse.getValue().commandStatus);
    if (!resIsMasterStatus.isOK()) {
        return resIsMasterStatus.withContext(str::stream()
                                             << "Error running isMaster against "
                                             << targeter->connectionString().toString());
    }

    auto resIsMaster = std::move(swCommandResponse.getValue().response);

    // Fail if the node being added is a mongos.
    const std::string msg = resIsMaster.getStringField("msg");
    if (msg == "isdbgrid") {
        return {ErrorCodes::IllegalOperation, "cannot add a mongos as a shard"};
    }

    // Extract the maxWireVersion so we can verify that the node being added has a binary version
    // greater than or equal to the cluster's featureCompatibilityVersion. We expect an incompatible
    // binary node to be unable to communicate, returning an IncompatibleServerVersion error,
    // because of our internal wire version protocol. So we can safely invariant here that the node
    // is compatible.
    long long maxWireVersion;
    Status status = bsonExtractIntegerField(resIsMaster, "maxWireVersion", &maxWireVersion);
    if (!status.isOK()) {
        return status.withContext(str::stream() << "isMaster returned invalid 'maxWireVersion' "
                                                << "field when attempting to add "
                                                << connectionString.toString()
                                                << " as a shard");
    }
    if (serverGlobalParams.featureCompatibility.getVersion() >
        ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo36) {
        // If FCV 4.0, or upgrading to / downgrading from, wire version must be LATEST.
        invariant(maxWireVersion == WireVersion::LATEST_WIRE_VERSION);
    } else if (serverGlobalParams.featureCompatibility.getVersion() >
                   ServerGlobalParams::FeatureCompatibility::Version::kFullyDowngradedTo34 &&
               serverGlobalParams.featureCompatibility.getVersion() <=
                   ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo36) {
        // If FCV 3.6, or upgrading to / downgrading from, wire version must be v3.6
        // LATEST_WIRE_VERSION or greater.
        invariant(maxWireVersion >= WireVersion::LATEST_WIRE_VERSION - 1);
    } else {
        // If FCV 3.4, wire version cannot be less than v3.4 LATEST_WIRE_VERSION.
        invariant(serverGlobalParams.featureCompatibility.getVersion() ==
                  ServerGlobalParams::FeatureCompatibility::Version::kFullyDowngradedTo34);
        invariant(maxWireVersion >= WireVersion::LATEST_WIRE_VERSION - 2);
    }

    // Check whether there is a master. If there isn't, the replica set may not have been
    // initiated. If the connection is a standalone, it will return true for isMaster.
    bool isMaster;
    status = bsonExtractBooleanField(resIsMaster, "ismaster", &isMaster);
    if (!status.isOK()) {
        return status.withContext(str::stream() << "isMaster returned invalid 'ismaster' "
                                                << "field when attempting to add "
                                                << connectionString.toString()
                                                << " as a shard");
    }
    if (!isMaster) {
        return {ErrorCodes::NotMaster,
                str::stream()
                    << connectionString.toString()
                    << " does not have a master. If this is a replica set, ensure that it has a"
                    << " healthy primary and that the set has been properly initiated."};
    }

    const std::string providedSetName = connectionString.getSetName();
    const std::string foundSetName = resIsMaster["setName"].str();

    // Make sure the specified replica set name (if any) matches the actual shard's replica set
    if (providedSetName.empty() && !foundSetName.empty()) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "host is part of set " << foundSetName << "; "
                              << "use replica set url format "
                              << "<setname>/<server1>,<server2>, ..."};
    }

    if (!providedSetName.empty() && foundSetName.empty()) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "host did not return a set name; "
                              << "is the replica set still initializing? "
                              << resIsMaster};
    }

    // Make sure the set name specified in the connection string matches the one where its hosts
    // belong into
    if (!providedSetName.empty() && (providedSetName != foundSetName)) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "the provided connection string (" << connectionString.toString()
                              << ") does not match the actual set name "
                              << foundSetName};
    }

    // Is it a config server?
    if (resIsMaster.hasField("configsvr")) {
        return {ErrorCodes::OperationFailed,
                str::stream() << "Cannot add " << connectionString.toString()
                              << " as a shard since it is a config server"};
    }

    // If the shard is part of a replica set, make sure all the hosts mentioned in the connection
    // string are part of the set. It is fine if not all members of the set are mentioned in the
    // connection string, though.
    if (!providedSetName.empty()) {
        std::set<std::string> hostSet;

        BSONObjIterator iter(resIsMaster["hosts"].Obj());
        while (iter.more()) {
            hostSet.insert(iter.next().String());  // host:port
        }

        if (resIsMaster["passives"].isABSONObj()) {
            BSONObjIterator piter(resIsMaster["passives"].Obj());
            while (piter.more()) {
                hostSet.insert(piter.next().String());  // host:port
            }
        }

        if (resIsMaster["arbiters"].isABSONObj()) {
            BSONObjIterator piter(resIsMaster["arbiters"].Obj());
            while (piter.more()) {
                hostSet.insert(piter.next().String());  // host:port
            }
        }

        for (const auto& hostEntry : connectionString.getServers()) {
            const auto& host = hostEntry.toString();  // host:port
            if (hostSet.find(host) == hostSet.end()) {
                return {ErrorCodes::OperationFailed,
                        str::stream() << "in seed list " << connectionString.toString() << ", host "
                                      << host
                                      << " does not belong to replica set "
                                      << foundSetName
                                      << "; found "
                                      << resIsMaster.toString()};
            }
        }
    }

    std::string actualShardName;

    if (shardProposedName) {
        actualShardName = *shardProposedName;
    } else if (!foundSetName.empty()) {
        // Default it to the name of the replica set
        actualShardName = foundSetName;
    }

    // Disallow adding shard replica set with name 'config'
    if (actualShardName == NamespaceString::kConfigDb) {
        return {ErrorCodes::BadValue, "use of shard replica set with name 'config' is not allowed"};
    }

    // Retrieve the most up to date connection string that we know from the replica set monitor (if
    // this is a replica set shard, otherwise it will be the same value as connectionString).
    ConnectionString actualShardConnStr = targeter->connectionString();

    ShardType shard;
    shard.setName(actualShardName);
    shard.setHost(actualShardConnStr.toString());
    shard.setState(ShardType::ShardState::kShardAware);

    return shard;
}
StatusWith<ShardDrainingStatus> ShardingCatalogManager::removeShard(OperationContext* opCtx,
                                                                    const ShardId& shardId) {
    // Check preconditions for removing the shard
    std::string name = shardId.toString();
    auto countStatus = _runCountCommandOnConfig(
        opCtx,
        ShardType::ConfigNS,
        BSON(ShardType::name() << NE << name << ShardType::draining(true)));
    if (!countStatus.isOK()) {
        return countStatus.getStatus();
    }
    if (countStatus.getValue() > 0) {
        return Status(ErrorCodes::ConflictingOperationInProgress,
                      "Can't have more than one draining shard at a time");
    }

    countStatus =
        _runCountCommandOnConfig(opCtx, ShardType::ConfigNS, BSON(ShardType::name() << NE << name));
    if (!countStatus.isOK()) {
        return countStatus.getStatus();
    }
    if (countStatus.getValue() == 0) {
        return Status(ErrorCodes::IllegalOperation, "Can't remove last shard");
    }

    // Figure out if shard is already draining
    countStatus = _runCountCommandOnConfig(
        opCtx, ShardType::ConfigNS, BSON(ShardType::name() << name << ShardType::draining(true)));
    if (!countStatus.isOK()) {
        return countStatus.getStatus();
    }

    auto* const shardRegistry = Grid::get(opCtx)->shardRegistry();

    if (countStatus.getValue() == 0) {
        log() << "going to start draining shard: " << name;

        auto updateStatus = Grid::get(opCtx)->catalogClient()->updateConfigDocument(
            opCtx,
            ShardType::ConfigNS,
            BSON(ShardType::name() << name),
            BSON("$set" << BSON(ShardType::draining(true))),
            false,
            ShardingCatalogClient::kLocalWriteConcern);
        if (!updateStatus.isOK()) {
            log() << "error starting removeShard: " << name
                  << causedBy(redact(updateStatus.getStatus()));
            return updateStatus.getStatus();
        }

        shardRegistry->reload(opCtx);

        // Record start in changelog
        Grid::get(opCtx)
            ->catalogClient()
            ->logChange(opCtx,
                        "removeShard.start",
                        "",
                        BSON("shard" << name),
                        ShardingCatalogClient::kLocalWriteConcern)
            .transitional_ignore();

        return ShardDrainingStatus::STARTED;
    }

    // Draining has already started, now figure out how many chunks and databases are still on the
    // shard.
    countStatus =
        _runCountCommandOnConfig(opCtx, ChunkType::ConfigNS, BSON(ChunkType::shard(name)));
    if (!countStatus.isOK()) {
        return countStatus.getStatus();
    }
    const long long chunkCount = countStatus.getValue();

    countStatus =
        _runCountCommandOnConfig(opCtx, DatabaseType::ConfigNS, BSON(DatabaseType::primary(name)));
    if (!countStatus.isOK()) {
        return countStatus.getStatus();
    }
    const long long databaseCount = countStatus.getValue();

    if (chunkCount > 0 || databaseCount > 0) {
        // Still more draining to do
        LOG(0) << "chunkCount: " << chunkCount;
        LOG(0) << "databaseCount: " << databaseCount;
        return ShardDrainingStatus::ONGOING;
    }

    // Draining is done, now finish removing the shard.
    log() << "going to remove shard: " << name;
    audit::logRemoveShard(opCtx->getClient(), name);

    Status status = Grid::get(opCtx)->catalogClient()->removeConfigDocuments(
        opCtx,
        ShardType::ConfigNS,
        BSON(ShardType::name() << name),
        ShardingCatalogClient::kLocalWriteConcern);
    if (!status.isOK()) {
        log() << "Error concluding removeShard operation on: " << name
              << "; err: " << status.reason();
        return status;
    }

    shardConnectionPool.removeHost(name);
    ReplicaSetMonitor::remove(name);

    shardRegistry->reload(opCtx);

    // Record finish in changelog
    Grid::get(opCtx)
        ->catalogClient()
        ->logChange(opCtx,
                    "removeShard",
                    "",
                    BSON("shard" << name),
                    ShardingCatalogClient::kLocalWriteConcern)
        .transitional_ignore();

    return ShardDrainingStatus::COMPLETED;
}
StatusWith<BalanceChunkRequest> BalanceChunkRequest::parseFromConfigCommand(const BSONObj& obj) {
    auto chunkStatus = ChunkType::fromBSON(obj);
    if (!chunkStatus.isOK()) {
        return chunkStatus.getStatus();
    }

    // The secondary throttle options being sent to the config server are contained within a
    // sub-object on the request because they contain the writeConcern field, which when sent to the
    // config server gets checked for only being w:1 or w:majoirty.
    BSONObj secondaryThrottleObj;

    {
        BSONElement secondaryThrottleElement;
        auto secondaryThrottleElementStatus =
            bsonExtractTypedField(obj, kSecondaryThrottle, Object, &secondaryThrottleElement);

        if (secondaryThrottleElementStatus.isOK()) {
            secondaryThrottleObj = secondaryThrottleElement.Obj();
        } else if (secondaryThrottleElementStatus != ErrorCodes::NoSuchKey) {
            return secondaryThrottleElementStatus;
        }
    }

    auto secondaryThrottleStatus =
        MigrationSecondaryThrottleOptions::createFromCommand(secondaryThrottleObj);
    if (!secondaryThrottleStatus.isOK()) {
        return secondaryThrottleStatus.getStatus();
    }

    BalanceChunkRequest request(std::move(chunkStatus.getValue()),
                                std::move(secondaryThrottleStatus.getValue()));

    {
        Status status =
            bsonExtractBooleanFieldWithDefault(obj, kWaitForDelete, false, &request._waitForDelete);
        if (!status.isOK()) {
            return status;
        }
    }

    {
        long long maxChunkSizeBytes;
        Status status =
            bsonExtractIntegerFieldWithDefault(obj, kMaxChunkSizeBytes, 0, &maxChunkSizeBytes);
        if (!status.isOK()) {
            return status;
        }

        request._maxChunkSizeBytes = static_cast<int64_t>(maxChunkSizeBytes);
    }

    {
        std::string toShardId;
        Status status = bsonExtractStringField(obj, kToShardId, &toShardId);
        if (status.isOK()) {
            if (toShardId.empty()) {
                return {ErrorCodes::BadValue, "To shard cannot be empty"};
            }

            request._toShardId = std::move(toShardId);
        } else if (status != ErrorCodes::NoSuchKey) {
            return status;
        }
    }

    return request;
}
Example #9
0
UUID UUID::parse(const BSONObj& obj) {
    auto res = parse(obj.getField("uuid"));
    uassert(40566, res.getStatus().reason(), res.isOK());
    return res.getValue();
}
Example #10
0
StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn,
                                                   const GetMoreRequest& request) {
    auto cursorManager = Grid::get(txn)->getCursorManager();

    auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid, txn);
    if (!pinnedCursor.isOK()) {
        return pinnedCursor.getStatus();
    }
    invariant(request.cursorid == pinnedCursor.getValue().getCursorId());

    // If the fail point is enabled, busy wait until it is disabled.
    while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) {
    }

    if (request.awaitDataTimeout) {
        auto status = pinnedCursor.getValue().setAwaitDataTimeout(*request.awaitDataTimeout);
        if (!status.isOK()) {
            return status;
        }
    }

    std::vector<BSONObj> batch;
    int bytesBuffered = 0;
    long long batchSize = request.batchSize.value_or(0);
    long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar();
    auto cursorState = ClusterCursorManager::CursorState::NotExhausted;
    while (!FindCommon::enoughForGetMore(batchSize, batch.size())) {
        auto next = pinnedCursor.getValue().next();
        if (!next.isOK()) {
            return next.getStatus();
        }

        if (next.getValue().isEOF()) {
            // We reached end-of-stream.
            if (!pinnedCursor.getValue().isTailable()) {
                cursorState = ClusterCursorManager::CursorState::Exhausted;
            }
            break;
        }

        if (!FindCommon::haveSpaceForNext(
                *next.getValue().getResult(), batch.size(), bytesBuffered)) {
            pinnedCursor.getValue().queueResult(*next.getValue().getResult());
            break;
        }

        // Add doc to the batch. Account for the space overhead associated with returning this doc
        // inside a BSON array.
        bytesBuffered +=
            (next.getValue().getResult()->objsize() + kPerDocumentOverheadBytesUpperBound);
        batch.push_back(std::move(*next.getValue().getResult()));
    }

    // Transfer ownership of the cursor back to the cursor manager.
    pinnedCursor.getValue().returnCursor(cursorState);

    CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted)
        ? CursorId(0)
        : request.cursorid;
    return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom);
}
Example #11
0
Status ClusterAggregate::runAggregate(OperationContext* txn,
                                      const Namespaces& namespaces,
                                      BSONObj cmdObj,
                                      int options,
                                      BSONObjBuilder* result) {
    auto dbname = namespaces.executionNss.db().toString();
    auto status = grid.catalogCache()->getDatabase(txn, dbname);
    if (!status.isOK()) {
        appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns());
        return Status::OK();
    }

    std::shared_ptr<DBConfig> conf = status.getValue();

    if (!conf->isShardingEnabled()) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);
    }

    auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj);
    if (!request.isOK()) {
        return request.getStatus();
    }

    boost::intrusive_ptr<ExpressionContext> mergeCtx =
        new ExpressionContext(txn, request.getValue());
    mergeCtx->inRouter = true;
    // explicitly *not* setting mergeCtx->tempDir

    // Parse and optimize the pipeline specification.
    auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx);
    if (!pipeline.isOK()) {
        return pipeline.getStatus();
    }

    for (auto&& ns : pipeline.getValue()->getInvolvedCollections()) {
        uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns()));
        // We won't try to execute anything on a mongos, but we still have to populate this map
        // so that any $lookups etc will be able to have a resolved view definition. It's okay
        // that this is incorrect, we will repopulate the real resolved namespace map on the
        // mongod.
        // TODO SERVER-25038 This should become unnecessary once we can get the involved
        // namespaces before parsing.
        mergeCtx->resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}};
    }

    if (!conf->isSharded(namespaces.executionNss.ns())) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);
    }

    ChunkManagerPtr chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns());

    // If there was no collation specified, but there is a default collation for the collation,
    // use that.
    if (request.getValue().getCollation().isEmpty() && chunkMgr->getDefaultCollator()) {
        mergeCtx->setCollator(chunkMgr->getDefaultCollator()->clone());
    }

    // Now that we know the collation we'll be using, inject the ExpressionContext and optimize.
    // TODO SERVER-25038: this must happen before we parse the pipeline, since we can make
    // string comparisons during parse time.
    pipeline.getValue()->injectExpressionContext(mergeCtx);
    pipeline.getValue()->optimizePipeline();

    // If the first $match stage is an exact match on the shard key (with a simple collation or
    // no string matching), we only have to send it to one shard, so send the command to that
    // shard.
    BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery();
    BSONObj shardKeyMatches;
    shardKeyMatches = uassertStatusOK(
        chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery));
    bool singleShard = false;
    if (!shardKeyMatches.isEmpty()) {
        auto chunk = chunkMgr->findIntersectingChunk(
            txn, shardKeyMatches, request.getValue().getCollation());
        if (chunk.isOK()) {
            singleShard = true;
        }
    }

    // Don't need to split pipeline if the first $match is an exact match on shard key, unless
    // there is a stage that needs to be run on the primary shard.
    const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger();
    const bool needSplit = !singleShard || needPrimaryShardMerger;

    // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true,
    // 'pipeline' will become the merger side.
    boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded()
                                                           : pipeline.getValue());

    // Create the command for the shards. The 'fromRouter' field means produce output to be
    // merged.
    MutableDocument commandBuilder(request.getValue().serializeToCommandObj());
    commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize());
    if (needSplit) {
        commandBuilder[AggregationRequest::kFromRouterName] = Value(true);
        commandBuilder[AggregationRequest::kCursorName] =
            Value(DOC(AggregationRequest::kBatchSizeName << 0));
    }

    // These fields are not part of the AggregationRequest since they are not handled by the
    // aggregation subsystem, so we serialize them separately.
    const std::initializer_list<StringData> fieldsToPropagateToShards = {
        "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS,
    };
    for (auto&& field : fieldsToPropagateToShards) {
        commandBuilder[field] = Value(cmdObj[field]);
    }

    BSONObj shardedCommand = commandBuilder.freeze().toBson();
    BSONObj shardQuery = shardPipeline->getInitialQuery();

    // Run the command on the shards
    // TODO need to make sure cursors are killed if a retry is needed
    std::vector<Strategy::CommandResult> shardResults;
    Strategy::commandOp(txn,
                        dbname,
                        shardedCommand,
                        options,
                        namespaces.executionNss.ns(),
                        shardQuery,
                        request.getValue().getCollation(),
                        &shardResults);

    if (mergeCtx->isExplain) {
        // This must be checked before we start modifying result.
        uassertAllShardsSupportExplain(shardResults);

        if (needSplit) {
            *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline"
                    << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart"
                                        << pipeline.getValue()->writeExplainOps());
        } else {
            *result << "splitPipeline" << BSONNULL;
        }

        BSONObjBuilder shardExplains(result->subobjStart("shards"));
        for (size_t i = 0; i < shardResults.size(); i++) {
            shardExplains.append(shardResults[i].shardTargetId,
                                 BSON("host" << shardResults[i].target.toString() << "stages"
                                             << shardResults[i].result["stages"]));
        }

        return Status::OK();
    }

    if (!needSplit) {
        invariant(shardResults.size() == 1);
        invariant(shardResults[0].target.getServers().size() == 1);
        auto executorPool = grid.getExecutorPool();
        const BSONObj reply =
            uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0],
                                                shardResults[0].result,
                                                namespaces.requestedNss,
                                                executorPool->getArbitraryExecutor(),
                                                grid.getCursorManager()));
        result->appendElements(reply);
        return getStatusFromCommandResult(reply);
    }

    pipeline.getValue()->addInitialSource(
        DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx));

    MutableDocument mergeCmd(request.getValue().serializeToCommandObj());
    mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize());
    mergeCmd["cursor"] = Value(cmdObj["cursor"]);

    if (cmdObj.hasField("$queryOptions")) {
        mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]);
    }

    if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) {
        mergeCmd[QueryRequest::cmdOptionMaxTimeMS] =
            Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]);
    }

    mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"]));

    // Not propagating readConcern to merger since it doesn't do local reads.

    // If the user didn't specify a collation already, make sure there's a collation attached to
    // the merge command, since the merging shard may not have the collection metadata.
    if (mergeCmd.peek()["collation"].missing()) {
        mergeCmd.setField("collation",
                          mergeCtx->getCollator()
                              ? Value(mergeCtx->getCollator()->getSpec().toBSON())
                              : Value(Document{CollationSpec::kSimpleSpec}));
    }

    std::string outputNsOrEmpty;
    if (DocumentSourceOut* out =
            dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) {
        outputNsOrEmpty = out->getOutputNs().ns();
    }

    // Run merging command on random shard, unless a stage needs the primary shard. Need to use
    // ShardConnection so that the merging mongod is sent the config servers on connection init.
    auto& prng = txn->getClient()->getPrng();
    const auto& mergingShardId = needPrimaryShardMerger
        ? conf->getPrimaryId()
        : shardResults[prng.nextInt32(shardResults.size())].shardTargetId;
    const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId));

    ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty);
    BSONObj mergedResults =
        aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options);
    conn.done();

    if (auto wcErrorElem = mergedResults["writeConcernError"]) {
        appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result);
    }

    // Copy output from merging (primary) shard to the output object from our command.
    // Also, propagates errmsg and code if ok == false.
    result->appendElementsUnique(mergedResults);

    return getStatusFromCommandResult(result->asTempObj());
}
Example #12
0
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn,
                                           const CanonicalQuery& query,
                                           const ReadPreferenceSetting& readPref,
                                           std::vector<BSONObj>* results,
                                           BSONObj* viewDefinition) {
    invariant(results);

    // Projection on the reserved sort key field is illegal in mongos.
    if (query.getQueryRequest().getProj().hasField(ClusterClientCursorParams::kSortKeyField)) {
        return {ErrorCodes::BadValue,
                str::stream() << "Projection contains illegal field '"
                              << ClusterClientCursorParams::kSortKeyField
                              << "': "
                              << query.getQueryRequest().getProj()};
    }

    auto dbConfig = Grid::get(txn)->catalogCache()->getDatabase(txn, query.nss().db().toString());
    if (dbConfig.getStatus() == ErrorCodes::NamespaceNotFound) {
        // If the database doesn't exist, we successfully return an empty result set without
        // creating a cursor.
        return CursorId(0);
    } else if (!dbConfig.isOK()) {
        return dbConfig.getStatus();
    }

    std::shared_ptr<ChunkManager> chunkManager;
    std::shared_ptr<Shard> primary;
    dbConfig.getValue()->getChunkManagerOrPrimary(txn, query.nss().ns(), chunkManager, primary);

    // Re-target and re-send the initial find command to the shards until we have established the
    // shard version.
    for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) {
        auto cursorId = runQueryWithoutRetrying(
            txn, query, readPref, chunkManager.get(), std::move(primary), results, viewDefinition);
        if (cursorId.isOK()) {
            return cursorId;
        }
        auto status = std::move(cursorId.getStatus());

        if (!ErrorCodes::isStaleShardingError(status.code()) &&
            status != ErrorCodes::ShardNotFound) {
            // Errors other than trying to reach a non existent shard or receiving a stale
            // metadata message from MongoD are fatal to the operation. Network errors and
            // replication retries happen at the level of the AsyncResultsMerger.
            return status;
        }

        LOG(1) << "Received error status for query " << redact(query.toStringShort())
               << " on attempt " << retries << " of " << kMaxStaleConfigRetries << ": "
               << redact(status);

        const bool staleEpoch = (status == ErrorCodes::StaleEpoch);
        if (staleEpoch) {
            if (!dbConfig.getValue()->reload(txn)) {
                // If the reload failed that means the database wasn't found, so successfully return
                // an empty result set without creating a cursor.
                return CursorId(0);
            }
        }
        chunkManager =
            dbConfig.getValue()->getChunkManagerIfExists(txn, query.nss().ns(), true, staleEpoch);
        if (!chunkManager) {
            dbConfig.getValue()->getChunkManagerOrPrimary(
                txn, query.nss().ns(), chunkManager, primary);
        }
    }

    return {ErrorCodes::StaleShardVersion,
            str::stream() << "Retried " << kMaxStaleConfigRetries
                          << " times without successfully establishing shard version."};
}
Example #13
0
StatusWith<AggregationRequest> AggregationRequest::parseFromBSON(
    NamespaceString nss,
    const BSONObj& cmdObj,
    boost::optional<ExplainOptions::Verbosity> explainVerbosity) {
    // Parse required parameters.
    auto pipelineElem = cmdObj[kPipelineName];
    auto pipeline = AggregationRequest::parsePipelineFromBSON(pipelineElem);
    if (!pipeline.isOK()) {
        return pipeline.getStatus();
    }

    AggregationRequest request(std::move(nss), std::move(pipeline.getValue()));

    const std::initializer_list<StringData> optionsParsedElseWhere = {kPipelineName, kCommandName};

    bool hasCursorElem = false;
    bool hasExplainElem = false;

    bool hasFromMongosElem = false;
    bool hasNeedsMergeElem = false;

    // Parse optional parameters.
    for (auto&& elem : cmdObj) {
        auto fieldName = elem.fieldNameStringData();

        if (QueryRequest::kUnwrappedReadPrefField == fieldName) {
            // We expect this field to be validated elsewhere.
            request.setUnwrappedReadPref(elem.embeddedObject());
        } else if (std::find(optionsParsedElseWhere.begin(),
                             optionsParsedElseWhere.end(),
                             fieldName) != optionsParsedElseWhere.end()) {
            // Ignore options that are parsed elsewhere.
        } else if (kCursorName == fieldName) {
            long long batchSize;
            auto status =
                CursorRequest::parseCommandCursorOptions(cmdObj, kDefaultBatchSize, &batchSize);
            if (!status.isOK()) {
                return status;
            }

            hasCursorElem = true;
            request.setBatchSize(batchSize);
        } else if (kCollationName == fieldName) {
            if (elem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << kCollationName << " must be an object, not a "
                                      << typeName(elem.type())};
            }
            request.setCollation(elem.embeddedObject().getOwned());
        } else if (QueryRequest::cmdOptionMaxTimeMS == fieldName) {
            auto maxTimeMs = QueryRequest::parseMaxTimeMS(elem);
            if (!maxTimeMs.isOK()) {
                return maxTimeMs.getStatus();
            }

            request.setMaxTimeMS(maxTimeMs.getValue());
        } else if (repl::ReadConcernArgs::kReadConcernFieldName == fieldName) {
            if (elem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << repl::ReadConcernArgs::kReadConcernFieldName
                                      << " must be an object, not a "
                                      << typeName(elem.type())};
            }
            request.setReadConcern(elem.embeddedObject().getOwned());
        } else if (kHintName == fieldName) {
            if (BSONType::Object == elem.type()) {
                request.setHint(elem.embeddedObject());
            } else if (BSONType::String == elem.type()) {
                request.setHint(BSON("$hint" << elem.valueStringData()));
            } else {
                return Status(ErrorCodes::FailedToParse,
                              str::stream()
                                  << kHintName
                                  << " must be specified as a string representing an index"
                                  << " name, or an object representing an index's key pattern");
            }
        } else if (kCommentName == fieldName) {
            if (elem.type() != BSONType::String) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << kCommentName << " must be a string, not a "
                                      << typeName(elem.type())};
            }
            request.setComment(elem.str());
        } else if (kExplainName == fieldName) {
            if (elem.type() != BSONType::Bool) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << kExplainName << " must be a boolean, not a "
                                      << typeName(elem.type())};
            }

            hasExplainElem = true;
            if (elem.Bool()) {
                request.setExplain(ExplainOptions::Verbosity::kQueryPlanner);
            }
        } else if (kFromMongosName == fieldName) {
            if (elem.type() != BSONType::Bool) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << kFromMongosName << " must be a boolean, not a "
                                      << typeName(elem.type())};
            }

            hasFromMongosElem = true;
            request.setFromMongos(elem.Bool());
        } else if (kNeedsMergeName == fieldName) {
            if (elem.type() != BSONType::Bool) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << kNeedsMergeName << " must be a boolean, not a "
                                      << typeName(elem.type())};
            }

            hasNeedsMergeElem = true;
            request.setNeedsMerge(elem.Bool());
        } else if (kMergeByPBRTName == fieldName) {
            if (elem.type() != BSONType::Bool) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << kMergeByPBRTName << " must be a boolean, not a "
                                      << typeName(elem.type())};
            }

            request.setMergeByPBRT(elem.Bool());
        } else if (kAllowDiskUseName == fieldName) {
            if (storageGlobalParams.readOnly) {
                return {ErrorCodes::IllegalOperation,
                        str::stream() << "The '" << kAllowDiskUseName
                                      << "' option is not permitted in read-only mode."};
            } else if (elem.type() != BSONType::Bool) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << kAllowDiskUseName << " must be a boolean, not a "
                                      << typeName(elem.type())};
            }
            request.setAllowDiskUse(elem.Bool());
        } else if (kExchangeName == fieldName) {
            try {
                IDLParserErrorContext ctx("internalExchange");
                request.setExchangeSpec(ExchangeSpec::parse(ctx, elem.Obj()));
            } catch (const DBException& ex) {
                return ex.toStatus();
            }
        } else if (bypassDocumentValidationCommandOption() == fieldName) {
            request.setBypassDocumentValidation(elem.trueValue());
        } else if (WriteConcernOptions::kWriteConcernField == fieldName) {
            if (elem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << fieldName << " must be an object, not a "
                                      << typeName(elem.type())};
            }

            WriteConcernOptions writeConcern;
            uassertStatusOK(writeConcern.parse(elem.embeddedObject()));
            request.setWriteConcern(writeConcern);
        } else if (kRuntimeConstants == fieldName) {
            try {
                IDLParserErrorContext ctx("internalRuntimeConstants");
                request.setRuntimeConstants(RuntimeConstants::parse(ctx, elem.Obj()));
            } catch (const DBException& ex) {
                return ex.toStatus();
            }
        } else if (!isGenericArgument(fieldName)) {
            return {ErrorCodes::FailedToParse,
                    str::stream() << "unrecognized field '" << elem.fieldName() << "'"};
        }
    }

    if (explainVerbosity) {
        if (hasExplainElem) {
            return {
                ErrorCodes::FailedToParse,
                str::stream() << "The '" << kExplainName
                              << "' option is illegal when a explain verbosity is also provided"};
        }

        request.setExplain(explainVerbosity);
    }

    // 'hasExplainElem' implies an aggregate command-level explain option, which does not require
    // a cursor argument.
    if (!hasCursorElem && !hasExplainElem) {
        return {ErrorCodes::FailedToParse,
                str::stream()
                    << "The '"
                    << kCursorName
                    << "' option is required, except for aggregate with the explain argument"};
    }

    if (request.getExplain() && cmdObj[WriteConcernOptions::kWriteConcernField]) {
        return {ErrorCodes::FailedToParse,
                str::stream() << "Aggregation explain does not support the'"
                              << WriteConcernOptions::kWriteConcernField
                              << "' option"};
    }

    if (hasNeedsMergeElem && !hasFromMongosElem) {
        return {ErrorCodes::FailedToParse,
                str::stream() << "Cannot specify '" << kNeedsMergeName << "' without '"
                              << kFromMongosName
                              << "'"};
    }

    return request;
}
Example #14
0
File: db.cpp Project: Machyne/mongo
ExitCode _initAndListen(int listenPort) {
    Client::initThread("initandlisten");

    _initWireSpec();
    auto globalServiceContext = getGlobalServiceContext();

    globalServiceContext->setFastClockSource(FastClockSourceFactory::create(Milliseconds(10)));
    globalServiceContext->setOpObserver(stdx::make_unique<OpObserver>());

    DBDirectClientFactory::get(globalServiceContext)
        .registerImplementation([](OperationContext* txn) {
            return std::unique_ptr<DBClientBase>(new DBDirectClient(txn));
        });

    const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings();

    {
        ProcessId pid = ProcessId::getCurrent();
        LogstreamBuilder l = log(LogComponent::kControl);
        l << "MongoDB starting : pid=" << pid << " port=" << serverGlobalParams.port
          << " dbpath=" << storageGlobalParams.dbpath;
        if (replSettings.isMaster())
            l << " master=" << replSettings.isMaster();
        if (replSettings.isSlave())
            l << " slave=" << (int)replSettings.isSlave();

        const bool is32bit = sizeof(int*) == 4;
        l << (is32bit ? " 32" : " 64") << "-bit host=" << getHostNameCached() << endl;
    }

    DEV log(LogComponent::kControl) << "DEBUG build (which is slower)" << endl;

#if defined(_WIN32)
    VersionInfoInterface::instance().logTargetMinOS();
#endif

    logProcessDetails();

    checked_cast<ServiceContextMongoD*>(getGlobalServiceContext())->createLockFile();

    transport::TransportLayerLegacy::Options options;
    options.port = listenPort;
    options.ipList = serverGlobalParams.bind_ip;

    auto sep =
        stdx::make_unique<ServiceEntryPointMongod>(getGlobalServiceContext()->getTransportLayer());
    auto sepPtr = sep.get();

    getGlobalServiceContext()->setServiceEntryPoint(std::move(sep));

    // Create, start, and attach the TL
    auto transportLayer = stdx::make_unique<transport::TransportLayerLegacy>(options, sepPtr);
    auto res = transportLayer->setup();
    if (!res.isOK()) {
        error() << "Failed to set up listener: " << res;
        return EXIT_NET_ERROR;
    }

    std::shared_ptr<DbWebServer> dbWebServer;
    if (serverGlobalParams.isHttpInterfaceEnabled) {
        dbWebServer.reset(new DbWebServer(serverGlobalParams.bind_ip,
                                          serverGlobalParams.port + 1000,
                                          getGlobalServiceContext(),
                                          new RestAdminAccess()));
        if (!dbWebServer->setupSockets()) {
            error() << "Failed to set up sockets for HTTP interface during startup.";
            return EXIT_NET_ERROR;
        }
    }

    getGlobalServiceContext()->initializeGlobalStorageEngine();

#ifdef MONGO_CONFIG_WIREDTIGER_ENABLED
    if (WiredTigerCustomizationHooks::get(getGlobalServiceContext())->restartRequired()) {
        exitCleanly(EXIT_CLEAN);
    }
#endif

    // Warn if we detect configurations for multiple registered storage engines in
    // the same configuration file/environment.
    if (serverGlobalParams.parsedOpts.hasField("storage")) {
        BSONElement storageElement = serverGlobalParams.parsedOpts.getField("storage");
        invariant(storageElement.isABSONObj());
        BSONObj storageParamsObj = storageElement.Obj();
        BSONObjIterator i = storageParamsObj.begin();
        while (i.more()) {
            BSONElement e = i.next();
            // Ignore if field name under "storage" matches current storage engine.
            if (storageGlobalParams.engine == e.fieldName()) {
                continue;
            }

            // Warn if field name matches non-active registered storage engine.
            if (getGlobalServiceContext()->isRegisteredStorageEngine(e.fieldName())) {
                warning() << "Detected configuration for non-active storage engine "
                          << e.fieldName() << " when current storage engine is "
                          << storageGlobalParams.engine;
            }
        }
    }

    if (!getGlobalServiceContext()->getGlobalStorageEngine()->getSnapshotManager()) {
        if (moe::startupOptionsParsed.count("replication.enableMajorityReadConcern") &&
            moe::startupOptionsParsed["replication.enableMajorityReadConcern"].as<bool>()) {
            // Note: we are intentionally only erroring if the user explicitly requested that we
            // enable majority read concern. We do not error if the they are implicitly enabled for
            // CSRS because a required step in the upgrade procedure can involve an mmapv1 node in
            // the CSRS in the REMOVED state. This is handled by the TopologyCoordinator.
            invariant(replSettings.isMajorityReadConcernEnabled());
            severe() << "Majority read concern requires a storage engine that supports"
                     << " snapshots, such as wiredTiger. " << storageGlobalParams.engine
                     << " does not support snapshots.";
            exitCleanly(EXIT_BADOPTIONS);
        }
    }

    logMongodStartupWarnings(storageGlobalParams, serverGlobalParams);

    {
        stringstream ss;
        ss << endl;
        ss << "*********************************************************************" << endl;
        ss << " ERROR: dbpath (" << storageGlobalParams.dbpath << ") does not exist." << endl;
        ss << " Create this directory or give existing directory in --dbpath." << endl;
        ss << " See http://dochub.mongodb.org/core/startingandstoppingmongo" << endl;
        ss << "*********************************************************************" << endl;
        uassert(10296, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.dbpath));
    }

    {
        stringstream ss;
        ss << "repairpath (" << storageGlobalParams.repairpath << ") does not exist";
        uassert(12590, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.repairpath));
    }

    // TODO:  This should go into a MONGO_INITIALIZER once we have figured out the correct
    // dependencies.
    if (snmpInit) {
        snmpInit();
    }

    if (!storageGlobalParams.readOnly) {
        boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/");
    }

    if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalRecoverOnly)
        return EXIT_NET_ERROR;

    if (mongodGlobalParams.scriptingEnabled) {
        ScriptEngine::setup();
    }

    auto startupOpCtx = getGlobalServiceContext()->makeOperationContext(&cc());

    repairDatabasesAndCheckVersion(startupOpCtx.get());

    if (storageGlobalParams.upgrade) {
        log() << "finished checking dbs";
        exitCleanly(EXIT_CLEAN);
    }

    uassertStatusOK(getGlobalAuthorizationManager()->initialize(startupOpCtx.get()));

    /* this is for security on certain platforms (nonce generation) */
    srand((unsigned)(curTimeMicros64() ^ startupSrandTimer.micros()));

    // The snapshot thread provides historical collection level and lock statistics for use
    // by the web interface. Only needed when HTTP is enabled.
    if (serverGlobalParams.isHttpInterfaceEnabled) {
        statsSnapshotThread.go();

        invariant(dbWebServer);
        stdx::thread web(stdx::bind(&webServerListenThread, dbWebServer));
        web.detach();
    }

    AuthorizationManager* globalAuthzManager = getGlobalAuthorizationManager();
    if (globalAuthzManager->shouldValidateAuthSchemaOnStartup()) {
        Status status = authindex::verifySystemIndexes(startupOpCtx.get());
        if (!status.isOK()) {
            log() << redact(status);
            exitCleanly(EXIT_NEED_UPGRADE);
        }

        // SERVER-14090: Verify that auth schema version is schemaVersion26Final.
        int foundSchemaVersion;
        status =
            globalAuthzManager->getAuthorizationVersion(startupOpCtx.get(), &foundSchemaVersion);
        if (!status.isOK()) {
            log() << "Auth schema version is incompatible: "
                  << "User and role management commands require auth data to have "
                  << "at least schema version " << AuthorizationManager::schemaVersion26Final
                  << " but startup could not verify schema version: " << status;
            exitCleanly(EXIT_NEED_UPGRADE);
        }
        if (foundSchemaVersion < AuthorizationManager::schemaVersion26Final) {
            log() << "Auth schema version is incompatible: "
                  << "User and role management commands require auth data to have "
                  << "at least schema version " << AuthorizationManager::schemaVersion26Final
                  << " but found " << foundSchemaVersion << ". In order to upgrade "
                  << "the auth schema, first downgrade MongoDB binaries to version "
                  << "2.6 and then run the authSchemaUpgrade command.";
            exitCleanly(EXIT_NEED_UPGRADE);
        }
    } else if (globalAuthzManager->isAuthEnabled()) {
        error() << "Auth must be disabled when starting without auth schema validation";
        exitCleanly(EXIT_BADOPTIONS);
    } else {
        // If authSchemaValidation is disabled and server is running without auth,
        // warn the user and continue startup without authSchema metadata checks.
        log() << startupWarningsLog;
        log() << "** WARNING: Startup auth schema validation checks are disabled for the "
                 "database."
              << startupWarningsLog;
        log() << "**          This mode should only be used to manually repair corrupted auth "
                 "data."
              << startupWarningsLog;
    }

    auto shardingInitialized =
        uassertStatusOK(ShardingState::get(startupOpCtx.get())
                            ->initializeShardingAwarenessIfNeeded(startupOpCtx.get()));
    if (shardingInitialized) {
        reloadShardRegistryUntilSuccess(startupOpCtx.get());
    }

    if (!storageGlobalParams.readOnly) {
        logStartup(startupOpCtx.get());

        startFTDC();

        getDeleter()->startWorkers();

        restartInProgressIndexesFromLastShutdown(startupOpCtx.get());

        if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) {
            // Note: For replica sets, ShardingStateRecovery happens on transition to primary.
            if (!repl::getGlobalReplicationCoordinator()->isReplEnabled()) {
                uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get()));
            }
        } else if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
            uassertStatusOK(
                initializeGlobalShardingStateForMongod(startupOpCtx.get(),
                                                       ConnectionString::forLocal(),
                                                       kDistLockProcessIdForConfigServer));
            Balancer::create(startupOpCtx->getServiceContext());
        }

        repl::getGlobalReplicationCoordinator()->startup(startupOpCtx.get());

        const unsigned long long missingRepl =
            checkIfReplMissingFromCommandLine(startupOpCtx.get());
        if (missingRepl) {
            log() << startupWarningsLog;
            log() << "** WARNING: mongod started without --replSet yet " << missingRepl
                  << " documents are present in local.system.replset" << startupWarningsLog;
            log() << "**          Restart with --replSet unless you are doing maintenance and "
                  << " no other clients are connected." << startupWarningsLog;
            log() << "**          The TTL collection monitor will not start because of this."
                  << startupWarningsLog;
            log() << "**         ";
            log() << " For more info see http://dochub.mongodb.org/core/ttlcollections";
            log() << startupWarningsLog;
        } else {
            startTTLBackgroundJob();
        }

        if (!replSettings.usingReplSets() && !replSettings.isSlave() &&
            storageGlobalParams.engine != "devnull") {
            ScopedTransaction transaction(startupOpCtx.get(), MODE_X);
            Lock::GlobalWrite lk(startupOpCtx.get()->lockState());
            FeatureCompatibilityVersion::setIfCleanStartup(
                startupOpCtx.get(), repl::StorageInterface::get(getGlobalServiceContext()));
        }

        if (replSettings.usingReplSets() || (!replSettings.isMaster() && replSettings.isSlave()) ||
            !internalValidateFeaturesAsMaster) {
            serverGlobalParams.featureCompatibility.validateFeaturesAsMaster.store(false);
        }
    }

    startClientCursorMonitor();

    PeriodicTask::startRunningPeriodicTasks();

    // MessageServer::run will return when exit code closes its socket and we don't need the
    // operation context anymore
    startupOpCtx.reset();

    auto start = getGlobalServiceContext()->addAndStartTransportLayer(std::move(transportLayer));
    if (!start.isOK()) {
        error() << "Failed to start the listener: " << start.toString();
        return EXIT_NET_ERROR;
    }

#ifndef _WIN32
    mongo::signalForkSuccess();
#else
    if (ntservice::shouldStartService()) {
        ntservice::reportStatus(SERVICE_RUNNING);
        log() << "Service running";
    }
#endif

    return waitForShutdown();
}
StatusWith<Shard::CommandResponse> ShardingCatalogManager::_runCommandForAddShard(
    OperationContext* opCtx,
    RemoteCommandTargeter* targeter,
    const std::string& dbName,
    const BSONObj& cmdObj) {
    auto swHost = targeter->findHost(opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly});
    if (!swHost.isOK()) {
        return swHost.getStatus();
    }
    auto host = std::move(swHost.getValue());

    executor::RemoteCommandRequest request(
        host, dbName, cmdObj, rpc::makeEmptyMetadata(), nullptr, Seconds(30));

    executor::RemoteCommandResponse response =
        Status(ErrorCodes::InternalError, "Internal error running command");

    auto swCallbackHandle = _executorForAddShard->scheduleRemoteCommand(
        request, [&response](const executor::TaskExecutor::RemoteCommandCallbackArgs& args) {
            response = args.response;
        });
    if (!swCallbackHandle.isOK()) {
        return swCallbackHandle.getStatus();
    }

    // Block until the command is carried out
    _executorForAddShard->wait(swCallbackHandle.getValue());

    if (response.status == ErrorCodes::ExceededTimeLimit) {
        LOG(0) << "Operation timed out with status " << redact(response.status);
    }

    if (!response.isOK()) {
        if (!Shard::shouldErrorBePropagated(response.status.code())) {
            return {ErrorCodes::OperationFailed,
                    str::stream() << "failed to run command " << cmdObj
                                  << " when attempting to add shard "
                                  << targeter->connectionString().toString()
                                  << causedBy(response.status)};
        }
        return response.status;
    }

    BSONObj result = response.data.getOwned();

    Status commandStatus = getStatusFromCommandResult(result);
    if (!Shard::shouldErrorBePropagated(commandStatus.code())) {
        commandStatus = {ErrorCodes::OperationFailed,
                         str::stream() << "failed to run command " << cmdObj
                                       << " when attempting to add shard "
                                       << targeter->connectionString().toString()
                                       << causedBy(commandStatus)};
    }

    Status writeConcernStatus = getWriteConcernStatusFromCommandResult(result);
    if (!Shard::shouldErrorBePropagated(writeConcernStatus.code())) {
        writeConcernStatus = {ErrorCodes::OperationFailed,
                              str::stream() << "failed to satisfy writeConcern for command "
                                            << cmdObj
                                            << " when attempting to add shard "
                                            << targeter->connectionString().toString()
                                            << causedBy(writeConcernStatus)};
    }

    return Shard::CommandResponse(std::move(host),
                                  std::move(result),
                                  response.metadata.getOwned(),
                                  std::move(commandStatus),
                                  std::move(writeConcernStatus));
}
Example #16
0
Status KeyGenerator::generateNewKeysIfNeeded(OperationContext* opCtx) {

    if (MONGO_FAIL_POINT(disableKeyGeneration)) {
        return {ErrorCodes::FailPointEnabled, "key generation disabled"};
    }

    auto currentTime = LogicalClock::get(opCtx)->getClusterTime();
    auto keyStatus = _client->getNewKeys(opCtx, _purpose, currentTime);

    if (!keyStatus.isOK()) {
        return keyStatus.getStatus();
    }

    const auto& newKeys = keyStatus.getValue();
    auto keyIter = newKeys.cbegin();

    LogicalTime currentKeyExpiresAt;

    long long keyId = currentTime.asTimestamp().asLL();

    if (keyIter == newKeys.cend()) {
        currentKeyExpiresAt = addSeconds(currentTime, _keyValidForInterval);
        auto status = insertNewKey(opCtx, _client, keyId, _purpose, currentKeyExpiresAt);

        if (!status.isOK()) {
            return status;
        }

        keyId++;
    } else if (keyIter->getExpiresAt() < currentTime) {
        currentKeyExpiresAt = addSeconds(currentTime, _keyValidForInterval);
        auto status = insertNewKey(opCtx, _client, keyId, _purpose, currentKeyExpiresAt);

        if (!status.isOK()) {
            return status;
        }

        keyId++;
        ++keyIter;
    } else {
        currentKeyExpiresAt = keyIter->getExpiresAt();
        ++keyIter;
    }

    // Create a new key in advance if we don't have a key on standby after the current one
    // expires.
    // Note: Convert this block into a loop if more reserved keys are desired.
    if (keyIter == newKeys.cend()) {
        auto reserveKeyExpiresAt = addSeconds(currentKeyExpiresAt, _keyValidForInterval);
        auto status = insertNewKey(opCtx, _client, keyId, _purpose, reserveKeyExpiresAt);

        if (!status.isOK()) {
            return status;
        }
    } else if (keyIter->getExpiresAt() < currentTime) {
        currentKeyExpiresAt = addSeconds(currentKeyExpiresAt, _keyValidForInterval);
        auto status = insertNewKey(opCtx, _client, keyId, _purpose, currentKeyExpiresAt);

        if (!status.isOK()) {
            return status;
        }
    }

    return Status::OK();
}
StatusWith<boost::optional<ShardType>> ShardingCatalogManager::_checkIfShardExists(
    OperationContext* opCtx,
    const ConnectionString& proposedShardConnectionString,
    const std::string* proposedShardName,
    long long proposedShardMaxSize) {
    // Check whether any host in the connection is already part of the cluster.
    const auto existingShards = Grid::get(opCtx)->catalogClient()->getAllShards(
        opCtx, repl::ReadConcernLevel::kLocalReadConcern);
    if (!existingShards.isOK()) {
        return existingShards.getStatus().withContext(
            "Failed to load existing shards during addShard");
    }

    // Now check if this shard already exists - if it already exists *with the same options* then
    // the addShard request can return success early without doing anything more.
    for (const auto& existingShard : existingShards.getValue().value) {
        auto swExistingShardConnStr = ConnectionString::parse(existingShard.getHost());
        if (!swExistingShardConnStr.isOK()) {
            return swExistingShardConnStr.getStatus();
        }
        auto existingShardConnStr = std::move(swExistingShardConnStr.getValue());

        // Function for determining if the options for the shard that is being added match the
        // options of an existing shard that conflicts with it.
        auto shardsAreEquivalent = [&]() {
            if (proposedShardName && *proposedShardName != existingShard.getName()) {
                return false;
            }
            if (proposedShardConnectionString.type() != existingShardConnStr.type()) {
                return false;
            }
            if (proposedShardConnectionString.type() == ConnectionString::SET &&
                proposedShardConnectionString.getSetName() != existingShardConnStr.getSetName()) {
                return false;
            }
            if (proposedShardMaxSize != existingShard.getMaxSizeMB()) {
                return false;
            }
            return true;
        };

        if (existingShardConnStr.type() == ConnectionString::SET &&
            proposedShardConnectionString.type() == ConnectionString::SET &&
            existingShardConnStr.getSetName() == proposedShardConnectionString.getSetName()) {
            // An existing shard has the same replica set name as the shard being added.
            // If the options aren't the same, then this is an error,
            // but if the options match then the addShard operation should be immediately
            // considered a success and terminated.
            if (shardsAreEquivalent()) {
                return {existingShard};
            } else {
                return {ErrorCodes::IllegalOperation,
                        str::stream() << "A shard already exists containing the replica set '"
                                      << existingShardConnStr.getSetName()
                                      << "'"};
            }
        }

        for (const auto& existingHost : existingShardConnStr.getServers()) {
            // Look if any of the hosts in the existing shard are present within the shard trying
            // to be added.
            for (const auto& addingHost : proposedShardConnectionString.getServers()) {
                if (existingHost == addingHost) {
                    // At least one of the hosts in the shard being added already exists in an
                    // existing shard.  If the options aren't the same, then this is an error,
                    // but if the options match then the addShard operation should be immediately
                    // considered a success and terminated.
                    if (shardsAreEquivalent()) {
                        return {existingShard};
                    } else {
                        return {ErrorCodes::IllegalOperation,
                                str::stream() << "'" << addingHost.toString() << "' "
                                              << "is already a member of the existing shard '"
                                              << existingShard.getHost()
                                              << "' ("
                                              << existingShard.getName()
                                              << ")."};
                    }
                }
            }
        }

        if (proposedShardName && *proposedShardName == existingShard.getName()) {
            // If we get here then we're trying to add a shard with the same name as an existing
            // shard, but there was no overlap in the hosts between the existing shard and the
            // proposed connection string for the new shard.
            return {ErrorCodes::IllegalOperation,
                    str::stream() << "A shard named " << *proposedShardName << " already exists"};
        }
    }

    return {boost::none};
}
void SyncSourceFeedback::run(BackgroundSync* bgsync) {
    Client::initThread("SyncSourceFeedback");

    // Task executor used to run replSetUpdatePosition command on sync source.
    auto net = executor::makeNetworkInterface("NetworkInterfaceASIO-SyncSourceFeedback");
    auto pool = stdx::make_unique<executor::NetworkInterfaceThreadPool>(net.get());
    executor::ThreadPoolTaskExecutor executor(std::move(pool), std::move(net));
    executor.startup();
    ON_BLOCK_EXIT([&executor]() {
        executor.shutdown();
        executor.join();
    });

    HostAndPort syncTarget;

    // keepAliveInterval indicates how frequently to forward progress in the absence of updates.
    Milliseconds keepAliveInterval(0);

    while (true) {  // breaks once _shutdownSignaled is true
        auto txn = cc().makeOperationContext();

        if (keepAliveInterval == Milliseconds(0)) {
            keepAliveInterval = calculateKeepAliveInterval(txn.get(), _mtx);
        }

        {
            // Take SyncSourceFeedback lock before calling into ReplicationCoordinator
            // to avoid deadlock because ReplicationCoordinator could conceivably calling back into
            // this class.
            stdx::unique_lock<stdx::mutex> lock(_mtx);
            while (!_positionChanged && !_shutdownSignaled) {
                if (_cond.wait_for(lock, keepAliveInterval.toSystemDuration()) ==
                        stdx::cv_status::timeout) {
                    MemberState state = ReplicationCoordinator::get(txn.get())->getMemberState();
                    if (!(state.primary() || state.startup())) {
                        break;
                    }
                }
            }

            if (_shutdownSignaled) {
                break;
            }

            _positionChanged = false;
        }

        {
            stdx::lock_guard<stdx::mutex> lock(_mtx);
            MemberState state = ReplicationCoordinator::get(txn.get())->getMemberState();
            if (state.primary() || state.startup()) {
                continue;
            }
        }

        const HostAndPort target = bgsync->getSyncTarget();
        // Log sync source changes.
        if (target.empty()) {
            if (syncTarget != target) {
                syncTarget = target;
            }
            // Loop back around again; the keepalive functionality will cause us to retry
            continue;
        }

        if (syncTarget != target) {
            LOG(1) << "setting syncSourceFeedback to " << target;
            syncTarget = target;

            // Update keepalive value from config.
            auto oldKeepAliveInterval = keepAliveInterval;
            keepAliveInterval = calculateKeepAliveInterval(txn.get(), _mtx);
            if (oldKeepAliveInterval != keepAliveInterval) {
                LOG(1) << "new syncSourceFeedback keep alive duration = " << keepAliveInterval
                       << " (previously " << oldKeepAliveInterval << ")";
            }
        }

        Reporter reporter(
            &executor,
            makePrepareReplSetUpdatePositionCommandFn(txn.get(), _mtx, syncTarget, bgsync),
            syncTarget,
            keepAliveInterval);
        {
            stdx::lock_guard<stdx::mutex> lock(_mtx);
            _reporter = &reporter;
        }
        ON_BLOCK_EXIT([this]() {
            stdx::lock_guard<stdx::mutex> lock(_mtx);
            _reporter = nullptr;
        });

        auto status = _updateUpstream(txn.get(), bgsync);
        if (!status.isOK()) {
            LOG(1) << "The replication progress command (replSetUpdatePosition) failed and will be "
                   "retried: "
                   << status;
        }
    }
}
StatusWith<std::string> ShardingCatalogManager::addShard(
    OperationContext* opCtx,
    const std::string* shardProposedName,
    const ConnectionString& shardConnectionString,
    const long long maxSize) {
    if (shardConnectionString.type() == ConnectionString::INVALID) {
        return {ErrorCodes::BadValue, "Invalid connection string"};
    }

    if (shardProposedName && shardProposedName->empty()) {
        return {ErrorCodes::BadValue, "shard name cannot be empty"};
    }

    // Only one addShard operation can be in progress at a time.
    Lock::ExclusiveLock lk(opCtx->lockState(), _kShardMembershipLock);

    // Check if this shard has already been added (can happen in the case of a retry after a network
    // error, for example) and thus this addShard request should be considered a no-op.
    auto existingShard =
        _checkIfShardExists(opCtx, shardConnectionString, shardProposedName, maxSize);
    if (!existingShard.isOK()) {
        return existingShard.getStatus();
    }
    if (existingShard.getValue()) {
        // These hosts already belong to an existing shard, so report success and terminate the
        // addShard request.  Make sure to set the last optime for the client to the system last
        // optime so that we'll still wait for replication so that this state is visible in the
        // committed snapshot.
        repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx);
        return existingShard.getValue()->getName();
    }

    // Force a reload of the ShardRegistry to ensure that, in case this addShard is to re-add a
    // replica set that has recently been removed, we have detached the ReplicaSetMonitor for the
    // set with that setName from the ReplicaSetMonitorManager and will create a new
    // ReplicaSetMonitor when targeting the set below.
    // Note: This is necessary because as of 3.4, removeShard is performed by mongos (unlike
    // addShard), so the ShardRegistry is not synchronously reloaded on the config server when a
    // shard is removed.
    if (!Grid::get(opCtx)->shardRegistry()->reload(opCtx)) {
        // If the first reload joined an existing one, call reload again to ensure the reload is
        // fresh.
        Grid::get(opCtx)->shardRegistry()->reload(opCtx);
    }

    // TODO: Don't create a detached Shard object, create a detached RemoteCommandTargeter instead.
    const std::shared_ptr<Shard> shard{
        Grid::get(opCtx)->shardRegistry()->createConnection(shardConnectionString)};
    invariant(shard);
    auto targeter = shard->getTargeter();

    auto stopMonitoringGuard = MakeGuard([&] {
        if (shardConnectionString.type() == ConnectionString::SET) {
            // This is a workaround for the case were we could have some bad shard being
            // requested to be added and we put that bad connection string on the global replica set
            // monitor registry. It needs to be cleaned up so that when a correct replica set is
            // added, it will be recreated.
            ReplicaSetMonitor::remove(shardConnectionString.getSetName());
        }
    });

    // Validate the specified connection string may serve as shard at all
    auto shardStatus =
        _validateHostAsShard(opCtx, targeter, shardProposedName, shardConnectionString);
    if (!shardStatus.isOK()) {
        return shardStatus.getStatus();
    }
    ShardType& shardType = shardStatus.getValue();

    // Check that none of the existing shard candidate's dbs exist already
    auto dbNamesStatus = _getDBNamesListFromShard(opCtx, targeter);
    if (!dbNamesStatus.isOK()) {
        return dbNamesStatus.getStatus();
    }

    for (const auto& dbName : dbNamesStatus.getValue()) {
        auto dbt = Grid::get(opCtx)->catalogClient()->getDatabase(
            opCtx, dbName, repl::ReadConcernLevel::kLocalReadConcern);
        if (dbt.isOK()) {
            const auto& dbDoc = dbt.getValue().value;
            return Status(ErrorCodes::OperationFailed,
                          str::stream() << "can't add shard "
                                        << "'"
                                        << shardConnectionString.toString()
                                        << "'"
                                        << " because a local database '"
                                        << dbName
                                        << "' exists in another "
                                        << dbDoc.getPrimary());
        } else if (dbt != ErrorCodes::NamespaceNotFound) {
            return dbt.getStatus();
        }
    }

    // Check that the shard candidate does not have a local config.system.sessions collection
    auto res = _dropSessionsCollection(opCtx, targeter);

    if (!res.isOK()) {
        return res.withContext(
            "can't add shard with a local copy of config.system.sessions, please drop this "
            "collection from the shard manually and try again.");
    }

    // If a name for a shard wasn't provided, generate one
    if (shardType.getName().empty()) {
        auto result = generateNewShardName(opCtx);
        if (!result.isOK()) {
            return result.getStatus();
        }
        shardType.setName(result.getValue());
    }

    if (maxSize > 0) {
        shardType.setMaxSizeMB(maxSize);
    }

    // Insert a shardIdentity document onto the shard. This also triggers sharding initialization on
    // the shard.
    LOG(2) << "going to insert shardIdentity document into shard: " << shardType;
    auto commandRequest = createShardIdentityUpsertForAddShard(opCtx, shardType.getName());
    auto swCommandResponse = _runCommandForAddShard(opCtx, targeter.get(), "admin", commandRequest);
    if (!swCommandResponse.isOK()) {
        return swCommandResponse.getStatus();
    }
    auto commandResponse = std::move(swCommandResponse.getValue());
    BatchedCommandResponse batchResponse;
    auto batchResponseStatus =
        Shard::CommandResponse::processBatchWriteResponse(commandResponse, &batchResponse);
    if (!batchResponseStatus.isOK()) {
        return batchResponseStatus;
    }

    // The featureCompatibilityVersion should be the same throughout the cluster. We don't
    // explicitly send writeConcern majority to the added shard, because a 3.4 mongod will reject
    // it (setFCV did not support writeConcern until 3.6), and a 3.6 mongod will still default to
    // majority writeConcern.
    //
    // TODO SERVER-32045: propagate the user's writeConcern
    auto versionResponse = _runCommandForAddShard(
        opCtx,
        targeter.get(),
        "admin",
        BSON(FeatureCompatibilityVersion::kCommandName << FeatureCompatibilityVersion::toString(
                 serverGlobalParams.featureCompatibility.getVersion())));
    if (!versionResponse.isOK()) {
        return versionResponse.getStatus();
    }

    if (!versionResponse.getValue().commandStatus.isOK()) {
        return versionResponse.getValue().commandStatus;
    }

    log() << "going to insert new entry for shard into config.shards: " << shardType.toString();

    Status result = Grid::get(opCtx)->catalogClient()->insertConfigDocument(
        opCtx,
        ShardType::ConfigNS,
        shardType.toBSON(),
        ShardingCatalogClient::kMajorityWriteConcern);
    if (!result.isOK()) {
        log() << "error adding shard: " << shardType.toBSON() << " err: " << result.reason();
        return result;
    }

    // Add all databases which were discovered on the new shard
    for (const auto& dbName : dbNamesStatus.getValue()) {
        DatabaseType dbt(dbName, shardType.getName(), false);
        Status status = Grid::get(opCtx)->catalogClient()->updateDatabase(opCtx, dbName, dbt);
        if (!status.isOK()) {
            log() << "adding shard " << shardConnectionString.toString()
                  << " even though could not add database " << dbName;
        }
    }

    // Record in changelog
    BSONObjBuilder shardDetails;
    shardDetails.append("name", shardType.getName());
    shardDetails.append("host", shardConnectionString.toString());

    Grid::get(opCtx)
        ->catalogClient()
        ->logChange(
            opCtx, "addShard", "", shardDetails.obj(), ShardingCatalogClient::kMajorityWriteConcern)
        .transitional_ignore();

    // Ensure the added shard is visible to this process.
    auto shardRegistry = Grid::get(opCtx)->shardRegistry();
    if (!shardRegistry->getShard(opCtx, shardType.getName()).isOK()) {
        return {ErrorCodes::OperationFailed,
                "Could not find shard metadata for shard after adding it. This most likely "
                "indicates that the shard was removed immediately after it was added."};
    }
    stopMonitoringGuard.Dismiss();

    return shardType.getName();
}
Example #20
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      const OpTime& lastOpTimeFetched,
                                      ReplicationCoordinator* replCoord) {
    const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0);
    const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max());
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp());

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            error() << "too stale to catch up -- entering maintenance mode";
            log() << "our last optime : " << lastOpTimeFetched;
            log() << "oldest available is " << oldestOpTimeSeen;
            log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            setMinValid(txn, {lastOpTimeFetched, oldestOpTimeSeen});
            auto status = replCoord->setMaintenanceMode(true);
            if (!status.isOK()) {
                warning() << "Failed to transition into maintenance mode.";
            }
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query()));
        OpTime remoteOldOpTime =
            fassertStatusOK(28776, OpTime::parseFromOplogEntry(remoteOldestOp));

        // remoteOldOpTime may come from a very old config, so we cannot compare their terms.
        if (!lastOpTimeFetched.isNull() &&
            lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1));
            if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }


        // TODO: If we were too stale (recovering with maintenance mode on), then turn it off, to
        //       allow becoming secondary/etc.

        // Got a valid sync source.
        return;
    }  // while (true)
}
StatusWith<BalancerSettingsType> BalancerSettingsType::fromBSON(const BSONObj& obj) {
    BalancerSettingsType settings;

    {
        bool stopped;
        Status status = bsonExtractBooleanFieldWithDefault(obj, kStopped, false, &stopped);
        if (!status.isOK())
            return status;
        if (stopped) {
            settings._mode = kOff;
        } else {
            std::string modeStr;
            status = bsonExtractStringFieldWithDefault(obj, kMode, kBalancerModes[kFull], &modeStr);
            if (!status.isOK())
                return status;
            auto it = std::find(std::begin(kBalancerModes), std::end(kBalancerModes), modeStr);
            if (it == std::end(kBalancerModes)) {
                return Status(ErrorCodes::BadValue, "Invalid balancer mode");
            }

            settings._mode = static_cast<BalancerMode>(it - std::begin(kBalancerModes));
        }
    }

    {
        BSONElement activeWindowElem;
        Status status = bsonExtractTypedField(obj, kActiveWindow, Object, &activeWindowElem);
        if (status.isOK()) {
            const BSONObj balancingWindowObj = activeWindowElem.Obj();
            if (balancingWindowObj.isEmpty()) {
                return Status(ErrorCodes::BadValue, "activeWindow not specified");
            }

            // Check if both 'start' and 'stop' are present
            const std::string start = balancingWindowObj.getField("start").str();
            const std::string stop = balancingWindowObj.getField("stop").str();

            if (start.empty() || stop.empty()) {
                return Status(ErrorCodes::BadValue,
                              str::stream()
                                  << "must specify both start and stop of balancing window: "
                                  << balancingWindowObj);
            }

            // Check that both 'start' and 'stop' are valid time-of-day
            boost::posix_time::ptime startTime;
            boost::posix_time::ptime stopTime;
            if (!toPointInTime(start, &startTime) || !toPointInTime(stop, &stopTime)) {
                return Status(ErrorCodes::BadValue,
                              str::stream() << kActiveWindow << " format is "
                                            << " { start: \"hh:mm\" , stop: \"hh:mm\" }");
            }

            // Check that start and stop designate different time points
            if (startTime == stopTime) {
                return Status(ErrorCodes::BadValue,
                              str::stream() << "start and stop times must be different");
            }

            settings._activeWindowStart = startTime;
            settings._activeWindowStop = stopTime;
        } else if (status != ErrorCodes::NoSuchKey) {
            return status;
        }
    }

    {
        auto secondaryThrottleStatus =
            MigrationSecondaryThrottleOptions::createFromBalancerConfig(obj);
        if (!secondaryThrottleStatus.isOK()) {
            return secondaryThrottleStatus.getStatus();
        }

        settings._secondaryThrottle = std::move(secondaryThrottleStatus.getValue());
    }

    {
        bool waitForDelete;
        Status status =
            bsonExtractBooleanFieldWithDefault(obj, kWaitForDelete, false, &waitForDelete);
        if (!status.isOK())
            return status;

        settings._waitForDelete = waitForDelete;
    }

    return settings;
}
Example #22
0
mongo::Status mongo::cloneCollectionAsCapped(OperationContext* opCtx,
                                             Database* db,
                                             const std::string& shortFrom,
                                             const std::string& shortTo,
                                             long long size,
                                             bool temp) {
    NamespaceString fromNss(db->name(), shortFrom);
    NamespaceString toNss(db->name(), shortTo);

    Collection* fromCollection = db->getCollection(opCtx, fromNss);
    if (!fromCollection) {
        if (db->getViewCatalog()->lookup(opCtx, fromNss.ns())) {
            return Status(ErrorCodes::CommandNotSupportedOnView,
                          str::stream() << "cloneCollectionAsCapped not supported for views: "
                                        << fromNss.ns());
        }
        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "source collection " << fromNss.ns() << " does not exist");
    }

    if (fromNss.isDropPendingNamespace()) {
        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "source collection " << fromNss.ns()
                                    << " is currently in a drop-pending state.");
    }

    if (db->getCollection(opCtx, toNss)) {
        return Status(ErrorCodes::NamespaceExists,
                      str::stream() << "cloneCollectionAsCapped failed - destination collection "
                                    << toNss.ns()
                                    << " already exists. source collection: "
                                    << fromNss.ns());
    }

    // create new collection
    {
        auto options = fromCollection->getCatalogEntry()->getCollectionOptions(opCtx);
        // The capped collection will get its own new unique id, as the conversion isn't reversible,
        // so it can't be rolled back.
        options.uuid.reset();
        options.capped = true;
        options.cappedSize = size;
        if (temp)
            options.temp = true;

        BSONObjBuilder cmd;
        cmd.append("create", toNss.coll());
        cmd.appendElements(options.toBSON());
        Status status = createCollection(opCtx, toNss.db().toString(), cmd.done());
        if (!status.isOK())
            return status;
    }

    Collection* toCollection = db->getCollection(opCtx, toNss);
    invariant(toCollection);  // we created above

    // how much data to ignore because it won't fit anyway
    // datasize and extentSize can't be compared exactly, so add some padding to 'size'

    long long allocatedSpaceGuess =
        std::max(static_cast<long long>(size * 2),
                 static_cast<long long>(toCollection->getRecordStore()->storageSize(opCtx) * 2));

    long long excessSize = fromCollection->dataSize(opCtx) - allocatedSpaceGuess;

    auto exec = InternalPlanner::collectionScan(opCtx,
                                                fromNss.ns(),
                                                fromCollection,
                                                PlanExecutor::WRITE_CONFLICT_RETRY_ONLY,
                                                InternalPlanner::FORWARD);

    Snapshotted<BSONObj> objToClone;
    RecordId loc;
    PlanExecutor::ExecState state = PlanExecutor::FAILURE;  // suppress uninitialized warnings

    DisableDocumentValidation validationDisabler(opCtx);

    int retries = 0;  // non-zero when retrying our last document.
    while (true) {
        if (!retries) {
            state = exec->getNextSnapshotted(&objToClone, &loc);
        }

        switch (state) {
            case PlanExecutor::IS_EOF:
                return Status::OK();
            case PlanExecutor::ADVANCED: {
                if (excessSize > 0) {
                    // 4x is for padding, power of 2, etc...
                    excessSize -= (4 * objToClone.value().objsize());
                    continue;
                }
                break;
            }
            default:
                // Unreachable as:
                // 1) We require a read lock (at a minimum) on the "from" collection
                //    and won't yield, preventing collection drop and PlanExecutor::DEAD
                // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The
                //    CollectionScan PlanStage does not have a FAILURE scenario.
                // 3) All other PlanExecutor states are handled above
                MONGO_UNREACHABLE;
        }

        try {
            // Make sure we are working with the latest version of the document.
            if (objToClone.snapshotId() != opCtx->recoveryUnit()->getSnapshotId() &&
                !fromCollection->findDoc(opCtx, loc, &objToClone)) {
                // doc was deleted so don't clone it.
                retries = 0;
                continue;
            }

            WriteUnitOfWork wunit(opCtx);
            OpDebug* const nullOpDebug = nullptr;
            uassertStatusOK(toCollection->insertDocument(
                opCtx, InsertStatement(objToClone.value()), nullOpDebug, true));
            wunit.commit();

            // Go to the next document
            retries = 0;
        } catch (const WriteConflictException&) {
            CurOp::get(opCtx)->debug().writeConflicts++;
            retries++;  // logAndBackoff expects this to be 1 on first call.
            WriteConflictException::logAndBackoff(retries, "cloneCollectionAsCapped", fromNss.ns());

            // Can't use writeConflictRetry since we need to save/restore exec around call to
            // abandonSnapshot.
            exec->saveState();
            opCtx->recoveryUnit()->abandonSnapshot();
            auto restoreStatus = exec->restoreState();  // Handles any WCEs internally.
            if (!restoreStatus.isOK()) {
                return restoreStatus;
            }
        }
    }

    MONGO_UNREACHABLE;
}