void forceDatabaseRefresh(OperationContext* opCtx, const StringData dbName) {
    invariant(!opCtx->lockState()->isLocked());
    invariant(!opCtx->getClient()->isInDirectClient());

    auto const shardingState = ShardingState::get(opCtx);
    invariant(shardingState->canAcceptShardedCommands());

    const auto refreshedDbVersion =
        uassertStatusOK(Grid::get(opCtx)->catalogCache()->getDatabaseWithRefresh(opCtx, dbName))
            .databaseVersion();

    // First, check under a shared lock if another thread already updated the cached version.
    // This is a best-effort optimization to make as few threads as possible to convoy on the
    // exclusive lock below.
    auto databaseHolder = DatabaseHolder::get(opCtx);
    {
        // Take the DBLock directly rather than using AutoGetDb, to prevent a recursive call
        // into checkDbVersion().
        Lock::DBLock dbLock(opCtx, dbName, MODE_IS);
        auto db = databaseHolder->getDb(opCtx, dbName);
        if (!db) {
            log() << "Database " << dbName
                  << " has been dropped; not caching the refreshed databaseVersion";
            return;
        }

        auto& dss = DatabaseShardingState::get(db);
        auto dssLock = DatabaseShardingState::DSSLock::lock(opCtx, &dss);

        const auto cachedDbVersion = dss.getDbVersion(opCtx, dssLock);
        if (cachedDbVersion && cachedDbVersion->getUuid() == refreshedDbVersion.getUuid() &&
            cachedDbVersion->getLastMod() >= refreshedDbVersion.getLastMod()) {
            LOG(2) << "Skipping setting cached databaseVersion for " << dbName
                   << " to refreshed version " << refreshedDbVersion.toBSON()
                   << " because current cached databaseVersion is already "
                   << cachedDbVersion->toBSON();
            return;
        }
    }

    // The cached version is older than the refreshed version; update the cached version.
    Lock::DBLock dbLock(opCtx, dbName, MODE_X);
    auto db = databaseHolder->getDb(opCtx, dbName);
    if (!db) {
        log() << "Database " << dbName
              << " has been dropped; not caching the refreshed databaseVersion";
        return;
    }

    auto& dss = DatabaseShardingState::get(db);
    auto dssLock = DatabaseShardingState::DSSLock::lockExclusive(opCtx, &dss);

    dss.setDbVersion(opCtx, std::move(refreshedDbVersion), dssLock);
}
Ejemplo n.º 2
0
Status ShardingStateRecovery::recover(OperationContext* txn) {
    BSONObj recoveryDocBSON;

    try {
        AutoGetCollection autoColl(txn, NamespaceString::kConfigCollectionNamespace, MODE_IS);
        if (!Helpers::findOne(
                txn, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) {
            return Status::OK();
        }
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON);
    if (!recoveryDocStatus.isOK())
        return recoveryDocStatus.getStatus();

    const auto recoveryDoc = std::move(recoveryDocStatus.getValue());

    log() << "Sharding state recovery process found document " << recoveryDoc.toBSON();

    // Make sure the sharding state is initialized
    ShardingState* const shardingState = ShardingState::get(txn);

    shardingState->initialize(txn, recoveryDoc.getConfigsvr().toString());
    shardingState->setShardName(recoveryDoc.getShardName());

    if (!recoveryDoc.getMinOpTimeUpdaters()) {
        // Treat the minOpTime as up-to-date
        grid.shardRegistry()->advanceConfigOpTime(recoveryDoc.getMinOpTime());
        return Status::OK();
    }

    log() << "Sharding state recovery document indicates there were "
          << recoveryDoc.getMinOpTimeUpdaters()
          << " metadata change operations in flight. Contacting the config server primary in order "
             "to retrieve the most recent opTime.";

    // Need to fetch the latest uptime from the config server, so do a logging write
    Status status =
        grid.catalogManager(txn)->logChange(txn,
                                            "Sharding recovery thread",
                                            "Sharding minOpTime recovery",
                                            NamespaceString::kConfigCollectionNamespace.ns(),
                                            recoveryDocBSON);
    if (!status.isOK())
        return status;

    log() << "Sharding state recovered. New config server opTime is "
          << grid.shardRegistry()->getConfigOpTime();

    // Finally, clear the recovery document so next time we don't need to recover
    status = modifyRecoveryDocument(txn, RecoveryDocument::Clear, kMajorityWriteConcern);
    if (!status.isOK()) {
        warning() << "Failed to reset sharding state recovery document due to " << status;
    }

    return Status::OK();
}
Ejemplo n.º 3
0
Status ShardingStateRecovery::recover(OperationContext* opCtx) {
    if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) {
        return Status::OK();
    }

    BSONObj recoveryDocBSON;

    try {
        AutoGetCollection autoColl(opCtx, NamespaceString::kConfigCollectionNamespace, MODE_IS);
        if (!Helpers::findOne(
                opCtx, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) {
            return Status::OK();
        }
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON);
    if (!recoveryDocStatus.isOK())
        return recoveryDocStatus.getStatus();

    const auto recoveryDoc = std::move(recoveryDocStatus.getValue());

    log() << "Sharding state recovery process found document " << redact(recoveryDoc.toBSON());

    ShardingState* const shardingState = ShardingState::get(opCtx);
    invariant(shardingState->enabled());

    if (!recoveryDoc.getMinOpTimeUpdaters()) {
        // Treat the minOpTime as up-to-date
        grid.advanceConfigOpTime(recoveryDoc.getMinOpTime());
        return Status::OK();
    }

    log() << "Sharding state recovery document indicates there were "
          << recoveryDoc.getMinOpTimeUpdaters()
          << " metadata change operations in flight. Contacting the config server primary in order "
             "to retrieve the most recent opTime.";

    // Need to fetch the latest uptime from the config server, so do a logging write
    Status status =
        grid.catalogClient(opCtx)->logChange(opCtx,
                                             "Sharding minOpTime recovery",
                                             NamespaceString::kConfigCollectionNamespace.ns(),
                                             recoveryDocBSON,
                                             ShardingCatalogClient::kMajorityWriteConcern);
    if (!status.isOK())
        return status;

    log() << "Sharding state recovered. New config server opTime is " << grid.configOpTime();

    // Finally, clear the recovery document so next time we don't need to recover
    status = modifyRecoveryDocument(opCtx, RecoveryDocument::Clear, kLocalWriteConcern);
    if (!status.isOK()) {
        warning() << "Failed to reset sharding state recovery document due to " << redact(status);
    }

    return Status::OK();
}
Ejemplo n.º 4
0
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
                                                 Timestamp oplogApplicationStartPoint,
                                                 Timestamp topOfOplog) {
    invariant(!oplogApplicationStartPoint.isNull());
    invariant(!topOfOplog.isNull());

    // Check if we have any unapplied ops in our oplog. It is important that this is done after
    // deleting the ragged end of the oplog.
    if (oplogApplicationStartPoint == topOfOplog) {
        log()
            << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog.";
        return;  // We've applied all the valid oplog we have.
    } else if (oplogApplicationStartPoint > topOfOplog) {
        severe() << "Applied op " << oplogApplicationStartPoint.toBSON()
                 << " not found. Top of oplog is " << topOfOplog.toBSON() << '.';
        fassertFailedNoTrace(40313);
    }

    log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON()
          << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive).";

    DBDirectClient db(opCtx);
    auto cursor = db.query(NamespaceString::kRsOplogNamespace.ns(),
                           QUERY("ts" << BSON("$gte" << oplogApplicationStartPoint)),
                           /*batchSize*/ 0,
                           /*skip*/ 0,
                           /*projection*/ nullptr,
                           QueryOption_OplogReplay);

    // Check that the first document matches our appliedThrough point then skip it since it's
    // already been applied.
    if (!cursor->more()) {
        // This should really be impossible because we check above that the top of the oplog is
        // strictly > appliedThrough. If this fails it represents a serious bug in either the
        // storage engine or query's implementation of OplogReplay.
        severe() << "Couldn't find any entries in the oplog >= "
                 << oplogApplicationStartPoint.toBSON() << " which should be impossible.";
        fassertFailedNoTrace(40293);
    }

    auto firstTimestampFound =
        fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())).getTimestamp();
    if (firstTimestampFound != oplogApplicationStartPoint) {
        severe() << "Oplog entry at " << oplogApplicationStartPoint.toBSON()
                 << " is missing; actual entry found is " << firstTimestampFound.toBSON();
        fassertFailedNoTrace(40292);
    }

    // Apply remaining ops one at at time, but don't log them because they are already logged.
    UnreplicatedWritesBlock uwb(opCtx);

    while (cursor->more()) {
        auto entry = cursor->nextSafe();
        fassertStatusOK(40294,
                        SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering));
        _consistencyMarkers->setAppliedThrough(
            opCtx, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry)));
    }
}
StatusWith<OplogApplier::Operations> OplogApplier::getNextApplierBatch(
    OperationContext* opCtx, const BatchLimits& batchLimits) {
    if (batchLimits.ops == 0) {
        return Status(ErrorCodes::InvalidOptions, "Batch size must be greater than 0.");
    }

    std::uint32_t totalBytes = 0;
    Operations ops;
    BSONObj op;
    while (_oplogBuffer->peek(opCtx, &op)) {
        auto entry = OplogEntry(op);

        // Check for oplog version change. If it is absent, its value is one.
        if (entry.getVersion() != OplogEntry::kOplogVersion) {
            std::string message = str::stream()
                << "expected oplog version " << OplogEntry::kOplogVersion << " but found version "
                << entry.getVersion() << " in oplog entry: " << redact(entry.toBSON());
            severe() << message;
            return {ErrorCodes::BadValue, message};
        }

        // Commands must be processed one at a time. The only exception to this is applyOps because
        // applyOps oplog entries are effectively containers for CRUD operations. Therefore, it is
        // safe to batch applyOps commands with CRUD operations when reading from the oplog buffer.
        if (entry.isCommand() && (entry.getCommandType() != OplogEntry::CommandType::kApplyOps ||
                                  entry.shouldPrepare())) {
            if (ops.empty()) {
                // Apply commands one-at-a-time.
                ops.push_back(std::move(entry));
                BSONObj opToPopAndDiscard;
                invariant(_oplogBuffer->tryPop(opCtx, &opToPopAndDiscard));
                dassert(ops.back() == OplogEntry(opToPopAndDiscard));
            }

            // Otherwise, apply what we have so far and come back for the command.
            return std::move(ops);
        }

        // Apply replication batch limits.
        if (ops.size() >= batchLimits.ops) {
            return std::move(ops);
        }

        // Never return an empty batch if there are operations left.
        if ((totalBytes + entry.getRawObjSizeBytes() >= batchLimits.bytes) && (ops.size() > 0)) {
            return std::move(ops);
        }

        // Add op to buffer.
        totalBytes += entry.getRawObjSizeBytes();
        ops.push_back(std::move(entry));
        BSONObj opToPopAndDiscard;
        invariant(_oplogBuffer->tryPop(opCtx, &opToPopAndDiscard));
        dassert(ops.back() == OplogEntry(opToPopAndDiscard));
    }
    return std::move(ops);
}
Ejemplo n.º 6
0
void RollbackResyncsCollectionOptionsTest::resyncCollectionOptionsTest(
    CollectionOptions localCollOptions,
    BSONObj remoteCollOptionsObj,
    BSONObj collModCmd,
    std::string collName) {
    createOplog(_opCtx.get());

    auto dbName = "test";
    auto nss = NamespaceString(dbName, collName);

    auto coll = _createCollection(_opCtx.get(), nss.toString(), localCollOptions);

    auto commonOpUuid = unittest::assertGet(UUID::parse("f005ba11-cafe-bead-f00d-123456789abc"));
    auto commonOpBson = BSON("ts" << Timestamp(1, 1) << "t" << 1LL << "op"
                                  << "n"
                                  << "o"
                                  << BSONObj()
                                  << "ns"
                                  << "rollback_test.test"
                                  << "ui"
                                  << commonOpUuid);

    auto commonOperation = std::make_pair(commonOpBson, RecordId(1));

    auto collectionModificationOperation =
        makeCommandOp(Timestamp(Seconds(2), 0), coll->uuid(), nss.toString(), collModCmd, 2);

    RollbackSourceWithCollectionOptions rollbackSource(
        std::unique_ptr<OplogInterface>(new OplogInterfaceMock({commonOperation})),
        remoteCollOptionsObj);

    ASSERT_OK(syncRollback(_opCtx.get(),
                           OplogInterfaceMock({collectionModificationOperation, commonOperation}),
                           rollbackSource,
                           {},
                           _coordinator,
                           _replicationProcess.get()));

    // Make sure the collection options are correct.
    AutoGetCollectionForReadCommand autoColl(_opCtx.get(), NamespaceString(nss.toString()));
    auto collAfterRollbackOptions =
        autoColl.getCollection()->getCatalogEntry()->getCollectionOptions(_opCtx.get());

    BSONObjBuilder expectedOptionsBob;
    if (localCollOptions.uuid) {
        localCollOptions.uuid.get().appendToBuilder(&expectedOptionsBob, "uuid");
    }
    expectedOptionsBob.appendElements(remoteCollOptionsObj);

    ASSERT_BSONOBJ_EQ(expectedOptionsBob.obj(), collAfterRollbackOptions.toBSON());
}
Ejemplo n.º 7
0
std::string LocksType::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 8
0
void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx) try {
    if (_consistencyMarkers->getInitialSyncFlag(opCtx)) {
        log() << "No recovery needed. Initial sync flag set.";
        return;  // Initial Sync will take over so no cleanup is needed.
    }

    const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
    const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);

    if (!truncateAfterPoint.isNull()) {
        log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON();
        _truncateOplogTo(opCtx, truncateAfterPoint);
    }

    // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries
    // erroneously.
    _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});

    // TODO (SERVER-30556): Delete this line since the old oplog delete from point cannot exist.
    _consistencyMarkers->removeOldOplogDeleteFromPointField(opCtx);

    auto topOfOplogSW = _getLastAppliedOpTime(opCtx);
    boost::optional<OpTime> topOfOplog = boost::none;
    if (topOfOplogSW.getStatus() != ErrorCodes::CollectionIsEmpty &&
        topOfOplogSW.getStatus() != ErrorCodes::NamespaceNotFound) {
        fassertStatusOK(40290, topOfOplogSW);
        topOfOplog = topOfOplogSW.getValue();
    }

    // If we have a checkpoint timestamp, then we recovered to a timestamp and should set the
    // initial data timestamp to that. Otherwise, we simply recovered the data on disk so we should
    // set the initial data timestamp to the top OpTime in the oplog once the data is consistent
    // there. If there is nothing in the oplog, then we do not set the initial data timestamp.
    auto checkpointTimestamp = _consistencyMarkers->getCheckpointTimestamp(opCtx);
    if (!checkpointTimestamp.isNull()) {

        // If we have a checkpoint timestamp, we set the initial data timestamp now so that
        // the operations we apply below can be given the proper timestamps.
        _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(),
                                                   SnapshotName(checkpointTimestamp));
    }

    // Oplog is empty. There are no oplog entries to apply, so we exit recovery. If there was a
    // checkpointTimestamp then we already set the initial data timestamp. Otherwise, there is
    // nothing to set it to.
    if (!topOfOplog) {
        log() << "No oplog entries to apply for recovery. Oplog is empty.";
        return;
    }

    if (auto startPoint = _getOplogApplicationStartPoint(checkpointTimestamp, appliedThrough)) {
        _applyToEndOfOplog(opCtx, startPoint.get(), topOfOplog->getTimestamp());
    }

    // If we don't have a checkpoint timestamp, then we are either not running a storage engine
    // that supports "recover to stable timestamp" or we just upgraded from a version that didn't.
    // In both cases, the data on disk is not consistent until we have applied all oplog entries to
    // the end of the oplog, since we do not know which ones actually got applied before shutdown.
    // As a result, we do not set the initial data timestamp until after we have applied to the end
    // of the oplog.
    if (checkpointTimestamp.isNull()) {
        _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(),
                                                   SnapshotName(topOfOplog->getTimestamp()));
    }

} catch (...) {
    severe() << "Caught exception during replication recovery: " << exceptionToStatus();
    std::terminate();
}
Ejemplo n.º 9
0
Status createCollectionForApplyOps(OperationContext* opCtx,
                                   const std::string& dbName,
                                   const BSONElement& ui,
                                   const BSONObj& cmdObj,
                                   const BSONObj& idIndex) {
    invariant(opCtx->lockState()->isDbLockedForMode(dbName, MODE_X));
    auto db = dbHolder().get(opCtx, dbName);
    const NamespaceString newCollName(Command::parseNsCollectionRequired(dbName, cmdObj));
    auto newCmd = cmdObj;

    // If a UUID is given, see if we need to rename a collection out of the way, and whether the
    // collection already exists under a different name. If so, rename it into place. As this is
    // done during replay of the oplog, the operations do not need to be atomic, just idempotent.
    // We need to do the renaming part in a separate transaction, as we cannot transactionally
    // create a database on MMAPv1, which could result in createCollection failing if the database
    // does not yet exist.
    if (ui.ok()) {
        // Return an optional, indicating whether we need to early return (if the collection already
        // exists, or in case of an error).
        using Result = boost::optional<Status>;
        auto result =
            writeConflictRetry(opCtx, "createCollectionForApplyOps", newCollName.ns(), [&] {
                WriteUnitOfWork wunit(opCtx);
                // Options need the field to be named "uuid", so parse/recreate.
                auto uuid = uassertStatusOK(UUID::parse(ui));
                uassert(ErrorCodes::InvalidUUID,
                        "Invalid UUID in applyOps create command: " + uuid.toString(),
                        uuid.isRFC4122v4());

                auto& catalog = UUIDCatalog::get(opCtx);
                auto currentName = catalog.lookupNSSByUUID(uuid);
                OpObserver* opObserver = getGlobalServiceContext()->getOpObserver();
                if (currentName == newCollName)
                    return Result(Status::OK());

                // In the case of oplog replay, a future command may have created or renamed a
                // collection with that same name. In that case, renaming this future collection to
                // a random temporary name is correct: once all entries are replayed no temporary
                // names will remain.  On MMAPv1 the rename can result in index names that are too
                // long. However this should only happen for initial sync and "resync collection"
                // for rollback, so we can let the error propagate resulting in an abort and restart
                // of the initial sync or result in rollback to fassert, requiring a resync of that
                // node.
                const bool stayTemp = true;
                if (auto futureColl = db ? db->getCollection(opCtx, newCollName) : nullptr) {
                    auto tmpNameResult = db->makeUniqueCollectionNamespace(opCtx, "tmp%%%%%");
                    if (!tmpNameResult.isOK()) {
                        return Result(Status(tmpNameResult.getStatus().code(),
                                             str::stream() << "Cannot generate temporary "
                                                              "collection namespace for applyOps "
                                                              "create command: collection: "
                                                           << newCollName.ns()
                                                           << ". error: "
                                                           << tmpNameResult.getStatus().reason()));
                    }
                    const auto& tmpName = tmpNameResult.getValue();
                    Status status =
                        db->renameCollection(opCtx, newCollName.ns(), tmpName.ns(), stayTemp);
                    if (!status.isOK())
                        return Result(status);
                    opObserver->onRenameCollection(opCtx,
                                                   newCollName,
                                                   tmpName,
                                                   futureColl->uuid(),
                                                   /*dropTarget*/ false,
                                                   /*dropTargetUUID*/ {},
                                                   stayTemp);
                }

                // If the collection with the requested UUID already exists, but with a different
                // name, just rename it to 'newCollName'.
                if (catalog.lookupCollectionByUUID(uuid)) {
                    Status status =
                        db->renameCollection(opCtx, currentName.ns(), newCollName.ns(), stayTemp);
                    if (!status.isOK())
                        return Result(status);
                    opObserver->onRenameCollection(opCtx,
                                                   currentName,
                                                   newCollName,
                                                   uuid,
                                                   /*dropTarget*/ false,
                                                   /*dropTargetUUID*/ {},
                                                   stayTemp);

                    wunit.commit();
                    return Result(Status::OK());
                }

                // A new collection with the specific UUID must be created, so add the UUID to the
                // creation options. Regular user collection creation commands cannot do this.
                auto uuidObj = uuid.toBSON();
                newCmd = cmdObj.addField(uuidObj.firstElement());
                wunit.commit();

                return Result(boost::none);
            });

        if (result) {
            return *result;
        }
    }

    return createCollection(
        opCtx, newCollName, newCmd, idIndex, CollectionOptions::parseForStorage);
}
Ejemplo n.º 10
0
std::string ReadPreferenceSetting::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 11
0
std::string CollectionBulkLoaderImpl::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 12
0
std::string ChunkType::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 13
0
void ChunkVersion::appendForCommands(BSONObjBuilder* builder) const {
    builder->appendArray(kShardVersionField, toBSON());
}
Ejemplo n.º 14
0
std::string SettingsType::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 15
0
void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
                                               boost::optional<Timestamp> stableTimestamp) try {
    if (_consistencyMarkers->getInitialSyncFlag(opCtx)) {
        log() << "No recovery needed. Initial sync flag set.";
        return;  // Initial Sync will take over so no cleanup is needed.
    }

    const auto serviceCtx = getGlobalServiceContext();
    inReplicationRecovery(serviceCtx) = true;
    ON_BLOCK_EXIT([serviceCtx] {
        invariant(
            inReplicationRecovery(serviceCtx),
            "replication recovery flag is unexpectedly unset when exiting recoverFromOplog()");
        inReplicationRecovery(serviceCtx) = false;
    });

    const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
    if (!truncateAfterPoint.isNull()) {
        log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON();
        _truncateOplogTo(opCtx, truncateAfterPoint);

        // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries
        // erroneously.
        _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});
        opCtx->recoveryUnit()->waitUntilDurable();
    }

    auto topOfOplogSW = _getTopOfOplog(opCtx);
    if (topOfOplogSW.getStatus() == ErrorCodes::CollectionIsEmpty ||
        topOfOplogSW.getStatus() == ErrorCodes::NamespaceNotFound) {
        // Oplog is empty. There are no oplog entries to apply, so we exit recovery and go into
        // initial sync.
        log() << "No oplog entries to apply for recovery. Oplog is empty.";
        return;
    }
    fassert(40290, topOfOplogSW);
    const auto topOfOplog = topOfOplogSW.getValue();

    // If we were passed in a stable timestamp, we are in rollback recovery and should recover from
    // that stable timestamp. Otherwise, we're recovering at startup. If this storage engine
    // supports recover to stable timestamp or enableMajorityReadConcern=false, we ask it for the
    // recovery timestamp. If the storage engine returns a timestamp, we recover from that point.
    // However, if the storage engine returns "none", the storage engine does not have a stable
    // checkpoint and we must recover from an unstable checkpoint instead.
    const bool supportsRecoveryTimestamp =
        _storageInterface->supportsRecoveryTimestamp(opCtx->getServiceContext());
    if (!stableTimestamp && supportsRecoveryTimestamp) {
        stableTimestamp = _storageInterface->getRecoveryTimestamp(opCtx->getServiceContext());
    }

    const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);
    invariant(!stableTimestamp || stableTimestamp->isNull() || appliedThrough.isNull() ||
                  *stableTimestamp == appliedThrough.getTimestamp(),
              str::stream() << "Stable timestamp " << stableTimestamp->toString()
                            << " does not equal appliedThrough timestamp "
                            << appliedThrough.toString());

    if (stableTimestamp) {
        invariant(supportsRecoveryTimestamp);
        _recoverFromStableTimestamp(opCtx, *stableTimestamp, appliedThrough, topOfOplog);
    } else {
        _recoverFromUnstableCheckpoint(opCtx, appliedThrough, topOfOplog);
    }

    _reconstructPreparedTransactions(opCtx);
} catch (...) {
    severe() << "Caught exception during replication recovery: " << exceptionToStatus();
    std::terminate();
}
Ejemplo n.º 16
0
std::string CollectionCloner::Stats::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 17
0
Status ShardingStateRecovery::recover(OperationContext* txn) {
    if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) {
        return Status::OK();
    }

    BSONObj recoveryDocBSON;

    try {
        AutoGetCollection autoColl(txn, NamespaceString::kConfigCollectionNamespace, MODE_IS);
        if (!Helpers::findOne(
                txn, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) {
            return Status::OK();
        }
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON);
    if (!recoveryDocStatus.isOK())
        return recoveryDocStatus.getStatus();

    const auto recoveryDoc = std::move(recoveryDocStatus.getValue());

    log() << "Sharding state recovery process found document " << recoveryDoc.toBSON();

    // Make sure the sharding state is initialized
    ShardingState* const shardingState = ShardingState::get(txn);

    // For backwards compatibility. Shards added by v3.4 cluster should have been initialized by
    // the shard identity document.
    // TODO(SERER-25276): Remove this after 3.4 since 3.4 shards should always have ShardingState
    // initialized by this point.
    if (!shardingState->enabled()) {
        shardingState->initializeFromConfigConnString(txn, recoveryDoc.getConfigsvr().toString());
        shardingState->setShardName(recoveryDoc.getShardName());
    }

    if (!recoveryDoc.getMinOpTimeUpdaters()) {
        // Treat the minOpTime as up-to-date
        grid.advanceConfigOpTime(recoveryDoc.getMinOpTime());
        return Status::OK();
    }

    log() << "Sharding state recovery document indicates there were "
          << recoveryDoc.getMinOpTimeUpdaters()
          << " metadata change operations in flight. Contacting the config server primary in order "
             "to retrieve the most recent opTime.";

    // Need to fetch the latest uptime from the config server, so do a logging write
    Status status =
        grid.catalogClient(txn)->logChange(txn,
                                           "Sharding minOpTime recovery",
                                           NamespaceString::kConfigCollectionNamespace.ns(),
                                           recoveryDocBSON,
                                           ShardingCatalogClient::kMajorityWriteConcern);
    if (!status.isOK())
        return status;

    log() << "Sharding state recovered. New config server opTime is " << grid.configOpTime();

    // Finally, clear the recovery document so next time we don't need to recover
    status = modifyRecoveryDocument(txn, RecoveryDocument::Clear, kLocalWriteConcern);
    if (!status.isOK()) {
        warning() << "Failed to reset sharding state recovery document due to " << status;
    }

    return Status::OK();
}
Ejemplo n.º 18
0
std::string MongosType::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 19
0
std::string ReadConcernArgs::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 20
0
std::vector<BSONObj> MongoProcessCommon::getCurrentOps(
    const boost::intrusive_ptr<ExpressionContext>& expCtx,
    CurrentOpConnectionsMode connMode,
    CurrentOpSessionsMode sessionMode,
    CurrentOpUserMode userMode,
    CurrentOpTruncateMode truncateMode,
    CurrentOpCursorMode cursorMode) const {
    OperationContext* opCtx = expCtx->opCtx;
    AuthorizationSession* ctxAuth = AuthorizationSession::get(opCtx->getClient());

    std::vector<BSONObj> ops;

    for (ServiceContext::LockedClientsCursor cursor(opCtx->getClient()->getServiceContext());
         Client* client = cursor.next();) {
        invariant(client);

        stdx::lock_guard<Client> lk(*client);

        // If auth is disabled, ignore the allUsers parameter.
        if (ctxAuth->getAuthorizationManager().isAuthEnabled() &&
            userMode == CurrentOpUserMode::kExcludeOthers &&
            !ctxAuth->isCoauthorizedWithClient(client)) {
            continue;
        }

        // Ignore inactive connections unless 'idleConnections' is true.
        if (!client->getOperationContext() && connMode == CurrentOpConnectionsMode::kExcludeIdle) {
            continue;
        }

        // Delegate to the mongoD- or mongoS-specific implementation of _reportCurrentOpForClient.
        ops.emplace_back(_reportCurrentOpForClient(opCtx, client, truncateMode));
    }

    // If 'cursorMode' is set to include idle cursors, retrieve them and add them to ops.
    if (cursorMode == CurrentOpCursorMode::kIncludeCursors) {

        for (auto&& cursor : getIdleCursors(expCtx, userMode)) {
            BSONObjBuilder cursorObj;
            auto ns = cursor.getNs();
            auto lsid = cursor.getLsid();
            cursorObj.append("type", "idleCursor");
            cursorObj.append("host", getHostNameCached());
            cursorObj.append("ns", ns->toString());
            // If in legacy read mode, lsid is not present.
            if (lsid) {
                cursorObj.append("lsid", lsid->toBSON());
            }
            cursor.setNs(boost::none);
            cursor.setLsid(boost::none);
            // On mongos, planSummary is not present.
            auto planSummaryData = cursor.getPlanSummary();
            if (planSummaryData) {
                auto planSummaryText = planSummaryData->toString();
                // Plan summary has to appear in the top level object, not the cursor object.
                // We remove it, create the op, then put it back.
                cursor.setPlanSummary(boost::none);
                cursorObj.append("planSummary", planSummaryText);
                cursorObj.append("cursor", cursor.toBSON());
                cursor.setPlanSummary(StringData(planSummaryText));
            } else {
                cursorObj.append("cursor", cursor.toBSON());
            }
            ops.emplace_back(cursorObj.obj());
            cursor.setNs(ns);
            cursor.setLsid(lsid);
        }
    }

    // If we need to report on idle Sessions, defer to the mongoD or mongoS implementations.
    if (sessionMode == CurrentOpSessionsMode::kIncludeIdle) {
        _reportCurrentOpsForIdleSessions(opCtx, userMode, &ops);
    }

    return ops;
}
std::string DatabasesCloner::Stats::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 22
0
mongo::Status mongo::cloneCollectionAsCapped(OperationContext* opCtx,
                                             Database* db,
                                             const std::string& shortFrom,
                                             const std::string& shortTo,
                                             long long size,
                                             bool temp) {
    NamespaceString fromNss(db->name(), shortFrom);
    NamespaceString toNss(db->name(), shortTo);

    Collection* fromCollection = db->getCollection(opCtx, fromNss);
    if (!fromCollection) {
        if (db->getViewCatalog()->lookup(opCtx, fromNss.ns())) {
            return Status(ErrorCodes::CommandNotSupportedOnView,
                          str::stream() << "cloneCollectionAsCapped not supported for views: "
                                        << fromNss.ns());
        }
        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "source collection " << fromNss.ns() << " does not exist");
    }

    if (fromNss.isDropPendingNamespace()) {
        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "source collection " << fromNss.ns()
                                    << " is currently in a drop-pending state.");
    }

    if (db->getCollection(opCtx, toNss)) {
        return Status(ErrorCodes::NamespaceExists,
                      str::stream() << "cloneCollectionAsCapped failed - destination collection "
                                    << toNss.ns()
                                    << " already exists. source collection: "
                                    << fromNss.ns());
    }

    // create new collection
    {
        auto options = fromCollection->getCatalogEntry()->getCollectionOptions(opCtx);
        // The capped collection will get its own new unique id, as the conversion isn't reversible,
        // so it can't be rolled back.
        options.uuid.reset();
        options.capped = true;
        options.cappedSize = size;
        if (temp)
            options.temp = true;

        BSONObjBuilder cmd;
        cmd.append("create", toNss.coll());
        cmd.appendElements(options.toBSON());
        Status status = createCollection(opCtx, toNss.db().toString(), cmd.done());
        if (!status.isOK())
            return status;
    }

    Collection* toCollection = db->getCollection(opCtx, toNss);
    invariant(toCollection);  // we created above

    // how much data to ignore because it won't fit anyway
    // datasize and extentSize can't be compared exactly, so add some padding to 'size'

    long long allocatedSpaceGuess =
        std::max(static_cast<long long>(size * 2),
                 static_cast<long long>(toCollection->getRecordStore()->storageSize(opCtx) * 2));

    long long excessSize = fromCollection->dataSize(opCtx) - allocatedSpaceGuess;

    auto exec = InternalPlanner::collectionScan(opCtx,
                                                fromNss.ns(),
                                                fromCollection,
                                                PlanExecutor::WRITE_CONFLICT_RETRY_ONLY,
                                                InternalPlanner::FORWARD);

    Snapshotted<BSONObj> objToClone;
    RecordId loc;
    PlanExecutor::ExecState state = PlanExecutor::FAILURE;  // suppress uninitialized warnings

    DisableDocumentValidation validationDisabler(opCtx);

    int retries = 0;  // non-zero when retrying our last document.
    while (true) {
        if (!retries) {
            state = exec->getNextSnapshotted(&objToClone, &loc);
        }

        switch (state) {
            case PlanExecutor::IS_EOF:
                return Status::OK();
            case PlanExecutor::ADVANCED: {
                if (excessSize > 0) {
                    // 4x is for padding, power of 2, etc...
                    excessSize -= (4 * objToClone.value().objsize());
                    continue;
                }
                break;
            }
            default:
                // Unreachable as:
                // 1) We require a read lock (at a minimum) on the "from" collection
                //    and won't yield, preventing collection drop and PlanExecutor::DEAD
                // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The
                //    CollectionScan PlanStage does not have a FAILURE scenario.
                // 3) All other PlanExecutor states are handled above
                MONGO_UNREACHABLE;
        }

        try {
            // Make sure we are working with the latest version of the document.
            if (objToClone.snapshotId() != opCtx->recoveryUnit()->getSnapshotId() &&
                !fromCollection->findDoc(opCtx, loc, &objToClone)) {
                // doc was deleted so don't clone it.
                retries = 0;
                continue;
            }

            WriteUnitOfWork wunit(opCtx);
            OpDebug* const nullOpDebug = nullptr;
            uassertStatusOK(toCollection->insertDocument(
                opCtx, InsertStatement(objToClone.value()), nullOpDebug, true));
            wunit.commit();

            // Go to the next document
            retries = 0;
        } catch (const WriteConflictException&) {
            CurOp::get(opCtx)->debug().additiveMetrics.incrementWriteConflicts(1);
            retries++;  // logAndBackoff expects this to be 1 on first call.
            WriteConflictException::logAndBackoff(retries, "cloneCollectionAsCapped", fromNss.ns());

            // Can't use writeConflictRetry since we need to save/restore exec around call to
            // abandonSnapshot.
            exec->saveState();
            opCtx->recoveryUnit()->abandonSnapshot();
            auto restoreStatus = exec->restoreState();  // Handles any WCEs internally.
            if (!restoreStatus.isOK()) {
                return restoreStatus;
            }
        }
    }

    MONGO_UNREACHABLE;
}
Ejemplo n.º 23
0
std::string ChangeLogType::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 24
0
void ChunkVersion::appendWithFieldForCommands(BSONObjBuilder* builder, StringData field) const {
    builder->appendArray(field, toBSON());
}
Ejemplo n.º 25
0
std::string ShardType::toString() const {
    return toBSON().toString();
}
Ejemplo n.º 26
0
std::string ShardCollectionType::toString() const {
    return toBSON().toString();
}