bool DropPendingCollectionReaper::rollBackDropPendingCollection(
    OperationContext* opCtx, const OpTime& opTime, const NamespaceString& collectionNamespace) {
    // renames because these are internal operations.
    UnreplicatedWritesBlock uwb(opCtx);

    const auto pendingNss = collectionNamespace.makeDropPendingNamespace(opTime);
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        const auto equalRange = _dropPendingNamespaces.equal_range(opTime);
        const auto& lowerBound = equalRange.first;
        const auto& upperBound = equalRange.second;
        auto matcher = [&pendingNss](const auto& pair) { return pair.second == pendingNss; };
        auto it = std::find_if(lowerBound, upperBound, matcher);
        if (it == upperBound) {
            warning() << "Cannot find drop-pending namespace at OpTime " << opTime
                      << " for collection " << collectionNamespace << " to roll back.";
            return false;
        }

        _dropPendingNamespaces.erase(it);
    }

    log() << "Rolling back collection drop for " << pendingNss << " with drop OpTime " << opTime
          << " to namespace " << collectionNamespace;

    return true;
}
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
                                                 Timestamp oplogApplicationStartPoint,
                                                 Timestamp topOfOplog) {
    invariant(!oplogApplicationStartPoint.isNull());
    invariant(!topOfOplog.isNull());

    // Check if we have any unapplied ops in our oplog. It is important that this is done after
    // deleting the ragged end of the oplog.
    if (oplogApplicationStartPoint == topOfOplog) {
        log()
            << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog.";
        return;  // We've applied all the valid oplog we have.
    } else if (oplogApplicationStartPoint > topOfOplog) {
        severe() << "Applied op " << oplogApplicationStartPoint.toBSON()
                 << " not found. Top of oplog is " << topOfOplog.toBSON() << '.';
        fassertFailedNoTrace(40313);
    }

    log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON()
          << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive).";

    DBDirectClient db(opCtx);
    auto cursor = db.query(NamespaceString::kRsOplogNamespace.ns(),
                           QUERY("ts" << BSON("$gte" << oplogApplicationStartPoint)),
                           /*batchSize*/ 0,
                           /*skip*/ 0,
                           /*projection*/ nullptr,
                           QueryOption_OplogReplay);

    // Check that the first document matches our appliedThrough point then skip it since it's
    // already been applied.
    if (!cursor->more()) {
        // This should really be impossible because we check above that the top of the oplog is
        // strictly > appliedThrough. If this fails it represents a serious bug in either the
        // storage engine or query's implementation of OplogReplay.
        severe() << "Couldn't find any entries in the oplog >= "
                 << oplogApplicationStartPoint.toBSON() << " which should be impossible.";
        fassertFailedNoTrace(40293);
    }

    auto firstTimestampFound =
        fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())).getTimestamp();
    if (firstTimestampFound != oplogApplicationStartPoint) {
        severe() << "Oplog entry at " << oplogApplicationStartPoint.toBSON()
                 << " is missing; actual entry found is " << firstTimestampFound.toBSON();
        fassertFailedNoTrace(40292);
    }

    // Apply remaining ops one at at time, but don't log them because they are already logged.
    UnreplicatedWritesBlock uwb(opCtx);

    while (cursor->more()) {
        auto entry = cursor->nextSafe();
        fassertStatusOK(40294,
                        SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering));
        _consistencyMarkers->setAppliedThrough(
            opCtx, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry)));
    }
}
Exemple #3
0
int* measure()
{
	int i=0;
    for(i=0;i<num;i++)
    {
        dis[i]=uwb(nodes[i]);
    }
    return dis;
}
void DropPendingCollectionReaper::dropCollectionsOlderThan(OperationContext* opCtx,
                                                           const OpTime& opTime) {
    DropPendingNamespaces toDrop;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        for (auto it = _dropPendingNamespaces.cbegin();
             it != _dropPendingNamespaces.cend() && it->first <= opTime;
             ++it) {
            toDrop.insert(*it);
        }
    }

    if (toDrop.empty()) {
        return;
    }

    {
        // Every node cleans up its own drop-pending collections. We should never replicate these
        // drops because these are internal operations.
        UnreplicatedWritesBlock uwb(opCtx);

        for (const auto& opTimeAndNamespace : toDrop) {
            const auto& dropOpTime = opTimeAndNamespace.first;
            const auto& nss = opTimeAndNamespace.second;
            log() << "Completing collection drop for " << nss << " with drop optime " << dropOpTime
                  << " (notification optime: " << opTime << ")";
            Status status = Status::OK();
            try {
                // dropCollection could throw an interrupt exception, since it acquires db locks.
                status = _storageInterface->dropCollection(opCtx, nss);
            } catch (...) {
                status = exceptionToStatus();
            }
            if (!status.isOK()) {
                warning() << "Failed to remove drop-pending collection " << nss
                          << " with drop optime " << dropOpTime
                          << " (notification optime: " << opTime << "): " << status;
            }
        }
    }

    {
        // Entries must be removed AFTER drops are completed, so that getEarliestDropOpTime()
        // returns appropriate results.
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        auto it = _dropPendingNamespaces.cbegin();
        while (it != _dropPendingNamespaces.cend() && it->first <= opTime) {
            if (toDrop.find(it->first) != toDrop.cend()) {
                it = _dropPendingNamespaces.erase(it);
            } else {
                ++it;
            }
        }
    }
}
void ReplicationRecoveryImpl::_reconstructPreparedTransactions(OperationContext* opCtx) {
    DBDirectClient client(opCtx);
    const auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace,
                                     {BSON("state"
                                           << "prepared")});

    // Iterate over each entry in the transactions table that has a prepared transaction.
    while (cursor->more()) {
        const auto txnRecordObj = cursor->next();
        const auto txnRecord = SessionTxnRecord::parse(
            IDLParserErrorContext("recovering prepared transaction"), txnRecordObj);

        invariant(txnRecord.getState() == DurableTxnStateEnum::kPrepared);

        // Get the prepareTransaction oplog entry corresponding to this transactions table entry.
        invariant(!opCtx->recoveryUnit()->getPointInTimeReadTimestamp());
        const auto prepareOpTime = txnRecord.getLastWriteOpTime();
        invariant(!prepareOpTime.isNull());
        TransactionHistoryIterator iter(prepareOpTime);
        invariant(iter.hasNext());
        const auto prepareOplogEntry = iter.next(opCtx);

        {
            // Make a new opCtx so that we can set the lsid when applying the prepare transaction
            // oplog entry.
            auto newClient =
                opCtx->getServiceContext()->makeClient("reconstruct-prepared-transactions");
            AlternativeClientRegion acr(newClient);
            const auto newOpCtx = cc().makeOperationContext();
            repl::UnreplicatedWritesBlock uwb(newOpCtx.get());

            // Snapshot transaction can never conflict with the PBWM lock.
            newOpCtx->lockState()->setShouldConflictWithSecondaryBatchApplication(false);

            // TODO: SERVER-40177 This should be removed once it is guaranteed operations applied on
            // recovering nodes cannot encounter unnecessary prepare conflicts.
            newOpCtx->recoveryUnit()->setIgnorePrepared(true);

            // Checks out the session, applies the operations and prepares the transactions.
            uassertStatusOK(applyRecoveredPrepareTransaction(newOpCtx.get(), prepareOplogEntry));
        }
    }
}
Status ReplicationCoordinatorExternalStateImpl::runRepairOnLocalDB(OperationContext* opCtx) {
    try {
        Lock::GlobalWrite globalWrite(opCtx);
        StorageEngine* engine = getGlobalServiceContext()->getGlobalStorageEngine();

        if (!engine->isMmapV1()) {
            return Status::OK();
        }

        UnreplicatedWritesBlock uwb(opCtx);
        Status status = repairDatabase(opCtx, engine, localDbName, false, false);

        // Open database before returning
        dbHolder().openDb(opCtx, localDbName);
    } catch (const DBException& ex) {
        return ex.toStatus();
    }
    return Status::OK();
}
Exemple #7
0
void CollectionCloner::_listIndexesCallback(const Fetcher::QueryResponseStatus& fetchResult,
                                            Fetcher::NextAction* nextAction,
                                            BSONObjBuilder* getMoreBob) {
    const bool collectionIsEmpty = fetchResult == ErrorCodes::NamespaceNotFound;
    if (collectionIsEmpty) {
        // Schedule collection creation and finish callback.
        auto&& scheduleResult =
            _scheduleDbWorkFn([this](const executor::TaskExecutor::CallbackArgs& cbd) {
                if (!cbd.status.isOK()) {
                    _finishCallback(cbd.status);
                    return;
                }
                auto opCtx = cbd.opCtx;
                UnreplicatedWritesBlock uwb(opCtx);
                auto&& createStatus =
                    _storageInterface->createCollection(opCtx, _destNss, _options);
                _finishCallback(createStatus);
            });
        if (!scheduleResult.isOK()) {
            _finishCallback(scheduleResult.getStatus());
        }
        return;
    };
    if (!fetchResult.isOK()) {
        _finishCallback(fetchResult.getStatus().withContext(
            str::stream() << "listIndexes call failed on collection '" << _sourceNss.ns() << "'"));
        return;
    }

    auto batchData(fetchResult.getValue());
    auto&& documents = batchData.documents;

    if (documents.empty()) {
        warning() << "No indexes found for collection " << _sourceNss.ns() << " while cloning from "
                  << _source;
    }

    UniqueLock lk(_mutex);
    // When listing indexes by UUID, the sync source may use a different name for the collection
    // as result of renaming or two-phase drop. As the index spec also includes a 'ns' field, this
    // must be rewritten.
    BSONObjBuilder nsFieldReplacementBuilder;
    nsFieldReplacementBuilder.append("ns", _sourceNss.ns());
    BSONElement nsFieldReplacementElem = nsFieldReplacementBuilder.done().firstElement();

    // We may be called with multiple batches leading to a need to grow _indexSpecs.
    _indexSpecs.reserve(_indexSpecs.size() + documents.size());
    for (auto&& doc : documents) {
        // The addField replaces the 'ns' field with the correct name, see above.
        if (StringData("_id_") == doc["name"].str()) {
            _idIndexSpec = doc.addField(nsFieldReplacementElem);
            continue;
        }
        _indexSpecs.push_back(doc.addField(nsFieldReplacementElem));
    }
    lk.unlock();

    // The fetcher will continue to call with kGetMore until an error or the last batch.
    if (*nextAction == Fetcher::NextAction::kGetMore) {
        invariant(getMoreBob);
        getMoreBob->append("getMore", batchData.cursorId);
        getMoreBob->append("collection", batchData.nss.coll());
        return;
    }

    // We have all of the indexes now, so we can start cloning the collection data.
    auto&& scheduleResult = _scheduleDbWorkFn(
        [=](const executor::TaskExecutor::CallbackArgs& cbd) { _beginCollectionCallback(cbd); });
    if (!scheduleResult.isOK()) {
        _finishCallback(scheduleResult.getStatus());
        return;
    }
}
Exemple #8
0
Status applyOps(OperationContext* opCtx,
                const std::string& dbName,
                const BSONObj& applyOpCmd,
                BSONObjBuilder* result) {
    bool allowAtomic = false;
    uassertStatusOK(
        bsonExtractBooleanFieldWithDefault(applyOpCmd, "allowAtomic", true, &allowAtomic));
    auto areOpsCrudOnly = _areOpsCrudOnly(applyOpCmd);
    auto isAtomic = allowAtomic && areOpsCrudOnly;
    auto hasPrecondition = _hasPrecondition(applyOpCmd);

    boost::optional<Lock::GlobalWrite> globalWriteLock;
    boost::optional<Lock::DBLock> dbWriteLock;

    // There's only one case where we are allowed to take the database lock instead of the global
    // lock - no preconditions; only CRUD ops; and non-atomic mode.
    if (!hasPrecondition && areOpsCrudOnly && !allowAtomic) {
        dbWriteLock.emplace(opCtx, dbName, MODE_X);
    } else {
        globalWriteLock.emplace(opCtx);
    }

    bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() &&
        !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(opCtx, dbName);

    if (userInitiatedWritesAndNotPrimary)
        return Status(ErrorCodes::NotMaster,
                      str::stream() << "Not primary while applying ops to database " << dbName);

    if (hasPrecondition) {
        auto status = _checkPrecondition(opCtx, applyOpCmd, result);
        if (!status.isOK()) {
            return status;
        }
    }

    int numApplied = 0;
    if (!isAtomic)
        return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied);

    // Perform write ops atomically
    invariant(globalWriteLock);
    try {
        writeConflictRetry(opCtx, "applyOps", dbName, [&] {
            BSONObjBuilder intermediateResult;
            WriteUnitOfWork wunit(opCtx);
            numApplied = 0;
            {
                // Suppress replication for atomic operations until end of applyOps.
                repl::UnreplicatedWritesBlock uwb(opCtx);
                uassertStatusOK(
                    _applyOps(opCtx, dbName, applyOpCmd, &intermediateResult, &numApplied));
            }
            // Generate oplog entry for all atomic ops collectively.
            if (opCtx->writesAreReplicated()) {
                // We want this applied atomically on slaves so we rewrite the oplog entry without
                // the pre-condition for speed.

                BSONObjBuilder cmdBuilder;

                for (auto elem : applyOpCmd) {
                    auto name = elem.fieldNameStringData();
                    if (name == kPreconditionFieldName)
                        continue;
                    if (name == "bypassDocumentValidation")
                        continue;
                    cmdBuilder.append(elem);
                }

                const BSONObj cmdRewritten = cmdBuilder.done();

                auto opObserver = getGlobalServiceContext()->getOpObserver();
                invariant(opObserver);
                opObserver->onApplyOps(opCtx, dbName, cmdRewritten);
            }
            wunit.commit();
            result->appendElements(intermediateResult.obj());
        });
    } catch (const DBException& ex) {
        if (ex.getCode() == ErrorCodes::NamespaceNotFound) {
            // Retry in non-atomic mode, since MMAP cannot implicitly create a new database
            // within an active WriteUnitOfWork.
            return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied);
        }
        BSONArrayBuilder ab;
        ++numApplied;
        for (int j = 0; j < numApplied; j++)
            ab.append(false);
        result->append("applied", numApplied);
        result->append("code", ex.getCode());
        result->append("codeName", ErrorCodes::errorString(ErrorCodes::fromInt(ex.getCode())));
        result->append("errmsg", ex.what());
        result->append("results", ab.arr());
        return Status(ErrorCodes::UnknownError, ex.what());
    }

    return Status::OK();
}