Exemple #1
0
void Collection::deleteDocument(
    OperationContext* txn, const RecordId& loc, bool cappedOK, bool noWarn, BSONObj* deletedId) {
    if (isCapped() && !cappedOK) {
        log() << "failing remove on a capped ns " << _ns << endl;
        uasserted(10089, "cannot remove from a capped collection");
        return;
    }

    Snapshotted<BSONObj> doc = docFor(txn, loc);

    BSONElement e = doc.value()["_id"];
    BSONObj id;
    if (e.type()) {
        id = e.wrap();
        if (deletedId) {
            *deletedId = e.wrap();
        }
    }

    /* check if any cursors point to us.  if so, advance them. */
    _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION);

    _indexCatalog.unindexRecord(txn, doc.value(), loc, noWarn);

    _recordStore->deleteRecord(txn, loc);

    if (!id.isEmpty()) {
        getGlobalServiceContext()->getOpObserver()->onDelete(txn, ns().ns(), id);
    }
}
PlanExecutor::ExecState PlanExecutor::getNext(BSONObj* objOut, RecordId* dlOut) {
    Snapshotted<BSONObj> snapshotted;
    ExecState state = getNextImpl(objOut ? &snapshotted : NULL, dlOut);

    if (objOut) {
        *objOut = snapshotted.value();
    }

    return state;
}
Exemple #3
0
void Collection::deleteDocument(
    OperationContext* txn, const RecordId& loc, bool fromMigrate, bool cappedOK, bool noWarn) {
    if (isCapped() && !cappedOK) {
        log() << "failing remove on a capped ns " << _ns << endl;
        uasserted(10089, "cannot remove from a capped collection");
        return;
    }

    Snapshotted<BSONObj> doc = docFor(txn, loc);

    auto opObserver = getGlobalServiceContext()->getOpObserver();
    OpObserver::DeleteState deleteState = opObserver->aboutToDelete(txn, ns(), doc.value());

    /* check if any cursors point to us.  if so, advance them. */
    _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION);

    _indexCatalog.unindexRecord(txn, doc.value(), loc, noWarn);

    _recordStore->deleteRecord(txn, loc);

    opObserver->onDelete(txn, ns(), std::move(deleteState), fromMigrate);
}
Exemple #4
0
StatusWith<RecordData> Collection::updateDocumentWithDamages(
    OperationContext* txn,
    const RecordId& loc,
    const Snapshotted<RecordData>& oldRec,
    const char* damageSource,
    const mutablebson::DamageVector& damages,
    oplogUpdateEntryArgs& args) {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(oldRec.snapshotId() == txn->recoveryUnit()->getSnapshotId());
    invariant(updateWithDamagesSupported());

    // Broadcast the mutation so that query results stay correct.
    _cursorManager.invalidateDocument(txn, loc, INVALIDATION_MUTATION);

    auto newRecStatus =
        _recordStore->updateWithDamages(txn, loc, oldRec.value(), damageSource, damages);

    if (newRecStatus.isOK()) {
        args.ns = ns().ns();
        getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);
    }
    return newRecStatus;
}
Status MigrationChunkClonerSourceLegacy::nextCloneBatch(OperationContext* txn,
                                                        Collection* collection,
                                                        BSONArrayBuilder* arrBuilder) {
    dassert(txn->lockState()->isCollectionLockedForMode(_args.getNss().ns(), MODE_IS));

    ElapsedTracker tracker(txn->getServiceContext()->getFastClockSource(),
                           internalQueryExecYieldIterations,
                           Milliseconds(internalQueryExecYieldPeriodMS.load()));

    stdx::lock_guard<stdx::mutex> sl(_mutex);

    std::set<RecordId>::iterator it;

    for (it = _cloneLocs.begin(); it != _cloneLocs.end(); ++it) {
        // We must always make progress in this method by at least one document because empty return
        // indicates there is no more initial clone data.
        if (arrBuilder->arrSize() && tracker.intervalHasElapsed()) {
            break;
        }

        Snapshotted<BSONObj> doc;
        if (collection->findDoc(txn, *it, &doc)) {
            // Use the builder size instead of accumulating the document sizes directly so that we
            // take into consideration the overhead of BSONArray indices.
            if (arrBuilder->arrSize() &&
                (arrBuilder->len() + doc.value().objsize() + 1024) > BSONObjMaxUserSize) {
                break;
            }

            arrBuilder->append(doc.value());
        }
    }

    _cloneLocs.erase(_cloneLocs.begin(), it);

    return Status::OK();
}
Exemple #6
0
Status MultiIndexBlock::insertAllDocumentsInCollection(std::set<RecordId>* dupsOut) {
    const char* curopMessage = _buildInBackground ? "Index Build (background)" : "Index Build";
    const auto numRecords = _collection->numRecords(_txn);
    stdx::unique_lock<Client> lk(*_txn->getClient());
    ProgressMeterHolder progress(*_txn->setMessage_inlock(curopMessage, curopMessage, numRecords));
    lk.unlock();

    Timer t;

    unsigned long long n = 0;

    unique_ptr<PlanExecutor> exec(InternalPlanner::collectionScan(
        _txn, _collection->ns().ns(), _collection, PlanExecutor::YIELD_MANUAL));
    if (_buildInBackground) {
        invariant(_allowInterruption);
        exec->setYieldPolicy(PlanExecutor::YIELD_AUTO);
    } else {
        exec->setYieldPolicy(PlanExecutor::WRITE_CONFLICT_RETRY_ONLY);
    }

    Snapshotted<BSONObj> objToIndex;
    RecordId loc;
    PlanExecutor::ExecState state;
    int retries = 0;  // non-zero when retrying our last document.
    while (retries ||
           (PlanExecutor::ADVANCED == (state = exec->getNextSnapshotted(&objToIndex, &loc)))) {
        try {
            if (_allowInterruption)
                _txn->checkForInterrupt();

            // Make sure we are working with the latest version of the document.
            if (objToIndex.snapshotId() != _txn->recoveryUnit()->getSnapshotId() &&
                !_collection->findDoc(_txn, loc, &objToIndex)) {
                // doc was deleted so don't index it.
                retries = 0;
                continue;
            }

            // Done before insert so we can retry document if it WCEs.
            progress->setTotalWhileRunning(_collection->numRecords(_txn));

            WriteUnitOfWork wunit(_txn);
            Status ret = insert(objToIndex.value(), loc);
            if (_buildInBackground)
                exec->saveState();
            if (ret.isOK()) {
                wunit.commit();
            } else if (dupsOut && ret.code() == ErrorCodes::DuplicateKey) {
                // If dupsOut is non-null, we should only fail the specific insert that
                // led to a DuplicateKey rather than the whole index build.
                dupsOut->insert(loc);
            } else {
                // Fail the index build hard.
                return ret;
            }
            if (_buildInBackground)
                exec->restoreState();  // Handles any WCEs internally.

            // Go to the next document
            progress->hit();
            n++;
            retries = 0;
        } catch (const WriteConflictException& wce) {
            CurOp::get(_txn)->debug().writeConflicts++;
            retries++;  // logAndBackoff expects this to be 1 on first call.
            wce.logAndBackoff(retries, "index creation", _collection->ns().ns());

            // Can't use WRITE_CONFLICT_RETRY_LOOP macros since we need to save/restore exec
            // around call to abandonSnapshot.
            exec->saveState();
            _txn->recoveryUnit()->abandonSnapshot();
            exec->restoreState();  // Handles any WCEs internally.
        }
    }

    uassert(28550,
            "Unable to complete index build due to collection scan failure: " +
                WorkingSetCommon::toStatusString(objToIndex.value()),
            state == PlanExecutor::IS_EOF);

    progress->finished();

    Status ret = doneInserting(dupsOut);
    if (!ret.isOK())
        return ret;

    log() << "build index done.  scanned " << n << " total records. " << t.seconds() << " secs"
          << endl;

    return Status::OK();
}
Exemple #7
0
StatusWith<RecordId> Collection::updateDocument(OperationContext* txn,
                                                const RecordId& oldLocation,
                                                const Snapshotted<BSONObj>& oldDoc,
                                                const BSONObj& newDoc,
                                                bool enforceQuota,
                                                bool indexesAffected,
                                                OpDebug* debug,
                                                oplogUpdateEntryArgs& args) {
    {
        auto status = checkValidation(txn, newDoc);
        if (!status.isOK()) {
            if (_validationLevel == STRICT_V) {
                return status;
            }
            // moderate means we have to check the old doc
            auto oldDocStatus = checkValidation(txn, oldDoc.value());
            if (oldDocStatus.isOK()) {
                // transitioning from good -> bad is not ok
                return status;
            }
            // bad -> bad is ok in moderate mode
        }
    }

    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId());

    if (_needCappedLock) {
        // X-lock the metadata resource for this capped collection until the end of the WUOW. This
        // prevents the primary from executing with more concurrency than secondaries.
        // See SERVER-21646.
        Lock::ResourceLock{txn->lockState(), ResourceId(RESOURCE_METADATA, _ns.ns()), MODE_X};
    }

    SnapshotId sid = txn->recoveryUnit()->getSnapshotId();

    BSONElement oldId = oldDoc.value()["_id"];
    if (!oldId.eoo() && (oldId != newDoc["_id"]))
        return StatusWith<RecordId>(
            ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596);

    // The MMAPv1 storage engine implements capped collections in a way that does not allow records
    // to grow beyond their original size. If MMAPv1 part of a replicaset with storage engines that
    // do not have this limitation, replication could result in errors, so it is necessary to set a
    // uniform rule here. Similarly, it is not sufficient to disallow growing records, because this
    // happens when secondaries roll back an update shrunk a record. Exactly replicating legacy
    // MMAPv1 behavior would require padding shrunk documents on all storage engines. Instead forbid
    // all size changes.
    const auto oldSize = oldDoc.value().objsize();
    if (_recordStore->isCapped() && oldSize != newDoc.objsize())
        return {ErrorCodes::CannotGrowDocumentInCappedNamespace,
                str::stream() << "Cannot change the size of a document in a capped collection: "
                              << oldSize << " != " << newDoc.objsize()};

    // At the end of this step, we will have a map of UpdateTickets, one per index, which
    // represent the index updates needed to be done, based on the changes between oldDoc and
    // newDoc.
    OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets;
    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexCatalogEntry* entry = ii.catalogEntry(descriptor);
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) ||
                repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor);
            UpdateTicket* updateTicket = new UpdateTicket();
            updateTickets.mutableMap()[descriptor] = updateTicket;
            Status ret = iam->validateUpdate(txn,
                                             oldDoc.value(),
                                             newDoc,
                                             oldLocation,
                                             options,
                                             updateTicket,
                                             entry->getFilterExpression());
            if (!ret.isOK()) {
                return StatusWith<RecordId>(ret);
            }
        }
    }

    // This can call back into Collection::recordStoreGoingToMove.  If that happens, the old
    // object is removed from all indexes.
    StatusWith<RecordId> newLocation = _recordStore->updateRecord(
        txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this);

    if (!newLocation.isOK()) {
        return newLocation;
    }

    // At this point, the old object may or may not still be indexed, depending on if it was
    // moved. If the object did move, we need to add the new location to all indexes.
    if (newLocation.getValue() != oldLocation) {
        if (debug) {
            if (debug->nmoved == -1)  // default of -1 rather than 0
                debug->nmoved = 1;
            else
                debug->nmoved += 1;
        }

        std::vector<BsonRecord> bsonRecords;
        BsonRecord bsonRecord = {newLocation.getValue(), &newDoc};
        bsonRecords.push_back(bsonRecord);
        Status s = _indexCatalog.indexRecords(txn, bsonRecords);
        if (!s.isOK())
            return StatusWith<RecordId>(s);
        invariant(sid == txn->recoveryUnit()->getSnapshotId());
        args.ns = ns().ns();
        getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

        return newLocation;
    }

    // Object did not move.  We update each index with each respective UpdateTicket.

    if (debug)
        debug->keyUpdates = 0;

    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            int64_t updatedKeys;
            Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys);
            if (!ret.isOK())
                return StatusWith<RecordId>(ret);
            if (debug)
                debug->keyUpdates += updatedKeys;
        }
    }

    invariant(sid == txn->recoveryUnit()->getSnapshotId());
    args.ns = ns().ns();
    getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

    return newLocation;
}
Exemple #8
0
StatusWith<RecordId> Collection::updateDocument(OperationContext* txn,
                                                const RecordId& oldLocation,
                                                const Snapshotted<BSONObj>& oldDoc,
                                                const BSONObj& newDoc,
                                                bool enforceQuota,
                                                bool indexesAffected,
                                                OpDebug* debug,
                                                oplogUpdateEntryArgs& args) {
    {
        auto status = checkValidation(txn, newDoc);
        if (!status.isOK()) {
            if (_validationLevel == STRICT_V) {
                return status;
            }
            // moderate means we have to check the old doc
            auto oldDocStatus = checkValidation(txn, oldDoc.value());
            if (oldDocStatus.isOK()) {
                // transitioning from good -> bad is not ok
                return status;
            }
            // bad -> bad is ok in moderate mode
        }
    }

    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId());

    SnapshotId sid = txn->recoveryUnit()->getSnapshotId();

    BSONElement oldId = oldDoc.value()["_id"];
    if (!oldId.eoo() && (oldId != newDoc["_id"]))
        return StatusWith<RecordId>(
            ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596);

    // At the end of this step, we will have a map of UpdateTickets, one per index, which
    // represent the index updates needed to be done, based on the changes between oldDoc and
    // newDoc.
    OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets;
    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexCatalogEntry* entry = ii.catalogEntry(descriptor);
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) ||
                repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor);
            UpdateTicket* updateTicket = new UpdateTicket();
            updateTickets.mutableMap()[descriptor] = updateTicket;
            Status ret = iam->validateUpdate(txn,
                                             oldDoc.value(),
                                             newDoc,
                                             oldLocation,
                                             options,
                                             updateTicket,
                                             entry->getFilterExpression());
            if (!ret.isOK()) {
                return StatusWith<RecordId>(ret);
            }
        }
    }

    // This can call back into Collection::recordStoreGoingToMove.  If that happens, the old
    // object is removed from all indexes.
    StatusWith<RecordId> newLocation = _recordStore->updateRecord(
        txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this);

    if (!newLocation.isOK()) {
        return newLocation;
    }

    // At this point, the old object may or may not still be indexed, depending on if it was
    // moved. If the object did move, we need to add the new location to all indexes.
    if (newLocation.getValue() != oldLocation) {
        if (debug) {
            if (debug->nmoved == -1)  // default of -1 rather than 0
                debug->nmoved = 1;
            else
                debug->nmoved += 1;
        }

        Status s = _indexCatalog.indexRecord(txn, newDoc, newLocation.getValue());
        if (!s.isOK())
            return StatusWith<RecordId>(s);
        invariant(sid == txn->recoveryUnit()->getSnapshotId());
        args.ns = ns().ns();
        getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

        return newLocation;
    }

    // Object did not move.  We update each index with each respective UpdateTicket.

    if (debug)
        debug->keyUpdates = 0;

    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            int64_t updatedKeys;
            Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys);
            if (!ret.isOK())
                return StatusWith<RecordId>(ret);
            if (debug)
                debug->keyUpdates += updatedKeys;
        }
    }

    invariant(sid == txn->recoveryUnit()->getSnapshotId());
    args.ns = ns().ns();
    getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

    return newLocation;
}
bool MigrationSourceManager::clone(OperationContext* txn, string& errmsg, BSONObjBuilder& result) {
    ElapsedTracker tracker(internalQueryExecYieldIterations, internalQueryExecYieldPeriodMS);

    int allocSize = 0;

    {
        AutoGetCollection autoColl(txn, _getNS(), MODE_IS);

        stdx::lock_guard<stdx::mutex> sl(_mutex);
        if (!_active) {
            errmsg = "not active";
            return false;
        }

        Collection* collection = autoColl.getCollection();
        if (!collection) {
            errmsg = str::stream() << "collection " << _nss.toString() << " does not exist";
            return false;
        }

        allocSize = std::min(
            BSONObjMaxUserSize,
            static_cast<int>((12 + collection->averageObjectSize(txn)) * cloneLocsRemaining()));
    }

    bool isBufferFilled = false;
    BSONArrayBuilder clonedDocsArrayBuilder(allocSize);
    while (!isBufferFilled) {
        AutoGetCollection autoColl(txn, _getNS(), MODE_IS);

        stdx::lock_guard<stdx::mutex> sl(_mutex);
        if (!_active) {
            errmsg = "not active";
            return false;
        }

        // TODO: fix SERVER-16540 race
        Collection* collection = autoColl.getCollection();
        if (!collection) {
            errmsg = str::stream() << "collection " << _nss.toString() << " does not exist";
            return false;
        }

        stdx::lock_guard<stdx::mutex> lk(_cloneLocsMutex);

        std::set<RecordId>::iterator cloneLocsIter = _cloneLocs.begin();
        for (; cloneLocsIter != _cloneLocs.end(); ++cloneLocsIter) {
            if (tracker.intervalHasElapsed())  // should I yield?
                break;

            RecordId recordId = *cloneLocsIter;
            Snapshotted<BSONObj> doc;
            if (!collection->findDoc(txn, recordId, &doc)) {
                // doc was deleted
                continue;
            }

            // Use the builder size instead of accumulating 'doc's size so that we take
            // into consideration the overhead of BSONArray indices, and *always*
            // append one doc.
            if (clonedDocsArrayBuilder.arrSize() != 0 &&
                (clonedDocsArrayBuilder.len() + doc.value().objsize() + 1024) >
                    BSONObjMaxUserSize) {
                isBufferFilled = true;  // break out of outer while loop
                break;
            }

            clonedDocsArrayBuilder.append(doc.value());
        }

        _cloneLocs.erase(_cloneLocs.begin(), cloneLocsIter);

        // Note: must be holding _cloneLocsMutex, don't move this inside while condition!
        if (_cloneLocs.empty()) {
            break;
        }
    }

    result.appendArray("objects", clonedDocsArrayBuilder.arr());
    return true;
}
Exemple #10
0
BSONObj UpdateStage::transformAndUpdate(const Snapshotted<BSONObj>& oldObj, RecordId& recordId) {
    const UpdateRequest* request = _params.request;
    UpdateDriver* driver = _params.driver;
    CanonicalQuery* cq = _params.canonicalQuery;
    UpdateLifecycle* lifecycle = request->getLifecycle();

    // If asked to return new doc, default to the oldObj, in case nothing changes.
    BSONObj newObj = oldObj.value();

    // Ask the driver to apply the mods. It may be that the driver can apply those "in
    // place", that is, some values of the old document just get adjusted without any
    // change to the binary layout on the bson layer. It may be that a whole new document
    // is needed to accomodate the new bson layout of the resulting document. In any event,
    // only enable in-place mutations if the underlying storage engine offers support for
    // writing damage events.
    _doc.reset(oldObj.value(),
               (_collection->updateWithDamagesSupported()
                    ? mutablebson::Document::kInPlaceEnabled
                    : mutablebson::Document::kInPlaceDisabled));

    BSONObj logObj;

    bool docWasModified = false;

    Status status = Status::OK();
    const bool validateForStorage = getOpCtx()->writesAreReplicated() && _enforceOkForStorage;
    FieldRefSet immutablePaths;
    if (getOpCtx()->writesAreReplicated() && !request->isFromMigration()) {
        if (lifecycle) {
            auto immutablePathsVector =
                getImmutableFields(getOpCtx(), request->getNamespaceString());
            if (immutablePathsVector) {
                immutablePaths.fillFrom(
                    transitional_tools_do_not_use::unspool_vector(*immutablePathsVector));
            }
        }
        immutablePaths.keepShortest(&idFieldRef);
    }
    if (!driver->needMatchDetails()) {
        // If we don't need match details, avoid doing the rematch
        status = driver->update(
            StringData(), &_doc, validateForStorage, immutablePaths, &logObj, &docWasModified);
    } else {
        // If there was a matched field, obtain it.
        MatchDetails matchDetails;
        matchDetails.requestElemMatchKey();

        dassert(cq);
        verify(cq->root()->matchesBSON(oldObj.value(), &matchDetails));

        string matchedField;
        if (matchDetails.hasElemMatchKey())
            matchedField = matchDetails.elemMatchKey();

        status = driver->update(
            matchedField, &_doc, validateForStorage, immutablePaths, &logObj, &docWasModified);
    }

    if (!status.isOK()) {
        uasserted(16837, status.reason());
    }

    // Skip adding _id field if the collection is capped (since capped collection documents can
    // neither grow nor shrink).
    const auto createIdField = !_collection->isCapped();

    // Ensure if _id exists it is first
    status = ensureIdFieldIsFirst(&_doc);
    if (status.code() == ErrorCodes::InvalidIdField) {
        // Create ObjectId _id field if we are doing that
        if (createIdField) {
            addObjectIDIdField(&_doc);
        }
    } else {
        uassertStatusOK(status);
    }

    // See if the changes were applied in place
    const char* source = NULL;
    const bool inPlace = _doc.getInPlaceUpdates(&_damages, &source);

    if (inPlace && _damages.empty()) {
        // An interesting edge case. A modifier didn't notice that it was really a no-op
        // during its 'prepare' phase. That represents a missed optimization, but we still
        // shouldn't do any real work. Toggle 'docWasModified' to 'false'.
        //
        // Currently, an example of this is '{ $push : { x : {$each: [], $sort: 1} } }' when the 'x'
        // array exists and is already sorted.
        docWasModified = false;
    }

    if (docWasModified) {

        // Prepare to write back the modified document
        WriteUnitOfWork wunit(getOpCtx());

        RecordId newRecordId;
        OplogUpdateEntryArgs args;
        if (!request->isExplain()) {
            invariant(_collection);
            auto* css = CollectionShardingState::get(getOpCtx(), _collection->ns());
            args.nss = _collection->ns();
            args.uuid = _collection->uuid();
            args.stmtId = request->getStmtId();
            args.update = logObj;
            args.criteria = css->getMetadata().extractDocumentKey(newObj);
            uassert(16980,
                    "Multi-update operations require all documents to have an '_id' field",
                    !request->isMulti() || args.criteria.hasField("_id"_sd));
            args.fromMigrate = request->isFromMigration();
            args.storeDocOption = getStoreDocMode(*request);
            if (args.storeDocOption == OplogUpdateEntryArgs::StoreDocOption::PreImage) {
                args.preImageDoc = oldObj.value().getOwned();
            }
        }

        if (inPlace) {
            if (!request->isExplain()) {
                newObj = oldObj.value();
                const RecordData oldRec(oldObj.value().objdata(), oldObj.value().objsize());

                Snapshotted<RecordData> snap(oldObj.snapshotId(), oldRec);

                StatusWith<RecordData> newRecStatus = _collection->updateDocumentWithDamages(
                    getOpCtx(), recordId, std::move(snap), source, _damages, &args);

                newObj = uassertStatusOK(std::move(newRecStatus)).releaseToBson();
            }

            newRecordId = recordId;
        } else {
            // The updates were not in place. Apply them through the file manager.

            newObj = _doc.getObject();
            uassert(17419,
                    str::stream() << "Resulting document after update is larger than "
                                  << BSONObjMaxUserSize,
                    newObj.objsize() <= BSONObjMaxUserSize);

            if (!request->isExplain()) {
                newRecordId = _collection->updateDocument(getOpCtx(),
                                                          recordId,
                                                          oldObj,
                                                          newObj,
                                                          true,
                                                          driver->modsAffectIndices(),
                                                          _params.opDebug,
                                                          &args);
            }
        }

        invariant(oldObj.snapshotId() == getOpCtx()->recoveryUnit()->getSnapshotId());
        wunit.commit();

        // If the document moved, we might see it again in a collection scan (maybe it's
        // a document after our current document).
        //
        // If the document is indexed and the mod changes an indexed value, we might see
        // it again.  For an example, see the comment above near declaration of
        // updatedRecordIds.
        //
        // This must be done after the wunit commits so we are sure we won't be rolling back.
        if (_updatedRecordIds && (newRecordId != recordId || driver->modsAffectIndices())) {
            _updatedRecordIds->insert(newRecordId);
        }
    }

    // Only record doc modifications if they wrote (exclude no-ops). Explains get
    // recorded as if they wrote.
    if (docWasModified || request->isExplain()) {
        _specificStats.nModified++;
    }

    return newObj;
}
Exemple #11
0
Status cloneCollectionAsCapped(OperationContext* txn,
                               Database* db,
                               const std::string& shortFrom,
                               const std::string& shortTo,
                               double size,
                               bool temp) {
    std::string fromNs = db->name() + "." + shortFrom;
    std::string toNs = db->name() + "." + shortTo;

    Collection* fromCollection = db->getCollection(fromNs);
    if (!fromCollection)
        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "source collection " << fromNs << " does not exist");

    if (db->getCollection(toNs))
        return Status(ErrorCodes::NamespaceExists, "to collection already exists");

    // create new collection
    MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
        const auto fromOptions =
            fromCollection->getCatalogEntry()->getCollectionOptions(txn).toBSON();
        OldClientContext ctx(txn, toNs);
        BSONObjBuilder spec;
        spec.appendBool("capped", true);
        spec.append("size", size);
        if (temp)
            spec.appendBool("temp", true);
        spec.appendElementsUnique(fromOptions);

        WriteUnitOfWork wunit(txn);
        Status status = userCreateNS(txn, ctx.db(), toNs, spec.done());
        if (!status.isOK())
            return status;
        wunit.commit();
    }
    MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "cloneCollectionAsCapped", fromNs);

    Collection* toCollection = db->getCollection(toNs);
    invariant(toCollection);  // we created above

    // how much data to ignore because it won't fit anyway
    // datasize and extentSize can't be compared exactly, so add some padding to 'size'

    long long allocatedSpaceGuess =
        std::max(static_cast<long long>(size * 2),
                 static_cast<long long>(toCollection->getRecordStore()->storageSize(txn) * 2));

    long long excessSize = fromCollection->dataSize(txn) - allocatedSpaceGuess;

    std::unique_ptr<PlanExecutor> exec(
        InternalPlanner::collectionScan(txn, fromNs, fromCollection, InternalPlanner::FORWARD));

    exec->setYieldPolicy(PlanExecutor::WRITE_CONFLICT_RETRY_ONLY);

    Snapshotted<BSONObj> objToClone;
    RecordId loc;
    PlanExecutor::ExecState state = PlanExecutor::FAILURE;  // suppress uninitialized warnings

    DisableDocumentValidation validationDisabler(txn);

    int retries = 0;  // non-zero when retrying our last document.
    while (true) {
        if (!retries) {
            state = exec->getNextSnapshotted(&objToClone, &loc);
        }

        switch (state) {
            case PlanExecutor::IS_EOF:
                return Status::OK();
            case PlanExecutor::ADVANCED: {
                if (excessSize > 0) {
                    // 4x is for padding, power of 2, etc...
                    excessSize -= (4 * objToClone.value().objsize());
                    continue;
                }
                break;
            }
            default:
                // Unreachable as:
                // 1) We require a read lock (at a minimum) on the "from" collection
                //    and won't yield, preventing collection drop and PlanExecutor::DEAD
                // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The
                //    CollectionScan PlanStage does not have a FAILURE scenario.
                // 3) All other PlanExecutor states are handled above
                invariant(false);
        }

        try {
            // Make sure we are working with the latest version of the document.
            if (objToClone.snapshotId() != txn->recoveryUnit()->getSnapshotId() &&
                !fromCollection->findDoc(txn, loc, &objToClone)) {
                // doc was deleted so don't clone it.
                retries = 0;
                continue;
            }

            WriteUnitOfWork wunit(txn);
            toCollection->insertDocument(txn, objToClone.value(), true, txn->writesAreReplicated());
            wunit.commit();

            // Go to the next document
            retries = 0;
        } catch (const WriteConflictException& wce) {
            CurOp::get(txn)->debug().writeConflicts++;
            retries++;  // logAndBackoff expects this to be 1 on first call.
            wce.logAndBackoff(retries, "cloneCollectionAsCapped", fromNs);

            // Can't use WRITE_CONFLICT_RETRY_LOOP macros since we need to save/restore exec
            // around call to abandonSnapshot.
            exec->saveState();
            txn->recoveryUnit()->abandonSnapshot();
            exec->restoreState(txn);  // Handles any WCEs internally.
        }
    }

    invariant(false);  // unreachable
}
Exemple #12
0
mongo::Status mongo::cloneCollectionAsCapped(OperationContext* opCtx,
                                             Database* db,
                                             const std::string& shortFrom,
                                             const std::string& shortTo,
                                             long long size,
                                             bool temp) {
    NamespaceString fromNss(db->name(), shortFrom);
    NamespaceString toNss(db->name(), shortTo);

    Collection* fromCollection = db->getCollection(opCtx, fromNss);
    if (!fromCollection) {
        if (db->getViewCatalog()->lookup(opCtx, fromNss.ns())) {
            return Status(ErrorCodes::CommandNotSupportedOnView,
                          str::stream() << "cloneCollectionAsCapped not supported for views: "
                                        << fromNss.ns());
        }
        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "source collection " << fromNss.ns() << " does not exist");
    }

    if (fromNss.isDropPendingNamespace()) {
        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "source collection " << fromNss.ns()
                                    << " is currently in a drop-pending state.");
    }

    if (db->getCollection(opCtx, toNss)) {
        return Status(ErrorCodes::NamespaceExists,
                      str::stream() << "cloneCollectionAsCapped failed - destination collection "
                                    << toNss.ns()
                                    << " already exists. source collection: "
                                    << fromNss.ns());
    }

    // create new collection
    {
        auto options = fromCollection->getCatalogEntry()->getCollectionOptions(opCtx);
        // The capped collection will get its own new unique id, as the conversion isn't reversible,
        // so it can't be rolled back.
        options.uuid.reset();
        options.capped = true;
        options.cappedSize = size;
        if (temp)
            options.temp = true;

        BSONObjBuilder cmd;
        cmd.append("create", toNss.coll());
        cmd.appendElements(options.toBSON());
        Status status = createCollection(opCtx, toNss.db().toString(), cmd.done());
        if (!status.isOK())
            return status;
    }

    Collection* toCollection = db->getCollection(opCtx, toNss);
    invariant(toCollection);  // we created above

    // how much data to ignore because it won't fit anyway
    // datasize and extentSize can't be compared exactly, so add some padding to 'size'

    long long allocatedSpaceGuess =
        std::max(static_cast<long long>(size * 2),
                 static_cast<long long>(toCollection->getRecordStore()->storageSize(opCtx) * 2));

    long long excessSize = fromCollection->dataSize(opCtx) - allocatedSpaceGuess;

    auto exec = InternalPlanner::collectionScan(opCtx,
                                                fromNss.ns(),
                                                fromCollection,
                                                PlanExecutor::WRITE_CONFLICT_RETRY_ONLY,
                                                InternalPlanner::FORWARD);

    Snapshotted<BSONObj> objToClone;
    RecordId loc;
    PlanExecutor::ExecState state = PlanExecutor::FAILURE;  // suppress uninitialized warnings

    DisableDocumentValidation validationDisabler(opCtx);

    int retries = 0;  // non-zero when retrying our last document.
    while (true) {
        if (!retries) {
            state = exec->getNextSnapshotted(&objToClone, &loc);
        }

        switch (state) {
            case PlanExecutor::IS_EOF:
                return Status::OK();
            case PlanExecutor::ADVANCED: {
                if (excessSize > 0) {
                    // 4x is for padding, power of 2, etc...
                    excessSize -= (4 * objToClone.value().objsize());
                    continue;
                }
                break;
            }
            default:
                // Unreachable as:
                // 1) We require a read lock (at a minimum) on the "from" collection
                //    and won't yield, preventing collection drop and PlanExecutor::DEAD
                // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The
                //    CollectionScan PlanStage does not have a FAILURE scenario.
                // 3) All other PlanExecutor states are handled above
                MONGO_UNREACHABLE;
        }

        try {
            // Make sure we are working with the latest version of the document.
            if (objToClone.snapshotId() != opCtx->recoveryUnit()->getSnapshotId() &&
                !fromCollection->findDoc(opCtx, loc, &objToClone)) {
                // doc was deleted so don't clone it.
                retries = 0;
                continue;
            }

            WriteUnitOfWork wunit(opCtx);
            OpDebug* const nullOpDebug = nullptr;
            uassertStatusOK(toCollection->insertDocument(
                opCtx, InsertStatement(objToClone.value()), nullOpDebug, true));
            wunit.commit();

            // Go to the next document
            retries = 0;
        } catch (const WriteConflictException&) {
            CurOp::get(opCtx)->debug().additiveMetrics.incrementWriteConflicts(1);
            retries++;  // logAndBackoff expects this to be 1 on first call.
            WriteConflictException::logAndBackoff(retries, "cloneCollectionAsCapped", fromNss.ns());

            // Can't use writeConflictRetry since we need to save/restore exec around call to
            // abandonSnapshot.
            exec->saveState();
            opCtx->recoveryUnit()->abandonSnapshot();
            auto restoreStatus = exec->restoreState();  // Handles any WCEs internally.
            if (!restoreStatus.isOK()) {
                return restoreStatus;
            }
        }
    }

    MONGO_UNREACHABLE;
}
Exemple #13
0
    void run() {
        OldClientWriteContext ctx(&_txn, ns());
        Database* db = ctx.db();
        Collection* coll = db->getCollection(ns());
        if (!coll) {
            WriteUnitOfWork wuow(&_txn);
            coll = db->createCollection(&_txn, ns());
            wuow.commit();
        }

        {
            WriteUnitOfWork wuow(&_txn);
            fillData();
            wuow.commit();
        }

        // The data we're going to later invalidate.
        set<RecordId> locs;
        getLocs(&locs, coll);

        std::unique_ptr<PlanExecutor> exec(makePlanExecutorWithSortStage(coll));
        SortStage* ss = static_cast<SortStage*>(exec->getRootStage());
        QueuedDataStage* ms = static_cast<QueuedDataStage*>(ss->getChildren()[0]);

        // Have sort read in data from the queued data stage.
        const int firstRead = 5;
        for (int i = 0; i < firstRead; ++i) {
            WorkingSetID id = WorkingSet::INVALID_ID;
            PlanStage::StageState status = ss->work(&id);
            ASSERT_NOT_EQUALS(PlanStage::ADVANCED, status);
        }

        // We should have read in the first 'firstRead' locs.  Invalidate the first one.
        // Since it's in the WorkingSet, the updates should not be reflected in the output.
        exec->saveState();
        set<RecordId>::iterator it = locs.begin();
        Snapshotted<BSONObj> oldDoc = coll->docFor(&_txn, *it);

        OID updatedId = oldDoc.value().getField("_id").OID();
        SnapshotId idBeforeUpdate = oldDoc.snapshotId();
        // We purposefully update the document to have a 'foo' value greater than limit().
        // This allows us to check that we don't return the new copy of a doc by asserting
        // foo < limit().
        BSONObj newDoc = BSON("_id" << updatedId << "foo" << limit() + 10);
        oplogUpdateEntryArgs args;
        {
            WriteUnitOfWork wuow(&_txn);
            coll->updateDocument(&_txn, *it, oldDoc, newDoc, false, false, NULL, args);
            wuow.commit();
        }
        exec->restoreState(&_txn);

        // Read the rest of the data from the queued data stage.
        while (!ms->isEOF()) {
            WorkingSetID id = WorkingSet::INVALID_ID;
            ss->work(&id);
        }

        // Let's just invalidate everything now. Already read into ss, so original values
        // should be fetched.
        exec->saveState();
        while (it != locs.end()) {
            oldDoc = coll->docFor(&_txn, *it);
            {
                WriteUnitOfWork wuow(&_txn);
                coll->updateDocument(&_txn, *it++, oldDoc, newDoc, false, false, NULL, args);
                wuow.commit();
            }
        }
        exec->restoreState(&_txn);

        // Verify that it's sorted, the right number of documents are returned, and they're all
        // in the expected range.
        int count = 0;
        int lastVal = 0;
        int thisVal;
        while (!ss->isEOF()) {
            WorkingSetID id = WorkingSet::INVALID_ID;
            PlanStage::StageState status = ss->work(&id);
            if (PlanStage::ADVANCED != status) {
                ASSERT_NE(status, PlanStage::FAILURE);
                ASSERT_NE(status, PlanStage::DEAD);
                continue;
            }
            WorkingSetMember* member = exec->getWorkingSet()->get(id);
            ASSERT(member->hasObj());
            if (member->obj.value().getField("_id").OID() == updatedId) {
                ASSERT(idBeforeUpdate == member->obj.snapshotId());
            }
            thisVal = member->obj.value().getField("foo").Int();
            ASSERT_LTE(lastVal, thisVal);
            // Expect docs in range [0, limit)
            ASSERT_LTE(0, thisVal);
            ASSERT_LT(thisVal, limit());
            lastVal = thisVal;
            ++count;
        }
        // Returns all docs.
        ASSERT_EQUALS(limit(), count);
    }
Exemple #14
0
        static bool runImpl(OperationContext* txn,
                            const string& dbname,
                            const string& ns,
                            const BSONObj& query,
                            const BSONObj& fields,
                            const BSONObj& update,
                            const BSONObj& sort,
                            bool upsert,
                            bool returnNew,
                            bool remove ,
                            BSONObjBuilder& result,
                            string& errmsg) {

            AutoGetOrCreateDb autoDb(txn, dbname, MODE_IX);
            Lock::CollectionLock collLock(txn->lockState(), ns, MODE_IX);
            Client::Context ctx(txn, ns, autoDb.getDb(), autoDb.justCreated());

            if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) {
                return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream()
                    << "Not primary while running findAndModify in " << ns));
            }

            Collection* collection = ctx.db()->getCollection(ns);

            const WhereCallbackReal whereCallback(txn, StringData(ns));

            if ( !collection ) {
                if ( !upsert ) {
                    // no collectio and no upsert, so can't possible do anything
                    _appendHelper( result, BSONObj(), false, fields, whereCallback );
                    return true;
                }
                // no collection, but upsert, so we want to create it
                // problem is we only have IX on db and collection :(
                // so we tell our caller who can do it
                errmsg = "no-collection";
                return false;
            }

            Snapshotted<BSONObj> snapshotDoc;
            RecordId loc;
            bool found = false;
            {
                CanonicalQuery* cq;
                const BSONObj projection;
                const long long skip = 0;
                const long long limit = -1; // 1 document requested; negative indicates hard limit.
                uassertStatusOK(CanonicalQuery::canonicalize(ns,
                                                             query,
                                                             sort,
                                                             projection,
                                                             skip,
                                                             limit,
                                                             &cq,
                                                             whereCallback));

                PlanExecutor* rawExec;
                uassertStatusOK(getExecutor(txn,
                                            collection,
                                            cq,
                                            PlanExecutor::YIELD_AUTO,
                                            &rawExec,
                                            QueryPlannerParams::DEFAULT));

                scoped_ptr<PlanExecutor> exec(rawExec);

                PlanExecutor::ExecState state = exec->getNextSnapshotted(&snapshotDoc, &loc);
                if (PlanExecutor::ADVANCED == state) {
                    found = true;
                }
                else if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
                    if (PlanExecutor::FAILURE == state &&
                        WorkingSetCommon::isValidStatusMemberObject(snapshotDoc.value())) {
                        const Status errorStatus =
                            WorkingSetCommon::getMemberObjectStatus(snapshotDoc.value());
                        invariant(!errorStatus.isOK());
                        uasserted(errorStatus.code(), errorStatus.reason());
                    }
                    uasserted(ErrorCodes::OperationFailed,
                              str::stream() << "executor returned " << PlanExecutor::statestr(state)
                                            << " while finding document to update");
                }
                else {
                    invariant(PlanExecutor::IS_EOF == state);
                }
            }

            WriteUnitOfWork wuow(txn);
            if (found) {
                // We found a doc, but it might not be associated with the active snapshot.
                // If the doc has changed or is no longer in the collection, we will throw a
                // write conflict exception and start again from the beginning.
                if (txn->recoveryUnit()->getSnapshotId() != snapshotDoc.snapshotId()) {
                    BSONObj oldObj = snapshotDoc.value();
                    if (!collection->findDoc(txn, loc, &snapshotDoc)) {
                        // Got deleted in the new snapshot.
                        throw WriteConflictException();
                    }

                    if (!oldObj.binaryEqual(snapshotDoc.value())) {
                        // Got updated in the new snapshot.
                        throw WriteConflictException();
                    }
                }

                // If we get here without throwing, then we should have the copy of the doc from
                // the latest snapshot.
                invariant(txn->recoveryUnit()->getSnapshotId() == snapshotDoc.snapshotId());
            }

            BSONObj doc = snapshotDoc.value();
            BSONObj queryModified = query;
            if (found && !doc["_id"].eoo() && !CanonicalQuery::isSimpleIdQuery(query)) {
                // we're going to re-write the query to be more efficient
                // we have to be a little careful because of positional operators
                // maybe we can pass this all through eventually, but right now isn't an easy way
                
                bool hasPositionalUpdate = false;
                {
                    // if the update has a positional piece ($)
                    // then we need to pull all query parts in
                    // so here we check for $
                    // a little hacky
                    BSONObjIterator i( update );
                    while ( i.more() ) {
                        const BSONElement& elem = i.next();
                        
                        if ( elem.fieldName()[0] != '$' || elem.type() != Object )
                            continue;

                        BSONObjIterator j( elem.Obj() );
                        while ( j.more() ) {
                            if ( str::contains( j.next().fieldName(), ".$" ) ) {
                                hasPositionalUpdate = true;
                                break;
                            }
                        }
                    }
                }

                BSONObjBuilder b(query.objsize() + 10);
                b.append( doc["_id"] );
                
                bool addedAtomic = false;

                BSONObjIterator i(query);
                while ( i.more() ) {
                    const BSONElement& elem = i.next();

                    if ( str::equals( "_id" , elem.fieldName() ) ) {
                        // we already do _id
                        continue;
                    }
                    
                    if ( ! hasPositionalUpdate ) {
                        // if there is a dotted field, accept we may need more query parts
                        continue;
                    }
                    
                    if ( ! addedAtomic ) {
                        b.appendBool( "$atomic" , true );
                        addedAtomic = true;
                    }

                    b.append( elem );
                }

                queryModified = b.obj();
            }

            if ( remove ) {
                _appendHelper(result, doc, found, fields, whereCallback);
                if ( found ) {
                    deleteObjects(txn, ctx.db(), ns, queryModified, PlanExecutor::YIELD_MANUAL,
                                  true, true);
                    BSONObjBuilder le( result.subobjStart( "lastErrorObject" ) );
                    le.appendNumber( "n" , 1 );
                    le.done();
                }
            }
            else {
                // update
                if ( ! found && ! upsert ) {
                    // didn't have it, and am not upserting
                    _appendHelper(result, doc, found, fields, whereCallback);
                }
                else {
                    // we found it or we're updating
                    
                    if ( ! returnNew ) {
                        _appendHelper(result, doc, found, fields, whereCallback);
                    }
                    
                    const NamespaceString requestNs(ns);
                    UpdateRequest request(requestNs);

                    request.setQuery(queryModified);
                    request.setUpdates(update);
                    request.setUpsert(upsert);
                    request.setUpdateOpLog();
                    request.setStoreResultDoc(returnNew);

                    request.setYieldPolicy(PlanExecutor::YIELD_MANUAL);

                    // TODO(greg) We need to send if we are ignoring
                    // the shard version below, but for now no
                    UpdateLifecycleImpl updateLifecycle(false, requestNs);
                    request.setLifecycle(&updateLifecycle);
                    UpdateResult res = mongo::update(txn,
                                                     ctx.db(),
                                                     request,
                                                     &txn->getCurOp()->debug());

                    if (!found && res.existing) {
                        // No match was found during the read part of this find and modify, which
                        // means that we're here doing an upsert. But the update also told us that
                        // we modified an *already existing* document. This probably means that
                        // the query reported EOF based on an out-of-date snapshot. This should be
                        // a rare event, so we handle it by throwing a write conflict.
                        throw WriteConflictException();
                    }

                    if ( !collection ) {
                        // collection created by an upsert
                        collection = ctx.db()->getCollection(ns);
                    }

                    LOG(3) << "update result: "  << res ;
                    if (returnNew) {
                        dassert(!res.newObj.isEmpty());
                        _appendHelper(result, res.newObj, true, fields, whereCallback);
                    }

                    BSONObjBuilder le( result.subobjStart( "lastErrorObject" ) );
                    le.appendBool( "updatedExisting" , res.existing );
                    le.appendNumber( "n" , res.numMatched );
                    if ( !res.upserted.isEmpty() ) {
                        le.append( res.upserted[kUpsertedFieldName] );
                    }
                    le.done();
                }
            }

            // Committing the WUOW can close the current snapshot. Until this happens, the
            // snapshot id should not have changed.
            if (found) {
                invariant(txn->recoveryUnit()->getSnapshotId() == snapshotDoc.snapshotId());
            }
            wuow.commit();

            return true;
        }
    virtual bool run(OperationContext* txn,
                     const string& dbname,
                     BSONObj& cmdObj,
                     int,
                     string& errmsg,
                     BSONObjBuilder& result,
                     bool fromRepl) {
        ScopedTransaction transaction(txn, MODE_X);
        Lock::GlobalWrite globalWriteLock(txn->lockState());
        string source = cmdObj.getStringField(name.c_str());
        string target = cmdObj.getStringField("to");

        if (!fromRepl &&
            !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NotMaster,
                                              str::stream()
                                                  << "Not primary while renaming collection "
                                                  << source << " to " << target));
        }

        // We stay in source context the whole time. This is mostly to set the CurOp namespace.
        Client::Context ctx(txn, source);

        if (!NamespaceString::validCollectionComponent(target.c_str())) {
            errmsg = "invalid collection name: " + target;
            return false;
        }
        if (source.empty() || target.empty()) {
            errmsg = "invalid command syntax";
            return false;
        }

        if ((repl::getGlobalReplicationCoordinator()->getReplicationMode() !=
             repl::ReplicationCoordinator::modeNone)) {
            if (NamespaceString(source).isOplog()) {
                errmsg = "can't rename live oplog while replicating";
                return false;
            }
            if (NamespaceString(target).isOplog()) {
                errmsg = "can't rename to live oplog while replicating";
                return false;
            }
        }

        if (NamespaceString::oplog(source) != NamespaceString::oplog(target)) {
            errmsg = "If either the source or target of a rename is an oplog name, both must be";
            return false;
        }

        if (!fromRepl) {  // If it got through on the master, need to allow it here too
            Status sourceStatus = userAllowedWriteNS(source);
            if (!sourceStatus.isOK()) {
                errmsg = "error with source namespace: " + sourceStatus.reason();
                return false;
            }

            Status targetStatus = userAllowedWriteNS(target);
            if (!targetStatus.isOK()) {
                errmsg = "error with target namespace: " + targetStatus.reason();
                return false;
            }
        }

        if (NamespaceString(source).coll() == "system.indexes" ||
            NamespaceString(target).coll() == "system.indexes") {
            errmsg = "renaming system.indexes is not allowed";
            return false;
        }

        Database* const sourceDB = dbHolder().get(txn, nsToDatabase(source));
        Collection* const sourceColl = sourceDB ? sourceDB->getCollection(source) : NULL;
        if (!sourceColl) {
            errmsg = "source namespace does not exist";
            return false;
        }

        {
            // Ensure that collection name does not exceed maximum length.
            // Ensure that index names do not push the length over the max.
            // Iterator includes unfinished indexes.
            IndexCatalog::IndexIterator sourceIndIt =
                sourceColl->getIndexCatalog()->getIndexIterator(txn, true);
            int longestIndexNameLength = 0;
            while (sourceIndIt.more()) {
                int thisLength = sourceIndIt.next()->indexName().length();
                if (thisLength > longestIndexNameLength)
                    longestIndexNameLength = thisLength;
            }

            unsigned int longestAllowed =
                min(int(NamespaceString::MaxNsCollectionLen),
                    int(NamespaceString::MaxNsLen) - 2 /*strlen(".$")*/ - longestIndexNameLength);
            if (target.size() > longestAllowed) {
                StringBuilder sb;
                sb << "collection name length of " << target.size() << " exceeds maximum length of "
                   << longestAllowed << ", allowing for index names";
                errmsg = sb.str();
                return false;
            }
        }

        BackgroundOperation::assertNoBgOpInProgForNs(source);

        Database* const targetDB = dbHolder().openDb(txn, nsToDatabase(target));

        {
            WriteUnitOfWork wunit(txn);

            // Check if the target namespace exists and if dropTarget is true.
            // If target exists and dropTarget is not true, return false.
            if (targetDB->getCollection(target)) {
                if (!cmdObj["dropTarget"].trueValue()) {
                    errmsg = "target namespace exists";
                    return false;
                }

                Status s = targetDB->dropCollection(txn, target);
                if (!s.isOK()) {
                    errmsg = s.toString();
                    return false;
                }
            }

            // If we are renaming in the same database, just
            // rename the namespace and we're done.
            if (sourceDB == targetDB) {
                Status s =
                    targetDB->renameCollection(txn, source, target, cmdObj["stayTemp"].trueValue());
                if (!s.isOK()) {
                    return appendCommandStatus(result, s);
                }

                if (!fromRepl) {
                    repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj);
                }

                wunit.commit();
                return true;
            }

            wunit.commit();
        }

        // If we get here, we are renaming across databases, so we must copy all the data and
        // indexes, then remove the source collection.

        // Create the target collection. It will be removed if we fail to copy the collection.
        // TODO use a temp collection and unset the temp flag on success.
        Collection* targetColl = NULL;
        {
            CollectionOptions options;
            options.setNoIdIndex();

            if (sourceColl->isCapped()) {
                const CollectionOptions sourceOpts =
                    sourceColl->getCatalogEntry()->getCollectionOptions(txn);

                options.capped = true;
                options.cappedSize = sourceOpts.cappedSize;
                options.cappedMaxDocs = sourceOpts.cappedMaxDocs;
            }

            WriteUnitOfWork wunit(txn);

            // No logOp necessary because the entire renameCollection command is one logOp.
            targetColl = targetDB->createCollection(txn, target, options);
            if (!targetColl) {
                errmsg = "Failed to create target collection.";
                return false;
            }

            wunit.commit();
        }

        // Dismissed on success
        ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target);

        MultiIndexBlock indexer(txn, targetColl);
        indexer.allowInterruption();

        // Copy the index descriptions from the source collection, adjusting the ns field.
        {
            std::vector<BSONObj> indexesToCopy;
            IndexCatalog::IndexIterator sourceIndIt =
                sourceColl->getIndexCatalog()->getIndexIterator(txn, true);
            while (sourceIndIt.more()) {
                const BSONObj currIndex = sourceIndIt.next()->infoObj();

                // Process the source index.
                BSONObjBuilder newIndex;
                newIndex.append("ns", target);
                newIndex.appendElementsUnique(currIndex);
                indexesToCopy.push_back(newIndex.obj());
            }
            indexer.init(indexesToCopy);
        }

        {
            // Copy over all the data from source collection to target collection.
            boost::scoped_ptr<RecordIterator> sourceIt(sourceColl->getIterator(txn));
            while (!sourceIt->isEOF()) {
                txn->checkForInterrupt();

                const Snapshotted<BSONObj> obj = sourceColl->docFor(txn, sourceIt->getNext());

                WriteUnitOfWork wunit(txn);
                // No logOp necessary because the entire renameCollection command is one logOp.
                Status status =
                    targetColl->insertDocument(txn, obj.value(), &indexer, true).getStatus();
                if (!status.isOK())
                    return appendCommandStatus(result, status);
                wunit.commit();
            }
        }

        Status status = indexer.doneInserting();
        if (!status.isOK())
            return appendCommandStatus(result, status);

        {
            // Getting here means we successfully built the target copy. We now remove the
            // source collection and finalize the rename.
            WriteUnitOfWork wunit(txn);

            Status status = sourceDB->dropCollection(txn, source);
            if (!status.isOK())
                return appendCommandStatus(result, status);

            indexer.commit();

            if (!fromRepl) {
                repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj);
            }

            wunit.commit();
        }

        targetCollectionDropper.Dismiss();
        return true;
    }
Status MultiIndexBlockImpl::insertAllDocumentsInCollection(std::set<RecordId>* dupsOut) {
    const char* curopMessage = _buildInBackground ? "Index Build (background)" : "Index Build";
    const auto numRecords = _collection->numRecords(_opCtx);
    stdx::unique_lock<Client> lk(*_opCtx->getClient());
    ProgressMeterHolder progress(
        CurOp::get(_opCtx)->setMessage_inlock(curopMessage, curopMessage, numRecords));
    lk.unlock();

    Timer t;

    unsigned long long n = 0;

    PlanExecutor::YieldPolicy yieldPolicy;
    if (_buildInBackground) {
        invariant(_allowInterruption);
        yieldPolicy = PlanExecutor::YIELD_AUTO;
    } else {
        yieldPolicy = PlanExecutor::WRITE_CONFLICT_RETRY_ONLY;
    }
    auto exec =
        InternalPlanner::collectionScan(_opCtx, _collection->ns().ns(), _collection, yieldPolicy);

    Snapshotted<BSONObj> objToIndex;
    RecordId loc;
    PlanExecutor::ExecState state;
    int retries = 0;  // non-zero when retrying our last document.
    while (retries ||
           (PlanExecutor::ADVANCED == (state = exec->getNextSnapshotted(&objToIndex, &loc))) ||
           MONGO_FAIL_POINT(hangAfterStartingIndexBuild)) {
        try {
            if (_allowInterruption)
                _opCtx->checkForInterrupt();

            if (!(retries || (PlanExecutor::ADVANCED == state))) {
                // The only reason we are still in the loop is hangAfterStartingIndexBuild.
                log() << "Hanging index build due to 'hangAfterStartingIndexBuild' failpoint";
                invariant(_allowInterruption);
                sleepmillis(1000);
                continue;
            }

            // Make sure we are working with the latest version of the document.
            if (objToIndex.snapshotId() != _opCtx->recoveryUnit()->getSnapshotId() &&
                !_collection->findDoc(_opCtx, loc, &objToIndex)) {
                // doc was deleted so don't index it.
                retries = 0;
                continue;
            }

            // Done before insert so we can retry document if it WCEs.
            progress->setTotalWhileRunning(_collection->numRecords(_opCtx));

            WriteUnitOfWork wunit(_opCtx);
            Status ret = insert(objToIndex.value(), loc);
            if (_buildInBackground)
                exec->saveState();
            if (ret.isOK()) {
                wunit.commit();
            } else if (dupsOut && ret.code() == ErrorCodes::DuplicateKey) {
                // If dupsOut is non-null, we should only fail the specific insert that
                // led to a DuplicateKey rather than the whole index build.
                dupsOut->insert(loc);
            } else {
                // Fail the index build hard.
                return ret;
            }
            if (_buildInBackground) {
                auto restoreStatus = exec->restoreState();  // Handles any WCEs internally.
                if (!restoreStatus.isOK()) {
                    return restoreStatus;
                }
            }

            // Go to the next document
            progress->hit();
            n++;
            retries = 0;
        } catch (const WriteConflictException&) {
            CurOp::get(_opCtx)->debug().writeConflicts++;
            retries++;  // logAndBackoff expects this to be 1 on first call.
            WriteConflictException::logAndBackoff(
                retries, "index creation", _collection->ns().ns());

            // Can't use writeConflictRetry since we need to save/restore exec around call to
            // abandonSnapshot.
            exec->saveState();
            _opCtx->recoveryUnit()->abandonSnapshot();
            auto restoreStatus = exec->restoreState();  // Handles any WCEs internally.
            if (!restoreStatus.isOK()) {
                return restoreStatus;
            }
        }
    }

    uassert(28550,
            "Unable to complete index build due to collection scan failure: " +
                WorkingSetCommon::toStatusString(objToIndex.value()),
            state == PlanExecutor::IS_EOF);

    if (MONGO_FAIL_POINT(hangAfterStartingIndexBuildUnlocked)) {
        // Unlock before hanging so replication recognizes we've completed.
        Locker::LockSnapshot lockInfo;
        _opCtx->lockState()->saveLockStateAndUnlock(&lockInfo);
        while (MONGO_FAIL_POINT(hangAfterStartingIndexBuildUnlocked)) {
            log() << "Hanging index build with no locks due to "
                     "'hangAfterStartingIndexBuildUnlocked' failpoint";
            sleepmillis(1000);
        }
        // If we want to support this, we'd need to regrab the lock and be sure that all callers are
        // ok with us yielding. They should be for BG indexes, but not for foreground.
        invariant(!"the hangAfterStartingIndexBuildUnlocked failpoint can't be turned off");
    }

    progress->finished();

    Status ret = doneInserting(dupsOut);
    if (!ret.isOK())
        return ret;

    log() << "build index done.  scanned " << n << " total records. " << t.seconds() << " secs";

    return Status::OK();
}