void Collection::deleteDocument( OperationContext* txn, const RecordId& loc, bool cappedOK, bool noWarn, BSONObj* deletedId) { if (isCapped() && !cappedOK) { log() << "failing remove on a capped ns " << _ns << endl; uasserted(10089, "cannot remove from a capped collection"); return; } Snapshotted<BSONObj> doc = docFor(txn, loc); BSONElement e = doc.value()["_id"]; BSONObj id; if (e.type()) { id = e.wrap(); if (deletedId) { *deletedId = e.wrap(); } } /* check if any cursors point to us. if so, advance them. */ _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION); _indexCatalog.unindexRecord(txn, doc.value(), loc, noWarn); _recordStore->deleteRecord(txn, loc); if (!id.isEmpty()) { getGlobalServiceContext()->getOpObserver()->onDelete(txn, ns().ns(), id); } }
PlanExecutor::ExecState PlanExecutor::getNext(BSONObj* objOut, RecordId* dlOut) { Snapshotted<BSONObj> snapshotted; ExecState state = getNextImpl(objOut ? &snapshotted : NULL, dlOut); if (objOut) { *objOut = snapshotted.value(); } return state; }
void Collection::deleteDocument( OperationContext* txn, const RecordId& loc, bool fromMigrate, bool cappedOK, bool noWarn) { if (isCapped() && !cappedOK) { log() << "failing remove on a capped ns " << _ns << endl; uasserted(10089, "cannot remove from a capped collection"); return; } Snapshotted<BSONObj> doc = docFor(txn, loc); auto opObserver = getGlobalServiceContext()->getOpObserver(); OpObserver::DeleteState deleteState = opObserver->aboutToDelete(txn, ns(), doc.value()); /* check if any cursors point to us. if so, advance them. */ _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION); _indexCatalog.unindexRecord(txn, doc.value(), loc, noWarn); _recordStore->deleteRecord(txn, loc); opObserver->onDelete(txn, ns(), std::move(deleteState), fromMigrate); }
StatusWith<RecordData> Collection::updateDocumentWithDamages( OperationContext* txn, const RecordId& loc, const Snapshotted<RecordData>& oldRec, const char* damageSource, const mutablebson::DamageVector& damages, oplogUpdateEntryArgs& args) { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(oldRec.snapshotId() == txn->recoveryUnit()->getSnapshotId()); invariant(updateWithDamagesSupported()); // Broadcast the mutation so that query results stay correct. _cursorManager.invalidateDocument(txn, loc, INVALIDATION_MUTATION); auto newRecStatus = _recordStore->updateWithDamages(txn, loc, oldRec.value(), damageSource, damages); if (newRecStatus.isOK()) { args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); } return newRecStatus; }
Status MigrationChunkClonerSourceLegacy::nextCloneBatch(OperationContext* txn, Collection* collection, BSONArrayBuilder* arrBuilder) { dassert(txn->lockState()->isCollectionLockedForMode(_args.getNss().ns(), MODE_IS)); ElapsedTracker tracker(txn->getServiceContext()->getFastClockSource(), internalQueryExecYieldIterations, Milliseconds(internalQueryExecYieldPeriodMS.load())); stdx::lock_guard<stdx::mutex> sl(_mutex); std::set<RecordId>::iterator it; for (it = _cloneLocs.begin(); it != _cloneLocs.end(); ++it) { // We must always make progress in this method by at least one document because empty return // indicates there is no more initial clone data. if (arrBuilder->arrSize() && tracker.intervalHasElapsed()) { break; } Snapshotted<BSONObj> doc; if (collection->findDoc(txn, *it, &doc)) { // Use the builder size instead of accumulating the document sizes directly so that we // take into consideration the overhead of BSONArray indices. if (arrBuilder->arrSize() && (arrBuilder->len() + doc.value().objsize() + 1024) > BSONObjMaxUserSize) { break; } arrBuilder->append(doc.value()); } } _cloneLocs.erase(_cloneLocs.begin(), it); return Status::OK(); }
Status MultiIndexBlock::insertAllDocumentsInCollection(std::set<RecordId>* dupsOut) { const char* curopMessage = _buildInBackground ? "Index Build (background)" : "Index Build"; const auto numRecords = _collection->numRecords(_txn); stdx::unique_lock<Client> lk(*_txn->getClient()); ProgressMeterHolder progress(*_txn->setMessage_inlock(curopMessage, curopMessage, numRecords)); lk.unlock(); Timer t; unsigned long long n = 0; unique_ptr<PlanExecutor> exec(InternalPlanner::collectionScan( _txn, _collection->ns().ns(), _collection, PlanExecutor::YIELD_MANUAL)); if (_buildInBackground) { invariant(_allowInterruption); exec->setYieldPolicy(PlanExecutor::YIELD_AUTO); } else { exec->setYieldPolicy(PlanExecutor::WRITE_CONFLICT_RETRY_ONLY); } Snapshotted<BSONObj> objToIndex; RecordId loc; PlanExecutor::ExecState state; int retries = 0; // non-zero when retrying our last document. while (retries || (PlanExecutor::ADVANCED == (state = exec->getNextSnapshotted(&objToIndex, &loc)))) { try { if (_allowInterruption) _txn->checkForInterrupt(); // Make sure we are working with the latest version of the document. if (objToIndex.snapshotId() != _txn->recoveryUnit()->getSnapshotId() && !_collection->findDoc(_txn, loc, &objToIndex)) { // doc was deleted so don't index it. retries = 0; continue; } // Done before insert so we can retry document if it WCEs. progress->setTotalWhileRunning(_collection->numRecords(_txn)); WriteUnitOfWork wunit(_txn); Status ret = insert(objToIndex.value(), loc); if (_buildInBackground) exec->saveState(); if (ret.isOK()) { wunit.commit(); } else if (dupsOut && ret.code() == ErrorCodes::DuplicateKey) { // If dupsOut is non-null, we should only fail the specific insert that // led to a DuplicateKey rather than the whole index build. dupsOut->insert(loc); } else { // Fail the index build hard. return ret; } if (_buildInBackground) exec->restoreState(); // Handles any WCEs internally. // Go to the next document progress->hit(); n++; retries = 0; } catch (const WriteConflictException& wce) { CurOp::get(_txn)->debug().writeConflicts++; retries++; // logAndBackoff expects this to be 1 on first call. wce.logAndBackoff(retries, "index creation", _collection->ns().ns()); // Can't use WRITE_CONFLICT_RETRY_LOOP macros since we need to save/restore exec // around call to abandonSnapshot. exec->saveState(); _txn->recoveryUnit()->abandonSnapshot(); exec->restoreState(); // Handles any WCEs internally. } } uassert(28550, "Unable to complete index build due to collection scan failure: " + WorkingSetCommon::toStatusString(objToIndex.value()), state == PlanExecutor::IS_EOF); progress->finished(); Status ret = doneInserting(dupsOut); if (!ret.isOK()) return ret; log() << "build index done. scanned " << n << " total records. " << t.seconds() << " secs" << endl; return Status::OK(); }
StatusWith<RecordId> Collection::updateDocument(OperationContext* txn, const RecordId& oldLocation, const Snapshotted<BSONObj>& oldDoc, const BSONObj& newDoc, bool enforceQuota, bool indexesAffected, OpDebug* debug, oplogUpdateEntryArgs& args) { { auto status = checkValidation(txn, newDoc); if (!status.isOK()) { if (_validationLevel == STRICT_V) { return status; } // moderate means we have to check the old doc auto oldDocStatus = checkValidation(txn, oldDoc.value()); if (oldDocStatus.isOK()) { // transitioning from good -> bad is not ok return status; } // bad -> bad is ok in moderate mode } } dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId()); if (_needCappedLock) { // X-lock the metadata resource for this capped collection until the end of the WUOW. This // prevents the primary from executing with more concurrency than secondaries. // See SERVER-21646. Lock::ResourceLock{txn->lockState(), ResourceId(RESOURCE_METADATA, _ns.ns()), MODE_X}; } SnapshotId sid = txn->recoveryUnit()->getSnapshotId(); BSONElement oldId = oldDoc.value()["_id"]; if (!oldId.eoo() && (oldId != newDoc["_id"])) return StatusWith<RecordId>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596); // The MMAPv1 storage engine implements capped collections in a way that does not allow records // to grow beyond their original size. If MMAPv1 part of a replicaset with storage engines that // do not have this limitation, replication could result in errors, so it is necessary to set a // uniform rule here. Similarly, it is not sufficient to disallow growing records, because this // happens when secondaries roll back an update shrunk a record. Exactly replicating legacy // MMAPv1 behavior would require padding shrunk documents on all storage engines. Instead forbid // all size changes. const auto oldSize = oldDoc.value().objsize(); if (_recordStore->isCapped() && oldSize != newDoc.objsize()) return {ErrorCodes::CannotGrowDocumentInCappedNamespace, str::stream() << "Cannot change the size of a document in a capped collection: " << oldSize << " != " << newDoc.objsize()}; // At the end of this step, we will have a map of UpdateTickets, one per index, which // represent the index updates needed to be done, based on the changes between oldDoc and // newDoc. OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexCatalogEntry* entry = ii.catalogEntry(descriptor); IndexAccessMethod* iam = ii.accessMethod(descriptor); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor); UpdateTicket* updateTicket = new UpdateTicket(); updateTickets.mutableMap()[descriptor] = updateTicket; Status ret = iam->validateUpdate(txn, oldDoc.value(), newDoc, oldLocation, options, updateTicket, entry->getFilterExpression()); if (!ret.isOK()) { return StatusWith<RecordId>(ret); } } } // This can call back into Collection::recordStoreGoingToMove. If that happens, the old // object is removed from all indexes. StatusWith<RecordId> newLocation = _recordStore->updateRecord( txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this); if (!newLocation.isOK()) { return newLocation; } // At this point, the old object may or may not still be indexed, depending on if it was // moved. If the object did move, we need to add the new location to all indexes. if (newLocation.getValue() != oldLocation) { if (debug) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } std::vector<BsonRecord> bsonRecords; BsonRecord bsonRecord = {newLocation.getValue(), &newDoc}; bsonRecords.push_back(bsonRecord); Status s = _indexCatalog.indexRecords(txn, bsonRecords); if (!s.isOK()) return StatusWith<RecordId>(s); invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; } // Object did not move. We update each index with each respective UpdateTicket. if (debug) debug->keyUpdates = 0; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = ii.accessMethod(descriptor); int64_t updatedKeys; Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys); if (!ret.isOK()) return StatusWith<RecordId>(ret); if (debug) debug->keyUpdates += updatedKeys; } } invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; }
StatusWith<RecordId> Collection::updateDocument(OperationContext* txn, const RecordId& oldLocation, const Snapshotted<BSONObj>& oldDoc, const BSONObj& newDoc, bool enforceQuota, bool indexesAffected, OpDebug* debug, oplogUpdateEntryArgs& args) { { auto status = checkValidation(txn, newDoc); if (!status.isOK()) { if (_validationLevel == STRICT_V) { return status; } // moderate means we have to check the old doc auto oldDocStatus = checkValidation(txn, oldDoc.value()); if (oldDocStatus.isOK()) { // transitioning from good -> bad is not ok return status; } // bad -> bad is ok in moderate mode } } dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId()); SnapshotId sid = txn->recoveryUnit()->getSnapshotId(); BSONElement oldId = oldDoc.value()["_id"]; if (!oldId.eoo() && (oldId != newDoc["_id"])) return StatusWith<RecordId>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596); // At the end of this step, we will have a map of UpdateTickets, one per index, which // represent the index updates needed to be done, based on the changes between oldDoc and // newDoc. OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexCatalogEntry* entry = ii.catalogEntry(descriptor); IndexAccessMethod* iam = ii.accessMethod(descriptor); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor); UpdateTicket* updateTicket = new UpdateTicket(); updateTickets.mutableMap()[descriptor] = updateTicket; Status ret = iam->validateUpdate(txn, oldDoc.value(), newDoc, oldLocation, options, updateTicket, entry->getFilterExpression()); if (!ret.isOK()) { return StatusWith<RecordId>(ret); } } } // This can call back into Collection::recordStoreGoingToMove. If that happens, the old // object is removed from all indexes. StatusWith<RecordId> newLocation = _recordStore->updateRecord( txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this); if (!newLocation.isOK()) { return newLocation; } // At this point, the old object may or may not still be indexed, depending on if it was // moved. If the object did move, we need to add the new location to all indexes. if (newLocation.getValue() != oldLocation) { if (debug) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } Status s = _indexCatalog.indexRecord(txn, newDoc, newLocation.getValue()); if (!s.isOK()) return StatusWith<RecordId>(s); invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; } // Object did not move. We update each index with each respective UpdateTicket. if (debug) debug->keyUpdates = 0; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = ii.accessMethod(descriptor); int64_t updatedKeys; Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys); if (!ret.isOK()) return StatusWith<RecordId>(ret); if (debug) debug->keyUpdates += updatedKeys; } } invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; }
bool MigrationSourceManager::clone(OperationContext* txn, string& errmsg, BSONObjBuilder& result) { ElapsedTracker tracker(internalQueryExecYieldIterations, internalQueryExecYieldPeriodMS); int allocSize = 0; { AutoGetCollection autoColl(txn, _getNS(), MODE_IS); stdx::lock_guard<stdx::mutex> sl(_mutex); if (!_active) { errmsg = "not active"; return false; } Collection* collection = autoColl.getCollection(); if (!collection) { errmsg = str::stream() << "collection " << _nss.toString() << " does not exist"; return false; } allocSize = std::min( BSONObjMaxUserSize, static_cast<int>((12 + collection->averageObjectSize(txn)) * cloneLocsRemaining())); } bool isBufferFilled = false; BSONArrayBuilder clonedDocsArrayBuilder(allocSize); while (!isBufferFilled) { AutoGetCollection autoColl(txn, _getNS(), MODE_IS); stdx::lock_guard<stdx::mutex> sl(_mutex); if (!_active) { errmsg = "not active"; return false; } // TODO: fix SERVER-16540 race Collection* collection = autoColl.getCollection(); if (!collection) { errmsg = str::stream() << "collection " << _nss.toString() << " does not exist"; return false; } stdx::lock_guard<stdx::mutex> lk(_cloneLocsMutex); std::set<RecordId>::iterator cloneLocsIter = _cloneLocs.begin(); for (; cloneLocsIter != _cloneLocs.end(); ++cloneLocsIter) { if (tracker.intervalHasElapsed()) // should I yield? break; RecordId recordId = *cloneLocsIter; Snapshotted<BSONObj> doc; if (!collection->findDoc(txn, recordId, &doc)) { // doc was deleted continue; } // Use the builder size instead of accumulating 'doc's size so that we take // into consideration the overhead of BSONArray indices, and *always* // append one doc. if (clonedDocsArrayBuilder.arrSize() != 0 && (clonedDocsArrayBuilder.len() + doc.value().objsize() + 1024) > BSONObjMaxUserSize) { isBufferFilled = true; // break out of outer while loop break; } clonedDocsArrayBuilder.append(doc.value()); } _cloneLocs.erase(_cloneLocs.begin(), cloneLocsIter); // Note: must be holding _cloneLocsMutex, don't move this inside while condition! if (_cloneLocs.empty()) { break; } } result.appendArray("objects", clonedDocsArrayBuilder.arr()); return true; }
BSONObj UpdateStage::transformAndUpdate(const Snapshotted<BSONObj>& oldObj, RecordId& recordId) { const UpdateRequest* request = _params.request; UpdateDriver* driver = _params.driver; CanonicalQuery* cq = _params.canonicalQuery; UpdateLifecycle* lifecycle = request->getLifecycle(); // If asked to return new doc, default to the oldObj, in case nothing changes. BSONObj newObj = oldObj.value(); // Ask the driver to apply the mods. It may be that the driver can apply those "in // place", that is, some values of the old document just get adjusted without any // change to the binary layout on the bson layer. It may be that a whole new document // is needed to accomodate the new bson layout of the resulting document. In any event, // only enable in-place mutations if the underlying storage engine offers support for // writing damage events. _doc.reset(oldObj.value(), (_collection->updateWithDamagesSupported() ? mutablebson::Document::kInPlaceEnabled : mutablebson::Document::kInPlaceDisabled)); BSONObj logObj; bool docWasModified = false; Status status = Status::OK(); const bool validateForStorage = getOpCtx()->writesAreReplicated() && _enforceOkForStorage; FieldRefSet immutablePaths; if (getOpCtx()->writesAreReplicated() && !request->isFromMigration()) { if (lifecycle) { auto immutablePathsVector = getImmutableFields(getOpCtx(), request->getNamespaceString()); if (immutablePathsVector) { immutablePaths.fillFrom( transitional_tools_do_not_use::unspool_vector(*immutablePathsVector)); } } immutablePaths.keepShortest(&idFieldRef); } if (!driver->needMatchDetails()) { // If we don't need match details, avoid doing the rematch status = driver->update( StringData(), &_doc, validateForStorage, immutablePaths, &logObj, &docWasModified); } else { // If there was a matched field, obtain it. MatchDetails matchDetails; matchDetails.requestElemMatchKey(); dassert(cq); verify(cq->root()->matchesBSON(oldObj.value(), &matchDetails)); string matchedField; if (matchDetails.hasElemMatchKey()) matchedField = matchDetails.elemMatchKey(); status = driver->update( matchedField, &_doc, validateForStorage, immutablePaths, &logObj, &docWasModified); } if (!status.isOK()) { uasserted(16837, status.reason()); } // Skip adding _id field if the collection is capped (since capped collection documents can // neither grow nor shrink). const auto createIdField = !_collection->isCapped(); // Ensure if _id exists it is first status = ensureIdFieldIsFirst(&_doc); if (status.code() == ErrorCodes::InvalidIdField) { // Create ObjectId _id field if we are doing that if (createIdField) { addObjectIDIdField(&_doc); } } else { uassertStatusOK(status); } // See if the changes were applied in place const char* source = NULL; const bool inPlace = _doc.getInPlaceUpdates(&_damages, &source); if (inPlace && _damages.empty()) { // An interesting edge case. A modifier didn't notice that it was really a no-op // during its 'prepare' phase. That represents a missed optimization, but we still // shouldn't do any real work. Toggle 'docWasModified' to 'false'. // // Currently, an example of this is '{ $push : { x : {$each: [], $sort: 1} } }' when the 'x' // array exists and is already sorted. docWasModified = false; } if (docWasModified) { // Prepare to write back the modified document WriteUnitOfWork wunit(getOpCtx()); RecordId newRecordId; OplogUpdateEntryArgs args; if (!request->isExplain()) { invariant(_collection); auto* css = CollectionShardingState::get(getOpCtx(), _collection->ns()); args.nss = _collection->ns(); args.uuid = _collection->uuid(); args.stmtId = request->getStmtId(); args.update = logObj; args.criteria = css->getMetadata().extractDocumentKey(newObj); uassert(16980, "Multi-update operations require all documents to have an '_id' field", !request->isMulti() || args.criteria.hasField("_id"_sd)); args.fromMigrate = request->isFromMigration(); args.storeDocOption = getStoreDocMode(*request); if (args.storeDocOption == OplogUpdateEntryArgs::StoreDocOption::PreImage) { args.preImageDoc = oldObj.value().getOwned(); } } if (inPlace) { if (!request->isExplain()) { newObj = oldObj.value(); const RecordData oldRec(oldObj.value().objdata(), oldObj.value().objsize()); Snapshotted<RecordData> snap(oldObj.snapshotId(), oldRec); StatusWith<RecordData> newRecStatus = _collection->updateDocumentWithDamages( getOpCtx(), recordId, std::move(snap), source, _damages, &args); newObj = uassertStatusOK(std::move(newRecStatus)).releaseToBson(); } newRecordId = recordId; } else { // The updates were not in place. Apply them through the file manager. newObj = _doc.getObject(); uassert(17419, str::stream() << "Resulting document after update is larger than " << BSONObjMaxUserSize, newObj.objsize() <= BSONObjMaxUserSize); if (!request->isExplain()) { newRecordId = _collection->updateDocument(getOpCtx(), recordId, oldObj, newObj, true, driver->modsAffectIndices(), _params.opDebug, &args); } } invariant(oldObj.snapshotId() == getOpCtx()->recoveryUnit()->getSnapshotId()); wunit.commit(); // If the document moved, we might see it again in a collection scan (maybe it's // a document after our current document). // // If the document is indexed and the mod changes an indexed value, we might see // it again. For an example, see the comment above near declaration of // updatedRecordIds. // // This must be done after the wunit commits so we are sure we won't be rolling back. if (_updatedRecordIds && (newRecordId != recordId || driver->modsAffectIndices())) { _updatedRecordIds->insert(newRecordId); } } // Only record doc modifications if they wrote (exclude no-ops). Explains get // recorded as if they wrote. if (docWasModified || request->isExplain()) { _specificStats.nModified++; } return newObj; }
Status cloneCollectionAsCapped(OperationContext* txn, Database* db, const std::string& shortFrom, const std::string& shortTo, double size, bool temp) { std::string fromNs = db->name() + "." + shortFrom; std::string toNs = db->name() + "." + shortTo; Collection* fromCollection = db->getCollection(fromNs); if (!fromCollection) return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNs << " does not exist"); if (db->getCollection(toNs)) return Status(ErrorCodes::NamespaceExists, "to collection already exists"); // create new collection MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { const auto fromOptions = fromCollection->getCatalogEntry()->getCollectionOptions(txn).toBSON(); OldClientContext ctx(txn, toNs); BSONObjBuilder spec; spec.appendBool("capped", true); spec.append("size", size); if (temp) spec.appendBool("temp", true); spec.appendElementsUnique(fromOptions); WriteUnitOfWork wunit(txn); Status status = userCreateNS(txn, ctx.db(), toNs, spec.done()); if (!status.isOK()) return status; wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "cloneCollectionAsCapped", fromNs); Collection* toCollection = db->getCollection(toNs); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(txn) * 2)); long long excessSize = fromCollection->dataSize(txn) - allocatedSpaceGuess; std::unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, fromNs, fromCollection, InternalPlanner::FORWARD)); exec->setYieldPolicy(PlanExecutor::WRITE_CONFLICT_RETRY_ONLY); Snapshotted<BSONObj> objToClone; RecordId loc; PlanExecutor::ExecState state = PlanExecutor::FAILURE; // suppress uninitialized warnings DisableDocumentValidation validationDisabler(txn); int retries = 0; // non-zero when retrying our last document. while (true) { if (!retries) { state = exec->getNextSnapshotted(&objToClone, &loc); } switch (state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::ADVANCED: { if (excessSize > 0) { // 4x is for padding, power of 2, etc... excessSize -= (4 * objToClone.value().objsize()); continue; } break; } default: // Unreachable as: // 1) We require a read lock (at a minimum) on the "from" collection // and won't yield, preventing collection drop and PlanExecutor::DEAD // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The // CollectionScan PlanStage does not have a FAILURE scenario. // 3) All other PlanExecutor states are handled above invariant(false); } try { // Make sure we are working with the latest version of the document. if (objToClone.snapshotId() != txn->recoveryUnit()->getSnapshotId() && !fromCollection->findDoc(txn, loc, &objToClone)) { // doc was deleted so don't clone it. retries = 0; continue; } WriteUnitOfWork wunit(txn); toCollection->insertDocument(txn, objToClone.value(), true, txn->writesAreReplicated()); wunit.commit(); // Go to the next document retries = 0; } catch (const WriteConflictException& wce) { CurOp::get(txn)->debug().writeConflicts++; retries++; // logAndBackoff expects this to be 1 on first call. wce.logAndBackoff(retries, "cloneCollectionAsCapped", fromNs); // Can't use WRITE_CONFLICT_RETRY_LOOP macros since we need to save/restore exec // around call to abandonSnapshot. exec->saveState(); txn->recoveryUnit()->abandonSnapshot(); exec->restoreState(txn); // Handles any WCEs internally. } } invariant(false); // unreachable }
mongo::Status mongo::cloneCollectionAsCapped(OperationContext* opCtx, Database* db, const std::string& shortFrom, const std::string& shortTo, long long size, bool temp) { NamespaceString fromNss(db->name(), shortFrom); NamespaceString toNss(db->name(), shortTo); Collection* fromCollection = db->getCollection(opCtx, fromNss); if (!fromCollection) { if (db->getViewCatalog()->lookup(opCtx, fromNss.ns())) { return Status(ErrorCodes::CommandNotSupportedOnView, str::stream() << "cloneCollectionAsCapped not supported for views: " << fromNss.ns()); } return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " does not exist"); } if (fromNss.isDropPendingNamespace()) { return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " is currently in a drop-pending state."); } if (db->getCollection(opCtx, toNss)) { return Status(ErrorCodes::NamespaceExists, str::stream() << "cloneCollectionAsCapped failed - destination collection " << toNss.ns() << " already exists. source collection: " << fromNss.ns()); } // create new collection { auto options = fromCollection->getCatalogEntry()->getCollectionOptions(opCtx); // The capped collection will get its own new unique id, as the conversion isn't reversible, // so it can't be rolled back. options.uuid.reset(); options.capped = true; options.cappedSize = size; if (temp) options.temp = true; BSONObjBuilder cmd; cmd.append("create", toNss.coll()); cmd.appendElements(options.toBSON()); Status status = createCollection(opCtx, toNss.db().toString(), cmd.done()); if (!status.isOK()) return status; } Collection* toCollection = db->getCollection(opCtx, toNss); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(opCtx) * 2)); long long excessSize = fromCollection->dataSize(opCtx) - allocatedSpaceGuess; auto exec = InternalPlanner::collectionScan(opCtx, fromNss.ns(), fromCollection, PlanExecutor::WRITE_CONFLICT_RETRY_ONLY, InternalPlanner::FORWARD); Snapshotted<BSONObj> objToClone; RecordId loc; PlanExecutor::ExecState state = PlanExecutor::FAILURE; // suppress uninitialized warnings DisableDocumentValidation validationDisabler(opCtx); int retries = 0; // non-zero when retrying our last document. while (true) { if (!retries) { state = exec->getNextSnapshotted(&objToClone, &loc); } switch (state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::ADVANCED: { if (excessSize > 0) { // 4x is for padding, power of 2, etc... excessSize -= (4 * objToClone.value().objsize()); continue; } break; } default: // Unreachable as: // 1) We require a read lock (at a minimum) on the "from" collection // and won't yield, preventing collection drop and PlanExecutor::DEAD // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The // CollectionScan PlanStage does not have a FAILURE scenario. // 3) All other PlanExecutor states are handled above MONGO_UNREACHABLE; } try { // Make sure we are working with the latest version of the document. if (objToClone.snapshotId() != opCtx->recoveryUnit()->getSnapshotId() && !fromCollection->findDoc(opCtx, loc, &objToClone)) { // doc was deleted so don't clone it. retries = 0; continue; } WriteUnitOfWork wunit(opCtx); OpDebug* const nullOpDebug = nullptr; uassertStatusOK(toCollection->insertDocument( opCtx, InsertStatement(objToClone.value()), nullOpDebug, true)); wunit.commit(); // Go to the next document retries = 0; } catch (const WriteConflictException&) { CurOp::get(opCtx)->debug().additiveMetrics.incrementWriteConflicts(1); retries++; // logAndBackoff expects this to be 1 on first call. WriteConflictException::logAndBackoff(retries, "cloneCollectionAsCapped", fromNss.ns()); // Can't use writeConflictRetry since we need to save/restore exec around call to // abandonSnapshot. exec->saveState(); opCtx->recoveryUnit()->abandonSnapshot(); auto restoreStatus = exec->restoreState(); // Handles any WCEs internally. if (!restoreStatus.isOK()) { return restoreStatus; } } } MONGO_UNREACHABLE; }
void run() { OldClientWriteContext ctx(&_txn, ns()); Database* db = ctx.db(); Collection* coll = db->getCollection(ns()); if (!coll) { WriteUnitOfWork wuow(&_txn); coll = db->createCollection(&_txn, ns()); wuow.commit(); } { WriteUnitOfWork wuow(&_txn); fillData(); wuow.commit(); } // The data we're going to later invalidate. set<RecordId> locs; getLocs(&locs, coll); std::unique_ptr<PlanExecutor> exec(makePlanExecutorWithSortStage(coll)); SortStage* ss = static_cast<SortStage*>(exec->getRootStage()); QueuedDataStage* ms = static_cast<QueuedDataStage*>(ss->getChildren()[0]); // Have sort read in data from the queued data stage. const int firstRead = 5; for (int i = 0; i < firstRead; ++i) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ss->work(&id); ASSERT_NOT_EQUALS(PlanStage::ADVANCED, status); } // We should have read in the first 'firstRead' locs. Invalidate the first one. // Since it's in the WorkingSet, the updates should not be reflected in the output. exec->saveState(); set<RecordId>::iterator it = locs.begin(); Snapshotted<BSONObj> oldDoc = coll->docFor(&_txn, *it); OID updatedId = oldDoc.value().getField("_id").OID(); SnapshotId idBeforeUpdate = oldDoc.snapshotId(); // We purposefully update the document to have a 'foo' value greater than limit(). // This allows us to check that we don't return the new copy of a doc by asserting // foo < limit(). BSONObj newDoc = BSON("_id" << updatedId << "foo" << limit() + 10); oplogUpdateEntryArgs args; { WriteUnitOfWork wuow(&_txn); coll->updateDocument(&_txn, *it, oldDoc, newDoc, false, false, NULL, args); wuow.commit(); } exec->restoreState(&_txn); // Read the rest of the data from the queued data stage. while (!ms->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; ss->work(&id); } // Let's just invalidate everything now. Already read into ss, so original values // should be fetched. exec->saveState(); while (it != locs.end()) { oldDoc = coll->docFor(&_txn, *it); { WriteUnitOfWork wuow(&_txn); coll->updateDocument(&_txn, *it++, oldDoc, newDoc, false, false, NULL, args); wuow.commit(); } } exec->restoreState(&_txn); // Verify that it's sorted, the right number of documents are returned, and they're all // in the expected range. int count = 0; int lastVal = 0; int thisVal; while (!ss->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ss->work(&id); if (PlanStage::ADVANCED != status) { ASSERT_NE(status, PlanStage::FAILURE); ASSERT_NE(status, PlanStage::DEAD); continue; } WorkingSetMember* member = exec->getWorkingSet()->get(id); ASSERT(member->hasObj()); if (member->obj.value().getField("_id").OID() == updatedId) { ASSERT(idBeforeUpdate == member->obj.snapshotId()); } thisVal = member->obj.value().getField("foo").Int(); ASSERT_LTE(lastVal, thisVal); // Expect docs in range [0, limit) ASSERT_LTE(0, thisVal); ASSERT_LT(thisVal, limit()); lastVal = thisVal; ++count; } // Returns all docs. ASSERT_EQUALS(limit(), count); }
static bool runImpl(OperationContext* txn, const string& dbname, const string& ns, const BSONObj& query, const BSONObj& fields, const BSONObj& update, const BSONObj& sort, bool upsert, bool returnNew, bool remove , BSONObjBuilder& result, string& errmsg) { AutoGetOrCreateDb autoDb(txn, dbname, MODE_IX); Lock::CollectionLock collLock(txn->lockState(), ns, MODE_IX); Client::Context ctx(txn, ns, autoDb.getDb(), autoDb.justCreated()); if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) { return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while running findAndModify in " << ns)); } Collection* collection = ctx.db()->getCollection(ns); const WhereCallbackReal whereCallback(txn, StringData(ns)); if ( !collection ) { if ( !upsert ) { // no collectio and no upsert, so can't possible do anything _appendHelper( result, BSONObj(), false, fields, whereCallback ); return true; } // no collection, but upsert, so we want to create it // problem is we only have IX on db and collection :( // so we tell our caller who can do it errmsg = "no-collection"; return false; } Snapshotted<BSONObj> snapshotDoc; RecordId loc; bool found = false; { CanonicalQuery* cq; const BSONObj projection; const long long skip = 0; const long long limit = -1; // 1 document requested; negative indicates hard limit. uassertStatusOK(CanonicalQuery::canonicalize(ns, query, sort, projection, skip, limit, &cq, whereCallback)); PlanExecutor* rawExec; uassertStatusOK(getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, QueryPlannerParams::DEFAULT)); scoped_ptr<PlanExecutor> exec(rawExec); PlanExecutor::ExecState state = exec->getNextSnapshotted(&snapshotDoc, &loc); if (PlanExecutor::ADVANCED == state) { found = true; } else if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { if (PlanExecutor::FAILURE == state && WorkingSetCommon::isValidStatusMemberObject(snapshotDoc.value())) { const Status errorStatus = WorkingSetCommon::getMemberObjectStatus(snapshotDoc.value()); invariant(!errorStatus.isOK()); uasserted(errorStatus.code(), errorStatus.reason()); } uasserted(ErrorCodes::OperationFailed, str::stream() << "executor returned " << PlanExecutor::statestr(state) << " while finding document to update"); } else { invariant(PlanExecutor::IS_EOF == state); } } WriteUnitOfWork wuow(txn); if (found) { // We found a doc, but it might not be associated with the active snapshot. // If the doc has changed or is no longer in the collection, we will throw a // write conflict exception and start again from the beginning. if (txn->recoveryUnit()->getSnapshotId() != snapshotDoc.snapshotId()) { BSONObj oldObj = snapshotDoc.value(); if (!collection->findDoc(txn, loc, &snapshotDoc)) { // Got deleted in the new snapshot. throw WriteConflictException(); } if (!oldObj.binaryEqual(snapshotDoc.value())) { // Got updated in the new snapshot. throw WriteConflictException(); } } // If we get here without throwing, then we should have the copy of the doc from // the latest snapshot. invariant(txn->recoveryUnit()->getSnapshotId() == snapshotDoc.snapshotId()); } BSONObj doc = snapshotDoc.value(); BSONObj queryModified = query; if (found && !doc["_id"].eoo() && !CanonicalQuery::isSimpleIdQuery(query)) { // we're going to re-write the query to be more efficient // we have to be a little careful because of positional operators // maybe we can pass this all through eventually, but right now isn't an easy way bool hasPositionalUpdate = false; { // if the update has a positional piece ($) // then we need to pull all query parts in // so here we check for $ // a little hacky BSONObjIterator i( update ); while ( i.more() ) { const BSONElement& elem = i.next(); if ( elem.fieldName()[0] != '$' || elem.type() != Object ) continue; BSONObjIterator j( elem.Obj() ); while ( j.more() ) { if ( str::contains( j.next().fieldName(), ".$" ) ) { hasPositionalUpdate = true; break; } } } } BSONObjBuilder b(query.objsize() + 10); b.append( doc["_id"] ); bool addedAtomic = false; BSONObjIterator i(query); while ( i.more() ) { const BSONElement& elem = i.next(); if ( str::equals( "_id" , elem.fieldName() ) ) { // we already do _id continue; } if ( ! hasPositionalUpdate ) { // if there is a dotted field, accept we may need more query parts continue; } if ( ! addedAtomic ) { b.appendBool( "$atomic" , true ); addedAtomic = true; } b.append( elem ); } queryModified = b.obj(); } if ( remove ) { _appendHelper(result, doc, found, fields, whereCallback); if ( found ) { deleteObjects(txn, ctx.db(), ns, queryModified, PlanExecutor::YIELD_MANUAL, true, true); BSONObjBuilder le( result.subobjStart( "lastErrorObject" ) ); le.appendNumber( "n" , 1 ); le.done(); } } else { // update if ( ! found && ! upsert ) { // didn't have it, and am not upserting _appendHelper(result, doc, found, fields, whereCallback); } else { // we found it or we're updating if ( ! returnNew ) { _appendHelper(result, doc, found, fields, whereCallback); } const NamespaceString requestNs(ns); UpdateRequest request(requestNs); request.setQuery(queryModified); request.setUpdates(update); request.setUpsert(upsert); request.setUpdateOpLog(); request.setStoreResultDoc(returnNew); request.setYieldPolicy(PlanExecutor::YIELD_MANUAL); // TODO(greg) We need to send if we are ignoring // the shard version below, but for now no UpdateLifecycleImpl updateLifecycle(false, requestNs); request.setLifecycle(&updateLifecycle); UpdateResult res = mongo::update(txn, ctx.db(), request, &txn->getCurOp()->debug()); if (!found && res.existing) { // No match was found during the read part of this find and modify, which // means that we're here doing an upsert. But the update also told us that // we modified an *already existing* document. This probably means that // the query reported EOF based on an out-of-date snapshot. This should be // a rare event, so we handle it by throwing a write conflict. throw WriteConflictException(); } if ( !collection ) { // collection created by an upsert collection = ctx.db()->getCollection(ns); } LOG(3) << "update result: " << res ; if (returnNew) { dassert(!res.newObj.isEmpty()); _appendHelper(result, res.newObj, true, fields, whereCallback); } BSONObjBuilder le( result.subobjStart( "lastErrorObject" ) ); le.appendBool( "updatedExisting" , res.existing ); le.appendNumber( "n" , res.numMatched ); if ( !res.upserted.isEmpty() ) { le.append( res.upserted[kUpsertedFieldName] ); } le.done(); } } // Committing the WUOW can close the current snapshot. Until this happens, the // snapshot id should not have changed. if (found) { invariant(txn->recoveryUnit()->getSnapshotId() == snapshotDoc.snapshotId()); } wuow.commit(); return true; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite globalWriteLock(txn->lockState()); string source = cmdObj.getStringField(name.c_str()); string target = cmdObj.getStringField("to"); if (!fromRepl && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) { return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while renaming collection " << source << " to " << target)); } // We stay in source context the whole time. This is mostly to set the CurOp namespace. Client::Context ctx(txn, source); if (!NamespaceString::validCollectionComponent(target.c_str())) { errmsg = "invalid collection name: " + target; return false; } if (source.empty() || target.empty()) { errmsg = "invalid command syntax"; return false; } if ((repl::getGlobalReplicationCoordinator()->getReplicationMode() != repl::ReplicationCoordinator::modeNone)) { if (NamespaceString(source).isOplog()) { errmsg = "can't rename live oplog while replicating"; return false; } if (NamespaceString(target).isOplog()) { errmsg = "can't rename to live oplog while replicating"; return false; } } if (NamespaceString::oplog(source) != NamespaceString::oplog(target)) { errmsg = "If either the source or target of a rename is an oplog name, both must be"; return false; } if (!fromRepl) { // If it got through on the master, need to allow it here too Status sourceStatus = userAllowedWriteNS(source); if (!sourceStatus.isOK()) { errmsg = "error with source namespace: " + sourceStatus.reason(); return false; } Status targetStatus = userAllowedWriteNS(target); if (!targetStatus.isOK()) { errmsg = "error with target namespace: " + targetStatus.reason(); return false; } } if (NamespaceString(source).coll() == "system.indexes" || NamespaceString(target).coll() == "system.indexes") { errmsg = "renaming system.indexes is not allowed"; return false; } Database* const sourceDB = dbHolder().get(txn, nsToDatabase(source)); Collection* const sourceColl = sourceDB ? sourceDB->getCollection(source) : NULL; if (!sourceColl) { errmsg = "source namespace does not exist"; return false; } { // Ensure that collection name does not exceed maximum length. // Ensure that index names do not push the length over the max. // Iterator includes unfinished indexes. IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator(txn, true); int longestIndexNameLength = 0; while (sourceIndIt.more()) { int thisLength = sourceIndIt.next()->indexName().length(); if (thisLength > longestIndexNameLength) longestIndexNameLength = thisLength; } unsigned int longestAllowed = min(int(NamespaceString::MaxNsCollectionLen), int(NamespaceString::MaxNsLen) - 2 /*strlen(".$")*/ - longestIndexNameLength); if (target.size() > longestAllowed) { StringBuilder sb; sb << "collection name length of " << target.size() << " exceeds maximum length of " << longestAllowed << ", allowing for index names"; errmsg = sb.str(); return false; } } BackgroundOperation::assertNoBgOpInProgForNs(source); Database* const targetDB = dbHolder().openDb(txn, nsToDatabase(target)); { WriteUnitOfWork wunit(txn); // Check if the target namespace exists and if dropTarget is true. // If target exists and dropTarget is not true, return false. if (targetDB->getCollection(target)) { if (!cmdObj["dropTarget"].trueValue()) { errmsg = "target namespace exists"; return false; } Status s = targetDB->dropCollection(txn, target); if (!s.isOK()) { errmsg = s.toString(); return false; } } // If we are renaming in the same database, just // rename the namespace and we're done. if (sourceDB == targetDB) { Status s = targetDB->renameCollection(txn, source, target, cmdObj["stayTemp"].trueValue()); if (!s.isOK()) { return appendCommandStatus(result, s); } if (!fromRepl) { repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj); } wunit.commit(); return true; } wunit.commit(); } // If we get here, we are renaming across databases, so we must copy all the data and // indexes, then remove the source collection. // Create the target collection. It will be removed if we fail to copy the collection. // TODO use a temp collection and unset the temp flag on success. Collection* targetColl = NULL; { CollectionOptions options; options.setNoIdIndex(); if (sourceColl->isCapped()) { const CollectionOptions sourceOpts = sourceColl->getCatalogEntry()->getCollectionOptions(txn); options.capped = true; options.cappedSize = sourceOpts.cappedSize; options.cappedMaxDocs = sourceOpts.cappedMaxDocs; } WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. targetColl = targetDB->createCollection(txn, target, options); if (!targetColl) { errmsg = "Failed to create target collection."; return false; } wunit.commit(); } // Dismissed on success ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target); MultiIndexBlock indexer(txn, targetColl); indexer.allowInterruption(); // Copy the index descriptions from the source collection, adjusting the ns field. { std::vector<BSONObj> indexesToCopy; IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator(txn, true); while (sourceIndIt.more()) { const BSONObj currIndex = sourceIndIt.next()->infoObj(); // Process the source index. BSONObjBuilder newIndex; newIndex.append("ns", target); newIndex.appendElementsUnique(currIndex); indexesToCopy.push_back(newIndex.obj()); } indexer.init(indexesToCopy); } { // Copy over all the data from source collection to target collection. boost::scoped_ptr<RecordIterator> sourceIt(sourceColl->getIterator(txn)); while (!sourceIt->isEOF()) { txn->checkForInterrupt(); const Snapshotted<BSONObj> obj = sourceColl->docFor(txn, sourceIt->getNext()); WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. Status status = targetColl->insertDocument(txn, obj.value(), &indexer, true).getStatus(); if (!status.isOK()) return appendCommandStatus(result, status); wunit.commit(); } } Status status = indexer.doneInserting(); if (!status.isOK()) return appendCommandStatus(result, status); { // Getting here means we successfully built the target copy. We now remove the // source collection and finalize the rename. WriteUnitOfWork wunit(txn); Status status = sourceDB->dropCollection(txn, source); if (!status.isOK()) return appendCommandStatus(result, status); indexer.commit(); if (!fromRepl) { repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj); } wunit.commit(); } targetCollectionDropper.Dismiss(); return true; }
Status MultiIndexBlockImpl::insertAllDocumentsInCollection(std::set<RecordId>* dupsOut) { const char* curopMessage = _buildInBackground ? "Index Build (background)" : "Index Build"; const auto numRecords = _collection->numRecords(_opCtx); stdx::unique_lock<Client> lk(*_opCtx->getClient()); ProgressMeterHolder progress( CurOp::get(_opCtx)->setMessage_inlock(curopMessage, curopMessage, numRecords)); lk.unlock(); Timer t; unsigned long long n = 0; PlanExecutor::YieldPolicy yieldPolicy; if (_buildInBackground) { invariant(_allowInterruption); yieldPolicy = PlanExecutor::YIELD_AUTO; } else { yieldPolicy = PlanExecutor::WRITE_CONFLICT_RETRY_ONLY; } auto exec = InternalPlanner::collectionScan(_opCtx, _collection->ns().ns(), _collection, yieldPolicy); Snapshotted<BSONObj> objToIndex; RecordId loc; PlanExecutor::ExecState state; int retries = 0; // non-zero when retrying our last document. while (retries || (PlanExecutor::ADVANCED == (state = exec->getNextSnapshotted(&objToIndex, &loc))) || MONGO_FAIL_POINT(hangAfterStartingIndexBuild)) { try { if (_allowInterruption) _opCtx->checkForInterrupt(); if (!(retries || (PlanExecutor::ADVANCED == state))) { // The only reason we are still in the loop is hangAfterStartingIndexBuild. log() << "Hanging index build due to 'hangAfterStartingIndexBuild' failpoint"; invariant(_allowInterruption); sleepmillis(1000); continue; } // Make sure we are working with the latest version of the document. if (objToIndex.snapshotId() != _opCtx->recoveryUnit()->getSnapshotId() && !_collection->findDoc(_opCtx, loc, &objToIndex)) { // doc was deleted so don't index it. retries = 0; continue; } // Done before insert so we can retry document if it WCEs. progress->setTotalWhileRunning(_collection->numRecords(_opCtx)); WriteUnitOfWork wunit(_opCtx); Status ret = insert(objToIndex.value(), loc); if (_buildInBackground) exec->saveState(); if (ret.isOK()) { wunit.commit(); } else if (dupsOut && ret.code() == ErrorCodes::DuplicateKey) { // If dupsOut is non-null, we should only fail the specific insert that // led to a DuplicateKey rather than the whole index build. dupsOut->insert(loc); } else { // Fail the index build hard. return ret; } if (_buildInBackground) { auto restoreStatus = exec->restoreState(); // Handles any WCEs internally. if (!restoreStatus.isOK()) { return restoreStatus; } } // Go to the next document progress->hit(); n++; retries = 0; } catch (const WriteConflictException&) { CurOp::get(_opCtx)->debug().writeConflicts++; retries++; // logAndBackoff expects this to be 1 on first call. WriteConflictException::logAndBackoff( retries, "index creation", _collection->ns().ns()); // Can't use writeConflictRetry since we need to save/restore exec around call to // abandonSnapshot. exec->saveState(); _opCtx->recoveryUnit()->abandonSnapshot(); auto restoreStatus = exec->restoreState(); // Handles any WCEs internally. if (!restoreStatus.isOK()) { return restoreStatus; } } } uassert(28550, "Unable to complete index build due to collection scan failure: " + WorkingSetCommon::toStatusString(objToIndex.value()), state == PlanExecutor::IS_EOF); if (MONGO_FAIL_POINT(hangAfterStartingIndexBuildUnlocked)) { // Unlock before hanging so replication recognizes we've completed. Locker::LockSnapshot lockInfo; _opCtx->lockState()->saveLockStateAndUnlock(&lockInfo); while (MONGO_FAIL_POINT(hangAfterStartingIndexBuildUnlocked)) { log() << "Hanging index build with no locks due to " "'hangAfterStartingIndexBuildUnlocked' failpoint"; sleepmillis(1000); } // If we want to support this, we'd need to regrab the lock and be sure that all callers are // ok with us yielding. They should be for BG indexes, but not for foreground. invariant(!"the hangAfterStartingIndexBuildUnlocked failpoint can't be turned off"); } progress->finished(); Status ret = doneInserting(dupsOut); if (!ret.isOK()) return ret; log() << "build index done. scanned " << n << " total records. " << t.seconds() << " secs"; return Status::OK(); }