// This free function is used by the writer threads to apply each op void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; txn.setReplicatedWrites(false); DisableDocumentValidation validationDisabler(&txn); // allow us to get through the magic barrier txn.lockState()->setIsBatchWriter(true); bool convertUpdatesToUpserts = true; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { const Status s = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s.isOK()) { severe() << "Error applying operation (" << it->toString() << "): " << s; fassertFailedNoTrace(16359); } } catch (const DBException& e) { severe() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString(); if (inShutdown()) { return; } fassertFailedNoTrace(16360); } } }
// This free function is used by the initial sync writer threads to apply each op void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; txn.setReplicatedWrites(false); DisableDocumentValidation validationDisabler(&txn); // allow us to get through the magic barrier txn.lockState()->setIsBatchWriter(true); bool convertUpdatesToUpserts = false; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { const Status s = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s.isOK()) { if (st->shouldRetry(&txn, *it)) { const Status s2 = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s2.isOK()) { severe() << "Error applying operation (" << it->toString() << "): " << s2; fassertFailedNoTrace(15915); } } // If shouldRetry() returns false, fall through. // This can happen if the document that was moved and missed by Cloner // subsequently got deleted and no longer exists on the Sync Target at all } } catch (const DBException& e) { severe() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString(); if (inShutdown()) { return; } fassertFailedNoTrace(16361); } } }
// static Status SyncTail::syncApply(OperationContext* txn, const BSONObj& op, bool convertUpdateToUpsert, ApplyOperationInLockFn applyOperationInLock, ApplyCommandInLockFn applyCommandInLock, IncrementOpsAppliedStatsFn incrementOpsAppliedStats) { if (inShutdown()) { return Status::OK(); } // Count each log op application as a separate operation, for reporting purposes CurOp individualOp(txn); const char* ns = op.getStringField("ns"); verify(ns); const char* opType = op["op"].valuestrsafe(); bool isCommand(opType[0] == 'c'); bool isNoOp(opType[0] == 'n'); if ((*ns == '\0') || (*ns == '.')) { // this is ugly // this is often a no-op // but can't be 100% sure if (!isNoOp) { error() << "skipping bad op in oplog: " << op.toString(); } return Status::OK(); } if (isCommand) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // a command may need a global write lock. so we will conservatively go // ahead and grab one here. suboptimal. :-( Lock::GlobalWrite globalWriteLock(txn->lockState()); // special case apply for commands to avoid implicit database creation Status status = applyCommandInLock(txn, op); incrementOpsAppliedStats(); return status; } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "syncApply_command", ns); } auto applyOp = [&](Database* db) { // For non-initial-sync, we convert updates to upserts // to suppress errors when replaying oplog entries. txn->setReplicatedWrites(false); DisableDocumentValidation validationDisabler(txn); Status status = applyOperationInLock(txn, db, op, convertUpdateToUpsert, incrementOpsAppliedStats); if (!status.isOK() && status.code() == ErrorCodes::WriteConflict) { throw WriteConflictException(); } return status; }; if (isNoOp || (opType[0] == 'i' && nsToCollectionSubstring(ns) == "system.indexes")) { auto opStr = isNoOp ? "syncApply_noop" : "syncApply_indexBuild"; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X); OldClientContext ctx(txn, ns); return applyOp(ctx.db()); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, opStr, ns); }
StatusWith<CompactStats> compactCollection(OperationContext* opCtx, Collection* collection, const CompactOptions* compactOptions) { dassert(opCtx->lockState()->isCollectionLockedForMode(collection->ns().toString(), MODE_X)); DisableDocumentValidation validationDisabler(opCtx); auto recordStore = collection->getRecordStore(); auto indexCatalog = collection->getIndexCatalog(); if (!recordStore->compactSupported()) return StatusWith<CompactStats>(ErrorCodes::CommandNotSupported, str::stream() << "cannot compact collection with record store: " << recordStore->name()); if (recordStore->compactsInPlace()) { CompactStats stats; Status status = recordStore->compact(opCtx); if (!status.isOK()) return StatusWith<CompactStats>(status); // Compact all indexes (not including unfinished indexes) status = indexCatalog->compactIndexes(opCtx); if (!status.isOK()) return StatusWith<CompactStats>(status); return StatusWith<CompactStats>(stats); } if (indexCatalog->numIndexesInProgress(opCtx)) return StatusWith<CompactStats>(ErrorCodes::BadValue, "cannot compact when indexes in progress"); std::vector<BSONObj> indexSpecs; { std::unique_ptr<IndexCatalog::IndexIterator> ii( indexCatalog->getIndexIterator(opCtx, false)); while (ii->more()) { const IndexDescriptor* descriptor = ii->next()->descriptor(); // Compact always creates the new index in the foreground. const BSONObj spec = descriptor->infoObj().removeField(IndexDescriptor::kBackgroundFieldName); const BSONObj key = spec.getObjectField("key"); const Status keyStatus = index_key_validate::validateKeyPattern(key, descriptor->version()); if (!keyStatus.isOK()) { return StatusWith<CompactStats>( ErrorCodes::CannotCreateIndex, str::stream() << "Cannot compact collection due to invalid index " << spec << ": " << keyStatus.reason() << " For more info see" << " http://dochub.mongodb.org/core/index-validation"); } indexSpecs.push_back(spec); } } // Give a chance to be interrupted *before* we drop all indexes. opCtx->checkForInterrupt(); { // note that the drop indexes call also invalidates all clientcursors for the namespace, // which is important and wanted here WriteUnitOfWork wunit(opCtx); log() << "compact dropping indexes"; indexCatalog->dropAllIndexes(opCtx, true); wunit.commit(); } CompactStats stats; MultiIndexBlock indexer(opCtx, collection); indexer.allowInterruption(); indexer.ignoreUniqueConstraint(); // in compact we should be doing no checking Status status = indexer.init(indexSpecs).getStatus(); if (!status.isOK()) return StatusWith<CompactStats>(status); status = recordStore->compact(opCtx); if (!status.isOK()) return StatusWith<CompactStats>(status); log() << "starting index commits"; status = indexer.dumpInsertsFromBulk(); if (!status.isOK()) return StatusWith<CompactStats>(status); { WriteUnitOfWork wunit(opCtx); status = indexer.commit(); if (!status.isOK()) { return StatusWith<CompactStats>(status); } wunit.commit(); } return StatusWith<CompactStats>(stats); }
Status cloneCollectionAsCapped(OperationContext* txn, Database* db, const std::string& shortFrom, const std::string& shortTo, double size, bool temp) { std::string fromNs = db->name() + "." + shortFrom; std::string toNs = db->name() + "." + shortTo; Collection* fromCollection = db->getCollection(fromNs); if (!fromCollection) return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNs << " does not exist"); if (db->getCollection(toNs)) return Status(ErrorCodes::NamespaceExists, "to collection already exists"); // create new collection { const auto fromOptions = fromCollection->getCatalogEntry() ->getCollectionOptions(txn) .toBSON(); OldClientContext ctx(txn, toNs); BSONObjBuilder spec; spec.appendBool("capped", true); spec.append("size", size); if (temp) spec.appendBool("temp", true); spec.appendElementsUnique(fromOptions); WriteUnitOfWork wunit(txn); Status status = userCreateNS(txn, ctx.db(), toNs, spec.done()); if (!status.isOK()) return status; wunit.commit(); } Collection* toCollection = db->getCollection(toNs); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(txn) * 2)); long long excessSize = fromCollection->dataSize(txn) - allocatedSpaceGuess; boost::scoped_ptr<PlanExecutor> exec(InternalPlanner::collectionScan( txn, fromNs, fromCollection, InternalPlanner::FORWARD)); DisableDocumentValidation validationDisabler(txn); while (true) { BSONObj obj; PlanExecutor::ExecState state = exec->getNext(&obj, NULL); switch(state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::DEAD: db->dropCollection(txn, toNs); return Status(ErrorCodes::InternalError, "executor turned dead while iterating"); case PlanExecutor::FAILURE: return Status(ErrorCodes::InternalError, "executor error while iterating"); case PlanExecutor::ADVANCED: if (excessSize > 0) { excessSize -= (4 * obj.objsize()); // 4x is for padding, power of 2, etc... continue; } WriteUnitOfWork wunit(txn); toCollection->insertDocument(txn, obj, true, txn->writesAreReplicated()); wunit.commit(); } } invariant(false); // unreachable }
Status cloneCollectionAsCapped(OperationContext* txn, Database* db, const std::string& shortFrom, const std::string& shortTo, double size, bool temp) { std::string fromNs = db->name() + "." + shortFrom; std::string toNs = db->name() + "." + shortTo; Collection* fromCollection = db->getCollection(fromNs); if (!fromCollection) return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNs << " does not exist"); if (db->getCollection(toNs)) return Status(ErrorCodes::NamespaceExists, "to collection already exists"); // create new collection MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { const auto fromOptions = fromCollection->getCatalogEntry()->getCollectionOptions(txn).toBSON(); OldClientContext ctx(txn, toNs); BSONObjBuilder spec; spec.appendBool("capped", true); spec.append("size", size); if (temp) spec.appendBool("temp", true); spec.appendElementsUnique(fromOptions); WriteUnitOfWork wunit(txn); Status status = userCreateNS(txn, ctx.db(), toNs, spec.done()); if (!status.isOK()) return status; wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "cloneCollectionAsCapped", fromNs); Collection* toCollection = db->getCollection(toNs); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(txn) * 2)); long long excessSize = fromCollection->dataSize(txn) - allocatedSpaceGuess; std::unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, fromNs, fromCollection, InternalPlanner::FORWARD)); exec->setYieldPolicy(PlanExecutor::WRITE_CONFLICT_RETRY_ONLY); Snapshotted<BSONObj> objToClone; RecordId loc; PlanExecutor::ExecState state = PlanExecutor::FAILURE; // suppress uninitialized warnings DisableDocumentValidation validationDisabler(txn); int retries = 0; // non-zero when retrying our last document. while (true) { if (!retries) { state = exec->getNextSnapshotted(&objToClone, &loc); } switch (state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::ADVANCED: { if (excessSize > 0) { // 4x is for padding, power of 2, etc... excessSize -= (4 * objToClone.value().objsize()); continue; } break; } default: // Unreachable as: // 1) We require a read lock (at a minimum) on the "from" collection // and won't yield, preventing collection drop and PlanExecutor::DEAD // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The // CollectionScan PlanStage does not have a FAILURE scenario. // 3) All other PlanExecutor states are handled above invariant(false); } try { // Make sure we are working with the latest version of the document. if (objToClone.snapshotId() != txn->recoveryUnit()->getSnapshotId() && !fromCollection->findDoc(txn, loc, &objToClone)) { // doc was deleted so don't clone it. retries = 0; continue; } WriteUnitOfWork wunit(txn); toCollection->insertDocument(txn, objToClone.value(), true, txn->writesAreReplicated()); wunit.commit(); // Go to the next document retries = 0; } catch (const WriteConflictException& wce) { CurOp::get(txn)->debug().writeConflicts++; retries++; // logAndBackoff expects this to be 1 on first call. wce.logAndBackoff(retries, "cloneCollectionAsCapped", fromNs); // Can't use WRITE_CONFLICT_RETRY_LOOP macros since we need to save/restore exec // around call to abandonSnapshot. exec->saveState(); txn->recoveryUnit()->abandonSnapshot(); exec->restoreState(txn); // Handles any WCEs internally. } } invariant(false); // unreachable }
mongo::Status mongo::cloneCollectionAsCapped(OperationContext* opCtx, Database* db, const std::string& shortFrom, const std::string& shortTo, long long size, bool temp) { NamespaceString fromNss(db->name(), shortFrom); NamespaceString toNss(db->name(), shortTo); Collection* fromCollection = db->getCollection(opCtx, fromNss); if (!fromCollection) { if (db->getViewCatalog()->lookup(opCtx, fromNss.ns())) { return Status(ErrorCodes::CommandNotSupportedOnView, str::stream() << "cloneCollectionAsCapped not supported for views: " << fromNss.ns()); } return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " does not exist"); } if (fromNss.isDropPendingNamespace()) { return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " is currently in a drop-pending state."); } if (db->getCollection(opCtx, toNss)) { return Status(ErrorCodes::NamespaceExists, str::stream() << "cloneCollectionAsCapped failed - destination collection " << toNss.ns() << " already exists. source collection: " << fromNss.ns()); } // create new collection { auto options = fromCollection->getCatalogEntry()->getCollectionOptions(opCtx); // The capped collection will get its own new unique id, as the conversion isn't reversible, // so it can't be rolled back. options.uuid.reset(); options.capped = true; options.cappedSize = size; if (temp) options.temp = true; BSONObjBuilder cmd; cmd.append("create", toNss.coll()); cmd.appendElements(options.toBSON()); Status status = createCollection(opCtx, toNss.db().toString(), cmd.done()); if (!status.isOK()) return status; } Collection* toCollection = db->getCollection(opCtx, toNss); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(opCtx) * 2)); long long excessSize = fromCollection->dataSize(opCtx) - allocatedSpaceGuess; auto exec = InternalPlanner::collectionScan(opCtx, fromNss.ns(), fromCollection, PlanExecutor::WRITE_CONFLICT_RETRY_ONLY, InternalPlanner::FORWARD); Snapshotted<BSONObj> objToClone; RecordId loc; PlanExecutor::ExecState state = PlanExecutor::FAILURE; // suppress uninitialized warnings DisableDocumentValidation validationDisabler(opCtx); int retries = 0; // non-zero when retrying our last document. while (true) { if (!retries) { state = exec->getNextSnapshotted(&objToClone, &loc); } switch (state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::ADVANCED: { if (excessSize > 0) { // 4x is for padding, power of 2, etc... excessSize -= (4 * objToClone.value().objsize()); continue; } break; } default: // Unreachable as: // 1) We require a read lock (at a minimum) on the "from" collection // and won't yield, preventing collection drop and PlanExecutor::DEAD // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The // CollectionScan PlanStage does not have a FAILURE scenario. // 3) All other PlanExecutor states are handled above MONGO_UNREACHABLE; } try { // Make sure we are working with the latest version of the document. if (objToClone.snapshotId() != opCtx->recoveryUnit()->getSnapshotId() && !fromCollection->findDoc(opCtx, loc, &objToClone)) { // doc was deleted so don't clone it. retries = 0; continue; } WriteUnitOfWork wunit(opCtx); OpDebug* const nullOpDebug = nullptr; uassertStatusOK(toCollection->insertDocument( opCtx, InsertStatement(objToClone.value()), nullOpDebug, true)); wunit.commit(); // Go to the next document retries = 0; } catch (const WriteConflictException&) { CurOp::get(opCtx)->debug().additiveMetrics.incrementWriteConflicts(1); retries++; // logAndBackoff expects this to be 1 on first call. WriteConflictException::logAndBackoff(retries, "cloneCollectionAsCapped", fromNss.ns()); // Can't use writeConflictRetry since we need to save/restore exec around call to // abandonSnapshot. exec->saveState(); opCtx->recoveryUnit()->abandonSnapshot(); auto restoreStatus = exec->restoreState(); // Handles any WCEs internally. if (!restoreStatus.isOK()) { return restoreStatus; } } } MONGO_UNREACHABLE; }
Status renameCollection(OperationContext* txn, const NamespaceString& source, const NamespaceString& target, bool dropTarget, bool stayTemp) { DisableDocumentValidation validationDisabler(txn); ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite globalWriteLock(txn->lockState()); // We stay in source context the whole time. This is mostly to set the CurOp namespace. OldClientContext ctx(txn, source.ns()); bool userInitiatedWritesAndNotPrimary = txn->writesAreReplicated() && !repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(source); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while renaming collection " << source.ns() << " to " << target.ns()); } Database* const sourceDB = dbHolder().get(txn, source.db()); Collection* const sourceColl = sourceDB ? sourceDB->getCollection(source.ns()) : nullptr; if (!sourceColl) { return Status(ErrorCodes::NamespaceNotFound, "source namespace does not exist"); } { // Ensure that collection name does not exceed maximum length. // Ensure that index names do not push the length over the max. // Iterator includes unfinished indexes. IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator(txn, true); int longestIndexNameLength = 0; while (sourceIndIt.more()) { int thisLength = sourceIndIt.next()->indexName().length(); if (thisLength > longestIndexNameLength) longestIndexNameLength = thisLength; } unsigned int longestAllowed = std::min(int(NamespaceString::MaxNsCollectionLen), int(NamespaceString::MaxNsLen) - 2 /*strlen(".$")*/ - longestIndexNameLength); if (target.size() > longestAllowed) { StringBuilder sb; sb << "collection name length of " << target.size() << " exceeds maximum length of " << longestAllowed << ", allowing for index names"; return Status(ErrorCodes::InvalidLength, sb.str()); } } BackgroundOperation::assertNoBgOpInProgForNs(source.ns()); Database* const targetDB = dbHolder().openDb(txn, target.db()); { WriteUnitOfWork wunit(txn); // Check if the target namespace exists and if dropTarget is true. // If target exists and dropTarget is not true, return false. if (targetDB->getCollection(target)) { if (!dropTarget) { return Status(ErrorCodes::NamespaceExists, "target namespace exists"); } Status s = targetDB->dropCollection(txn, target.ns()); if (!s.isOK()) { return s; } } // If we are renaming in the same database, just // rename the namespace and we're done. if (sourceDB == targetDB) { Status s = targetDB->renameCollection(txn, source.ns(), target.ns(), stayTemp); if (!s.isOK()) { return s; } getGlobalServiceContext()->getOpObserver()->onRenameCollection( txn, NamespaceString(source), NamespaceString(target), dropTarget, stayTemp); wunit.commit(); return Status::OK(); } wunit.commit(); } // If we get here, we are renaming across databases, so we must copy all the data and // indexes, then remove the source collection. // Create the target collection. It will be removed if we fail to copy the collection. // TODO use a temp collection and unset the temp flag on success. Collection* targetColl = nullptr; { CollectionOptions options = sourceColl->getCatalogEntry()->getCollectionOptions(txn); WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. bool shouldReplicateWrites = txn->writesAreReplicated(); txn->setReplicatedWrites(false); targetColl = targetDB->createCollection(txn, target.ns(), options, false); // _id index build with others later. txn->setReplicatedWrites(shouldReplicateWrites); if (!targetColl) { return Status(ErrorCodes::OutOfDiskSpace, "Failed to create target collection."); } wunit.commit(); } // Dismissed on success ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target.ns()); MultiIndexBlock indexer(txn, targetColl); indexer.allowInterruption(); // Copy the index descriptions from the source collection, adjusting the ns field. { std::vector<BSONObj> indexesToCopy; IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator(txn, true); while (sourceIndIt.more()) { const BSONObj currIndex = sourceIndIt.next()->infoObj(); // Process the source index. BSONObjBuilder newIndex; newIndex.append("ns", target.ns()); newIndex.appendElementsUnique(currIndex); indexesToCopy.push_back(newIndex.obj()); } indexer.init(indexesToCopy); } { // Copy over all the data from source collection to target collection. auto cursor = sourceColl->getCursor(txn); while (auto record = cursor->next()) { txn->checkForInterrupt(); const auto obj = record->data.releaseToBson(); WriteUnitOfWork wunit(txn); // No logOp necessary because the entire renameCollection command is one logOp. bool shouldReplicateWrites = txn->writesAreReplicated(); txn->setReplicatedWrites(false); Status status = targetColl->insertDocument(txn, obj, &indexer, true); txn->setReplicatedWrites(shouldReplicateWrites); if (!status.isOK()) return status; wunit.commit(); } } Status status = indexer.doneInserting(); if (!status.isOK()) return status; { // Getting here means we successfully built the target copy. We now remove the // source collection and finalize the rename. WriteUnitOfWork wunit(txn); bool shouldReplicateWrites = txn->writesAreReplicated(); txn->setReplicatedWrites(false); Status status = sourceDB->dropCollection(txn, source.ns()); txn->setReplicatedWrites(shouldReplicateWrites); if (!status.isOK()) return status; indexer.commit(); getGlobalServiceContext()->getOpObserver()->onRenameCollection( txn, NamespaceString(source), NamespaceString(target), dropTarget, stayTemp); wunit.commit(); } targetCollectionDropper.Dismiss(); return Status::OK(); }