virtual Status explain(OperationContext* txn, const std::string& dbname, const BSONObj& cmdObj, ExplainCommon::Verbosity verbosity, const rpc::ServerSelectionMetadata&, BSONObjBuilder* out) const { const string ns = parseNs(dbname, cmdObj); AutoGetCollectionForRead ctx(txn, ns); Collection* collection = ctx.getCollection(); StatusWith<unique_ptr<PlanExecutor>> executor = getPlanExecutor(txn, collection, ns, cmdObj, true); if (!executor.isOK()) { return executor.getStatus(); } Explain::explainStages(executor.getValue().get(), verbosity, out); return Status::OK(); }
StatusWith<DiskLoc> RecordStoreV1Base::updateRecord( OperationContext* txn, const DiskLoc& oldLocation, const char* data, int dataSize, bool enforceQuota, UpdateMoveNotifier* notifier ) { Record* oldRecord = recordFor( oldLocation ); if ( oldRecord->netLength() >= dataSize ) { // we fit _paddingFits( txn ); memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize ); return StatusWith<DiskLoc>( oldLocation ); } if ( isCapped() ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003 ); // we have to move _paddingTooSmall( txn ); StatusWith<DiskLoc> newLocation = _insertRecord( txn, data, dataSize, enforceQuota ); if ( !newLocation.isOK() ) return newLocation; // insert worked, so we delete old record if ( notifier ) { Status moveStatus = notifier->recordStoreGoingToMove( txn, oldLocation, oldRecord->data(), oldRecord->netLength() ); if ( !moveStatus.isOK() ) return StatusWith<DiskLoc>( moveStatus ); } deleteRecord( txn, oldLocation ); return newLocation; }
StatusWith<std::string> KVCatalog::newOrphanedIdent(OperationContext* opCtx, std::string ident) { // The collection will be named local.orphan.xxxxx. std::string identNs = ident; std::replace(identNs.begin(), identNs.end(), '-', '_'); std::string ns = NamespaceString(NamespaceString::kOrphanCollectionDb, NamespaceString::kOrphanCollectionPrefix + identNs) .ns(); stdx::lock_guard<stdx::mutex> lk(_identsLock); Entry& old = _idents[ns]; if (!old.ident.empty()) { return Status(ErrorCodes::NamespaceExists, str::stream() << ns << " already exists in the catalog"); } opCtx->recoveryUnit()->registerChange(new AddIdentChange(this, ns)); // Generate a new UUID for the orphaned collection. CollectionOptions optionsWithUUID; optionsWithUUID.uuid.emplace(CollectionUUID::gen()); BSONObj obj; { BSONObjBuilder b; b.append("ns", ns); b.append("ident", ident); BSONCollectionCatalogEntry::MetaData md; md.ns = ns; // Default options with newly generated UUID. md.options = optionsWithUUID; // Not Prefixed. md.prefix = KVPrefix::kNotPrefixed; b.append("md", md.toBSON()); obj = b.obj(); } StatusWith<RecordId> res = _rs->insertRecord(opCtx, obj.objdata(), obj.objsize(), Timestamp()); if (!res.isOK()) return res.getStatus(); old = Entry(ident, res.getValue()); LOG(1) << "stored meta data for orphaned collection " << ns << " @ " << res.getValue(); return StatusWith<std::string>(std::move(ns)); }
Status KVCatalog::newCollection(OperationContext* opCtx, StringData ns, const CollectionOptions& options) { invariant(opCtx->lockState() == NULL || opCtx->lockState()->isDbLockedForMode(nsToDatabaseSubstring(ns), MODE_X)); std::unique_ptr<Lock::ResourceLock> rLk; if (!_isRsThreadSafe && opCtx->lockState()) { rLk.reset(new Lock::ResourceLock(opCtx->lockState(), resourceIdCatalogMetadata, MODE_X)); } const string ident = _newUniqueIdent(ns, "collection"); stdx::lock_guard<stdx::mutex> lk(_identsLock); Entry& old = _idents[ns.toString()]; if (!old.ident.empty()) { return Status(ErrorCodes::NamespaceExists, "collection already exists"); } opCtx->recoveryUnit()->registerChange(new AddIdentChange(this, ns)); BSONObj obj; { BSONObjBuilder b; b.append("ns", ns); b.append("ident", ident); BSONCollectionCatalogEntry::MetaData md; md.ns = ns.toString(); md.options = options; b.append("md", md.toBSON()); obj = b.obj(); } StatusWith<RecordId> res = _rs->insertRecord(opCtx, obj.objdata(), obj.objsize(), false); if (!res.isOK()) return res.getStatus(); old = Entry(ident, res.getValue()); LOG(1) << "stored meta data for " << ns << " @ " << res.getValue(); return Status::OK(); }
void ReplicationCoordinatorImpl::_handleHeartbeatResponseAction( const HeartbeatResponseAction& action, const StatusWith<ReplSetHeartbeatResponse>& responseStatus) { switch (action.getAction()) { case HeartbeatResponseAction::NoAction: // Update the cached member state if different than the current topology member state if (_memberState != _topCoord->getMemberState()) { stdx::unique_lock<stdx::mutex> lk(_mutex); const PostMemberStateUpdateAction postUpdateAction = _updateMemberStateFromTopologyCoordinator_inlock(); lk.unlock(); _performPostMemberStateUpdateAction(postUpdateAction); } break; case HeartbeatResponseAction::Reconfig: invariant(responseStatus.isOK()); _scheduleHeartbeatReconfig(responseStatus.getValue().getConfig()); break; case HeartbeatResponseAction::StartElection: if (isV1ElectionProtocol()) { _startElectSelfV1(); } else { _startElectSelf(); } break; case HeartbeatResponseAction::StepDownSelf: invariant(action.getPrimaryConfigIndex() == _selfIndex); log() << "Stepping down from primary in response to heartbeat"; _stepDownStart(); break; case HeartbeatResponseAction::StepDownRemotePrimary: { invariant(action.getPrimaryConfigIndex() != _selfIndex); _requestRemotePrimaryStepdown( _rsConfig.getMemberAt(action.getPrimaryConfigIndex()).getHostAndPort()); break; } default: severe() << "Illegal heartbeat response action code " << int(action.getAction()); invariant(false); } }
// Goes over the request and preprocesses normalized versions of all the inserts in the request static void normalizeInserts( const BatchedCommandRequest& request, vector<StatusWith<BSONObj> >* normalizedInserts, vector<PregeneratedKeys>* pregen ) { normalizedInserts->reserve(request.sizeWriteOps()); for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) { BSONObj insertDoc = request.getInsertRequest()->getDocumentsAt( i ); StatusWith<BSONObj> normalInsert = fixDocumentForInsert( insertDoc ); normalizedInserts->push_back( normalInsert ); if ( request.getOrdered() && !normalInsert.isOK() ) break; if ( !normalInsert.getValue().isEmpty() ) insertDoc = normalInsert.getValue(); pregen->push_back( PregeneratedKeys() ); GeneratorHolder::getInstance()->prepare( request.getTargetingNS(), insertDoc, &pregen->back() ); } }
BtreeLogicTestHelper<OnDiskFormat>::BtreeLogicTestHelper(const BSONObj& order) : recordStore("TestRecordStore"), btree(&headManager, &recordStore, &cursorRegistry, Ordering::make(order), "TestIndex", /*isUnique*/ false) { static const string randomData("RandomStuff"); // Generate a valid record location for a "fake" record, which we will repeatedly use // thoughout the tests. OperationContextNoop opCtx; StatusWith<RecordId> s = recordStore.insertRecord(&opCtx, randomData.c_str(), randomData.length(), false); ASSERT_TRUE(s.isOK()); ASSERT_EQUALS(1, recordStore.numRecords(NULL)); dummyDiskLoc = DiskLoc::fromRecordId(s.getValue()); }
StatusWith<DiskLoc> Collection::insertDocument( OperationContext* txn, const BSONObj& doc, MultiIndexBlock& indexBlock ) { StatusWith<DiskLoc> loc = _recordStore->insertRecord( txn, doc.objdata(), doc.objsize(), 0 ); if ( !loc.isOK() ) return loc; InsertDeleteOptions indexOptions; indexOptions.logIfError = false; indexOptions.dupsAllowed = true; // in repair we should be doing no checking Status status = indexBlock.insert( doc, loc.getValue(), indexOptions ); if ( !status.isOK() ) return StatusWith<DiskLoc>( status ); return loc; }
/** * Perform a single insert into a collection. Requires the insert be preprocessed and the * collection already has been created. * * Might fault or error, otherwise populates the result. */ static void singleInsert( OperationContext* txn, const BSONObj& docToInsert, Collection* collection, WriteOpResult* result ) { const string& insertNS = collection->ns().ns(); txn->lockState()->assertWriteLocked( insertNS ); WriteUnitOfWork wunit(txn); StatusWith<DiskLoc> status = collection->insertDocument( txn, docToInsert, true ); if ( !status.isOK() ) { result->setError(toWriteError(status.getStatus())); } else { repl::logOp( txn, "i", insertNS.c_str(), docToInsert ); result->getStats().n = 1; wunit.commit(); } }
StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::scheduleWorkWithGlobalExclusiveLock( const CallbackFn& work) { boost::lock_guard<boost::mutex> lk(_mutex); StatusWith<CallbackHandle> handle = enqueueWork_inlock(&_exclusiveLockInProgressQueue, work); if (handle.isOK()) { const stdx::function<void (OperationContext*)> doOp = stdx::bind( &ReplicationExecutor::doOperationWithGlobalExclusiveLock, this, stdx::placeholders::_1, handle.getValue()); _dblockWorkers.schedule( makeNoExcept(stdx::bind( &NetworkInterface::runCallbackWithGlobalExclusiveLock, _networkInterface.get(), doOp))); } return handle; }
Status ReplicationCoordinatorExternalStateImpl::storeLocalLastVoteDocument( OperationContext* opCtx, const LastVote& lastVote) { BSONObj lastVoteObj = lastVote.toBSON(); try { Status status = writeConflictRetry(opCtx, "save replica set lastVote", lastVoteCollectionName, [&] { Lock::DBLock dbWriteLock(opCtx, lastVoteDatabaseName, MODE_X); // If there is no last vote document, we want to store one. Otherwise, we only want // to replace it if the new last vote document would have a higher term. We both // check the term of the current last vote document and insert the new document // under the DBLock to synchronize the two operations. BSONObj result; bool exists = Helpers::getSingleton(opCtx, lastVoteCollectionName, result); if (!exists) { Helpers::putSingleton(opCtx, lastVoteCollectionName, lastVoteObj); } else { StatusWith<LastVote> oldLastVoteDoc = LastVote::readFromLastVote(result); if (!oldLastVoteDoc.isOK()) { return oldLastVoteDoc.getStatus(); } if (lastVote.getTerm() > oldLastVoteDoc.getValue().getTerm()) { Helpers::putSingleton(opCtx, lastVoteCollectionName, lastVoteObj); } } return Status::OK(); }); if (!status.isOK()) { return status; } opCtx->recoveryUnit()->waitUntilDurable(); return Status::OK(); } catch (const DBException& ex) { return ex.toStatus(); } }
StatusWith<SettingsType> CatalogManagerReplicaSet::getGlobalSettings(OperationContext* txn, const string& key) { const auto configShard = grid.shardRegistry()->getShard(txn, "config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = _exhaustiveFindOnConfig(readHost.getValue(), NamespaceString(SettingsType::ConfigNS), BSON(SettingsType::key(key)), BSONObj(), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docs = findStatus.getValue().value; if (docs.empty()) { return {ErrorCodes::NoMatchingDocument, str::stream() << "can't find settings document with key: " << key}; } BSONObj settingsDoc = docs.front(); StatusWith<SettingsType> settingsResult = SettingsType::fromBSON(settingsDoc); if (!settingsResult.isOK()) { return {ErrorCodes::FailedToParse, str::stream() << "error while parsing settings document: " << settingsDoc << " : " << settingsResult.getStatus().toString()}; } const SettingsType& settings = settingsResult.getValue(); Status validationStatus = settings.validate(); if (!validationStatus.isOK()) { return validationStatus; } return settingsResult; }
Status EphemeralForTestRecordStore::insertRecordsWithDocWriter(OperationContext* opCtx, const DocWriter* const* docs, const Timestamp*, size_t nDocs, RecordId* idsOut) { stdx::lock_guard<stdx::recursive_mutex> lock(_data->recordsMutex); for (size_t i = 0; i < nDocs; i++) { const int len = docs[i]->documentSize(); if (_isCapped && len > _cappedMaxSize) { // We use dataSize for capped rollover and we don't want to delete everything if we // know this won't fit. return Status(ErrorCodes::BadValue, "object to insert exceeds cappedMaxSize"); } EphemeralForTestRecord rec(len); docs[i]->writeDocument(rec.data.get()); RecordId loc; if (_data->isOplog) { StatusWith<RecordId> status = extractAndCheckLocForOplog(rec.data.get(), len); if (!status.isOK()) return status.getStatus(); loc = status.getValue(); } else { loc = allocateLoc(); } opCtx->recoveryUnit()->registerChange(new InsertChange(opCtx, _data, loc)); _data->dataSize += len; _data->records[loc] = rec; cappedDeleteAsNeeded_inlock(opCtx); if (idsOut) idsOut[i] = loc; } return Status::OK(); }
bool WriteCmd::run(OperationContext* txn, const string& dbName, BSONObj& cmdObj, int options, string& errMsg, BSONObjBuilder& result) { // Can't be run on secondaries. dassert(txn->writesAreReplicated()); BatchedCommandRequest request( _writeType ); BatchedCommandResponse response; if ( !request.parseBSON( cmdObj, &errMsg ) || !request.isValid( &errMsg ) ) { return appendCommandStatus( result, Status( ErrorCodes::FailedToParse, errMsg ) ); } // Note that this is a runCommmand, and therefore, the database and the collection name // are in different parts of the grammar for the command. But it's more convenient to // work with a NamespaceString. We built it here and replace it in the parsed command. // Internally, everything work with the namespace string as opposed to just the // collection name. NamespaceString nss(dbName, request.getNS()); request.setNSS(nss); StatusWith<WriteConcernOptions> wcStatus = extractWriteConcern(cmdObj); if (!wcStatus.isOK()) { return appendCommandStatus(result, wcStatus.getStatus()); } txn->setWriteConcern(wcStatus.getValue()); WriteBatchExecutor writeBatchExecutor(txn, &globalOpCounters, &LastError::get(txn->getClient())); writeBatchExecutor.executeBatch( request, &response ); result.appendElements( response.toBSON() ); return response.getOk(); }
StatusWith<RecordId> Collection::insertDocument(OperationContext* txn, const BSONObj& docToInsert, bool enforceQuota, bool fromMigrate) { { auto status = checkValidation(txn, docToInsert); if (!status.isOK()) return status; } const SnapshotId sid = txn->recoveryUnit()->getSnapshotId(); if (_indexCatalog.findIdIndex(txn)) { if (docToInsert["_id"].eoo()) { return StatusWith<RecordId>(ErrorCodes::InternalError, str::stream() << "Collection::insertDocument got " "document without _id for ns:" << _ns.ns()); } } if (_mustTakeCappedLockOnInsert) synchronizeOnCappedInFlightResource(txn->lockState()); StatusWith<RecordId> res = _insertDocument(txn, docToInsert, enforceQuota); invariant(sid == txn->recoveryUnit()->getSnapshotId()); if (res.isOK()) { getGlobalServiceContext()->getOpObserver()->onInsert(txn, ns(), docToInsert, fromMigrate); // If there is a notifier object and another thread is waiting on it, then we notify // waiters of this document insert. Waiters keep a shared_ptr to '_cappedNotifier', so // there are waiters if this Collection's shared_ptr is not unique. if (_cappedNotifier && !_cappedNotifier.unique()) { _cappedNotifier->notifyOfInsert(); } } return res; }
StatusWith<DiskLoc> Collection::insertDocument( const BSONObj& docToInsert, bool enforceQuota ) { if ( _indexCatalog.findIdIndex() ) { if ( docToInsert["_id"].eoo() ) { return StatusWith<DiskLoc>( ErrorCodes::InternalError, "Collection::insertDocument got document without _id" ); } } if ( _details->isCapped() ) { // TOOD: old god not done Status ret = _indexCatalog.checkNoIndexConflicts( docToInsert ); if ( !ret.isOK() ) return StatusWith<DiskLoc>( ret ); } StatusWith<DiskLoc> status = _insertDocument( docToInsert, enforceQuota ); if ( status.isOK() ) { _details->paddingFits(); } return status; }
void WiredTigerUtil::fetchTypeAndSourceURI(OperationContext* opCtx, const std::string& tableUri, std::string* type, std::string* source) { std::string colgroupUri = "colgroup"; const size_t colon = tableUri.find(':'); invariant(colon != string::npos); colgroupUri += tableUri.substr(colon); StatusWith<std::string> colgroupResult = getMetadata(opCtx, colgroupUri); invariant(colgroupResult.isOK()); WiredTigerConfigParser parser(colgroupResult.getValue()); WT_CONFIG_ITEM typeItem; invariant(parser.get("type", &typeItem) == 0); invariant(typeItem.type == WT_CONFIG_ITEM::WT_CONFIG_ITEM_ID); *type = std::string(typeItem.str, typeItem.len); WT_CONFIG_ITEM sourceItem; invariant(parser.get("source", &sourceItem) == 0); invariant(sourceItem.type == WT_CONFIG_ITEM::WT_CONFIG_ITEM_STRING); *source = std::string(sourceItem.str, sourceItem.len); }
Status KVCatalog::newCollection(OperationContext* opCtx, StringData ns, const CollectionOptions& options, KVPrefix prefix) { invariant(opCtx->lockState()->isDbLockedForMode(nsToDatabaseSubstring(ns), MODE_X)); const string ident = _newUniqueIdent(ns, "collection"); stdx::lock_guard<stdx::mutex> lk(_identsLock); Entry& old = _idents[ns.toString()]; if (!old.ident.empty()) { return Status(ErrorCodes::NamespaceExists, "collection already exists"); } opCtx->recoveryUnit()->registerChange(new AddIdentChange(this, ns)); BSONObj obj; { BSONObjBuilder b; b.append("ns", ns); b.append("ident", ident); BSONCollectionCatalogEntry::MetaData md; md.ns = ns.toString(); md.options = options; md.prefix = prefix; b.append("md", md.toBSON()); obj = b.obj(); } const bool enforceQuota = false; // TODO SERVER-30638: using timestamp 0 for these inserts. StatusWith<RecordId> res = _rs->insertRecord(opCtx, obj.objdata(), obj.objsize(), Timestamp(), enforceQuota); if (!res.isOK()) return res.getStatus(); old = Entry(ident, res.getValue()); LOG(1) << "stored meta data for " << ns << " @ " << res.getValue(); return Status::OK(); }
Status ReplSetConfig::checkIfWriteConcernCanBeSatisfied( const WriteConcernOptions& writeConcern) const { if (!writeConcern.wMode.empty() && writeConcern.wMode != WriteConcernOptions::kMajority && writeConcern.wMode != WriteConcernOptions::kInternalMajorityNoSnapshot) { StatusWith<ReplSetTagPattern> tagPatternStatus = findCustomWriteMode(writeConcern.wMode); if (!tagPatternStatus.isOK()) { return tagPatternStatus.getStatus(); } ReplSetTagMatch matcher(tagPatternStatus.getValue()); for (size_t j = 0; j < _members.size(); ++j) { const MemberConfig& memberConfig = _members[j]; for (MemberConfig::TagIterator it = memberConfig.tagsBegin(); it != memberConfig.tagsEnd(); ++it) { if (matcher.update(*it)) { return Status::OK(); } } } // Even if all the nodes in the set had a given write it still would not satisfy this // write concern mode. return Status(ErrorCodes::UnsatisfiableWriteConcern, str::stream() << "Not enough nodes match write concern mode \"" << writeConcern.wMode << "\""); } else { int nodesRemaining = writeConcern.wNumNodes; for (size_t j = 0; j < _members.size(); ++j) { if (!_members[j].isArbiter()) { // Only count data-bearing nodes --nodesRemaining; if (nodesRemaining <= 0) { return Status::OK(); } } } return Status(ErrorCodes::UnsatisfiableWriteConcern, "Not enough data-bearing nodes"); } }
bool Sync::shouldRetry(OperationContext* txn, const BSONObj& o) { const NamespaceString nss(o.getStringField("ns")); // Take an X lock on the database in order to preclude other modifications. Also, the // database might not exist yet, so create it. AutoGetOrCreateDb autoDb(txn, nss.db(), MODE_X); Database* const db = autoDb.getDb(); // we don't have the object yet, which is possible on initial sync. get it. log() << "adding missing object" << endl; // rare enough we can log BSONObj missingObj = getMissingDoc(txn, db, o); if( missingObj.isEmpty() ) { log() << "missing object not found on source. presumably deleted later in oplog" << endl; log() << "o2: " << o.getObjectField("o2").toString() << endl; log() << "o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl; return false; } else { WriteUnitOfWork wunit(txn); Collection* const collection = db->getOrCreateCollection(txn, nss.toString()); invariant(collection); StatusWith<RecordId> result = collection->insertDocument(txn, missingObj, true); uassert(15917, str::stream() << "failed to insert missing doc: " << result.getStatus().toString(), result.isOK() ); LOG(1) << "inserted missing doc: " << missingObj.toString() << endl; wunit.commit(); return true; } }
StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( TransactionExperiment* txn, const DocWriter* doc, int quotaMax ) { int lenWHdr = doc->documentSize() + Record::HeaderSize; if ( doc->addPadding() ) lenWHdr = getRecordAllocationSize( lenWHdr ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, quotaMax ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17319, r->lengthWithHeaders() >= lenWHdr ); r = reinterpret_cast<Record*>( txn->writingPtr(r, lenWHdr) ); doc->writeDocument( r->data() ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); return loc; }
StatusWith<RecordId> Collection::_insertDocument(OperationContext* txn, const BSONObj& docToInsert, bool enforceQuota) { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); // TODO: for now, capped logic lives inside NamespaceDetails, which is hidden // under the RecordStore, this feels broken since that should be a // collection access method probably StatusWith<RecordId> loc = _recordStore->insertRecord( txn, docToInsert.objdata(), docToInsert.objsize(), _enforceQuota(enforceQuota)); if (!loc.isOK()) return loc; invariant(RecordId::min() < loc.getValue()); invariant(loc.getValue() < RecordId::max()); Status s = _indexCatalog.indexRecord(txn, docToInsert, loc.getValue()); if (!s.isOK()) return StatusWith<RecordId>(s); return loc; }
/** * Perform a single insert into a collection. Requires the insert be preprocessed and the * collection already has been created. * * Might fault or error, otherwise populates the result. */ static void singleInsert( const BatchItemRef& insertItem, const BSONObj& normalInsert, Collection* collection, WriteOpResult* result ) { const string& insertNS = insertItem.getRequest()->getNS(); Lock::assertWriteLocked( insertNS ); try { // XXX - are we 100% sure that all !OK statuses do not write a document? StatusWith<DiskLoc> status = collection->insertDocument( normalInsert, true ); if ( !status.isOK() ) { result->error = toWriteError( status.getStatus() ); } else { logOp( "i", insertNS.c_str(), normalInsert ); getDur().commitIfNeeded(); result->stats.n = 1; } } catch ( const PageFaultException& ex ) { // TODO: An actual data structure that's not an exception for this result->fault = new PageFaultException( ex ); } catch ( const DBException& ex ) { Status status(ex.toStatus()); if (ErrorCodes::isInterruption(status.code())) { throw; } result->error = toWriteError(status); } }
StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( TransactionExperiment* txn, const char* data, int len, int quotaMax ) { int lenWHdr = getRecordAllocationSize( len + Record::HeaderSize ); fassert( 17208, lenWHdr >= ( len + Record::HeaderSize ) ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, quotaMax ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17210, r->lengthWithHeaders() >= lenWHdr ); // copy the data r = reinterpret_cast<Record*>( txn->writingPtr(r, lenWHdr) ); memcpy( r->data(), data, len ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); return loc; }
StatusWith<RecordId> Collection::insertDocument(OperationContext* txn, const BSONObj& doc, MultiIndexBlock* indexBlock, bool enforceQuota) { { auto status = checkValidation(txn, doc); if (!status.isOK()) return status; } dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); if (_mustTakeCappedLockOnInsert) synchronizeOnCappedInFlightResource(txn->lockState()); StatusWith<RecordId> loc = _recordStore->insertRecord(txn, doc.objdata(), doc.objsize(), _enforceQuota(enforceQuota)); if (!loc.isOK()) return loc; Status status = indexBlock->insert(doc, loc.getValue()); if (!status.isOK()) return StatusWith<RecordId>(status); getGlobalServiceContext()->getOpObserver()->onInsert(txn, ns(), doc); // If there is a notifier object and another thread is waiting on it, then we notify waiters // of this document insert. Waiters keep a shared_ptr to '_cappedNotifier', so there are // waiters if this Collection's shared_ptr is not unique. if (_cappedNotifier && !_cappedNotifier.unique()) { _cappedNotifier->notifyOfInsert(); } return loc; }
StatusWith<DiskLoc> Collection::_insertDocument( OperationContext* txn, const BSONObj& docToInsert, bool enforceQuota ) { // TODO: for now, capped logic lives inside NamespaceDetails, which is hidden // under the RecordStore, this feels broken since that should be a // collection access method probably StatusWith<DiskLoc> loc = _recordStore->insertRecord( txn, docToInsert.objdata(), docToInsert.objsize(), _enforceQuota( enforceQuota ) ); if ( !loc.isOK() ) return loc; _infoCache.notifyOfWriteOp(); try { _indexCatalog.indexRecord(txn, docToInsert, loc.getValue()); } catch ( AssertionException& e ) { if ( isCapped() ) { return StatusWith<DiskLoc>( ErrorCodes::InternalError, str::stream() << "unexpected index insertion failure on" << " capped collection" << e.toString() << " - collection and its index will not match" ); } // indexRecord takes care of rolling back indexes // so we just have to delete the main storage _recordStore->deleteRecord( txn, loc.getValue() ); return StatusWith<DiskLoc>( e.toStatus( "insertDocument" ) ); } return loc; }
Status CatalogManagerReplicaSet::shardCollection(OperationContext* txn, const string& ns, const ShardKeyPattern& fieldsAndOrder, bool unique, const vector<BSONObj>& initPoints, const set<ShardId>& initShardIds) { // Lock the collection globally so that no other mongos can try to shard or drop the collection // at the same time. auto scopedDistLock = getDistLockManager()->lock(ns, "shardCollection"); if (!scopedDistLock.isOK()) { return scopedDistLock.getStatus(); } StatusWith<DatabaseType> status = getDatabase(nsToDatabase(ns)); if (!status.isOK()) { return status.getStatus(); } DatabaseType dbt = status.getValue(); ShardId dbPrimaryShardId = dbt.getPrimary(); const auto primaryShard = grid.shardRegistry()->getShard(dbPrimaryShardId); { // In 3.0 and prior we include this extra safety check that the collection is not getting // sharded concurrently by two different mongos instances. It is not 100%-proof, but it // reduces the chance that two invocations of shard collection will step on each other's // toes. Now we take the distributed lock so going forward this check won't be necessary // but we leave it around for compatibility with other mongoses from 3.0. // TODO(spencer): Remove this after 3.2 ships. const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto countStatus = _runCountCommand( readHost.getValue(), NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::ns(ns))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::AlreadyInitialized, str::stream() << "collection " << ns << " already sharded with " << countStatus.getValue() << " chunks."); } } // Record start in changelog { BSONObjBuilder collectionDetail; collectionDetail.append("shardKey", fieldsAndOrder.toBSON()); collectionDetail.append("collection", ns); collectionDetail.append("primary", primaryShard->toString()); { BSONArrayBuilder initialShards(collectionDetail.subarrayStart("initShards")); for (const ShardId& shardId : initShardIds) { initialShards.append(shardId); } } collectionDetail.append("numChunks", static_cast<int>(initPoints.size() + 1)); logChange(txn->getClient()->clientAddress(true), "shardCollection.start", ns, collectionDetail.obj()); } ChunkManagerPtr manager(new ChunkManager(ns, fieldsAndOrder, unique)); manager->createFirstChunks(dbPrimaryShardId, &initPoints, &initShardIds); manager->loadExistingRanges(nullptr); CollectionInfo collInfo; collInfo.useChunkManager(manager); collInfo.save(ns); manager->reload(true); // TODO(spencer) SERVER-19319: Send setShardVersion to primary shard so it knows to start // rejecting unversioned writes. BSONObj finishDetail = BSON("version" << ""); // TODO(spencer) SERVER-19319 Report actual version used logChange(txn->getClient()->clientAddress(true), "shardCollection", ns, finishDetail); return Status::OK(); }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Status* remoteOplogStartStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { return; } if (inShutdown()) { return; } // Check if we have been paused. if (isPaused()) { return; } const auto& queryResponse = result.getValue(); const auto& documents = queryResponse.documents; auto documentBegin = documents.cbegin(); auto documentEnd = documents.cend(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> { if (documentBegin == documentEnd) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(documentBegin++); }; *remoteOplogStartStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!remoteOplogStartStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(documentBegin != documents.cbegin()); } // process documents int currentBatchMessageSize = 0; for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) { if (inShutdown()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the fetcher. const BSONObj& o = *documentIter; currentBatchMessageSize += o.objsize(); opsReadStats.increment(); if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { sleepsecs(20); } { stdx::unique_lock<stdx::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = extractOpTime(o); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } // record time for each batch getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count()); networkByteStats.increment(currentBatchMessageSize); // Check some things periodically // (whenever we run out of items in the // current cursor batch) if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (_shouldChangeSyncSource(source)) { return; } // Check if we have been paused. if (isPaused()) { return; } // We fill in 'bob' to signal the fetcher to process with another getMore. invariant(bob); bob->append("getMore", queryResponse.cursorId); bob->append("collection", queryResponse.nss.coll()); bob->append("maxTimeMS", int(fetcherMaxTimeMS.count())); }
Status getOplogStartHack(OperationContext* txn, Collection* collection, CanonicalQuery* cq, PlanExecutor** execOut) { invariant(cq); auto_ptr<CanonicalQuery> autoCq(cq); if ( collection == NULL ) return Status(ErrorCodes::InternalError, "getOplogStartHack called with a NULL collection" ); // A query can only do oplog start finding if it has a top-level $gt or $gte predicate over // the "ts" field (the operation's timestamp). Find that predicate and pass it to // the OplogStart stage. MatchExpression* tsExpr = NULL; if (MatchExpression::AND == cq->root()->matchType()) { // The query has an AND at the top-level. See if any of the children // of the AND are $gt or $gte predicates over 'ts'. for (size_t i = 0; i < cq->root()->numChildren(); ++i) { MatchExpression* me = cq->root()->getChild(i); if (isOplogTsPred(me)) { tsExpr = me; break; } } } else if (isOplogTsPred(cq->root())) { // The root of the tree is a $gt or $gte predicate over 'ts'. tsExpr = cq->root(); } if (NULL == tsExpr) { return Status(ErrorCodes::OplogOperationUnsupported, "OplogReplay query does not contain top-level " "$gt or $gte over the 'ts' field."); } DiskLoc startLoc = DiskLoc().setInvalid(); // See if the RecordStore supports the oplogStartHack const BSONElement tsElem = extractOplogTsOptime(tsExpr); if (tsElem.type() == Timestamp) { StatusWith<DiskLoc> goal = oploghack::keyForOptime(tsElem._opTime()); if (goal.isOK()) { startLoc = collection->getRecordStore()->oplogStartHack(txn, goal.getValue()); } } if (startLoc.isValid()) { LOG(3) << "Using direct oplog seek"; } else { LOG(3) << "Using OplogStart stage"; // Fallback to trying the OplogStart stage. WorkingSet* oplogws = new WorkingSet(); OplogStart* stage = new OplogStart(txn, collection, tsExpr, oplogws); PlanExecutor* rawExec; // Takes ownership of oplogws and stage. Status execStatus = PlanExecutor::make(txn, oplogws, stage, collection, PlanExecutor::YIELD_AUTO, &rawExec); invariant(execStatus.isOK()); scoped_ptr<PlanExecutor> exec(rawExec); // The stage returns a DiskLoc of where to start. PlanExecutor::ExecState state = exec->getNext(NULL, &startLoc); // This is normal. The start of the oplog is the beginning of the collection. if (PlanExecutor::IS_EOF == state) { return getExecutor(txn, collection, autoCq.release(), PlanExecutor::YIELD_AUTO, execOut); } // This is not normal. An error was encountered. if (PlanExecutor::ADVANCED != state) { return Status(ErrorCodes::InternalError, "quick oplog start location had error...?"); } } // cout << "diskloc is " << startLoc.toString() << endl; // Build our collection scan... CollectionScanParams params; params.collection = collection; params.start = startLoc; params.direction = CollectionScanParams::FORWARD; params.tailable = cq->getParsed().getOptions().tailable; WorkingSet* ws = new WorkingSet(); CollectionScan* cs = new CollectionScan(txn, params, ws, cq->root()); // Takes ownership of 'ws', 'cs', and 'cq'. return PlanExecutor::make(txn, ws, cs, autoCq.release(), collection, PlanExecutor::YIELD_AUTO, execOut); }
StatusWith<std::vector<ShardEndpoint>> ChunkManagerTargeter::targetDelete( OperationContext* opCtx, const write_ops::DeleteOpEntry& deleteDoc) const { BSONObj shardKey; if (_routingInfo->cm()) { // // Sharded collections have the following further requirements for targeting: // // Limit-1 deletes must be targeted exactly by shard key *or* exact _id // // Get the shard key StatusWith<BSONObj> status = _routingInfo->cm()->getShardKeyPattern().extractShardKeyFromQuery(opCtx, deleteDoc.getQ()); // Bad query if (!status.isOK()) return status.getStatus(); shardKey = status.getValue(); } const auto collation = write_ops::collationOf(deleteDoc); // Target the shard key or delete query if (!shardKey.isEmpty()) { try { return std::vector<ShardEndpoint>{_targetShardKey(shardKey, collation, 0)}; } catch (const DBException&) { // This delete is potentially not constrained to a single shard } } // We failed to target a single shard. // Parse delete query. auto qr = stdx::make_unique<QueryRequest>(getNS()); qr->setFilter(deleteDoc.getQ()); if (!collation.isEmpty()) { qr->setCollation(collation); } const boost::intrusive_ptr<ExpressionContext> expCtx; auto cq = CanonicalQuery::canonicalize(opCtx, std::move(qr), expCtx, ExtensionsCallbackNoop(), MatchExpressionParser::kAllowAllSpecialFeatures); if (!cq.isOK()) { return cq.getStatus().withContext(str::stream() << "Could not parse delete query " << deleteDoc.getQ()); } // Single deletes must target a single shard or be exact-ID. if (_routingInfo->cm() && !deleteDoc.getMulti() && !isExactIdQuery(opCtx, *cq.getValue(), _routingInfo->cm().get())) { return Status(ErrorCodes::ShardKeyNotFound, str::stream() << "A single delete on a sharded collection must contain an exact " "match on _id (and have the collection default collation) or " "contain the shard key (and have the simple collation). Delete " "request: " << deleteDoc.toBSON() << ", shard key pattern: " << _routingInfo->cm()->getShardKeyPattern().toString()); } return _targetQuery(opCtx, deleteDoc.getQ(), collation); }