UpdateResult update(UpdateRequest& request, UpdateDriver* driver) { const NamespaceString& nsString = request.getNamespaceString(); validateUpdate( nsString.ns().c_str(), request.getUpdates(), request.getQuery() ); NamespaceDetails* nsDetails = nsdetails( nsString.ns() ); NamespaceDetailsTransient* nsDetailsTransient = &NamespaceDetailsTransient::get( nsString.ns().c_str() ); OpDebug& debug = request.getDebug(); // TODO: This seems a bit circuitious. debug.updateobj = request.getUpdates(); driver->refreshIndexKeys( nsDetailsTransient->indexKeys() ); shared_ptr<Cursor> cursor = getOptimizedCursor( nsString.ns(), request.getQuery(), BSONObj(), request.getQueryPlanSelectionPolicy() ); // If the update was marked with '$isolated' (a.k.a '$atomic'), we are not allowed to // yield while evaluating the update loop below. // // TODO: Old code checks this repeatedly within the update loop. Is that necessary? It seems // that once atomic should be always atomic. const bool isolated = cursor->ok() && cursor->matcher() && cursor->matcher()->docMatcher().atomic(); // The 'cursor' the optimizer gave us may contain query plans that generate duplicate // diskloc's. We set up here the mechanims that will prevent us from processing those // twice if we see them. We also set up a 'ClientCursor' so that we can support // yielding. // // TODO: Is it valid to call this on a non-ok cursor? const bool dedupHere = cursor->autoDedup(); // // We'll start assuming we have one or more documents for this update. (Othwerwise, // we'll fallback to upserting.) // // We record that this will not be an upsert, in case a mod doesn't want to be applied // when in strict update mode. driver->setContext( ModifierInterface::ExecInfo::UPDATE_CONTEXT ); // Let's fetch each of them and pipe them through the update expression, making sure to // keep track of the necessary stats. Recall that we'll be pulling documents out of // cursors and some of them do not deduplicate the entries they generate. We have // deduping logic in here, too -- for now. unordered_set<DiskLoc, DiskLoc::Hasher> seenLocs; int numMatched = 0; debug.nscanned = 0; Client& client = cc(); mutablebson::Document doc; // If we are going to be yielding, we will need a ClientCursor scoped to this loop. We // only loop as long as the underlying cursor is OK. for ( auto_ptr<ClientCursor> clientCursor; cursor->ok(); ) { // If we haven't constructed a ClientCursor, and if the client allows us to throw // page faults, and if we are referring to a location that is likely not in // physical memory, then throw a PageFaultException. The entire operation will be // restarted. if ( clientCursor.get() == NULL && client.allowedToThrowPageFaultException() && !cursor->currLoc().isNull() && !cursor->currLoc().rec()->likelyInPhysicalMemory() ) { // We should never throw a PFE if we have already updated items. dassert((numMatched == 0) || (numMatched == debug.nupdateNoops)); throw PageFaultException( cursor->currLoc().rec() ); } if ( !isolated && debug.nscanned != 0 ) { // We are permitted to yield. To do so we need a ClientCursor, so create one // now if we have not yet done so. if ( !clientCursor.get() ) clientCursor.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, nsString.ns() ) ); // Ask the client cursor to yield. We get two bits of state back: whether or not // we yielded, and whether or not we correctly recovered from yielding. bool yielded = false; const bool recovered = clientCursor->yieldSometimes( ClientCursor::WillNeed, &yielded ); if ( !recovered ) { // If we failed to recover from the yield, then the ClientCursor is already // gone. Release it so we don't destroy it a second time. clientCursor.release(); break; } if ( !cursor->ok() ) { // If the cursor died while we were yielded, just get out of the update loop. break; } if ( yielded ) { // We yielded and recovered OK, and our cursor is still good. Details about // our namespace may have changed while we were yielded, so we re-acquire // them here. If we can't do so, escape the update loop. Otherwise, refresh // the driver so that it knows about what is currently indexed. nsDetails = nsdetails( nsString.ns() ); if ( !nsDetails ) break; nsDetailsTransient = &NamespaceDetailsTransient::get( nsString.ns().c_str() ); // TODO: This copies the index keys, but it may not need to do so. driver->refreshIndexKeys( nsDetailsTransient->indexKeys() ); } } // Let's fetch the next candidate object for this update. Record* record = cursor->_current(); DiskLoc loc = cursor->currLoc(); const BSONObj oldObj = loc.obj(); // We count how many documents we scanned even though we may skip those that are // deemed duplicated. The final 'numUpdated' and 'nscanned' numbers may differ for // that reason. debug.nscanned++; // Skips this document if it: // a) doesn't match the query portion of the update // b) was deemed duplicate by the underlying cursor machinery // // Now, if we are going to update the document, // c) we don't want to do so while the cursor is at it, as that may invalidate // the cursor. So, we advance to next document, before issuing the update. MatchDetails matchDetails; matchDetails.requestElemMatchKey(); if ( !cursor->currentMatches( &matchDetails ) ) { // a) cursor->advance(); continue; } else if ( cursor->getsetdup( loc ) && dedupHere ) { // b) cursor->advance(); continue; } else if (!driver->isDocReplacement() && request.isMulti()) { // c) cursor->advance(); if ( dedupHere ) { if ( seenLocs.count( loc ) ) { continue; } } // There are certain kind of cursors that hold multiple pointers to data // underneath. $or cursors is one example. In a $or cursor, it may be the case // that when we did the last advance(), we finished consuming documents from // one of $or child and started consuming the next one. In that case, it is // possible that the last document of the previous child is the same as the // first document of the next (see SERVER-5198 and jstests/orp.js). // // So we advance the cursor here until we see a new diskloc. // // Note that we won't be yielding, and we may not do so for a while if we find // a particularly duplicated sequence of loc's. That is highly unlikely, // though. (See SERVER-5725, if curious, but "stage" based $or will make that // ticket moot). while( cursor->ok() && loc == cursor->currLoc() ) { cursor->advance(); } } // For some (unfortunate) historical reasons, not all cursors would be valid after // a write simply because we advanced them to a document not affected by the write. // To protect in those cases, not only we engaged in the advance() logic above, but // we also tell the cursor we're about to write a document that we've just seen. // prepareToTouchEarlierIterate() requires calling later // recoverFromTouchingEarlierIterate(), so we make a note here to do so. bool touchPreviousDoc = request.isMulti() && cursor->ok(); if ( touchPreviousDoc ) { if ( clientCursor.get() ) clientCursor->setDoingDeletes( true ); cursor->prepareToTouchEarlierIterate(); } // Found a matching document numMatched++; // Ask the driver to apply the mods. It may be that the driver can apply those "in // place", that is, some values of the old document just get adjusted without any // change to the binary layout on the bson layer. It may be that a whole new // document is needed to accomodate the new bson layout of the resulting document. doc.reset( oldObj, mutablebson::Document::kInPlaceEnabled ); BSONObj logObj; // If there was a matched field, obtain it. string matchedField; if (matchDetails.hasElemMatchKey()) matchedField = matchDetails.elemMatchKey(); Status status = driver->update( matchedField, &doc, &logObj ); if ( !status.isOK() ) { uasserted( 16837, status.reason() ); } // If the driver applied the mods in place, we can ask the mutable for what // changed. We call those changes "damages". :) We use the damages to inform the // journal what was changed, and then apply them to the original document // ourselves. If, however, the driver applied the mods out of place, we ask it to // generate a new, modified document for us. In that case, the file manager will // take care of the journaling details for us. // // This code flow is admittedly odd. But, right now, journaling is baked in the file // manager. And if we aren't using the file manager, we have to do jounaling // ourselves. bool objectWasChanged = false; BSONObj newObj; const char* source = NULL; mutablebson::DamageVector damages; bool inPlace = doc.getInPlaceUpdates(&damages, &source); if ( inPlace && !damages.empty() && !driver->modsAffectIndices() ) { nsDetails->paddingFits(); // All updates were in place. Apply them via durability and writing pointer. mutablebson::DamageVector::const_iterator where = damages.begin(); const mutablebson::DamageVector::const_iterator end = damages.end(); for( ; where != end; ++where ) { const char* sourcePtr = source + where->sourceOffset; void* targetPtr = getDur().writingPtr( const_cast<char*>(oldObj.objdata()) + where->targetOffset, where->size); std::memcpy(targetPtr, sourcePtr, where->size); } newObj = oldObj; debug.fastmod = true; objectWasChanged = true; } else { // The updates were not in place. Apply them through the file manager. newObj = doc.getObject(); DiskLoc newLoc = theDataFileMgr.updateRecord(nsString.ns().c_str(), nsDetails, nsDetailsTransient, record, loc, newObj.objdata(), newObj.objsize(), debug); // If we've moved this object to a new location, make sure we don't apply // that update again if our traversal picks the objecta again. // // We also take note that the diskloc if the updates are affecting indices. // Chances are that we're traversing one of them and they may be multi key and // therefore duplicate disklocs. if ( newLoc != loc || driver->modsAffectIndices() ) { seenLocs.insert( newLoc ); } objectWasChanged = true; } // Log Obj if ( request.shouldUpdateOpLog() ) { if ( driver->isDocReplacement() || !logObj.isEmpty() ) { BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti()); logOp("u", nsString.ns().c_str(), logObj , &idQuery, NULL, request.isFromMigration(), &newObj); } } // If it was noop since the document didn't change, record that. if (!objectWasChanged) debug.nupdateNoops++; if (!request.isMulti()) { break; } // If we used the cursor mechanism that prepares an earlier seen document for a // write we need to tell such mechanisms that the write is over. if ( touchPreviousDoc ) { cursor->recoverFromTouchingEarlierIterate(); } getDur().commitIfNeeded(); } // TODO: Can this be simplified? if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) { debug.nupdated = numMatched; return UpdateResult( numMatched > 0 /* updated existing object(s) */, !driver->isDocReplacement() /* $mod or obj replacement */, numMatched /* # of docments update, even no-ops */, BSONObj() ); } // // We haven't found any existing document so an insert is done // (upsert is true). // debug.upsert = true; // Since this is an insert (no docs found and upsert:true), we will be logging it // as an insert in the oplog. We don't need the driver's help to build the // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT. // Some mods may only work in that context (e.g. $setOnInsert). driver->setLogOp( false ); driver->setContext( ModifierInterface::ExecInfo::INSERT_CONTEXT ); BSONObj baseObj; // Reset the document we will be writing to doc.reset( baseObj, mutablebson::Document::kInPlaceDisabled ); if ( request.getQuery().hasElement("_id") ) { uassertStatusOK(doc.root().appendElement(request.getQuery().getField("_id"))); } // If this is a $mod base update, we need to generate a document by examining the // query and the mods. Otherwise, we can use the object replacement sent by the user // update command that was parsed by the driver before. // In the following block we handle the query part, and then do the regular mods after. if ( *request.getUpdates().firstElementFieldName() == '$' ) { uassertStatusOK(UpdateDriver::createFromQuery(request.getQuery(), doc)); debug.fastmodinsert = true; } // Apply the update modifications and then log the update as an insert manually. Status status = driver->update( StringData(), &doc, NULL /* no oplog record */); if ( !status.isOK() ) { uasserted( 16836, status.reason() ); } BSONObj newObj = doc.getObject(); theDataFileMgr.insertWithObjMod( nsString.ns().c_str(), newObj, false, request.isGod() ); if ( request.shouldUpdateOpLog() ) { logOp( "i", nsString.ns().c_str(), newObj, NULL, NULL, request.isFromMigration(), &newObj ); } debug.nupdated = 1; return UpdateResult( false /* updated a non existing document */, !driver->isDocReplacement() /* $mod or obj replacement? */, 1 /* count of updated documents */, newObj /* object that was upserted */ ); }
boost::shared_ptr<Runner> PipelineD::prepareCursorSource( OperationContext* txn, Collection* collection, const intrusive_ptr<Pipeline>& pPipeline, const intrusive_ptr<ExpressionContext>& pExpCtx) { // get the full "namespace" name const string& fullName = pExpCtx->ns.ns(); pExpCtx->opCtx->lockState()->assertAtLeastReadLocked(fullName); // We will be modifying the source vector as we go Pipeline::SourceContainer& sources = pPipeline->sources; // Inject a MongodImplementation to sources that need them. for (size_t i = 0; i < sources.size(); i++) { DocumentSourceNeedsMongod* needsMongod = dynamic_cast<DocumentSourceNeedsMongod*>(sources[i].get()); if (needsMongod) { needsMongod->injectMongodInterface( boost::make_shared<MongodImplementation>(pExpCtx)); } } if (!sources.empty() && sources.front()->isValidInitialSource()) { if (dynamic_cast<DocumentSourceMergeCursors*>(sources.front().get())) { // Enable the hooks for setting up authentication on the subsequent internal // connections we are going to create. This would normally have been done // when SetShardVersion was called, but since SetShardVersion is never called // on secondaries, this is needed. ShardedConnectionInfo::addHook(); } return boost::shared_ptr<Runner>(); // don't need a cursor } // Look for an initial match. This works whether we got an initial query or not. // If not, it results in a "{}" query, which will be what we want in that case. const BSONObj queryObj = pPipeline->getInitialQuery(); if (!queryObj.isEmpty()) { // This will get built in to the Cursor we'll create, so // remove the match from the pipeline sources.pop_front(); } // Find the set of fields in the source documents depended on by this pipeline. const DepsTracker deps = pPipeline->getDependencies(queryObj); // Passing query an empty projection since it is faster to use ParsedDeps::extractFields(). // This will need to change to support covering indexes (SERVER-12015). There is an // exception for textScore since that can only be retrieved by a query projection. const BSONObj projectionForQuery = deps.needTextScore ? deps.toProjection() : BSONObj(); /* Look for an initial sort; we'll try to add this to the Cursor we create. If we're successful in doing that (further down), we'll remove the $sort from the pipeline, because the documents will already come sorted in the specified order as a result of the index scan. */ intrusive_ptr<DocumentSourceSort> sortStage; BSONObj sortObj; if (!sources.empty()) { sortStage = dynamic_cast<DocumentSourceSort*>(sources.front().get()); if (sortStage) { // build the sort key sortObj = sortStage->serializeSortKey(/*explain*/false).toBson(); } } // Create the Runner. // // If we try to create a Runner that includes both the match and the // sort, and the two are incompatible wrt the available indexes, then // we don't get a Runner back. // // So we try to use both first. If that fails, try again, without the // sort. // // If we don't have a sort, jump straight to just creating a Runner // without the sort. // // If we are able to incorporate the sort into the Runner, remove it // from the head of the pipeline. // // LATER - we should be able to find this out before we create the // cursor. Either way, we can then apply other optimizations there // are tickets for, such as SERVER-4507. const size_t runnerOptions = QueryPlannerParams::DEFAULT | QueryPlannerParams::INCLUDE_SHARD_FILTER | QueryPlannerParams::NO_BLOCKING_SORT ; boost::shared_ptr<Runner> runner; bool sortInRunner = false; const WhereCallbackReal whereCallback(pExpCtx->opCtx, pExpCtx->ns.db()); if (sortStage) { CanonicalQuery* cq; Status status = CanonicalQuery::canonicalize(pExpCtx->ns, queryObj, sortObj, projectionForQuery, &cq, whereCallback); Runner* rawRunner; if (status.isOK() && getRunner(txn, collection, cq, &rawRunner, runnerOptions).isOK()) { // success: The Runner will handle sorting for us using an index. runner.reset(rawRunner); sortInRunner = true; sources.pop_front(); if (sortStage->getLimitSrc()) { // need to reinsert coalesced $limit after removing $sort sources.push_front(sortStage->getLimitSrc()); } } } if (!runner.get()) { const BSONObj noSort; CanonicalQuery* cq; uassertStatusOK( CanonicalQuery::canonicalize(pExpCtx->ns, queryObj, noSort, projectionForQuery, &cq, whereCallback)); Runner* rawRunner; uassertStatusOK(getRunner(txn, collection, cq, &rawRunner, runnerOptions)); runner.reset(rawRunner); } // DocumentSourceCursor expects a yielding Runner that has had its state saved. runner->saveState(); // Put the Runner into a DocumentSourceCursor and add it to the front of the pipeline. intrusive_ptr<DocumentSourceCursor> pSource = DocumentSourceCursor::create(fullName, runner, pExpCtx); // Note the query, sort, and projection for explain. pSource->setQuery(queryObj); if (sortInRunner) pSource->setSort(sortObj); pSource->setProjection(deps.toProjection(), deps.toParsedDeps()); while (!sources.empty() && pSource->coalesce(sources.front())) { sources.pop_front(); } pPipeline->addInitialSource(pSource); return runner; }
void Strategy::getMore(OperationContext* txn, Request& r) { Timer getMoreTimer; const char* ns = r.getns(); const int ntoreturn = r.d().pullInt(); const long long id = r.d().pullInt64(); // TODO: Handle stale config exceptions here from coll being dropped or sharded during op // for now has same semantics as legacy request const NamespaceString nss(ns); auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString()); if (statusGetDb == ErrorCodes::DatabaseNotFound) { cursorCache.remove(id); replyToQuery(ResultFlag_CursorNotFound, r.p(), r.m(), 0, 0, 0); return; } uassertStatusOK(statusGetDb); shared_ptr<DBConfig> config = statusGetDb.getValue(); ShardPtr primary; ChunkManagerPtr info; config->getChunkManagerOrPrimary(ns, info, primary); // // TODO: Cleanup cursor cache, consolidate into single codepath // const string host = cursorCache.getRef(id); ShardedClientCursorPtr cursor = cursorCache.get(id); int cursorMaxTimeMS = cursorCache.getMaxTimeMS(id); // Cursor ids should not overlap between sharded and unsharded cursors massert(17012, str::stream() << "duplicate sharded and unsharded cursor id " << id << " detected for " << ns << ", duplicated on host " << host, NULL == cursorCache.get(id).get() || host.empty()); ClientBasic* client = ClientBasic::getCurrent(); NamespaceString nsString(ns); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForGetMore(nsString, id); audit::logGetMoreAuthzCheck(client, nsString, id, status.code()); uassertStatusOK(status); if (!host.empty()) { LOG(3) << "single getmore: " << ns; // we used ScopedDbConnection because we don't get about config versions // not deleting data is handled elsewhere // and we don't want to call setShardVersion ScopedDbConnection conn(host); Message response; bool ok = conn->callRead(r.m(), response); uassert(10204, "dbgrid: getmore: error calling db", ok); bool hasMore = (response.singleData().getCursor() != 0); if (!hasMore) { cursorCache.removeRef(id); } r.reply(response, "" /*conn->getServerAddress() */); conn.done(); return; } else if (cursor) { if (cursorMaxTimeMS == kMaxTimeCursorTimeLimitExpired) { cursorCache.remove(id); uasserted(ErrorCodes::ExceededTimeLimit, "operation exceeded time limit"); } // TODO: Try to match logic of mongod, where on subsequent getMore() we pull lots more data? BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cursor->getTotalSent(); bool hasMore = cursor->sendNextBatch(ntoreturn, buffer, docCount); if (hasMore) { // still more data cursor->accessed(); if (cursorMaxTimeMS != kMaxTimeCursorNoTimeLimit) { // Update remaining amount of time in cursor cache. int cursorLeftoverMillis = cursorMaxTimeMS - getMoreTimer.millis(); if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.updateMaxTimeMS(id, cursorLeftoverMillis); } } else { // we've exhausted the cursor cursorCache.remove(id); } replyToQuery(0, r.p(), r.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cursor->getId() : 0); return; } else { LOG(3) << "could not find cursor " << id << " in cache for " << ns; replyToQuery(ResultFlag_CursorNotFound, r.p(), r.m(), 0, 0, 0); return; } }
std::unique_ptr<RequestBuilderInterface> makeRequestBuilder(ProtocolSet clientProtos, ProtocolSet serverProtos) { return makeRequestBuilder(uassertStatusOK(negotiate(clientProtos, serverProtos))); }
void ensureIdIndexForNewNs( Collection* collection ) { if ( collection->ns().isSystem() && !legalClientSystemNS( collection->ns().ns(), false ) ) return; uassertStatusOK( collection->getIndexCatalog()->ensureHaveIdIndex() ); }
Status ClusterAggregate::runAggregate(OperationContext* txn, const Namespaces& namespaces, BSONObj cmdObj, int options, BSONObjBuilder* result) { auto scopedShardDbStatus = ScopedShardDatabase::getExisting(txn, namespaces.executionNss.db()); if (!scopedShardDbStatus.isOK()) { appendEmptyResultSet( *result, scopedShardDbStatus.getStatus(), namespaces.requestedNss.ns()); return Status::OK(); } auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj); if (!request.isOK()) { return request.getStatus(); } const auto conf = scopedShardDbStatus.getValue().db(); // Determine the appropriate collation and 'resolve' involved namespaces to make the // ExpressionContext. // We won't try to execute anything on a mongos, but we still have to populate this map so that // any $lookups, etc. will be able to have a resolved view definition. It's okay that this is // incorrect, we will repopulate the real resolved namespace map on the mongod. Note that we // need to check if any involved collections are sharded before forwarding an aggregation // command on an unsharded collection. StringMap<ExpressionContext::ResolvedNamespace> resolvedNamespaces; LiteParsedPipeline liteParsedPipeline(request.getValue()); for (auto&& ns : liteParsedPipeline.getInvolvedNamespaces()) { uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns())); resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}}; } if (!conf->isSharded(namespaces.executionNss.ns())) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } auto chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns()); std::unique_ptr<CollatorInterface> collation; if (!request.getValue().getCollation().isEmpty()) { collation = uassertStatusOK(CollatorFactoryInterface::get(txn->getServiceContext()) ->makeFromBSON(request.getValue().getCollation())); } else if (chunkMgr->getDefaultCollator()) { collation = chunkMgr->getDefaultCollator()->clone(); } boost::intrusive_ptr<ExpressionContext> mergeCtx = new ExpressionContext( txn, request.getValue(), std::move(collation), std::move(resolvedNamespaces)); mergeCtx->inRouter = true; // explicitly *not* setting mergeCtx->tempDir // Parse and optimize the pipeline specification. auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx); if (!pipeline.isOK()) { return pipeline.getStatus(); } pipeline.getValue()->optimizePipeline(); // If the first $match stage is an exact match on the shard key (with a simple collation or // no string matching), we only have to send it to one shard, so send the command to that // shard. BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery(); BSONObj shardKeyMatches; shardKeyMatches = uassertStatusOK( chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery)); bool singleShard = false; if (!shardKeyMatches.isEmpty()) { auto chunk = chunkMgr->findIntersectingChunk( txn, shardKeyMatches, request.getValue().getCollation()); if (chunk.isOK()) { singleShard = true; } } // Don't need to split pipeline if the first $match is an exact match on shard key, unless // there is a stage that needs to be run on the primary shard. const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger(); const bool needSplit = !singleShard || needPrimaryShardMerger; // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true, // 'pipeline' will become the merger side. boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded() : pipeline.getValue()); // Create the command for the shards. The 'fromRouter' field means produce output to be // merged. MutableDocument commandBuilder(request.getValue().serializeToCommandObj()); commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize()); if (needSplit) { commandBuilder[AggregationRequest::kFromRouterName] = Value(true); commandBuilder[AggregationRequest::kCursorName] = Value(DOC(AggregationRequest::kBatchSizeName << 0)); } // These fields are not part of the AggregationRequest since they are not handled by the // aggregation subsystem, so we serialize them separately. const std::initializer_list<StringData> fieldsToPropagateToShards = { "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS, }; for (auto&& field : fieldsToPropagateToShards) { commandBuilder[field] = Value(cmdObj[field]); } BSONObj shardedCommand = commandBuilder.freeze().toBson(); BSONObj shardQuery = shardPipeline->getInitialQuery(); // Run the command on the shards // TODO need to make sure cursors are killed if a retry is needed std::vector<Strategy::CommandResult> shardResults; Strategy::commandOp(txn, namespaces.executionNss.db().toString(), shardedCommand, options, namespaces.executionNss.ns(), shardQuery, request.getValue().getCollation(), &shardResults); if (mergeCtx->isExplain) { // This must be checked before we start modifying result. uassertAllShardsSupportExplain(shardResults); if (needSplit) { *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline" << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart" << pipeline.getValue()->writeExplainOps()); } else { *result << "splitPipeline" << BSONNULL; } BSONObjBuilder shardExplains(result->subobjStart("shards")); for (size_t i = 0; i < shardResults.size(); i++) { shardExplains.append(shardResults[i].shardTargetId, BSON("host" << shardResults[i].target.toString() << "stages" << shardResults[i].result["stages"])); } return Status::OK(); } if (!needSplit) { invariant(shardResults.size() == 1); invariant(shardResults[0].target.getServers().size() == 1); auto executorPool = Grid::get(txn)->getExecutorPool(); const BSONObj reply = uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0], shardResults[0].result, namespaces.requestedNss, executorPool->getArbitraryExecutor(), Grid::get(txn)->getCursorManager())); result->appendElements(reply); return getStatusFromCommandResult(reply); } pipeline.getValue()->addInitialSource( DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx)); MutableDocument mergeCmd(request.getValue().serializeToCommandObj()); mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize()); mergeCmd["cursor"] = Value(cmdObj["cursor"]); if (cmdObj.hasField("$queryOptions")) { mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]); } if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) { mergeCmd[QueryRequest::cmdOptionMaxTimeMS] = Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]); } mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"])); mergeCmd.setField("readConcern", Value(cmdObj["readConcern"])); // If the user didn't specify a collation already, make sure there's a collation attached to // the merge command, since the merging shard may not have the collection metadata. if (mergeCmd.peek()["collation"].missing()) { mergeCmd.setField("collation", mergeCtx->getCollator() ? Value(mergeCtx->getCollator()->getSpec().toBSON()) : Value(Document{CollationSpec::kSimpleSpec})); } std::string outputNsOrEmpty; if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) { outputNsOrEmpty = out->getOutputNs().ns(); } // Run merging command on random shard, unless a stage needs the primary shard. Need to use // ShardConnection so that the merging mongod is sent the config servers on connection init. auto& prng = txn->getClient()->getPrng(); const auto& mergingShardId = (needPrimaryShardMerger || internalQueryAlwaysMergeOnPrimaryShard.load()) ? conf->getPrimaryId() : shardResults[prng.nextInt32(shardResults.size())].shardTargetId; const auto mergingShard = uassertStatusOK(Grid::get(txn)->shardRegistry()->getShard(txn, mergingShardId)); ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty); BSONObj mergedResults = aggRunCommand(txn, conn.get(), namespaces, mergeCmd.freeze().toBson(), options); conn.done(); if (auto wcErrorElem = mergedResults["writeConcernError"]) { appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result); } // Copy output from merging (primary) shard to the output object from our command. // Also, propagates errmsg and code if ok == false. result->appendElementsUnique(mergedResults); return getStatusFromCommandResult(result->asTempObj()); }
UpdateResult update( OperationContext* txn, Database* db, const UpdateRequest& request, OpDebug* opDebug, UpdateDriver* driver, CanonicalQuery* cq) { LOG(3) << "processing update : " << request; std::auto_ptr<CanonicalQuery> cqHolder(cq); const NamespaceString& nsString = request.getNamespaceString(); UpdateLifecycle* lifecycle = request.getLifecycle(); Collection* collection = db->getCollection(nsString.ns()); validateUpdate(nsString.ns().c_str(), request.getUpdates(), request.getQuery()); // TODO: This seems a bit circuitious. opDebug->updateobj = request.getUpdates(); if (lifecycle) { lifecycle->setCollection(collection); driver->refreshIndexKeys(lifecycle->getIndexKeys()); } Runner* rawRunner; Status status = cq ? getRunner(collection, cqHolder.release(), &rawRunner) : getRunner(collection, nsString.ns(), request.getQuery(), &rawRunner, &cq); uassert(17243, "could not get runner " + request.getQuery().toString() + "; " + causedBy(status), status.isOK()); // Create the runner and setup all deps. auto_ptr<Runner> runner(rawRunner); // Register Runner with ClientCursor const ScopedRunnerRegistration safety(runner.get()); // // We'll start assuming we have one or more documents for this update. (Otherwise, // we'll fall-back to insert case (if upsert is true).) // // We are an update until we fall into the insert case below. driver->setContext(ModifierInterface::ExecInfo::UPDATE_CONTEXT); int numMatched = 0; // If the update was in-place, we may see it again. This only matters if we're doing // a multi-update; if we're not doing a multi-update we stop after one update and we // won't see any more docs. // // For example: If we're scanning an index {x:1} and performing {$inc:{x:5}}, we'll keep // moving the document forward and it will continue to reappear in our index scan. // Unless the index is multikey, the underlying query machinery won't de-dup. // // If the update wasn't in-place we may see it again. Our query may return the new // document and we wouldn't want to update that. // // So, no matter what, we keep track of where the doc wound up. typedef unordered_set<DiskLoc, DiskLoc::Hasher> DiskLocSet; const scoped_ptr<DiskLocSet> updatedLocs(request.isMulti() ? new DiskLocSet : NULL); // Reset these counters on each call. We might re-enter this function to retry this // update if we throw a page fault exception below, and we rely on these counters // reflecting only the actions taken locally. In particlar, we must have the no-op // counter reset so that we can meaningfully comapre it with numMatched above. opDebug->nscanned = 0; opDebug->nscannedObjects = 0; opDebug->nModified = 0; // Get the cached document from the update driver. mutablebson::Document& doc = driver->getDocument(); mutablebson::DamageVector damages; // Used during iteration of docs BSONObj oldObj; // Get first doc, and location Runner::RunnerState state = Runner::RUNNER_ADVANCED; uassert(ErrorCodes::NotMaster, mongoutils::str::stream() << "Not primary while updating " << nsString.ns(), !request.shouldCallLogOp() || isMasterNs(nsString.ns().c_str())); while (true) { // Get next doc, and location DiskLoc loc; state = runner->getNext(&oldObj, &loc); if (state != Runner::RUNNER_ADVANCED) { if (state == Runner::RUNNER_EOF) { // We have reached the logical end of the loop, so do yielding recovery break; } else { uassertStatusOK(Status(ErrorCodes::InternalError, str::stream() << " Update query failed -- " << Runner::statestr(state))); } } // We fill this with the new locs of moved doc so we don't double-update. if (updatedLocs && updatedLocs->count(loc) > 0) { continue; } // We count how many documents we scanned even though we may skip those that are // deemed duplicated. The final 'numMatched' and 'nscanned' numbers may differ for // that reason. // TODO: Do we want to pull this out of the underlying query plan? opDebug->nscanned++; // Found a matching document opDebug->nscannedObjects++; numMatched++; // Ask the driver to apply the mods. It may be that the driver can apply those "in // place", that is, some values of the old document just get adjusted without any // change to the binary layout on the bson layer. It may be that a whole new // document is needed to accomodate the new bson layout of the resulting document. doc.reset(oldObj, mutablebson::Document::kInPlaceEnabled); BSONObj logObj; FieldRefSet updatedFields; Status status = Status::OK(); if (!driver->needMatchDetails()) { // If we don't need match details, avoid doing the rematch status = driver->update(StringData(), &doc, &logObj, &updatedFields); } else { // If there was a matched field, obtain it. MatchDetails matchDetails; matchDetails.requestElemMatchKey(); dassert(cq); verify(cq->root()->matchesBSON(oldObj, &matchDetails)); string matchedField; if (matchDetails.hasElemMatchKey()) matchedField = matchDetails.elemMatchKey(); // TODO: Right now, each mod checks in 'prepare' that if it needs positional // data, that a non-empty StringData() was provided. In principle, we could do // that check here in an else clause to the above conditional and remove the // checks from the mods. status = driver->update(matchedField, &doc, &logObj, &updatedFields); } if (!status.isOK()) { uasserted(16837, status.reason()); } // Ensure _id exists and is first uassertStatusOK(ensureIdAndFirst(doc)); // If the driver applied the mods in place, we can ask the mutable for what // changed. We call those changes "damages". :) We use the damages to inform the // journal what was changed, and then apply them to the original document // ourselves. If, however, the driver applied the mods out of place, we ask it to // generate a new, modified document for us. In that case, the file manager will // take care of the journaling details for us. // // This code flow is admittedly odd. But, right now, journaling is baked in the file // manager. And if we aren't using the file manager, we have to do jounaling // ourselves. bool docWasModified = false; BSONObj newObj; const char* source = NULL; bool inPlace = doc.getInPlaceUpdates(&damages, &source); // If something changed in the document, verify that no immutable fields were changed // and data is valid for storage. if ((!inPlace || !damages.empty()) ) { if (!(request.isFromReplication() || request.isFromMigration())) { const std::vector<FieldRef*>* immutableFields = NULL; if (lifecycle) immutableFields = lifecycle->getImmutableFields(); uassertStatusOK(validate(oldObj, updatedFields, doc, immutableFields, driver->modOptions()) ); } } // Save state before making changes runner->saveState(); if (inPlace && !driver->modsAffectIndices()) { // If a set of modifiers were all no-ops, we are still 'in place', but there is // no work to do, in which case we want to consider the object unchanged. if (!damages.empty() ) { collection->updateDocumentWithDamages( txn, loc, source, damages ); docWasModified = true; opDebug->fastmod = true; } newObj = oldObj; } else { // The updates were not in place. Apply them through the file manager. newObj = doc.getObject(); uassert(17419, str::stream() << "Resulting document after update is larger than " << BSONObjMaxUserSize, newObj.objsize() <= BSONObjMaxUserSize); StatusWith<DiskLoc> res = collection->updateDocument(txn, loc, newObj, true, opDebug); uassertStatusOK(res.getStatus()); DiskLoc newLoc = res.getValue(); docWasModified = true; // If the document moved, we might see it again in a collection scan (maybe it's // a document after our current document). // // If the document is indexed and the mod changes an indexed value, we might see it // again. For an example, see the comment above near declaration of updatedLocs. if (updatedLocs && (newLoc != loc || driver->modsAffectIndices())) { updatedLocs->insert(newLoc); } } // Restore state after modification uassert(17278, "Update could not restore runner state after updating a document.", runner->restoreState()); // Call logOp if requested. if (request.shouldCallLogOp() && !logObj.isEmpty()) { BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti()); logOp(txn, "u", nsString.ns().c_str(), logObj , &idQuery, NULL, request.isFromMigration()); } // Only record doc modifications if they wrote (exclude no-ops) if (docWasModified) opDebug->nModified++; if (!request.isMulti()) { break; } // Opportunity for journaling to write during the update. txn->recoveryUnit()->commitIfNeeded(); } // TODO: Can this be simplified? if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) { opDebug->nMatched = numMatched; return UpdateResult(numMatched > 0 /* updated existing object(s) */, !driver->isDocReplacement() /* $mod or obj replacement */, opDebug->nModified /* number of modified docs, no no-ops */, numMatched /* # of docs matched/updated, even no-ops */, BSONObj()); } // // We haven't found any existing document so an insert is done // (upsert is true). // opDebug->upsert = true; // Since this is an insert (no docs found and upsert:true), we will be logging it // as an insert in the oplog. We don't need the driver's help to build the // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT. // Some mods may only work in that context (e.g. $setOnInsert). driver->setLogOp(false); driver->setContext(ModifierInterface::ExecInfo::INSERT_CONTEXT); // Reset the document we will be writing to doc.reset(); // This remains the empty object in the case of an object replacement, but in the case // of an upsert where we are creating a base object from the query and applying mods, // we capture the query as the original so that we can detect immutable field mutations. BSONObj original = BSONObj(); // Calling createFromQuery will populate the 'doc' with fields from the query which // creates the base of the update for the inserterd doc (because upsert was true) if (cq) { uassertStatusOK(driver->populateDocumentWithQueryFields(cq, doc)); // Validate the base doc, as taken from the query -- no fields means validate all. FieldRefSet noFields; uassertStatusOK(validate(BSONObj(), noFields, doc, NULL, driver->modOptions())); if (!driver->isDocReplacement()) { opDebug->fastmodinsert = true; // We need all the fields from the query to compare against for validation below. original = doc.getObject(); } else { original = request.getQuery(); } } else { fassert(17354, CanonicalQuery::isSimpleIdQuery(request.getQuery())); BSONElement idElt = request.getQuery()["_id"]; original = idElt.wrap(); fassert(17352, doc.root().appendElement(idElt)); } // Apply the update modifications and then log the update as an insert manually. FieldRefSet updatedFields; status = driver->update(StringData(), &doc, NULL, &updatedFields); if (!status.isOK()) { uasserted(16836, status.reason()); } // Ensure _id exists and is first uassertStatusOK(ensureIdAndFirst(doc)); // Validate that the object replacement or modifiers resulted in a document // that contains all the immutable keys and can be stored. if (!(request.isFromReplication() || request.isFromMigration())){ const std::vector<FieldRef*>* immutableFields = NULL; if (lifecycle) immutableFields = lifecycle->getImmutableFields(); // This will only validate the modified fields if not a replacement. uassertStatusOK(validate(original, updatedFields, doc, immutableFields, driver->modOptions()) ); } // Only create the collection if the doc will be inserted. if (!collection) { collection = db->getCollection(request.getNamespaceString().ns()); if (!collection) { collection = db->createCollection(txn, request.getNamespaceString().ns()); } } // Insert the doc BSONObj newObj = doc.getObject(); uassert(17420, str::stream() << "Document to upsert is larger than " << BSONObjMaxUserSize, newObj.objsize() <= BSONObjMaxUserSize); StatusWith<DiskLoc> newLoc = collection->insertDocument(txn, newObj, !request.isGod() /*enforceQuota*/); uassertStatusOK(newLoc.getStatus()); if (request.shouldCallLogOp()) { logOp(txn, "i", nsString.ns().c_str(), newObj, NULL, NULL, request.isFromMigration()); } opDebug->nMatched = 1; return UpdateResult(false /* updated a non existing document */, !driver->isDocReplacement() /* $mod or obj replacement? */, 1 /* docs written*/, 1 /* count of updated documents */, newObj /* object that was upserted */ ); }
void ServiceContextMongoD::initializeGlobalStorageEngine() { // This should be set once. invariant(!_storageEngine); // We should have a _lockFile or be in read-only mode. Confusingly, we can still have a lockFile // if we are in read-only mode. This can happen if the server is started in read-only mode on a // writable dbpath. invariant(_lockFile || storageGlobalParams.readOnly); const std::string dbpath = storageGlobalParams.dbpath; if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) { if (storageGlobalParams.engineSetByUser) { // Verify that the name of the user-supplied storage engine matches the contents of // the metadata file. const StorageEngine::Factory* factory = mapFindWithDefault(_storageFactories, storageGlobalParams.engine, static_cast<const StorageEngine::Factory*>(nullptr)); if (factory) { uassert(28662, str::stream() << "Cannot start server. Detected data files in " << dbpath << " created by" << " the '" << *existingStorageEngine << "' storage engine, but the" << " specified storage engine was '" << factory->getCanonicalName() << "'.", factory->getCanonicalName() == *existingStorageEngine); } } else { // Otherwise set the active storage engine as the contents of the metadata file. log() << "Detected data files in " << dbpath << " created by the '" << *existingStorageEngine << "' storage engine, so setting the active" << " storage engine to '" << *existingStorageEngine << "'."; storageGlobalParams.engine = *existingStorageEngine; } } else if (!storageGlobalParams.engineSetByUser) { // Ensure the default storage engine is available with this build of mongod. uassert(28663, str::stream() << "Cannot start server. The default storage engine '" << storageGlobalParams.engine << "' is not available with this build of mongod. Please specify a different" << " storage engine explicitly, e.g. --storageEngine=mmapv1.", isRegisteredStorageEngine(storageGlobalParams.engine)); } const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine]; uassert(18656, str::stream() << "Cannot start server with an unknown storage engine: " << storageGlobalParams.engine, factory); if (storageGlobalParams.readOnly) { uassert(34368, str::stream() << "Server was started in read-only mode, but the configured storage engine, " << storageGlobalParams.engine << ", does not support read-only operation", factory->supportsReadOnly()); } std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath); if (storageGlobalParams.readOnly) { uassert(34415, "Server was started in read-only mode, but the storage metadata file was not" " found.", metadata.get()); } // Validate options in metadata against current startup options. if (metadata.get()) { uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams)); } ScopeGuard guard = MakeGuard([&] { if (_lockFile) { _lockFile->close(); } }); _storageEngine = factory->create(storageGlobalParams, _lockFile.get()); _storageEngine->finishInit(); if (_lockFile) { uassertStatusOK(_lockFile->writePid()); } // Write a new metadata file if it is not present. if (!metadata.get()) { invariant(!storageGlobalParams.readOnly); metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath)); metadata->setStorageEngine(factory->getCanonicalName().toString()); metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams)); uassertStatusOK(metadata->write()); } guard.Dismiss(); _supportsDocLocking = _storageEngine->supportsDocLocking(); }
void readRequestMetadata(OperationContext* opCtx, const BSONObj& metadataObj) { BSONElement readPreferenceElem; BSONElement auditElem; BSONElement configSvrElem; BSONElement trackingElem; BSONElement clientElem; BSONElement logicalTimeElem; for (const auto& metadataElem : metadataObj) { auto fieldName = metadataElem.fieldNameStringData(); if (fieldName == "$readPreference") { readPreferenceElem = metadataElem; } else if (fieldName == AuditMetadata::fieldName()) { auditElem = metadataElem; } else if (fieldName == ConfigServerMetadata::fieldName()) { configSvrElem = metadataElem; } else if (fieldName == ClientMetadata::fieldName()) { clientElem = metadataElem; } else if (fieldName == TrackingMetadata::fieldName()) { trackingElem = metadataElem; } else if (fieldName == LogicalTimeMetadata::fieldName()) { logicalTimeElem = metadataElem; } } if (readPreferenceElem) { ReadPreferenceSetting::get(opCtx) = uassertStatusOK(ReadPreferenceSetting::fromInnerBSON(readPreferenceElem)); } AuditMetadata::get(opCtx) = uassertStatusOK(AuditMetadata::readFromMetadata(auditElem)); uassertStatusOK(ClientMetadataIsMasterState::readFromMetadata(opCtx, clientElem)); ConfigServerMetadata::get(opCtx) = uassertStatusOK(ConfigServerMetadata::readFromMetadata(configSvrElem)); TrackingMetadata::get(opCtx) = uassertStatusOK(TrackingMetadata::readFromMetadata(trackingElem)); auto logicalClock = LogicalClock::get(opCtx); if (logicalClock) { auto logicalTimeMetadata = uassertStatusOK(rpc::LogicalTimeMetadata::readFromMetadata(logicalTimeElem)); auto& signedTime = logicalTimeMetadata.getSignedTime(); // LogicalTimeMetadata is default constructed if no cluster time metadata was sent, so a // default constructed SignedLogicalTime should be ignored. if (signedTime.getTime() != LogicalTime::kUninitialized) { // Cluster times are only sent by sharding aware mongod servers, so this point is only // reached in sharded clusters. if (serverGlobalParams.featureCompatibility.version.load() != ServerGlobalParams::FeatureCompatibility::Version::k34) { auto logicalTimeValidator = LogicalTimeValidator::get(opCtx); if (!LogicalTimeValidator::isAuthorizedToAdvanceClock(opCtx)) { if (!logicalTimeValidator) { uasserted(ErrorCodes::CannotVerifyAndSignLogicalTime, "Cannot accept logicalTime: " + signedTime.getTime().toString() + ". May not be a part of a sharded cluster"); } else { uassertStatusOK(logicalTimeValidator->validate(opCtx, signedTime)); } } uassertStatusOK(logicalClock->advanceClusterTime(signedTime.getTime())); } } } }
void updateSessionEntry(OperationContext* opCtx, const UpdateRequest& updateRequest) { // Current code only supports replacement update. dassert(UpdateDriver::isDocReplacement(updateRequest.getUpdates())); AutoGetCollection autoColl(opCtx, NamespaceString::kSessionTransactionsTableNamespace, MODE_IX); uassert(40527, str::stream() << "Unable to persist transaction state because the session transaction " "collection is missing. This indicates that the " << NamespaceString::kSessionTransactionsTableNamespace.ns() << " collection has been manually deleted.", autoColl.getCollection()); WriteUnitOfWork wuow(opCtx); auto collection = autoColl.getCollection(); auto idIndex = collection->getIndexCatalog()->findIdIndex(opCtx); uassert(40672, str::stream() << "Failed to fetch _id index for " << NamespaceString::kSessionTransactionsTableNamespace.ns(), idIndex); auto indexAccess = collection->getIndexCatalog()->getIndex(idIndex); // Since we are looking up a key inside the _id index, create a key object consisting of only // the _id field. auto idToFetch = updateRequest.getQuery().firstElement(); auto toUpdateIdDoc = idToFetch.wrap(); dassert(idToFetch.fieldNameStringData() == "_id"_sd); auto recordId = indexAccess->findSingle(opCtx, toUpdateIdDoc); auto startingSnapshotId = opCtx->recoveryUnit()->getSnapshotId(); if (recordId.isNull()) { // Upsert case. auto status = collection->insertDocument( opCtx, InsertStatement(updateRequest.getUpdates()), nullptr, true, false); if (status == ErrorCodes::DuplicateKey) { throw WriteConflictException(); } uassertStatusOK(status); wuow.commit(); return; } auto originalRecordData = collection->getRecordStore()->dataFor(opCtx, recordId); auto originalDoc = originalRecordData.toBson(); invariant(collection->getDefaultCollator() == nullptr); boost::intrusive_ptr<ExpressionContext> expCtx(new ExpressionContext(opCtx, nullptr)); auto matcher = fassertStatusOK( 40673, MatchExpressionParser::parse(updateRequest.getQuery(), std::move(expCtx))); if (!matcher->matchesBSON(originalDoc)) { // Document no longer match what we expect so throw WCE to make the caller re-examine. throw WriteConflictException(); } OplogUpdateEntryArgs args; args.nss = NamespaceString::kSessionTransactionsTableNamespace; args.uuid = collection->uuid(); args.update = updateRequest.getUpdates(); args.criteria = toUpdateIdDoc; args.fromMigrate = false; collection->updateDocument(opCtx, recordId, Snapshotted<BSONObj>(startingSnapshotId, originalDoc), updateRequest.getUpdates(), true, // enforceQuota false, // indexesAffected = false because _id is the only index nullptr, &args); wuow.commit(); }
auto mock = DocumentSourceMock::create(inputs); facetStage->setSource(mock.get()); auto output = facetStage->getNext(); ASSERT(output.isAdvanced()); ASSERT_DOCUMENT_EQ(output.getDocument(), Document(fromjson("{subPipe: [{_id: 0}, {_id: 1}]}"))); } // TODO: DocumentSourceFacet will have to propagate pauses if we ever allow nested $facets. DEATH_TEST_F(DocumentSourceFacetTest, ShouldFailIfGivenPausedInput, "Invariant failure !input.isPaused()") { auto ctx = getExpCtx(); auto firstDummy = DocumentSourcePassthrough::create(); auto pipeline = uassertStatusOK(Pipeline::create({firstDummy}, ctx)); auto facetStage = DocumentSourceFacet::create({{"subPipe", pipeline}}, ctx); auto mock = DocumentSourceMock::create(DocumentSource::GetNextResult::makePauseExecution()); facetStage->setSource(mock.get()); facetStage->getNext(); // This should cause a crash. } // // Miscellaneous. // TEST_F(DocumentSourceFacetTest, ShouldBeAbleToReParseSerializedStage) { auto ctx = getExpCtx();
boost::intrusive_ptr<DocumentSource> DocumentSourceMergeCursors::createFromBson( BSONElement elem, const boost::intrusive_ptr<ExpressionContext>& expCtx) { if (elem.type() == BSONType::Object) { // This is the modern serialization format. We de-serialize using the IDL. auto ownedObj = elem.embeddedObject().getOwned(); auto armParams = AsyncResultsMergerParams::parse(IDLParserErrorContext(kStageName), ownedObj); return new DocumentSourceMergeCursors( Grid::get(expCtx->opCtx)->getExecutorPool()->getArbitraryExecutor(), std::move(armParams), expCtx, std::move(ownedObj)); } // This is the old serialization format which can still be generated by mongos processes // older than 4.0. // TODO SERVER-34009 Remove support for this format. uassert(17026, "$mergeCursors stage expected either an array or an object as argument", elem.type() == BSONType::Array); const auto serializedRemotes = elem.Array(); uassert(50729, "$mergeCursors stage expected array with at least one entry", serializedRemotes.size() > 0); boost::optional<NamespaceString> nss; std::vector<RemoteCursor> remotes; for (auto&& cursor : serializedRemotes) { BSONElement nsElem; BSONElement hostElem; BSONElement idElem; uassert(17027, "$mergeCursors stage requires each cursor in array to be an object", cursor.type() == BSONType::Object); for (auto&& cursorElem : cursor.Obj()) { const auto fieldName = cursorElem.fieldNameStringData(); if (fieldName == "ns"_sd) { nsElem = cursorElem; } else if (fieldName == "host"_sd) { hostElem = cursorElem; } else if (fieldName == "id"_sd) { idElem = cursorElem; } else { uasserted(50730, str::stream() << "Unrecognized option " << fieldName << " within cursor provided to $mergeCursors: " << cursor); } } uassert( 50731, "$mergeCursors stage requires \'ns\' field with type string for each cursor in array", nsElem.type() == BSONType::String); // We require each cursor to have the same namespace. This isn't a fundamental limit of the // system, but needs to be true due to the implementation of AsyncResultsMerger, which // tracks one namespace for all cursors. uassert(50720, "$mergeCursors requires each cursor to have the same namespace", !nss || nss->ns() == nsElem.valueStringData()); nss = NamespaceString(nsElem.String()); uassert( 50721, "$mergeCursors stage requires \'host\' field with type string for each cursor in array", hostElem.type() == BSONType::String); auto host = uassertStatusOK(HostAndPort::parse(hostElem.valueStringData())); uassert(50722, "$mergeCursors stage requires \'id\' field with type long for each cursor in array", idElem.type() == BSONType::NumberLong); auto cursorId = idElem.Long(); // We are assuming that none of the cursors have been iterated at all, and so will not have // any data in the initial batch. // TODO SERVER-33323 We use a fake shard id because the AsyncResultsMerger won't use it for // anything, and finding the real one is non-trivial. RemoteCursor remoteCursor; remoteCursor.setShardId(ShardId("fakeShardIdForMergeCursors")); remoteCursor.setHostAndPort(std::move(host)); std::vector<BSONObj> emptyBatch; remoteCursor.setCursorResponse(CursorResponse{*nss, cursorId, emptyBatch}); remotes.push_back(std::move(remoteCursor)); } invariant(nss); // We know there is at least one cursor in 'serializedRemotes', and we require // each cursor to have a 'ns' field. AsyncResultsMergerParams params; params.setRemotes(std::move(remotes)); params.setNss(*nss); return new DocumentSourceMergeCursors( Grid::get(expCtx->opCtx)->getExecutorPool()->getArbitraryExecutor(), std::move(params), expCtx, elem.embeddedObject().getOwned()); }
long long DeleteExecutor::execute(OperationContext* txn, Database* db) { uassertStatusOK(prepare()); uassert(17417, mongoutils::str::stream() << "DeleteExecutor::prepare() failed to parse query " << _request->getQuery(), _isQueryParsed); const bool logop = _request->shouldCallLogOp(); const NamespaceString& ns(_request->getNamespaceString()); if (!_request->isGod()) { if (ns.isSystem()) { uassert(12050, "cannot delete from system namespace", legalClientSystemNS(ns.ns(), true)); } if (ns.ns().find('$') != string::npos) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uasserted( 10100, "cannot delete from collection with reserved $ in name" ); } } Collection* collection = db->getCollection(ns.ns()); if (NULL == collection) { return 0; } uassert(10101, str::stream() << "cannot remove from a capped collection: " << ns.ns(), !collection->isCapped()); uassert(ErrorCodes::NotMaster, str::stream() << "Not primary while removing from " << ns.ns(), !logop || replset::isMasterNs(ns.ns().c_str())); long long nDeleted = 0; Runner* rawRunner; if (_canonicalQuery.get()) { uassertStatusOK(getRunner(collection, _canonicalQuery.release(), &rawRunner)); } else { CanonicalQuery* ignored; uassertStatusOK(getRunner(collection, ns.ns(), _request->getQuery(), &rawRunner, &ignored)); } auto_ptr<Runner> runner(rawRunner); ScopedRunnerRegistration safety(runner.get()); DiskLoc rloc; Runner::RunnerState state; CurOp* curOp = cc().curop(); int oldYieldCount = curOp->numYields(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &rloc))) { if (oldYieldCount != curOp->numYields()) { uassert(ErrorCodes::NotMaster, str::stream() << "No longer primary while removing from " << ns.ns(), !logop || replset::isMasterNs(ns.ns().c_str())); oldYieldCount = curOp->numYields(); } BSONObj toDelete; // TODO: do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? runner->saveState(); collection->deleteDocument(txn, rloc, false, false, logop ? &toDelete : NULL ); runner->restoreState(txn); nDeleted++; if (logop) { if ( toDelete.isEmpty() ) { problem() << "Deleted object without id in collection " << collection->ns() << ", not logging."; } else { bool replJustOne = true; replset::logOp(txn, "d", ns.ns().c_str(), toDelete, 0, &replJustOne); } } if (!_request->isMulti()) { break; } if (!_request->isGod()) { txn->recoveryUnit()->commitIfNeeded(); } if (debug && _request->isGod() && nDeleted == 100) { log() << "warning high number of deletes with god=true " << " which could use significant memory b/c we don't commit journal"; } } return nDeleted; }
ChunkVersion forceShardFilteringMetadataRefresh(OperationContext* opCtx, const NamespaceString& nss, bool forceRefreshFromThisThread) { invariant(!opCtx->lockState()->isLocked()); invariant(!opCtx->getClient()->isInDirectClient()); auto* const shardingState = ShardingState::get(opCtx); invariant(shardingState->canAcceptShardedCommands()); auto routingInfo = uassertStatusOK(Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh( opCtx, nss, forceRefreshFromThisThread)); auto cm = routingInfo.cm(); if (!cm) { // No chunk manager, so unsharded. // Exclusive collection lock needed since we're now changing the metadata AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X); CollectionShardingRuntime::get(opCtx, nss) ->setFilteringMetadata(opCtx, CollectionMetadata()); return ChunkVersion::UNSHARDED(); } // Optimistic check with only IS lock in order to avoid threads piling up on the collection X // lock below { AutoGetCollection autoColl(opCtx, nss, MODE_IS); auto optMetadata = CollectionShardingState::get(opCtx, nss)->getCurrentMetadataIfKnown(); // We already have newer version if (optMetadata) { const auto& metadata = *optMetadata; if (metadata->isSharded() && metadata->getCollVersion().epoch() == cm->getVersion().epoch() && metadata->getCollVersion() >= cm->getVersion()) { LOG(1) << "Skipping refresh of metadata for " << nss << " " << metadata->getCollVersion() << " with an older " << cm->getVersion(); return metadata->getShardVersion(); } } } // Exclusive collection lock needed since we're now changing the metadata AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X); auto* const css = CollectionShardingRuntime::get(opCtx, nss); { auto optMetadata = CollectionShardingState::get(opCtx, nss)->getCurrentMetadataIfKnown(); // We already have newer version if (optMetadata) { const auto& metadata = *optMetadata; if (metadata->isSharded() && metadata->getCollVersion().epoch() == cm->getVersion().epoch() && metadata->getCollVersion() >= cm->getVersion()) { LOG(1) << "Skipping refresh of metadata for " << nss << " " << metadata->getCollVersion() << " with an older " << cm->getVersion(); return metadata->getShardVersion(); } } } CollectionMetadata metadata(std::move(cm), shardingState->shardId()); const auto newShardVersion = metadata.getShardVersion(); css->setFilteringMetadata(opCtx, std::move(metadata)); return newShardVersion; }
void ChunkSplitter::_runAutosplit(const NamespaceString& nss, const BSONObj& min, const BSONObj& max, long dataWritten) { if (!_isPrimary) { return; } try { const auto opCtx = cc().makeOperationContext(); const auto routingInfo = uassertStatusOK( Grid::get(opCtx.get())->catalogCache()->getCollectionRoutingInfo(opCtx.get(), nss)); uassert(ErrorCodes::NamespaceNotSharded, "Could not split chunk. Collection is no longer sharded", routingInfo.cm()); const auto cm = routingInfo.cm(); const auto chunk = cm->findIntersectingChunkWithSimpleCollation(min); // Stop if chunk's range differs from the range we were expecting to split. if ((0 != chunk.getMin().woCompare(min)) || (0 != chunk.getMax().woCompare(max)) || (chunk.getShardId() != ShardingState::get(opCtx.get())->getShardName())) { LOG(1) << "Cannot auto-split chunk with range '" << redact(ChunkRange(min, max).toString()) << "' for nss '" << nss << "' on shard '" << ShardingState::get(opCtx.get())->getShardName() << "' because since scheduling auto-split the chunk has been changed to '" << redact(chunk.toString()) << "'"; return; } const ChunkRange chunkRange(chunk.getMin(), chunk.getMax()); const auto balancerConfig = Grid::get(opCtx.get())->getBalancerConfiguration(); // Ensure we have the most up-to-date balancer configuration uassertStatusOK(balancerConfig->refreshAndCheck(opCtx.get())); if (!balancerConfig->getShouldAutoSplit()) { return; } const uint64_t maxChunkSizeBytes = balancerConfig->getMaxChunkSizeBytes(); LOG(1) << "about to initiate autosplit: " << redact(chunk.toString()) << " dataWritten since last check: " << dataWritten << " maxChunkSizeBytes: " << maxChunkSizeBytes; auto splitPoints = uassertStatusOK(splitVector(opCtx.get(), nss, cm->getShardKeyPattern().toBSON(), chunk.getMin(), chunk.getMax(), false, boost::none, boost::none, boost::none, maxChunkSizeBytes)); if (splitPoints.size() <= 1) { // No split points means there isn't enough data to split on; 1 split point means we // have between half the chunk size to full chunk size so there is no need to split yet return; } // We assume that if the chunk being split is the first (or last) one on the collection, // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the // very first (or last) key as a split point. // // This heuristic is skipped for "special" shard key patterns that are not likely to produce // monotonically increasing or decreasing values (e.g. hashed shard keys). // Keeps track of the minKey of the top chunk after the split so we can migrate the chunk. BSONObj topChunkMinKey; if (KeyPattern::isOrderedKeyPattern(cm->getShardKeyPattern().toBSON())) { if (0 == cm->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk.getMin())) { // MinKey is infinity (This is the first chunk on the collection) BSONObj key = findExtremeKeyForShard(opCtx.get(), nss, cm->getShardKeyPattern(), true); if (!key.isEmpty()) { splitPoints.front() = key.getOwned(); topChunkMinKey = cm->getShardKeyPattern().getKeyPattern().globalMin(); } } else if (0 == cm->getShardKeyPattern().getKeyPattern().globalMax().woCompare( chunk.getMax())) { // MaxKey is infinity (This is the last chunk on the collection) BSONObj key = findExtremeKeyForShard(opCtx.get(), nss, cm->getShardKeyPattern(), false); if (!key.isEmpty()) { splitPoints.back() = key.getOwned(); topChunkMinKey = key.getOwned(); } } } uassertStatusOK(splitChunkAtMultiplePoints(opCtx.get(), chunk.getShardId(), nss, cm->getShardKeyPattern(), cm->getVersion(), chunkRange, splitPoints)); const bool shouldBalance = isAutoBalanceEnabled(opCtx.get(), nss, balancerConfig); log() << "autosplitted " << nss << " chunk: " << redact(chunk.toString()) << " into " << (splitPoints.size() + 1) << " parts (maxChunkSizeBytes " << maxChunkSizeBytes << ")" << (topChunkMinKey.isEmpty() ? "" : " (top chunk migration suggested" + (std::string)(shouldBalance ? ")" : ", but no migrations allowed)")); // Balance the resulting chunks if the autobalance option is enabled and if we split at the // first or last chunk on the collection as part of top chunk optimization. if (!shouldBalance || topChunkMinKey.isEmpty()) { return; } // Tries to move the top chunk out of the shard to prevent the hot spot from staying on a // single shard. This is based on the assumption that succeeding inserts will fall on the // top chunk. moveChunk(opCtx.get(), nss, topChunkMinKey); } catch (const DBException& ex) { log() << "Unable to auto-split chunk " << redact(ChunkRange(min, max).toString()) << " in nss " << nss << causedBy(redact(ex.toStatus())); } catch (const std::exception& e) { log() << "caught exception while splitting chunk: " << redact(e.what()); } }
bool RunOnAllShardsCommand::run(OperationContext* txn, const std::string& dbName, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& output) { LOG(1) << "RunOnAllShardsCommand db: " << dbName << " cmd:" << cmdObj; if (_implicitCreateDb) { uassertStatusOK(ScopedShardDatabase::getOrCreate(txn, dbName)); } std::vector<ShardId> shardIds; getShardIds(txn, dbName, cmdObj, shardIds); std::list<std::shared_ptr<Future::CommandResult>> futures; for (const ShardId& shardId : shardIds) { const auto shard = grid.shardRegistry()->getShard(txn, shardId); if (!shard) { continue; } futures.push_back(Future::spawnCommand( shard->getConnString().toString(), dbName, cmdObj, 0, NULL, _useShardConn)); } std::vector<ShardAndReply> results; BSONObjBuilder subobj(output.subobjStart("raw")); BSONObjBuilder errors; int commonErrCode = -1; std::list<std::shared_ptr<Future::CommandResult>>::iterator futuresit; std::vector<ShardId>::const_iterator shardIdsIt; BSONElement wcErrorElem; ShardId wcErrorShardId; bool hasWCError = false; // We iterate over the set of shard ids and their corresponding futures in parallel. // TODO: replace with zip iterator if we ever decide to use one from Boost or elsewhere for (futuresit = futures.begin(), shardIdsIt = shardIds.cbegin(); futuresit != futures.end() && shardIdsIt != shardIds.end(); ++futuresit, ++shardIdsIt) { std::shared_ptr<Future::CommandResult> res = *futuresit; if (res->join(txn)) { // success :) BSONObj result = res->result(); results.emplace_back(shardIdsIt->toString(), result); subobj.append(res->getServer(), result); if (!hasWCError) { if ((wcErrorElem = result["writeConcernError"])) { wcErrorShardId = *shardIdsIt; hasWCError = true; } } continue; } BSONObj result = res->result(); if (!hasWCError) { if ((wcErrorElem = result["writeConcernError"])) { wcErrorShardId = *shardIdsIt; hasWCError = true; } } if (result["errmsg"].type() || result["code"].numberInt() != 0) { result = specialErrorHandler(res->getServer(), dbName, cmdObj, result); BSONElement errmsgObj = result["errmsg"]; if (errmsgObj.eoo() || errmsgObj.String().empty()) { // it was fixed! results.emplace_back(shardIdsIt->toString(), result); subobj.append(res->getServer(), result); continue; } } // Handle "errmsg". if (!result["errmsg"].eoo()) { errors.appendAs(result["errmsg"], res->getServer()); } else { // Can happen if message is empty, for some reason errors.append(res->getServer(), str::stream() << "result without error message returned : " << result); } // Handle "code". int errCode = result["code"].numberInt(); if (commonErrCode == -1) { commonErrCode = errCode; } else if (commonErrCode != errCode) { commonErrCode = 0; } results.emplace_back(shardIdsIt->toString(), result); subobj.append(res->getServer(), result); } subobj.done(); if (hasWCError) { appendWriteConcernErrorToCmdResponse(wcErrorShardId, wcErrorElem, output); } BSONObj errobj = errors.done(); if (!errobj.isEmpty()) { errmsg = errobj.toString(); // If every error has a code, and the code for all errors is the same, then add // a top-level field "code" with this value to the output object. if (commonErrCode > 0) { output.append("code", commonErrCode); } return false; } aggregateResults(results, output); return true; }
void operator()( DBClientCursorBatchIterator &i ) { Lock::GlobalWrite lk; context.relocked(); bool createdCollection = false; Collection* collection = NULL; while( i.moreInCurrentBatch() ) { if ( numSeen % 128 == 127 /*yield some*/ ) { collection = NULL; time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << numSeen << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } if ( isindex == false && collection == NULL ) { collection = context.db()->getCollection( to_collection ); if ( !collection ) { massert( 17321, str::stream() << "collection dropped during clone [" << to_collection << "]", !createdCollection ); createdCollection = true; collection = context.db()->createCollection( txn, to_collection ); verify( collection ); } } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ const Status status = validateBSON(tmp.objdata(), tmp.objsize()); if (!status.isOK()) { out() << "Cloner: skipping corrupt object from " << from_collection << ": " << status.reason(); continue; } ++numSeen; BSONObj js = tmp; if ( isindex ) { verify(nsToCollectionSubstring(from_collection) == "system.indexes"); js = fixindex(context.db()->name(), tmp); indexesToBuild->push_back( js.getOwned() ); continue; } verify(nsToCollectionSubstring(from_collection) != "system.indexes"); StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true ); if ( !loc.isOK() ) { error() << "error: exception cloning object in " << from_collection << ' ' << loc.toString() << " obj:" << js; } uassertStatusOK( loc.getStatus() ); if ( logForRepl ) logOp(txn, "i", to_collection, js); txn->commitIfNeeded(); RARELY if ( time( 0 ) - saveLast > 60 ) { log() << numSeen << " objects cloned so far from collection " << from_collection; saveLast = time( 0 ); } } }
DiskLoc DataFileMgr::insert(const char* ns, const void* obuf, int32_t len, bool mayInterrupt, bool god, bool mayAddIndex, bool* addedID) { Database* database = cc().database(); bool wouldAddIndex = false; massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); { const char *sys = strstr(ns, "system."); if ( sys ) { if ( !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) return DiskLoc(); if ( mayAddIndex && wouldAddIndex ) { // TODO: this should be handled above this function BSONObj spec( static_cast<const char*>( obuf ) ); string collectionToIndex = spec.getStringField( "ns" ); uassert(10096, "invalid ns to index", collectionToIndex.find( '.' ) != string::npos); massert(10097, str::stream() << "trying to create index on wrong db " << " db: " << database->name() << " collection: " << collectionToIndex, database->ownsNS( collectionToIndex ) ); Collection* collection = database->getCollection( collectionToIndex ); if ( !collection ) { collection = database->createCollection( collectionToIndex, false, NULL, true ); verify( collection ); if ( !god ) ensureIdIndexForNewNs( collection ); } Status status = collection->getIndexCatalog()->createIndex( spec, mayInterrupt ); if ( status.code() == ErrorCodes::IndexAlreadyExists ) return DiskLoc(); uassertStatusOK( status ); return DiskLoc(); } } } Collection* collection = database->getCollection( ns ); if ( collection == NULL ) { collection = database->createCollection( ns, false, NULL, false ); int ies = Extent::initialSize(len); if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { // probably an index. so we pick a value here for the first extent instead of using // initialExtentSize() which is more for user collections. // TODO: we could look at the # of records in the parent collection to be smarter here. ies = (32+4) * 1024; } collection->increaseStorageSize( ies, false); if ( !god ) ensureIdIndexForNewNs( collection ); } NamespaceDetails* d = collection->details(); IDToInsert idToInsert; // only initialized if needed if( !god ) { /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); BSONElement idField = io.getField( "_id" ); uassert( 10099 , "_id cannot be an array", idField.type() != Array ); // we don't add _id for capped collections in local as they don't have an _id index if( idField.eoo() && !wouldAddIndex && nsToDatabase( ns ) != "local" && d->haveIdIndex() ) { if( addedID ) *addedID = true; idToInsert.init(); len += idToInsert.size(); } BSONElementManipulator::lookForTimestamps( io ); } int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize ); fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) ); // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->isCapped() && !god) { BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) ); Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp ); uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() ); } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); if ( loc.isNull() ) { string errmsg = str::stream() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->isCapped(); log() << errmsg; uasserted( 17248, errmsg ); } Record *r = loc.rec(); { verify( r->lengthWithHeaders() >= lenWHdr ); r = (Record*) getDur().writingPtr(r, lenWHdr); if( idToInsert.needed() ) { /* a little effort was made here to avoid a double copy when we add an ID */ int originalSize = *((int*) obuf); ((int&)*r->data()) = originalSize + idToInsert.size(); memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size()); memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4); } else { if( obuf ) // obuf can be null from internal callers memcpy(r->data(), obuf, len); } } addRecordToRecListInExtent(r, loc); d->incrementStats( r->netLength(), 1 ); // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket if ( !god ) collection->infoCache()->notifyOfWriteOp(); /* add this record to our indexes */ if ( d->getTotalIndexCount() > 0 ) { try { BSONObj obj(r->data()); collection->getIndexCatalog()->indexRecord(obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index if( d->isCapped() ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; setLastError(0, s.c_str()); error() << s << endl; } else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; } } } d->paddingFits(); return loc; }
void _initAndListen(int listenPort ) { Client::initThread("initandlisten"); bool is32bit = sizeof(int*) == 4; { ProcessId pid = ProcessId::getCurrent(); LogstreamBuilder l = log(); l << "MongoDB starting : pid=" << pid << " port=" << serverGlobalParams.port << " dbpath=" << storageGlobalParams.dbpath; if( replSettings.master ) l << " master=" << replSettings.master; if( replSettings.slave ) l << " slave=" << (int) replSettings.slave; l << ( is32bit ? " 32" : " 64" ) << "-bit host=" << getHostNameCached() << endl; } DEV log() << "_DEBUG build (which is slower)" << endl; logStartupWarnings(); #if defined(_WIN32) printTargetMinOS(); #endif logProcessDetails(); { stringstream ss; ss << endl; ss << "*********************************************************************" << endl; ss << " ERROR: dbpath (" << storageGlobalParams.dbpath << ") does not exist." << endl; ss << " Create this directory or give existing directory in --dbpath." << endl; ss << " See http://dochub.mongodb.org/core/startingandstoppingmongo" << endl; ss << "*********************************************************************" << endl; uassert(10296, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.dbpath)); } { stringstream ss; ss << "repairpath (" << storageGlobalParams.repairpath << ") does not exist"; uassert(12590, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.repairpath)); } // TODO check non-journal subdirs if using directory-per-db checkReadAhead(storageGlobalParams.dbpath); acquirePathLock(mongodGlobalParams.repair); boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/"); FileAllocator::get()->start(); // TODO: This should go into a MONGO_INITIALIZER once we have figured out the correct // dependencies. if (snmpInit) { snmpInit(); } MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" ); dur::startup(); if (storageGlobalParams.durOptions & StorageGlobalParams::DurRecoverOnly) return; unsigned long long missingRepl = checkIfReplMissingFromCommandLine(); if (missingRepl) { log() << startupWarningsLog; log() << "** WARNING: mongod started without --replSet yet " << missingRepl << " documents are present in local.system.replset" << startupWarningsLog; log() << "** Restart with --replSet unless you are doing maintenance and no" << " other clients are connected." << startupWarningsLog; log() << "** The TTL collection monitor will not start because of this." << startupWarningsLog; log() << "** For more info see http://dochub.mongodb.org/core/ttlcollections" << startupWarningsLog; log() << startupWarningsLog; } if (mongodGlobalParams.scriptingEnabled) { ScriptEngine::setup(); globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback ); globalScriptEngine->setGetCurrentOpIdCallback( jsGetCurrentOpIdCallback ); } // On replica set members we only clear temp collections on DBs other than "local" during // promotion to primary. On pure slaves, they are only cleared when the oplog tells them to. // The local DB is special because it is not replicated. See SERVER-10927 for more details. const bool shouldClearNonLocalTmpCollections = !(missingRepl || replSettings.usingReplSets() || replSettings.slave == SimpleSlave); repairDatabasesAndCheckVersion(shouldClearNonLocalTmpCollections); if (mongodGlobalParams.upgrade) return; uassertStatusOK(getGlobalAuthorizationManager()->initialize()); /* this is for security on certain platforms (nonce generation) */ srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros())); // The snapshot thread provides historical collection level and lock statistics for use // by the web interface. Only needed when HTTP is enabled. if (serverGlobalParams.isHttpInterfaceEnabled) snapshotThread.go(); d.clientCursorMonitor.go(); PeriodicTask::startRunningPeriodicTasks(); if (missingRepl) { // a warning was logged earlier } else { startTTLBackgroundJob(); } #ifndef _WIN32 mongo::signalForkSuccess(); #endif if(getGlobalAuthorizationManager()->isAuthEnabled()) { // open admin db in case we need to use it later. TODO this is not the right way to // resolve this. Client::WriteContext c("admin", storageGlobalParams.dbpath); } authindex::configureSystemIndexes("admin"); getDeleter()->startWorkers(); // Starts a background thread that rebuilds all incomplete indices. indexRebuilder.go(); listen(listenPort); // listen() will return when exit code closes its socket. exitCleanly(EXIT_NET_ERROR); }
void KVRecordStoreCapped::deleteAsNeeded(OperationContext *txn) { if (!needsDelete(txn)) { // nothing to do return; } // Only one thread should do deletes at a time, otherwise they'll conflict. boost::mutex::scoped_lock lock(_cappedDeleteMutex, boost::defer_lock); if (_cappedMaxDocs != -1) { lock.lock(); } else { if (!lock.try_lock()) { // Someone else is deleting old records. Apply back-pressure if too far behind, // otherwise continue. if ((dataSize(txn) - _cappedMaxSize) < _cappedMaxSizeSlack) return; lock.lock(); // If we already waited, let someone else do cleanup unless we are significantly // over the limit. if ((dataSize(txn) - _cappedMaxSize) < (2 * _cappedMaxSizeSlack)) return; } } // we do this is a side transaction in case it aborts TempRecoveryUnitSwap swap(txn); int64_t ds = dataSize(txn); int64_t nr = numRecords(txn); int64_t sizeOverCap = (ds > _cappedMaxSize) ? ds - _cappedMaxSize : 0; int64_t sizeSaved = 0; int64_t docsOverCap = (_cappedMaxDocs != -1 && nr > _cappedMaxDocs) ? nr - _cappedMaxDocs : 0; int64_t docsRemoved = 0; try { WriteUnitOfWork wuow(txn); // We're going to notify the underlying store that we've // deleted this range of ids. In TokuFT, this will trigger an // optimize. RecordId firstDeleted, lastDeleted; Timer t; // Delete documents while we are over-full and the iterator has more. // // Note that the iterator we get has the _idTracker's logic // already built in, so we don't need to worry about deleting // records that are not yet committed, including the one we // just inserted for (boost::scoped_ptr<RecordIterator> iter(getIterator(txn)); ((sizeSaved < sizeOverCap || docsRemoved < docsOverCap) && !iter->isEOF()); ) { const RecordId oldest = iter->getNext(); ++docsRemoved; sizeSaved += iter->dataFor(oldest).size(); if (_cappedDeleteCallback) { // need to notify higher layers that a RecordId is about to be deleted uassertStatusOK(_cappedDeleteCallback->aboutToDeleteCapped(txn, oldest, iter->dataFor(oldest))); } deleteRecord(txn, oldest); if (firstDeleted.isNull()) { firstDeleted = oldest; } dassert(oldest > lastDeleted); lastDeleted = oldest; // Now, decide whether to keep working, we want to balance // staying on top of the deletion workload with the // latency of the client that's doing the deletes for // everyone. if (sizeOverCap >= _cappedMaxSizeSlack) { // If we're over the slack amount, everyone's going to // block on us anyway, so we may as well keep working. continue; } if (sizeOverCap < (_cappedMaxSizeSlack / 4) && docsRemoved >= 1000) { // If we aren't too much over and we've done a fair // amount of work, take a break. break; } else if (docsRemoved % 1000 == 0 && t.seconds() >= 4) { // If we're under the slack amount and we've already // spent a second working on this, return and give // someone else a chance to shoulder that latency. break; } } if (docsRemoved > 0) { _db->justDeletedCappedRange(txn, Slice::of(KeyString(firstDeleted)), Slice::of(KeyString(lastDeleted)), sizeSaved, docsRemoved); wuow.commit(); dassert(lastDeleted > _lastDeletedId); _lastDeletedId = lastDeleted; } } catch (WriteConflictException) { log() << "Got conflict truncating capped, ignoring."; return; } }
// throws DBException void buildAnIndex( OperationContext* txn, Collection* collection, IndexCatalogEntry* btreeState, bool mayInterrupt ) { const string ns = collection->ns().ns(); // our copy verify(txn->lockState()->isWriteLocked(ns)); const IndexDescriptor* idx = btreeState->descriptor(); const BSONObj& idxInfo = idx->infoObj(); LOG(0) << "build index on: " << ns << " properties: " << idx->toString() << endl; audit::logCreateIndex( currentClient.get(), &idxInfo, idx->indexName(), ns ); Timer t; // this is so that people know there are more keys to look at when doing // things like in place updates, etc... collection->infoCache()->addedIndex(); if ( collection->numRecords() == 0 ) { Status status = btreeState->accessMethod()->initializeAsEmpty(txn); massert( 17343, str::stream() << "IndexAccessMethod::initializeAsEmpty failed" << status.toString(), status.isOK() ); LOG(0) << "\t added index to empty collection"; return; } scoped_ptr<BackgroundOperation> backgroundOperation; bool doInBackground = false; if ( idxInfo["background"].trueValue() && !inDBRepair ) { doInBackground = true; backgroundOperation.reset( new BackgroundOperation(ns) ); uassert( 13130, "can't start bg index b/c in recursive lock (db.eval?)", !txn->lockState()->isRecursive() ); log() << "\t building index in background"; } Status status = btreeState->accessMethod()->initializeAsEmpty(txn); massert( 17342, str::stream() << "IndexAccessMethod::initializeAsEmpty failed" << status.toString(), status.isOK() ); IndexAccessMethod* bulk = doInBackground ? NULL : btreeState->accessMethod()->initiateBulk(txn); scoped_ptr<IndexAccessMethod> bulkHolder(bulk); IndexAccessMethod* iam = bulk ? bulk : btreeState->accessMethod(); if ( bulk ) log() << "\t building index using bulk method"; unsigned long long n = addExistingToIndex( txn, collection, btreeState->descriptor(), iam, doInBackground ); if ( bulk ) { LOG(1) << "\t bulk commit starting"; std::set<DiskLoc> dupsToDrop; Status status = btreeState->accessMethod()->commitBulk( bulk, mayInterrupt, &dupsToDrop ); // Code above us expects a uassert in case of dupkey errors. if (ErrorCodes::DuplicateKey == status.code()) { uassertStatusOK(status); } // Any other errors are probably bad and deserve a massert. massert( 17398, str::stream() << "commitBulk failed: " << status.toString(), status.isOK() ); if ( dupsToDrop.size() ) log() << "\t bulk dropping " << dupsToDrop.size() << " dups"; for( set<DiskLoc>::const_iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); ++i ) { BSONObj toDelete; collection->deleteDocument( txn, *i, false /* cappedOk */, true /* noWarn */, &toDelete ); if (repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase( collection->ns().db())) { repl::logOp(txn, "d", ns.c_str(), toDelete); } txn->recoveryUnit()->commitIfNeeded(); RARELY if ( mayInterrupt ) { txn->checkForInterrupt(); } } } verify( !btreeState->head().isNull() ); LOG(0) << "build index done. scanned " << n << " total records. " << t.millis() / 1000.0 << " secs" << endl; // this one is so people know that the index is finished collection->infoCache()->addedIndex(); }
ConnectionPool::ConnectionList::iterator ConnectionPool::acquireConnection( const HostAndPort& target, Date_t now, Milliseconds timeout) { stdx::unique_lock<stdx::mutex> lk(_mutex); // Clean up connections on stale/unused hosts _cleanUpStaleHosts_inlock(now); for (HostConnectionMap::iterator hostConns; (hostConns = _connections.find(target)) != _connections.end();) { // Clean up the requested host to remove stale/unused connections _cleanUpOlderThan_inlock(now, &hostConns->second); if (hostConns->second.empty()) { // prevent host from causing unnecessary cleanups _lastUsedHosts[hostConns->first] = kNeverTooStale; break; } _inUseConnections.splice( _inUseConnections.begin(), hostConns->second, hostConns->second.begin()); const ConnectionList::iterator candidate = _inUseConnections.begin(); lk.unlock(); try { if (candidate->conn->isStillConnected()) { // setSoTimeout takes a double representing the number of seconds for send and // receive timeouts. Thus, we must express 'timeout' in milliseconds and divide by // 1000.0 to get the number of seconds with a fractional part. candidate->conn->setSoTimeout(durationCount<Milliseconds>(timeout) / 1000.0); return candidate; } } catch (...) { lk.lock(); _destroyConnection_inlock(&_inUseConnections, candidate); throw; } lk.lock(); _destroyConnection_inlock(&_inUseConnections, candidate); } // No idle connection in the pool; make a new one. lk.unlock(); std::unique_ptr<DBClientConnection> conn; if (_hook) { conn.reset(new DBClientConnection( false, // auto reconnect 0, // socket timeout {}, // MongoURI [this, target](const executor::RemoteCommandResponse& isMasterReply) { return _hook->validateHost(target, isMasterReply); })); } else { conn.reset(new DBClientConnection()); } // setSoTimeout takes a double representing the number of seconds for send and receive // timeouts. Thus, we must express 'timeout' in milliseconds and divide by 1000.0 to get // the number of seconds with a fractional part. conn->setSoTimeout(durationCount<Milliseconds>(timeout) / 1000.0); uassertStatusOK(conn->connect(target, StringData())); conn->port().setTag(conn->port().getTag() | _messagingPortTags); if (isInternalAuthSet()) { conn->auth(getInternalUserAuthParams()); } if (_hook) { auto postConnectRequest = uassertStatusOK(_hook->makeRequest(target)); // We might not have a postConnectRequest if (postConnectRequest != boost::none) { auto start = Date_t::now(); auto reply = conn->runCommandWithMetadata(postConnectRequest->dbname, postConnectRequest->cmdObj.firstElementFieldName(), postConnectRequest->metadata, postConnectRequest->cmdObj); auto rcr = executor::RemoteCommandResponse(reply->getCommandReply().getOwned(), reply->getMetadata().getOwned(), Date_t::now() - start); uassertStatusOK(_hook->handleReply(target, std::move(rcr))); } } lk.lock(); return _inUseConnections.insert(_inUseConnections.begin(), ConnectionInfo(conn.release(), now)); }
shared_ptr<ChunkManager> ChunkManager::reload(OperationContext* txn, bool force) const { const NamespaceString nss(_ns); auto config = uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString())); return config->getChunkManagerIfExists(txn, getns(), force); }
void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber, vector<IndexAccessMethod*>& indexesToInsertTo, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << diskloc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; size_t length = e->length; touch_pages( reinterpret_cast<const char*>(e), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/t.seconds() << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = getExtentManager()->getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if ( compactOptions->validateDocuments && !objOld.valid() ) { // object is corrupt! log() << "compact skipping corrupt document!"; stats->corruptDocuments++; } else { unsigned docSize = objOld.objsize(); nrecords++; oldObjSize += docSize; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = docSize + Record::HeaderSize; unsigned lenWPadding = lenWHdr; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) ) lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding); break; case CompactOptions::PRESERVE: // if we are preserving the padding, the record should not change size lenWPadding = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: lenWPadding = compactOptions->computeRecordSize(lenWPadding); if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } break; } CompactDocWriter writer( objOld, lenWPadding ); StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 ); uassertStatusOK( status.getStatus() ); datasize += _recordStore->recordFor( status.getValue() )->netLength(); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = true; // in compact we should be doing no checking for ( size_t i = 0; i < indexesToInsertTo.size(); i++ ) { Status idxStatus = indexesToInsertTo[i]->insert( objOld, status.getValue(), options, NULL ); uassertStatusOK( idxStatus ); } } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( details()->firstExtent() == diskloc ); verify( details()->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; details()->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); getExtentManager()->freeExtents( diskloc, diskloc ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100; } } }
StatusWith<ResolvedView> ViewCatalog::resolveView(OperationContext* opCtx, const NamespaceString& nss) { stdx::unique_lock<stdx::mutex> lock(_mutex); // Keep looping until the resolution completes. If the catalog is invalidated during the // resolution, we start over from the beginning. while (true) { // Points to the name of the most resolved namespace. const NamespaceString* resolvedNss = &nss; // Holds the combination of all the resolved views. std::vector<BSONObj> resolvedPipeline; // If the catalog has not been tampered with, all views seen during the resolution will have // the same collation. As an optimization, we fill out the collation spec only once. boost::optional<BSONObj> collation; // The last seen view definition, which owns the NamespaceString pointed to by // 'resolvedNss'. std::shared_ptr<ViewDefinition> lastViewDefinition; int depth = 0; for (; depth < ViewGraph::kMaxViewDepth; depth++) { while (MONGO_FAIL_POINT(hangDuringViewResolution)) { log() << "Yielding mutex and hanging due to 'hangDuringViewResolution' failpoint"; lock.unlock(); sleepmillis(1000); lock.lock(); } // If the catalog has been invalidated, bail and restart. if (!_valid.load()) { uassertStatusOK(_reloadIfNeeded_inlock(opCtx)); break; } auto view = _lookup_inlock(opCtx, resolvedNss->ns()); if (!view) { // Return error status if pipeline is too large. int pipelineSize = 0; for (auto obj : resolvedPipeline) { pipelineSize += obj.objsize(); } if (pipelineSize > ViewGraph::kMaxViewPipelineSizeBytes) { return {ErrorCodes::ViewPipelineMaxSizeExceeded, str::stream() << "View pipeline exceeds maximum size; maximum size is " << ViewGraph::kMaxViewPipelineSizeBytes}; } return StatusWith<ResolvedView>( {*resolvedNss, std::move(resolvedPipeline), std::move(collation.get())}); } resolvedNss = &view->viewOn(); if (!collation) { collation = view->defaultCollator() ? view->defaultCollator()->getSpec().toBSON() : CollationSpec::kSimpleSpec; } // Prepend the underlying view's pipeline to the current working pipeline. const std::vector<BSONObj>& toPrepend = view->pipeline(); resolvedPipeline.insert(resolvedPipeline.begin(), toPrepend.begin(), toPrepend.end()); // If the first stage is a $collStats, then we return early with the viewOn namespace. if (toPrepend.size() > 0 && !toPrepend[0]["$collStats"].eoo()) { return StatusWith<ResolvedView>( {*resolvedNss, std::move(resolvedPipeline), std::move(collation.get())}); } } if (depth >= ViewGraph::kMaxViewDepth) { return {ErrorCodes::ViewDepthLimitExceeded, str::stream() << "View depth too deep or view cycle detected; maximum depth is " << ViewGraph::kMaxViewDepth}; } }; MONGO_UNREACHABLE; }
virtual bool run(OperationContext* opCtx, const string&, const BSONObj& cmdObj, BSONObjBuilder& result) { /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not authenticated. */ if (cmdObj["forShell"].trueValue()) { LastError::get(opCtx->getClient()).disable(); } transport::Session::TagMask sessionTagsToSet = 0; transport::Session::TagMask sessionTagsToUnset = 0; // Tag connections to avoid closing them on stepdown. auto hangUpElement = cmdObj["hangUpOnStepDown"]; if (!hangUpElement.eoo() && !hangUpElement.trueValue()) { sessionTagsToSet |= transport::Session::kKeepOpen; } auto& clientMetadataIsMasterState = ClientMetadataIsMasterState::get(opCtx->getClient()); bool seenIsMaster = clientMetadataIsMasterState.hasSeenIsMaster(); if (!seenIsMaster) { clientMetadataIsMasterState.setSeenIsMaster(); } BSONElement element = cmdObj[kMetadataDocumentName]; if (!element.eoo()) { if (seenIsMaster) { uasserted(ErrorCodes::ClientMetadataCannotBeMutated, "The client metadata document may only be sent in the first isMaster"); } auto swParseClientMetadata = ClientMetadata::parse(element); uassertStatusOK(swParseClientMetadata.getStatus()); invariant(swParseClientMetadata.getValue()); swParseClientMetadata.getValue().get().logClientMetadata(opCtx->getClient()); clientMetadataIsMasterState.setClientMetadata( opCtx->getClient(), std::move(swParseClientMetadata.getValue())); } // Parse the optional 'internalClient' field. This is provided by incoming connections from // mongod and mongos. auto internalClientElement = cmdObj["internalClient"]; if (internalClientElement) { sessionTagsToSet |= transport::Session::kInternalClient; uassert(ErrorCodes::TypeMismatch, str::stream() << "'internalClient' must be of type Object, but was of type " << typeName(internalClientElement.type()), internalClientElement.type() == BSONType::Object); bool foundMaxWireVersion = false; for (auto&& elem : internalClientElement.Obj()) { auto fieldName = elem.fieldNameStringData(); if (fieldName == "minWireVersion") { // We do not currently use 'internalClient.minWireVersion'. continue; } else if (fieldName == "maxWireVersion") { foundMaxWireVersion = true; uassert(ErrorCodes::TypeMismatch, str::stream() << "'maxWireVersion' field of 'internalClient' must be " "of type int, but was of type " << typeName(elem.type()), elem.type() == BSONType::NumberInt); // All incoming connections from mongod/mongos of earlier versions should be // closed if the featureCompatibilityVersion is bumped to 3.6. if (elem.numberInt() >= WireSpec::instance().incomingInternalClient.maxWireVersion) { sessionTagsToSet |= transport::Session::kLatestVersionInternalClientKeepOpen; } else { sessionTagsToUnset |= transport::Session::kLatestVersionInternalClientKeepOpen; } } else { uasserted(ErrorCodes::BadValue, str::stream() << "Unrecognized field of 'internalClient': '" << fieldName << "'"); } } uassert(ErrorCodes::BadValue, "Missing required field 'maxWireVersion' of 'internalClient'", foundMaxWireVersion); } else { sessionTagsToUnset |= (transport::Session::kInternalClient | transport::Session::kLatestVersionInternalClientKeepOpen); sessionTagsToSet |= transport::Session::kExternalClientKeepOpen; } auto session = opCtx->getClient()->session(); if (session) { session->mutateTags( [sessionTagsToSet, sessionTagsToUnset](transport::Session::TagMask originalTags) { // After a mongos sends the initial "isMaster" command with its mongos client // information, it sometimes sends another "isMaster" command that is forwarded // from its client. Once kInternalClient has been set, we assume that any future // "isMaster" commands are forwarded in this manner, and we do not update the // session tags. if ((originalTags & transport::Session::kInternalClient) == 0) { return (originalTags | sessionTagsToSet) & ~sessionTagsToUnset; } else { return originalTags; } }); } appendReplicationInfo(opCtx, result, 0); if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { const int configServerModeNumber = 2; result.append("configsvr", configServerModeNumber); } result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); result.appendNumber("maxMessageSizeBytes", MaxMessageSizeBytes); result.appendNumber("maxWriteBatchSize", write_ops::kMaxWriteBatchSize); result.appendDate("localTime", jsTime()); result.append("logicalSessionTimeoutMinutes", localLogicalSessionTimeoutMinutes); result.appendNumber("connectionId", opCtx->getClient()->getConnectionId()); if (internalClientElement) { result.append("minWireVersion", WireSpec::instance().incomingInternalClient.minWireVersion); result.append("maxWireVersion", WireSpec::instance().incomingInternalClient.maxWireVersion); } else { result.append("minWireVersion", WireSpec::instance().incomingExternalClient.minWireVersion); result.append("maxWireVersion", WireSpec::instance().incomingExternalClient.maxWireVersion); } result.append("readOnly", storageGlobalParams.readOnly); const auto parameter = mapFindWithDefault(ServerParameterSet::getGlobal()->getMap(), "automationServiceDescriptor", static_cast<ServerParameter*>(nullptr)); if (parameter) parameter->append(opCtx, result, "automationServiceDescriptor"); if (opCtx->getClient()->session()) { MessageCompressorManager::forSession(opCtx->getClient()->session()) .serverNegotiate(cmdObj, &result); } auto& saslMechanismRegistry = SASLServerMechanismRegistry::get(opCtx->getServiceContext()); saslMechanismRegistry.advertiseMechanismNamesForUser(opCtx, cmdObj, &result); return true; }
void Strategy::queryOp(OperationContext* txn, Request& r) { verify(!NamespaceString(r.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(r.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForQuery(ns, q.query); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { auto txn = cc().makeOperationContext(); ReadPreferenceSetting readPreference(ReadPreference::PrimaryOnly, TagSet::primaryOnly()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kFindCommandReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn.get(), *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // Build the response document. // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags r.p(), r.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, r, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, r.p(), r.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, r.p(), r.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = cursor->getQueryShard(); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache r.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
BSONObj rewriteCommandForListingOwnCollections(OperationContext* opCtx, const std::string& dbName, const BSONObj& cmdObj) { mutablebson::Document rewrittenCmdObj(cmdObj); mutablebson::Element ownCollections = mutablebson::findFirstChildNamed(rewrittenCmdObj.root(), "authorizedCollections"); AuthorizationSession* authzSession = AuthorizationSession::get(opCtx->getClient()); // We must strip $ownCollections from the delegated command. uassertStatusOK(ownCollections.remove()); BSONObj collectionFilter; // Extract and retain any previous filter mutablebson::Element oldFilter = mutablebson::findFirstChildNamed(rewrittenCmdObj.root(), "filter"); // Make a new filter, containing a $and array. mutablebson::Element newFilter = rewrittenCmdObj.makeElementObject("filter"); mutablebson::Element newFilterAnd = rewrittenCmdObj.makeElementArray("$and"); uassertStatusOK(newFilter.pushBack(newFilterAnd)); // Append a rule to the $and, which rejects system collections. mutablebson::Element systemCollectionsFilter = rewrittenCmdObj.makeElementObject( "", BSON("name" << BSON("$regex" << BSONRegEx("^(?!system\\.)")))); uassertStatusOK(newFilterAnd.pushBack(systemCollectionsFilter)); if (!authzSession->isAuthorizedForAnyActionOnResource( ResourcePattern::forDatabaseName(dbName))) { // We passed an auth check which said we might be able to render some collections, // but it doesn't seem like we should render all of them. We must filter. // Compute the set of collection names which would be permissible to return. std::set<std::string> collectionNames; for (UserNameIterator nameIter = authzSession->getAuthenticatedUserNames(); nameIter.more(); nameIter.next()) { User* authUser = authzSession->lookupUser(*nameIter); const User::ResourcePrivilegeMap& resourcePrivilegeMap = authUser->getPrivileges(); for (const std::pair<ResourcePattern, Privilege>& resourcePrivilege : resourcePrivilegeMap) { const auto& resource = resourcePrivilege.first; if (resource.isCollectionPattern() || (resource.isExactNamespacePattern() && resource.databaseToMatch() == dbName)) { collectionNames.emplace(resource.collectionToMatch().toString()); } } } // Construct a new filter predicate which returns only collections we were found to // have privileges for. BSONObjBuilder predicateBuilder; BSONObjBuilder nameBuilder(predicateBuilder.subobjStart("name")); BSONArrayBuilder setBuilder(nameBuilder.subarrayStart("$in")); // Load the de-duplicated set into a BSON array for (StringData collectionName : collectionNames) { setBuilder << collectionName; } setBuilder.done(); nameBuilder.done(); collectionFilter = predicateBuilder.obj(); // Filter the results by our collection names. mutablebson::Element newFilterAndIn = rewrittenCmdObj.makeElementObject("", collectionFilter); uassertStatusOK(newFilterAnd.pushBack(newFilterAndIn)); } // If there was a pre-existing filter, compose it with our new one. if (oldFilter.ok()) { uassertStatusOK(oldFilter.remove()); uassertStatusOK(newFilterAnd.pushBack(oldFilter)); } // Attach our new composite filter back onto the listCollections command object. uassertStatusOK(rewrittenCmdObj.root().pushBack(newFilter)); return rewrittenCmdObj.getObject(); }
bool ChunkManager::_load(OperationContext* txn, ChunkMap& chunkMap, set<ShardId>& shardIds, ShardVersionMap* shardVersions, const ChunkManager* oldManager) { // Reset the max version, but not the epoch, when we aren't loading from the oldManager _version = ChunkVersion(0, 0, _version.epoch()); // If we have a previous version of the ChunkManager to work from, use that info to reduce // our config query if (oldManager && oldManager->getVersion().isSet()) { // Get the old max version _version = oldManager->getVersion(); // Load a copy of the old versions *shardVersions = oldManager->_shardVersions; // Load a copy of the chunk map, replacing the chunk manager with our own const ChunkMap& oldChunkMap = oldManager->getChunkMap(); // Could be v.expensive // TODO: If chunks were immutable and didn't reference the manager, we could do more // interesting things here for (const auto& oldChunkMapEntry : oldChunkMap) { shared_ptr<Chunk> oldC = oldChunkMapEntry.second; shared_ptr<Chunk> newC(new Chunk( this, oldC->getMin(), oldC->getMax(), oldC->getShardId(), oldC->getLastmod())); newC->setBytesWritten(oldC->getBytesWritten()); chunkMap.insert(make_pair(oldC->getMax(), newC)); } LOG(2) << "loading chunk manager for collection " << _ns << " using old chunk manager w/ version " << _version.toString() << " and " << oldChunkMap.size() << " chunks"; } // Attach a diff tracker for the versioned chunk data CMConfigDiffTracker differ(this); differ.attach(_ns, chunkMap, _version, *shardVersions); // Diff tracker should *always* find at least one chunk if collection exists // Get the diff query required auto diffQuery = differ.configDiffQuery(); repl::OpTime opTime; std::vector<ChunkType> chunks; uassertStatusOK(grid.catalogManager(txn)->getChunks( txn, diffQuery.query, diffQuery.sort, boost::none, &chunks, &opTime)); invariant(opTime >= _configOpTime); _configOpTime = opTime; int diffsApplied = differ.calculateConfigDiff(txn, chunks); if (diffsApplied > 0) { LOG(2) << "loaded " << diffsApplied << " chunks into new chunk manager for " << _ns << " with version " << _version; // Add all existing shards we find to the shards set for (ShardVersionMap::iterator it = shardVersions->begin(); it != shardVersions->end();) { shared_ptr<Shard> shard = grid.shardRegistry()->getShard(txn, it->first); if (shard) { shardIds.insert(it->first); ++it; } else { shardVersions->erase(it++); } } _configOpTime = opTime; return true; } else if (diffsApplied == 0) { // No chunks were found for the ns warning() << "no chunks found when reloading " << _ns << ", previous version was " << _version; // Set all our data to empty chunkMap.clear(); shardVersions->clear(); _version = ChunkVersion(0, 0, OID()); _configOpTime = opTime; return true; } else { // diffsApplied < 0 bool allInconsistent = (differ.numValidDiffs() == 0); if (allInconsistent) { // All versions are different, this can be normal warning() << "major change in chunk information found when reloading " << _ns << ", previous version was " << _version; } else { // Inconsistent load halfway through (due to yielding cursor during load) // should be rare warning() << "inconsistent chunks found when reloading " << _ns << ", previous version was " << _version << ", this should be rare"; } // Set all our data to empty to be extra safe chunkMap.clear(); shardVersions->clear(); _version = ChunkVersion(0, 0, OID()); return allInconsistent; } }
long long DeleteExecutor::execute() { uassertStatusOK(prepare()); uassert(17417, mongoutils::str::stream() << "DeleteExecutor::prepare() failed to parse query " << _request->getQuery(), _isQueryParsed); const bool logop = _request->shouldCallLogOp(); const NamespaceString& ns(_request->getNamespaceString()); if (!_request->isGod()) { if (ns.isSystem()) { uassert(12050, "cannot delete from system namespace", legalClientSystemNS(ns.ns(), true)); } if (ns.ns().find('$') != string::npos) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uasserted( 10100, "cannot delete from collection with reserved $ in name" ); } } massert(17418, mongoutils::str::stream() << "dbname = " << currentClient.get()->database()->name() << "; ns = " << ns.ns(), currentClient.get()->database()->name() == nsToDatabaseSubstring(ns.ns())); Collection* collection = currentClient.get()->database()->getCollection(ns.ns()); if (NULL == collection) { return 0; } uassert(10101, str::stream() << "cannot remove from a capped collection: " << ns.ns(), !collection->isCapped()); uassert(ErrorCodes::NotMaster, str::stream() << "Not primary while removing from " << ns.ns(), !logop || isMasterNs(ns.ns().c_str())); long long nDeleted = 0; const bool canYield = !_request->isGod() && ( _canonicalQuery.get() ? !QueryPlannerCommon::hasNode(_canonicalQuery->root(), MatchExpression::ATOMIC) : LiteParsedQuery::isQueryIsolated(_request->getQuery())); Runner* rawRunner; if (_canonicalQuery.get()) { uassertStatusOK(getRunner(collection, _canonicalQuery.release(), &rawRunner)); } else { CanonicalQuery* ignored; uassertStatusOK(getRunner(collection, ns.ns(), _request->getQuery(), &rawRunner, &ignored)); } auto_ptr<Runner> runner(rawRunner); auto_ptr<ScopedRunnerRegistration> safety; if (canYield) { safety.reset(new ScopedRunnerRegistration(runner.get())); runner->setYieldPolicy(Runner::YIELD_AUTO); } DiskLoc rloc; Runner::RunnerState state; CurOp* curOp = cc().curop(); int oldYieldCount = curOp->numYields(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &rloc))) { if (oldYieldCount != curOp->numYields()) { uassert(ErrorCodes::NotMaster, str::stream() << "No longer primary while removing from " << ns.ns(), !logop || isMasterNs(ns.ns().c_str())); oldYieldCount = curOp->numYields(); } BSONObj toDelete; // TODO: do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? runner->saveState(); collection->deleteDocument(rloc, false, false, logop ? &toDelete : NULL ); runner->restoreState(); nDeleted++; if (logop) { if ( toDelete.isEmpty() ) { problem() << "deleted object without id, not logging" << endl; } else { bool replJustOne = true; logOp("d", ns.ns().c_str(), toDelete, 0, &replJustOne); } } if (!_request->isMulti()) { break; } if (!_request->isGod()) { getDur().commitIfNeeded(); } if (debug && _request->isGod() && nDeleted == 100) { log() << "warning high number of deletes with god=true " << " which could use significant memory b/c we don't commit journal"; } } return nDeleted; }