void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { verify( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() ); vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col["key"].eoo() && ! col["noBalance"].trueValue() ){ collections.push_back( col["_id"].String() ); } else if( col["noBalance"].trueValue() ){ LOG(1) << "not balancing collection " << col["_id"].String() << ", explicitly disabled" << endl; } } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } ShardInfoMap shardInfo; for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) { const Shard& s = *it; ShardStatus status = s.getStatus(); shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(), status.mapped(), s.isDraining(), status.hasOpsQueued(), s.tags() ); } // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; map< string,vector<BSONObj> > shardToChunksMap; cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ) { BSONObj chunk = cursor->nextSafe(); if ( chunk["jumbo"].trueValue() ) continue; vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()]; chunks.push_back( chunk.getOwned() ); } cursor.reset(); if (shardToChunksMap.empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); } DistributionStatus status( shardInfo, shardToChunksMap ); // load tags conn.ensureIndex( ShardNS::tags, BSON( "ns" << 1 << "min" << 1 ), true ); cursor = conn.query( ShardNS::tags , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); uassert( 16356 , str::stream() << "tag ranges not valid for: " << ns , status.addTagRange( TagRange( tag["min"].Obj().getOwned(), tag["max"].Obj().getOwned(), tag["tag"].String() ) ) ); } cursor.reset(); CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }
void go( const string db , const boost::filesystem::path outdir ) { log() << "DATABASE: " << db << "\t to \t" << outdir.string() << endl; boost::filesystem::create_directories( outdir ); map <string, BSONObj> collectionOptions; multimap <string, BSONObj> indexes; vector <string> collections; // Save indexes for database string ins = db + ".system.indexes"; auto_ptr<DBClientCursor> cursor = conn( true ).query( ins.c_str() , Query() , 0 , 0 , 0 , QueryOption_SlaveOk | QueryOption_NoCursorTimeout ); while ( cursor->more() ) { BSONObj obj = cursor->nextSafe(); const string name = obj.getField( "ns" ).valuestr(); indexes.insert( pair<string, BSONObj> (name, obj.getOwned()) ); } string sns = db + ".system.namespaces"; cursor = conn( true ).query( sns.c_str() , Query() , 0 , 0 , 0 , QueryOption_SlaveOk | QueryOption_NoCursorTimeout ); while ( cursor->more() ) { BSONObj obj = cursor->nextSafe(); const string name = obj.getField( "name" ).valuestr(); if (obj.hasField("options")) { collectionOptions.insert( pair<string,BSONObj> (name, obj.getField("options").embeddedObject()) ); } // skip namespaces with $ in them only if we don't specify a collection to dump if ( _coll == "" && name.find( ".$" ) != string::npos ) { log(1) << "\tskipping collection: " << name << endl; continue; } const string filename = name.substr( db.size() + 1 ); //if a particular collections is specified, and it's not this one, skip it if ( _coll != "" && db + "." + _coll != name && _coll != name ) continue; // raise error before writing collection with non-permitted filename chars in the name size_t hasBadChars = name.find_first_of("/\0"); if (hasBadChars != string::npos){ error() << "Cannot dump " << name << ". Collection has '/' or null in the collection name." << endl; continue; } // Don't dump indexes if ( endsWith(name.c_str(), ".system.indexes") ) { continue; } if ( _coll != "" && db + "." + _coll != name && _coll != name ) continue; collections.push_back(name); } for (vector<string>::iterator it = collections.begin(); it != collections.end(); ++it) { string name = *it; const string filename = name.substr( db.size() + 1 ); writeCollectionFile( name , outdir / ( filename + ".bson" ) ); writeMetadataFile( name, outdir / (filename + ".metadata.json"), collectionOptions, indexes); } }
void WriteErrorDetail::setErrInfo(const BSONObj& errInfo) { _errInfo = errInfo.getOwned(); _isErrInfoSet = true; }
PlanStage::StageState DeleteStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; ++_commonStats.advanced; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; if (_idRetrying != WorkingSet::INVALID_ID) { id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } else { auto status = child()->work(&id); switch (status) { case PlanStage::ADVANCED: break; case PlanStage::FAILURE: case PlanStage::DEAD: *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which // case 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return status; case PlanStage::NEED_TIME: ++_commonStats.needTime; return status; case PlanStage::NEED_YIELD: *out = id; ++_commonStats.needYield; return status; case PlanStage::IS_EOF: return status; default: MONGO_UNREACHABLE; } } // We advanced, or are retrying, and id is set to the WSM to work on. WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasLoc()) { // We expect to be here because of an invalidation causing a force-fetch. // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will // not produce any more results even if there is another matching document. Throw a WCE here // so that these operations get another chance to find a matching document. The // findAndModify command should automatically retry if it gets a WCE. // TODO: this is not necessary if there was no sort specified. if (_params.returnDeleted) { throw WriteConflictException(); } ++_specificStats.nInvalidateSkips; ++_commonStats.needTime; return PlanStage::NEED_TIME; } RecordId rloc = member->loc; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. std::unique_ptr<SeekableRecordCursor> cursor; if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { cursor = _collection->getCursor(getOpCtx()); if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) { // Doc is already deleted. Nothing more to do. ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted, but keep it in the // LOC_AND_OBJ state in case we need to retry deleting it. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { if (supportsDocLocking()) { // Doc-locking engines require this before saveState() since they don't use // invalidations. WorkingSetCommon::prepareForSnapshotChange(_ws); } child()->saveState(); } catch (const WriteConflictException& wce) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), rloc); wunit.commit(); } ++_specificStats.docsDeleted; } catch (const WriteConflictException& wce) { // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will // not produce any more results even if there is another matching document. Re-throw the WCE // here so that these operations get another chance to find a matching document. The // findAndModify command should automatically retry if it gets a WCE. // TODO: this is not necessary if there was no sort specified. if (_params.returnDeleted) { throw; } _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } if (_params.returnDeleted) { // After deleting the document, the RecordId associated with this member is invalid. // Remove the 'loc' from the WorkingSetMember before returning it. member->loc = RecordId(); member->transitionToOwnedObj(); } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException& wce) { // Note we don't need to retry anything in this case since the delete already // was committed. However, we still need to return the deleted document // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } ++_commonStats.needTime; return PlanStage::NEED_TIME; }
CollectionMetadata* CollectionMetadata::cloneSplit( const ChunkType& chunk, const vector<BSONObj>& splitKeys, const ChunkVersion& newShardVersion, string* errMsg ) const { // The error message string is optional. string dummy; if (errMsg == NULL) { errMsg = &dummy; } // The version required in both resulting chunks could be simply an increment in the // minor portion of the current version. However, we are enforcing uniqueness over the // attributes <ns, version> of the configdb collection 'chunks'. So in practice, a // migrate somewhere may force this split to pick up a version that has the major // portion higher than the one that this shard has been using. // // TODO drop the uniqueness constraint and tighten the check below so that only the // minor portion of version changes if (newShardVersion <= _shardVersion) { *errMsg = stream() << "cannot split chunk " << rangeToString( chunk.getMin(), chunk.getMax() ) << ", new shard version " << newShardVersion.toString() << " is not greater than current version " << _shardVersion.toString(); warning() << *errMsg << endl; return NULL; } // Check that we have the exact chunk that will be subtracted. if ( !rangeMapContains( _chunksMap, chunk.getMin(), chunk.getMax() ) ) { *errMsg = stream() << "cannot split chunk " << rangeToString( chunk.getMin(), chunk.getMax() ) << ", this shard does not contain the chunk"; if ( rangeMapOverlaps( _chunksMap, chunk.getMin(), chunk.getMax() ) ) { RangeVector overlap; getRangeMapOverlap( _chunksMap, chunk.getMin(), chunk.getMax(), &overlap ); *errMsg += stream() << " and it overlaps " << overlapToString( overlap ); } warning() << *errMsg << endl; return NULL; } // Check that the split key is valid for ( vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it ) { if (!rangeContains(chunk.getMin(), chunk.getMax(), *it)) { *errMsg = stream() << "cannot split chunk " << rangeToString( chunk.getMin(), chunk.getMax() ) << " at key " << *it; warning() << *errMsg << endl; return NULL; } } auto_ptr<CollectionMetadata> metadata(new CollectionMetadata); metadata->_keyPattern = this->_keyPattern; metadata->_keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = this->_pendingMap; metadata->_chunksMap = this->_chunksMap; metadata->_shardVersion = newShardVersion; // will increment 2nd, 3rd,... chunks below BSONObj startKey = chunk.getMin(); for ( vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it ) { BSONObj split = *it; metadata->_chunksMap[chunk.getMin()] = split.getOwned(); metadata->_chunksMap.insert( make_pair( split.getOwned(), chunk.getMax().getOwned() ) ); metadata->_shardVersion.incMinor(); startKey = split; } metadata->_collVersion = metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion; metadata->fillRanges(); dassert(metadata->isValid()); return metadata.release(); }
/** * Run a query -- includes checking for and running a Command. * @return points to ns if exhaust mode. 0=normal mode * @locks the db mutex for reading (and potentially for writing temporarily to create a new db). * @asserts on scan and order memory exhaustion and other cases. */ string runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) ); ParsedQuery& pq( *pq_shared ); BSONObj jsobj = q.query; int queryOptions = q.queryOptions; const char *ns = q.ns; uassert( 16332 , "can't have an empty ns" , ns[0] ); if( logLevel >= 2 ) log() << "runQuery called " << ns << " " << jsobj << endl; curop.debug().ns = ns; curop.debug().ntoreturn = pq.getNumToReturn(); curop.debug().query = jsobj; curop.setQuery(jsobj); uassert( 16256, str::stream() << "Invalid ns [" << ns << "]", NamespaceString::isValid(ns) ); // Run a command. if ( pq.couldBeCommand() ) { curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { curop.debug().iscommand = true; curop.debug().query = jsobj; auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData( qr.release(), true ); } else { uasserted(13530, "bad or malformed command request?"); } return ""; } const bool explain = pq.isExplain(); const bool tailable = pq.hasOption(QueryOption_CursorTailable); BSONObj order = pq.getOrder(); BSONObj query = pq.getFilter(); /* The ElemIter will not be happy if this isn't really an object. So throw exception here when that is true. (Which may indicate bad data from client.) */ if ( query.objsize() == 0 ) { out() << "Bad query object?\n jsobj:"; out() << jsobj.toString() << "\n query:"; out() << query.toString() << endl; uassert( 10110 , "bad query object", false); } // Tailable cursors need to read newly written entries from the tail // of the collection. They manually arbitrate with the collection over // what data is readable and when, so we choose read uncommited isolation. OpSettings settings; settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR); settings.setBulkFetch(true); settings.setCappedAppendPK(pq.hasOption(QueryOption_AddHiddenPK)); cc().setOpSettings(settings); // If our caller has a transaction, it's multi-statement. const bool inMultiStatementTxn = cc().hasTxn(); if (tailable) { // Because it's easier to disable this. It shouldn't be happening in a normal system. uassert(16812, "May not perform a tailable query in a multi-statement transaction.", !inMultiStatementTxn); } // Begin a read-only, snapshot transaction under normal circumstances. // If the cursor is tailable, we need to be able to read uncommitted data. const int txnFlags = (tailable ? DB_READ_UNCOMMITTED : DB_TXN_SNAPSHOT) | DB_TXN_READ_ONLY; LOCK_REASON(lockReason, "query"); Client::ReadContext ctx(ns, lockReason); scoped_ptr<Client::Transaction> transaction(!inMultiStatementTxn ? new Client::Transaction(txnFlags) : NULL); bool hasRetried = false; while ( 1 ) { try { replVerifyReadsOk(&pq); // Fast-path for primary key queries. if (!explain && !tailable) { replVerifyReadsOk(&pq); if (_tryQueryByPKHack(ns, query, pq, curop, result)) { if (transaction) { transaction->commit(); } return ""; } } // sanity check the query and projection if (pq.getFields() != NULL) { pq.getFields()->validateQuery( query ); } if (tailable) { Collection *cl = getCollection( ns ); if (cl != NULL && !(cl->isCapped() || str::equals(ns, rsoplog))) { uasserted( 13051, "tailable cursor requested on non-capped, non-oplog collection" ); } const BSONObj nat1 = BSON( "$natural" << 1 ); if ( order.isEmpty() ) { order = nat1; } else { uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); } } // Run a regular query. // these now may stored in a ClientCursor or somewhere else, // so make sure we use a real copy jsobj = jsobj.getOwned(); query = query.getOwned(); order = order.getOwned(); const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns ); const bool getCachedExplainPlan = ! hasRetried && explain && ! pq.hasIndexSpecifier(); const bool savedCursor = queryWithQueryOptimizer( queryOptions, ns, jsobj, curop, query, order, pq_shared, shardingVersionAtStart, getCachedExplainPlan, inMultiStatementTxn, result ); // Did not save the cursor, so we can commit the transaction now if it exists. if (transaction && !savedCursor) { transaction->commit(); } return curop.debug().exhaust ? ns : ""; } catch ( const QueryRetryException & ) { // In some cases the query may be retried if there is an in memory sort size assertion. verify( ! hasRetried ); hasRetried = true; } } }
bool runParsed(OperationContext* txn, const NamespaceString& origNss, const AggregationRequest& request, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) { // For operations on views, this will be the underlying namespace. const NamespaceString& nss = request.getNamespaceString(); // Set up the ExpressionContext. intrusive_ptr<ExpressionContext> expCtx = new ExpressionContext(txn, request); expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; // Parse the pipeline. auto statusWithPipeline = Pipeline::parse(request.getPipeline(), expCtx); if (!statusWithPipeline.isOK()) { return appendCommandStatus(result, statusWithPipeline.getStatus()); } auto pipeline = std::move(statusWithPipeline.getValue()); auto resolvedNamespaces = resolveInvolvedNamespaces(txn, pipeline, expCtx); if (!resolvedNamespaces.isOK()) { return appendCommandStatus(result, resolvedNamespaces.getStatus()); } expCtx->resolvedNamespaces = std::move(resolvedNamespaces.getValue()); unique_ptr<ClientCursorPin> pin; // either this OR the exec will be non-null unique_ptr<PlanExecutor> exec; auto curOp = CurOp::get(txn); { // This will throw if the sharding version for this connection is out of date. If the // namespace is a view, the lock will be released before re-running the aggregation. // Otherwise, the lock must be held continuously from now until we have we created both // the output ClientCursor and the input executor. This ensures that both are using the // same sharding version that we synchronize on here. This is also why we always need to // create a ClientCursor even when we aren't outputting to a cursor. See the comment on // ShardFilterStage for more details. AutoGetCollectionOrViewForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); // If running $collStats on a view, we do not resolve the view since we want stats // on this view namespace. auto startsWithCollStats = [&pipeline]() { const Pipeline::SourceContainer& sources = pipeline->getSources(); return !sources.empty() && dynamic_cast<DocumentSourceCollStats*>(sources.front().get()); }; // If this is a view, resolve it by finding the underlying collection and stitching view // pipelines and this request's pipeline together. We then release our locks before // recursively calling run, which will re-acquire locks on the underlying collection. // (The lock must be released because recursively acquiring locks on the database will // prohibit yielding.) auto view = ctx.getView(); if (view && !startsWithCollStats()) { auto viewDefinition = ViewShardingCheck::getResolvedViewIfSharded(txn, ctx.getDb(), view); if (!viewDefinition.isOK()) { return appendCommandStatus(result, viewDefinition.getStatus()); } if (!viewDefinition.getValue().isEmpty()) { ViewShardingCheck::appendShardedViewStatus(viewDefinition.getValue(), &result); return false; } auto resolvedView = ctx.getDb()->getViewCatalog()->resolveView(txn, nss); if (!resolvedView.isOK()) { return appendCommandStatus(result, resolvedView.getStatus()); } // With the view resolved, we can relinquish locks. ctx.releaseLocksForView(); // Parse the resolved view into a new aggregation request. auto newCmd = resolvedView.getValue().asExpandedViewAggregation(request); if (!newCmd.isOK()) { return appendCommandStatus(result, newCmd.getStatus()); } auto newNss = resolvedView.getValue().getNamespace(); auto newRequest = AggregationRequest::parseFromBSON(newNss, newCmd.getValue()); if (!newRequest.isOK()) { return appendCommandStatus(result, newRequest.getStatus()); } bool status = runParsed( txn, origNss, newRequest.getValue(), newCmd.getValue(), errmsg, result); { // Set the namespace of the curop back to the view namespace so ctx records // stats on this view namespace on destruction. stdx::lock_guard<Client>(*txn->getClient()); curOp->setNS_inlock(nss.ns()); } return status; } // If the pipeline does not have a user-specified collation, set it from the collection // default. if (request.getCollation().isEmpty() && collection && collection->getDefaultCollator()) { invariant(!expCtx->getCollator()); expCtx->setCollator(collection->getDefaultCollator()->clone()); } // Propagate the ExpressionContext throughout all of the pipeline's stages and // expressions. pipeline->injectExpressionContext(expCtx); // The pipeline must be optimized after the correct collator has been set on it (by // injecting the ExpressionContext containing the collator). This is necessary because // optimization may make string comparisons, e.g. optimizing {$eq: [<str1>, <str2>]} to // a constant. pipeline->optimizePipeline(); if (kDebugBuild && !expCtx->isExplain && !expCtx->inShard) { // Make sure all operations round-trip through Pipeline::serialize() correctly by // re-parsing every command in debug builds. This is important because sharded // aggregations rely on this ability. Skipping when inShard because this has // already been through the transformation (and this un-sets expCtx->inShard). pipeline = reparsePipeline(pipeline, request, expCtx); } // This does mongod-specific stuff like creating the input PlanExecutor and adding // it to the front of the pipeline if needed. PipelineD::prepareCursorSource(collection, pipeline); // Create the PlanExecutor which returns results from the pipeline. The WorkingSet // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created // PlanExecutor. auto ws = make_unique<WorkingSet>(); auto proxy = make_unique<PipelineProxyStage>(txn, pipeline, ws.get()); auto statusWithPlanExecutor = (NULL == collection) ? PlanExecutor::make( txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL) : PlanExecutor::make( txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL); invariant(statusWithPlanExecutor.isOK()); exec = std::move(statusWithPlanExecutor.getValue()); { auto planSummary = Explain::getPlanSummary(exec.get()); stdx::lock_guard<Client>(*txn->getClient()); curOp->setPlanSummary_inlock(std::move(planSummary)); } if (collection) { PlanSummaryStats stats; Explain::getSummaryStats(*exec, &stats); collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed); } if (collection) { const bool isAggCursor = true; // enable special locking behavior ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), 0, cmdObj.getOwned(), isAggCursor); pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid())); // Don't add any code between here and the start of the try block. } // At this point, it is safe to release the collection lock. // - In the case where we have a collection: we will need to reacquire the // collection lock later when cleaning up our ClientCursorPin. // - In the case where we don't have a collection: our PlanExecutor won't be // registered, so it will be safe to clean it up outside the lock. invariant(!exec || !collection); } try { // Unless set to true, the ClientCursor created above will be deleted on block exit. bool keepCursor = false; // Use of the aggregate command without specifying to use a cursor is deprecated. // Applications should migrate to using cursors. Cursors are strictly more useful than // outputting the results as a single document, since results that fit inside a single // BSONObj will also fit inside a single batch. // // We occasionally log a deprecation warning. if (!request.isCursorCommand()) { RARELY { warning() << "Use of the aggregate command without the 'cursor' " "option is deprecated. See " "http://dochub.mongodb.org/core/aggregate-without-cursor-deprecation."; } } // If both explain and cursor are specified, explain wins. if (expCtx->isExplain) { result << "stages" << Value(pipeline->writeExplainOps()); } else if (request.isCursorCommand()) { keepCursor = handleCursorCommand(txn, origNss.ns(), pin.get(), pin ? pin->c()->getExecutor() : exec.get(), request, result); } else { pipeline->run(result); } if (!expCtx->isExplain) { PlanSummaryStats stats; Explain::getSummaryStats(pin ? *pin->c()->getExecutor() : *exec.get(), &stats); curOp->debug().setPlanSummaryMetrics(stats); curOp->debug().nreturned = stats.nReturned; } // Clean up our ClientCursorPin, if needed. We must reacquire the collection lock // in order to do so. if (pin) { // We acquire locks here with DBLock and CollectionLock instead of using // AutoGetCollectionForRead. AutoGetCollectionForRead will throw if the // sharding version is out of date, and we don't care if the sharding version // has changed. Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS); if (keepCursor) { pin->release(); } else { pin->deleteUnderlying(); } } } catch (...) {
/** * Do the initial sync for this member. */ void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } sethbmsg("initial sync data copy, starting syncup",0); BSONObj minValid; if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // its currently important that lastOp is equal to the last op we actually pulled // this is because the background thread only pulls each op once now // so if its now, we'll be waiting forever { // this takes whatever the last op the we got is // and stores it locally before we wipe it out below Lock::DBRead lk(rsoplog); Helpers::getLast(rsoplog, lastOp); lastOp = lastOp.getOwned(); } // reset state, as that "didn't count" emptyOplog(); lastOpTimeWritten = OpTime(); lastH = 0; sethbmsg("initial sync building indexes",0); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } } sethbmsg("initial sync query minValid",0); BSONObj minValid; if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) { return; } // --------- sethbmsg("initial sync finishing up",0); verify( !box.getState().primary() ); // wouldn't make sense if we were. { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } theReplSet->setMinValid(minValid); cx.ctx().db()->flushFiles(true); } changeState(MemberState::RS_RECOVERING); sethbmsg("initial sync done",0); }
int ConfigDiffTracker<ValType,ShardType>:: calculateConfigDiff( DBClientCursorInterface& diffCursor ) { MONGO_LOG_DEFAULT_COMPONENT_LOCAL(::mongo::logger::LogComponent::kSharding); verifyAttached(); // Apply the chunk changes to the ranges and versions // // Overall idea here is to work in two steps : // 1. For all the new chunks we find, increment the maximum version per-shard and // per-collection, and remove any conflicting chunks from the ranges // 2. For all the new chunks we're interested in (all of them for mongos, just chunks on the // shard for mongod) add them to the ranges // vector<BSONObj> newTracked; // Store epoch now so it doesn't change when we change max OID currEpoch = _maxVersion->epoch(); _validDiffs = 0; while( diffCursor.more() ){ BSONObj diffChunkDoc = diffCursor.next(); ChunkVersion chunkVersion = ChunkVersion::fromBSON(diffChunkDoc, ChunkType::DEPRECATED_lastmod()); if( diffChunkDoc[ChunkType::min()].type() != Object || diffChunkDoc[ChunkType::max()].type() != Object || diffChunkDoc[ChunkType::shard()].type() != String ) { warning() << "got invalid chunk document " << diffChunkDoc << " when trying to load differing chunks" << endl; continue; } if( ! chunkVersion.isSet() || ! chunkVersion.hasEqualEpoch( currEpoch ) ){ warning() << "got invalid chunk version " << chunkVersion << " in document " << diffChunkDoc << " when trying to load differing chunks at version " << ChunkVersion( _maxVersion->majorVersion(), _maxVersion->minorVersion(), currEpoch ) << endl; // Don't keep loading, since we know we'll be broken here return -1; } _validDiffs++; // Get max changed version and chunk version if( chunkVersion > *_maxVersion ) *_maxVersion = chunkVersion; // Chunk version changes ShardType shard = shardFor( diffChunkDoc[ChunkType::shard()].String() ); typename map<ShardType, ChunkVersion>::iterator shardVersionIt = _maxShardVersions->find( shard ); if( shardVersionIt == _maxShardVersions->end() || shardVersionIt->second < chunkVersion ){ (*_maxShardVersions)[ shard ] = chunkVersion; } // See if we need to remove any chunks we are currently tracking b/c of this chunk's changes removeOverlapping(diffChunkDoc[ChunkType::min()].Obj(), diffChunkDoc[ChunkType::max()].Obj()); // Figure out which of the new chunks we need to track // Important - we need to actually own this doc, in case the cursor decides to getMore or unbuffer if( isTracked( diffChunkDoc ) ) newTracked.push_back( diffChunkDoc.getOwned() ); } LOG(3) << "found " << _validDiffs << " new chunks for collection " << _ns << " (tracking " << newTracked.size() << "), new version is " << *_maxVersion << endl; for( vector<BSONObj>::iterator it = newTracked.begin(); it != newTracked.end(); it++ ){ BSONObj chunkDoc = *it; // Important - we need to make sure we actually own the min and max here BSONObj min = chunkDoc[ChunkType::min()].Obj().getOwned(); BSONObj max = chunkDoc[ChunkType::max()].Obj().getOwned(); // Invariant enforced by sharding // It's possible to read inconsistent state b/c of getMore() and yielding, so we want // to detect as early as possible. // TODO: This checks for overlap, we also should check for holes here iff we're tracking // all chunks if( isOverlapping( min, max ) ) return -1; _currMap->insert( rangeFor( chunkDoc, min, max ) ); } return _validDiffs; }
bool ClientInfo::getLastError( const BSONObj& options , BSONObjBuilder& result , bool fromWriteBackListener ) { set<string> * shards = getPrev(); if ( shards->size() == 0 ) { result.appendNull( "err" ); return true; } vector<WBInfo> writebacks; // handle single server if ( shards->size() == 1 ) { string theShard = *(shards->begin() ); BSONObj res; bool ok = false; { ShardConnection conn( theShard , "" ); try { ok = conn->runCommand( "admin" , options , res ); } catch( std::exception &e ) { warning() << "could not get last error from shard " << theShard << causedBy( e ) << endl; // Catch everything that happens here, since we need to ensure we return our connection when we're // finished. conn.done(); return false; } res = res.getOwned(); conn.done(); } _addWriteBack( writebacks , res ); // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( temp == theShard ) continue; try { ShardConnection conn( temp , "" ); ON_BLOCK_EXIT_OBJ( conn, &ShardConnection::done ); _addWriteBack( writebacks , conn->getLastErrorDetailed() ); } catch( std::exception &e ){ warning() << "could not clear last error from shard " << temp << causedBy( e ) << endl; } } clearSinceLastGetError(); if ( writebacks.size() ){ vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener ); if ( v.size() == 0 && fromWriteBackListener ) { // ok } else { // this will usually be 1 // it can be greater than 1 if a write to a different shard // than the last write op had a writeback // all we're going to report is the first // since that's the current write // but we block for all verify( v.size() >= 1 ); result.appendElements( v[0] ); result.appendElementsUnique( res ); result.append( "writebackGLE" , v[0] ); result.append( "initialGLEHost" , theShard ); } } else { result.append( "singleShard" , theShard ); result.appendElements( res ); } return ok; } BSONArrayBuilder bbb( result.subarrayStart( "shards" ) ); BSONObjBuilder shardRawGLE; long long n = 0; int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true // hit each shard vector<string> errors; vector<BSONObj> errorObjects; for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) { string theShard = *i; bbb.append( theShard ); boost::scoped_ptr<ShardConnection> conn; BSONObj res; bool ok = false; try { conn.reset( new ShardConnection( theShard , "" ) ); // constructor can throw if shard is down ok = (*conn)->runCommand( "admin" , options , res ); shardRawGLE.append( theShard , res ); } catch( std::exception &e ){ // Safe to return here, since we haven't started any extra processing yet, just collecting // responses. warning() << "could not get last error from a shard " << theShard << causedBy( e ) << endl; conn->done(); return false; } _addWriteBack( writebacks, res ); string temp = DBClientWithCommands::getLastErrorString( res ); if ( (*conn)->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) { errors.push_back( temp ); errorObjects.push_back( res ); } n += res["n"].numberLong(); if ( res["updatedExisting"].type() ) { if ( res["updatedExisting"].trueValue() ) updatedExistingStat = 1; else if ( updatedExistingStat == 0 ) updatedExistingStat = -1; } conn->done(); } bbb.done(); result.append( "shardRawGLE" , shardRawGLE.obj() ); result.appendNumber( "n" , n ); if ( updatedExistingStat ) result.appendBool( "updatedExisting" , updatedExistingStat > 0 ); // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( shards->count( temp ) ) continue; ShardConnection conn( temp , "" ); try { _addWriteBack( writebacks, conn->getLastErrorDetailed() ); } catch( std::exception &e ){ warning() << "could not clear last error from a shard " << temp << causedBy( e ) << endl; } conn.done(); } clearSinceLastGetError(); if ( errors.size() == 0 ) { result.appendNull( "err" ); _handleWriteBacks( writebacks , fromWriteBackListener ); return true; } result.append( "err" , errors[0].c_str() ); { // errs BSONArrayBuilder all( result.subarrayStart( "errs" ) ); for ( unsigned i=0; i<errors.size(); i++ ) { all.append( errors[i].c_str() ); } all.done(); } { // errObjects BSONArrayBuilder all( result.subarrayStart( "errObjects" ) ); for ( unsigned i=0; i<errorObjects.size(); i++ ) { all.append( errorObjects[i] ); } all.done(); } _handleWriteBacks( writebacks , fromWriteBackListener ); return true; }
PlanStage::StageState IndexScan::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); // If we examined multiple keys in a prior work cycle, make up for it here by returning // NEED_TIME. This is done for plan ranking. Refer to the comment for '_checkEndKeys' // in the .h for details. if (_checkEndKeys > 0) { --_checkEndKeys; ++_commonStats.needTime; return PlanStage::NEED_TIME; } if (NULL == _indexCursor.get()) { // First call to work(). Perform possibly heavy init. initIndexScan(); checkEnd(); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (_yieldMovedCursor) { _yieldMovedCursor = false; // Note that we're not calling next() here. We got the next thing when we recovered // from yielding. } if (isEOF()) { return PlanStage::IS_EOF; } // Grab the next (key, value) from the index. BSONObj keyObj = _indexCursor->getKey(); DiskLoc loc = _indexCursor->getValue(); // Move to the next result. // The underlying IndexCursor points at the *next* thing we want to return. We do this so // that if we're scanning an index looking for docs to delete we don't continually clobber // the thing we're pointing at. _indexCursor->next(); checkEnd(); if (_shouldDedup) { ++_specificStats.dupsTested; if (_returned.end() != _returned.find(loc)) { ++_specificStats.dupsDropped; ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { _returned.insert(loc); } } if (Filter::passes(keyObj, _keyPattern, _filter)) { if (NULL != _filter) { ++_specificStats.matchTested; } // We must make a copy of the on-disk data since it can mutate during the execution of // this query. BSONObj ownedKeyObj = keyObj.getOwned(); // Fill out the WSM. WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = loc; member->keyData.push_back(IndexKeyDatum(_keyPattern, ownedKeyObj)); member->state = WorkingSetMember::LOC_AND_IDX; if (_params.addKeyMetadata) { BSONObjBuilder bob; bob.appendKeys(_keyPattern, ownedKeyObj); member->addComputed(new IndexKeyComputedData(bob.obj())); } *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } ++_commonStats.needTime; return PlanStage::NEED_TIME; }
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result) { const std::string ns = parseNs(dbname, jsobj); md5digest d; md5_state_t st; md5_init(&st); int n = 0; bool partialOk = jsobj["partialOk"].trueValue(); if (partialOk) { // WARNING: This code depends on the binary layout of md5_state. It will not be // compatible with different md5 libraries or work correctly in an environment with // mongod's of different endians. It is ok for mongos to be a different endian since // it just passes the buffer through to another mongod. BSONElement stateElem = jsobj["md5state"]; if (!stateElem.eoo()) { int len; const char* data = stateElem.binDataClean(len); massert(16247, "md5 state not correct size", len == sizeof(st)); memcpy(&st, data, sizeof(st)); } n = jsobj["startAt"].numberInt(); } BSONObj query = BSON("files_id" << jsobj["filemd5"] << "n" << GTE << n); BSONObj sort = BSON("files_id" << 1 << "n" << 1); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { CanonicalQuery* cq; if (!CanonicalQuery::canonicalize(ns, query, sort, BSONObj(), &cq).isOK()) { uasserted(17240, "Can't canonicalize query " + query.toString()); return 0; } // Check shard version at startup. // This will throw before we've done any work if shard version is outdated // We drop and re-acquire these locks every document because md5'ing is expensive unique_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, ns)); Collection* coll = ctx->getCollection(); PlanExecutor* rawExec; if (!getExecutor(txn, coll, cq, PlanExecutor::YIELD_MANUAL, &rawExec, QueryPlannerParams::NO_TABLE_SCAN).isOK()) { uasserted(17241, "Can't get executor for query " + query.toString()); return 0; } unique_ptr<PlanExecutor> exec(rawExec); // Process notifications when the lock is released/reacquired in the loop below exec->registerExec(); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { BSONElement ne = obj["n"]; verify(ne.isNumber()); int myn = ne.numberInt(); if (n != myn) { if (partialOk) { break; // skipped chunk is probably on another shard } log() << "should have chunk: " << n << " have:" << myn << endl; dumpChunks(txn, ns, query, sort); uassert(10040, "chunks out of order", n == myn); } // make a copy of obj since we access data in it while yielding locks BSONObj owned = obj.getOwned(); exec->saveState(); // UNLOCKED ctx.reset(); int len; const char* data = owned["data"].binDataClean(len); // This is potentially an expensive operation, so do it out of the lock md5_append(&st, (const md5_byte_t*)(data), len); n++; try { // RELOCKED ctx.reset(new AutoGetCollectionForRead(txn, ns)); } catch (const SendStaleConfigException& ex) { LOG(1) << "chunk metadata changed during filemd5, will retarget and continue"; break; } // Have the lock again. See if we were killed. if (!exec->restoreState(txn)) { if (!partialOk) { uasserted(13281, "File deleted during filemd5 command"); } } } if (partialOk) result.appendBinData("md5state", sizeof(st), BinDataGeneral, &st); // This must be *after* the capture of md5state since it mutates st md5_finish(&st, d); result.append("numChunks", n); result.append("md5", digestToString(d)); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "filemd5", dbname); return true; }
void ParsedQuery::initFields( const BSONObj& fields ) { if ( fields.isEmpty() ) return; _fields.reset( new Projection() ); _fields->init( fields.getOwned() ); }
Status MMAPV1DatabaseCatalogEntry::_renameSingleNamespace( OperationContext* txn, const StringData& fromNS, const StringData& toNS, bool stayTemp ) { // some sanity checking NamespaceDetails* fromDetails = _namespaceIndex.details( fromNS ); if ( !fromDetails ) return Status( ErrorCodes::BadValue, "from namespace doesn't exist" ); if ( _namespaceIndex.details( toNS ) ) return Status( ErrorCodes::BadValue, "to namespace already exists" ); _removeFromCache( fromNS ); // at this point, we haven't done anything destructive yet // ---- // actually start moving // ---- // this could throw, but if it does we're ok _namespaceIndex.add_ns( txn, toNS, fromDetails ); NamespaceDetails* toDetails = _namespaceIndex.details( toNS ); try { toDetails->copyingFrom(txn, toNS, _namespaceIndex, fromDetails); // fixes extraOffset } catch( DBException& ) { // could end up here if .ns is full - if so try to clean up / roll back a little _namespaceIndex.kill_ns( txn, toNS ); throw; } // at this point, code .ns stuff moved _namespaceIndex.kill_ns( txn, fromNS ); fromDetails = NULL; // fix system.namespaces BSONObj newSpec; DiskLoc oldSpecLocation; { BSONObj oldSpec; { RecordStoreV1Base* rs = _getNamespaceRecordStore(); scoped_ptr<RecordIterator> it( rs->getIterator(txn) ); while ( !it->isEOF() ) { DiskLoc loc = it->getNext(); BSONObj entry = it->dataFor( loc ).toBson(); if ( fromNS == entry["name"].String() ) { oldSpecLocation = loc; oldSpec = entry.getOwned(); break; } } } invariant( !oldSpec.isEmpty() ); invariant( !oldSpecLocation.isNull() ); BSONObjBuilder b; BSONObjIterator i( oldSpec.getObjectField( "options" ) ); while( i.more() ) { BSONElement e = i.next(); if ( strcmp( e.fieldName(), "create" ) != 0 ) { if (stayTemp || (strcmp(e.fieldName(), "temp") != 0)) b.append( e ); } else { b << "create" << toNS; } } newSpec = b.obj(); } _addNamespaceToNamespaceCollection( txn, toNS, newSpec.isEmpty() ? 0 : &newSpec ); _getNamespaceRecordStore()->deleteRecord( txn, oldSpecLocation ); boost::mutex::scoped_lock lk( _collectionsLock ); Entry*& entry = _collections[toNS.toString()]; invariant( entry == NULL ); entry = new Entry(); _fillInEntry_inlock( txn, toNS, entry ); return Status::OK(); }
bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if( cmdObj["replSetReconfig"].type() != Object ) { errmsg = "no configuration specified"; return false; } // We might want to add the protocol version of theReplSet->config() if it exists, // instead of just blindly adding our compiled-in CURRENT_PROTOCOL_VERSION. But for // TokuMX 1.0 it doesn't matter. BSONObj configObj = ReplSetConfig::addProtocolVersionIfMissing(cmdObj["replSetReconfig"].Obj()); bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue(); if( force && !theReplSet ) { replSettings.reconfig = configObj.getOwned(); result.append("msg", "will try this config momentarily, try running rs.conf() again in a few seconds"); return true; } if ( !check(errmsg, result) ) { return false; } if( !force && !theReplSet->box.getState().primary() ) { errmsg = "replSetReconfig command must be sent to the current replica set primary."; return false; } { // just make sure we can get a write lock before doing anything else. we'll reacquire one // later. of course it could be stuck then, but this check lowers the risk if weird things // are up - we probably don't want a change to apply 30 minutes after the initial attempt. time_t t = time(0); Lock::GlobalWrite lk; if( time(0)-t > 20 ) { errmsg = "took a long time to get write lock, so not initiating. Initiate when server less busy?"; return false; } } try { scoped_ptr<ReplSetConfig> newConfig (ReplSetConfig::make(configObj, force)); log() << "replSet replSetReconfig config object parses ok, " << newConfig->members.size() << " members specified" << rsLog; if( !ReplSetConfig::legalChange(theReplSet->getConfig(), *newConfig, errmsg) ) { return false; } checkMembersUpForConfigChange(*newConfig, result, false); log() << "replSet replSetReconfig [2]" << rsLog; theReplSet->haveNewConfig(*newConfig, true); ReplSet::startupStatusMsg.set("replSetReconfig'd"); } catch( DBException& e ) { log() << "replSet replSetReconfig exception: " << e.what() << rsLog; throw; } catch( string& se ) { log() << "replSet reconfig exception: " << se << rsLog; errmsg = se; return false; } resetSlaveCache(); return true; }
void BatchedUpdateDocument::setQuery(const BSONObj& query) { _query = query.getOwned(); _isQuerySet = true; }
/** * Run a query with a cursor provided by the query optimizer, or FindingStartCursor. * @returns true if client cursor was saved, false if the query has completed. */ bool queryWithQueryOptimizer( int queryOptions, const string& ns, const BSONObj &jsobj, CurOp& curop, const BSONObj &query, const BSONObj &order, const shared_ptr<ParsedQuery> &pq_shared, const ConfigVersion &shardingVersionAtStart, const bool getCachedExplainPlan, const bool inMultiStatementTxn, Message &result ) { const ParsedQuery &pq( *pq_shared ); shared_ptr<Cursor> cursor; QueryPlanSummary queryPlan; const bool tailable = pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1; LOG(1) << "query beginning read-only transaction. tailable: " << tailable << endl; BSONObj oldPlan; if (getCachedExplainPlan) { scoped_ptr<MultiPlanScanner> mps( MultiPlanScanner::make( ns.c_str(), query, order ) ); oldPlan = mps->cachedPlanExplainSummary(); } cursor = getOptimizedCursor( ns.c_str(), query, order, QueryPlanSelectionPolicy::any(), pq_shared, false, &queryPlan ); verify( cursor ); // Tailable cursors must be marked as such before any use. This is so that // the implementation knows that uncommitted data cannot be returned. if ( tailable ) { cursor->setTailable(); } scoped_ptr<QueryResponseBuilder> queryResponseBuilder ( QueryResponseBuilder::make( pq, cursor, queryPlan, oldPlan ) ); bool saveClientCursor = false; int options = QueryOption_NoCursorTimeout; if (pq.hasOption( QueryOption_OplogReplay )) { options |= QueryOption_OplogReplay; } // create a client cursor that does not create a cursorID. // The cursor ID will be created if and only if we save // the client cursor further below ClientCursor::Holder ccPointer( new ClientCursor( options, cursor, ns, BSONObj(), false, false ) ); // for oplog cursors, we check if we are reading data that is too old and might // be stale. bool opChecked = false; bool slaveLocationUpdated = false; BSONObj last; bool lastBSONObjSet = false; for ( ; cursor->ok(); cursor->advance() ) { if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) { break; } if ( !queryResponseBuilder->addMatch() ) { continue; } // Note slave's position in the oplog. if ( pq.hasOption( QueryOption_OplogReplay ) ) { BSONObj current = cursor->current(); last = current.copy(); lastBSONObjSet = true; // the first row returned is equal to the last element that // the slave has synced up to, so we might as well update // the slave location if (!slaveLocationUpdated) { ccPointer->updateSlaveLocation(curop); slaveLocationUpdated = true; } // check if data we are about to return may be too stale if (!opChecked) { ccPointer->storeOpForSlave(current); uassert(16785, "oplog cursor reading data that is too old", !ccPointer->lastOpForSlaveTooOld()); opChecked = true; } } if ( pq.isExplain() ) { if ( queryResponseBuilder->enoughTotalResults() ) { break; } } else if ( queryResponseBuilder->enoughForFirstBatch() ) { // if only 1 requested, no cursor saved for efficiency...we assume it is findOne() if ( pq.wantMore() && pq.getNumToReturn() != 1 ) { queryResponseBuilder->finishedFirstBatch(); if ( cursor->advance() ) { saveClientCursor = true; } } break; } } // If the tailing request succeeded if ( cursor->tailable() ) { saveClientCursor = true; } if ( ! shardingState.getVersion( ns ).isWriteCompatibleWith( shardingVersionAtStart ) ) { // if the version changed during the query // we might be missing some data // and its safe to send this as mongos can resend // at this point throw SendStaleConfigException( ns , "version changed during initial query", shardingVersionAtStart, shardingState.getVersion( ns ) ); } int nReturned = queryResponseBuilder->handoff( result ); ccPointer.reset(); long long cursorid = 0; if ( saveClientCursor ) { // Create a new ClientCursor, with a default timeout. ccPointer.reset( new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned(), inMultiStatementTxn ) ); cursorid = ccPointer->cursorid(); DEV tlog(2) << "query has more, cursorid: " << cursorid << endl; if ( !ccPointer->ok() && ccPointer->c()->tailable() ) { DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl; } if( queryOptions & QueryOption_Exhaust ) { curop.debug().exhaust = true; } // Set attributes for getMore. ccPointer->setChunkManager( queryResponseBuilder->chunkManager() ); ccPointer->setPos( nReturned ); ccPointer->pq = pq_shared; ccPointer->fields = pq.getFieldPtr(); if (pq.hasOption( QueryOption_OplogReplay ) && lastBSONObjSet) { ccPointer->storeOpForSlave(last); } if (!inMultiStatementTxn) { // This cursor is not part of a multi-statement transaction, so // we pass off the current client's transaction stack to the // cursor so that it may be live as long as the cursor. cc().swapTransactionStack(ccPointer->transactions); verify(!cc().hasTxn()); } ccPointer.release(); } QueryResult *qr = (QueryResult *) result.header(); qr->cursorId = cursorid; curop.debug().cursorid = ( cursorid == 0 ? -1 : qr->cursorId ); qr->setResultFlagsToOk(); // qr->len is updated automatically by appendData() curop.debug().responseLength = qr->len; qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = nReturned; curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = nReturned; return saveClientCursor; }
void BatchedUpdateDocument::setUpdateExpr(const BSONObj& updateExpr) { _updateExpr = updateExpr.getOwned(); _isUpdateExprSet = true; }
/** * Invariant: idObj should belong to a document that is part of the active chunk being migrated */ LogOpForShardingHandler(MigrationSourceManager* migrateSourceManager, const BSONObj& idObj, const char op) : _migrationSourceManager(migrateSourceManager), _idObj(idObj.getOwned()), _op(op) {}
void BatchedUpdateDocument::setCollation(const BSONObj& collation) { _collation = collation.getOwned(); _isCollationSet = true; }
void operator()( DBClientCursorBatchIterator &i ) { Lock::GlobalWrite lk; context.relocked(); bool createdCollection = false; Collection* collection = NULL; while( i.moreInCurrentBatch() ) { if ( numSeen % 128 == 127 /*yield some*/ ) { collection = NULL; time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << numSeen << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } if ( isindex == false && collection == NULL ) { collection = context.db()->getCollection( to_collection ); if ( !collection ) { massert( 17321, str::stream() << "collection dropped during clone [" << to_collection << "]", !createdCollection ); createdCollection = true; collection = context.db()->createCollection( to_collection ); verify( collection ); } } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ const Status status = validateBSON(tmp.objdata(), tmp.objsize()); if (!status.isOK()) { out() << "Cloner: skipping corrupt object from " << from_collection << ": " << status.reason(); continue; } ++numSeen; BSONObj js = tmp; if ( isindex ) { verify(nsToCollectionSubstring(from_collection) == "system.indexes"); js = fixindex(tmp); indexesToBuild->push_back( js.getOwned() ); continue; } verify(nsToCollectionSubstring(from_collection) != "system.indexes"); StatusWith<DiskLoc> loc = collection->insertDocument( js, true ); if ( !loc.isOK() ) { error() << "error: exception cloning object in " << from_collection << ' ' << loc.toString() << " obj:" << js; } uassertStatusOK( loc.getStatus() ); if ( logForRepl ) logOp("i", to_collection, js); getDur().commitIfNeeded(); RARELY if ( time( 0 ) - saveLast > 60 ) { log() << numSeen << " objects cloned so far from collection " << from_collection; saveLast = time( 0 ); } } }
void appendReplicationInfo(OperationContext* txn, BSONObjBuilder& result, int level) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getSettings().usingReplSets()) { IsMasterResponse isMasterResponse; replCoord->fillIsMasterForReplSet(&isMasterResponse); result.appendElements(isMasterResponse.toBSON()); if (level) { replCoord->appendSlaveInfoData(&result); } return; } // TODO(dannenberg) replAllDead is bad and should be removed when master slave is removed if (replAllDead) { result.append("ismaster", 0); string s = string("dead: ") + replAllDead; result.append("info", s); } else { result.appendBool("ismaster", getGlobalReplicationCoordinator()->isMasterForReportingPurposes()); } if (level) { BSONObjBuilder sources(result.subarrayStart("sources")); int n = 0; list<BSONObj> src; { const char* localSources = "local.sources"; AutoGetCollectionForRead ctx(txn, localSources); unique_ptr<PlanExecutor> exec(InternalPlanner::collectionScan( txn, localSources, ctx.getCollection(), PlanExecutor::YIELD_MANUAL)); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { src.push_back(obj.getOwned()); } // Non-yielding collection scans from InternalPlanner will never error. invariant(PlanExecutor::IS_EOF == state); } for (list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++) { BSONObj s = *i; BSONObjBuilder bb; bb.append(s["host"]); string sourcename = s["source"].valuestr(); if (sourcename != "main") bb.append(s["source"]); { BSONElement e = s["syncedTo"]; BSONObjBuilder t(bb.subobjStart("syncedTo")); t.appendDate("time", e.timestampTime()); t.append("inc", e.timestampInc()); t.done(); } if (level > 1) { wassert(!txn->lockState()->isLocked()); // note: there is no so-style timeout on this connection; perhaps we should have // one. ScopedDbConnection conn(s["host"].valuestr()); DBClientConnection* cliConn = dynamic_cast<DBClientConnection*>(&conn.conn()); if (cliConn && replAuthenticate(cliConn)) { BSONObj first = conn->findOne((string) "local.oplog.$" + sourcename, Query().sort(BSON("$natural" << 1))); BSONObj last = conn->findOne((string) "local.oplog.$" + sourcename, Query().sort(BSON("$natural" << -1))); bb.appendDate("masterFirst", first["ts"].timestampTime()); bb.appendDate("masterLast", last["ts"].timestampTime()); const auto lag = (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); bb.append("lagSeconds", durationCount<Milliseconds>(lag) / 1000.0); } conn.done(); } sources.append(BSONObjBuilder::numStr(n++), bb.obj()); } sources.done(); replCoord->appendSlaveInfoData(&result); } }
bool ClientInfo::getLastError( const string& dbName, const BSONObj& options, BSONObjBuilder& result, string& errmsg, bool fromWriteBackListener) { scoped_ptr<TimerHolder> gleTimerHolder; if ( ! fromWriteBackListener ) { bool doTiming = false; const BSONElement& e = options["w"]; if ( e.isNumber() ) { doTiming = e.numberInt() > 1; } else if ( e.type() == String ) { doTiming = true; } if ( doTiming ) { gleTimerHolder.reset( new TimerHolder( &gleWtimeStats ) ); } } set<string> * shards = getPrev(); if ( shards->size() == 0 ) { result.appendNull( "err" ); return true; } vector<WBInfo> writebacks; // // TODO: These branches should be collapsed into a single codepath // // handle single server if ( shards->size() == 1 ) { string theShard = *(shards->begin() ); BSONObj res; bool ok = false; { LOG(5) << "gathering response for gle from: " << theShard << endl; ShardConnection conn( theShard , "" ); try { ok = conn->runCommand( dbName , options , res ); } catch( std::exception &e ) { string message = str::stream() << "could not get last error from shard " << theShard << causedBy( e ); warning() << message << endl; errmsg = message; // Catch everything that happens here, since we need to ensure we return our connection when we're // finished. conn.done(); return false; } res = res.getOwned(); conn.done(); } _addWriteBack( writebacks, res, true ); LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for" << " gle (" << theShard << ")" << endl; // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( temp == theShard ) continue; LOG(5) << "gathering writebacks for single-shard gle from: " << temp << endl; try { ShardConnection conn( temp , "" ); ON_BLOCK_EXIT_OBJ( conn, &ShardConnection::done ); _addWriteBack( writebacks, conn->getLastErrorDetailed(), false ); } catch( std::exception &e ){ warning() << "could not clear last error from shard " << temp << causedBy( e ) << endl; } } clearSinceLastGetError(); LOG(4) << "checking " << writebacks.size() << " writebacks for" << " gle (" << theShard << ")" << endl; if ( writebacks.size() ){ vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener ); if ( v.size() == 0 && fromWriteBackListener ) { // ok } else { // this will usually be 1 // it can be greater than 1 if a write to a different shard // than the last write op had a writeback // all we're going to report is the first // since that's the current write // but we block for all verify( v.size() >= 1 ); if ( res["writebackSince"].numberInt() > 0 ) { // got writeback from older op // ignore the result from it, just needed to wait result.appendElements( res ); } else if ( writebacks[0].fromLastOperation ) { result.appendElements( v[0] ); result.appendElementsUnique( res ); result.append( "writebackGLE" , v[0] ); result.append( "initialGLEHost" , theShard ); result.append( "initialGLE", res ); } else { // there was a writeback // but its from an old operations // so all that's important is that we block, not that we return stats result.appendElements( res ); } } } else { result.append( "singleShard" , theShard ); result.appendElements( res ); } return ok; } BSONArrayBuilder bbb( result.subarrayStart( "shards" ) ); BSONObjBuilder shardRawGLE; long long n = 0; int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true // hit each shard vector<string> errors; vector<BSONObj> errorObjects; for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) { string theShard = *i; bbb.append( theShard ); LOG(5) << "gathering a response for gle from: " << theShard << endl; boost::scoped_ptr<ShardConnection> conn; BSONObj res; bool ok = false; try { conn.reset( new ShardConnection( theShard , "" ) ); // constructor can throw if shard is down ok = (*conn)->runCommand( dbName , options , res ); shardRawGLE.append( theShard , res ); } catch( std::exception &e ){ // Safe to return here, since we haven't started any extra processing yet, just collecting // responses. string message = str::stream() << "could not get last error from a shard " << theShard << causedBy( e ); warning() << message << endl; errmsg = message; if (conn) conn->done(); return false; } _addWriteBack( writebacks, res, true ); string temp = DBClientWithCommands::getLastErrorString( res ); if ( (*conn)->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) { errors.push_back( temp ); errorObjects.push_back( res ); } n += res["n"].numberLong(); if ( res["updatedExisting"].type() ) { if ( res["updatedExisting"].trueValue() ) updatedExistingStat = 1; else if ( updatedExistingStat == 0 ) updatedExistingStat = -1; } conn->done(); } bbb.done(); result.append( "shardRawGLE" , shardRawGLE.obj() ); result.appendNumber( "n" , n ); if ( updatedExistingStat ) result.appendBool( "updatedExisting" , updatedExistingStat > 0 ); LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for" << " gle (" << shards->size() << " shards)" << endl; // hit other machines just to block for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) { string temp = *i; if ( shards->count( temp ) ) continue; LOG(5) << "gathering writebacks for multi-shard gle from: " << temp << endl; ShardConnection conn( temp , "" ); try { _addWriteBack( writebacks, conn->getLastErrorDetailed(), false ); } catch( std::exception &e ){ warning() << "could not clear last error from a shard " << temp << causedBy( e ) << endl; } conn.done(); } clearSinceLastGetError(); LOG(4) << "checking " << writebacks.size() << " writebacks for" << " gle (" << shards->size() << " shards)" << endl; if ( errors.size() == 0 ) { result.appendNull( "err" ); _handleWriteBacks( writebacks , fromWriteBackListener ); return true; } result.append( "err" , errors[0].c_str() ); { // errs BSONArrayBuilder all( result.subarrayStart( "errs" ) ); for ( unsigned i=0; i<errors.size(); i++ ) { all.append( errors[i].c_str() ); } all.done(); } { // errObjects BSONArrayBuilder all( result.subarrayStart( "errObjects" ) ); for ( unsigned i=0; i<errorObjects.size(); i++ ) { all.append( errorObjects[i] ); } all.done(); } _handleWriteBacks( writebacks , fromWriteBackListener ); return true; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result) { BSONElement first = cmdObj.firstElement(); uassert(28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); StringData collectionName = first.valueStringData(); uassert(28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !collectionName.empty()); const NamespaceString ns(dbname, collectionName); const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no database")); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no collection")); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexNames.clear(); cce->getAllIndexes(txn, &indexNames); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); auto ws = make_unique<WorkingSet>(); auto root = make_unique<QueuedDataStage>(txn, ws.get()); for (size_t i = 0; i < indexNames.size(); i++) { BSONObj indexSpec; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexSpec = cce->getIndexSpec(txn, indexNames[i]); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "." << ns.coll(); dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS()); dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); exec->detachFromOperationContext(); ClientCursor* cursor = new ClientCursor(CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace, txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot()); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
void WriteBackListener::run() { int secsToSleep = 0; scoped_ptr<ShardChunkVersion> lastNeededVersion; int lastNeededCount = 0; while ( ! inShutdown() ) { if ( ! Shard::isAShardNode( _addr ) ) { LOG(1) << _addr << " is not a shard node" << endl; sleepsecs( 60 ); continue; } try { scoped_ptr<ScopedDbConnection> conn( ScopedDbConnection::getInternalScopedDbConnection( _addr ) ); BSONObj result; { BSONObjBuilder cmd; cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data if ( ! conn->get()->runCommand( "admin" , cmd.obj() , result ) ) { result = result.getOwned(); log() << "writebacklisten command failed! " << result << endl; conn->done(); continue; } } conn->done(); LOG(1) << "writebacklisten result: " << result << endl; BSONObj data = result.getObjectField( "data" ); if ( data.getBoolField( "writeBack" ) ) { string ns = data["ns"].valuestrsafe(); ConnectionIdent cid( "" , 0 ); OID wid; if ( data["connectionId"].isNumber() && data["id"].type() == jstOID ) { string s = ""; if ( data["instanceIdent"].type() == String ) s = data["instanceIdent"].String(); cid = ConnectionIdent( s , data["connectionId"].numberLong() ); wid = data["id"].OID(); } else { warning() << "mongos/mongod version mismatch (1.7.5 is the split)" << endl; } int len; // not used, but needed for next call Message msg( (void*)data["msg"].binData( len ) , false ); massert( 10427 , "invalid writeback message" , msg.header()->valid() ); DBConfigPtr db = grid.getDBConfig( ns ); ShardChunkVersion needVersion = ShardChunkVersion::fromBSON( data, "version" ); // // TODO: Refactor the sharded strategy to correctly handle all sharding state changes itself, // we can't rely on WBL to do this for us b/c anything could reset our state in-between. // We should always reload here for efficiency when possible, but staleness is also caught in the // loop below. // ChunkManagerPtr manager; ShardPtr primary; db->getChunkManagerOrPrimary( ns, manager, primary ); ShardChunkVersion currVersion; if( manager ) currVersion = manager->getVersion(); LOG(1) << "connectionId: " << cid << " writebackId: " << wid << " needVersion : " << needVersion.toString() << " mine : " << currVersion.toString() << endl; LOG(1) << msg.toString() << endl; // // We should reload only if we need to update our version to be compatible *and* we // haven't already done so. This avoids lots of reloading when we remove/add a sharded collection // bool alreadyReloaded = lastNeededVersion && lastNeededVersion->isEquivalentTo( needVersion ); if( alreadyReloaded ){ LOG(1) << "wbl already reloaded config information for version " << needVersion << ", at version " << currVersion << endl; } else if( lastNeededVersion ) { log() << "new version change detected to " << needVersion.toString() << ", " << lastNeededCount << " writebacks processed at " << lastNeededVersion->toString() << endl; lastNeededCount = 0; } // // Set our lastNeededVersion for next time // lastNeededVersion.reset( new ShardChunkVersion( needVersion ) ); lastNeededCount++; // // Determine if we should reload, if so, reload // bool shouldReload = ! needVersion.isWriteCompatibleWith( currVersion ) && ! alreadyReloaded; if( shouldReload && currVersion.isSet() && needVersion.isSet() && currVersion.hasCompatibleEpoch( needVersion ) ) { // // If we disagree about versions only, reload the chunk manager // db->getChunkManagerIfExists( ns, true ); } else if( shouldReload ){ // // If we disagree about anything else, reload the full db // warning() << "reloading config data for " << db->getName() << ", " << "wanted version " << needVersion.toString() << " but currently have version " << currVersion.toString() << endl; db->reload(); } // do request and then call getLastError // we have to call getLastError so we can return the right fields to the user if they decide to call getLastError BSONObj gle; int attempts = 0; while ( true ) { attempts++; try { Request r( msg , 0 ); r.init(); r.d().reservedField() |= Reserved_FromWriteback; ClientInfo * ci = r.getClientInfo(); if (!noauth) { // TODO: Figure out why this is 'admin' instead of 'local'. ci->getAuthenticationInfo()->authorize("admin", internalSecurity.user); } ci->noAutoSplit(); r.process( attempts ); ci->newRequest(); // this so we flip prev and cur shards BSONObjBuilder b; string errmsg; if ( ! ci->getLastError( "admin", BSON( "getLastError" << 1 ), b, errmsg, true ) ) { b.appendBool( "commandFailed" , true ); if( ! b.hasField( "errmsg" ) ){ b.append( "errmsg", errmsg ); gle = b.obj(); } else if( errmsg.size() > 0 ){ // Rebuild GLE object with errmsg // TODO: Make this less clumsy by improving GLE interface gle = b.obj(); if( gle["errmsg"].type() == String ){ BSONObj gleNoErrmsg = gle.filterFieldsUndotted( BSON( "errmsg" << 1 ), false ); BSONObjBuilder bb; bb.appendElements( gleNoErrmsg ); bb.append( "errmsg", gle["errmsg"].String() + " ::and:: " + errmsg ); gle = bb.obj().getOwned(); } } } else{ gle = b.obj(); } log() << "GLE is " << gle << endl; if ( gle["code"].numberInt() == 9517 ) { log() << "new version change detected, " << lastNeededCount << " writebacks processed previously" << endl; lastNeededVersion.reset(); lastNeededCount = 1; log() << "writeback failed because of stale config, retrying attempts: " << attempts << endl; LOG(1) << "writeback error : " << gle << endl; // // Bringing this in line with the similar retry logic elsewhere // // TODO: Reloading the chunk manager may not help if we dropped a // collection, but we don't actually have that info in the writeback // error // if( attempts <= 2 ){ db->getChunkManagerIfExists( ns, true ); } else{ versionManager.forceRemoteCheckShardVersionCB( ns ); sleepsecs( attempts - 1 ); } uassert( 15884, str::stream() << "Could not reload chunk manager after " << attempts << " attempts.", attempts <= 4 ); continue; } ci->clearSinceLastGetError(); } catch ( DBException& e ) { error() << "error processing writeback: " << e << endl; BSONObjBuilder b; e.getInfo().append( b, "err", "code" ); gle = b.obj(); } break; } { scoped_lock lk( _seenWritebacksLock ); WBStatus& s = _seenWritebacks[cid]; s.id = wid; s.gle = gle; } } else if ( result["noop"].trueValue() ) { // no-op } else { log() << "unknown writeBack result: " << result << endl; } secsToSleep = 0; continue; } catch ( std::exception& e ) { if ( inShutdown() ) { // we're shutting down, so just clean up return; } log() << "WriteBackListener exception : " << e.what() << endl; // It's possible this shard was removed Shard::reloadShardInfo(); } catch ( ... ) { log() << "WriteBackListener uncaught exception!" << endl; } secsToSleep++; sleepsecs(secsToSleep); if ( secsToSleep > 10 ) secsToSleep = 0; } log() << "WriteBackListener exiting : address no longer in cluster " << _addr; }
void operator()( DBClientCursorBatchIterator &i ) { Lock::GlobalWrite lk; if ( context ) { context->relocked(); } while( i.moreInCurrentBatch() ) { if ( n % 128 == 127 /*yield some*/ ) { time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << n << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ if ( !tmp.valid() ) { stringstream ss; ss << "Cloner: skipping corrupt object from " << from_collection; BSONElement e = tmp.firstElement(); try { e.validate(); ss << " firstElement: " << e; } catch( ... ) { ss << " firstElement corrupt"; } out() << ss.str() << endl; continue; } ++n; BSONObj js = tmp; if ( isindex ) { verify( strstr(from_collection, "system.indexes") ); js = fixindex(tmp); storedForLater->push_back( js.getOwned() ); continue; } try { theDataFileMgr.insertWithObjMod(to_collection, js); if ( logForRepl ) logOp("i", to_collection, js); getDur().commitIfNeeded(); } catch( UserException& e ) { log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; } RARELY if ( time( 0 ) - saveLast > 60 ) { log() << n << " objects cloned so far from collection " << from_collection << endl; saveLast = time( 0 ); } } }
void BatchedInsertRequest::setWriteConcern(const BSONObj& writeConcern) { _writeConcern = writeConcern.getOwned(); _isWriteConcernSet = true; }
MigrateInfo* BalancerPolicy::balance( const string& ns, const DistributionStatus& distribution, int balancedLastTime ) { // 1) check for shards that policy require to us to move off of: // draining only // 2) check tag policy violations // 3) then we make sure chunks are balanced for each tag // ---- // 1) check things we have to move { const set<string>& shards = distribution.shards(); for ( set<string>::const_iterator z = shards.begin(); z != shards.end(); ++z ) { string shard = *z; const ShardInfo& info = distribution.shardInfo( shard ); if ( ! info.isDraining() ) continue; if ( distribution.numberOfChunksInShard( shard ) == 0 ) continue; // now we know we need to move to chunks off this shard // we will if we are allowed if ( info.hasOpsQueued() ) { warning() << "want to shed load from " << shard << " but can't because it has ops queued" << endl; continue; } const vector<BSONObj>& chunks = distribution.getChunks( shard ); unsigned numJumboChunks = 0; // since we have to move all chunks, lets just do in order for ( unsigned i=0; i<chunks.size(); i++ ) { BSONObj chunkToMove = chunks[i]; if ( _isJumbo( chunkToMove ) ) { numJumboChunks++; continue; } string tag = distribution.getTagForChunk( chunkToMove ); string to = distribution.getBestReceieverShard( tag ); if ( to.size() == 0 ) { warning() << "want to move chunk: " << chunkToMove << "(" << tag << ") " << "from " << shard << " but can't find anywhere to put it" << endl; continue; } log() << "going to move " << chunkToMove << " from " << shard << "(" << tag << ")" << " to " << to << endl; return new MigrateInfo( ns, to, shard, chunkToMove.getOwned() ); } warning() << "can't find any chunk to move from: " << shard << " but we want to. " << " numJumboChunks: " << numJumboChunks << endl; } } // 2) tag violations if ( distribution.tags().size() > 0 ) { const set<string>& shards = distribution.shards(); for ( set<string>::const_iterator i = shards.begin(); i != shards.end(); ++i ) { string shard = *i; const ShardInfo& info = distribution.shardInfo( shard ); const vector<BSONObj>& chunks = distribution.getChunks( shard ); for ( unsigned j = 0; j < chunks.size(); j++ ) { string tag = distribution.getTagForChunk( chunks[j] ); if ( info.hasTag( tag ) ) continue; // uh oh, this chunk is in the wrong place log() << "chunk " << chunks[j] << " is not on a shard with the right tag: " << tag << endl; if ( _isJumbo( chunks[j] ) ) { warning() << "chunk " << chunks[j] << " is jumbo, so cannot be moved" << endl; continue; } string to = distribution.getBestReceieverShard( tag ); if ( to.size() == 0 ) { log() << "no where to put it :(" << endl; continue; } verify( to != shard ); log() << " going to move to: " << to << endl; return new MigrateInfo( ns, to, shard, chunks[j].getOwned() ); } } } // 3) for each tag balance int threshold = 8; if ( balancedLastTime || distribution.totalChunks() < 20 ) threshold = 2; else if ( distribution.totalChunks() < 80 ) threshold = 4; // randomize the order in which we balance the tags // this is so that one bad tag doesn't prevent others from getting balanced vector<string> tags; { set<string> t = distribution.tags(); for ( set<string>::const_iterator i = t.begin(); i != t.end(); ++i ) tags.push_back( *i ); tags.push_back( "" ); std::random_shuffle( tags.begin(), tags.end() ); } for ( unsigned i=0; i<tags.size(); i++ ) { string tag = tags[i]; string from = distribution.getMostOverloadedShard( tag ); if ( from.size() == 0 ) continue; unsigned max = distribution.numberOfChunksInShardWithTag( from, tag ); if ( max == 0 ) continue; string to = distribution.getBestReceieverShard( tag ); if ( to.size() == 0 ) { log() << "no available shards to take chunks for tag [" << tag << "]" << endl; return NULL; } unsigned min = distribution.numberOfChunksInShardWithTag( to, tag ); const int imbalance = max - min; LOG(1) << "collection : " << ns << endl; LOG(1) << "donor : " << from << " chunks on " << max << endl; LOG(1) << "receiver : " << to << " chunks on " << min << endl; LOG(1) << "threshold : " << threshold << endl; if ( imbalance < threshold ) continue; const vector<BSONObj>& chunks = distribution.getChunks( from ); unsigned numJumboChunks = 0; for ( unsigned j = 0; j < chunks.size(); j++ ) { if ( distribution.getTagForChunk( chunks[j] ) != tag ) continue; if ( _isJumbo( chunks[j] ) ) { numJumboChunks++; continue; } log() << " ns: " << ns << " going to move " << chunks[j] << " from: " << from << " to: " << to << " tag [" << tag << "]" << endl; return new MigrateInfo( ns, to, from, chunks[j] ); } if ( numJumboChunks ) { error() << "shard: " << from << "ns: " << ns << "has too many chunks, but they are all jumbo " << " numJumboChunks: " << numJumboChunks << endl; continue; } verify( false ); // should be impossible } // Everything is balanced here! return NULL; }
IndexBuilder::IndexBuilder(const BSONObj& index) : BackgroundJob(true /* self-delete */), _index(index.getOwned()), _name(str::stream() << "repl index builder " << _indexBuildCount.addAndFetch(1)) { }
void StorageEngineMetadata::setStorageEngineOptions(const BSONObj& storageEngineOptions) { _storageEngineOptions = storageEngineOptions.getOwned(); }