bool setShardVersion(DBClientBase& conn, const string& ns, const string& configServerPrimary, ChunkVersion version, ChunkManager* manager, bool authoritative, BSONObj& result) { BSONObjBuilder cmdBuilder; cmdBuilder.append("setShardVersion", ns); cmdBuilder.append("configdb", configServerPrimary); Shard s = Shard::make(conn.getServerAddress()); cmdBuilder.append("shard", s.getName()); cmdBuilder.append("shardHost", s.getConnString()); if (ns.size() > 0) { version.addToBSON(cmdBuilder); } else { cmdBuilder.append("init", true); } if (authoritative) { cmdBuilder.appendBool("authoritative", 1); } BSONObj cmd = cmdBuilder.obj(); LOG(1) << " setShardVersion " << s.getName() << " " << conn.getServerAddress() << " " << ns << " " << cmd << (manager ? string(str::stream() << " " << manager->getSequenceNumber()) : ""); return conn.runCommand("admin", cmd, result, 0); }
bool Chunk::moveAndCommit(const Shard& to, long long chunkSize /* bytes */, const WriteConcernOptions* writeConcern, bool waitForDelete, int maxTimeMS, BSONObj& res) const { uassert( 10167 , "can't move shard to its current location!" , getShard() != to ); log() << "moving chunk ns: " << _manager->getns() << " moving ( " << toString() << ") " << _shard.toString() << " -> " << to.toString(); Shard from = _shard; ScopedDbConnection fromconn(from.getConnString()); BSONObjBuilder builder; builder.append("moveChunk", _manager->getns()); builder.append("from", from.getAddress().toString()); builder.append("to", to.getAddress().toString()); // NEEDED FOR 2.0 COMPATIBILITY builder.append("fromShard", from.getName()); builder.append("toShard", to.getName()); /////////////////////////////// builder.append("min", _min); builder.append("max", _max); builder.append("maxChunkSizeBytes", chunkSize); builder.append("shardId", genID()); builder.append("configdb", configServer.modelServer()); // For legacy secondary throttle setting. bool secondaryThrottle = true; if (writeConcern && writeConcern->wNumNodes <= 1 && writeConcern->wMode.empty()) { secondaryThrottle = false; } builder.append("secondaryThrottle", secondaryThrottle); if (secondaryThrottle && writeConcern) { builder.append("writeConcern", writeConcern->toBSON()); } builder.append("waitForDelete", waitForDelete); builder.append(LiteParsedQuery::cmdOptionMaxTimeMS, maxTimeMS); builder.append("epoch", _manager->getVersion().epoch()); bool worked = fromconn->runCommand("admin", builder.done(), res); fromconn.done(); LOG( worked ? 1 : 0 ) << "moveChunk result: " << res; // if succeeded, needs to reload to pick up the new location // if failed, mongos may be stale // reload is excessive here as the failure could be simply because collection metadata is taken _manager->reload(); return worked; }
void getAllShards( vector<Shard>& all ) { scoped_lock lk( _mutex ); std::set<string> seen; for ( map<string,Shard>::iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) { Shard s = i->second; if ( s.getName() == "config" ) continue; if ( seen.count( s.getName() ) ) continue; seen.insert( s.getName() ); all.push_back( s ); } }
Status ChunkManagerTargeter::targetShardKey(const BSONObj& doc, ShardEndpoint** endpoint) const { invariant(NULL != _manager); dassert(_manager->hasShardKey(doc)); ChunkPtr chunk = _manager->findChunkForDoc(doc); Shard shard = chunk->getShard(); *endpoint = new ShardEndpoint(shard.getName(), _manager->getVersion(StringData(shard.getName()))); return Status::OK(); }
Status DBClientShardResolver::chooseWriteHost( const string& shardName, ConnectionString* shardHost ) const { // Declare up here for parsing later string errMsg; // Special-case for config if (shardName == "config") { *shardHost = ConnectionString::parse( configServer.modelServer(), errMsg ); dassert( errMsg == "" ); return Status::OK(); } // // First get the information about the shard from the shard cache // // Internally uses our shard cache, does no reload Shard shard = Shard::findIfExists( shardName ); if ( shard.getName() == "" ) { return Status( ErrorCodes::ShardNotFound, string("unknown shard name ") + shardName ); } return findMaster(shard.getConnString().toString(), shardHost); }
bool setShardVersion( DBClientBase & conn , const string& ns , ShardChunkVersion version , bool authoritative , BSONObj& result ){ BSONObjBuilder cmdBuilder; cmdBuilder.append( "setShardVersion" , ns.c_str() ); cmdBuilder.append( "configdb" , configServer.modelServer() ); cmdBuilder.appendTimestamp( "version" , version ); cmdBuilder.appendOID( "serverID" , &serverID ); if ( authoritative ) cmdBuilder.appendBool( "authoritative" , 1 ); Shard s = Shard::make( conn.getServerAddress() ); cmdBuilder.append( "shard" , s.getName() ); cmdBuilder.append( "shardHost" , s.getConnString() ); BSONObj cmd = cmdBuilder.obj(); log(1) << " setShardVersion " << s.getName() << " " << conn.getServerAddress() << " " << ns << " " << cmd << " " << &conn << endl; return conn.runCommand( "admin" , cmd , result ); }
void remove( const string& name ) { scoped_lock lk( _mutex ); for ( map<string,Shard>::iterator i = _lookup.begin(); i!=_lookup.end(); ) { Shard s = i->second; if ( s.getName() == name ) { _lookup.erase(i++); } else { ++i; } } }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { string target = cmdObj.firstElement().valuestrsafe(); Shard s = Shard::make( target ); if ( ! grid.knowAboutShard( s.getConnString() ) ) { errmsg = "unknown shard"; return false; } ScopedDbConnection conn( configServer.getPrimary() ); if (conn->count("config.shards", BSON("_id" << NE << s.getName() << ShardFields::draining(true)))){ conn.done(); errmsg = "Can't have more than one draining shard at a time"; return false; } if (conn->count("config.shards", BSON("_id" << NE << s.getName())) == 0){ conn.done(); errmsg = "Can't remove last shard"; return false; } BSONObj primaryDoc = BSON( "_id" << NE << "local" << "primary" << s.getName() ); BSONObj dbInfo; // appended at end of result on success { boost::scoped_ptr<DBClientCursor> cursor (conn->query("config.databases", primaryDoc)); if (cursor->more()) { // skip block and allocations if empty BSONObjBuilder dbInfoBuilder; dbInfoBuilder.append("note", "you need to drop or movePrimary these databases"); BSONArrayBuilder dbs(dbInfoBuilder.subarrayStart("dbsToMove")); while (cursor->more()){ BSONObj db = cursor->nextSafe(); dbs.append(db["_id"]); } dbs.doneFast(); dbInfo = dbInfoBuilder.obj(); } } // If the server is not yet draining chunks, put it in draining mode. BSONObj searchDoc = BSON( "_id" << s.getName() ); BSONObj drainingDoc = BSON( "_id" << s.getName() << ShardFields::draining(true) ); BSONObj shardDoc = conn->findOne( "config.shards", drainingDoc ); if ( shardDoc.isEmpty() ) { // TODO prevent move chunks to this shard. log() << "going to start draining shard: " << s.getName() << endl; BSONObj newStatus = BSON( "$set" << BSON( ShardFields::draining(true) ) ); conn->update( "config.shards" , searchDoc , newStatus, false /* do no upsert */); errmsg = conn->getLastError(); if ( errmsg.size() ) { log() << "error starting remove shard: " << s.getName() << " err: " << errmsg << endl; return false; } BSONObj primaryLocalDoc = BSON("_id" << "local" << "primary" << s.getName() ); PRINT(primaryLocalDoc); if (conn->count("config.databases", primaryLocalDoc)) { log() << "This shard is listed as primary of local db. Removing entry." << endl; conn->remove("config.databases", BSON("_id" << "local")); errmsg = conn->getLastError(); if ( errmsg.size() ) { log() << "error removing local db: " << errmsg << endl; return false; } } Shard::reloadShardInfo(); result.append( "msg" , "draining started successfully" ); result.append( "state" , "started" ); result.append( "shard" , s.getName() ); result.appendElements(dbInfo); conn.done(); return true; } // If the server has been completely drained, remove it from the ConfigDB. // Check not only for chunks but also databases. BSONObj shardIDDoc = BSON( "shard" << shardDoc[ "_id" ].str() ); long long chunkCount = conn->count( "config.chunks" , shardIDDoc ); long long dbCount = conn->count( "config.databases" , primaryDoc ); if ( ( chunkCount == 0 ) && ( dbCount == 0 ) ) { log() << "going to remove shard: " << s.getName() << endl; conn->remove( "config.shards" , searchDoc ); errmsg = conn->getLastError(); if ( errmsg.size() ) { log() << "error concluding remove shard: " << s.getName() << " err: " << errmsg << endl; return false; } Shard::removeShard( shardDoc[ "_id" ].str() ); Shard::reloadShardInfo(); result.append( "msg" , "removeshard completed successfully" ); result.append( "state" , "completed" ); result.append( "shard" , s.getName() ); conn.done(); return true; } // If the server is already in draining mode, just report on its progress. // Report on databases (not just chunks) that are left too. result.append( "msg" , "draining ongoing" ); result.append( "state" , "ongoing" ); BSONObjBuilder inner; inner.append( "chunks" , chunkCount ); inner.append( "dbs" , dbCount ); result.append( "remaining" , inner.obj() ); result.appendElements(dbInfo); conn.done(); return true; }
bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string target = cmdObj.firstElement().valuestrsafe(); Shard s = Shard::make( target ); if ( ! grid.knowAboutShard( s.getConnString() ) ) { errmsg = "unknown shard"; return false; } ScopedDbConnection conn( configServer.getPrimary() ); // If the server is not yet draining chunks, put it in draining mode. BSONObj searchDoc = BSON( "_id" << s.getName() ); BSONObj drainingDoc = BSON( "_id" << s.getName() << ShardFields::draining(true) ); BSONObj shardDoc = conn->findOne( "config.shards", drainingDoc ); if ( shardDoc.isEmpty() ) { // TODO prevent move chunks to this shard. log() << "going to start draining shard: " << s.getName() << endl; BSONObj newStatus = BSON( "$set" << BSON( ShardFields::draining(true) ) ); conn->update( "config.shards" , searchDoc , newStatus, false /* do no upsert */); errmsg = conn->getLastError(); if ( errmsg.size() ) { log() << "error starting remove shard: " << s.getName() << " err: " << errmsg << endl; return false; } Shard::reloadShardInfo(); result.append( "msg" , "draining started successfully" ); result.append( "state" , "started" ); result.append( "shard" , s.getName() ); conn.done(); return true; } // If the server has been completely drained, remove it from the ConfigDB. // Check not only for chunks but also databases. BSONObj shardIDDoc = BSON( "shard" << shardDoc[ "_id" ].str() ); long long chunkCount = conn->count( "config.chunks" , shardIDDoc ); BSONObj primaryDoc = BSON( "primary" << shardDoc[ "_id" ].str() ); long long dbCount = conn->count( "config.databases" , primaryDoc ); if ( ( chunkCount == 0 ) && ( dbCount == 0 ) ) { log() << "going to remove shard: " << s.getName() << endl; conn->remove( "config.shards" , searchDoc ); errmsg = conn->getLastError(); if ( errmsg.size() ) { log() << "error concluding remove shard: " << s.getName() << " err: " << errmsg << endl; return false; } Shard::removeShard( shardDoc[ "_id" ].str() ); Shard::reloadShardInfo(); result.append( "msg" , "removeshard completed successfully" ); result.append( "state" , "completed" ); result.append( "shard" , s.getName() ); conn.done(); return true; } // If the server is already in draining mode, just report on its progress. // Report on databases (not just chunks) that are left too. result.append( "msg" , "draining ongoing" ); result.append( "state" , "ongoing" ); BSONObjBuilder inner; inner.append( "chunks" , chunkCount ); inner.append( "dbs" , dbCount ); result.append( "remaining" , inner.obj() ); conn.done(); return true; }
/** * @return true if had to do something */ bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) { // TODO: cache, optimize, etc... WriteBackListener::init( *conn_in ); DBConfigPtr conf = grid.getDBConfig( ns ); if ( ! conf ) return false; DBClientBase* conn = getVersionable( conn_in ); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ChunkManagerPtr manager; const bool isSharded = conf->isSharded( ns ); if ( isSharded ) { manager = conf->getChunkManagerIfExists( ns , authoritative ); // It's possible the chunk manager was reset since we checked whether sharded was true, // so must check this here. if( manager ) officialSequenceNumber = manager->getSequenceNumber(); } // Check this manager against the reference manager if( isSharded && manager ){ Shard shard = Shard::make( conn->getServerAddress() ); if( refManager && ! refManager->compatibleWith( manager, shard ) ){ throw SendStaleConfigException( ns, str::stream() << "manager (" << manager->getVersion( shard ).toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refManager->getVersion( shard ).toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")", refManager->getVersion( shard ), manager->getVersion( shard ) ); } } else if( refManager ){ Shard shard = Shard::make( conn->getServerAddress() ); string msg( str::stream() << "not sharded (" << ( (manager.get() == 0) ? string( "<none>" ) : str::stream() << manager->getSequenceNumber() ) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")" ); throw SendStaleConfigException( ns, msg, refManager->getVersion( shard ), ShardChunkVersion( 0, OID() )); } // has the ChunkManager been reloaded since the last time we updated the connection-level version? // (ie., last time we issued the setShardVersions below) unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns); if ( sequenceNumber == officialSequenceNumber ) { return false; } ShardChunkVersion version = ShardChunkVersion( 0, OID() ); if ( isSharded && manager ) { version = manager->getVersion( Shard::make( conn->getServerAddress() ) ); } if( ! version.isSet() ){ LOG(0) << "resetting shard version of " << ns << " on " << conn->getServerAddress() << ", " << ( ! isSharded ? "no longer sharded" : ( ! manager ? "no chunk manager found" : "version is zero" ) ) << endl; } LOG(2) << " have to set shard version for conn: " << conn->getServerAddress() << " ns:" << ns << " my last seq: " << sequenceNumber << " current: " << officialSequenceNumber << " version: " << version << " manager: " << manager.get() << endl; const string versionableServerAddress(conn->getServerAddress()); BSONObj result; if ( setShardVersion( *conn , ns , version , authoritative , result ) ) { // success! LOG(1) << " setShardVersion success: " << result << endl; connectionShardStatus.setSequence( conn , ns , officialSequenceNumber ); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if ( result["need_authoritative"].trueValue() ) massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); if ( ! authoritative ) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if ( result["reloadConfig"].trueValue() ) { if( result["version"].timestampTime() == 0 ){ warning() << "reloading full configuration for " << conf->getName() << ", connection state indicates significant version changes" << endl; // reload db conf->reload(); } else { // reload config conf->getChunkManager( ns , true ); } } const int maxNumTries = 7; if ( tryNumber < maxNumTries ) { LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) << "going to retry checkShardVersion host: " << versionableServerAddress << " " << result << endl; sleepmillis( 10 * tryNumber ); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed host: " << versionableServerAddress << " " << result; log() << " " << errmsg << endl; massert( 10429 , errmsg , 0 ); return true; }
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { verify( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() ); vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col["key"].eoo() && ! col["noBalance"].trueValue() ){ collections.push_back( col["_id"].String() ); } else if( col["noBalance"].trueValue() ){ LOG(1) << "not balancing collection " << col["_id"].String() << ", explicitly disabled" << endl; } } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } ShardInfoMap shardInfo; for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) { const Shard& s = *it; ShardStatus status = s.getStatus(); shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(), status.mapped(), s.isDraining(), status.hasOpsQueued(), s.tags() ); } // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; map< string,vector<BSONObj> > shardToChunksMap; cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ) { BSONObj chunk = cursor->nextSafe(); if ( chunk["jumbo"].trueValue() ) continue; vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()]; chunks.push_back( chunk.getOwned() ); } cursor.reset(); if (shardToChunksMap.empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); } DistributionStatus status( shardInfo, shardToChunksMap ); // load tags conn.ensureIndex( ShardNS::tags, BSON( "ns" << 1 << "min" << 1 ), true ); cursor = conn.query( ShardNS::tags , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); uassert( 16356 , str::stream() << "tag ranges not valid for: " << ns , status.addTagRange( TagRange( tag["min"].Obj().getOwned(), tag["max"].Obj().getOwned(), tag["tag"].String() ) ) ); } cursor.reset(); CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ){ assert( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::database collection // // { "_id" : "test", "partitioned" : true, "primary" : "shard0", // "sharded" : { // "test.images" : { "key" : { "_id" : 1 }, "unique" : false }, // ... // } // } // auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::database , BSON( "partitioned" << true ) ); vector< string > collections; while ( cursor->more() ){ BSONObj db = cursor->next(); // A database may be partitioned but not yet have a sharded collection. // 'cursor' will point to docs that do not contain the "sharded" key. Since // there'd be nothing to balance, we want to skip those here. BSONElement shardedColls = db["sharded"]; if ( shardedColls.eoo() ){ log(2) << "balancer: skipping database with no sharded collection (" << db["_id"].str() << ")" << endl; continue; } BSONObjIterator i( shardedColls.Obj() ); while ( i.more() ){ BSONElement e = i.next(); collections.push_back( e.fieldName() ); } } cursor.reset(); if ( collections.empty() ) { log(1) << "balancer: no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { log(1) << "balancer: can't balance without more active shards" << endl; return; } map< string, BSONObj > shardLimitsMap; for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ){ const Shard& s = *it; ShardStatus status = s.getStatus(); BSONObj limitsObj = BSON( ShardFields::maxSize( s.getMaxSize() ) << ShardFields::currSize( status.mapped() ) << ShardFields::draining( s.isDraining()) ); shardLimitsMap[ s.getName() ] = limitsObj; } // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; map< string,vector<BSONObj> > shardToChunksMap; cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ){ BSONObj chunk = cursor->next(); vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()]; chunks.push_back( chunk.getOwned() ); } cursor.reset(); if (shardToChunksMap.empty()) { log(1) << "balancer: skipping empty collection (" << ns << ")"; continue; } for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ){ // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); } CandidateChunk* p = _policy->balance( ns , shardLimitsMap , shardToChunksMap , _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }
/** * Updates the remote cached version on the remote shard host (primary, in the case of replica * sets) if needed with a fully-qualified shard version for the given namespace: * config server(s) + shard name + shard version * * If no remote cached version has ever been set, an initial shard version is sent. * * If the namespace is empty and no version has ever been sent, the config server + shard name * is sent to the remote shard host to initialize the connection as coming from mongos. * NOTE: This initialization is *best-effort only*. Operations which wish to correctly version * must send the namespace. * * Config servers are special and are not (unless otherwise a shard) kept up to date with this * protocol. This is safe so long as config servers only contain unversioned collections. * * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB). * * @return true if we contacted the remote host */ bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) { // TODO: cache, optimize, etc... // Empty namespaces are special - we require initialization but not versioning if (ns.size() == 0) { return initShardVersionEmptyNS(conn_in); } DBConfigPtr conf = grid.getDBConfig( ns ); if ( ! conf ) return false; DBClientBase* conn = getVersionable( conn_in ); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ShardPtr primary; ChunkManagerPtr manager; if (authoritative) conf->getChunkManagerIfExists(ns, true); conf->getChunkManagerOrPrimary(ns, manager, primary); if (manager) officialSequenceNumber = manager->getSequenceNumber(); // Check this manager against the reference manager if( manager ){ Shard shard = Shard::make( conn->getServerAddress() ); if (refManager && !refManager->compatibleWith(*manager, shard.getName())) { const ChunkVersion refVersion(refManager->getVersion(shard.getName())); const ChunkVersion currentVersion(manager->getVersion(shard.getName())); string msg(str::stream() << "manager (" << currentVersion.toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refVersion.toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")"); throw SendStaleConfigException(ns, msg, refVersion, currentVersion); } } else if( refManager ){ Shard shard = Shard::make(conn->getServerAddress()); string msg( str::stream() << "not sharded (" << ( (manager.get() == 0) ? string( "<none>" ) : str::stream() << manager->getSequenceNumber() ) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")" ); throw SendStaleConfigException(ns, msg, refManager->getVersion(shard.getName()), ChunkVersion::UNSHARDED()); } // Do not send setShardVersion to collections on the config servers - this causes problems // when config servers are also shards and get SSV with conflicting names. // TODO: Make config servers regular shards if (primary && primary->getName() == "config") { return false; } // Has the ChunkManager been reloaded since the last time we updated the shard version over // this connection? If we've never updated the shard version, do so now. unsigned long long sequenceNumber = 0; if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) { if (sequenceNumber == officialSequenceNumber) { return false; } } // Now that we're sure we're sending SSV and not to a single config server, get the shard Shard shard = Shard::make(conn->getServerAddress()); ChunkVersion version = ChunkVersion(0, 0, OID()); if (manager) version = manager->getVersion(shard.getName()); LOG(1) << "setting shard version of " << version << " for " << ns << " on shard " << shard.toString(); LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber << ", current chunk manager iteration is " << officialSequenceNumber; BSONObj result; if (setShardVersion(*conn, ns, configServer.modelServer(), version, manager.get(), authoritative, result)) { LOG(1) << " setShardVersion success: " << result; connectionShardStatus.setSequence( conn , ns , officialSequenceNumber ); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if ( result["need_authoritative"].trueValue() ) massert( 10428 , "need_authoritative set but in authoritative mode already" , ! authoritative ); if ( ! authoritative ) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if ( result["reloadConfig"].trueValue() ) { if( result["version"].timestampTime() == 0 ){ warning() << "reloading full configuration for " << conf->name() << ", connection state indicates significant version changes"; // reload db conf->reload(); } else { // reload config conf->getChunkManager( ns , true ); } } const int maxNumTries = 7; if ( tryNumber < maxNumTries ) { LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) << "going to retry checkShardVersion shard: " << shard.toString() << " " << result; sleepmillis( 10 * tryNumber ); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed shard: " << shard.toString() << " " << result; log() << " " << errmsg << endl; massert( 10429 , errmsg , 0 ); return true; }
Status DBClientShardResolver::chooseWriteHost( const string& shardName, ConnectionString* shardHost ) const { // Declare up here for parsing later string errMsg; // Special-case for config and admin if ( shardName == "config" || shardName == "admin" ) { *shardHost = ConnectionString::parse( configServer.modelServer(), errMsg ); dassert( errMsg == "" ); return Status::OK(); } // // First get the information about the shard from the shard cache // // Internally uses our shard cache, does no reload Shard shard = Shard::findIfExists( shardName ); if ( shard.getName() == "" ) { return Status( ErrorCodes::ShardNotFound, string("unknown shard name ") + shardName ); } ConnectionString rawShardHost = ConnectionString::parse( shard.getConnString(), errMsg ); dassert( errMsg == "" ); dassert( rawShardHost.type() == ConnectionString::SET || rawShardHost.type() == ConnectionString::MASTER ); if ( rawShardHost.type() == ConnectionString::MASTER ) { *shardHost = rawShardHost; return Status::OK(); } // // If we need to, then get the particular node we're targeting in the replica set // // Does not reload the monitor if it doesn't currently exist ReplicaSetMonitorPtr replMonitor = ReplicaSetMonitor::get( rawShardHost.getSetName(), false ); if ( !replMonitor ) { return Status( ErrorCodes::ReplicaSetNotFound, string("unknown replica set ") + rawShardHost.getSetName() ); } try { // This can throw when we don't find a master! HostAndPort masterHostAndPort = replMonitor->getMaster(); *shardHost = ConnectionString::parse( masterHostAndPort.toString( true ), errMsg ); dassert( errMsg == "" ); return Status::OK(); } catch ( const DBException& ) { return Status( ErrorCodes::HostNotFound, string("could not contact primary for replica set ") + replMonitor->getName() ); } // Unreachable dassert( false ); return Status( ErrorCodes::UnknownError, "" ); }
bool handleSpecialNamespaces( Request& r , QueryMessage& q ) { const char * ns = r.getns(); ns = strstr( r.getns() , ".$cmd.sys." ); if ( ! ns ) return false; ns += 10; r.checkAuth( Auth::WRITE ); BSONObjBuilder b; vector<Shard> shards; if ( strcmp( ns , "inprog" ) == 0 ) { Shard::getAllShards( shards ); BSONArrayBuilder arr( b.subarrayStart( "inprog" ) ); for ( unsigned i=0; i<shards.size(); i++ ) { Shard shard = shards[i]; ScopedDbConnection conn( shard ); BSONObj temp = conn->findOne( r.getns() , BSONObj() ); if ( temp["inprog"].isABSONObj() ) { BSONObjIterator i( temp["inprog"].Obj() ); while ( i.more() ) { BSONObjBuilder x; BSONObjIterator j( i.next().Obj() ); while( j.more() ) { BSONElement e = j.next(); if ( str::equals( e.fieldName() , "opid" ) ) { stringstream ss; ss << shard.getName() << ':' << e.numberInt(); x.append( "opid" , ss.str() ); } else if ( str::equals( e.fieldName() , "client" ) ) { x.appendAs( e , "client_s" ); } else { x.append( e ); } } arr.append( x.obj() ); } } conn.done(); } arr.done(); } else if ( strcmp( ns , "killop" ) == 0 ) { BSONElement e = q.query["op"]; if ( strstr( r.getns() , "admin." ) == 0 ) { b.append( "err" , "unauthorized" ); } else if ( e.type() != String ) { b.append( "err" , "bad op" ); b.append( e ); } else { b.append( e ); string s = e.String(); string::size_type i = s.find( ':' ); if ( i == string::npos ) { b.append( "err" , "bad opid" ); } else { string shard = s.substr( 0 , i ); int opid = atoi( s.substr( i + 1 ).c_str() ); b.append( "shard" , shard ); b.append( "shardid" , opid ); log() << "want to kill op: " << e << endl; Shard s(shard); ScopedDbConnection conn( s ); conn->findOne( r.getns() , BSON( "op" << opid ) ); conn.done(); } } } else if ( strcmp( ns , "unlock" ) == 0 ) { b.append( "err" , "can't do unlock through mongos" ); } else { log( LL_WARNING ) << "unknown sys command [" << ns << "]" << endl; return false; } BSONObj x = b.done(); replyToQuery(0, r.p(), r.m(), x); return true; }
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { verify( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj()); if ( NULL == cursor.get() ) { warning() << "could not query " << CollectionType::ConfigNS << " while trying to balance" << endl; return; } vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col[CollectionType::keyPattern()].eoo() && ! col[CollectionType::noBalance()].trueValue() ){ collections.push_back( col[CollectionType::ns()].String() ); } else if( col[CollectionType::noBalance()].trueValue() ){ LOG(1) << "not balancing collection " << col[CollectionType::ns()].String() << ", explicitly disabled" << endl; } } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } ShardInfoMap shardInfo; DistributionStatus::populateShardInfoMap(allShards, &shardInfo); OCCASIONALLY warnOnMultiVersion( shardInfo ); // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; OwnedPointerMap<string, OwnedPointerVector<ChunkType> > shardToChunksMap; cursor = conn.query(ChunkType::ConfigNS, QUERY(ChunkType::ns(ns)).sort(ChunkType::min())); set<BSONObj> allChunkMinimums; while ( cursor->more() ) { BSONObj chunkDoc = cursor->nextSafe().getOwned(); auto_ptr<ChunkType> chunk(new ChunkType()); string errmsg; if (!chunk->parseBSON(chunkDoc, &errmsg)) { error() << "bad chunk format for " << chunkDoc << ": " << errmsg << endl; return; } allChunkMinimums.insert(chunk->getMin().getOwned()); OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[chunk->getShard()]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } chunkList->mutableVector().push_back(chunk.release()); } cursor.reset(); if (shardToChunksMap.map().empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[s.getName()]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } } DistributionStatus status(shardInfo, shardToChunksMap.map()); // load tags Status result = clusterCreateIndex(TagsType::ConfigNS, BSON(TagsType::ns() << 1 << TagsType::min() << 1), true, // unique WriteConcernOptions::AllConfigs, NULL); if ( !result.isOK() ) { warning() << "could not create index tags_1_min_1: " << result.reason() << endl; continue; } cursor = conn.query(TagsType::ConfigNS, QUERY(TagsType::ns(ns)).sort(TagsType::min())); vector<TagRange> ranges; while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); TagRange tr(tag[TagsType::min()].Obj().getOwned(), tag[TagsType::max()].Obj().getOwned(), tag[TagsType::tag()].String()); ranges.push_back(tr); uassert(16356, str::stream() << "tag ranges not valid for: " << ns, status.addTagRange(tr) ); } cursor.reset(); DBConfigPtr cfg = grid.getDBConfig( ns ); if ( !cfg ) { warning() << "could not load db config to balance " << ns << " collection" << endl; continue; } // This line reloads the chunk manager once if this process doesn't know the collection // is sharded yet. ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true ); if ( !cm ) { warning() << "could not load chunks to balance " << ns << " collection" << endl; continue; } // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks bool didAnySplits = false; for ( unsigned i = 0; i < ranges.size(); i++ ) { BSONObj min = ranges[i].min; min = cm->getShardKey().extendRangeBound( min, false ); if ( allChunkMinimums.count( min ) > 0 ) continue; didAnySplits = true; log() << "ns: " << ns << " need to split on " << min << " because there is a range there" << endl; ChunkPtr c = cm->findIntersectingChunk( min ); vector<BSONObj> splitPoints; splitPoints.push_back( min ); Status status = c->multiSplit(splitPoints, NULL); if ( !status.isOK() ) { error() << "split failed: " << status << endl; } else { LOG(1) << "split worked" << endl; } break; } if ( didAnySplits ) { // state change, just wait till next round continue; } CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }
bool handleSpecialNamespaces( Request& r , QueryMessage& q ) { const char * ns = strstr( r.getns() , ".$cmd.sys." ); if ( ! ns ) return false; ns += 10; BSONObjBuilder b; vector<Shard> shards; ClientBasic* client = ClientBasic::getCurrent(); AuthorizationSession* authSession = client->getAuthorizationSession(); if ( strcmp( ns , "inprog" ) == 0 ) { const bool isAuthorized = authSession->isAuthorizedForActionsOnResource( ResourcePattern::forClusterResource(), ActionType::inprog); audit::logInProgAuthzCheck( client, q.query, isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized); uassert(ErrorCodes::Unauthorized, "not authorized to run inprog", isAuthorized); Shard::getAllShards( shards ); BSONArrayBuilder arr( b.subarrayStart( "inprog" ) ); for ( unsigned i=0; i<shards.size(); i++ ) { Shard shard = shards[i]; ScopedDbConnection conn(shard.getConnString()); BSONObj temp = conn->findOne( r.getns() , q.query ); if ( temp["inprog"].isABSONObj() ) { BSONObjIterator i( temp["inprog"].Obj() ); while ( i.more() ) { BSONObjBuilder x; BSONObjIterator j( i.next().Obj() ); while( j.more() ) { BSONElement e = j.next(); if ( str::equals( e.fieldName() , "opid" ) ) { stringstream ss; ss << shard.getName() << ':' << e.numberInt(); x.append( "opid" , ss.str() ); } else if ( str::equals( e.fieldName() , "client" ) ) { x.appendAs( e , "client_s" ); } else { x.append( e ); } } arr.append( x.obj() ); } } conn.done(); } arr.done(); } else if ( strcmp( ns , "killop" ) == 0 ) { const bool isAuthorized = authSession->isAuthorizedForActionsOnResource( ResourcePattern::forClusterResource(), ActionType::killop); audit::logKillOpAuthzCheck( client, q.query, isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized); uassert(ErrorCodes::Unauthorized, "not authorized to run killop", isAuthorized); BSONElement e = q.query["op"]; if ( e.type() != String ) { b.append( "err" , "bad op" ); b.append( e ); } else { b.append( e ); string s = e.String(); string::size_type i = s.find( ':' ); if ( i == string::npos ) { b.append( "err" , "bad opid" ); } else { string shard = s.substr( 0 , i ); int opid = atoi( s.substr( i + 1 ).c_str() ); b.append( "shard" , shard ); b.append( "shardid" , opid ); log() << "want to kill op: " << e << endl; Shard s(shard); ScopedDbConnection conn(s.getConnString()); conn->findOne( r.getns() , BSON( "op" << opid ) ); conn.done(); } } } else if ( strcmp( ns , "unlock" ) == 0 ) { b.append( "err" , "can't do unlock through mongos" ); } else { warning() << "unknown sys command [" << ns << "]" << endl; return false; } BSONObj x = b.done(); replyToQuery(0, r.p(), r.m(), x); return true; }
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { verify( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj()); vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col[CollectionType::keyPattern()].eoo() && ! col[CollectionType::noBalance()].trueValue() ){ collections.push_back( col[CollectionType::ns()].String() ); } else if( col[CollectionType::noBalance()].trueValue() ){ LOG(1) << "not balancing collection " << col[CollectionType::ns()].String() << ", explicitly disabled" << endl; } } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } ShardInfoMap shardInfo; for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) { const Shard& s = *it; ShardStatus status = s.getStatus(); shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(), status.mapped(), s.isDraining(), status.hasOpsQueued(), s.tags() ); } // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; map< string,vector<BSONObj> > shardToChunksMap; cursor = conn.query(ChunkType::ConfigNS, QUERY(ChunkType::ns(ns)).sort(ChunkType::min())); set<BSONObj> allChunkMinimums; while ( cursor->more() ) { BSONObj chunk = cursor->nextSafe().getOwned(); vector<BSONObj>& chunks = shardToChunksMap[chunk[ChunkType::shard()].String()]; allChunkMinimums.insert( chunk[ChunkType::min()].Obj() ); chunks.push_back( chunk ); } cursor.reset(); if (shardToChunksMap.empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); } DistributionStatus status( shardInfo, shardToChunksMap ); // load tags conn.ensureIndex(TagsType::ConfigNS, BSON(TagsType::ns() << 1 << TagsType::min() << 1), true); cursor = conn.query(TagsType::ConfigNS, QUERY(TagsType::ns(ns)).sort(TagsType::min())); vector<TagRange> ranges; while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); TagRange tr(tag[TagsType::min()].Obj().getOwned(), tag[TagsType::max()].Obj().getOwned(), tag[TagsType::tag()].String()); ranges.push_back(tr); uassert(16356, str::stream() << "tag ranges not valid for: " << ns, status.addTagRange(tr) ); } cursor.reset(); DBConfigPtr cfg = grid.getDBConfig( ns ); verify( cfg ); ChunkManagerPtr cm = cfg->getChunkManager( ns ); verify( cm ); // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks bool didAnySplits = false; for ( unsigned i = 0; i < ranges.size(); i++ ) { BSONObj min = ranges[i].min; min = cm->getShardKey().extendRangeBound( min, false ); if ( allChunkMinimums.count( min ) > 0 ) continue; didAnySplits = true; log() << "ns: " << ns << " need to split on " << min << " because there is a range there" << endl; ChunkPtr c = cm->findIntersectingChunk( min ); vector<BSONObj> splitPoints; splitPoints.push_back( min ); BSONObj res; if ( !c->multiSplit( splitPoints, res ) ) { error() << "split failed: " << res << endl; } else { LOG(1) << "split worked: " << res << endl; } break; } if ( didAnySplits ) { // state change, just wait till next round continue; } CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { assert( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() ); vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col["key"].eoo() ) collections.push_back( col["_id"].String() ); } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } map< string, BSONObj > shardLimitsMap; for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) { const Shard& s = *it; ShardStatus status = s.getStatus(); BSONObj limitsObj = BSON( ShardFields::maxSize( s.getMaxSize() ) << LimitsFields::currSize( status.mapped() ) << ShardFields::draining( s.isDraining() ) << LimitsFields::hasOpsQueued( status.hasOpsQueued() ) ); shardLimitsMap[ s.getName() ] = limitsObj; } // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; map< string,vector<BSONObj> > shardToChunksMap; cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ) { BSONObj chunk = cursor->nextSafe(); if ( chunk["jumbo"].trueValue() ) continue; vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()]; chunks.push_back( chunk.getOwned() ); } cursor.reset(); if (shardToChunksMap.empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); } CandidateChunk* p = _policy->balance( ns , shardLimitsMap , shardToChunksMap , _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }