bool setShardVersion(DBClientBase& conn,
                         const string& ns,
                         const string& configServerPrimary,
                         ChunkVersion version,
                         ChunkManager* manager,
                         bool authoritative,
                         BSONObj& result) {

        BSONObjBuilder cmdBuilder;
        cmdBuilder.append("setShardVersion", ns);
        cmdBuilder.append("configdb", configServerPrimary);

        Shard s = Shard::make(conn.getServerAddress());
        cmdBuilder.append("shard", s.getName());
        cmdBuilder.append("shardHost", s.getConnString());

        if (ns.size() > 0) {
            version.addToBSON(cmdBuilder);
        }
        else {
            cmdBuilder.append("init", true);
        }

        if (authoritative) {
            cmdBuilder.appendBool("authoritative", 1);
        }

        BSONObj cmd = cmdBuilder.obj();

        LOG(1) << "    setShardVersion  " << s.getName() << " " << conn.getServerAddress()
               << "  " << ns << "  " << cmd
               << (manager ? string(str::stream() << " " << manager->getSequenceNumber()) : "");

        return conn.runCommand("admin", cmd, result, 0);
    }
Beispiel #2
0
    bool Chunk::moveAndCommit(const Shard& to,
                              long long chunkSize /* bytes */,
                              const WriteConcernOptions* writeConcern,
                              bool waitForDelete,
                              int maxTimeMS,
                              BSONObj& res) const {
        uassert( 10167 ,  "can't move shard to its current location!" , getShard() != to );

        log() << "moving chunk ns: " << _manager->getns() << " moving ( " << toString() << ") "
              << _shard.toString() << " -> " << to.toString();

        Shard from = _shard;
        ScopedDbConnection fromconn(from.getConnString());

        BSONObjBuilder builder;
        builder.append("moveChunk", _manager->getns());
        builder.append("from", from.getAddress().toString());
        builder.append("to", to.getAddress().toString());
        // NEEDED FOR 2.0 COMPATIBILITY
        builder.append("fromShard", from.getName());
        builder.append("toShard", to.getName());
        ///////////////////////////////
        builder.append("min", _min);
        builder.append("max", _max);
        builder.append("maxChunkSizeBytes", chunkSize);
        builder.append("shardId", genID());
        builder.append("configdb", configServer.modelServer());

        // For legacy secondary throttle setting.
        bool secondaryThrottle = true;
        if (writeConcern &&
                writeConcern->wNumNodes <= 1 &&
                writeConcern->wMode.empty()) {
            secondaryThrottle = false;
        }

        builder.append("secondaryThrottle", secondaryThrottle);

        if (secondaryThrottle && writeConcern) {
            builder.append("writeConcern", writeConcern->toBSON());
        }

        builder.append("waitForDelete", waitForDelete);
        builder.append(LiteParsedQuery::cmdOptionMaxTimeMS, maxTimeMS);
        builder.append("epoch", _manager->getVersion().epoch());

        bool worked = fromconn->runCommand("admin", builder.done(), res);
        fromconn.done();

        LOG( worked ? 1 : 0 ) << "moveChunk result: " << res;

        // if succeeded, needs to reload to pick up the new location
        // if failed, mongos may be stale
        // reload is excessive here as the failure could be simply because collection metadata is taken
        _manager->reload();

        return worked;
    }
Beispiel #3
0
 void getAllShards( vector<Shard>& all ) {
     scoped_lock lk( _mutex );
     std::set<string> seen;
     for ( map<string,Shard>::iterator i = _lookup.begin(); i!=_lookup.end(); ++i ) {
         Shard s = i->second;
         if ( s.getName() == "config" )
             continue;
         if ( seen.count( s.getName() ) )
             continue;
         seen.insert( s.getName() );
         all.push_back( s );
     }
 }
    Status ChunkManagerTargeter::targetShardKey(const BSONObj& doc,
                                                ShardEndpoint** endpoint) const {

        invariant(NULL != _manager);
        dassert(_manager->hasShardKey(doc));

        ChunkPtr chunk = _manager->findChunkForDoc(doc);

        Shard shard = chunk->getShard();
        *endpoint = new ShardEndpoint(shard.getName(),
                                      _manager->getVersion(StringData(shard.getName())));

        return Status::OK();
    }
    Status DBClientShardResolver::chooseWriteHost( const string& shardName,
                                                   ConnectionString* shardHost ) const {

        // Declare up here for parsing later
        string errMsg;

        // Special-case for config
        if (shardName == "config") {
            *shardHost = ConnectionString::parse( configServer.modelServer(), errMsg );
            dassert( errMsg == "" );
            return Status::OK();
        }

        //
        // First get the information about the shard from the shard cache
        //

        // Internally uses our shard cache, does no reload
        Shard shard = Shard::findIfExists( shardName );
        if ( shard.getName() == "" ) {
            return Status( ErrorCodes::ShardNotFound,
                           string("unknown shard name ") + shardName );
        }
        return findMaster(shard.getConnString().toString(), shardHost);
    }
Beispiel #6
0
    bool setShardVersion( DBClientBase & conn , const string& ns , ShardChunkVersion version , bool authoritative , BSONObj& result ){

        BSONObjBuilder cmdBuilder;
        cmdBuilder.append( "setShardVersion" , ns.c_str() );
        cmdBuilder.append( "configdb" , configServer.modelServer() );
        cmdBuilder.appendTimestamp( "version" , version );
        cmdBuilder.appendOID( "serverID" , &serverID );
        if ( authoritative )
            cmdBuilder.appendBool( "authoritative" , 1 );

        Shard s = Shard::make( conn.getServerAddress() );
        cmdBuilder.append( "shard" , s.getName() );
        cmdBuilder.append( "shardHost" , s.getConnString() );
        BSONObj cmd = cmdBuilder.obj();
        
        log(1) << "    setShardVersion  " << s.getName() << " " << conn.getServerAddress() << "  " << ns << "  " << cmd << " " << &conn << endl;
        
        return conn.runCommand( "admin" , cmd , result );
    }
Beispiel #7
0
 void remove( const string& name ) {
     scoped_lock lk( _mutex );
     for ( map<string,Shard>::iterator i = _lookup.begin(); i!=_lookup.end(); ) {
         Shard s = i->second;
         if ( s.getName() == name ) {
             _lookup.erase(i++);
         }
         else {
             ++i;
         }
     }
 }
Beispiel #8
0
            bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                string target = cmdObj.firstElement().valuestrsafe();
                Shard s = Shard::make( target );
                if ( ! grid.knowAboutShard( s.getConnString() ) ) {
                    errmsg = "unknown shard";
                    return false;
                }

                ScopedDbConnection conn( configServer.getPrimary() );

                if (conn->count("config.shards", BSON("_id" << NE << s.getName() << ShardFields::draining(true)))){
                    conn.done();
                    errmsg = "Can't have more than one draining shard at a time";
                    return false;
                }

                if (conn->count("config.shards", BSON("_id" << NE << s.getName())) == 0){
                    conn.done();
                    errmsg = "Can't remove last shard";
                    return false;
                }

                BSONObj primaryDoc = BSON( "_id" << NE << "local" << "primary" << s.getName() );
                BSONObj dbInfo; // appended at end of result on success
                {
                    boost::scoped_ptr<DBClientCursor> cursor (conn->query("config.databases", primaryDoc));
                    if (cursor->more()) { // skip block and allocations if empty
                        BSONObjBuilder dbInfoBuilder;
                        dbInfoBuilder.append("note", "you need to drop or movePrimary these databases");
                        BSONArrayBuilder dbs(dbInfoBuilder.subarrayStart("dbsToMove"));

                        while (cursor->more()){
                            BSONObj db = cursor->nextSafe();
                            dbs.append(db["_id"]);
                        }
                        dbs.doneFast();

                        dbInfo = dbInfoBuilder.obj();
                    }
                }

                // If the server is not yet draining chunks, put it in draining mode.
                BSONObj searchDoc = BSON( "_id" << s.getName() );
                BSONObj drainingDoc = BSON( "_id" << s.getName() << ShardFields::draining(true) );
                BSONObj shardDoc = conn->findOne( "config.shards", drainingDoc );
                if ( shardDoc.isEmpty() ) {

                    // TODO prevent move chunks to this shard.

                    log() << "going to start draining shard: " << s.getName() << endl;
                    BSONObj newStatus = BSON( "$set" << BSON( ShardFields::draining(true) ) );
                    conn->update( "config.shards" , searchDoc , newStatus, false /* do no upsert */);

                    errmsg = conn->getLastError();
                    if ( errmsg.size() ) {
                        log() << "error starting remove shard: " << s.getName() << " err: " << errmsg << endl;
                        return false;
                    }

                    BSONObj primaryLocalDoc = BSON("_id" << "local" <<  "primary" << s.getName() );
                    PRINT(primaryLocalDoc);
                    if (conn->count("config.databases", primaryLocalDoc)) {
                        log() << "This shard is listed as primary of local db. Removing entry." << endl;
                        conn->remove("config.databases", BSON("_id" << "local"));
                        errmsg = conn->getLastError();
                        if ( errmsg.size() ) {
                            log() << "error removing local db: " << errmsg << endl;
                            return false;
                        }
                    }

                    Shard::reloadShardInfo();

                    result.append( "msg"   , "draining started successfully" );
                    result.append( "state" , "started" );
                    result.append( "shard" , s.getName() );
                    result.appendElements(dbInfo);
                    conn.done();
                    return true;
                }

                // If the server has been completely drained, remove it from the ConfigDB.
                // Check not only for chunks but also databases.
                BSONObj shardIDDoc = BSON( "shard" << shardDoc[ "_id" ].str() );
                long long chunkCount = conn->count( "config.chunks" , shardIDDoc );
                long long dbCount = conn->count( "config.databases" , primaryDoc );
                if ( ( chunkCount == 0 ) && ( dbCount == 0 ) ) {
                    log() << "going to remove shard: " << s.getName() << endl;
                    conn->remove( "config.shards" , searchDoc );

                    errmsg = conn->getLastError();
                    if ( errmsg.size() ) {
                        log() << "error concluding remove shard: " << s.getName() << " err: " << errmsg << endl;
                        return false;
                    }

                    Shard::removeShard( shardDoc[ "_id" ].str() );
                    Shard::reloadShardInfo();

                    result.append( "msg"   , "removeshard completed successfully" );
                    result.append( "state" , "completed" );
                    result.append( "shard" , s.getName() );
                    conn.done();
                    return true;
                }

                // If the server is already in draining mode, just report on its progress.
                // Report on databases (not just chunks) that are left too.
                result.append( "msg"  , "draining ongoing" );
                result.append( "state" , "ongoing" );
                BSONObjBuilder inner;
                inner.append( "chunks" , chunkCount );
                inner.append( "dbs" , dbCount );
                result.append( "remaining" , inner.obj() );
                result.appendElements(dbInfo);

                conn.done();
                return true;
            }
Beispiel #9
0
            bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
                string target = cmdObj.firstElement().valuestrsafe();
                Shard s = Shard::make( target );
                if ( ! grid.knowAboutShard( s.getConnString() ) ) {
                    errmsg = "unknown shard";
                    return false;
                }

                ScopedDbConnection conn( configServer.getPrimary() );

                // If the server is not yet draining chunks, put it in draining mode.
                BSONObj searchDoc = BSON( "_id" << s.getName() );
                BSONObj drainingDoc = BSON( "_id" << s.getName() << ShardFields::draining(true) );
                BSONObj shardDoc = conn->findOne( "config.shards", drainingDoc );
                if ( shardDoc.isEmpty() ) {

                    // TODO prevent move chunks to this shard.

                    log() << "going to start draining shard: " << s.getName() << endl;
                    BSONObj newStatus = BSON( "$set" << BSON( ShardFields::draining(true) ) );
                    conn->update( "config.shards" , searchDoc , newStatus, false /* do no upsert */);

                    errmsg = conn->getLastError();
                    if ( errmsg.size() ) {
                        log() << "error starting remove shard: " << s.getName() << " err: " << errmsg << endl;
                        return false;
                    }

                    Shard::reloadShardInfo();

                    result.append( "msg"   , "draining started successfully" );
                    result.append( "state" , "started" );
                    result.append( "shard" , s.getName() );
                    conn.done();
                    return true;
                }

                // If the server has been completely drained, remove it from the ConfigDB.
                // Check not only for chunks but also databases.
                BSONObj shardIDDoc = BSON( "shard" << shardDoc[ "_id" ].str() );
                long long chunkCount = conn->count( "config.chunks" , shardIDDoc );
                BSONObj primaryDoc = BSON( "primary" << shardDoc[ "_id" ].str() );
                long long dbCount = conn->count( "config.databases" , primaryDoc );
                if ( ( chunkCount == 0 ) && ( dbCount == 0 ) ) {
                    log() << "going to remove shard: " << s.getName() << endl;
                    conn->remove( "config.shards" , searchDoc );

                    errmsg = conn->getLastError();
                    if ( errmsg.size() ) {
                        log() << "error concluding remove shard: " << s.getName() << " err: " << errmsg << endl;
                        return false;
                    }

                    Shard::removeShard( shardDoc[ "_id" ].str() );
                    Shard::reloadShardInfo();

                    result.append( "msg"   , "removeshard completed successfully" );
                    result.append( "state" , "completed" );
                    result.append( "shard" , s.getName() );
                    conn.done();
                    return true;
                }

                // If the server is already in draining mode, just report on its progress.
                // Report on databases (not just chunks) that are left too.
                result.append( "msg"  , "draining ongoing" );
                result.append( "state" , "ongoing" );
                BSONObjBuilder inner;
                inner.append( "chunks" , chunkCount );
                inner.append( "dbs" , dbCount );
                result.append( "remaining" , inner.obj() );

                conn.done();
                return true;
            }
Beispiel #10
0
    /**
     * @return true if had to do something
     */
    bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) {
        // TODO: cache, optimize, etc...

        WriteBackListener::init( *conn_in );

        DBConfigPtr conf = grid.getDBConfig( ns );
        if ( ! conf )
            return false;

        DBClientBase* conn = getVersionable( conn_in );
        verify(conn); // errors thrown above

        unsigned long long officialSequenceNumber = 0;

        ChunkManagerPtr manager;
        const bool isSharded = conf->isSharded( ns );
        if ( isSharded ) {
            manager = conf->getChunkManagerIfExists( ns , authoritative );
            // It's possible the chunk manager was reset since we checked whether sharded was true,
            // so must check this here.
            if( manager ) officialSequenceNumber = manager->getSequenceNumber();
        }

        // Check this manager against the reference manager
        if( isSharded && manager ){

            Shard shard = Shard::make( conn->getServerAddress() );
            if( refManager && ! refManager->compatibleWith( manager, shard ) ){
                throw SendStaleConfigException( ns, str::stream() << "manager (" << manager->getVersion( shard ).toString()  << " : " << manager->getSequenceNumber() << ") "
                                                                      << "not compatible with reference manager (" << refManager->getVersion( shard ).toString()  << " : " << refManager->getSequenceNumber() << ") "
                                                                      << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")",
                                                refManager->getVersion( shard ), manager->getVersion( shard ) );
            }
        }
        else if( refManager ){
            Shard shard = Shard::make( conn->getServerAddress() );
            string msg( str::stream() << "not sharded ("
                        << ( (manager.get() == 0) ? string( "<none>" ) :
                                str::stream() << manager->getSequenceNumber() )
                        << ") but has reference manager ("
                        << refManager->getSequenceNumber() << ") "
                        << "on conn " << conn->getServerAddress() << " ("
                        << conn_in->getServerAddress() << ")" );

            throw SendStaleConfigException( ns, msg,
                    refManager->getVersion( shard ), ShardChunkVersion( 0, OID() ));
        }

        // has the ChunkManager been reloaded since the last time we updated the connection-level version?
        // (ie., last time we issued the setShardVersions below)
        unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns);
        if ( sequenceNumber == officialSequenceNumber ) {
            return false;
        }


        ShardChunkVersion version = ShardChunkVersion( 0, OID() );
        if ( isSharded && manager ) {
            version = manager->getVersion( Shard::make( conn->getServerAddress() ) );
        }

        if( ! version.isSet() ){
            LOG(0) << "resetting shard version of " << ns << " on " << conn->getServerAddress() << ", " <<
                      ( ! isSharded ? "no longer sharded" :
                      ( ! manager ? "no chunk manager found" :
                                    "version is zero" ) ) << endl;
        }

        LOG(2) << " have to set shard version for conn: " << conn->getServerAddress() << " ns:" << ns
               << " my last seq: " << sequenceNumber << "  current: " << officialSequenceNumber
               << " version: " << version << " manager: " << manager.get()
               << endl;

        const string versionableServerAddress(conn->getServerAddress());

        BSONObj result;
        if ( setShardVersion( *conn , ns , version , authoritative , result ) ) {
            // success!
            LOG(1) << "      setShardVersion success: " << result << endl;
            connectionShardStatus.setSequence( conn , ns , officialSequenceNumber );
            return true;
        }

        LOG(1) << "       setShardVersion failed!\n" << result << endl;

        if ( result["need_authoritative"].trueValue() )
            massert( 10428 ,  "need_authoritative set but in authoritative mode already" , ! authoritative );

        if ( ! authoritative ) {
            // use the original connection and get a fresh versionable connection
            // since conn can be invalidated (or worse, freed) after the failure
            checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1);
            return true;
        }
        
        if ( result["reloadConfig"].trueValue() ) {
            if( result["version"].timestampTime() == 0 ){

                warning() << "reloading full configuration for " << conf->getName()
                          << ", connection state indicates significant version changes" << endl;

                // reload db
                conf->reload();
            }
            else {
                // reload config
                conf->getChunkManager( ns , true );
            }
        }

        const int maxNumTries = 7;
        if ( tryNumber < maxNumTries ) {
            LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) 
                << "going to retry checkShardVersion host: " << versionableServerAddress << " " << result << endl;
            sleepmillis( 10 * tryNumber );
            // use the original connection and get a fresh versionable connection
            // since conn can be invalidated (or worse, freed) after the failure
            checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1);
            return true;
        }
        
        string errmsg = str::stream() << "setShardVersion failed host: " << versionableServerAddress << " " << result;
        log() << "     " << errmsg << endl;
        massert( 10429 , errmsg , 0 );
        return true;
    }
Beispiel #11
0
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
        verify( candidateChunks );

        //
        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::collections collection
        //

        auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() );
        vector< string > collections;
        while ( cursor->more() ) {
            BSONObj col = cursor->nextSafe();

            // sharded collections will have a shard "key".
            if ( ! col["key"].eoo() && ! col["noBalance"].trueValue() ){
                collections.push_back( col["_id"].String() );
            }
            else if( col["noBalance"].trueValue() ){
                LOG(1) << "not balancing collection " << col["_id"].String() << ", explicitly disabled" << endl;
            }

        }
        cursor.reset();

        if ( collections.empty() ) {
            LOG(1) << "no collections to balance" << endl;
            return;
        }

        //
        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        //
        // TODO: skip unresponsive shards and mark information as stale.
        //

        vector<Shard> allShards;
        Shard::getAllShards( allShards );
        if ( allShards.size() < 2) {
            LOG(1) << "can't balance without more active shards" << endl;
            return;
        }
        
        ShardInfoMap shardInfo;
        for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) {
            const Shard& s = *it;
            ShardStatus status = s.getStatus();
            shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(),
                                                  status.mapped(),
                                                  s.isDraining(),
                                                  status.hasOpsQueued(),
                                                  s.tags()
                                                  );
        }

        //
        // 3. For each collection, check if the balancing policy recommends moving anything around.
        //

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            map< string,vector<BSONObj> > shardToChunksMap;
            cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) );
            while ( cursor->more() ) {
                BSONObj chunk = cursor->nextSafe();
                if ( chunk["jumbo"].trueValue() )
                    continue;
                vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()];
                chunks.push_back( chunk.getOwned() );
            }
            cursor.reset();

            if (shardToChunksMap.empty()) {
                LOG(1) << "skipping empty collection (" << ns << ")";
                continue;
            }
            
            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
                // this just makes sure there is an entry in shardToChunksMap for every shard
                Shard s = *i;
                shardToChunksMap[s.getName()].size();
            }

            DistributionStatus status( shardInfo, shardToChunksMap );
            
            // load tags
            conn.ensureIndex( ShardNS::tags, BSON( "ns" << 1 << "min" << 1 ), true );
            cursor = conn.query( ShardNS::tags , QUERY( "ns" << ns ).sort( "min" ) );
            while ( cursor->more() ) {
                BSONObj tag = cursor->nextSafe();
                uassert( 16356 , str::stream() << "tag ranges not valid for: " << ns ,
                         status.addTagRange( TagRange( tag["min"].Obj().getOwned(), 
                                                       tag["max"].Obj().getOwned(), 
                                                       tag["tag"].String() ) ) );
                    
            }
            cursor.reset();
            
            CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
        }
    }
Beispiel #12
0
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ){
        assert( candidateChunks );

        //
        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::database collection
        //
        // { "_id" : "test", "partitioned" : true, "primary" : "shard0",
        //   "sharded" : {
        //       "test.images" : { "key" : { "_id" : 1 }, "unique" : false },
        //       ...  
        //   }
        // }
        //

        auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::database , BSON( "partitioned" << true ) );
        vector< string > collections;
        while ( cursor->more() ){
            BSONObj db = cursor->next();

            // A database may be partitioned but not yet have a sharded collection. 
            // 'cursor' will point to docs that do not contain the "sharded" key. Since 
            // there'd be nothing to balance, we want to skip those here.

            BSONElement shardedColls = db["sharded"];
            if ( shardedColls.eoo() ){
                log(2) << "balancer: skipping database with no sharded collection (" 
                      << db["_id"].str() << ")" << endl;
                continue;
            }
            
            BSONObjIterator i( shardedColls.Obj() );
            while ( i.more() ){
                BSONElement e = i.next();
                collections.push_back( e.fieldName() );
            }
        }
        cursor.reset();

        if ( collections.empty() ) {
            log(1) << "balancer: no collections to balance" << endl;
            return;
        }

        //
        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        //
        // TODO: skip unresponsive shards and mark information as stale.
        //
 
        vector<Shard> allShards;
        Shard::getAllShards( allShards );
        if ( allShards.size() < 2) {
            log(1) << "balancer: can't balance without more active shards" << endl;
            return;
        }

        map< string, BSONObj > shardLimitsMap; 
        for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ){
            const Shard& s = *it;
            ShardStatus status = s.getStatus();

            BSONObj limitsObj = BSON( ShardFields::maxSize( s.getMaxSize() ) << 
                                      ShardFields::currSize( status.mapped() ) <<
                                      ShardFields::draining( s.isDraining()) );

            shardLimitsMap[ s.getName() ] = limitsObj;
        }

        //
        // 3. For each collection, check if the balancing policy recommends moving anything around.
        //

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            map< string,vector<BSONObj> > shardToChunksMap;
            cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) );
            while ( cursor->more() ){
                BSONObj chunk = cursor->next();
                vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()];
                chunks.push_back( chunk.getOwned() );
            }
            cursor.reset();

            if (shardToChunksMap.empty()) {
                log(1) << "balancer: skipping empty collection (" << ns << ")";
                continue;
            }
                
            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ){
                // this just makes sure there is an entry in shardToChunksMap for every shard
                Shard s = *i;
                shardToChunksMap[s.getName()].size();
            }

            CandidateChunk* p = _policy->balance( ns , shardLimitsMap , shardToChunksMap , _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
        }
    }
Beispiel #13
0
    /**
     * Updates the remote cached version on the remote shard host (primary, in the case of replica
     * sets) if needed with a fully-qualified shard version for the given namespace:
     *   config server(s) + shard name + shard version
     *
     * If no remote cached version has ever been set, an initial shard version is sent.
     *
     * If the namespace is empty and no version has ever been sent, the config server + shard name
     * is sent to the remote shard host to initialize the connection as coming from mongos.
     * NOTE: This initialization is *best-effort only*.  Operations which wish to correctly version
     * must send the namespace.
     *
     * Config servers are special and are not (unless otherwise a shard) kept up to date with this
     * protocol.  This is safe so long as config servers only contain unversioned collections.
     *
     * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB).
     *
     * @return true if we contacted the remote host
     */
    bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) {
        // TODO: cache, optimize, etc...

        // Empty namespaces are special - we require initialization but not versioning
        if (ns.size() == 0) {
            return initShardVersionEmptyNS(conn_in);
        }

        DBConfigPtr conf = grid.getDBConfig( ns );
        if ( ! conf )
            return false;

        DBClientBase* conn = getVersionable( conn_in );
        verify(conn); // errors thrown above

        unsigned long long officialSequenceNumber = 0;

        ShardPtr primary;
        ChunkManagerPtr manager;
        if (authoritative)
            conf->getChunkManagerIfExists(ns, true);

        conf->getChunkManagerOrPrimary(ns, manager, primary);

        if (manager)
            officialSequenceNumber = manager->getSequenceNumber();

        // Check this manager against the reference manager
        if( manager ){

            Shard shard = Shard::make( conn->getServerAddress() );
            if (refManager && !refManager->compatibleWith(*manager, shard.getName())) {
                const ChunkVersion refVersion(refManager->getVersion(shard.getName()));
                const ChunkVersion currentVersion(manager->getVersion(shard.getName()));
                string msg(str::stream() << "manager ("
                        << currentVersion.toString()
                        << " : " << manager->getSequenceNumber() << ") "
                        << "not compatible with reference manager ("
                        << refVersion.toString()
                        << " : " << refManager->getSequenceNumber() << ") "
                        << "on shard " << shard.getName()
                        << " (" << shard.getAddress().toString() << ")");

                throw SendStaleConfigException(ns,
                                               msg,
                                               refVersion,
                                               currentVersion);
            }
        }
        else if( refManager ){

            Shard shard = Shard::make(conn->getServerAddress());
            string msg( str::stream() << "not sharded ("
                        << ( (manager.get() == 0) ? string( "<none>" ) :
                                str::stream() << manager->getSequenceNumber() )
                        << ") but has reference manager ("
                        << refManager->getSequenceNumber() << ") "
                        << "on conn " << conn->getServerAddress() << " ("
                        << conn_in->getServerAddress() << ")" );

            throw SendStaleConfigException(ns,
                                           msg,
                                           refManager->getVersion(shard.getName()),
                                           ChunkVersion::UNSHARDED());
        }

        // Do not send setShardVersion to collections on the config servers - this causes problems
        // when config servers are also shards and get SSV with conflicting names.
        // TODO: Make config servers regular shards
        if (primary && primary->getName() == "config") {
            return false;
        }

        // Has the ChunkManager been reloaded since the last time we updated the shard version over
        // this connection?  If we've never updated the shard version, do so now.
        unsigned long long sequenceNumber = 0;
        if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) {
            if (sequenceNumber == officialSequenceNumber) {
                return false;
            }
        }

        // Now that we're sure we're sending SSV and not to a single config server, get the shard
        Shard shard = Shard::make(conn->getServerAddress());

        ChunkVersion version = ChunkVersion(0, 0, OID());
        if (manager)
            version = manager->getVersion(shard.getName());

        LOG(1) << "setting shard version of " << version << " for " << ns << " on shard "
               << shard.toString();

        LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber
               << ", current chunk manager iteration is " << officialSequenceNumber;

        BSONObj result;
        if (setShardVersion(*conn,
                            ns,
                            configServer.modelServer(),
                            version,
                            manager.get(),
                            authoritative,
                            result)) {

            LOG(1) << "      setShardVersion success: " << result;
            connectionShardStatus.setSequence( conn , ns , officialSequenceNumber );
            return true;
        }

        LOG(1) << "       setShardVersion failed!\n" << result << endl;

        if ( result["need_authoritative"].trueValue() )
            massert( 10428 ,  "need_authoritative set but in authoritative mode already" , ! authoritative );

        if ( ! authoritative ) {
            // use the original connection and get a fresh versionable connection
            // since conn can be invalidated (or worse, freed) after the failure
            checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1);
            return true;
        }
        
        if ( result["reloadConfig"].trueValue() ) {
            if( result["version"].timestampTime() == 0 ){

                warning() << "reloading full configuration for " << conf->name()
                          << ", connection state indicates significant version changes";

                // reload db
                conf->reload();
            }
            else {
                // reload config
                conf->getChunkManager( ns , true );
            }
        }

        const int maxNumTries = 7;
        if ( tryNumber < maxNumTries ) {
            LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) 
                << "going to retry checkShardVersion shard: " << shard.toString() << " " << result;
            sleepmillis( 10 * tryNumber );
            // use the original connection and get a fresh versionable connection
            // since conn can be invalidated (or worse, freed) after the failure
            checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1);
            return true;
        }
        
        string errmsg = str::stream() << "setShardVersion failed shard: " << shard.toString()
                                          << " " << result;
        log() << "     " << errmsg << endl;
        massert( 10429 , errmsg , 0 );
        return true;
    }
    Status DBClientShardResolver::chooseWriteHost( const string& shardName,
                                                   ConnectionString* shardHost ) const {

        // Declare up here for parsing later
        string errMsg;

        // Special-case for config and admin
        if ( shardName == "config" || shardName == "admin" ) {
            *shardHost = ConnectionString::parse( configServer.modelServer(), errMsg );
            dassert( errMsg == "" );
            return Status::OK();
        }

        //
        // First get the information about the shard from the shard cache
        //

        // Internally uses our shard cache, does no reload
        Shard shard = Shard::findIfExists( shardName );
        if ( shard.getName() == "" ) {
            return Status( ErrorCodes::ShardNotFound,
                           string("unknown shard name ") + shardName );
        }

        ConnectionString rawShardHost = ConnectionString::parse( shard.getConnString(), errMsg );
        dassert( errMsg == "" );
        dassert( rawShardHost.type() == ConnectionString::SET
                 || rawShardHost.type() == ConnectionString::MASTER );

        if ( rawShardHost.type() == ConnectionString::MASTER ) {
            *shardHost = rawShardHost;
            return Status::OK();
        }

        //
        // If we need to, then get the particular node we're targeting in the replica set
        //

        // Does not reload the monitor if it doesn't currently exist
        ReplicaSetMonitorPtr replMonitor = ReplicaSetMonitor::get( rawShardHost.getSetName(),
                                                                   false );
        if ( !replMonitor ) {
            return Status( ErrorCodes::ReplicaSetNotFound,
                           string("unknown replica set ") + rawShardHost.getSetName() );
        }

        try {
            // This can throw when we don't find a master!
            HostAndPort masterHostAndPort = replMonitor->getMaster();
            *shardHost = ConnectionString::parse( masterHostAndPort.toString( true ), errMsg );
            dassert( errMsg == "" );
            return Status::OK();
        }
        catch ( const DBException& ) {
            return Status( ErrorCodes::HostNotFound,
                           string("could not contact primary for replica set ")
                           + replMonitor->getName() );
        }

        // Unreachable
        dassert( false );
        return Status( ErrorCodes::UnknownError, "" );
    }
Beispiel #15
0
        bool handleSpecialNamespaces( Request& r , QueryMessage& q ) {
            const char * ns = r.getns();
            ns = strstr( r.getns() , ".$cmd.sys." );
            if ( ! ns )
                return false;
            ns += 10;

            r.checkAuth( Auth::WRITE );

            BSONObjBuilder b;
            vector<Shard> shards;

            if ( strcmp( ns , "inprog" ) == 0 ) {
                Shard::getAllShards( shards );

                BSONArrayBuilder arr( b.subarrayStart( "inprog" ) );

                for ( unsigned i=0; i<shards.size(); i++ ) {
                    Shard shard = shards[i];
                    ScopedDbConnection conn( shard );
                    BSONObj temp = conn->findOne( r.getns() , BSONObj() );
                    if ( temp["inprog"].isABSONObj() ) {
                        BSONObjIterator i( temp["inprog"].Obj() );
                        while ( i.more() ) {
                            BSONObjBuilder x;

                            BSONObjIterator j( i.next().Obj() );
                            while( j.more() ) {
                                BSONElement e = j.next();
                                if ( str::equals( e.fieldName() , "opid" ) ) {
                                    stringstream ss;
                                    ss << shard.getName() << ':' << e.numberInt();
                                    x.append( "opid" , ss.str() );
                                }
                                else if ( str::equals( e.fieldName() , "client" ) ) {
                                    x.appendAs( e , "client_s" );
                                }
                                else {
                                    x.append( e );
                                }
                            }
                            arr.append( x.obj() );
                        }
                    }
                    conn.done();
                }

                arr.done();
            }
            else if ( strcmp( ns , "killop" ) == 0 ) {
                BSONElement e = q.query["op"];
                if ( strstr( r.getns() , "admin." ) == 0 ) {
                    b.append( "err" , "unauthorized" );
                }
                else if ( e.type() != String ) {
                    b.append( "err" , "bad op" );
                    b.append( e );
                }
                else {
                    b.append( e );
                    string s = e.String();
                    string::size_type i = s.find( ':' );
                    if ( i == string::npos ) {
                        b.append( "err" , "bad opid" );
                    }
                    else {
                        string shard = s.substr( 0 , i );
                        int opid = atoi( s.substr( i + 1 ).c_str() );
                        b.append( "shard" , shard );
                        b.append( "shardid" , opid );

                        log() << "want to kill op: " << e << endl;
                        Shard s(shard);

                        ScopedDbConnection conn( s );
                        conn->findOne( r.getns() , BSON( "op" << opid ) );
                        conn.done();
                    }
                }
            }
            else if ( strcmp( ns , "unlock" ) == 0 ) {
                b.append( "err" , "can't do unlock through mongos" );
            }
            else {
                log( LL_WARNING ) << "unknown sys command [" << ns << "]" << endl;
                return false;
            }

            BSONObj x = b.done();
            replyToQuery(0, r.p(), r.m(), x);
            return true;
        }
Beispiel #16
0
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
        verify( candidateChunks );

        //
        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::collections collection
        //

        auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj());

        if ( NULL == cursor.get() ) {
            warning() << "could not query " << CollectionType::ConfigNS
                      << " while trying to balance" << endl;
            return;
        }

        vector< string > collections;
        while ( cursor->more() ) {
            BSONObj col = cursor->nextSafe();

            // sharded collections will have a shard "key".
            if ( ! col[CollectionType::keyPattern()].eoo() &&
                 ! col[CollectionType::noBalance()].trueValue() ){
                collections.push_back( col[CollectionType::ns()].String() );
            }
            else if( col[CollectionType::noBalance()].trueValue() ){
                LOG(1) << "not balancing collection " << col[CollectionType::ns()].String()
                       << ", explicitly disabled" << endl;
            }

        }
        cursor.reset();

        if ( collections.empty() ) {
            LOG(1) << "no collections to balance" << endl;
            return;
        }

        //
        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        //
        // TODO: skip unresponsive shards and mark information as stale.
        //

        vector<Shard> allShards;
        Shard::getAllShards( allShards );
        if ( allShards.size() < 2) {
            LOG(1) << "can't balance without more active shards" << endl;
            return;
        }
        
        ShardInfoMap shardInfo;
        DistributionStatus::populateShardInfoMap(allShards, &shardInfo);

        OCCASIONALLY warnOnMultiVersion( shardInfo );

        //
        // 3. For each collection, check if the balancing policy recommends moving anything around.
        //

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            OwnedPointerMap<string, OwnedPointerVector<ChunkType> > shardToChunksMap;
            cursor = conn.query(ChunkType::ConfigNS,
                                QUERY(ChunkType::ns(ns)).sort(ChunkType::min()));

            set<BSONObj> allChunkMinimums;

            while ( cursor->more() ) {
                BSONObj chunkDoc = cursor->nextSafe().getOwned();

                auto_ptr<ChunkType> chunk(new ChunkType());
                string errmsg;
                if (!chunk->parseBSON(chunkDoc, &errmsg)) {
                    error() << "bad chunk format for " << chunkDoc
                            << ": " << errmsg << endl;
                    return;
                }

                allChunkMinimums.insert(chunk->getMin().getOwned());
                OwnedPointerVector<ChunkType>*& chunkList =
                        shardToChunksMap.mutableMap()[chunk->getShard()];

                if (chunkList == NULL) {
                    chunkList = new OwnedPointerVector<ChunkType>();
                }

                chunkList->mutableVector().push_back(chunk.release());
            }
            cursor.reset();

            if (shardToChunksMap.map().empty()) {
                LOG(1) << "skipping empty collection (" << ns << ")";
                continue;
            }

            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
                // this just makes sure there is an entry in shardToChunksMap for every shard
                Shard s = *i;
                OwnedPointerVector<ChunkType>*& chunkList =
                        shardToChunksMap.mutableMap()[s.getName()];

                if (chunkList == NULL) {
                    chunkList = new OwnedPointerVector<ChunkType>();
                }
            }

            DistributionStatus status(shardInfo, shardToChunksMap.map());

            // load tags
            Status result = clusterCreateIndex(TagsType::ConfigNS,
                                               BSON(TagsType::ns() << 1 << TagsType::min() << 1),
                                               true, // unique
                                               WriteConcernOptions::AllConfigs,
                                               NULL);

            if ( !result.isOK() ) {
                warning() << "could not create index tags_1_min_1: " << result.reason() << endl;
                continue;
            }

            cursor = conn.query(TagsType::ConfigNS,
                                QUERY(TagsType::ns(ns)).sort(TagsType::min()));

            vector<TagRange> ranges;

            while ( cursor->more() ) {
                BSONObj tag = cursor->nextSafe();
                TagRange tr(tag[TagsType::min()].Obj().getOwned(),
                            tag[TagsType::max()].Obj().getOwned(),
                            tag[TagsType::tag()].String());
                ranges.push_back(tr);
                uassert(16356,
                        str::stream() << "tag ranges not valid for: " << ns,
                        status.addTagRange(tr) );

            }
            cursor.reset();

            DBConfigPtr cfg = grid.getDBConfig( ns );
            if ( !cfg ) {
                warning() << "could not load db config to balance " << ns << " collection" << endl;
                continue;
            }

            // This line reloads the chunk manager once if this process doesn't know the collection
            // is sharded yet.
            ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true );
            if ( !cm ) {
                warning() << "could not load chunks to balance " << ns << " collection" << endl;
                continue;
            }

            // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks
            bool didAnySplits = false;
            for ( unsigned i = 0; i < ranges.size(); i++ ) {
                BSONObj min = ranges[i].min;

                min = cm->getShardKey().extendRangeBound( min, false );

                if ( allChunkMinimums.count( min ) > 0 )
                    continue;

                didAnySplits = true;

                log() << "ns: " << ns << " need to split on "
                      << min << " because there is a range there" << endl;

                ChunkPtr c = cm->findIntersectingChunk( min );

                vector<BSONObj> splitPoints;
                splitPoints.push_back( min );

                Status status = c->multiSplit(splitPoints, NULL);
                if ( !status.isOK() ) {
                    error() << "split failed: " << status << endl;
                }
                else {
                    LOG(1) << "split worked" << endl;
                }
                break;
            }

            if ( didAnySplits ) {
                // state change, just wait till next round
                continue;
            }

            CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
        }
    }
Beispiel #17
0
        bool handleSpecialNamespaces( Request& r , QueryMessage& q ) {
            const char * ns = strstr( r.getns() , ".$cmd.sys." );
            if ( ! ns )
                return false;
            ns += 10;

            BSONObjBuilder b;
            vector<Shard> shards;

            ClientBasic* client = ClientBasic::getCurrent();
            AuthorizationSession* authSession = client->getAuthorizationSession();
            if ( strcmp( ns , "inprog" ) == 0 ) {
                const bool isAuthorized = authSession->isAuthorizedForActionsOnResource(
                        ResourcePattern::forClusterResource(), ActionType::inprog);
                audit::logInProgAuthzCheck(
                        client, q.query, isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized);
                uassert(ErrorCodes::Unauthorized, "not authorized to run inprog", isAuthorized);

                Shard::getAllShards( shards );

                BSONArrayBuilder arr( b.subarrayStart( "inprog" ) );

                for ( unsigned i=0; i<shards.size(); i++ ) {
                    Shard shard = shards[i];
                    ScopedDbConnection conn(shard.getConnString());
                    BSONObj temp = conn->findOne( r.getns() , q.query );
                    if ( temp["inprog"].isABSONObj() ) {
                        BSONObjIterator i( temp["inprog"].Obj() );
                        while ( i.more() ) {
                            BSONObjBuilder x;

                            BSONObjIterator j( i.next().Obj() );
                            while( j.more() ) {
                                BSONElement e = j.next();
                                if ( str::equals( e.fieldName() , "opid" ) ) {
                                    stringstream ss;
                                    ss << shard.getName() << ':' << e.numberInt();
                                    x.append( "opid" , ss.str() );
                                }
                                else if ( str::equals( e.fieldName() , "client" ) ) {
                                    x.appendAs( e , "client_s" );
                                }
                                else {
                                    x.append( e );
                                }
                            }
                            arr.append( x.obj() );
                        }
                    }
                    conn.done();
                }

                arr.done();
            }
            else if ( strcmp( ns , "killop" ) == 0 ) {
                const bool isAuthorized = authSession->isAuthorizedForActionsOnResource(
                        ResourcePattern::forClusterResource(), ActionType::killop);
                audit::logKillOpAuthzCheck(
                        client,
                        q.query,
                        isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized);
                uassert(ErrorCodes::Unauthorized, "not authorized to run killop", isAuthorized);

                BSONElement e = q.query["op"];
                if ( e.type() != String ) {
                    b.append( "err" , "bad op" );
                    b.append( e );
                }
                else {
                    b.append( e );
                    string s = e.String();
                    string::size_type i = s.find( ':' );
                    if ( i == string::npos ) {
                        b.append( "err" , "bad opid" );
                    }
                    else {
                        string shard = s.substr( 0 , i );
                        int opid = atoi( s.substr( i + 1 ).c_str() );
                        b.append( "shard" , shard );
                        b.append( "shardid" , opid );

                        log() << "want to kill op: " << e << endl;
                        Shard s(shard);

                        ScopedDbConnection conn(s.getConnString());
                        conn->findOne( r.getns() , BSON( "op" << opid ) );
                        conn.done();
                    }
                }
            }
            else if ( strcmp( ns , "unlock" ) == 0 ) {
                b.append( "err" , "can't do unlock through mongos" );
            }
            else {
                warning() << "unknown sys command [" << ns << "]" << endl;
                return false;
            }

            BSONObj x = b.done();
            replyToQuery(0, r.p(), r.m(), x);
            return true;
        }
Beispiel #18
0
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
        verify( candidateChunks );

        //
        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::collections collection
        //

        auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj());
        vector< string > collections;
        while ( cursor->more() ) {
            BSONObj col = cursor->nextSafe();

            // sharded collections will have a shard "key".
            if ( ! col[CollectionType::keyPattern()].eoo() &&
                 ! col[CollectionType::noBalance()].trueValue() ){
                collections.push_back( col[CollectionType::ns()].String() );
            }
            else if( col[CollectionType::noBalance()].trueValue() ){
                LOG(1) << "not balancing collection " << col[CollectionType::ns()].String()
                       << ", explicitly disabled" << endl;
            }

        }
        cursor.reset();

        if ( collections.empty() ) {
            LOG(1) << "no collections to balance" << endl;
            return;
        }

        //
        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        //
        // TODO: skip unresponsive shards and mark information as stale.
        //

        vector<Shard> allShards;
        Shard::getAllShards( allShards );
        if ( allShards.size() < 2) {
            LOG(1) << "can't balance without more active shards" << endl;
            return;
        }
        
        ShardInfoMap shardInfo;
        for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) {
            const Shard& s = *it;
            ShardStatus status = s.getStatus();
            shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(),
                                                  status.mapped(),
                                                  s.isDraining(),
                                                  status.hasOpsQueued(),
                                                  s.tags()
                                                  );
        }

        //
        // 3. For each collection, check if the balancing policy recommends moving anything around.
        //

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            map< string,vector<BSONObj> > shardToChunksMap;
            cursor = conn.query(ChunkType::ConfigNS,
                                QUERY(ChunkType::ns(ns)).sort(ChunkType::min()));

            set<BSONObj> allChunkMinimums;

            while ( cursor->more() ) {
                BSONObj chunk = cursor->nextSafe().getOwned();
                vector<BSONObj>& chunks = shardToChunksMap[chunk[ChunkType::shard()].String()];
                allChunkMinimums.insert( chunk[ChunkType::min()].Obj() );
                chunks.push_back( chunk );
            }
            cursor.reset();

            if (shardToChunksMap.empty()) {
                LOG(1) << "skipping empty collection (" << ns << ")";
                continue;
            }

            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
                // this just makes sure there is an entry in shardToChunksMap for every shard
                Shard s = *i;
                shardToChunksMap[s.getName()].size();
            }

            DistributionStatus status( shardInfo, shardToChunksMap );

            // load tags
            conn.ensureIndex(TagsType::ConfigNS,
                             BSON(TagsType::ns() << 1 << TagsType::min() << 1),
                             true);

            cursor = conn.query(TagsType::ConfigNS,
                                QUERY(TagsType::ns(ns)).sort(TagsType::min()));

            vector<TagRange> ranges;

            while ( cursor->more() ) {
                BSONObj tag = cursor->nextSafe();
                TagRange tr(tag[TagsType::min()].Obj().getOwned(),
                            tag[TagsType::max()].Obj().getOwned(),
                            tag[TagsType::tag()].String());
                ranges.push_back(tr);
                uassert(16356,
                        str::stream() << "tag ranges not valid for: " << ns,
                        status.addTagRange(tr) );

            }
            cursor.reset();

            DBConfigPtr cfg = grid.getDBConfig( ns );
            verify( cfg );
            ChunkManagerPtr cm = cfg->getChunkManager( ns );
            verify( cm );

            // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks
            bool didAnySplits = false;
            for ( unsigned i = 0; i < ranges.size(); i++ ) {
                BSONObj min = ranges[i].min;

                min = cm->getShardKey().extendRangeBound( min, false );

                if ( allChunkMinimums.count( min ) > 0 )
                    continue;

                didAnySplits = true;

                log() << "ns: " << ns << " need to split on "
                      << min << " because there is a range there" << endl;

                ChunkPtr c = cm->findIntersectingChunk( min );

                vector<BSONObj> splitPoints;
                splitPoints.push_back( min );

                BSONObj res;
                if ( !c->multiSplit( splitPoints, res ) ) {
                    error() << "split failed: " << res << endl;
                }
                else {
                    LOG(1) << "split worked: " << res << endl;
                }
                break;
            }

            if ( didAnySplits ) {
                // state change, just wait till next round
                continue;
            }

            CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
        }
    }
Beispiel #19
0
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
        assert( candidateChunks );

        //
        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::collections collection
        //

        auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() );
        vector< string > collections;
        while ( cursor->more() ) {
            BSONObj col = cursor->nextSafe();

            // sharded collections will have a shard "key".
            if ( ! col["key"].eoo() )
                collections.push_back( col["_id"].String() );
        }
        cursor.reset();

        if ( collections.empty() ) {
            LOG(1) << "no collections to balance" << endl;
            return;
        }

        //
        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        //
        // TODO: skip unresponsive shards and mark information as stale.
        //

        vector<Shard> allShards;
        Shard::getAllShards( allShards );
        if ( allShards.size() < 2) {
            LOG(1) << "can't balance without more active shards" << endl;
            return;
        }

        map< string, BSONObj > shardLimitsMap;
        for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) {
            const Shard& s = *it;
            ShardStatus status = s.getStatus();

            BSONObj limitsObj = BSON( ShardFields::maxSize( s.getMaxSize() ) <<
                                      LimitsFields::currSize( status.mapped() ) <<
                                      ShardFields::draining( s.isDraining() )  <<
                                      LimitsFields::hasOpsQueued( status.hasOpsQueued() )
                                    );

            shardLimitsMap[ s.getName() ] = limitsObj;
        }

        //
        // 3. For each collection, check if the balancing policy recommends moving anything around.
        //

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            map< string,vector<BSONObj> > shardToChunksMap;
            cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) );
            while ( cursor->more() ) {
                BSONObj chunk = cursor->nextSafe();
                if ( chunk["jumbo"].trueValue() )
                    continue;
                vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()];
                chunks.push_back( chunk.getOwned() );
            }
            cursor.reset();

            if (shardToChunksMap.empty()) {
                LOG(1) << "skipping empty collection (" << ns << ")";
                continue;
            }

            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
                // this just makes sure there is an entry in shardToChunksMap for every shard
                Shard s = *i;
                shardToChunksMap[s.getName()].size();
            }

            CandidateChunk* p = _policy->balance( ns , shardLimitsMap , shardToChunksMap , _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
        }
    }