示例#1
0
    int Balancer::_moveChunks( const vector<CandidateChunkPtr>* candidateChunks , bool secondaryThrottle ) {
        int movedCount = 0;

        for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
            const CandidateChunk& chunkInfo = *it->get();

            DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
            verify( cfg );

            ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
            verify( cm );

            ChunkPtr c = cm->findChunk( chunkInfo.chunk.min );
            if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                // likely a split happened somewhere
                cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
                verify( cm );

                c = cm->findChunk( chunkInfo.chunk.min );
                if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                    log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl;
                    continue;
                }
            }

            BSONObj res;
            if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , Chunk::MaxChunkSize , secondaryThrottle , res ) ) {
                movedCount++;
                continue;
            }

            // the move requires acquiring the collection metadata's lock, which can fail
            log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
                  << " chunk: " << chunkInfo.chunk << endl;

            if ( res["chunkTooBig"].trueValue() ) {
                // reload just to be safe
                cm = cfg->getChunkManager( chunkInfo.ns );
                verify( cm );
                c = cm->findChunk( chunkInfo.chunk.min );
                
                log() << "forcing a split because migrate failed for size reasons" << endl;
                
                res = BSONObj();
                c->singleSplit( true , res );
                log() << "forced split results: " << res << endl;
                
                if ( ! res["ok"].trueValue() ) {
                    log() << "marking chunk as jumbo: " << c->toString() << endl;
                    c->markAsJumbo();
                    // we increment moveCount so we do another round right away
                    movedCount++;
                }

            }
        }

        return movedCount;
    }
示例#2
0
int Balancer::_moveChunks(OperationContext* txn,
                          const vector<MigrateInfo>& candidateChunks,
                          const MigrationSecondaryThrottleOptions& secondaryThrottle,
                          bool waitForDelete) {
    int movedCount = 0;

    for (const auto& migrateInfo : candidateChunks) {
        // If the balancer was disabled since we started this round, don't start new chunks
        // moves.
        const auto balSettingsResult =
            grid.catalogManager(txn)->getGlobalSettings(txn, SettingsType::BalancerDocKey);

        const bool isBalSettingsAbsent =
            balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument;

        if (!balSettingsResult.isOK() && !isBalSettingsAbsent) {
            warning() << balSettingsResult.getStatus();
            return movedCount;
        }

        const SettingsType& balancerConfig =
            isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue();

        if ((!isBalSettingsAbsent && !Chunk::shouldBalance(balancerConfig)) ||
            MONGO_FAIL_POINT(skipBalanceRound)) {
            LOG(1) << "Stopping balancing round early as balancing was disabled";
            return movedCount;
        }

        // Changes to metadata, borked metadata, and connectivity problems between shards
        // should cause us to abort this chunk move, but shouldn't cause us to abort the entire
        // round of chunks.
        //
        // TODO(spencer): We probably *should* abort the whole round on issues communicating
        // with the config servers, but its impossible to distinguish those types of failures
        // at the moment.
        //
        // TODO: Handle all these things more cleanly, since they're expected problems

        const NamespaceString nss(migrateInfo.ns);

        try {
            shared_ptr<DBConfig> cfg =
                uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString()));

            // NOTE: We purposely do not reload metadata here, since _getCandidateChunks already
            // tried to do so once
            shared_ptr<ChunkManager> cm = cfg->getChunkManager(txn, migrateInfo.ns);
            uassert(28628,
                    str::stream()
                        << "Collection " << migrateInfo.ns
                        << " was deleted while balancing was active. Aborting balancing round.",
                    cm);

            ChunkPtr c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min);

            if (c->getMin().woCompare(migrateInfo.chunk.min) ||
                c->getMax().woCompare(migrateInfo.chunk.max)) {
                // Likely a split happened somewhere, so force reload the chunk manager
                cm = cfg->getChunkManager(txn, migrateInfo.ns, true);
                invariant(cm);

                c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min);

                if (c->getMin().woCompare(migrateInfo.chunk.min) ||
                    c->getMax().woCompare(migrateInfo.chunk.max)) {
                    log() << "chunk mismatch after reload, ignoring will retry issue "
                          << migrateInfo.chunk.toString();

                    continue;
                }
            }

            BSONObj res;
            if (c->moveAndCommit(txn,
                                 migrateInfo.to,
                                 Chunk::MaxChunkSize,
                                 secondaryThrottle,
                                 waitForDelete,
                                 0, /* maxTimeMS */
                                 res)) {
                movedCount++;
                continue;
            }

            // The move requires acquiring the collection metadata's lock, which can fail.
            log() << "balancer move failed: " << res << " from: " << migrateInfo.from
                  << " to: " << migrateInfo.to << " chunk: " << migrateInfo.chunk;

            Status moveStatus = getStatusFromCommandResult(res);

            if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) {
                // Reload just to be safe
                cm = cfg->getChunkManager(txn, migrateInfo.ns);
                invariant(cm);

                c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min);

                log() << "performing a split because migrate failed for size reasons";

                Status status = c->split(txn, Chunk::normal, NULL, NULL);
                log() << "split results: " << status;

                if (!status.isOK()) {
                    log() << "marking chunk as jumbo: " << c->toString();

                    c->markAsJumbo(txn);

                    // We increment moveCount so we do another round right away
                    movedCount++;
                }
            }
        } catch (const DBException& ex) {
            warning() << "could not move chunk " << migrateInfo.chunk.toString()
                      << ", continuing balancing round" << causedBy(ex);
        }
    }

    return movedCount;
}
示例#3
0
    int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks,
                              bool secondaryThrottle,
                              bool waitForDelete)
    {
        int movedCount = 0;

        for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
            const CandidateChunk& chunkInfo = *it->get();

            // Changes to metadata, borked metadata, and connectivity problems should cause us to
            // abort this chunk move, but shouldn't cause us to abort the entire round of chunks.
            // TODO: Handle all these things more cleanly, since they're expected problems
            try {

                DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
                verify( cfg );

                // NOTE: We purposely do not reload metadata here, since _doBalanceRound already
                // tried to do so once.
                ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
                verify( cm );

                ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                    // likely a split happened somewhere
                    cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
                    verify( cm );

                    c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                    if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                        log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl;
                        continue;
                    }
                }

                BSONObj res;
                if (c->moveAndCommit(Shard::make(chunkInfo.to),
                                     Chunk::MaxChunkSize,
                                     secondaryThrottle,
                                     waitForDelete,
                                     0, /* maxTimeMS */
                                     res)) {
                    movedCount++;
                    continue;
                }

                // the move requires acquiring the collection metadata's lock, which can fail
                log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
                      << " chunk: " << chunkInfo.chunk << endl;

                if ( res["chunkTooBig"].trueValue() ) {
                    // reload just to be safe
                    cm = cfg->getChunkManager( chunkInfo.ns );
                    verify( cm );
                    c = cm->findIntersectingChunk( chunkInfo.chunk.min );

                    log() << "forcing a split because migrate failed for size reasons" << endl;

                    res = BSONObj();
                    c->singleSplit( true , res );
                    log() << "forced split results: " << res << endl;

                    if ( ! res["ok"].trueValue() ) {
                        log() << "marking chunk as jumbo: " << c->toString() << endl;
                        c->markAsJumbo();
                        // we increment moveCount so we do another round right away
                        movedCount++;
                    }

                }
            }
            catch( const DBException& ex ) {
                warning() << "could not move chunk " << chunkInfo.chunk.toString()
                          << ", continuing balancing round" << causedBy( ex ) << endl;
            }
        }

        return movedCount;
    }
示例#4
0
int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks,
                          const WriteConcernOptions* writeConcern,
                          bool waitForDelete) {
    int movedCount = 0;

    for (vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin();
         it != candidateChunks->end();
         ++it) {
        // If the balancer was disabled since we started this round, don't start new
        // chunks moves.
        SettingsType balancerConfig;
        std::string errMsg;

        if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) {
            warning() << errMsg;
            // No point in continuing the round if the config servers are unreachable.
            return movedCount;
        }

        if ((balancerConfig.isKeySet() &&  // balancer config doc exists
             !grid.shouldBalance(balancerConfig)) ||
            MONGO_FAIL_POINT(skipBalanceRound)) {
            LOG(1) << "Stopping balancing round early as balancing was disabled";
            return movedCount;
        }

        // Changes to metadata, borked metadata, and connectivity problems between shards should
        // cause us to abort this chunk move, but shouldn't cause us to abort the entire round
        // of chunks.
        // TODO(spencer): We probably *should* abort the whole round on issues communicating
        // with the config servers, but its impossible to distinguish those types of failures
        // at the moment.
        // TODO: Handle all these things more cleanly, since they're expected problems
        const CandidateChunk& chunkInfo = *it->get();
        try {
            DBConfigPtr cfg = grid.getDBConfig(chunkInfo.ns);
            verify(cfg);

            // NOTE: We purposely do not reload metadata here, since _doBalanceRound already
            // tried to do so once.
            ChunkManagerPtr cm = cfg->getChunkManager(chunkInfo.ns);
            verify(cm);

            ChunkPtr c = cm->findIntersectingChunk(chunkInfo.chunk.min);
            if (c->getMin().woCompare(chunkInfo.chunk.min) ||
                c->getMax().woCompare(chunkInfo.chunk.max)) {
                // likely a split happened somewhere
                cm = cfg->getChunkManager(chunkInfo.ns, true /* reload */);
                verify(cm);

                c = cm->findIntersectingChunk(chunkInfo.chunk.min);
                if (c->getMin().woCompare(chunkInfo.chunk.min) ||
                    c->getMax().woCompare(chunkInfo.chunk.max)) {
                    log() << "chunk mismatch after reload, ignoring will retry issue "
                          << chunkInfo.chunk.toString() << endl;
                    continue;
                }
            }

            BSONObj res;
            if (c->moveAndCommit(Shard::make(chunkInfo.to),
                                 Chunk::MaxChunkSize,
                                 writeConcern,
                                 waitForDelete,
                                 0, /* maxTimeMS */
                                 res)) {
                movedCount++;
                continue;
            }

            // the move requires acquiring the collection metadata's lock, which can fail
            log() << "balancer move failed: " << res << " from: " << chunkInfo.from
                  << " to: " << chunkInfo.to << " chunk: " << chunkInfo.chunk << endl;

            if (res["chunkTooBig"].trueValue()) {
                // reload just to be safe
                cm = cfg->getChunkManager(chunkInfo.ns);
                verify(cm);
                c = cm->findIntersectingChunk(chunkInfo.chunk.min);

                log() << "performing a split because migrate failed for size reasons";

                Status status = c->split(Chunk::normal, NULL, NULL);
                log() << "split results: " << status << endl;

                if (!status.isOK()) {
                    log() << "marking chunk as jumbo: " << c->toString() << endl;
                    c->markAsJumbo();
                    // we increment moveCount so we do another round right away
                    movedCount++;
                }
            }
        } catch (const DBException& ex) {
            warning() << "could not move chunk " << chunkInfo.chunk.toString()
                      << ", continuing balancing round" << causedBy(ex) << endl;
        }
    }

    return movedCount;
}