int Balancer::_moveChunks( const vector<CandidateChunkPtr>* candidateChunks , bool secondaryThrottle ) { int movedCount = 0; for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) { const CandidateChunk& chunkInfo = *it->get(); DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns ); verify( cfg ); ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns ); verify( cm ); ChunkPtr c = cm->findChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { // likely a split happened somewhere cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */); verify( cm ); c = cm->findChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl; continue; } } BSONObj res; if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , Chunk::MaxChunkSize , secondaryThrottle , res ) ) { movedCount++; continue; } // the move requires acquiring the collection metadata's lock, which can fail log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to << " chunk: " << chunkInfo.chunk << endl; if ( res["chunkTooBig"].trueValue() ) { // reload just to be safe cm = cfg->getChunkManager( chunkInfo.ns ); verify( cm ); c = cm->findChunk( chunkInfo.chunk.min ); log() << "forcing a split because migrate failed for size reasons" << endl; res = BSONObj(); c->singleSplit( true , res ); log() << "forced split results: " << res << endl; if ( ! res["ok"].trueValue() ) { log() << "marking chunk as jumbo: " << c->toString() << endl; c->markAsJumbo(); // we increment moveCount so we do another round right away movedCount++; } } } return movedCount; }
int Balancer::_moveChunks(OperationContext* txn, const vector<MigrateInfo>& candidateChunks, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { int movedCount = 0; for (const auto& migrateInfo : candidateChunks) { // If the balancer was disabled since we started this round, don't start new chunks // moves. const auto balSettingsResult = grid.catalogManager(txn)->getGlobalSettings(txn, SettingsType::BalancerDocKey); const bool isBalSettingsAbsent = balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument; if (!balSettingsResult.isOK() && !isBalSettingsAbsent) { warning() << balSettingsResult.getStatus(); return movedCount; } const SettingsType& balancerConfig = isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue(); if ((!isBalSettingsAbsent && !Chunk::shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "Stopping balancing round early as balancing was disabled"; return movedCount; } // Changes to metadata, borked metadata, and connectivity problems between shards // should cause us to abort this chunk move, but shouldn't cause us to abort the entire // round of chunks. // // TODO(spencer): We probably *should* abort the whole round on issues communicating // with the config servers, but its impossible to distinguish those types of failures // at the moment. // // TODO: Handle all these things more cleanly, since they're expected problems const NamespaceString nss(migrateInfo.ns); try { shared_ptr<DBConfig> cfg = uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString())); // NOTE: We purposely do not reload metadata here, since _getCandidateChunks already // tried to do so once shared_ptr<ChunkManager> cm = cfg->getChunkManager(txn, migrateInfo.ns); uassert(28628, str::stream() << "Collection " << migrateInfo.ns << " was deleted while balancing was active. Aborting balancing round.", cm); ChunkPtr c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min); if (c->getMin().woCompare(migrateInfo.chunk.min) || c->getMax().woCompare(migrateInfo.chunk.max)) { // Likely a split happened somewhere, so force reload the chunk manager cm = cfg->getChunkManager(txn, migrateInfo.ns, true); invariant(cm); c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min); if (c->getMin().woCompare(migrateInfo.chunk.min) || c->getMax().woCompare(migrateInfo.chunk.max)) { log() << "chunk mismatch after reload, ignoring will retry issue " << migrateInfo.chunk.toString(); continue; } } BSONObj res; if (c->moveAndCommit(txn, migrateInfo.to, Chunk::MaxChunkSize, secondaryThrottle, waitForDelete, 0, /* maxTimeMS */ res)) { movedCount++; continue; } // The move requires acquiring the collection metadata's lock, which can fail. log() << "balancer move failed: " << res << " from: " << migrateInfo.from << " to: " << migrateInfo.to << " chunk: " << migrateInfo.chunk; Status moveStatus = getStatusFromCommandResult(res); if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) { // Reload just to be safe cm = cfg->getChunkManager(txn, migrateInfo.ns); invariant(cm); c = cm->findIntersectingChunk(txn, migrateInfo.chunk.min); log() << "performing a split because migrate failed for size reasons"; Status status = c->split(txn, Chunk::normal, NULL, NULL); log() << "split results: " << status; if (!status.isOK()) { log() << "marking chunk as jumbo: " << c->toString(); c->markAsJumbo(txn); // We increment moveCount so we do another round right away movedCount++; } } } catch (const DBException& ex) { warning() << "could not move chunk " << migrateInfo.chunk.toString() << ", continuing balancing round" << causedBy(ex); } } return movedCount; }
int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks, bool secondaryThrottle, bool waitForDelete) { int movedCount = 0; for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) { const CandidateChunk& chunkInfo = *it->get(); // Changes to metadata, borked metadata, and connectivity problems should cause us to // abort this chunk move, but shouldn't cause us to abort the entire round of chunks. // TODO: Handle all these things more cleanly, since they're expected problems try { DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns ); verify( cfg ); // NOTE: We purposely do not reload metadata here, since _doBalanceRound already // tried to do so once. ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns ); verify( cm ); ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { // likely a split happened somewhere cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */); verify( cm ); c = cm->findIntersectingChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl; continue; } } BSONObj res; if (c->moveAndCommit(Shard::make(chunkInfo.to), Chunk::MaxChunkSize, secondaryThrottle, waitForDelete, 0, /* maxTimeMS */ res)) { movedCount++; continue; } // the move requires acquiring the collection metadata's lock, which can fail log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to << " chunk: " << chunkInfo.chunk << endl; if ( res["chunkTooBig"].trueValue() ) { // reload just to be safe cm = cfg->getChunkManager( chunkInfo.ns ); verify( cm ); c = cm->findIntersectingChunk( chunkInfo.chunk.min ); log() << "forcing a split because migrate failed for size reasons" << endl; res = BSONObj(); c->singleSplit( true , res ); log() << "forced split results: " << res << endl; if ( ! res["ok"].trueValue() ) { log() << "marking chunk as jumbo: " << c->toString() << endl; c->markAsJumbo(); // we increment moveCount so we do another round right away movedCount++; } } } catch( const DBException& ex ) { warning() << "could not move chunk " << chunkInfo.chunk.toString() << ", continuing balancing round" << causedBy( ex ) << endl; } } return movedCount; }
int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks, const WriteConcernOptions* writeConcern, bool waitForDelete) { int movedCount = 0; for (vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it) { // If the balancer was disabled since we started this round, don't start new // chunks moves. SettingsType balancerConfig; std::string errMsg; if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) { warning() << errMsg; // No point in continuing the round if the config servers are unreachable. return movedCount; } if ((balancerConfig.isKeySet() && // balancer config doc exists !grid.shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "Stopping balancing round early as balancing was disabled"; return movedCount; } // Changes to metadata, borked metadata, and connectivity problems between shards should // cause us to abort this chunk move, but shouldn't cause us to abort the entire round // of chunks. // TODO(spencer): We probably *should* abort the whole round on issues communicating // with the config servers, but its impossible to distinguish those types of failures // at the moment. // TODO: Handle all these things more cleanly, since they're expected problems const CandidateChunk& chunkInfo = *it->get(); try { DBConfigPtr cfg = grid.getDBConfig(chunkInfo.ns); verify(cfg); // NOTE: We purposely do not reload metadata here, since _doBalanceRound already // tried to do so once. ChunkManagerPtr cm = cfg->getChunkManager(chunkInfo.ns); verify(cm); ChunkPtr c = cm->findIntersectingChunk(chunkInfo.chunk.min); if (c->getMin().woCompare(chunkInfo.chunk.min) || c->getMax().woCompare(chunkInfo.chunk.max)) { // likely a split happened somewhere cm = cfg->getChunkManager(chunkInfo.ns, true /* reload */); verify(cm); c = cm->findIntersectingChunk(chunkInfo.chunk.min); if (c->getMin().woCompare(chunkInfo.chunk.min) || c->getMax().woCompare(chunkInfo.chunk.max)) { log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl; continue; } } BSONObj res; if (c->moveAndCommit(Shard::make(chunkInfo.to), Chunk::MaxChunkSize, writeConcern, waitForDelete, 0, /* maxTimeMS */ res)) { movedCount++; continue; } // the move requires acquiring the collection metadata's lock, which can fail log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to << " chunk: " << chunkInfo.chunk << endl; if (res["chunkTooBig"].trueValue()) { // reload just to be safe cm = cfg->getChunkManager(chunkInfo.ns); verify(cm); c = cm->findIntersectingChunk(chunkInfo.chunk.min); log() << "performing a split because migrate failed for size reasons"; Status status = c->split(Chunk::normal, NULL, NULL); log() << "split results: " << status << endl; if (!status.isOK()) { log() << "marking chunk as jumbo: " << c->toString() << endl; c->markAsJumbo(); // we increment moveCount so we do another round right away movedCount++; } } } catch (const DBException& ex) { warning() << "could not move chunk " << chunkInfo.chunk.toString() << ", continuing balancing round" << causedBy(ex) << endl; } } return movedCount; }