void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { verify( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj()); if ( NULL == cursor.get() ) { warning() << "could not query " << CollectionType::ConfigNS << " while trying to balance" << endl; return; } vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col[CollectionType::keyPattern()].eoo() && ! col[CollectionType::noBalance()].trueValue() ){ collections.push_back( col[CollectionType::ns()].String() ); } else if( col[CollectionType::noBalance()].trueValue() ){ LOG(1) << "not balancing collection " << col[CollectionType::ns()].String() << ", explicitly disabled" << endl; } } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // ShardInfoMap shardInfo; Status loadStatus = DistributionStatus::populateShardInfoMap(&shardInfo); if (!loadStatus.isOK()) { warning() << "failed to load shard metadata" << causedBy(loadStatus) << endl; return; } if (shardInfo.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } OCCASIONALLY warnOnMultiVersion( shardInfo ); // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; OwnedPointerMap<string, OwnedPointerVector<ChunkType> > shardToChunksMap; cursor = conn.query(ChunkType::ConfigNS, QUERY(ChunkType::ns(ns)).sort(ChunkType::min())); set<BSONObj> allChunkMinimums; while ( cursor->more() ) { BSONObj chunkDoc = cursor->nextSafe().getOwned(); auto_ptr<ChunkType> chunk(new ChunkType()); string errmsg; if (!chunk->parseBSON(chunkDoc, &errmsg)) { error() << "bad chunk format for " << chunkDoc << ": " << errmsg << endl; return; } allChunkMinimums.insert(chunk->getMin().getOwned()); OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[chunk->getShard()]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } chunkList->mutableVector().push_back(chunk.release()); } cursor.reset(); if (shardToChunksMap.map().empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) { // this just makes sure there is an entry in shardToChunksMap for every shard OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[i->first]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } } DistributionStatus status(shardInfo, shardToChunksMap.map()); // load tags Status result = clusterCreateIndex(TagsType::ConfigNS, BSON(TagsType::ns() << 1 << TagsType::min() << 1), true, // unique WriteConcernOptions::AllConfigs, NULL); if ( !result.isOK() ) { warning() << "could not create index tags_1_min_1: " << result.reason() << endl; continue; } cursor = conn.query(TagsType::ConfigNS, QUERY(TagsType::ns(ns)).sort(TagsType::min())); vector<TagRange> ranges; while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); TagRange tr(tag[TagsType::min()].Obj().getOwned(), tag[TagsType::max()].Obj().getOwned(), tag[TagsType::tag()].String()); ranges.push_back(tr); uassert(16356, str::stream() << "tag ranges not valid for: " << ns, status.addTagRange(tr) ); } cursor.reset(); DBConfigPtr cfg = grid.getDBConfig( ns ); if ( !cfg ) { warning() << "could not load db config to balance " << ns << " collection" << endl; continue; } // This line reloads the chunk manager once if this process doesn't know the collection // is sharded yet. ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true ); if ( !cm ) { warning() << "could not load chunks to balance " << ns << " collection" << endl; continue; } // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks bool didAnySplits = false; for ( unsigned i = 0; i < ranges.size(); i++ ) { BSONObj min = ranges[i].min; min = cm->getShardKey().extendRangeBound( min, false ); if ( allChunkMinimums.count( min ) > 0 ) continue; didAnySplits = true; log() << "ns: " << ns << " need to split on " << min << " because there is a range there" << endl; ChunkPtr c = cm->findIntersectingChunk( min ); vector<BSONObj> splitPoints; splitPoints.push_back( min ); BSONObj res; if ( !c->multiSplit( splitPoints, res ) ) { error() << "split failed: " << res << endl; } else { LOG(1) << "split worked: " << res << endl; } break; } if ( didAnySplits ) { // state change, just wait till next round continue; } CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }
/** * Upgrade v3 to v4 described here. * * This upgrade takes a config server without collection epochs (potentially) and adds * epochs to all mongo processes. * */ bool doUpgradeV3ToV4(const ConnectionString& configLoc, const VersionType& lastVersionInfo, string* errMsg) { string dummy; if (!errMsg) errMsg = &dummy; verify(lastVersionInfo.getCurrentVersion() == UpgradeHistory_NoEpochVersion); if (lastVersionInfo.isUpgradeIdSet() && lastVersionInfo.getUpgradeId().isSet()) { // // Another upgrade failed, so cleanup may be necessary // BSONObj lastUpgradeState = lastVersionInfo.getUpgradeState(); bool inCriticalSection; if (!FieldParser::extract(lastUpgradeState, inCriticalSectionField, &inCriticalSection, errMsg)) { *errMsg = stream() << "problem reading previous upgrade state" << causedBy(errMsg); return false; } if (inCriticalSection) { // Manual intervention is needed here. Somehow our upgrade didn't get applied // consistently across config servers. *errMsg = cannotCleanupMessage; return false; } if (!_cleanupUpgradeState(configLoc, lastVersionInfo.getUpgradeId(), errMsg)) { // If we can't cleanup the old upgrade state, the user might have done it for us, // not a fatal problem (we'll just end up with extra collections). warning() << "could not cleanup previous upgrade state" << causedBy(errMsg) << endl; *errMsg = ""; } } // // Check the versions of other mongo processes in the cluster before upgrade. // We can't upgrade if there are active pre-v2.2 processes in the cluster // Status mongoVersionStatus = checkClusterMongoVersions(configLoc, string(minMongoProcessVersion)); if (!mongoVersionStatus.isOK()) { *errMsg = stream() << "cannot upgrade with pre-v" << minMongoProcessVersion << " mongo processes active in the cluster" << causedBy(mongoVersionStatus); return false; } VersionType newVersionInfo; lastVersionInfo.cloneTo(&newVersionInfo); // Set our upgrade id and state OID upgradeId = OID::gen(); newVersionInfo.setUpgradeId(upgradeId); newVersionInfo.setUpgradeState(BSONObj()); // Write our upgrade id and state { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; verify(newVersionInfo.isValid(NULL)); conn->update(VersionType::ConfigNS, BSON("_id" << 1 << VersionType::version_DEPRECATED(3)), newVersionInfo.toBSON()); _checkGLE(conn); } catch (const DBException& e) { *errMsg = stream() << "could not initialize version info for upgrade" << causedBy(e); return false; } connPtr->done(); } // // First lock all collection namespaces that exist // OwnedPointerMap<string, CollectionType> ownedCollections; const map<string, CollectionType*>& collections = ownedCollections.map(); Status findCollectionsStatus = findAllCollectionsV3(configLoc, &ownedCollections); if (!findCollectionsStatus.isOK()) { *errMsg = stream() << "could not read collections from config server" << causedBy(findCollectionsStatus); return false; } // // Acquire locks for all sharded collections // Something that didn't involve getting thousands of locks would be better. // OwnedPointerVector<ScopedDistributedLock> collectionLocks; log() << "acquiring locks for " << collections.size() << " sharded collections..." << endl; // WARNING - this string is used programmatically when forcing locks, be careful when // changing! // TODO: Add programmatic "why" field to lock collection string lockMessage = str::stream() << "ensuring epochs for config upgrade" << " (" << upgradeId.toString() << ")"; if (!_acquireAllCollectionLocks(configLoc, collections, lockMessage, 20 * 60 * 1000, &collectionLocks, errMsg)) { *errMsg = stream() << "could not acquire all namespace locks for upgrade" << " (" << upgradeId.toString() << ")" << causedBy(errMsg); return false; } // We are now preventing all splits and migrates for all sharded collections // Get working and backup suffixes string workingSuffix = genWorkingSuffix(upgradeId); string backupSuffix = genBackupSuffix(upgradeId); log() << "copying collection and chunk metadata to working and backup collections..." << endl; // Get a backup and working copy of the config.collections and config.chunks collections Status copyStatus = copyFrozenCollection(configLoc, CollectionType::ConfigNS, CollectionType::ConfigNS + workingSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to " << (CollectionType::ConfigNS + workingSuffix) << causedBy(copyStatus); return false; } copyStatus = copyFrozenCollection(configLoc, CollectionType::ConfigNS, CollectionType::ConfigNS + backupSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to " << (CollectionType::ConfigNS + backupSuffix) << causedBy(copyStatus); return false; } copyStatus = copyFrozenCollection(configLoc, ChunkType::ConfigNS, ChunkType::ConfigNS + workingSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to " << (ChunkType::ConfigNS + workingSuffix) << causedBy(copyStatus); return false; } copyStatus = copyFrozenCollection(configLoc, ChunkType::ConfigNS, ChunkType::ConfigNS + backupSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to " << (ChunkType::ConfigNS + backupSuffix) << causedBy(copyStatus); return false; } // // Go through sharded collections one-by-one and add epochs where missing // for (map<string, CollectionType*>::const_iterator it = collections.begin(); it != collections.end(); ++it) { // Create a copy so that we can change the epoch later CollectionType collection; it->second->cloneTo(&collection); log() << "checking epochs for " << collection.getNS() << " collection..." << endl; OID epoch = collection.getEpoch(); // // Go through chunks to find epoch if we haven't found it or to verify epoch is the same // OwnedPointerVector<ChunkType> ownedChunks; const vector<ChunkType*>& chunks = ownedChunks.vector(); Status findChunksStatus = findAllChunks(configLoc, collection.getNS(), &ownedChunks); if (!findChunksStatus.isOK()) { *errMsg = stream() << "could not read chunks from config server" << causedBy(findChunksStatus); return false; } for (vector<ChunkType*>::const_iterator chunkIt = chunks.begin(); chunkIt != chunks.end(); ++chunkIt) { const ChunkType& chunk = *(*chunkIt); // If our chunk epoch is set and doesn't match if (epoch.isSet() && chunk.getVersion().epoch().isSet() && chunk.getVersion().epoch() != epoch) { *errMsg = stream() << "chunk epoch for " << chunk.toString() << " in " << collection.getNS() << " does not match found epoch " << epoch; return false; } else if (!epoch.isSet() && chunk.getVersion().epoch().isSet()) { epoch = chunk.getVersion().epoch(); } } // // Write collection epoch if needed // if (!collection.getEpoch().isSet()) { OID newEpoch = OID::gen(); log() << "writing new epoch " << newEpoch << " for " << collection.getNS() << " collection..." << endl; scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; conn->update(CollectionType::ConfigNS + workingSuffix, BSON(CollectionType::ns(collection.getNS())), BSON("$set" << BSON(CollectionType::DEPRECATED_lastmodEpoch(newEpoch)))); _checkGLE(conn); } catch (const DBException& e) { *errMsg = stream() << "could not write a new epoch for " << collection.getNS() << causedBy(e); return false; } connPtr->done(); collection.setEpoch(newEpoch); } epoch = collection.getEpoch(); verify(epoch.isSet()); // // Now write verified epoch to all chunks // log() << "writing epoch " << epoch << " for " << chunks.size() << " chunks in " << collection.getNS() << " collection..." << endl; { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; // Multi-update of all chunks conn->update(ChunkType::ConfigNS + workingSuffix, BSON(ChunkType::ns(collection.getNS())), BSON("$set" << BSON(ChunkType::DEPRECATED_epoch(epoch))), false, true); // multi _checkGLE(conn); } catch (const DBException& e) { *errMsg = stream() << "could not write a new epoch " << epoch.toString() << " for chunks in " << collection.getNS() << causedBy(e); return false; } connPtr->done(); } } // // Paranoid verify the collection writes // { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; // Find collections with no epochs BSONObj emptyDoc = conn->findOne(CollectionType::ConfigNS + workingSuffix, BSON("$unset" << BSON(CollectionType::DEPRECATED_lastmodEpoch() << 1))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "collection " << emptyDoc << " is still missing epoch after config upgrade"; connPtr->done(); return false; } // Find collections with empty epochs emptyDoc = conn->findOne(CollectionType::ConfigNS + workingSuffix, BSON(CollectionType::DEPRECATED_lastmodEpoch(OID()))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "collection " << emptyDoc << " still has empty epoch after config upgrade"; connPtr->done(); return false; } // Find chunks with no epochs emptyDoc = conn->findOne(ChunkType::ConfigNS + workingSuffix, BSON("$unset" << BSON(ChunkType::DEPRECATED_epoch() << 1))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "chunk " << emptyDoc << " is still missing epoch after config upgrade"; connPtr->done(); return false; } // Find chunks with empty epochs emptyDoc = conn->findOne(ChunkType::ConfigNS + workingSuffix, BSON(ChunkType::DEPRECATED_epoch(OID()))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "chunk " << emptyDoc << " still has empty epoch after config upgrade"; connPtr->done(); return false; } } catch (const DBException& e) { *errMsg = stream() << "could not verify epoch writes" << causedBy(e); return false; } connPtr->done(); } // // Double check that our collections haven't changed // Status idCheckStatus = checkIdsTheSame(configLoc, CollectionType::ConfigNS, CollectionType::ConfigNS + workingSuffix); if (!idCheckStatus.isOK()) { *errMsg = stream() << CollectionType::ConfigNS << " was modified while working on upgrade" << causedBy(idCheckStatus); return false; } idCheckStatus = checkIdsTheSame(configLoc, ChunkType::ConfigNS, ChunkType::ConfigNS + workingSuffix); if (!idCheckStatus.isOK()) { *errMsg = stream() << ChunkType::ConfigNS << " was modified while working on upgrade" << causedBy(idCheckStatus); return false; } // // ENTER CRITICAL SECTION // newVersionInfo.setUpgradeState(BSON(inCriticalSectionField(true))); { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; verify(newVersionInfo.isValid(NULL)); conn->update(VersionType::ConfigNS, BSON("_id" << 1 << VersionType::version_DEPRECATED(3)), newVersionInfo.toBSON()); _checkGLE(conn); } catch (const DBException& e) { // No cleanup message here since we're not sure if we wrote or not, and // not dangerous either way except to prevent further updates (at which point // the message is printed) *errMsg = stream() << "could not update version info to enter critical update section" << causedBy(e); return false; } // AT THIS POINT ANY FAILURE REQUIRES MANUAL INTERVENTION! connPtr->done(); } log() << "entered critical section for config upgrade" << endl; Status overwriteStatus = overwriteCollection(configLoc, CollectionType::ConfigNS + workingSuffix, CollectionType::ConfigNS); if (!overwriteStatus.isOK()) { error() << cleanupMessage << endl; *errMsg = stream() << "could not overwrite collection " << CollectionType::ConfigNS << " with working collection " << (CollectionType::ConfigNS + workingSuffix) << causedBy(overwriteStatus); return false; } overwriteStatus = overwriteCollection(configLoc, ChunkType::ConfigNS + workingSuffix, ChunkType::ConfigNS); if (!overwriteStatus.isOK()) { error() << cleanupMessage << endl; *errMsg = stream() << "could not overwrite collection " << ChunkType::ConfigNS << " with working collection " << (ChunkType::ConfigNS + workingSuffix) << causedBy(overwriteStatus); return false; } // // Finally update the version to latest and add clusterId to version // OID newClusterId = OID::gen(); // Note: hardcoded versions, since this is a very particular upgrade // Note: DO NOT CLEAR the config version unless bumping the minCompatibleVersion, // we want to save the excludes that were set. newVersionInfo.setMinCompatibleVersion(UpgradeHistory_NoEpochVersion); newVersionInfo.setCurrentVersion(UpgradeHistory_MandatoryEpochVersion); newVersionInfo.setClusterId(newClusterId); // Leave critical section newVersionInfo.unsetUpgradeId(); newVersionInfo.unsetUpgradeState(); log() << "writing new version info and clusterId " << newClusterId << "..." << endl; { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; verify(newVersionInfo.isValid(NULL)); conn->update(VersionType::ConfigNS, BSON("_id" << 1 << VersionType::version_DEPRECATED(UpgradeHistory_NoEpochVersion)), newVersionInfo.toBSON()); _checkGLE(conn); } catch (const DBException& e) { error() << cleanupMessage << endl; *errMsg = stream() << "could not write new version info " << "and exit critical upgrade section" << causedBy(e); return false; } connPtr->done(); } // // END CRITICAL SECTION // return true; }